From d909529eb44155617fad828c94877cb0168e25fc Mon Sep 17 00:00:00 2001
From: rolfv <rvandevaart@nvidia.com>
Date: Fri, 7 Nov 2014 11:00:45 -0800
Subject: [PATCH 001/190] Add GPU packing and unpacking add cuda stream for
 submmitting multiple kernels. add suppot for predefined datatypes.

Conflicts:
	opal/datatype/opal_datatype_unpack.c
	test/datatype/ddt_test.c
---
 opal/datatype/Makefile.am                     |   6 +-
 opal/datatype/cuda/Makefile                   |  40 ++
 opal/datatype/cuda/opal_datatype_cuda.cu      |  78 +++
 opal/datatype/cuda/opal_datatype_cuda.cuh     |  42 ++
 .../cuda/opal_datatype_cuda_internal.cuh      | 397 ++++++++++++++
 .../cuda/opal_datatype_pack_cuda_kernel.cu    | 502 ++++++++++++++++++
 .../cuda/opal_datatype_pack_cuda_wrapper.cu   | 196 +++++++
 .../cuda/opal_datatype_unpack_cuda_kernel.cu  | 288 ++++++++++
 .../cuda/opal_datatype_unpack_cuda_wrapper.cu | 123 +++++
 opal/datatype/opal_datatype_gpu.c             | 167 ++++++
 opal/datatype/opal_datatype_gpu.h             |  40 ++
 opal/datatype/opal_datatype_module.c          |  11 +
 opal/datatype/opal_datatype_pack.c            |  19 +-
 opal/datatype/opal_datatype_pack.h            |   2 +
 opal/datatype/opal_datatype_unpack.c          |  13 +-
 opal/include/opal_config_top.h                |   2 +
 test/datatype/ddt_test.c                      | 122 ++++-
 17 files changed, 2019 insertions(+), 29 deletions(-)
 create mode 100644 opal/datatype/cuda/Makefile
 create mode 100644 opal/datatype/cuda/opal_datatype_cuda.cu
 create mode 100644 opal/datatype/cuda/opal_datatype_cuda.cuh
 create mode 100644 opal/datatype/cuda/opal_datatype_cuda_internal.cuh
 create mode 100644 opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
 create mode 100644 opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
 create mode 100644 opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
 create mode 100644 opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
 create mode 100644 opal/datatype/opal_datatype_gpu.c
 create mode 100644 opal/datatype/opal_datatype_gpu.h

diff --git a/opal/datatype/Makefile.am b/opal/datatype/Makefile.am
index 6002a739f20..7683c2e8786 100644
--- a/opal/datatype/Makefile.am
+++ b/opal/datatype/Makefile.am
@@ -32,7 +32,8 @@ headers = \
         opal_datatype_memcpy.h \
         opal_datatype_pack.h \
         opal_datatype_prototypes.h \
-        opal_datatype_unpack.h
+        opal_datatype_unpack.h \
+		opal_datatype_gpu.h
 
 
 noinst_LTLIBRARIES = \
@@ -60,10 +61,11 @@ libdatatype_la_SOURCES = \
         opal_datatype_get_count.c \
         opal_datatype_module.c \
         opal_datatype_optimize.c \
+		opal_datatype_gpu.c \
         opal_datatype_pack.c \
         opal_datatype_position.c \
         opal_datatype_resize.c \
-        opal_datatype_unpack.c
+        opal_datatype_unpack.c 
 
 libdatatype_la_LIBADD = libdatatype_reliable.la
 
diff --git a/opal/datatype/cuda/Makefile b/opal/datatype/cuda/Makefile
new file mode 100644
index 00000000000..d42ab556fae
--- /dev/null
+++ b/opal/datatype/cuda/Makefile
@@ -0,0 +1,40 @@
+CC			= gcc
+NVCC		= nvcc
+ARCH		= ar
+ARCHFLAGS	= cr
+RANLIB		= ranlib
+STLIB		?= opal_datatype_cuda.a
+DYLIB		?= opal_datatype_cuda.so
+CFLAGS		= -g -G -O0
+EXTLIB		= -L/home/wwu12/ompi/ompi-cuda/opal/datatype/.libs -ldatatype
+INC			=
+
+SRC	:= \
+    opal_datatype_cuda.cu \
+    opal_datatype_pack_cuda_kernel.cu \
+    opal_datatype_pack_cuda_wrapper.cu \
+	opal_datatype_unpack_cuda_kernel.cu \
+	opal_datatype_unpack_cuda_wrapper.cu \
+	
+OBJ := $(SRC:.cu=.o)
+
+.PHONY: all clean cleanall
+
+all: $(STLIB) $(DYLIB)
+
+$(STLIB): $(OBJ)
+	$(ARCH) $(ARCHFLAGS) $@ $(OBJ) 
+	$(RANLIB) $@
+	
+$(DYLIB): $(OBJ)
+	$(NVCC) $(CFLAGS) $(EXTLIB) -shared --compiler-options '-fPIC' -o $(DYLIB) $(OBJ)
+	
+%.o: %.cu
+	$(NVCC) $(CFLAGS) $(EXTLIB) -gencode arch=compute_35,code=sm_35 $(INC) -c --compiler-options '-fPIC' $< -o $@ 
+
+clean:
+	rm -f *.o
+
+cleanall: clean
+	rm -f $(STLIB)
+	rm -f $(DYLIB)
diff --git a/opal/datatype/cuda/opal_datatype_cuda.cu b/opal/datatype/cuda/opal_datatype_cuda.cu
new file mode 100644
index 00000000000..ea1f3633480
--- /dev/null
+++ b/opal/datatype/cuda/opal_datatype_cuda.cu
@@ -0,0 +1,78 @@
+#include "opal_datatype_cuda_internal.cuh"
+#include "opal_datatype_cuda.cuh"
+#include <cuda_runtime_api.h>
+#include <stdio.h>
+
+ddt_cuda_desc_t *cuda_desc_d, *cuda_desc_h;
+unsigned char *pBaseBuf_GPU, *gpu_src_const, *gpu_dest_const;
+ddt_cuda_stream_t* cuda_streams;
+
+void opal_datatype_cuda_init(void)
+{
+    uint32_t i;
+    
+    int cuda_device = OPAL_GPU_INDEX;
+    cudaSetDevice(cuda_device);
+    
+    cudaMalloc((void **)&cuda_desc_d, sizeof(ddt_cuda_desc_t));
+    cudaMallocHost((void **)&cuda_desc_h, sizeof(ddt_cuda_desc_t));
+    printf("size cuda_desc %d\n", sizeof(ddt_cuda_desc_t));
+    
+    printf("malloc iov\n");
+    for (i = 0; i < IOV_ARRAY_SIZE; i++) {
+        void* iov_base;
+        cudaMalloc( (void **)&iov_base, sizeof(char)*IOV_LEN);
+        cuda_desc_h->iov[i].iov_base = iov_base;
+        cuda_desc_h->iov[i].iov_len = IOV_LEN;
+    }
+    cudaMalloc((void **)(&pBaseBuf_GPU), sizeof(char)*IOV_LEN);
+    gpu_src_const = pBaseBuf_GPU;
+    gpu_dest_const = (unsigned char*)cuda_desc_h->iov[0].iov_base; 
+    
+    cuda_desc_h->description_max_count = 0;
+    cuda_desc_h->description_count = 0;
+    
+    cuda_streams = (ddt_cuda_stream_t*)malloc(sizeof(ddt_cuda_stream_t));
+    /* init cuda stream */
+    for (i = 0; i < NB_STREAMS; i++) {
+        cudaStreamCreate(&(cuda_streams->opal_cuda_stream[i]));
+    }
+    cuda_streams->current_stream_id = 0;
+}
+
+void opal_datatype_cuda_fini(void)
+{
+    uint32_t i;
+    
+    if (cuda_desc_d != NULL) {
+        cudaFree(cuda_desc_d);
+        cuda_desc_d = NULL;
+    }
+    if (cuda_desc_h->description != NULL) {
+        cudaFree(cuda_desc_h->description);
+        cuda_desc_h->description = NULL;
+    }
+    printf("free iov\n");
+    if (cuda_desc_h != NULL) {    
+        for (i = 0; i < IOV_ARRAY_SIZE; i++) {
+            cudaFree(cuda_desc_h->iov[i].iov_base);
+            cuda_desc_h->iov[i].iov_base = NULL;
+        }
+    
+        cudaFreeHost(cuda_desc_h);
+        cuda_desc_h = NULL;
+    }
+    
+    /* destory cuda stream */
+    for (i = 0; i < NB_STREAMS; i++) {
+        cudaStreamDestroy(cuda_streams->opal_cuda_stream[i]);
+    }
+    free(cuda_streams);
+}
+
+void opal_cuda_sync_device(void)
+{
+    cudaDeviceSynchronize();
+    pBaseBuf_GPU = gpu_src_const;
+    cuda_desc_h->iov[0].iov_base = (void*)gpu_dest_const;
+}
\ No newline at end of file
diff --git a/opal/datatype/cuda/opal_datatype_cuda.cuh b/opal/datatype/cuda/opal_datatype_cuda.cuh
new file mode 100644
index 00000000000..82ab78b2ff7
--- /dev/null
+++ b/opal/datatype/cuda/opal_datatype_cuda.cuh
@@ -0,0 +1,42 @@
+#ifndef OPAL_DATATYPE_CUDA_H_HAS_BEEN_INCLUDED
+#define OPAL_DATATYPE_CUDA_H_HAS_BEEN_INCLUDED
+
+extern "C"
+{
+    
+void opal_datatype_cuda_init(void);
+
+void opal_datatype_cuda_fini(void);
+                                
+int32_t opal_generic_simple_pack_function_cuda( opal_convertor_t* pConvertor,
+                                                struct iovec* iov, 
+                                                uint32_t* out_size,
+                                                size_t* max_data );
+
+int32_t opal_generic_simple_unpack_function_cuda( opal_convertor_t* pConvertor,
+                                                  struct iovec* iov, 
+                                                  uint32_t* out_size,
+                                                  size_t* max_data );
+
+void pack_contiguous_loop_cuda( dt_elem_desc_t* ELEM,
+                                uint32_t* COUNT,
+                                unsigned char** SOURCE,
+                                unsigned char** DESTINATION,
+                                size_t* SPACE );
+                                
+void unpack_contiguous_loop_cuda( dt_elem_desc_t* ELEM,
+                                  uint32_t* COUNT,
+                                  unsigned char** SOURCE,
+                                  unsigned char** DESTINATION,
+                                  size_t* SPACE );
+                                  
+void pack_predefined_data_cuda( dt_elem_desc_t* ELEM,
+                                uint32_t* COUNT,
+                                unsigned char** SOURCE,
+                                unsigned char** DESTINATION,
+                                size_t* SPACE );
+
+void opal_cuda_sync_device(void);
+}
+                            
+#endif  /* OPAL_DATATYPE_CUDA_H_HAS_BEEN_INCLUDED */
\ No newline at end of file
diff --git a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
new file mode 100644
index 00000000000..84fbbe856a0
--- /dev/null
+++ b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
@@ -0,0 +1,397 @@
+#ifndef OPAL_DATATYPE_CUDA_INTERNAL_H_HAS_BEEN_INCLUDED
+#define OPAL_DATATYPE_CUDA_INTERNAL_H_HAS_BEEN_INCLUDED
+
+#include <stdint.h>
+#include <stddef.h>
+
+//#define OPAL_DATATYPE_CUDA_DRY_RUN
+//#define OPAL_DATATYPE_CUDA_DEBUG
+//#define OPAL_DATATYPE_CUDA_KERNEL_TIME
+#define OPAL_ENABLE_DEBUG   1
+
+#define DT_STATIC_STACK_SIZE    5                /**< This should be sufficient for most applications */
+#define IOV_ARRAY_SIZE          10
+#define IOV_LEN                 1024*1024*200
+
+#define THREAD_PER_BLOCK    32
+#define TASK_PER_THREAD     1
+#define OPAL_GPU_INDEX      0
+#define NB_STREAMS          4
+
+#define OPAL_PTRDIFF_TYPE ptrdiff_t
+
+/* keep the last 16 bits free for data flags */
+#define CONVERTOR_DATATYPE_MASK    0x0000FFFF
+#define CONVERTOR_SEND_CONVERSION  0x00010000
+#define CONVERTOR_RECV             0x00020000
+#define CONVERTOR_SEND             0x00040000
+#define CONVERTOR_HOMOGENEOUS      0x00080000
+#define CONVERTOR_NO_OP            0x00100000
+#define CONVERTOR_WITH_CHECKSUM    0x00200000
+#define CONVERTOR_CUDA             0x00400000
+#define CONVERTOR_CUDA_ASYNC       0x00800000
+#define CONVERTOR_TYPE_MASK        0x00FF0000
+#define CONVERTOR_STATE_START      0x01000000
+#define CONVERTOR_STATE_COMPLETE   0x02000000
+#define CONVERTOR_STATE_ALLOC      0x04000000
+#define CONVERTOR_COMPLETED        0x08000000
+
+#define OPAL_DATATYPE_LOOP           0
+#define OPAL_DATATYPE_END_LOOP       1
+#define OPAL_DATATYPE_LB             2
+#define OPAL_DATATYPE_UB             3
+#define OPAL_DATATYPE_FIRST_TYPE     4 /* Number of first real type */
+#define OPAL_DATATYPE_INT1           4
+#define OPAL_DATATYPE_INT2           5
+#define OPAL_DATATYPE_INT4           6
+#define OPAL_DATATYPE_INT8           7
+#define OPAL_DATATYPE_INT16          8
+#define OPAL_DATATYPE_UINT1          9
+#define OPAL_DATATYPE_UINT2          10
+#define OPAL_DATATYPE_UINT4          11
+#define OPAL_DATATYPE_UINT8          12
+#define OPAL_DATATYPE_UINT16         13
+#define OPAL_DATATYPE_FLOAT2         14
+#define OPAL_DATATYPE_FLOAT4         15
+#define OPAL_DATATYPE_FLOAT8         16
+#define OPAL_DATATYPE_FLOAT12        17
+#define OPAL_DATATYPE_FLOAT16        18
+#define OPAL_DATATYPE_FLOAT_COMPLEX  19
+#define OPAL_DATATYPE_DOUBLE_COMPLEX 20
+#define OPAL_DATATYPE_LONG_DOUBLE_COMPLEX 21
+#define OPAL_DATATYPE_BOOL           22
+#define OPAL_DATATYPE_WCHAR          23
+#define OPAL_DATATYPE_UNAVAILABLE    24
+
+/* flags for the datatypes. */
+#define OPAL_DATATYPE_FLAG_UNAVAILABLE   0x0001  /**< datatypes unavailable on the build (OS or compiler dependant) */
+#define OPAL_DATATYPE_FLAG_PREDEFINED    0x0002  /**< cannot be removed: initial and predefined datatypes */
+#define OPAL_DATATYPE_FLAG_COMMITED      0x0004  /**< ready to be used for a send/recv operation */
+#define OPAL_DATATYPE_FLAG_OVERLAP       0x0008  /**< datatype is unpropper for a recv operation */
+#define OPAL_DATATYPE_FLAG_CONTIGUOUS    0x0010  /**< contiguous datatype */
+#define OPAL_DATATYPE_FLAG_NO_GAPS       0x0020  /**< no gaps around the datatype, aka OPAL_DATATYPE_FLAG_CONTIGUOUS and extent == size */
+#define OPAL_DATATYPE_FLAG_USER_LB       0x0040  /**< has a user defined LB */
+#define OPAL_DATATYPE_FLAG_USER_UB       0x0080  /**< has a user defined UB */
+#define OPAL_DATATYPE_FLAG_DATA          0x0100  /**< data or control structure */
+/*
+ * We should make the difference here between the predefined contiguous and non contiguous
+ * datatypes. The OPAL_DATATYPE_FLAG_BASIC is held by all predefined contiguous datatypes.
+ */
+#define OPAL_DATATYPE_FLAG_BASIC         (OPAL_DATATYPE_FLAG_PREDEFINED | \
+                                          OPAL_DATATYPE_FLAG_CONTIGUOUS | \
+                                          OPAL_DATATYPE_FLAG_NO_GAPS |    \
+                                          OPAL_DATATYPE_FLAG_DATA |       \
+                                          OPAL_DATATYPE_FLAG_COMMITED)
+ 
+/* typedefs ***********************************************************/
+
+typedef struct opal_object_t opal_object_t;
+typedef struct opal_class_t opal_class_t;
+typedef void (*opal_construct_t) (opal_object_t *);
+typedef void (*opal_destruct_t) (opal_object_t *);
+
+
+/* types **************************************************************/
+
+/**
+* Class descriptor.
+*
+* There should be a single instance of this descriptor for each class
+* definition.
+*/
+struct opal_class_t {
+  const char *cls_name;           /**< symbolic name for class */
+  opal_class_t *cls_parent;       /**< parent class descriptor */
+  opal_construct_t cls_construct; /**< class constructor */
+  opal_destruct_t cls_destruct;   /**< class destructor */
+  int cls_initialized;            /**< is class initialized */
+  int cls_depth;                  /**< depth of class hierarchy tree */
+  opal_construct_t *cls_construct_array;
+                                  /**< array of parent class constructors */
+  opal_destruct_t *cls_destruct_array;
+                                  /**< array of parent class destructors */
+  size_t cls_sizeof;              /**< size of an object instance */
+};
+
+/**
+ * Base object.
+ *
+ * This is special and does not follow the pattern for other classes.
+ */
+struct opal_object_t {
+#if OPAL_ENABLE_DEBUG
+    /** Magic ID -- want this to be the very first item in the
+        struct's memory */
+    uint64_t obj_magic_id;
+#endif
+    opal_class_t *obj_class;            /**< class descriptor */
+    volatile int32_t obj_reference_count;   /**< reference count */
+#if OPAL_ENABLE_DEBUG
+   const char* cls_init_file_name;        /**< In debug mode store the file where the object get contructed */
+   int   cls_init_lineno;           /**< In debug mode store the line number where the object get contructed */
+#endif  /* OPAL_ENABLE_DEBUG */
+};
+
+ 
+ 
+struct ddt_elem_id_description {
+    uint16_t   flags;  /**< flags for the record */
+    uint16_t   type;   /**< the basic data type id */
+};
+typedef struct ddt_elem_id_description ddt_elem_id_description;
+
+/* the basic element. A data description is composed
+ * by a set of basic elements.
+ */
+struct ddt_elem_desc {
+    ddt_elem_id_description common;           /**< basic data description and flags */
+    uint32_t                count;            /**< number of blocks */
+    uint32_t                blocklen;         /**< number of elements on each block */
+    OPAL_PTRDIFF_TYPE       extent;           /**< extent of each block (in bytes) */
+    OPAL_PTRDIFF_TYPE       disp;             /**< displacement of the first block */
+};
+typedef struct ddt_elem_desc ddt_elem_desc_t;
+
+struct ddt_loop_desc {
+    ddt_elem_id_description common;           /**< basic data description and flags */
+    uint32_t                loops;            /**< number of elements */
+    uint32_t                items;            /**< number of items in the loop */
+    size_t                  unused;           /**< not used right now */
+    OPAL_PTRDIFF_TYPE       extent;           /**< extent of the whole loop */
+};
+typedef struct ddt_loop_desc ddt_loop_desc_t;
+
+struct ddt_endloop_desc {
+    ddt_elem_id_description common;           /**< basic data description and flags */
+    uint32_t                items;            /**< number of elements */
+    uint32_t                unused;           /**< not used right now */
+    size_t                  size;             /**< real size of the data in the loop */
+    OPAL_PTRDIFF_TYPE       first_elem_disp;  /**< the displacement of the first block in the loop */
+};
+typedef struct ddt_endloop_desc ddt_endloop_desc_t;
+
+union dt_elem_desc {
+    ddt_elem_desc_t    elem;
+    ddt_loop_desc_t    loop;
+    ddt_endloop_desc_t end_loop;
+};
+typedef union dt_elem_desc dt_elem_desc_t;
+
+/* dt_type_description */
+typedef uint32_t opal_datatype_count_t;
+
+struct dt_type_desc_t {
+    opal_datatype_count_t  length;  /**< the maximum number of elements in the description array */
+    opal_datatype_count_t  used;    /**< the number of used elements in the description array */
+    dt_elem_desc_t*        desc;
+};
+typedef struct dt_type_desc_t dt_type_desc_t;
+
+/*
+ * The datatype description.
+ */
+#define OPAL_DATATYPE_MAX_PREDEFINED 25
+#define OPAL_DATATYPE_MAX_SUPPORTED  47
+#define OPAL_MAX_OBJECT_NAME         64
+
+struct opal_datatype_t {
+    opal_object_t      super;    /**< basic superclass */
+    uint16_t           flags;    /**< the flags */
+    uint16_t           id;       /**< data id, normally the index in the data array. */
+    uint32_t           bdt_used; /**< bitset of which basic datatypes are used in the data description */
+    size_t             size;     /**< total size in bytes of the memory used by the data if
+                                      the data is put on a contiguous buffer */
+    OPAL_PTRDIFF_TYPE  true_lb;  /**< the true lb of the data without user defined lb and ub */
+    OPAL_PTRDIFF_TYPE  true_ub;  /**< the true ub of the data without user defined lb and ub */
+    OPAL_PTRDIFF_TYPE  lb;       /**< lower bound in memory */
+    OPAL_PTRDIFF_TYPE  ub;       /**< upper bound in memory */
+    /* --- cacheline 1 boundary (64 bytes) --- */
+    size_t             nbElems;  /**< total number of elements inside the datatype */
+    uint32_t           align;    /**< data should be aligned to */
+
+    /* Attribute fields */
+    char               name[OPAL_MAX_OBJECT_NAME];  /**< name of the datatype */
+    /* --- cacheline 2 boundary (128 bytes) was 8-12 bytes ago --- */
+    dt_type_desc_t     desc;     /**< the data description */
+    dt_type_desc_t     opt_desc; /**< short description of the data used when conversion is useless
+                                      or in the send case (without conversion) */
+
+    uint32_t           btypes[OPAL_DATATYPE_MAX_SUPPORTED];
+                                 /**< basic elements count used to compute the size of the
+                                      datatype for remote nodes. The length of the array is dependent on
+                                      the maximum number of datatypes of all top layers.
+                                      Reason being is that Fortran is not at the OPAL layer. */
+    /* --- cacheline 5 boundary (320 bytes) was 32-36 bytes ago --- */
+
+    /* size: 352, cachelines: 6, members: 15 */
+    /* last cacheline: 28-32 bytes */
+};
+
+typedef struct opal_datatype_t opal_datatype_t;
+
+/* convertor and stack */
+typedef struct opal_convertor_t opal_convertor_t;
+
+typedef int32_t (*convertor_advance_fct_t)( opal_convertor_t* pConvertor,
+                                            struct iovec* iov,
+                                            uint32_t* out_size,
+                                            size_t* max_data );
+typedef void*(*memalloc_fct_t)( size_t* pLength, void* userdata );
+typedef void*(*memcpy_fct_t)( void* dest, const void* src, size_t n, opal_convertor_t* pConvertor );
+
+/* The master convertor struct (defined in convertor_internal.h) */
+struct opal_convertor_master_t;
+
+struct dt_stack_t {
+    int32_t           index;    /**< index in the element description */
+    int16_t           type;     /**< the type used for the last pack/unpack (original or OPAL_DATATYPE_UINT1) */
+    size_t            count;    /**< number of times we still have to do it */
+    OPAL_PTRDIFF_TYPE disp;     /**< actual displacement depending on the count field */
+};
+typedef struct dt_stack_t dt_stack_t;
+
+typedef int32_t (*conversion_fct_t)( opal_convertor_t* pConvertor, uint32_t count,
+                                     const void* from, size_t from_len, OPAL_PTRDIFF_TYPE from_extent,
+                                     void* to, size_t to_length, OPAL_PTRDIFF_TYPE to_extent,
+                                     OPAL_PTRDIFF_TYPE *advance );
+
+typedef struct opal_convertor_master_t {
+    struct opal_convertor_master_t* next;
+    uint32_t                        remote_arch;
+    uint32_t                        flags;
+    uint32_t                        hetero_mask;
+    const size_t                    remote_sizes[OPAL_DATATYPE_MAX_PREDEFINED];
+    conversion_fct_t*               pFunctions;   /**< the convertor functions pointer */
+} opal_convertor_master_t;
+
+struct opal_convertor_t {
+    opal_object_t                 super;          /**< basic superclass */
+    uint32_t                      remoteArch;     /**< the remote architecture */
+    uint32_t                      flags;          /**< the properties of this convertor */
+    size_t                        local_size;     /**< overall length data on local machine, compared to bConverted */
+    size_t                        remote_size;    /**< overall length data on remote machine, compared to bConverted */
+    const opal_datatype_t*        pDesc;          /**< the datatype description associated with the convertor */
+    const dt_type_desc_t*         use_desc;       /**< the version used by the convertor (normal or optimized) */
+    opal_datatype_count_t         count;          /**< the total number of full datatype elements */
+    uint32_t                      stack_size;     /**< size of the allocated stack */
+    /* --- cacheline 1 boundary (64 bytes) --- */
+    unsigned char*                pBaseBuf;       /**< initial buffer as supplied by the user */
+    dt_stack_t*                   pStack;         /**< the local stack for the actual conversion */
+    convertor_advance_fct_t       fAdvance;       /**< pointer to the pack/unpack functions */
+    struct opal_convertor_master_t* master;       /**< the master convertor */
+
+    /* All others fields get modified for every call to pack/unpack functions */
+    uint32_t                      stack_pos;      /**< the actual position on the stack */
+    uint32_t                      partial_length; /**< amount of data left over from the last unpack */
+    size_t                        bConverted;     /**< # of bytes already converted */
+    uint32_t                      checksum;       /**< checksum computed by pack/unpack operation */
+    uint32_t                      csum_ui1;       /**< partial checksum computed by pack/unpack operation */
+    size_t                        csum_ui2;       /**< partial checksum computed by pack/unpack operation */
+     /* --- cacheline 2 boundary (128 bytes) --- */
+    dt_stack_t                    static_stack[DT_STATIC_STACK_SIZE];  /**< local stack for small datatypes */
+    /* --- cacheline 3 boundary (192 bytes) was 56 bytes ago --- */
+
+#if OPAL_CUDA_SUPPORT
+    memcpy_fct_t                  cbmemcpy;       /**< memcpy or cuMemcpy */
+    void *                        stream;         /**< CUstream for async copy */
+#endif
+    /* size: 248, cachelines: 4, members: 20 */
+    /* last cacheline: 56 bytes */
+};
+
+struct iovec {  
+    void *iov_base; /* Starting address */  
+    size_t iov_len; /* Length in bytes */  
+};
+
+typedef struct {
+    dt_stack_t pStack[DT_STATIC_STACK_SIZE];
+    dt_elem_desc_t* description;
+    struct iovec iov[IOV_ARRAY_SIZE];
+    uint32_t stack_pos;
+    uint32_t stack_size;
+    unsigned char* pBaseBuf; /* const */
+    OPAL_PTRDIFF_TYPE lb;  /* const */
+    OPAL_PTRDIFF_TYPE ub;  /* const */
+    size_t bConverted;
+    size_t local_size; /* const */
+    uint32_t out_size;
+    size_t max_data;
+    uint32_t description_count;
+    uint32_t description_max_count;
+} ddt_cuda_desc_t;
+
+typedef struct {
+    cudaStream_t opal_cuda_stream[NB_STREAMS];
+    uint32_t current_stream_id;
+} ddt_cuda_stream_t;
+
+extern ddt_cuda_desc_t *cuda_desc_d, *cuda_desc_h;
+extern unsigned char* pBaseBuf_GPU;
+extern ddt_cuda_stream_t* cuda_streams;
+
+#define SAVE_STACK( PSTACK, INDEX, TYPE, COUNT, DISP) \
+do { \
+   (PSTACK)->index    = (INDEX); \
+   (PSTACK)->type     = (TYPE); \
+   (PSTACK)->count    = (COUNT); \
+   (PSTACK)->disp     = (DISP); \
+} while(0)
+
+#define PUSH_STACK( PSTACK, STACK_POS, INDEX, TYPE, COUNT, DISP) \
+do { \
+   dt_stack_t* pTempStack = (PSTACK) + 1; \
+   if (threadIdx.x == 0) {  \
+       SAVE_STACK( pTempStack, (INDEX), (TYPE), (COUNT), (DISP) );  \
+   }    \
+   __syncthreads(); \
+   (STACK_POS)++; \
+   (PSTACK) = pTempStack; \
+} while(0)
+
+#define UPDATE_INTERNAL_COUNTERS( DESCRIPTION, POSITION, ELEMENT, COUNTER ) \
+    do {                                                                \
+        (ELEMENT) = &((DESCRIPTION)[(POSITION)]);                       \
+        (COUNTER) = (ELEMENT)->elem.count;                              \
+    } while (0)
+        
+#if defined (OPAL_DATATYPE_CUDA_DEBUG) 
+#define DBGPRINT(fmt, ...) printf(fmt, __VA_ARGS__) 
+#else 
+#define DBGPRINT(fmt, ...) 
+#endif 
+
+__device__ void pack_contiguous_loop_cuda_kernel( dt_elem_desc_t* ELEM,
+                                                  uint32_t* COUNT,
+                                                  unsigned char** SOURCE,
+                                                  unsigned char** DESTINATION,
+                                                  size_t* SPACE );
+                                                            
+__device__ void unpack_contiguous_loop_cuda_kernel( dt_elem_desc_t* ELEM,
+                                                    uint32_t* COUNT,
+                                                    unsigned char** SOURCE,
+                                                    unsigned char** DESTINATION,
+                                                    size_t* SPACE );
+                                                  
+__global__ void opal_generic_simple_pack_cuda_kernel(ddt_cuda_desc_t* cuda_desc);
+
+__global__ void opal_generic_simple_unpack_cuda_kernel(ddt_cuda_desc_t* cuda_desc);
+
+__global__ void pack_contiguous_loop_cuda_kernel_global( uint32_t copy_loops,
+                                                         size_t size,
+                                                         OPAL_PTRDIFF_TYPE extent,
+                                                         unsigned char* source,
+                                                         unsigned char* destination );
+                                                         
+__global__ void unpack_contiguous_loop_cuda_kernel_global( uint32_t copy_loops,
+                                                           size_t size,
+                                                           OPAL_PTRDIFF_TYPE extent,
+                                                           unsigned char* source,
+                                                           unsigned char* destination );
+
+extern "C"
+{
+int32_t opal_convertor_set_position_nocheck( opal_convertor_t* convertor, size_t* position );
+}
+
+#endif  /* OPAL_DATATYPE_CUDA_INTERNAL_H_HAS_BEEN_INCLUDED */
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
new file mode 100644
index 00000000000..d56ebfe6954
--- /dev/null
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
@@ -0,0 +1,502 @@
+#include "opal_datatype_cuda_internal.cuh"
+#include <stdio.h> 
+#include <time.h>
+
+__device__ void pack_contiguous_loop_cuda_kernel( dt_elem_desc_t* ELEM,
+                                                  uint32_t* COUNT,
+                                                  unsigned char** SOURCE,
+                                                  unsigned char** DESTINATION,
+                                                  size_t* SPACE )
+{
+    ddt_loop_desc_t *_loop = (ddt_loop_desc_t*)(ELEM);
+    ddt_endloop_desc_t* _end_loop = (ddt_endloop_desc_t*)((ELEM) + _loop->items);
+    unsigned char* _src_disp = (*SOURCE) + _end_loop->first_elem_disp;
+    uint32_t _copy_loops = *(COUNT);
+    uint32_t _i, tid, num_threads;
+    unsigned char* _destination = *DESTINATION;
+//    unsigned char* _source = _src_disp;
+    uint32_t gap, nb_elements;
+    double *_source_tmp, *_destination_tmp, *_src_disp_tmp;
+    
+    tid = threadIdx.x + blockIdx.x * blockDim.x;
+    num_threads = gridDim.x * blockDim.x;
+
+    if( (_copy_loops * _end_loop->size) > *(SPACE) )
+        _copy_loops = (uint32_t)(*(SPACE) / _end_loop->size);
+    
+//     num_task_per_thread = _copy_loops / num_threads;
+//     residue = _copy_loops % num_threads;
+//     if ( ((tid < residue) && (residue != 0)) || (residue == 0) ) {
+//         num_task_per_thread += residue == 0 ? 0 : 1;
+//         start_index = tid * num_task_per_thread;
+//     } else {
+//         start_index = residue * (num_task_per_thread+1) + (tid-residue) * num_task_per_thread;
+//     }
+//
+//     end_index = start_index + num_task_per_thread;
+//     DBGPRINT("tid %d, start %d, end %d, num_task_per_thread %d, copy_loops %d\n", tid, start_index, end_index, num_task_per_thread, _copy_loops);
+//     for( _i = start_index; _i < end_index; _i++ ) {
+//         // OPAL_DATATYPE_SAFEGUARD_POINTER( _source, _loop->extent, (CONVERTOR)->pBaseBuf,
+//         //                             (CONVERTOR)->pDesc, (CONVERTOR)->count );
+//         _source = _src_disp + _i * _loop->extent;
+//         _destination = *DESTINATION + _i * _end_loop->size;
+//         DBGPRINT("tid %d, pack 3. memcpy( %p, %p, %lu ) => space %lu, _i %d\n",
+//                                tid, _destination, _source, (unsigned long)_end_loop->size, (unsigned long)(*(SPACE) - _i * _end_loop->size), _i );
+//     //    MEMCPY_CSUM( *(DESTINATION), _source, _end_loop->size, (CONVERTOR) );
+// #if !defined (OPAL_DATATYPE_CUDA_DRY_RUN)
+//  //       memcpy(_destination, _source, _end_loop->size);
+//         _source_tmp = (double *)_source;
+//         _destination_tmp = (double *)_destination;
+//         for (_j = 0; _j < _end_loop->size/8; _j++)
+//         {
+//             *_destination_tmp = *_source_tmp;
+//             _destination_tmp ++;
+//             _source_tmp ++;
+//         }
+// #endif /* OPAL_DATATYPE_CUDA_DRY_RUN */
+//     }
+    
+    gap = (_loop->extent - _end_loop->size) / 8;
+    nb_elements = _end_loop->size / 8;
+    _src_disp_tmp = (double*)_src_disp;
+    _destination_tmp = (double*)_destination;
+    _destination_tmp += tid;
+
+    __syncthreads();
+
+    for (_i = tid; _i < _copy_loops*nb_elements; _i+=num_threads) {
+        _source_tmp = _src_disp_tmp + tid + _i/num_threads*num_threads + _i/nb_elements * gap;
+#if defined (OPAL_DATATYPE_CUDA_DEBUG)
+        if (_i % nb_elements == 0 ) {
+            DBGPRINT("tid %d, pack 3. memcpy( %p, %p, %lu ) => space %lu, _i %d, actual _i %d\n",
+                                            tid, _destination_tmp, _source_tmp, (unsigned long)_end_loop->size, (unsigned long)(*(SPACE) - _i/nb_elements * _end_loop->size), _i/nb_elements, _i );
+        }
+        // if (_i / nb_elements ==1 && tid == 0 ) {
+        //     DBGPRINT("tid %d, pack 3. memcpy( %p, %p, %lu ) => space %lu, _i %d, actual _i %d\n",
+        //                                     tid, _destination_tmp, _source_tmp, (unsigned long)_end_loop->size, (unsigned long)(*(SPACE) - _i/nb_elements * _end_loop->size), _i/nb_elements, _i );
+        // }
+#endif /* OPAL_DATATYPE_CUDA_DEBUG */
+#if !defined (OPAL_DATATYPE_CUDA_DRY_RUN)
+        *_destination_tmp = *_source_tmp;
+#endif /* ! OPAL_DATATYPE_CUDA_DRY_RUN */
+        _destination_tmp += num_threads;
+
+    }
+    *(SOURCE) = _src_disp + _copy_loops*_loop->extent - _end_loop->first_elem_disp;
+    *(DESTINATION) = *(DESTINATION)  + _copy_loops * _end_loop->size;
+    *(SPACE) -= _copy_loops * _end_loop->size;
+    *(COUNT) -= _copy_loops;
+
+    __syncthreads();
+}
+
+__device__ void pack_predefined_data_cuda_kernel( dt_elem_desc_t* ELEM,
+                                                  uint32_t* COUNT,
+                                                  unsigned char** SOURCE,
+                                                  unsigned char** DESTINATION,
+                                                  size_t* SPACE )
+{
+    uint32_t _copy_count = *(COUNT);
+    size_t _copy_blength;
+    ddt_elem_desc_t* _elem = &((ELEM)->elem);
+    unsigned char* _src_disp = (*SOURCE) + _elem->disp;
+    uint32_t _i, tid, num_threads;
+    unsigned char* _destination = *DESTINATION;
+    uint32_t gap, nb_elements;
+    double *_source_tmp, *_destination_tmp, *_src_disp_tmp;;
+
+    _copy_blength = 8;//opal_datatype_basicDatatypes[_elem->common.type]->size;
+    if( (_copy_count * _copy_blength) > *(SPACE) ) {
+        _copy_count = (uint32_t)(*(SPACE) / _copy_blength);
+        if( 0 == _copy_count ) return;  /* nothing to do */
+    }
+    
+    tid = threadIdx.x + blockIdx.x * blockDim.x;
+    num_threads = gridDim.x * blockDim.x;
+    
+    gap = (_elem->extent - _copy_blength) / 8;
+    nb_elements = _copy_blength / 8;
+    _src_disp_tmp = (double*)_src_disp;
+    _destination_tmp = (double*)_destination;
+    _source_tmp = _src_disp_tmp + tid;
+    _destination_tmp += tid;
+    
+    __syncthreads();
+    
+    for (_i = tid; _i < _copy_count*nb_elements; _i+=num_threads) {
+        _source_tmp = _src_disp_tmp + tid + _i/num_threads*num_threads + _i/nb_elements * gap;
+#if defined (OPAL_DATATYPE_CUDA_DEBUG)
+        if (_i == 0 ) {
+            DBGPRINT("tid %d, pack 3. memcpy( %p, %p, %lu ) => space %lu, _i %d, actual _i %d\n",
+                                            tid, _destination_tmp, _source_tmp, (unsigned long)_copy_blength*_copy_count, (unsigned long)(*(SPACE) - _i/nb_elements * _copy_blength), _i/nb_elements, _i );
+        }
+        // if (_i / nb_elements ==1 && tid == 0 ) {
+        //     DBGPRINT("tid %d, pack 3. memcpy( %p, %p, %lu ) => space %lu, _i %d, actual _i %d\n",
+        //                                     tid, _destination_tmp, _source_tmp, (unsigned long)_end_loop->size, (unsigned long)(*(SPACE) - _i/nb_elements * _end_loop->size), _i/nb_elements, _i );
+        // }
+#endif /* OPAL_DATATYPE_CUDA_DEBUG */
+#if !defined (OPAL_DATATYPE_CUDA_DRY_RUN)
+        *_destination_tmp = *_source_tmp;
+#endif /* ! OPAL_DATATYPE_CUDA_DRY_RUN */
+        _destination_tmp += num_threads;
+
+    }
+    
+    _copy_blength *= _copy_count;
+    *(SOURCE)  = _src_disp + _elem->extent*_copy_count - _elem->disp;
+    *(DESTINATION) += _copy_blength;
+    *(SPACE)  -= _copy_blength;
+    *(COUNT)  -= _copy_count;
+    
+    __syncthreads();
+}
+
+__global__ void opal_generic_simple_pack_cuda_kernel(ddt_cuda_desc_t* cuda_desc)
+{
+    dt_stack_t *pStack, *pStack_head;       /* pointer to the position on the stack */
+    uint32_t pos_desc;        /* actual position in the description of the derived datatype */
+    uint32_t count_desc;      /* the number of items already done in the actual pos_desc */
+    size_t total_packed = 0;  /* total amount packed this time */
+    dt_elem_desc_t* description;
+    dt_elem_desc_t* pElem;
+    unsigned char *conv_ptr, *iov_ptr, *pBaseBuf;
+    size_t iov_len_local;
+    uint32_t iov_count;
+    uint32_t stack_pos;
+    struct iovec* iov;
+
+    OPAL_PTRDIFF_TYPE lb;
+    OPAL_PTRDIFF_TYPE ub;
+    uint32_t out_size;
+    uint32_t tid;
+
+    tid = threadIdx.x + blockIdx.x * blockDim.x;
+
+    __shared__ ddt_cuda_desc_t cuda_desc_b;
+
+    if (threadIdx.x == 0) {
+        memcpy(&cuda_desc_b, cuda_desc, sizeof(ddt_cuda_desc_t));
+    }
+    __syncthreads();
+
+    // load cuda descriptor from constant memory
+    iov = cuda_desc_b.iov;
+    pStack_head = cuda_desc_b.pStack;
+    pStack = pStack_head;
+    description = cuda_desc_b.description;
+    stack_pos = cuda_desc_b.stack_pos;
+    pBaseBuf = cuda_desc_b.pBaseBuf;
+    lb = cuda_desc_b.lb;
+    ub = cuda_desc_b.ub;
+    out_size = cuda_desc_b.out_size;
+
+    pStack = pStack + stack_pos;
+    pos_desc   = pStack->index;
+    conv_ptr   = pBaseBuf + pStack->disp;
+    count_desc = (uint32_t)pStack->count;
+    pStack--;
+    stack_pos--;
+    pElem = &(description[pos_desc]);
+
+//    printf("pack start pos_desc %d count_desc %d disp %ld, stack_pos %d pos_desc %d count_desc %d disp %ld\n",
+//            pos_desc, count_desc, (long)(conv_ptr - pBaseBuf), stack_pos, pStack->index, (int)pStack->count, (long)pStack->disp);
+
+    for( iov_count = 0; iov_count < out_size; iov_count++ ) {
+        iov_ptr = (unsigned char *) iov[iov_count].iov_base;
+        iov_len_local = iov[iov_count].iov_len;
+        DBGPRINT("iov_len_local %lu, flags %d, types %d, count %d\n", iov_len_local, description->elem.common.flags, description->elem.common.type, description->elem.count);
+        while( 1 ) {
+            while( pElem->elem.common.flags & OPAL_DATATYPE_FLAG_DATA ) {
+                /* now here we have a basic datatype */
+                // PACK_PREDEFINED_DATATYPE( pConvertor, pElem, count_desc,
+                //                           conv_ptr, iov_ptr, iov_len_local );
+                pack_predefined_data_cuda_kernel(pElem, &count_desc, &conv_ptr, &iov_ptr, &iov_len_local);
+                if( 0 == count_desc ) {  /* completed */
+                    conv_ptr = pBaseBuf + pStack->disp;
+                    pos_desc++;  /* advance to the next data */
+                    UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
+                    continue;
+                }
+                goto complete_loop;
+            }
+            if( OPAL_DATATYPE_END_LOOP == pElem->elem.common.type ) { /* end of the current loop */
+                // DO_DEBUG( opal_output( 0, "pack end_loop count %d stack_pos %d"
+                //                        " pos_desc %d disp %ld space %lu\n",
+                //                        (int)pStack->count, pConvertor->stack_pos,
+                //                        pos_desc, (long)pStack->disp, (unsigned long)iov_len_local ); );
+                if (threadIdx.x == 0) {
+                    (pStack->count)--;
+                }
+                __syncthreads();
+
+                if( (pStack->count) == 0 ) { /* end of loop */
+                    if( 0 == stack_pos ) {
+                        /* we lie about the size of the next element in order to
+                         * make sure we exit the main loop.
+                         */
+                        out_size = iov_count;
+                        goto complete_loop;  /* completed */
+                    }
+                    stack_pos--;
+                    pStack--;
+                    pos_desc++;
+                } else {
+                    pos_desc = pStack->index + 1;
+                    if (threadIdx.x == 0) {
+                        if( pStack->index == -1 ) {
+                            pStack->disp += (ub - lb);
+                        } else {
+                            // assert( OPAL_DATATYPE_LOOP == description[pStack->index].loop.common.type );
+                            pStack->disp += description[pStack->index].loop.extent;
+                        }
+                    }
+                    __syncthreads();
+                }
+                conv_ptr = pBaseBuf + pStack->disp;
+                UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
+                // DO_DEBUG( opal_output( 0, "pack new_loop count %d stack_pos %d pos_desc %d count_desc %d disp %ld space %lu\n",
+                //                        (int)pStack->count, pConvertor->stack_pos, pos_desc,
+                //                        count_desc, (long)pStack->disp, (unsigned long)iov_len_local ); );
+            }
+            if( OPAL_DATATYPE_LOOP == pElem->elem.common.type ) {
+                OPAL_PTRDIFF_TYPE local_disp = (OPAL_PTRDIFF_TYPE)conv_ptr;
+                if( pElem->loop.common.flags & OPAL_DATATYPE_FLAG_CONTIGUOUS ) {
+                    pack_contiguous_loop_cuda_kernel( pElem, &count_desc,
+                                          &conv_ptr, &iov_ptr, &iov_len_local );
+                    if( 0 == count_desc ) {  /* completed */
+                        pos_desc += pElem->loop.items + 1;
+                        goto update_loop_description;
+                    }
+                    /* Save the stack with the correct last_count value. */
+                }
+                local_disp = (OPAL_PTRDIFF_TYPE)conv_ptr - local_disp;
+
+                PUSH_STACK( pStack, stack_pos, pos_desc, OPAL_DATATYPE_LOOP, count_desc,
+                            pStack->disp + local_disp);
+
+                pos_desc++;
+            update_loop_description:  /* update the current state */
+                conv_ptr = pBaseBuf + pStack->disp;
+                UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
+                // DDT_DUMP_STACK( pConvertor->pStack, pConvertor->stack_pos, pElem, "advance loop" );
+                continue;
+            }
+        }
+    complete_loop:
+        if (threadIdx.x == 0) {
+            iov[iov_count].iov_len -= iov_len_local;  /* update the amount of valid data */
+        }
+        __syncthreads();
+        total_packed += iov[iov_count].iov_len;
+    }
+
+    if (tid == 0) {
+        cuda_desc->max_data = total_packed;
+        cuda_desc->out_size = iov_count;
+        // cuda_desc->bConverted += total_packed;  /* update the already converted bytes */
+        // if( cuda_desc->bConverted == cuda_desc->local_size ) {
+        //     cuda_desc->stack_pos = stack_pos;
+        //     memcpy(cuda_desc->pStack, pStack_head, sizeof(dt_stack_t)*cuda_desc->stack_size);
+        //     return;
+        // }
+        // /* Save the global position for the next round */
+        // PUSH_STACK( pStack, stack_pos, pos_desc, OPAL_DATATYPE_INT8, count_desc,
+        //             conv_ptr - pBaseBuf );
+        // memcpy(cuda_desc->pStack, pStack_head, sizeof(dt_stack_t)*cuda_desc->stack_size);
+        // cuda_desc->stack_pos = stack_pos;
+    }
+    __syncthreads();
+
+    return;
+}
+
+// __global__ void opal_generic_simple_pack_cuda_kernel(ddt_cuda_desc_t* cuda_desc)
+// {
+//     dt_stack_t *pStack, *pStack_head;       /* pointer to the position on the stack */
+//     uint32_t pos_desc;        /* actual position in the description of the derived datatype */
+//     uint32_t count_desc;      /* the number of items already done in the actual pos_desc */
+//     size_t total_packed = 0;  /* total amount packed this time */
+//     dt_elem_desc_t* description;
+//     dt_elem_desc_t* pElem;
+//     unsigned char *conv_ptr, *iov_ptr, *pBaseBuf;
+//     size_t iov_len_local;
+//     uint32_t iov_count;
+//     uint32_t stack_pos;
+//     struct iovec* iov;
+//
+//     OPAL_PTRDIFF_TYPE lb;
+//     OPAL_PTRDIFF_TYPE ub;
+//     uint32_t out_size;
+//     uint32_t tid;
+//
+//     tid = threadIdx.x + blockIdx.x * blockDim.x;
+//
+//     __shared__ ddt_cuda_desc_t cuda_desc_b;
+//
+//     if (threadIdx.x == 0) {
+//         memcpy(&cuda_desc_b, cuda_desc, sizeof(ddt_cuda_desc_t));
+//     }
+//     __syncthreads();
+//
+//
+//     // load cuda descriptor from constant memory
+//     iov = cuda_desc_b.iov;
+//     pStack_head = cuda_desc_b.pStack;
+//     pStack = pStack_head;
+//     description = cuda_desc_b.description;
+//     stack_pos = cuda_desc_b.stack_pos;
+//     pBaseBuf = cuda_desc_b.pBaseBuf;
+//     lb = cuda_desc_b.lb;
+//     ub = cuda_desc_b.ub;
+//     out_size = cuda_desc_b.out_size;
+//
+//     pStack = pStack + stack_pos;
+//     pos_desc   = pStack->index;
+//     conv_ptr   = pBaseBuf + pStack->disp;
+//     count_desc = (uint32_t)pStack->count;
+//     pStack--;
+//     stack_pos--;
+//     pElem = &(description[pos_desc]);
+//
+// //    printf("pack start pos_desc %d count_desc %d disp %ld, stack_pos %d pos_desc %d count_desc %d disp %ld\n",
+// //            pos_desc, count_desc, (long)(conv_ptr - pBaseBuf), stack_pos, pStack->index, (int)pStack->count, (long)pStack->disp);
+//
+//     if (threadIdx.x == 0) {
+//     for( iov_count = 0; iov_count < out_size; iov_count++ ) {
+//         iov_ptr = (unsigned char *) iov[iov_count].iov_base;
+//         iov_len_local = iov[iov_count].iov_len;
+//         DBGPRINT("iov_len_local %lu, flags %d, types %d, count %d\n", iov_len_local, description->elem.common.flags, description->elem.common.type, description->elem.count);
+//         while( 1 ) {
+//             while( pElem->elem.common.flags & OPAL_DATATYPE_FLAG_DATA ) {
+//                 /* now here we have a basic datatype */
+//                 // PACK_PREDEFINED_DATATYPE( pConvertor, pElem, count_desc,
+//                 //                           conv_ptr, iov_ptr, iov_len_local );
+//                 if( 0 == count_desc ) {  /* completed */
+//                     conv_ptr = pBaseBuf + pStack->disp;
+//                     pos_desc++;  /* advance to the next data */
+//                     UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
+//                     continue;
+//                 }
+//                 goto complete_loop;
+//             }
+//             if( OPAL_DATATYPE_END_LOOP == pElem->elem.common.type ) { /* end of the current loop */
+//                 // DO_DEBUG( opal_output( 0, "pack end_loop count %d stack_pos %d"
+//                 //                        " pos_desc %d disp %ld space %lu\n",
+//                 //                        (int)pStack->count, pConvertor->stack_pos,
+//                 //                        pos_desc, (long)pStack->disp, (unsigned long)iov_len_local ); );
+//
+//                 if( --(pStack->count) == 0 ) { /* end of loop */
+//                     if( 0 == stack_pos ) {
+//                         /* we lie about the size of the next element in order to
+//                          * make sure we exit the main loop.
+//                          */
+//                         out_size = iov_count;
+//                         goto complete_loop;  /* completed */
+//                     }
+//                     stack_pos--;
+//                     pStack--;
+//                     pos_desc++;
+//                 } else {
+//                     pos_desc = pStack->index + 1;
+//                     if( pStack->index == -1 ) {
+//                         pStack->disp += (ub - lb);
+//                     } else {
+//                         // assert( OPAL_DATATYPE_LOOP == description[pStack->index].loop.common.type );
+//                         pStack->disp += description[pStack->index].loop.extent;
+//                     }
+//
+//                 }
+//                 conv_ptr = pBaseBuf + pStack->disp;
+//                 UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
+//                 // DO_DEBUG( opal_output( 0, "pack new_loop count %d stack_pos %d pos_desc %d count_desc %d disp %ld space %lu\n",
+//                 //                        (int)pStack->count, pConvertor->stack_pos, pos_desc,
+//                 //                        count_desc, (long)pStack->disp, (unsigned long)iov_len_local ); );
+//             }
+//             if( OPAL_DATATYPE_LOOP == pElem->elem.common.type ) {
+//                 OPAL_PTRDIFF_TYPE local_disp = (OPAL_PTRDIFF_TYPE)conv_ptr;
+//                 if( pElem->loop.common.flags & OPAL_DATATYPE_FLAG_CONTIGUOUS ) {
+//                     // pack_contiguous_loop_cuda_kernel( pElem, &count_desc,
+//                     //                       &conv_ptr, &iov_ptr, &iov_len_local );
+//                     count_desc = 0;
+//                     if( 0 == count_desc ) {  /* completed */
+//                         pos_desc += pElem->loop.items + 1;
+//                         goto update_loop_description;
+//                     }
+//                     /* Save the stack with the correct last_count value. */
+//                 }
+//                 local_disp = (OPAL_PTRDIFF_TYPE)conv_ptr - local_disp;
+//
+//                 PUSH_STACK( pStack, stack_pos, pos_desc, OPAL_DATATYPE_LOOP, count_desc,
+//                             pStack->disp + local_disp);
+//
+//                 pos_desc++;
+//             update_loop_description:  /* update the current state */
+//                 conv_ptr = pBaseBuf + pStack->disp;
+//                 UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
+//                 // DDT_DUMP_STACK( pConvertor->pStack, pConvertor->stack_pos, pElem, "advance loop" );
+//                 continue;
+//             }
+//         }
+//     complete_loop:
+//         iov[iov_count].iov_len -= iov_len_local;  /* update the amount of valid data */
+//         total_packed += iov[iov_count].iov_len;
+//     }
+//
+//     }
+//     __syncthreads();
+//     if (tid == 0) {
+//         cuda_desc->max_data = total_packed;
+//         cuda_desc->out_size = iov_count;
+//         // cuda_desc->bConverted += total_packed;  /* update the already converted bytes */
+//         // if( cuda_desc->bConverted == cuda_desc->local_size ) {
+//         //     cuda_desc->stack_pos = stack_pos;
+//         //     memcpy(cuda_desc->pStack, pStack_head, sizeof(dt_stack_t)*cuda_desc->stack_size);
+//         //     return;
+//         // }
+//         // /* Save the global position for the next round */
+//         // PUSH_STACK( pStack, stack_pos, pos_desc, OPAL_DATATYPE_INT8, count_desc,
+//         //             conv_ptr - pBaseBuf );
+//         // memcpy(cuda_desc->pStack, pStack_head, sizeof(dt_stack_t)*cuda_desc->stack_size);
+//         // cuda_desc->stack_pos = stack_pos;
+//     }
+//     return;
+// }
+
+__global__ void pack_contiguous_loop_cuda_kernel_global( uint32_t copy_loops,
+                                                         size_t size,
+                                                         OPAL_PTRDIFF_TYPE extent,
+                                                         unsigned char* source,
+                                                         unsigned char* destination )
+{
+    uint32_t _i, tid, num_threads;
+    uint32_t gap, nb_elements;
+    double *_source_tmp, *_destination_tmp, *_src_disp_tmp;;
+    
+    tid = threadIdx.x + blockIdx.x * blockDim.x;
+    num_threads = gridDim.x * blockDim.x;
+    
+    gap = (extent - size) / 8;
+    nb_elements = size / 8;
+    _src_disp_tmp = (double*)source;
+    _destination_tmp = (double*)destination;
+    _source_tmp = _src_disp_tmp + tid;
+    _destination_tmp += tid;
+
+    for (_i = tid; _i < copy_loops*nb_elements; _i+=num_threads) {
+        _source_tmp = _src_disp_tmp + tid + _i/num_threads*num_threads + _i/nb_elements * gap;
+#if defined (OPAL_DATATYPE_CUDA_DEBUG)
+        if (_i % nb_elements == 0 ) {
+            DBGPRINT("tid %d, pack 3. memcpy( %p, %p, %lu ) => _i %d, actual _i %d, count %d\n",
+                                            tid, _destination_tmp, _source_tmp, (unsigned long)size, _i/nb_elements, _i, copy_loops );
+        }
+        // if (_i / nb_elements ==1 && tid == 0 ) {
+        //     DBGPRINT("tid %d, pack 3. memcpy( %p, %p, %lu ) => space %lu, _i %d, actual _i %d\n",
+        //                                     tid, _destination_tmp, _source_tmp, (unsigned long)_end_loop->size, (unsigned long)(*(SPACE) - _i/nb_elements * _end_loop->size), _i/nb_elements, _i );
+        // }
+#endif /* OPAL_DATATYPE_CUDA_DEBUG */
+#if !defined (OPAL_DATATYPE_CUDA_DRY_RUN)
+        *_destination_tmp = *_source_tmp;
+#endif /* ! OPAL_DATATYPE_CUDA_DRY_RUN */
+        _destination_tmp += num_threads;
+    }
+}
\ No newline at end of file
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
new file mode 100644
index 00000000000..3b04bf025e8
--- /dev/null
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
@@ -0,0 +1,196 @@
+#include "opal_datatype_cuda_internal.cuh"
+#include "opal_datatype_cuda.cuh"
+
+#include <stdio.h>
+
+int32_t opal_generic_simple_pack_function_cuda( opal_convertor_t* pConvertor,
+                                                struct iovec* iov, 
+                                                uint32_t* out_size,
+                                                size_t* max_data )
+{
+    uint32_t i;
+    dt_elem_desc_t* description;
+    const opal_datatype_t *pData = pConvertor->pDesc;
+    uint32_t tasks_per_block, num_blocks;
+    dt_stack_t* pStack;
+    
+    description = pConvertor->use_desc->desc;
+    
+    cuda_desc_h->stack_pos = pConvertor->stack_pos;
+#if defined(OPAL_DATATYPE_CUDA_DRY_RUN)
+    cuda_desc_h->pBaseBuf = pConvertor->pBaseBuf;
+#else
+    cuda_desc_h->pBaseBuf = pBaseBuf_GPU;
+#endif /* OPAL_DATATYPE_CUDA_DRY_RUN */
+    cuda_desc_h->lb = pData->lb;
+    cuda_desc_h->ub = pData->ub;
+    cuda_desc_h->out_size = *out_size;
+    cuda_desc_h->max_data = *max_data;
+    cuda_desc_h->bConverted = pConvertor->bConverted;
+    cuda_desc_h->local_size = pConvertor->local_size;
+    cuda_desc_h->stack_size = pConvertor->stack_size;
+    
+    for (i = 0; i < pConvertor->stack_size; i++) {
+        cuda_desc_h->pStack[i] = pConvertor->pStack[i];
+    }
+    if (cuda_desc_h->description_max_count != 0) {
+        if (cuda_desc_h->description_max_count >= (pConvertor->use_desc->used+1)) {
+            cuda_desc_h->description_count = pConvertor->use_desc->used+1;
+        } else {
+            cudaFree(cuda_desc_h->description);
+            cuda_desc_h->description = NULL;
+            cudaMalloc((void **)&(cuda_desc_h->description), sizeof(dt_elem_desc_t)*(pConvertor->use_desc->used+1));
+            cuda_desc_h->description_max_count = pConvertor->use_desc->used+1;
+            cuda_desc_h->description_count = pConvertor->use_desc->used+1;
+        }
+        
+    } else {
+        cudaMalloc((void **)&(cuda_desc_h->description), sizeof(dt_elem_desc_t)*(pConvertor->use_desc->used+1));
+        cuda_desc_h->description_max_count = pConvertor->use_desc->used+1;
+        cuda_desc_h->description_count = pConvertor->use_desc->used+1;
+    }
+    cudaMemcpy(cuda_desc_h->description, description, sizeof(dt_elem_desc_t)*(pConvertor->use_desc->used+1), cudaMemcpyHostToDevice);
+    
+    // for (i = 0; i < pConvertor->use_desc->used+1; i++) {
+    //     cuda_desc_h->description[i] = description[i];
+    // }
+    
+    DBGPRINT("stack_size %d\n", pConvertor->stack_size);
+
+    DBGPRINT("flags %d, types %d, count %d\n", description->elem.common.flags, description->elem.common.type, description->elem.count);
+    
+    for (i = 0; i < *out_size; i++) {
+#if defined (OPAL_DATATYPE_CUDA_DRY_RUN)
+        cuda_desc_h->iov[i].iov_base = iov[i].iov_base;
+#endif /* OPAL_DATATYPE_CUDA_DRY_RUN */
+        cuda_desc_h->iov[i].iov_len = iov[i].iov_len;
+    }
+    
+    cudaMemcpy(cuda_desc_d, cuda_desc_h, sizeof(ddt_cuda_desc_t), cudaMemcpyHostToDevice);
+    
+    pStack = pConvertor->pStack + pConvertor->stack_pos;
+    tasks_per_block = THREAD_PER_BLOCK * TASK_PER_THREAD;
+    num_blocks = ((uint32_t)pStack->count + tasks_per_block - 1) / tasks_per_block;
+    printf("launch kernel, count %d, num_blocks %d, total threads %d\n", (uint32_t)pStack->count, num_blocks, num_blocks*2*THREAD_PER_BLOCK);
+    opal_generic_simple_pack_cuda_kernel<<<192,4*THREAD_PER_BLOCK>>>(cuda_desc_d);
+#if !defined(OPAL_DATATYPE_CUDA_DRY_RUN)
+    size_t position = pConvertor->pDesc->size;
+    opal_convertor_set_position_nocheck(pConvertor, &position);
+#endif
+    cudaDeviceSynchronize();
+    
+#if defined(OPAL_DATATYPE_CUDA_DRY_RUN)
+    return -99;
+#else
+    // /* copy stack and description data back to CPU */
+    // cudaMemcpy(cuda_desc_h, cuda_desc_d, sizeof(ddt_cuda_desc_t), cudaMemcpyDeviceToHost);
+    //
+    // for (i = 0; i < pConvertor->stack_size; i++) {
+    //     pConvertor->pStack[i] = cuda_desc_h->pStack[i];
+    // }
+    //
+    // pConvertor->stack_pos = cuda_desc_h->stack_pos;
+    // *out_size = cuda_desc_h->out_size;
+    // *max_data = cuda_desc_h->max_data;
+    // pConvertor->bConverted = cuda_desc_h->bConverted;
+    // pConvertor->local_size = cuda_desc_h->local_size;
+    //
+    // for (i = 0; i < *out_size; i++) {
+    //     iov[i].iov_len = cuda_desc_h->iov[i].iov_len;
+    // }
+    //
+    if( pConvertor->bConverted == pConvertor->local_size ) {
+        // pConvertor->flags |= CONVERTOR_COMPLETED;
+        return 1;
+    }
+
+    return 0;
+#endif /* OPAL_DATATYPE_CUDA_DRY_RUN */
+                                                  
+}
+
+void pack_contiguous_loop_cuda( dt_elem_desc_t* ELEM,
+                                uint32_t* COUNT,
+                                unsigned char** SOURCE,
+                                unsigned char** DESTINATION,
+                                size_t* SPACE )
+{
+    ddt_loop_desc_t *_loop = (ddt_loop_desc_t*)(ELEM);
+    ddt_endloop_desc_t* _end_loop = (ddt_endloop_desc_t*)((ELEM) + _loop->items);
+    unsigned char* _source = (*SOURCE) + _end_loop->first_elem_disp;
+    uint32_t _copy_loops = *(COUNT);
+    uint32_t num_blocks, tasks_per_block;
+    unsigned char* _destination = *(DESTINATION);
+
+    printf("I am in pack_contiguous_loop_cuda\n");
+
+    if( (_copy_loops * _end_loop->size) > *(SPACE) )
+        _copy_loops = (uint32_t)(*(SPACE) / _end_loop->size);
+
+#if !defined(OPAL_DATATYPE_CUDA_DRY_RUN)
+    _source = pBaseBuf_GPU;
+    _destination = (unsigned char*)cuda_desc_h->iov[0].iov_base;
+#endif
+    
+    tasks_per_block = THREAD_PER_BLOCK * TASK_PER_THREAD;
+    num_blocks = (*COUNT + tasks_per_block - 1) / tasks_per_block;
+    pack_contiguous_loop_cuda_kernel_global<<<192, 4*THREAD_PER_BLOCK>>>(_copy_loops, _end_loop->size, _loop->extent, _source, _destination);
+
+#if !defined(OPAL_DATATYPE_CUDA_DRY_RUN)    
+    *(SOURCE) = _source +  _loop->extent*_copy_loops - _end_loop->first_elem_disp;
+    *(DESTINATION) = *(DESTINATION)  + _copy_loops * _end_loop->size;
+    *(SPACE) -= _copy_loops * _end_loop->size;
+    *(COUNT) -= _copy_loops;
+#endif
+    
+    cudaDeviceSynchronize();
+}
+
+
+void pack_predefined_data_cuda( dt_elem_desc_t* ELEM,
+                                uint32_t* COUNT,
+                                unsigned char** SOURCE,
+                                unsigned char** DESTINATION,
+                                size_t* SPACE )
+{
+    uint32_t _copy_count = *(COUNT);
+    size_t _copy_blength;
+    ddt_elem_desc_t* _elem = &((ELEM)->elem);
+    unsigned char* _source = (*SOURCE) + _elem->disp;
+    uint32_t num_blocks, tasks_per_block;
+    unsigned char* _destination = *(DESTINATION);
+
+    _copy_blength = 8;//opal_datatype_basicDatatypes[_elem->common.type]->size;
+    if( (_copy_count * _copy_blength) > *(SPACE) ) {
+        _copy_count = (uint32_t)(*(SPACE) / _copy_blength);
+        if( 0 == _copy_count ) return;  /* nothing to do */
+    }
+    
+#if !defined(OPAL_DATATYPE_CUDA_DRY_RUN)
+    _source = pBaseBuf_GPU;
+    _destination = (unsigned char*)cuda_desc_h->iov[0].iov_base;
+#endif
+    
+    tasks_per_block = THREAD_PER_BLOCK*4;
+    num_blocks = (*COUNT + tasks_per_block - 1) / tasks_per_block;
+
+    DBGPRINT("num_blocks %d, thread %d\n", num_blocks, tasks_per_block);
+    DBGPRINT( "GPU pack 1. memcpy( %p, %p, %lu ) => space %lu\n", _destination, _source, (unsigned long)_copy_count, (unsigned long)(*(SPACE)) );
+    
+    pack_contiguous_loop_cuda_kernel_global<<<1, THREAD_PER_BLOCK, 0, cuda_streams->opal_cuda_stream[cuda_streams->current_stream_id]>>>(_copy_count, _copy_blength, _elem->extent, _source, _destination);
+    cuda_streams->current_stream_id ++;
+    cuda_streams->current_stream_id = cuda_streams->current_stream_id % NB_STREAMS;
+    
+#if !defined(OPAL_DATATYPE_CUDA_DRY_RUN)  
+    _copy_blength *= _copy_count;
+    *(SOURCE)  = _source + _elem->extent*_copy_count - _elem->disp;
+    *(DESTINATION) += _copy_blength;
+    *(SPACE)  -= _copy_blength;
+    *(COUNT)  -= _copy_count;
+#endif
+    
+    pBaseBuf_GPU += _elem->extent*_copy_count;
+    cuda_desc_h->iov[0].iov_base = (unsigned char*)cuda_desc_h->iov[0].iov_base + _copy_blength;
+ //   cudaDeviceSynchronize();
+}
+
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
new file mode 100644
index 00000000000..f59b2bb0e00
--- /dev/null
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
@@ -0,0 +1,288 @@
+#include "opal_datatype_cuda_internal.cuh"
+#include <cuda.h>
+#include <stdio.h> 
+
+__device__ void unpack_contiguous_loop_cuda_kernel( dt_elem_desc_t* ELEM,
+                                                    uint32_t* COUNT,
+                                                    unsigned char** SOURCE,
+                                                    unsigned char** DESTINATION,
+                                                    size_t* SPACE )
+{
+    ddt_loop_desc_t *_loop = (ddt_loop_desc_t*)(ELEM);
+    ddt_endloop_desc_t* _end_loop = (ddt_endloop_desc_t*)((ELEM) + _loop->items);
+    unsigned char* _dst_disp = (*DESTINATION) + _end_loop->first_elem_disp;
+    uint32_t _copy_loops = *(COUNT);
+    uint32_t _i, tid, num_threads;
+    unsigned char* _source = *SOURCE;
+//    unsigned char* _source = _src_disp;
+    uint32_t gap, nb_elements;
+    double *_source_tmp, *_destination_tmp, *_dst_disp_tmp;;
+    
+    tid = threadIdx.x + blockIdx.x * blockDim.x;
+    num_threads = gridDim.x * blockDim.x;
+
+    if( (_copy_loops * _end_loop->size) > *(SPACE) )
+        _copy_loops = (uint32_t)(*(SPACE) / _end_loop->size);
+    
+    gap = (_loop->extent - _end_loop->size) / 8;
+    nb_elements = _end_loop->size / 8;
+    _dst_disp_tmp = (double*)_dst_disp;
+    _source_tmp = (double*)_source;
+    _destination_tmp = _dst_disp_tmp + tid;
+    _source_tmp += tid;
+
+    __syncthreads();
+    for (_i = tid; _i < _copy_loops*nb_elements; _i+=num_threads) {
+        _destination_tmp = _dst_disp_tmp + tid + _i/num_threads*num_threads + _i/nb_elements * gap;
+#if defined (OPAL_DATATYPE_CUDA_DEBUG)
+        if (_i % nb_elements == 0 ) {
+            DBGPRINT("tid %d, pack 3. memcpy( %p, %p, %lu ) => space %lu, _i %d, actual _i %d\n",
+                                            tid, _destination_tmp, _source_tmp, (unsigned long)_end_loop->size, (unsigned long)(*(SPACE) - _i/nb_elements * _end_loop->size), _i/nb_elements, _i );
+        }
+        // if (_i / nb_elements ==1 && tid == 0 ) {
+        //     DBGPRINT("tid %d, pack 3. memcpy( %p, %p, %lu ) => space %lu, _i %d, actual _i %d\n",
+        //                                     tid, _destination_tmp, _source_tmp, (unsigned long)_end_loop->size, (unsigned long)(*(SPACE) - _i/nb_elements * _end_loop->size), _i/nb_elements, _i );
+        // }
+#endif /* OPAL_DATATYPE_CUDA_DEBUG */
+#if !defined (OPAL_DATATYPE_CUDA_DRY_RUN)
+        *_destination_tmp = *_source_tmp;
+#endif /* ! OPAL_DATATYPE_CUDA_DRY_RUN */
+        _source_tmp += num_threads;
+//        _source_tmp += num_threads;
+
+    }
+    *(DESTINATION) = _dst_disp + _copy_loops*_loop->extent - _end_loop->first_elem_disp;
+    *(SOURCE) = *(SOURCE)  + _copy_loops * _end_loop->size;
+    *(SPACE) -= _copy_loops * _end_loop->size;
+    *(COUNT) -= _copy_loops;
+
+    __syncthreads();
+}
+
+__global__ void opal_generic_simple_unpack_cuda_kernel(ddt_cuda_desc_t* cuda_desc)
+{
+    dt_stack_t* pStack, *pStack_head;                /* pointer to the position on the stack */
+    uint32_t pos_desc;                 /* actual position in the description of the derived datatype */
+    uint32_t count_desc;               /* the number of items already done in the actual pos_desc */
+    size_t total_unpacked = 0;         /* total size unpacked this time */
+    dt_elem_desc_t* description;
+    dt_elem_desc_t* pElem;
+    unsigned char *conv_ptr, *iov_ptr, *pBaseBuf;
+    size_t iov_len_local;
+    uint32_t iov_count;
+    uint32_t stack_pos;
+    struct iovec* iov;
+
+    OPAL_PTRDIFF_TYPE lb; 
+    OPAL_PTRDIFF_TYPE ub;
+    uint32_t out_size;
+    uint32_t tid;
+
+    tid = threadIdx.x + blockIdx.x * blockDim.x;
+    
+    __shared__ ddt_cuda_desc_t cuda_desc_b;
+    
+    if (threadIdx.x == 0) {
+        memcpy(&cuda_desc_b, cuda_desc, sizeof(ddt_cuda_desc_t));
+    }
+    __syncthreads();
+    
+    // load cuda descriptor from constant memory
+    iov = cuda_desc_b.iov;
+    pStack_head = cuda_desc_b.pStack;
+    pStack = pStack_head;
+    description = cuda_desc_b.description;
+    stack_pos = cuda_desc_b.stack_pos;
+    pBaseBuf = cuda_desc_b.pBaseBuf;
+    lb = cuda_desc_b.lb;
+    ub = cuda_desc_b.ub;
+    out_size = cuda_desc_b.out_size;
+
+    /* For the first step we have to add both displacement to the source. After in the
+     * main while loop we will set back the source_base to the correct value. This is
+     * due to the fact that the convertor can stop in the middle of a data with a count
+     */
+    pStack     = pStack + stack_pos;
+    pos_desc   = pStack->index;
+    conv_ptr   = pBaseBuf + pStack->disp;
+    count_desc = (uint32_t)pStack->count;
+    pStack--;
+    stack_pos--;
+    pElem = &(description[pos_desc]);
+
+
+    for( iov_count = 0; iov_count < out_size; iov_count++ ) {
+        iov_ptr = (unsigned char *) iov[iov_count].iov_base;
+        iov_len_local = iov[iov_count].iov_len;
+        // if( 0 != pConvertor->partial_length ) {
+        //     size_t element_length = opal_datatype_basicDatatypes[pElem->elem.common.type]->size;
+        //     size_t missing_length = element_length - pConvertor->partial_length;
+        //
+        //     assert( pElem->elem.common.flags & OPAL_DATATYPE_FLAG_DATA );
+        //     COMPUTE_CSUM( iov_ptr, missing_length, pConvertor );
+        //     opal_unpack_partial_datatype( pConvertor, pElem,
+        //                                   iov_ptr,
+        //                                   pConvertor->partial_length, element_length - pConvertor->partial_length,
+        //                                   &conv_ptr );
+        //     --count_desc;
+        //     if( 0 == count_desc ) {
+        //         conv_ptr = pConvertor->pBaseBuf + pStack->disp;
+        //         pos_desc++;  /* advance to the next data */
+        //         UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
+        //     }
+        //     iov_ptr       += missing_length;
+        //     iov_len_local -= missing_length;
+        //     pConvertor->partial_length = 0;  /* nothing more inside */
+        // }
+        while( 1 ) {
+            while( pElem->elem.common.flags & OPAL_DATATYPE_FLAG_DATA ) {
+                /* now here we have a basic datatype */
+                // UNPACK_PREDEFINED_DATATYPE( pConvertor, pElem, count_desc,
+                //                             iov_ptr, conv_ptr, iov_len_local );
+                if( 0 == count_desc ) {  /* completed */
+                    conv_ptr = pBaseBuf + pStack->disp;
+                    pos_desc++;  /* advance to the next data */
+                    UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
+                    continue;
+                }
+                // assert( pElem->elem.common.type < OPAL_DATATYPE_MAX_PREDEFINED );
+                if( 0 != iov_len_local ) {
+                    unsigned char* temp = conv_ptr;
+                    /* We have some partial data here. Let's copy it into the convertor
+                     * and keep it hot until the next round.
+                     */
+                    // assert( iov_len_local < opal_datatype_basicDatatypes[pElem->elem.common.type]->size );
+                    // COMPUTE_CSUM( iov_ptr, iov_len_local, pConvertor );
+                    //
+                    // opal_unpack_partial_datatype( pConvertor, pElem,
+                    //                               iov_ptr, 0, iov_len_local,
+                    //                               &temp );
+                    //
+                    // pConvertor->partial_length = (uint32_t)iov_len_local;
+                    iov_len_local = 0;
+                }
+                goto complete_loop;
+            }
+            if( OPAL_DATATYPE_END_LOOP == pElem->elem.common.type ) { /* end of the current loop */
+                // DO_DEBUG( opal_output( 0, "unpack end_loop count %d stack_pos %d pos_desc %d disp %ld space %lu\n",
+                //                        (int)pStack->count, pConvertor->stack_pos, pos_desc,
+                //                        (long)pStack->disp, (unsigned long)iov_len_local ); );
+                if (threadIdx.x == 0) {
+                    (pStack->count)--;
+                }
+                __syncthreads();
+                
+                if( pStack->count == 0 ) { /* end of loop */
+                    if( 0 == stack_pos ) {
+                        /* Do the same thing as when the loop is completed */
+                        iov[iov_count].iov_len -= iov_len_local;  /* update the amount of valid data */
+                        total_unpacked += iov[iov_count].iov_len;
+                        iov_count++;  /* go to the next */
+                        goto complete_conversion;
+                    }
+                    stack_pos--;
+                    pStack--;
+                    pos_desc++;
+                } else {
+                    pos_desc = pStack->index + 1;
+                    if (threadIdx.x == 0) {
+                        if( pStack->index == -1 ) {
+                            pStack->disp += (ub - lb);
+                        } else {
+                            //assert( OPAL_DATATYPE_LOOP == description[pStack->index].loop.common.type );
+                            pStack->disp += description[pStack->index].loop.extent;
+                        }
+                    }
+                    __syncthreads();
+                }
+                conv_ptr = pBaseBuf + pStack->disp;
+                UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
+                // DO_DEBUG( opal_output( 0, "unpack new_loop count %d stack_pos %d pos_desc %d disp %ld space %lu\n",
+                //                        (int)pStack->count, pConvertor->stack_pos, pos_desc,
+                //                        (long)pStack->disp, (unsigned long)iov_len_local ); );
+            }
+            if( OPAL_DATATYPE_LOOP == pElem->elem.common.type ) {
+                OPAL_PTRDIFF_TYPE local_disp = (OPAL_PTRDIFF_TYPE)conv_ptr;
+                if( pElem->loop.common.flags & OPAL_DATATYPE_FLAG_CONTIGUOUS ) {
+                    unpack_contiguous_loop_cuda_kernel( pElem, &count_desc,
+                                                        &iov_ptr, &conv_ptr, &iov_len_local );
+                    count_desc = 0;
+                    if( 0 == count_desc ) {  /* completed */
+                        pos_desc += pElem->loop.items + 1;
+                        goto update_loop_description;
+                    }
+                    /* Save the stack with the correct last_count value. */
+                }
+                local_disp = (OPAL_PTRDIFF_TYPE)conv_ptr - local_disp;
+                PUSH_STACK( pStack, stack_pos, pos_desc, OPAL_DATATYPE_LOOP, count_desc,
+                            pStack->disp + local_disp);
+                pos_desc++;
+            update_loop_description:  /* update the current state */
+                conv_ptr = pBaseBuf + pStack->disp;
+                UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
+                // DDT_DUMP_STACK( pConvertor->pStack, pConvertor->stack_pos, pElem, "advance loop" );
+                continue;
+            }
+        }
+    complete_loop:
+        if (threadIdx.x == 0) { 
+            iov[iov_count].iov_len -= iov_len_local;  /* update the amount of valid data */
+        }
+        __syncthreads();
+        total_unpacked += iov[iov_count].iov_len;
+    }
+ complete_conversion:
+    if (tid == 0) {
+        cuda_desc->max_data = total_unpacked;
+    //    pConvertor->bConverted += total_unpacked;  /* update the already converted bytes */
+        cuda_desc->out_size = iov_count;
+        // if( pConvertor->bConverted == pConvertor->remote_size ) {
+        //     pConvertor->flags |= CONVERTOR_COMPLETED;
+        //     return 1;
+        // }
+        // /* Save the global position for the next round */
+        // PUSH_STACK( pStack, pConvertor->stack_pos, pos_desc, OPAL_DATATYPE_UINT1, count_desc,
+        //             conv_ptr - pConvertor->pBaseBuf );
+        // DO_DEBUG( opal_output( 0, "unpack save stack stack_pos %d pos_desc %d count_desc %d disp %ld\n",
+        //                        pConvertor->stack_pos, pStack->index, (int)pStack->count, (long)pStack->disp ); );
+    }
+}
+
+__global__ void unpack_contiguous_loop_cuda_kernel_global( uint32_t copy_loops,
+                                                           size_t size,
+                                                           OPAL_PTRDIFF_TYPE extent,
+                                                           unsigned char* source,
+                                                           unsigned char* destination )
+{
+    uint32_t _i, tid, num_threads;
+    uint32_t gap, nb_elements;
+    double *_source_tmp, *_destination_tmp, *_dst_disp_tmp;;
+    
+    tid = threadIdx.x + blockIdx.x * blockDim.x;
+    num_threads = gridDim.x * blockDim.x;
+    
+    gap = (extent - size) / 8;
+    nb_elements = size / 8;
+    _dst_disp_tmp = (double*)destination;
+    _source_tmp = (double*)source;
+    _destination_tmp = _dst_disp_tmp + tid;
+    _source_tmp += tid;
+
+    for (_i = tid; _i < copy_loops*nb_elements; _i+=num_threads) {
+        _destination_tmp = _dst_disp_tmp + tid + _i/num_threads*num_threads + _i/nb_elements * gap;
+#if defined (OPAL_DATATYPE_CUDA_DEBUG)
+        if (_i % nb_elements == 0 ) {
+            DBGPRINT("tid %d, pack 3. memcpy( %p, %p, %lu ) => _i %d, actual _i %d\n",
+                                            tid, _destination_tmp, _source_tmp, (unsigned long)size,  _i/nb_elements, _i );
+        }
+        // if (_i / nb_elements ==1 && tid == 0 ) {
+        //     DBGPRINT("tid %d, pack 3. memcpy( %p, %p, %lu ) => space %lu, _i %d, actual _i %d\n",
+        //                                     tid, _destination_tmp, _source_tmp, (unsigned long)_end_loop->size, (unsigned long)(*(SPACE) - _i/nb_elements * _end_loop->size), _i/nb_elements, _i );
+        // }
+#endif /* OPAL_DATATYPE_CUDA_DEBUG */
+#if !defined (OPAL_DATATYPE_CUDA_DRY_RUN)
+        *_destination_tmp = *_source_tmp;
+#endif /* ! OPAL_DATATYPE_CUDA_DRY_RUN */
+        _source_tmp += num_threads;
+    }
+}
\ No newline at end of file
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
new file mode 100644
index 00000000000..7181f3cd362
--- /dev/null
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
@@ -0,0 +1,123 @@
+#include "opal_datatype_cuda_internal.cuh"
+#include "opal_datatype_cuda.cuh"
+
+#include <stdio.h>
+
+int32_t opal_generic_simple_unpack_function_cuda( opal_convertor_t* pConvertor,
+                                                  struct iovec* iov, 
+                                                  uint32_t* out_size,
+                                                  size_t* max_data )
+{
+    uint32_t i;
+    dt_elem_desc_t* description;
+    const opal_datatype_t *pData = pConvertor->pDesc;
+    uint32_t tasks_per_block, num_blocks;
+    dt_stack_t* pStack;
+    
+    description = pConvertor->use_desc->desc;
+    
+    cuda_desc_h->stack_pos = pConvertor->stack_pos;
+#if defined(OPAL_DATATYPE_CUDA_DRY_RUN)
+    cuda_desc_h->pBaseBuf = pConvertor->pBaseBuf;
+#else
+    cuda_desc_h->pBaseBuf = pBaseBuf_GPU;
+#endif /* OPAL_DATATYPE_CUDA_DRY_RUN */
+    cuda_desc_h->lb = pData->lb;
+    cuda_desc_h->ub = pData->ub;
+    cuda_desc_h->out_size = *out_size;
+    cuda_desc_h->max_data = *max_data;
+    cuda_desc_h->bConverted = pConvertor->bConverted;
+    cuda_desc_h->local_size = pConvertor->local_size;
+    cuda_desc_h->stack_size = pConvertor->stack_size;
+    
+    for (i = 0; i < pConvertor->stack_size; i++) {
+        cuda_desc_h->pStack[i] = pConvertor->pStack[i];
+    }
+    for (i = 0; i < pConvertor->use_desc->used+1; i++) {
+        cuda_desc_h->description[i] = description[i];
+    }
+    
+    DBGPRINT("stack_size %d\n", pConvertor->stack_size);
+
+    DBGPRINT("flags %d, types %d, count %d\n", description->elem.common.flags, description->elem.common.type, description->elem.count);
+    
+    for (i = 0; i < *out_size; i++) {
+#if defined (OPAL_DATATYPE_CUDA_DRY_RUN)
+        cuda_desc_h->iov[i].iov_base = iov[i].iov_base;
+#endif /* OPAL_DATATYPE_CUDA_DRY_RUN */
+        cuda_desc_h->iov[i].iov_len = iov[i].iov_len;
+    }
+    
+    cudaMemcpy(cuda_desc_d, cuda_desc_h, sizeof(ddt_cuda_desc_t), cudaMemcpyHostToDevice);
+    
+    pStack = pConvertor->pStack + pConvertor->stack_pos;
+    tasks_per_block = THREAD_PER_BLOCK * TASK_PER_THREAD;
+    num_blocks = ((uint32_t)pStack->count + tasks_per_block - 1) / tasks_per_block;
+    printf("launch kernel, count %d, num_blocks %d, total threads %d\n", (uint32_t)pStack->count, num_blocks, num_blocks*4*THREAD_PER_BLOCK);
+    opal_generic_simple_unpack_cuda_kernel<<<2*num_blocks,2*THREAD_PER_BLOCK>>>(cuda_desc_d);
+#if !defined(OPAL_DATATYPE_CUDA_DRY_RUN)
+    size_t position = pConvertor->pDesc->size;
+    opal_convertor_set_position_nocheck(pConvertor, &position);
+#endif
+    cudaDeviceSynchronize();
+    
+#if defined(OPAL_DATATYPE_CUDA_DRY_RUN)
+    return -99;
+#else
+    // /* copy stack and description data back to CPU */
+    // cudaMemcpy(cuda_desc_h, cuda_desc_d, sizeof(ddt_cuda_desc_t), cudaMemcpyDeviceToHost);
+    //
+    // for (i = 0; i < pConvertor->stack_size; i++) {
+    //     pConvertor->pStack[i] = cuda_desc_h->pStack[i];
+    // }
+    //
+    // pConvertor->stack_pos = cuda_desc_h->stack_pos;
+    // *out_size = cuda_desc_h->out_size;
+    // *max_data = cuda_desc_h->max_data;
+    // pConvertor->bConverted = cuda_desc_h->bConverted;
+    // pConvertor->local_size = cuda_desc_h->local_size;
+    //
+    // for (i = 0; i < *out_size; i++) {
+    //     iov[i].iov_len = cuda_desc_h->iov[i].iov_len;
+    // }
+    //
+    if( pConvertor->bConverted == pConvertor->local_size ) {
+        // pConvertor->flags |= CONVERTOR_COMPLETED;
+        return 1;
+    }
+
+    return 0;
+#endif /* OPAL_DATATYPE_CUDA_DRY_RUN */
+}
+
+void unpack_contiguous_loop_cuda( dt_elem_desc_t* ELEM,
+                                  uint32_t* COUNT,
+                                  unsigned char** SOURCE,
+                                  unsigned char** DESTINATION,
+                                  size_t* SPACE )
+{
+    ddt_loop_desc_t *_loop = (ddt_loop_desc_t*)(ELEM);
+    ddt_endloop_desc_t* _end_loop = (ddt_endloop_desc_t*)((ELEM) + _loop->items);
+    unsigned char* _destination = (*DESTINATION) + _end_loop->first_elem_disp;
+    uint32_t _copy_loops = *(COUNT);
+    uint32_t num_blocks, tasks_per_block;
+    unsigned char* _source = *(SOURCE);
+
+    printf("I am in unpack_contiguous_loop_cuda\n");
+
+    if( (_copy_loops * _end_loop->size) > *(SPACE) )
+        _copy_loops = (uint32_t)(*(SPACE) / _end_loop->size);
+
+    _destination = pBaseBuf_GPU;
+    _source = (unsigned char*)cuda_desc_h->iov[0].iov_base;
+    
+    tasks_per_block = THREAD_PER_BLOCK * TASK_PER_THREAD;
+    num_blocks = (*COUNT + tasks_per_block - 1) / tasks_per_block;
+    unpack_contiguous_loop_cuda_kernel_global<<<192, 4*THREAD_PER_BLOCK>>>(_copy_loops, _end_loop->size, _loop->extent, _source, _destination);
+    
+    *(DESTINATION) = _destination - _end_loop->first_elem_disp;
+    *(SPACE) -= _copy_loops * _end_loop->size;
+    *(COUNT) -= _copy_loops;
+    
+    cudaDeviceSynchronize();
+}
\ No newline at end of file
diff --git a/opal/datatype/opal_datatype_gpu.c b/opal/datatype/opal_datatype_gpu.c
new file mode 100644
index 00000000000..e77a4f77325
--- /dev/null
+++ b/opal/datatype/opal_datatype_gpu.c
@@ -0,0 +1,167 @@
+/* -*- Mode: C; c-basic-offset:4 ; -*- */
+/*
+ * Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana
+ *                         University Research and Technology
+ *                         Corporation.  All rights reserved.
+ * Copyright (c) 2004-2014 The University of Tennessee and The University
+ *                         of Tennessee Research Foundation.  All rights
+ *                         reserved.
+ * Copyright (c) 2004-2006 High Performance Computing Center Stuttgart,
+ *                         University of Stuttgart.  All rights reserved.
+ * Copyright (c) 2004-2006 The Regents of the University of California.
+ *                         All rights reserved.
+ * Copyright (c) 2009      Oak Ridge National Labs.  All rights reserved.
+ * Copyright (c) 2013 Cisco Systems, Inc.  All rights reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+
+#include "opal_config.h"
+
+#include <stddef.h>
+#include <dlfcn.h>
+
+#include "opal/datatype/opal_convertor_internal.h"
+#include "opal/datatype/opal_datatype_internal.h"
+
+#if OPAL_ENABLE_DEBUG
+#include "opal/util/output.h"
+
+#define DO_DEBUG(INST)  if( opal_pack_debug ) { INST }
+#else
+#define DO_DEBUG(INST)
+#endif  /* OPAL_ENABLE_DEBUG */
+
+#include "opal/datatype/opal_datatype_gpu.h"
+
+static void *opal_datatype_cuda_handle = NULL; 
+
+void (*opal_datatype_cuda_init_p)(void) = NULL;
+
+void (*opal_datatype_cuda_fini_p)(void) = NULL;
+
+int32_t (*opal_generic_simple_pack_function_cuda_p)( opal_convertor_t* pConvertor,
+                                                     struct iovec* iov, 
+                                                     uint32_t* out_size,
+                                                     size_t* max_data ) = NULL;
+
+int32_t (*opal_generic_simple_unpack_function_cuda_p)( opal_convertor_t* pConvertor,
+                                                       struct iovec* iov, 
+                                                       uint32_t* out_size,
+                                                       size_t* max_data ) = NULL;
+                                                       
+void (*pack_contiguous_loop_cuda_p)( dt_elem_desc_t* ELEM,
+                                     uint32_t* COUNT,
+                                     unsigned char** SOURCE,
+                                     unsigned char** DESTINATION,
+                                     size_t* SPACE ) = NULL;
+                                     
+void (*unpack_contiguous_loop_cuda_p)( dt_elem_desc_t* ELEM,
+                                       uint32_t* COUNT,
+                                       unsigned char** SOURCE,
+                                       unsigned char** DESTINATION,
+                                       size_t* SPACE ) = NULL;
+                                       
+void (*pack_predefined_data_cuda_p)( dt_elem_desc_t* ELEM,
+                                     uint32_t* COUNT,
+                                     unsigned char** SOURCE,
+                                     unsigned char** DESTINATION,
+                                     size_t* SPACE ) = NULL;
+
+void (*opal_cuda_sync_device_p)(void) = NULL;
+
+int32_t opal_datatype_gpu_init(void)
+{
+    char *error;
+    char *lib = "/home/wwu12/ompi/ompi-cuda/opal/datatype/cuda/opal_datatype_cuda.so";
+    
+    if (opal_datatype_cuda_handle ==  NULL) {
+        opal_datatype_cuda_handle = dlopen(lib, RTLD_LAZY);
+        if (!opal_datatype_cuda_handle) {
+            fprintf(stderr, "%s\n", dlerror());
+            opal_datatype_cuda_handle = NULL;
+            return OPAL_ERROR;
+        }
+        
+        *(void **)(&opal_datatype_cuda_init_p) = dlsym(opal_datatype_cuda_handle, "opal_datatype_cuda_init");
+        if ((error = dlerror()) != NULL)  {
+            fprintf(stderr, "opal_datatype_cuda_init error: %s\n", error);
+            opal_datatype_cuda_init_p = NULL;
+            return OPAL_ERROR;
+        }
+        
+        *(void **)(&opal_datatype_cuda_fini_p) = dlsym(opal_datatype_cuda_handle, "opal_datatype_cuda_fini");
+        if ((error = dlerror()) != NULL)  {
+            fprintf(stderr, "opal_datatype_cuda_fini error: %s\n", error);
+            opal_datatype_cuda_fini_p = NULL;
+            return OPAL_ERROR;
+        }
+        
+        *(void **)(&opal_generic_simple_pack_function_cuda_p) = dlsym(opal_datatype_cuda_handle, "opal_generic_simple_pack_function_cuda");
+        if ((error = dlerror()) != NULL)  {
+            fprintf(stderr, "opal_generic_simple_pack_function_cuda error: %s\n", error);
+            opal_generic_simple_pack_function_cuda_p = NULL;
+            return OPAL_ERROR;
+        }
+        
+        *(void **)(&opal_generic_simple_unpack_function_cuda_p) = dlsym(opal_datatype_cuda_handle, "opal_generic_simple_unpack_function_cuda");
+        if ((error = dlerror()) != NULL)  {
+            fprintf(stderr, "opal_generic_simple_unpack_function_cuda error: %s\n", error);
+            opal_generic_simple_unpack_function_cuda_p = NULL;
+            return OPAL_ERROR;
+        }
+        
+        *(void **)(&pack_contiguous_loop_cuda_p) = dlsym(opal_datatype_cuda_handle, "pack_contiguous_loop_cuda");
+        if ((error = dlerror()) != NULL)  {
+            fprintf(stderr, "pack_contiguous_loop_cuda error: %s\n", error);
+            pack_contiguous_loop_cuda_p = NULL;
+            return OPAL_ERROR;
+        }
+        
+        *(void **)(&unpack_contiguous_loop_cuda_p) = dlsym(opal_datatype_cuda_handle, "unpack_contiguous_loop_cuda");
+        if ((error = dlerror()) != NULL)  {
+            fprintf(stderr, "unpack_contiguous_loop_cuda error: %s\n", error);
+            unpack_contiguous_loop_cuda_p = NULL;
+            return OPAL_ERROR;
+        }
+        
+        *(void **)(&pack_predefined_data_cuda_p) = dlsym(opal_datatype_cuda_handle, "pack_predefined_data_cuda");
+        if ((error = dlerror()) != NULL)  {
+            fprintf(stderr, "pack_predefined_data_cuda error: %s\n", error);
+            pack_predefined_data_cuda_p = NULL;
+            return OPAL_ERROR;
+        }
+        
+        *(void **)(&opal_cuda_sync_device_p) = dlsym(opal_datatype_cuda_handle, "opal_cuda_sync_device");
+        if ((error = dlerror()) != NULL)  {
+            fprintf(stderr, "opal_cuda_sync_device error: %s\n", error);
+            opal_cuda_sync_device_p = NULL;
+            return OPAL_ERROR;
+        }
+        
+        (*opal_datatype_cuda_init_p)();
+        printf("cuda init done\n");   
+    }
+    return OPAL_SUCCESS;
+}
+int32_t opal_datatype_gpu_fini(void)
+{
+    if (opal_datatype_cuda_handle != NULL) {
+        (*opal_datatype_cuda_fini_p)();
+        dlclose(opal_datatype_cuda_handle);
+        opal_datatype_cuda_handle = NULL;
+        opal_datatype_cuda_init_p = NULL;
+        opal_datatype_cuda_fini_p = NULL;
+        opal_generic_simple_pack_function_cuda_p = NULL;
+        opal_generic_simple_unpack_function_cuda_p = NULL;
+        pack_contiguous_loop_cuda_p = NULL;
+        unpack_contiguous_loop_cuda_p = NULL;
+        pack_predefined_data_cuda_p = NULL;
+        opal_cuda_sync_device_p = NULL;
+        printf("cuda fini done\n");
+    }
+    return OPAL_SUCCESS;
+}
diff --git a/opal/datatype/opal_datatype_gpu.h b/opal/datatype/opal_datatype_gpu.h
new file mode 100644
index 00000000000..385d7cdb73c
--- /dev/null
+++ b/opal/datatype/opal_datatype_gpu.h
@@ -0,0 +1,40 @@
+#ifndef OPAL_DATATYPE_GPU_H_HAS_BEEN_INCLUDED
+#define OPAL_DATATYPE_GPU_H_HAS_BEEN_INCLUDED
+
+int32_t opal_datatype_gpu_init(void);
+int32_t opal_datatype_gpu_fini(void);
+
+extern void (*opal_datatype_cuda_init_p)(void);
+
+extern void (*opal_datatype_cuda_fini_p)(void);
+
+extern int32_t (*opal_generic_simple_pack_function_cuda_p)( opal_convertor_t* pConvertor,
+                                                            struct iovec* iov, 
+                                                            uint32_t* out_size,
+                                                            size_t* max_data );
+                                                            
+extern int32_t (*opal_generic_simple_unpack_function_cuda_p)( opal_convertor_t* pConvertor,
+                                                              struct iovec* iov, 
+                                                              uint32_t* out_size,
+                                                              size_t* max_data );
+                                                              
+extern void (*pack_contiguous_loop_cuda_p)( dt_elem_desc_t* ELEM,
+                                            uint32_t* COUNT,
+                                            unsigned char** SOURCE,
+                                            unsigned char** DESTINATION,
+                                            size_t* SPACE );
+                                            
+extern void (*unpack_contiguous_loop_cuda_p)( dt_elem_desc_t* ELEM,
+                                             uint32_t* COUNT,
+                                             unsigned char** SOURCE,
+                                             unsigned char** DESTINATION,
+                                             size_t* SPACE );
+
+extern void (*pack_predefined_data_cuda_p)( dt_elem_desc_t* ELEM,
+                                            uint32_t* COUNT,
+                                            unsigned char** SOURCE,
+                                            unsigned char** DESTINATION,
+                                            size_t* SPACE );
+                                            
+extern void (*opal_cuda_sync_device_p)(void);
+#endif /* OPAL_DATATYPE_GPU_H_HAS_BEEN_INCLUDED */
\ No newline at end of file
diff --git a/opal/datatype/opal_datatype_module.c b/opal/datatype/opal_datatype_module.c
index 7de8fae5b08..520105d8de9 100644
--- a/opal/datatype/opal_datatype_module.c
+++ b/opal/datatype/opal_datatype_module.c
@@ -33,6 +33,7 @@
 #include "opal/datatype/opal_datatype.h"
 #include "opal/datatype/opal_convertor_internal.h"
 #include "opal/mca/base/mca_base_var.h"
+#include "opal/datatype/opal_datatype_gpu.h"
 
 /* by default the debuging is turned off */
 int opal_datatype_dfd = -1;
@@ -225,6 +226,12 @@ int32_t opal_datatype_init( void )
         datatype->desc.desc[1].end_loop.first_elem_disp = datatype->desc.desc[0].elem.disp;
         datatype->desc.desc[1].end_loop.size            = datatype->size;
     }
+    
+#if defined (OPAL_DATATYPE_CUDA)
+    if (opal_datatype_gpu_init() != OPAL_SUCCESS) {
+        opal_datatype_gpu_fini();
+    }
+#endif /* defined OPAL_DATATYPE_CUDA */
 
     return OPAL_SUCCESS;
 }
@@ -248,6 +255,10 @@ int32_t opal_datatype_finalize( void )
     /* clear all master convertors */
     opal_convertor_destroy_masters();
 
+#if defined (OPAL_DATATYPE_CUDA)  
+    opal_datatype_gpu_fini();
+#endif /* defined OPAL_DATATYPE_CUDA */
+
     return OPAL_SUCCESS;
 }
 
diff --git a/opal/datatype/opal_datatype_pack.c b/opal/datatype/opal_datatype_pack.c
index 45f1213b811..9dc0666eb4e 100644
--- a/opal/datatype/opal_datatype_pack.c
+++ b/opal/datatype/opal_datatype_pack.c
@@ -37,6 +37,7 @@
 #include "opal/datatype/opal_datatype_checksum.h"
 #include "opal/datatype/opal_datatype_pack.h"
 #include "opal/datatype/opal_datatype_prototypes.h"
+#include "opal/datatype/opal_datatype_gpu.h"
 
 #if defined(CHECKSUM)
 #define opal_pack_homogeneous_contig_function           opal_pack_homogeneous_contig_checksum
@@ -287,6 +288,13 @@ opal_generic_simple_pack_function( opal_convertor_t* pConvertor,
                            (void*)pConvertor, (void*)pConvertor->pBaseBuf,
                            iov[0].iov_base, (unsigned long)iov[0].iov_len, *out_size ); );
 
+   if (opal_generic_simple_pack_function_cuda_p != NULL) {
+       int32_t rvalue = (*opal_generic_simple_pack_function_cuda_p)( pConvertor, iov, out_size, max_data);
+       if (rvalue != -99) { /* -99 is DRY RUN, to verify the result with CPU packing*/
+           return rvalue;
+       }
+   }
+
     description = pConvertor->use_desc->desc;
 
     /* For the first step we have to add both displacement to the source. After in the
@@ -312,8 +320,9 @@ opal_generic_simple_pack_function( opal_convertor_t* pConvertor,
         while( 1 ) {
             while( pElem->elem.common.flags & OPAL_DATATYPE_FLAG_DATA ) {
                 /* now here we have a basic datatype */
-                PACK_PREDEFINED_DATATYPE( pConvertor, pElem, count_desc,
-                                          conv_ptr, iov_ptr, iov_len_local );
+                (*pack_predefined_data_cuda_p)(pElem, &count_desc, &conv_ptr, &iov_ptr, &iov_len_local);
+                // PACK_PREDEFINED_DATATYPE( pConvertor, pElem, count_desc,
+                //                           conv_ptr, iov_ptr, iov_len_local );
                 if( 0 == count_desc ) {  /* completed */
                     conv_ptr = pConvertor->pBaseBuf + pStack->disp;
                     pos_desc++;  /* advance to the next data */
@@ -356,8 +365,9 @@ opal_generic_simple_pack_function( opal_convertor_t* pConvertor,
             if( OPAL_DATATYPE_LOOP == pElem->elem.common.type ) {
                 OPAL_PTRDIFF_TYPE local_disp = (OPAL_PTRDIFF_TYPE)conv_ptr;
                 if( pElem->loop.common.flags & OPAL_DATATYPE_FLAG_CONTIGUOUS ) {
-                    PACK_CONTIGUOUS_LOOP( pConvertor, pElem, count_desc,
-                                          conv_ptr, iov_ptr, iov_len_local );
+                    (*pack_contiguous_loop_cuda_p)(pElem, &count_desc, &conv_ptr, &iov_ptr, &iov_len_local);
+                    //PACK_CONTIGUOUS_LOOP( pConvertor, pElem, count_desc,
+                    //                      conv_ptr, iov_ptr, iov_len_local );
                     if( 0 == count_desc ) {  /* completed */
                         pos_desc += pElem->loop.items + 1;
                         goto update_loop_description;
@@ -379,6 +389,7 @@ opal_generic_simple_pack_function( opal_convertor_t* pConvertor,
         iov[iov_count].iov_len -= iov_len_local;  /* update the amount of valid data */
         total_packed += iov[iov_count].iov_len;
     }
+    (*opal_cuda_sync_device_p)();
     *max_data = total_packed;
     pConvertor->bConverted += total_packed;  /* update the already converted bytes */
     *out_size = iov_count;
diff --git a/opal/datatype/opal_datatype_pack.h b/opal/datatype/opal_datatype_pack.h
index c02ecf86ec5..b011f434472 100644
--- a/opal/datatype/opal_datatype_pack.h
+++ b/opal/datatype/opal_datatype_pack.h
@@ -51,6 +51,8 @@ static inline void pack_predefined_data( opal_convertor_t* CONVERTOR,
                                     (CONVERTOR)->pDesc, (CONVERTOR)->count );
         DO_DEBUG( opal_output( 0, "pack 1. memcpy( %p, %p, %lu ) => space %lu\n",
                                *(DESTINATION), _source, (unsigned long)_copy_blength, (unsigned long)(*(SPACE)) ); );
+        printf("pack 1. memcpy( %p, %p, %lu ) => space %lu\n",
+                               *(DESTINATION), _source, (unsigned long)_copy_blength, (unsigned long)(*(SPACE)) );
         MEMCPY_CSUM( *(DESTINATION), _source, _copy_blength, (CONVERTOR) );
         _source        += _copy_blength;
         *(DESTINATION) += _copy_blength;
diff --git a/opal/datatype/opal_datatype_unpack.c b/opal/datatype/opal_datatype_unpack.c
index e5c05e14e2d..f2c57593bcc 100644
--- a/opal/datatype/opal_datatype_unpack.c
+++ b/opal/datatype/opal_datatype_unpack.c
@@ -27,6 +27,7 @@
 
 #include "opal/datatype/opal_convertor_internal.h"
 #include "opal/datatype/opal_datatype_internal.h"
+#include "opal/datatype/opal_datatype_gpu.h"
 
 #if OPAL_ENABLE_DEBUG
 #include "opal/util/output.h"
@@ -275,6 +276,13 @@ opal_generic_simple_unpack_function( opal_convertor_t* pConvertor,
     DO_DEBUG( opal_output( 0, "opal_convertor_generic_simple_unpack( %p, {%p, %lu}, %u )\n",
                            (void*)pConvertor, iov[0].iov_base, (unsigned long)iov[0].iov_len, *out_size ); );
 
+//  if (opal_generic_simple_unpack_function_cuda_p != NULL) {
+//      int32_t rvalue = (*opal_generic_simple_unpack_function_cuda_p)( pConvertor, iov, out_size, max_data);
+//      if (rvalue != -99) { /* -99 is DRY RUN, to verify the result with CPU packing*/
+//          return rvalue;
+//      }
+//  }                      
+
     description = pConvertor->use_desc->desc;
 
     /* For the first step we have to add both displacement to the source. After in the
@@ -379,8 +387,9 @@ opal_generic_simple_unpack_function( opal_convertor_t* pConvertor,
             if( OPAL_DATATYPE_LOOP == pElem->elem.common.type ) {
                 OPAL_PTRDIFF_TYPE local_disp = (OPAL_PTRDIFF_TYPE)conv_ptr;
                 if( pElem->loop.common.flags & OPAL_DATATYPE_FLAG_CONTIGUOUS ) {
-                    UNPACK_CONTIGUOUS_LOOP( pConvertor, pElem, count_desc,
-                                            iov_ptr, conv_ptr, iov_len_local );
+                //    UNPACK_CONTIGUOUS_LOOP( pConvertor, pElem, count_desc, 
+                //                            iov_ptr, conv_ptr, iov_len_local );
+                    (*unpack_contiguous_loop_cuda_p)(pElem, &count_desc, &iov_ptr, &conv_ptr, &iov_len_local);
                     if( 0 == count_desc ) {  /* completed */
                         pos_desc += pElem->loop.items + 1;
                         goto update_loop_description;
diff --git a/opal/include/opal_config_top.h b/opal/include/opal_config_top.h
index 1ce5267c389..2f5ad1adec2 100644
--- a/opal/include/opal_config_top.h
+++ b/opal/include/opal_config_top.h
@@ -19,6 +19,8 @@
 #error "opal_config_top.h should only be included from opal_config.h"
 #endif
 
+#define OPAL_DATATYPE_CUDA
+
 /* The only purpose of this file is to undef the PACKAGE_<foo> macros
    that are put in by autoconf/automake projects.  Specifically, if
    you include a .h file from another project that defines these
diff --git a/test/datatype/ddt_test.c b/test/datatype/ddt_test.c
index 0afac9b49ec..12b4b31fc15 100644
--- a/test/datatype/ddt_test.c
+++ b/test/datatype/ddt_test.c
@@ -341,7 +341,7 @@ static int local_copy_with_convertor( ompi_datatype_t* pdt, int count, int chunk
 int main( int argc, char* argv[] )
 {
     ompi_datatype_t *pdt, *pdt1, *pdt2, *pdt3;
-    int rc, length = 500;
+    int rc, length = 500, i;
 
     opal_init_util(&argc, &argv);
     ompi_datatype_init();
@@ -350,7 +350,7 @@ int main( int argc, char* argv[] )
      * By default simulate homogeneous architectures.
      */
     remote_arch = opal_local_arch;
-    printf( "\n\n#\n * TEST INVERSED VECTOR\n #\n\n" );
+/*    printf( "\n\n#\n * TEST INVERSED VECTOR\n #\n\n" );
     pdt = create_inversed_vector( &ompi_mpi_int.dt, 10 );
     if( outputFlags & CHECK_PACK_UNPACK ) {
         local_copy_ddt_count(pdt, 100);
@@ -364,15 +364,17 @@ int main( int argc, char* argv[] )
         local_copy_with_convertor(pdt, 1, 956);
     }
     OBJ_RELEASE( pdt ); assert( pdt == NULL );
-
+*/    
     printf( "\n\n#\n * TEST UPPER TRIANGULAR MATRIX (size 100)\n #\n\n" );
-    pdt = upper_matrix(100);
+    pdt = upper_matrix(4000);
     if( outputFlags & CHECK_PACK_UNPACK ) {
-        local_copy_ddt_count(pdt, 1);
-        local_copy_with_convertor(pdt, 1, 48);
+        for (i = 1; i <= 4; i++) {
+//        local_copy_ddt_count(pdt, 1);
+    //    local_copy_with_convertor(pdt, 1, 1024*1024*200);
+        }
     }
     OBJ_RELEASE( pdt ); assert( pdt == NULL );
-
+  /*  
     mpich_typeub();
     mpich_typeub2();
     mpich_typeub3();
@@ -476,26 +478,104 @@ int main( int argc, char* argv[] )
         local_copy_with_convertor( pdt, 4500, 12 );
         local_copy_with_convertor_2datatypes( pdt, 4500, pdt, 4500, 12 );
         OBJ_RELEASE( pdt ); assert( pdt == NULL );
+    }*/
+    printf( ">>--------------------------------------------<<\n" );
+    printf( "Vector data-type (4000 times 512 double stride 640)\n" );
+#if 0
+    pdt = create_vector_type( MPI_DOUBLE, 4000, 512, 640 );
+    opal_datatype_resize(&pdt->super, 0, pdt->super.size+sizeof(MPI_DOUBLE));
+    ompi_datatype_create_contiguous( 1, pdt, &pdt1 );
+#else
+    pdt = create_vector_type( MPI_DOUBLE, 4000, 512, 640 );
+  //  opal_datatype_resize(&pdt->super, 0, pdt->super.size+sizeof(MPI_DOUBLE));
+  //  ompi_datatype_create_contiguous( 4000, pdt, &pdt1 );
+#endif
+//    ompi_datatype_dump( pdt );
+ //   ompi_datatype_commit(&pdt1);
+    if( outputFlags & CHECK_PACK_UNPACK ) {
+        for (i = 0; i < 10; i++) {
+            local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 1024*1024*30 );
+        }
     }
     printf( ">>--------------------------------------------<<\n" );
+    printf( "Vector data-type (4000 times 384 double stride 512)\n" );
+    pdt = create_vector_type( MPI_DOUBLE, 4000, 384, 512 );
+    opal_datatype_resize(&pdt->super, 0, pdt->super.size+sizeof(MPI_DOUBLE));
+    ompi_datatype_create_contiguous( 1, pdt, &pdt1 );
+//    ompi_datatype_dump( pdt );
+    if( outputFlags & CHECK_PACK_UNPACK ) {
+        for (i = 0; i < 10; i++) {
+       // local_copy_ddt_count(pdt, 1);
+      //  local_copy_with_convertor( pdt, 1, 12 );
+      //  local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 12 );
+      //  local_copy_with_convertor( pdt, 1, 82 );
+      //  local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 82 );
+      //  local_copy_with_convertor( pdt, 1, 6000 );
+      //  local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 6000 );
+      //  local_copy_with_convertor( pdt, 1, 36000 );
+     //     local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 1024*1024*20 );
+        }
+    }
     printf( ">>--------------------------------------------<<\n" );
-    printf( "Vector data-type (450 times 10 double stride 11)\n" );
-    pdt = create_vector_type( MPI_DOUBLE, 450, 10, 11 );
-    ompi_datatype_dump( pdt );
+    OBJ_RELEASE( pdt ); assert( pdt == NULL );
+    
+    printf( "Vector data-type (4000 times 256 double stride 384)\n" );
+    pdt = create_vector_type( MPI_DOUBLE, 4000, 256, 384 );
+//    ompi_datatype_dump( pdt );
     if( outputFlags & CHECK_PACK_UNPACK ) {
-        local_copy_ddt_count(pdt, 1);
-        local_copy_with_convertor( pdt, 1, 12 );
-        local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 12 );
-        local_copy_with_convertor( pdt, 1, 82 );
-        local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 82 );
-        local_copy_with_convertor( pdt, 1, 6000 );
-        local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 6000 );
-        local_copy_with_convertor( pdt, 1, 36000 );
-        local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 36000 );
+        for (i = 0; i < 10; i++) {
+       // local_copy_ddt_count(pdt, 1);
+      //  local_copy_with_convertor( pdt, 1, 12 );
+      //  local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 12 );
+      //  local_copy_with_convertor( pdt, 1, 82 );
+      //  local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 82 );
+      //  local_copy_with_convertor( pdt, 1, 6000 );
+      //  local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 6000 );
+      //  local_copy_with_convertor( pdt, 1, 36000 );
+  //        local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 1024*1024*10 );
+        }
     }
     printf( ">>--------------------------------------------<<\n" );
     OBJ_RELEASE( pdt ); assert( pdt == NULL );
-
+    
+    printf( "Vector data-type (4000 times 128 double stride 256)\n" );
+    pdt = create_vector_type( MPI_DOUBLE, 4000, 128, 256 );
+//    ompi_datatype_dump( pdt );
+    if( outputFlags & CHECK_PACK_UNPACK ) {
+        for (i = 0; i < 10; i++) {
+       // local_copy_ddt_count(pdt, 1);
+      //  local_copy_with_convertor( pdt, 1, 12 );
+      //  local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 12 );
+      //  local_copy_with_convertor( pdt, 1, 82 );
+      //  local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 82 );
+      //  local_copy_with_convertor( pdt, 1, 6000 );
+      //  local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 6000 );
+      //  local_copy_with_convertor( pdt, 1, 36000 );
+   //       local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 1024*1024*5 );
+        }
+    }
+    printf( ">>--------------------------------------------<<\n" );
+    OBJ_RELEASE( pdt ); assert( pdt == NULL );
+    
+    printf( "Vector data-type (2000 times 3 double stride 4)\n" );
+    pdt = create_vector_type( MPI_DOUBLE, 2000, 3, 4 );
+//    ompi_datatype_dump( pdt );
+    if( outputFlags & CHECK_PACK_UNPACK ) {
+        for (i = 0; i < 10; i++) {
+       // local_copy_ddt_count(pdt, 1);
+      //  local_copy_with_convertor( pdt, 1, 12 );
+      //  local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 12 );
+      //  local_copy_with_convertor( pdt, 1, 82 );
+      //  local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 82 );
+      //  local_copy_with_convertor( pdt, 1, 6000 );
+      //  local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 6000 );
+      //  local_copy_with_convertor( pdt, 1, 36000 );
+      //  local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 1024*1024*4 );
+        }
+    }
+    printf( ">>--------------------------------------------<<\n" );
+    OBJ_RELEASE( pdt ); assert( pdt == NULL );
+    /*
     printf( ">>--------------------------------------------<<\n" );
     pdt = test_struct_char_double();
     if( outputFlags & CHECK_PACK_UNPACK ) {
@@ -541,7 +621,7 @@ int main( int argc, char* argv[] )
     printf( ">>--------------------------------------------<<\n" );
     OBJ_RELEASE( pdt1 ); assert( pdt1 == NULL );
     OBJ_RELEASE( pdt2 ); assert( pdt2 == NULL );
-
+*/
     /* clean-ups all data allocations */
     ompi_datatype_finalize();
 

From e3463fa0da86043b6ff9259dd0d2d8243184e689 Mon Sep 17 00:00:00 2001
From: eddy16112 <eddy16112@gmail.com>
Date: Fri, 14 Nov 2014 14:03:34 -0500
Subject: [PATCH 002/190] indexed datatype new, bonus stask support. Add
 support for iovec and for pipeline iovec. a new way to compute nb_block and
 thread_per_block

Conflicts:
	test/datatype/Makefile.am
---
 opal/datatype/cuda/Makefile                   |    2 +-
 opal/datatype/cuda/opal_config.h              | 2792 +++++++++++++++++
 opal/datatype/cuda/opal_datatype_cuda.cu      |  117 +-
 opal/datatype/cuda/opal_datatype_cuda.cuh     |   10 +
 .../cuda/opal_datatype_cuda_internal.cuh      |  383 +--
 .../cuda/opal_datatype_orig_internal.h        |  646 ++++
 .../cuda/opal_datatype_pack_cuda_kernel.cu    |  518 +--
 .../cuda/opal_datatype_pack_cuda_wrapper.cu   |  437 ++-
 .../cuda/opal_datatype_unpack_cuda_kernel.cu  |   67 +-
 .../cuda/opal_datatype_unpack_cuda_wrapper.cu |  252 +-
 opal/datatype/opal_convertor.c                |   23 +-
 opal/datatype/opal_datatype_cuda.c            |    1 +
 opal/datatype/opal_datatype_gpu.c             |   26 +
 opal/datatype/opal_datatype_gpu.h             |   20 +-
 opal/datatype/opal_datatype_module.c          |    6 -
 opal/datatype/opal_datatype_pack.c            |   44 +-
 opal/datatype/opal_datatype_pack.h            |    2 -
 opal/datatype/opal_datatype_prototypes.h      |   16 +
 opal/datatype/opal_datatype_unpack.c          |   39 +-
 test/datatype/Makefile.am                     |   13 +-
 test/datatype/ddt_lib.c                       |   33 +-
 test/datatype/ddt_lib.h                       |    7 +-
 test/datatype/ddt_test.c                      |  477 ++-
 23 files changed, 5310 insertions(+), 621 deletions(-)
 create mode 100644 opal/datatype/cuda/opal_config.h
 create mode 100644 opal/datatype/cuda/opal_datatype_orig_internal.h

diff --git a/opal/datatype/cuda/Makefile b/opal/datatype/cuda/Makefile
index d42ab556fae..6be10afd0fd 100644
--- a/opal/datatype/cuda/Makefile
+++ b/opal/datatype/cuda/Makefile
@@ -5,7 +5,7 @@ ARCHFLAGS	= cr
 RANLIB		= ranlib
 STLIB		?= opal_datatype_cuda.a
 DYLIB		?= opal_datatype_cuda.so
-CFLAGS		= -g -G -O0
+CFLAGS		= -g -G -O0 
 EXTLIB		= -L/home/wwu12/ompi/ompi-cuda/opal/datatype/.libs -ldatatype
 INC			=
 
diff --git a/opal/datatype/cuda/opal_config.h b/opal/datatype/cuda/opal_config.h
new file mode 100644
index 00000000000..19fa55f52ed
--- /dev/null
+++ b/opal/datatype/cuda/opal_config.h
@@ -0,0 +1,2792 @@
+/* opal/include/opal_config.h.  Generated from opal_config.h.in by configure.  */
+/* opal/include/opal_config.h.in.  Generated from configure.ac by autoheader.  */
+
+/* -*- c -*-
+ *
+ * Copyright (c) 2004-2005 The Trustees of Indiana University.
+ *                         All rights reserved.
+ * Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
+ *                         All rights reserved.
+ * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
+ *                         University of Stuttgart.  All rights reserved.
+ * Copyright (c) 2004-2005 The Regents of the University of California.
+ *                         All rights reserved.
+ * Copyright (c) 2014      Intel, Inc. All rights reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ *
+ * Function: - OS, CPU and compiler dependent configuration
+ */
+
+#ifndef OPAL_CONFIG_H
+#define OPAL_CONFIG_H
+
+/* Define if building universal (internal helper macro) */
+/* #undef AC_APPLE_UNIVERSAL_BUILD */
+
+/* enable openib BTL failover */
+#define BTL_OPENIB_FAILOVER_ENABLED 0
+
+/* Whether the openib BTL malloc hooks are enabled */
+#define BTL_OPENIB_MALLOC_HOOKS_ENABLED 1
+
+/* rdmacm without IB_AF addressing support */
+/* #undef BTL_OPENIB_RDMACM_IB_ADDR */
+
+/* BLCR cr_request_file check */
+/* #undef CRS_BLCR_HAVE_CR_REQUEST */
+
+/* BLCR cr_request_checkpoint check */
+/* #undef CRS_BLCR_HAVE_CR_REQUEST_CHECKPOINT */
+
+/* BLCRs cr_checkpoint_info.requester member availability */
+/* #undef CRS_BLCR_HAVE_INFO_REQUESTER */
+
+/* Version of event */
+/* #undef EVENT_EXTERNAL_EVENT_VERSION */
+
+/* Define to 1 if you have the <aio.h> header file. */
+#define HAVE_AIO_H 1
+
+/* Define to 1 if you have the <alloca.h> header file. */
+#define HAVE_ALLOCA_H 1
+
+/* Define to 1 if you have the <alps/apInfo.h> header file. */
+/* #undef HAVE_ALPS_APINFO_H */
+
+/* Define to 1 if you have the <arpa/inet.h> header file. */
+#define HAVE_ARPA_INET_H 1
+
+/* Define to 1 if you have the `asprintf' function. */
+#define HAVE_ASPRINTF 1
+
+/* Define to 1 if the system has the type `CACHE_DESCRIPTOR'. */
+/* #undef HAVE_CACHE_DESCRIPTOR */
+
+/* Define to 1 if the system has the type `CACHE_RELATIONSHIP'. */
+/* #undef HAVE_CACHE_RELATIONSHIP */
+
+/* Define to 1 if you have the `clz' function. */
+/* #undef HAVE_CLZ */
+
+/* Define to 1 if you have the `clzl' function. */
+/* #undef HAVE_CLZL */
+
+/* Define to 1 if you have the <CL/cl_ext.h> header file. */
+#define HAVE_CL_CL_EXT_H 1
+
+/* Define to 1 if you have the <complex.h> header file. */
+#define HAVE_COMPLEX_H 1
+
+/* Define to 1 if you have the `cpuset_setaffinity' function. */
+/* #undef HAVE_CPUSET_SETAFFINITY */
+
+/* Define to 1 if you have the `cpuset_setid' function. */
+/* #undef HAVE_CPUSET_SETID */
+
+/* Define to 1 if you have the <criu/criu.h> header file. */
+/* #undef HAVE_CRIU_CRIU_H */
+
+/* Define to 1 if you have the <crt_externs.h> header file. */
+/* #undef HAVE_CRT_EXTERNS_H */
+
+/* Define to 1 if we have -lcuda */
+/* #undef HAVE_CUDA */
+
+/* Define to 1 if you have the <cuda.h> header file. */
+/* #undef HAVE_CUDA_H */
+
+/* Define to 1 if you have the <cuda_runtime_api.h> header file. */
+/* #undef HAVE_CUDA_RUNTIME_API_H */
+
+/* Define to 1 if you have the <curl/curl.h> header file. */
+/* #undef HAVE_CURL_CURL_H */
+
+/* Define to 1 if you have the `dbm_open' function. */
+/* #undef HAVE_DBM_OPEN */
+
+/* Define to 1 if you have the `dbopen' function. */
+/* #undef HAVE_DBOPEN */
+
+/* Define to 1 if you have the <db.h> header file. */
+/* #undef HAVE_DB_H */
+
+/* Define to 1 if you have the declaration of `AF_INET6', and to 0 if you
+   don't. */
+#define HAVE_DECL_AF_INET6 1
+
+/* Define to 1 if you have the declaration of `AF_UNSPEC', and to 0 if you
+   don't. */
+#define HAVE_DECL_AF_UNSPEC 1
+
+/* Define to 1 if you have the declaration of `CL_DEVICE_TOPOLOGY_AMD', and to
+   0 if you don't. */
+#define HAVE_DECL_CL_DEVICE_TOPOLOGY_AMD 0
+
+/* Define to 1 if you have the declaration of `CTL_HW', and to 0 if you don't.
+   */
+#define HAVE_DECL_CTL_HW 0
+
+/* Define to 1 if you have the declaration of `fabsf', and to 0 if you don't.
+   */
+#define HAVE_DECL_FABSF 1
+
+/* Define to 1 if you have the declaration of `HW_NCPU', and to 0 if you
+   don't. */
+#define HAVE_DECL_HW_NCPU 0
+
+/* Define to 1 if you have the declaration of `HZ', and to 0 if you don't. */
+#define HAVE_DECL_HZ 1
+
+/* Define to 1 if you have the declaration of `IBV_ACCESS_ALLOCATE_MR', and to
+   0 if you don't. */
+/* #undef HAVE_DECL_IBV_ACCESS_ALLOCATE_MR */
+
+/* Define to 1 if you have the declaration of
+   `IBV_ACCESS_SHARED_MR_USER_READ', and to 0 if you don't. */
+/* #undef HAVE_DECL_IBV_ACCESS_SHARED_MR_USER_READ */
+
+/* Define to 1 if you have the declaration of `IBV_ACCESS_SO', and to 0 if you
+   don't. */
+/* #undef HAVE_DECL_IBV_ACCESS_SO */
+
+/* Define to 1 if you have the declaration of `IBV_EVENT_CLIENT_REREGISTER',
+   and to 0 if you don't. */
+/* #undef HAVE_DECL_IBV_EVENT_CLIENT_REREGISTER */
+
+/* Define to 1 if you have the declaration of `IBV_EVENT_GID_CHANGE', and to 0
+   if you don't. */
+/* #undef HAVE_DECL_IBV_EVENT_GID_CHANGE */
+
+/* Define to 1 if you have the declaration of `ibv_event_type_str', and to 0
+   if you don't. */
+/* #undef HAVE_DECL_IBV_EVENT_TYPE_STR */
+
+/* Define to 1 if you have the declaration of `IBV_EXP_ACCESS_ALLOCATE_MR',
+   and to 0 if you don't. */
+/* #undef HAVE_DECL_IBV_EXP_ACCESS_ALLOCATE_MR */
+
+/* Define to 1 if you have the declaration of
+   `IBV_EXP_ACCESS_SHARED_MR_USER_READ', and to 0 if you don't. */
+/* #undef HAVE_DECL_IBV_EXP_ACCESS_SHARED_MR_USER_READ */
+
+/* Define to 1 if you have the declaration of `IBV_LINK_LAYER_ETHERNET', and
+   to 0 if you don't. */
+/* #undef HAVE_DECL_IBV_LINK_LAYER_ETHERNET */
+
+/* Define to 1 if you have the declaration of `IBV_NODE_USNIC', and to 0 if
+   you don't. */
+/* #undef HAVE_DECL_IBV_NODE_USNIC */
+
+/* Define to 1 if you have the declaration of `IBV_TRANSPORT_USNIC', and to 0
+   if you don't. */
+/* #undef HAVE_DECL_IBV_TRANSPORT_USNIC */
+
+/* Define to 1 if you have the declaration of `IBV_TRANSPORT_USNIC_UDP', and
+   to 0 if you don't. */
+/* #undef HAVE_DECL_IBV_TRANSPORT_USNIC_UDP */
+
+/* Define to 1 if you have the declaration of
+   `nvmlDeviceGetMaxPcieLinkGeneration', and to 0 if you don't. */
+/* #undef HAVE_DECL_NVMLDEVICEGETMAXPCIELINKGENERATION */
+
+/* Define to 1 if you have the declaration of `PCI_LOOKUP_NO_NUMBERS', and to
+   0 if you don't. */
+/* #undef HAVE_DECL_PCI_LOOKUP_NO_NUMBERS */
+
+/* Define to 1 if you have the declaration of `PF_INET6', and to 0 if you
+   don't. */
+#define HAVE_DECL_PF_INET6 1
+
+/* Define to 1 if you have the declaration of `PF_UNSPEC', and to 0 if you
+   don't. */
+#define HAVE_DECL_PF_UNSPEC 1
+
+/* Define to 1 if you have the declaration of `pthread_getaffinity_np', and to
+   0 if you don't. */
+#define HAVE_DECL_PTHREAD_GETAFFINITY_NP 1
+
+/* Define to 1 if you have the declaration of `pthread_setaffinity_np', and to
+   0 if you don't. */
+#define HAVE_DECL_PTHREAD_SETAFFINITY_NP 1
+
+/* Define to 1 if you have the declaration of `RLIMIT_AS', and to 0 if you
+   don't. */
+#define HAVE_DECL_RLIMIT_AS 1
+
+/* Define to 1 if you have the declaration of `RLIMIT_CORE', and to 0 if you
+   don't. */
+#define HAVE_DECL_RLIMIT_CORE 1
+
+/* Define to 1 if you have the declaration of `RLIMIT_FSIZE', and to 0 if you
+   don't. */
+#define HAVE_DECL_RLIMIT_FSIZE 1
+
+/* Define to 1 if you have the declaration of `RLIMIT_MEMLOCK', and to 0 if
+   you don't. */
+#define HAVE_DECL_RLIMIT_MEMLOCK 1
+
+/* Define to 1 if you have the declaration of `RLIMIT_NOFILE', and to 0 if you
+   don't. */
+#define HAVE_DECL_RLIMIT_NOFILE 1
+
+/* Define to 1 if you have the declaration of `RLIMIT_NPROC', and to 0 if you
+   don't. */
+#define HAVE_DECL_RLIMIT_NPROC 1
+
+/* Define to 1 if you have the declaration of `RLIMIT_STACK', and to 0 if you
+   don't. */
+#define HAVE_DECL_RLIMIT_STACK 1
+
+/* Define to 1 if you have the declaration of `sbrk', and to 0 if you don't.
+   */
+#define HAVE_DECL_SBRK 1
+
+/* Define to 1 if you have the declaration of `strtoull', and to 0 if you
+   don't. */
+#define HAVE_DECL_STRTOULL 1
+
+/* Define to 1 if you have the declaration of `_SC_LARGE_PAGESIZE', and to 0
+   if you don't. */
+#define HAVE_DECL__SC_LARGE_PAGESIZE 0
+
+/* Define to 1 if you have the declaration of `_SC_NPROCESSORS_CONF', and to 0
+   if you don't. */
+#define HAVE_DECL__SC_NPROCESSORS_CONF 1
+
+/* Define to 1 if you have the declaration of `_SC_NPROCESSORS_ONLN', and to 0
+   if you don't. */
+#define HAVE_DECL__SC_NPROCESSORS_ONLN 1
+
+/* Define to 1 if you have the declaration of `_SC_NPROC_CONF', and to 0 if
+   you don't. */
+#define HAVE_DECL__SC_NPROC_CONF 0
+
+/* Define to 1 if you have the declaration of `_SC_NPROC_ONLN', and to 0 if
+   you don't. */
+#define HAVE_DECL__SC_NPROC_ONLN 0
+
+/* Define to 1 if you have the declaration of `_SC_PAGESIZE', and to 0 if you
+   don't. */
+#define HAVE_DECL__SC_PAGESIZE 1
+
+/* Define to 1 if you have the declaration of `_SC_PAGE_SIZE', and to 0 if you
+   don't. */
+#define HAVE_DECL__SC_PAGE_SIZE 1
+
+/* Define to 1 if you have the declaration of `__func__', and to 0 if you
+   don't. */
+#define HAVE_DECL___FUNC__ 1
+
+/* Define to 1 if you have the <dirent.h> header file. */
+#define HAVE_DIRENT_H 1
+
+/* Define to 1 if you have the <dlfcn.h> header file. */
+#define HAVE_DLFCN_H 1
+
+/* Define to 1 if you have the `dlsym' function. */
+#define HAVE_DLSYM 1
+
+/* Define to 1 if the system has the type `double _Complex'. */
+#define HAVE_DOUBLE__COMPLEX 1
+
+/* Define to 1 if you have the <err.h> header file. */
+#define HAVE_ERR_H 1
+
+/* Define to 1 if you have the <event.h> header file. */
+/* #undef HAVE_EVENT_H */
+
+/* Define to 1 if you have the <execinfo.h> header file. */
+#define HAVE_EXECINFO_H 1
+
+/* Define to 1 if you have the `execve' function. */
+#define HAVE_EXECVE 1
+
+/* Define to 1 if you have the <fca_api.h> header file. */
+/* #undef HAVE_FCA_API_H */
+
+/* Define to 1 if you have the <fcntl.h> header file. */
+#define HAVE_FCNTL_H 1
+
+/* Define to 1 if you have the `ffs' function. */
+#define HAVE_FFS 1
+
+/* Define to 1 if you have the `ffsl' function. */
+#define HAVE_FFSL 1
+
+/* Define to 1 if the system has the type `float _Complex'. */
+#define HAVE_FLOAT__COMPLEX 1
+
+/* Define to 1 if you have the `fls' function. */
+/* #undef HAVE_FLS */
+
+/* Define to 1 if you have the `flsl' function. */
+/* #undef HAVE_FLSL */
+
+/* Define to 1 if you have the `fork' function. */
+#define HAVE_FORK 1
+
+/* Define to 1 if you have the `getpagesize' function. */
+#define HAVE_GETPAGESIZE 1
+
+/* Define to 1 if you have the `getpwuid' function. */
+#define HAVE_GETPWUID 1
+
+/* Define to 1 if you have the `GNI_GetJobResInfo' function. */
+/* #undef HAVE_GNI_GETJOBRESINFO */
+
+/* Define to 1 if the system has the type `GROUP_AFFINITY'. */
+/* #undef HAVE_GROUP_AFFINITY */
+
+/* Define to 1 if the system has the type `GROUP_RELATIONSHIP'. */
+/* #undef HAVE_GROUP_RELATIONSHIP */
+
+/* Define to 1 if you have the <grp.h> header file. */
+#define HAVE_GRP_H 1
+
+/* Define to 1 if you have the <hcoll_api.h> header file. */
+/* #undef HAVE_HCOLL_API_H */
+
+/* Define to 1 if you have the <hostLib.h> header file. */
+/* #undef HAVE_HOSTLIB_H */
+
+/* Define to 1 if you have the `host_info' function. */
+/* #undef HAVE_HOST_INFO */
+
+/* Define to 1 if you have the <hwloc.h> header file. */
+/* #undef HAVE_HWLOC_H */
+
+/* Define to 1 if you have the `ibv_create_xrc_rcv_qp' function. */
+/* #undef HAVE_IBV_CREATE_XRC_RCV_QP */
+
+/* Define to 1 if you have the `ibv_fork_init' function. */
+/* #undef HAVE_IBV_FORK_INIT */
+
+/* Define to 1 if you have the `ibv_get_device_list' function. */
+/* #undef HAVE_IBV_GET_DEVICE_LIST */
+
+/* Define to 1 if you have the `ibv_resize_cq' function. */
+/* #undef HAVE_IBV_RESIZE_CQ */
+
+/* Define to 1 if you have the <ifaddrs.h> header file. */
+#define HAVE_IFADDRS_H 1
+
+/* Define to 1 if you have the <infiniband/driver.h> header file. */
+/* #undef HAVE_INFINIBAND_DRIVER_H */
+
+/* Define to 1 if you have the <infiniband/verbs.h> header file. */
+/* #undef HAVE_INFINIBAND_VERBS_H */
+
+/* Define to 1 if the system has the type `int128_t'. */
+/* #undef HAVE_INT128_T */
+
+/* Define to 1 if the system has the type `int16_t'. */
+#define HAVE_INT16_T 1
+
+/* Define to 1 if the system has the type `int32_t'. */
+#define HAVE_INT32_T 1
+
+/* Define to 1 if the system has the type `int64_t'. */
+#define HAVE_INT64_T 1
+
+/* Define to 1 if the system has the type `int8_t'. */
+#define HAVE_INT8_T 1
+
+/* Define to 1 if the system has the type `intptr_t'. */
+#define HAVE_INTPTR_T 1
+
+/* Define to 1 if you have the <inttypes.h> header file. */
+#define HAVE_INTTYPES_H 1
+
+/* Define to 1 if you have the <ioLib.h> header file. */
+/* #undef HAVE_IOLIB_H */
+
+/* Define to 1 if you have the `isatty' function. */
+#define HAVE_ISATTY 1
+
+/* Define to 1 if the system has the type `KAFFINITY'. */
+/* #undef HAVE_KAFFINITY */
+
+/* Define to 1 if you have the <knem_io.h> header file. */
+/* #undef HAVE_KNEM_IO_H */
+
+/* Define to 1 if you have the <kstat.h> header file. */
+/* #undef HAVE_KSTAT_H */
+
+/* Define to 1 if you have the <libcr.h> header file. */
+/* #undef HAVE_LIBCR_H */
+
+/* Define to 1 if you have the `event' library (-levent). */
+/* #undef HAVE_LIBEVENT */
+
+/* Define to 1 if you have the `event_pthreads' library (-levent_pthreads). */
+/* #undef HAVE_LIBEVENT_PTHREADS */
+
+/* Define to 1 if we have -lgdi32 */
+/* #undef HAVE_LIBGDI32 */
+
+/* Define to 1 if you have the <libgen.h> header file. */
+#define HAVE_LIBGEN_H 1
+
+/* Define to 1 if we have -lkstat */
+/* #undef HAVE_LIBKSTAT */
+
+/* Define to 1 if we have -llgrp */
+/* #undef HAVE_LIBLGRP */
+
+/* Define to 1 if you have the `pci' library (-lpci). */
+/* #undef HAVE_LIBPCI */
+
+/* Define to 1 if you have the <libutil.h> header file. */
+/* #undef HAVE_LIBUTIL_H */
+
+/* Define to 1 if you have the <limits.h> header file. */
+#define HAVE_LIMITS_H 1
+
+/* Define to 1 if the system has the type `LOGICAL_PROCESSOR_RELATIONSHIP'. */
+/* #undef HAVE_LOGICAL_PROCESSOR_RELATIONSHIP */
+
+/* Define to 1 if the system has the type `long double'. */
+#define HAVE_LONG_DOUBLE 1
+
+/* Define to 1 if the system has the type `long double _Complex'. */
+#define HAVE_LONG_DOUBLE__COMPLEX 1
+
+/* Define to 1 if the system has the type `long long'. */
+#define HAVE_LONG_LONG 1
+
+/* Define to 1 if you have the <lsf/lsbatch.h> header file. */
+/* #undef HAVE_LSF_LSBATCH_H */
+
+/* Define to 1 if you have the <lsf/lsf.h> header file. */
+/* #undef HAVE_LSF_LSF_H */
+
+/* Define to 1 if you have the <ltdl.h> header file. */
+/* #undef HAVE_LTDL_H */
+
+/* Define to 1 if you have the <lustre/liblustreapi.h> header file. */
+/* #undef HAVE_LUSTRE_LIBLUSTREAPI_H */
+
+/* Define to 1 if you have the <mach/mach_host.h> header file. */
+/* #undef HAVE_MACH_MACH_HOST_H */
+
+/* Define to 1 if you have the <mach/mach_init.h> header file. */
+/* #undef HAVE_MACH_MACH_INIT_H */
+
+/* Define to 1 if you have the <mach/mach_time.h> header file. */
+/* #undef HAVE_MACH_MACH_TIME_H */
+
+/* Define to 1 if you have the <malloc.h> header file. */
+#define HAVE_MALLOC_H 1
+
+/* Define to 1 if you have the `memalign' function. */
+#define HAVE_MEMALIGN 1
+
+/* Define to 1 if you have the <memory.h> header file. */
+#define HAVE_MEMORY_H 1
+
+/* Define to 1 if you have the `mkfifo' function. */
+#define HAVE_MKFIFO 1
+
+/* Define to 1 if you have the `mmap' function. */
+#define HAVE_MMAP 1
+
+/* Define to 1 if the system has the type `mode_t'. */
+#define HAVE_MODE_T 1
+
+/* Define to 1 if you have the <mtcp.h> header file. */
+/* #undef HAVE_MTCP_H */
+
+/* Define to 1 if you have the <mxm/api/mxm_api.h> header file. */
+/* #undef HAVE_MXM_API_MXM_API_H */
+
+/* Define to 1 if you have the <ndbm.h> header file. */
+/* #undef HAVE_NDBM_H */
+
+/* Define to 1 if you have the <netdb.h> header file. */
+#define HAVE_NETDB_H 1
+
+/* Define to 1 if you have the <netinet/in.h> header file. */
+#define HAVE_NETINET_IN_H 1
+
+/* Define to 1 if you have the <netinet/tcp.h> header file. */
+#define HAVE_NETINET_TCP_H 1
+
+/* Define to 1 if you have the <netlink/netlink.h> header file. */
+/* #undef HAVE_NETLINK_NETLINK_H */
+
+/* Define to 1 if you have the <net/if.h> header file. */
+#define HAVE_NET_IF_H 1
+
+/* Define to 1 if you have the <net/uio.h> header file. */
+/* #undef HAVE_NET_UIO_H */
+
+/* Define to 1 if you have the <numaif.h> header file. */
+/* #undef HAVE_NUMAIF_H */
+
+/* Define to 1 if the system has the type `NUMA_NODE_RELATIONSHIP'. */
+/* #undef HAVE_NUMA_NODE_RELATIONSHIP */
+
+/* Define to 1 if you have the <NVCtrl/NVCtrl.h> header file. */
+/* #undef HAVE_NVCTRL_NVCTRL_H */
+
+/* Define to 1 if you have the <nvml.h> header file. */
+/* #undef HAVE_NVML_H */
+
+/* Define to 1 if you have the `on_exit' function. */
+#define HAVE_ON_EXIT 1
+
+/* Define to 1 if you have the `openat' function. */
+#define HAVE_OPENAT 1
+
+/* Define to 1 if you have the `openpty' function. */
+#define HAVE_OPENPTY 1
+
+/* Define to 1 if you have the <pci/pci.h> header file. */
+/* #undef HAVE_PCI_PCI_H */
+
+/* Define to 1 if you have the <picl.h> header file. */
+/* #undef HAVE_PICL_H */
+
+/* Define to 1 if you have the `pipe' function. */
+#define HAVE_PIPE 1
+
+/* Define to 1 if you have the <plfs.h> header file. */
+/* #undef HAVE_PLFS_H */
+
+/* Define to 1 if you have the <pmapi.h> header file. */
+/* #undef HAVE_PMAPI_H */
+
+/* Define to 1 if you have the `pm_cycles' function. */
+/* #undef HAVE_PM_CYCLES */
+
+/* Define to 1 if you have the <poll.h> header file. */
+#define HAVE_POLL_H 1
+
+/* Define to 1 if you have the <portals4.h> header file. */
+/* #undef HAVE_PORTALS4_H */
+
+/* Define to 1 if you have the `posix_memalign' function. */
+#define HAVE_POSIX_MEMALIGN 1
+
+/* Define to 1 if you have the `printstack' function. */
+/* #undef HAVE_PRINTSTACK */
+
+/* Define to 1 if the system has the type `PROCESSOR_CACHE_TYPE'. */
+/* #undef HAVE_PROCESSOR_CACHE_TYPE */
+
+/* Define to 1 if the system has the type `PROCESSOR_GROUP_INFO'. */
+/* #undef HAVE_PROCESSOR_GROUP_INFO */
+
+/* Define to 1 if the system has the type `PROCESSOR_RELATIONSHIP'. */
+/* #undef HAVE_PROCESSOR_RELATIONSHIP */
+
+/* Define to 1 if the system has the type `PSAPI_WORKING_SET_EX_BLOCK'. */
+/* #undef HAVE_PSAPI_WORKING_SET_EX_BLOCK */
+
+/* Define to 1 if the system has the type `PSAPI_WORKING_SET_EX_INFORMATION'.
+   */
+/* #undef HAVE_PSAPI_WORKING_SET_EX_INFORMATION */
+
+/* Define to 1 if you have the <psm.h> header file. */
+/* #undef HAVE_PSM_H */
+
+/* Define to 1 if you have the `pthread_condattr_setpshared' function. */
+#define HAVE_PTHREAD_CONDATTR_SETPSHARED 1
+
+/* Define to 1 if you have the <pthread.h> header file. */
+#define HAVE_PTHREAD_H 1
+
+/* Define to 1 if you have the `pthread_mutexattr_setpshared' function. */
+#define HAVE_PTHREAD_MUTEXATTR_SETPSHARED 1
+
+/* Define to 1 if you have the <pthread_np.h> header file. */
+/* #undef HAVE_PTHREAD_NP_H */
+
+/* Define to 1 if the system has the type `pthread_t'. */
+#define HAVE_PTHREAD_T 1
+
+/* Define to 1 if the system has the type `ptrdiff_t'. */
+#define HAVE_PTRDIFF_T 1
+
+/* Define to 1 if you have the `ptsname' function. */
+#define HAVE_PTSNAME 1
+
+/* Define to 1 if you have the <pty.h> header file. */
+#define HAVE_PTY_H 1
+
+/* Define to 1 if you have the <pvfs2.h> header file. */
+/* #undef HAVE_PVFS2_H */
+
+/* Define to 1 if you have the <pwd.h> header file. */
+#define HAVE_PWD_H 1
+
+/* Define to 1 if you have the <rdma/rdma_cma.h> header file. */
+/* #undef HAVE_RDMA_RDMA_CMA_H */
+
+/* Define to 1 if you have the <rdma/rsocket.h> header file. */
+/* #undef HAVE_RDMA_RSOCKET_H */
+
+/* Define to 1 if you have the `regcmp' function. */
+/* #undef HAVE_REGCMP */
+
+/* Define to 1 if you have the `regexec' function. */
+#define HAVE_REGEXEC 1
+
+/* Define to 1 if you have the <regex.h> header file. */
+#define HAVE_REGEX_H 1
+
+/* Define to 1 if you have the `regfree' function. */
+#define HAVE_REGFREE 1
+
+/* Define to 1 if the system has the type `RelationProcessorPackage'. */
+/* #undef HAVE_RELATIONPROCESSORPACKAGE */
+
+/* Define to 1 if you have the <sched.h> header file. */
+#define HAVE_SCHED_H 1
+
+/* Define to 1 if you have the <scif.h> header file. */
+#define HAVE_SCIF_H 1
+
+/* Define to 1 if you have the `setenv' function. */
+#define HAVE_SETENV 1
+
+/* Define to 1 if you have the `setlocale' function. */
+#define HAVE_SETLOCALE 1
+
+/* Define to 1 if you have the `setpgid' function. */
+#define HAVE_SETPGID 1
+
+/* Define to 1 if you have the `setsid' function. */
+#define HAVE_SETSID 1
+
+/* Define to 1 if you have the <shlwapi.h> header file. */
+/* #undef HAVE_SHLWAPI_H */
+
+/* Define to 1 if `si_band' is a member of `siginfo_t'. */
+#define HAVE_SIGINFO_T_SI_BAND 1
+
+/* Define to 1 if `si_fd' is a member of `siginfo_t'. */
+#define HAVE_SIGINFO_T_SI_FD 1
+
+/* Define to 1 if you have the <signal.h> header file. */
+#define HAVE_SIGNAL_H 1
+
+/* Define to 1 if you have the `snprintf' function. */
+#define HAVE_SNPRINTF 1
+
+/* Define to 1 if you have the <sn/xpmem.h> header file. */
+/* #undef HAVE_SN_XPMEM_H */
+
+/* Define to 1 if you have the `socketpair' function. */
+#define HAVE_SOCKETPAIR 1
+
+/* Define to 1 if the system has the type `socklen_t'. */
+#define HAVE_SOCKLEN_T 1
+
+/* Define to 1 if you have the <sockLib.h> header file. */
+/* #undef HAVE_SOCKLIB_H */
+
+/* Define to 1 if the system has the type `ssize_t'. */
+#define HAVE_SSIZE_T 1
+
+/* Define to 1 if you have the `statfs' function. */
+#define HAVE_STATFS 1
+
+/* Define to 1 if you have the `statvfs' function. */
+#define HAVE_STATVFS 1
+
+/* Define to 1 if you have the <stdarg.h> header file. */
+#define HAVE_STDARG_H 1
+
+/* Define to 1 if you have the <stdbool.h> header file. */
+#define HAVE_STDBOOL_H 1
+
+/* Define to 1 if you have the <stddef.h> header file. */
+#define HAVE_STDDEF_H 1
+
+/* Define to 1 if you have the <stdint.h> header file. */
+#define HAVE_STDINT_H 1
+
+/* Define to 1 if you have the <stdlib.h> header file. */
+#define HAVE_STDLIB_H 1
+
+/* Define to 1 if you have the `strftime' function. */
+#define HAVE_STRFTIME 1
+
+/* Define to 1 if you have the <strings.h> header file. */
+#define HAVE_STRINGS_H 1
+
+/* Define to 1 if you have the <string.h> header file. */
+#define HAVE_STRING_H 1
+
+/* Define to 1 if you have the `strncasecmp' function. */
+#define HAVE_STRNCASECMP 1
+
+/* Define to 1 if you have the `strncpy_s' function. */
+/* #undef HAVE_STRNCPY_S */
+
+/* Define to 1 if you have the <stropts.h> header file. */
+/* #undef HAVE_STROPTS_H */
+
+/* Define to 1 if you have the `strsignal' function. */
+#define HAVE_STRSIGNAL 1
+
+/* Define to 1 if `d_type' is a member of `struct dirent'. */
+#define HAVE_STRUCT_DIRENT_D_TYPE 1
+
+/* Define to 1 if `transport_type' is a member of `struct ibv_device'. */
+/* #undef HAVE_STRUCT_IBV_DEVICE_TRANSPORT_TYPE */
+
+/* Define to 1 if `ifr_hwaddr' is a member of `struct ifreq'. */
+#define HAVE_STRUCT_IFREQ_IFR_HWADDR 1
+
+/* Define to 1 if `ifr_mtu' is a member of `struct ifreq'. */
+#define HAVE_STRUCT_IFREQ_IFR_MTU 1
+
+/* Define to 1 if the system has the type `struct sockaddr_in'. */
+#define HAVE_STRUCT_SOCKADDR_IN 1
+
+/* Define to 1 if the system has the type `struct sockaddr_in6'. */
+#define HAVE_STRUCT_SOCKADDR_IN6 1
+
+/* Define to 1 if `sa_len' is a member of `struct sockaddr'. */
+/* #undef HAVE_STRUCT_SOCKADDR_SA_LEN */
+
+/* Define to 1 if the system has the type `struct sockaddr_storage'. */
+#define HAVE_STRUCT_SOCKADDR_STORAGE 1
+
+/* Define to 1 if the system has the type `struct sockaddr_un'. */
+#define HAVE_STRUCT_SOCKADDR_UN 1
+
+/* Define to 1 if `f_fstypename' is a member of `struct statfs'. */
+/* #undef HAVE_STRUCT_STATFS_F_FSTYPENAME */
+
+/* Define to 1 if `f_type' is a member of `struct statfs'. */
+#define HAVE_STRUCT_STATFS_F_TYPE 1
+
+/* Define to 1 if `f_basetype' is a member of `struct statvfs'. */
+/* #undef HAVE_STRUCT_STATVFS_F_BASETYPE */
+
+/* Define to 1 if `f_fstypename' is a member of `struct statvfs'. */
+/* #undef HAVE_STRUCT_STATVFS_F_FSTYPENAME */
+
+/* Define to 1 if you have the `syscall' function. */
+#define HAVE_SYSCALL 1
+
+/* Define to 1 if you have the `sysconf' function. */
+#define HAVE_SYSCONF 1
+
+/* Define to '1' if sysctl is present and usable */
+#define HAVE_SYSCTL 1
+
+/* Define to '1' if sysctlbyname is present and usable */
+/* #undef HAVE_SYSCTLBYNAME */
+
+/* Define to 1 if you have the `syslog' function. */
+#define HAVE_SYSLOG 1
+
+/* Define to 1 if you have the <syslog.h> header file. */
+#define HAVE_SYSLOG_H 1
+
+/* Define to 1 if the system has the type
+   `SYSTEM_LOGICAL_PROCESSOR_INFORMATION'. */
+/* #undef HAVE_SYSTEM_LOGICAL_PROCESSOR_INFORMATION */
+
+/* Define to 1 if the system has the type
+   `SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX'. */
+/* #undef HAVE_SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX */
+
+/* Define to 1 if you have the <sys/cpuset.h> header file. */
+/* #undef HAVE_SYS_CPUSET_H */
+
+/* Define to 1 if you have the <sys/fcntl.h> header file. */
+#define HAVE_SYS_FCNTL_H 1
+
+/* Define to 1 if you have the <sys/ioctl.h> header file. */
+#define HAVE_SYS_IOCTL_H 1
+
+/* Define to 1 if you have the <sys/ipc.h> header file. */
+#define HAVE_SYS_IPC_H 1
+
+/* Define to 1 if you have the <sys/lgrp_user.h> header file. */
+/* #undef HAVE_SYS_LGRP_USER_H */
+
+/* Define to 1 if you have the <sys/mman.h> header file. */
+#define HAVE_SYS_MMAN_H 1
+
+/* Define to 1 if you have the <sys/mount.h> header file. */
+#define HAVE_SYS_MOUNT_H 1
+
+/* Define to 1 if you have the <sys/param.h> header file. */
+#define HAVE_SYS_PARAM_H 1
+
+/* Define to 1 if you have the <sys/poll.h> header file. */
+#define HAVE_SYS_POLL_H 1
+
+/* Define to 1 if you have the <sys/prctl.h> header file. */
+/* #undef HAVE_SYS_PRCTL_H */
+
+/* Define to 1 if you have the <sys/queue.h> header file. */
+#define HAVE_SYS_QUEUE_H 1
+
+/* Define to 1 if you have the <sys/resource.h> header file. */
+#define HAVE_SYS_RESOURCE_H 1
+
+/* Define to 1 if you have the <sys/select.h> header file. */
+#define HAVE_SYS_SELECT_H 1
+
+/* Define to 1 if you have the <sys/shm.h> header file. */
+#define HAVE_SYS_SHM_H 1
+
+/* Define to 1 if you have the <sys/socket.h> header file. */
+#define HAVE_SYS_SOCKET_H 1
+
+/* Define to 1 if you have the <sys/sockio.h> header file. */
+/* #undef HAVE_SYS_SOCKIO_H */
+
+/* Define to 1 if you have the <sys/statfs.h> header file. */
+#define HAVE_SYS_STATFS_H 1
+
+/* Define to 1 if you have the <sys/statvfs.h> header file. */
+#define HAVE_SYS_STATVFS_H 1
+
+/* Define to 1 if you have the <sys/stat.h> header file. */
+#define HAVE_SYS_STAT_H 1
+
+/* Define to 1 if you have the <sys/synch.h> header file. */
+/* #undef HAVE_SYS_SYNCH_H */
+
+/* Define to 1 if you have the <sys/sysctl.h> header file. */
+#define HAVE_SYS_SYSCTL_H 1
+
+/* Define to 1 if you have the <sys/time.h> header file. */
+#define HAVE_SYS_TIME_H 1
+
+/* Define to 1 if you have the <sys/tree.h> header file. */
+/* #undef HAVE_SYS_TREE_H */
+
+/* Define to 1 if you have the <sys/types.h> header file. */
+#define HAVE_SYS_TYPES_H 1
+
+/* Define to 1 if you have the <sys/uio.h> header file. */
+#define HAVE_SYS_UIO_H 1
+
+/* Define to 1 if you have the <sys/un.h> header file. */
+#define HAVE_SYS_UN_H 1
+
+/* Define to 1 if you have the <sys/utsname.h> header file. */
+#define HAVE_SYS_UTSNAME_H 1
+
+/* Define to 1 if you have the <sys/vfs.h> header file. */
+#define HAVE_SYS_VFS_H 1
+
+/* Define to 1 if you have the <sys/wait.h> header file. */
+#define HAVE_SYS_WAIT_H 1
+
+/* Define to 1 if you have the <TargetConditionals.h> header file. */
+/* #undef HAVE_TARGETCONDITIONALS_H */
+
+/* Define to 1 if you have the `tcgetpgrp' function. */
+#define HAVE_TCGETPGRP 1
+
+/* Define to 1 if you have the <termios.h> header file. */
+#define HAVE_TERMIOS_H 1
+
+/* Define to 1 if you have the <time.h> header file. */
+#define HAVE_TIME_H 1
+
+/* Define to 1 if you have the <tm.h> header file. */
+/* #undef HAVE_TM_H */
+
+/* Define to 1 if you have the <ucontext.h> header file. */
+#define HAVE_UCONTEXT_H 1
+
+/* Define to 1 if the system has the type `uint128_t'. */
+/* #undef HAVE_UINT128_T */
+
+/* Define to 1 if the system has the type `uint16_t'. */
+#define HAVE_UINT16_T 1
+
+/* Define to 1 if the system has the type `uint32_t'. */
+#define HAVE_UINT32_T 1
+
+/* Define to 1 if the system has the type `uint64_t'. */
+#define HAVE_UINT64_T 1
+
+/* Define to 1 if the system has the type `uint8_t'. */
+#define HAVE_UINT8_T 1
+
+/* Define to 1 if the system has the type `uintptr_t'. */
+#define HAVE_UINTPTR_T 1
+
+/* Define to 1 if you have the <ulimit.h> header file. */
+#define HAVE_ULIMIT_H 1
+
+/* Define to 1 if you have the `uname' function. */
+#define HAVE_UNAME 1
+
+/* Define to 1 if you have the <unistd.h> header file. */
+#define HAVE_UNISTD_H 1
+
+/* whether unix byteswap routines -- htonl, htons, nothl, ntohs -- are
+   available */
+#define HAVE_UNIX_BYTESWAP 1
+
+/* Define to 1 if you have the `usleep' function. */
+#define HAVE_USLEEP 1
+
+/* Define to 1 if you have the <util.h> header file. */
+/* #undef HAVE_UTIL_H */
+
+/* Define to 1 if you have the <utmp.h> header file. */
+#define HAVE_UTMP_H 1
+
+/* Define to 1 if you have the <valgrind/valgrind.h> header file. */
+/* #undef HAVE_VALGRIND_VALGRIND_H */
+
+/* Define to 1 if you have the `vasprintf' function. */
+#define HAVE_VASPRINTF 1
+
+/* Define to 1 if you have the `vsnprintf' function. */
+#define HAVE_VSNPRINTF 1
+
+/* Define to 1 if you have the `vsyslog' function. */
+#define HAVE_VSYSLOG 1
+
+/* Define to 1 if you have the `waitpid' function. */
+#define HAVE_WAITPID 1
+
+/* Define to 1 if you have the <X11/keysym.h> header file. */
+#define HAVE_X11_KEYSYM_H 1
+
+/* Define to 1 if you have the <X11/Xlib.h> header file. */
+#define HAVE_X11_XLIB_H 1
+
+/* Define to 1 if you have the <X11/Xutil.h> header file. */
+#define HAVE_X11_XUTIL_H 1
+
+/* Define to 1 if you have the <xpmem.h> header file. */
+/* #undef HAVE_XPMEM_H */
+
+/* Define to 1 if you have the `_NSGetEnviron' function. */
+/* #undef HAVE__NSGETENVIRON */
+
+/* Define to 1 if the system has the type `__float128'. */
+#define HAVE___FLOAT128 1
+
+/* Define to 1 if you have the `__mmap' function. */
+/* #undef HAVE___MMAP */
+
+/* Define to 1 if you have the `__munmap' function. */
+/* #undef HAVE___MUNMAP */
+
+/* Define to 1 on AIX */
+/* #undef HWLOC_AIX_SYS */
+
+/* Define to 1 on BlueGene/Q */
+/* #undef HWLOC_BGQ_SYS */
+
+/* Whether C compiler supports symbol visibility or not */
+#define HWLOC_C_HAVE_VISIBILITY 1
+
+/* Define to 1 on Darwin */
+/* #undef HWLOC_DARWIN_SYS */
+
+/* Whether we are in debugging mode or not */
+/* #undef HWLOC_DEBUG */
+
+/* Version of hwloc */
+/* #undef HWLOC_EXTERNAL_HWLOC_VERSION */
+
+/* Define to 1 on *FREEBSD */
+/* #undef HWLOC_FREEBSD_SYS */
+
+/* Whether your compiler has __attribute__ or not */
+#define HWLOC_HAVE_ATTRIBUTE 1
+
+/* Whether your compiler has __attribute__ aligned or not */
+#define HWLOC_HAVE_ATTRIBUTE_ALIGNED 1
+
+/* Whether your compiler has __attribute__ always_inline or not */
+#define HWLOC_HAVE_ATTRIBUTE_ALWAYS_INLINE 1
+
+/* Whether your compiler has __attribute__ cold or not */
+#define HWLOC_HAVE_ATTRIBUTE_COLD 1
+
+/* Whether your compiler has __attribute__ const or not */
+#define HWLOC_HAVE_ATTRIBUTE_CONST 1
+
+/* Whether your compiler has __attribute__ deprecated or not */
+#define HWLOC_HAVE_ATTRIBUTE_DEPRECATED 1
+
+/* Whether your compiler has __attribute__ format or not */
+#define HWLOC_HAVE_ATTRIBUTE_FORMAT 1
+
+/* Whether your compiler has __attribute__ hot or not */
+#define HWLOC_HAVE_ATTRIBUTE_HOT 1
+
+/* Whether your compiler has __attribute__ malloc or not */
+#define HWLOC_HAVE_ATTRIBUTE_MALLOC 1
+
+/* Whether your compiler has __attribute__ may_alias or not */
+#define HWLOC_HAVE_ATTRIBUTE_MAY_ALIAS 1
+
+/* Whether your compiler has __attribute__ nonnull or not */
+#define HWLOC_HAVE_ATTRIBUTE_NONNULL 1
+
+/* Whether your compiler has __attribute__ noreturn or not */
+#define HWLOC_HAVE_ATTRIBUTE_NORETURN 1
+
+/* Whether your compiler has __attribute__ no_instrument_function or not */
+#define HWLOC_HAVE_ATTRIBUTE_NO_INSTRUMENT_FUNCTION 1
+
+/* Whether your compiler has __attribute__ packed or not */
+#define HWLOC_HAVE_ATTRIBUTE_PACKED 1
+
+/* Whether your compiler has __attribute__ pure or not */
+#define HWLOC_HAVE_ATTRIBUTE_PURE 1
+
+/* Whether your compiler has __attribute__ sentinel or not */
+#define HWLOC_HAVE_ATTRIBUTE_SENTINEL 1
+
+/* Whether your compiler has __attribute__ unused or not */
+#define HWLOC_HAVE_ATTRIBUTE_UNUSED 1
+
+/* Whether your compiler has __attribute__ warn unused result or not */
+#define HWLOC_HAVE_ATTRIBUTE_WARN_UNUSED_RESULT 1
+
+/* Whether your compiler has __attribute__ weak alias or not */
+#define HWLOC_HAVE_ATTRIBUTE_WEAK_ALIAS 1
+
+/* Define to 1 if your `ffs' function is known to be broken. */
+/* #undef HWLOC_HAVE_BROKEN_FFS */
+
+/* Define to 1 if you have the `clz' function. */
+/* #undef HWLOC_HAVE_CLZ */
+
+/* Define to 1 if you have the `clzl' function. */
+/* #undef HWLOC_HAVE_CLZL */
+
+/* Define to 1 if the CPU_SET macro works */
+#define HWLOC_HAVE_CPU_SET 1
+
+/* Define to 1 if the CPU_SET_S macro works */
+#define HWLOC_HAVE_CPU_SET_S 1
+
+/* Define to 1 if you have the `cudart' SDK. */
+/* #undef HWLOC_HAVE_CUDART */
+
+/* Define to 1 if function `clz' is declared by system headers */
+/* #undef HWLOC_HAVE_DECL_CLZ */
+
+/* Define to 1 if function `clzl' is declared by system headers */
+/* #undef HWLOC_HAVE_DECL_CLZL */
+
+/* Define to 1 if function `ffs' is declared by system headers */
+#define HWLOC_HAVE_DECL_FFS 1
+
+/* Define to 1 if function `ffsl' is declared by system headers */
+#define HWLOC_HAVE_DECL_FFSL 1
+
+/* Define to 1 if function `fls' is declared by system headers */
+/* #undef HWLOC_HAVE_DECL_FLS */
+
+/* Define to 1 if function `flsl' is declared by system headers */
+/* #undef HWLOC_HAVE_DECL_FLSL */
+
+/* Define to 1 if function `strncasecmp' is declared by system headers */
+#define HWLOC_HAVE_DECL_STRNCASECMP 1
+
+/* Define to 1 if you have the `ffs' function. */
+#define HWLOC_HAVE_FFS 1
+
+/* Define to 1 if you have the `ffsl' function. */
+#define HWLOC_HAVE_FFSL 1
+
+/* Define to 1 if you have the `fls' function. */
+/* #undef HWLOC_HAVE_FLS */
+
+/* Define to 1 if you have the `flsl' function. */
+/* #undef HWLOC_HAVE_FLSL */
+
+/* Define to 1 if you have the GL module components. */
+/* #undef HWLOC_HAVE_GL */
+
+/* Define to 1 if you have the `libpciaccess' library. */
+/* #undef HWLOC_HAVE_LIBPCIACCESS */
+
+/* Define to 1 if you have the `libxml2' library. */
+/* #undef HWLOC_HAVE_LIBXML2 */
+
+/* Define to 1 if building the Linux PCI component */
+#define HWLOC_HAVE_LINUXPCI 1
+
+/* Define to 1 if mbind is available. */
+/* #undef HWLOC_HAVE_MBIND */
+
+/* Define to 1 if migrate_pages is available. */
+/* #undef HWLOC_HAVE_MIGRATE_PAGES */
+
+/* Define to 1 if you have the `NVML' library. */
+/* #undef HWLOC_HAVE_NVML */
+
+/* Define to 1 if glibc provides the old prototype (without length) of
+   sched_setaffinity() */
+/* #undef HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
+
+/* Define to 1 if you have the `OpenCL' library. */
+/* #undef HWLOC_HAVE_OPENCL */
+
+/* Define to 1 if `libpci' struct pci_dev has a `device_class' field. */
+/* #undef HWLOC_HAVE_PCIDEV_DEVICE_CLASS */
+
+/* Define to 1 if `libpci' struct pci_dev has a `domain' field. */
+/* #undef HWLOC_HAVE_PCIDEV_DOMAIN */
+
+/* Define to 1 if you have the pciutils `libpci' library. */
+/* #undef HWLOC_HAVE_PCIUTILS */
+
+/* Define to 1 if `libpci' has the `pci_find_cap' function. */
+/* #undef HWLOC_HAVE_PCI_FIND_CAP */
+
+/* Define to 1 if the hwloc library should support dynamically-loaded plugins
+   */
+/* #undef HWLOC_HAVE_PLUGINS */
+
+/* `Define to 1 if you have pthread_getthrds_np' */
+/* #undef HWLOC_HAVE_PTHREAD_GETTHRDS_NP */
+
+/* Define to 1 if pthread mutexes are available */
+#define HWLOC_HAVE_PTHREAD_MUTEX 1
+
+/* Define to 1 if glibc provides a prototype of sched_setaffinity() */
+#define HWLOC_HAVE_SCHED_SETAFFINITY 1
+
+/* Define to 1 if set_mempolicy is available. */
+/* #undef HWLOC_HAVE_SET_MEMPOLICY */
+
+/* Define to 1 if you have the <stdint.h> header file. */
+#define HWLOC_HAVE_STDINT_H 1
+
+/* Define to 1 if you have the `windows.h' header. */
+/* #undef HWLOC_HAVE_WINDOWS_H */
+
+/* Define to 1 if X11 headers including Xutil.h and keysym.h are available. */
+#define HWLOC_HAVE_X11_KEYSYM 1
+
+/* Define to 1 if you have x86 cpuid */
+#define HWLOC_HAVE_X86_CPUID 1
+
+/* Define to 1 if the _syscall3 macro works */
+/* #undef HWLOC_HAVE__SYSCALL3 */
+
+/* Define to 1 on HP-UX */
+/* #undef HWLOC_HPUX_SYS */
+
+/* Version of hwloc */
+#define HWLOC_HWLOC191_HWLOC_VERSION "internal v1.9.1"
+
+/* Define to 1 on Irix */
+/* #undef HWLOC_IRIX_SYS */
+
+/* Define to 1 on Linux */
+#define HWLOC_LINUX_SYS 1
+
+/* Define to 1 on *NETBSD */
+/* #undef HWLOC_NETBSD_SYS */
+
+/* Define to 1 on OSF */
+/* #undef HWLOC_OSF_SYS */
+
+/* The size of `unsigned int', as computed by sizeof */
+#define HWLOC_SIZEOF_UNSIGNED_INT 4
+
+/* The size of `unsigned long', as computed by sizeof */
+#define HWLOC_SIZEOF_UNSIGNED_LONG 8
+
+/* Define to 1 on Solaris */
+/* #undef HWLOC_SOLARIS_SYS */
+
+/* The hwloc symbol prefix */
+#define HWLOC_SYM_PREFIX opal_hwloc191_
+
+/* The hwloc symbol prefix in all caps */
+#define HWLOC_SYM_PREFIX_CAPS OPAL_HWLOC191_
+
+/* Whether we need to re-define all the hwloc public symbols or not */
+#define HWLOC_SYM_TRANSFORM 1
+
+/* Define to 1 on unsupported systems */
+/* #undef HWLOC_UNSUPPORTED_SYS */
+
+/* Define to 1 on WINDOWS */
+/* #undef HWLOC_WIN_SYS */
+
+/* Define to 1 on x86_32 */
+/* #undef HWLOC_X86_32_ARCH */
+
+/* Define to 1 on x86_64 */
+#define HWLOC_X86_64_ARCH 1
+
+/* Define to the sub-directory in which libtool stores uninstalled libraries.
+   */
+#define LT_OBJDIR ".libs/"
+
+/* Header to include for event implementation */
+#define MCA_event_IMPLEMENTATION_HEADER "opal/mca/event/libevent2021/libevent2021.h"
+
+/* Header to include for hwloc implementation */
+#define MCA_hwloc_IMPLEMENTATION_HEADER "opal/mca/hwloc/hwloc191/hwloc191.h"
+
+/* Location of external hwloc header */
+/* #undef MCA_hwloc_external_header */
+
+/* Location of external hwloc header */
+/* #undef MCA_hwloc_external_openfabrics_header */
+
+/* Complete set of command line arguments given to ROMIOs configure script */
+#define MCA_io_romio_COMPLETE_CONFIGURE_FLAGS " FROM_OMPI=yes CC='gcc -std=gnu99' CFLAGS='-g -Wall -Wundef -Wno-long-long -Wsign-compare -Wmissing-prototypes -Wstrict-prototypes -Wcomment -pedantic -Werror-implicit-function-declaration -finline-functions -fno-strict-aliasing -pthread' CPPFLAGS='  -I/home/wwu12/ompi/ompi-cuda/opal/mca/hwloc/hwloc191/hwloc/include -I/home/wwu12/ompi/ompi-cuda/opal/mca/event/libevent2021/libevent -I/home/wwu12/ompi/ompi-cuda/opal/mca/event/libevent2021/libevent/include' FFLAGS='' LDFLAGS='  ' --enable-shared --disable-static  --prefix=/home/wwu12/ompi/build-cuda --disable-aio"
+
+/* Set of user-defined configure flags given to ROMIOs configure script via
+   --with-io-romio-flags */
+#define MCA_io_romio_USER_CONFIGURE_FLAGS ""
+
+/* Header to include for memcpy implementation */
+#define MCA_memcpy_IMPLEMENTATION_HEADER "opal/mca/memcpy/base/memcpy_base_default.h"
+
+/* Header to include for parts of the memory implementation */
+#define MCA_memory_IMPLEMENTATION_HEADER "opal/mca/memory/base/empty.h"
+
+/* Defined to 1 if ompi:mtl should use direct calls instead of components */
+#define MCA_ompi_mtl_DIRECT_CALL 0
+
+/* name of component to use for direct calls, if MCA_ompi_mtl_DIRECT_CALL is 1
+   */
+#define MCA_ompi_mtl_DIRECT_CALL_COMPONENT 
+
+/* Header ompi:mtl includes to be direct called */
+#define MCA_ompi_mtl_DIRECT_CALL_HEADER ""
+
+/* Defined to 1 if ompi:pml should use direct calls instead of components */
+#define MCA_ompi_pml_DIRECT_CALL 0
+
+/* name of component to use for direct calls, if MCA_ompi_pml_DIRECT_CALL is 1
+   */
+#define MCA_ompi_pml_DIRECT_CALL_COMPONENT 
+
+/* Header ompi:pml includes to be direct called */
+#define MCA_ompi_pml_DIRECT_CALL_HEADER ""
+
+/* Defined to 1 if oshmem:memheap should use direct calls instead of
+   components */
+#define MCA_oshmem_memheap_DIRECT_CALL 0
+
+/* name of component to use for direct calls, if
+   MCA_oshmem_memheap_DIRECT_CALL is 1 */
+#define MCA_oshmem_memheap_DIRECT_CALL_COMPONENT 
+
+/* Header oshmem:memheap includes to be direct called */
+#define MCA_oshmem_memheap_DIRECT_CALL_HEADER ""
+
+/* Defined to 1 if oshmem:spml should use direct calls instead of components
+   */
+#define MCA_oshmem_spml_DIRECT_CALL 0
+
+/* name of component to use for direct calls, if MCA_oshmem_spml_DIRECT_CALL
+   is 1 */
+#define MCA_oshmem_spml_DIRECT_CALL_COMPONENT 
+
+/* Header oshmem:spml includes to be direct called */
+#define MCA_oshmem_spml_DIRECT_CALL_HEADER ""
+
+/* Header to include for rte implementation */
+#define MCA_rte_IMPLEMENTATION_HEADER "ompi/mca/rte/orte/rte_orte.h"
+
+/* Header to include for timer implementation */
+#define MCA_timer_IMPLEMENTATION_HEADER "opal/mca/timer/linux/timer_linux.h"
+
+/* Whether ptmalloc2 is supported on this system or not */
+#define MEMORY_LINUX_PTMALLOC2 1
+
+/* Whether ummunotify is supported on this system or not */
+#define MEMORY_LINUX_UMMUNOTIFY 0
+
+/* Whether we can use M-PAGE supported since MOFED 1.8 */
+#define MPAGE_ENABLE 0
+
+/* create_flags field is part of ibv_exp_reg_mr_in */
+#define MPAGE_HAVE_IBV_EXP_REG_MR_CREATE_FLAGS 0
+
+/* exp_access field is part of ibv_exp_reg_shared_mr_in */
+#define MPAGE_HAVE_SMR_EXP_ACCESS 0
+
+/* Maximum value for an MPI_Count */
+#define MPI_COUNT_MAX 0x7fffffffffffffffll
+
+/* Whether we want to check MPI parameters always, never, or decide at
+   run-time */
+#define MPI_PARAM_CHECK ompi_mpi_param_check
+
+/* Alignment of Fortran CHARACTER */
+#define OMPI_ALIGNMENT_FORTRAN_CHARACTER 1
+
+/* Alignment of Fortran COMPLEX */
+#define OMPI_ALIGNMENT_FORTRAN_COMPLEX 4
+
+/* Alignment of Fortran COMPLEX*16 */
+#define OMPI_ALIGNMENT_FORTRAN_COMPLEX16 8
+
+/* Alignment of Fortran COMPLEX*32 */
+#define OMPI_ALIGNMENT_FORTRAN_COMPLEX32 4
+
+/* Alignment of Fortran COMPLEX*4 */
+#define OMPI_ALIGNMENT_FORTRAN_COMPLEX4 4
+
+/* Alignment of Fortran COMPLEX*8 */
+#define OMPI_ALIGNMENT_FORTRAN_COMPLEX8 4
+
+/* Alignment of Fortran DOUBLE COMPLEX */
+#define OMPI_ALIGNMENT_FORTRAN_DOUBLE_COMPLEX 8
+
+/* Alignment of Fortran DOUBLE PRECISION */
+#define OMPI_ALIGNMENT_FORTRAN_DOUBLE_PRECISION 8
+
+/* Alignment of Fortran INTEGER */
+#define OMPI_ALIGNMENT_FORTRAN_INTEGER 4
+
+/* Alignment of Fortran INTEGER*1 */
+#define OMPI_ALIGNMENT_FORTRAN_INTEGER1 1
+
+/* Alignment of Fortran INTEGER*16 */
+#define OMPI_ALIGNMENT_FORTRAN_INTEGER16 4
+
+/* Alignment of Fortran INTEGER*2 */
+#define OMPI_ALIGNMENT_FORTRAN_INTEGER2 2
+
+/* Alignment of Fortran INTEGER*4 */
+#define OMPI_ALIGNMENT_FORTRAN_INTEGER4 4
+
+/* Alignment of Fortran INTEGER*8 */
+#define OMPI_ALIGNMENT_FORTRAN_INTEGER8 8
+
+/* Alignment of Fortran LOGICAL */
+#define OMPI_ALIGNMENT_FORTRAN_LOGICAL 4
+
+/* Alignment of Fortran LOGICAL*1 */
+#define OMPI_ALIGNMENT_FORTRAN_LOGICAL1 1
+
+/* Alignment of Fortran LOGICAL*2 */
+#define OMPI_ALIGNMENT_FORTRAN_LOGICAL2 2
+
+/* Alignment of Fortran LOGICAL*4 */
+#define OMPI_ALIGNMENT_FORTRAN_LOGICAL4 4
+
+/* Alignment of Fortran LOGICAL*8 */
+#define OMPI_ALIGNMENT_FORTRAN_LOGICAL8 8
+
+/* Alignment of Fortran REAL */
+#define OMPI_ALIGNMENT_FORTRAN_REAL 4
+
+/* Alignment of Fortran REAL*16 */
+#define OMPI_ALIGNMENT_FORTRAN_REAL16 4
+
+/* Alignment of Fortran REAL*2 */
+#define OMPI_ALIGNMENT_FORTRAN_REAL2 4
+
+/* Alignment of Fortran REAL*4 */
+#define OMPI_ALIGNMENT_FORTRAN_REAL4 4
+
+/* Alignment of Fortran REAL*8 */
+#define OMPI_ALIGNMENT_FORTRAN_REAL8 8
+
+/* Whether we want MPI C++ support or not */
+#define OMPI_BUILD_CXX_BINDINGS 0
+
+/* Whether we built the 'use mpi_f08' prototype subarray-based implementation
+   or not (i.e., whether to build the use-mpi-f08-desc prototype or the
+   regular use-mpi-f08 implementation) */
+#define OMPI_BUILD_FORTRAN_F08_SUBARRAYS 0
+
+/* Whether we will build the MPI Fortran mpif.h bindings or not */
+#define OMPI_BUILD_FORTRAN_MPIFH_BINDINGS 1
+
+/* For ompi_info: Whether we will build the MPI Fortran "use mpi_f08" bindings
+   or not */
+#define OMPI_BUILD_FORTRAN_USEMPIF08_BINDINGS 0
+
+/* Whether we will build the MPI Fortran "use mpi" bindings or not */
+#define OMPI_BUILD_FORTRAN_USEMPI_BINDINGS 1
+
+/* OMPI underlying C++ compiler */
+#define OMPI_CXX "g++"
+
+/* Whether C++ compiler supports __builtin_expect */
+#define OMPI_CXX_HAVE_BUILTIN_EXPECT 0
+
+/* Whether C++ compiler supports __builtin_prefetch */
+#define OMPI_CXX_HAVE_BUILTIN_PREFETCH 0
+
+/* Whether a const_cast on a 2-d array will work with the C++ compiler */
+#define OMPI_CXX_SUPPORTS_2D_CONST_CAST 0
+
+/* Enable contributed software package libompitrace */
+#define OMPI_ENABLE_CONTRIB_libompitrace 1
+
+/* Enable contributed software package vt */
+#define OMPI_ENABLE_CONTRIB_vt 1
+
+/* Whether we want MPI profiling or not */
+#define OMPI_ENABLE_MPI_PROFILING 1
+
+/* Enable MPI_THREAD_MULTIPLE */
+#define OMPI_ENABLE_THREAD_MULTIPLE 0
+
+/* Underlying Fortran compiler */
+#define OMPI_FC "gfortran"
+
+/* Absolutey path to the underlying Fortran compiler found by configure */
+#define OMPI_FC_ABSOLUTE "/usr/bin/gfortran"
+
+/* Whether the mpif.h interface supports the MPI_SIZEOF interface or not */
+#define OMPI_FORTRAN_BUILD_SIZEOF 0
+
+/* Whether fortran symbols are all caps or not */
+#define OMPI_FORTRAN_CAPS 0
+
+/* Whether fortran symbols have a trailing double underscore or not */
+#define OMPI_FORTRAN_DOUBLE_UNDERSCORE 0
+
+/* How many bytes the mpi_f08 TYPE(MPI_<foo>) handles will be */
+#define OMPI_FORTRAN_F08_HANDLE_SIZE 4
+
+/* Max handle value for fortran MPI handles, effectively min(INT_MAX, max
+   fortran INTEGER value) */
+#define OMPI_FORTRAN_HANDLE_MAX 2147483647
+
+/* For mpi-f08-interfaces-callbacks.f90 and ompi_info: whether the compiler
+   supports the "abstract" keyword or not */
+#define OMPI_FORTRAN_HAVE_ABSTRACT 0
+
+/* For ompi/mpi/fortran/use-mpi-f08/blah.F90 and blah.h and ompi_info: whether
+   the compiler supports the "asynchronous" keyword or not */
+#define OMPI_FORTRAN_HAVE_ASYNCHRONOUS 0
+
+/* For ompi_info: Whether the compiler supports all forms of BIND(C) that we
+   need */
+#define OMPI_FORTRAN_HAVE_BIND_C 0
+
+/* For ompi_info: Whether the compiler supports SUBROUTINE ... BIND(C) or not
+   */
+#define OMPI_FORTRAN_HAVE_BIND_C_SUB 0
+
+/* For ompi_info: Whether the compiler supports TYPE, BIND(C) or not */
+#define OMPI_FORTRAN_HAVE_BIND_C_TYPE 0
+
+/* For ompi_info: Whether the compiler supports TYPE, BIND(C, NAME="name") or
+   not */
+#define OMPI_FORTRAN_HAVE_BIND_C_TYPE_NAME 0
+
+/* For ompi_info: Whether the Fortran compiler supports the Fortran 2008
+   "assumed rank" syntax or not */
+#define OMPI_FORTRAN_HAVE_F08_ASSUMED_RANK 0
+
+/* Whether the Fortran compiler supports ignore TKR functionality or not */
+#define OMPI_FORTRAN_HAVE_IGNORE_TKR 0
+
+/* Whether the compiler supports INTERFACE or not */
+#define OMPI_FORTRAN_HAVE_INTERFACE 1
+
+/* For ompi_info: Whether the compiler supports ISO_C_BINDING or not */
+#define OMPI_FORTRAN_HAVE_ISO_C_BINDING 1
+
+/* Whether the compiler supports ISO_FORTRAN_ENV or not */
+#define OMPI_FORTRAN_HAVE_ISO_FORTRAN_ENV 0
+
+/* For ompi_info: whether the Fortran compiler supports optional arguments or
+   not */
+#define OMPI_FORTRAN_HAVE_OPTIONAL_ARGS 0
+
+/* For mpi-f08-types.f90 and ompi_info: whether the compiler supports the
+   "private" keyword or not (used in MPI_Status) */
+#define OMPI_FORTRAN_HAVE_PRIVATE 0
+
+/* For ompi/mpi/fortran/use-mpi-f08/blah.F90 and blah.h and ompi_info: whether
+   the compiler supports the "procedure" keyword or not */
+#define OMPI_FORTRAN_HAVE_PROCEDURE 0
+
+/* For mpi-f08-types.f90 and .F90 and ompi_info: whether the compiler supports
+   the "protected" keyword or not */
+#define OMPI_FORTRAN_HAVE_PROTECTED 0
+
+/* Whether the compiler supports STORAGE_SIZE on relevant types */
+#define OMPI_FORTRAN_HAVE_STORAGE_SIZE 0
+
+/* Pre declaration for FORTRAN ignore parameter TKR behavior */
+#define OMPI_FORTRAN_IGNORE_TKR_PREDECL ""
+
+/* Type declaration for FORTRAN ignore parameter TKR behavior */
+#define OMPI_FORTRAN_IGNORE_TKR_TYPE 
+
+/* Max dimension rank of Fortran arrays */
+#define OMPI_FORTRAN_MAX_ARRAY_RANK 7
+
+/* Whether the mpi_f08 implementation is using wrapper routines ("bad" Fortran
+   compiler) or weak symbols ("good" Fortran compiler) for the F08 interface
+   definition implementations */
+#define OMPI_FORTRAN_NEED_WRAPPER_ROUTINES 0
+
+/* Whether fortran symbols have no trailing underscore or not */
+#define OMPI_FORTRAN_PLAIN 0
+
+/* Whether fortran symbols have a trailing underscore or not */
+#define OMPI_FORTRAN_SINGLE_UNDERSCORE 1
+
+/* Value to load to the MPI_SUBARRAYS_SUPPORTED compile-time constant */
+#define OMPI_FORTRAN_SUBARRAYS_SUPPORTED .FALSE.
+
+/* Fortran value for LOGICAL .TRUE. value */
+#define OMPI_FORTRAN_VALUE_TRUE 1
+
+/* Greek - alpha, beta, etc - release number of Open MPI */
+#define OMPI_GREEK_VERSION "a1"
+
+/* Wether we want sparse process groups */
+#define OMPI_GROUP_SPARSE 0
+
+/* Whether or not we have compiled with C++ exceptions support */
+#define OMPI_HAVE_CXX_EXCEPTION_SUPPORT 0
+
+/* Whether we have Fortran CHARACTER or not */
+#define OMPI_HAVE_FORTRAN_CHARACTER 1
+
+/* Whether we have Fortran COMPLEX or not */
+#define OMPI_HAVE_FORTRAN_COMPLEX 1
+
+/* Whether we have Fortran COMPLEX*16 or not */
+#define OMPI_HAVE_FORTRAN_COMPLEX16 1
+
+/* Whether we have Fortran COMPLEX*32 or not */
+#define OMPI_HAVE_FORTRAN_COMPLEX32 0
+
+/* Whether we have Fortran COMPLEX*4 or not */
+#define OMPI_HAVE_FORTRAN_COMPLEX4 0
+
+/* Whether we have Fortran COMPLEX*8 or not */
+#define OMPI_HAVE_FORTRAN_COMPLEX8 1
+
+/* Whether we have Fortran DOUBLE COMPLEX or not */
+#define OMPI_HAVE_FORTRAN_DOUBLE_COMPLEX 1
+
+/* Whether we have Fortran DOUBLE PRECISION or not */
+#define OMPI_HAVE_FORTRAN_DOUBLE_PRECISION 1
+
+/* Whether we have Fortran INTEGER or not */
+#define OMPI_HAVE_FORTRAN_INTEGER 1
+
+/* Whether we have Fortran INTEGER*1 or not */
+#define OMPI_HAVE_FORTRAN_INTEGER1 1
+
+/* Whether we have Fortran INTEGER*16 or not */
+#define OMPI_HAVE_FORTRAN_INTEGER16 0
+
+/* Whether we have Fortran INTEGER*2 or not */
+#define OMPI_HAVE_FORTRAN_INTEGER2 1
+
+/* Whether we have Fortran INTEGER*4 or not */
+#define OMPI_HAVE_FORTRAN_INTEGER4 1
+
+/* Whether we have Fortran INTEGER*8 or not */
+#define OMPI_HAVE_FORTRAN_INTEGER8 1
+
+/* Whether we have Fortran LOGICAL or not */
+#define OMPI_HAVE_FORTRAN_LOGICAL 1
+
+/* Whether we have Fortran LOGICAL*1 or not */
+#define OMPI_HAVE_FORTRAN_LOGICAL1 1
+
+/* Whether we have Fortran LOGICAL*2 or not */
+#define OMPI_HAVE_FORTRAN_LOGICAL2 1
+
+/* Whether we have Fortran LOGICAL*4 or not */
+#define OMPI_HAVE_FORTRAN_LOGICAL4 1
+
+/* Whether we have Fortran LOGICAL*8 or not */
+#define OMPI_HAVE_FORTRAN_LOGICAL8 1
+
+/* Whether we have Fortran REAL or not */
+#define OMPI_HAVE_FORTRAN_REAL 1
+
+/* Whether we have Fortran REAL*16 or not */
+#define OMPI_HAVE_FORTRAN_REAL16 0
+
+/* Whether we have Fortran REAL*2 or not */
+#define OMPI_HAVE_FORTRAN_REAL2 0
+
+/* Whether we have Fortran REAL*4 or not */
+#define OMPI_HAVE_FORTRAN_REAL4 1
+
+/* Whether we have Fortran REAL*8 or not */
+#define OMPI_HAVE_FORTRAN_REAL8 1
+
+/* Fortrn KIND number for CHARACTER */
+#define OMPI_KIND_FORTRAN_CHARACTER C_SIGNED_CHAR
+
+/* Fortrn KIND number for COMPLEX */
+#define OMPI_KIND_FORTRAN_COMPLEX C_FLOAT_COMPLEX
+
+/* Fortrn KIND number for COMPLEX*16 */
+#define OMPI_KIND_FORTRAN_COMPLEX16 C_DOUBLE_COMPLEX
+
+/* Fortrn KIND number for COMPLEX*32 */
+#define OMPI_KIND_FORTRAN_COMPLEX32 0
+
+/* Fortrn KIND number for COMPLEX*4 */
+#define OMPI_KIND_FORTRAN_COMPLEX4 0
+
+/* Fortrn KIND number for COMPLEX*8 */
+#define OMPI_KIND_FORTRAN_COMPLEX8 C_FLOAT_COMPLEX
+
+/* Fortrn KIND number for DOUBLE COMPLEX */
+#define OMPI_KIND_FORTRAN_DOUBLE_COMPLEX C_DOUBLE_COMPLEX
+
+/* Fortrn KIND number for DOUBLE PRECISION */
+#define OMPI_KIND_FORTRAN_DOUBLE_PRECISION C_DOUBLE
+
+/* Fortrn KIND number for INTEGER */
+#define OMPI_KIND_FORTRAN_INTEGER C_INT
+
+/* Fortrn KIND number for INTEGER*1 */
+#define OMPI_KIND_FORTRAN_INTEGER1 C_SIGNED_CHAR
+
+/* Fortrn KIND number for INTEGER*16 */
+#define OMPI_KIND_FORTRAN_INTEGER16 0
+
+/* Fortrn KIND number for INTEGER*2 */
+#define OMPI_KIND_FORTRAN_INTEGER2 C_SHORT
+
+/* Fortrn KIND number for INTEGER*4 */
+#define OMPI_KIND_FORTRAN_INTEGER4 C_INT
+
+/* Fortrn KIND number for INTEGER*8 */
+#define OMPI_KIND_FORTRAN_INTEGER8 C_LONG_LONG
+
+/* Fortrn KIND number for LOGICAL */
+#define OMPI_KIND_FORTRAN_LOGICAL C_INT
+
+/* Fortrn KIND number for LOGICAL*1 */
+#define OMPI_KIND_FORTRAN_LOGICAL1 C_SIGNED_CHAR
+
+/* Fortrn KIND number for LOGICAL*2 */
+#define OMPI_KIND_FORTRAN_LOGICAL2 C_SHORT
+
+/* Fortrn KIND number for LOGICAL*4 */
+#define OMPI_KIND_FORTRAN_LOGICAL4 C_INT
+
+/* Fortrn KIND number for LOGICAL*8 */
+#define OMPI_KIND_FORTRAN_LOGICAL8 C_LONG_LONG
+
+/* Fortrn KIND number for REAL */
+#define OMPI_KIND_FORTRAN_REAL C_FLOAT
+
+/* Fortrn KIND number for REAL*16 */
+#define OMPI_KIND_FORTRAN_REAL16 0
+
+/* Fortrn KIND number for REAL*2 */
+#define OMPI_KIND_FORTRAN_REAL2 0
+
+/* Fortrn KIND number for REAL*4 */
+#define OMPI_KIND_FORTRAN_REAL4 C_FLOAT
+
+/* Fortrn KIND number for REAL*8 */
+#define OMPI_KIND_FORTRAN_REAL8 C_DOUBLE
+
+/* Major release number of Open MPI */
+#define OMPI_MAJOR_VERSION 1
+
+/* Minor release number of Open MPI */
+#define OMPI_MINOR_VERSION 9
+
+/* MPI Extensions included in libmpi */
+#define OMPI_MPIEXT_COMPONENTS ""
+
+/* Type of MPI_Aint */
+#define OMPI_MPI_AINT_TYPE ptrdiff_t
+
+/* Contributed software packages built with Open MPI */
+#define OMPI_MPI_CONTRIBS "vt, libompitrace"
+
+/* Size of the MPI_Count datatype */
+#define OMPI_MPI_COUNT_SIZE 8
+
+/* Type of the MPI_Count datatype */
+#define OMPI_MPI_COUNT_TYPE long long
+
+/* Size of the MPI_Offset */
+#define OMPI_MPI_OFFSET_SIZE 8
+
+/* Type of MPI_Offset */
+#define OMPI_MPI_OFFSET_TYPE long long
+
+/* Enable flow control for Portals4 MTL */
+#define OMPI_MTL_PORTALS4_FLOW_CONTROL 1
+
+/* MPI datatype corresponding to MPI_Offset */
+#define OMPI_OFFSET_DATATYPE MPI_LONG_LONG
+
+/* Whether we want to check MPI parameters never or possible (an integer
+   constant) */
+#define OMPI_PARAM_CHECK 1
+
+/* Index into endpoint array for BML */
+#define OMPI_PROC_ENDPOINT_TAG_BML 0
+
+/* Maximum number of endpoint entries to be attached to an ompi_proc_t */
+#define OMPI_PROC_ENDPOINT_TAG_MAX 1
+
+/* Index into endpoint array for MTL */
+/* #undef OMPI_PROC_ENDPOINT_TAG_MTL */
+
+/* Index into endpoint array for PML */
+/* #undef OMPI_PROC_ENDPOINT_TAG_PML */
+
+/* Index into endpoint array for PORTALS4 */
+/* #undef OMPI_PROC_ENDPOINT_TAG_PORTALS4 */
+
+/* Whether OMPI should provide MPI File interface */
+#define OMPI_PROVIDE_MPI_FILE_INTERFACE 1
+
+/* Whether Fortran REAL*16 matches the bit format of the equivalent C type */
+#define OMPI_REAL16_MATCHES_C 0
+
+/* Release date of Open MPI */
+#define OMPI_RELEASE_DATE "Unreleased developer copy"
+
+/* Release release number of Open MPI */
+#define OMPI_RELEASE_VERSION 0
+
+/* The repository version Open MPI */
+#define OMPI_REPO_REV "dev-267-g51b4521"
+
+/* Defined to 1 if the OMPI runtime component is ORTE */
+#define OMPI_RTE_ORTE 1
+
+/* Size of Fortran CHARACTER */
+#define OMPI_SIZEOF_FORTRAN_CHARACTER 1
+
+/* Size of Fortran COMPLEX */
+#define OMPI_SIZEOF_FORTRAN_COMPLEX 8
+
+/* Size of Fortran COMPLEX*16 */
+#define OMPI_SIZEOF_FORTRAN_COMPLEX16 16
+
+/* Size of Fortran COMPLEX*32 */
+#define OMPI_SIZEOF_FORTRAN_COMPLEX32 4
+
+/* Size of Fortran COMPLEX*4 */
+#define OMPI_SIZEOF_FORTRAN_COMPLEX4 4
+
+/* Size of Fortran COMPLEX*8 */
+#define OMPI_SIZEOF_FORTRAN_COMPLEX8 8
+
+/* Size of Fortran DOUBLE COMPLEX */
+#define OMPI_SIZEOF_FORTRAN_DOUBLE_COMPLEX 16
+
+/* Size of Fortran DOUBLE PRECISION */
+#define OMPI_SIZEOF_FORTRAN_DOUBLE_PRECISION 8
+
+/* Size of Fortran INTEGER */
+#define OMPI_SIZEOF_FORTRAN_INTEGER 4
+
+/* Size of Fortran INTEGER*1 */
+#define OMPI_SIZEOF_FORTRAN_INTEGER1 1
+
+/* Size of Fortran INTEGER*16 */
+#define OMPI_SIZEOF_FORTRAN_INTEGER16 16
+
+/* Size of Fortran INTEGER*2 */
+#define OMPI_SIZEOF_FORTRAN_INTEGER2 2
+
+/* Size of Fortran INTEGER*4 */
+#define OMPI_SIZEOF_FORTRAN_INTEGER4 4
+
+/* Size of Fortran INTEGER*8 */
+#define OMPI_SIZEOF_FORTRAN_INTEGER8 8
+
+/* Size of Fortran LOGICAL */
+#define OMPI_SIZEOF_FORTRAN_LOGICAL 4
+
+/* Size of Fortran LOGICAL*1 */
+#define OMPI_SIZEOF_FORTRAN_LOGICAL1 1
+
+/* Size of Fortran LOGICAL*2 */
+#define OMPI_SIZEOF_FORTRAN_LOGICAL2 2
+
+/* Size of Fortran LOGICAL*4 */
+#define OMPI_SIZEOF_FORTRAN_LOGICAL4 4
+
+/* Size of Fortran LOGICAL*8 */
+#define OMPI_SIZEOF_FORTRAN_LOGICAL8 8
+
+/* Size of Fortran REAL */
+#define OMPI_SIZEOF_FORTRAN_REAL 4
+
+/* Size of Fortran REAL*16 */
+#define OMPI_SIZEOF_FORTRAN_REAL16 4
+
+/* Size of Fortran REAL*2 */
+#define OMPI_SIZEOF_FORTRAN_REAL2 4
+
+/* Size of Fortran REAL*4 */
+#define OMPI_SIZEOF_FORTRAN_REAL4 4
+
+/* Size of Fortran REAL*8 */
+#define OMPI_SIZEOF_FORTRAN_REAL8 8
+
+/* Tarball filename version string of Open MPI */
+#define OMPI_TARBALL_VERSION "gitclone"
+
+/* Complete release number of Open MPI */
+#define OMPI_VERSION "0"
+
+/* do we want java mpi bindings */
+#define OMPI_WANT_JAVA_BINDINGS 0
+
+/* do we want to try to work around C++ bindings SEEK_* issue? */
+#define OMPI_WANT_MPI_CXX_SEEK 1
+
+/* Enable warnings when using deprecated MPI functions */
+#define OMPI_WANT_MPI_INTERFACE_WARNING 1
+
+/* if the peruse interface should be enabled */
+#define OMPI_WANT_PERUSE 0
+
+/* Alignment of type _Bool */
+#define OPAL_ALIGNMENT_BOOL 1
+
+/* Alignment of type char */
+#define OPAL_ALIGNMENT_CHAR 1
+
+/* Alignment of type bool */
+#define OPAL_ALIGNMENT_CXX_BOOL 1
+
+/* Alignment of type double */
+#define OPAL_ALIGNMENT_DOUBLE 8
+
+/* Alignment of type double _Complex */
+#define OPAL_ALIGNMENT_DOUBLE_COMPLEX 8
+
+/* Alignment of type float */
+#define OPAL_ALIGNMENT_FLOAT 4
+
+/* Alignment of type float _Complex */
+#define OPAL_ALIGNMENT_FLOAT_COMPLEX 4
+
+/* Alignment of type int */
+#define OPAL_ALIGNMENT_INT 4
+
+/* Alignment of type int128_t */
+/* #undef OPAL_ALIGNMENT_INT128 */
+
+/* Alignment of type int16_t */
+#define OPAL_ALIGNMENT_INT16 2
+
+/* Alignment of type int32_t */
+#define OPAL_ALIGNMENT_INT32 4
+
+/* Alignment of type int64_t */
+#define OPAL_ALIGNMENT_INT64 8
+
+/* Alignment of type int8_t */
+#define OPAL_ALIGNMENT_INT8 1
+
+/* Alignment of type long */
+#define OPAL_ALIGNMENT_LONG 8
+
+/* Alignment of type long double */
+#define OPAL_ALIGNMENT_LONG_DOUBLE 16
+
+/* Alignment of type long double _Complex */
+#define OPAL_ALIGNMENT_LONG_DOUBLE_COMPLEX 16
+
+/* Alignment of type long long */
+#define OPAL_ALIGNMENT_LONG_LONG 8
+
+/* Alignment of type short */
+#define OPAL_ALIGNMENT_SHORT 2
+
+/* Alignment of type size_t */
+#define OPAL_ALIGNMENT_SIZE_T 8
+
+/* Alignment of type void * */
+#define OPAL_ALIGNMENT_VOID_P 8
+
+/* Alignment of type wchar_t */
+#define OPAL_ALIGNMENT_WCHAR 4
+
+/* Alignment of type __float128 */
+#define OPAL_ALIGNMENT___FLOAT128 16
+
+/* set to 1 if word-size integers must be aligned to word-size padding to
+   prevent bus errors */
+#define OPAL_ALIGN_WORD_SIZE_INTEGERS 0
+
+/* OMPI architecture string */
+#define OPAL_ARCH "x86_64-unknown-linux-gnu"
+
+/* Assembly align directive expects logarithmic value */
+#define OPAL_ASM_ALIGN_LOG 
+
+/* What ARM assembly version to use */
+/* #undef OPAL_ASM_ARM_VERSION */
+
+/* Assembly directive for exporting symbols */
+#define OPAL_ASM_GLOBAL ".globl"
+
+/* Assembly prefix for gsym labels */
+#define OPAL_ASM_GSYM ""
+
+/* Assembly suffix for labels */
+#define OPAL_ASM_LABEL_SUFFIX ":"
+
+/* Assembly prefix for lsym labels */
+#define OPAL_ASM_LSYM ".L"
+
+/* Do we need to give a .size directive */
+#define OPAL_ASM_SIZE "1"
+
+/* Whether we can do 64bit assembly operations or not. Should not be used
+   outside of the assembly header files */
+#define OPAL_ASM_SUPPORT_64BIT 1
+
+/* Assembly directive for setting text section */
+#define OPAL_ASM_TEXT ".text"
+
+/* How to set function type in .type directive */
+#define OPAL_ASM_TYPE "@"
+
+/* Architecture type of assembly to use for atomic operations and CMA */
+#define OPAL_ASSEMBLY_ARCH OPAL_AMD64
+
+/* Whether to use builtin atomics */
+#define OPAL_ASSEMBLY_BUILTIN OPAL_BUILTIN_NO
+
+/* Format of assembly file */
+#define OPAL_ASSEMBLY_FORMAT "default-.text-.globl-:--.L-@-1-0-1-1-1"
+
+/* Enable flow control for Portals4 BTL */
+#define OPAL_BTL_PORTALS4_FLOW_CONTROL 0
+
+/* If CMA support can be enabled */
+#define OPAL_BTL_SM_HAVE_CMA 0
+
+/* If knem support can be enabled */
+#define OPAL_BTL_SM_HAVE_KNEM 0
+
+/* define to 1 if usnic BTL unit tests are enabled, 0 otherwise */
+#define OPAL_BTL_USNIC_UNIT_TESTS 0
+
+/* If CMA support can be enabled within vader */
+#define OPAL_BTL_VADER_HAVE_CMA 0
+
+/* If KNEM support can be enabled within vader */
+#define OPAL_BTL_VADER_HAVE_KNEM 0
+
+/* If XPMEM support can be enabled within vader */
+#define OPAL_BTL_VADER_HAVE_XPMEM 0
+
+/* The compiler $lower which OMPI was built with */
+#define OPAL_BUILD_PLATFORM_COMPILER_FAMILYID 1
+
+/* The compiler $lower which OMPI was built with */
+#define OPAL_BUILD_PLATFORM_COMPILER_FAMILYNAME GNU
+
+/* The compiler $lower which OMPI was built with */
+#define OPAL_BUILD_PLATFORM_COMPILER_VERSION 263175
+
+/* The compiler $lower which OMPI was built with */
+#define OPAL_BUILD_PLATFORM_COMPILER_VERSION_STR 4.4.7
+
+/* OMPI underlying C compiler */
+#define OPAL_CC "gcc"
+
+/* Use static const char[] strings for C files */
+#define OPAL_CC_USE_CONST_CHAR_IDENT 0
+
+/* Use #ident strings for C files */
+#define OPAL_CC_USE_IDENT 1
+
+/* Use #pragma comment for C files */
+#define OPAL_CC_USE_PRAGMA_COMMENT 
+
+/* Use #pragma ident strings for C files */
+#define OPAL_CC_USE_PRAGMA_IDENT 0
+
+/* Need CMA syscalls defined */
+/* #undef OPAL_CMA_NEED_SYSCALL_DEFS */
+
+/* Whether we have CUDA GDR support available */
+#define OPAL_CUDA_GDR_SUPPORT 1
+
+/* Whether we have CUDA cuPointerGetAttributes function available */
+#define OPAL_CUDA_GET_ATTRIBUTES 0
+
+/* Whether we want cuda device pointer support */
+#define OPAL_CUDA_SUPPORT 1
+
+/* Whether we have CUDA 4.1 support available */
+#define OPAL_CUDA_SUPPORT_41 1
+
+/* Whether we have CUDA CU_POINTER_ATTRIBUTE_SYNC_MEMOPS support available */
+#define OPAL_CUDA_SYNC_MEMOPS 1
+
+/* OPAL underlying C++ compiler */
+#define OPAL_CXX "g++"
+
+/* Use static const char[] strings for C++ files */
+/* #undef OPAL_CXX_USE_CONST_CHAR_IDENT */
+
+/* Use #ident strings for C++ files */
+/* #undef OPAL_CXX_USE_IDENT */
+
+/* Use #pragma comment for C++ files */
+/* #undef OPAL_CXX_USE_PRAGMA_COMMENT */
+
+/* Use #pragma ident strings for C++ files */
+/* #undef OPAL_CXX_USE_PRAGMA_IDENT */
+
+/* Whether C compiler supports DEC style inline assembly */
+#define OPAL_C_DEC_INLINE_ASSEMBLY 0
+
+/* Whether C compiler supports GCC style inline assembly */
+#define OPAL_C_GCC_INLINE_ASSEMBLY 1
+
+/* Whether C compiler supports __builtin_clz */
+#define OPAL_C_HAVE_BUILTIN_CLZ 1
+
+/* Whether C compiler supports __builtin_expect */
+#define OPAL_C_HAVE_BUILTIN_EXPECT 1
+
+/* Whether C compiler supports __builtin_prefetch */
+#define OPAL_C_HAVE_BUILTIN_PREFETCH 1
+
+/* Whether C compiler supports symbol visibility or not */
+#define OPAL_C_HAVE_VISIBILITY 1
+
+/* Whether C compiler supports XLC style inline assembly */
+#define OPAL_C_XLC_INLINE_ASSEMBLY 0
+
+/* Whether we want checkpoint/restart enabled debugging functionality or not
+   */
+#define OPAL_ENABLE_CRDEBUG 0
+
+/* Whether we want developer-level debugging code or not */
+#define OPAL_ENABLE_DEBUG 1
+
+/* Enable features required for dynamic SL support */
+#define OPAL_ENABLE_DYNAMIC_SL 0
+
+/* Enable fault tolerance general components and logic */
+#define OPAL_ENABLE_FT 0
+
+/* Enable fault tolerance checkpoint/restart components and logic */
+#define OPAL_ENABLE_FT_CR 0
+
+/* Enable fault tolerance thread in Open PAL */
+#define OPAL_ENABLE_FT_THREAD 0
+
+/* Disable getpwuid support (default: enabled) */
+#define OPAL_ENABLE_GETPWUID 1
+
+/* Enable features required for heterogeneous support */
+#define OPAL_ENABLE_HETEROGENEOUS_SUPPORT 0
+
+/* Enable IPv6 support, but only if the underlying system supports it */
+#define OPAL_ENABLE_IPV6 0
+
+/* Whether we want the memory profiling or not */
+#define OPAL_ENABLE_MEM_DEBUG 1
+
+/* Whether we want the memory profiling or not */
+#define OPAL_ENABLE_MEM_PROFILE 1
+
+/* Whether we should enable thread support within the OPAL code base */
+#define OPAL_ENABLE_MULTI_THREADS 1
+
+/* Whether we want BTL progress threads enabled */
+#define OPAL_ENABLE_PROGRESS_THREADS 0
+
+/* Whether user wants PTY support or not */
+#define OPAL_ENABLE_PTY_SUPPORT 1
+
+/* Whether we want developer-level timing framework or not */
+#define OPAL_ENABLE_TIMING 0
+
+/* Greek - alpha, beta, etc - release number of Open Portable Access Layer */
+#define OPAL_GREEK_VERSION "a1"
+
+/* Whether there is an atomic assembly file available */
+#define OPAL_HAVE_ASM_FILE 1
+
+/* Whether your compiler has __attribute__ or not */
+#define OPAL_HAVE_ATTRIBUTE 1
+
+/* Whether your compiler has __attribute__ aligned or not */
+#define OPAL_HAVE_ATTRIBUTE_ALIGNED 1
+
+/* Whether your compiler has __attribute__ always_inline or not */
+#define OPAL_HAVE_ATTRIBUTE_ALWAYS_INLINE 1
+
+/* Whether your compiler has __attribute__ cold or not */
+#define OPAL_HAVE_ATTRIBUTE_COLD 1
+
+/* Whether your compiler has __attribute__ const or not */
+#define OPAL_HAVE_ATTRIBUTE_CONST 1
+
+/* Whether your compiler has __attribute__ deprecated or not */
+#define OPAL_HAVE_ATTRIBUTE_DEPRECATED 1
+
+/* Whether your compiler has __attribute__ deprecated with optional argument
+   */
+#define OPAL_HAVE_ATTRIBUTE_DEPRECATED_ARGUMENT 0
+
+/* Whether your compiler has __attribute__ destructor or not */
+#define OPAL_HAVE_ATTRIBUTE_DESTRUCTOR 1
+
+/* Whether your compiler has __attribute__ format or not */
+#define OPAL_HAVE_ATTRIBUTE_FORMAT 1
+
+/* Whether your compiler has __attribute__ format and it works on function
+   pointers */
+#define OPAL_HAVE_ATTRIBUTE_FORMAT_FUNCPTR 1
+
+/* Whether your compiler has __attribute__ hot or not */
+#define OPAL_HAVE_ATTRIBUTE_HOT 1
+
+/* Whether your compiler has __attribute__ malloc or not */
+#define OPAL_HAVE_ATTRIBUTE_MALLOC 1
+
+/* Whether your compiler has __attribute__ may_alias or not */
+#define OPAL_HAVE_ATTRIBUTE_MAY_ALIAS 1
+
+/* Whether your compiler has __attribute__ noinline or not */
+#define OPAL_HAVE_ATTRIBUTE_NOINLINE 1
+
+/* Whether your compiler has __attribute__ nonnull or not */
+#define OPAL_HAVE_ATTRIBUTE_NONNULL 1
+
+/* Whether your compiler has __attribute__ noreturn or not */
+#define OPAL_HAVE_ATTRIBUTE_NORETURN 1
+
+/* Whether your compiler has __attribute__ noreturn and it works on function
+   pointers */
+#define OPAL_HAVE_ATTRIBUTE_NORETURN_FUNCPTR 1
+
+/* Whether your compiler has __attribute__ no_instrument_function or not */
+#define OPAL_HAVE_ATTRIBUTE_NO_INSTRUMENT_FUNCTION 1
+
+/* Whether your compiler has __attribute__ packed or not */
+#define OPAL_HAVE_ATTRIBUTE_PACKED 1
+
+/* Whether your compiler has __attribute__ pure or not */
+#define OPAL_HAVE_ATTRIBUTE_PURE 1
+
+/* Whether your compiler has __attribute__ sentinel or not */
+#define OPAL_HAVE_ATTRIBUTE_SENTINEL 1
+
+/* Whether your compiler has __attribute__ unused or not */
+#define OPAL_HAVE_ATTRIBUTE_UNUSED 1
+
+/* Whether your compiler has __attribute__ visibility or not */
+#define OPAL_HAVE_ATTRIBUTE_VISIBILITY 1
+
+/* Whether your compiler has __attribute__ warn unused result or not */
+#define OPAL_HAVE_ATTRIBUTE_WARN_UNUSED_RESULT 1
+
+/* Whether your compiler has __attribute__ weak alias or not */
+#define OPAL_HAVE_ATTRIBUTE_WEAK_ALIAS 1
+
+/* whether backtrace_execinfo is found and available */
+#define OPAL_HAVE_BACKTRACE_EXECINFO 1
+
+/* whether qsort is broken or not */
+#define OPAL_HAVE_BROKEN_QSORT 0
+
+/* whether ceil is found and available */
+#define OPAL_HAVE_CEIL 1
+
+/* Enable features required for ConnectX XRC support */
+#define OPAL_HAVE_CONNECTX_XRC 0
+
+/* whether crs_blcr is found and available */
+/* #undef OPAL_HAVE_CRS_BLCR */
+
+/* whether dirname is found and available */
+#define OPAL_HAVE_DIRNAME 1
+
+/* whether fbtl_posix is found and available */
+#define OPAL_HAVE_FBTL_POSIX 1
+
+/* whether gethostbyname is found and available */
+#define OPAL_HAVE_GETHOSTBYNAME 1
+
+/* Whether we have hwloc support or not */
+#define OPAL_HAVE_HWLOC 1
+
+/* do we have Java support */
+#define OPAL_HAVE_JAVA_SUPPORT 1
+
+/* Do not use outside of mpi.h. Define to 1 if the system has the type `long
+   long'. */
+#define OPAL_HAVE_LONG_LONG 1
+
+/* Whether libltdl appears to have the lt_dladvise interface */
+#define OPAL_HAVE_LTDL_ADVISE 0
+
+/* whether openpty is found and available */
+#define OPAL_HAVE_OPENPTY 1
+
+/* Do we have POSIX threads */
+#define OPAL_HAVE_POSIX_THREADS 1
+
+/* If PTHREADS implementation supports PTHREAD_MUTEX_ERRORCHECK */
+#define OPAL_HAVE_PTHREAD_MUTEX_ERRORCHECK 1
+
+/* If PTHREADS implementation supports PTHREAD_MUTEX_ERRORCHECK_NP */
+#define OPAL_HAVE_PTHREAD_MUTEX_ERRORCHECK_NP 1
+
+/* Whether RDMA CM is available or not */
+/* #undef OPAL_HAVE_RDMACM */
+
+/* Enable RDMAoE support */
+/* #undef OPAL_HAVE_RDMAOE */
+
+/* Whether we have SA_RESTART in <signal.h> or not */
+#define OPAL_HAVE_SA_RESTART 1
+
+/* whether sched_yield is found and available */
+#define OPAL_HAVE_SCHED_YIELD 1
+
+/* whether shmem_posix is found and available */
+#define OPAL_HAVE_SHMEM_POSIX 1
+
+/* whether socket is found and available */
+#define OPAL_HAVE_SOCKET 1
+
+/* Whether or not we have solaris */
+#define OPAL_HAVE_SOLARIS 0
+
+/* Do not use outside of mpi.h. Define to 1 if you have the <sys/synch.h>
+   header file. */
+/* #undef OPAL_HAVE_SYS_SYNCH_H */
+
+/* Do not use outside of mpi.h. Define to 1 if you have the <sys/time.h>
+   header file. */
+#define OPAL_HAVE_SYS_TIME_H 1
+
+/* Whether UD CM is available or not */
+/* #undef OPAL_HAVE_UDCM */
+
+/* Whether we have __va_copy or not */
+#define OPAL_HAVE_UNDERSCORE_VA_COPY 1
+
+/* Whether we have va_copy or not */
+#define OPAL_HAVE_VA_COPY 1
+
+/* Whether we have weak symbols or not */
+#define OPAL_HAVE_WEAK_SYMBOLS 1
+
+/* Whether our event component has working event operations or not (if not,
+   then assumedly it only has working timers and signals) */
+#define OPAL_HAVE_WORKING_EVENTOPS 1
+
+/* whether yp_all_nsl is found and available */
+#define OPAL_HAVE_YP_ALL_NSL 1
+
+/* Define to 1 ifyou have the declaration of _SC_NPROCESSORS_ONLN, and to 0
+   otherwise */
+#define OPAL_HAVE__SC_NPROCESSORS_ONLN 1
+
+/* Number of arguments to ibv_create_cq */
+/* #undef OPAL_IBV_CREATE_CQ_ARGS */
+
+/* ident string for Open MPI */
+#define OPAL_IDENT_STRING "1.9.0a1"
+
+/* Whether we are using the internal libltdl or not */
+#define OPAL_LIBLTDL_INTERNAL 1
+
+/* Major release number of Open Portable Access Layer */
+#define OPAL_MAJOR_VERSION 1
+
+/* Maximum length of datarep strings (default is 128) */
+#define OPAL_MAX_DATAREP_STRING 128
+
+/* Maximum length of error strings (default is 256) */
+#define OPAL_MAX_ERROR_STRING 256
+
+/* Maximum length of info keys (default is 36) */
+#define OPAL_MAX_INFO_KEY 36
+
+/* Maximum length of info vals (default is 256) */
+#define OPAL_MAX_INFO_VAL 256
+
+/* Maximum length of object names (default is 64) */
+#define OPAL_MAX_OBJECT_NAME 64
+
+/* Maximum length of port names (default is 1024) */
+#define OPAL_MAX_PORT_NAME 1024
+
+/* Maximum length of processor names (default is 256) */
+#define OPAL_MAX_PROCESSOR_NAME 256
+
+/* MCA cmd line identifier */
+#define OPAL_MCA_CMD_LINE_ID "mca"
+
+/* MCA prefix string for envars */
+#define OPAL_MCA_PREFIX "OMPI_MCA_"
+
+/* Whether any opal memory mca components were found */
+#define OPAL_MEMORY_HAVE_COMPONENT 1
+
+/* Minor release number of Open Portable Access Layer */
+#define OPAL_MINOR_VERSION 9
+
+/* Whether the C compiler supports "bool" without any other help (such as
+   <stdbool.h>) */
+#define OPAL_NEED_C_BOOL 1
+
+/* Add padding bytes to the openib BTL control header */
+#define OPAL_OPENIB_PAD_HDR 0
+
+/* package/branding string for Open MPI */
+#define OPAL_PACKAGE_STRING "Open MPI wwu12@bunsen.icl.utk.edu Distribution"
+
+/* Log base 2 of the maximum size in bytes of a memory descriptor. Set to 0 if
+   MD can bind all of memory. */
+#define OPAL_PORTALS4_MAX_MD_SIZE 0
+
+/* Log base 2 of the maximum size in bytes of the user virtual address space.
+   Set to 0 if MD can bind all of memory. */
+#define OPAL_PORTALS4_MAX_VA_SIZE 0
+
+/* Whether r notation is used for ppc registers */
+/* #undef OPAL_POWERPC_R_REGISTERS */
+
+/* type to use for ptrdiff_t */
+#define OPAL_PTRDIFF_TYPE ptrdiff_t
+
+/* Release date of Open Portable Access Layer */
+#define OPAL_RELEASE_DATE "Unreleased developer copy"
+
+/* Release release number of Open Portable Access Layer */
+#define OPAL_RELEASE_VERSION 0
+
+/* The repository version Open Portable Access Layer */
+#define OPAL_REPO_REV "dev-267-g51b4521"
+
+/* Whether we have shared memory support for mmap or not */
+#define OPAL_SHMEM_MMAP 1
+
+/* Whether we have shared memory support for POSIX or not */
+#define OPAL_SHMEM_POSIX 1
+
+/* Whether we have shared memory support for SYSV or not */
+#define OPAL_SHMEM_SYSV 1
+
+/* Do not use outside of mpi.h. Define to 1 if you have the ANSI C header
+   files. */
+#define OPAL_STDC_HEADERS 1
+
+/* Tarball filename version string of Open Portable Access Layer */
+#define OPAL_TARBALL_VERSION "gitclone"
+
+/* Whether to use <stdbool.h> or not */
+#define OPAL_USE_STDBOOL_H 1
+
+/* Complete release number of Open Portable Access Layer */
+#define OPAL_VERSION "0"
+
+/* Enable per-user config files */
+#define OPAL_WANT_HOME_CONFIG_FILES 1
+
+/* Whether to include support for libltdl or not */
+#define OPAL_WANT_LIBLTDL 1
+
+/* if the memory and buffer checking should be enabled */
+#define OPAL_WANT_MEMCHECKER 0
+
+/* if want pretty-print stack trace feature */
+#define OPAL_WANT_PRETTY_PRINT_STACKTRACE 1
+
+/* whether we want to have smp locks in atomic ops or not */
+#define OPAL_WANT_SMP_LOCKS 1
+
+/* Specific ps command to use in orte-clean */
+#define ORTE_CLEAN_PS_CMD "ps -A -o fname,pid,user"
+
+/* Whether we want static ports enabled */
+#define ORTE_ENABLE_STATIC_PORTS 1
+
+/* Greek - alpha, beta, etc - release number of Open MPI Run-Time Environment
+   */
+#define ORTE_GREEK_VERSION "a1"
+
+/* Major release number of Open MPI Run-Time Environment */
+#define ORTE_MAJOR_VERSION 1
+
+/* Minor release number of Open MPI Run-Time Environment */
+#define ORTE_MINOR_VERSION 9
+
+/* Release date of Open MPI Run-Time Environment */
+#define ORTE_RELEASE_DATE "Unreleased developer copy"
+
+/* Release release number of Open MPI Run-Time Environment */
+#define ORTE_RELEASE_VERSION 0
+
+/* The repository version Open MPI Run-Time Environment */
+#define ORTE_REPO_REV "dev-267-g51b4521"
+
+/* Tarball filename version string of Open MPI Run-Time Environment */
+#define ORTE_TARBALL_VERSION "gitclone"
+
+/* Complete release number of Open MPI Run-Time Environment */
+#define ORTE_VERSION "0"
+
+/* Whether we want orterun to effect "--prefix $prefix" by default */
+#define ORTE_WANT_ORTERUN_PREFIX_BY_DEFAULT 0
+
+/* Greek - alpha, beta, etc - release number of Open SHMEM */
+#define OSHMEM_GREEK_VERSION "a1"
+
+/* mxm support is available */
+/* #undef OSHMEM_HAS_ATOMIC_MXM */
+
+/* Major release number of Open SHMEM */
+#define OSHMEM_MAJOR_VERSION 1
+
+/* Minor release number of Open SHMEM */
+#define OSHMEM_MINOR_VERSION 9
+
+/* Whether we want to check OSHMEM parameters always or never */
+#define OSHMEM_PARAM_CHECK 1
+
+/* Release date of Open SHMEM */
+#define OSHMEM_RELEASE_DATE "Unreleased developer copy"
+
+/* Release release number of Open SHMEM */
+#define OSHMEM_RELEASE_VERSION 0
+
+/* The repository version Open SHMEM */
+#define OSHMEM_REPO_REV "dev-267-g51b4521"
+
+/* Whether user wants OSHMEM in compatibility mode or not */
+#define OSHMEM_SPEC_COMPAT 1
+
+/* Whether we have shared memory support for mmap or not */
+#define OSHMEM_SSHMEM_MMAP 1
+
+/* Whether we have shared memory support for SYSV or not */
+#define OSHMEM_SSHMEM_SYSV 1
+
+/* Whether we have shared memory support for verbs or not */
+#define OSHMEM_SSHMEM_VERBS 0
+
+/* Tarball filename version string of Open SHMEM */
+#define OSHMEM_TARBALL_VERSION "gitclone"
+
+/* Complete release number of Open SHMEM */
+#define OSHMEM_VERSION "0"
+
+/* do we want java oshmem bindings */
+#define OSHMEM_WANT_JAVA_BINDINGS 0
+
+/* Define to the address where bug reports for this package should be sent. */
+#define PACKAGE_BUGREPORT "http://www.open-mpi.org/community/help/"
+
+/* Define to the full name of this package. */
+#define PACKAGE_NAME "Open MPI"
+
+/* Define to the full name and version of this package. */
+#define PACKAGE_STRING "Open MPI gitclone"
+
+/* Define to the one symbol short name of this package. */
+#define PACKAGE_TARNAME "openmpi"
+
+/* Define to the home page for this package. */
+#define PACKAGE_URL ""
+
+/* Define to the version of this package. */
+#define PACKAGE_VERSION "gitclone"
+
+/* The size of `bool', as computed by sizeof. */
+#define SIZEOF_BOOL 1
+
+/* The size of `char', as computed by sizeof. */
+#define SIZEOF_CHAR 1
+
+/* The size of `double', as computed by sizeof. */
+#define SIZEOF_DOUBLE 8
+
+/* The size of `double _Complex', as computed by sizeof. */
+#define SIZEOF_DOUBLE__COMPLEX 16
+
+/* The size of `float', as computed by sizeof. */
+#define SIZEOF_FLOAT 4
+
+/* The size of `float _Complex', as computed by sizeof. */
+#define SIZEOF_FLOAT__COMPLEX 8
+
+/* The size of `int', as computed by sizeof. */
+#define SIZEOF_INT 4
+
+/* The size of `long', as computed by sizeof. */
+#define SIZEOF_LONG 8
+
+/* The size of `long double', as computed by sizeof. */
+#define SIZEOF_LONG_DOUBLE 16
+
+/* The size of `long double _Complex', as computed by sizeof. */
+#define SIZEOF_LONG_DOUBLE__COMPLEX 32
+
+/* The size of `long long', as computed by sizeof. */
+#define SIZEOF_LONG_LONG 8
+
+/* The size of `pid_t', as computed by sizeof. */
+#define SIZEOF_PID_T 4
+
+/* The size of `ptrdiff_t', as computed by sizeof. */
+#define SIZEOF_PTRDIFF_T 8
+
+/* The size of `short', as computed by sizeof. */
+#define SIZEOF_SHORT 2
+
+/* The size of `size_t', as computed by sizeof. */
+#define SIZEOF_SIZE_T 8
+
+/* The size of `ssize_t', as computed by sizeof. */
+#define SIZEOF_SSIZE_T 8
+
+/* The size of `unsigned int', as computed by sizeof. */
+#define SIZEOF_UNSIGNED_INT 4
+
+/* The size of `unsigned long', as computed by sizeof. */
+#define SIZEOF_UNSIGNED_LONG 8
+
+/* The size of `void *', as computed by sizeof. */
+#define SIZEOF_VOID_P 8
+
+/* The size of `wchar_t', as computed by sizeof. */
+#define SIZEOF_WCHAR_T 4
+
+/* The size of `_Bool', as computed by sizeof. */
+#define SIZEOF__BOOL 1
+
+/* The size of `__float128', as computed by sizeof. */
+#define SIZEOF___FLOAT128 16
+
+/* Define to 1 if you have the ANSI C header files. */
+#define STDC_HEADERS 1
+
+/* Enable extensions on HP-UX. */
+#ifndef _HPUX_SOURCE
+# define _HPUX_SOURCE 1
+#endif
+
+
+/* Whether to use the legacy Solaris munmap prototype or not */
+/* #undef USE_SOLARIS_LEGACY_MUNMAP_PROTOTYPE */
+
+/* Enable extensions on AIX 3, Interix.  */
+#ifndef _ALL_SOURCE
+# define _ALL_SOURCE 1
+#endif
+/* Enable GNU extensions on systems that have them.  */
+#ifndef _GNU_SOURCE
+# define _GNU_SOURCE 1
+#endif
+/* Enable threading extensions on Solaris.  */
+#ifndef _POSIX_PTHREAD_SEMANTICS
+# define _POSIX_PTHREAD_SEMANTICS 1
+#endif
+/* Enable extensions on HP NonStop.  */
+#ifndef _TANDEM_SOURCE
+# define _TANDEM_SOURCE 1
+#endif
+/* Enable general extensions on Solaris.  */
+#ifndef __EXTENSIONS__
+# define __EXTENSIONS__ 1
+#endif
+
+
+/* Define WORDS_BIGENDIAN to 1 if your processor stores words with the most
+   significant byte first (like Motorola and SPARC, unlike Intel). */
+#if defined AC_APPLE_UNIVERSAL_BUILD
+# if defined __BIG_ENDIAN__
+#  define WORDS_BIGENDIAN 1
+# endif
+#else
+# ifndef WORDS_BIGENDIAN
+/* #  undef WORDS_BIGENDIAN */
+# endif
+#endif
+
+/* Additional CFLAGS to pass through the wrapper compilers */
+#define WRAPPER_EXTRA_CFLAGS "-pthread "
+
+/* Additional CFLAGS_PREFIX to pass through the wrapper compilers */
+#define WRAPPER_EXTRA_CFLAGS_PREFIX ""
+
+/* Additional CXXFLAGS to pass through the wrapper compilers */
+#define WRAPPER_EXTRA_CXXFLAGS "-pthread "
+
+/* Additional CXXFLAGS_PREFIX to pass through the wrapper compilers */
+#define WRAPPER_EXTRA_CXXFLAGS_PREFIX ""
+
+/* Additional FCFLAGS to pass through the wrapper compilers */
+#define WRAPPER_EXTRA_FCFLAGS "-pthread  -I${libdir}"
+
+/* Additional FCFLAGS to pass through the wrapper compilers */
+#define WRAPPER_EXTRA_FCFLAGS_PREFIX ""
+
+/* Additional LDFLAGS to pass through the wrapper compilers */
+#define WRAPPER_EXTRA_LDFLAGS "    -Wl,-rpath -Wl,@{libdir} -Wl,--enable-new-dtags"
+
+/* Additional LIBS to pass through the wrapper compilers */
+#define WRAPPER_EXTRA_LIBS "-lm -ldl -lutil "
+
+/* Whether the wrapper compilers add rpath flags by default */
+#define WRAPPER_RPATH_SUPPORT "runpath"
+
+/* Define to 1 if the X Window System is missing or not being used. */
+/* #undef X_DISPLAY_MISSING */
+
+/* Define to 1 if `lex' declares `yytext' as a `char *' by default, not a
+   `char[]'. */
+#define YYTEXT_POINTER 1
+
+/* Enable GNU extensions on systems that have them.  */
+#ifndef _GNU_SOURCE
+# define _GNU_SOURCE 1
+#endif
+
+/* Are we building for HP-UX? */
+#define _HPUX_SOURCE 1
+
+/* Define to 1 if on MINIX. */
+/* #undef _MINIX */
+
+/* Define to 2 if the system does not provide POSIX.1 features except with
+   this defined. */
+/* #undef _POSIX_1_SOURCE */
+
+/* Define to 1 if you need to in order for `stat' and other things to work. */
+/* #undef _POSIX_SOURCE */
+
+/* Define this to the process ID type */
+#define hwloc_pid_t pid_t
+
+/* Define this to the thread ID type */
+#define hwloc_thread_t pthread_t
+
+/* Define to `__inline__' or `__inline' if that's what the C compiler
+   calls it, or to nothing if 'inline' is not supported under any name.  */
+#ifndef __cplusplus
+#define inline __inline__
+#endif
+
+/* A bogus type that allows us to have sentinel type values that are still
+   valid */
+#define ompi_fortran_bogus_type_t int
+
+/* C type corresponding to Fortran CHARACTER */
+#define ompi_fortran_character_t char
+
+/* C type corresponding to Fortran COMPLEX*16 */
+/* #undef ompi_fortran_complex16_t */
+
+/* C type corresponding to Fortran COMPLEX*32 */
+/* #undef ompi_fortran_complex32_t */
+
+/* C type corresponding to Fortran COMPLEX*4 */
+/* #undef ompi_fortran_complex4_t */
+
+/* C type corresponding to Fortran COMPLEX*8 */
+/* #undef ompi_fortran_complex8_t */
+
+/* C type corresponding to Fortran COMPLEX */
+/* #undef ompi_fortran_complex_t */
+
+/* C type corresponding to Fortran DOUBLE COMPLEX */
+/* #undef ompi_fortran_double_complex_t */
+
+/* C type corresponding to Fortran DOUBLE PRECISION */
+#define ompi_fortran_double_precision_t double
+
+/* C type corresponding to Fortran INTEGER*16 */
+#define ompi_fortran_integer16_t 
+
+/* C type corresponding to Fortran INTEGER*1 */
+#define ompi_fortran_integer1_t char
+
+/* C type corresponding to Fortran INTEGER*2 */
+#define ompi_fortran_integer2_t short
+
+/* C type corresponding to Fortran INTEGER*4 */
+#define ompi_fortran_integer4_t int
+
+/* C type corresponding to Fortran INTEGER*8 */
+#define ompi_fortran_integer8_t long long
+
+/* C type corresponding to Fortran INTEGER */
+#define ompi_fortran_integer_t int
+
+/* C type corresponding to Fortran LOGICAL*1 */
+#define ompi_fortran_logical1_t char
+
+/* C type corresponding to Fortran LOGICAL*2 */
+#define ompi_fortran_logical2_t short
+
+/* C type corresponding to Fortran LOGICAL*4 */
+#define ompi_fortran_logical4_t int
+
+/* C type corresponding to Fortran LOGICAL*8 */
+#define ompi_fortran_logical8_t long long
+
+/* C type corresponding to Fortran LOGICAL */
+#define ompi_fortran_logical_t int
+
+/* C type corresponding to Fortran REAL*16 */
+#define ompi_fortran_real16_t ompi_fortran_bogus_type_t
+
+/* C type corresponding to Fortran REAL*2 */
+#define ompi_fortran_real2_t ompi_fortran_bogus_type_t
+
+/* C type corresponding to Fortran REAL*4 */
+#define ompi_fortran_real4_t float
+
+/* C type corresponding to Fortran REAL*8 */
+#define ompi_fortran_real8_t double
+
+/* C type corresponding to Fortran REAL */
+#define ompi_fortran_real_t float
+
+/* Define to the equivalent of the C99 'restrict' keyword, or to
+   nothing if this is not supported.  Do not define if restrict is
+   supported directly.  */
+#define restrict __restrict
+/* Work around a bug in Sun C++: it does not support _Restrict or
+   __restrict__, even though the corresponding Sun C compiler ends up with
+   "#define restrict _Restrict" or "#define restrict __restrict__" in the
+   previous line.  Perhaps some future version of Sun C++ will work with
+   restrict; if so, hopefully it defines __RESTRICT like Sun C does.  */
+#if defined __SUNPRO_CC && !defined __RESTRICT
+# define _Restrict
+# define __restrict__
+#endif
+
+#endif /* OPAL_CONFIG_H */
+
diff --git a/opal/datatype/cuda/opal_datatype_cuda.cu b/opal/datatype/cuda/opal_datatype_cuda.cu
index ea1f3633480..105ba2bfeba 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cu
+++ b/opal/datatype/cuda/opal_datatype_cuda.cu
@@ -2,10 +2,56 @@
 #include "opal_datatype_cuda.cuh"
 #include <cuda_runtime_api.h>
 #include <stdio.h>
+#include <stdarg.h> 
+
+/*
+ * NOTE: The order of this array *MUST* match what is listed in datatype.h
+ * (use of designated initializers should relax this restrictions some)
+ */
+OPAL_DECLSPEC const size_t opal_datatype_basicDatatypesSize[OPAL_DATATYPE_MAX_PREDEFINED] = {
+    OPAL_DATATYPE_LOOP_SIZE,
+    OPAL_DATATYPE_END_LOOP_SIZE,
+    OPAL_DATATYPE_LB_SIZE,
+    OPAL_DATATYPE_UB_SIZE,
+    OPAL_DATATYPE_INT1_SIZE,
+    OPAL_DATATYPE_INT2_SIZE,
+    OPAL_DATATYPE_INT4_SIZE,
+    OPAL_DATATYPE_INT8_SIZE,
+    OPAL_DATATYPE_INT16_SIZE,       /* Yes, double-machine word integers are available */
+    OPAL_DATATYPE_UINT1_SIZE,
+    OPAL_DATATYPE_UINT2_SIZE,
+    OPAL_DATATYPE_UINT4_SIZE,
+    OPAL_DATATYPE_UINT8_SIZE,
+    OPAL_DATATYPE_UINT16_SIZE,      /* Yes, double-machine word integers are available */
+    OPAL_DATATYPE_FLOAT2_SIZE,
+    OPAL_DATATYPE_FLOAT4_SIZE,
+    OPAL_DATATYPE_FLOAT8_SIZE,
+    OPAL_DATATYPE_FLOAT12_SIZE,
+    OPAL_DATATYPE_FLOAT16_SIZE,
+    OPAL_DATATYPE_FLOAT_COMPLEX_SIZE,
+    OPAL_DATATYPE_DOUBLE_COMPLEX_SIZE,
+    OPAL_DATATYPE_LONG_DOUBLE_COMPLEX_SIZE,
+    OPAL_DATATYPE_BOOL_SIZE,
+    OPAL_DATATYPE_WCHAR_SIZE,
+    OPAL_DATATYPE_UNAVAILABLE_SIZE,
+};
+
+/***** my variables ********/
 
 ddt_cuda_desc_t *cuda_desc_d, *cuda_desc_h;
 unsigned char *pBaseBuf_GPU, *gpu_src_const, *gpu_dest_const;
+unsigned char *ddt_cuda_pack_buffer, *ddt_cuda_unpack_buffer;
 ddt_cuda_stream_t* cuda_streams;
+struct iovec cuda_iov[CUDA_NB_IOV];
+uint32_t cuda_iov_count;
+ddt_cuda_description_dist_t description_dist_h[CUDA_MAX_NB_BLOCKS];
+ddt_cuda_description_dist_t* description_dist_d;
+ddt_cuda_iov_dist_t cuda_iov_dist_h[NB_STREAMS][CUDA_MAX_NB_BLOCKS];
+ddt_cuda_iov_dist_t* cuda_iov_dist_d[NB_STREAMS];
+dt_elem_desc_t* description_d;
+uint8_t opal_datatype_cuda_debug;
+
+//uint8_t ALIGNMENT_DOUBLE, ALIGNMENT_FLOAT, ALIGNMENT_CHAR;
 
 void opal_datatype_cuda_init(void)
 {
@@ -18,26 +64,57 @@ void opal_datatype_cuda_init(void)
     cudaMallocHost((void **)&cuda_desc_h, sizeof(ddt_cuda_desc_t));
     printf("size cuda_desc %d\n", sizeof(ddt_cuda_desc_t));
     
-    printf("malloc iov\n");
-    for (i = 0; i < IOV_ARRAY_SIZE; i++) {
-        void* iov_base;
-        cudaMalloc( (void **)&iov_base, sizeof(char)*IOV_LEN);
-        cuda_desc_h->iov[i].iov_base = iov_base;
-        cuda_desc_h->iov[i].iov_len = IOV_LEN;
-    }
-    cudaMalloc((void **)(&pBaseBuf_GPU), sizeof(char)*IOV_LEN);
+    // printf("malloc iov\n");
+    // for (i = 0; i < IOV_ARRAY_SIZE; i++) {
+    //     void* iov_base;
+    //     cudaMalloc( (void **)&iov_base, sizeof(char)*IOV_LEN);
+    //     cuda_desc_h->iov[i].iov_base = iov_base;
+    //     cuda_desc_h->iov[i].iov_len = IOV_LEN;
+    // }
+    printf("malloc cuda packing buffer\n");
+    cudaMalloc((void **)(&ddt_cuda_pack_buffer), sizeof(char)*DT_CUDA_BUFFER_SIZE);
+    cudaMemset(ddt_cuda_pack_buffer, 0, sizeof(char)*DT_CUDA_BUFFER_SIZE);
+    printf("malloc cuda unpacking buffer\n");
+    cudaMalloc((void **)(&ddt_cuda_unpack_buffer), sizeof(char)*DT_CUDA_BUFFER_SIZE);
+    cudaMemset(ddt_cuda_unpack_buffer, 0, sizeof(char)*DT_CUDA_BUFFER_SIZE);
+
+    cuda_desc_h->iov[0].iov_base = ddt_cuda_pack_buffer;
+    cuda_desc_h->iov[0].iov_len = DT_CUDA_BUFFER_SIZE;
+    
+    cudaMalloc((void **)(&pBaseBuf_GPU), sizeof(char)*DT_CUDA_BUFFER_SIZE);
     gpu_src_const = pBaseBuf_GPU;
     gpu_dest_const = (unsigned char*)cuda_desc_h->iov[0].iov_base; 
     
     cuda_desc_h->description_max_count = 0;
     cuda_desc_h->description_count = 0;
     
-    cuda_streams = (ddt_cuda_stream_t*)malloc(sizeof(ddt_cuda_stream_t));
     /* init cuda stream */
+    cuda_streams = (ddt_cuda_stream_t*)malloc(sizeof(ddt_cuda_stream_t));
     for (i = 0; i < NB_STREAMS; i++) {
         cudaStreamCreate(&(cuda_streams->opal_cuda_stream[i]));
     }
     cuda_streams->current_stream_id = 0;
+    
+    /* init cuda_iov */
+    cuda_iov_count = CUDA_NB_IOV;
+    
+    /* init description dist array */
+    cudaMalloc((void **)(&description_dist_d), sizeof(ddt_cuda_description_dist_t)*CUDA_MAX_NB_BLOCKS);
+    cuda_desc_h->description_dist = description_dist_d;
+    
+    /* only for iov version */
+    for (i = 0; i < NB_STREAMS; i++) {
+        cudaMalloc((void **)(&cuda_iov_dist_d[i]), sizeof(ddt_cuda_iov_dist_t)*CUDA_MAX_NB_BLOCKS);
+    }
+    
+    opal_datatype_cuda_debug = 1;
+    
+    // /* init size for double, float, char */
+    // ALIGNMENT_DOUBLE = sizeof(double);
+    // ALIGNMENT_FLOAT = sizeof(float);
+    // ALIGNMENT_CHAR = sizeof(char);
+    
+    
 }
 
 void opal_datatype_cuda_fini(void)
@@ -52,6 +129,10 @@ void opal_datatype_cuda_fini(void)
         cudaFree(cuda_desc_h->description);
         cuda_desc_h->description = NULL;
     }
+    if (cuda_desc_h->description_dist != NULL) {
+        cudaFree(cuda_desc_h->description_dist);
+        cuda_desc_h->description_dist = NULL;
+    }
     printf("free iov\n");
     if (cuda_desc_h != NULL) {    
         for (i = 0; i < IOV_ARRAY_SIZE; i++) {
@@ -68,6 +149,11 @@ void opal_datatype_cuda_fini(void)
         cudaStreamDestroy(cuda_streams->opal_cuda_stream[i]);
     }
     free(cuda_streams);
+    
+    /* only for iov version */
+    for (i = 0; i < NB_STREAMS; i++) {
+        cudaFree(cuda_iov_dist_d[i]);
+    }
 }
 
 void opal_cuda_sync_device(void)
@@ -75,4 +161,15 @@ void opal_cuda_sync_device(void)
     cudaDeviceSynchronize();
     pBaseBuf_GPU = gpu_src_const;
     cuda_desc_h->iov[0].iov_base = (void*)gpu_dest_const;
-}
\ No newline at end of file
+}
+
+void opal_cuda_output(int output_id, const char *format, ...)
+{
+    if (output_id >= 0 && output_id <= OPAL_DATATYPE_CUDA_DEBUG_LEVEL) {
+        va_list arglist;
+        fprintf( stderr, "[Debug %d]: ", output_id );
+        va_start(arglist, format);
+        vfprintf(stderr, format, arglist);
+        va_end(arglist);
+    }
+}
diff --git a/opal/datatype/cuda/opal_datatype_cuda.cuh b/opal/datatype/cuda/opal_datatype_cuda.cuh
index 82ab78b2ff7..ebaad5a06fc 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cuh
+++ b/opal/datatype/cuda/opal_datatype_cuda.cuh
@@ -12,11 +12,21 @@ int32_t opal_generic_simple_pack_function_cuda( opal_convertor_t* pConvertor,
                                                 struct iovec* iov, 
                                                 uint32_t* out_size,
                                                 size_t* max_data );
+                                                
+int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor,
+                                                    struct iovec* iov, 
+                                                    uint32_t* out_size,
+                                                    size_t* max_data );                                              
 
 int32_t opal_generic_simple_unpack_function_cuda( opal_convertor_t* pConvertor,
                                                   struct iovec* iov, 
                                                   uint32_t* out_size,
                                                   size_t* max_data );
+                                                  
+int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvertor,
+                                                  struct iovec* iov, 
+                                                  uint32_t* out_size,
+                                                  size_t* max_data );  
 
 void pack_contiguous_loop_cuda( dt_elem_desc_t* ELEM,
                                 uint32_t* COUNT,
diff --git a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
index 84fbbe856a0..b510a2f5808 100644
--- a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
+++ b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
@@ -3,306 +3,48 @@
 
 #include <stdint.h>
 #include <stddef.h>
+#include <sys/time.h>
 
-//#define OPAL_DATATYPE_CUDA_DRY_RUN
-//#define OPAL_DATATYPE_CUDA_DEBUG
+#include "opal_datatype_orig_internal.h"
+
+
+/* OPAL_CUDA */
+// #define OPAL_DATATYPE_CUDA_DRY_RUN
+#define OPAL_DATATYPE_CUDA_DEBUG
 //#define OPAL_DATATYPE_CUDA_KERNEL_TIME
-#define OPAL_ENABLE_DEBUG   1
+#define OPAL_DATATYPE_CUDA_DEBUG_LEVEL  0
+#define OPAL_DATATYPE_CUDA_IOV
+#define OPAL_DATATYPE_CUDA_TIMING
+
 
-#define DT_STATIC_STACK_SIZE    5                /**< This should be sufficient for most applications */
-#define IOV_ARRAY_SIZE          10
-#define IOV_LEN                 1024*1024*200
+#define IOV_ARRAY_SIZE          1
+#define DT_CUDA_BUFFER_SIZE    1024*1024*200
 
 #define THREAD_PER_BLOCK    32
-#define TASK_PER_THREAD     1
+#define CUDA_WARP_SIZE      32
+#define TASK_PER_THREAD     2
 #define OPAL_GPU_INDEX      0
 #define NB_STREAMS          4
+#define CUDA_NB_IOV         4096
+#define CUDA_IOV_LEN        1024*1204
+#define CUDA_MAX_NB_BLOCKS  1024
+#define CUDA_IOV_MAX_TASK_PER_BLOCK 200
+#define ALIGNMENT_DOUBLE    8
+#define ALIGNMENT_FLOAT     4
+#define ALIGNMENT_CHAR      1
 
-#define OPAL_PTRDIFF_TYPE ptrdiff_t
-
-/* keep the last 16 bits free for data flags */
-#define CONVERTOR_DATATYPE_MASK    0x0000FFFF
-#define CONVERTOR_SEND_CONVERSION  0x00010000
-#define CONVERTOR_RECV             0x00020000
-#define CONVERTOR_SEND             0x00040000
-#define CONVERTOR_HOMOGENEOUS      0x00080000
-#define CONVERTOR_NO_OP            0x00100000
-#define CONVERTOR_WITH_CHECKSUM    0x00200000
-#define CONVERTOR_CUDA             0x00400000
-#define CONVERTOR_CUDA_ASYNC       0x00800000
-#define CONVERTOR_TYPE_MASK        0x00FF0000
-#define CONVERTOR_STATE_START      0x01000000
-#define CONVERTOR_STATE_COMPLETE   0x02000000
-#define CONVERTOR_STATE_ALLOC      0x04000000
-#define CONVERTOR_COMPLETED        0x08000000
-
-#define OPAL_DATATYPE_LOOP           0
-#define OPAL_DATATYPE_END_LOOP       1
-#define OPAL_DATATYPE_LB             2
-#define OPAL_DATATYPE_UB             3
-#define OPAL_DATATYPE_FIRST_TYPE     4 /* Number of first real type */
-#define OPAL_DATATYPE_INT1           4
-#define OPAL_DATATYPE_INT2           5
-#define OPAL_DATATYPE_INT4           6
-#define OPAL_DATATYPE_INT8           7
-#define OPAL_DATATYPE_INT16          8
-#define OPAL_DATATYPE_UINT1          9
-#define OPAL_DATATYPE_UINT2          10
-#define OPAL_DATATYPE_UINT4          11
-#define OPAL_DATATYPE_UINT8          12
-#define OPAL_DATATYPE_UINT16         13
-#define OPAL_DATATYPE_FLOAT2         14
-#define OPAL_DATATYPE_FLOAT4         15
-#define OPAL_DATATYPE_FLOAT8         16
-#define OPAL_DATATYPE_FLOAT12        17
-#define OPAL_DATATYPE_FLOAT16        18
-#define OPAL_DATATYPE_FLOAT_COMPLEX  19
-#define OPAL_DATATYPE_DOUBLE_COMPLEX 20
-#define OPAL_DATATYPE_LONG_DOUBLE_COMPLEX 21
-#define OPAL_DATATYPE_BOOL           22
-#define OPAL_DATATYPE_WCHAR          23
-#define OPAL_DATATYPE_UNAVAILABLE    24
-
-/* flags for the datatypes. */
-#define OPAL_DATATYPE_FLAG_UNAVAILABLE   0x0001  /**< datatypes unavailable on the build (OS or compiler dependant) */
-#define OPAL_DATATYPE_FLAG_PREDEFINED    0x0002  /**< cannot be removed: initial and predefined datatypes */
-#define OPAL_DATATYPE_FLAG_COMMITED      0x0004  /**< ready to be used for a send/recv operation */
-#define OPAL_DATATYPE_FLAG_OVERLAP       0x0008  /**< datatype is unpropper for a recv operation */
-#define OPAL_DATATYPE_FLAG_CONTIGUOUS    0x0010  /**< contiguous datatype */
-#define OPAL_DATATYPE_FLAG_NO_GAPS       0x0020  /**< no gaps around the datatype, aka OPAL_DATATYPE_FLAG_CONTIGUOUS and extent == size */
-#define OPAL_DATATYPE_FLAG_USER_LB       0x0040  /**< has a user defined LB */
-#define OPAL_DATATYPE_FLAG_USER_UB       0x0080  /**< has a user defined UB */
-#define OPAL_DATATYPE_FLAG_DATA          0x0100  /**< data or control structure */
-/*
- * We should make the difference here between the predefined contiguous and non contiguous
- * datatypes. The OPAL_DATATYPE_FLAG_BASIC is held by all predefined contiguous datatypes.
- */
-#define OPAL_DATATYPE_FLAG_BASIC         (OPAL_DATATYPE_FLAG_PREDEFINED | \
-                                          OPAL_DATATYPE_FLAG_CONTIGUOUS | \
-                                          OPAL_DATATYPE_FLAG_NO_GAPS |    \
-                                          OPAL_DATATYPE_FLAG_DATA |       \
-                                          OPAL_DATATYPE_FLAG_COMMITED)
- 
-/* typedefs ***********************************************************/
-
-typedef struct opal_object_t opal_object_t;
-typedef struct opal_class_t opal_class_t;
-typedef void (*opal_construct_t) (opal_object_t *);
-typedef void (*opal_destruct_t) (opal_object_t *);
-
-
-/* types **************************************************************/
-
-/**
-* Class descriptor.
-*
-* There should be a single instance of this descriptor for each class
-* definition.
-*/
-struct opal_class_t {
-  const char *cls_name;           /**< symbolic name for class */
-  opal_class_t *cls_parent;       /**< parent class descriptor */
-  opal_construct_t cls_construct; /**< class constructor */
-  opal_destruct_t cls_destruct;   /**< class destructor */
-  int cls_initialized;            /**< is class initialized */
-  int cls_depth;                  /**< depth of class hierarchy tree */
-  opal_construct_t *cls_construct_array;
-                                  /**< array of parent class constructors */
-  opal_destruct_t *cls_destruct_array;
-                                  /**< array of parent class destructors */
-  size_t cls_sizeof;              /**< size of an object instance */
-};
-
-/**
- * Base object.
- *
- * This is special and does not follow the pattern for other classes.
- */
-struct opal_object_t {
-#if OPAL_ENABLE_DEBUG
-    /** Magic ID -- want this to be the very first item in the
-        struct's memory */
-    uint64_t obj_magic_id;
-#endif
-    opal_class_t *obj_class;            /**< class descriptor */
-    volatile int32_t obj_reference_count;   /**< reference count */
-#if OPAL_ENABLE_DEBUG
-   const char* cls_init_file_name;        /**< In debug mode store the file where the object get contructed */
-   int   cls_init_lineno;           /**< In debug mode store the line number where the object get contructed */
-#endif  /* OPAL_ENABLE_DEBUG */
-};
-
- 
- 
-struct ddt_elem_id_description {
-    uint16_t   flags;  /**< flags for the record */
-    uint16_t   type;   /**< the basic data type id */
-};
-typedef struct ddt_elem_id_description ddt_elem_id_description;
-
-/* the basic element. A data description is composed
- * by a set of basic elements.
- */
-struct ddt_elem_desc {
-    ddt_elem_id_description common;           /**< basic data description and flags */
-    uint32_t                count;            /**< number of blocks */
-    uint32_t                blocklen;         /**< number of elements on each block */
-    OPAL_PTRDIFF_TYPE       extent;           /**< extent of each block (in bytes) */
-    OPAL_PTRDIFF_TYPE       disp;             /**< displacement of the first block */
-};
-typedef struct ddt_elem_desc ddt_elem_desc_t;
-
-struct ddt_loop_desc {
-    ddt_elem_id_description common;           /**< basic data description and flags */
-    uint32_t                loops;            /**< number of elements */
-    uint32_t                items;            /**< number of items in the loop */
-    size_t                  unused;           /**< not used right now */
-    OPAL_PTRDIFF_TYPE       extent;           /**< extent of the whole loop */
-};
-typedef struct ddt_loop_desc ddt_loop_desc_t;
-
-struct ddt_endloop_desc {
-    ddt_elem_id_description common;           /**< basic data description and flags */
-    uint32_t                items;            /**< number of elements */
-    uint32_t                unused;           /**< not used right now */
-    size_t                  size;             /**< real size of the data in the loop */
-    OPAL_PTRDIFF_TYPE       first_elem_disp;  /**< the displacement of the first block in the loop */
-};
-typedef struct ddt_endloop_desc ddt_endloop_desc_t;
-
-union dt_elem_desc {
-    ddt_elem_desc_t    elem;
-    ddt_loop_desc_t    loop;
-    ddt_endloop_desc_t end_loop;
-};
-typedef union dt_elem_desc dt_elem_desc_t;
-
-/* dt_type_description */
-typedef uint32_t opal_datatype_count_t;
-
-struct dt_type_desc_t {
-    opal_datatype_count_t  length;  /**< the maximum number of elements in the description array */
-    opal_datatype_count_t  used;    /**< the number of used elements in the description array */
-    dt_elem_desc_t*        desc;
-};
-typedef struct dt_type_desc_t dt_type_desc_t;
-
-/*
- * The datatype description.
- */
-#define OPAL_DATATYPE_MAX_PREDEFINED 25
-#define OPAL_DATATYPE_MAX_SUPPORTED  47
-#define OPAL_MAX_OBJECT_NAME         64
-
-struct opal_datatype_t {
-    opal_object_t      super;    /**< basic superclass */
-    uint16_t           flags;    /**< the flags */
-    uint16_t           id;       /**< data id, normally the index in the data array. */
-    uint32_t           bdt_used; /**< bitset of which basic datatypes are used in the data description */
-    size_t             size;     /**< total size in bytes of the memory used by the data if
-                                      the data is put on a contiguous buffer */
-    OPAL_PTRDIFF_TYPE  true_lb;  /**< the true lb of the data without user defined lb and ub */
-    OPAL_PTRDIFF_TYPE  true_ub;  /**< the true ub of the data without user defined lb and ub */
-    OPAL_PTRDIFF_TYPE  lb;       /**< lower bound in memory */
-    OPAL_PTRDIFF_TYPE  ub;       /**< upper bound in memory */
-    /* --- cacheline 1 boundary (64 bytes) --- */
-    size_t             nbElems;  /**< total number of elements inside the datatype */
-    uint32_t           align;    /**< data should be aligned to */
-
-    /* Attribute fields */
-    char               name[OPAL_MAX_OBJECT_NAME];  /**< name of the datatype */
-    /* --- cacheline 2 boundary (128 bytes) was 8-12 bytes ago --- */
-    dt_type_desc_t     desc;     /**< the data description */
-    dt_type_desc_t     opt_desc; /**< short description of the data used when conversion is useless
-                                      or in the send case (without conversion) */
+#define TIMER_DATA_TYPE struct timeval
+#define GET_TIME(TV)   gettimeofday( &(TV), NULL )
+#define ELAPSED_TIME(TSTART, TEND)  (((TEND).tv_sec - (TSTART).tv_sec) * 1000000 + ((TEND).tv_usec - (TSTART).tv_usec))
 
-    uint32_t           btypes[OPAL_DATATYPE_MAX_SUPPORTED];
-                                 /**< basic elements count used to compute the size of the
-                                      datatype for remote nodes. The length of the array is dependent on
-                                      the maximum number of datatypes of all top layers.
-                                      Reason being is that Fortran is not at the OPAL layer. */
-    /* --- cacheline 5 boundary (320 bytes) was 32-36 bytes ago --- */
 
-    /* size: 352, cachelines: 6, members: 15 */
-    /* last cacheline: 28-32 bytes */
-};
 
-typedef struct opal_datatype_t opal_datatype_t;
-
-/* convertor and stack */
-typedef struct opal_convertor_t opal_convertor_t;
-
-typedef int32_t (*convertor_advance_fct_t)( opal_convertor_t* pConvertor,
-                                            struct iovec* iov,
-                                            uint32_t* out_size,
-                                            size_t* max_data );
-typedef void*(*memalloc_fct_t)( size_t* pLength, void* userdata );
-typedef void*(*memcpy_fct_t)( void* dest, const void* src, size_t n, opal_convertor_t* pConvertor );
-
-/* The master convertor struct (defined in convertor_internal.h) */
-struct opal_convertor_master_t;
-
-struct dt_stack_t {
-    int32_t           index;    /**< index in the element description */
-    int16_t           type;     /**< the type used for the last pack/unpack (original or OPAL_DATATYPE_UINT1) */
-    size_t            count;    /**< number of times we still have to do it */
-    OPAL_PTRDIFF_TYPE disp;     /**< actual displacement depending on the count field */
-};
-typedef struct dt_stack_t dt_stack_t;
-
-typedef int32_t (*conversion_fct_t)( opal_convertor_t* pConvertor, uint32_t count,
-                                     const void* from, size_t from_len, OPAL_PTRDIFF_TYPE from_extent,
-                                     void* to, size_t to_length, OPAL_PTRDIFF_TYPE to_extent,
-                                     OPAL_PTRDIFF_TYPE *advance );
-
-typedef struct opal_convertor_master_t {
-    struct opal_convertor_master_t* next;
-    uint32_t                        remote_arch;
-    uint32_t                        flags;
-    uint32_t                        hetero_mask;
-    const size_t                    remote_sizes[OPAL_DATATYPE_MAX_PREDEFINED];
-    conversion_fct_t*               pFunctions;   /**< the convertor functions pointer */
-} opal_convertor_master_t;
-
-struct opal_convertor_t {
-    opal_object_t                 super;          /**< basic superclass */
-    uint32_t                      remoteArch;     /**< the remote architecture */
-    uint32_t                      flags;          /**< the properties of this convertor */
-    size_t                        local_size;     /**< overall length data on local machine, compared to bConverted */
-    size_t                        remote_size;    /**< overall length data on remote machine, compared to bConverted */
-    const opal_datatype_t*        pDesc;          /**< the datatype description associated with the convertor */
-    const dt_type_desc_t*         use_desc;       /**< the version used by the convertor (normal or optimized) */
-    opal_datatype_count_t         count;          /**< the total number of full datatype elements */
-    uint32_t                      stack_size;     /**< size of the allocated stack */
-    /* --- cacheline 1 boundary (64 bytes) --- */
-    unsigned char*                pBaseBuf;       /**< initial buffer as supplied by the user */
-    dt_stack_t*                   pStack;         /**< the local stack for the actual conversion */
-    convertor_advance_fct_t       fAdvance;       /**< pointer to the pack/unpack functions */
-    struct opal_convertor_master_t* master;       /**< the master convertor */
-
-    /* All others fields get modified for every call to pack/unpack functions */
-    uint32_t                      stack_pos;      /**< the actual position on the stack */
-    uint32_t                      partial_length; /**< amount of data left over from the last unpack */
-    size_t                        bConverted;     /**< # of bytes already converted */
-    uint32_t                      checksum;       /**< checksum computed by pack/unpack operation */
-    uint32_t                      csum_ui1;       /**< partial checksum computed by pack/unpack operation */
-    size_t                        csum_ui2;       /**< partial checksum computed by pack/unpack operation */
-     /* --- cacheline 2 boundary (128 bytes) --- */
-    dt_stack_t                    static_stack[DT_STATIC_STACK_SIZE];  /**< local stack for small datatypes */
-    /* --- cacheline 3 boundary (192 bytes) was 56 bytes ago --- */
-
-#if OPAL_CUDA_SUPPORT
-    memcpy_fct_t                  cbmemcpy;       /**< memcpy or cuMemcpy */
-    void *                        stream;         /**< CUstream for async copy */
-#endif
-    /* size: 248, cachelines: 4, members: 20 */
-    /* last cacheline: 56 bytes */
-};
-
-struct iovec {  
-    void *iov_base; /* Starting address */  
-    size_t iov_len; /* Length in bytes */  
-};
+typedef struct {
+    uint32_t description_index[200];     /* index of y direction */
+    uint32_t description_local_index[200];   /* index of x direction */
+    uint32_t dst_offset[200];
+    uint32_t description_used;
+} ddt_cuda_description_dist_t;
 
 typedef struct {
     dt_stack_t pStack[DT_STATIC_STACK_SIZE];
@@ -319,6 +61,7 @@ typedef struct {
     size_t max_data;
     uint32_t description_count;
     uint32_t description_max_count;
+    ddt_cuda_description_dist_t *description_dist;
 } ddt_cuda_desc_t;
 
 typedef struct {
@@ -326,34 +69,30 @@ typedef struct {
     uint32_t current_stream_id;
 } ddt_cuda_stream_t;
 
+typedef struct {
+    unsigned char* src[CUDA_IOV_MAX_TASK_PER_BLOCK];
+    unsigned char* dst[CUDA_IOV_MAX_TASK_PER_BLOCK];
+    uint32_t nb_elements[CUDA_IOV_MAX_TASK_PER_BLOCK];
+    uint8_t element_alignment[CUDA_IOV_MAX_TASK_PER_BLOCK];
+    uint32_t nb_tasks;
+} ddt_cuda_iov_dist_t;
+
 extern ddt_cuda_desc_t *cuda_desc_d, *cuda_desc_h;
 extern unsigned char* pBaseBuf_GPU;
+extern unsigned char *ddt_cuda_pack_buffer, *ddt_cuda_unpack_buffer;
+extern size_t ddt_cuda_buffer_space;
 extern ddt_cuda_stream_t* cuda_streams;
+extern struct iovec cuda_iov[CUDA_NB_IOV];
+extern uint32_t cuda_iov_count;
+extern ddt_cuda_description_dist_t description_dist_h[CUDA_MAX_NB_BLOCKS];
+extern ddt_cuda_description_dist_t* description_dist_d;
+extern ddt_cuda_iov_dist_t cuda_iov_dist_h[NB_STREAMS][CUDA_MAX_NB_BLOCKS];
+extern ddt_cuda_iov_dist_t* cuda_iov_dist_d[NB_STREAMS];
+extern dt_elem_desc_t* description_d;
+extern uint8_t opal_datatype_cuda_debug;
 
-#define SAVE_STACK( PSTACK, INDEX, TYPE, COUNT, DISP) \
-do { \
-   (PSTACK)->index    = (INDEX); \
-   (PSTACK)->type     = (TYPE); \
-   (PSTACK)->count    = (COUNT); \
-   (PSTACK)->disp     = (DISP); \
-} while(0)
+//extern uint8_t ALIGNMENT_DOUBLE, ALIGNMENT_FLOAT, ALIGNMENT_CHAR;
 
-#define PUSH_STACK( PSTACK, STACK_POS, INDEX, TYPE, COUNT, DISP) \
-do { \
-   dt_stack_t* pTempStack = (PSTACK) + 1; \
-   if (threadIdx.x == 0) {  \
-       SAVE_STACK( pTempStack, (INDEX), (TYPE), (COUNT), (DISP) );  \
-   }    \
-   __syncthreads(); \
-   (STACK_POS)++; \
-   (PSTACK) = pTempStack; \
-} while(0)
-
-#define UPDATE_INTERNAL_COUNTERS( DESCRIPTION, POSITION, ELEMENT, COUNTER ) \
-    do {                                                                \
-        (ELEMENT) = &((DESCRIPTION)[(POSITION)]);                       \
-        (COUNTER) = (ELEMENT)->elem.count;                              \
-    } while (0)
         
 #if defined (OPAL_DATATYPE_CUDA_DEBUG) 
 #define DBGPRINT(fmt, ...) printf(fmt, __VA_ARGS__) 
@@ -375,6 +114,8 @@ __device__ void unpack_contiguous_loop_cuda_kernel( dt_elem_desc_t* ELEM,
                                                   
 __global__ void opal_generic_simple_pack_cuda_kernel(ddt_cuda_desc_t* cuda_desc);
 
+__global__ void opal_generic_simple_pack_cuda_kernel_v2(ddt_cuda_desc_t* cuda_desc);
+
 __global__ void opal_generic_simple_unpack_cuda_kernel(ddt_cuda_desc_t* cuda_desc);
 
 __global__ void pack_contiguous_loop_cuda_kernel_global( uint32_t copy_loops,
@@ -388,10 +129,28 @@ __global__ void unpack_contiguous_loop_cuda_kernel_global( uint32_t copy_loops,
                                                            OPAL_PTRDIFF_TYPE extent,
                                                            unsigned char* source,
                                                            unsigned char* destination );
+                                                           
+// __global__ void opal_generic_simple_pack_cuda_iov_kernel( ddt_cuda_description_dist_t* desc_dist_d, dt_elem_desc_t* desc_d, uint32_t required_blocks, struct iovec* iov, unsigned char* pBaseBuf);
+
+__global__ void opal_generic_simple_pack_cuda_iov_kernel( ddt_cuda_iov_dist_t* cuda_iov_dist);
+
+__global__ void opal_generic_simple_unpack_cuda_iov_kernel( ddt_cuda_iov_dist_t* cuda_iov_dist);
+
+void opal_cuda_output(int output_id, const char *format, ...);
+
+#if defined (OPAL_DATATYPE_CUDA_DEBUG)
+#define DT_CUDA_DEBUG( INST ) if (opal_datatype_cuda_debug) { INST }
+#else
+#define DT_CUDA_DEBUG( INST )
+#endif
 
 extern "C"
 {
 int32_t opal_convertor_set_position_nocheck( opal_convertor_t* convertor, size_t* position );
+
+int32_t opal_convertor_raw( opal_convertor_t* pConvertor, 
+		                    struct iovec* iov, uint32_t* iov_count,
+		                    size_t* length );
 }
 
 #endif  /* OPAL_DATATYPE_CUDA_INTERNAL_H_HAS_BEEN_INCLUDED */
diff --git a/opal/datatype/cuda/opal_datatype_orig_internal.h b/opal/datatype/cuda/opal_datatype_orig_internal.h
new file mode 100644
index 00000000000..fc30fc87741
--- /dev/null
+++ b/opal/datatype/cuda/opal_datatype_orig_internal.h
@@ -0,0 +1,646 @@
+#ifndef OPAL_DATATYPE_ORIG_INTERNAL_H_HAS_BEEN_INCLUDED
+#define OPAL_DATATYPE_ORIG_INTERNAL_H_HAS_BEEN_INCLUDED
+
+#include <stdbool.h>
+
+#include "opal_config.h"
+
+/* original OMPI */
+#define OPAL_DECLSPEC
+
+#define OPAL_PTRDIFF_TYPE ptrdiff_t
+#define DT_STATIC_STACK_SIZE    5                /**< This should be sufficient for most applications */
+
+#if OPAL_ENABLE_DEBUG
+/* Any kind of unique ID should do the job */
+#define OPAL_OBJ_MAGIC_ID ((0xdeafbeedULL << 32) + 0xdeafbeedULL)
+#endif
+
+/* keep the last 16 bits free for data flags */
+#define CONVERTOR_DATATYPE_MASK    0x0000FFFF
+#define CONVERTOR_SEND_CONVERSION  0x00010000
+#define CONVERTOR_RECV             0x00020000
+#define CONVERTOR_SEND             0x00040000
+#define CONVERTOR_HOMOGENEOUS      0x00080000
+#define CONVERTOR_NO_OP            0x00100000
+#define CONVERTOR_WITH_CHECKSUM    0x00200000
+#define CONVERTOR_CUDA             0x00400000
+#define CONVERTOR_CUDA_ASYNC       0x00800000
+#define CONVERTOR_TYPE_MASK        0x00FF0000
+#define CONVERTOR_STATE_START      0x01000000
+#define CONVERTOR_STATE_COMPLETE   0x02000000
+#define CONVERTOR_STATE_ALLOC      0x04000000
+#define CONVERTOR_COMPLETED        0x08000000
+
+#define OPAL_DATATYPE_LOOP           0
+#define OPAL_DATATYPE_END_LOOP       1
+#define OPAL_DATATYPE_LB             2
+#define OPAL_DATATYPE_UB             3
+#define OPAL_DATATYPE_FIRST_TYPE     4 /* Number of first real type */
+#define OPAL_DATATYPE_INT1           4
+#define OPAL_DATATYPE_INT2           5
+#define OPAL_DATATYPE_INT4           6
+#define OPAL_DATATYPE_INT8           7
+#define OPAL_DATATYPE_INT16          8
+#define OPAL_DATATYPE_UINT1          9
+#define OPAL_DATATYPE_UINT2          10
+#define OPAL_DATATYPE_UINT4          11
+#define OPAL_DATATYPE_UINT8          12
+#define OPAL_DATATYPE_UINT16         13
+#define OPAL_DATATYPE_FLOAT2         14
+#define OPAL_DATATYPE_FLOAT4         15
+#define OPAL_DATATYPE_FLOAT8         16
+#define OPAL_DATATYPE_FLOAT12        17
+#define OPAL_DATATYPE_FLOAT16        18
+#define OPAL_DATATYPE_FLOAT_COMPLEX  19
+#define OPAL_DATATYPE_DOUBLE_COMPLEX 20
+#define OPAL_DATATYPE_LONG_DOUBLE_COMPLEX 21
+#define OPAL_DATATYPE_BOOL           22
+#define OPAL_DATATYPE_WCHAR          23
+#define OPAL_DATATYPE_UNAVAILABLE    24
+
+/* flags for the datatypes. */
+#define OPAL_DATATYPE_FLAG_UNAVAILABLE   0x0001  /**< datatypes unavailable on the build (OS or compiler dependant) */
+#define OPAL_DATATYPE_FLAG_PREDEFINED    0x0002  /**< cannot be removed: initial and predefined datatypes */
+#define OPAL_DATATYPE_FLAG_COMMITED      0x0004  /**< ready to be used for a send/recv operation */
+#define OPAL_DATATYPE_FLAG_OVERLAP       0x0008  /**< datatype is unpropper for a recv operation */
+#define OPAL_DATATYPE_FLAG_CONTIGUOUS    0x0010  /**< contiguous datatype */
+#define OPAL_DATATYPE_FLAG_NO_GAPS       0x0020  /**< no gaps around the datatype, aka OPAL_DATATYPE_FLAG_CONTIGUOUS and extent == size */
+#define OPAL_DATATYPE_FLAG_USER_LB       0x0040  /**< has a user defined LB */
+#define OPAL_DATATYPE_FLAG_USER_UB       0x0080  /**< has a user defined UB */
+#define OPAL_DATATYPE_FLAG_DATA          0x0100  /**< data or control structure */
+/*
+ * We should make the difference here between the predefined contiguous and non contiguous
+ * datatypes. The OPAL_DATATYPE_FLAG_BASIC is held by all predefined contiguous datatypes.
+ */
+#define OPAL_DATATYPE_FLAG_BASIC         (OPAL_DATATYPE_FLAG_PREDEFINED | \
+                                          OPAL_DATATYPE_FLAG_CONTIGUOUS | \
+                                          OPAL_DATATYPE_FLAG_NO_GAPS |    \
+                                          OPAL_DATATYPE_FLAG_DATA |       \
+                                          OPAL_DATATYPE_FLAG_COMMITED)
+ 
+/* typedefs ***********************************************************/
+
+typedef struct opal_object_t opal_object_t;
+typedef struct opal_class_t opal_class_t;
+typedef void (*opal_construct_t) (opal_object_t *);
+typedef void (*opal_destruct_t) (opal_object_t *);
+
+
+/* types **************************************************************/
+
+/**
+* Class descriptor.
+*
+* There should be a single instance of this descriptor for each class
+* definition.
+*/
+struct opal_class_t {
+  const char *cls_name;           /**< symbolic name for class */
+  opal_class_t *cls_parent;       /**< parent class descriptor */
+  opal_construct_t cls_construct; /**< class constructor */
+  opal_destruct_t cls_destruct;   /**< class destructor */
+  int cls_initialized;            /**< is class initialized */
+  int cls_depth;                  /**< depth of class hierarchy tree */
+  opal_construct_t *cls_construct_array;
+                                  /**< array of parent class constructors */
+  opal_destruct_t *cls_destruct_array;
+                                  /**< array of parent class destructors */
+  size_t cls_sizeof;              /**< size of an object instance */
+};
+
+/**
+ * Base object.
+ *
+ * This is special and does not follow the pattern for other classes.
+ */
+struct opal_object_t {
+#if OPAL_ENABLE_DEBUG
+    /** Magic ID -- want this to be the very first item in the
+        struct's memory */
+    uint64_t obj_magic_id;
+#endif
+    opal_class_t *obj_class;            /**< class descriptor */
+    volatile int32_t obj_reference_count;   /**< reference count */
+#if OPAL_ENABLE_DEBUG
+   const char* cls_init_file_name;        /**< In debug mode store the file where the object get contructed */
+   int   cls_init_lineno;           /**< In debug mode store the line number where the object get contructed */
+#endif  /* OPAL_ENABLE_DEBUG */
+};
+
+/**
+ * Declaration for class descriptor
+ *
+ * @param NAME          Name of class
+ *
+ * Put this in NAME.h
+ */
+#define OBJ_CLASS_DECLARATION(NAME)             \
+    extern opal_class_t NAME ## _class
+
+/**
+ * Return a pointer to the class descriptor associated with a
+ * class type.
+ *
+ * @param NAME          Name of class
+ * @return              Pointer to class descriptor
+ */
+#define OBJ_CLASS(NAME)     (&(NAME ## _class))
+
+/**
+ * For static initializations of OBJects.
+ *
+ * @param NAME   Name of the class to initialize
+ */
+#if OPAL_ENABLE_DEBUG
+#define OPAL_OBJ_STATIC_INIT(BASE_CLASS) { OPAL_OBJ_MAGIC_ID, OBJ_CLASS(BASE_CLASS), 1, __FILE__, __LINE__ }
+#else
+#define OPAL_OBJ_STATIC_INIT(BASE_CLASS) { OBJ_CLASS(BASE_CLASS), 1 }
+#endif
+
+
+
+struct ddt_elem_id_description {
+    uint16_t   flags;  /**< flags for the record */
+    uint16_t   type;   /**< the basic data type id */
+};
+typedef struct ddt_elem_id_description ddt_elem_id_description;
+
+/* the basic element. A data description is composed
+ * by a set of basic elements.
+ */
+struct ddt_elem_desc {
+    ddt_elem_id_description common;           /**< basic data description and flags */
+    uint32_t                count;            /**< number of blocks */
+    uint32_t                blocklen;         /**< number of elements on each block */
+    OPAL_PTRDIFF_TYPE       extent;           /**< extent of each block (in bytes) */
+    OPAL_PTRDIFF_TYPE       disp;             /**< displacement of the first block */
+};
+typedef struct ddt_elem_desc ddt_elem_desc_t;
+
+struct ddt_loop_desc {
+    ddt_elem_id_description common;           /**< basic data description and flags */
+    uint32_t                loops;            /**< number of elements */
+    uint32_t                items;            /**< number of items in the loop */
+    size_t                  unused;           /**< not used right now */
+    OPAL_PTRDIFF_TYPE       extent;           /**< extent of the whole loop */
+};
+typedef struct ddt_loop_desc ddt_loop_desc_t;
+
+struct ddt_endloop_desc {
+    ddt_elem_id_description common;           /**< basic data description and flags */
+    uint32_t                items;            /**< number of elements */
+    uint32_t                unused;           /**< not used right now */
+    size_t                  size;             /**< real size of the data in the loop */
+    OPAL_PTRDIFF_TYPE       first_elem_disp;  /**< the displacement of the first block in the loop */
+};
+typedef struct ddt_endloop_desc ddt_endloop_desc_t;
+
+union dt_elem_desc {
+    ddt_elem_desc_t    elem;
+    ddt_loop_desc_t    loop;
+    ddt_endloop_desc_t end_loop;
+};
+typedef union dt_elem_desc dt_elem_desc_t;
+
+/* dt_type_description */
+typedef uint32_t opal_datatype_count_t;
+
+struct dt_type_desc_t {
+    opal_datatype_count_t  length;  /**< the maximum number of elements in the description array */
+    opal_datatype_count_t  used;    /**< the number of used elements in the description array */
+    dt_elem_desc_t*        desc;
+};
+typedef struct dt_type_desc_t dt_type_desc_t;
+
+/*
+ * The datatype description.
+ */
+#define OPAL_DATATYPE_MAX_PREDEFINED 25
+#define OPAL_DATATYPE_MAX_SUPPORTED  47
+#define OPAL_MAX_OBJECT_NAME         64
+
+struct opal_datatype_t {
+    opal_object_t      super;    /**< basic superclass */
+    uint16_t           flags;    /**< the flags */
+    uint16_t           id;       /**< data id, normally the index in the data array. */
+    uint32_t           bdt_used; /**< bitset of which basic datatypes are used in the data description */
+    size_t             size;     /**< total size in bytes of the memory used by the data if
+                                      the data is put on a contiguous buffer */
+    OPAL_PTRDIFF_TYPE  true_lb;  /**< the true lb of the data without user defined lb and ub */
+    OPAL_PTRDIFF_TYPE  true_ub;  /**< the true ub of the data without user defined lb and ub */
+    OPAL_PTRDIFF_TYPE  lb;       /**< lower bound in memory */
+    OPAL_PTRDIFF_TYPE  ub;       /**< upper bound in memory */
+    /* --- cacheline 1 boundary (64 bytes) --- */
+    size_t             nbElems;  /**< total number of elements inside the datatype */
+    uint32_t           align;    /**< data should be aligned to */
+
+    /* Attribute fields */
+    char               name[OPAL_MAX_OBJECT_NAME];  /**< name of the datatype */
+    /* --- cacheline 2 boundary (128 bytes) was 8-12 bytes ago --- */
+    dt_type_desc_t     desc;     /**< the data description */
+    dt_type_desc_t     opt_desc; /**< short description of the data used when conversion is useless
+                                      or in the send case (without conversion) */
+
+    uint32_t           btypes[OPAL_DATATYPE_MAX_SUPPORTED];
+                                 /**< basic elements count used to compute the size of the
+                                      datatype for remote nodes. The length of the array is dependent on
+                                      the maximum number of datatypes of all top layers.
+                                      Reason being is that Fortran is not at the OPAL layer. */
+    /* --- cacheline 5 boundary (320 bytes) was 32-36 bytes ago --- */
+
+    /* size: 352, cachelines: 6, members: 15 */
+    /* last cacheline: 28-32 bytes */
+};
+
+typedef struct opal_datatype_t opal_datatype_t;
+
+OPAL_DECLSPEC OBJ_CLASS_DECLARATION( opal_datatype_t );
+
+/* convertor and stack */
+typedef struct opal_convertor_t opal_convertor_t;
+
+typedef int32_t (*convertor_advance_fct_t)( opal_convertor_t* pConvertor,
+                                            struct iovec* iov,
+                                            uint32_t* out_size,
+                                            size_t* max_data );
+typedef void*(*memalloc_fct_t)( size_t* pLength, void* userdata );
+typedef void*(*memcpy_fct_t)( void* dest, const void* src, size_t n, opal_convertor_t* pConvertor );
+
+/* The master convertor struct (defined in convertor_internal.h) */
+struct opal_convertor_master_t;
+
+struct dt_stack_t {
+    int32_t           index;    /**< index in the element description */
+    int16_t           type;     /**< the type used for the last pack/unpack (original or OPAL_DATATYPE_UINT1) */
+    size_t            count;    /**< number of times we still have to do it */
+    OPAL_PTRDIFF_TYPE disp;     /**< actual displacement depending on the count field */
+};
+typedef struct dt_stack_t dt_stack_t;
+
+typedef int32_t (*conversion_fct_t)( opal_convertor_t* pConvertor, uint32_t count,
+                                     const void* from, size_t from_len, OPAL_PTRDIFF_TYPE from_extent,
+                                     void* to, size_t to_length, OPAL_PTRDIFF_TYPE to_extent,
+                                     OPAL_PTRDIFF_TYPE *advance );
+
+typedef struct opal_convertor_master_t {
+    struct opal_convertor_master_t* next;
+    uint32_t                        remote_arch;
+    uint32_t                        flags;
+    uint32_t                        hetero_mask;
+    const size_t                    remote_sizes[OPAL_DATATYPE_MAX_PREDEFINED];
+    conversion_fct_t*               pFunctions;   /**< the convertor functions pointer */
+} opal_convertor_master_t;
+
+struct opal_convertor_t {
+    opal_object_t                 super;          /**< basic superclass */
+    uint32_t                      remoteArch;     /**< the remote architecture */
+    uint32_t                      flags;          /**< the properties of this convertor */
+    size_t                        local_size;     /**< overall length data on local machine, compared to bConverted */
+    size_t                        remote_size;    /**< overall length data on remote machine, compared to bConverted */
+    const opal_datatype_t*        pDesc;          /**< the datatype description associated with the convertor */
+    const dt_type_desc_t*         use_desc;       /**< the version used by the convertor (normal or optimized) */
+    opal_datatype_count_t         count;          /**< the total number of full datatype elements */
+    uint32_t                      stack_size;     /**< size of the allocated stack */
+    /* --- cacheline 1 boundary (64 bytes) --- */
+    unsigned char*                pBaseBuf;       /**< initial buffer as supplied by the user */
+    dt_stack_t*                   pStack;         /**< the local stack for the actual conversion */
+    convertor_advance_fct_t       fAdvance;       /**< pointer to the pack/unpack functions */
+    struct opal_convertor_master_t* master;       /**< the master convertor */
+
+    /* All others fields get modified for every call to pack/unpack functions */
+    uint32_t                      stack_pos;      /**< the actual position on the stack */
+    uint32_t                      partial_length; /**< amount of data left over from the last unpack */
+    size_t                        bConverted;     /**< # of bytes already converted */
+    uint32_t                      checksum;       /**< checksum computed by pack/unpack operation */
+    uint32_t                      csum_ui1;       /**< partial checksum computed by pack/unpack operation */
+    size_t                        csum_ui2;       /**< partial checksum computed by pack/unpack operation */
+     /* --- cacheline 2 boundary (128 bytes) --- */
+    dt_stack_t                    static_stack[DT_STATIC_STACK_SIZE];  /**< local stack for small datatypes */
+    /* --- cacheline 3 boundary (192 bytes) was 56 bytes ago --- */
+
+#if OPAL_CUDA_SUPPORT
+    memcpy_fct_t                  cbmemcpy;       /**< memcpy or cuMemcpy */
+    void *                        stream;         /**< CUstream for async copy */
+#endif
+    /* size: 248, cachelines: 4, members: 20 */
+    /* last cacheline: 56 bytes */
+};
+
+struct iovec {  
+    void *iov_base; /* Starting address */  
+    size_t iov_len; /* Length in bytes */  
+};
+
+
+OPAL_DECLSPEC extern union dt_elem_desc opal_datatype_predefined_elem_desc[2 * OPAL_DATATYPE_MAX_PREDEFINED];
+
+#define OPAL_DATATYPE_INIT_BTYPES_ARRAY_UNAVAILABLE { 0 }
+#define OPAL_DATATYPE_INIT_BTYPES_ARRAY(NAME) { [OPAL_DATATYPE_ ## NAME] = 1 }
+
+#define OPAL_DATATYPE_INIT_NAME(NAME) "OPAL_" #NAME
+
+/*
+ * Macro to initialize the main description for basic types, setting the pointer
+ * into the array opal_datatype_predefined_type_desc, which is initialized at
+ * runtime in opal_datatype_init(). Each basic type has two desc-elements....
+ */
+#define OPAL_DATATYPE_INIT_DESC_PREDEFINED(NAME)                                     \
+    {                                                                                \
+        .length = 1, .used = 1,                                                      \
+        .desc = &(opal_datatype_predefined_elem_desc[2 * OPAL_DATATYPE_ ## NAME])    \
+    }
+#define OPAL_DATATYPE_INIT_DESC_NULL  {.length = 0, .used = 0, .desc = NULL}
+
+#define OPAL_DATATYPE_INITIALIZER_UNAVAILABLE_NAMED( NAME, FLAGS )                   \
+    {                                                                                \
+        .super = OPAL_OBJ_STATIC_INIT(opal_datatype_t),                              \
+        .flags = OPAL_DATATYPE_FLAG_UNAVAILABLE | OPAL_DATATYPE_FLAG_PREDEFINED | (FLAGS), \
+        .id = OPAL_DATATYPE_ ## NAME,                                                \
+        .bdt_used = 0,                                                               \
+        .size = 0,                                                                   \
+        .true_lb = 0, .true_ub = 0, .lb = 0, .ub = 0,                                \
+        .align = 0,                                                                  \
+        .nbElems = 1,                                                                \
+        .name = OPAL_DATATYPE_INIT_NAME(NAME),                                       \
+        .desc = OPAL_DATATYPE_INIT_DESC_PREDEFINED(UNAVAILABLE),                     \
+        .opt_desc = OPAL_DATATYPE_INIT_DESC_PREDEFINED(UNAVAILABLE),                 \
+        .btypes = OPAL_DATATYPE_INIT_BTYPES_ARRAY_UNAVAILABLE                        \
+    }
+
+#define OPAL_DATATYPE_INITIALIZER_EMPTY( FLAGS )                        \
+    {                                                                   \
+        .super = OPAL_OBJ_STATIC_INIT(opal_datatype_t),                 \
+        .flags = OPAL_DATATYPE_FLAG_PREDEFINED | (FLAGS),               \
+        .id = 0,                                                        \
+        .bdt_used = 0,                                                  \
+        .size = 0,                                                      \
+        .true_lb = 0, .true_ub = 0, .lb = 0, .ub = 0,                   \
+        .align = 0,                                                     \
+        .nbElems = 1,                                                   \
+        .name = OPAL_DATATYPE_INIT_NAME(EMPTY),                         \
+        .desc = OPAL_DATATYPE_INIT_DESC_NULL,                           \
+        .opt_desc = OPAL_DATATYPE_INIT_DESC_NULL,                       \
+        .btypes = OPAL_DATATYPE_INIT_BTYPES_ARRAY_UNAVAILABLE           \
+    }
+
+#define OPAL_DATATYPE_INIT_BASIC_TYPE( TYPE, NAME, FLAGS )              \
+    {                                                                   \
+        .super = OPAL_OBJ_STATIC_INIT(opal_datatype_t),                 \
+        .flags = OPAL_DATATYPE_FLAG_PREDEFINED | (FLAGS),               \
+        .id = TYPE,                                                     \
+        .bdt_used = (((uint32_t)1)<<(TYPE)),                            \
+        .size = 0,                                                      \
+        .true_lb = 0, .true_ub = 0, .lb = 0, .ub = 0,                   \
+        .align = 0,                                                     \
+        .nbElems = 1,                                                   \
+        .name = OPAL_DATATYPE_INIT_NAME(NAME),                          \
+        .desc = OPAL_DATATYPE_INIT_DESC_NULL,                           \
+        .opt_desc = OPAL_DATATYPE_INIT_DESC_NULL,                       \
+        .btypes = OPAL_DATATYPE_INIT_BTYPES_ARRAY(NAME)                 \
+    }
+    
+#define OPAL_DATATYPE_INIT_BASIC_DATATYPE( TYPE, ALIGN, NAME, FLAGS )                \
+    {                                                                                \
+        .super = OPAL_OBJ_STATIC_INIT(opal_datatype_t),                              \
+        .flags = OPAL_DATATYPE_FLAG_BASIC | (FLAGS),                                 \
+        .id = OPAL_DATATYPE_ ## NAME,                                                \
+        .bdt_used = (((uint32_t)1)<<(OPAL_DATATYPE_ ## NAME)),                       \
+        .size = sizeof(TYPE),                                                        \
+        .true_lb = 0, .true_ub = sizeof(TYPE), .lb = 0, .ub = sizeof(TYPE),          \
+        .align = (ALIGN),                                                            \
+        .nbElems = 1,                                                                \
+        .name = OPAL_DATATYPE_INIT_NAME(NAME),                                       \
+        .desc = OPAL_DATATYPE_INIT_DESC_PREDEFINED(NAME),                            \
+        .opt_desc = OPAL_DATATYPE_INIT_DESC_PREDEFINED(NAME),                        \
+        .btypes = OPAL_DATATYPE_INIT_BTYPES_ARRAY(NAME)                              \
+    }
+
+#define OPAL_DATATYPE_INITIALIZER_LOOP(FLAGS)       OPAL_DATATYPE_INIT_BASIC_TYPE( OPAL_DATATYPE_LOOP, LOOP, FLAGS )
+#define OPAL_DATATYPE_INITIALIZER_END_LOOP(FLAGS)   OPAL_DATATYPE_INIT_BASIC_TYPE( OPAL_DATATYPE_END_LOOP, END_LOOP, FLAGS )
+#define OPAL_DATATYPE_INITIALIZER_LB(FLAGS)         OPAL_DATATYPE_INIT_BASIC_TYPE( OPAL_DATATYPE_LB, LB, FLAGS )
+#define OPAL_DATATYPE_INITIALIZER_UB(FLAGS)         OPAL_DATATYPE_INIT_BASIC_TYPE( OPAL_DATATYPE_UB, UB, FLAGS ) 
+#define OPAL_DATATYPE_INITIALIZER_INT1(FLAGS)       OPAL_DATATYPE_INIT_BASIC_DATATYPE( int8_t, OPAL_ALIGNMENT_INT8, INT1, FLAGS )
+#define OPAL_DATATYPE_INITIALIZER_INT2(FLAGS)       OPAL_DATATYPE_INIT_BASIC_DATATYPE( int16_t, OPAL_ALIGNMENT_INT16, INT2, FLAGS )
+#define OPAL_DATATYPE_INITIALIZER_INT4(FLAGS)       OPAL_DATATYPE_INIT_BASIC_DATATYPE( int32_t, OPAL_ALIGNMENT_INT32, INT4, FLAGS )
+#define OPAL_DATATYPE_INITIALIZER_INT8(FLAGS)       OPAL_DATATYPE_INIT_BASIC_DATATYPE( int64_t, OPAL_ALIGNMENT_INT64, INT8, FLAGS )
+#ifdef HAVE_INT128_T
+#define OPAL_DATATYPE_INITIALIZER_INT16(FLAGS)      OPAL_DATATYPE_INIT_BASIC_DATATYPE( int128_t, OPAL_ALIGNMENT_INT128, INT16, FLAGS )
+#else
+#define OPAL_DATATYPE_INITIALIZER_INT16(FLAGS)      OPAL_DATATYPE_INITIALIZER_UNAVAILABLE_NAMED( INT16, FLAGS )
+#endif
+#define OPAL_DATATYPE_INITIALIZER_UINT1(FLAGS)      OPAL_DATATYPE_INIT_BASIC_DATATYPE( uint8_t, OPAL_ALIGNMENT_INT8, UINT1, FLAGS )
+#define OPAL_DATATYPE_INITIALIZER_UINT2(FLAGS)      OPAL_DATATYPE_INIT_BASIC_DATATYPE( uint16_t, OPAL_ALIGNMENT_INT16, UINT2, FLAGS )
+#define OPAL_DATATYPE_INITIALIZER_UINT4(FLAGS)      OPAL_DATATYPE_INIT_BASIC_DATATYPE( uint32_t, OPAL_ALIGNMENT_INT32, UINT4, FLAGS )
+#define OPAL_DATATYPE_INITIALIZER_UINT8(FLAGS)      OPAL_DATATYPE_INIT_BASIC_DATATYPE( uint64_t, OPAL_ALIGNMENT_INT64, UINT8, FLAGS )
+#ifdef HAVE_UINT128_T
+#define OPAL_DATATYPE_INITIALIZER_UINT16(FLAGS)     OPAL_DATATYPE_INIT_BASIC_DATATYPE( uint128_t, OPAL_ALIGNMENT_INT128, UINT16, FLAGS )
+#else
+#define OPAL_DATATYPE_INITIALIZER_UINT16(FLAGS)     OPAL_DATATYPE_INITIALIZER_UNAVAILABLE_NAMED( INT16, FLAGS )
+#endif
+
+#if SIZEOF_FLOAT == 2
+#define OPAL_DATATYPE_INITIALIZER_FLOAT2(FLAGS)     OPAL_DATATYPE_INIT_BASIC_DATATYPE( float, OPAL_ALIGNMENT_FLOAT, FLOAT2, FLAGS )
+#elif SIZEOF_DOUBLE == 2
+#define OPAL_DATATYPE_INITIALIZER_FLOAT2(FLAGS)     OPAL_DATATYPE_INIT_BASIC_DATATYPE( double, OPAL_ALIGNMENT_DOUBLE, FLOAT2, FLAGS )
+#elif SIZEOF_LONG_DOUBLE == 2
+#define OPAL_DATATYPE_INITIALIZER_FLOAT2(FLAGS)     OPAL_DATATYPE_INIT_BASIC_DATATYPE( long double, OPAL_ALIGNMENT_LONG_DOUBLE, FLOAT2, FLAGS )
+#else
+#define OPAL_DATATYPE_INITIALIZER_FLOAT2(FLAGS)     OPAL_DATATYPE_INITIALIZER_UNAVAILABLE_NAMED( FLOAT2, FLAGS )
+#endif
+
+#if SIZEOF_FLOAT == 4
+#define OPAL_DATATYPE_INITIALIZER_FLOAT4(FLAGS)     OPAL_DATATYPE_INIT_BASIC_DATATYPE( float, OPAL_ALIGNMENT_FLOAT, FLOAT4, FLAGS )
+#elif SIZEOF_DOUBLE == 4
+#define OPAL_DATATYPE_INITIALIZER_FLOAT4(FLAGS)     OPAL_DATATYPE_INIT_BASIC_DATATYPE( double, OPAL_ALIGNMENT_DOUBLE, FLOAT4, FLAGS )
+#elif SIZEOF_LONG_DOUBLE == 4
+#define OPAL_DATATYPE_INITIALIZER_FLOAT4(FLAGS)     OPAL_DATATYPE_INIT_BASIC_DATATYPE( long double, OPAL_ALIGNMENT_LONG_DOUBLE, FLOAT4, FLAGS )
+#else
+#define OPAL_DATATYPE_INITIALIZER_FLOAT4(FLAGS)     OPAL_DATATYPE_INITIALIZER_UNAVAILABLE_NAMED( FLOAT4, FLAGS )
+#endif
+
+#if SIZEOF_FLOAT == 8
+#define OPAL_DATATYPE_INITIALIZER_FLOAT8(FLAGS)     OPAL_DATATYPE_INIT_BASIC_DATATYPE( float, OPAL_ALIGNMENT_FLOAT, FLOAT8, FLAGS )
+#elif SIZEOF_DOUBLE == 8
+#define OPAL_DATATYPE_INITIALIZER_FLOAT8(FLAGS)     OPAL_DATATYPE_INIT_BASIC_DATATYPE( double, OPAL_ALIGNMENT_DOUBLE, FLOAT8, FLAGS )
+#elif SIZEOF_LONG_DOUBLE == 8
+#define OPAL_DATATYPE_INITIALIZER_FLOAT8(FLAGS)     OPAL_DATATYPE_INIT_BASIC_DATATYPE( long double, OPAL_ALIGNMENT_LONG_DOUBLE, FLOAT8, FLAGS )
+#else
+#define OPAL_DATATYPE_INITIALIZER_FLOAT8(FLAGS)     OPAL_DATATYPE_INITIALIZER_UNAVAILABLE_NAMED( FLOAT8, FLAGS )
+#endif
+
+#if SIZEOF_FLOAT == 12
+#define OPAL_DATATYPE_INITIALIZER_FLOAT12(FLAGS)    OPAL_DATATYPE_INIT_BASIC_DATATYPE( float, OPAL_ALIGNMENT_FLOAT, FLOAT12, FLAGS )
+#elif SIZEOF_DOUBLE == 12
+#define OPAL_DATATYPE_INITIALIZER_FLOAT12(FLAGS)    OPAL_DATATYPE_INIT_BASIC_DATATYPE( double, OPAL_ALIGNMENT_DOUBLE, FLOAT12, FLAGS )
+#elif SIZEOF_LONG_DOUBLE == 12
+#define OPAL_DATATYPE_INITIALIZER_FLOAT12(FLAGS)    OPAL_DATATYPE_INIT_BASIC_DATATYPE( long double, OPAL_ALIGNMENT_LONG_DOUBLE, FLOAT12, FLAGS )
+#else
+#define OPAL_DATATYPE_INITIALIZER_FLOAT12(FLAGS)    OPAL_DATATYPE_INITIALIZER_UNAVAILABLE_NAMED( FLOAT12, FLAGS )
+#endif
+
+#if SIZEOF_FLOAT == 16
+#define OPAL_DATATYPE_INITIALIZER_FLOAT16(FLAGS)    OPAL_DATATYPE_INIT_BASIC_DATATYPE( float, OPAL_ALIGNMENT_FLOAT, FLOAT16, FLAGS )
+#elif SIZEOF_DOUBLE == 16
+#define OPAL_DATATYPE_INITIALIZER_FLOAT16(FLAGS)    OPAL_DATATYPE_INIT_BASIC_DATATYPE( double, OPAL_ALIGNMENT_DOUBLE, FLOAT16, FLAGS )
+#elif SIZEOF_LONG_DOUBLE == 16
+#define OPAL_DATATYPE_INITIALIZER_FLOAT16(FLAGS)    OPAL_DATATYPE_INIT_BASIC_DATATYPE( long double, OPAL_ALIGNMENT_LONG_DOUBLE, FLOAT16, FLAGS )
+#else
+#define OPAL_DATATYPE_INITIALIZER_FLOAT16(FLAGS)    OPAL_DATATYPE_INITIALIZER_UNAVAILABLE_NAMED( FLOAT16, FLAGS )
+#endif
+
+#if HAVE_FLOAT__COMPLEX
+#define OPAL_DATATYPE_INITIALIZER_FLOAT_COMPLEX(FLAGS) OPAL_DATATYPE_INIT_BASIC_DATATYPE( float _Complex, OPAL_ALIGNMENT_FLOAT_COMPLEX, FLOAT_COMPLEX, FLAGS )
+#else
+#define OPAL_DATATYPE_INITIALIZER_FLOAT_COMPLEX(FLAGS) OPAL_DATATYPE_INITIALIZER_UNAVAILABLE_NAMED( FLOAT_COMPLEX, FLAGS)
+#endif
+
+#if HAVE_DOUBLE__COMPLEX
+#define OPAL_DATATYPE_INITIALIZER_DOUBLE_COMPLEX(FLAGS) OPAL_DATATYPE_INIT_BASIC_DATATYPE( double _Complex, OPAL_ALIGNMENT_DOUBLE_COMPLEX, DOUBLE_COMPLEX, FLAGS )
+#else
+#define OPAL_DATATYPE_INITIALIZER_DOUBLE_COMPLEX(FLAGS) OPAL_DATATYPE_INITIALIZER_UNAVAILABLE_NAMED( DOUBLE_COMPLEX, FLAGS)
+#endif
+
+#if HAVE_LONG_DOUBLE__COMPLEX
+#define OPAL_DATATYPE_INITIALIZER_LONG_DOUBLE_COMPLEX(FLAGS) OPAL_DATATYPE_INIT_BASIC_DATATYPE( long double _Complex, OPAL_ALIGNMENT_LONG_DOUBLE_COMPLEX, LONG_DOUBLE_COMPLEX, FLAGS )
+#else
+#define OPAL_DATATYPE_INITIALIZER_LONG_DOUBLE_COMPLEX(FLAGS) OPAL_DATATYPE_INITIALIZER_UNAVAILABLE_NAMED( LONG_DOUBLE_COMPLEX, FLAGS)
+#endif
+
+#define OPAL_DATATYPE_INITIALIZER_BOOL(FLAGS)       OPAL_DATATYPE_INIT_BASIC_DATATYPE( _Bool, OPAL_ALIGNMENT_BOOL, BOOL, FLAGS )
+
+#if OPAL_ALIGNMENT_WCHAR != 0
+#define OPAL_DATATYPE_INITIALIZER_WCHAR(FLAGS)      OPAL_DATATYPE_INIT_BASIC_DATATYPE( wchar_t, OPAL_ALIGNMENT_WCHAR, WCHAR, FLAGS )
+#else
+#define OPAL_DATATYPE_INITIALIZER_WCHAR(FLAGS)      OPAL_DATATYPE_INITIALIZER_UNAVAILABLE_NAMED( WCHAR, FLAGS )
+#endif
+    
+#define SAVE_STACK( PSTACK, INDEX, TYPE, COUNT, DISP) \
+do { \
+   (PSTACK)->index    = (INDEX); \
+   (PSTACK)->type     = (TYPE); \
+   (PSTACK)->count    = (COUNT); \
+   (PSTACK)->disp     = (DISP); \
+} while(0)
+
+#define PUSH_STACK( PSTACK, STACK_POS, INDEX, TYPE, COUNT, DISP) \
+do { \
+   dt_stack_t* pTempStack = (PSTACK) + 1; \
+   if (threadIdx.x == 0) {  \
+       SAVE_STACK( pTempStack, (INDEX), (TYPE), (COUNT), (DISP) );  \
+   }    \
+   __syncthreads(); \
+   (STACK_POS)++; \
+   (PSTACK) = pTempStack; \
+} while(0)
+
+#define UPDATE_INTERNAL_COUNTERS( DESCRIPTION, POSITION, ELEMENT, COUNTER ) \
+    do {                                                                \
+        (ELEMENT) = &((DESCRIPTION)[(POSITION)]);                       \
+        (COUNTER) = (ELEMENT)->elem.count;                              \
+    } while (0)   
+
+OPAL_DECLSPEC extern const size_t opal_datatype_basicDatatypesSize[OPAL_DATATYPE_MAX_PREDEFINED];
+
+#define     OPAL_DATATYPE_LOOP_SIZE         0
+#define     OPAL_DATATYPE_END_LOOP_SIZE     0
+#define     OPAL_DATATYPE_LB_SIZE           0
+#define     OPAL_DATATYPE_UB_SIZE           0
+#define     OPAL_DATATYPE_INT1_SIZE         sizeof(int8_t)
+#define     OPAL_DATATYPE_INT2_SIZE         sizeof(int16_t)
+#define     OPAL_DATATYPE_INT4_SIZE         sizeof(int32_t)
+#define     OPAL_DATATYPE_INT8_SIZE         sizeof(int64_t)
+#ifdef HAVE_INT128_T
+#   define  OPAL_DATATYPE_INT16_SIZE        sizeof(int128_t)       /* Yes, double-machine word integers are available */
+#else
+#   define  OPAL_DATATYPE_INT16_SIZE        0
+#endif
+
+#define     OPAL_DATATYPE_UINT1_SIZE        sizeof(uint8_t)
+#define     OPAL_DATATYPE_UINT2_SIZE        sizeof(uint16_t)
+#define     OPAL_DATATYPE_UINT4_SIZE        sizeof(uint32_t)
+#define     OPAL_DATATYPE_UINT8_SIZE        sizeof(uint64_t)
+#ifdef HAVE_UINT128_T
+#   define  OPAL_DATATYPE_UINT16_SIZE       sizeof(uint128_t)      /* Yes, double-machine word integers are available */
+#else
+#   define  OPAL_DATATYPE_UINT16_SIZE       0
+#endif
+
+#if SIZEOF_FLOAT == 2
+#   define  OPAL_DATATYPE_FLOAT2_SIZE       sizeof(float)
+#elif SIZEOF_DOUBLE == 2
+#   define  OPAL_DATATYPE_FLOAT2_SIZE       sizeof(double)
+#elif SIZEOF_LONG_DOUBLE == 2
+#   define  OPAL_DATATYPE_FLOAT2_SIZE       sizeof(long double)
+#else
+#   define  OPAL_DATATYPE_FLOAT2_SIZE       0
+#endif
+
+#if SIZEOF_FLOAT == 4
+#   define  OPAL_DATATYPE_FLOAT4_SIZE       sizeof(float)
+#elif SIZEOF_DOUBLE == 4
+#   define  OPAL_DATATYPE_FLOAT4_SIZE       sizeof(double)
+#elif SIZEOF_LONG_DOUBLE == 4
+#   define  OPAL_DATATYPE_FLOAT4_SIZE       sizeof(long double)
+#else
+#   define  OPAL_DATATYPE_FLOAT4_SIZE       0
+#endif
+
+#if SIZEOF_FLOAT == 8
+#   define  OPAL_DATATYPE_FLOAT8_SIZE       sizeof(float)
+#elif SIZEOF_DOUBLE == 8
+#   define  OPAL_DATATYPE_FLOAT8_SIZE       sizeof(double)
+#elif SIZEOF_LONG_DOUBLE == 8
+#   define  OPAL_DATATYPE_FLOAT8_SIZE       sizeof(long double)
+#else
+#   define  OPAL_DATATYPE_FLOAT8_SIZE       0
+#endif
+
+#if SIZEOF_FLOAT == 12
+#   define  OPAL_DATATYPE_FLOAT12_SIZE      sizeof(float)
+#elif SIZEOF_DOUBLE == 12
+#   define  OPAL_DATATYPE_FLOAT12_SIZE      sizeof(double)
+#elif SIZEOF_LONG_DOUBLE == 12
+#   define  OPAL_DATATYPE_FLOAT12_SIZE      sizeof(long double)
+#else
+#   define  OPAL_DATATYPE_FLOAT12_SIZE      0
+#endif
+
+#if SIZEOF_FLOAT == 16
+#   define  OPAL_DATATYPE_FLOAT16_SIZE      sizeof(float)
+#elif SIZEOF_DOUBLE == 16
+#   define  OPAL_DATATYPE_FLOAT16_SIZE      sizeof(double)
+#elif SIZEOF_LONG_DOUBLE == 16
+#   define  OPAL_DATATYPE_FLOAT16_SIZE      sizeof(long double)
+#else
+#   define  OPAL_DATATYPE_FLOAT16_SIZE      0
+#endif
+        
+#if HAVE_FLOAT__COMPLEX
+#   define  OPAL_DATATYPE_FLOAT_COMPLEX_SIZE    sizeof(float _Complex)
+#else
+#   define  OPAL_DATATYPE_FLOAT_COMPLEX_SIZE    0
+#endif
+
+#if HAVE_DOUBLE__COMPLEX
+#   define  OPAL_DATATYPE_DOUBLE_COMPLEX_SIZE    sizeof(float _Complex)
+#else
+#   define  OPAL_DATATYPE_DOUBLE_COMPLEX_SIZE    0
+#endif
+    
+#if HAVE_LONG_DOUBLE__COMPLEX
+#   define  OPAL_DATATYPE_LONG_DOUBLE_COMPLEX_SIZE    sizeof(float _Complex)
+#else
+#   define  OPAL_DATATYPE_LONG_DOUBLE_COMPLEX_SIZE    0
+#endif
+
+#define     OPAL_DATATYPE_BOOL_SIZE         sizeof(_Bool)
+#if OPAL_ALIGNMENT_WCHAR != 0
+#   define  OPAL_DATATYPE_WCHAR_SIZE        sizeof(wchar_t)
+#else 
+#   define  OPAL_DATATYPE_WCHAR_SIZE        0
+#endif
+
+#define     OPAL_DATATYPE_UNAVAILABLE_SIZE  0
+
+#endif  /* OPAL_DATATYPE_ORIG_INTERNAL_H_HAS_BEEN_INCLUDED */
\ No newline at end of file
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
index d56ebfe6954..98208dc0f39 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
@@ -1,4 +1,4 @@
-#include "opal_datatype_cuda_internal.cuh"
+ #include "opal_datatype_cuda_internal.cuh"
 #include <stdio.h> 
 #include <time.h>
 
@@ -87,7 +87,6 @@ __device__ void pack_contiguous_loop_cuda_kernel( dt_elem_desc_t* ELEM,
     *(SPACE) -= _copy_loops * _end_loop->size;
     *(COUNT) -= _copy_loops;
 
-    __syncthreads();
 }
 
 __device__ void pack_predefined_data_cuda_kernel( dt_elem_desc_t* ELEM,
@@ -118,7 +117,6 @@ __device__ void pack_predefined_data_cuda_kernel( dt_elem_desc_t* ELEM,
     nb_elements = _copy_blength / 8;
     _src_disp_tmp = (double*)_src_disp;
     _destination_tmp = (double*)_destination;
-    _source_tmp = _src_disp_tmp + tid;
     _destination_tmp += tid;
     
     __syncthreads();
@@ -127,8 +125,8 @@ __device__ void pack_predefined_data_cuda_kernel( dt_elem_desc_t* ELEM,
         _source_tmp = _src_disp_tmp + tid + _i/num_threads*num_threads + _i/nb_elements * gap;
 #if defined (OPAL_DATATYPE_CUDA_DEBUG)
         if (_i == 0 ) {
-            DBGPRINT("tid %d, pack 3. memcpy( %p, %p, %lu ) => space %lu, _i %d, actual _i %d\n",
-                                            tid, _destination_tmp, _source_tmp, (unsigned long)_copy_blength*_copy_count, (unsigned long)(*(SPACE) - _i/nb_elements * _copy_blength), _i/nb_elements, _i );
+            DBGPRINT("tid %d, pack 3. memcpy( %p, %p, %lu ) => space %lu, _i %d, count %d\n",
+                                            tid, _destination_tmp, _source_tmp, (unsigned long)_copy_blength*_copy_count, (unsigned long)(*(SPACE) - _i/nb_elements * _copy_blength), _i/nb_elements, _copy_count );
         }
         // if (_i / nb_elements ==1 && tid == 0 ) {
         //     DBGPRINT("tid %d, pack 3. memcpy( %p, %p, %lu ) => space %lu, _i %d, actual _i %d\n",
@@ -148,12 +146,52 @@ __device__ void pack_predefined_data_cuda_kernel( dt_elem_desc_t* ELEM,
     *(SPACE)  -= _copy_blength;
     *(COUNT)  -= _copy_count;
     
-    __syncthreads();
+}
+
+__device__ void pack_predefined_data_cuda_kernel_v2( dt_elem_desc_t* ELEM,
+                                                     uint32_t* COUNT,
+                                                     unsigned char* SOURCE,
+                                                     unsigned char* DESTINATION,
+                                                     size_t* SPACE,
+                                                     uint32_t local_index,
+                                                     uint32_t dst_offset )
+{
+    uint32_t _copy_count = *(COUNT);
+    size_t _copy_blength;
+    ddt_elem_desc_t* _elem = &((ELEM)->elem);
+    unsigned char* _src_disp = (SOURCE) + _elem->disp;
+    uint32_t local_tid;
+    unsigned char* _destination = DESTINATION;
+    double *_source_tmp, *_destination_tmp, *_src_disp_tmp;;
+
+    _copy_blength = 8;//opal_datatype_basicDatatypes[_elem->common.type]->size;
+    // if( (_copy_count * _copy_blength) > *(SPACE) ) {
+    //     _copy_count = (uint32_t)(*(SPACE) / _copy_blength);
+    //     if( 0 == _copy_count ) return;  /* nothing to do */
+    // }
+    
+    local_tid = threadIdx.x + local_index * blockDim.x;
+    _src_disp_tmp = (double*)_src_disp;
+    _destination_tmp = (double*)_destination + dst_offset;
+    
+    if (local_tid < _copy_count) {
+        _source_tmp = _src_disp_tmp + local_tid;
+        _destination_tmp += local_tid;
+#if defined (OPAL_DATATYPE_CUDA_DEBUG)
+       if (local_tid == 0 ) {
+            DBGPRINT("tid %d, local_index %d, pack 1. memcpy( %p, %p, %lu ) => space %lu, blockIdx %d, count %d, destination %p, offset %d\n",
+                                            local_tid, local_index, _destination_tmp, _source_tmp, (unsigned long)_copy_blength*_copy_count, (unsigned long)(*(SPACE) - local_tid * _copy_blength), blockIdx.x, _copy_count, _destination, dst_offset );
+       }
+#endif /* OPAL_DATATYPE_CUDA_DEBUG */
+#if !defined (OPAL_DATATYPE_CUDA_DRY_RUN)
+       *_destination_tmp = *_source_tmp;
+#endif /* ! OPAL_DATATYPE_CUDA_DRY_RUN */
+    }
 }
 
 __global__ void opal_generic_simple_pack_cuda_kernel(ddt_cuda_desc_t* cuda_desc)
 {
-    dt_stack_t *pStack, *pStack_head;       /* pointer to the position on the stack */
+    dt_stack_t *pStack;       /* pointer to the position on the stack */
     uint32_t pos_desc;        /* actual position in the description of the derived datatype */
     uint32_t count_desc;      /* the number of items already done in the actual pos_desc */
     size_t total_packed = 0;  /* total amount packed this time */
@@ -165,30 +203,26 @@ __global__ void opal_generic_simple_pack_cuda_kernel(ddt_cuda_desc_t* cuda_desc)
     uint32_t stack_pos;
     struct iovec* iov;
 
-    OPAL_PTRDIFF_TYPE lb;
-    OPAL_PTRDIFF_TYPE ub;
+    OPAL_PTRDIFF_TYPE extent;
     uint32_t out_size;
-    uint32_t tid;
-
-    tid = threadIdx.x + blockIdx.x * blockDim.x;
 
-    __shared__ ddt_cuda_desc_t cuda_desc_b;
+    // __shared__ ddt_cuda_desc_t cuda_desc_b;
+    __shared__ dt_stack_t shared_pStack[DT_STATIC_STACK_SIZE];
 
-    if (threadIdx.x == 0) {
-        memcpy(&cuda_desc_b, cuda_desc, sizeof(ddt_cuda_desc_t));
+    if (threadIdx.x < DT_STATIC_STACK_SIZE) {
+        shared_pStack[threadIdx.x] = cuda_desc->pStack[threadIdx.x];
     }
     __syncthreads();
 
+
     // load cuda descriptor from constant memory
-    iov = cuda_desc_b.iov;
-    pStack_head = cuda_desc_b.pStack;
-    pStack = pStack_head;
-    description = cuda_desc_b.description;
-    stack_pos = cuda_desc_b.stack_pos;
-    pBaseBuf = cuda_desc_b.pBaseBuf;
-    lb = cuda_desc_b.lb;
-    ub = cuda_desc_b.ub;
-    out_size = cuda_desc_b.out_size;
+    iov = cuda_desc->iov;
+    pStack = shared_pStack;
+    description = cuda_desc->description;
+    stack_pos = cuda_desc->stack_pos;
+    pBaseBuf = cuda_desc->pBaseBuf;
+    extent = cuda_desc->ub - cuda_desc->lb;
+    out_size = cuda_desc->out_size;
 
     pStack = pStack + stack_pos;
     pos_desc   = pStack->index;
@@ -209,7 +243,7 @@ __global__ void opal_generic_simple_pack_cuda_kernel(ddt_cuda_desc_t* cuda_desc)
             while( pElem->elem.common.flags & OPAL_DATATYPE_FLAG_DATA ) {
                 /* now here we have a basic datatype */
                 // PACK_PREDEFINED_DATATYPE( pConvertor, pElem, count_desc,
-                //                           conv_ptr, iov_ptr, iov_len_local );
+                //                           conv_ptr, iov_ptr, iov_len_local );     
                 pack_predefined_data_cuda_kernel(pElem, &count_desc, &conv_ptr, &iov_ptr, &iov_len_local);
                 if( 0 == count_desc ) {  /* completed */
                     conv_ptr = pBaseBuf + pStack->disp;
@@ -244,7 +278,7 @@ __global__ void opal_generic_simple_pack_cuda_kernel(ddt_cuda_desc_t* cuda_desc)
                     pos_desc = pStack->index + 1;
                     if (threadIdx.x == 0) {
                         if( pStack->index == -1 ) {
-                            pStack->disp += (ub - lb);
+                            pStack->disp += extent;
                         } else {
                             // assert( OPAL_DATATYPE_LOOP == description[pStack->index].loop.common.type );
                             pStack->disp += description[pStack->index].loop.extent;
@@ -290,178 +324,207 @@ __global__ void opal_generic_simple_pack_cuda_kernel(ddt_cuda_desc_t* cuda_desc)
         total_packed += iov[iov_count].iov_len;
     }
 
-    if (tid == 0) {
-        cuda_desc->max_data = total_packed;
-        cuda_desc->out_size = iov_count;
-        // cuda_desc->bConverted += total_packed;  /* update the already converted bytes */
-        // if( cuda_desc->bConverted == cuda_desc->local_size ) {
-        //     cuda_desc->stack_pos = stack_pos;
-        //     memcpy(cuda_desc->pStack, pStack_head, sizeof(dt_stack_t)*cuda_desc->stack_size);
-        //     return;
-        // }
-        // /* Save the global position for the next round */
-        // PUSH_STACK( pStack, stack_pos, pos_desc, OPAL_DATATYPE_INT8, count_desc,
-        //             conv_ptr - pBaseBuf );
-        // memcpy(cuda_desc->pStack, pStack_head, sizeof(dt_stack_t)*cuda_desc->stack_size);
-        // cuda_desc->stack_pos = stack_pos;
+    // if (tid == 0) {
+    //     cuda_desc->max_data = total_packed;
+    //     cuda_desc->out_size = iov_count;
+    //     // cuda_desc->bConverted += total_packed;  /* update the already converted bytes */
+    //     // if( cuda_desc->bConverted == cuda_desc->local_size ) {
+    //     //     cuda_desc->stack_pos = stack_pos;
+    //     //     memcpy(cuda_desc->pStack, pStack_head, sizeof(dt_stack_t)*cuda_desc->stack_size);
+    //     //     return;
+    //     // }
+    //     // /* Save the global position for the next round */
+    //     // PUSH_STACK( pStack, stack_pos, pos_desc, OPAL_DATATYPE_INT8, count_desc,
+    //     //             conv_ptr - pBaseBuf );
+    //     // memcpy(cuda_desc->pStack, pStack_head, sizeof(dt_stack_t)*cuda_desc->stack_size);
+    //     // cuda_desc->stack_pos = stack_pos;
+    // }
+
+    return;
+}
+
+__global__ void opal_generic_simple_pack_cuda_kernel_v2(ddt_cuda_desc_t* cuda_desc)
+{
+    dt_stack_t *pStack;       /* pointer to the position on the stack */
+    uint32_t pos_desc;        /* actual position in the description of the derived datatype */
+    uint32_t count_desc;      /* the number of items already done in the actual pos_desc */
+    size_t total_packed = 0;  /* total amount packed this time */
+    dt_elem_desc_t* description;
+    dt_elem_desc_t* pElem;
+    unsigned char *conv_ptr, *iov_ptr, *pBaseBuf;
+    size_t iov_len_local;
+    uint32_t iov_count;
+    uint32_t stack_pos;
+    struct iovec* iov;
+    ddt_cuda_description_dist_t* description_dist_d;
+    uint32_t ct = 0, local_index, dst_offset;
+
+    OPAL_PTRDIFF_TYPE extent;
+    uint32_t out_size;
+
+    // __shared__ ddt_cuda_desc_t cuda_desc_b;
+    __shared__ dt_stack_t shared_pStack[DT_STATIC_STACK_SIZE];
+
+    if (threadIdx.x < DT_STATIC_STACK_SIZE) {
+        shared_pStack[threadIdx.x] = cuda_desc->pStack[threadIdx.x];
     }
     __syncthreads();
 
+
+    // load cuda descriptor from constant memory
+    iov = cuda_desc->iov;
+    pStack = shared_pStack;
+    description = cuda_desc->description;
+    stack_pos = cuda_desc->stack_pos;
+    pBaseBuf = cuda_desc->pBaseBuf;
+    extent = cuda_desc->ub - cuda_desc->lb;
+    out_size = cuda_desc->out_size;
+    description_dist_d = cuda_desc->description_dist;
+
+    pStack = pStack + stack_pos;
+    pos_desc = description_dist_d[blockIdx.x].description_index[ct];
+    local_index = description_dist_d[blockIdx.x].description_local_index[ct];
+    dst_offset = description_dist_d[blockIdx.x].dst_offset[ct];
+    pElem = &(description[pos_desc]);
+    count_desc = pElem->elem.count;
+    conv_ptr = pBaseBuf + pStack->disp;
+    pStack--;
+    stack_pos--;
+
+//    printf("pack start pos_desc %d count_desc %d disp %ld, stack_pos %d pos_desc %d count_desc %d disp %ld\n",
+//            pos_desc, count_desc, (long)(conv_ptr - pBaseBuf), stack_pos, pStack->index, (int)pStack->count, (long)pStack->disp);
+
+    for( iov_count = 0; iov_count < out_size; iov_count++ ) {
+        iov_ptr = (unsigned char *) iov[iov_count].iov_base;
+        iov_len_local = iov[iov_count].iov_len;
+//        DBGPRINT("iov_len_local %lu, flags %d, types %d, count %d\n", iov_len_local, description->elem.common.flags, description->elem.common.type, description->elem.count);
+        while( 1 ) {
+            while( pElem->elem.common.flags & OPAL_DATATYPE_FLAG_DATA ) {
+                /* now here we have a basic datatype */
+                // PACK_PREDEFINED_DATATYPE( pConvertor, pElem, count_desc,
+                //                           conv_ptr, iov_ptr, iov_len_local );  
+               pack_predefined_data_cuda_kernel_v2(pElem, &count_desc, conv_ptr, iov_ptr, &iov_len_local, local_index, dst_offset);
+               count_desc = 0;
+                if( 0 == count_desc ) {  /* completed */
+                    conv_ptr = pBaseBuf + pStack->disp;
+                    ct ++;
+                    if (ct >= description_dist_d[blockIdx.x].description_used) {
+                        pos_desc = cuda_desc->description_count-1;
+                    } else {
+                        pos_desc = description_dist_d[blockIdx.x].description_index[ct];  /* advance to the next data */
+                        local_index = description_dist_d[blockIdx.x].description_local_index[ct];
+                        dst_offset = description_dist_d[blockIdx.x].dst_offset[ct];
+                    }
+#if defined (OPAL_DATATYPE_CUDA_DEBUG)
+                    if (pos_desc > (cuda_desc->description_count - 1)) {
+                        printf("EEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEERROR, block %d, thread %d, pos_desc %d\n", blockIdx.x, threadIdx.x, pos_desc);
+                    }
+#endif /* OPAL_DATATYPE_CUDA_DEBUG */
+                    UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
+#if defined (OPAL_DATATYPE_CUDA_DEBUG)
+                    if (pos_desc < (cuda_desc->description_count - 1) && !(pElem->elem.common.flags & OPAL_DATATYPE_FLAG_DATA)) {
+                        printf("I get a error block %d, thread %d, pos_desc %d\n", blockIdx.x, threadIdx.x, pos_desc);
+                    }
+#endif /* OPAL_DATATYPE_CUDA_DEBUG */
+                    continue;
+                }
+                goto complete_loop;
+            }
+            if( OPAL_DATATYPE_END_LOOP == pElem->elem.common.type ) { /* end of the current loop */
+                // DO_DEBUG( opal_output( 0, "pack end_loop count %d stack_pos %d"
+                //                        " pos_desc %d disp %ld space %lu\n",
+                //                        (int)pStack->count, pConvertor->stack_pos,
+                //                        pos_desc, (long)pStack->disp, (unsigned long)iov_len_local ); );
+                if (threadIdx.x == 0) {
+                    (pStack->count)--;
+                }
+                __syncthreads();
+
+                if( (pStack->count) == 0 ) { /* end of loop */
+                    if( 0 == stack_pos ) {
+                        /* we lie about the size of the next element in order to
+                         * make sure we exit the main loop.
+                         */
+                        out_size = iov_count;
+                        goto complete_loop;  /* completed */
+                    }
+                    stack_pos--;
+                    pStack--;
+                    pos_desc++;
+                } else {
+                    pos_desc = pStack->index + 1;
+                    if (threadIdx.x == 0) {
+                        if( pStack->index == -1 ) {
+                            pStack->disp += extent;
+                        } else {
+                            // assert( OPAL_DATATYPE_LOOP == description[pStack->index].loop.common.type );
+                            pStack->disp += description[pStack->index].loop.extent;
+                        }
+                    }
+                    __syncthreads();
+                }
+                conv_ptr = pBaseBuf + pStack->disp;
+                UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
+                // DO_DEBUG( opal_output( 0, "pack new_loop count %d stack_pos %d pos_desc %d count_desc %d disp %ld space %lu\n",
+                //                        (int)pStack->count, pConvertor->stack_pos, pos_desc,
+                //                        count_desc, (long)pStack->disp, (unsigned long)iov_len_local ); );
+            }
+            if( OPAL_DATATYPE_LOOP == pElem->elem.common.type ) {
+                OPAL_PTRDIFF_TYPE local_disp = (OPAL_PTRDIFF_TYPE)conv_ptr;
+                if( pElem->loop.common.flags & OPAL_DATATYPE_FLAG_CONTIGUOUS ) {
+                    pack_contiguous_loop_cuda_kernel( pElem, &count_desc,
+                                          &conv_ptr, &iov_ptr, &iov_len_local );
+                    if( 0 == count_desc ) {  /* completed */
+                        pos_desc += pElem->loop.items + 1;
+                        goto update_loop_description;
+                    }
+                    /* Save the stack with the correct last_count value. */
+                }
+                local_disp = (OPAL_PTRDIFF_TYPE)conv_ptr - local_disp;
+
+                PUSH_STACK( pStack, stack_pos, pos_desc, OPAL_DATATYPE_LOOP, count_desc,
+                            pStack->disp + local_disp);
+
+                pos_desc++;
+            update_loop_description:  /* update the current state */
+                conv_ptr = pBaseBuf + pStack->disp;
+                UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
+                // DDT_DUMP_STACK( pConvertor->pStack, pConvertor->stack_pos, pElem, "advance loop" );
+                continue;
+            }
+        }
+    complete_loop:
+        if (threadIdx.x == 0) {
+            iov[iov_count].iov_len -= iov_len_local;  /* update the amount of valid data */
+        }
+        __syncthreads();
+        total_packed += iov[iov_count].iov_len;
+    }
+#if defined (OPAL_DATATYPE_CUDA_DEBUG)    
+    if (ct != description_dist_d[blockIdx.x].description_used) {
+        printf("I am at the end, but error,ct %d\n", ct);
+    }
+#endif /* OPAL_DATATYPE_CUDA_DEBUG */
+
+    // if (tid == 0) {
+    //     cuda_desc->max_data = total_packed;
+    //     cuda_desc->out_size = iov_count;
+    //     // cuda_desc->bConverted += total_packed;  /* update the already converted bytes */
+    //     // if( cuda_desc->bConverted == cuda_desc->local_size ) {
+    //     //     cuda_desc->stack_pos = stack_pos;
+    //     //     memcpy(cuda_desc->pStack, pStack_head, sizeof(dt_stack_t)*cuda_desc->stack_size);
+    //     //     return;
+    //     // }
+    //     // /* Save the global position for the next round */
+    //     // PUSH_STACK( pStack, stack_pos, pos_desc, OPAL_DATATYPE_INT8, count_desc,
+    //     //             conv_ptr - pBaseBuf );
+    //     // memcpy(cuda_desc->pStack, pStack_head, sizeof(dt_stack_t)*cuda_desc->stack_size);
+    //     // cuda_desc->stack_pos = stack_pos;
+    // }
+
     return;
 }
 
-// __global__ void opal_generic_simple_pack_cuda_kernel(ddt_cuda_desc_t* cuda_desc)
-// {
-//     dt_stack_t *pStack, *pStack_head;       /* pointer to the position on the stack */
-//     uint32_t pos_desc;        /* actual position in the description of the derived datatype */
-//     uint32_t count_desc;      /* the number of items already done in the actual pos_desc */
-//     size_t total_packed = 0;  /* total amount packed this time */
-//     dt_elem_desc_t* description;
-//     dt_elem_desc_t* pElem;
-//     unsigned char *conv_ptr, *iov_ptr, *pBaseBuf;
-//     size_t iov_len_local;
-//     uint32_t iov_count;
-//     uint32_t stack_pos;
-//     struct iovec* iov;
-//
-//     OPAL_PTRDIFF_TYPE lb;
-//     OPAL_PTRDIFF_TYPE ub;
-//     uint32_t out_size;
-//     uint32_t tid;
-//
-//     tid = threadIdx.x + blockIdx.x * blockDim.x;
-//
-//     __shared__ ddt_cuda_desc_t cuda_desc_b;
-//
-//     if (threadIdx.x == 0) {
-//         memcpy(&cuda_desc_b, cuda_desc, sizeof(ddt_cuda_desc_t));
-//     }
-//     __syncthreads();
-//
-//
-//     // load cuda descriptor from constant memory
-//     iov = cuda_desc_b.iov;
-//     pStack_head = cuda_desc_b.pStack;
-//     pStack = pStack_head;
-//     description = cuda_desc_b.description;
-//     stack_pos = cuda_desc_b.stack_pos;
-//     pBaseBuf = cuda_desc_b.pBaseBuf;
-//     lb = cuda_desc_b.lb;
-//     ub = cuda_desc_b.ub;
-//     out_size = cuda_desc_b.out_size;
-//
-//     pStack = pStack + stack_pos;
-//     pos_desc   = pStack->index;
-//     conv_ptr   = pBaseBuf + pStack->disp;
-//     count_desc = (uint32_t)pStack->count;
-//     pStack--;
-//     stack_pos--;
-//     pElem = &(description[pos_desc]);
-//
-// //    printf("pack start pos_desc %d count_desc %d disp %ld, stack_pos %d pos_desc %d count_desc %d disp %ld\n",
-// //            pos_desc, count_desc, (long)(conv_ptr - pBaseBuf), stack_pos, pStack->index, (int)pStack->count, (long)pStack->disp);
-//
-//     if (threadIdx.x == 0) {
-//     for( iov_count = 0; iov_count < out_size; iov_count++ ) {
-//         iov_ptr = (unsigned char *) iov[iov_count].iov_base;
-//         iov_len_local = iov[iov_count].iov_len;
-//         DBGPRINT("iov_len_local %lu, flags %d, types %d, count %d\n", iov_len_local, description->elem.common.flags, description->elem.common.type, description->elem.count);
-//         while( 1 ) {
-//             while( pElem->elem.common.flags & OPAL_DATATYPE_FLAG_DATA ) {
-//                 /* now here we have a basic datatype */
-//                 // PACK_PREDEFINED_DATATYPE( pConvertor, pElem, count_desc,
-//                 //                           conv_ptr, iov_ptr, iov_len_local );
-//                 if( 0 == count_desc ) {  /* completed */
-//                     conv_ptr = pBaseBuf + pStack->disp;
-//                     pos_desc++;  /* advance to the next data */
-//                     UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
-//                     continue;
-//                 }
-//                 goto complete_loop;
-//             }
-//             if( OPAL_DATATYPE_END_LOOP == pElem->elem.common.type ) { /* end of the current loop */
-//                 // DO_DEBUG( opal_output( 0, "pack end_loop count %d stack_pos %d"
-//                 //                        " pos_desc %d disp %ld space %lu\n",
-//                 //                        (int)pStack->count, pConvertor->stack_pos,
-//                 //                        pos_desc, (long)pStack->disp, (unsigned long)iov_len_local ); );
-//
-//                 if( --(pStack->count) == 0 ) { /* end of loop */
-//                     if( 0 == stack_pos ) {
-//                         /* we lie about the size of the next element in order to
-//                          * make sure we exit the main loop.
-//                          */
-//                         out_size = iov_count;
-//                         goto complete_loop;  /* completed */
-//                     }
-//                     stack_pos--;
-//                     pStack--;
-//                     pos_desc++;
-//                 } else {
-//                     pos_desc = pStack->index + 1;
-//                     if( pStack->index == -1 ) {
-//                         pStack->disp += (ub - lb);
-//                     } else {
-//                         // assert( OPAL_DATATYPE_LOOP == description[pStack->index].loop.common.type );
-//                         pStack->disp += description[pStack->index].loop.extent;
-//                     }
-//
-//                 }
-//                 conv_ptr = pBaseBuf + pStack->disp;
-//                 UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
-//                 // DO_DEBUG( opal_output( 0, "pack new_loop count %d stack_pos %d pos_desc %d count_desc %d disp %ld space %lu\n",
-//                 //                        (int)pStack->count, pConvertor->stack_pos, pos_desc,
-//                 //                        count_desc, (long)pStack->disp, (unsigned long)iov_len_local ); );
-//             }
-//             if( OPAL_DATATYPE_LOOP == pElem->elem.common.type ) {
-//                 OPAL_PTRDIFF_TYPE local_disp = (OPAL_PTRDIFF_TYPE)conv_ptr;
-//                 if( pElem->loop.common.flags & OPAL_DATATYPE_FLAG_CONTIGUOUS ) {
-//                     // pack_contiguous_loop_cuda_kernel( pElem, &count_desc,
-//                     //                       &conv_ptr, &iov_ptr, &iov_len_local );
-//                     count_desc = 0;
-//                     if( 0 == count_desc ) {  /* completed */
-//                         pos_desc += pElem->loop.items + 1;
-//                         goto update_loop_description;
-//                     }
-//                     /* Save the stack with the correct last_count value. */
-//                 }
-//                 local_disp = (OPAL_PTRDIFF_TYPE)conv_ptr - local_disp;
-//
-//                 PUSH_STACK( pStack, stack_pos, pos_desc, OPAL_DATATYPE_LOOP, count_desc,
-//                             pStack->disp + local_disp);
-//
-//                 pos_desc++;
-//             update_loop_description:  /* update the current state */
-//                 conv_ptr = pBaseBuf + pStack->disp;
-//                 UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
-//                 // DDT_DUMP_STACK( pConvertor->pStack, pConvertor->stack_pos, pElem, "advance loop" );
-//                 continue;
-//             }
-//         }
-//     complete_loop:
-//         iov[iov_count].iov_len -= iov_len_local;  /* update the amount of valid data */
-//         total_packed += iov[iov_count].iov_len;
-//     }
-//
-//     }
-//     __syncthreads();
-//     if (tid == 0) {
-//         cuda_desc->max_data = total_packed;
-//         cuda_desc->out_size = iov_count;
-//         // cuda_desc->bConverted += total_packed;  /* update the already converted bytes */
-//         // if( cuda_desc->bConverted == cuda_desc->local_size ) {
-//         //     cuda_desc->stack_pos = stack_pos;
-//         //     memcpy(cuda_desc->pStack, pStack_head, sizeof(dt_stack_t)*cuda_desc->stack_size);
-//         //     return;
-//         // }
-//         // /* Save the global position for the next round */
-//         // PUSH_STACK( pStack, stack_pos, pos_desc, OPAL_DATATYPE_INT8, count_desc,
-//         //             conv_ptr - pBaseBuf );
-//         // memcpy(cuda_desc->pStack, pStack_head, sizeof(dt_stack_t)*cuda_desc->stack_size);
-//         // cuda_desc->stack_pos = stack_pos;
-//     }
-//     return;
-// }
-
 __global__ void pack_contiguous_loop_cuda_kernel_global( uint32_t copy_loops,
                                                          size_t size,
                                                          OPAL_PTRDIFF_TYPE extent,
@@ -479,7 +542,6 @@ __global__ void pack_contiguous_loop_cuda_kernel_global( uint32_t copy_loops,
     nb_elements = size / 8;
     _src_disp_tmp = (double*)source;
     _destination_tmp = (double*)destination;
-    _source_tmp = _src_disp_tmp + tid;
     _destination_tmp += tid;
 
     for (_i = tid; _i < copy_loops*nb_elements; _i+=num_threads) {
@@ -499,4 +561,72 @@ __global__ void pack_contiguous_loop_cuda_kernel_global( uint32_t copy_loops,
 #endif /* ! OPAL_DATATYPE_CUDA_DRY_RUN */
         _destination_tmp += num_threads;
     }
-}
\ No newline at end of file
+}
+
+// __global__ void opal_generic_simple_pack_cuda_iov_kernel( ddt_cuda_description_dist_t* desc_dist_d,
+//                                                         dt_elem_desc_t* desc_d,
+//                                                         uint32_t required_blocks, struct iovec* iov, unsigned char* pBaseBuf)
+// {
+//     uint32_t i;
+//     dt_elem_desc_t* pElem;
+//     unsigned char *conv_ptr, *iov_ptr;
+//     uint32_t local_index, dst_offset, pos_desc, count_desc;
+//     size_t iov_len_local;
+//
+//     iov_ptr = (unsigned char *) iov[0].iov_base;
+//     iov_len_local = iov[0].iov_len;
+//     conv_ptr = pBaseBuf;
+//     for (i = 0; i < desc_dist_d[blockIdx.x].description_used; i++) {
+//         pos_desc = desc_dist_d[blockIdx.x].description_index[i];
+//         local_index = desc_dist_d[blockIdx.x].description_local_index[i];
+//         dst_offset = desc_dist_d[blockIdx.x].dst_offset[i];
+//         pElem = &(desc_d[pos_desc]);
+//         count_desc = pElem->elem.count;
+//
+//   //      if ( pElem->elem.common.flags & OPAL_DATATYPE_FLAG_DATA ) {
+//             pack_predefined_data_cuda_kernel_v2(pElem, &count_desc, conv_ptr, iov_ptr, &iov_len_local, local_index, dst_offset);
+// //        }
+//     }
+//
+// }
+
+__global__ void opal_generic_simple_pack_cuda_iov_kernel( ddt_cuda_iov_dist_t* cuda_iov_dist)
+{
+    uint32_t i, _copy_count;
+    unsigned char *src, *dst;
+    uint8_t alignment;
+    unsigned char *_source_tmp, *_destination_tmp;
+    
+    __shared__ uint32_t nb_tasks;
+    
+    if (threadIdx.x == 0) {
+        //printf("iov pack kernel \n");
+        nb_tasks = cuda_iov_dist[blockIdx.x].nb_tasks;
+    }
+    __syncthreads();
+    
+    for (i = 0; i < nb_tasks; i++) {
+        src = cuda_iov_dist[blockIdx.x].src[i];
+        dst = cuda_iov_dist[blockIdx.x].dst[i];
+        _copy_count = cuda_iov_dist[blockIdx.x].nb_elements[i];
+        alignment = cuda_iov_dist[blockIdx.x].element_alignment[i];
+        
+        // if (threadIdx.x == 0) {
+        //     printf("block %d, ali %d, nb_element %d\n", blockIdx.x, cuda_iov_dist[blockIdx.x].element_alignment[i], _copy_count);
+        // }
+        
+        if (threadIdx.x < _copy_count) {
+            _source_tmp = src + threadIdx.x * alignment;
+            _destination_tmp = dst + threadIdx.x * alignment;
+#if !defined (OPAL_DATATYPE_CUDA_DRY_RUN)
+            if (alignment == ALIGNMENT_DOUBLE) {
+                *((double *)_destination_tmp) = *((double *)_source_tmp);
+            } else if (alignment == ALIGNMENT_FLOAT) {
+                *((float *)_destination_tmp) = *((float *)_source_tmp);
+            } else {
+                * _destination_tmp = *_source_tmp;
+            }
+#endif /* ! OPAL_DATATYPE_CUDA_DRY_RUN */
+        }
+    }
+}
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
index 3b04bf025e8..f13610fc1bf 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
@@ -2,6 +2,7 @@
 #include "opal_datatype_cuda.cuh"
 
 #include <stdio.h>
+#include <assert.h>
 
 int32_t opal_generic_simple_pack_function_cuda( opal_convertor_t* pConvertor,
                                                 struct iovec* iov, 
@@ -10,10 +11,13 @@ int32_t opal_generic_simple_pack_function_cuda( opal_convertor_t* pConvertor,
 {
     uint32_t i;
     dt_elem_desc_t* description;
+    dt_elem_desc_t* pElem;
     const opal_datatype_t *pData = pConvertor->pDesc;
-    uint32_t tasks_per_block, num_blocks;
+    uint32_t tasks_per_block, num_blocks, thread_per_block;
     dt_stack_t* pStack;
     
+    //return -99;
+
     description = pConvertor->use_desc->desc;
     
     cuda_desc_h->stack_pos = pConvertor->stack_pos;
@@ -49,7 +53,8 @@ int32_t opal_generic_simple_pack_function_cuda( opal_convertor_t* pConvertor,
         cuda_desc_h->description_max_count = pConvertor->use_desc->used+1;
         cuda_desc_h->description_count = pConvertor->use_desc->used+1;
     }
-    cudaMemcpy(cuda_desc_h->description, description, sizeof(dt_elem_desc_t)*(pConvertor->use_desc->used+1), cudaMemcpyHostToDevice);
+    cudaMemcpy(cuda_desc_h->description, description, sizeof(dt_elem_desc_t)*(cuda_desc_h->description_count), cudaMemcpyHostToDevice);
+    printf("description ct %d\n", cuda_desc_h->description_count);
     
     // for (i = 0; i < pConvertor->use_desc->used+1; i++) {
     //     cuda_desc_h->description[i] = description[i];
@@ -66,19 +71,73 @@ int32_t opal_generic_simple_pack_function_cuda( opal_convertor_t* pConvertor,
         cuda_desc_h->iov[i].iov_len = iov[i].iov_len;
     }
     
-    cudaMemcpy(cuda_desc_d, cuda_desc_h, sizeof(ddt_cuda_desc_t), cudaMemcpyHostToDevice);
-    
     pStack = pConvertor->pStack + pConvertor->stack_pos;
-    tasks_per_block = THREAD_PER_BLOCK * TASK_PER_THREAD;
+    thread_per_block = CUDA_WARP_SIZE * 5;
+    tasks_per_block = thread_per_block * TASK_PER_THREAD;
     num_blocks = ((uint32_t)pStack->count + tasks_per_block - 1) / tasks_per_block;
-    printf("launch kernel, count %d, num_blocks %d, total threads %d\n", (uint32_t)pStack->count, num_blocks, num_blocks*2*THREAD_PER_BLOCK);
-    opal_generic_simple_pack_cuda_kernel<<<192,4*THREAD_PER_BLOCK>>>(cuda_desc_d);
+    num_blocks = 512;
+
+    /***/
+    uint32_t pos_desc, count_desc, current_block, task_iteration, nb_blocks_per_description, j, dst_offset;
+    pos_desc   = pStack->index;
+    pElem = &(description[pos_desc]);
+    count_desc = (uint32_t)pStack->count;
+    current_block = 0;
+    task_iteration = 0;
+    dst_offset = 0;
+    while( 1 ) {
+        while( pElem->elem.common.flags & OPAL_DATATYPE_FLAG_DATA ) {
+            nb_blocks_per_description = (count_desc + thread_per_block - 1) / thread_per_block;
+            for (i = 0; i < nb_blocks_per_description; i++) {
+                description_dist_h[current_block].description_index[task_iteration] = pos_desc;
+                description_dist_h[current_block].description_local_index[task_iteration] = i;
+                description_dist_h[current_block].dst_offset[task_iteration] = dst_offset;
+                description_dist_h[current_block].description_used = task_iteration + 1;
+                if ( (i+1) * thread_per_block <= count_desc) {
+                    dst_offset += thread_per_block;
+                } else {
+                    dst_offset += thread_per_block - ((i+1)*thread_per_block - count_desc);
+                }
+                current_block += 1;
+                if (current_block >= num_blocks) {
+                    current_block = 0;
+                    task_iteration ++;
+                }
+            }
+            pos_desc ++;
+            pElem = &(description[pos_desc]);
+            count_desc = pElem->elem.count;
+        }
+        if( OPAL_DATATYPE_END_LOOP == pElem->elem.common.type ) {
+            break;
+        }
+    }
+
+    // for (i = 0; i < num_blocks; i++) {
+    //     printf("block %d\t, used %d\n", i, description_dist_h[i].description_used);
+    //     for (j = 0; j < description_dist_h[i].description_used; j++) {
+    //         pos_desc = description_dist_h[i].description_index[j];
+    //         pElem = &(description[pos_desc]);
+    //         printf("i %d\t, descp_pos %d\t, local_index %d\t, count %d\t, dst offset %d\n", j, description_dist_h[i].description_index[j], description_dist_h[i].description_local_index[j], pElem->elem.count, description_dist_h[i].dst_offset[j]);
+    //     }
+    // }
+
+    cudaMemcpy(cuda_desc_h->description_dist, description_dist_h, sizeof(ddt_cuda_description_dist_t)*(num_blocks), cudaMemcpyHostToDevice);
+    /***/
+    
+    cudaMemcpy(cuda_desc_d, cuda_desc_h, sizeof(ddt_cuda_desc_t), cudaMemcpyHostToDevice);
+      
+    printf("launch pack kernel, count %d, num_blocks %d, total threads %d\n", (uint32_t)pStack->count, num_blocks, num_blocks*thread_per_block);
+    opal_generic_simple_pack_cuda_kernel_v2<<<num_blocks, thread_per_block>>>(cuda_desc_d);
 #if !defined(OPAL_DATATYPE_CUDA_DRY_RUN)
     size_t position = pConvertor->pDesc->size;
-    opal_convertor_set_position_nocheck(pConvertor, &position);
+//    opal_convertor_set_position_nocheck(pConvertor, &position);
 #endif
     cudaDeviceSynchronize();
     
+   return 1;
+    
+    
 #if defined(OPAL_DATATYPE_CUDA_DRY_RUN)
     return -99;
 #else
@@ -147,6 +206,346 @@ void pack_contiguous_loop_cuda( dt_elem_desc_t* ELEM,
 }
 
 
+// int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor,
+//                                                     struct iovec* iov,
+//                                                     uint32_t* out_size,
+//                                                     size_t* max_data )
+// {
+//     uint32_t i;
+//     uint32_t count_desc, current_block, task_iteration, nb_blocks_per_description, j, dst_offset;
+//     uint32_t nb_blocks, thread_per_block;
+//     dt_elem_desc_t* description;
+//     size_t length;
+//
+//  //   return -99;
+//
+//     cuda_iov_count = 4000;
+//     opal_convertor_raw( pConvertor, cuda_iov, &cuda_iov_count, &length );
+//     printf("iov count %d, length %d\n", cuda_iov_count, length);
+//
+//     description = pConvertor->use_desc->desc;
+//     current_block = 0;
+//     task_iteration = 0;
+//     dst_offset = 0;
+//     thread_per_block = CUDA_WARP_SIZE * 4;
+//     nb_blocks = 512;
+//     for (i = 0; i < cuda_iov_count; i++) {
+//         count_desc = cuda_iov[i].iov_len / sizeof(double);
+// //        printf("i = %d\t, iov_base %p\t, iov_len %ld\t, count %d\n", i, cuda_iov[i].iov_base, cuda_iov[i].iov_len, count_desc);
+//         nb_blocks_per_description = (count_desc + thread_per_block - 1) / thread_per_block;
+//         for (j = 0; j < nb_blocks_per_description; j++) {
+//             description_dist_h[current_block].description_index[task_iteration] = i;
+//             description_dist_h[current_block].description_local_index[task_iteration] = j;
+//             description_dist_h[current_block].dst_offset[task_iteration] = dst_offset;
+//             description_dist_h[current_block].description_used = task_iteration + 1;
+//             if ( (j+1) * thread_per_block <= count_desc) {
+//                 dst_offset += thread_per_block;
+//             } else {
+//                 dst_offset += thread_per_block - ((j+1)*thread_per_block - count_desc);
+//             }
+//             current_block += 1;
+//             if (current_block >= nb_blocks) {
+//                 current_block = 0;
+//                 task_iteration ++;
+//             }
+//         }
+//     }
+//
+//     uint32_t pos_desc;
+//     dt_elem_desc_t* pElem;
+//     // for (i = 0; i < nb_blocks; i++) {
+//     //     printf("block %d\t, used %d\n", i, description_dist_h[i].description_used);
+//     //     for (j = 0; j < description_dist_h[i].description_used; j++) {
+//     //         pos_desc = description_dist_h[i].description_index[j];
+//     //         pElem = &(description[pos_desc]);
+//     //         printf("i %d\t, descp_pos %d\t, local_index %d\t, count %d\t, dst offset %d\n", j, description_dist_h[i].description_index[j], description_dist_h[i].description_local_index[j], pElem->elem.count, description_dist_h[i].dst_offset[j]);
+//     //     }
+//     // }
+//
+//     cudaMemcpy(description_dist_d, description_dist_h, sizeof(ddt_cuda_description_dist_t)*(nb_blocks), cudaMemcpyHostToDevice);
+//
+//     if (cuda_desc_h->description_max_count != 0) {
+//         if (cuda_desc_h->description_max_count >= (pConvertor->use_desc->used+1)) {
+//             cuda_desc_h->description_count = pConvertor->use_desc->used+1;
+//         } else {
+//             cudaFree(cuda_desc_h->description);
+//             cuda_desc_h->description = NULL;
+//             cudaMalloc((void **)&(cuda_desc_h->description), sizeof(dt_elem_desc_t)*(pConvertor->use_desc->used+1));
+//             description_d = cuda_desc_h->description;
+//             cuda_desc_h->description_max_count = pConvertor->use_desc->used+1;
+//             cuda_desc_h->description_count = pConvertor->use_desc->used+1;
+//         }
+//
+//     } else {
+//         cudaMalloc((void **)&(cuda_desc_h->description), sizeof(dt_elem_desc_t)*(pConvertor->use_desc->used+1));
+//         description_d = cuda_desc_h->description;
+//         cuda_desc_h->description_max_count = pConvertor->use_desc->used+1;
+//         cuda_desc_h->description_count = pConvertor->use_desc->used+1;
+//     }
+//     cudaMemcpy(description_d, description, sizeof(dt_elem_desc_t)*(cuda_desc_h->description_count), cudaMemcpyHostToDevice);
+//
+//     unsigned char* pBaseBuf;
+// #if defined(OPAL_DATATYPE_CUDA_DRY_RUN)
+//     pBaseBuf = pConvertor->pBaseBuf;
+// #else
+//     pBaseBuf = pBaseBuf_GPU;
+// #endif /* OPAL_DATATYPE_CUDA_DRY_RUN */
+//
+//     for (i = 0; i < *out_size; i++) {
+// #if defined (OPAL_DATATYPE_CUDA_DRY_RUN)
+//         cuda_desc_h->iov[i].iov_base = iov[i].iov_base;
+// #endif /* OPAL_DATATYPE_CUDA_DRY_RUN */
+//         cuda_desc_h->iov[i].iov_len = iov[i].iov_len;
+//     }
+//
+//     opal_generic_simple_pack_cuda_iov_kernel<<<nb_blocks, thread_per_block>>>(description_dist_d, description_d, current_block, cuda_desc_h->iov, pBaseBuf);
+//     cudaDeviceSynchronize();
+//
+//     return 1;
+// }
+
+int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor,
+                                                    struct iovec* iov,
+                                                    uint32_t* out_size,
+                                                    size_t* max_data )
+{
+    uint32_t i, j;
+    uint32_t count_desc, current_block, task_iteration, nb_blocks_per_description, residue_desc;
+    uint32_t nb_blocks, thread_per_block;
+    size_t length, buffer_size, length_per_iovec, dst_offset;
+    unsigned char *destination;
+    size_t total_packed, total_converted;
+    int32_t complete_flag = 0;
+    uint8_t buffer_isfull = 0;
+    uint32_t convertor_flags;
+    dt_elem_desc_t* description;
+    dt_elem_desc_t* pElem;
+    dt_stack_t* pStack;
+    uint8_t alignment, orig_alignment;
+    
+    ddt_cuda_iov_dist_t* cuda_iov_dist_h_current;
+    ddt_cuda_iov_dist_t* cuda_iov_dist_d_current;
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)    
+    TIMER_DATA_TYPE start, end, start_total, end_total;
+    long total_time;
+#endif
+    
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+    GET_TIME(start_total);
+#endif
+    
+    DT_CUDA_DEBUG ( opal_cuda_output(0, "GPU datatype packing using iovec\n"); );
+
+#if defined(OPAL_DATATYPE_CUDA_DRY_RUN)
+    destination = (unsigned char*)iov[0].iov_base;
+#else
+//    pConvertor->pBaseBuf = pBaseBuf_GPU;
+  //  printf("Pack GPU base %p, iov_buffer %p\n", pConvertor->pBaseBuf, iov[0].iov_base);
+    destination = ddt_cuda_pack_buffer;
+#endif /* OPAL_DATATYPE_CUDA_DRY_RUN */
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+    GET_TIME(start);
+#endif
+    
+    description = pConvertor->use_desc->desc;
+    pStack = pConvertor->pStack + pConvertor->stack_pos;
+    pElem = &(description[pStack->index]);
+    printf("size elem %lu, size %d\n", pElem->elem.common.type, opal_datatype_basicDatatypesSize[pElem->elem.common.type]);
+
+    printf("buffer size %d, max_data %d\n", iov[0].iov_len, *max_data);
+    buffer_size = iov[0].iov_len;
+    cuda_iov_count = 1000;
+    total_packed = 0;
+    total_converted = pConvertor->bConverted;
+    cuda_streams->current_stream_id = 0;
+    convertor_flags = pConvertor->flags;
+    complete_flag = opal_convertor_raw( pConvertor, cuda_iov, &cuda_iov_count, &length );
+    DT_CUDA_DEBUG ( opal_cuda_output(1, "complete flag %d, iov count %d, length %d, submit to CUDA stream %d\n", complete_flag, cuda_iov_count, length, cuda_streams->current_stream_id); );
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)    
+    GET_TIME( end );
+    total_time = ELAPSED_TIME( start, end );
+    printf( "[Timing]: ddt to iov in %ld microsec\n", total_time );
+#endif
+    
+    dst_offset = 0;
+    thread_per_block = CUDA_WARP_SIZE * 4;
+    nb_blocks = 256;
+    
+    while (cuda_iov_count > 0) {
+    // void* temp_addr;
+    // size_t temp_size;
+    // for (i = 1; i < cuda_iov_count/2; i+=2) {
+    //     temp_addr = cuda_iov[i].iov_base;
+    //     temp_size = cuda_iov[i].iov_len;
+    //     cuda_iov[i].iov_base = cuda_iov[cuda_iov_count-i].iov_base;
+    //     cuda_iov[i].iov_len = cuda_iov[cuda_iov_count-i].iov_len;
+    //     cuda_iov[cuda_iov_count-i].iov_base = temp_addr;
+    //     cuda_iov[cuda_iov_count-i].iov_len = temp_size;
+    //     // printf("swap %d, %d, len %d %d\n", i, cuda_iov_count-i, cuda_iov[i].iov_len, cuda_iov[cuda_iov_count-i].iov_len);
+    // }
+        
+        current_block = 0;
+        task_iteration = 0;
+        cuda_iov_dist_h_current = cuda_iov_dist_h[cuda_streams->current_stream_id];
+        cuda_iov_dist_d_current = cuda_iov_dist_d[cuda_streams->current_stream_id]; 
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+        GET_TIME(start);
+#endif
+        for (i = 0; i < nb_blocks; i++) {
+            cuda_iov_dist_h_current[i].nb_tasks = 0;
+        }
+
+        for (i = 0; i < cuda_iov_count; i++) {
+            pElem = &(description[pStack->index+i]);
+            if (buffer_size >= cuda_iov[i].iov_len) {
+                length_per_iovec = cuda_iov[i].iov_len;
+            } else {
+                orig_alignment = opal_datatype_basicDatatypesSize[pElem->elem.common.type];
+                length_per_iovec = buffer_size / orig_alignment * orig_alignment;
+                buffer_isfull = 1;
+            }
+            buffer_size -= length_per_iovec;
+            total_packed += length_per_iovec;
+            
+            /* check alignment */
+            if ((uintptr_t)(cuda_iov[i].iov_base) % ALIGNMENT_DOUBLE == 0 && (uintptr_t)destination % ALIGNMENT_DOUBLE == 0) {
+                alignment = ALIGNMENT_DOUBLE;
+            } else if ((uintptr_t)(cuda_iov[i].iov_base) % ALIGNMENT_FLOAT == 0 && (uintptr_t)destination % ALIGNMENT_FLOAT == 0) {
+                alignment = ALIGNMENT_FLOAT;
+            } else {
+                alignment = ALIGNMENT_CHAR;
+            }
+            
+        //    alignment = ALIGNMENT_CHAR;
+
+            count_desc = length_per_iovec / alignment;
+            residue_desc = length_per_iovec % alignment;
+            nb_blocks_per_description = (count_desc + thread_per_block - 1) / thread_per_block;
+            DT_CUDA_DEBUG ( opal_cuda_output(2, "description %d, size %d, residue %d, alignment %d, nb_block_aquired %d\n", i, count_desc, residue_desc, alignment, nb_blocks_per_description); );
+            for (j = 0; j < nb_blocks_per_description; j++) {
+                cuda_iov_dist_h_current[current_block].src[task_iteration] = (unsigned char *)(cuda_iov[i].iov_base) + j * thread_per_block * alignment;
+                cuda_iov_dist_h_current[current_block].dst[task_iteration] = destination;
+                cuda_iov_dist_h_current[current_block].element_alignment[task_iteration] = alignment;
+                cuda_iov_dist_h_current[current_block].nb_tasks = task_iteration + 1;
+                if ( (j+1) * thread_per_block <= count_desc) {
+                    cuda_iov_dist_h_current[current_block].nb_elements[task_iteration] = thread_per_block;// * sizeof(double);
+                } else {
+                    cuda_iov_dist_h_current[current_block].nb_elements[task_iteration] = count_desc - j*thread_per_block; //(thread_per_block - ((j+1)*thread_per_block - count_desc));// * sizeof(double);
+                }
+                destination += cuda_iov_dist_h_current[current_block].nb_elements[task_iteration] * alignment;
+                DT_CUDA_DEBUG( opal_cuda_output(3, "\tblock %d, src %p, dst %p, nb_elements %d, alignment %d\n", current_block, cuda_iov_dist_h_current[current_block].src[task_iteration], cuda_iov_dist_h_current[current_block].dst[task_iteration], cuda_iov_dist_h_current[current_block].nb_elements[task_iteration], cuda_iov_dist_h_current[current_block].element_alignment[task_iteration]); );
+                current_block += 1;
+                if (current_block >= nb_blocks) {
+                    current_block = 0;
+                    task_iteration ++;
+                    assert(task_iteration < CUDA_IOV_MAX_TASK_PER_BLOCK);
+                }
+            }
+            
+            /* handle residue */
+            if (residue_desc != 0) {
+                orig_alignment = opal_datatype_basicDatatypesSize[pElem->elem.common.type];
+                cuda_iov_dist_h_current[current_block].src[task_iteration] = (unsigned char *)(cuda_iov[i].iov_base) + length_per_iovec / alignment * alignment;
+                cuda_iov_dist_h_current[current_block].dst[task_iteration] = destination;
+                cuda_iov_dist_h_current[current_block].element_alignment[task_iteration] = orig_alignment;
+                cuda_iov_dist_h_current[current_block].nb_tasks = task_iteration + 1;
+                cuda_iov_dist_h_current[current_block].nb_elements[task_iteration] = (length_per_iovec - length_per_iovec / alignment * alignment) / orig_alignment;
+                destination += cuda_iov_dist_h_current[current_block].nb_elements[task_iteration] * orig_alignment;
+                DT_CUDA_DEBUG( opal_cuda_output(3, "\tblock %d, src %p, dst %p, nb_elements %d, alignment %d\n", current_block, cuda_iov_dist_h_current[current_block].src[task_iteration], cuda_iov_dist_h_current[current_block].dst[task_iteration], cuda_iov_dist_h_current[current_block].nb_elements[task_iteration], cuda_iov_dist_h_current[current_block].element_alignment[task_iteration]); );
+                current_block += 1;
+                if (current_block >= nb_blocks) {
+                    current_block = 0;
+                    task_iteration ++;
+                    assert(task_iteration < CUDA_IOV_MAX_TASK_PER_BLOCK);
+                }
+            }
+            
+            if (buffer_isfull) {
+                break;
+            }
+        }
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)    
+        GET_TIME( end );
+        total_time = ELAPSED_TIME( start, end );
+        printf( "[Timing]: iov is prepared in %ld microsec, cudaMemcpy will be submit to CUDA stream %d\n", total_time,  cuda_streams->current_stream_id);
+#endif
+
+        cudaMemcpyAsync(cuda_iov_dist_d_current, cuda_iov_dist_h_current, sizeof(ddt_cuda_iov_dist_t)*(nb_blocks), cudaMemcpyHostToDevice, cuda_streams->opal_cuda_stream[cuda_streams->current_stream_id]);
+
+        for (i = 0; i < *out_size; i++) {
+#if defined (OPAL_DATATYPE_CUDA_DRY_RUN)
+            cuda_desc_h->iov[i].iov_base = iov[i].iov_base;
+#endif /* OPAL_DATATYPE_CUDA_DRY_RUN */
+            cuda_desc_h->iov[i].iov_len = iov[i].iov_len;
+        }
+    
+        opal_generic_simple_pack_cuda_iov_kernel<<<nb_blocks, thread_per_block, 0, cuda_streams->opal_cuda_stream[cuda_streams->current_stream_id]>>>(cuda_iov_dist_d_current);
+        cuda_streams->current_stream_id ++;
+        cuda_streams->current_stream_id = cuda_streams->current_stream_id % NB_STREAMS;
+        
+        /* buffer is full */
+        if (buffer_isfull) {
+            pConvertor->flags = convertor_flags;
+            total_converted += total_packed;
+            opal_convertor_set_position_nocheck(pConvertor, &total_converted);
+            break;
+        }
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+        GET_TIME(start);
+#endif
+        convertor_flags = pConvertor->flags;
+        complete_flag = opal_convertor_raw( pConvertor, cuda_iov, &cuda_iov_count, &length );
+        DT_CUDA_DEBUG ( opal_cuda_output(1, "complete flag %d, iov count %d, length %d, submit to CUDA stream %d\n", complete_flag, cuda_iov_count, length, cuda_streams->current_stream_id); );
+#if defined(OPAL_DATATYPE_CUDA_TIMING)    
+        GET_TIME( end );
+        total_time = ELAPSED_TIME( start, end );
+        printf( "[Timing]: ddt to iov in %ld microsec\n", total_time );
+#endif
+    }
+    
+
+    cudaDeviceSynchronize();
+    
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+    GET_TIME(start);
+#endif
+    cudaMemcpy(iov[0].iov_base, ddt_cuda_pack_buffer, total_packed, cudaMemcpyDeviceToHost);
+#if defined(OPAL_DATATYPE_CUDA_TIMING) 
+    GET_TIME( end );
+    total_time = ELAPSED_TIME( start, end );
+    printf( "[Timing]: DtoH memcpy in %ld microsec\n", total_time );
+#endif
+    // float *vtmp = (float *)iov[0].iov_base;
+    // DT_CUDA_DEBUG ( opal_cuda_output(0, "packed iov buffer, total packed %d\n", total_packed); );
+    // for (uint32_t i = 0; i < total_packed/sizeof(float); i++) {
+    //     printf(" %1.f ", *vtmp);
+    //     vtmp ++;
+    // }
+    // printf("\n");
+    iov[0].iov_len = total_packed;
+    *max_data = total_packed;
+    *out_size = 1;
+    DT_CUDA_DEBUG ( opal_cuda_output(0, "total packed %d\n", total_packed); );
+    
+#if defined(OPAL_DATATYPE_CUDA_TIMING)    
+    GET_TIME( end_total );
+    total_time = ELAPSED_TIME( start_total, end_total );
+    printf( "[Timing]: total packing in %ld microsec\n", total_time );
+#endif
+    
+    if( pConvertor->bConverted == pConvertor->local_size ) {
+        pConvertor->flags |= CONVERTOR_COMPLETED;
+        return 1;
+    }        
+    return 0;
+}
+
+
 void pack_predefined_data_cuda( dt_elem_desc_t* ELEM,
                                 uint32_t* COUNT,
                                 unsigned char** SOURCE,
@@ -157,7 +556,7 @@ void pack_predefined_data_cuda( dt_elem_desc_t* ELEM,
     size_t _copy_blength;
     ddt_elem_desc_t* _elem = &((ELEM)->elem);
     unsigned char* _source = (*SOURCE) + _elem->disp;
-    uint32_t num_blocks, tasks_per_block;
+    uint32_t nb_blocks, tasks_per_block, thread_per_block;
     unsigned char* _destination = *(DESTINATION);
 
     _copy_blength = 8;//opal_datatype_basicDatatypes[_elem->common.type]->size;
@@ -167,17 +566,26 @@ void pack_predefined_data_cuda( dt_elem_desc_t* ELEM,
     }
     
 #if !defined(OPAL_DATATYPE_CUDA_DRY_RUN)
-    _source = pBaseBuf_GPU;
+    _source = pBaseBuf_GPU + _elem->disp;
     _destination = (unsigned char*)cuda_desc_h->iov[0].iov_base;
 #endif
     
-    tasks_per_block = THREAD_PER_BLOCK*4;
-    num_blocks = (*COUNT + tasks_per_block - 1) / tasks_per_block;
+    if (*COUNT / TASK_PER_THREAD < CUDA_WARP_SIZE) {
+        thread_per_block = CUDA_WARP_SIZE;
+    } else if (*COUNT / TASK_PER_THREAD < CUDA_WARP_SIZE * 2) {
+        thread_per_block = CUDA_WARP_SIZE * 2;
+    } else if (*COUNT / TASK_PER_THREAD < CUDA_WARP_SIZE * 3) {
+        thread_per_block = CUDA_WARP_SIZE * 3;
+    } else {
+        thread_per_block = CUDA_WARP_SIZE * 4;
+    }
+    tasks_per_block = thread_per_block * TASK_PER_THREAD;
+    nb_blocks = (*COUNT + tasks_per_block - 1) / tasks_per_block;
 
-    DBGPRINT("num_blocks %d, thread %d\n", num_blocks, tasks_per_block);
+    DBGPRINT("num_blocks %d, thread %d\n", nb_blocks, tasks_per_block);
     DBGPRINT( "GPU pack 1. memcpy( %p, %p, %lu ) => space %lu\n", _destination, _source, (unsigned long)_copy_count, (unsigned long)(*(SPACE)) );
     
-    pack_contiguous_loop_cuda_kernel_global<<<1, THREAD_PER_BLOCK, 0, cuda_streams->opal_cuda_stream[cuda_streams->current_stream_id]>>>(_copy_count, _copy_blength, _elem->extent, _source, _destination);
+    pack_contiguous_loop_cuda_kernel_global<<<nb_blocks, thread_per_block, 0, cuda_streams->opal_cuda_stream[cuda_streams->current_stream_id]>>>(_copy_count, _copy_blength, _elem->extent, _source, _destination);
     cuda_streams->current_stream_id ++;
     cuda_streams->current_stream_id = cuda_streams->current_stream_id % NB_STREAMS;
     
@@ -189,7 +597,6 @@ void pack_predefined_data_cuda( dt_elem_desc_t* ELEM,
     *(COUNT)  -= _copy_count;
 #endif
     
-    pBaseBuf_GPU += _elem->extent*_copy_count;
     cuda_desc_h->iov[0].iov_base = (unsigned char*)cuda_desc_h->iov[0].iov_base + _copy_blength;
  //   cudaDeviceSynchronize();
 }
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
index f59b2bb0e00..0ae85e22eef 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
@@ -61,7 +61,7 @@ __device__ void unpack_contiguous_loop_cuda_kernel( dt_elem_desc_t* ELEM,
 
 __global__ void opal_generic_simple_unpack_cuda_kernel(ddt_cuda_desc_t* cuda_desc)
 {
-    dt_stack_t* pStack, *pStack_head;                /* pointer to the position on the stack */
+    dt_stack_t* pStack;                /* pointer to the position on the stack */
     uint32_t pos_desc;                 /* actual position in the description of the derived datatype */
     uint32_t count_desc;               /* the number of items already done in the actual pos_desc */
     size_t total_unpacked = 0;         /* total size unpacked this time */
@@ -80,23 +80,23 @@ __global__ void opal_generic_simple_unpack_cuda_kernel(ddt_cuda_desc_t* cuda_des
 
     tid = threadIdx.x + blockIdx.x * blockDim.x;
     
-    __shared__ ddt_cuda_desc_t cuda_desc_b;
-    
-    if (threadIdx.x == 0) {
-        memcpy(&cuda_desc_b, cuda_desc, sizeof(ddt_cuda_desc_t));
+ //   __shared__ ddt_cuda_desc_t cuda_desc_b;
+    __shared__ dt_stack_t shared_pStack[DT_STATIC_STACK_SIZE];
+
+    if (threadIdx.x < DT_STATIC_STACK_SIZE) {
+        shared_pStack[threadIdx.x] = cuda_desc->pStack[threadIdx.x];
     }
     __syncthreads();
     
     // load cuda descriptor from constant memory
-    iov = cuda_desc_b.iov;
-    pStack_head = cuda_desc_b.pStack;
-    pStack = pStack_head;
-    description = cuda_desc_b.description;
-    stack_pos = cuda_desc_b.stack_pos;
-    pBaseBuf = cuda_desc_b.pBaseBuf;
-    lb = cuda_desc_b.lb;
-    ub = cuda_desc_b.ub;
-    out_size = cuda_desc_b.out_size;
+    iov = cuda_desc->iov;
+    pStack = shared_pStack;
+    description = cuda_desc->description;
+    stack_pos = cuda_desc->stack_pos;
+    pBaseBuf = cuda_desc->pBaseBuf;
+    lb = cuda_desc->lb;
+    ub = cuda_desc->ub;
+    out_size = cuda_desc->out_size;
 
     /* For the first step we have to add both displacement to the source. After in the
      * main while loop we will set back the source_base to the correct value. This is
@@ -248,6 +248,43 @@ __global__ void opal_generic_simple_unpack_cuda_kernel(ddt_cuda_desc_t* cuda_des
     }
 }
 
+
+__global__ void opal_generic_simple_unpack_cuda_iov_kernel( ddt_cuda_iov_dist_t* cuda_iov_dist)
+{
+    uint32_t i, _copy_count;
+    unsigned char *src, *dst;
+    uint8_t alignment;
+    unsigned char *_source_tmp, *_destination_tmp;
+    
+    __shared__ uint32_t nb_tasks;
+    
+    if (threadIdx.x == 0) {
+        nb_tasks = cuda_iov_dist[blockIdx.x].nb_tasks;
+    }
+    __syncthreads();
+    
+    for (i = 0; i < nb_tasks; i++) {
+        src = cuda_iov_dist[blockIdx.x].src[i];
+        dst = cuda_iov_dist[blockIdx.x].dst[i];
+        _copy_count = cuda_iov_dist[blockIdx.x].nb_elements[i];
+        alignment = cuda_iov_dist[blockIdx.x].element_alignment[i];
+        
+        if (threadIdx.x < _copy_count) {
+            _source_tmp = src + threadIdx.x * alignment;
+            _destination_tmp = dst + threadIdx.x * alignment;
+#if !defined (OPAL_DATATYPE_CUDA_DRY_RUN)
+                if (alignment == ALIGNMENT_DOUBLE) {
+                    *((double *)_destination_tmp) = *((double *)_source_tmp);
+                } else if (alignment == ALIGNMENT_FLOAT) {
+                    *((float *)_destination_tmp) = *((float *)_source_tmp);
+                } else {
+                    * _destination_tmp = *_source_tmp;
+                }
+        //   printf("src %p, %1.f | dst %p, %1.f\n", _source_tmp, *_source_tmp, _destination_tmp, *_destination_tmp);
+#endif /* ! OPAL_DATATYPE_CUDA_DRY_RUN */
+        }
+    }
+}
 __global__ void unpack_contiguous_loop_cuda_kernel_global( uint32_t copy_loops,
                                                            size_t size,
                                                            OPAL_PTRDIFF_TYPE extent,
@@ -285,4 +322,4 @@ __global__ void unpack_contiguous_loop_cuda_kernel_global( uint32_t copy_loops,
 #endif /* ! OPAL_DATATYPE_CUDA_DRY_RUN */
         _source_tmp += num_threads;
     }
-}
\ No newline at end of file
+}
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
index 7181f3cd362..88a66de5f02 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
@@ -2,6 +2,7 @@
 #include "opal_datatype_cuda.cuh"
 
 #include <stdio.h>
+#include <assert.h>
 
 int32_t opal_generic_simple_unpack_function_cuda( opal_convertor_t* pConvertor,
                                                   struct iovec* iov, 
@@ -11,9 +12,10 @@ int32_t opal_generic_simple_unpack_function_cuda( opal_convertor_t* pConvertor,
     uint32_t i;
     dt_elem_desc_t* description;
     const opal_datatype_t *pData = pConvertor->pDesc;
-    uint32_t tasks_per_block, num_blocks;
+    uint32_t tasks_per_block, num_blocks, thread_per_block;
     dt_stack_t* pStack;
     
+    return -99;
     description = pConvertor->use_desc->desc;
     
     cuda_desc_h->stack_pos = pConvertor->stack_pos;
@@ -33,9 +35,23 @@ int32_t opal_generic_simple_unpack_function_cuda( opal_convertor_t* pConvertor,
     for (i = 0; i < pConvertor->stack_size; i++) {
         cuda_desc_h->pStack[i] = pConvertor->pStack[i];
     }
-    for (i = 0; i < pConvertor->use_desc->used+1; i++) {
-        cuda_desc_h->description[i] = description[i];
+    if (cuda_desc_h->description_max_count != 0) {
+        if (cuda_desc_h->description_max_count >= (pConvertor->use_desc->used+1)) {
+            cuda_desc_h->description_count = pConvertor->use_desc->used+1;
+        } else {
+            cudaFree(cuda_desc_h->description);
+            cuda_desc_h->description = NULL;
+            cudaMalloc((void **)&(cuda_desc_h->description), sizeof(dt_elem_desc_t)*(pConvertor->use_desc->used+1));
+            cuda_desc_h->description_max_count = pConvertor->use_desc->used+1;
+            cuda_desc_h->description_count = pConvertor->use_desc->used+1;
+        }
+        
+    } else {
+        cudaMalloc((void **)&(cuda_desc_h->description), sizeof(dt_elem_desc_t)*(pConvertor->use_desc->used+1));
+        cuda_desc_h->description_max_count = pConvertor->use_desc->used+1;
+        cuda_desc_h->description_count = pConvertor->use_desc->used+1;
     }
+    cudaMemcpy(cuda_desc_h->description, description, sizeof(dt_elem_desc_t)*(pConvertor->use_desc->used+1), cudaMemcpyHostToDevice);
     
     DBGPRINT("stack_size %d\n", pConvertor->stack_size);
 
@@ -51,10 +67,11 @@ int32_t opal_generic_simple_unpack_function_cuda( opal_convertor_t* pConvertor,
     cudaMemcpy(cuda_desc_d, cuda_desc_h, sizeof(ddt_cuda_desc_t), cudaMemcpyHostToDevice);
     
     pStack = pConvertor->pStack + pConvertor->stack_pos;
-    tasks_per_block = THREAD_PER_BLOCK * TASK_PER_THREAD;
+    thread_per_block = CUDA_WARP_SIZE * 3;
+    tasks_per_block = thread_per_block * TASK_PER_THREAD;
     num_blocks = ((uint32_t)pStack->count + tasks_per_block - 1) / tasks_per_block;
-    printf("launch kernel, count %d, num_blocks %d, total threads %d\n", (uint32_t)pStack->count, num_blocks, num_blocks*4*THREAD_PER_BLOCK);
-    opal_generic_simple_unpack_cuda_kernel<<<2*num_blocks,2*THREAD_PER_BLOCK>>>(cuda_desc_d);
+    printf("launch unpack kernel, count %d, num_blocks %d, total threads %d\n", (uint32_t)pStack->count, num_blocks, num_blocks*thread_per_block);
+    opal_generic_simple_unpack_cuda_kernel<<<192, thread_per_block>>>(cuda_desc_d);
 #if !defined(OPAL_DATATYPE_CUDA_DRY_RUN)
     size_t position = pConvertor->pDesc->size;
     opal_convertor_set_position_nocheck(pConvertor, &position);
@@ -90,6 +107,227 @@ int32_t opal_generic_simple_unpack_function_cuda( opal_convertor_t* pConvertor,
 #endif /* OPAL_DATATYPE_CUDA_DRY_RUN */
 }
 
+int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvertor,
+                                                  struct iovec* iov, 
+                                                  uint32_t* out_size,
+                                                  size_t* max_data )
+{
+    uint32_t i, j;
+    uint32_t count_desc, current_block, task_iteration, nb_blocks_per_description, dst_offset, residue_desc;
+    uint32_t nb_blocks, thread_per_block;
+    size_t length, buffer_size, length_per_iovec;
+    unsigned char *source;
+    size_t total_unpacked, total_converted;
+    int32_t complete_flag = 0;
+    uint8_t buffer_isfull = 0;
+    uint32_t convertor_flags;
+    dt_elem_desc_t* description;
+    dt_elem_desc_t* pElem;
+    dt_stack_t* pStack;
+    uint8_t alignment, orig_alignment;
+    
+    ddt_cuda_iov_dist_t* cuda_iov_dist_h_current;
+    ddt_cuda_iov_dist_t* cuda_iov_dist_d_current;
+    
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+    TIMER_DATA_TYPE start, end, start_total, end_total;
+    long total_time;
+#endif
+    
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+    GET_TIME(start_total);
+#endif
+    
+    description = pConvertor->use_desc->desc;
+    pStack = pConvertor->pStack + pConvertor->stack_pos;
+    pElem = &(description[pStack->index]);
+    printf("size elem %lu, size %d\n", pElem->elem.common.type, opal_datatype_basicDatatypesSize[pElem->elem.common.type]);
+    
+    DT_CUDA_DEBUG ( opal_cuda_output(0, "GPU datatype UNpacking using iovec\n"); );
+    
+#if defined(OPAL_DATATYPE_CUDA_DRY_RUN)
+    source = (unsigned char*)iov[0].iov_base;
+#else
+//    pConvertor->pBaseBuf = pBaseBuf_GPU;
+ //   printf("Unpack GPU base %p, iov buffer %p\n", pConvertor->pBaseBuf, iov[0].iov_base);
+    source = ddt_cuda_unpack_buffer;
+#endif /* OPAL_DATATYPE_CUDA_DRY_RUN */
+    
+    // double *vtmp = (double *)iov[0].iov_base;
+    printf("recevied unpacked iov buffer, len %d\n", iov[0].iov_len);
+    // for (uint32_t i = 0; i < iov[0].iov_len/sizeof(double); i++) {
+    //     printf(" %1.f ", *vtmp);
+    //     vtmp ++;
+    // }
+    // printf("\n");
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+    GET_TIME(start);
+#endif    
+    cudaMemcpy(source, iov[0].iov_base, iov[0].iov_len, cudaMemcpyHostToDevice);
+#if defined(OPAL_DATATYPE_CUDA_TIMING) 
+    GET_TIME( end );
+    total_time = ELAPSED_TIME( start, end );
+    printf( "[Timing]: HtoD memcpy in %ld microsec\n", total_time );
+#endif
+
+
+#if defined (OPAL_DATATYPE_CUDA_TIMING)
+    GET_TIME(start);
+#endif
+    buffer_size = iov[0].iov_len;
+    cuda_iov_count = 1000;
+    total_unpacked = 0;
+    total_converted = pConvertor->bConverted;
+    cuda_streams->current_stream_id = 0;
+    convertor_flags = pConvertor->flags;
+    complete_flag = opal_convertor_raw( pConvertor, cuda_iov, &cuda_iov_count, &length );
+    DT_CUDA_DEBUG ( opal_cuda_output(1, "complete flag %d, iov count %d, length %d, submit to CUDA stream %d\n", complete_flag, cuda_iov_count, length, cuda_streams->current_stream_id); );
+    
+#if defined (OPAL_DATATYPE_CUDA_TIMING)
+    GET_TIME( end );
+    total_time = ELAPSED_TIME( start, end );
+    printf( "[Timing]: ddt to iov in %ld microsec\n", total_time );
+#endif
+    
+    dst_offset = 0;
+    thread_per_block = CUDA_WARP_SIZE * 4;
+    nb_blocks = 256;
+    
+    while (cuda_iov_count > 0) {
+        
+        current_block = 0;
+        task_iteration = 0;
+        cuda_iov_dist_h_current = cuda_iov_dist_h[cuda_streams->current_stream_id];
+        cuda_iov_dist_d_current = cuda_iov_dist_d[cuda_streams->current_stream_id]; 
+        
+#if defined (OPAL_DATATYPE_CUDA_TIMING)
+        GET_TIME(start);
+#endif
+        for (i = 0; i < nb_blocks; i++) {
+            cuda_iov_dist_h_current[i].nb_tasks = 0;
+        }
+        
+        for (i = 0; i < cuda_iov_count; i++) {
+            if (buffer_size >= cuda_iov[i].iov_len) {
+                length_per_iovec = cuda_iov[i].iov_len;
+            } else {
+                orig_alignment = opal_datatype_basicDatatypesSize[pElem->elem.common.type];
+                length_per_iovec = buffer_size / orig_alignment * orig_alignment;
+                buffer_isfull = 1;
+            }
+            buffer_size -= length_per_iovec;
+            total_unpacked += length_per_iovec;
+            
+            /* check alignment */
+            if ((uintptr_t)(cuda_iov[i].iov_base) % ALIGNMENT_DOUBLE == 0 && (uintptr_t)source % ALIGNMENT_DOUBLE == 0) {
+                alignment = ALIGNMENT_DOUBLE;
+            } else if ((uintptr_t)(cuda_iov[i].iov_base) % ALIGNMENT_FLOAT == 0 && (uintptr_t)source % ALIGNMENT_FLOAT == 0) {
+                alignment = ALIGNMENT_FLOAT;
+            } else {
+                alignment = ALIGNMENT_CHAR;
+            }
+            
+           // alignment = ALIGNMENT_CHAR;
+
+            count_desc = length_per_iovec / alignment;
+            residue_desc = length_per_iovec % alignment;
+            nb_blocks_per_description = (count_desc + thread_per_block - 1) / thread_per_block;
+            DT_CUDA_DEBUG ( opal_cuda_output(2, "description %d, size %d, residue %d, alignment %d\n", i, count_desc, residue_desc, alignment); );
+            for (j = 0; j < nb_blocks_per_description; j++) {
+                cuda_iov_dist_h_current[current_block].dst[task_iteration] = (unsigned char *)(cuda_iov[i].iov_base) + j * thread_per_block * alignment;
+                cuda_iov_dist_h_current[current_block].src[task_iteration] = source;
+                cuda_iov_dist_h_current[current_block].element_alignment[task_iteration] = alignment;
+                cuda_iov_dist_h_current[current_block].nb_tasks = task_iteration + 1;
+                if ( (j+1) * thread_per_block <= count_desc) {
+                    cuda_iov_dist_h_current[current_block].nb_elements[task_iteration] = thread_per_block;// * sizeof(double);
+                } else {
+                    cuda_iov_dist_h_current[current_block].nb_elements[task_iteration] = (thread_per_block - ((j+1)*thread_per_block - count_desc));// * sizeof(double);
+                }
+                source += cuda_iov_dist_h_current[current_block].nb_elements[task_iteration] * alignment;
+                DT_CUDA_DEBUG( opal_cuda_output(3, "\tblock %d, src %p, dst %p, nb_elements %d, alignment %d\n", current_block, cuda_iov_dist_h_current[current_block].src[task_iteration], cuda_iov_dist_h_current[current_block].dst[task_iteration], cuda_iov_dist_h_current[current_block].nb_elements[task_iteration], cuda_iov_dist_h_current[current_block].element_alignment[task_iteration]); );
+                current_block += 1;
+                if (current_block >= nb_blocks) {
+                    current_block = 0;
+                    task_iteration ++;
+                    assert(task_iteration < CUDA_IOV_MAX_TASK_PER_BLOCK);
+                }
+            }
+            
+            /* handle residue */
+            if (residue_desc != 0) {
+                orig_alignment = opal_datatype_basicDatatypesSize[pElem->elem.common.type];
+                cuda_iov_dist_h_current[current_block].dst[task_iteration] = (unsigned char *)(cuda_iov[i].iov_base) + length_per_iovec / alignment * alignment;
+                cuda_iov_dist_h_current[current_block].src[task_iteration] = source;
+                cuda_iov_dist_h_current[current_block].element_alignment[task_iteration] = orig_alignment;
+                cuda_iov_dist_h_current[current_block].nb_tasks = task_iteration + 1;
+                cuda_iov_dist_h_current[current_block].nb_elements[task_iteration] = (length_per_iovec - length_per_iovec / alignment * alignment) / orig_alignment;
+                source += cuda_iov_dist_h_current[current_block].nb_elements[task_iteration] * orig_alignment;
+                DT_CUDA_DEBUG( opal_cuda_output(3, "\tblock %d, src %p, dst %p, nb_elements %d, alignment %d\n", current_block, cuda_iov_dist_h_current[current_block].src[task_iteration], cuda_iov_dist_h_current[current_block].dst[task_iteration], cuda_iov_dist_h_current[current_block].nb_elements[task_iteration], cuda_iov_dist_h_current[current_block].element_alignment[task_iteration]); );
+                current_block += 1;
+                if (current_block >= nb_blocks) {
+                    current_block = 0;
+                    task_iteration ++;
+                    assert(task_iteration < CUDA_IOV_MAX_TASK_PER_BLOCK);
+                }
+            }
+            
+            if (buffer_isfull) {
+                break;
+            }
+        }
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)    
+        GET_TIME( end );
+        total_time = ELAPSED_TIME( start, end );
+        printf( "[Timing]: iov is prepared in %ld microsec, cudaMemcpy will be submit to CUDA stream %d\n", total_time,  cuda_streams->current_stream_id);
+#endif
+                
+        cudaMemcpyAsync(cuda_iov_dist_d_current, cuda_iov_dist_h_current, sizeof(ddt_cuda_iov_dist_t)*(nb_blocks), cudaMemcpyHostToDevice, cuda_streams->opal_cuda_stream[cuda_streams->current_stream_id]);
+        opal_generic_simple_unpack_cuda_iov_kernel<<<nb_blocks, thread_per_block, 0, cuda_streams->opal_cuda_stream[cuda_streams->current_stream_id]>>>(cuda_iov_dist_d_current);
+        cuda_streams->current_stream_id ++;
+        cuda_streams->current_stream_id = cuda_streams->current_stream_id % NB_STREAMS;    
+        
+        /* buffer is full */
+        if (buffer_isfull) {
+            pConvertor->flags = convertor_flags;
+            total_converted += total_unpacked;
+            opal_convertor_set_position_nocheck(pConvertor, &total_converted);
+            break;
+        }
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+        GET_TIME(start);
+#endif   
+        convertor_flags = pConvertor->flags;     
+        complete_flag = opal_convertor_raw( pConvertor, cuda_iov, &cuda_iov_count, &length );
+        DT_CUDA_DEBUG ( opal_cuda_output(1, "complete flag %d, iov count %d, length %d, submit to CUDA stream %d\n", complete_flag, cuda_iov_count, length, cuda_streams->current_stream_id); );
+#if defined(OPAL_DATATYPE_CUDA_TIMING)    
+        GET_TIME( end );
+        total_time = ELAPSED_TIME( start, end );
+        printf( "[Timing]: ddt to iov in %ld microsec\n", total_time );
+#endif
+
+    }
+    cudaDeviceSynchronize();
+    
+    iov[0].iov_len = total_unpacked;
+    *max_data = total_unpacked;
+    *out_size = 1;
+    
+    DT_CUDA_DEBUG ( opal_cuda_output(0, "total unpacked %d\n", total_unpacked); );
+    
+#if defined(OPAL_DATATYPE_CUDA_TIMING)    
+    GET_TIME( end_total );
+    total_time = ELAPSED_TIME( start_total, end_total );
+    printf( "[Timing]: total unpacking in %ld microsec\n", total_time );
+#endif
+    
+    if( pConvertor->bConverted == pConvertor->local_size ) {
+        pConvertor->flags |= CONVERTOR_COMPLETED;
+        return 1;
+    }        
+    return 0;   
+}
+
 void unpack_contiguous_loop_cuda( dt_elem_desc_t* ELEM,
                                   uint32_t* COUNT,
                                   unsigned char** SOURCE,
@@ -120,4 +358,4 @@ void unpack_contiguous_loop_cuda( dt_elem_desc_t* ELEM,
     *(COUNT) -= _copy_loops;
     
     cudaDeviceSynchronize();
-}
\ No newline at end of file
+}
diff --git a/opal/datatype/opal_convertor.c b/opal/datatype/opal_convertor.c
index 1c10efd1aa8..f0d6dbb10e3 100644
--- a/opal/datatype/opal_convertor.c
+++ b/opal/datatype/opal_convertor.c
@@ -39,6 +39,7 @@
 #include "opal/datatype/opal_convertor_internal.h"
 #if OPAL_CUDA_SUPPORT
 #include "opal/datatype/opal_datatype_cuda.h"
+#include "opal/datatype/opal_datatype_gpu.h"
 #define MEMCPY_CUDA( DST, SRC, BLENGTH, CONVERTOR ) \
     CONVERTOR->cbmemcpy( (DST), (SRC), (BLENGTH), (CONVERTOR) )
 #endif
@@ -558,6 +559,11 @@ int32_t opal_convertor_prepare_for_recv( opal_convertor_t* convertor,
     convertor->flags |= CONVERTOR_RECV;
 #if OPAL_CUDA_SUPPORT
     mca_cuda_convertor_init(convertor, pUserBuf);
+#if defined (OPAL_DATATYPE_CUDA)
+    if (opal_datatype_gpu_init() != OPAL_SUCCESS) {
+        opal_datatype_gpu_fini();
+    }
+#endif /* defined OPAL_DATATYPE_CUDA */
 #endif
 
     OPAL_CONVERTOR_PREPARE( convertor, datatype, count, pUserBuf );
@@ -582,7 +588,11 @@ int32_t opal_convertor_prepare_for_recv( opal_convertor_t* convertor,
         if( convertor->pDesc->flags & OPAL_DATATYPE_FLAG_CONTIGUOUS ) {
             convertor->fAdvance = opal_unpack_homogeneous_contig;
         } else {
-            convertor->fAdvance = opal_generic_simple_unpack;
+            if (convertor->flags & CONVERTOR_CUDA ) {
+                convertor->fAdvance = opal_generic_simple_unpack_cuda;
+            } else {
+                convertor->fAdvance = opal_generic_simple_unpack;
+            }
         }
     }
     return OPAL_SUCCESS;
@@ -597,6 +607,11 @@ int32_t opal_convertor_prepare_for_send( opal_convertor_t* convertor,
     convertor->flags |= CONVERTOR_SEND;
 #if OPAL_CUDA_SUPPORT
     mca_cuda_convertor_init(convertor, pUserBuf);
+#if defined (OPAL_DATATYPE_CUDA)
+    if (opal_datatype_gpu_init() != OPAL_SUCCESS) {
+        opal_datatype_gpu_fini();
+    }
+#endif /* defined OPAL_DATATYPE_CUDA */
 #endif
 
     OPAL_CONVERTOR_PREPARE( convertor, datatype, count, pUserBuf );
@@ -619,7 +634,11 @@ int32_t opal_convertor_prepare_for_send( opal_convertor_t* convertor,
             else
                 convertor->fAdvance = opal_pack_homogeneous_contig_with_gaps;
         } else {
-            convertor->fAdvance = opal_generic_simple_pack;
+            if (convertor->flags & CONVERTOR_CUDA ) {
+                convertor->fAdvance = opal_generic_simple_pack_cuda;
+            } else {
+                convertor->fAdvance = opal_generic_simple_pack;
+            }
         }
     }
     return OPAL_SUCCESS;
diff --git a/opal/datatype/opal_datatype_cuda.c b/opal/datatype/opal_datatype_cuda.c
index 71b60e60801..caaab68208d 100644
--- a/opal/datatype/opal_datatype_cuda.c
+++ b/opal/datatype/opal_datatype_cuda.c
@@ -180,6 +180,7 @@ static void opal_cuda_support_init(void)
     }
 
     initialized = true;
+    
 }
 
 /**
diff --git a/opal/datatype/opal_datatype_gpu.c b/opal/datatype/opal_datatype_gpu.c
index e77a4f77325..787e86e4f4c 100644
--- a/opal/datatype/opal_datatype_gpu.c
+++ b/opal/datatype/opal_datatype_gpu.c
@@ -52,6 +52,16 @@ int32_t (*opal_generic_simple_unpack_function_cuda_p)( opal_convertor_t* pConver
                                                        struct iovec* iov, 
                                                        uint32_t* out_size,
                                                        size_t* max_data ) = NULL;
+                                                     
+int32_t (*opal_generic_simple_pack_function_cuda_iov_p)( opal_convertor_t* pConvertor,
+                                                        struct iovec* iov, 
+                                                        uint32_t* out_size,
+                                                        size_t* max_data ) = NULL;
+                                                        
+int32_t (*opal_generic_simple_unpack_function_cuda_iov_p)( opal_convertor_t* pConvertor,
+                                                        struct iovec* iov, 
+                                                        uint32_t* out_size,
+                                                        size_t* max_data ) = NULL;
                                                        
 void (*pack_contiguous_loop_cuda_p)( dt_elem_desc_t* ELEM,
                                      uint32_t* COUNT,
@@ -114,6 +124,20 @@ int32_t opal_datatype_gpu_init(void)
             return OPAL_ERROR;
         }
         
+        *(void **)(&opal_generic_simple_pack_function_cuda_iov_p) = dlsym(opal_datatype_cuda_handle, "opal_generic_simple_pack_function_cuda_iov");
+        if ((error = dlerror()) != NULL)  {
+            fprintf(stderr, "opal_generic_simple_pack_function_cuda_iov error: %s\n", error);
+            opal_generic_simple_pack_function_cuda_iov_p = NULL;
+            return OPAL_ERROR;
+        }
+        
+        *(void **)(&opal_generic_simple_unpack_function_cuda_iov_p) = dlsym(opal_datatype_cuda_handle, "opal_generic_simple_unpack_function_cuda_iov");
+        if ((error = dlerror()) != NULL)  {
+            fprintf(stderr, "opal_generic_simple_unpack_function_cuda_iov error: %s\n", error);
+            opal_generic_simple_unpack_function_cuda_iov_p = NULL;
+            return OPAL_ERROR;
+        }
+        
         *(void **)(&pack_contiguous_loop_cuda_p) = dlsym(opal_datatype_cuda_handle, "pack_contiguous_loop_cuda");
         if ((error = dlerror()) != NULL)  {
             fprintf(stderr, "pack_contiguous_loop_cuda error: %s\n", error);
@@ -157,6 +181,8 @@ int32_t opal_datatype_gpu_fini(void)
         opal_datatype_cuda_fini_p = NULL;
         opal_generic_simple_pack_function_cuda_p = NULL;
         opal_generic_simple_unpack_function_cuda_p = NULL;
+        opal_generic_simple_pack_function_cuda_iov_p = NULL;
+        opal_generic_simple_unpack_function_cuda_iov_p = NULL;
         pack_contiguous_loop_cuda_p = NULL;
         unpack_contiguous_loop_cuda_p = NULL;
         pack_predefined_data_cuda_p = NULL;
diff --git a/opal/datatype/opal_datatype_gpu.h b/opal/datatype/opal_datatype_gpu.h
index 385d7cdb73c..b8dc828a0df 100644
--- a/opal/datatype/opal_datatype_gpu.h
+++ b/opal/datatype/opal_datatype_gpu.h
@@ -1,6 +1,8 @@
 #ifndef OPAL_DATATYPE_GPU_H_HAS_BEEN_INCLUDED
 #define OPAL_DATATYPE_GPU_H_HAS_BEEN_INCLUDED
 
+#define OPAL_DATATYPE_CUDA_IOV
+
 int32_t opal_datatype_gpu_init(void);
 int32_t opal_datatype_gpu_fini(void);
 
@@ -18,6 +20,16 @@ extern int32_t (*opal_generic_simple_unpack_function_cuda_p)( opal_convertor_t*
                                                               uint32_t* out_size,
                                                               size_t* max_data );
                                                               
+extern int32_t (*opal_generic_simple_pack_function_cuda_iov_p)( opal_convertor_t* pConvertor,
+                                                                struct iovec* iov, 
+                                                                uint32_t* out_size,
+                                                                size_t* max_data );
+                                                                
+extern int32_t (*opal_generic_simple_unpack_function_cuda_iov_p)( opal_convertor_t* pConvertor,
+                                                                struct iovec* iov, 
+                                                                uint32_t* out_size,
+                                                                size_t* max_data );
+                                                              
 extern void (*pack_contiguous_loop_cuda_p)( dt_elem_desc_t* ELEM,
                                             uint32_t* COUNT,
                                             unsigned char** SOURCE,
@@ -25,10 +37,10 @@ extern void (*pack_contiguous_loop_cuda_p)( dt_elem_desc_t* ELEM,
                                             size_t* SPACE );
                                             
 extern void (*unpack_contiguous_loop_cuda_p)( dt_elem_desc_t* ELEM,
-                                             uint32_t* COUNT,
-                                             unsigned char** SOURCE,
-                                             unsigned char** DESTINATION,
-                                             size_t* SPACE );
+                                            uint32_t* COUNT,
+                                            unsigned char** SOURCE,
+                                            unsigned char** DESTINATION,
+                                            size_t* SPACE );
 
 extern void (*pack_predefined_data_cuda_p)( dt_elem_desc_t* ELEM,
                                             uint32_t* COUNT,
diff --git a/opal/datatype/opal_datatype_module.c b/opal/datatype/opal_datatype_module.c
index 520105d8de9..307eb001085 100644
--- a/opal/datatype/opal_datatype_module.c
+++ b/opal/datatype/opal_datatype_module.c
@@ -226,12 +226,6 @@ int32_t opal_datatype_init( void )
         datatype->desc.desc[1].end_loop.first_elem_disp = datatype->desc.desc[0].elem.disp;
         datatype->desc.desc[1].end_loop.size            = datatype->size;
     }
-    
-#if defined (OPAL_DATATYPE_CUDA)
-    if (opal_datatype_gpu_init() != OPAL_SUCCESS) {
-        opal_datatype_gpu_fini();
-    }
-#endif /* defined OPAL_DATATYPE_CUDA */
 
     return OPAL_SUCCESS;
 }
diff --git a/opal/datatype/opal_datatype_pack.c b/opal/datatype/opal_datatype_pack.c
index 9dc0666eb4e..dbfc1cec12d 100644
--- a/opal/datatype/opal_datatype_pack.c
+++ b/opal/datatype/opal_datatype_pack.c
@@ -43,10 +43,12 @@
 #define opal_pack_homogeneous_contig_function           opal_pack_homogeneous_contig_checksum
 #define opal_pack_homogeneous_contig_with_gaps_function opal_pack_homogeneous_contig_with_gaps_checksum
 #define opal_generic_simple_pack_function               opal_generic_simple_pack_checksum
+#define opal_generic_simple_pack_cuda_function          opal_generic_simple_pack_cuda_checksum
 #else
 #define opal_pack_homogeneous_contig_function           opal_pack_homogeneous_contig
 #define opal_pack_homogeneous_contig_with_gaps_function opal_pack_homogeneous_contig_with_gaps
 #define opal_generic_simple_pack_function               opal_generic_simple_pack
+#define opal_generic_simple_pack_cuda_function          opal_generic_simple_pack_cuda
 #endif  /* defined(CHECKSUM) */
 
 
@@ -288,13 +290,7 @@ opal_generic_simple_pack_function( opal_convertor_t* pConvertor,
                            (void*)pConvertor, (void*)pConvertor->pBaseBuf,
                            iov[0].iov_base, (unsigned long)iov[0].iov_len, *out_size ); );
 
-   if (opal_generic_simple_pack_function_cuda_p != NULL) {
-       int32_t rvalue = (*opal_generic_simple_pack_function_cuda_p)( pConvertor, iov, out_size, max_data);
-       if (rvalue != -99) { /* -99 is DRY RUN, to verify the result with CPU packing*/
-           return rvalue;
-       }
-   }
-
+    printf("I am in simple pack, max_data %lu, iov_len %lu\n", *max_data, iov[0].iov_len);
     description = pConvertor->use_desc->desc;
 
     /* For the first step we have to add both displacement to the source. After in the
@@ -320,9 +316,9 @@ opal_generic_simple_pack_function( opal_convertor_t* pConvertor,
         while( 1 ) {
             while( pElem->elem.common.flags & OPAL_DATATYPE_FLAG_DATA ) {
                 /* now here we have a basic datatype */
-                (*pack_predefined_data_cuda_p)(pElem, &count_desc, &conv_ptr, &iov_ptr, &iov_len_local);
-                // PACK_PREDEFINED_DATATYPE( pConvertor, pElem, count_desc,
-                //                           conv_ptr, iov_ptr, iov_len_local );
+//                (*pack_predefined_data_cuda_p)(pElem, &count_desc, &conv_ptr, &iov_ptr, &iov_len_local);
+                PACK_PREDEFINED_DATATYPE( pConvertor, pElem, count_desc,
+                                        conv_ptr, iov_ptr, iov_len_local );
                 if( 0 == count_desc ) {  /* completed */
                     conv_ptr = pConvertor->pBaseBuf + pStack->disp;
                     pos_desc++;  /* advance to the next data */
@@ -365,9 +361,9 @@ opal_generic_simple_pack_function( opal_convertor_t* pConvertor,
             if( OPAL_DATATYPE_LOOP == pElem->elem.common.type ) {
                 OPAL_PTRDIFF_TYPE local_disp = (OPAL_PTRDIFF_TYPE)conv_ptr;
                 if( pElem->loop.common.flags & OPAL_DATATYPE_FLAG_CONTIGUOUS ) {
-                    (*pack_contiguous_loop_cuda_p)(pElem, &count_desc, &conv_ptr, &iov_ptr, &iov_len_local);
-                    //PACK_CONTIGUOUS_LOOP( pConvertor, pElem, count_desc,
-                    //                      conv_ptr, iov_ptr, iov_len_local );
+                    //(*pack_contiguous_loop_cuda_p)(pElem, &count_desc, &conv_ptr, &iov_ptr, &iov_len_local);
+                    PACK_CONTIGUOUS_LOOP( pConvertor, pElem, count_desc,
+                                          conv_ptr, iov_ptr, iov_len_local );
                     if( 0 == count_desc ) {  /* completed */
                         pos_desc += pElem->loop.items + 1;
                         goto update_loop_description;
@@ -389,12 +385,18 @@ opal_generic_simple_pack_function( opal_convertor_t* pConvertor,
         iov[iov_count].iov_len -= iov_len_local;  /* update the amount of valid data */
         total_packed += iov[iov_count].iov_len;
     }
-    (*opal_cuda_sync_device_p)();
     *max_data = total_packed;
     pConvertor->bConverted += total_packed;  /* update the already converted bytes */
     *out_size = iov_count;
     if( pConvertor->bConverted == pConvertor->local_size ) {
         pConvertor->flags |= CONVERTOR_COMPLETED;
+        printf("total packed %lu\n", pConvertor->bConverted);
+        // double *vtmp = (double *)iov[0].iov_base;
+        // for (uint32_t i = 0; i < total_packed/8; i++) {
+        //     printf(" %1.f ", *vtmp);
+        //     vtmp ++;
+        // }
+        // printf("\n");
         return 1;
     }
     /* Save the global position for the next round */
@@ -404,3 +406,17 @@ opal_generic_simple_pack_function( opal_convertor_t* pConvertor,
                            pConvertor->stack_pos, pStack->index, (int)pStack->count, (long)pStack->disp ); );
     return 0;
 }
+
+int32_t
+opal_generic_simple_pack_cuda_function( opal_convertor_t* pConvertor,
+                                        struct iovec* iov, uint32_t* out_size,
+                                        size_t* max_data )
+{
+#if defined (OPAL_DATATYPE_CUDA_IOV)
+    if (opal_generic_simple_pack_function_cuda_iov_p != NULL) {
+        return (*opal_generic_simple_pack_function_cuda_iov_p)( pConvertor, iov, out_size, max_data);
+
+    }
+#endif
+    return 0;
+}
diff --git a/opal/datatype/opal_datatype_pack.h b/opal/datatype/opal_datatype_pack.h
index b011f434472..c02ecf86ec5 100644
--- a/opal/datatype/opal_datatype_pack.h
+++ b/opal/datatype/opal_datatype_pack.h
@@ -51,8 +51,6 @@ static inline void pack_predefined_data( opal_convertor_t* CONVERTOR,
                                     (CONVERTOR)->pDesc, (CONVERTOR)->count );
         DO_DEBUG( opal_output( 0, "pack 1. memcpy( %p, %p, %lu ) => space %lu\n",
                                *(DESTINATION), _source, (unsigned long)_copy_blength, (unsigned long)(*(SPACE)) ); );
-        printf("pack 1. memcpy( %p, %p, %lu ) => space %lu\n",
-                               *(DESTINATION), _source, (unsigned long)_copy_blength, (unsigned long)(*(SPACE)) );
         MEMCPY_CSUM( *(DESTINATION), _source, _copy_blength, (CONVERTOR) );
         _source        += _copy_blength;
         *(DESTINATION) += _copy_blength;
diff --git a/opal/datatype/opal_datatype_prototypes.h b/opal/datatype/opal_datatype_prototypes.h
index bcfb59b9b31..0f9099f552f 100644
--- a/opal/datatype/opal_datatype_prototypes.h
+++ b/opal/datatype/opal_datatype_prototypes.h
@@ -60,6 +60,14 @@ opal_generic_simple_pack_checksum( opal_convertor_t* pConvertor,
                                    struct iovec* iov, uint32_t* out_size,
                                    size_t* max_data );
 int32_t
+opal_generic_simple_pack_cuda( opal_convertor_t* pConvertor,
+                               struct iovec* iov, uint32_t* out_size,
+                               size_t* max_data );
+int32_t
+opal_generic_simple_pack_cuda_checksum( opal_convertor_t* pConvertor,
+                                        struct iovec* iov, uint32_t* out_size,
+                                        size_t* max_data );
+int32_t
 opal_unpack_homogeneous_contig( opal_convertor_t* pConv,
                                 struct iovec* iov, uint32_t* out_size,
                                 size_t* max_data );
@@ -75,6 +83,14 @@ int32_t
 opal_generic_simple_unpack_checksum( opal_convertor_t* pConvertor,
                                      struct iovec* iov, uint32_t* out_size,
                                      size_t* max_data );
+int32_t
+opal_generic_simple_unpack_cuda( opal_convertor_t* pConvertor,
+                                struct iovec* iov, uint32_t* out_size,
+                                size_t* max_data );                                     
+int32_t
+opal_generic_simple_unpack_cuda_checksum( opal_convertor_t* pConvertor,
+                                        struct iovec* iov, uint32_t* out_size,
+                                        size_t* max_data );
 
 END_C_DECLS
 
diff --git a/opal/datatype/opal_datatype_unpack.c b/opal/datatype/opal_datatype_unpack.c
index f2c57593bcc..b569b40cd81 100644
--- a/opal/datatype/opal_datatype_unpack.c
+++ b/opal/datatype/opal_datatype_unpack.c
@@ -45,10 +45,12 @@
 #define opal_unpack_general_function            opal_unpack_general_checksum
 #define opal_unpack_homogeneous_contig_function opal_unpack_homogeneous_contig_checksum
 #define opal_generic_simple_unpack_function     opal_generic_simple_unpack_checksum
+#define opal_generic_simple_unpack_cuda_function     opal_generic_simple_unpack_cuda_checksum
 #else
 #define opal_unpack_general_function            opal_unpack_general
 #define opal_unpack_homogeneous_contig_function opal_unpack_homogeneous_contig
 #define opal_generic_simple_unpack_function     opal_generic_simple_unpack
+#define opal_generic_simple_unpack_cuda_function     opal_generic_simple_unpack_cuda
 #endif  /* defined(CHECKSUM) */
 
 
@@ -273,15 +275,9 @@ opal_generic_simple_unpack_function( opal_convertor_t* pConvertor,
     size_t iov_len_local;
     uint32_t iov_count;
 
+    printf("i am in simple unpack, max_data %lu, iov len %lu\n", *max_data, iov[0].iov_len);
     DO_DEBUG( opal_output( 0, "opal_convertor_generic_simple_unpack( %p, {%p, %lu}, %u )\n",
-                           (void*)pConvertor, iov[0].iov_base, (unsigned long)iov[0].iov_len, *out_size ); );
-
-//  if (opal_generic_simple_unpack_function_cuda_p != NULL) {
-//      int32_t rvalue = (*opal_generic_simple_unpack_function_cuda_p)( pConvertor, iov, out_size, max_data);
-//      if (rvalue != -99) { /* -99 is DRY RUN, to verify the result with CPU packing*/
-//          return rvalue;
-//      }
-//  }                      
+                           (void*)pConvertor, iov[0].iov_base, (unsigned long)iov[0].iov_len, *out_size ); )                  
 
     description = pConvertor->use_desc->desc;
 
@@ -387,9 +383,9 @@ opal_generic_simple_unpack_function( opal_convertor_t* pConvertor,
             if( OPAL_DATATYPE_LOOP == pElem->elem.common.type ) {
                 OPAL_PTRDIFF_TYPE local_disp = (OPAL_PTRDIFF_TYPE)conv_ptr;
                 if( pElem->loop.common.flags & OPAL_DATATYPE_FLAG_CONTIGUOUS ) {
-                //    UNPACK_CONTIGUOUS_LOOP( pConvertor, pElem, count_desc, 
-                //                            iov_ptr, conv_ptr, iov_len_local );
-                    (*unpack_contiguous_loop_cuda_p)(pElem, &count_desc, &iov_ptr, &conv_ptr, &iov_len_local);
+                    UNPACK_CONTIGUOUS_LOOP( pConvertor, pElem, count_desc, 
+                                            iov_ptr, conv_ptr, iov_len_local );
+                //    (*unpack_contiguous_loop_cuda_p)(pElem, &count_desc, &iov_ptr, &conv_ptr, &iov_len_local);
                     if( 0 == count_desc ) {  /* completed */
                         pos_desc += pElem->loop.items + 1;
                         goto update_loop_description;
@@ -417,6 +413,13 @@ opal_generic_simple_unpack_function( opal_convertor_t* pConvertor,
     *out_size = iov_count;
     if( pConvertor->bConverted == pConvertor->remote_size ) {
         pConvertor->flags |= CONVERTOR_COMPLETED;
+        printf("total unpacked %lu\n", pConvertor->bConverted);
+        // double *vtmp = (double *)iov[0].iov_base;
+        // for (uint32_t i = 0; i < total_unpacked/8; i++) {
+        //     printf(" %1.f ", *vtmp);
+        //     vtmp ++;
+        // }
+        // printf("\n");
         return 1;
     }
     /* Save the global position for the next round */
@@ -590,3 +593,17 @@ opal_unpack_general_function( opal_convertor_t* pConvertor,
                            pConvertor->stack_pos, pStack->index, (int)pStack->count, (long)pStack->disp ); );
     return 0;
 }
+
+int32_t
+opal_generic_simple_unpack_cuda_function( opal_convertor_t* pConvertor,
+                                          struct iovec* iov, uint32_t* out_size,
+                                          size_t* max_data )
+{
+#if defined (OPAL_DATATYPE_CUDA_IOV)
+    if (opal_generic_simple_unpack_function_cuda_iov_p != NULL) {
+        return (*opal_generic_simple_unpack_function_cuda_iov_p)( pConvertor, iov, out_size, max_data);
+
+    }
+#endif
+    return 0;
+}
diff --git a/test/datatype/Makefile.am b/test/datatype/Makefile.am
index 37b89c96be7..9b070ed1357 100644
--- a/test/datatype/Makefile.am
+++ b/test/datatype/Makefile.am
@@ -14,7 +14,7 @@
 #
 
 if PROJECT_OMPI
-    MPI_TESTS = checksum position position_noncontig ddt_test ddt_raw unpack_ooo ddt_pack
+    MPI_TESTS = checksum position position_noncontig ddt_test ddt_test_old ddt_raw unpack_ooo ddt_pack
     MPI_CHECKS = to_self
 endif
 TESTS = opal_datatype_test $(MPI_TESTS)
@@ -28,10 +28,13 @@ unpack_ooo_LDADD = \
         $(top_builddir)/opal/lib@OPAL_LIB_PREFIX@open-pal.la
 
 ddt_test_SOURCES = ddt_test.c ddt_lib.c ddt_lib.h
-ddt_test_LDFLAGS = $(OMPI_PKG_CONFIG_LDFLAGS)
-ddt_test_LDADD = \
-        $(top_builddir)/ompi/libmpi.la \
-        $(top_builddir)/opal/lib@OPAL_LIB_PREFIX@open-pal.la
+ddt_test_LDFLAGS = $(WRAPPER_EXTRA_LDFLAGS)
+ddt_test_CFLAGS = -I/mnt/scratch/cuda-6.5.14/include -g 
+ddt_test_LDADD = $(top_builddir)/ompi/libmpi.la $(top_builddir)/opal/mca/common/cuda/libmca_common_cuda.la -L/mnt/scratch/cuda-6.5.14/lib64 -lcudart
+
+ddt_test_old_SOURCES = ddt_test_old.c ddt_lib.c ddt_lib.h
+ddt_test_old_LDFLAGS = $(WRAPPER_EXTRA_LDFLAGS)
+ddt_test_old_LDADD = $(top_builddir)/ompi/libmpi.la
 
 ddt_raw_SOURCES = ddt_raw.c ddt_lib.c ddt_lib.h
 ddt_raw_LDFLAGS = $(OMPI_PKG_CONFIG_LDFLAGS)
diff --git a/test/datatype/ddt_lib.c b/test/datatype/ddt_lib.c
index 9170da0914a..321a5c4be88 100644
--- a/test/datatype/ddt_lib.c
+++ b/test/datatype/ddt_lib.c
@@ -358,14 +358,20 @@ ompi_datatype_t* upper_matrix( unsigned int mat_size )
 
     disp = (int*)malloc( sizeof(int) * mat_size );
     blocklen = (int*)malloc( sizeof(int) * mat_size );
-
+    
     for( i = 0; i < mat_size; i++ ) {
         disp[i] = i * mat_size + i;
         blocklen[i] = mat_size - i;
     }
-
+#if defined (TEST_DOUBLE)
     ompi_datatype_create_indexed( mat_size, blocklen, disp, &ompi_mpi_double.dt,
                              &upper );
+#elif defined (TEST_FLOAT)
+    ompi_datatype_create_indexed( mat_size, blocklen, disp, &ompi_mpi_float.dt, &upper );
+#elif defined (TEST_CHAR)
+    ompi_datatype_create_indexed( mat_size, blocklen, disp, &ompi_mpi_char.dt, &upper );
+#else
+#endif
     ompi_datatype_commit( &upper );
     if( outputFlags & DUMP_DATA_AFTER_COMMIT ) {
         ompi_datatype_dump( upper );
@@ -686,3 +692,26 @@ ompi_datatype_t* create_vector_type( const ompi_datatype_t* data, int count, int
     return vector;
 }
 
+ompi_datatype_t* create_struct_type(int count)
+{
+    ompi_datatype_t* dt_struct;
+    ompi_datatype_t* dt_struct_vector;
+    ompi_datatype_t* oldtypes[2];
+    MPI_Aint offsets[2], extent, lb;
+    int blockcounts[2];
+    
+    offsets[0] = 0; 
+    oldtypes[0] = MPI_FLOAT; 
+    blockcounts[0] = 4; 
+    
+    ompi_datatype_get_extent(MPI_FLOAT, &lb, &extent);
+    offsets[1] = 4 * extent; 
+    oldtypes[1] = MPI_DOUBLE; 
+    blockcounts[1] = 2;
+    
+    ompi_datatype_create_struct( 2, blockcounts, offsets, oldtypes, &dt_struct );
+    dt_struct_vector = create_vector_type( dt_struct, 10, 2, 4 );
+    ompi_datatype_commit( &dt_struct_vector );
+    return dt_struct_vector;
+}
+
diff --git a/test/datatype/ddt_lib.h b/test/datatype/ddt_lib.h
index d94690047a7..539434f9525 100644
--- a/test/datatype/ddt_lib.h
+++ b/test/datatype/ddt_lib.h
@@ -34,6 +34,11 @@
 #define DUMP_DATA_AFTER_COMMIT 0x00000001
 #define CHECK_PACK_UNPACK      0x00000002
 
+#define TEST_DOUBLE
+//#define TEST_FLOAT
+//#define TEST_CHAR
+
+
 extern uint32_t outputFlags;
 
 /**
@@ -91,5 +96,5 @@ extern ompi_datatype_t* create_strange_dt( void );
 extern ompi_datatype_t* create_contiguous_type( const ompi_datatype_t* data, int count );
 extern ompi_datatype_t* create_vector_type( const ompi_datatype_t* data, int count,
                                             int length, int stride );
-extern ompi_datatype_t* create_struct_constant_gap_resized_ddt( ompi_datatype_t* type );
+extern ompi_datatype_t* create_struct_type(int count);
 
diff --git a/test/datatype/ddt_test.c b/test/datatype/ddt_test.c
index 12b4b31fc15..e5f58a5b348 100644
--- a/test/datatype/ddt_test.c
+++ b/test/datatype/ddt_test.c
@@ -30,6 +30,14 @@
 #include <stdio.h>
 #include <string.h>
 
+#define DDT_TEST_CUDA
+
+#if defined (DDT_TEST_CUDA)
+#include <cuda_runtime_api.h>
+#include "opal/mca/common/cuda/common_cuda.h"
+#include "opal/runtime/opal_params.h"
+#endif
+
 /* Compile with:
 mpicc -DHAVE_CONFIG_H -I. -I../../include -I../../../ompi-trunk/include  -I../.. -I../../include -I../../../ompi-trunk/opal -I../../../ompi-trunk/orte -I../../../ompi-trunk/ompi -g ddt_test.c -o ddt_test
 */
@@ -171,12 +179,64 @@ static int local_copy_ddt_count( ompi_datatype_t* pdt, int count )
     return OMPI_SUCCESS;
 }
 
+static void fill_vectors(double* vp, int itera, int contig, int gap)
+{
+    int i, j;
+    for (i = 0; i < itera-1; i++ ){
+        for (j = i*gap; j < (i+1)*gap; j++) {
+            if (j >= i*gap && j < i*gap+contig) {
+                vp[j] = 1.0;
+            } else {
+                vp[j] = 0.0;
+            }
+        }
+    }
+    for (i = (itera-1)*gap; i < (itera-1)*gap+contig; i++) {
+        vp[i] = 1.0;
+    }
+    
+    // printf("vector generated:\n");
+    // for (i = 0; i < (itera-1)*gap+contig; i++) {
+    //     printf("%1.f ", vp[i]);
+    // }
+    // printf("\n");
+}
+
+static void verify_vectors(double *vp, int itera, int contig, int gap)
+{
+    int i, j;
+    int error = 0;
+    for (i = 0; i < itera-1; i++) {
+        for (j = i*gap; j < (i+1)*gap; j++) {
+            if (j >= i*gap && j < i*gap+contig) {
+                if (vp[j] != 1.0) {
+                    error ++;
+                }
+            } 
+        }
+    }
+    for (i = (itera-1)*gap; i < (itera-1)*gap+contig; i++) {
+        if (vp[i] != 1.0) {
+            error ++;
+        }
+    }
+    // printf("vector received:\n");
+    // for (i = 0; i < (itera-1)*gap+contig; i++) {
+    //     printf("%1.f ", vp[i]);
+    // }
+    if (error != 0) {
+        printf("%d error is found\n", error);
+    } else {
+        printf("no error is found\n");
+    }
+}
+
 static int
 local_copy_with_convertor_2datatypes( ompi_datatype_t* send_type, int send_count,
                                       ompi_datatype_t* recv_type, int recv_count,
-                                      int chunk )
+                                      int chunk, int itera, int contig, int gap )
 {
-    void *pdst = NULL, *psrc = NULL, *ptemp = NULL;
+    void *pdst = NULL, *psrc = NULL, *ptemp = NULL, *phost = NULL;
     opal_convertor_t *send_convertor = NULL, *recv_convertor = NULL;
     struct iovec iov;
     uint32_t iov_count;
@@ -188,6 +248,40 @@ local_copy_with_convertor_2datatypes( ompi_datatype_t* send_type, int send_count
 
     rlength = compute_buffer_length(recv_type, recv_count);
     slength = compute_buffer_length(send_type, send_count);
+
+#if defined (DDT_TEST_CUDA)
+    cudaError_t error = cudaMalloc((void **)&psrc, slength);
+    if ( error != cudaSuccess) {
+        printf("CUDA error: %s\n", cudaGetErrorString(error));
+        exit(-1);
+    }
+    cudaMemset(psrc, 0, slength);
+    printf("cudamalloc psrc %p\n", psrc);
+    
+    error = cudaMalloc((void **)&pdst, rlength);
+    if ( error != cudaSuccess) {
+        printf("CUDA error: %s\n", cudaGetErrorString(error));
+        exit(-1);
+    }
+    cudaMemset(pdst, 0, rlength); 
+    printf("cudamalloc pdst %p\n", pdst);
+    
+    error = cudaMallocHost((void **)&ptemp, chunk);
+    if ( error != cudaSuccess) {
+        printf("CUDA error: %s\n", cudaGetErrorString(error));
+        exit(-1);
+    }
+    memset(ptemp, 0, chunk);
+    printf("cudamallochost ptemp %p\n", ptemp);
+    
+    error = cudaMallocHost((void **)&phost, slength);
+    if ( error != cudaSuccess) {
+        printf("CUDA error: %s\n", cudaGetErrorString(error));
+        exit(-1);
+    }
+    memset(phost, 0, slength);
+    printf("cudamallochost phost %p\n", phost);
+#else
     pdst  = malloc( rlength );
     psrc  = malloc( slength );
     ptemp = malloc( chunk );
@@ -196,6 +290,18 @@ local_copy_with_convertor_2datatypes( ompi_datatype_t* send_type, int send_count
     for( size_t i = 0; i < slength; i++ )
             ((char*)psrc)[i] = i % 128 + 32;
     memset(pdst, 0, rlength);
+#endif
+    
+#if defined (DDT_TEST_CUDA)
+    if (itera > 0) {
+        fill_vectors((double *)phost, itera, contig, gap);
+    }
+    cudaMemcpy(psrc, phost, slength, cudaMemcpyHostToDevice);
+#else 
+    if (itera > 0) {
+        fill_vectors(psrc, itera, contig, gap);
+    }
+#endif
 
     send_convertor = opal_convertor_create( remote_arch, 0 );
     if( OPAL_SUCCESS != opal_convertor_prepare_for_send( send_convertor, &(send_type->super), send_count, psrc ) ) {
@@ -242,6 +348,18 @@ local_copy_with_convertor_2datatypes( ompi_datatype_t* send_type, int send_count
     printf( "copying different data-types using convertors in %ld microsec\n", total_time );
     printf( "\t unpack in %ld microsec [pack in %ld microsec]\n", unpack_time,
             total_time - unpack_time );
+            
+#if defined (DDT_TEST_CUDA)
+    memset(phost, 0, slength);
+    cudaMemcpy(phost, pdst, rlength, cudaMemcpyDeviceToHost);
+    if (itera > 0) {
+        verify_vectors((double *)phost, itera, contig, gap);
+    }
+#else
+    if (itera > 0) {
+        verify_vectors((double *)pdst, itera, contig, gap);
+    }
+#endif
  clean_and_return:
     if( send_convertor != NULL ) {
         OBJ_RELEASE( send_convertor ); assert( send_convertor == NULL );
@@ -249,15 +367,25 @@ local_copy_with_convertor_2datatypes( ompi_datatype_t* send_type, int send_count
     if( recv_convertor != NULL ) {
         OBJ_RELEASE( recv_convertor ); assert( recv_convertor == NULL );
     }
+#if defined (DDT_TEST_CUDA)
+    if( NULL != pdst ) cudaFree( pdst );
+    if( NULL != psrc ) cudaFree( psrc );
+    if( NULL != ptemp ) cudaFreeHost( ptemp );
+    if( NULL != phost ) cudaFreeHost( phost );
+#else
     if( NULL != pdst ) free( pdst );
     if( NULL != psrc ) free( psrc );
     if( NULL != ptemp ) free( ptemp );
+#endif
     return OMPI_SUCCESS;
 }
 
-static int local_copy_with_convertor( ompi_datatype_t* pdt, int count, int chunk )
+static int
+local_copy_with_convertor_2datatypes_struct( ompi_datatype_t* send_type, int send_count,
+                                      ompi_datatype_t* recv_type, int recv_count,
+                                      int chunk, int count)
 {
-    void *pdst = NULL, *psrc = NULL, *ptemp = NULL;
+    void *pdst = NULL, *psrc = NULL, *ptemp = NULL, *phost = NULL;
     opal_convertor_t *send_convertor = NULL, *recv_convertor = NULL;
     struct iovec iov;
     uint32_t iov_count;
@@ -265,15 +393,295 @@ static int local_copy_with_convertor( ompi_datatype_t* pdt, int count, int chunk
     int32_t length = 0, done1 = 0, done2 = 0;
     TIMER_DATA_TYPE start, end, unpack_start, unpack_end;
     long total_time, unpack_time = 0;
+    size_t slength, rlength;
 
-    max_data = compute_buffer_length(pdt, count);
+    rlength = compute_buffer_length(recv_type, recv_count);
+    slength = compute_buffer_length(send_type, send_count);
+
+#if defined (DDT_TEST_CUDA)
+    cudaError_t error = cudaMalloc((void **)&psrc, slength);
+    if ( error != cudaSuccess) {
+        printf("CUDA error: %s\n", cudaGetErrorString(error));
+        exit(-1);
+    }
+    cudaMemset(psrc, 0, slength);
+    printf("cudamalloc psrc %p\n", psrc);
+    
+    error = cudaMalloc((void **)&pdst, rlength);
+    if ( error != cudaSuccess) {
+        printf("CUDA error: %s\n", cudaGetErrorString(error));
+        exit(-1);
+    }
+    cudaMemset(pdst, 0, rlength); 
+    printf("cudamalloc pdst %p\n", pdst);
+    
+    error = cudaMallocHost((void **)&ptemp, chunk);
+    if ( error != cudaSuccess) {
+        printf("CUDA error: %s\n", cudaGetErrorString(error));
+        exit(-1);
+    }
+    memset(ptemp, 0, chunk);
+    printf("cudamallochost ptemp %p\n", ptemp);
+    
+    error = cudaMallocHost((void **)&phost, slength);
+    if ( error != cudaSuccess) {
+        printf("CUDA error: %s\n", cudaGetErrorString(error));
+        exit(-1);
+    }
+    memset(phost, 0, slength);
+    printf("cudamallochost phost %p\n", phost);
+#else
+    pdst  = malloc( rlength );
+    psrc  = malloc( slength );
+    ptemp = malloc( chunk );
+
+    /* initialize the buffers to prevent valgrind from complaining */
+    for( size_t i = 0; i < slength; i++ )
+            ((char*)psrc)[i] = i % 128 + 32;
+    memset(pdst, 0, rlength);
+#endif
+    
+#if defined (DDT_TEST_CUDA)
+
+    cudaMemcpy(psrc, phost, slength, cudaMemcpyHostToDevice);
+#else 
+
+#endif
+
+    send_convertor = opal_convertor_create( remote_arch, 0 );
+    if( OPAL_SUCCESS != opal_convertor_prepare_for_send( send_convertor, &(send_type->super), send_count, psrc ) ) {
+        printf( "Unable to create the send convertor. Is the datatype committed ?\n" );
+        goto clean_and_return;
+    }
+    recv_convertor = opal_convertor_create( remote_arch, 0 );
+    if( OPAL_SUCCESS != opal_convertor_prepare_for_recv( recv_convertor, &(recv_type->super), recv_count, pdst ) ) {
+        printf( "Unable to create the recv convertor. Is the datatype committed ?\n" );
+        goto clean_and_return;
+    }
+
+    cache_trash();  /* make sure the cache is useless */
+
+    GET_TIME( start );
+    while( (done1 & done2) != 1 ) {
+        /* They are supposed to finish in exactly the same time. */
+        if( done1 | done2 ) {
+            printf( "WRONG !!! the send is %s but the receive is %s in local_copy_with_convertor_2datatypes\n",
+                    (done1 ? "finish" : "not finish"),
+                    (done2 ? "finish" : "not finish") );
+        }
+
+        max_data = chunk;
+        iov_count = 1;
+        iov.iov_base = ptemp;
+        iov.iov_len = chunk;
+
+        if( done1 == 0 ) {
+            done1 = opal_convertor_pack( send_convertor, &iov, &iov_count, &max_data );
+        }
+
+        if( done2 == 0 ) {
+            GET_TIME( unpack_start );
+            done2 = opal_convertor_unpack( recv_convertor, &iov, &iov_count, &max_data );
+            GET_TIME( unpack_end );
+            unpack_time += ELAPSED_TIME( unpack_start, unpack_end );
+        }
+
+        length += max_data;
+    }
+    GET_TIME( end );
+    total_time = ELAPSED_TIME( start, end );
+    printf( "copying different data-types using convertors in %ld microsec\n", total_time );
+    printf( "\t unpack in %ld microsec [pack in %ld microsec]\n", unpack_time,
+            total_time - unpack_time );
+            
+#if defined (DDT_TEST_CUDA)
+    memset(phost, 0, slength);
+    cudaMemcpy(phost, pdst, rlength, cudaMemcpyDeviceToHost);
+
+#else
+
+#endif
+ clean_and_return:
+    if( send_convertor != NULL ) {
+        OBJ_RELEASE( send_convertor ); assert( send_convertor == NULL );
+    }
+    if( recv_convertor != NULL ) {
+        OBJ_RELEASE( recv_convertor ); assert( recv_convertor == NULL );
+    }
+#if defined (DDT_TEST_CUDA)
+    if( NULL != pdst ) cudaFree( pdst );
+    if( NULL != psrc ) cudaFree( psrc );
+    if( NULL != ptemp ) cudaFreeHost( ptemp );
+    if( NULL != phost ) cudaFreeHost( phost );
+#else
+    if( NULL != pdst ) free( pdst );
+    if( NULL != psrc ) free( psrc );
+    if( NULL != ptemp ) free( ptemp );
+#endif
+    return OMPI_SUCCESS;
+}
 
-    pdst  = malloc(max_data);
-    psrc  = malloc(max_data);
-    ptemp = malloc(chunk);
 
+static void fill_upper_matrix(void *matt, int msize)
+{
+    int i, j, start, end;
+    int *blklens, *displs;
+#if defined (TEST_DOUBLE)
+    double *mat = (double *)matt;
+#elif defined (TEST_FLOAT)
+    float *mat = (float *)matt;
+#elif defined (TEST_CHAR)
+    char *mat = (char *)matt;
+#else
+    void *mat = matt;
+#endif
+    
+    blklens = (int *)malloc(sizeof(int)*msize);
+    displs = (int *)malloc(sizeof(int)*msize);
+    for (i = 0; i < msize; i++) {
+        blklens[i] = msize - i;
+        displs[i] = i*msize + i;
+    }
+    for (i = 0; i < msize; i++) {
+        start = displs[i];
+        end = start + blklens[i];
+        for (j = start; j < end; j++) {
+#if defined (TEST_CHAR)
+            mat[j] = 'a';
+#else
+            mat[j] = 0.0 + i;
+#endif
+        }
+    }
+    free(blklens);
+    free(displs);
+
+   // printf("matrix generate\n");
+   // for (i = 0; i < msize; i++) {
+   //     for (j = 0; j < msize; j++) {
+   //         printf(" %1.f ", mat[i*msize+j]);
+   //     }
+   //     printf("\n");
+   // }
+}
+
+static void verify_mat_result(void *matt, int msize)
+{
+    int *blklens, *displs;
+    int i, j, error = 0;
+    int start, end;
+#if defined (TEST_DOUBLE)
+    double *mat = (double *)matt;
+#elif defined (TEST_FLOAT)
+    float *mat = (float *)matt;
+#elif defined (TEST_CHAR)
+    char *mat = (char *)matt;
+#else
+    void *mat = matt;
+#endif
+    
+    blklens = (int *)malloc(sizeof(int)*msize);
+    displs = (int *)malloc(sizeof(int)*msize);
+    for (i = 0; i < msize; i++) {
+        blklens[i] = msize - i;
+        displs[i] = i*msize + i;
+    }
+    for (i = 0; i < msize; i++) {
+        start = displs[i];
+        end = start + blklens[i];
+        for (j = start; j < end; j++) {
+#if defined (TEST_CHAR) 
+            if (mat[j] != 'a') {
+#else
+            if (mat[j] != (0.0+i)) {
+#endif
+                error ++;
+            }
+        }
+    }
+    free(blklens);
+    free(displs);
+    
+    // printf("matrix received\n");
+    // for (i = 0; i < msize; i++) {
+    //     for (j = 0; j < msize; j++) {
+    //         printf(" %1.f ", mat[i*msize+j]);
+    //     }
+    //     printf("\n");
+    // }
+    
+    if (error != 0) {
+        printf("error is found %d\n", error);
+    } else {
+        printf("no error is found\n");
+    }
+}
+
+static int local_copy_with_convertor( ompi_datatype_t* pdt, int count, int chunk, int msize )
+{
+    void *pdst = NULL, *psrc = NULL, *ptemp = NULL, *phost = NULL;
+    opal_convertor_t *send_convertor = NULL, *recv_convertor = NULL;
+    struct iovec iov;
+    uint32_t iov_count;
+    size_t max_data, dt_length;
+    int32_t length = 0, done1 = 0, done2 = 0;
+    TIMER_DATA_TYPE start, end, unpack_start, unpack_end;
+    long total_time, unpack_time = 0;
+
+    dt_length = compute_buffer_length(pdt, count);
+    printf("length %lu\n", dt_length);
+
+#if defined (DDT_TEST_CUDA)
+    cudaError_t error = cudaMalloc((void **)&psrc, dt_length);
+    if ( error != cudaSuccess) {
+        printf("CUDA error: %s\n", cudaGetErrorString(error));
+        exit(-1);
+    }
+    cudaMemset(psrc, 0, dt_length);
+    printf("cudamalloc psrc %p\n", psrc);
+    
+    error = cudaMalloc((void **)&pdst, dt_length);
+    if ( error != cudaSuccess) {
+        printf("CUDA error: %s\n", cudaGetErrorString(error));
+        exit(-1);
+    }
+    cudaMemset(pdst, 0, dt_length); 
+    printf("cudamalloc pdst %p\n", pdst);
+    
+    error = cudaMallocHost((void **)&ptemp, chunk);
+    if ( error != cudaSuccess) {
+        printf("CUDA error: %s\n", cudaGetErrorString(error));
+        exit(-1);
+    }
+    memset(ptemp, 0, chunk);
+    printf("cudamallochost ptemp %p\n", ptemp);
+    
+    error = cudaMallocHost((void **)&phost, dt_length);
+    if ( error != cudaSuccess) {
+        printf("CUDA error: %s\n", cudaGetErrorString(error));
+        exit(-1);
+    }
+    memset(phost, 0, dt_length);
+    printf("cudamallochost phost %p\n", phost);
+#else
+    pdst  = malloc(dt_length);
+    psrc  = malloc(dt_length);
+    ptemp = malloc(chunk);
+    
     for( int i = 0; i < length; ((char*)psrc)[i] = i % 128 + 32, i++ );
     memset( pdst, 0, length );
+#endif
+
+#if defined (DDT_TEST_CUDA)
+    if (msize > 0) {
+        fill_upper_matrix(phost, msize);
+    }
+    cudaMemcpy(psrc, phost, dt_length, cudaMemcpyHostToDevice);
+#else 
+    if (msize > 0) {
+        fill_upper_matrix(psrc, msize);
+    }
+#endif
 
     send_convertor = opal_convertor_create( remote_arch, 0 );
     if( OPAL_SUCCESS != opal_convertor_prepare_for_send( send_convertor, &(pdt->super), count, psrc ) ) {
@@ -321,13 +729,32 @@ static int local_copy_with_convertor( ompi_datatype_t* pdt, int count, int chunk
     printf( "copying same data-type using convertors in %ld microsec\n", total_time );
     printf( "\t unpack in %ld microsec [pack in %ld microsec]\n", unpack_time,
             total_time - unpack_time );
- clean_and_return:
+            
+#if defined (DDT_TEST_CUDA)
+    memset(phost, 0, dt_length);
+    cudaMemcpy(phost, pdst, dt_length, cudaMemcpyDeviceToHost);
+    if (msize > 0) {
+        verify_mat_result(phost, msize);
+    }
+#else
+    if (msize > 0) {
+        verify_mat_result(pdst, msize);
+    }
+#endif
+clean_and_return:
     if( NULL != send_convertor ) OBJ_RELEASE( send_convertor );
     if( NULL != recv_convertor ) OBJ_RELEASE( recv_convertor );
 
+#if defined (DDT_TEST_CUDA)
+    if( NULL != pdst ) cudaFree( pdst );
+    if( NULL != psrc ) cudaFree( psrc );
+    if( NULL != ptemp ) cudaFreeHost( ptemp );
+    if( NULL != phost ) cudaFreeHost( phost );
+#else
     if( NULL != pdst ) free( pdst );
     if( NULL != psrc ) free( psrc );
     if( NULL != ptemp ) free( ptemp );
+#endif
     return OMPI_SUCCESS;
 }
 
@@ -343,7 +770,13 @@ int main( int argc, char* argv[] )
     ompi_datatype_t *pdt, *pdt1, *pdt2, *pdt3;
     int rc, length = 500, i;
 
+#if defined (DDT_TEST_CUDA)
+    opal_cuda_support = 1;
+#endif
     opal_init_util(&argc, &argv);
+#if defined (DDT_TEST_CUDA)
+    mca_common_cuda_stage_one_init();
+#endif
     ompi_datatype_init();
 
     /**
@@ -365,12 +798,20 @@ int main( int argc, char* argv[] )
     }
     OBJ_RELEASE( pdt ); assert( pdt == NULL );
 */    
+    printf("\n TEST STRUCT \n");
+    pdt = create_struct_type(5);
+    if( outputFlags & CHECK_PACK_UNPACK ) {
+        for (i = 1; i <= 1; i++) {
+  //          local_copy_with_convertor_2datatypes_struct(pdt, 1, pdt, 1, 1024*1024*100, 5);
+        }
+    }
+    
     printf( "\n\n#\n * TEST UPPER TRIANGULAR MATRIX (size 100)\n #\n\n" );
-    pdt = upper_matrix(4000);
+    pdt = upper_matrix(1000);
     if( outputFlags & CHECK_PACK_UNPACK ) {
-        for (i = 1; i <= 4; i++) {
+        for (i = 1; i <= 3; i++) {
 //        local_copy_ddt_count(pdt, 1);
-    //    local_copy_with_convertor(pdt, 1, 1024*1024*200);
+            local_copy_with_convertor(pdt, 1, 1024*1024*200, 1000);
         }
     }
     OBJ_RELEASE( pdt ); assert( pdt == NULL );
@@ -403,7 +844,6 @@ int main( int argc, char* argv[] )
     ompi_datatype_create_contiguous(0, &ompi_mpi_datatype_null.dt, &pdt1);
     ompi_datatype_create_contiguous(0, &ompi_mpi_datatype_null.dt, &pdt2);
     ompi_datatype_create_contiguous(0, &ompi_mpi_datatype_null.dt, &pdt3);
-
     ompi_datatype_add( pdt3, &ompi_mpi_int.dt, 10, 0, -1 );
     ompi_datatype_add( pdt3, &ompi_mpi_float.dt, 5, 10 * sizeof(int), -1 );
 
@@ -429,7 +869,6 @@ int main( int argc, char* argv[] )
     OBJ_RELEASE( pdt1 ); assert( pdt1 == NULL );
     OBJ_RELEASE( pdt2 ); assert( pdt2 == NULL );
     OBJ_RELEASE( pdt3 ); assert( pdt3 == NULL );
-
     printf( ">>--------------------------------------------<<\n" );
     printf( " Contiguous data-type (MPI_DOUBLE)\n" );
     pdt = MPI_DOUBLE;
@@ -494,7 +933,7 @@ int main( int argc, char* argv[] )
  //   ompi_datatype_commit(&pdt1);
     if( outputFlags & CHECK_PACK_UNPACK ) {
         for (i = 0; i < 10; i++) {
-            local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 1024*1024*30 );
+    //         local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 1024*1024*30 );
         }
     }
     printf( ">>--------------------------------------------<<\n" );
@@ -504,7 +943,7 @@ int main( int argc, char* argv[] )
     ompi_datatype_create_contiguous( 1, pdt, &pdt1 );
 //    ompi_datatype_dump( pdt );
     if( outputFlags & CHECK_PACK_UNPACK ) {
-        for (i = 0; i < 10; i++) {
+        for (i = 0; i < 1; i++) {
        // local_copy_ddt_count(pdt, 1);
       //  local_copy_with_convertor( pdt, 1, 12 );
       //  local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 12 );
@@ -513,7 +952,7 @@ int main( int argc, char* argv[] )
       //  local_copy_with_convertor( pdt, 1, 6000 );
       //  local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 6000 );
       //  local_copy_with_convertor( pdt, 1, 36000 );
-     //     local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 1024*1024*20 );
+           // local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 1024*1024*20 , 4000, 384, 512);
         }
     }
     printf( ">>--------------------------------------------<<\n" );
@@ -551,7 +990,7 @@ int main( int argc, char* argv[] )
       //  local_copy_with_convertor( pdt, 1, 6000 );
       //  local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 6000 );
       //  local_copy_with_convertor( pdt, 1, 36000 );
-   //       local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 1024*1024*5 );
+    //      local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 1024*1024*5 );
         }
     }
     printf( ">>--------------------------------------------<<\n" );
@@ -595,7 +1034,6 @@ int main( int argc, char* argv[] )
     }
     printf( ">>--------------------------------------------<<\n" );
     OBJ_RELEASE( pdt ); assert( pdt == NULL );
-
     printf( ">>--------------------------------------------<<\n" );
     pdt = test_create_blacs_type();
     if( outputFlags & CHECK_PACK_UNPACK ) {
@@ -611,7 +1049,6 @@ int main( int argc, char* argv[] )
     }
     printf( ">>--------------------------------------------<<\n" );
     OBJ_RELEASE( pdt ); assert( pdt == NULL );
-
     printf( ">>--------------------------------------------<<\n" );
     pdt1 = test_create_blacs_type1( &ompi_mpi_int.dt );
     pdt2 = test_create_blacs_type2( &ompi_mpi_int.dt );

From cf44223f7d921452bf8cf70c2ee339031f3285b6 Mon Sep 17 00:00:00 2001
From: eddy16112 <eddy16112@gmail.com>
Date: Thu, 9 Apr 2015 03:23:21 -0400
Subject: [PATCH 003/190] RDMA send is now working.

Conflicts:
	test/datatype/Makefile.am
---
 ompi/mca/pml/ob1/pml_ob1_cuda.c               |  74 +++++++-
 ompi/mca/pml/ob1/pml_ob1_recvreq.c            |   7 +-
 ompi/mca/pml/ob1/pml_ob1_sendreq.c            |  18 +-
 opal/datatype/cuda/Makefile                   |   2 +-
 opal/datatype/cuda/opal_config.h              | 171 +++++++++++++-----
 opal/datatype/cuda/opal_datatype_cuda.cu      |  34 ++++
 opal/datatype/cuda/opal_datatype_cuda.cuh     |   4 +
 .../cuda/opal_datatype_cuda_internal.cuh      |   1 -
 .../cuda/opal_datatype_orig_internal.h        |   8 +-
 .../cuda/opal_datatype_pack_cuda_wrapper.cu   |  40 +++-
 .../cuda/opal_datatype_unpack_cuda_wrapper.cu |  11 +-
 opal/datatype/opal_convertor.c                |  12 +-
 opal/datatype/opal_convertor.h                |   6 +
 opal/datatype/opal_datatype_gpu.c             |  27 ++-
 opal/datatype/opal_datatype_gpu.h             |   5 +-
 opal/datatype/opal_datatype_module.c          |   4 +-
 opal/datatype/opal_datatype_pack.c            |   2 -
 opal/datatype/opal_datatype_unpack.c          |   2 -
 opal/include/opal_config_top.h                |   2 -
 opal/mca/btl/smcuda/btl_smcuda.c              |  52 +++++-
 opal/mca/common/cuda/common_cuda.c            |  64 +++++++
 opal/mca/common/cuda/common_cuda.h            |   9 +
 test/datatype/Makefile.am                     |  14 +-
 test/datatype/ddt_test.c                      |  13 +-
 24 files changed, 480 insertions(+), 102 deletions(-)

diff --git a/ompi/mca/pml/ob1/pml_ob1_cuda.c b/ompi/mca/pml/ob1/pml_ob1_cuda.c
index a44a8b377c8..c2b2708bebf 100644
--- a/ompi/mca/pml/ob1/pml_ob1_cuda.c
+++ b/ompi/mca/pml/ob1/pml_ob1_cuda.c
@@ -37,11 +37,21 @@
 #include "ompi/mca/bml/base/base.h"
 #include "ompi/memchecker.h"
 
+#include "opal/datatype/opal_datatype_gpu.h"
+#include "opal/mca/common/cuda/common_cuda.h"
+
+#define CUDA_DDT_WITH_RDMA 1
+
 size_t mca_pml_ob1_rdma_cuda_btls(
     mca_bml_base_endpoint_t* bml_endpoint,
     unsigned char* base,
     size_t size,
     mca_pml_ob1_com_btl_t* rdma_btls);
+    
+int mca_pml_ob1_rdma_cuda_btl_register_events(
+    mca_pml_ob1_com_btl_t* rdma_btls, 
+    uint32_t num_btls_used, 
+    struct opal_convertor_t* convertor);
 
 int mca_pml_ob1_cuda_need_buffers(void * rreq,
                                   mca_btl_base_module_t* btl);
@@ -93,7 +103,45 @@ int mca_pml_ob1_send_request_start_cuda(mca_pml_ob1_send_request_t* sendreq,
         /* Do not send anything with first rendezvous message as copying GPU
          * memory into RNDV message is expensive. */
         sendreq->req_send.req_base.req_convertor.flags |= CONVERTOR_CUDA;
-        rc = mca_pml_ob1_send_request_start_rndv(sendreq, bml_btl, 0, 0);
+        mca_bml_base_btl_t* bml_endpoint_btl = mca_bml_base_btl_array_get_index(&(sendreq->req_endpoint->btl_send), 0);
+        if ((bml_endpoint_btl->btl_flags & MCA_BTL_FLAGS_CUDA_GET) && CUDA_DDT_WITH_RDMA) {
+            printf("GPU data ready for GET!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n");
+            unsigned char *base;
+            struct opal_convertor_t *convertor = &(sendreq->req_send.req_base.req_convertor);
+            base = opal_datatype_get_gpu_buffer();
+            sendreq->req_send.req_bytes_packed = convertor->local_size;
+            printf("GPU BUFFER %p, local %lu, remote %lu\n", base, convertor->local_size, convertor->remote_size);
+            if( 0 != (sendreq->req_rdma_cnt = (uint32_t)mca_pml_ob1_rdma_cuda_btls(
+                                                                           sendreq->req_endpoint,
+                                                                           base,
+                                                                           sendreq->req_send.req_bytes_packed,
+                                                                           sendreq->req_rdma))) {
+                
+                mca_pml_ob1_rdma_cuda_btl_register_events(sendreq->req_rdma, sendreq->req_rdma_cnt, convertor);
+                struct iovec iov;
+                int rc_dt = 0;
+                uint32_t iov_count = 1;
+                iov.iov_base = NULL;
+                iov.iov_len = 0;
+                size_t max_data = 0;
+                rc_dt = opal_convertor_pack(convertor, &iov, &iov_count, &max_data );
+              //  mca_common_cuda_record_event(&convertor->pipeline_event[0]);
+           //      uint64_t event, *ep;
+           //      ep = &event;
+           //      mca_common_cuda_create_event((uint64_t**)ep);
+           // //     mca_common_cuda_record_event(ep);
+           //      printf("success record event %d\n", event);
+                rc = mca_pml_ob1_send_request_start_rdma(sendreq, bml_btl,
+                                                         sendreq->req_send.req_bytes_packed);
+                if( OPAL_UNLIKELY(OMPI_SUCCESS != rc) ) {
+                    mca_pml_ob1_free_rdma_resources(sendreq);
+                }
+            } else {
+                rc = mca_pml_ob1_send_request_start_rndv(sendreq, bml_btl, 0, 0);
+            }
+        } else {
+            rc = mca_pml_ob1_send_request_start_rndv(sendreq, bml_btl, 0, 0);
+        }
     }
 #else
     /* Just do the rendezvous but set initial data to be sent to zero */
@@ -157,6 +205,30 @@ size_t mca_pml_ob1_rdma_cuda_btls(
     return num_btls_used;
 }
 
+int mca_pml_ob1_rdma_cuda_btl_register_events(
+    mca_pml_ob1_com_btl_t* rdma_btls, 
+    uint32_t num_btls_used, 
+    struct opal_convertor_t* convertor)
+{
+    // uint32_t i, j;
+    // for (i = 0; i < num_btls_used; i++) {
+    //     mca_btl_base_registration_handle_t *handle = rdma_btls[i].btl_reg;
+    //     mca_mpool_common_cuda_reg_t *cuda_reg = (mca_mpool_common_cuda_reg_t *)
+    //             ((intptr_t) handle - offsetof (mca_mpool_common_cuda_reg_t, data));
+    //     printf("base %p\n", cuda_reg->base.base);
+    //     for (j = 0; j < MAX_IPC_EVENT_HANDLE; j++) {
+    //         uint64_t *event = &convertor->pipeline_event[j];
+    //         convertor->pipeline_event[j] = 0;
+    //         mca_common_cuda_geteventhandle(&event, j, (mca_mpool_base_registration_t *)cuda_reg);
+    //         convertor->pipeline_event[j] = *event;
+    //   //      printf("event %lu, j %d\n", convertor->pipeline_event[j], j);
+    //     }
+    //     cuda_reg->data.pipeline_size = 1000;
+    //
+    // }
+    return 0;
+}
+
 int mca_pml_ob1_cuda_need_buffers(void * rreq,
                                   mca_btl_base_module_t* btl)
 {
diff --git a/ompi/mca/pml/ob1/pml_ob1_recvreq.c b/ompi/mca/pml/ob1/pml_ob1_recvreq.c
index b7646890d03..15cfe8560ba 100644
--- a/ompi/mca/pml/ob1/pml_ob1_recvreq.c
+++ b/ompi/mca/pml/ob1/pml_ob1_recvreq.c
@@ -649,8 +649,11 @@ void mca_pml_ob1_recv_request_progress_rget( mca_pml_ob1_recv_request_t* recvreq
         if (mca_pml_ob1_cuda_need_buffers(recvreq, btl))
 #endif /* OPAL_CUDA_SUPPORT */
         {
-            mca_pml_ob1_recv_request_ack(recvreq, &hdr->hdr_rndv, 0);
-            return;
+            /* need more careful check here */
+            if (! (recvreq->req_recv.req_base.req_convertor.flags & CONVERTOR_CUDA)) {
+                mca_pml_ob1_recv_request_ack(recvreq, &hdr->hdr_rndv, 0);
+                return;    
+            }
         }
     }
 
diff --git a/ompi/mca/pml/ob1/pml_ob1_sendreq.c b/ompi/mca/pml/ob1/pml_ob1_sendreq.c
index f1f2744b2e3..50b11d36dff 100644
--- a/ompi/mca/pml/ob1/pml_ob1_sendreq.c
+++ b/ompi/mca/pml/ob1/pml_ob1_sendreq.c
@@ -675,10 +675,26 @@ int mca_pml_ob1_send_request_start_rdma( mca_pml_ob1_send_request_t* sendreq,
                                                     MCA_PML_OB1_HDR_FLAGS_PIN);
     }
 
+#if OPAL_CUDA_SUPPORT
+    if ( (sendreq->req_send.req_base.req_convertor.flags & CONVERTOR_CUDA)) {
+        sendreq->req_send.req_base.req_convertor.flags &= ~CONVERTOR_CUDA;
+        if (opal_convertor_need_buffers(&sendreq->req_send.req_base.req_convertor) == true) {
+            data_ptr = sendreq->req_send.req_base.req_convertor.gpu_buffer_ptr_source;
+            printf("START RMDA data_ptr %p\n", data_ptr);
+        } else {
+            opal_convertor_get_current_pointer (&sendreq->req_send.req_base.req_convertor, &data_ptr);
+        }
+        /* Set flag back */
+        sendreq->req_send.req_base.req_convertor.flags |= CONVERTOR_CUDA;
+    } else {
+        opal_convertor_get_current_pointer (&sendreq->req_send.req_base.req_convertor, &data_ptr);
+    }
+#else
     /* at this time ob1 does not support non-contiguous gets. the convertor represents a
      * contiguous block of memory */
     opal_convertor_get_current_pointer (&sendreq->req_send.req_base.req_convertor, &data_ptr);
-
+#endif
+    
     local_handle = sendreq->req_rdma[0].btl_reg;
 
     /* allocate an rdma fragment to keep track of the request size for use in the fin message */
diff --git a/opal/datatype/cuda/Makefile b/opal/datatype/cuda/Makefile
index 6be10afd0fd..e76f160fb88 100644
--- a/opal/datatype/cuda/Makefile
+++ b/opal/datatype/cuda/Makefile
@@ -6,7 +6,7 @@ RANLIB		= ranlib
 STLIB		?= opal_datatype_cuda.a
 DYLIB		?= opal_datatype_cuda.so
 CFLAGS		= -g -G -O0 
-EXTLIB		= -L/home/wwu12/ompi/ompi-cuda/opal/datatype/.libs -ldatatype
+EXTLIB		= -L/home/wwu12/ompi/ompi-gpu/opal/datatype/.libs -ldatatype -L/usr/lib64 -lcuda
 INC			=
 
 SRC	:= \
diff --git a/opal/datatype/cuda/opal_config.h b/opal/datatype/cuda/opal_config.h
index 19fa55f52ed..d23f071a86a 100644
--- a/opal/datatype/cuda/opal_config.h
+++ b/opal/datatype/cuda/opal_config.h
@@ -24,6 +24,10 @@
 #ifndef OPAL_CONFIG_H
 #define OPAL_CONFIG_H
 
+//#include "opal_config_top.h"
+
+
+
 /* Define if building universal (internal helper macro) */
 /* #undef AC_APPLE_UNIVERSAL_BUILD */
 
@@ -51,6 +55,9 @@
 /* Define to 1 if you have the <aio.h> header file. */
 #define HAVE_AIO_H 1
 
+/* Define to 1 if the linker supports alias attribute. */
+/* #undef HAVE_ALIAS_ATTRIBUTE */
+
 /* Define to 1 if you have the <alloca.h> header file. */
 #define HAVE_ALLOCA_H 1
 
@@ -63,6 +70,9 @@
 /* Define to 1 if you have the `asprintf' function. */
 #define HAVE_ASPRINTF 1
 
+/* Set to use c11 atomic functions */
+/* #undef HAVE_ATOMICS */
+
 /* Define to 1 if the system has the type `CACHE_DESCRIPTOR'. */
 /* #undef HAVE_CACHE_DESCRIPTOR */
 
@@ -93,6 +103,9 @@
 /* Define to 1 if you have the <crt_externs.h> header file. */
 /* #undef HAVE_CRT_EXTERNS_H */
 
+/* Define to 1 if you have the <ctype.h> header file. */
+#define HAVE_CTYPE_H 1
+
 /* Define to 1 if we have -lcuda */
 /* #undef HAVE_CUDA */
 
@@ -153,18 +166,14 @@
    don't. */
 /* #undef HAVE_DECL_IBV_ACCESS_SO */
 
+/* Define to 1 if you have the declaration of `IBV_ATOMIC_HCA', and to 0 if
+   you don't. */
+/* #undef HAVE_DECL_IBV_ATOMIC_HCA */
+
 /* Define to 1 if you have the declaration of `IBV_EVENT_CLIENT_REREGISTER',
    and to 0 if you don't. */
 /* #undef HAVE_DECL_IBV_EVENT_CLIENT_REREGISTER */
 
-/* Define to 1 if you have the declaration of `IBV_EVENT_GID_CHANGE', and to 0
-   if you don't. */
-/* #undef HAVE_DECL_IBV_EVENT_GID_CHANGE */
-
-/* Define to 1 if you have the declaration of `ibv_event_type_str', and to 0
-   if you don't. */
-/* #undef HAVE_DECL_IBV_EVENT_TYPE_STR */
-
 /* Define to 1 if you have the declaration of `IBV_EXP_ACCESS_ALLOCATE_MR',
    and to 0 if you don't. */
 /* #undef HAVE_DECL_IBV_EXP_ACCESS_ALLOCATE_MR */
@@ -177,17 +186,9 @@
    to 0 if you don't. */
 /* #undef HAVE_DECL_IBV_LINK_LAYER_ETHERNET */
 
-/* Define to 1 if you have the declaration of `IBV_NODE_USNIC', and to 0 if
-   you don't. */
-/* #undef HAVE_DECL_IBV_NODE_USNIC */
-
-/* Define to 1 if you have the declaration of `IBV_TRANSPORT_USNIC', and to 0
-   if you don't. */
-/* #undef HAVE_DECL_IBV_TRANSPORT_USNIC */
-
-/* Define to 1 if you have the declaration of `IBV_TRANSPORT_USNIC_UDP', and
-   to 0 if you don't. */
-/* #undef HAVE_DECL_IBV_TRANSPORT_USNIC_UDP */
+/* Define to 1 if you have the declaration of `IBV_SRQT_XRC', and to 0 if you
+   don't. */
+/* #undef HAVE_DECL_IBV_SRQT_XRC */
 
 /* Define to 1 if you have the declaration of
    `nvmlDeviceGetMaxPcieLinkGeneration', and to 0 if you don't. */
@@ -359,6 +360,9 @@
 /* Define to 1 if you have the <hwloc.h> header file. */
 /* #undef HAVE_HWLOC_H */
 
+/* Define to 1 if you have the `ibv_cmd_open_xrcd' function. */
+/* #undef HAVE_IBV_CMD_OPEN_XRCD */
+
 /* Define to 1 if you have the `ibv_create_xrc_rcv_qp' function. */
 /* #undef HAVE_IBV_CREATE_XRC_RCV_QP */
 
@@ -437,9 +441,21 @@
 /* Define to 1 if we have -llgrp */
 /* #undef HAVE_LIBLGRP */
 
+/* set to 1 if should use libnl v3, set to 0 for libnl v11 */
+#define HAVE_LIBNL3 0
+
 /* Define to 1 if you have the `pci' library (-lpci). */
 /* #undef HAVE_LIBPCI */
 
+/* Define to 1 if you have the `psm_infinipath' library (-lpsm_infinipath). */
+/* #undef HAVE_LIBPSM_INFINIPATH */
+
+/* Define to 1 if you have the `pthread' library (-lpthread). */
+#define HAVE_LIBPTHREAD 1
+
+/* Define to 1 if you have the `rt' library (-lrt). */
+#define HAVE_LIBRT 1
+
 /* Define to 1 if you have the <libutil.h> header file. */
 /* #undef HAVE_LIBUTIL_H */
 
@@ -494,12 +510,18 @@
 /* Define to 1 if you have the `mmap' function. */
 #define HAVE_MMAP 1
 
+/* Define to 1 if you have the <mntent.h> header file. */
+#define HAVE_MNTENT_H 1
+
 /* Define to 1 if the system has the type `mode_t'. */
 #define HAVE_MODE_T 1
 
 /* Define to 1 if you have the <mtcp.h> header file. */
 /* #undef HAVE_MTCP_H */
 
+/* Define to 1 if you have the <munge.h> header file. */
+/* #undef HAVE_MUNGE_H */
+
 /* Define to 1 if you have the <mxm/api/mxm_api.h> header file. */
 /* #undef HAVE_MXM_API_MXM_API_H */
 
@@ -515,9 +537,6 @@
 /* Define to 1 if you have the <netinet/tcp.h> header file. */
 #define HAVE_NETINET_TCP_H 1
 
-/* Define to 1 if you have the <netlink/netlink.h> header file. */
-/* #undef HAVE_NETLINK_NETLINK_H */
-
 /* Define to 1 if you have the <net/if.h> header file. */
 #define HAVE_NET_IF_H 1
 
@@ -545,6 +564,9 @@
 /* Define to 1 if you have the `openpty' function. */
 #define HAVE_OPENPTY 1
 
+/* Define to 1 if you have the <paths.h> header file. */
+#define HAVE_PATHS_H 1
+
 /* Define to 1 if you have the <pci/pci.h> header file. */
 /* #undef HAVE_PCI_PCI_H */
 
@@ -591,6 +613,12 @@
    */
 /* #undef HAVE_PSAPI_WORKING_SET_EX_INFORMATION */
 
+/* libfabric: whether to build the PSM provider or not */
+/* #undef HAVE_PSM */
+
+/* libfabric: do not build PSM provider as a DL */
+/* #undef HAVE_PSM_DL */
+
 /* Define to 1 if you have the <psm.h> header file. */
 /* #undef HAVE_PSM_H */
 
@@ -624,6 +652,9 @@
 /* Define to 1 if you have the <pwd.h> header file. */
 #define HAVE_PWD_H 1
 
+/* Define to 1 if you have the <rdma/fabric.h> header file. */
+/* #undef HAVE_RDMA_FABRIC_H */
+
 /* Define to 1 if you have the <rdma/rdma_cma.h> header file. */
 /* #undef HAVE_RDMA_RDMA_CMA_H */
 
@@ -678,12 +709,15 @@
 /* Define to 1 if you have the `snprintf' function. */
 #define HAVE_SNPRINTF 1
 
-/* Define to 1 if you have the <sn/xpmem.h> header file. */
-/* #undef HAVE_SN_XPMEM_H */
-
 /* Define to 1 if you have the `socketpair' function. */
 #define HAVE_SOCKETPAIR 1
 
+/* libfabric: do not build sockets provider */
+/* #undef HAVE_SOCKETS */
+
+/* libfabric: do not build sockets provider */
+/* #undef HAVE_SOCKETS_DL */
+
 /* Define to 1 if the system has the type `socklen_t'. */
 #define HAVE_SOCKLEN_T 1
 
@@ -902,6 +936,9 @@
 /* Define to 1 if you have the <tm.h> header file. */
 /* #undef HAVE_TM_H */
 
+/* Define to 1 if you have the <tm_tree.h> header file. */
+/* #undef HAVE_TM_TREE_H */
+
 /* Define to 1 if you have the <ucontext.h> header file. */
 #define HAVE_UCONTEXT_H 1
 
@@ -939,6 +976,12 @@
 /* Define to 1 if you have the `usleep' function. */
 #define HAVE_USLEEP 1
 
+/* libfabric: whether to build the usnic provider or not */
+/* #undef HAVE_USNIC */
+
+/* libfabric: do not build usnic provider as a DL */
+/* #undef HAVE_USNIC_DL */
+
 /* Define to 1 if you have the <util.h> header file. */
 /* #undef HAVE_UTIL_H */
 
@@ -951,6 +994,12 @@
 /* Define to 1 if you have the `vasprintf' function. */
 #define HAVE_VASPRINTF 1
 
+/* libfabric: do not build verbs provider */
+/* #undef HAVE_VERBS */
+
+/* libfabric: do not build verbs provider */
+/* #undef HAVE_VERBS_DL */
+
 /* Define to 1 if you have the `vsnprintf' function. */
 #define HAVE_VSNPRINTF 1
 
@@ -978,6 +1027,9 @@
 /* Define to 1 if the system has the type `__float128'. */
 #define HAVE___FLOAT128 1
 
+/* Define to 1 if the system has the type `__int128'. */
+/* #undef HAVE___INT128 */
+
 /* Define to 1 if you have the `__mmap' function. */
 /* #undef HAVE___MMAP */
 
@@ -1188,7 +1240,7 @@
 /* #undef HWLOC_HPUX_SYS */
 
 /* Version of hwloc */
-#define HWLOC_HWLOC191_HWLOC_VERSION "internal v1.9.1"
+#define HWLOC_HWLOC191_HWLOC_VERSION "internal v1.9.2"
 
 /* Define to 1 on Irix */
 /* #undef HWLOC_IRIX_SYS */
@@ -1237,7 +1289,7 @@
 #define LT_OBJDIR ".libs/"
 
 /* Header to include for event implementation */
-#define MCA_event_IMPLEMENTATION_HEADER "opal/mca/event/libevent2021/libevent2021.h"
+#define MCA_event_IMPLEMENTATION_HEADER "opal/mca/event/libevent2022/libevent2022.h"
 
 /* Header to include for hwloc implementation */
 #define MCA_hwloc_IMPLEMENTATION_HEADER "opal/mca/hwloc/hwloc191/hwloc191.h"
@@ -1249,7 +1301,7 @@
 /* #undef MCA_hwloc_external_openfabrics_header */
 
 /* Complete set of command line arguments given to ROMIOs configure script */
-#define MCA_io_romio_COMPLETE_CONFIGURE_FLAGS " FROM_OMPI=yes CC='gcc -std=gnu99' CFLAGS='-g -Wall -Wundef -Wno-long-long -Wsign-compare -Wmissing-prototypes -Wstrict-prototypes -Wcomment -pedantic -Werror-implicit-function-declaration -finline-functions -fno-strict-aliasing -pthread' CPPFLAGS='  -I/home/wwu12/ompi/ompi-cuda/opal/mca/hwloc/hwloc191/hwloc/include -I/home/wwu12/ompi/ompi-cuda/opal/mca/event/libevent2021/libevent -I/home/wwu12/ompi/ompi-cuda/opal/mca/event/libevent2021/libevent/include' FFLAGS='' LDFLAGS='  ' --enable-shared --disable-static  --prefix=/home/wwu12/ompi/build-cuda --disable-aio"
+#define MCA_io_romio_COMPLETE_CONFIGURE_FLAGS " FROM_OMPI=yes CC='gcc -std=gnu99' CFLAGS='-g -Wall -Wundef -Wno-long-long -Wsign-compare -Wmissing-prototypes -Wstrict-prototypes -Wcomment -pedantic -Werror-implicit-function-declaration -finline-functions -fno-strict-aliasing -pthread -D__EXTENSIONS__' CPPFLAGS='  -I/home/wwu12/ompi/ompi-gpu/opal/mca/hwloc/hwloc191/hwloc/include -I/home/wwu12/ompi/ompi-gpu/opal/mca/event/libevent2022/libevent -I/home/wwu12/ompi/ompi-gpu/opal/mca/event/libevent2022/libevent/include' FFLAGS='' LDFLAGS='  ' --enable-shared --disable-static  --prefix=/home/wwu12/ompi/build-gpu --disable-aio --disable-weak-symbols --enable-strict"
 
 /* Set of user-defined configure flags given to ROMIOs configure script via
    --with-io-romio-flags */
@@ -1436,9 +1488,6 @@
 /* Enable contributed software package libompitrace */
 #define OMPI_ENABLE_CONTRIB_libompitrace 1
 
-/* Enable contributed software package vt */
-#define OMPI_ENABLE_CONTRIB_vt 1
-
 /* Whether we want MPI profiling or not */
 #define OMPI_ENABLE_MPI_PROFILING 1
 
@@ -1490,6 +1539,10 @@
    not */
 #define OMPI_FORTRAN_HAVE_BIND_C_TYPE_NAME 0
 
+/* For ompi/mpi/fortran/use-mpi-f08/blah.F90 and blah.h and ompi_info: whether
+   the compiler supports c_funloc or not */
+#define OMPI_FORTRAN_HAVE_C_FUNLOC 0
+
 /* For ompi_info: Whether the Fortran compiler supports the Fortran 2008
    "assumed rank" syntax or not */
 #define OMPI_FORTRAN_HAVE_F08_ASSUMED_RANK 0
@@ -1717,7 +1770,7 @@
 #define OMPI_MPI_AINT_TYPE ptrdiff_t
 
 /* Contributed software packages built with Open MPI */
-#define OMPI_MPI_CONTRIBS "vt, libompitrace"
+#define OMPI_MPI_CONTRIBS "libompitrace"
 
 /* Size of the MPI_Count datatype */
 #define OMPI_MPI_COUNT_SIZE 8
@@ -1769,7 +1822,7 @@
 #define OMPI_RELEASE_VERSION 0
 
 /* The repository version Open MPI */
-#define OMPI_REPO_REV "dev-267-g51b4521"
+#define OMPI_REPO_REV "dev-1510-g40fe521"
 
 /* Defined to 1 if the OMPI runtime component is ORTE */
 #define OMPI_RTE_ORTE 1
@@ -1977,6 +2030,9 @@
 /* Format of assembly file */
 #define OPAL_ASSEMBLY_FORMAT "default-.text-.globl-:--.L-@-1-0-1-1-1"
 
+/* Whether we have support for RDTSCP instruction */
+#define OPAL_ASSEMBLY_SUPPORTS_RDTSCP 0
+
 /* Enable flow control for Portals4 BTL */
 #define OPAL_BTL_PORTALS4_FLOW_CONTROL 0
 
@@ -1986,6 +2042,9 @@
 /* If knem support can be enabled */
 #define OPAL_BTL_SM_HAVE_KNEM 0
 
+/* Path by which to include fi_ext_usnic.h */
+/* #undef OPAL_BTL_USNIC_FI_EXT_USNIC_H */
+
 /* define to 1 if usnic BTL unit tests are enabled, 0 otherwise */
 #define OPAL_BTL_USNIC_UNIT_TESTS 0
 
@@ -2032,7 +2091,7 @@
 #define OPAL_CUDA_GDR_SUPPORT 1
 
 /* Whether we have CUDA cuPointerGetAttributes function available */
-#define OPAL_CUDA_GET_ATTRIBUTES 0
+#define OPAL_CUDA_GET_ATTRIBUTES 1
 
 /* Whether we want cuda device pointer support */
 #define OPAL_CUDA_SUPPORT 1
@@ -2079,6 +2138,9 @@
 /* Whether C compiler supports XLC style inline assembly */
 #define OPAL_C_XLC_INLINE_ASSEMBLY 0
 
+/* Whether we have lt_dladvise or not */
+#define OPAL_DL_LIBLTDL_HAVE_LT_DLADVISE 0
+
 /* Whether we want checkpoint/restart enabled debugging functionality or not
    */
 #define OPAL_ENABLE_CRDEBUG 0
@@ -2218,15 +2280,27 @@
 /* whether ceil is found and available */
 #define OPAL_HAVE_CEIL 1
 
+/* whether clock_gettime is found and available */
+#define OPAL_HAVE_CLOCK_GETTIME 1
+
+/* Whether the processor supports the cmpxchg16b instruction */
+#define OPAL_HAVE_CMPXCHG16B 1
+
 /* Enable features required for ConnectX XRC support */
 #define OPAL_HAVE_CONNECTX_XRC 0
 
+/* Enable features required for XRC domains support */
+#define OPAL_HAVE_CONNECTX_XRC_DOMAINS 0
+
 /* whether crs_blcr is found and available */
 /* #undef OPAL_HAVE_CRS_BLCR */
 
 /* whether dirname is found and available */
 #define OPAL_HAVE_DIRNAME 1
 
+/* Whether the OPAL DL framework is functional or not */
+#define OPAL_HAVE_DL_SUPPORT 1
+
 /* whether fbtl_posix is found and available */
 #define OPAL_HAVE_FBTL_POSIX 1
 
@@ -2243,15 +2317,9 @@
    long'. */
 #define OPAL_HAVE_LONG_LONG 1
 
-/* Whether libltdl appears to have the lt_dladvise interface */
-#define OPAL_HAVE_LTDL_ADVISE 0
-
 /* whether openpty is found and available */
 #define OPAL_HAVE_OPENPTY 1
 
-/* Do we have POSIX threads */
-#define OPAL_HAVE_POSIX_THREADS 1
-
 /* If PTHREADS implementation supports PTHREAD_MUTEX_ERRORCHECK */
 #define OPAL_HAVE_PTHREAD_MUTEX_ERRORCHECK 1
 
@@ -2279,6 +2347,10 @@
 /* Whether or not we have solaris */
 #define OPAL_HAVE_SOLARIS 0
 
+/* Whether the __sync builtin atomic compare and swap supports 128-bit values
+   */
+/* #undef OPAL_HAVE_SYNC_BUILTIN_CSWAP_INT128 */
+
 /* Do not use outside of mpi.h. Define to 1 if you have the <sys/synch.h>
    header file. */
 /* #undef OPAL_HAVE_SYS_SYNCH_H */
@@ -2316,9 +2388,6 @@
 /* ident string for Open MPI */
 #define OPAL_IDENT_STRING "1.9.0a1"
 
-/* Whether we are using the internal libltdl or not */
-#define OPAL_LIBLTDL_INTERNAL 1
-
 /* Major release number of Open Portable Access Layer */
 #define OPAL_MAJOR_VERSION 1
 
@@ -2386,7 +2455,7 @@
 #define OPAL_RELEASE_VERSION 0
 
 /* The repository version Open Portable Access Layer */
-#define OPAL_REPO_REV "dev-267-g51b4521"
+#define OPAL_REPO_REV "dev-1510-g40fe521"
 
 /* Whether we have shared memory support for mmap or not */
 #define OPAL_SHMEM_MMAP 1
@@ -2413,9 +2482,6 @@
 /* Enable per-user config files */
 #define OPAL_WANT_HOME_CONFIG_FILES 1
 
-/* Whether to include support for libltdl or not */
-#define OPAL_WANT_LIBLTDL 1
-
 /* if the memory and buffer checking should be enabled */
 #define OPAL_WANT_MEMCHECKER 0
 
@@ -2448,7 +2514,7 @@
 #define ORTE_RELEASE_VERSION 0
 
 /* The repository version Open MPI Run-Time Environment */
-#define ORTE_REPO_REV "dev-267-g51b4521"
+#define ORTE_REPO_REV "dev-1510-g40fe521"
 
 /* Tarball filename version string of Open MPI Run-Time Environment */
 #define ORTE_TARBALL_VERSION "gitclone"
@@ -2481,7 +2547,7 @@
 #define OSHMEM_RELEASE_VERSION 0
 
 /* The repository version Open SHMEM */
-#define OSHMEM_REPO_REV "dev-267-g51b4521"
+#define OSHMEM_REPO_REV "dev-1510-g40fe521"
 
 /* Whether user wants OSHMEM in compatibility mode or not */
 #define OSHMEM_SPEC_COMPAT 1
@@ -2522,6 +2588,9 @@
 /* Define to the version of this package. */
 #define PACKAGE_VERSION "gitclone"
 
+/* Define PT_LOCK_SPIN to 1 if available. */
+/* #undef PT_LOCK_SPIN */
+
 /* The size of `bool', as computed by sizeof. */
 #define SIZEOF_BOOL 1
 
@@ -2656,7 +2725,7 @@
 #define WRAPPER_EXTRA_LDFLAGS "    -Wl,-rpath -Wl,@{libdir} -Wl,--enable-new-dtags"
 
 /* Additional LIBS to pass through the wrapper compilers */
-#define WRAPPER_EXTRA_LIBS "-lm -ldl -lutil "
+#define WRAPPER_EXTRA_LIBS "-lm -ldl -lutil -lrt "
 
 /* Whether the wrapper compilers add rpath flags by default */
 #define WRAPPER_RPATH_SUPPORT "runpath"
@@ -2788,5 +2857,7 @@
 # define __restrict__
 #endif
 
+
+//#include "opal_config_bottom.h"
 #endif /* OPAL_CONFIG_H */
 
diff --git a/opal/datatype/cuda/opal_datatype_cuda.cu b/opal/datatype/cuda/opal_datatype_cuda.cu
index 105ba2bfeba..1debbd221a5 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cu
+++ b/opal/datatype/cuda/opal_datatype_cuda.cu
@@ -1,6 +1,7 @@
 #include "opal_datatype_cuda_internal.cuh"
 #include "opal_datatype_cuda.cuh"
 #include <cuda_runtime_api.h>
+#include <cuda.h>
 #include <stdio.h>
 #include <stdarg.h> 
 
@@ -163,6 +164,39 @@ void opal_cuda_sync_device(void)
     cuda_desc_h->iov[0].iov_base = (void*)gpu_dest_const;
 }
 
+int32_t opal_cuda_is_gpu_buffer(const void *ptr)
+{
+    int res;
+    CUmemorytype memType;
+    CUdeviceptr dbuf = (CUdeviceptr)ptr;
+    res = cuPointerGetAttribute(&memType, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, dbuf);
+    if (res != CUDA_SUCCESS) {
+        /* If we cannot determine it is device pointer,
+         * just assume it is not. */
+        printf("!!!!!!!is gpu buffer error\n");
+        return 0;
+    } 
+    if (memType == CU_MEMORYTYPE_DEVICE) {
+        return 1;
+    } else if (memType == CU_MEMORYTYPE_HOST){
+        return 0;
+    } else if (memType == 0) {
+        return 0;
+    } else {
+        return 0;
+    }
+}
+
+unsigned char* opal_cuda_get_gpu_pack_buffer()
+{
+    if (ddt_cuda_pack_buffer != NULL) {
+        return ddt_cuda_pack_buffer;
+    } else {
+        return NULL;
+    }
+}
+
+/* from internal.h*/
 void opal_cuda_output(int output_id, const char *format, ...)
 {
     if (output_id >= 0 && output_id <= OPAL_DATATYPE_CUDA_DEBUG_LEVEL) {
diff --git a/opal/datatype/cuda/opal_datatype_cuda.cuh b/opal/datatype/cuda/opal_datatype_cuda.cuh
index ebaad5a06fc..5797ceb55d8 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cuh
+++ b/opal/datatype/cuda/opal_datatype_cuda.cuh
@@ -47,6 +47,10 @@ void pack_predefined_data_cuda( dt_elem_desc_t* ELEM,
                                 size_t* SPACE );
 
 void opal_cuda_sync_device(void);
+
+int32_t opal_cuda_is_gpu_buffer(const void *ptr);
+
+unsigned char* opal_cuda_get_gpu_pack_buffer();
 }
                             
 #endif  /* OPAL_DATATYPE_CUDA_H_HAS_BEEN_INCLUDED */
\ No newline at end of file
diff --git a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
index b510a2f5808..be264484153 100644
--- a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
+++ b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
@@ -13,7 +13,6 @@
 #define OPAL_DATATYPE_CUDA_DEBUG
 //#define OPAL_DATATYPE_CUDA_KERNEL_TIME
 #define OPAL_DATATYPE_CUDA_DEBUG_LEVEL  0
-#define OPAL_DATATYPE_CUDA_IOV
 #define OPAL_DATATYPE_CUDA_TIMING
 
 
diff --git a/opal/datatype/cuda/opal_datatype_orig_internal.h b/opal/datatype/cuda/opal_datatype_orig_internal.h
index fc30fc87741..37b1d1be51b 100644
--- a/opal/datatype/cuda/opal_datatype_orig_internal.h
+++ b/opal/datatype/cuda/opal_datatype_orig_internal.h
@@ -292,6 +292,8 @@ typedef struct opal_convertor_master_t {
     conversion_fct_t*               pFunctions;   /**< the convertor functions pointer */
 } opal_convertor_master_t;
 
+#define MAX_IPC_EVENT_HANDLE   10
+
 struct opal_convertor_t {
     opal_object_t                 super;          /**< basic superclass */
     uint32_t                      remoteArch;     /**< the remote architecture */
@@ -322,6 +324,10 @@ struct opal_convertor_t {
 #if OPAL_CUDA_SUPPORT
     memcpy_fct_t                  cbmemcpy;       /**< memcpy or cuMemcpy */
     void *                        stream;         /**< CUstream for async copy */
+
+    unsigned char *               gpu_buffer_ptr; /**< GPU buffer used for pack/unpack */
+    unsigned char *               gpu_buffer_ptr_source; /**< source address of GPU buffer start to pack, update in packing function */
+    uint64_t *                    pipeline_event[MAX_IPC_EVENT_HANDLE]; /**< cuda event for pipeline */
 #endif
     /* size: 248, cachelines: 4, members: 20 */
     /* last cacheline: 56 bytes */
@@ -643,4 +649,4 @@ OPAL_DECLSPEC extern const size_t opal_datatype_basicDatatypesSize[OPAL_DATATYPE
 
 #define     OPAL_DATATYPE_UNAVAILABLE_SIZE  0
 
-#endif  /* OPAL_DATATYPE_ORIG_INTERNAL_H_HAS_BEEN_INCLUDED */
\ No newline at end of file
+#endif  /* OPAL_DATATYPE_ORIG_INTERNAL_H_HAS_BEEN_INCLUDED */
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
index f13610fc1bf..14fdcfca346 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
@@ -316,7 +316,7 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
     unsigned char *destination;
     size_t total_packed, total_converted;
     int32_t complete_flag = 0;
-    uint8_t buffer_isfull = 0;
+    uint8_t buffer_isfull = 0, transfer_required;
     uint32_t convertor_flags;
     dt_elem_desc_t* description;
     dt_elem_desc_t* pElem;
@@ -341,7 +341,7 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
     destination = (unsigned char*)iov[0].iov_base;
 #else
 //    pConvertor->pBaseBuf = pBaseBuf_GPU;
-  //  printf("Pack GPU base %p, iov_buffer %p\n", pConvertor->pBaseBuf, iov[0].iov_base);
+    printf("Pack GPU base %p, gpu_buffer %p\n", pConvertor->pBaseBuf, ddt_cuda_pack_buffer);
     destination = ddt_cuda_pack_buffer;
 #endif /* OPAL_DATATYPE_CUDA_DRY_RUN */
 
@@ -353,9 +353,35 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
     pStack = pConvertor->pStack + pConvertor->stack_pos;
     pElem = &(description[pStack->index]);
     printf("size elem %lu, size %d\n", pElem->elem.common.type, opal_datatype_basicDatatypesSize[pElem->elem.common.type]);
+    
+    assert(opal_datatype_basicDatatypesSize[pElem->elem.common.type] != 0);
 
     printf("buffer size %d, max_data %d\n", iov[0].iov_len, *max_data);
-    buffer_size = iov[0].iov_len;
+    if ((iov[0].iov_base == NULL) || opal_cuda_is_gpu_buffer(iov[0].iov_base)) {
+        if (iov[0].iov_len == 0) {
+            buffer_size = DT_CUDA_BUFFER_SIZE;
+        } else {
+            buffer_size = iov[0].iov_len;
+        }
+        pConvertor->gpu_buffer_ptr = ddt_cuda_pack_buffer;
+        pConvertor->gpu_buffer_ptr_source = pConvertor->gpu_buffer_ptr + pConvertor->bConverted;
+        
+        if (iov[0].iov_base == NULL) {
+            iov[0].iov_base = ddt_cuda_pack_buffer;
+            destination = ddt_cuda_pack_buffer;
+        } else {
+            destination = (unsigned char *)iov[0].iov_base;
+        }
+        transfer_required = 0;
+    } else {
+        buffer_size = iov[0].iov_len;
+        pConvertor->gpu_buffer_ptr = NULL;
+        pConvertor->gpu_buffer_ptr_source = NULL;
+        transfer_required = 1;
+    }
+    
+    printf("start packing from %p\n", destination);
+
     cuda_iov_count = 1000;
     total_packed = 0;
     total_converted = pConvertor->bConverted;
@@ -371,7 +397,7 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
 #endif
     
     dst_offset = 0;
-    thread_per_block = CUDA_WARP_SIZE * 4;
+    thread_per_block = CUDA_WARP_SIZE * 5;
     nb_blocks = 256;
     
     while (cuda_iov_count > 0) {
@@ -400,7 +426,7 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
         }
 
         for (i = 0; i < cuda_iov_count; i++) {
-            pElem = &(description[pStack->index+i]);
+    //        pElem = &(description[pStack->index+i]);
             if (buffer_size >= cuda_iov[i].iov_len) {
                 length_per_iovec = cuda_iov[i].iov_len;
             } else {
@@ -514,7 +540,9 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
     GET_TIME(start);
 #endif
-    cudaMemcpy(iov[0].iov_base, ddt_cuda_pack_buffer, total_packed, cudaMemcpyDeviceToHost);
+    if (transfer_required) {
+        cudaMemcpy(iov[0].iov_base, ddt_cuda_pack_buffer, total_packed, cudaMemcpyDeviceToHost);
+    } 
 #if defined(OPAL_DATATYPE_CUDA_TIMING) 
     GET_TIME( end );
     total_time = ELAPSED_TIME( start, end );
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
index 88a66de5f02..dccf9f23e82 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
@@ -162,8 +162,12 @@ int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvert
     // printf("\n");
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
     GET_TIME(start);
-#endif    
-    cudaMemcpy(source, iov[0].iov_base, iov[0].iov_len, cudaMemcpyHostToDevice);
+#endif
+    if (opal_cuda_is_gpu_buffer(iov[0].iov_base)) {
+        source = (unsigned char*)iov[0].iov_base;
+    } else {    
+        cudaMemcpy(source, iov[0].iov_base, iov[0].iov_len, cudaMemcpyHostToDevice);
+    }
 #if defined(OPAL_DATATYPE_CUDA_TIMING) 
     GET_TIME( end );
     total_time = ELAPSED_TIME( start, end );
@@ -190,7 +194,7 @@ int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvert
 #endif
     
     dst_offset = 0;
-    thread_per_block = CUDA_WARP_SIZE * 4;
+    thread_per_block = CUDA_WARP_SIZE * 5;
     nb_blocks = 256;
     
     while (cuda_iov_count > 0) {
@@ -312,7 +316,6 @@ int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvert
     iov[0].iov_len = total_unpacked;
     *max_data = total_unpacked;
     *out_size = 1;
-    
     DT_CUDA_DEBUG ( opal_cuda_output(0, "total unpacked %d\n", total_unpacked); );
     
 #if defined(OPAL_DATATYPE_CUDA_TIMING)    
diff --git a/opal/datatype/opal_convertor.c b/opal/datatype/opal_convertor.c
index f0d6dbb10e3..4ed3773495f 100644
--- a/opal/datatype/opal_convertor.c
+++ b/opal/datatype/opal_convertor.c
@@ -559,11 +559,11 @@ int32_t opal_convertor_prepare_for_recv( opal_convertor_t* convertor,
     convertor->flags |= CONVERTOR_RECV;
 #if OPAL_CUDA_SUPPORT
     mca_cuda_convertor_init(convertor, pUserBuf);
-#if defined (OPAL_DATATYPE_CUDA)
+#if OPAL_DATATYPE_CUDA_KERNEL
     if (opal_datatype_gpu_init() != OPAL_SUCCESS) {
         opal_datatype_gpu_fini();
     }
-#endif /* defined OPAL_DATATYPE_CUDA */
+#endif /* defined OPAL_DATATYPE_CUDA_KERNEL */
 #endif
 
     OPAL_CONVERTOR_PREPARE( convertor, datatype, count, pUserBuf );
@@ -588,7 +588,7 @@ int32_t opal_convertor_prepare_for_recv( opal_convertor_t* convertor,
         if( convertor->pDesc->flags & OPAL_DATATYPE_FLAG_CONTIGUOUS ) {
             convertor->fAdvance = opal_unpack_homogeneous_contig;
         } else {
-            if (convertor->flags & CONVERTOR_CUDA ) {
+            if ((convertor->flags & CONVERTOR_CUDA) && OPAL_DATATYPE_CUDA_KERNEL) {
                 convertor->fAdvance = opal_generic_simple_unpack_cuda;
             } else {
                 convertor->fAdvance = opal_generic_simple_unpack;
@@ -607,11 +607,11 @@ int32_t opal_convertor_prepare_for_send( opal_convertor_t* convertor,
     convertor->flags |= CONVERTOR_SEND;
 #if OPAL_CUDA_SUPPORT
     mca_cuda_convertor_init(convertor, pUserBuf);
-#if defined (OPAL_DATATYPE_CUDA)
+#if OPAL_DATATYPE_CUDA_KERNEL
     if (opal_datatype_gpu_init() != OPAL_SUCCESS) {
         opal_datatype_gpu_fini();
     }
-#endif /* defined OPAL_DATATYPE_CUDA */
+#endif /* defined OPAL_DATATYPE_CUDA_KERNEL */
 #endif
 
     OPAL_CONVERTOR_PREPARE( convertor, datatype, count, pUserBuf );
@@ -634,7 +634,7 @@ int32_t opal_convertor_prepare_for_send( opal_convertor_t* convertor,
             else
                 convertor->fAdvance = opal_pack_homogeneous_contig_with_gaps;
         } else {
-            if (convertor->flags & CONVERTOR_CUDA ) {
+            if ((convertor->flags & CONVERTOR_CUDA) && OPAL_DATATYPE_CUDA_KERNEL ) {
                 convertor->fAdvance = opal_generic_simple_pack_cuda;
             } else {
                 convertor->fAdvance = opal_generic_simple_pack;
diff --git a/opal/datatype/opal_convertor.h b/opal/datatype/opal_convertor.h
index 5b26b7e7d63..6ed9e311d84 100644
--- a/opal/datatype/opal_convertor.h
+++ b/opal/datatype/opal_convertor.h
@@ -79,6 +79,8 @@ typedef struct dt_stack_t dt_stack_t;
  */
 #define DT_STATIC_STACK_SIZE   5                /**< This should be sufficient for most applications */
 
+#define MAX_IPC_EVENT_HANDLE   10
+
 struct opal_convertor_t {
     opal_object_t                 super;          /**< basic superclass */
     uint32_t                      remoteArch;     /**< the remote architecture */
@@ -109,6 +111,10 @@ struct opal_convertor_t {
 #if OPAL_CUDA_SUPPORT
     memcpy_fct_t                  cbmemcpy;       /**< memcpy or cuMemcpy */
     void *                        stream;         /**< CUstream for async copy */
+
+    unsigned char *               gpu_buffer_ptr; /**< GPU buffer used for pack/unpack */
+    unsigned char *               gpu_buffer_ptr_source; /**< source address of GPU buffer start to pack, update in packing function */
+    uint64_t *                    pipeline_event[MAX_IPC_EVENT_HANDLE]; /**< cuda event for pipeline */
 #endif
     /* size: 248, cachelines: 4, members: 20 */
     /* last cacheline: 56 bytes */
diff --git a/opal/datatype/opal_datatype_gpu.c b/opal/datatype/opal_datatype_gpu.c
index 787e86e4f4c..f8c4785994d 100644
--- a/opal/datatype/opal_datatype_gpu.c
+++ b/opal/datatype/opal_datatype_gpu.c
@@ -83,10 +83,12 @@ void (*pack_predefined_data_cuda_p)( dt_elem_desc_t* ELEM,
 
 void (*opal_cuda_sync_device_p)(void) = NULL;
 
+unsigned char* (*opal_cuda_get_gpu_pack_buffer_p)(void) = NULL;
+
 int32_t opal_datatype_gpu_init(void)
 {
     char *error;
-    char *lib = "/home/wwu12/ompi/ompi-cuda/opal/datatype/cuda/opal_datatype_cuda.so";
+    char *lib = "/home/wwu12/ompi/ompi-gpu/opal/datatype/cuda/opal_datatype_cuda.so";
     
     if (opal_datatype_cuda_handle ==  NULL) {
         opal_datatype_cuda_handle = dlopen(lib, RTLD_LAZY);
@@ -166,11 +168,19 @@ int32_t opal_datatype_gpu_init(void)
             return OPAL_ERROR;
         }
         
+        *(void **)(&opal_cuda_get_gpu_pack_buffer_p) = dlsym(opal_datatype_cuda_handle, "opal_cuda_get_gpu_pack_buffer");
+        if ((error = dlerror()) != NULL)  {
+            fprintf(stderr, "opal_cuda_get_gpu_pack_buffer error: %s\n", error);
+            opal_cuda_get_gpu_pack_buffer_p = NULL;
+            return OPAL_ERROR;
+        }
+        
         (*opal_datatype_cuda_init_p)();
         printf("cuda init done\n");   
     }
     return OPAL_SUCCESS;
 }
+
 int32_t opal_datatype_gpu_fini(void)
 {
     if (opal_datatype_cuda_handle != NULL) {
@@ -187,7 +197,22 @@ int32_t opal_datatype_gpu_fini(void)
         unpack_contiguous_loop_cuda_p = NULL;
         pack_predefined_data_cuda_p = NULL;
         opal_cuda_sync_device_p = NULL;
+        opal_cuda_get_gpu_pack_buffer_p = NULL;
         printf("cuda fini done\n");
     }
     return OPAL_SUCCESS;
 }
+
+unsigned char* opal_datatype_get_gpu_buffer(void)
+{
+#if OPAL_DATATYPE_CUDA_KERNEL
+    if (opal_datatype_gpu_init() != OPAL_SUCCESS) {
+        opal_datatype_gpu_fini();
+        return NULL;
+    }
+    return (*opal_cuda_get_gpu_pack_buffer_p)();
+#else
+    return NULL;
+#endif /* defined OPAL_DATATYPE_CUDA_KERNEL */
+    
+}
\ No newline at end of file
diff --git a/opal/datatype/opal_datatype_gpu.h b/opal/datatype/opal_datatype_gpu.h
index b8dc828a0df..49060bde8d1 100644
--- a/opal/datatype/opal_datatype_gpu.h
+++ b/opal/datatype/opal_datatype_gpu.h
@@ -1,10 +1,11 @@
 #ifndef OPAL_DATATYPE_GPU_H_HAS_BEEN_INCLUDED
 #define OPAL_DATATYPE_GPU_H_HAS_BEEN_INCLUDED
 
-#define OPAL_DATATYPE_CUDA_IOV
+#define OPAL_DATATYPE_CUDA_KERNEL   1
 
 int32_t opal_datatype_gpu_init(void);
 int32_t opal_datatype_gpu_fini(void);
+unsigned char* opal_datatype_get_gpu_buffer(void);
 
 extern void (*opal_datatype_cuda_init_p)(void);
 
@@ -49,4 +50,6 @@ extern void (*pack_predefined_data_cuda_p)( dt_elem_desc_t* ELEM,
                                             size_t* SPACE );
                                             
 extern void (*opal_cuda_sync_device_p)(void);
+
+extern unsigned char* (*opal_cuda_get_gpu_pack_buffer_p)(void);
 #endif /* OPAL_DATATYPE_GPU_H_HAS_BEEN_INCLUDED */
\ No newline at end of file
diff --git a/opal/datatype/opal_datatype_module.c b/opal/datatype/opal_datatype_module.c
index 307eb001085..09940374ab3 100644
--- a/opal/datatype/opal_datatype_module.c
+++ b/opal/datatype/opal_datatype_module.c
@@ -249,9 +249,9 @@ int32_t opal_datatype_finalize( void )
     /* clear all master convertors */
     opal_convertor_destroy_masters();
 
-#if defined (OPAL_DATATYPE_CUDA)  
+#if OPAL_DATATYPE_CUDA_KERNEL
     opal_datatype_gpu_fini();
-#endif /* defined OPAL_DATATYPE_CUDA */
+#endif /* defined OPAL_DATATYPE_CUDA_KERNEL */
 
     return OPAL_SUCCESS;
 }
diff --git a/opal/datatype/opal_datatype_pack.c b/opal/datatype/opal_datatype_pack.c
index dbfc1cec12d..a9aaa6541d7 100644
--- a/opal/datatype/opal_datatype_pack.c
+++ b/opal/datatype/opal_datatype_pack.c
@@ -412,11 +412,9 @@ opal_generic_simple_pack_cuda_function( opal_convertor_t* pConvertor,
                                         struct iovec* iov, uint32_t* out_size,
                                         size_t* max_data )
 {
-#if defined (OPAL_DATATYPE_CUDA_IOV)
     if (opal_generic_simple_pack_function_cuda_iov_p != NULL) {
         return (*opal_generic_simple_pack_function_cuda_iov_p)( pConvertor, iov, out_size, max_data);
 
     }
-#endif
     return 0;
 }
diff --git a/opal/datatype/opal_datatype_unpack.c b/opal/datatype/opal_datatype_unpack.c
index b569b40cd81..cad655000d6 100644
--- a/opal/datatype/opal_datatype_unpack.c
+++ b/opal/datatype/opal_datatype_unpack.c
@@ -599,11 +599,9 @@ opal_generic_simple_unpack_cuda_function( opal_convertor_t* pConvertor,
                                           struct iovec* iov, uint32_t* out_size,
                                           size_t* max_data )
 {
-#if defined (OPAL_DATATYPE_CUDA_IOV)
     if (opal_generic_simple_unpack_function_cuda_iov_p != NULL) {
         return (*opal_generic_simple_unpack_function_cuda_iov_p)( pConvertor, iov, out_size, max_data);
 
     }
-#endif
     return 0;
 }
diff --git a/opal/include/opal_config_top.h b/opal/include/opal_config_top.h
index 2f5ad1adec2..1ce5267c389 100644
--- a/opal/include/opal_config_top.h
+++ b/opal/include/opal_config_top.h
@@ -19,8 +19,6 @@
 #error "opal_config_top.h should only be included from opal_config.h"
 #endif
 
-#define OPAL_DATATYPE_CUDA
-
 /* The only purpose of this file is to undef the PACKAGE_<foo> macros
    that are put in by autoconf/automake projects.  Specifically, if
    you include a .h file from another project that defines these
diff --git a/opal/mca/btl/smcuda/btl_smcuda.c b/opal/mca/btl/smcuda/btl_smcuda.c
index bf470f4fb72..2e42d4babc8 100644
--- a/opal/mca/btl/smcuda/btl_smcuda.c
+++ b/opal/mca/btl/smcuda/btl_smcuda.c
@@ -71,6 +71,9 @@
 #include "btl_smcuda_frag.h"
 #include "btl_smcuda_fifo.h"
 
+#include "ompi/mca/pml/ob1/pml_ob1_recvreq.h"
+#include "ompi/mca/pml/ob1/pml_ob1_rdmafrag.h"
+
 #if OPAL_CUDA_SUPPORT
 static struct mca_btl_base_registration_handle_t *mca_btl_smcuda_register_mem (
     struct mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t *endpoint, void *base,
@@ -1107,6 +1110,7 @@ int mca_btl_smcuda_get_cuda (struct mca_btl_base_module_t *btl,
     offset = (size_t) ((intptr_t) remote_address - (intptr_t) reg_ptr->base.base);
     remote_memory_address = (unsigned char *)reg_ptr->base.alloc_base + offset;
     if (0 != offset) {
+        printf("!!!!!!offset %d, ra %p, base %p\n", offset, (void*)remote_address, (void*)reg_ptr->base.base);
         opal_output(-1, "OFFSET=%d", (int)offset);
     }
 
@@ -1116,18 +1120,48 @@ int mca_btl_smcuda_get_cuda (struct mca_btl_base_module_t *btl,
      * on the IPC event that we received.  Note that we pull it from
      * rget_reg, not reg_ptr, as we do not cache the event. */
     mca_common_wait_stream_synchronize(&rget_reg);
-
-    rc = mca_common_cuda_memcpy(local_address, remote_memory_address, size,
-				"mca_btl_smcuda_get", (mca_btl_base_descriptor_t *)frag,
-				&done);
-    if (OPAL_SUCCESS != rc) {
-        /* Out of resources can be handled by upper layers. */
-        if (OPAL_ERR_OUT_OF_RESOURCE != rc) {
-            opal_output(0, "Failed to cuMemcpy GPU memory, rc=%d", rc);
+    
+    /* datatype RDMA */
+    mca_pml_ob1_rdma_frag_t *frag_ob1 = cbdata;
+    mca_pml_ob1_recv_request_t *recvreq = (mca_pml_ob1_recv_request_t *) frag_ob1->rdma_req;
+    mca_bml_base_btl_t *bml_btl = frag_ob1->rdma_bml;
+    
+    if ((recvreq->req_recv.req_base.req_convertor.flags & CONVERTOR_CUDA) &&
+        (bml_btl->btl_flags & MCA_BTL_FLAGS_CUDA_GET)) {
+        recvreq->req_recv.req_base.req_convertor.flags &= ~CONVERTOR_CUDA;
+        if(opal_convertor_need_buffers(&recvreq->req_recv.req_base.req_convertor) == true) {
+            recvreq->req_recv.req_base.req_convertor.flags |= CONVERTOR_CUDA;
+            printf("RGET NOT IMPLEMENT YET!!!!!!!!!!!!!!\n");
+            struct iovec iov;
+            uint32_t iov_count = 1;
+            iov.iov_base = remote_memory_address;
+            iov.iov_len = size;
+            int rc;
+            size_t max_data = size;
+            struct opal_convertor_t *convertor = &(recvreq->req_recv.req_base.req_convertor);   
+        //    uint64_t *event = &convertor->pipeline_event[0];
+            // mca_common_cuda_openeventhandle(&event, 0, (mca_mpool_common_cuda_reg_data_t*)remote_handle);
+            // if (mca_common_cuda_query_event(event) == OPAL_SUCCESS){
+            //     printf("get event\n");
+                rc = opal_convertor_unpack(convertor, &iov, &iov_count, &max_data );
+                done = 1;
+            // }
+        } else {
+            recvreq->req_recv.req_base.req_convertor.flags |= CONVERTOR_CUDA;
+            rc = mca_common_cuda_memcpy(local_address, remote_memory_address, size,
+        				"mca_btl_smcuda_get", (mca_btl_base_descriptor_t *)frag,
+        				&done);
+            if (OPAL_SUCCESS != rc) {
+                /* Out of resources can be handled by upper layers. */
+                if (OPAL_ERR_OUT_OF_RESOURCE != rc) {
+                    opal_output(0, "Failed to cuMemcpy GPU memory, rc=%d", rc);
+                }
+                return rc;
+            }
         }
-        return rc;
     }
 
+
     if (OPAL_UNLIKELY(1 == done)) {
         cbfunc (btl, ep, local_address, local_handle, cbcontext, cbdata, OPAL_SUCCESS);
         mca_btl_smcuda_free(btl, (mca_btl_base_descriptor_t *)frag);
diff --git a/opal/mca/common/cuda/common_cuda.c b/opal/mca/common/cuda/common_cuda.c
index bf966747a50..e0a80ef4ac2 100644
--- a/opal/mca/common/cuda/common_cuda.c
+++ b/opal/mca/common/cuda/common_cuda.c
@@ -1057,6 +1057,7 @@ int cuda_getmemhandle(void *base, size_t size, mca_mpool_base_registration_t *ne
                             "CUDA: cuMemGetAddressRange passed: addr=%p, size=%d, pbase=%p, psize=%d ",
                             base, (int)size, (void *)pbase, (int)psize);
     }
+    printf("sizeof memhandle %lu, CUipcMemHandle %lu, cuEvent %lu, char %lu\n", sizeof(memHandle), sizeof(CUipcMemHandle), sizeof(CUevent), sizeof(char));
 
     /* Store all the information in the registration */
     cuda_reg->base.base = (void *)pbase;
@@ -1649,6 +1650,69 @@ int progress_one_cuda_htod_event(struct mca_btl_base_descriptor_t **frag) {
     return 0;
 }
 
+int mca_common_cuda_geteventhandle(uint64_t **event, int n, mca_mpool_base_registration_t *newreg) 
+{
+    // CUipcEventHandle evtHandle;
+    // mca_mpool_common_cuda_reg_t *cuda_reg = (mca_mpool_common_cuda_reg_t*)newreg;
+    // mca_common_cuda_construct_event_and_handle(event, (void**)&evtHandle);
+    // memcpy(&cuda_reg->data.pipeline_evtHandle[n], &evtHandle, sizeof(evtHandle));
+    return OPAL_SUCCESS;
+}
+
+int mca_common_cuda_create_event(uint64_t **event)
+{
+    CUresult result;
+
+    result = cuFunc.cuEventCreate((CUevent *)event, CU_EVENT_INTERPROCESS | CU_EVENT_DISABLE_TIMING);
+    if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
+        opal_show_help("help-mpi-common-cuda.txt", "cuEventCreate failed",
+                       true, OPAL_PROC_MY_HOSTNAME, result);
+        return OPAL_ERROR;
+    }
+    return OPAL_SUCCESS;
+}
+
+int mca_common_cuda_record_event(uint64_t *event)
+{
+    CUresult result;
+    result = cuFunc.cuEventRecord((CUevent)event,0);
+    if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
+        printf("record event error %d\n", result);
+        return OPAL_ERROR;
+    } else {
+        return OPAL_SUCCESS;
+    }
+}
+
+int mca_common_cuda_query_event(uint64_t *event)
+{
+    CUresult result;
+    result = cuFunc.cuEventQuery((CUevent)event);
+    if (OPAL_UNLIKELY(CUDA_SUCCESS == result)) {
+        return OPAL_SUCCESS;
+    } else if (OPAL_UNLIKELY(CUDA_ERROR_NOT_READY == result)) {
+        return OPAL_ERROR;
+    } else {
+        printf("query event error %d\n", result);
+        return OPAL_ERROR;
+    }
+}
+
+int mca_common_cuda_openeventhandle(uint64_t **event, int n, mca_mpool_common_cuda_reg_data_t *handle)
+{
+    // CUipcEventHandle evtHandle;
+    // CUresult result;
+    // mca_mpool_common_cuda_reg_data_t *cuda_handle = (mca_mpool_common_cuda_reg_data_t*)handle;
+    // memcpy(&evtHandle, cuda_handle->pipeline_evtHandle[n], sizeof(evtHandle));
+    // result = cuFunc.cuIpcOpenEventHandle((CUevent *)event, evtHandle);
+    // if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
+    //     opal_show_help("help-mpi-common-cuda.txt", "cuIpcOpenEventHandle failed",
+    //                        true, result);
+    //     return OPAL_ERROR;
+    // }
+    return OPAL_SUCCESS;
+}
+
 
 /**
  * Need to make sure the handle we are retrieving from the cache is still
diff --git a/opal/mca/common/cuda/common_cuda.h b/opal/mca/common/cuda/common_cuda.h
index c0cd59c359b..da6b86d2464 100644
--- a/opal/mca/common/cuda/common_cuda.h
+++ b/opal/mca/common/cuda/common_cuda.h
@@ -28,12 +28,16 @@
 #define MEMHANDLE_SIZE 8
 #define EVTHANDLE_SIZE 8
 
+typedef uint64_t cuIPCHandle[EVTHANDLE_SIZE];
+
 struct mca_mpool_common_cuda_reg_data_t {
     uint64_t memHandle[MEMHANDLE_SIZE];
     uint64_t evtHandle[EVTHANDLE_SIZE];
     uint64_t event;
     opal_ptr_t memh_seg_addr;
     size_t memh_seg_len;
+//    cuIPCHandle pipeline_evtHandle[MAX_IPC_EVENT_HANDLE];
+    uint32_t pipeline_size;
 };
 typedef struct mca_mpool_common_cuda_reg_data_t mca_mpool_common_cuda_reg_data_t;
 
@@ -86,6 +90,11 @@ OPAL_DECLSPEC int mca_common_cuda_device_can_access_peer(int *access, int dev1,
 OPAL_DECLSPEC int mca_common_cuda_stage_one_init(void);
 OPAL_DECLSPEC int mca_common_cuda_get_address_range(void *pbase, size_t *psize, void *base);
 OPAL_DECLSPEC void mca_common_cuda_fini(void);
+OPAL_DECLSPEC int mca_common_cuda_geteventhandle(uint64_t **event, int n, mca_mpool_base_registration_t *newreg);
+OPAL_DECLSPEC int mca_common_cuda_create_event(uint64_t **event);
+OPAL_DECLSPEC int mca_common_cuda_record_event(uint64_t *event);
+OPAL_DECLSPEC int mca_common_cuda_query_event(uint64_t *event);
+OPAL_DECLSPEC int mca_common_cuda_openeventhandle(uint64_t **event, int n, mca_mpool_common_cuda_reg_data_t *handle);
 #if OPAL_CUDA_GDR_SUPPORT
 OPAL_DECLSPEC bool mca_common_cuda_previously_freed_memory(mca_mpool_base_registration_t *reg);
 OPAL_DECLSPEC void mca_common_cuda_get_buffer_id(mca_mpool_base_registration_t *reg);
diff --git a/test/datatype/Makefile.am b/test/datatype/Makefile.am
index 9b070ed1357..b9232a59893 100644
--- a/test/datatype/Makefile.am
+++ b/test/datatype/Makefile.am
@@ -14,8 +14,8 @@
 #
 
 if PROJECT_OMPI
-    MPI_TESTS = checksum position position_noncontig ddt_test ddt_test_old ddt_raw unpack_ooo ddt_pack
-    MPI_CHECKS = to_self
+    MPI_TESTS = checksum position position_noncontig ddt_test ddt_raw unpack_ooo ddt_pack
+    MPI_CHECKS = to_self ddt_pack
 endif
 TESTS = opal_datatype_test $(MPI_TESTS)
 
@@ -29,12 +29,12 @@ unpack_ooo_LDADD = \
 
 ddt_test_SOURCES = ddt_test.c ddt_lib.c ddt_lib.h
 ddt_test_LDFLAGS = $(WRAPPER_EXTRA_LDFLAGS)
-ddt_test_CFLAGS = -I/mnt/scratch/cuda-6.5.14/include -g 
-ddt_test_LDADD = $(top_builddir)/ompi/libmpi.la $(top_builddir)/opal/mca/common/cuda/libmca_common_cuda.la -L/mnt/scratch/cuda-6.5.14/lib64 -lcudart
+ddt_test_CFLAGS = -I/mnt/sw/cuda/include -g 
+ddt_test_LDADD = $(top_builddir)/ompi/libmpi.la $(top_builddir)/opal/mca/common/cuda/libmca_common_cuda.la -L/mnt/sw/cuda/lib64 -lcudart
 
-ddt_test_old_SOURCES = ddt_test_old.c ddt_lib.c ddt_lib.h
-ddt_test_old_LDFLAGS = $(WRAPPER_EXTRA_LDFLAGS)
-ddt_test_old_LDADD = $(top_builddir)/ompi/libmpi.la
+#ddt_test_old_SOURCES = ddt_test_old.c ddt_lib.c ddt_lib.h
+#ddt_test_old_LDFLAGS = $(WRAPPER_EXTRA_LDFLAGS)
+#ddt_test_old_LDADD = $(top_builddir)/ompi/libmpi.la
 
 ddt_raw_SOURCES = ddt_raw.c ddt_lib.c ddt_lib.h
 ddt_raw_LDFLAGS = $(OMPI_PKG_CONFIG_LDFLAGS)
diff --git a/test/datatype/ddt_test.c b/test/datatype/ddt_test.c
index e5f58a5b348..6a41001a770 100644
--- a/test/datatype/ddt_test.c
+++ b/test/datatype/ddt_test.c
@@ -36,6 +36,7 @@
 #include <cuda_runtime_api.h>
 #include "opal/mca/common/cuda/common_cuda.h"
 #include "opal/runtime/opal_params.h"
+#define CONVERTOR_CUDA             0x00400000
 #endif
 
 /* Compile with:
@@ -684,12 +685,18 @@ static int local_copy_with_convertor( ompi_datatype_t* pdt, int count, int chunk
 #endif
 
     send_convertor = opal_convertor_create( remote_arch, 0 );
+#if defined (DDT_TEST_CUDA)
+    send_convertor->flags |= CONVERTOR_CUDA;
+#endif
     if( OPAL_SUCCESS != opal_convertor_prepare_for_send( send_convertor, &(pdt->super), count, psrc ) ) {
         printf( "Unable to create the send convertor. Is the datatype committed ?\n" );
         goto clean_and_return;
     }
 
     recv_convertor = opal_convertor_create( remote_arch, 0 );
+#if defined (DDT_TEST_CUDA)
+    recv_convertor->flags |= CONVERTOR_CUDA;
+#endif
     if( OPAL_SUCCESS != opal_convertor_prepare_for_recv( recv_convertor, &(pdt->super), count, pdst ) ) {
         printf( "Unable to create the recv convertor. Is the datatype committed ?\n" );
         goto clean_and_return;
@@ -775,7 +782,7 @@ int main( int argc, char* argv[] )
 #endif
     opal_init_util(&argc, &argv);
 #if defined (DDT_TEST_CUDA)
-    mca_common_cuda_stage_one_init();
+   // mca_common_cuda_stage_one_init();
 #endif
     ompi_datatype_init();
 
@@ -807,11 +814,11 @@ int main( int argc, char* argv[] )
     }
     
     printf( "\n\n#\n * TEST UPPER TRIANGULAR MATRIX (size 100)\n #\n\n" );
-    pdt = upper_matrix(1000);
+    pdt = upper_matrix(4000);
     if( outputFlags & CHECK_PACK_UNPACK ) {
         for (i = 1; i <= 3; i++) {
 //        local_copy_ddt_count(pdt, 1);
-            local_copy_with_convertor(pdt, 1, 1024*1024*200, 1000);
+            local_copy_with_convertor(pdt, 1, 1024*1024*100, 4000);
         }
     }
     OBJ_RELEASE( pdt ); assert( pdt == NULL );

From c6a00d757f1ba7aca830b719548f8dfeb212abfb Mon Sep 17 00:00:00 2001
From: eddy16112 <eddy16112@gmail.com>
Date: Wed, 22 Apr 2015 00:16:10 -0400
Subject: [PATCH 004/190] Add support for vector datatype. Add pipeline.
 Improve the GPU memory management.

Conflicts:
	opal/mca/mpool/gpusm/mpool_gpusm.h
	opal/mca/mpool/gpusm/mpool_gpusm_module.c
---
 ompi/mca/pml/ob1/pml_ob1_cuda.c               |  69 ++--
 ompi/mca/pml/ob1/pml_ob1_sendreq.c            |   2 +-
 opal/datatype/cuda/opal_datatype_cuda.cu      | 260 +++++++++++++-
 opal/datatype/cuda/opal_datatype_cuda.cuh     |  16 +
 .../cuda/opal_datatype_cuda_internal.cuh      |  25 ++
 .../cuda/opal_datatype_orig_internal.h        |  12 +-
 .../cuda/opal_datatype_pack_cuda_kernel.cu    |   8 +-
 .../cuda/opal_datatype_pack_cuda_wrapper.cu   | 319 +++++++++++-------
 .../cuda/opal_datatype_unpack_cuda_kernel.cu  |   8 +-
 .../cuda/opal_datatype_unpack_cuda_wrapper.cu | 181 +++++++++-
 opal/datatype/opal_convertor.c                |   2 +
 opal/datatype/opal_convertor.h                |   1 -
 opal/datatype/opal_datatype_gpu.c             |  46 +++
 opal/datatype/opal_datatype_gpu.h             |  20 +-
 opal/datatype/opal_datatype_pack.c            |  21 +-
 opal/datatype/opal_datatype_unpack.c          |  21 +-
 opal/mca/btl/btl.h                            |   2 +
 opal/mca/btl/smcuda/btl_smcuda.c              | 106 +++++-
 opal/mca/btl/smcuda/btl_smcuda.h              |  36 ++
 opal/mca/btl/smcuda/btl_smcuda_component.c    |  65 ++++
 opal/mca/common/cuda/common_cuda.c            |  11 +-
 opal/mca/common/cuda/common_cuda.h            |   9 +-
 opal/mca/mpool/gpusm/mpool_gpusm_module.c     |   2 +-
 test/datatype/ddt_test.c                      |  22 +-
 24 files changed, 1033 insertions(+), 231 deletions(-)

diff --git a/ompi/mca/pml/ob1/pml_ob1_cuda.c b/ompi/mca/pml/ob1/pml_ob1_cuda.c
index c2b2708bebf..97383e008ee 100644
--- a/ompi/mca/pml/ob1/pml_ob1_cuda.c
+++ b/ompi/mca/pml/ob1/pml_ob1_cuda.c
@@ -39,6 +39,7 @@
 
 #include "opal/datatype/opal_datatype_gpu.h"
 #include "opal/mca/common/cuda/common_cuda.h"
+#include "opal/mca/btl/smcuda/btl_smcuda.h"
 
 #define CUDA_DDT_WITH_RDMA 1
 
@@ -51,7 +52,7 @@ size_t mca_pml_ob1_rdma_cuda_btls(
 int mca_pml_ob1_rdma_cuda_btl_register_events(
     mca_pml_ob1_com_btl_t* rdma_btls, 
     uint32_t num_btls_used, 
-    struct opal_convertor_t* convertor);
+    struct opal_convertor_t* convertor, size_t pipeline_size, int lindex);
 
 int mca_pml_ob1_cuda_need_buffers(void * rreq,
                                   mca_btl_base_module_t* btl);
@@ -108,7 +109,8 @@ int mca_pml_ob1_send_request_start_cuda(mca_pml_ob1_send_request_t* sendreq,
             printf("GPU data ready for GET!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n");
             unsigned char *base;
             struct opal_convertor_t *convertor = &(sendreq->req_send.req_base.req_convertor);
-            base = opal_datatype_get_gpu_buffer();
+            base = opal_cuda_malloc_gpu_buffer_p(convertor->local_size, 0);
+            convertor->gpu_buffer_ptr = base;
             sendreq->req_send.req_bytes_packed = convertor->local_size;
             printf("GPU BUFFER %p, local %lu, remote %lu\n", base, convertor->local_size, convertor->remote_size);
             if( 0 != (sendreq->req_rdma_cnt = (uint32_t)mca_pml_ob1_rdma_cuda_btls(
@@ -117,22 +119,34 @@ int mca_pml_ob1_send_request_start_cuda(mca_pml_ob1_send_request_t* sendreq,
                                                                            sendreq->req_send.req_bytes_packed,
                                                                            sendreq->req_rdma))) {
                 
-                mca_pml_ob1_rdma_cuda_btl_register_events(sendreq->req_rdma, sendreq->req_rdma_cnt, convertor);
+                size_t pipeline_size = convertor->local_size;
                 struct iovec iov;
                 int rc_dt = 0;
                 uint32_t iov_count = 1;
-                iov.iov_base = NULL;
-                iov.iov_len = 0;
+                iov.iov_base = base;
+                iov.iov_len = pipeline_size;
                 size_t max_data = 0;
+                int seq = 0;
+                /* the first pack here is used to get the correct size of pipeline_size */
+                /* because pack may not use the whole pipeline size */
                 rc_dt = opal_convertor_pack(convertor, &iov, &iov_count, &max_data );
-              //  mca_common_cuda_record_event(&convertor->pipeline_event[0]);
-           //      uint64_t event, *ep;
-           //      ep = &event;
-           //      mca_common_cuda_create_event((uint64_t**)ep);
-           // //     mca_common_cuda_record_event(ep);
-           //      printf("success record event %d\n", event);
+                pipeline_size = max_data;
+                int lindex = mca_btl_smcuda_alloc_cuda_dt_clone();
+                assert(lindex >= 0);
+                mca_pml_ob1_rdma_cuda_btl_register_events(sendreq->req_rdma, sendreq->req_rdma_cnt, convertor, pipeline_size, lindex); 
+                mca_btl_smcuda_cuda_dt_clone(convertor, bml_btl->btl_endpoint, NULL, NULL, NULL, NULL, NULL, pipeline_size, lindex);
+                
                 rc = mca_pml_ob1_send_request_start_rdma(sendreq, bml_btl,
                                                          sendreq->req_send.req_bytes_packed);
+                
+                mca_btl_smcuda_send_cuda_unpack_sig(bml_btl->btl, bml_btl->btl_endpoint, lindex, seq);
+                while (rc_dt != 1) {
+                    iov.iov_base += pipeline_size;
+                    seq ++;
+                    rc_dt = opal_convertor_pack(convertor, &iov, &iov_count, &max_data );
+                    mca_btl_smcuda_send_cuda_unpack_sig(bml_btl->btl, bml_btl->btl_endpoint, lindex, seq);
+                }
+                mca_btl_smcuda_send_cuda_unpack_sig(bml_btl->btl, bml_btl->btl_endpoint, lindex, -1);
                 if( OPAL_UNLIKELY(OMPI_SUCCESS != rc) ) {
                     mca_pml_ob1_free_rdma_resources(sendreq);
                 }
@@ -208,24 +222,23 @@ size_t mca_pml_ob1_rdma_cuda_btls(
 int mca_pml_ob1_rdma_cuda_btl_register_events(
     mca_pml_ob1_com_btl_t* rdma_btls, 
     uint32_t num_btls_used, 
-    struct opal_convertor_t* convertor)
+    struct opal_convertor_t* convertor, size_t pipeline_size, int lindex)
 {
-    // uint32_t i, j;
-    // for (i = 0; i < num_btls_used; i++) {
-    //     mca_btl_base_registration_handle_t *handle = rdma_btls[i].btl_reg;
-    //     mca_mpool_common_cuda_reg_t *cuda_reg = (mca_mpool_common_cuda_reg_t *)
-    //             ((intptr_t) handle - offsetof (mca_mpool_common_cuda_reg_t, data));
-    //     printf("base %p\n", cuda_reg->base.base);
-    //     for (j = 0; j < MAX_IPC_EVENT_HANDLE; j++) {
-    //         uint64_t *event = &convertor->pipeline_event[j];
-    //         convertor->pipeline_event[j] = 0;
-    //         mca_common_cuda_geteventhandle(&event, j, (mca_mpool_base_registration_t *)cuda_reg);
-    //         convertor->pipeline_event[j] = *event;
-    //   //      printf("event %lu, j %d\n", convertor->pipeline_event[j], j);
-    //     }
-    //     cuda_reg->data.pipeline_size = 1000;
-    //
-    // }
+    uint32_t i, j;
+    for (i = 0; i < num_btls_used; i++) {
+        mca_btl_base_registration_handle_t *handle = rdma_btls[i].btl_reg;
+        mca_mpool_common_cuda_reg_t *cuda_reg = (mca_mpool_common_cuda_reg_t *)
+                ((intptr_t) handle - offsetof (mca_mpool_common_cuda_reg_t, data));
+      //   printf("base %p\n", cuda_reg->base.base);
+      //   for (j = 0; j < MAX_IPC_EVENT_HANDLE; j++) {
+      //       mca_common_cuda_geteventhandle(&convertor->pipeline_event[j], j, (mca_mpool_base_registration_t *)cuda_reg);
+      // //      printf("event %lu, j %d\n", convertor->pipeline_event[j], j);
+      //   }
+        printf("i send pipeline %ld\n", pipeline_size);
+        cuda_reg->data.pipeline_size = pipeline_size;
+        cuda_reg->data.lindex = lindex;
+
+    }
     return 0;
 }
 
diff --git a/ompi/mca/pml/ob1/pml_ob1_sendreq.c b/ompi/mca/pml/ob1/pml_ob1_sendreq.c
index 50b11d36dff..78b7188cdbb 100644
--- a/ompi/mca/pml/ob1/pml_ob1_sendreq.c
+++ b/ompi/mca/pml/ob1/pml_ob1_sendreq.c
@@ -679,7 +679,7 @@ int mca_pml_ob1_send_request_start_rdma( mca_pml_ob1_send_request_t* sendreq,
     if ( (sendreq->req_send.req_base.req_convertor.flags & CONVERTOR_CUDA)) {
         sendreq->req_send.req_base.req_convertor.flags &= ~CONVERTOR_CUDA;
         if (opal_convertor_need_buffers(&sendreq->req_send.req_base.req_convertor) == true) {
-            data_ptr = sendreq->req_send.req_base.req_convertor.gpu_buffer_ptr_source;
+            data_ptr = sendreq->req_send.req_base.req_convertor.gpu_buffer_ptr;
             printf("START RMDA data_ptr %p\n", data_ptr);
         } else {
             opal_convertor_get_current_pointer (&sendreq->req_send.req_base.req_convertor, &data_ptr);
diff --git a/opal/datatype/cuda/opal_datatype_cuda.cu b/opal/datatype/cuda/opal_datatype_cuda.cu
index 1debbd221a5..387f75583ce 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cu
+++ b/opal/datatype/cuda/opal_datatype_cuda.cu
@@ -3,6 +3,7 @@
 #include <cuda_runtime_api.h>
 #include <cuda.h>
 #include <stdio.h>
+#include <assert.h>
 #include <stdarg.h> 
 
 /*
@@ -39,6 +40,9 @@ OPAL_DECLSPEC const size_t opal_datatype_basicDatatypesSize[OPAL_DATATYPE_MAX_PR
 
 /***** my variables ********/
 
+
+ddt_cuda_list_t *cuda_free_list;
+ddt_cuda_device_t *cuda_device;
 ddt_cuda_desc_t *cuda_desc_d, *cuda_desc_h;
 unsigned char *pBaseBuf_GPU, *gpu_src_const, *gpu_dest_const;
 unsigned char *ddt_cuda_pack_buffer, *ddt_cuda_unpack_buffer;
@@ -54,12 +58,172 @@ uint8_t opal_datatype_cuda_debug;
 
 //uint8_t ALIGNMENT_DOUBLE, ALIGNMENT_FLOAT, ALIGNMENT_CHAR;
 
+
+static inline ddt_cuda_buffer_t* obj_ddt_cuda_buffer_new()
+{
+    ddt_cuda_buffer_t *p = (ddt_cuda_buffer_t *)malloc(sizeof(ddt_cuda_buffer_t));
+    p->next = NULL;
+    p->prev = NULL;
+    p->size = 0;
+    p->gpu_addr = NULL;
+    return p; 
+}
+
+static inline void obj_ddt_cuda_buffer_chop(ddt_cuda_buffer_t *p)
+{
+    p->next = NULL;
+    p->prev = NULL;
+}
+
+static inline void obj_ddt_cuda_buffer_reset(ddt_cuda_buffer_t *p)
+{
+    p->size = 0;
+    p->gpu_addr = NULL;
+}
+
+static ddt_cuda_list_t* init_cuda_free_list()
+{
+    ddt_cuda_list_t *list = NULL;
+    ddt_cuda_buffer_t *p, *prev;
+    int i;
+    list = (ddt_cuda_list_t *)malloc(sizeof(ddt_cuda_list_t));
+    p = obj_ddt_cuda_buffer_new();
+    list->head = p;
+    prev = p;
+    for (i = 1; i < DT_CUDA_FREE_LIST_SIZE; i++) {
+        p = obj_ddt_cuda_buffer_new();
+        prev->next = p;
+        p->prev = prev;
+        prev = p;
+    }
+    list->tail = p;
+    list->nb_elements = DT_CUDA_FREE_LIST_SIZE;
+    return list;
+} 
+
+static inline ddt_cuda_buffer_t* cuda_list_pop_tail(ddt_cuda_list_t *list)
+{
+    ddt_cuda_buffer_t *p = NULL;
+    p = list->tail;
+    if (p == NULL) {
+        return p;
+    } else {
+        list->nb_elements --;
+        if (list->head == p) {
+            list->head = NULL;
+            list->tail = NULL;
+        } else {
+            list->tail = p->prev;
+            p->prev->next = NULL;
+            obj_ddt_cuda_buffer_chop(p);
+        }
+        return p;
+    }
+}
+
+static inline void cuda_list_push_head(ddt_cuda_list_t *list, ddt_cuda_buffer_t *item)
+{
+    ddt_cuda_buffer_t * orig_head = list->head;
+    assert(item->next == NULL && item->prev == NULL);
+    list->head = item;
+    item->next = orig_head;
+    if (orig_head == NULL) {
+        list->tail = item;
+    }
+    list->nb_elements ++;
+}
+
+static inline void cuda_list_push_tail(ddt_cuda_list_t *list, ddt_cuda_buffer_t *item)
+{
+    ddt_cuda_buffer_t * orig_tail = list->tail;
+    assert(item->next == NULL && item->prev == NULL);
+    list->tail = item;
+    item->prev = orig_tail;
+    if (orig_tail == NULL) {
+        list->head = item;
+    }
+    list->nb_elements ++;
+}
+
+static inline void cuda_list_delete(ddt_cuda_list_t *list, ddt_cuda_buffer_t *item)
+{
+    if (item->prev == NULL && item->next == NULL) {
+        list->head = NULL;
+        list->tail = NULL;
+    }else if (item->prev == NULL && item->next != NULL) {
+        list->head = item->next;
+        item->next->prev = NULL;
+    } else if (item->next == NULL && item->prev != NULL) {
+        list->tail = item->prev;
+        item->prev->next = NULL;
+    } else {
+        item->prev->next = item->next;
+        item->next->prev = item->prev;
+    }
+    list->nb_elements --;
+    obj_ddt_cuda_buffer_chop(item);
+}
+
+static inline void cuda_list_insert_before(ddt_cuda_list_t *list, ddt_cuda_buffer_t *item, ddt_cuda_buffer_t *next)
+{
+    assert(item->next == NULL && item->prev == NULL);
+    item->next = next;
+    item->prev = next->prev;
+    next->prev = item;
+    if (list->head == next) {
+        list->head = item;
+    }
+    list->nb_elements ++;
+}
+
+static inline void cuda_list_item_merge_by_addr(ddt_cuda_list_t *list)
+{
+    ddt_cuda_buffer_t *ptr = NULL;
+    ddt_cuda_buffer_t *next = NULL;
+    ptr = list->head;
+    while(ptr != NULL) {
+        next = ptr->next;
+        if (next == NULL) {
+            break;
+        } else if ((ptr->gpu_addr + ptr->size) == next->gpu_addr) {
+            ptr->size += next->size;
+            cuda_list_delete(list, next);
+        } else {
+            ptr = ptr->next;
+        }
+    }
+}
+
 void opal_datatype_cuda_init(void)
 {
     uint32_t i;
     
-    int cuda_device = OPAL_GPU_INDEX;
-    cudaSetDevice(cuda_device);
+    int device = OPAL_GPU_INDEX;
+    cudaSetDevice(device);
+    
+    cuda_free_list = init_cuda_free_list();
+    
+    /* init device */
+    cuda_device = (ddt_cuda_device_t *)malloc(sizeof(ddt_cuda_device_t)*1);
+    for (i = 0; i < 1; i++) {
+        unsigned char *gpu_ptr = NULL;
+        if (cudaMalloc((void **)(&gpu_ptr), sizeof(char)*DT_CUDA_BUFFER_SIZE) != cudaSuccess) {
+            DT_CUDA_DEBUG( opal_cuda_output( 0, "cudaMalloc is failed in GPU %d\n", i); );
+        }
+        cudaMemset(gpu_ptr, 0, sizeof(char)*DT_CUDA_BUFFER_SIZE);
+        cuda_device[i].gpu_buffer = gpu_ptr;
+        
+        cuda_device[i].buffer_free_size = DT_CUDA_BUFFER_SIZE;
+        ddt_cuda_buffer_t *p = obj_ddt_cuda_buffer_new();
+        p->size = DT_CUDA_BUFFER_SIZE;
+        p->gpu_addr = gpu_ptr;
+        cuda_device[i].buffer_free.head = p;
+        cuda_device[i].buffer_free.tail = cuda_device[i].buffer_free.head;
+        
+        cuda_device[i].buffer_used.head = NULL;
+        cuda_device[i].buffer_used.tail = NULL;
+        cuda_device[i].buffer_used_size = 0;
+    }
     
     cudaMalloc((void **)&cuda_desc_d, sizeof(ddt_cuda_desc_t));
     cudaMallocHost((void **)&cuda_desc_h, sizeof(ddt_cuda_desc_t));
@@ -72,11 +236,12 @@ void opal_datatype_cuda_init(void)
     //     cuda_desc_h->iov[i].iov_base = iov_base;
     //     cuda_desc_h->iov[i].iov_len = IOV_LEN;
     // }
-    printf("malloc cuda packing buffer\n");
+    
     cudaMalloc((void **)(&ddt_cuda_pack_buffer), sizeof(char)*DT_CUDA_BUFFER_SIZE);
+    printf("malloc cuda packing buffer, %p\n", ddt_cuda_pack_buffer);
     cudaMemset(ddt_cuda_pack_buffer, 0, sizeof(char)*DT_CUDA_BUFFER_SIZE);
-    printf("malloc cuda unpacking buffer\n");
     cudaMalloc((void **)(&ddt_cuda_unpack_buffer), sizeof(char)*DT_CUDA_BUFFER_SIZE);
+    printf("malloc cuda unpacking buffer, %p\n", ddt_cuda_unpack_buffer);
     cudaMemset(ddt_cuda_unpack_buffer, 0, sizeof(char)*DT_CUDA_BUFFER_SIZE);
 
     cuda_desc_h->iov[0].iov_base = ddt_cuda_pack_buffer;
@@ -196,6 +361,93 @@ unsigned char* opal_cuda_get_gpu_pack_buffer()
     }
 }
 
+void* opal_cuda_malloc_gpu_buffer(size_t size, int gpu_id)
+{
+    ddt_cuda_device_t *device = &cuda_device[gpu_id];
+    if (device->buffer_free_size < size) {
+        return NULL;
+    }
+    ddt_cuda_buffer_t *ptr = NULL;
+    void *addr = NULL;
+    ptr = device->buffer_free.head;
+    while (ptr != NULL) {
+        if (ptr->size >= size) {
+            addr = ptr->gpu_addr;
+            ptr->size -= size;
+            if (ptr->size == 0) {
+                cuda_list_delete(&device->buffer_free, ptr);
+                obj_ddt_cuda_buffer_reset(ptr);
+                cuda_list_push_head(cuda_free_list, ptr);
+            } else {
+                ptr->gpu_addr += size;
+            }
+            break;
+        }
+        ptr = ptr->next;
+    }
+    
+    if (ptr == NULL) {
+        return NULL;
+    } else {    
+        ddt_cuda_buffer_t *p = cuda_list_pop_tail(cuda_free_list);
+        if (p == NULL) {
+            p = obj_ddt_cuda_buffer_new();
+        }
+        p->size = size;
+        p->gpu_addr = (unsigned char*)addr;
+        cuda_list_push_head(&device->buffer_used, p);
+        device->buffer_used_size += size;
+        device->buffer_free_size -= size;
+        DT_CUDA_DEBUG( opal_cuda_output( 0, "Malloc GPU buffer %p.\n", addr); );
+        return addr;
+    }
+}
+
+void opal_cuda_free_gpu_buffer(void *addr, int gpu_id)
+{
+    ddt_cuda_device_t *device = &cuda_device[gpu_id];
+    ddt_cuda_buffer_t *ptr = NULL;
+    ddt_cuda_buffer_t *ptr_next = NULL;
+    ptr = device->buffer_used.head;
+    while (ptr != NULL) {
+        if (ptr->gpu_addr == addr) {
+            cuda_list_delete(&device->buffer_used, ptr);
+            ptr_next = device->buffer_free.head;
+            while (ptr_next != NULL) {
+                if (ptr_next->gpu_addr > addr) {
+                    break;
+                }
+                ptr_next = ptr_next->next;
+            }
+            if (ptr_next == NULL) {
+                /* buffer_free is empty, or insert to last one */
+                cuda_list_push_tail(&device->buffer_free, ptr);
+            } else {
+                cuda_list_insert_before(&device->buffer_free, ptr, ptr_next);
+            }
+            cuda_list_item_merge_by_addr(&device->buffer_free);
+            device->buffer_free_size += ptr->size;
+            break;
+        }
+        ptr = ptr->next;
+    }
+    if (ptr == NULL) {
+        DT_CUDA_DEBUG( opal_cuda_output( 0, "addr %p is not managed.\n", addr); );
+    }
+    DT_CUDA_DEBUG( opal_cuda_output( 0, "Free GPU buffer %p.\n", addr); );
+}
+
+void opal_dump_cuda_list(ddt_cuda_list_t *list)
+{
+    ddt_cuda_buffer_t *ptr = NULL;
+    ptr = list->head;
+    DT_CUDA_DEBUG( opal_cuda_output( 0, "DUMP cuda list %p, nb_elements %d\n", list, list->nb_elements); );
+    while (ptr != NULL) {
+        DT_CUDA_DEBUG( opal_cuda_output( 0, "\titem addr %p, size %ld.\n", ptr->gpu_addr, ptr->size); );
+        ptr = ptr->next;
+    }
+}
+
 /* from internal.h*/
 void opal_cuda_output(int output_id, const char *format, ...)
 {
diff --git a/opal/datatype/cuda/opal_datatype_cuda.cuh b/opal/datatype/cuda/opal_datatype_cuda.cuh
index 5797ceb55d8..04dd5f88a26 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cuh
+++ b/opal/datatype/cuda/opal_datatype_cuda.cuh
@@ -13,6 +13,11 @@ int32_t opal_generic_simple_pack_function_cuda( opal_convertor_t* pConvertor,
                                                 uint32_t* out_size,
                                                 size_t* max_data );
                                                 
+int32_t opal_generic_simple_pack_function_cuda_vector( opal_convertor_t* pConvertor,
+                                                       struct iovec* iov, 
+                                                       uint32_t* out_size,
+                                                       size_t* max_data );
+                                                
 int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor,
                                                     struct iovec* iov, 
                                                     uint32_t* out_size,
@@ -27,6 +32,11 @@ int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvert
                                                   struct iovec* iov, 
                                                   uint32_t* out_size,
                                                   size_t* max_data );  
+                                                
+int32_t opal_generic_simple_unpack_function_cuda_vector( opal_convertor_t* pConvertor,
+                                                         struct iovec* iov, 
+                                                         uint32_t* out_size,
+                                                         size_t* max_data );
 
 void pack_contiguous_loop_cuda( dt_elem_desc_t* ELEM,
                                 uint32_t* COUNT,
@@ -50,6 +60,12 @@ void opal_cuda_sync_device(void);
 
 int32_t opal_cuda_is_gpu_buffer(const void *ptr);
 
+void* opal_cuda_malloc_gpu_buffer(size_t size, int gpu_id);
+
+void opal_cuda_free_gpu_buffer(void *addr, int gpu_id);
+
+void opal_dump_cuda_list(ddt_cuda_list_t *list);
+
 unsigned char* opal_cuda_get_gpu_pack_buffer();
 }
                             
diff --git a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
index be264484153..567e81218ec 100644
--- a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
+++ b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
@@ -18,6 +18,7 @@
 
 #define IOV_ARRAY_SIZE          1
 #define DT_CUDA_BUFFER_SIZE    1024*1024*200
+#define DT_CUDA_FREE_LIST_SIZE  50
 
 #define THREAD_PER_BLOCK    32
 #define CUDA_WARP_SIZE      32
@@ -76,6 +77,30 @@ typedef struct {
     uint32_t nb_tasks;
 } ddt_cuda_iov_dist_t;
 
+typedef struct ddt_cuda_buffer{
+    unsigned char* gpu_addr;
+    size_t size;
+    struct ddt_cuda_buffer *next;
+    struct ddt_cuda_buffer *prev;
+} ddt_cuda_buffer_t;
+
+typedef struct {
+    ddt_cuda_buffer_t *head;
+    ddt_cuda_buffer_t *tail;
+    size_t nb_elements;
+} ddt_cuda_list_t;
+
+typedef struct {
+    int device_id;
+    unsigned char* gpu_buffer;
+    ddt_cuda_list_t buffer_free;
+    ddt_cuda_list_t buffer_used;
+    size_t buffer_free_size;
+    size_t buffer_used_size;
+} ddt_cuda_device_t;
+
+extern ddt_cuda_list_t *cuda_free_list;
+extern ddt_cuda_device_t *cuda_device;
 extern ddt_cuda_desc_t *cuda_desc_d, *cuda_desc_h;
 extern unsigned char* pBaseBuf_GPU;
 extern unsigned char *ddt_cuda_pack_buffer, *ddt_cuda_unpack_buffer;
diff --git a/opal/datatype/cuda/opal_datatype_orig_internal.h b/opal/datatype/cuda/opal_datatype_orig_internal.h
index 37b1d1be51b..90561359f75 100644
--- a/opal/datatype/cuda/opal_datatype_orig_internal.h
+++ b/opal/datatype/cuda/opal_datatype_orig_internal.h
@@ -326,7 +326,6 @@ struct opal_convertor_t {
     void *                        stream;         /**< CUstream for async copy */
 
     unsigned char *               gpu_buffer_ptr; /**< GPU buffer used for pack/unpack */
-    unsigned char *               gpu_buffer_ptr_source; /**< source address of GPU buffer start to pack, update in packing function */
     uint64_t *                    pipeline_event[MAX_IPC_EVENT_HANDLE]; /**< cuda event for pipeline */
 #endif
     /* size: 248, cachelines: 4, members: 20 */
@@ -531,13 +530,10 @@ do { \
 
 #define PUSH_STACK( PSTACK, STACK_POS, INDEX, TYPE, COUNT, DISP) \
 do { \
-   dt_stack_t* pTempStack = (PSTACK) + 1; \
-   if (threadIdx.x == 0) {  \
-       SAVE_STACK( pTempStack, (INDEX), (TYPE), (COUNT), (DISP) );  \
-   }    \
-   __syncthreads(); \
-   (STACK_POS)++; \
-   (PSTACK) = pTempStack; \
+    dt_stack_t* pTempStack = (PSTACK) + 1; \
+    SAVE_STACK( pTempStack, (INDEX), (TYPE), (COUNT), (DISP) );  \
+    (STACK_POS)++; \
+    (PSTACK) = pTempStack; \
 } while(0)
 
 #define UPDATE_INTERNAL_COUNTERS( DESCRIPTION, POSITION, ELEMENT, COUNTER ) \
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
index 98208dc0f39..96bdc12d961 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
@@ -547,10 +547,10 @@ __global__ void pack_contiguous_loop_cuda_kernel_global( uint32_t copy_loops,
     for (_i = tid; _i < copy_loops*nb_elements; _i+=num_threads) {
         _source_tmp = _src_disp_tmp + tid + _i/num_threads*num_threads + _i/nb_elements * gap;
 #if defined (OPAL_DATATYPE_CUDA_DEBUG)
-        if (_i % nb_elements == 0 ) {
-            DBGPRINT("tid %d, pack 3. memcpy( %p, %p, %lu ) => _i %d, actual _i %d, count %d\n",
-                                            tid, _destination_tmp, _source_tmp, (unsigned long)size, _i/nb_elements, _i, copy_loops );
-        }
+        // if (_i % nb_elements == 0 ) {
+        //     DBGPRINT("tid %d, pack 3. memcpy( %p, %p, %lu ) => _i %d, actual _i %d, count %d\n",
+        //                                     tid, _destination_tmp, _source_tmp, (unsigned long)size, _i/nb_elements, _i, copy_loops );
+        // }
         // if (_i / nb_elements ==1 && tid == 0 ) {
         //     DBGPRINT("tid %d, pack 3. memcpy( %p, %p, %lu ) => space %lu, _i %d, actual _i %d\n",
         //                                     tid, _destination_tmp, _source_tmp, (unsigned long)_end_loop->size, (unsigned long)(*(SPACE) - _i/nb_elements * _end_loop->size), _i/nb_elements, _i );
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
index 14fdcfca346..a5963b74d3f 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
@@ -168,6 +168,167 @@ int32_t opal_generic_simple_pack_function_cuda( opal_convertor_t* pConvertor,
                                                   
 }
 
+int32_t opal_generic_simple_pack_function_cuda_vector(opal_convertor_t* pConvertor,
+                                                      struct iovec* iov,
+                                                      uint32_t* out_size,
+                                                      size_t* max_data )
+{
+    dt_stack_t* pStack;       /* pointer to the position on the stack */
+    uint32_t pos_desc;        /* actual position in the description of the derived datatype */
+    uint32_t count_desc;      /* the number of items already done in the actual pos_desc */
+    size_t total_packed = 0;  /* total amount packed this time */
+    dt_elem_desc_t* description;
+    dt_elem_desc_t* pElem;
+    const opal_datatype_t *pData = pConvertor->pDesc;
+    unsigned char *conv_ptr, *iov_ptr;
+    size_t iov_len_local;
+    uint32_t iov_count;
+    uint8_t transfer_required;
+    
+#if defined(OPAL_DATATYPE_CUDA_TIMING)    
+    TIMER_DATA_TYPE start, end, start_total, end_total;
+    long total_time;
+#endif
+    
+    DT_CUDA_DEBUG( opal_cuda_output( 1, "opal_convertor_generic_simple_pack( %p:%p, {%p, %lu}, %d )\n",
+                                (void*)pConvertor, (void*)pConvertor->pBaseBuf,
+                                iov[0].iov_base, (unsigned long)iov[0].iov_len, *out_size ); );
+
+    printf("I am in simple pack vector, max_data %lu, iov_len %lu\n", *max_data, iov[0].iov_len);
+    description = pConvertor->use_desc->desc;
+    
+    /* For the first step we have to add both displacement to the source. After in the
+     * main while loop we will set back the conv_ptr to the correct value. This is
+     * due to the fact that the convertor can stop in the middle of a data with a count
+     */
+    pStack = pConvertor->pStack + pConvertor->stack_pos;
+    pos_desc   = pStack->index;
+    conv_ptr   = pConvertor->pBaseBuf + pStack->disp;
+    count_desc = (uint32_t)pStack->count;
+    pStack--;
+    pConvertor->stack_pos--;
+    pElem = &(description[pos_desc]);
+
+    DT_CUDA_DEBUG( opal_cuda_output( 0, "pack start pos_desc %d count_desc %d disp %ld\n"
+                           "stack_pos %d pos_desc %d count_desc %d disp %ld\n",
+                           pos_desc, count_desc, (long)(conv_ptr - pConvertor->pBaseBuf),
+                           pConvertor->stack_pos, pStack->index, (int)pStack->count, (long)pStack->disp ); );
+    
+    
+    for( iov_count = 0; iov_count < (*out_size); iov_count++ ) {
+        if ((iov[0].iov_base == NULL) || opal_cuda_is_gpu_buffer(iov[iov_count].iov_base)) {
+            // if (iov[0].iov_len == 0) {
+            //     buffer_size = DT_CUDA_BUFFER_SIZE;
+            // } else {
+            //     buffer_size = iov[0].iov_len;
+            // }
+            pConvertor->gpu_buffer_ptr = ddt_cuda_pack_buffer;
+        
+            if (iov[iov_count].iov_base == NULL) {
+                iov[iov_count].iov_base = ddt_cuda_pack_buffer;
+                iov_ptr = ddt_cuda_pack_buffer;
+            } else {
+                iov_ptr = (unsigned char *)iov[iov_count].iov_base;
+            }
+            transfer_required = 0;
+        } else {
+            pConvertor->gpu_buffer_ptr = NULL;
+            transfer_required = 1;
+        }
+        iov_ptr = ddt_cuda_pack_buffer;
+        iov_len_local = iov[iov_count].iov_len;
+        printf("original local %d\n", iov_len_local);
+        while( 1 ) {
+            while( pElem->elem.common.flags & OPAL_DATATYPE_FLAG_DATA ) {
+                /* now here we have a basic datatype */
+                /* should not go into here */
+                goto complete_loop;
+            }
+            if( OPAL_DATATYPE_END_LOOP == pElem->elem.common.type ) { /* end of the current loop */
+                DT_CUDA_DEBUG( opal_cuda_output( 1, "pack end_loop count %d stack_pos %d"
+                                                 " pos_desc %d disp %ld space %lu\n",
+                                                 (int)pStack->count, pConvertor->stack_pos,
+                                                 pos_desc, (long)pStack->disp, (unsigned long)iov_len_local ); );
+                if( --(pStack->count) == 0 ) { /* end of loop */
+                    if( 0 == pConvertor->stack_pos ) {
+                        /* we lie about the size of the next element in order to
+                         * make sure we exit the main loop.
+                         */
+                        *out_size = iov_count;
+                        goto complete_loop;  /* completed */
+                    }
+                    pConvertor->stack_pos--;
+                    pStack--;
+                    pos_desc++;
+                } else {
+                    pos_desc = pStack->index + 1;
+                    if( pStack->index == -1 ) {
+                        pStack->disp += (pData->ub - pData->lb);
+                    } else {
+                        assert( OPAL_DATATYPE_LOOP == description[pStack->index].loop.common.type );
+                        pStack->disp += description[pStack->index].loop.extent;
+                    }
+                }
+                conv_ptr = pConvertor->pBaseBuf + pStack->disp;
+                UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
+                DT_CUDA_DEBUG( opal_cuda_output( 1, "pack new_loop count %d stack_pos %d pos_desc %d count_desc %d disp %ld space %lu\n",
+                                                 (int)pStack->count, pConvertor->stack_pos, pos_desc,
+                                                 count_desc, (long)pStack->disp, (unsigned long)iov_len_local ); );
+            }
+            if( OPAL_DATATYPE_LOOP == pElem->elem.common.type ) {
+                OPAL_PTRDIFF_TYPE local_disp = (OPAL_PTRDIFF_TYPE)conv_ptr;
+                if( pElem->loop.common.flags & OPAL_DATATYPE_FLAG_CONTIGUOUS ) {
+                    pack_contiguous_loop_cuda(pElem, &count_desc, &conv_ptr, &iov_ptr, &iov_len_local);
+                    count_desc = 0;
+                    if( 0 == count_desc ) {  /* completed */
+                        pos_desc += pElem->loop.items + 1;
+                        goto update_loop_description;
+                    }
+                    /* Save the stack with the correct last_count value. */
+                }
+                local_disp = (OPAL_PTRDIFF_TYPE)conv_ptr - local_disp;
+                PUSH_STACK( pStack, pConvertor->stack_pos, pos_desc, OPAL_DATATYPE_LOOP, count_desc,
+                            pStack->disp + local_disp);
+                pos_desc++;
+            update_loop_description:  /* update the current state */
+                conv_ptr = pConvertor->pBaseBuf + pStack->disp;
+                UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
+                continue;
+            }
+        }
+    complete_loop:
+        iov[iov_count].iov_len -= iov_len_local;  /* update the amount of valid data */
+        total_packed += iov[iov_count].iov_len;
+        printf("iov_len %d, local %d\n", iov[iov_count].iov_len, iov_len_local);
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+        GET_TIME(start);
+#endif
+        if (transfer_required) {
+            cudaMemcpy(iov[iov_count].iov_base, ddt_cuda_pack_buffer, total_packed, cudaMemcpyDeviceToHost);
+        } 
+#if defined(OPAL_DATATYPE_CUDA_TIMING) 
+        GET_TIME( end );
+        total_time = ELAPSED_TIME( start, end );
+        printf( "[Timing]: DtoH memcpy in %ld microsec\n", total_time );
+#endif
+    }
+    *max_data = total_packed;
+    pConvertor->bConverted += total_packed;  /* update the already converted bytes */
+    pConvertor->bConverted = pConvertor->local_size;
+    *out_size = iov_count;
+    if( pConvertor->bConverted == pConvertor->local_size ) {
+        pConvertor->flags |= CONVERTOR_COMPLETED;
+        printf("total packed %lu\n", pConvertor->bConverted);
+        return 1;
+    }
+    /* Save the global position for the next round */
+    PUSH_STACK( pStack, pConvertor->stack_pos, pos_desc, pElem->elem.common.type, count_desc,
+                conv_ptr - pConvertor->pBaseBuf );
+    DT_CUDA_DEBUG( opal_cuda_output( 2, "pack save stack stack_pos %d pos_desc %d count_desc %d disp %ld\n",
+                                     pConvertor->stack_pos, pStack->index, (int)pStack->count, (long)pStack->disp ); );
+    return 0;
+}
+
 void pack_contiguous_loop_cuda( dt_elem_desc_t* ELEM,
                                 uint32_t* COUNT,
                                 unsigned char** SOURCE,
@@ -187,8 +348,8 @@ void pack_contiguous_loop_cuda( dt_elem_desc_t* ELEM,
         _copy_loops = (uint32_t)(*(SPACE) / _end_loop->size);
 
 #if !defined(OPAL_DATATYPE_CUDA_DRY_RUN)
-    _source = pBaseBuf_GPU;
-    _destination = (unsigned char*)cuda_desc_h->iov[0].iov_base;
+ //   _source = pBaseBuf_GPU;
+ //   _destination = (unsigned char*)cuda_desc_h->iov[0].iov_base;
 #endif
     
     tasks_per_block = THREAD_PER_BLOCK * TASK_PER_THREAD;
@@ -205,105 +366,6 @@ void pack_contiguous_loop_cuda( dt_elem_desc_t* ELEM,
     cudaDeviceSynchronize();
 }
 
-
-// int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor,
-//                                                     struct iovec* iov,
-//                                                     uint32_t* out_size,
-//                                                     size_t* max_data )
-// {
-//     uint32_t i;
-//     uint32_t count_desc, current_block, task_iteration, nb_blocks_per_description, j, dst_offset;
-//     uint32_t nb_blocks, thread_per_block;
-//     dt_elem_desc_t* description;
-//     size_t length;
-//
-//  //   return -99;
-//
-//     cuda_iov_count = 4000;
-//     opal_convertor_raw( pConvertor, cuda_iov, &cuda_iov_count, &length );
-//     printf("iov count %d, length %d\n", cuda_iov_count, length);
-//
-//     description = pConvertor->use_desc->desc;
-//     current_block = 0;
-//     task_iteration = 0;
-//     dst_offset = 0;
-//     thread_per_block = CUDA_WARP_SIZE * 4;
-//     nb_blocks = 512;
-//     for (i = 0; i < cuda_iov_count; i++) {
-//         count_desc = cuda_iov[i].iov_len / sizeof(double);
-// //        printf("i = %d\t, iov_base %p\t, iov_len %ld\t, count %d\n", i, cuda_iov[i].iov_base, cuda_iov[i].iov_len, count_desc);
-//         nb_blocks_per_description = (count_desc + thread_per_block - 1) / thread_per_block;
-//         for (j = 0; j < nb_blocks_per_description; j++) {
-//             description_dist_h[current_block].description_index[task_iteration] = i;
-//             description_dist_h[current_block].description_local_index[task_iteration] = j;
-//             description_dist_h[current_block].dst_offset[task_iteration] = dst_offset;
-//             description_dist_h[current_block].description_used = task_iteration + 1;
-//             if ( (j+1) * thread_per_block <= count_desc) {
-//                 dst_offset += thread_per_block;
-//             } else {
-//                 dst_offset += thread_per_block - ((j+1)*thread_per_block - count_desc);
-//             }
-//             current_block += 1;
-//             if (current_block >= nb_blocks) {
-//                 current_block = 0;
-//                 task_iteration ++;
-//             }
-//         }
-//     }
-//
-//     uint32_t pos_desc;
-//     dt_elem_desc_t* pElem;
-//     // for (i = 0; i < nb_blocks; i++) {
-//     //     printf("block %d\t, used %d\n", i, description_dist_h[i].description_used);
-//     //     for (j = 0; j < description_dist_h[i].description_used; j++) {
-//     //         pos_desc = description_dist_h[i].description_index[j];
-//     //         pElem = &(description[pos_desc]);
-//     //         printf("i %d\t, descp_pos %d\t, local_index %d\t, count %d\t, dst offset %d\n", j, description_dist_h[i].description_index[j], description_dist_h[i].description_local_index[j], pElem->elem.count, description_dist_h[i].dst_offset[j]);
-//     //     }
-//     // }
-//
-//     cudaMemcpy(description_dist_d, description_dist_h, sizeof(ddt_cuda_description_dist_t)*(nb_blocks), cudaMemcpyHostToDevice);
-//
-//     if (cuda_desc_h->description_max_count != 0) {
-//         if (cuda_desc_h->description_max_count >= (pConvertor->use_desc->used+1)) {
-//             cuda_desc_h->description_count = pConvertor->use_desc->used+1;
-//         } else {
-//             cudaFree(cuda_desc_h->description);
-//             cuda_desc_h->description = NULL;
-//             cudaMalloc((void **)&(cuda_desc_h->description), sizeof(dt_elem_desc_t)*(pConvertor->use_desc->used+1));
-//             description_d = cuda_desc_h->description;
-//             cuda_desc_h->description_max_count = pConvertor->use_desc->used+1;
-//             cuda_desc_h->description_count = pConvertor->use_desc->used+1;
-//         }
-//
-//     } else {
-//         cudaMalloc((void **)&(cuda_desc_h->description), sizeof(dt_elem_desc_t)*(pConvertor->use_desc->used+1));
-//         description_d = cuda_desc_h->description;
-//         cuda_desc_h->description_max_count = pConvertor->use_desc->used+1;
-//         cuda_desc_h->description_count = pConvertor->use_desc->used+1;
-//     }
-//     cudaMemcpy(description_d, description, sizeof(dt_elem_desc_t)*(cuda_desc_h->description_count), cudaMemcpyHostToDevice);
-//
-//     unsigned char* pBaseBuf;
-// #if defined(OPAL_DATATYPE_CUDA_DRY_RUN)
-//     pBaseBuf = pConvertor->pBaseBuf;
-// #else
-//     pBaseBuf = pBaseBuf_GPU;
-// #endif /* OPAL_DATATYPE_CUDA_DRY_RUN */
-//
-//     for (i = 0; i < *out_size; i++) {
-// #if defined (OPAL_DATATYPE_CUDA_DRY_RUN)
-//         cuda_desc_h->iov[i].iov_base = iov[i].iov_base;
-// #endif /* OPAL_DATATYPE_CUDA_DRY_RUN */
-//         cuda_desc_h->iov[i].iov_len = iov[i].iov_len;
-//     }
-//
-//     opal_generic_simple_pack_cuda_iov_kernel<<<nb_blocks, thread_per_block>>>(description_dist_d, description_d, current_block, cuda_desc_h->iov, pBaseBuf);
-//     cudaDeviceSynchronize();
-//
-//     return 1;
-// }
-
 int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor,
                                                     struct iovec* iov,
                                                     uint32_t* out_size,
@@ -313,10 +375,10 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
     uint32_t count_desc, current_block, task_iteration, nb_blocks_per_description, residue_desc;
     uint32_t nb_blocks, thread_per_block;
     size_t length, buffer_size, length_per_iovec, dst_offset;
-    unsigned char *destination;
+    unsigned char *destination, *destination_tmp;
     size_t total_packed, total_converted;
     int32_t complete_flag = 0;
-    uint8_t buffer_isfull = 0, transfer_required;
+    uint8_t buffer_isfull = 0, transfer_required, free_required;
     uint32_t convertor_flags;
     dt_elem_desc_t* description;
     dt_elem_desc_t* pElem;
@@ -337,13 +399,6 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
     
     DT_CUDA_DEBUG ( opal_cuda_output(0, "GPU datatype packing using iovec\n"); );
 
-#if defined(OPAL_DATATYPE_CUDA_DRY_RUN)
-    destination = (unsigned char*)iov[0].iov_base;
-#else
-//    pConvertor->pBaseBuf = pBaseBuf_GPU;
-    printf("Pack GPU base %p, gpu_buffer %p\n", pConvertor->pBaseBuf, ddt_cuda_pack_buffer);
-    destination = ddt_cuda_pack_buffer;
-#endif /* OPAL_DATATYPE_CUDA_DRY_RUN */
 
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
     GET_TIME(start);
@@ -354,7 +409,7 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
     pElem = &(description[pStack->index]);
     printf("size elem %lu, size %d\n", pElem->elem.common.type, opal_datatype_basicDatatypesSize[pElem->elem.common.type]);
     
-    assert(opal_datatype_basicDatatypesSize[pElem->elem.common.type] != 0);
+//    assert(opal_datatype_basicDatatypesSize[pElem->elem.common.type] != 0);
 
     printf("buffer size %d, max_data %d\n", iov[0].iov_len, *max_data);
     if ((iov[0].iov_base == NULL) || opal_cuda_is_gpu_buffer(iov[0].iov_base)) {
@@ -363,24 +418,34 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
         } else {
             buffer_size = iov[0].iov_len;
         }
-        pConvertor->gpu_buffer_ptr = ddt_cuda_pack_buffer;
-        pConvertor->gpu_buffer_ptr_source = pConvertor->gpu_buffer_ptr + pConvertor->bConverted;
         
         if (iov[0].iov_base == NULL) {
-            iov[0].iov_base = ddt_cuda_pack_buffer;
-            destination = ddt_cuda_pack_buffer;
+            iov[0].iov_base = (unsigned char *)opal_cuda_malloc_gpu_buffer(buffer_size, 0);
+            destination = (unsigned char *)iov[0].iov_base;
+            free_required = 1;
         } else {
             destination = (unsigned char *)iov[0].iov_base;
+            free_required = 0;
         }
         transfer_required = 0;
+        pConvertor->gpu_buffer_ptr = destination;
     } else {
         buffer_size = iov[0].iov_len;
-        pConvertor->gpu_buffer_ptr = NULL;
-        pConvertor->gpu_buffer_ptr_source = NULL;
+        if (pConvertor->gpu_buffer_ptr == NULL) {
+            pConvertor->gpu_buffer_ptr = (unsigned char*)opal_cuda_malloc_gpu_buffer(buffer_size, 0);
+        }
         transfer_required = 1;
+        free_required = 1;
+#if defined(OPAL_DATATYPE_CUDA_DRY_RUN)
+        destination = (unsigned char*)iov[0].iov_base;
+#else
+        destination = pConvertor->gpu_buffer_ptr;
+#endif /* OPAL_DATATYPE_CUDA_DRY_RUN */
     }
     
-    printf("start packing from %p\n", destination);
+    destination_tmp = destination;
+    
+    DT_CUDA_DEBUG ( opal_cuda_output(0, "Pack GPU base %p, pack to buffer %p\n", pConvertor->pBaseBuf, destination););
 
     cuda_iov_count = 1000;
     total_packed = 0;
@@ -446,7 +511,7 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
                 alignment = ALIGNMENT_CHAR;
             }
             
-        //    alignment = ALIGNMENT_CHAR;
+            alignment = ALIGNMENT_DOUBLE;
 
             count_desc = length_per_iovec / alignment;
             residue_desc = length_per_iovec % alignment;
@@ -498,18 +563,10 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
 #if defined(OPAL_DATATYPE_CUDA_TIMING)    
         GET_TIME( end );
         total_time = ELAPSED_TIME( start, end );
-        printf( "[Timing]: iov is prepared in %ld microsec, cudaMemcpy will be submit to CUDA stream %d\n", total_time,  cuda_streams->current_stream_id);
+        printf( "[Timing]: Pack to dest %p, iov is prepared in %ld microsec, kernel submitted to CUDA stream %d\n", destination_tmp, total_time,  cuda_streams->current_stream_id);
 #endif
 
         cudaMemcpyAsync(cuda_iov_dist_d_current, cuda_iov_dist_h_current, sizeof(ddt_cuda_iov_dist_t)*(nb_blocks), cudaMemcpyHostToDevice, cuda_streams->opal_cuda_stream[cuda_streams->current_stream_id]);
-
-        for (i = 0; i < *out_size; i++) {
-#if defined (OPAL_DATATYPE_CUDA_DRY_RUN)
-            cuda_desc_h->iov[i].iov_base = iov[i].iov_base;
-#endif /* OPAL_DATATYPE_CUDA_DRY_RUN */
-            cuda_desc_h->iov[i].iov_len = iov[i].iov_len;
-        }
-    
         opal_generic_simple_pack_cuda_iov_kernel<<<nb_blocks, thread_per_block, 0, cuda_streams->opal_cuda_stream[cuda_streams->current_stream_id]>>>(cuda_iov_dist_d_current);
         cuda_streams->current_stream_id ++;
         cuda_streams->current_stream_id = cuda_streams->current_stream_id % NB_STREAMS;
@@ -541,7 +598,7 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
     GET_TIME(start);
 #endif
     if (transfer_required) {
-        cudaMemcpy(iov[0].iov_base, ddt_cuda_pack_buffer, total_packed, cudaMemcpyDeviceToHost);
+        cudaMemcpy(iov[0].iov_base, destination_tmp, total_packed, cudaMemcpyDeviceToHost);
     } 
 #if defined(OPAL_DATATYPE_CUDA_TIMING) 
     GET_TIME( end );
@@ -568,6 +625,10 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
     
     if( pConvertor->bConverted == pConvertor->local_size ) {
         pConvertor->flags |= CONVERTOR_COMPLETED;
+        if (pConvertor->gpu_buffer_ptr != NULL && free_required) {
+           opal_cuda_free_gpu_buffer(pConvertor->gpu_buffer_ptr, 0);
+           pConvertor->gpu_buffer_ptr = NULL;
+        }
         return 1;
     }        
     return 0;
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
index 0ae85e22eef..35a4ff73078 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
@@ -308,10 +308,10 @@ __global__ void unpack_contiguous_loop_cuda_kernel_global( uint32_t copy_loops,
     for (_i = tid; _i < copy_loops*nb_elements; _i+=num_threads) {
         _destination_tmp = _dst_disp_tmp + tid + _i/num_threads*num_threads + _i/nb_elements * gap;
 #if defined (OPAL_DATATYPE_CUDA_DEBUG)
-        if (_i % nb_elements == 0 ) {
-            DBGPRINT("tid %d, pack 3. memcpy( %p, %p, %lu ) => _i %d, actual _i %d\n",
-                                            tid, _destination_tmp, _source_tmp, (unsigned long)size,  _i/nb_elements, _i );
-        }
+        // if (_i % nb_elements == 0 ) {
+        //     DBGPRINT("tid %d, pack 3. memcpy( %p, %p, %lu ) => _i %d, actual _i %d\n",
+        //                                     tid, _destination_tmp, _source_tmp, (unsigned long)size,  _i/nb_elements, _i );
+        // }
         // if (_i / nb_elements ==1 && tid == 0 ) {
         //     DBGPRINT("tid %d, pack 3. memcpy( %p, %p, %lu ) => space %lu, _i %d, actual _i %d\n",
         //                                     tid, _destination_tmp, _source_tmp, (unsigned long)_end_loop->size, (unsigned long)(*(SPACE) - _i/nb_elements * _end_loop->size), _i/nb_elements, _i );
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
index dccf9f23e82..e1f96ea6a2f 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
@@ -107,6 +107,147 @@ int32_t opal_generic_simple_unpack_function_cuda( opal_convertor_t* pConvertor,
 #endif /* OPAL_DATATYPE_CUDA_DRY_RUN */
 }
 
+int32_t opal_generic_simple_unpack_function_cuda_vector( opal_convertor_t* pConvertor,
+                                                         struct iovec* iov, uint32_t* out_size,
+                                                         size_t* max_data )
+{
+    dt_stack_t* pStack;                /* pointer to the position on the stack */
+    uint32_t pos_desc;                 /* actual position in the description of the derived datatype */
+    uint32_t count_desc;               /* the number of items already done in the actual pos_desc */
+    size_t total_unpacked = 0;         /* total size unpacked this time */
+    dt_elem_desc_t* description;
+    dt_elem_desc_t* pElem;
+    const opal_datatype_t *pData = pConvertor->pDesc;
+    unsigned char *conv_ptr, *iov_ptr;
+    size_t iov_len_local;
+    uint32_t iov_count;
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+    TIMER_DATA_TYPE start, end, start_total, end_total;
+    long total_time;
+#endif
+    
+    printf("i am in simple unpack vector, max_data %lu, iov len %lu\n", *max_data, iov[0].iov_len);
+    DT_CUDA_DEBUG( opal_cuda_output( 1, "opal_convertor_generic_simple_unpack( %p, {%p, %lu}, %u )\n",
+                                     (void*)pConvertor, iov[0].iov_base, (unsigned long)iov[0].iov_len, *out_size ); )                  
+
+    description = pConvertor->use_desc->desc;
+
+    /* For the first step we have to add both displacement to the source. After in the
+     * main while loop we will set back the source_base to the correct value. This is
+     * due to the fact that the convertor can stop in the middle of a data with a count
+     */
+    pStack     = pConvertor->pStack + pConvertor->stack_pos;
+    pos_desc   = pStack->index;
+    conv_ptr   = pConvertor->pBaseBuf + pStack->disp;
+    count_desc = (uint32_t)pStack->count;
+    pStack--;
+    pConvertor->stack_pos--;
+    pElem = &(description[pos_desc]);
+
+    DT_CUDA_DEBUG( opal_cuda_output( 0, "unpack start pos_desc %d count_desc %d disp %ld\n"
+                           "stack_pos %d pos_desc %d count_desc %d disp %ld\n",
+                           pos_desc, count_desc, (long)(conv_ptr - pConvertor->pBaseBuf),
+                           pConvertor->stack_pos, pStack->index, (int)pStack->count, (long)(pStack->disp) ); );
+
+    for( iov_count = 0; iov_count < (*out_size); iov_count++ ) {
+        iov_ptr = ddt_cuda_unpack_buffer;
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+        GET_TIME(start);
+#endif
+        if (opal_cuda_is_gpu_buffer(iov[iov_count].iov_base)) {
+            iov_ptr = (unsigned char*)iov[iov_count].iov_base;
+        } else {    
+            cudaMemcpy(iov_ptr, iov[iov_count].iov_base, iov[iov_count].iov_len, cudaMemcpyHostToDevice);
+        }
+#if defined(OPAL_DATATYPE_CUDA_TIMING) 
+        GET_TIME( end );
+        total_time = ELAPSED_TIME( start, end );
+        printf( "[Timing]: HtoD memcpy in %ld microsec\n", total_time );
+#endif
+        iov_len_local = iov[iov_count].iov_len;
+        if( 0 != pConvertor->partial_length ) {
+            /* not support yet */
+        }
+        while( 1 ) {
+            while( pElem->elem.common.flags & OPAL_DATATYPE_FLAG_DATA ) {
+                /* now here we have a basic datatype */
+                /* should not go to here */
+                goto complete_loop;
+            }
+            if( OPAL_DATATYPE_END_LOOP == pElem->elem.common.type ) { /* end of the current loop */
+                DT_CUDA_DEBUG( opal_cuda_output( 0, "unpack end_loop count %d stack_pos %d pos_desc %d disp %ld space %lu\n",
+                                                 (int)pStack->count, pConvertor->stack_pos, pos_desc,
+                                                 (long)pStack->disp, (unsigned long)iov_len_local ); );
+                if( --(pStack->count) == 0 ) { /* end of loop */
+                    if( 0 == pConvertor->stack_pos ) {
+                        /* Do the same thing as when the loop is completed */
+                        iov[iov_count].iov_len -= iov_len_local;  /* update the amount of valid data */
+                        total_unpacked += iov[iov_count].iov_len;
+                        iov_count++;  /* go to the next */
+                        goto complete_conversion;
+                    }
+                    pConvertor->stack_pos--;
+                    pStack--;
+                    pos_desc++;
+                } else {
+                    pos_desc = pStack->index + 1;
+                    if( pStack->index == -1 ) {
+                        pStack->disp += (pData->ub - pData->lb);
+                    } else {
+                        assert( OPAL_DATATYPE_LOOP == description[pStack->index].loop.common.type );
+                        pStack->disp += description[pStack->index].loop.extent;
+                    }
+                }
+                conv_ptr = pConvertor->pBaseBuf + pStack->disp;
+                UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
+                DT_CUDA_DEBUG( opal_cuda_output( 0, "unpack new_loop count %d stack_pos %d pos_desc %d disp %ld space %lu\n",
+                                                 (int)pStack->count, pConvertor->stack_pos, pos_desc,
+                                                 (long)pStack->disp, (unsigned long)iov_len_local ); );
+            }
+            if( OPAL_DATATYPE_LOOP == pElem->elem.common.type ) {
+                OPAL_PTRDIFF_TYPE local_disp = (OPAL_PTRDIFF_TYPE)conv_ptr;
+                if( pElem->loop.common.flags & OPAL_DATATYPE_FLAG_CONTIGUOUS ) {
+                    unpack_contiguous_loop_cuda(pElem, &count_desc, &iov_ptr, &conv_ptr, &iov_len_local);
+                    count_desc = 0;
+                    if( 0 == count_desc ) {  /* completed */
+                        pos_desc += pElem->loop.items + 1;
+                        goto update_loop_description;
+                    }
+                    /* Save the stack with the correct last_count value. */
+                }
+                local_disp = (OPAL_PTRDIFF_TYPE)conv_ptr - local_disp;
+                PUSH_STACK( pStack, pConvertor->stack_pos, pos_desc, OPAL_DATATYPE_LOOP, count_desc,
+                            pStack->disp + local_disp);
+                pos_desc++;
+            update_loop_description:  /* update the current state */
+                conv_ptr = pConvertor->pBaseBuf + pStack->disp;
+                UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
+                continue;
+            }
+        }
+    complete_loop:
+        iov[iov_count].iov_len -= iov_len_local;  /* update the amount of valid data */
+        total_unpacked += iov[iov_count].iov_len;
+    }
+ complete_conversion:
+    *max_data = total_unpacked;
+    pConvertor->bConverted += total_unpacked;  /* update the already converted bytes */
+    *out_size = iov_count;
+    pConvertor->bConverted = pConvertor->local_size;
+    if( pConvertor->bConverted == pConvertor->remote_size ) {
+        pConvertor->flags |= CONVERTOR_COMPLETED;
+        printf("total unpacked %lu\n", pConvertor->bConverted);
+        return 1;
+    }
+    /* Save the global position for the next round */
+    PUSH_STACK( pStack, pConvertor->stack_pos, pos_desc, pElem->elem.common.type, count_desc,
+                conv_ptr - pConvertor->pBaseBuf );
+    DT_CUDA_DEBUG( opal_cuda_output( 1, "unpack save stack stack_pos %d pos_desc %d count_desc %d disp %ld\n",
+                                     pConvertor->stack_pos, pStack->index, (int)pStack->count, (long)pStack->disp ); );
+    return 0;
+}
+
 int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvertor,
                                                   struct iovec* iov, 
                                                   uint32_t* out_size,
@@ -116,10 +257,11 @@ int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvert
     uint32_t count_desc, current_block, task_iteration, nb_blocks_per_description, dst_offset, residue_desc;
     uint32_t nb_blocks, thread_per_block;
     size_t length, buffer_size, length_per_iovec;
-    unsigned char *source;
+    unsigned char *source, *source_tmp;
     size_t total_unpacked, total_converted;
     int32_t complete_flag = 0;
     uint8_t buffer_isfull = 0;
+    uint8_t free_required = 0;
     uint32_t convertor_flags;
     dt_elem_desc_t* description;
     dt_elem_desc_t* pElem;
@@ -145,16 +287,7 @@ int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvert
     
     DT_CUDA_DEBUG ( opal_cuda_output(0, "GPU datatype UNpacking using iovec\n"); );
     
-#if defined(OPAL_DATATYPE_CUDA_DRY_RUN)
-    source = (unsigned char*)iov[0].iov_base;
-#else
-//    pConvertor->pBaseBuf = pBaseBuf_GPU;
- //   printf("Unpack GPU base %p, iov buffer %p\n", pConvertor->pBaseBuf, iov[0].iov_base);
-    source = ddt_cuda_unpack_buffer;
-#endif /* OPAL_DATATYPE_CUDA_DRY_RUN */
-    
     // double *vtmp = (double *)iov[0].iov_base;
-    printf("recevied unpacked iov buffer, len %d\n", iov[0].iov_len);
     // for (uint32_t i = 0; i < iov[0].iov_len/sizeof(double); i++) {
     //     printf(" %1.f ", *vtmp);
     //     vtmp ++;
@@ -165,9 +298,23 @@ int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvert
 #endif
     if (opal_cuda_is_gpu_buffer(iov[0].iov_base)) {
         source = (unsigned char*)iov[0].iov_base;
-    } else {    
+        free_required = 0;
+    } else {  
+#if defined(OPAL_DATATYPE_CUDA_DRY_RUN)
+        source = (unsigned char*)iov[0].iov_base;
+#else
+        if (pConvertor->gpu_buffer_ptr == NULL) {
+            pConvertor->gpu_buffer_ptr = (unsigned char*)opal_cuda_malloc_gpu_buffer(iov[0].iov_len, 0);
+        }
+        source = pConvertor->gpu_buffer_ptr;
+#endif /* OPAL_DATATYPE_CUDA_DRY_RUN */  
         cudaMemcpy(source, iov[0].iov_base, iov[0].iov_len, cudaMemcpyHostToDevice);
+        free_required = 1;
     }
+    
+    source_tmp = source;
+
+    DT_CUDA_DEBUG ( opal_cuda_output(0, "UNpack GPU base %p, unpack from buffer %p, total size %ld\n", pConvertor->pBaseBuf, source, iov[0].iov_len); );
 #if defined(OPAL_DATATYPE_CUDA_TIMING) 
     GET_TIME( end );
     total_time = ELAPSED_TIME( start, end );
@@ -231,7 +378,7 @@ int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvert
                 alignment = ALIGNMENT_CHAR;
             }
             
-           // alignment = ALIGNMENT_CHAR;
+            alignment = ALIGNMENT_DOUBLE;
 
             count_desc = length_per_iovec / alignment;
             residue_desc = length_per_iovec % alignment;
@@ -283,7 +430,7 @@ int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvert
 #if defined(OPAL_DATATYPE_CUDA_TIMING)    
         GET_TIME( end );
         total_time = ELAPSED_TIME( start, end );
-        printf( "[Timing]: iov is prepared in %ld microsec, cudaMemcpy will be submit to CUDA stream %d\n", total_time,  cuda_streams->current_stream_id);
+        printf( "[Timing]: UNpack src %p, iov is prepared in %ld microsec, kernel submitted to CUDA stream %d\n", source_tmp, total_time,  cuda_streams->current_stream_id);
 #endif
                 
         cudaMemcpyAsync(cuda_iov_dist_d_current, cuda_iov_dist_h_current, sizeof(ddt_cuda_iov_dist_t)*(nb_blocks), cudaMemcpyHostToDevice, cuda_streams->opal_cuda_stream[cuda_streams->current_stream_id]);
@@ -326,6 +473,10 @@ int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvert
     
     if( pConvertor->bConverted == pConvertor->local_size ) {
         pConvertor->flags |= CONVERTOR_COMPLETED;
+        if (pConvertor->gpu_buffer_ptr != NULL && free_required) {
+            opal_cuda_free_gpu_buffer(pConvertor->gpu_buffer_ptr, 0);
+            pConvertor->gpu_buffer_ptr = NULL;
+        }
         return 1;
     }        
     return 0;   
@@ -349,8 +500,8 @@ void unpack_contiguous_loop_cuda( dt_elem_desc_t* ELEM,
     if( (_copy_loops * _end_loop->size) > *(SPACE) )
         _copy_loops = (uint32_t)(*(SPACE) / _end_loop->size);
 
-    _destination = pBaseBuf_GPU;
-    _source = (unsigned char*)cuda_desc_h->iov[0].iov_base;
+    // _destination = pBaseBuf_GPU;
+    // _source = (unsigned char*)cuda_desc_h->iov[0].iov_base;
     
     tasks_per_block = THREAD_PER_BLOCK * TASK_PER_THREAD;
     num_blocks = (*COUNT + tasks_per_block - 1) / tasks_per_block;
diff --git a/opal/datatype/opal_convertor.c b/opal/datatype/opal_convertor.c
index 4ed3773495f..7a8448afbde 100644
--- a/opal/datatype/opal_convertor.c
+++ b/opal/datatype/opal_convertor.c
@@ -590,6 +590,7 @@ int32_t opal_convertor_prepare_for_recv( opal_convertor_t* convertor,
         } else {
             if ((convertor->flags & CONVERTOR_CUDA) && OPAL_DATATYPE_CUDA_KERNEL) {
                 convertor->fAdvance = opal_generic_simple_unpack_cuda;
+                convertor->gpu_buffer_ptr = NULL;
             } else {
                 convertor->fAdvance = opal_generic_simple_unpack;
             }
@@ -636,6 +637,7 @@ int32_t opal_convertor_prepare_for_send( opal_convertor_t* convertor,
         } else {
             if ((convertor->flags & CONVERTOR_CUDA) && OPAL_DATATYPE_CUDA_KERNEL ) {
                 convertor->fAdvance = opal_generic_simple_pack_cuda;
+                convertor->gpu_buffer_ptr = NULL;
             } else {
                 convertor->fAdvance = opal_generic_simple_pack;
             }
diff --git a/opal/datatype/opal_convertor.h b/opal/datatype/opal_convertor.h
index 6ed9e311d84..1ee0c010e63 100644
--- a/opal/datatype/opal_convertor.h
+++ b/opal/datatype/opal_convertor.h
@@ -113,7 +113,6 @@ struct opal_convertor_t {
     void *                        stream;         /**< CUstream for async copy */
 
     unsigned char *               gpu_buffer_ptr; /**< GPU buffer used for pack/unpack */
-    unsigned char *               gpu_buffer_ptr_source; /**< source address of GPU buffer start to pack, update in packing function */
     uint64_t *                    pipeline_event[MAX_IPC_EVENT_HANDLE]; /**< cuda event for pipeline */
 #endif
     /* size: 248, cachelines: 4, members: 20 */
diff --git a/opal/datatype/opal_datatype_gpu.c b/opal/datatype/opal_datatype_gpu.c
index f8c4785994d..c136a55ea71 100644
--- a/opal/datatype/opal_datatype_gpu.c
+++ b/opal/datatype/opal_datatype_gpu.c
@@ -62,6 +62,16 @@ int32_t (*opal_generic_simple_unpack_function_cuda_iov_p)( opal_convertor_t* pCo
                                                         struct iovec* iov, 
                                                         uint32_t* out_size,
                                                         size_t* max_data ) = NULL;
+                                                        
+int32_t (*opal_generic_simple_pack_function_cuda_vector_p)( opal_convertor_t* pConvertor,
+                                                            struct iovec* iov, 
+                                                            uint32_t* out_size,
+                                                            size_t* max_data ) = NULL;
+
+int32_t (*opal_generic_simple_unpack_function_cuda_vector_p)( opal_convertor_t* pConvertor,
+                                                              struct iovec* iov, 
+                                                              uint32_t* out_size,
+                                                              size_t* max_data ) = NULL;
                                                        
 void (*pack_contiguous_loop_cuda_p)( dt_elem_desc_t* ELEM,
                                      uint32_t* COUNT,
@@ -85,6 +95,10 @@ void (*opal_cuda_sync_device_p)(void) = NULL;
 
 unsigned char* (*opal_cuda_get_gpu_pack_buffer_p)(void) = NULL;
 
+void (*opal_cuda_free_gpu_buffer_p)(void *addr, int gpu_id) = NULL;
+
+void* (*opal_cuda_malloc_gpu_buffer_p)(size_t size, int gpu_id) = NULL;
+
 int32_t opal_datatype_gpu_init(void)
 {
     char *error;
@@ -140,6 +154,20 @@ int32_t opal_datatype_gpu_init(void)
             return OPAL_ERROR;
         }
         
+        *(void **)(&opal_generic_simple_pack_function_cuda_vector_p) = dlsym(opal_datatype_cuda_handle, "opal_generic_simple_pack_function_cuda_vector");
+        if ((error = dlerror()) != NULL)  {
+            fprintf(stderr, "opal_generic_simple_pack_function_cuda_vector error: %s\n", error);
+            opal_generic_simple_pack_function_cuda_vector_p = NULL;
+            return OPAL_ERROR;
+        }
+        
+        *(void **)(&opal_generic_simple_unpack_function_cuda_vector_p) = dlsym(opal_datatype_cuda_handle, "opal_generic_simple_unpack_function_cuda_vector");
+        if ((error = dlerror()) != NULL)  {
+            fprintf(stderr, "opal_generic_simple_unpack_function_cuda_vector error: %s\n", error);
+            opal_generic_simple_unpack_function_cuda_vector_p = NULL;
+            return OPAL_ERROR;
+        }
+        
         *(void **)(&pack_contiguous_loop_cuda_p) = dlsym(opal_datatype_cuda_handle, "pack_contiguous_loop_cuda");
         if ((error = dlerror()) != NULL)  {
             fprintf(stderr, "pack_contiguous_loop_cuda error: %s\n", error);
@@ -175,6 +203,20 @@ int32_t opal_datatype_gpu_init(void)
             return OPAL_ERROR;
         }
         
+        *(void **)(&opal_cuda_free_gpu_buffer_p) = dlsym(opal_datatype_cuda_handle, "opal_cuda_free_gpu_buffer");
+        if ((error = dlerror()) != NULL)  {
+            fprintf(stderr, "opal_cuda_free_gpu_buffer error: %s\n", error);
+            opal_cuda_free_gpu_buffer_p = NULL;
+            return OPAL_ERROR;
+        }
+        
+        *(void **)(&opal_cuda_malloc_gpu_buffer_p) = dlsym(opal_datatype_cuda_handle, "opal_cuda_malloc_gpu_buffer");
+        if ((error = dlerror()) != NULL)  {
+            fprintf(stderr, "opal_cuda_malloc_gpu_buffer error: %s\n", error);
+            opal_cuda_malloc_gpu_buffer_p = NULL;
+            return OPAL_ERROR;
+        }
+        
         (*opal_datatype_cuda_init_p)();
         printf("cuda init done\n");   
     }
@@ -193,11 +235,15 @@ int32_t opal_datatype_gpu_fini(void)
         opal_generic_simple_unpack_function_cuda_p = NULL;
         opal_generic_simple_pack_function_cuda_iov_p = NULL;
         opal_generic_simple_unpack_function_cuda_iov_p = NULL;
+        opal_generic_simple_pack_function_cuda_vector_p = NULL;
+        opal_generic_simple_unpack_function_cuda_vector_p = NULL;
         pack_contiguous_loop_cuda_p = NULL;
         unpack_contiguous_loop_cuda_p = NULL;
         pack_predefined_data_cuda_p = NULL;
         opal_cuda_sync_device_p = NULL;
         opal_cuda_get_gpu_pack_buffer_p = NULL;
+        opal_cuda_free_gpu_buffer_p = NULL;
+        opal_cuda_malloc_gpu_buffer_p = NULL;
         printf("cuda fini done\n");
     }
     return OPAL_SUCCESS;
diff --git a/opal/datatype/opal_datatype_gpu.h b/opal/datatype/opal_datatype_gpu.h
index 49060bde8d1..8ae90cde92f 100644
--- a/opal/datatype/opal_datatype_gpu.h
+++ b/opal/datatype/opal_datatype_gpu.h
@@ -26,10 +26,20 @@ extern int32_t (*opal_generic_simple_pack_function_cuda_iov_p)( opal_convertor_t
                                                                 uint32_t* out_size,
                                                                 size_t* max_data );
                                                                 
+extern int32_t (*opal_generic_simple_pack_function_cuda_vector_p)( opal_convertor_t* pConvertor,
+                                                                   struct iovec* iov, 
+                                                                   uint32_t* out_size,
+                                                                   size_t* max_data );
+                                                                
 extern int32_t (*opal_generic_simple_unpack_function_cuda_iov_p)( opal_convertor_t* pConvertor,
-                                                                struct iovec* iov, 
-                                                                uint32_t* out_size,
-                                                                size_t* max_data );
+                                                                  struct iovec* iov, 
+                                                                  uint32_t* out_size,
+                                                                  size_t* max_data );
+                                                                  
+extern int32_t (*opal_generic_simple_unpack_function_cuda_vector_p)( opal_convertor_t* pConvertor,
+                                                                     struct iovec* iov, 
+                                                                     uint32_t* out_size,
+                                                                     size_t* max_data );
                                                               
 extern void (*pack_contiguous_loop_cuda_p)( dt_elem_desc_t* ELEM,
                                             uint32_t* COUNT,
@@ -52,4 +62,8 @@ extern void (*pack_predefined_data_cuda_p)( dt_elem_desc_t* ELEM,
 extern void (*opal_cuda_sync_device_p)(void);
 
 extern unsigned char* (*opal_cuda_get_gpu_pack_buffer_p)(void);
+
+extern void (*opal_cuda_free_gpu_buffer_p)(void *addr, int gpu_id);
+
+extern void* (*opal_cuda_malloc_gpu_buffer_p)(size_t size, int gpu_id);
 #endif /* OPAL_DATATYPE_GPU_H_HAS_BEEN_INCLUDED */
\ No newline at end of file
diff --git a/opal/datatype/opal_datatype_pack.c b/opal/datatype/opal_datatype_pack.c
index a9aaa6541d7..7ddefdd1728 100644
--- a/opal/datatype/opal_datatype_pack.c
+++ b/opal/datatype/opal_datatype_pack.c
@@ -412,9 +412,24 @@ opal_generic_simple_pack_cuda_function( opal_convertor_t* pConvertor,
                                         struct iovec* iov, uint32_t* out_size,
                                         size_t* max_data )
 {
-    if (opal_generic_simple_pack_function_cuda_iov_p != NULL) {
-        return (*opal_generic_simple_pack_function_cuda_iov_p)( pConvertor, iov, out_size, max_data);
-
+    dt_stack_t* pStack;
+    uint32_t pos_desc;
+    dt_elem_desc_t* description;
+    dt_elem_desc_t* pElem;
+    
+    description = pConvertor->use_desc->desc;
+    pStack = pConvertor->pStack + pConvertor->stack_pos;
+    pos_desc   = pStack->index;
+    pElem = &(description[pos_desc]);
+    
+    if( OPAL_DATATYPE_LOOP == pElem->elem.common.type ) {
+        if (opal_generic_simple_pack_function_cuda_vector_p != NULL) {
+            return (*opal_generic_simple_pack_function_cuda_vector_p)( pConvertor, iov, out_size, max_data);
+        }
+    } else {
+        if (opal_generic_simple_pack_function_cuda_iov_p != NULL) {
+            return (*opal_generic_simple_pack_function_cuda_iov_p)( pConvertor, iov, out_size, max_data);
+        }
     }
     return 0;
 }
diff --git a/opal/datatype/opal_datatype_unpack.c b/opal/datatype/opal_datatype_unpack.c
index cad655000d6..ff8dae77971 100644
--- a/opal/datatype/opal_datatype_unpack.c
+++ b/opal/datatype/opal_datatype_unpack.c
@@ -599,9 +599,24 @@ opal_generic_simple_unpack_cuda_function( opal_convertor_t* pConvertor,
                                           struct iovec* iov, uint32_t* out_size,
                                           size_t* max_data )
 {
-    if (opal_generic_simple_unpack_function_cuda_iov_p != NULL) {
-        return (*opal_generic_simple_unpack_function_cuda_iov_p)( pConvertor, iov, out_size, max_data);
-
+    dt_stack_t* pStack;
+    uint32_t pos_desc;
+    dt_elem_desc_t* description;
+    dt_elem_desc_t* pElem;
+    
+    description = pConvertor->use_desc->desc;
+    pStack = pConvertor->pStack + pConvertor->stack_pos;
+    pos_desc   = pStack->index;
+    pElem = &(description[pos_desc]);
+    
+    if( OPAL_DATATYPE_LOOP == pElem->elem.common.type ) {
+        if (opal_generic_simple_unpack_function_cuda_vector_p != NULL) {
+            return (*opal_generic_simple_unpack_function_cuda_vector_p)( pConvertor, iov, out_size, max_data);
+        }
+    } else {
+        if (opal_generic_simple_unpack_function_cuda_iov_p != NULL) {
+            return (*opal_generic_simple_unpack_function_cuda_iov_p)( pConvertor, iov, out_size, max_data);
+        }
     }
     return 0;
 }
diff --git a/opal/mca/btl/btl.h b/opal/mca/btl/btl.h
index 691af933d14..431610ff17f 100644
--- a/opal/mca/btl/btl.h
+++ b/opal/mca/btl/btl.h
@@ -188,6 +188,8 @@ typedef uint8_t mca_btl_base_tag_t;
 #define MCA_BTL_TAG_IB                (MCA_BTL_TAG_BTL + 0)
 #define MCA_BTL_TAG_UDAPL             (MCA_BTL_TAG_BTL + 1)
 #define MCA_BTL_TAG_SMCUDA            (MCA_BTL_TAG_BTL + 2)
+#define MCA_BTL_TAG_SMCUDA_DATATYPE_UNPACK   (MCA_BTL_TAG_BTL + 3)
+#define MCA_BTL_TAG_SMCUDA_DATATYPE_PACK     (MCA_BTL_TAG_BTL + 4)
 
 /* prefered protocol */
 #define MCA_BTL_FLAGS_SEND            0x0001
diff --git a/opal/mca/btl/smcuda/btl_smcuda.c b/opal/mca/btl/smcuda/btl_smcuda.c
index 2e42d4babc8..3a711e40cdf 100644
--- a/opal/mca/btl/smcuda/btl_smcuda.c
+++ b/opal/mca/btl/smcuda/btl_smcuda.c
@@ -1131,21 +1131,15 @@ int mca_btl_smcuda_get_cuda (struct mca_btl_base_module_t *btl,
         recvreq->req_recv.req_base.req_convertor.flags &= ~CONVERTOR_CUDA;
         if(opal_convertor_need_buffers(&recvreq->req_recv.req_base.req_convertor) == true) {
             recvreq->req_recv.req_base.req_convertor.flags |= CONVERTOR_CUDA;
-            printf("RGET NOT IMPLEMENT YET!!!!!!!!!!!!!!\n");
-            struct iovec iov;
-            uint32_t iov_count = 1;
-            iov.iov_base = remote_memory_address;
-            iov.iov_len = size;
-            int rc;
-            size_t max_data = size;
+            printf("RECEIVE REGT!!!!!!!!!!!\n");
+            
             struct opal_convertor_t *convertor = &(recvreq->req_recv.req_base.req_convertor);   
-        //    uint64_t *event = &convertor->pipeline_event[0];
-            // mca_common_cuda_openeventhandle(&event, 0, (mca_mpool_common_cuda_reg_data_t*)remote_handle);
-            // if (mca_common_cuda_query_event(event) == OPAL_SUCCESS){
-            //     printf("get event\n");
-                rc = opal_convertor_unpack(convertor, &iov, &iov_count, &max_data );
-                done = 1;
-            // }
+            size_t pipeline_size = remote_handle->reg_data.pipeline_size;
+            uint32_t lindex = remote_handle->reg_data.lindex;
+            printf("i receive pipeline %ld, lindex %d\n", pipeline_size, lindex);
+            convertor->gpu_buffer_ptr = remote_memory_address;
+            mca_btl_smcuda_cuda_dt_clone(convertor, ep, local_address, local_handle, (mca_btl_base_completion_fn_t)cbfunc, cbcontext, cbdata, pipeline_size, lindex);
+            done = 0;
         } else {
             recvreq->req_recv.req_base.req_convertor.flags |= CONVERTOR_CUDA;
             rc = mca_common_cuda_memcpy(local_address, remote_memory_address, size,
@@ -1251,6 +1245,90 @@ static void mca_btl_smcuda_send_cuda_ipc_request(struct mca_btl_base_module_t* b
 
 }
 
+int mca_btl_smcuda_send_cuda_unpack_sig(struct mca_btl_base_module_t* btl,
+                                           struct mca_btl_base_endpoint_t* endpoint, int lindex, int seq)
+{
+    mca_btl_smcuda_frag_t* frag;
+    int rc;
+    cuda_dt_hdr_t cuda_dt_hdr;
+    
+    /* allocate a fragment, giving up if we can't get one */
+    MCA_BTL_SMCUDA_FRAG_ALLOC_EAGER(frag);
+    if( OPAL_UNLIKELY(NULL == frag) ) {
+        return OPAL_ERR_OUT_OF_RESOURCE;;
+    }
+
+    /* Fill in fragment fields. */
+    frag->base.des_flags = MCA_BTL_DES_FLAGS_BTL_OWNERSHIP;
+    cuda_dt_hdr.seq = seq;
+    cuda_dt_hdr.lindex = lindex;
+    memcpy(frag->segment.seg_addr.pval, &cuda_dt_hdr, sizeof(cuda_dt_hdr_t));
+    
+    rc = mca_btl_smcuda_send(btl, endpoint, (struct mca_btl_base_descriptor_t*)frag,  MCA_BTL_TAG_SMCUDA_DATATYPE_UNPACK);
+    return rc;
+}
+
+int mca_btl_smcuda_send_cuda_pack_sig(struct mca_btl_base_module_t* btl,
+                                      struct mca_btl_base_endpoint_t* endpoint, int lindex, int seq)
+{
+    mca_btl_smcuda_frag_t* frag;
+    int rc;
+    cuda_dt_hdr_t cuda_dt_hdr;
+    
+    /* allocate a fragment, giving up if we can't get one */
+    MCA_BTL_SMCUDA_FRAG_ALLOC_EAGER(frag);
+    if( OPAL_UNLIKELY(NULL == frag) ) {
+        return OPAL_ERR_OUT_OF_RESOURCE;;
+    }
+
+    /* Fill in fragment fields. */
+    frag->base.des_flags = MCA_BTL_DES_FLAGS_BTL_OWNERSHIP;
+    cuda_dt_hdr.seq = seq;
+    cuda_dt_hdr.lindex = lindex;
+    memcpy(frag->segment.seg_addr.pval, &cuda_dt_hdr, sizeof(cuda_dt_hdr_t));
+    
+    rc = mca_btl_smcuda_send(btl, endpoint, (struct mca_btl_base_descriptor_t*)frag,  MCA_BTL_TAG_SMCUDA_DATATYPE_PACK);
+    return rc;
+}
+
+int mca_btl_smcuda_alloc_cuda_dt_clone(void)
+{
+    int i;
+    for (i = 0; i < SMCUDA_DT_CLONE_SIZE; i++) {
+        if (smcuda_dt_clone[i].lindex == -1) {
+            return i;
+        }
+    }
+    return -1;
+}
+
+void mca_btl_smcuda_free_cuda_dt_clone(int lindex)
+{
+    assert(smcuda_dt_clone[lindex].lindex == lindex);
+    smcuda_dt_clone[lindex].lindex = -1;
+}
+
+void mca_btl_smcuda_cuda_dt_clone(struct opal_convertor_t *convertor,
+                                  struct mca_btl_base_endpoint_t *endpoint,
+                                  void *local_address,
+                                  struct mca_btl_base_registration_handle_t *local_handle,
+                                  mca_btl_base_completion_fn_t cbfunc,
+                                  void *cbcontext,
+                                  void *cbdata,
+                                  size_t pipeline_size,
+                                  int lindex)
+{
+    smcuda_dt_clone[lindex].convertor = convertor;
+    smcuda_dt_clone[lindex].endpoint = endpoint;
+    smcuda_dt_clone[lindex].local_address = local_address;
+    smcuda_dt_clone[lindex].local_handle = local_handle;
+    smcuda_dt_clone[lindex].cbfunc = cbfunc;
+    smcuda_dt_clone[lindex].cbcontext = cbcontext;
+    smcuda_dt_clone[lindex].cbdata = cbdata;
+    smcuda_dt_clone[lindex].pipeline_size = pipeline_size;
+    smcuda_dt_clone[lindex].lindex = lindex;
+}
+
 #endif /* OPAL_CUDA_SUPPORT */
 
 /**
diff --git a/opal/mca/btl/smcuda/btl_smcuda.h b/opal/mca/btl/smcuda/btl_smcuda.h
index 7c9d30faded..3e9f2a46db2 100644
--- a/opal/mca/btl/smcuda/btl_smcuda.h
+++ b/opal/mca/btl/smcuda/btl_smcuda.h
@@ -508,6 +508,42 @@ enum ipcState {
     IPC_BAD
 };
 
+/* cuda datatype control message */
+typedef struct {
+    int seq;
+    int lindex;
+} cuda_dt_hdr_t;
+
+/* package save pack/unpack convertor and cbfunc */
+typedef struct {
+    struct opal_convertor_t *convertor;
+    struct mca_btl_base_endpoint_t *endpoint;
+    void *local_address;
+    struct mca_btl_base_registration_handle_t *local_handle;
+    mca_btl_base_completion_fn_t cbfunc;
+    void *cbcontext;
+    void *cbdata;
+    size_t pipeline_size;
+    int lindex;
+} cuda_dt_clone_t;
+
+#define SMCUDA_DT_CLONE_SIZE 20
+extern cuda_dt_clone_t smcuda_dt_clone[SMCUDA_DT_CLONE_SIZE];
+
+int mca_btl_smcuda_send_cuda_unpack_sig(struct mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t* endpoint, int lindex, int seq);
+int mca_btl_smcuda_send_cuda_pack_sig(struct mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t* endpoint, int lindex, int seq);
+int mca_btl_smcuda_alloc_cuda_dt_clone(void);
+void mca_btl_smcuda_free_cuda_dt_clone(int lindex);
+void mca_btl_smcuda_cuda_dt_clone(struct opal_convertor_t *convertor,
+                                  struct mca_btl_base_endpoint_t *endpoint,
+                                  void *local_address,
+                                  struct mca_btl_base_registration_handle_t *local_handle,
+                                  mca_btl_base_completion_fn_t cbfunc,
+                                  void *cbcontext,
+                                  void *cbdata,
+                                  size_t pipeline_size,
+                                  int lindex);
+
 #endif /* OPAL_CUDA_SUPPORT */
 
 
diff --git a/opal/mca/btl/smcuda/btl_smcuda_component.c b/opal/mca/btl/smcuda/btl_smcuda_component.c
index dcbf0ec5180..727308c1df9 100644
--- a/opal/mca/btl/smcuda/btl_smcuda_component.c
+++ b/opal/mca/btl/smcuda/btl_smcuda_component.c
@@ -54,6 +54,7 @@
 
 #if OPAL_CUDA_SUPPORT
 #include "opal/mca/common/cuda/common_cuda.h"
+#include "opal/datatype/opal_datatype_gpu.h"
 #endif /* OPAL_CUDA_SUPPORT */
 #if OPAL_ENABLE_FT_CR    == 1
 #include "opal/runtime/opal_cr.h"
@@ -846,6 +847,62 @@ static void btl_smcuda_control(mca_btl_base_module_t* btl,
     }
 }
 
+cuda_dt_clone_t smcuda_dt_clone[SMCUDA_DT_CLONE_SIZE];
+
+static void btl_smcuda_datatype_unpack(mca_btl_base_module_t* btl,
+                                       mca_btl_base_tag_t tag,
+                                       mca_btl_base_descriptor_t* des, void* cbdata)
+{   
+    cuda_dt_hdr_t cuda_dt_hdr;
+    mca_btl_base_segment_t* segments = des->des_segments;
+    memcpy(&cuda_dt_hdr, segments->seg_addr.pval, sizeof(cuda_dt_hdr_t));
+    int seq = cuda_dt_hdr.seq;
+    int lindex = cuda_dt_hdr.lindex;
+    cuda_dt_clone_t *my_cuda_dt_clone = &smcuda_dt_clone[lindex];
+    
+    assert(my_cuda_dt_clone->lindex == lindex);
+    
+    printf("$$$$$$$$$$$$$$hello, rank %d in smcuda unpack seq %d, index %d\n", my_cuda_dt_clone->endpoint->my_smp_rank, seq, lindex);
+    
+    if (seq == -2) {
+        mca_btl_base_rdma_completion_fn_t cbfunc = (mca_btl_base_rdma_completion_fn_t)my_cuda_dt_clone->cbfunc;
+        cbfunc(btl, my_cuda_dt_clone->endpoint, my_cuda_dt_clone->local_address, my_cuda_dt_clone->local_handle, my_cuda_dt_clone->cbcontext, my_cuda_dt_clone->cbdata, OPAL_SUCCESS);
+        mca_btl_smcuda_free_cuda_dt_clone(lindex);
+    } else if (seq == -1) {
+        mca_btl_smcuda_send_cuda_pack_sig(btl, my_cuda_dt_clone->endpoint, lindex, -1);
+    } else {
+        struct iovec iov;
+        uint32_t iov_count = 1;
+        size_t max_data;
+        struct opal_convertor_t *convertor = my_cuda_dt_clone->convertor;   
+        iov.iov_base = convertor->gpu_buffer_ptr + seq * my_cuda_dt_clone->pipeline_size;
+        max_data = my_cuda_dt_clone->pipeline_size;
+        iov.iov_len = my_cuda_dt_clone->pipeline_size;
+        opal_convertor_unpack(convertor, &iov, &iov_count, &max_data );
+    }
+    
+}
+
+static void btl_smcuda_datatype_pack(mca_btl_base_module_t* btl,
+                                     mca_btl_base_tag_t tag,
+                                     mca_btl_base_descriptor_t* des, void* cbdata)
+{
+    cuda_dt_hdr_t cuda_dt_hdr;
+    mca_btl_base_segment_t* segments = des->des_segments;
+    memcpy(&cuda_dt_hdr, segments->seg_addr.pval, sizeof(cuda_dt_hdr_t));
+    int seq = cuda_dt_hdr.seq;
+    int lindex = cuda_dt_hdr.lindex;
+    cuda_dt_clone_t *my_cuda_dt_clone = &smcuda_dt_clone[lindex];
+    
+    printf("$$$$$$$$$$$$$$hello, rank %d in smcuda pack seq %d, index %d\n", my_cuda_dt_clone->endpoint->my_smp_rank, seq, lindex);
+    
+    if (seq == -1) {
+        mca_btl_smcuda_send_cuda_unpack_sig(btl, my_cuda_dt_clone->endpoint, lindex, -2);
+        opal_cuda_free_gpu_buffer_p(my_cuda_dt_clone->convertor->gpu_buffer_ptr, 0);
+        mca_btl_smcuda_free_cuda_dt_clone(lindex);
+    }
+}
+
 #endif /* OPAL_CUDA_SUPPORT */
 
 /*
@@ -960,6 +1017,14 @@ mca_btl_smcuda_component_init(int *num_btls,
     /* Register a smcuda control function to help setup IPC support */
     mca_btl_base_active_message_trigger[MCA_BTL_TAG_SMCUDA].cbfunc = btl_smcuda_control;
     mca_btl_base_active_message_trigger[MCA_BTL_TAG_SMCUDA].cbdata = NULL;
+    mca_btl_base_active_message_trigger[MCA_BTL_TAG_SMCUDA_DATATYPE_UNPACK].cbfunc = btl_smcuda_datatype_unpack;
+    mca_btl_base_active_message_trigger[MCA_BTL_TAG_SMCUDA_DATATYPE_UNPACK].cbdata = NULL;
+    mca_btl_base_active_message_trigger[MCA_BTL_TAG_SMCUDA_DATATYPE_PACK].cbfunc = btl_smcuda_datatype_pack;
+    mca_btl_base_active_message_trigger[MCA_BTL_TAG_SMCUDA_DATATYPE_PACK].cbdata = NULL;
+    
+    for (int i = 0; i < SMCUDA_DT_CLONE_SIZE; i++) {
+        smcuda_dt_clone[i].lindex = -1;
+    }
 #endif /* OPAL_CUDA_SUPPORT */
 
     return btls;
diff --git a/opal/mca/common/cuda/common_cuda.c b/opal/mca/common/cuda/common_cuda.c
index e0a80ef4ac2..d37f6656d2c 100644
--- a/opal/mca/common/cuda/common_cuda.c
+++ b/opal/mca/common/cuda/common_cuda.c
@@ -1652,10 +1652,11 @@ int progress_one_cuda_htod_event(struct mca_btl_base_descriptor_t **frag) {
 
 int mca_common_cuda_geteventhandle(uint64_t **event, int n, mca_mpool_base_registration_t *newreg) 
 {
-    // CUipcEventHandle evtHandle;
-    // mca_mpool_common_cuda_reg_t *cuda_reg = (mca_mpool_common_cuda_reg_t*)newreg;
-    // mca_common_cuda_construct_event_and_handle(event, (void**)&evtHandle);
-    // memcpy(&cuda_reg->data.pipeline_evtHandle[n], &evtHandle, sizeof(evtHandle));
+    CUipcEventHandle evtHandle;
+    mca_mpool_common_cuda_reg_t *cuda_reg = (mca_mpool_common_cuda_reg_t*)newreg;
+ //   mca_common_cuda_construct_event_and_handle(event, (void**)&evtHandle);
+//    printf("0 %p, 1 %p\n",&cuda_reg->data.pipeline_evtHandle[0], &cuda_reg->data.pipeline_evtHandle[EVTHANDLE_SIZE]);
+ //   memcpy(&cuda_reg->data.pipeline_evtHandle[n*EVTHANDLE_SIZE], &evtHandle, sizeof(evtHandle));
     return OPAL_SUCCESS;
 }
 
@@ -1703,7 +1704,7 @@ int mca_common_cuda_openeventhandle(uint64_t **event, int n, mca_mpool_common_cu
     // CUipcEventHandle evtHandle;
     // CUresult result;
     // mca_mpool_common_cuda_reg_data_t *cuda_handle = (mca_mpool_common_cuda_reg_data_t*)handle;
-    // memcpy(&evtHandle, cuda_handle->pipeline_evtHandle[n], sizeof(evtHandle));
+    // memcpy(&evtHandle, &cuda_handle->pipeline_evtHandle[n*EVTHANDLE_SIZE], sizeof(evtHandle));
     // result = cuFunc.cuIpcOpenEventHandle((CUevent *)event, evtHandle);
     // if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
     //     opal_show_help("help-mpi-common-cuda.txt", "cuIpcOpenEventHandle failed",
diff --git a/opal/mca/common/cuda/common_cuda.h b/opal/mca/common/cuda/common_cuda.h
index da6b86d2464..0b5a724d9dc 100644
--- a/opal/mca/common/cuda/common_cuda.h
+++ b/opal/mca/common/cuda/common_cuda.h
@@ -28,7 +28,9 @@
 #define MEMHANDLE_SIZE 8
 #define EVTHANDLE_SIZE 8
 
-typedef uint64_t cuIPCHandle[EVTHANDLE_SIZE];
+typedef struct {
+    uint64_t evtHandle[EVTHANDLE_SIZE];
+}cuIPCHandle_t;
 
 struct mca_mpool_common_cuda_reg_data_t {
     uint64_t memHandle[MEMHANDLE_SIZE];
@@ -36,8 +38,9 @@ struct mca_mpool_common_cuda_reg_data_t {
     uint64_t event;
     opal_ptr_t memh_seg_addr;
     size_t memh_seg_len;
-//    cuIPCHandle pipeline_evtHandle[MAX_IPC_EVENT_HANDLE];
-    uint32_t pipeline_size;
+    // uint64_t pipeline_evtHandle[MAX_IPC_EVENT_HANDLE*EVTHANDLE_SIZE];
+    size_t pipeline_size;
+    uint32_t lindex;
 };
 typedef struct mca_mpool_common_cuda_reg_data_t mca_mpool_common_cuda_reg_data_t;
 
diff --git a/opal/mca/mpool/gpusm/mpool_gpusm_module.c b/opal/mca/mpool/gpusm/mpool_gpusm_module.c
index 98740bbdcde..50dcbc859fb 100644
--- a/opal/mca/mpool/gpusm/mpool_gpusm_module.c
+++ b/opal/mca/mpool/gpusm/mpool_gpusm_module.c
@@ -49,7 +49,7 @@
 static void mca_mpool_gpusm_registration_constructor( mca_mpool_gpusm_registration_t *item )
 {
     mca_common_cuda_construct_event_and_handle(&item->event,
-                                               (void *)&item->evtHandle);
+                                               (void *)item->evtHandle);
 }
 
 /**
diff --git a/test/datatype/ddt_test.c b/test/datatype/ddt_test.c
index 6a41001a770..3e6a2a531ff 100644
--- a/test/datatype/ddt_test.c
+++ b/test/datatype/ddt_test.c
@@ -305,11 +305,17 @@ local_copy_with_convertor_2datatypes( ompi_datatype_t* send_type, int send_count
 #endif
 
     send_convertor = opal_convertor_create( remote_arch, 0 );
+#if defined (DDT_TEST_CUDA)
+    send_convertor->flags |= CONVERTOR_CUDA;
+#endif
     if( OPAL_SUCCESS != opal_convertor_prepare_for_send( send_convertor, &(send_type->super), send_count, psrc ) ) {
         printf( "Unable to create the send convertor. Is the datatype committed ?\n" );
         goto clean_and_return;
     }
     recv_convertor = opal_convertor_create( remote_arch, 0 );
+#if defined (DDT_TEST_CUDA)
+    recv_convertor->flags |= CONVERTOR_CUDA;
+#endif
     if( OPAL_SUCCESS != opal_convertor_prepare_for_recv( recv_convertor, &(recv_type->super), recv_count, pdst ) ) {
         printf( "Unable to create the recv convertor. Is the datatype committed ?\n" );
         goto clean_and_return;
@@ -450,11 +456,17 @@ local_copy_with_convertor_2datatypes_struct( ompi_datatype_t* send_type, int sen
 #endif
 
     send_convertor = opal_convertor_create( remote_arch, 0 );
+#if defined (DDT_TEST_CUDA)
+    send_convertor->flags |= CONVERTOR_CUDA;
+#endif
     if( OPAL_SUCCESS != opal_convertor_prepare_for_send( send_convertor, &(send_type->super), send_count, psrc ) ) {
         printf( "Unable to create the send convertor. Is the datatype committed ?\n" );
         goto clean_and_return;
     }
     recv_convertor = opal_convertor_create( remote_arch, 0 );
+#if defined (DDT_TEST_CUDA)
+    recv_convertor->flags |= CONVERTOR_CUDA;
+#endif
     if( OPAL_SUCCESS != opal_convertor_prepare_for_recv( recv_convertor, &(recv_type->super), recv_count, pdst ) ) {
         printf( "Unable to create the recv convertor. Is the datatype committed ?\n" );
         goto clean_and_return;
@@ -816,9 +828,9 @@ int main( int argc, char* argv[] )
     printf( "\n\n#\n * TEST UPPER TRIANGULAR MATRIX (size 100)\n #\n\n" );
     pdt = upper_matrix(4000);
     if( outputFlags & CHECK_PACK_UNPACK ) {
-        for (i = 1; i <= 3; i++) {
+        for (i = 1; i <= 1; i++) {
 //        local_copy_ddt_count(pdt, 1);
-            local_copy_with_convertor(pdt, 1, 1024*1024*100, 4000);
+            local_copy_with_convertor(pdt, 1, 1024*1024*10, 4000);
         }
     }
     OBJ_RELEASE( pdt ); assert( pdt == NULL );
@@ -959,7 +971,7 @@ int main( int argc, char* argv[] )
       //  local_copy_with_convertor( pdt, 1, 6000 );
       //  local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 6000 );
       //  local_copy_with_convertor( pdt, 1, 36000 );
-           // local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 1024*1024*20 , 4000, 384, 512);
+          local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 1024*1024*20 , 4000, 384, 512);
         }
     }
     printf( ">>--------------------------------------------<<\n" );
@@ -969,7 +981,7 @@ int main( int argc, char* argv[] )
     pdt = create_vector_type( MPI_DOUBLE, 4000, 256, 384 );
 //    ompi_datatype_dump( pdt );
     if( outputFlags & CHECK_PACK_UNPACK ) {
-        for (i = 0; i < 10; i++) {
+        for (i = 0; i < 1; i++) {
        // local_copy_ddt_count(pdt, 1);
       //  local_copy_with_convertor( pdt, 1, 12 );
       //  local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 12 );
@@ -978,7 +990,7 @@ int main( int argc, char* argv[] )
       //  local_copy_with_convertor( pdt, 1, 6000 );
       //  local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 6000 );
       //  local_copy_with_convertor( pdt, 1, 36000 );
-  //        local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 1024*1024*10 );
+          local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 1024*1024*10, 4000, 256, 384 );
         }
     }
     printf( ">>--------------------------------------------<<\n" );

From c10d3f482bf0ec74ac9842d3c8f0386e7ce2cc39 Mon Sep 17 00:00:00 2001
From: eddy16112 <eddy16112@gmail.com>
Date: Fri, 1 May 2015 19:41:58 -0400
Subject: [PATCH 005/190] fix gpu memory and vector datatype

---
 opal/datatype/cuda/opal_datatype_cuda.cu      |  6 +++
 .../cuda/opal_datatype_pack_cuda_wrapper.cu   | 49 ++++++++++++-------
 .../cuda/opal_datatype_unpack_cuda_wrapper.cu | 32 +++++++++---
 test/datatype/ddt_test.c                      |  6 +--
 4 files changed, 66 insertions(+), 27 deletions(-)

diff --git a/opal/datatype/cuda/opal_datatype_cuda.cu b/opal/datatype/cuda/opal_datatype_cuda.cu
index 387f75583ce..3ec7b9e53ce 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cu
+++ b/opal/datatype/cuda/opal_datatype_cuda.cu
@@ -129,6 +129,8 @@ static inline void cuda_list_push_head(ddt_cuda_list_t *list, ddt_cuda_buffer_t
     item->next = orig_head;
     if (orig_head == NULL) {
         list->tail = item;
+    } else {
+        orig_head->prev = item;
     }
     list->nb_elements ++;
 }
@@ -141,6 +143,8 @@ static inline void cuda_list_push_tail(ddt_cuda_list_t *list, ddt_cuda_buffer_t
     item->prev = orig_tail;
     if (orig_tail == NULL) {
         list->head = item;
+    } else {
+        orig_tail->next = item;
     }
     list->nb_elements ++;
 }
@@ -219,10 +223,12 @@ void opal_datatype_cuda_init(void)
         p->gpu_addr = gpu_ptr;
         cuda_device[i].buffer_free.head = p;
         cuda_device[i].buffer_free.tail = cuda_device[i].buffer_free.head;
+        cuda_device[i].buffer_free.nb_elements = 1;
         
         cuda_device[i].buffer_used.head = NULL;
         cuda_device[i].buffer_used.tail = NULL;
         cuda_device[i].buffer_used_size = 0;
+        cuda_device[i].buffer_used.nb_elements = 0;
     }
     
     cudaMalloc((void **)&cuda_desc_d, sizeof(ddt_cuda_desc_t));
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
index a5963b74d3f..636e413bc21 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
@@ -184,6 +184,8 @@ int32_t opal_generic_simple_pack_function_cuda_vector(opal_convertor_t* pConvert
     size_t iov_len_local;
     uint32_t iov_count;
     uint8_t transfer_required;
+    uint8_t free_required;
+    uint32_t count_desc_tmp;
     
 #if defined(OPAL_DATATYPE_CUDA_TIMING)    
     TIMER_DATA_TYPE start, end, start_total, end_total;
@@ -216,32 +218,42 @@ int32_t opal_generic_simple_pack_function_cuda_vector(opal_convertor_t* pConvert
     
     
     for( iov_count = 0; iov_count < (*out_size); iov_count++ ) {
-        if ((iov[0].iov_base == NULL) || opal_cuda_is_gpu_buffer(iov[iov_count].iov_base)) {
-            // if (iov[0].iov_len == 0) {
-            //     buffer_size = DT_CUDA_BUFFER_SIZE;
-            // } else {
-            //     buffer_size = iov[0].iov_len;
-            // }
-            pConvertor->gpu_buffer_ptr = ddt_cuda_pack_buffer;
+        if ((iov[iov_count].iov_base == NULL) || opal_cuda_is_gpu_buffer(iov[iov_count].iov_base)) {
+            if (iov[iov_count].iov_len == 0) {
+                iov_len_local = DT_CUDA_BUFFER_SIZE;
+            } else {
+                iov_len_local = iov[iov_count].iov_len;
+            }
         
             if (iov[iov_count].iov_base == NULL) {
-                iov[iov_count].iov_base = ddt_cuda_pack_buffer;
-                iov_ptr = ddt_cuda_pack_buffer;
+                iov[iov_count].iov_base = (unsigned char *)opal_cuda_malloc_gpu_buffer(iov_len_local, 0);
+                iov_ptr = (unsigned char *)iov[iov_count].iov_base;
+                free_required = 1;
             } else {
                 iov_ptr = (unsigned char *)iov[iov_count].iov_base;
+                free_required = 0;
             }
             transfer_required = 0;
+            pConvertor->gpu_buffer_ptr = iov_ptr;
         } else {
-            pConvertor->gpu_buffer_ptr = NULL;
+            iov_len_local = iov[iov_count].iov_len;
+            if (pConvertor->gpu_buffer_ptr == NULL) {
+                pConvertor->gpu_buffer_ptr = (unsigned char*)opal_cuda_malloc_gpu_buffer(iov_len_local, 0);
+            }
             transfer_required = 1;
+            free_required = 1;
+            iov_ptr = pConvertor->gpu_buffer_ptr;
         }
-        iov_ptr = ddt_cuda_pack_buffer;
-        iov_len_local = iov[iov_count].iov_len;
         printf("original local %d\n", iov_len_local);
         while( 1 ) {
             while( pElem->elem.common.flags & OPAL_DATATYPE_FLAG_DATA ) {
                 /* now here we have a basic datatype */
                 /* should not go into here */
+                pStack--;
+                pConvertor->stack_pos--;
+                pos_desc --;
+                pElem = &(description[pos_desc]);
+                count_desc = count_desc_tmp;
                 goto complete_loop;
             }
             if( OPAL_DATATYPE_END_LOOP == pElem->elem.common.type ) { /* end of the current loop */
@@ -279,7 +291,6 @@ int32_t opal_generic_simple_pack_function_cuda_vector(opal_convertor_t* pConvert
                 OPAL_PTRDIFF_TYPE local_disp = (OPAL_PTRDIFF_TYPE)conv_ptr;
                 if( pElem->loop.common.flags & OPAL_DATATYPE_FLAG_CONTIGUOUS ) {
                     pack_contiguous_loop_cuda(pElem, &count_desc, &conv_ptr, &iov_ptr, &iov_len_local);
-                    count_desc = 0;
                     if( 0 == count_desc ) {  /* completed */
                         pos_desc += pElem->loop.items + 1;
                         goto update_loop_description;
@@ -291,7 +302,8 @@ int32_t opal_generic_simple_pack_function_cuda_vector(opal_convertor_t* pConvert
                             pStack->disp + local_disp);
                 pos_desc++;
             update_loop_description:  /* update the current state */
-                conv_ptr = pConvertor->pBaseBuf + pStack->disp;
+              //  conv_ptr = pConvertor->pBaseBuf + pStack->disp;
+                count_desc_tmp = count_desc;
                 UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
                 continue;
             }
@@ -304,7 +316,7 @@ int32_t opal_generic_simple_pack_function_cuda_vector(opal_convertor_t* pConvert
         GET_TIME(start);
 #endif
         if (transfer_required) {
-            cudaMemcpy(iov[iov_count].iov_base, ddt_cuda_pack_buffer, total_packed, cudaMemcpyDeviceToHost);
+            cudaMemcpy(iov[iov_count].iov_base, pConvertor->gpu_buffer_ptr, total_packed, cudaMemcpyDeviceToHost);
         } 
 #if defined(OPAL_DATATYPE_CUDA_TIMING) 
         GET_TIME( end );
@@ -314,11 +326,14 @@ int32_t opal_generic_simple_pack_function_cuda_vector(opal_convertor_t* pConvert
     }
     *max_data = total_packed;
     pConvertor->bConverted += total_packed;  /* update the already converted bytes */
-    pConvertor->bConverted = pConvertor->local_size;
     *out_size = iov_count;
     if( pConvertor->bConverted == pConvertor->local_size ) {
         pConvertor->flags |= CONVERTOR_COMPLETED;
         printf("total packed %lu\n", pConvertor->bConverted);
+        if (pConvertor->gpu_buffer_ptr != NULL && free_required) {
+           opal_cuda_free_gpu_buffer(pConvertor->gpu_buffer_ptr, 0);
+           pConvertor->gpu_buffer_ptr = NULL;
+        }
         return 1;
     }
     /* Save the global position for the next round */
@@ -598,7 +613,7 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
     GET_TIME(start);
 #endif
     if (transfer_required) {
-        cudaMemcpy(iov[0].iov_base, destination_tmp, total_packed, cudaMemcpyDeviceToHost);
+        cudaMemcpy(iov[0].iov_base, pConvertor->gpu_buffer_ptr, total_packed, cudaMemcpyDeviceToHost);
     } 
 #if defined(OPAL_DATATYPE_CUDA_TIMING) 
     GET_TIME( end );
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
index e1f96ea6a2f..fd4fec00a73 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
@@ -121,6 +121,8 @@ int32_t opal_generic_simple_unpack_function_cuda_vector( opal_convertor_t* pConv
     unsigned char *conv_ptr, *iov_ptr;
     size_t iov_len_local;
     uint32_t iov_count;
+    uint8_t free_required;
+    uint32_t count_desc_tmp;
 
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
     TIMER_DATA_TYPE start, end, start_total, end_total;
@@ -151,14 +153,19 @@ int32_t opal_generic_simple_unpack_function_cuda_vector( opal_convertor_t* pConv
                            pConvertor->stack_pos, pStack->index, (int)pStack->count, (long)(pStack->disp) ); );
 
     for( iov_count = 0; iov_count < (*out_size); iov_count++ ) {
-        iov_ptr = ddt_cuda_unpack_buffer;
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
         GET_TIME(start);
 #endif
         if (opal_cuda_is_gpu_buffer(iov[iov_count].iov_base)) {
             iov_ptr = (unsigned char*)iov[iov_count].iov_base;
-        } else {    
+            free_required = 0;
+        } else {  
+            if (pConvertor->gpu_buffer_ptr == NULL) {
+                pConvertor->gpu_buffer_ptr = (unsigned char*)opal_cuda_malloc_gpu_buffer(iov[iov_count].iov_len, 0);
+            }
+            iov_ptr = pConvertor->gpu_buffer_ptr;
             cudaMemcpy(iov_ptr, iov[iov_count].iov_base, iov[iov_count].iov_len, cudaMemcpyHostToDevice);
+            free_required = 1;
         }
 #if defined(OPAL_DATATYPE_CUDA_TIMING) 
         GET_TIME( end );
@@ -173,6 +180,11 @@ int32_t opal_generic_simple_unpack_function_cuda_vector( opal_convertor_t* pConv
             while( pElem->elem.common.flags & OPAL_DATATYPE_FLAG_DATA ) {
                 /* now here we have a basic datatype */
                 /* should not go to here */
+                pStack--;
+                pConvertor->stack_pos--;
+                pos_desc --;
+                pElem = &(description[pos_desc]);
+                count_desc = count_desc_tmp;
                 goto complete_loop;
             }
             if( OPAL_DATATYPE_END_LOOP == pElem->elem.common.type ) { /* end of the current loop */
@@ -209,7 +221,6 @@ int32_t opal_generic_simple_unpack_function_cuda_vector( opal_convertor_t* pConv
                 OPAL_PTRDIFF_TYPE local_disp = (OPAL_PTRDIFF_TYPE)conv_ptr;
                 if( pElem->loop.common.flags & OPAL_DATATYPE_FLAG_CONTIGUOUS ) {
                     unpack_contiguous_loop_cuda(pElem, &count_desc, &iov_ptr, &conv_ptr, &iov_len_local);
-                    count_desc = 0;
                     if( 0 == count_desc ) {  /* completed */
                         pos_desc += pElem->loop.items + 1;
                         goto update_loop_description;
@@ -221,7 +232,8 @@ int32_t opal_generic_simple_unpack_function_cuda_vector( opal_convertor_t* pConv
                             pStack->disp + local_disp);
                 pos_desc++;
             update_loop_description:  /* update the current state */
-                conv_ptr = pConvertor->pBaseBuf + pStack->disp;
+            //    conv_ptr = pConvertor->pBaseBuf + pStack->disp;
+                count_desc_tmp = count_desc;
                 UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
                 continue;
             }
@@ -234,10 +246,13 @@ int32_t opal_generic_simple_unpack_function_cuda_vector( opal_convertor_t* pConv
     *max_data = total_unpacked;
     pConvertor->bConverted += total_unpacked;  /* update the already converted bytes */
     *out_size = iov_count;
-    pConvertor->bConverted = pConvertor->local_size;
     if( pConvertor->bConverted == pConvertor->remote_size ) {
         pConvertor->flags |= CONVERTOR_COMPLETED;
         printf("total unpacked %lu\n", pConvertor->bConverted);
+        if (pConvertor->gpu_buffer_ptr != NULL && free_required) {
+            opal_cuda_free_gpu_buffer(pConvertor->gpu_buffer_ptr, 0);
+            pConvertor->gpu_buffer_ptr = NULL;
+        }
         return 1;
     }
     /* Save the global position for the next round */
@@ -506,10 +521,13 @@ void unpack_contiguous_loop_cuda( dt_elem_desc_t* ELEM,
     tasks_per_block = THREAD_PER_BLOCK * TASK_PER_THREAD;
     num_blocks = (*COUNT + tasks_per_block - 1) / tasks_per_block;
     unpack_contiguous_loop_cuda_kernel_global<<<192, 4*THREAD_PER_BLOCK>>>(_copy_loops, _end_loop->size, _loop->extent, _source, _destination);
-    
-    *(DESTINATION) = _destination - _end_loop->first_elem_disp;
+
+#if !defined(OPAL_DATATYPE_CUDA_DRY_RUN)     
+    *(DESTINATION) = _destination + _loop->extent*_copy_loops - _end_loop->first_elem_disp;
+    *(SOURCE) = *(SOURCE)  + _copy_loops * _end_loop->size;
     *(SPACE) -= _copy_loops * _end_loop->size;
     *(COUNT) -= _copy_loops;
+#endif
     
     cudaDeviceSynchronize();
 }
diff --git a/test/datatype/ddt_test.c b/test/datatype/ddt_test.c
index 3e6a2a531ff..98aa6f1347a 100644
--- a/test/datatype/ddt_test.c
+++ b/test/datatype/ddt_test.c
@@ -830,7 +830,7 @@ int main( int argc, char* argv[] )
     if( outputFlags & CHECK_PACK_UNPACK ) {
         for (i = 1; i <= 1; i++) {
 //        local_copy_ddt_count(pdt, 1);
-            local_copy_with_convertor(pdt, 1, 1024*1024*10, 4000);
+    //        local_copy_with_convertor(pdt, 1, 1024*1024*10, 4000);
         }
     }
     OBJ_RELEASE( pdt ); assert( pdt == NULL );
@@ -971,7 +971,7 @@ int main( int argc, char* argv[] )
       //  local_copy_with_convertor( pdt, 1, 6000 );
       //  local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 6000 );
       //  local_copy_with_convertor( pdt, 1, 36000 );
-          local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 1024*1024*20 , 4000, 384, 512);
+   //       local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 1024*1024*20 , 4000, 384, 512);
         }
     }
     printf( ">>--------------------------------------------<<\n" );
@@ -990,7 +990,7 @@ int main( int argc, char* argv[] )
       //  local_copy_with_convertor( pdt, 1, 6000 );
       //  local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 6000 );
       //  local_copy_with_convertor( pdt, 1, 36000 );
-          local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 1024*1024*10, 4000, 256, 384 );
+          local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 1024*2000, 4000, 256, 384 );
         }
     }
     printf( ">>--------------------------------------------<<\n" );

From 0fda4dfd2c281a780d0943aaadfeb8ef1e2f55c3 Mon Sep 17 00:00:00 2001
From: George Bosilca <bosilca@icl.utk.edu>
Date: Thu, 7 May 2015 00:43:19 -0400
Subject: [PATCH 006/190] unrestricted GPU. Instead of forcing everything to go
 on device 0, we now use the devices already opened.

---
 opal/datatype/cuda/opal_datatype_cuda.cu      | 30 ++++++++-----------
 .../cuda/opal_datatype_cuda_internal.cuh      |  1 -
 .../cuda/opal_datatype_pack_cuda_wrapper.cu   |  2 +-
 opal/datatype/opal_datatype_cuda.c            |  9 ++----
 4 files changed, 16 insertions(+), 26 deletions(-)

diff --git a/opal/datatype/cuda/opal_datatype_cuda.cu b/opal/datatype/cuda/opal_datatype_cuda.cu
index 3ec7b9e53ce..8451b143487 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cu
+++ b/opal/datatype/cuda/opal_datatype_cuda.cu
@@ -201,10 +201,15 @@ static inline void cuda_list_item_merge_by_addr(ddt_cuda_list_t *list)
 void opal_datatype_cuda_init(void)
 {
     uint32_t i;
-    
-    int device = OPAL_GPU_INDEX;
-    cudaSetDevice(device);
-    
+    int device;
+    cudaError res;
+
+    res = cudaGetDevice(&device);
+    if( cudaSuccess != res ) {
+        opal_cuda_output(0, "Cannot retrieve the device being used. Drop CUDA support!\n");
+        return;
+    }    
+
     cuda_free_list = init_cuda_free_list();
     
     /* init device */
@@ -245,10 +250,8 @@ void opal_datatype_cuda_init(void)
     
     cudaMalloc((void **)(&ddt_cuda_pack_buffer), sizeof(char)*DT_CUDA_BUFFER_SIZE);
     printf("malloc cuda packing buffer, %p\n", ddt_cuda_pack_buffer);
-    cudaMemset(ddt_cuda_pack_buffer, 0, sizeof(char)*DT_CUDA_BUFFER_SIZE);
     cudaMalloc((void **)(&ddt_cuda_unpack_buffer), sizeof(char)*DT_CUDA_BUFFER_SIZE);
     printf("malloc cuda unpacking buffer, %p\n", ddt_cuda_unpack_buffer);
-    cudaMemset(ddt_cuda_unpack_buffer, 0, sizeof(char)*DT_CUDA_BUFFER_SIZE);
 
     cuda_desc_h->iov[0].iov_base = ddt_cuda_pack_buffer;
     cuda_desc_h->iov[0].iov_len = DT_CUDA_BUFFER_SIZE;
@@ -285,8 +288,6 @@ void opal_datatype_cuda_init(void)
     // ALIGNMENT_DOUBLE = sizeof(double);
     // ALIGNMENT_FLOAT = sizeof(float);
     // ALIGNMENT_CHAR = sizeof(char);
-    
-    
 }
 
 void opal_datatype_cuda_fini(void)
@@ -344,18 +345,11 @@ int32_t opal_cuda_is_gpu_buffer(const void *ptr)
     if (res != CUDA_SUCCESS) {
         /* If we cannot determine it is device pointer,
          * just assume it is not. */
-        printf("!!!!!!!is gpu buffer error\n");
-        return 0;
-    } 
-    if (memType == CU_MEMORYTYPE_DEVICE) {
-        return 1;
-    } else if (memType == CU_MEMORYTYPE_HOST){
-        return 0;
-    } else if (memType == 0) {
-        return 0;
-    } else {
+        printf("!!!!!!! %p is not a gpu buffer. Take no-CUDA path!\n", ptr);
         return 0;
     }
+    /* Anything but CU_MEMORYTYPE_DEVICE is not a GPU memory */
+    return (memType == CU_MEMORYTYPE_DEVICE) ? 1 : 0;
 }
 
 unsigned char* opal_cuda_get_gpu_pack_buffer()
diff --git a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
index 567e81218ec..e9359209c01 100644
--- a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
+++ b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
@@ -23,7 +23,6 @@
 #define THREAD_PER_BLOCK    32
 #define CUDA_WARP_SIZE      32
 #define TASK_PER_THREAD     2
-#define OPAL_GPU_INDEX      0
 #define NB_STREAMS          4
 #define CUDA_NB_IOV         4096
 #define CUDA_IOV_LEN        1024*1204
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
index 636e413bc21..b55c59a5c1e 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
@@ -462,7 +462,7 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
     
     DT_CUDA_DEBUG ( opal_cuda_output(0, "Pack GPU base %p, pack to buffer %p\n", pConvertor->pBaseBuf, destination););
 
-    cuda_iov_count = 1000;
+    cuda_iov_count = CUDA_NB_IOV;
     total_packed = 0;
     total_converted = pConvertor->bConverted;
     cuda_streams->current_stream_id = 0;
diff --git a/opal/datatype/opal_datatype_cuda.c b/opal/datatype/opal_datatype_cuda.c
index caaab68208d..e09618e747b 100644
--- a/opal/datatype/opal_datatype_cuda.c
+++ b/opal/datatype/opal_datatype_cuda.c
@@ -80,9 +80,8 @@ bool opal_cuda_check_bufs(char *dest, char *src)
 
     if (ftable.gpu_is_gpu_buffer(dest, NULL) || ftable.gpu_is_gpu_buffer(src, NULL)) {
         return true;
-    } else {
-        return false;
     }
+    return false;
 }
 
 /*
@@ -109,9 +108,8 @@ void *opal_cuda_memcpy(void *dest, const void *src, size_t size, opal_convertor_
         opal_output(0, "CUDA: Error in cuMemcpy: res=%d, dest=%p, src=%p, size=%d",
                     res, dest, src, (int)size);
         abort();
-    } else {
-        return dest;
     }
+    return dest;
 }
 
 /*
@@ -127,9 +125,8 @@ void *opal_cuda_memcpy_sync(void *dest, const void *src, size_t size)
         opal_output(0, "CUDA: Error in cuMemcpy: res=%d, dest=%p, src=%p, size=%d",
                     res, dest, src, (int)size);
         abort();
-    } else {
-        return dest;
     }
+    return dest;
 }
 
 /*

From bc0e1047f5be57f5c678c6e5b0f8795044f2eab4 Mon Sep 17 00:00:00 2001
From: George Bosilca <bosilca@icl.utk.edu>
Date: Thu, 18 Jun 2015 11:17:30 -0400
Subject: [PATCH 007/190] Using globally defined indexes lead to several
 synchronization issues, when 2 peers were doing a send/recv or when multiple
 senders were targetting the same receiver. Rolf provided a patch to solve
 this issue, by moving the IPC communication index from a global location onto
 each endpoint.

---
 ompi/mca/pml/ob1/pml_ob1_cuda.c            |  4 +-
 opal/mca/btl/smcuda/btl_smcuda.c           | 88 ++++++++++++++++------
 opal/mca/btl/smcuda/btl_smcuda.h           | 33 +++++---
 opal/mca/btl/smcuda/btl_smcuda_component.c | 32 ++++----
 opal/mca/btl/smcuda/btl_smcuda_endpoint.h  |  2 +
 5 files changed, 109 insertions(+), 50 deletions(-)

diff --git a/ompi/mca/pml/ob1/pml_ob1_cuda.c b/ompi/mca/pml/ob1/pml_ob1_cuda.c
index 97383e008ee..2575228d019 100644
--- a/ompi/mca/pml/ob1/pml_ob1_cuda.c
+++ b/ompi/mca/pml/ob1/pml_ob1_cuda.c
@@ -131,10 +131,10 @@ int mca_pml_ob1_send_request_start_cuda(mca_pml_ob1_send_request_t* sendreq,
                 /* because pack may not use the whole pipeline size */
                 rc_dt = opal_convertor_pack(convertor, &iov, &iov_count, &max_data );
                 pipeline_size = max_data;
-                int lindex = mca_btl_smcuda_alloc_cuda_dt_clone();
+                int lindex = mca_btl_smcuda_alloc_cuda_dt_pack_clone(bml_btl->btl_endpoint);
                 assert(lindex >= 0);
                 mca_pml_ob1_rdma_cuda_btl_register_events(sendreq->req_rdma, sendreq->req_rdma_cnt, convertor, pipeline_size, lindex); 
-                mca_btl_smcuda_cuda_dt_clone(convertor, bml_btl->btl_endpoint, NULL, NULL, NULL, NULL, NULL, pipeline_size, lindex);
+                mca_btl_smcuda_cuda_dt_pack_clone(convertor, bml_btl->btl_endpoint, NULL, NULL, NULL, NULL, NULL, pipeline_size, lindex);
                 
                 rc = mca_pml_ob1_send_request_start_rdma(sendreq, bml_btl,
                                                          sendreq->req_send.req_bytes_packed);
diff --git a/opal/mca/btl/smcuda/btl_smcuda.c b/opal/mca/btl/smcuda/btl_smcuda.c
index 3a711e40cdf..4814b6c996a 100644
--- a/opal/mca/btl/smcuda/btl_smcuda.c
+++ b/opal/mca/btl/smcuda/btl_smcuda.c
@@ -491,6 +491,10 @@ create_sm_endpoint(int local_proc, struct opal_proc_t *proc)
         ep->mpool = mca_mpool_base_module_create("rgpusm",
                                                  NULL,
                                                  &resources);
+        for (int i = 0; i < SMCUDA_DT_CLONE_SIZE; i++) {
+            ep->smcuda_dt_pack_clone[i].lindex = -1;
+            ep->smcuda_dt_unpack_clone[i].lindex = -1;
+        }
     }
 #endif /* OPAL_CUDA_SUPPORT */
     return ep;
@@ -1138,7 +1142,7 @@ int mca_btl_smcuda_get_cuda (struct mca_btl_base_module_t *btl,
             uint32_t lindex = remote_handle->reg_data.lindex;
             printf("i receive pipeline %ld, lindex %d\n", pipeline_size, lindex);
             convertor->gpu_buffer_ptr = remote_memory_address;
-            mca_btl_smcuda_cuda_dt_clone(convertor, ep, local_address, local_handle, (mca_btl_base_completion_fn_t)cbfunc, cbcontext, cbdata, pipeline_size, lindex);
+            mca_btl_smcuda_cuda_dt_unpack_clone(convertor, ep, local_address, local_handle, (mca_btl_base_completion_fn_t)cbfunc, cbcontext, cbdata, pipeline_size, lindex);
             done = 0;
         } else {
             recvreq->req_recv.req_base.req_convertor.flags |= CONVERTOR_CUDA;
@@ -1291,42 +1295,78 @@ int mca_btl_smcuda_send_cuda_pack_sig(struct mca_btl_base_module_t* btl,
     return rc;
 }
 
-int mca_btl_smcuda_alloc_cuda_dt_clone(void)
+int mca_btl_smcuda_alloc_cuda_dt_pack_clone(struct mca_btl_base_endpoint_t *endpoint)
+{
+    int i;
+    for (i = 0; i < SMCUDA_DT_CLONE_SIZE; i++) {
+        if (endpoint->smcuda_dt_pack_clone[i].lindex == -1) {
+            return i;
+        }
+    }
+    return -1;
+}
+int mca_btl_smcuda_alloc_cuda_dt_unpack_clone(struct mca_btl_base_endpoint_t *endpoint)
 {
     int i;
     for (i = 0; i < SMCUDA_DT_CLONE_SIZE; i++) {
-        if (smcuda_dt_clone[i].lindex == -1) {
+        if (endpoint->smcuda_dt_unpack_clone[i].lindex == -1) {
             return i;
         }
     }
     return -1;
 }
 
-void mca_btl_smcuda_free_cuda_dt_clone(int lindex)
+void mca_btl_smcuda_free_cuda_dt_pack_clone(struct mca_btl_base_endpoint_t *endpoint, int lindex)
+{
+    assert(endpoint->smcuda_dt_pack_clone[lindex].lindex == lindex);
+    endpoint->smcuda_dt_pack_clone[lindex].lindex = -1;
+}
+void mca_btl_smcuda_free_cuda_dt_unpack_clone(struct mca_btl_base_endpoint_t *endpoint, int lindex)
+{
+    assert(endpoint->smcuda_dt_unpack_clone[lindex].lindex == lindex);
+    endpoint->smcuda_dt_unpack_clone[lindex].lindex = -1;
+}
+
+void mca_btl_smcuda_cuda_dt_pack_clone(struct opal_convertor_t *convertor,
+                                       struct mca_btl_base_endpoint_t *endpoint,
+                                       void *local_address,
+                                       struct mca_btl_base_registration_handle_t *local_handle,
+                                       mca_btl_base_completion_fn_t cbfunc,
+                                       void *cbcontext,
+                                       void *cbdata,
+                                       size_t pipeline_size,
+                                       int lindex)
 {
-    assert(smcuda_dt_clone[lindex].lindex == lindex);
-    smcuda_dt_clone[lindex].lindex = -1;
+    endpoint->smcuda_dt_pack_clone[lindex].convertor = convertor;
+    endpoint->smcuda_dt_pack_clone[lindex].endpoint = endpoint;
+    endpoint->smcuda_dt_pack_clone[lindex].local_address = local_address;
+    endpoint->smcuda_dt_pack_clone[lindex].local_handle = local_handle;
+    endpoint->smcuda_dt_pack_clone[lindex].cbfunc = cbfunc;
+    endpoint->smcuda_dt_pack_clone[lindex].cbcontext = cbcontext;
+    endpoint->smcuda_dt_pack_clone[lindex].cbdata = cbdata;
+    endpoint->smcuda_dt_pack_clone[lindex].pipeline_size = pipeline_size;
+    endpoint->smcuda_dt_pack_clone[lindex].lindex = lindex;
 }
 
-void mca_btl_smcuda_cuda_dt_clone(struct opal_convertor_t *convertor,
-                                  struct mca_btl_base_endpoint_t *endpoint,
-                                  void *local_address,
-                                  struct mca_btl_base_registration_handle_t *local_handle,
-                                  mca_btl_base_completion_fn_t cbfunc,
-                                  void *cbcontext,
-                                  void *cbdata,
-                                  size_t pipeline_size,
-                                  int lindex)
+void mca_btl_smcuda_cuda_dt_unpack_clone(struct opal_convertor_t *convertor,
+                                         struct mca_btl_base_endpoint_t *endpoint,
+                                         void *local_address,
+                                         struct mca_btl_base_registration_handle_t *local_handle,
+                                         mca_btl_base_completion_fn_t cbfunc,
+                                         void *cbcontext,
+                                         void *cbdata,
+                                         size_t pipeline_size,
+                                         int lindex)
 {
-    smcuda_dt_clone[lindex].convertor = convertor;
-    smcuda_dt_clone[lindex].endpoint = endpoint;
-    smcuda_dt_clone[lindex].local_address = local_address;
-    smcuda_dt_clone[lindex].local_handle = local_handle;
-    smcuda_dt_clone[lindex].cbfunc = cbfunc;
-    smcuda_dt_clone[lindex].cbcontext = cbcontext;
-    smcuda_dt_clone[lindex].cbdata = cbdata;
-    smcuda_dt_clone[lindex].pipeline_size = pipeline_size;
-    smcuda_dt_clone[lindex].lindex = lindex;
+    endpoint->smcuda_dt_unpack_clone[lindex].convertor = convertor;
+    endpoint->smcuda_dt_unpack_clone[lindex].endpoint = endpoint;
+    endpoint->smcuda_dt_unpack_clone[lindex].local_address = local_address;
+    endpoint->smcuda_dt_unpack_clone[lindex].local_handle = local_handle;
+    endpoint->smcuda_dt_unpack_clone[lindex].cbfunc = cbfunc;
+    endpoint->smcuda_dt_unpack_clone[lindex].cbcontext = cbcontext;
+    endpoint->smcuda_dt_unpack_clone[lindex].cbdata = cbdata;
+    endpoint->smcuda_dt_unpack_clone[lindex].pipeline_size = pipeline_size;
+    endpoint->smcuda_dt_unpack_clone[lindex].lindex = lindex;
 }
 
 #endif /* OPAL_CUDA_SUPPORT */
diff --git a/opal/mca/btl/smcuda/btl_smcuda.h b/opal/mca/btl/smcuda/btl_smcuda.h
index 3e9f2a46db2..00765f0a276 100644
--- a/opal/mca/btl/smcuda/btl_smcuda.h
+++ b/opal/mca/btl/smcuda/btl_smcuda.h
@@ -532,17 +532,28 @@ extern cuda_dt_clone_t smcuda_dt_clone[SMCUDA_DT_CLONE_SIZE];
 
 int mca_btl_smcuda_send_cuda_unpack_sig(struct mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t* endpoint, int lindex, int seq);
 int mca_btl_smcuda_send_cuda_pack_sig(struct mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t* endpoint, int lindex, int seq);
-int mca_btl_smcuda_alloc_cuda_dt_clone(void);
-void mca_btl_smcuda_free_cuda_dt_clone(int lindex);
-void mca_btl_smcuda_cuda_dt_clone(struct opal_convertor_t *convertor,
-                                  struct mca_btl_base_endpoint_t *endpoint,
-                                  void *local_address,
-                                  struct mca_btl_base_registration_handle_t *local_handle,
-                                  mca_btl_base_completion_fn_t cbfunc,
-                                  void *cbcontext,
-                                  void *cbdata,
-                                  size_t pipeline_size,
-                                  int lindex);
+int mca_btl_smcuda_alloc_cuda_dt_pack_clone(struct mca_btl_base_endpoint_t *endpoint);
+int mca_btl_smcuda_alloc_cuda_dt_unpack_clone(struct mca_btl_base_endpoint_t *endpoint);
+void mca_btl_smcuda_free_cuda_dt_pack_clone(struct mca_btl_base_endpoint_t *endpoint, int lindex);
+void mca_btl_smcuda_free_cuda_dt_unpack_clone(struct mca_btl_base_endpoint_t *endpoint, int lindex);
+void mca_btl_smcuda_cuda_dt_pack_clone(struct opal_convertor_t *convertor,
+                                       struct mca_btl_base_endpoint_t *endpoint,
+                                       void *local_address,
+                                       struct mca_btl_base_registration_handle_t *local_handle,
+                                       mca_btl_base_completion_fn_t cbfunc,
+                                       void *cbcontext,
+                                       void *cbdata,
+                                       size_t pipeline_size,
+                                       int lindex);
+void mca_btl_smcuda_cuda_dt_unpack_clone(struct opal_convertor_t *convertor,
+                                         struct mca_btl_base_endpoint_t *endpoint,
+                                         void *local_address,
+                                         struct mca_btl_base_registration_handle_t *local_handle,
+                                         mca_btl_base_completion_fn_t cbfunc,
+                                         void *cbcontext,
+                                         void *cbdata,
+                                         size_t pipeline_size,
+                                         int lindex);
 
 #endif /* OPAL_CUDA_SUPPORT */
 
diff --git a/opal/mca/btl/smcuda/btl_smcuda_component.c b/opal/mca/btl/smcuda/btl_smcuda_component.c
index 727308c1df9..f035578bd5d 100644
--- a/opal/mca/btl/smcuda/btl_smcuda_component.c
+++ b/opal/mca/btl/smcuda/btl_smcuda_component.c
@@ -847,29 +847,32 @@ static void btl_smcuda_control(mca_btl_base_module_t* btl,
     }
 }
 
-cuda_dt_clone_t smcuda_dt_clone[SMCUDA_DT_CLONE_SIZE];
-
 static void btl_smcuda_datatype_unpack(mca_btl_base_module_t* btl,
                                        mca_btl_base_tag_t tag,
                                        mca_btl_base_descriptor_t* des, void* cbdata)
 {   
+    struct mca_btl_base_endpoint_t *endpoint;
     cuda_dt_hdr_t cuda_dt_hdr;
     mca_btl_base_segment_t* segments = des->des_segments;
     memcpy(&cuda_dt_hdr, segments->seg_addr.pval, sizeof(cuda_dt_hdr_t));
     int seq = cuda_dt_hdr.seq;
     int lindex = cuda_dt_hdr.lindex;
-    cuda_dt_clone_t *my_cuda_dt_clone = &smcuda_dt_clone[lindex];
-    
+    mca_btl_smcuda_frag_t *frag = (mca_btl_smcuda_frag_t *)des;
+    cuda_dt_clone_t *my_cuda_dt_clone;
+
+    /* We can find the endoint back from the rank embedded in the header */
+    endpoint = mca_btl_smcuda_component.sm_peers[frag->hdr->my_smp_rank];
+    my_cuda_dt_clone = &endpoint->smcuda_dt_unpack_clone[lindex];
     assert(my_cuda_dt_clone->lindex == lindex);
     
     printf("$$$$$$$$$$$$$$hello, rank %d in smcuda unpack seq %d, index %d\n", my_cuda_dt_clone->endpoint->my_smp_rank, seq, lindex);
     
     if (seq == -2) {
         mca_btl_base_rdma_completion_fn_t cbfunc = (mca_btl_base_rdma_completion_fn_t)my_cuda_dt_clone->cbfunc;
-        cbfunc(btl, my_cuda_dt_clone->endpoint, my_cuda_dt_clone->local_address, my_cuda_dt_clone->local_handle, my_cuda_dt_clone->cbcontext, my_cuda_dt_clone->cbdata, OPAL_SUCCESS);
-        mca_btl_smcuda_free_cuda_dt_clone(lindex);
+        cbfunc(btl, endpoint, my_cuda_dt_clone->local_address, my_cuda_dt_clone->local_handle, my_cuda_dt_clone->cbcontext, my_cuda_dt_clone->cbdata, OPAL_SUCCESS);
+        mca_btl_smcuda_free_cuda_dt_unpack_clone(endpoint, lindex);
     } else if (seq == -1) {
-        mca_btl_smcuda_send_cuda_pack_sig(btl, my_cuda_dt_clone->endpoint, lindex, -1);
+        mca_btl_smcuda_send_cuda_pack_sig(btl, endpoint, lindex, -1);
     } else {
         struct iovec iov;
         uint32_t iov_count = 1;
@@ -887,19 +890,25 @@ static void btl_smcuda_datatype_pack(mca_btl_base_module_t* btl,
                                      mca_btl_base_tag_t tag,
                                      mca_btl_base_descriptor_t* des, void* cbdata)
 {
+    struct mca_btl_base_endpoint_t *endpoint;
     cuda_dt_hdr_t cuda_dt_hdr;
     mca_btl_base_segment_t* segments = des->des_segments;
     memcpy(&cuda_dt_hdr, segments->seg_addr.pval, sizeof(cuda_dt_hdr_t));
     int seq = cuda_dt_hdr.seq;
     int lindex = cuda_dt_hdr.lindex;
-    cuda_dt_clone_t *my_cuda_dt_clone = &smcuda_dt_clone[lindex];
+    mca_btl_smcuda_frag_t *frag = (mca_btl_smcuda_frag_t *)des;
+    cuda_dt_clone_t *my_cuda_dt_clone;
+
+    /* We can find the endoint back from the rank embedded in the header */
+    endpoint = mca_btl_smcuda_component.sm_peers[frag->hdr->my_smp_rank];
+    my_cuda_dt_clone = &endpoint->smcuda_dt_pack_clone[lindex];
     
     printf("$$$$$$$$$$$$$$hello, rank %d in smcuda pack seq %d, index %d\n", my_cuda_dt_clone->endpoint->my_smp_rank, seq, lindex);
     
     if (seq == -1) {
         mca_btl_smcuda_send_cuda_unpack_sig(btl, my_cuda_dt_clone->endpoint, lindex, -2);
         opal_cuda_free_gpu_buffer_p(my_cuda_dt_clone->convertor->gpu_buffer_ptr, 0);
-        mca_btl_smcuda_free_cuda_dt_clone(lindex);
+        mca_btl_smcuda_free_cuda_dt_pack_clone(my_cuda_dt_clone->endpoint, lindex);
     }
 }
 
@@ -1021,10 +1030,7 @@ mca_btl_smcuda_component_init(int *num_btls,
     mca_btl_base_active_message_trigger[MCA_BTL_TAG_SMCUDA_DATATYPE_UNPACK].cbdata = NULL;
     mca_btl_base_active_message_trigger[MCA_BTL_TAG_SMCUDA_DATATYPE_PACK].cbfunc = btl_smcuda_datatype_pack;
     mca_btl_base_active_message_trigger[MCA_BTL_TAG_SMCUDA_DATATYPE_PACK].cbdata = NULL;
-    
-    for (int i = 0; i < SMCUDA_DT_CLONE_SIZE; i++) {
-        smcuda_dt_clone[i].lindex = -1;
-    }
+
 #endif /* OPAL_CUDA_SUPPORT */
 
     return btls;
diff --git a/opal/mca/btl/smcuda/btl_smcuda_endpoint.h b/opal/mca/btl/smcuda/btl_smcuda_endpoint.h
index cead5ec7a5c..e4df5ee56d0 100644
--- a/opal/mca/btl/smcuda/btl_smcuda_endpoint.h
+++ b/opal/mca/btl/smcuda/btl_smcuda_endpoint.h
@@ -49,6 +49,8 @@ struct mca_btl_base_endpoint_t {
     opal_proc_t *proc_opal;  /**< Needed for adding CUDA IPC support dynamically */
     enum ipcState ipcstate;  /**< CUDA IPC connection status */
     int ipctries;            /**< Number of times CUDA IPC connect was sent */
+    cuda_dt_clone_t smcuda_dt_pack_clone[SMCUDA_DT_CLONE_SIZE];
+    cuda_dt_clone_t smcuda_dt_unpack_clone[SMCUDA_DT_CLONE_SIZE];
 #endif /* OPAL_CUDA_SUPPORT */
 };
 

From 6fda03667822e0675769bf22e65e9fb168198a29 Mon Sep 17 00:00:00 2001
From: George Bosilca <bosilca@icl.utk.edu>
Date: Thu, 18 Jun 2015 15:15:47 -0400
Subject: [PATCH 008/190] Generate the Makefile. It will now be placed in the
 bindir and will be populated with all the known information. Beware: one
 still has to manually set the CUDA lib and path as they are not available
 after configure (unlike the include which is).

Conflicts:
	opal/datatype/cuda/Makefile
---
 configure.ac                   |  4 +++
 opal/datatype/cuda/Makefile    | 40 ------------------------
 opal/datatype/cuda/Makefile.in | 57 ++++++++++++++++++++++++++++++++++
 3 files changed, 61 insertions(+), 40 deletions(-)
 delete mode 100644 opal/datatype/cuda/Makefile
 create mode 100644 opal/datatype/cuda/Makefile.in

diff --git a/configure.ac b/configure.ac
index 1d754232d3c..fef9e831c23 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1361,6 +1361,10 @@ m4_ifdef([project_oshmem],
 
 opal_show_subtitle "Final output"
 
+if test "$OPAL_cuda_support" != "0"; then
+  AC_CONFIG_FILES([opal/datatype/cuda/Makefile])
+fi
+
 AC_CONFIG_FILES([
     Makefile
 
diff --git a/opal/datatype/cuda/Makefile b/opal/datatype/cuda/Makefile
deleted file mode 100644
index e76f160fb88..00000000000
--- a/opal/datatype/cuda/Makefile
+++ /dev/null
@@ -1,40 +0,0 @@
-CC			= gcc
-NVCC		= nvcc
-ARCH		= ar
-ARCHFLAGS	= cr
-RANLIB		= ranlib
-STLIB		?= opal_datatype_cuda.a
-DYLIB		?= opal_datatype_cuda.so
-CFLAGS		= -g -G -O0 
-EXTLIB		= -L/home/wwu12/ompi/ompi-gpu/opal/datatype/.libs -ldatatype -L/usr/lib64 -lcuda
-INC			=
-
-SRC	:= \
-    opal_datatype_cuda.cu \
-    opal_datatype_pack_cuda_kernel.cu \
-    opal_datatype_pack_cuda_wrapper.cu \
-	opal_datatype_unpack_cuda_kernel.cu \
-	opal_datatype_unpack_cuda_wrapper.cu \
-	
-OBJ := $(SRC:.cu=.o)
-
-.PHONY: all clean cleanall
-
-all: $(STLIB) $(DYLIB)
-
-$(STLIB): $(OBJ)
-	$(ARCH) $(ARCHFLAGS) $@ $(OBJ) 
-	$(RANLIB) $@
-	
-$(DYLIB): $(OBJ)
-	$(NVCC) $(CFLAGS) $(EXTLIB) -shared --compiler-options '-fPIC' -o $(DYLIB) $(OBJ)
-	
-%.o: %.cu
-	$(NVCC) $(CFLAGS) $(EXTLIB) -gencode arch=compute_35,code=sm_35 $(INC) -c --compiler-options '-fPIC' $< -o $@ 
-
-clean:
-	rm -f *.o
-
-cleanall: clean
-	rm -f $(STLIB)
-	rm -f $(DYLIB)
diff --git a/opal/datatype/cuda/Makefile.in b/opal/datatype/cuda/Makefile.in
new file mode 100644
index 00000000000..519de6100ae
--- /dev/null
+++ b/opal/datatype/cuda/Makefile.in
@@ -0,0 +1,57 @@
+@SET_MAKE@
+
+AM_CPPFLAGS = @common_cuda_CPPFLAGS@
+srcdir = @srcdir@
+top_builddir = @top_builddir@
+VPATH = @srcdir@
+
+NVCC		= nvcc
+ARCH		= ar
+ARCHFLAGS	= cr
+STLIB		?= opal_datatype_cuda.a
+DYLIB		?= opal_datatype_cuda.so
+EXTLIB		= -L$(top_builddir)/opal/datatype/.libs -ldatatype -L$(top_builddir)/opal/.libs -lopen-pal -L/usr/local/cuda/lib -lcuda
+subdir = opal/datatype/cuda
+
+CC = nvcc
+CFLAGS = -gencode arch=compute_35,code=sm_35 --compiler-options '-fPIC @CFLAGS@'
+LDFLAGS += -shared --compiler-options '-fPIC @LDFLAGS@'
+
+SRC := \
+    opal_datatype_cuda.cu \
+    opal_datatype_pack_cuda_kernel.cu \
+    opal_datatype_pack_cuda_wrapper.cu \
+    opal_datatype_unpack_cuda_kernel.cu \
+    opal_datatype_unpack_cuda_wrapper.cu
+
+OBJ := $(SRC:.cu=.o)
+
+.PHONY: all clean cleanall
+
+all: Makefile $(STLIB) $(DYLIB)
+
+Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
+	@case '$?' in \
+	  *config.status*) \
+	    cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \
+	  *) \
+	    echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe)'; \
+	    cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe);; \
+	esac;
+
+$(STLIB): $(OBJ)
+	$(ARCH) $(ARCHFLAGS) $@ $(OBJ) 
+	@RANLIB@ $@
+	
+$(DYLIB): $(OBJ)
+	$(NVCC) $(LDFLAGS) $(EXTLIB) -o $(DYLIB) $(OBJ)
+	
+%.o: %.cu
+	$(NVCC) $(CFLAGS) $(EXTLIB) $(INC) -c $< -o $@ 
+
+clean:
+	rm -f *.o
+
+cleanall: clean
+	rm -f $(STLIB)
+	rm -f $(DYLIB)

From 742992a60566bc427d64ec1fb0d8d694ad1b7faa Mon Sep 17 00:00:00 2001
From: George Bosilca <bosilca@icl.utk.edu>
Date: Thu, 18 Jun 2015 19:47:14 -0400
Subject: [PATCH 009/190] This file was certainly not supposed to be here.
 There is NO valid reason to have a copy of a locally generated file in the
 source.

---
 opal/datatype/cuda/opal_config.h | 2863 ------------------------------
 1 file changed, 2863 deletions(-)
 delete mode 100644 opal/datatype/cuda/opal_config.h

diff --git a/opal/datatype/cuda/opal_config.h b/opal/datatype/cuda/opal_config.h
deleted file mode 100644
index d23f071a86a..00000000000
--- a/opal/datatype/cuda/opal_config.h
+++ /dev/null
@@ -1,2863 +0,0 @@
-/* opal/include/opal_config.h.  Generated from opal_config.h.in by configure.  */
-/* opal/include/opal_config.h.in.  Generated from configure.ac by autoheader.  */
-
-/* -*- c -*-
- *
- * Copyright (c) 2004-2005 The Trustees of Indiana University.
- *                         All rights reserved.
- * Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
- *                         All rights reserved.
- * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
- *                         University of Stuttgart.  All rights reserved.
- * Copyright (c) 2004-2005 The Regents of the University of California.
- *                         All rights reserved.
- * Copyright (c) 2014      Intel, Inc. All rights reserved.
- * $COPYRIGHT$
- *
- * Additional copyrights may follow
- *
- * $HEADER$
- *
- * Function: - OS, CPU and compiler dependent configuration
- */
-
-#ifndef OPAL_CONFIG_H
-#define OPAL_CONFIG_H
-
-//#include "opal_config_top.h"
-
-
-
-/* Define if building universal (internal helper macro) */
-/* #undef AC_APPLE_UNIVERSAL_BUILD */
-
-/* enable openib BTL failover */
-#define BTL_OPENIB_FAILOVER_ENABLED 0
-
-/* Whether the openib BTL malloc hooks are enabled */
-#define BTL_OPENIB_MALLOC_HOOKS_ENABLED 1
-
-/* rdmacm without IB_AF addressing support */
-/* #undef BTL_OPENIB_RDMACM_IB_ADDR */
-
-/* BLCR cr_request_file check */
-/* #undef CRS_BLCR_HAVE_CR_REQUEST */
-
-/* BLCR cr_request_checkpoint check */
-/* #undef CRS_BLCR_HAVE_CR_REQUEST_CHECKPOINT */
-
-/* BLCRs cr_checkpoint_info.requester member availability */
-/* #undef CRS_BLCR_HAVE_INFO_REQUESTER */
-
-/* Version of event */
-/* #undef EVENT_EXTERNAL_EVENT_VERSION */
-
-/* Define to 1 if you have the <aio.h> header file. */
-#define HAVE_AIO_H 1
-
-/* Define to 1 if the linker supports alias attribute. */
-/* #undef HAVE_ALIAS_ATTRIBUTE */
-
-/* Define to 1 if you have the <alloca.h> header file. */
-#define HAVE_ALLOCA_H 1
-
-/* Define to 1 if you have the <alps/apInfo.h> header file. */
-/* #undef HAVE_ALPS_APINFO_H */
-
-/* Define to 1 if you have the <arpa/inet.h> header file. */
-#define HAVE_ARPA_INET_H 1
-
-/* Define to 1 if you have the `asprintf' function. */
-#define HAVE_ASPRINTF 1
-
-/* Set to use c11 atomic functions */
-/* #undef HAVE_ATOMICS */
-
-/* Define to 1 if the system has the type `CACHE_DESCRIPTOR'. */
-/* #undef HAVE_CACHE_DESCRIPTOR */
-
-/* Define to 1 if the system has the type `CACHE_RELATIONSHIP'. */
-/* #undef HAVE_CACHE_RELATIONSHIP */
-
-/* Define to 1 if you have the `clz' function. */
-/* #undef HAVE_CLZ */
-
-/* Define to 1 if you have the `clzl' function. */
-/* #undef HAVE_CLZL */
-
-/* Define to 1 if you have the <CL/cl_ext.h> header file. */
-#define HAVE_CL_CL_EXT_H 1
-
-/* Define to 1 if you have the <complex.h> header file. */
-#define HAVE_COMPLEX_H 1
-
-/* Define to 1 if you have the `cpuset_setaffinity' function. */
-/* #undef HAVE_CPUSET_SETAFFINITY */
-
-/* Define to 1 if you have the `cpuset_setid' function. */
-/* #undef HAVE_CPUSET_SETID */
-
-/* Define to 1 if you have the <criu/criu.h> header file. */
-/* #undef HAVE_CRIU_CRIU_H */
-
-/* Define to 1 if you have the <crt_externs.h> header file. */
-/* #undef HAVE_CRT_EXTERNS_H */
-
-/* Define to 1 if you have the <ctype.h> header file. */
-#define HAVE_CTYPE_H 1
-
-/* Define to 1 if we have -lcuda */
-/* #undef HAVE_CUDA */
-
-/* Define to 1 if you have the <cuda.h> header file. */
-/* #undef HAVE_CUDA_H */
-
-/* Define to 1 if you have the <cuda_runtime_api.h> header file. */
-/* #undef HAVE_CUDA_RUNTIME_API_H */
-
-/* Define to 1 if you have the <curl/curl.h> header file. */
-/* #undef HAVE_CURL_CURL_H */
-
-/* Define to 1 if you have the `dbm_open' function. */
-/* #undef HAVE_DBM_OPEN */
-
-/* Define to 1 if you have the `dbopen' function. */
-/* #undef HAVE_DBOPEN */
-
-/* Define to 1 if you have the <db.h> header file. */
-/* #undef HAVE_DB_H */
-
-/* Define to 1 if you have the declaration of `AF_INET6', and to 0 if you
-   don't. */
-#define HAVE_DECL_AF_INET6 1
-
-/* Define to 1 if you have the declaration of `AF_UNSPEC', and to 0 if you
-   don't. */
-#define HAVE_DECL_AF_UNSPEC 1
-
-/* Define to 1 if you have the declaration of `CL_DEVICE_TOPOLOGY_AMD', and to
-   0 if you don't. */
-#define HAVE_DECL_CL_DEVICE_TOPOLOGY_AMD 0
-
-/* Define to 1 if you have the declaration of `CTL_HW', and to 0 if you don't.
-   */
-#define HAVE_DECL_CTL_HW 0
-
-/* Define to 1 if you have the declaration of `fabsf', and to 0 if you don't.
-   */
-#define HAVE_DECL_FABSF 1
-
-/* Define to 1 if you have the declaration of `HW_NCPU', and to 0 if you
-   don't. */
-#define HAVE_DECL_HW_NCPU 0
-
-/* Define to 1 if you have the declaration of `HZ', and to 0 if you don't. */
-#define HAVE_DECL_HZ 1
-
-/* Define to 1 if you have the declaration of `IBV_ACCESS_ALLOCATE_MR', and to
-   0 if you don't. */
-/* #undef HAVE_DECL_IBV_ACCESS_ALLOCATE_MR */
-
-/* Define to 1 if you have the declaration of
-   `IBV_ACCESS_SHARED_MR_USER_READ', and to 0 if you don't. */
-/* #undef HAVE_DECL_IBV_ACCESS_SHARED_MR_USER_READ */
-
-/* Define to 1 if you have the declaration of `IBV_ACCESS_SO', and to 0 if you
-   don't. */
-/* #undef HAVE_DECL_IBV_ACCESS_SO */
-
-/* Define to 1 if you have the declaration of `IBV_ATOMIC_HCA', and to 0 if
-   you don't. */
-/* #undef HAVE_DECL_IBV_ATOMIC_HCA */
-
-/* Define to 1 if you have the declaration of `IBV_EVENT_CLIENT_REREGISTER',
-   and to 0 if you don't. */
-/* #undef HAVE_DECL_IBV_EVENT_CLIENT_REREGISTER */
-
-/* Define to 1 if you have the declaration of `IBV_EXP_ACCESS_ALLOCATE_MR',
-   and to 0 if you don't. */
-/* #undef HAVE_DECL_IBV_EXP_ACCESS_ALLOCATE_MR */
-
-/* Define to 1 if you have the declaration of
-   `IBV_EXP_ACCESS_SHARED_MR_USER_READ', and to 0 if you don't. */
-/* #undef HAVE_DECL_IBV_EXP_ACCESS_SHARED_MR_USER_READ */
-
-/* Define to 1 if you have the declaration of `IBV_LINK_LAYER_ETHERNET', and
-   to 0 if you don't. */
-/* #undef HAVE_DECL_IBV_LINK_LAYER_ETHERNET */
-
-/* Define to 1 if you have the declaration of `IBV_SRQT_XRC', and to 0 if you
-   don't. */
-/* #undef HAVE_DECL_IBV_SRQT_XRC */
-
-/* Define to 1 if you have the declaration of
-   `nvmlDeviceGetMaxPcieLinkGeneration', and to 0 if you don't. */
-/* #undef HAVE_DECL_NVMLDEVICEGETMAXPCIELINKGENERATION */
-
-/* Define to 1 if you have the declaration of `PCI_LOOKUP_NO_NUMBERS', and to
-   0 if you don't. */
-/* #undef HAVE_DECL_PCI_LOOKUP_NO_NUMBERS */
-
-/* Define to 1 if you have the declaration of `PF_INET6', and to 0 if you
-   don't. */
-#define HAVE_DECL_PF_INET6 1
-
-/* Define to 1 if you have the declaration of `PF_UNSPEC', and to 0 if you
-   don't. */
-#define HAVE_DECL_PF_UNSPEC 1
-
-/* Define to 1 if you have the declaration of `pthread_getaffinity_np', and to
-   0 if you don't. */
-#define HAVE_DECL_PTHREAD_GETAFFINITY_NP 1
-
-/* Define to 1 if you have the declaration of `pthread_setaffinity_np', and to
-   0 if you don't. */
-#define HAVE_DECL_PTHREAD_SETAFFINITY_NP 1
-
-/* Define to 1 if you have the declaration of `RLIMIT_AS', and to 0 if you
-   don't. */
-#define HAVE_DECL_RLIMIT_AS 1
-
-/* Define to 1 if you have the declaration of `RLIMIT_CORE', and to 0 if you
-   don't. */
-#define HAVE_DECL_RLIMIT_CORE 1
-
-/* Define to 1 if you have the declaration of `RLIMIT_FSIZE', and to 0 if you
-   don't. */
-#define HAVE_DECL_RLIMIT_FSIZE 1
-
-/* Define to 1 if you have the declaration of `RLIMIT_MEMLOCK', and to 0 if
-   you don't. */
-#define HAVE_DECL_RLIMIT_MEMLOCK 1
-
-/* Define to 1 if you have the declaration of `RLIMIT_NOFILE', and to 0 if you
-   don't. */
-#define HAVE_DECL_RLIMIT_NOFILE 1
-
-/* Define to 1 if you have the declaration of `RLIMIT_NPROC', and to 0 if you
-   don't. */
-#define HAVE_DECL_RLIMIT_NPROC 1
-
-/* Define to 1 if you have the declaration of `RLIMIT_STACK', and to 0 if you
-   don't. */
-#define HAVE_DECL_RLIMIT_STACK 1
-
-/* Define to 1 if you have the declaration of `sbrk', and to 0 if you don't.
-   */
-#define HAVE_DECL_SBRK 1
-
-/* Define to 1 if you have the declaration of `strtoull', and to 0 if you
-   don't. */
-#define HAVE_DECL_STRTOULL 1
-
-/* Define to 1 if you have the declaration of `_SC_LARGE_PAGESIZE', and to 0
-   if you don't. */
-#define HAVE_DECL__SC_LARGE_PAGESIZE 0
-
-/* Define to 1 if you have the declaration of `_SC_NPROCESSORS_CONF', and to 0
-   if you don't. */
-#define HAVE_DECL__SC_NPROCESSORS_CONF 1
-
-/* Define to 1 if you have the declaration of `_SC_NPROCESSORS_ONLN', and to 0
-   if you don't. */
-#define HAVE_DECL__SC_NPROCESSORS_ONLN 1
-
-/* Define to 1 if you have the declaration of `_SC_NPROC_CONF', and to 0 if
-   you don't. */
-#define HAVE_DECL__SC_NPROC_CONF 0
-
-/* Define to 1 if you have the declaration of `_SC_NPROC_ONLN', and to 0 if
-   you don't. */
-#define HAVE_DECL__SC_NPROC_ONLN 0
-
-/* Define to 1 if you have the declaration of `_SC_PAGESIZE', and to 0 if you
-   don't. */
-#define HAVE_DECL__SC_PAGESIZE 1
-
-/* Define to 1 if you have the declaration of `_SC_PAGE_SIZE', and to 0 if you
-   don't. */
-#define HAVE_DECL__SC_PAGE_SIZE 1
-
-/* Define to 1 if you have the declaration of `__func__', and to 0 if you
-   don't. */
-#define HAVE_DECL___FUNC__ 1
-
-/* Define to 1 if you have the <dirent.h> header file. */
-#define HAVE_DIRENT_H 1
-
-/* Define to 1 if you have the <dlfcn.h> header file. */
-#define HAVE_DLFCN_H 1
-
-/* Define to 1 if you have the `dlsym' function. */
-#define HAVE_DLSYM 1
-
-/* Define to 1 if the system has the type `double _Complex'. */
-#define HAVE_DOUBLE__COMPLEX 1
-
-/* Define to 1 if you have the <err.h> header file. */
-#define HAVE_ERR_H 1
-
-/* Define to 1 if you have the <event.h> header file. */
-/* #undef HAVE_EVENT_H */
-
-/* Define to 1 if you have the <execinfo.h> header file. */
-#define HAVE_EXECINFO_H 1
-
-/* Define to 1 if you have the `execve' function. */
-#define HAVE_EXECVE 1
-
-/* Define to 1 if you have the <fca_api.h> header file. */
-/* #undef HAVE_FCA_API_H */
-
-/* Define to 1 if you have the <fcntl.h> header file. */
-#define HAVE_FCNTL_H 1
-
-/* Define to 1 if you have the `ffs' function. */
-#define HAVE_FFS 1
-
-/* Define to 1 if you have the `ffsl' function. */
-#define HAVE_FFSL 1
-
-/* Define to 1 if the system has the type `float _Complex'. */
-#define HAVE_FLOAT__COMPLEX 1
-
-/* Define to 1 if you have the `fls' function. */
-/* #undef HAVE_FLS */
-
-/* Define to 1 if you have the `flsl' function. */
-/* #undef HAVE_FLSL */
-
-/* Define to 1 if you have the `fork' function. */
-#define HAVE_FORK 1
-
-/* Define to 1 if you have the `getpagesize' function. */
-#define HAVE_GETPAGESIZE 1
-
-/* Define to 1 if you have the `getpwuid' function. */
-#define HAVE_GETPWUID 1
-
-/* Define to 1 if you have the `GNI_GetJobResInfo' function. */
-/* #undef HAVE_GNI_GETJOBRESINFO */
-
-/* Define to 1 if the system has the type `GROUP_AFFINITY'. */
-/* #undef HAVE_GROUP_AFFINITY */
-
-/* Define to 1 if the system has the type `GROUP_RELATIONSHIP'. */
-/* #undef HAVE_GROUP_RELATIONSHIP */
-
-/* Define to 1 if you have the <grp.h> header file. */
-#define HAVE_GRP_H 1
-
-/* Define to 1 if you have the <hcoll_api.h> header file. */
-/* #undef HAVE_HCOLL_API_H */
-
-/* Define to 1 if you have the <hostLib.h> header file. */
-/* #undef HAVE_HOSTLIB_H */
-
-/* Define to 1 if you have the `host_info' function. */
-/* #undef HAVE_HOST_INFO */
-
-/* Define to 1 if you have the <hwloc.h> header file. */
-/* #undef HAVE_HWLOC_H */
-
-/* Define to 1 if you have the `ibv_cmd_open_xrcd' function. */
-/* #undef HAVE_IBV_CMD_OPEN_XRCD */
-
-/* Define to 1 if you have the `ibv_create_xrc_rcv_qp' function. */
-/* #undef HAVE_IBV_CREATE_XRC_RCV_QP */
-
-/* Define to 1 if you have the `ibv_fork_init' function. */
-/* #undef HAVE_IBV_FORK_INIT */
-
-/* Define to 1 if you have the `ibv_get_device_list' function. */
-/* #undef HAVE_IBV_GET_DEVICE_LIST */
-
-/* Define to 1 if you have the `ibv_resize_cq' function. */
-/* #undef HAVE_IBV_RESIZE_CQ */
-
-/* Define to 1 if you have the <ifaddrs.h> header file. */
-#define HAVE_IFADDRS_H 1
-
-/* Define to 1 if you have the <infiniband/driver.h> header file. */
-/* #undef HAVE_INFINIBAND_DRIVER_H */
-
-/* Define to 1 if you have the <infiniband/verbs.h> header file. */
-/* #undef HAVE_INFINIBAND_VERBS_H */
-
-/* Define to 1 if the system has the type `int128_t'. */
-/* #undef HAVE_INT128_T */
-
-/* Define to 1 if the system has the type `int16_t'. */
-#define HAVE_INT16_T 1
-
-/* Define to 1 if the system has the type `int32_t'. */
-#define HAVE_INT32_T 1
-
-/* Define to 1 if the system has the type `int64_t'. */
-#define HAVE_INT64_T 1
-
-/* Define to 1 if the system has the type `int8_t'. */
-#define HAVE_INT8_T 1
-
-/* Define to 1 if the system has the type `intptr_t'. */
-#define HAVE_INTPTR_T 1
-
-/* Define to 1 if you have the <inttypes.h> header file. */
-#define HAVE_INTTYPES_H 1
-
-/* Define to 1 if you have the <ioLib.h> header file. */
-/* #undef HAVE_IOLIB_H */
-
-/* Define to 1 if you have the `isatty' function. */
-#define HAVE_ISATTY 1
-
-/* Define to 1 if the system has the type `KAFFINITY'. */
-/* #undef HAVE_KAFFINITY */
-
-/* Define to 1 if you have the <knem_io.h> header file. */
-/* #undef HAVE_KNEM_IO_H */
-
-/* Define to 1 if you have the <kstat.h> header file. */
-/* #undef HAVE_KSTAT_H */
-
-/* Define to 1 if you have the <libcr.h> header file. */
-/* #undef HAVE_LIBCR_H */
-
-/* Define to 1 if you have the `event' library (-levent). */
-/* #undef HAVE_LIBEVENT */
-
-/* Define to 1 if you have the `event_pthreads' library (-levent_pthreads). */
-/* #undef HAVE_LIBEVENT_PTHREADS */
-
-/* Define to 1 if we have -lgdi32 */
-/* #undef HAVE_LIBGDI32 */
-
-/* Define to 1 if you have the <libgen.h> header file. */
-#define HAVE_LIBGEN_H 1
-
-/* Define to 1 if we have -lkstat */
-/* #undef HAVE_LIBKSTAT */
-
-/* Define to 1 if we have -llgrp */
-/* #undef HAVE_LIBLGRP */
-
-/* set to 1 if should use libnl v3, set to 0 for libnl v11 */
-#define HAVE_LIBNL3 0
-
-/* Define to 1 if you have the `pci' library (-lpci). */
-/* #undef HAVE_LIBPCI */
-
-/* Define to 1 if you have the `psm_infinipath' library (-lpsm_infinipath). */
-/* #undef HAVE_LIBPSM_INFINIPATH */
-
-/* Define to 1 if you have the `pthread' library (-lpthread). */
-#define HAVE_LIBPTHREAD 1
-
-/* Define to 1 if you have the `rt' library (-lrt). */
-#define HAVE_LIBRT 1
-
-/* Define to 1 if you have the <libutil.h> header file. */
-/* #undef HAVE_LIBUTIL_H */
-
-/* Define to 1 if you have the <limits.h> header file. */
-#define HAVE_LIMITS_H 1
-
-/* Define to 1 if the system has the type `LOGICAL_PROCESSOR_RELATIONSHIP'. */
-/* #undef HAVE_LOGICAL_PROCESSOR_RELATIONSHIP */
-
-/* Define to 1 if the system has the type `long double'. */
-#define HAVE_LONG_DOUBLE 1
-
-/* Define to 1 if the system has the type `long double _Complex'. */
-#define HAVE_LONG_DOUBLE__COMPLEX 1
-
-/* Define to 1 if the system has the type `long long'. */
-#define HAVE_LONG_LONG 1
-
-/* Define to 1 if you have the <lsf/lsbatch.h> header file. */
-/* #undef HAVE_LSF_LSBATCH_H */
-
-/* Define to 1 if you have the <lsf/lsf.h> header file. */
-/* #undef HAVE_LSF_LSF_H */
-
-/* Define to 1 if you have the <ltdl.h> header file. */
-/* #undef HAVE_LTDL_H */
-
-/* Define to 1 if you have the <lustre/liblustreapi.h> header file. */
-/* #undef HAVE_LUSTRE_LIBLUSTREAPI_H */
-
-/* Define to 1 if you have the <mach/mach_host.h> header file. */
-/* #undef HAVE_MACH_MACH_HOST_H */
-
-/* Define to 1 if you have the <mach/mach_init.h> header file. */
-/* #undef HAVE_MACH_MACH_INIT_H */
-
-/* Define to 1 if you have the <mach/mach_time.h> header file. */
-/* #undef HAVE_MACH_MACH_TIME_H */
-
-/* Define to 1 if you have the <malloc.h> header file. */
-#define HAVE_MALLOC_H 1
-
-/* Define to 1 if you have the `memalign' function. */
-#define HAVE_MEMALIGN 1
-
-/* Define to 1 if you have the <memory.h> header file. */
-#define HAVE_MEMORY_H 1
-
-/* Define to 1 if you have the `mkfifo' function. */
-#define HAVE_MKFIFO 1
-
-/* Define to 1 if you have the `mmap' function. */
-#define HAVE_MMAP 1
-
-/* Define to 1 if you have the <mntent.h> header file. */
-#define HAVE_MNTENT_H 1
-
-/* Define to 1 if the system has the type `mode_t'. */
-#define HAVE_MODE_T 1
-
-/* Define to 1 if you have the <mtcp.h> header file. */
-/* #undef HAVE_MTCP_H */
-
-/* Define to 1 if you have the <munge.h> header file. */
-/* #undef HAVE_MUNGE_H */
-
-/* Define to 1 if you have the <mxm/api/mxm_api.h> header file. */
-/* #undef HAVE_MXM_API_MXM_API_H */
-
-/* Define to 1 if you have the <ndbm.h> header file. */
-/* #undef HAVE_NDBM_H */
-
-/* Define to 1 if you have the <netdb.h> header file. */
-#define HAVE_NETDB_H 1
-
-/* Define to 1 if you have the <netinet/in.h> header file. */
-#define HAVE_NETINET_IN_H 1
-
-/* Define to 1 if you have the <netinet/tcp.h> header file. */
-#define HAVE_NETINET_TCP_H 1
-
-/* Define to 1 if you have the <net/if.h> header file. */
-#define HAVE_NET_IF_H 1
-
-/* Define to 1 if you have the <net/uio.h> header file. */
-/* #undef HAVE_NET_UIO_H */
-
-/* Define to 1 if you have the <numaif.h> header file. */
-/* #undef HAVE_NUMAIF_H */
-
-/* Define to 1 if the system has the type `NUMA_NODE_RELATIONSHIP'. */
-/* #undef HAVE_NUMA_NODE_RELATIONSHIP */
-
-/* Define to 1 if you have the <NVCtrl/NVCtrl.h> header file. */
-/* #undef HAVE_NVCTRL_NVCTRL_H */
-
-/* Define to 1 if you have the <nvml.h> header file. */
-/* #undef HAVE_NVML_H */
-
-/* Define to 1 if you have the `on_exit' function. */
-#define HAVE_ON_EXIT 1
-
-/* Define to 1 if you have the `openat' function. */
-#define HAVE_OPENAT 1
-
-/* Define to 1 if you have the `openpty' function. */
-#define HAVE_OPENPTY 1
-
-/* Define to 1 if you have the <paths.h> header file. */
-#define HAVE_PATHS_H 1
-
-/* Define to 1 if you have the <pci/pci.h> header file. */
-/* #undef HAVE_PCI_PCI_H */
-
-/* Define to 1 if you have the <picl.h> header file. */
-/* #undef HAVE_PICL_H */
-
-/* Define to 1 if you have the `pipe' function. */
-#define HAVE_PIPE 1
-
-/* Define to 1 if you have the <plfs.h> header file. */
-/* #undef HAVE_PLFS_H */
-
-/* Define to 1 if you have the <pmapi.h> header file. */
-/* #undef HAVE_PMAPI_H */
-
-/* Define to 1 if you have the `pm_cycles' function. */
-/* #undef HAVE_PM_CYCLES */
-
-/* Define to 1 if you have the <poll.h> header file. */
-#define HAVE_POLL_H 1
-
-/* Define to 1 if you have the <portals4.h> header file. */
-/* #undef HAVE_PORTALS4_H */
-
-/* Define to 1 if you have the `posix_memalign' function. */
-#define HAVE_POSIX_MEMALIGN 1
-
-/* Define to 1 if you have the `printstack' function. */
-/* #undef HAVE_PRINTSTACK */
-
-/* Define to 1 if the system has the type `PROCESSOR_CACHE_TYPE'. */
-/* #undef HAVE_PROCESSOR_CACHE_TYPE */
-
-/* Define to 1 if the system has the type `PROCESSOR_GROUP_INFO'. */
-/* #undef HAVE_PROCESSOR_GROUP_INFO */
-
-/* Define to 1 if the system has the type `PROCESSOR_RELATIONSHIP'. */
-/* #undef HAVE_PROCESSOR_RELATIONSHIP */
-
-/* Define to 1 if the system has the type `PSAPI_WORKING_SET_EX_BLOCK'. */
-/* #undef HAVE_PSAPI_WORKING_SET_EX_BLOCK */
-
-/* Define to 1 if the system has the type `PSAPI_WORKING_SET_EX_INFORMATION'.
-   */
-/* #undef HAVE_PSAPI_WORKING_SET_EX_INFORMATION */
-
-/* libfabric: whether to build the PSM provider or not */
-/* #undef HAVE_PSM */
-
-/* libfabric: do not build PSM provider as a DL */
-/* #undef HAVE_PSM_DL */
-
-/* Define to 1 if you have the <psm.h> header file. */
-/* #undef HAVE_PSM_H */
-
-/* Define to 1 if you have the `pthread_condattr_setpshared' function. */
-#define HAVE_PTHREAD_CONDATTR_SETPSHARED 1
-
-/* Define to 1 if you have the <pthread.h> header file. */
-#define HAVE_PTHREAD_H 1
-
-/* Define to 1 if you have the `pthread_mutexattr_setpshared' function. */
-#define HAVE_PTHREAD_MUTEXATTR_SETPSHARED 1
-
-/* Define to 1 if you have the <pthread_np.h> header file. */
-/* #undef HAVE_PTHREAD_NP_H */
-
-/* Define to 1 if the system has the type `pthread_t'. */
-#define HAVE_PTHREAD_T 1
-
-/* Define to 1 if the system has the type `ptrdiff_t'. */
-#define HAVE_PTRDIFF_T 1
-
-/* Define to 1 if you have the `ptsname' function. */
-#define HAVE_PTSNAME 1
-
-/* Define to 1 if you have the <pty.h> header file. */
-#define HAVE_PTY_H 1
-
-/* Define to 1 if you have the <pvfs2.h> header file. */
-/* #undef HAVE_PVFS2_H */
-
-/* Define to 1 if you have the <pwd.h> header file. */
-#define HAVE_PWD_H 1
-
-/* Define to 1 if you have the <rdma/fabric.h> header file. */
-/* #undef HAVE_RDMA_FABRIC_H */
-
-/* Define to 1 if you have the <rdma/rdma_cma.h> header file. */
-/* #undef HAVE_RDMA_RDMA_CMA_H */
-
-/* Define to 1 if you have the <rdma/rsocket.h> header file. */
-/* #undef HAVE_RDMA_RSOCKET_H */
-
-/* Define to 1 if you have the `regcmp' function. */
-/* #undef HAVE_REGCMP */
-
-/* Define to 1 if you have the `regexec' function. */
-#define HAVE_REGEXEC 1
-
-/* Define to 1 if you have the <regex.h> header file. */
-#define HAVE_REGEX_H 1
-
-/* Define to 1 if you have the `regfree' function. */
-#define HAVE_REGFREE 1
-
-/* Define to 1 if the system has the type `RelationProcessorPackage'. */
-/* #undef HAVE_RELATIONPROCESSORPACKAGE */
-
-/* Define to 1 if you have the <sched.h> header file. */
-#define HAVE_SCHED_H 1
-
-/* Define to 1 if you have the <scif.h> header file. */
-#define HAVE_SCIF_H 1
-
-/* Define to 1 if you have the `setenv' function. */
-#define HAVE_SETENV 1
-
-/* Define to 1 if you have the `setlocale' function. */
-#define HAVE_SETLOCALE 1
-
-/* Define to 1 if you have the `setpgid' function. */
-#define HAVE_SETPGID 1
-
-/* Define to 1 if you have the `setsid' function. */
-#define HAVE_SETSID 1
-
-/* Define to 1 if you have the <shlwapi.h> header file. */
-/* #undef HAVE_SHLWAPI_H */
-
-/* Define to 1 if `si_band' is a member of `siginfo_t'. */
-#define HAVE_SIGINFO_T_SI_BAND 1
-
-/* Define to 1 if `si_fd' is a member of `siginfo_t'. */
-#define HAVE_SIGINFO_T_SI_FD 1
-
-/* Define to 1 if you have the <signal.h> header file. */
-#define HAVE_SIGNAL_H 1
-
-/* Define to 1 if you have the `snprintf' function. */
-#define HAVE_SNPRINTF 1
-
-/* Define to 1 if you have the `socketpair' function. */
-#define HAVE_SOCKETPAIR 1
-
-/* libfabric: do not build sockets provider */
-/* #undef HAVE_SOCKETS */
-
-/* libfabric: do not build sockets provider */
-/* #undef HAVE_SOCKETS_DL */
-
-/* Define to 1 if the system has the type `socklen_t'. */
-#define HAVE_SOCKLEN_T 1
-
-/* Define to 1 if you have the <sockLib.h> header file. */
-/* #undef HAVE_SOCKLIB_H */
-
-/* Define to 1 if the system has the type `ssize_t'. */
-#define HAVE_SSIZE_T 1
-
-/* Define to 1 if you have the `statfs' function. */
-#define HAVE_STATFS 1
-
-/* Define to 1 if you have the `statvfs' function. */
-#define HAVE_STATVFS 1
-
-/* Define to 1 if you have the <stdarg.h> header file. */
-#define HAVE_STDARG_H 1
-
-/* Define to 1 if you have the <stdbool.h> header file. */
-#define HAVE_STDBOOL_H 1
-
-/* Define to 1 if you have the <stddef.h> header file. */
-#define HAVE_STDDEF_H 1
-
-/* Define to 1 if you have the <stdint.h> header file. */
-#define HAVE_STDINT_H 1
-
-/* Define to 1 if you have the <stdlib.h> header file. */
-#define HAVE_STDLIB_H 1
-
-/* Define to 1 if you have the `strftime' function. */
-#define HAVE_STRFTIME 1
-
-/* Define to 1 if you have the <strings.h> header file. */
-#define HAVE_STRINGS_H 1
-
-/* Define to 1 if you have the <string.h> header file. */
-#define HAVE_STRING_H 1
-
-/* Define to 1 if you have the `strncasecmp' function. */
-#define HAVE_STRNCASECMP 1
-
-/* Define to 1 if you have the `strncpy_s' function. */
-/* #undef HAVE_STRNCPY_S */
-
-/* Define to 1 if you have the <stropts.h> header file. */
-/* #undef HAVE_STROPTS_H */
-
-/* Define to 1 if you have the `strsignal' function. */
-#define HAVE_STRSIGNAL 1
-
-/* Define to 1 if `d_type' is a member of `struct dirent'. */
-#define HAVE_STRUCT_DIRENT_D_TYPE 1
-
-/* Define to 1 if `transport_type' is a member of `struct ibv_device'. */
-/* #undef HAVE_STRUCT_IBV_DEVICE_TRANSPORT_TYPE */
-
-/* Define to 1 if `ifr_hwaddr' is a member of `struct ifreq'. */
-#define HAVE_STRUCT_IFREQ_IFR_HWADDR 1
-
-/* Define to 1 if `ifr_mtu' is a member of `struct ifreq'. */
-#define HAVE_STRUCT_IFREQ_IFR_MTU 1
-
-/* Define to 1 if the system has the type `struct sockaddr_in'. */
-#define HAVE_STRUCT_SOCKADDR_IN 1
-
-/* Define to 1 if the system has the type `struct sockaddr_in6'. */
-#define HAVE_STRUCT_SOCKADDR_IN6 1
-
-/* Define to 1 if `sa_len' is a member of `struct sockaddr'. */
-/* #undef HAVE_STRUCT_SOCKADDR_SA_LEN */
-
-/* Define to 1 if the system has the type `struct sockaddr_storage'. */
-#define HAVE_STRUCT_SOCKADDR_STORAGE 1
-
-/* Define to 1 if the system has the type `struct sockaddr_un'. */
-#define HAVE_STRUCT_SOCKADDR_UN 1
-
-/* Define to 1 if `f_fstypename' is a member of `struct statfs'. */
-/* #undef HAVE_STRUCT_STATFS_F_FSTYPENAME */
-
-/* Define to 1 if `f_type' is a member of `struct statfs'. */
-#define HAVE_STRUCT_STATFS_F_TYPE 1
-
-/* Define to 1 if `f_basetype' is a member of `struct statvfs'. */
-/* #undef HAVE_STRUCT_STATVFS_F_BASETYPE */
-
-/* Define to 1 if `f_fstypename' is a member of `struct statvfs'. */
-/* #undef HAVE_STRUCT_STATVFS_F_FSTYPENAME */
-
-/* Define to 1 if you have the `syscall' function. */
-#define HAVE_SYSCALL 1
-
-/* Define to 1 if you have the `sysconf' function. */
-#define HAVE_SYSCONF 1
-
-/* Define to '1' if sysctl is present and usable */
-#define HAVE_SYSCTL 1
-
-/* Define to '1' if sysctlbyname is present and usable */
-/* #undef HAVE_SYSCTLBYNAME */
-
-/* Define to 1 if you have the `syslog' function. */
-#define HAVE_SYSLOG 1
-
-/* Define to 1 if you have the <syslog.h> header file. */
-#define HAVE_SYSLOG_H 1
-
-/* Define to 1 if the system has the type
-   `SYSTEM_LOGICAL_PROCESSOR_INFORMATION'. */
-/* #undef HAVE_SYSTEM_LOGICAL_PROCESSOR_INFORMATION */
-
-/* Define to 1 if the system has the type
-   `SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX'. */
-/* #undef HAVE_SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX */
-
-/* Define to 1 if you have the <sys/cpuset.h> header file. */
-/* #undef HAVE_SYS_CPUSET_H */
-
-/* Define to 1 if you have the <sys/fcntl.h> header file. */
-#define HAVE_SYS_FCNTL_H 1
-
-/* Define to 1 if you have the <sys/ioctl.h> header file. */
-#define HAVE_SYS_IOCTL_H 1
-
-/* Define to 1 if you have the <sys/ipc.h> header file. */
-#define HAVE_SYS_IPC_H 1
-
-/* Define to 1 if you have the <sys/lgrp_user.h> header file. */
-/* #undef HAVE_SYS_LGRP_USER_H */
-
-/* Define to 1 if you have the <sys/mman.h> header file. */
-#define HAVE_SYS_MMAN_H 1
-
-/* Define to 1 if you have the <sys/mount.h> header file. */
-#define HAVE_SYS_MOUNT_H 1
-
-/* Define to 1 if you have the <sys/param.h> header file. */
-#define HAVE_SYS_PARAM_H 1
-
-/* Define to 1 if you have the <sys/poll.h> header file. */
-#define HAVE_SYS_POLL_H 1
-
-/* Define to 1 if you have the <sys/prctl.h> header file. */
-/* #undef HAVE_SYS_PRCTL_H */
-
-/* Define to 1 if you have the <sys/queue.h> header file. */
-#define HAVE_SYS_QUEUE_H 1
-
-/* Define to 1 if you have the <sys/resource.h> header file. */
-#define HAVE_SYS_RESOURCE_H 1
-
-/* Define to 1 if you have the <sys/select.h> header file. */
-#define HAVE_SYS_SELECT_H 1
-
-/* Define to 1 if you have the <sys/shm.h> header file. */
-#define HAVE_SYS_SHM_H 1
-
-/* Define to 1 if you have the <sys/socket.h> header file. */
-#define HAVE_SYS_SOCKET_H 1
-
-/* Define to 1 if you have the <sys/sockio.h> header file. */
-/* #undef HAVE_SYS_SOCKIO_H */
-
-/* Define to 1 if you have the <sys/statfs.h> header file. */
-#define HAVE_SYS_STATFS_H 1
-
-/* Define to 1 if you have the <sys/statvfs.h> header file. */
-#define HAVE_SYS_STATVFS_H 1
-
-/* Define to 1 if you have the <sys/stat.h> header file. */
-#define HAVE_SYS_STAT_H 1
-
-/* Define to 1 if you have the <sys/synch.h> header file. */
-/* #undef HAVE_SYS_SYNCH_H */
-
-/* Define to 1 if you have the <sys/sysctl.h> header file. */
-#define HAVE_SYS_SYSCTL_H 1
-
-/* Define to 1 if you have the <sys/time.h> header file. */
-#define HAVE_SYS_TIME_H 1
-
-/* Define to 1 if you have the <sys/tree.h> header file. */
-/* #undef HAVE_SYS_TREE_H */
-
-/* Define to 1 if you have the <sys/types.h> header file. */
-#define HAVE_SYS_TYPES_H 1
-
-/* Define to 1 if you have the <sys/uio.h> header file. */
-#define HAVE_SYS_UIO_H 1
-
-/* Define to 1 if you have the <sys/un.h> header file. */
-#define HAVE_SYS_UN_H 1
-
-/* Define to 1 if you have the <sys/utsname.h> header file. */
-#define HAVE_SYS_UTSNAME_H 1
-
-/* Define to 1 if you have the <sys/vfs.h> header file. */
-#define HAVE_SYS_VFS_H 1
-
-/* Define to 1 if you have the <sys/wait.h> header file. */
-#define HAVE_SYS_WAIT_H 1
-
-/* Define to 1 if you have the <TargetConditionals.h> header file. */
-/* #undef HAVE_TARGETCONDITIONALS_H */
-
-/* Define to 1 if you have the `tcgetpgrp' function. */
-#define HAVE_TCGETPGRP 1
-
-/* Define to 1 if you have the <termios.h> header file. */
-#define HAVE_TERMIOS_H 1
-
-/* Define to 1 if you have the <time.h> header file. */
-#define HAVE_TIME_H 1
-
-/* Define to 1 if you have the <tm.h> header file. */
-/* #undef HAVE_TM_H */
-
-/* Define to 1 if you have the <tm_tree.h> header file. */
-/* #undef HAVE_TM_TREE_H */
-
-/* Define to 1 if you have the <ucontext.h> header file. */
-#define HAVE_UCONTEXT_H 1
-
-/* Define to 1 if the system has the type `uint128_t'. */
-/* #undef HAVE_UINT128_T */
-
-/* Define to 1 if the system has the type `uint16_t'. */
-#define HAVE_UINT16_T 1
-
-/* Define to 1 if the system has the type `uint32_t'. */
-#define HAVE_UINT32_T 1
-
-/* Define to 1 if the system has the type `uint64_t'. */
-#define HAVE_UINT64_T 1
-
-/* Define to 1 if the system has the type `uint8_t'. */
-#define HAVE_UINT8_T 1
-
-/* Define to 1 if the system has the type `uintptr_t'. */
-#define HAVE_UINTPTR_T 1
-
-/* Define to 1 if you have the <ulimit.h> header file. */
-#define HAVE_ULIMIT_H 1
-
-/* Define to 1 if you have the `uname' function. */
-#define HAVE_UNAME 1
-
-/* Define to 1 if you have the <unistd.h> header file. */
-#define HAVE_UNISTD_H 1
-
-/* whether unix byteswap routines -- htonl, htons, nothl, ntohs -- are
-   available */
-#define HAVE_UNIX_BYTESWAP 1
-
-/* Define to 1 if you have the `usleep' function. */
-#define HAVE_USLEEP 1
-
-/* libfabric: whether to build the usnic provider or not */
-/* #undef HAVE_USNIC */
-
-/* libfabric: do not build usnic provider as a DL */
-/* #undef HAVE_USNIC_DL */
-
-/* Define to 1 if you have the <util.h> header file. */
-/* #undef HAVE_UTIL_H */
-
-/* Define to 1 if you have the <utmp.h> header file. */
-#define HAVE_UTMP_H 1
-
-/* Define to 1 if you have the <valgrind/valgrind.h> header file. */
-/* #undef HAVE_VALGRIND_VALGRIND_H */
-
-/* Define to 1 if you have the `vasprintf' function. */
-#define HAVE_VASPRINTF 1
-
-/* libfabric: do not build verbs provider */
-/* #undef HAVE_VERBS */
-
-/* libfabric: do not build verbs provider */
-/* #undef HAVE_VERBS_DL */
-
-/* Define to 1 if you have the `vsnprintf' function. */
-#define HAVE_VSNPRINTF 1
-
-/* Define to 1 if you have the `vsyslog' function. */
-#define HAVE_VSYSLOG 1
-
-/* Define to 1 if you have the `waitpid' function. */
-#define HAVE_WAITPID 1
-
-/* Define to 1 if you have the <X11/keysym.h> header file. */
-#define HAVE_X11_KEYSYM_H 1
-
-/* Define to 1 if you have the <X11/Xlib.h> header file. */
-#define HAVE_X11_XLIB_H 1
-
-/* Define to 1 if you have the <X11/Xutil.h> header file. */
-#define HAVE_X11_XUTIL_H 1
-
-/* Define to 1 if you have the <xpmem.h> header file. */
-/* #undef HAVE_XPMEM_H */
-
-/* Define to 1 if you have the `_NSGetEnviron' function. */
-/* #undef HAVE__NSGETENVIRON */
-
-/* Define to 1 if the system has the type `__float128'. */
-#define HAVE___FLOAT128 1
-
-/* Define to 1 if the system has the type `__int128'. */
-/* #undef HAVE___INT128 */
-
-/* Define to 1 if you have the `__mmap' function. */
-/* #undef HAVE___MMAP */
-
-/* Define to 1 if you have the `__munmap' function. */
-/* #undef HAVE___MUNMAP */
-
-/* Define to 1 on AIX */
-/* #undef HWLOC_AIX_SYS */
-
-/* Define to 1 on BlueGene/Q */
-/* #undef HWLOC_BGQ_SYS */
-
-/* Whether C compiler supports symbol visibility or not */
-#define HWLOC_C_HAVE_VISIBILITY 1
-
-/* Define to 1 on Darwin */
-/* #undef HWLOC_DARWIN_SYS */
-
-/* Whether we are in debugging mode or not */
-/* #undef HWLOC_DEBUG */
-
-/* Version of hwloc */
-/* #undef HWLOC_EXTERNAL_HWLOC_VERSION */
-
-/* Define to 1 on *FREEBSD */
-/* #undef HWLOC_FREEBSD_SYS */
-
-/* Whether your compiler has __attribute__ or not */
-#define HWLOC_HAVE_ATTRIBUTE 1
-
-/* Whether your compiler has __attribute__ aligned or not */
-#define HWLOC_HAVE_ATTRIBUTE_ALIGNED 1
-
-/* Whether your compiler has __attribute__ always_inline or not */
-#define HWLOC_HAVE_ATTRIBUTE_ALWAYS_INLINE 1
-
-/* Whether your compiler has __attribute__ cold or not */
-#define HWLOC_HAVE_ATTRIBUTE_COLD 1
-
-/* Whether your compiler has __attribute__ const or not */
-#define HWLOC_HAVE_ATTRIBUTE_CONST 1
-
-/* Whether your compiler has __attribute__ deprecated or not */
-#define HWLOC_HAVE_ATTRIBUTE_DEPRECATED 1
-
-/* Whether your compiler has __attribute__ format or not */
-#define HWLOC_HAVE_ATTRIBUTE_FORMAT 1
-
-/* Whether your compiler has __attribute__ hot or not */
-#define HWLOC_HAVE_ATTRIBUTE_HOT 1
-
-/* Whether your compiler has __attribute__ malloc or not */
-#define HWLOC_HAVE_ATTRIBUTE_MALLOC 1
-
-/* Whether your compiler has __attribute__ may_alias or not */
-#define HWLOC_HAVE_ATTRIBUTE_MAY_ALIAS 1
-
-/* Whether your compiler has __attribute__ nonnull or not */
-#define HWLOC_HAVE_ATTRIBUTE_NONNULL 1
-
-/* Whether your compiler has __attribute__ noreturn or not */
-#define HWLOC_HAVE_ATTRIBUTE_NORETURN 1
-
-/* Whether your compiler has __attribute__ no_instrument_function or not */
-#define HWLOC_HAVE_ATTRIBUTE_NO_INSTRUMENT_FUNCTION 1
-
-/* Whether your compiler has __attribute__ packed or not */
-#define HWLOC_HAVE_ATTRIBUTE_PACKED 1
-
-/* Whether your compiler has __attribute__ pure or not */
-#define HWLOC_HAVE_ATTRIBUTE_PURE 1
-
-/* Whether your compiler has __attribute__ sentinel or not */
-#define HWLOC_HAVE_ATTRIBUTE_SENTINEL 1
-
-/* Whether your compiler has __attribute__ unused or not */
-#define HWLOC_HAVE_ATTRIBUTE_UNUSED 1
-
-/* Whether your compiler has __attribute__ warn unused result or not */
-#define HWLOC_HAVE_ATTRIBUTE_WARN_UNUSED_RESULT 1
-
-/* Whether your compiler has __attribute__ weak alias or not */
-#define HWLOC_HAVE_ATTRIBUTE_WEAK_ALIAS 1
-
-/* Define to 1 if your `ffs' function is known to be broken. */
-/* #undef HWLOC_HAVE_BROKEN_FFS */
-
-/* Define to 1 if you have the `clz' function. */
-/* #undef HWLOC_HAVE_CLZ */
-
-/* Define to 1 if you have the `clzl' function. */
-/* #undef HWLOC_HAVE_CLZL */
-
-/* Define to 1 if the CPU_SET macro works */
-#define HWLOC_HAVE_CPU_SET 1
-
-/* Define to 1 if the CPU_SET_S macro works */
-#define HWLOC_HAVE_CPU_SET_S 1
-
-/* Define to 1 if you have the `cudart' SDK. */
-/* #undef HWLOC_HAVE_CUDART */
-
-/* Define to 1 if function `clz' is declared by system headers */
-/* #undef HWLOC_HAVE_DECL_CLZ */
-
-/* Define to 1 if function `clzl' is declared by system headers */
-/* #undef HWLOC_HAVE_DECL_CLZL */
-
-/* Define to 1 if function `ffs' is declared by system headers */
-#define HWLOC_HAVE_DECL_FFS 1
-
-/* Define to 1 if function `ffsl' is declared by system headers */
-#define HWLOC_HAVE_DECL_FFSL 1
-
-/* Define to 1 if function `fls' is declared by system headers */
-/* #undef HWLOC_HAVE_DECL_FLS */
-
-/* Define to 1 if function `flsl' is declared by system headers */
-/* #undef HWLOC_HAVE_DECL_FLSL */
-
-/* Define to 1 if function `strncasecmp' is declared by system headers */
-#define HWLOC_HAVE_DECL_STRNCASECMP 1
-
-/* Define to 1 if you have the `ffs' function. */
-#define HWLOC_HAVE_FFS 1
-
-/* Define to 1 if you have the `ffsl' function. */
-#define HWLOC_HAVE_FFSL 1
-
-/* Define to 1 if you have the `fls' function. */
-/* #undef HWLOC_HAVE_FLS */
-
-/* Define to 1 if you have the `flsl' function. */
-/* #undef HWLOC_HAVE_FLSL */
-
-/* Define to 1 if you have the GL module components. */
-/* #undef HWLOC_HAVE_GL */
-
-/* Define to 1 if you have the `libpciaccess' library. */
-/* #undef HWLOC_HAVE_LIBPCIACCESS */
-
-/* Define to 1 if you have the `libxml2' library. */
-/* #undef HWLOC_HAVE_LIBXML2 */
-
-/* Define to 1 if building the Linux PCI component */
-#define HWLOC_HAVE_LINUXPCI 1
-
-/* Define to 1 if mbind is available. */
-/* #undef HWLOC_HAVE_MBIND */
-
-/* Define to 1 if migrate_pages is available. */
-/* #undef HWLOC_HAVE_MIGRATE_PAGES */
-
-/* Define to 1 if you have the `NVML' library. */
-/* #undef HWLOC_HAVE_NVML */
-
-/* Define to 1 if glibc provides the old prototype (without length) of
-   sched_setaffinity() */
-/* #undef HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
-
-/* Define to 1 if you have the `OpenCL' library. */
-/* #undef HWLOC_HAVE_OPENCL */
-
-/* Define to 1 if `libpci' struct pci_dev has a `device_class' field. */
-/* #undef HWLOC_HAVE_PCIDEV_DEVICE_CLASS */
-
-/* Define to 1 if `libpci' struct pci_dev has a `domain' field. */
-/* #undef HWLOC_HAVE_PCIDEV_DOMAIN */
-
-/* Define to 1 if you have the pciutils `libpci' library. */
-/* #undef HWLOC_HAVE_PCIUTILS */
-
-/* Define to 1 if `libpci' has the `pci_find_cap' function. */
-/* #undef HWLOC_HAVE_PCI_FIND_CAP */
-
-/* Define to 1 if the hwloc library should support dynamically-loaded plugins
-   */
-/* #undef HWLOC_HAVE_PLUGINS */
-
-/* `Define to 1 if you have pthread_getthrds_np' */
-/* #undef HWLOC_HAVE_PTHREAD_GETTHRDS_NP */
-
-/* Define to 1 if pthread mutexes are available */
-#define HWLOC_HAVE_PTHREAD_MUTEX 1
-
-/* Define to 1 if glibc provides a prototype of sched_setaffinity() */
-#define HWLOC_HAVE_SCHED_SETAFFINITY 1
-
-/* Define to 1 if set_mempolicy is available. */
-/* #undef HWLOC_HAVE_SET_MEMPOLICY */
-
-/* Define to 1 if you have the <stdint.h> header file. */
-#define HWLOC_HAVE_STDINT_H 1
-
-/* Define to 1 if you have the `windows.h' header. */
-/* #undef HWLOC_HAVE_WINDOWS_H */
-
-/* Define to 1 if X11 headers including Xutil.h and keysym.h are available. */
-#define HWLOC_HAVE_X11_KEYSYM 1
-
-/* Define to 1 if you have x86 cpuid */
-#define HWLOC_HAVE_X86_CPUID 1
-
-/* Define to 1 if the _syscall3 macro works */
-/* #undef HWLOC_HAVE__SYSCALL3 */
-
-/* Define to 1 on HP-UX */
-/* #undef HWLOC_HPUX_SYS */
-
-/* Version of hwloc */
-#define HWLOC_HWLOC191_HWLOC_VERSION "internal v1.9.2"
-
-/* Define to 1 on Irix */
-/* #undef HWLOC_IRIX_SYS */
-
-/* Define to 1 on Linux */
-#define HWLOC_LINUX_SYS 1
-
-/* Define to 1 on *NETBSD */
-/* #undef HWLOC_NETBSD_SYS */
-
-/* Define to 1 on OSF */
-/* #undef HWLOC_OSF_SYS */
-
-/* The size of `unsigned int', as computed by sizeof */
-#define HWLOC_SIZEOF_UNSIGNED_INT 4
-
-/* The size of `unsigned long', as computed by sizeof */
-#define HWLOC_SIZEOF_UNSIGNED_LONG 8
-
-/* Define to 1 on Solaris */
-/* #undef HWLOC_SOLARIS_SYS */
-
-/* The hwloc symbol prefix */
-#define HWLOC_SYM_PREFIX opal_hwloc191_
-
-/* The hwloc symbol prefix in all caps */
-#define HWLOC_SYM_PREFIX_CAPS OPAL_HWLOC191_
-
-/* Whether we need to re-define all the hwloc public symbols or not */
-#define HWLOC_SYM_TRANSFORM 1
-
-/* Define to 1 on unsupported systems */
-/* #undef HWLOC_UNSUPPORTED_SYS */
-
-/* Define to 1 on WINDOWS */
-/* #undef HWLOC_WIN_SYS */
-
-/* Define to 1 on x86_32 */
-/* #undef HWLOC_X86_32_ARCH */
-
-/* Define to 1 on x86_64 */
-#define HWLOC_X86_64_ARCH 1
-
-/* Define to the sub-directory in which libtool stores uninstalled libraries.
-   */
-#define LT_OBJDIR ".libs/"
-
-/* Header to include for event implementation */
-#define MCA_event_IMPLEMENTATION_HEADER "opal/mca/event/libevent2022/libevent2022.h"
-
-/* Header to include for hwloc implementation */
-#define MCA_hwloc_IMPLEMENTATION_HEADER "opal/mca/hwloc/hwloc191/hwloc191.h"
-
-/* Location of external hwloc header */
-/* #undef MCA_hwloc_external_header */
-
-/* Location of external hwloc header */
-/* #undef MCA_hwloc_external_openfabrics_header */
-
-/* Complete set of command line arguments given to ROMIOs configure script */
-#define MCA_io_romio_COMPLETE_CONFIGURE_FLAGS " FROM_OMPI=yes CC='gcc -std=gnu99' CFLAGS='-g -Wall -Wundef -Wno-long-long -Wsign-compare -Wmissing-prototypes -Wstrict-prototypes -Wcomment -pedantic -Werror-implicit-function-declaration -finline-functions -fno-strict-aliasing -pthread -D__EXTENSIONS__' CPPFLAGS='  -I/home/wwu12/ompi/ompi-gpu/opal/mca/hwloc/hwloc191/hwloc/include -I/home/wwu12/ompi/ompi-gpu/opal/mca/event/libevent2022/libevent -I/home/wwu12/ompi/ompi-gpu/opal/mca/event/libevent2022/libevent/include' FFLAGS='' LDFLAGS='  ' --enable-shared --disable-static  --prefix=/home/wwu12/ompi/build-gpu --disable-aio --disable-weak-symbols --enable-strict"
-
-/* Set of user-defined configure flags given to ROMIOs configure script via
-   --with-io-romio-flags */
-#define MCA_io_romio_USER_CONFIGURE_FLAGS ""
-
-/* Header to include for memcpy implementation */
-#define MCA_memcpy_IMPLEMENTATION_HEADER "opal/mca/memcpy/base/memcpy_base_default.h"
-
-/* Header to include for parts of the memory implementation */
-#define MCA_memory_IMPLEMENTATION_HEADER "opal/mca/memory/base/empty.h"
-
-/* Defined to 1 if ompi:mtl should use direct calls instead of components */
-#define MCA_ompi_mtl_DIRECT_CALL 0
-
-/* name of component to use for direct calls, if MCA_ompi_mtl_DIRECT_CALL is 1
-   */
-#define MCA_ompi_mtl_DIRECT_CALL_COMPONENT 
-
-/* Header ompi:mtl includes to be direct called */
-#define MCA_ompi_mtl_DIRECT_CALL_HEADER ""
-
-/* Defined to 1 if ompi:pml should use direct calls instead of components */
-#define MCA_ompi_pml_DIRECT_CALL 0
-
-/* name of component to use for direct calls, if MCA_ompi_pml_DIRECT_CALL is 1
-   */
-#define MCA_ompi_pml_DIRECT_CALL_COMPONENT 
-
-/* Header ompi:pml includes to be direct called */
-#define MCA_ompi_pml_DIRECT_CALL_HEADER ""
-
-/* Defined to 1 if oshmem:memheap should use direct calls instead of
-   components */
-#define MCA_oshmem_memheap_DIRECT_CALL 0
-
-/* name of component to use for direct calls, if
-   MCA_oshmem_memheap_DIRECT_CALL is 1 */
-#define MCA_oshmem_memheap_DIRECT_CALL_COMPONENT 
-
-/* Header oshmem:memheap includes to be direct called */
-#define MCA_oshmem_memheap_DIRECT_CALL_HEADER ""
-
-/* Defined to 1 if oshmem:spml should use direct calls instead of components
-   */
-#define MCA_oshmem_spml_DIRECT_CALL 0
-
-/* name of component to use for direct calls, if MCA_oshmem_spml_DIRECT_CALL
-   is 1 */
-#define MCA_oshmem_spml_DIRECT_CALL_COMPONENT 
-
-/* Header oshmem:spml includes to be direct called */
-#define MCA_oshmem_spml_DIRECT_CALL_HEADER ""
-
-/* Header to include for rte implementation */
-#define MCA_rte_IMPLEMENTATION_HEADER "ompi/mca/rte/orte/rte_orte.h"
-
-/* Header to include for timer implementation */
-#define MCA_timer_IMPLEMENTATION_HEADER "opal/mca/timer/linux/timer_linux.h"
-
-/* Whether ptmalloc2 is supported on this system or not */
-#define MEMORY_LINUX_PTMALLOC2 1
-
-/* Whether ummunotify is supported on this system or not */
-#define MEMORY_LINUX_UMMUNOTIFY 0
-
-/* Whether we can use M-PAGE supported since MOFED 1.8 */
-#define MPAGE_ENABLE 0
-
-/* create_flags field is part of ibv_exp_reg_mr_in */
-#define MPAGE_HAVE_IBV_EXP_REG_MR_CREATE_FLAGS 0
-
-/* exp_access field is part of ibv_exp_reg_shared_mr_in */
-#define MPAGE_HAVE_SMR_EXP_ACCESS 0
-
-/* Maximum value for an MPI_Count */
-#define MPI_COUNT_MAX 0x7fffffffffffffffll
-
-/* Whether we want to check MPI parameters always, never, or decide at
-   run-time */
-#define MPI_PARAM_CHECK ompi_mpi_param_check
-
-/* Alignment of Fortran CHARACTER */
-#define OMPI_ALIGNMENT_FORTRAN_CHARACTER 1
-
-/* Alignment of Fortran COMPLEX */
-#define OMPI_ALIGNMENT_FORTRAN_COMPLEX 4
-
-/* Alignment of Fortran COMPLEX*16 */
-#define OMPI_ALIGNMENT_FORTRAN_COMPLEX16 8
-
-/* Alignment of Fortran COMPLEX*32 */
-#define OMPI_ALIGNMENT_FORTRAN_COMPLEX32 4
-
-/* Alignment of Fortran COMPLEX*4 */
-#define OMPI_ALIGNMENT_FORTRAN_COMPLEX4 4
-
-/* Alignment of Fortran COMPLEX*8 */
-#define OMPI_ALIGNMENT_FORTRAN_COMPLEX8 4
-
-/* Alignment of Fortran DOUBLE COMPLEX */
-#define OMPI_ALIGNMENT_FORTRAN_DOUBLE_COMPLEX 8
-
-/* Alignment of Fortran DOUBLE PRECISION */
-#define OMPI_ALIGNMENT_FORTRAN_DOUBLE_PRECISION 8
-
-/* Alignment of Fortran INTEGER */
-#define OMPI_ALIGNMENT_FORTRAN_INTEGER 4
-
-/* Alignment of Fortran INTEGER*1 */
-#define OMPI_ALIGNMENT_FORTRAN_INTEGER1 1
-
-/* Alignment of Fortran INTEGER*16 */
-#define OMPI_ALIGNMENT_FORTRAN_INTEGER16 4
-
-/* Alignment of Fortran INTEGER*2 */
-#define OMPI_ALIGNMENT_FORTRAN_INTEGER2 2
-
-/* Alignment of Fortran INTEGER*4 */
-#define OMPI_ALIGNMENT_FORTRAN_INTEGER4 4
-
-/* Alignment of Fortran INTEGER*8 */
-#define OMPI_ALIGNMENT_FORTRAN_INTEGER8 8
-
-/* Alignment of Fortran LOGICAL */
-#define OMPI_ALIGNMENT_FORTRAN_LOGICAL 4
-
-/* Alignment of Fortran LOGICAL*1 */
-#define OMPI_ALIGNMENT_FORTRAN_LOGICAL1 1
-
-/* Alignment of Fortran LOGICAL*2 */
-#define OMPI_ALIGNMENT_FORTRAN_LOGICAL2 2
-
-/* Alignment of Fortran LOGICAL*4 */
-#define OMPI_ALIGNMENT_FORTRAN_LOGICAL4 4
-
-/* Alignment of Fortran LOGICAL*8 */
-#define OMPI_ALIGNMENT_FORTRAN_LOGICAL8 8
-
-/* Alignment of Fortran REAL */
-#define OMPI_ALIGNMENT_FORTRAN_REAL 4
-
-/* Alignment of Fortran REAL*16 */
-#define OMPI_ALIGNMENT_FORTRAN_REAL16 4
-
-/* Alignment of Fortran REAL*2 */
-#define OMPI_ALIGNMENT_FORTRAN_REAL2 4
-
-/* Alignment of Fortran REAL*4 */
-#define OMPI_ALIGNMENT_FORTRAN_REAL4 4
-
-/* Alignment of Fortran REAL*8 */
-#define OMPI_ALIGNMENT_FORTRAN_REAL8 8
-
-/* Whether we want MPI C++ support or not */
-#define OMPI_BUILD_CXX_BINDINGS 0
-
-/* Whether we built the 'use mpi_f08' prototype subarray-based implementation
-   or not (i.e., whether to build the use-mpi-f08-desc prototype or the
-   regular use-mpi-f08 implementation) */
-#define OMPI_BUILD_FORTRAN_F08_SUBARRAYS 0
-
-/* Whether we will build the MPI Fortran mpif.h bindings or not */
-#define OMPI_BUILD_FORTRAN_MPIFH_BINDINGS 1
-
-/* For ompi_info: Whether we will build the MPI Fortran "use mpi_f08" bindings
-   or not */
-#define OMPI_BUILD_FORTRAN_USEMPIF08_BINDINGS 0
-
-/* Whether we will build the MPI Fortran "use mpi" bindings or not */
-#define OMPI_BUILD_FORTRAN_USEMPI_BINDINGS 1
-
-/* OMPI underlying C++ compiler */
-#define OMPI_CXX "g++"
-
-/* Whether C++ compiler supports __builtin_expect */
-#define OMPI_CXX_HAVE_BUILTIN_EXPECT 0
-
-/* Whether C++ compiler supports __builtin_prefetch */
-#define OMPI_CXX_HAVE_BUILTIN_PREFETCH 0
-
-/* Whether a const_cast on a 2-d array will work with the C++ compiler */
-#define OMPI_CXX_SUPPORTS_2D_CONST_CAST 0
-
-/* Enable contributed software package libompitrace */
-#define OMPI_ENABLE_CONTRIB_libompitrace 1
-
-/* Whether we want MPI profiling or not */
-#define OMPI_ENABLE_MPI_PROFILING 1
-
-/* Enable MPI_THREAD_MULTIPLE */
-#define OMPI_ENABLE_THREAD_MULTIPLE 0
-
-/* Underlying Fortran compiler */
-#define OMPI_FC "gfortran"
-
-/* Absolutey path to the underlying Fortran compiler found by configure */
-#define OMPI_FC_ABSOLUTE "/usr/bin/gfortran"
-
-/* Whether the mpif.h interface supports the MPI_SIZEOF interface or not */
-#define OMPI_FORTRAN_BUILD_SIZEOF 0
-
-/* Whether fortran symbols are all caps or not */
-#define OMPI_FORTRAN_CAPS 0
-
-/* Whether fortran symbols have a trailing double underscore or not */
-#define OMPI_FORTRAN_DOUBLE_UNDERSCORE 0
-
-/* How many bytes the mpi_f08 TYPE(MPI_<foo>) handles will be */
-#define OMPI_FORTRAN_F08_HANDLE_SIZE 4
-
-/* Max handle value for fortran MPI handles, effectively min(INT_MAX, max
-   fortran INTEGER value) */
-#define OMPI_FORTRAN_HANDLE_MAX 2147483647
-
-/* For mpi-f08-interfaces-callbacks.f90 and ompi_info: whether the compiler
-   supports the "abstract" keyword or not */
-#define OMPI_FORTRAN_HAVE_ABSTRACT 0
-
-/* For ompi/mpi/fortran/use-mpi-f08/blah.F90 and blah.h and ompi_info: whether
-   the compiler supports the "asynchronous" keyword or not */
-#define OMPI_FORTRAN_HAVE_ASYNCHRONOUS 0
-
-/* For ompi_info: Whether the compiler supports all forms of BIND(C) that we
-   need */
-#define OMPI_FORTRAN_HAVE_BIND_C 0
-
-/* For ompi_info: Whether the compiler supports SUBROUTINE ... BIND(C) or not
-   */
-#define OMPI_FORTRAN_HAVE_BIND_C_SUB 0
-
-/* For ompi_info: Whether the compiler supports TYPE, BIND(C) or not */
-#define OMPI_FORTRAN_HAVE_BIND_C_TYPE 0
-
-/* For ompi_info: Whether the compiler supports TYPE, BIND(C, NAME="name") or
-   not */
-#define OMPI_FORTRAN_HAVE_BIND_C_TYPE_NAME 0
-
-/* For ompi/mpi/fortran/use-mpi-f08/blah.F90 and blah.h and ompi_info: whether
-   the compiler supports c_funloc or not */
-#define OMPI_FORTRAN_HAVE_C_FUNLOC 0
-
-/* For ompi_info: Whether the Fortran compiler supports the Fortran 2008
-   "assumed rank" syntax or not */
-#define OMPI_FORTRAN_HAVE_F08_ASSUMED_RANK 0
-
-/* Whether the Fortran compiler supports ignore TKR functionality or not */
-#define OMPI_FORTRAN_HAVE_IGNORE_TKR 0
-
-/* Whether the compiler supports INTERFACE or not */
-#define OMPI_FORTRAN_HAVE_INTERFACE 1
-
-/* For ompi_info: Whether the compiler supports ISO_C_BINDING or not */
-#define OMPI_FORTRAN_HAVE_ISO_C_BINDING 1
-
-/* Whether the compiler supports ISO_FORTRAN_ENV or not */
-#define OMPI_FORTRAN_HAVE_ISO_FORTRAN_ENV 0
-
-/* For ompi_info: whether the Fortran compiler supports optional arguments or
-   not */
-#define OMPI_FORTRAN_HAVE_OPTIONAL_ARGS 0
-
-/* For mpi-f08-types.f90 and ompi_info: whether the compiler supports the
-   "private" keyword or not (used in MPI_Status) */
-#define OMPI_FORTRAN_HAVE_PRIVATE 0
-
-/* For ompi/mpi/fortran/use-mpi-f08/blah.F90 and blah.h and ompi_info: whether
-   the compiler supports the "procedure" keyword or not */
-#define OMPI_FORTRAN_HAVE_PROCEDURE 0
-
-/* For mpi-f08-types.f90 and .F90 and ompi_info: whether the compiler supports
-   the "protected" keyword or not */
-#define OMPI_FORTRAN_HAVE_PROTECTED 0
-
-/* Whether the compiler supports STORAGE_SIZE on relevant types */
-#define OMPI_FORTRAN_HAVE_STORAGE_SIZE 0
-
-/* Pre declaration for FORTRAN ignore parameter TKR behavior */
-#define OMPI_FORTRAN_IGNORE_TKR_PREDECL ""
-
-/* Type declaration for FORTRAN ignore parameter TKR behavior */
-#define OMPI_FORTRAN_IGNORE_TKR_TYPE 
-
-/* Max dimension rank of Fortran arrays */
-#define OMPI_FORTRAN_MAX_ARRAY_RANK 7
-
-/* Whether the mpi_f08 implementation is using wrapper routines ("bad" Fortran
-   compiler) or weak symbols ("good" Fortran compiler) for the F08 interface
-   definition implementations */
-#define OMPI_FORTRAN_NEED_WRAPPER_ROUTINES 0
-
-/* Whether fortran symbols have no trailing underscore or not */
-#define OMPI_FORTRAN_PLAIN 0
-
-/* Whether fortran symbols have a trailing underscore or not */
-#define OMPI_FORTRAN_SINGLE_UNDERSCORE 1
-
-/* Value to load to the MPI_SUBARRAYS_SUPPORTED compile-time constant */
-#define OMPI_FORTRAN_SUBARRAYS_SUPPORTED .FALSE.
-
-/* Fortran value for LOGICAL .TRUE. value */
-#define OMPI_FORTRAN_VALUE_TRUE 1
-
-/* Greek - alpha, beta, etc - release number of Open MPI */
-#define OMPI_GREEK_VERSION "a1"
-
-/* Wether we want sparse process groups */
-#define OMPI_GROUP_SPARSE 0
-
-/* Whether or not we have compiled with C++ exceptions support */
-#define OMPI_HAVE_CXX_EXCEPTION_SUPPORT 0
-
-/* Whether we have Fortran CHARACTER or not */
-#define OMPI_HAVE_FORTRAN_CHARACTER 1
-
-/* Whether we have Fortran COMPLEX or not */
-#define OMPI_HAVE_FORTRAN_COMPLEX 1
-
-/* Whether we have Fortran COMPLEX*16 or not */
-#define OMPI_HAVE_FORTRAN_COMPLEX16 1
-
-/* Whether we have Fortran COMPLEX*32 or not */
-#define OMPI_HAVE_FORTRAN_COMPLEX32 0
-
-/* Whether we have Fortran COMPLEX*4 or not */
-#define OMPI_HAVE_FORTRAN_COMPLEX4 0
-
-/* Whether we have Fortran COMPLEX*8 or not */
-#define OMPI_HAVE_FORTRAN_COMPLEX8 1
-
-/* Whether we have Fortran DOUBLE COMPLEX or not */
-#define OMPI_HAVE_FORTRAN_DOUBLE_COMPLEX 1
-
-/* Whether we have Fortran DOUBLE PRECISION or not */
-#define OMPI_HAVE_FORTRAN_DOUBLE_PRECISION 1
-
-/* Whether we have Fortran INTEGER or not */
-#define OMPI_HAVE_FORTRAN_INTEGER 1
-
-/* Whether we have Fortran INTEGER*1 or not */
-#define OMPI_HAVE_FORTRAN_INTEGER1 1
-
-/* Whether we have Fortran INTEGER*16 or not */
-#define OMPI_HAVE_FORTRAN_INTEGER16 0
-
-/* Whether we have Fortran INTEGER*2 or not */
-#define OMPI_HAVE_FORTRAN_INTEGER2 1
-
-/* Whether we have Fortran INTEGER*4 or not */
-#define OMPI_HAVE_FORTRAN_INTEGER4 1
-
-/* Whether we have Fortran INTEGER*8 or not */
-#define OMPI_HAVE_FORTRAN_INTEGER8 1
-
-/* Whether we have Fortran LOGICAL or not */
-#define OMPI_HAVE_FORTRAN_LOGICAL 1
-
-/* Whether we have Fortran LOGICAL*1 or not */
-#define OMPI_HAVE_FORTRAN_LOGICAL1 1
-
-/* Whether we have Fortran LOGICAL*2 or not */
-#define OMPI_HAVE_FORTRAN_LOGICAL2 1
-
-/* Whether we have Fortran LOGICAL*4 or not */
-#define OMPI_HAVE_FORTRAN_LOGICAL4 1
-
-/* Whether we have Fortran LOGICAL*8 or not */
-#define OMPI_HAVE_FORTRAN_LOGICAL8 1
-
-/* Whether we have Fortran REAL or not */
-#define OMPI_HAVE_FORTRAN_REAL 1
-
-/* Whether we have Fortran REAL*16 or not */
-#define OMPI_HAVE_FORTRAN_REAL16 0
-
-/* Whether we have Fortran REAL*2 or not */
-#define OMPI_HAVE_FORTRAN_REAL2 0
-
-/* Whether we have Fortran REAL*4 or not */
-#define OMPI_HAVE_FORTRAN_REAL4 1
-
-/* Whether we have Fortran REAL*8 or not */
-#define OMPI_HAVE_FORTRAN_REAL8 1
-
-/* Fortrn KIND number for CHARACTER */
-#define OMPI_KIND_FORTRAN_CHARACTER C_SIGNED_CHAR
-
-/* Fortrn KIND number for COMPLEX */
-#define OMPI_KIND_FORTRAN_COMPLEX C_FLOAT_COMPLEX
-
-/* Fortrn KIND number for COMPLEX*16 */
-#define OMPI_KIND_FORTRAN_COMPLEX16 C_DOUBLE_COMPLEX
-
-/* Fortrn KIND number for COMPLEX*32 */
-#define OMPI_KIND_FORTRAN_COMPLEX32 0
-
-/* Fortrn KIND number for COMPLEX*4 */
-#define OMPI_KIND_FORTRAN_COMPLEX4 0
-
-/* Fortrn KIND number for COMPLEX*8 */
-#define OMPI_KIND_FORTRAN_COMPLEX8 C_FLOAT_COMPLEX
-
-/* Fortrn KIND number for DOUBLE COMPLEX */
-#define OMPI_KIND_FORTRAN_DOUBLE_COMPLEX C_DOUBLE_COMPLEX
-
-/* Fortrn KIND number for DOUBLE PRECISION */
-#define OMPI_KIND_FORTRAN_DOUBLE_PRECISION C_DOUBLE
-
-/* Fortrn KIND number for INTEGER */
-#define OMPI_KIND_FORTRAN_INTEGER C_INT
-
-/* Fortrn KIND number for INTEGER*1 */
-#define OMPI_KIND_FORTRAN_INTEGER1 C_SIGNED_CHAR
-
-/* Fortrn KIND number for INTEGER*16 */
-#define OMPI_KIND_FORTRAN_INTEGER16 0
-
-/* Fortrn KIND number for INTEGER*2 */
-#define OMPI_KIND_FORTRAN_INTEGER2 C_SHORT
-
-/* Fortrn KIND number for INTEGER*4 */
-#define OMPI_KIND_FORTRAN_INTEGER4 C_INT
-
-/* Fortrn KIND number for INTEGER*8 */
-#define OMPI_KIND_FORTRAN_INTEGER8 C_LONG_LONG
-
-/* Fortrn KIND number for LOGICAL */
-#define OMPI_KIND_FORTRAN_LOGICAL C_INT
-
-/* Fortrn KIND number for LOGICAL*1 */
-#define OMPI_KIND_FORTRAN_LOGICAL1 C_SIGNED_CHAR
-
-/* Fortrn KIND number for LOGICAL*2 */
-#define OMPI_KIND_FORTRAN_LOGICAL2 C_SHORT
-
-/* Fortrn KIND number for LOGICAL*4 */
-#define OMPI_KIND_FORTRAN_LOGICAL4 C_INT
-
-/* Fortrn KIND number for LOGICAL*8 */
-#define OMPI_KIND_FORTRAN_LOGICAL8 C_LONG_LONG
-
-/* Fortrn KIND number for REAL */
-#define OMPI_KIND_FORTRAN_REAL C_FLOAT
-
-/* Fortrn KIND number for REAL*16 */
-#define OMPI_KIND_FORTRAN_REAL16 0
-
-/* Fortrn KIND number for REAL*2 */
-#define OMPI_KIND_FORTRAN_REAL2 0
-
-/* Fortrn KIND number for REAL*4 */
-#define OMPI_KIND_FORTRAN_REAL4 C_FLOAT
-
-/* Fortrn KIND number for REAL*8 */
-#define OMPI_KIND_FORTRAN_REAL8 C_DOUBLE
-
-/* Major release number of Open MPI */
-#define OMPI_MAJOR_VERSION 1
-
-/* Minor release number of Open MPI */
-#define OMPI_MINOR_VERSION 9
-
-/* MPI Extensions included in libmpi */
-#define OMPI_MPIEXT_COMPONENTS ""
-
-/* Type of MPI_Aint */
-#define OMPI_MPI_AINT_TYPE ptrdiff_t
-
-/* Contributed software packages built with Open MPI */
-#define OMPI_MPI_CONTRIBS "libompitrace"
-
-/* Size of the MPI_Count datatype */
-#define OMPI_MPI_COUNT_SIZE 8
-
-/* Type of the MPI_Count datatype */
-#define OMPI_MPI_COUNT_TYPE long long
-
-/* Size of the MPI_Offset */
-#define OMPI_MPI_OFFSET_SIZE 8
-
-/* Type of MPI_Offset */
-#define OMPI_MPI_OFFSET_TYPE long long
-
-/* Enable flow control for Portals4 MTL */
-#define OMPI_MTL_PORTALS4_FLOW_CONTROL 1
-
-/* MPI datatype corresponding to MPI_Offset */
-#define OMPI_OFFSET_DATATYPE MPI_LONG_LONG
-
-/* Whether we want to check MPI parameters never or possible (an integer
-   constant) */
-#define OMPI_PARAM_CHECK 1
-
-/* Index into endpoint array for BML */
-#define OMPI_PROC_ENDPOINT_TAG_BML 0
-
-/* Maximum number of endpoint entries to be attached to an ompi_proc_t */
-#define OMPI_PROC_ENDPOINT_TAG_MAX 1
-
-/* Index into endpoint array for MTL */
-/* #undef OMPI_PROC_ENDPOINT_TAG_MTL */
-
-/* Index into endpoint array for PML */
-/* #undef OMPI_PROC_ENDPOINT_TAG_PML */
-
-/* Index into endpoint array for PORTALS4 */
-/* #undef OMPI_PROC_ENDPOINT_TAG_PORTALS4 */
-
-/* Whether OMPI should provide MPI File interface */
-#define OMPI_PROVIDE_MPI_FILE_INTERFACE 1
-
-/* Whether Fortran REAL*16 matches the bit format of the equivalent C type */
-#define OMPI_REAL16_MATCHES_C 0
-
-/* Release date of Open MPI */
-#define OMPI_RELEASE_DATE "Unreleased developer copy"
-
-/* Release release number of Open MPI */
-#define OMPI_RELEASE_VERSION 0
-
-/* The repository version Open MPI */
-#define OMPI_REPO_REV "dev-1510-g40fe521"
-
-/* Defined to 1 if the OMPI runtime component is ORTE */
-#define OMPI_RTE_ORTE 1
-
-/* Size of Fortran CHARACTER */
-#define OMPI_SIZEOF_FORTRAN_CHARACTER 1
-
-/* Size of Fortran COMPLEX */
-#define OMPI_SIZEOF_FORTRAN_COMPLEX 8
-
-/* Size of Fortran COMPLEX*16 */
-#define OMPI_SIZEOF_FORTRAN_COMPLEX16 16
-
-/* Size of Fortran COMPLEX*32 */
-#define OMPI_SIZEOF_FORTRAN_COMPLEX32 4
-
-/* Size of Fortran COMPLEX*4 */
-#define OMPI_SIZEOF_FORTRAN_COMPLEX4 4
-
-/* Size of Fortran COMPLEX*8 */
-#define OMPI_SIZEOF_FORTRAN_COMPLEX8 8
-
-/* Size of Fortran DOUBLE COMPLEX */
-#define OMPI_SIZEOF_FORTRAN_DOUBLE_COMPLEX 16
-
-/* Size of Fortran DOUBLE PRECISION */
-#define OMPI_SIZEOF_FORTRAN_DOUBLE_PRECISION 8
-
-/* Size of Fortran INTEGER */
-#define OMPI_SIZEOF_FORTRAN_INTEGER 4
-
-/* Size of Fortran INTEGER*1 */
-#define OMPI_SIZEOF_FORTRAN_INTEGER1 1
-
-/* Size of Fortran INTEGER*16 */
-#define OMPI_SIZEOF_FORTRAN_INTEGER16 16
-
-/* Size of Fortran INTEGER*2 */
-#define OMPI_SIZEOF_FORTRAN_INTEGER2 2
-
-/* Size of Fortran INTEGER*4 */
-#define OMPI_SIZEOF_FORTRAN_INTEGER4 4
-
-/* Size of Fortran INTEGER*8 */
-#define OMPI_SIZEOF_FORTRAN_INTEGER8 8
-
-/* Size of Fortran LOGICAL */
-#define OMPI_SIZEOF_FORTRAN_LOGICAL 4
-
-/* Size of Fortran LOGICAL*1 */
-#define OMPI_SIZEOF_FORTRAN_LOGICAL1 1
-
-/* Size of Fortran LOGICAL*2 */
-#define OMPI_SIZEOF_FORTRAN_LOGICAL2 2
-
-/* Size of Fortran LOGICAL*4 */
-#define OMPI_SIZEOF_FORTRAN_LOGICAL4 4
-
-/* Size of Fortran LOGICAL*8 */
-#define OMPI_SIZEOF_FORTRAN_LOGICAL8 8
-
-/* Size of Fortran REAL */
-#define OMPI_SIZEOF_FORTRAN_REAL 4
-
-/* Size of Fortran REAL*16 */
-#define OMPI_SIZEOF_FORTRAN_REAL16 4
-
-/* Size of Fortran REAL*2 */
-#define OMPI_SIZEOF_FORTRAN_REAL2 4
-
-/* Size of Fortran REAL*4 */
-#define OMPI_SIZEOF_FORTRAN_REAL4 4
-
-/* Size of Fortran REAL*8 */
-#define OMPI_SIZEOF_FORTRAN_REAL8 8
-
-/* Tarball filename version string of Open MPI */
-#define OMPI_TARBALL_VERSION "gitclone"
-
-/* Complete release number of Open MPI */
-#define OMPI_VERSION "0"
-
-/* do we want java mpi bindings */
-#define OMPI_WANT_JAVA_BINDINGS 0
-
-/* do we want to try to work around C++ bindings SEEK_* issue? */
-#define OMPI_WANT_MPI_CXX_SEEK 1
-
-/* Enable warnings when using deprecated MPI functions */
-#define OMPI_WANT_MPI_INTERFACE_WARNING 1
-
-/* if the peruse interface should be enabled */
-#define OMPI_WANT_PERUSE 0
-
-/* Alignment of type _Bool */
-#define OPAL_ALIGNMENT_BOOL 1
-
-/* Alignment of type char */
-#define OPAL_ALIGNMENT_CHAR 1
-
-/* Alignment of type bool */
-#define OPAL_ALIGNMENT_CXX_BOOL 1
-
-/* Alignment of type double */
-#define OPAL_ALIGNMENT_DOUBLE 8
-
-/* Alignment of type double _Complex */
-#define OPAL_ALIGNMENT_DOUBLE_COMPLEX 8
-
-/* Alignment of type float */
-#define OPAL_ALIGNMENT_FLOAT 4
-
-/* Alignment of type float _Complex */
-#define OPAL_ALIGNMENT_FLOAT_COMPLEX 4
-
-/* Alignment of type int */
-#define OPAL_ALIGNMENT_INT 4
-
-/* Alignment of type int128_t */
-/* #undef OPAL_ALIGNMENT_INT128 */
-
-/* Alignment of type int16_t */
-#define OPAL_ALIGNMENT_INT16 2
-
-/* Alignment of type int32_t */
-#define OPAL_ALIGNMENT_INT32 4
-
-/* Alignment of type int64_t */
-#define OPAL_ALIGNMENT_INT64 8
-
-/* Alignment of type int8_t */
-#define OPAL_ALIGNMENT_INT8 1
-
-/* Alignment of type long */
-#define OPAL_ALIGNMENT_LONG 8
-
-/* Alignment of type long double */
-#define OPAL_ALIGNMENT_LONG_DOUBLE 16
-
-/* Alignment of type long double _Complex */
-#define OPAL_ALIGNMENT_LONG_DOUBLE_COMPLEX 16
-
-/* Alignment of type long long */
-#define OPAL_ALIGNMENT_LONG_LONG 8
-
-/* Alignment of type short */
-#define OPAL_ALIGNMENT_SHORT 2
-
-/* Alignment of type size_t */
-#define OPAL_ALIGNMENT_SIZE_T 8
-
-/* Alignment of type void * */
-#define OPAL_ALIGNMENT_VOID_P 8
-
-/* Alignment of type wchar_t */
-#define OPAL_ALIGNMENT_WCHAR 4
-
-/* Alignment of type __float128 */
-#define OPAL_ALIGNMENT___FLOAT128 16
-
-/* set to 1 if word-size integers must be aligned to word-size padding to
-   prevent bus errors */
-#define OPAL_ALIGN_WORD_SIZE_INTEGERS 0
-
-/* OMPI architecture string */
-#define OPAL_ARCH "x86_64-unknown-linux-gnu"
-
-/* Assembly align directive expects logarithmic value */
-#define OPAL_ASM_ALIGN_LOG 
-
-/* What ARM assembly version to use */
-/* #undef OPAL_ASM_ARM_VERSION */
-
-/* Assembly directive for exporting symbols */
-#define OPAL_ASM_GLOBAL ".globl"
-
-/* Assembly prefix for gsym labels */
-#define OPAL_ASM_GSYM ""
-
-/* Assembly suffix for labels */
-#define OPAL_ASM_LABEL_SUFFIX ":"
-
-/* Assembly prefix for lsym labels */
-#define OPAL_ASM_LSYM ".L"
-
-/* Do we need to give a .size directive */
-#define OPAL_ASM_SIZE "1"
-
-/* Whether we can do 64bit assembly operations or not. Should not be used
-   outside of the assembly header files */
-#define OPAL_ASM_SUPPORT_64BIT 1
-
-/* Assembly directive for setting text section */
-#define OPAL_ASM_TEXT ".text"
-
-/* How to set function type in .type directive */
-#define OPAL_ASM_TYPE "@"
-
-/* Architecture type of assembly to use for atomic operations and CMA */
-#define OPAL_ASSEMBLY_ARCH OPAL_AMD64
-
-/* Whether to use builtin atomics */
-#define OPAL_ASSEMBLY_BUILTIN OPAL_BUILTIN_NO
-
-/* Format of assembly file */
-#define OPAL_ASSEMBLY_FORMAT "default-.text-.globl-:--.L-@-1-0-1-1-1"
-
-/* Whether we have support for RDTSCP instruction */
-#define OPAL_ASSEMBLY_SUPPORTS_RDTSCP 0
-
-/* Enable flow control for Portals4 BTL */
-#define OPAL_BTL_PORTALS4_FLOW_CONTROL 0
-
-/* If CMA support can be enabled */
-#define OPAL_BTL_SM_HAVE_CMA 0
-
-/* If knem support can be enabled */
-#define OPAL_BTL_SM_HAVE_KNEM 0
-
-/* Path by which to include fi_ext_usnic.h */
-/* #undef OPAL_BTL_USNIC_FI_EXT_USNIC_H */
-
-/* define to 1 if usnic BTL unit tests are enabled, 0 otherwise */
-#define OPAL_BTL_USNIC_UNIT_TESTS 0
-
-/* If CMA support can be enabled within vader */
-#define OPAL_BTL_VADER_HAVE_CMA 0
-
-/* If KNEM support can be enabled within vader */
-#define OPAL_BTL_VADER_HAVE_KNEM 0
-
-/* If XPMEM support can be enabled within vader */
-#define OPAL_BTL_VADER_HAVE_XPMEM 0
-
-/* The compiler $lower which OMPI was built with */
-#define OPAL_BUILD_PLATFORM_COMPILER_FAMILYID 1
-
-/* The compiler $lower which OMPI was built with */
-#define OPAL_BUILD_PLATFORM_COMPILER_FAMILYNAME GNU
-
-/* The compiler $lower which OMPI was built with */
-#define OPAL_BUILD_PLATFORM_COMPILER_VERSION 263175
-
-/* The compiler $lower which OMPI was built with */
-#define OPAL_BUILD_PLATFORM_COMPILER_VERSION_STR 4.4.7
-
-/* OMPI underlying C compiler */
-#define OPAL_CC "gcc"
-
-/* Use static const char[] strings for C files */
-#define OPAL_CC_USE_CONST_CHAR_IDENT 0
-
-/* Use #ident strings for C files */
-#define OPAL_CC_USE_IDENT 1
-
-/* Use #pragma comment for C files */
-#define OPAL_CC_USE_PRAGMA_COMMENT 
-
-/* Use #pragma ident strings for C files */
-#define OPAL_CC_USE_PRAGMA_IDENT 0
-
-/* Need CMA syscalls defined */
-/* #undef OPAL_CMA_NEED_SYSCALL_DEFS */
-
-/* Whether we have CUDA GDR support available */
-#define OPAL_CUDA_GDR_SUPPORT 1
-
-/* Whether we have CUDA cuPointerGetAttributes function available */
-#define OPAL_CUDA_GET_ATTRIBUTES 1
-
-/* Whether we want cuda device pointer support */
-#define OPAL_CUDA_SUPPORT 1
-
-/* Whether we have CUDA 4.1 support available */
-#define OPAL_CUDA_SUPPORT_41 1
-
-/* Whether we have CUDA CU_POINTER_ATTRIBUTE_SYNC_MEMOPS support available */
-#define OPAL_CUDA_SYNC_MEMOPS 1
-
-/* OPAL underlying C++ compiler */
-#define OPAL_CXX "g++"
-
-/* Use static const char[] strings for C++ files */
-/* #undef OPAL_CXX_USE_CONST_CHAR_IDENT */
-
-/* Use #ident strings for C++ files */
-/* #undef OPAL_CXX_USE_IDENT */
-
-/* Use #pragma comment for C++ files */
-/* #undef OPAL_CXX_USE_PRAGMA_COMMENT */
-
-/* Use #pragma ident strings for C++ files */
-/* #undef OPAL_CXX_USE_PRAGMA_IDENT */
-
-/* Whether C compiler supports DEC style inline assembly */
-#define OPAL_C_DEC_INLINE_ASSEMBLY 0
-
-/* Whether C compiler supports GCC style inline assembly */
-#define OPAL_C_GCC_INLINE_ASSEMBLY 1
-
-/* Whether C compiler supports __builtin_clz */
-#define OPAL_C_HAVE_BUILTIN_CLZ 1
-
-/* Whether C compiler supports __builtin_expect */
-#define OPAL_C_HAVE_BUILTIN_EXPECT 1
-
-/* Whether C compiler supports __builtin_prefetch */
-#define OPAL_C_HAVE_BUILTIN_PREFETCH 1
-
-/* Whether C compiler supports symbol visibility or not */
-#define OPAL_C_HAVE_VISIBILITY 1
-
-/* Whether C compiler supports XLC style inline assembly */
-#define OPAL_C_XLC_INLINE_ASSEMBLY 0
-
-/* Whether we have lt_dladvise or not */
-#define OPAL_DL_LIBLTDL_HAVE_LT_DLADVISE 0
-
-/* Whether we want checkpoint/restart enabled debugging functionality or not
-   */
-#define OPAL_ENABLE_CRDEBUG 0
-
-/* Whether we want developer-level debugging code or not */
-#define OPAL_ENABLE_DEBUG 1
-
-/* Enable features required for dynamic SL support */
-#define OPAL_ENABLE_DYNAMIC_SL 0
-
-/* Enable fault tolerance general components and logic */
-#define OPAL_ENABLE_FT 0
-
-/* Enable fault tolerance checkpoint/restart components and logic */
-#define OPAL_ENABLE_FT_CR 0
-
-/* Enable fault tolerance thread in Open PAL */
-#define OPAL_ENABLE_FT_THREAD 0
-
-/* Disable getpwuid support (default: enabled) */
-#define OPAL_ENABLE_GETPWUID 1
-
-/* Enable features required for heterogeneous support */
-#define OPAL_ENABLE_HETEROGENEOUS_SUPPORT 0
-
-/* Enable IPv6 support, but only if the underlying system supports it */
-#define OPAL_ENABLE_IPV6 0
-
-/* Whether we want the memory profiling or not */
-#define OPAL_ENABLE_MEM_DEBUG 1
-
-/* Whether we want the memory profiling or not */
-#define OPAL_ENABLE_MEM_PROFILE 1
-
-/* Whether we should enable thread support within the OPAL code base */
-#define OPAL_ENABLE_MULTI_THREADS 1
-
-/* Whether we want BTL progress threads enabled */
-#define OPAL_ENABLE_PROGRESS_THREADS 0
-
-/* Whether user wants PTY support or not */
-#define OPAL_ENABLE_PTY_SUPPORT 1
-
-/* Whether we want developer-level timing framework or not */
-#define OPAL_ENABLE_TIMING 0
-
-/* Greek - alpha, beta, etc - release number of Open Portable Access Layer */
-#define OPAL_GREEK_VERSION "a1"
-
-/* Whether there is an atomic assembly file available */
-#define OPAL_HAVE_ASM_FILE 1
-
-/* Whether your compiler has __attribute__ or not */
-#define OPAL_HAVE_ATTRIBUTE 1
-
-/* Whether your compiler has __attribute__ aligned or not */
-#define OPAL_HAVE_ATTRIBUTE_ALIGNED 1
-
-/* Whether your compiler has __attribute__ always_inline or not */
-#define OPAL_HAVE_ATTRIBUTE_ALWAYS_INLINE 1
-
-/* Whether your compiler has __attribute__ cold or not */
-#define OPAL_HAVE_ATTRIBUTE_COLD 1
-
-/* Whether your compiler has __attribute__ const or not */
-#define OPAL_HAVE_ATTRIBUTE_CONST 1
-
-/* Whether your compiler has __attribute__ deprecated or not */
-#define OPAL_HAVE_ATTRIBUTE_DEPRECATED 1
-
-/* Whether your compiler has __attribute__ deprecated with optional argument
-   */
-#define OPAL_HAVE_ATTRIBUTE_DEPRECATED_ARGUMENT 0
-
-/* Whether your compiler has __attribute__ destructor or not */
-#define OPAL_HAVE_ATTRIBUTE_DESTRUCTOR 1
-
-/* Whether your compiler has __attribute__ format or not */
-#define OPAL_HAVE_ATTRIBUTE_FORMAT 1
-
-/* Whether your compiler has __attribute__ format and it works on function
-   pointers */
-#define OPAL_HAVE_ATTRIBUTE_FORMAT_FUNCPTR 1
-
-/* Whether your compiler has __attribute__ hot or not */
-#define OPAL_HAVE_ATTRIBUTE_HOT 1
-
-/* Whether your compiler has __attribute__ malloc or not */
-#define OPAL_HAVE_ATTRIBUTE_MALLOC 1
-
-/* Whether your compiler has __attribute__ may_alias or not */
-#define OPAL_HAVE_ATTRIBUTE_MAY_ALIAS 1
-
-/* Whether your compiler has __attribute__ noinline or not */
-#define OPAL_HAVE_ATTRIBUTE_NOINLINE 1
-
-/* Whether your compiler has __attribute__ nonnull or not */
-#define OPAL_HAVE_ATTRIBUTE_NONNULL 1
-
-/* Whether your compiler has __attribute__ noreturn or not */
-#define OPAL_HAVE_ATTRIBUTE_NORETURN 1
-
-/* Whether your compiler has __attribute__ noreturn and it works on function
-   pointers */
-#define OPAL_HAVE_ATTRIBUTE_NORETURN_FUNCPTR 1
-
-/* Whether your compiler has __attribute__ no_instrument_function or not */
-#define OPAL_HAVE_ATTRIBUTE_NO_INSTRUMENT_FUNCTION 1
-
-/* Whether your compiler has __attribute__ packed or not */
-#define OPAL_HAVE_ATTRIBUTE_PACKED 1
-
-/* Whether your compiler has __attribute__ pure or not */
-#define OPAL_HAVE_ATTRIBUTE_PURE 1
-
-/* Whether your compiler has __attribute__ sentinel or not */
-#define OPAL_HAVE_ATTRIBUTE_SENTINEL 1
-
-/* Whether your compiler has __attribute__ unused or not */
-#define OPAL_HAVE_ATTRIBUTE_UNUSED 1
-
-/* Whether your compiler has __attribute__ visibility or not */
-#define OPAL_HAVE_ATTRIBUTE_VISIBILITY 1
-
-/* Whether your compiler has __attribute__ warn unused result or not */
-#define OPAL_HAVE_ATTRIBUTE_WARN_UNUSED_RESULT 1
-
-/* Whether your compiler has __attribute__ weak alias or not */
-#define OPAL_HAVE_ATTRIBUTE_WEAK_ALIAS 1
-
-/* whether backtrace_execinfo is found and available */
-#define OPAL_HAVE_BACKTRACE_EXECINFO 1
-
-/* whether qsort is broken or not */
-#define OPAL_HAVE_BROKEN_QSORT 0
-
-/* whether ceil is found and available */
-#define OPAL_HAVE_CEIL 1
-
-/* whether clock_gettime is found and available */
-#define OPAL_HAVE_CLOCK_GETTIME 1
-
-/* Whether the processor supports the cmpxchg16b instruction */
-#define OPAL_HAVE_CMPXCHG16B 1
-
-/* Enable features required for ConnectX XRC support */
-#define OPAL_HAVE_CONNECTX_XRC 0
-
-/* Enable features required for XRC domains support */
-#define OPAL_HAVE_CONNECTX_XRC_DOMAINS 0
-
-/* whether crs_blcr is found and available */
-/* #undef OPAL_HAVE_CRS_BLCR */
-
-/* whether dirname is found and available */
-#define OPAL_HAVE_DIRNAME 1
-
-/* Whether the OPAL DL framework is functional or not */
-#define OPAL_HAVE_DL_SUPPORT 1
-
-/* whether fbtl_posix is found and available */
-#define OPAL_HAVE_FBTL_POSIX 1
-
-/* whether gethostbyname is found and available */
-#define OPAL_HAVE_GETHOSTBYNAME 1
-
-/* Whether we have hwloc support or not */
-#define OPAL_HAVE_HWLOC 1
-
-/* do we have Java support */
-#define OPAL_HAVE_JAVA_SUPPORT 1
-
-/* Do not use outside of mpi.h. Define to 1 if the system has the type `long
-   long'. */
-#define OPAL_HAVE_LONG_LONG 1
-
-/* whether openpty is found and available */
-#define OPAL_HAVE_OPENPTY 1
-
-/* If PTHREADS implementation supports PTHREAD_MUTEX_ERRORCHECK */
-#define OPAL_HAVE_PTHREAD_MUTEX_ERRORCHECK 1
-
-/* If PTHREADS implementation supports PTHREAD_MUTEX_ERRORCHECK_NP */
-#define OPAL_HAVE_PTHREAD_MUTEX_ERRORCHECK_NP 1
-
-/* Whether RDMA CM is available or not */
-/* #undef OPAL_HAVE_RDMACM */
-
-/* Enable RDMAoE support */
-/* #undef OPAL_HAVE_RDMAOE */
-
-/* Whether we have SA_RESTART in <signal.h> or not */
-#define OPAL_HAVE_SA_RESTART 1
-
-/* whether sched_yield is found and available */
-#define OPAL_HAVE_SCHED_YIELD 1
-
-/* whether shmem_posix is found and available */
-#define OPAL_HAVE_SHMEM_POSIX 1
-
-/* whether socket is found and available */
-#define OPAL_HAVE_SOCKET 1
-
-/* Whether or not we have solaris */
-#define OPAL_HAVE_SOLARIS 0
-
-/* Whether the __sync builtin atomic compare and swap supports 128-bit values
-   */
-/* #undef OPAL_HAVE_SYNC_BUILTIN_CSWAP_INT128 */
-
-/* Do not use outside of mpi.h. Define to 1 if you have the <sys/synch.h>
-   header file. */
-/* #undef OPAL_HAVE_SYS_SYNCH_H */
-
-/* Do not use outside of mpi.h. Define to 1 if you have the <sys/time.h>
-   header file. */
-#define OPAL_HAVE_SYS_TIME_H 1
-
-/* Whether UD CM is available or not */
-/* #undef OPAL_HAVE_UDCM */
-
-/* Whether we have __va_copy or not */
-#define OPAL_HAVE_UNDERSCORE_VA_COPY 1
-
-/* Whether we have va_copy or not */
-#define OPAL_HAVE_VA_COPY 1
-
-/* Whether we have weak symbols or not */
-#define OPAL_HAVE_WEAK_SYMBOLS 1
-
-/* Whether our event component has working event operations or not (if not,
-   then assumedly it only has working timers and signals) */
-#define OPAL_HAVE_WORKING_EVENTOPS 1
-
-/* whether yp_all_nsl is found and available */
-#define OPAL_HAVE_YP_ALL_NSL 1
-
-/* Define to 1 ifyou have the declaration of _SC_NPROCESSORS_ONLN, and to 0
-   otherwise */
-#define OPAL_HAVE__SC_NPROCESSORS_ONLN 1
-
-/* Number of arguments to ibv_create_cq */
-/* #undef OPAL_IBV_CREATE_CQ_ARGS */
-
-/* ident string for Open MPI */
-#define OPAL_IDENT_STRING "1.9.0a1"
-
-/* Major release number of Open Portable Access Layer */
-#define OPAL_MAJOR_VERSION 1
-
-/* Maximum length of datarep strings (default is 128) */
-#define OPAL_MAX_DATAREP_STRING 128
-
-/* Maximum length of error strings (default is 256) */
-#define OPAL_MAX_ERROR_STRING 256
-
-/* Maximum length of info keys (default is 36) */
-#define OPAL_MAX_INFO_KEY 36
-
-/* Maximum length of info vals (default is 256) */
-#define OPAL_MAX_INFO_VAL 256
-
-/* Maximum length of object names (default is 64) */
-#define OPAL_MAX_OBJECT_NAME 64
-
-/* Maximum length of port names (default is 1024) */
-#define OPAL_MAX_PORT_NAME 1024
-
-/* Maximum length of processor names (default is 256) */
-#define OPAL_MAX_PROCESSOR_NAME 256
-
-/* MCA cmd line identifier */
-#define OPAL_MCA_CMD_LINE_ID "mca"
-
-/* MCA prefix string for envars */
-#define OPAL_MCA_PREFIX "OMPI_MCA_"
-
-/* Whether any opal memory mca components were found */
-#define OPAL_MEMORY_HAVE_COMPONENT 1
-
-/* Minor release number of Open Portable Access Layer */
-#define OPAL_MINOR_VERSION 9
-
-/* Whether the C compiler supports "bool" without any other help (such as
-   <stdbool.h>) */
-#define OPAL_NEED_C_BOOL 1
-
-/* Add padding bytes to the openib BTL control header */
-#define OPAL_OPENIB_PAD_HDR 0
-
-/* package/branding string for Open MPI */
-#define OPAL_PACKAGE_STRING "Open MPI wwu12@bunsen.icl.utk.edu Distribution"
-
-/* Log base 2 of the maximum size in bytes of a memory descriptor. Set to 0 if
-   MD can bind all of memory. */
-#define OPAL_PORTALS4_MAX_MD_SIZE 0
-
-/* Log base 2 of the maximum size in bytes of the user virtual address space.
-   Set to 0 if MD can bind all of memory. */
-#define OPAL_PORTALS4_MAX_VA_SIZE 0
-
-/* Whether r notation is used for ppc registers */
-/* #undef OPAL_POWERPC_R_REGISTERS */
-
-/* type to use for ptrdiff_t */
-#define OPAL_PTRDIFF_TYPE ptrdiff_t
-
-/* Release date of Open Portable Access Layer */
-#define OPAL_RELEASE_DATE "Unreleased developer copy"
-
-/* Release release number of Open Portable Access Layer */
-#define OPAL_RELEASE_VERSION 0
-
-/* The repository version Open Portable Access Layer */
-#define OPAL_REPO_REV "dev-1510-g40fe521"
-
-/* Whether we have shared memory support for mmap or not */
-#define OPAL_SHMEM_MMAP 1
-
-/* Whether we have shared memory support for POSIX or not */
-#define OPAL_SHMEM_POSIX 1
-
-/* Whether we have shared memory support for SYSV or not */
-#define OPAL_SHMEM_SYSV 1
-
-/* Do not use outside of mpi.h. Define to 1 if you have the ANSI C header
-   files. */
-#define OPAL_STDC_HEADERS 1
-
-/* Tarball filename version string of Open Portable Access Layer */
-#define OPAL_TARBALL_VERSION "gitclone"
-
-/* Whether to use <stdbool.h> or not */
-#define OPAL_USE_STDBOOL_H 1
-
-/* Complete release number of Open Portable Access Layer */
-#define OPAL_VERSION "0"
-
-/* Enable per-user config files */
-#define OPAL_WANT_HOME_CONFIG_FILES 1
-
-/* if the memory and buffer checking should be enabled */
-#define OPAL_WANT_MEMCHECKER 0
-
-/* if want pretty-print stack trace feature */
-#define OPAL_WANT_PRETTY_PRINT_STACKTRACE 1
-
-/* whether we want to have smp locks in atomic ops or not */
-#define OPAL_WANT_SMP_LOCKS 1
-
-/* Specific ps command to use in orte-clean */
-#define ORTE_CLEAN_PS_CMD "ps -A -o fname,pid,user"
-
-/* Whether we want static ports enabled */
-#define ORTE_ENABLE_STATIC_PORTS 1
-
-/* Greek - alpha, beta, etc - release number of Open MPI Run-Time Environment
-   */
-#define ORTE_GREEK_VERSION "a1"
-
-/* Major release number of Open MPI Run-Time Environment */
-#define ORTE_MAJOR_VERSION 1
-
-/* Minor release number of Open MPI Run-Time Environment */
-#define ORTE_MINOR_VERSION 9
-
-/* Release date of Open MPI Run-Time Environment */
-#define ORTE_RELEASE_DATE "Unreleased developer copy"
-
-/* Release release number of Open MPI Run-Time Environment */
-#define ORTE_RELEASE_VERSION 0
-
-/* The repository version Open MPI Run-Time Environment */
-#define ORTE_REPO_REV "dev-1510-g40fe521"
-
-/* Tarball filename version string of Open MPI Run-Time Environment */
-#define ORTE_TARBALL_VERSION "gitclone"
-
-/* Complete release number of Open MPI Run-Time Environment */
-#define ORTE_VERSION "0"
-
-/* Whether we want orterun to effect "--prefix $prefix" by default */
-#define ORTE_WANT_ORTERUN_PREFIX_BY_DEFAULT 0
-
-/* Greek - alpha, beta, etc - release number of Open SHMEM */
-#define OSHMEM_GREEK_VERSION "a1"
-
-/* mxm support is available */
-/* #undef OSHMEM_HAS_ATOMIC_MXM */
-
-/* Major release number of Open SHMEM */
-#define OSHMEM_MAJOR_VERSION 1
-
-/* Minor release number of Open SHMEM */
-#define OSHMEM_MINOR_VERSION 9
-
-/* Whether we want to check OSHMEM parameters always or never */
-#define OSHMEM_PARAM_CHECK 1
-
-/* Release date of Open SHMEM */
-#define OSHMEM_RELEASE_DATE "Unreleased developer copy"
-
-/* Release release number of Open SHMEM */
-#define OSHMEM_RELEASE_VERSION 0
-
-/* The repository version Open SHMEM */
-#define OSHMEM_REPO_REV "dev-1510-g40fe521"
-
-/* Whether user wants OSHMEM in compatibility mode or not */
-#define OSHMEM_SPEC_COMPAT 1
-
-/* Whether we have shared memory support for mmap or not */
-#define OSHMEM_SSHMEM_MMAP 1
-
-/* Whether we have shared memory support for SYSV or not */
-#define OSHMEM_SSHMEM_SYSV 1
-
-/* Whether we have shared memory support for verbs or not */
-#define OSHMEM_SSHMEM_VERBS 0
-
-/* Tarball filename version string of Open SHMEM */
-#define OSHMEM_TARBALL_VERSION "gitclone"
-
-/* Complete release number of Open SHMEM */
-#define OSHMEM_VERSION "0"
-
-/* do we want java oshmem bindings */
-#define OSHMEM_WANT_JAVA_BINDINGS 0
-
-/* Define to the address where bug reports for this package should be sent. */
-#define PACKAGE_BUGREPORT "http://www.open-mpi.org/community/help/"
-
-/* Define to the full name of this package. */
-#define PACKAGE_NAME "Open MPI"
-
-/* Define to the full name and version of this package. */
-#define PACKAGE_STRING "Open MPI gitclone"
-
-/* Define to the one symbol short name of this package. */
-#define PACKAGE_TARNAME "openmpi"
-
-/* Define to the home page for this package. */
-#define PACKAGE_URL ""
-
-/* Define to the version of this package. */
-#define PACKAGE_VERSION "gitclone"
-
-/* Define PT_LOCK_SPIN to 1 if available. */
-/* #undef PT_LOCK_SPIN */
-
-/* The size of `bool', as computed by sizeof. */
-#define SIZEOF_BOOL 1
-
-/* The size of `char', as computed by sizeof. */
-#define SIZEOF_CHAR 1
-
-/* The size of `double', as computed by sizeof. */
-#define SIZEOF_DOUBLE 8
-
-/* The size of `double _Complex', as computed by sizeof. */
-#define SIZEOF_DOUBLE__COMPLEX 16
-
-/* The size of `float', as computed by sizeof. */
-#define SIZEOF_FLOAT 4
-
-/* The size of `float _Complex', as computed by sizeof. */
-#define SIZEOF_FLOAT__COMPLEX 8
-
-/* The size of `int', as computed by sizeof. */
-#define SIZEOF_INT 4
-
-/* The size of `long', as computed by sizeof. */
-#define SIZEOF_LONG 8
-
-/* The size of `long double', as computed by sizeof. */
-#define SIZEOF_LONG_DOUBLE 16
-
-/* The size of `long double _Complex', as computed by sizeof. */
-#define SIZEOF_LONG_DOUBLE__COMPLEX 32
-
-/* The size of `long long', as computed by sizeof. */
-#define SIZEOF_LONG_LONG 8
-
-/* The size of `pid_t', as computed by sizeof. */
-#define SIZEOF_PID_T 4
-
-/* The size of `ptrdiff_t', as computed by sizeof. */
-#define SIZEOF_PTRDIFF_T 8
-
-/* The size of `short', as computed by sizeof. */
-#define SIZEOF_SHORT 2
-
-/* The size of `size_t', as computed by sizeof. */
-#define SIZEOF_SIZE_T 8
-
-/* The size of `ssize_t', as computed by sizeof. */
-#define SIZEOF_SSIZE_T 8
-
-/* The size of `unsigned int', as computed by sizeof. */
-#define SIZEOF_UNSIGNED_INT 4
-
-/* The size of `unsigned long', as computed by sizeof. */
-#define SIZEOF_UNSIGNED_LONG 8
-
-/* The size of `void *', as computed by sizeof. */
-#define SIZEOF_VOID_P 8
-
-/* The size of `wchar_t', as computed by sizeof. */
-#define SIZEOF_WCHAR_T 4
-
-/* The size of `_Bool', as computed by sizeof. */
-#define SIZEOF__BOOL 1
-
-/* The size of `__float128', as computed by sizeof. */
-#define SIZEOF___FLOAT128 16
-
-/* Define to 1 if you have the ANSI C header files. */
-#define STDC_HEADERS 1
-
-/* Enable extensions on HP-UX. */
-#ifndef _HPUX_SOURCE
-# define _HPUX_SOURCE 1
-#endif
-
-
-/* Whether to use the legacy Solaris munmap prototype or not */
-/* #undef USE_SOLARIS_LEGACY_MUNMAP_PROTOTYPE */
-
-/* Enable extensions on AIX 3, Interix.  */
-#ifndef _ALL_SOURCE
-# define _ALL_SOURCE 1
-#endif
-/* Enable GNU extensions on systems that have them.  */
-#ifndef _GNU_SOURCE
-# define _GNU_SOURCE 1
-#endif
-/* Enable threading extensions on Solaris.  */
-#ifndef _POSIX_PTHREAD_SEMANTICS
-# define _POSIX_PTHREAD_SEMANTICS 1
-#endif
-/* Enable extensions on HP NonStop.  */
-#ifndef _TANDEM_SOURCE
-# define _TANDEM_SOURCE 1
-#endif
-/* Enable general extensions on Solaris.  */
-#ifndef __EXTENSIONS__
-# define __EXTENSIONS__ 1
-#endif
-
-
-/* Define WORDS_BIGENDIAN to 1 if your processor stores words with the most
-   significant byte first (like Motorola and SPARC, unlike Intel). */
-#if defined AC_APPLE_UNIVERSAL_BUILD
-# if defined __BIG_ENDIAN__
-#  define WORDS_BIGENDIAN 1
-# endif
-#else
-# ifndef WORDS_BIGENDIAN
-/* #  undef WORDS_BIGENDIAN */
-# endif
-#endif
-
-/* Additional CFLAGS to pass through the wrapper compilers */
-#define WRAPPER_EXTRA_CFLAGS "-pthread "
-
-/* Additional CFLAGS_PREFIX to pass through the wrapper compilers */
-#define WRAPPER_EXTRA_CFLAGS_PREFIX ""
-
-/* Additional CXXFLAGS to pass through the wrapper compilers */
-#define WRAPPER_EXTRA_CXXFLAGS "-pthread "
-
-/* Additional CXXFLAGS_PREFIX to pass through the wrapper compilers */
-#define WRAPPER_EXTRA_CXXFLAGS_PREFIX ""
-
-/* Additional FCFLAGS to pass through the wrapper compilers */
-#define WRAPPER_EXTRA_FCFLAGS "-pthread  -I${libdir}"
-
-/* Additional FCFLAGS to pass through the wrapper compilers */
-#define WRAPPER_EXTRA_FCFLAGS_PREFIX ""
-
-/* Additional LDFLAGS to pass through the wrapper compilers */
-#define WRAPPER_EXTRA_LDFLAGS "    -Wl,-rpath -Wl,@{libdir} -Wl,--enable-new-dtags"
-
-/* Additional LIBS to pass through the wrapper compilers */
-#define WRAPPER_EXTRA_LIBS "-lm -ldl -lutil -lrt "
-
-/* Whether the wrapper compilers add rpath flags by default */
-#define WRAPPER_RPATH_SUPPORT "runpath"
-
-/* Define to 1 if the X Window System is missing or not being used. */
-/* #undef X_DISPLAY_MISSING */
-
-/* Define to 1 if `lex' declares `yytext' as a `char *' by default, not a
-   `char[]'. */
-#define YYTEXT_POINTER 1
-
-/* Enable GNU extensions on systems that have them.  */
-#ifndef _GNU_SOURCE
-# define _GNU_SOURCE 1
-#endif
-
-/* Are we building for HP-UX? */
-#define _HPUX_SOURCE 1
-
-/* Define to 1 if on MINIX. */
-/* #undef _MINIX */
-
-/* Define to 2 if the system does not provide POSIX.1 features except with
-   this defined. */
-/* #undef _POSIX_1_SOURCE */
-
-/* Define to 1 if you need to in order for `stat' and other things to work. */
-/* #undef _POSIX_SOURCE */
-
-/* Define this to the process ID type */
-#define hwloc_pid_t pid_t
-
-/* Define this to the thread ID type */
-#define hwloc_thread_t pthread_t
-
-/* Define to `__inline__' or `__inline' if that's what the C compiler
-   calls it, or to nothing if 'inline' is not supported under any name.  */
-#ifndef __cplusplus
-#define inline __inline__
-#endif
-
-/* A bogus type that allows us to have sentinel type values that are still
-   valid */
-#define ompi_fortran_bogus_type_t int
-
-/* C type corresponding to Fortran CHARACTER */
-#define ompi_fortran_character_t char
-
-/* C type corresponding to Fortran COMPLEX*16 */
-/* #undef ompi_fortran_complex16_t */
-
-/* C type corresponding to Fortran COMPLEX*32 */
-/* #undef ompi_fortran_complex32_t */
-
-/* C type corresponding to Fortran COMPLEX*4 */
-/* #undef ompi_fortran_complex4_t */
-
-/* C type corresponding to Fortran COMPLEX*8 */
-/* #undef ompi_fortran_complex8_t */
-
-/* C type corresponding to Fortran COMPLEX */
-/* #undef ompi_fortran_complex_t */
-
-/* C type corresponding to Fortran DOUBLE COMPLEX */
-/* #undef ompi_fortran_double_complex_t */
-
-/* C type corresponding to Fortran DOUBLE PRECISION */
-#define ompi_fortran_double_precision_t double
-
-/* C type corresponding to Fortran INTEGER*16 */
-#define ompi_fortran_integer16_t 
-
-/* C type corresponding to Fortran INTEGER*1 */
-#define ompi_fortran_integer1_t char
-
-/* C type corresponding to Fortran INTEGER*2 */
-#define ompi_fortran_integer2_t short
-
-/* C type corresponding to Fortran INTEGER*4 */
-#define ompi_fortran_integer4_t int
-
-/* C type corresponding to Fortran INTEGER*8 */
-#define ompi_fortran_integer8_t long long
-
-/* C type corresponding to Fortran INTEGER */
-#define ompi_fortran_integer_t int
-
-/* C type corresponding to Fortran LOGICAL*1 */
-#define ompi_fortran_logical1_t char
-
-/* C type corresponding to Fortran LOGICAL*2 */
-#define ompi_fortran_logical2_t short
-
-/* C type corresponding to Fortran LOGICAL*4 */
-#define ompi_fortran_logical4_t int
-
-/* C type corresponding to Fortran LOGICAL*8 */
-#define ompi_fortran_logical8_t long long
-
-/* C type corresponding to Fortran LOGICAL */
-#define ompi_fortran_logical_t int
-
-/* C type corresponding to Fortran REAL*16 */
-#define ompi_fortran_real16_t ompi_fortran_bogus_type_t
-
-/* C type corresponding to Fortran REAL*2 */
-#define ompi_fortran_real2_t ompi_fortran_bogus_type_t
-
-/* C type corresponding to Fortran REAL*4 */
-#define ompi_fortran_real4_t float
-
-/* C type corresponding to Fortran REAL*8 */
-#define ompi_fortran_real8_t double
-
-/* C type corresponding to Fortran REAL */
-#define ompi_fortran_real_t float
-
-/* Define to the equivalent of the C99 'restrict' keyword, or to
-   nothing if this is not supported.  Do not define if restrict is
-   supported directly.  */
-#define restrict __restrict
-/* Work around a bug in Sun C++: it does not support _Restrict or
-   __restrict__, even though the corresponding Sun C compiler ends up with
-   "#define restrict _Restrict" or "#define restrict __restrict__" in the
-   previous line.  Perhaps some future version of Sun C++ will work with
-   restrict; if so, hopefully it defines __RESTRICT like Sun C does.  */
-#if defined __SUNPRO_CC && !defined __RESTRICT
-# define _Restrict
-# define __restrict__
-#endif
-
-
-//#include "opal_config_bottom.h"
-#endif /* OPAL_CONFIG_H */
-

From 9c63b098be8c9aa70426a71afa96fadacb213b8b Mon Sep 17 00:00:00 2001
From: George Bosilca <bosilca@icl.utk.edu>
Date: Thu, 18 Jun 2015 19:48:40 -0400
Subject: [PATCH 010/190] Add the capability to install the generated library
 and other minor cleanups.

---
 opal/datatype/cuda/Makefile.in | 31 +++++++++++++++++--------------
 1 file changed, 17 insertions(+), 14 deletions(-)

diff --git a/opal/datatype/cuda/Makefile.in b/opal/datatype/cuda/Makefile.in
index 519de6100ae..f00ca4e030c 100644
--- a/opal/datatype/cuda/Makefile.in
+++ b/opal/datatype/cuda/Makefile.in
@@ -3,19 +3,20 @@
 AM_CPPFLAGS = @common_cuda_CPPFLAGS@
 srcdir = @srcdir@
 top_builddir = @top_builddir@
+top_srcdir = @top_srcdir@
 VPATH = @srcdir@
 
-NVCC		= nvcc
-ARCH		= ar
-ARCHFLAGS	= cr
-STLIB		?= opal_datatype_cuda.a
-DYLIB		?= opal_datatype_cuda.so
-EXTLIB		= -L$(top_builddir)/opal/datatype/.libs -ldatatype -L$(top_builddir)/opal/.libs -lopen-pal -L/usr/local/cuda/lib -lcuda
-subdir = opal/datatype/cuda
+NVCC       = nvcc
+ARCH       = @AR@
+ARCHFLAGS  = cr
+STLIB     ?= opal_datatype_cuda.a
+DYLIB     ?= opal_datatype_cuda.so
+EXTLIB     = -L$(top_builddir)/opal/datatype/.libs -ldatatype -L$(top_builddir)/opal/.libs -lopen-pal -L/usr/local/cuda/lib -lcuda
+subdir     = opal/datatype/cuda
 
 CC = nvcc
-CFLAGS = -gencode arch=compute_35,code=sm_35 --compiler-options '-fPIC @CFLAGS@'
-LDFLAGS += -shared --compiler-options '-fPIC @LDFLAGS@'
+CFLAGS  = -I$(top_builddir)/opal/include -I$(top_srcdir)/opal/include -gencode arch=compute_35,code=sm_35 --compiler-options '-fPIC @CFLAGS@'
+LDFLAGS = -shared --compiler-options '-fPIC @LDFLAGS@'
 
 SRC := \
     opal_datatype_cuda.cu \
@@ -42,16 +43,18 @@ Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
 $(STLIB): $(OBJ)
 	$(ARCH) $(ARCHFLAGS) $@ $(OBJ) 
 	@RANLIB@ $@
-	
+
 $(DYLIB): $(OBJ)
 	$(NVCC) $(LDFLAGS) $(EXTLIB) -o $(DYLIB) $(OBJ)
-	
+
 %.o: %.cu
 	$(NVCC) $(CFLAGS) $(EXTLIB) $(INC) -c $< -o $@ 
 
+install: $(DYLIB)
+	cp -f $(DYLIB) @OMPI_WRAPPER_LIBDIR@/
+
 clean:
-	rm -f *.o
+	rm -f $(OBJ)
 
 cleanall: clean
-	rm -f $(STLIB)
-	rm -f $(DYLIB)
+	rm -f $(STLIB) $(DYLIB)

From a681551610587a1fed735f2fee5de708d734c8c7 Mon Sep 17 00:00:00 2001
From: George Bosilca <bosilca@icl.utk.edu>
Date: Thu, 18 Jun 2015 19:49:11 -0400
Subject: [PATCH 011/190] Open the datatype CUDA library from a default install
 location. Various other minor cleanups.

---
 opal/datatype/opal_datatype_gpu.c | 190 ++++++++++--------------------
 1 file changed, 61 insertions(+), 129 deletions(-)

diff --git a/opal/datatype/opal_datatype_gpu.c b/opal/datatype/opal_datatype_gpu.c
index c136a55ea71..ef7a8f41d27 100644
--- a/opal/datatype/opal_datatype_gpu.c
+++ b/opal/datatype/opal_datatype_gpu.c
@@ -3,7 +3,7 @@
  * Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana
  *                         University Research and Technology
  *                         Corporation.  All rights reserved.
- * Copyright (c) 2004-2014 The University of Tennessee and The University
+ * Copyright (c) 2004-2015 The University of Tennessee and The University
  *                         of Tennessee Research Foundation.  All rights
  *                         reserved.
  * Copyright (c) 2004-2006 High Performance Computing Center Stuttgart,
@@ -24,6 +24,7 @@
 #include <stddef.h>
 #include <dlfcn.h>
 
+#include "opal/mca/installdirs/installdirs.h"
 #include "opal/datatype/opal_convertor_internal.h"
 #include "opal/datatype/opal_datatype_internal.h"
 
@@ -37,54 +38,55 @@
 
 #include "opal/datatype/opal_datatype_gpu.h"
 
-static void *opal_datatype_cuda_handle = NULL; 
+static void *opal_datatype_cuda_handle = NULL;
+static char *opal_datatype_cuda_lib = NULL;
 
 void (*opal_datatype_cuda_init_p)(void) = NULL;
 
 void (*opal_datatype_cuda_fini_p)(void) = NULL;
 
 int32_t (*opal_generic_simple_pack_function_cuda_p)( opal_convertor_t* pConvertor,
-                                                     struct iovec* iov, 
+                                                     struct iovec* iov,
                                                      uint32_t* out_size,
                                                      size_t* max_data ) = NULL;
 
 int32_t (*opal_generic_simple_unpack_function_cuda_p)( opal_convertor_t* pConvertor,
-                                                       struct iovec* iov, 
+                                                       struct iovec* iov,
                                                        uint32_t* out_size,
                                                        size_t* max_data ) = NULL;
-                                                     
+
 int32_t (*opal_generic_simple_pack_function_cuda_iov_p)( opal_convertor_t* pConvertor,
-                                                        struct iovec* iov, 
+                                                        struct iovec* iov,
                                                         uint32_t* out_size,
                                                         size_t* max_data ) = NULL;
-                                                        
+
 int32_t (*opal_generic_simple_unpack_function_cuda_iov_p)( opal_convertor_t* pConvertor,
-                                                        struct iovec* iov, 
+                                                        struct iovec* iov,
                                                         uint32_t* out_size,
                                                         size_t* max_data ) = NULL;
-                                                        
+
 int32_t (*opal_generic_simple_pack_function_cuda_vector_p)( opal_convertor_t* pConvertor,
-                                                            struct iovec* iov, 
+                                                            struct iovec* iov,
                                                             uint32_t* out_size,
                                                             size_t* max_data ) = NULL;
 
 int32_t (*opal_generic_simple_unpack_function_cuda_vector_p)( opal_convertor_t* pConvertor,
-                                                              struct iovec* iov, 
+                                                              struct iovec* iov,
                                                               uint32_t* out_size,
                                                               size_t* max_data ) = NULL;
-                                                       
+
 void (*pack_contiguous_loop_cuda_p)( dt_elem_desc_t* ELEM,
                                      uint32_t* COUNT,
                                      unsigned char** SOURCE,
                                      unsigned char** DESTINATION,
                                      size_t* SPACE ) = NULL;
-                                     
+
 void (*unpack_contiguous_loop_cuda_p)( dt_elem_desc_t* ELEM,
                                        uint32_t* COUNT,
                                        unsigned char** SOURCE,
                                        unsigned char** DESTINATION,
                                        size_t* SPACE ) = NULL;
-                                       
+
 void (*pack_predefined_data_cuda_p)( dt_elem_desc_t* ELEM,
                                      uint32_t* COUNT,
                                      unsigned char** SOURCE,
@@ -99,126 +101,50 @@ void (*opal_cuda_free_gpu_buffer_p)(void *addr, int gpu_id) = NULL;
 
 void* (*opal_cuda_malloc_gpu_buffer_p)(size_t size, int gpu_id) = NULL;
 
+#define OPAL_DATATYPE_FIND_CUDA_FUNCTION_OR_RETURN(handle, fname)       \
+    do {                                                                \
+        char* _error;                                                   \
+        *(void **)(&(fname ## _p)) = dlsym((handle), # fname);          \
+        if(NULL != (_error = dlerror()) )  {                            \
+            opal_output(0, "Finding %s error: %s\n", # fname, _error);  \
+            fname ## _p = NULL;                                         \
+            return OPAL_ERROR;                                          \
+        }                                                               \
+    } while (0)
+
 int32_t opal_datatype_gpu_init(void)
 {
-    char *error;
-    char *lib = "/home/wwu12/ompi/ompi-gpu/opal/datatype/cuda/opal_datatype_cuda.so";
-    
     if (opal_datatype_cuda_handle ==  NULL) {
-        opal_datatype_cuda_handle = dlopen(lib, RTLD_LAZY);
+
+        /* If the library name was initialized but the load failed, we have another chance to change it */
+        if( NULL != opal_datatype_cuda_lib )
+            free(opal_datatype_cuda_lib);
+        asprintf(&opal_datatype_cuda_lib, "%s/%s", opal_install_dirs.libdir, "opal_datatype_cuda.so");
+
+        opal_datatype_cuda_handle = dlopen(opal_datatype_cuda_lib , RTLD_LAZY);
         if (!opal_datatype_cuda_handle) {
-            fprintf(stderr, "%s\n", dlerror());
+            opal_output( 0, "Failed to load %s library: error %s\n", opal_datatype_cuda_lib, dlerror());
             opal_datatype_cuda_handle = NULL;
             return OPAL_ERROR;
         }
-        
-        *(void **)(&opal_datatype_cuda_init_p) = dlsym(opal_datatype_cuda_handle, "opal_datatype_cuda_init");
-        if ((error = dlerror()) != NULL)  {
-            fprintf(stderr, "opal_datatype_cuda_init error: %s\n", error);
-            opal_datatype_cuda_init_p = NULL;
-            return OPAL_ERROR;
-        }
-        
-        *(void **)(&opal_datatype_cuda_fini_p) = dlsym(opal_datatype_cuda_handle, "opal_datatype_cuda_fini");
-        if ((error = dlerror()) != NULL)  {
-            fprintf(stderr, "opal_datatype_cuda_fini error: %s\n", error);
-            opal_datatype_cuda_fini_p = NULL;
-            return OPAL_ERROR;
-        }
-        
-        *(void **)(&opal_generic_simple_pack_function_cuda_p) = dlsym(opal_datatype_cuda_handle, "opal_generic_simple_pack_function_cuda");
-        if ((error = dlerror()) != NULL)  {
-            fprintf(stderr, "opal_generic_simple_pack_function_cuda error: %s\n", error);
-            opal_generic_simple_pack_function_cuda_p = NULL;
-            return OPAL_ERROR;
-        }
-        
-        *(void **)(&opal_generic_simple_unpack_function_cuda_p) = dlsym(opal_datatype_cuda_handle, "opal_generic_simple_unpack_function_cuda");
-        if ((error = dlerror()) != NULL)  {
-            fprintf(stderr, "opal_generic_simple_unpack_function_cuda error: %s\n", error);
-            opal_generic_simple_unpack_function_cuda_p = NULL;
-            return OPAL_ERROR;
-        }
-        
-        *(void **)(&opal_generic_simple_pack_function_cuda_iov_p) = dlsym(opal_datatype_cuda_handle, "opal_generic_simple_pack_function_cuda_iov");
-        if ((error = dlerror()) != NULL)  {
-            fprintf(stderr, "opal_generic_simple_pack_function_cuda_iov error: %s\n", error);
-            opal_generic_simple_pack_function_cuda_iov_p = NULL;
-            return OPAL_ERROR;
-        }
-        
-        *(void **)(&opal_generic_simple_unpack_function_cuda_iov_p) = dlsym(opal_datatype_cuda_handle, "opal_generic_simple_unpack_function_cuda_iov");
-        if ((error = dlerror()) != NULL)  {
-            fprintf(stderr, "opal_generic_simple_unpack_function_cuda_iov error: %s\n", error);
-            opal_generic_simple_unpack_function_cuda_iov_p = NULL;
-            return OPAL_ERROR;
-        }
-        
-        *(void **)(&opal_generic_simple_pack_function_cuda_vector_p) = dlsym(opal_datatype_cuda_handle, "opal_generic_simple_pack_function_cuda_vector");
-        if ((error = dlerror()) != NULL)  {
-            fprintf(stderr, "opal_generic_simple_pack_function_cuda_vector error: %s\n", error);
-            opal_generic_simple_pack_function_cuda_vector_p = NULL;
-            return OPAL_ERROR;
-        }
-        
-        *(void **)(&opal_generic_simple_unpack_function_cuda_vector_p) = dlsym(opal_datatype_cuda_handle, "opal_generic_simple_unpack_function_cuda_vector");
-        if ((error = dlerror()) != NULL)  {
-            fprintf(stderr, "opal_generic_simple_unpack_function_cuda_vector error: %s\n", error);
-            opal_generic_simple_unpack_function_cuda_vector_p = NULL;
-            return OPAL_ERROR;
-        }
-        
-        *(void **)(&pack_contiguous_loop_cuda_p) = dlsym(opal_datatype_cuda_handle, "pack_contiguous_loop_cuda");
-        if ((error = dlerror()) != NULL)  {
-            fprintf(stderr, "pack_contiguous_loop_cuda error: %s\n", error);
-            pack_contiguous_loop_cuda_p = NULL;
-            return OPAL_ERROR;
-        }
-        
-        *(void **)(&unpack_contiguous_loop_cuda_p) = dlsym(opal_datatype_cuda_handle, "unpack_contiguous_loop_cuda");
-        if ((error = dlerror()) != NULL)  {
-            fprintf(stderr, "unpack_contiguous_loop_cuda error: %s\n", error);
-            unpack_contiguous_loop_cuda_p = NULL;
-            return OPAL_ERROR;
-        }
-        
-        *(void **)(&pack_predefined_data_cuda_p) = dlsym(opal_datatype_cuda_handle, "pack_predefined_data_cuda");
-        if ((error = dlerror()) != NULL)  {
-            fprintf(stderr, "pack_predefined_data_cuda error: %s\n", error);
-            pack_predefined_data_cuda_p = NULL;
-            return OPAL_ERROR;
-        }
-        
-        *(void **)(&opal_cuda_sync_device_p) = dlsym(opal_datatype_cuda_handle, "opal_cuda_sync_device");
-        if ((error = dlerror()) != NULL)  {
-            fprintf(stderr, "opal_cuda_sync_device error: %s\n", error);
-            opal_cuda_sync_device_p = NULL;
-            return OPAL_ERROR;
-        }
-        
-        *(void **)(&opal_cuda_get_gpu_pack_buffer_p) = dlsym(opal_datatype_cuda_handle, "opal_cuda_get_gpu_pack_buffer");
-        if ((error = dlerror()) != NULL)  {
-            fprintf(stderr, "opal_cuda_get_gpu_pack_buffer error: %s\n", error);
-            opal_cuda_get_gpu_pack_buffer_p = NULL;
-            return OPAL_ERROR;
-        }
-        
-        *(void **)(&opal_cuda_free_gpu_buffer_p) = dlsym(opal_datatype_cuda_handle, "opal_cuda_free_gpu_buffer");
-        if ((error = dlerror()) != NULL)  {
-            fprintf(stderr, "opal_cuda_free_gpu_buffer error: %s\n", error);
-            opal_cuda_free_gpu_buffer_p = NULL;
-            return OPAL_ERROR;
-        }
-        
-        *(void **)(&opal_cuda_malloc_gpu_buffer_p) = dlsym(opal_datatype_cuda_handle, "opal_cuda_malloc_gpu_buffer");
-        if ((error = dlerror()) != NULL)  {
-            fprintf(stderr, "opal_cuda_malloc_gpu_buffer error: %s\n", error);
-            opal_cuda_malloc_gpu_buffer_p = NULL;
-            return OPAL_ERROR;
-        }
-        
+        OPAL_DATATYPE_FIND_CUDA_FUNCTION_OR_RETURN( opal_datatype_cuda_handle, opal_datatype_cuda_init );
+        OPAL_DATATYPE_FIND_CUDA_FUNCTION_OR_RETURN( opal_datatype_cuda_handle, opal_datatype_cuda_fini );
+        OPAL_DATATYPE_FIND_CUDA_FUNCTION_OR_RETURN( opal_datatype_cuda_handle, opal_generic_simple_pack_function_cuda );
+        OPAL_DATATYPE_FIND_CUDA_FUNCTION_OR_RETURN( opal_datatype_cuda_handle, opal_generic_simple_unpack_function_cuda );
+        OPAL_DATATYPE_FIND_CUDA_FUNCTION_OR_RETURN( opal_datatype_cuda_handle, opal_generic_simple_pack_function_cuda_iov );
+        OPAL_DATATYPE_FIND_CUDA_FUNCTION_OR_RETURN( opal_datatype_cuda_handle, opal_generic_simple_unpack_function_cuda_iov );
+        OPAL_DATATYPE_FIND_CUDA_FUNCTION_OR_RETURN( opal_datatype_cuda_handle, opal_generic_simple_pack_function_cuda_vector );
+        OPAL_DATATYPE_FIND_CUDA_FUNCTION_OR_RETURN( opal_datatype_cuda_handle, opal_generic_simple_unpack_function_cuda_vector );
+        OPAL_DATATYPE_FIND_CUDA_FUNCTION_OR_RETURN( opal_datatype_cuda_handle, pack_contiguous_loop_cuda );
+        OPAL_DATATYPE_FIND_CUDA_FUNCTION_OR_RETURN( opal_datatype_cuda_handle, unpack_contiguous_loop_cuda );
+        OPAL_DATATYPE_FIND_CUDA_FUNCTION_OR_RETURN( opal_datatype_cuda_handle, pack_predefined_data_cuda );
+        OPAL_DATATYPE_FIND_CUDA_FUNCTION_OR_RETURN( opal_datatype_cuda_handle, opal_cuda_sync_device );
+        OPAL_DATATYPE_FIND_CUDA_FUNCTION_OR_RETURN( opal_datatype_cuda_handle, opal_cuda_get_gpu_pack_buffer );
+        OPAL_DATATYPE_FIND_CUDA_FUNCTION_OR_RETURN( opal_datatype_cuda_handle, opal_cuda_free_gpu_buffer );
+        OPAL_DATATYPE_FIND_CUDA_FUNCTION_OR_RETURN( opal_datatype_cuda_handle, opal_cuda_malloc_gpu_buffer );
+
         (*opal_datatype_cuda_init_p)();
-        printf("cuda init done\n");   
+        printf("cuda init done\n");
     }
     return OPAL_SUCCESS;
 }
@@ -227,8 +153,7 @@ int32_t opal_datatype_gpu_fini(void)
 {
     if (opal_datatype_cuda_handle != NULL) {
         (*opal_datatype_cuda_fini_p)();
-        dlclose(opal_datatype_cuda_handle);
-        opal_datatype_cuda_handle = NULL;
+        /* Reset all functions to NULL */
         opal_datatype_cuda_init_p = NULL;
         opal_datatype_cuda_fini_p = NULL;
         opal_generic_simple_pack_function_cuda_p = NULL;
@@ -244,6 +169,13 @@ int32_t opal_datatype_gpu_fini(void)
         opal_cuda_get_gpu_pack_buffer_p = NULL;
         opal_cuda_free_gpu_buffer_p = NULL;
         opal_cuda_malloc_gpu_buffer_p = NULL;
+
+        dlclose(opal_datatype_cuda_handle);
+        opal_datatype_cuda_handle = NULL;
+
+        if( NULL != opal_datatype_cuda_lib )
+            free(opal_datatype_cuda_lib);
+        opal_datatype_cuda_lib = NULL;
         printf("cuda fini done\n");
     }
     return OPAL_SUCCESS;
@@ -261,4 +193,4 @@ unsigned char* opal_datatype_get_gpu_buffer(void)
     return NULL;
 #endif /* defined OPAL_DATATYPE_CUDA_KERNEL */
     
-}
\ No newline at end of file
+}

From bcd77f675fc2152cd4402359a608d60418daea6b Mon Sep 17 00:00:00 2001
From: George Bosilca <bosilca@icl.utk.edu>
Date: Tue, 30 Jun 2015 17:28:34 -0400
Subject: [PATCH 012/190] Add a patch from Rolf fixing 2 issues: 1. free code
 did not work right because we were computing the amount we freed after
 merging the list 2. we need to store original malloc GPU buffer in extra
 place because the one in the convertor gets changed over time

Conflicts:
	opal/datatype/cuda/opal_datatype_cuda.cu
	opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
---
 opal/datatype/cuda/Makefile.in                        | 2 +-
 opal/datatype/cuda/opal_datatype_cuda.cu              | 2 ++
 opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu | 2 +-
 opal/mca/btl/smcuda/btl_smcuda.c                      | 2 ++
 opal/mca/btl/smcuda/btl_smcuda.h                      | 1 +
 opal/mca/btl/smcuda/btl_smcuda_component.c            | 2 +-
 opal/mca/common/cuda/common_cuda.c                    | 1 +
 7 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/opal/datatype/cuda/Makefile.in b/opal/datatype/cuda/Makefile.in
index f00ca4e030c..ded04f1ed3c 100644
--- a/opal/datatype/cuda/Makefile.in
+++ b/opal/datatype/cuda/Makefile.in
@@ -15,7 +15,7 @@ EXTLIB     = -L$(top_builddir)/opal/datatype/.libs -ldatatype -L$(top_builddir)/
 subdir     = opal/datatype/cuda
 
 CC = nvcc
-CFLAGS  = -I$(top_builddir)/opal/include -I$(top_srcdir)/opal/include -gencode arch=compute_35,code=sm_35 --compiler-options '-fPIC @CFLAGS@'
+CFLAGS  = -I$(top_builddir)/opal/include -I$(top_srcdir)/opal/include -I$(top_builddir) -I$(top_srcdir) -gencode arch=compute_35,code=sm_35 --compiler-options '-fPIC @CFLAGS@'
 LDFLAGS = -shared --compiler-options '-fPIC @LDFLAGS@'
 
 SRC := \
diff --git a/opal/datatype/cuda/opal_datatype_cuda.cu b/opal/datatype/cuda/opal_datatype_cuda.cu
index 8451b143487..b81e5196a8f 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cu
+++ b/opal/datatype/cuda/opal_datatype_cuda.cu
@@ -434,6 +434,8 @@ void opal_cuda_free_gpu_buffer(void *addr, int gpu_id)
     if (ptr == NULL) {
         DT_CUDA_DEBUG( opal_cuda_output( 0, "addr %p is not managed.\n", addr); );
     }
+    cuda_list_item_merge_by_addr(&device->buffer_free, ptr);
+    device->buffer_free_size += ptr->size;
     DT_CUDA_DEBUG( opal_cuda_output( 0, "Free GPU buffer %p.\n", addr); );
 }
 
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
index b55c59a5c1e..87184277d9a 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
@@ -531,7 +531,7 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
             count_desc = length_per_iovec / alignment;
             residue_desc = length_per_iovec % alignment;
             nb_blocks_per_description = (count_desc + thread_per_block - 1) / thread_per_block;
-            DT_CUDA_DEBUG ( opal_cuda_output(2, "description %d, size %d, residue %d, alignment %d, nb_block_aquired %d\n", i, count_desc, residue_desc, alignment, nb_blocks_per_description); );
+            DT_CUDA_DEBUG ( opal_cuda_output(-1, "description %d, size %d, residue %d, alignment %d, nb_block_aquired %d\n", i, count_desc, residue_desc, alignment, nb_blocks_per_description); );
             for (j = 0; j < nb_blocks_per_description; j++) {
                 cuda_iov_dist_h_current[current_block].src[task_iteration] = (unsigned char *)(cuda_iov[i].iov_base) + j * thread_per_block * alignment;
                 cuda_iov_dist_h_current[current_block].dst[task_iteration] = destination;
diff --git a/opal/mca/btl/smcuda/btl_smcuda.c b/opal/mca/btl/smcuda/btl_smcuda.c
index 4814b6c996a..6041a8b64e8 100644
--- a/opal/mca/btl/smcuda/btl_smcuda.c
+++ b/opal/mca/btl/smcuda/btl_smcuda.c
@@ -1338,6 +1338,7 @@ void mca_btl_smcuda_cuda_dt_pack_clone(struct opal_convertor_t *convertor,
                                        int lindex)
 {
     endpoint->smcuda_dt_pack_clone[lindex].convertor = convertor;
+    endpoint->smcuda_dt_pack_clone[lindex].gpu_ptr = convertor->gpu_buffer_ptr;
     endpoint->smcuda_dt_pack_clone[lindex].endpoint = endpoint;
     endpoint->smcuda_dt_pack_clone[lindex].local_address = local_address;
     endpoint->smcuda_dt_pack_clone[lindex].local_handle = local_handle;
@@ -1359,6 +1360,7 @@ void mca_btl_smcuda_cuda_dt_unpack_clone(struct opal_convertor_t *convertor,
                                          int lindex)
 {
     endpoint->smcuda_dt_unpack_clone[lindex].convertor = convertor;
+    endpoint->smcuda_dt_unpack_clone[lindex].gpu_ptr = convertor->gpu_buffer_ptr;
     endpoint->smcuda_dt_unpack_clone[lindex].endpoint = endpoint;
     endpoint->smcuda_dt_unpack_clone[lindex].local_address = local_address;
     endpoint->smcuda_dt_unpack_clone[lindex].local_handle = local_handle;
diff --git a/opal/mca/btl/smcuda/btl_smcuda.h b/opal/mca/btl/smcuda/btl_smcuda.h
index 00765f0a276..c43fbe0b190 100644
--- a/opal/mca/btl/smcuda/btl_smcuda.h
+++ b/opal/mca/btl/smcuda/btl_smcuda.h
@@ -517,6 +517,7 @@ typedef struct {
 /* package save pack/unpack convertor and cbfunc */
 typedef struct {
     struct opal_convertor_t *convertor;
+    void *gpu_ptr;
     struct mca_btl_base_endpoint_t *endpoint;
     void *local_address;
     struct mca_btl_base_registration_handle_t *local_handle;
diff --git a/opal/mca/btl/smcuda/btl_smcuda_component.c b/opal/mca/btl/smcuda/btl_smcuda_component.c
index f035578bd5d..4633134bac5 100644
--- a/opal/mca/btl/smcuda/btl_smcuda_component.c
+++ b/opal/mca/btl/smcuda/btl_smcuda_component.c
@@ -907,7 +907,7 @@ static void btl_smcuda_datatype_pack(mca_btl_base_module_t* btl,
     
     if (seq == -1) {
         mca_btl_smcuda_send_cuda_unpack_sig(btl, my_cuda_dt_clone->endpoint, lindex, -2);
-        opal_cuda_free_gpu_buffer_p(my_cuda_dt_clone->convertor->gpu_buffer_ptr, 0);
+        opal_cuda_free_gpu_buffer_p(my_cuda_dt_clone->gpu_ptr, 0);
         mca_btl_smcuda_free_cuda_dt_pack_clone(my_cuda_dt_clone->endpoint, lindex);
     }
 }
diff --git a/opal/mca/common/cuda/common_cuda.c b/opal/mca/common/cuda/common_cuda.c
index d37f6656d2c..f59d4365006 100644
--- a/opal/mca/common/cuda/common_cuda.c
+++ b/opal/mca/common/cuda/common_cuda.c
@@ -33,6 +33,7 @@
 #include "opal/align.h"
 #include "opal/datatype/opal_convertor.h"
 #include "opal/datatype/opal_datatype_cuda.h"
+#include "opal/datatype/opal_datatype_gpu.h"
 #include "opal/util/output.h"
 #include "opal/util/show_help.h"
 #include "opal/util/proc.h"

From bdfe31b87e183394dbf9ccc5ebc97335e79e369f Mon Sep 17 00:00:00 2001
From: eddy16112 <eddy16112@gmail.com>
Date: Wed, 19 Aug 2015 17:20:39 -0400
Subject: [PATCH 013/190] clean up code in pack and unpack

Conflicts:
	ompi/mca/pml/ob1/pml_ob1_cuda.c
	opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
---
 ompi/mca/pml/ob1/pml_ob1_cuda.c               | 128 ++++++++++++------
 opal/datatype/cuda/opal_datatype_cuda.cu      |  10 +-
 .../cuda/opal_datatype_cuda_internal.cuh      |   4 +-
 .../cuda/opal_datatype_pack_cuda_kernel.cu    |   5 +-
 .../cuda/opal_datatype_pack_cuda_wrapper.cu   |  38 ++----
 .../cuda/opal_datatype_unpack_cuda_kernel.cu  |   3 +
 .../cuda/opal_datatype_unpack_cuda_wrapper.cu |  29 ++--
 opal/mca/btl/smcuda/btl_smcuda.c              |  42 +++++-
 opal/mca/btl/smcuda/btl_smcuda.h              |   5 +
 opal/mca/btl/smcuda/btl_smcuda_component.c    |   9 +-
 opal/mca/common/cuda/common_cuda.h            |   1 +
 test/datatype/Makefile.am                     |   2 +-
 test/datatype/ddt_test.c                      |   2 +-
 13 files changed, 192 insertions(+), 86 deletions(-)

diff --git a/ompi/mca/pml/ob1/pml_ob1_cuda.c b/ompi/mca/pml/ob1/pml_ob1_cuda.c
index 2575228d019..f4d4907c336 100644
--- a/ompi/mca/pml/ob1/pml_ob1_cuda.c
+++ b/ompi/mca/pml/ob1/pml_ob1_cuda.c
@@ -106,53 +106,103 @@ int mca_pml_ob1_send_request_start_cuda(mca_pml_ob1_send_request_t* sendreq,
         sendreq->req_send.req_base.req_convertor.flags |= CONVERTOR_CUDA;
         mca_bml_base_btl_t* bml_endpoint_btl = mca_bml_base_btl_array_get_index(&(sendreq->req_endpoint->btl_send), 0);
         if ((bml_endpoint_btl->btl_flags & MCA_BTL_FLAGS_CUDA_GET) && CUDA_DDT_WITH_RDMA) {
-            printf("GPU data ready for GET!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n");
+            
+            int seq = 0;
+            int rc_dt = 0;
+            int rc_sig = 0;
             unsigned char *base;
+            struct iovec iov;
+            size_t pipeline_size = 0;
+            uint32_t iov_count = 1;
+            size_t max_data = 0;
             struct opal_convertor_t *convertor = &(sendreq->req_send.req_base.req_convertor);
-            base = opal_cuda_malloc_gpu_buffer_p(convertor->local_size, 0);
-            convertor->gpu_buffer_ptr = base;
-            sendreq->req_send.req_bytes_packed = convertor->local_size;
-            printf("GPU BUFFER %p, local %lu, remote %lu\n", base, convertor->local_size, convertor->remote_size);
-            if( 0 != (sendreq->req_rdma_cnt = (uint32_t)mca_pml_ob1_rdma_cuda_btls(
-                                                                           sendreq->req_endpoint,
-                                                                           base,
-                                                                           sendreq->req_send.req_bytes_packed,
-                                                                           sendreq->req_rdma))) {
+            int lindex = mca_btl_smcuda_check_cuda_dt_pack_clone_exist(bml_btl->btl_endpoint, convertor); 
+            if (lindex == -1) {
+                /* this is the first time for this convertor */
+                printf("GPU data ready for GET!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n");
+                base = opal_cuda_malloc_gpu_buffer_p(convertor->local_size, 0);
+                convertor->gpu_buffer_ptr = base;
+                sendreq->req_send.req_bytes_packed = convertor->local_size;
+                printf("GPU BUFFER %p, local %lu, remote %lu\n", base, convertor->local_size, convertor->remote_size);
+                if( 0 != (sendreq->req_rdma_cnt = (uint32_t)mca_pml_ob1_rdma_cuda_btls(
+                                                                               sendreq->req_endpoint,
+                                                                               base,
+                                                                               sendreq->req_send.req_bytes_packed,
+                                                                               sendreq->req_rdma))) {
                 
-                size_t pipeline_size = convertor->local_size;
-                struct iovec iov;
-                int rc_dt = 0;
-                uint32_t iov_count = 1;
-                iov.iov_base = base;
-                iov.iov_len = pipeline_size;
-                size_t max_data = 0;
-                int seq = 0;
-                /* the first pack here is used to get the correct size of pipeline_size */
-                /* because pack may not use the whole pipeline size */
-                rc_dt = opal_convertor_pack(convertor, &iov, &iov_count, &max_data );
-                pipeline_size = max_data;
-                int lindex = mca_btl_smcuda_alloc_cuda_dt_pack_clone(bml_btl->btl_endpoint);
-                assert(lindex >= 0);
-                mca_pml_ob1_rdma_cuda_btl_register_events(sendreq->req_rdma, sendreq->req_rdma_cnt, convertor, pipeline_size, lindex); 
-                mca_btl_smcuda_cuda_dt_pack_clone(convertor, bml_btl->btl_endpoint, NULL, NULL, NULL, NULL, NULL, pipeline_size, lindex);
+                    pipeline_size = 1024*1024;
+                    iov.iov_base = base;
+                    iov.iov_len = pipeline_size;
+                    max_data = 0;
+                    /* the first pack here is used to get the correct size of pipeline_size */
+                    /* because pack may not use the whole pipeline size */
+                    rc_dt = opal_convertor_pack(convertor, &iov, &iov_count, &max_data );
+                    pipeline_size = max_data;
+                    lindex = mca_btl_smcuda_alloc_cuda_dt_pack_clone(bml_btl->btl_endpoint);
+                    assert(lindex >= 0);
+                    mca_pml_ob1_rdma_cuda_btl_register_events(sendreq->req_rdma, sendreq->req_rdma_cnt, convertor, pipeline_size, lindex); 
+                    mca_btl_smcuda_cuda_dt_pack_clone(convertor, bml_btl->btl_endpoint, NULL, NULL, NULL, NULL, NULL, pipeline_size, lindex);
                 
-                rc = mca_pml_ob1_send_request_start_rdma(sendreq, bml_btl,
-                                                         sendreq->req_send.req_bytes_packed);
+                    rc = mca_pml_ob1_send_request_start_rdma(sendreq, bml_btl,
+                                                             sendreq->req_send.req_bytes_packed);
                 
-                mca_btl_smcuda_send_cuda_unpack_sig(bml_btl->btl, bml_btl->btl_endpoint, lindex, seq);
-                while (rc_dt != 1) {
-                    iov.iov_base += pipeline_size;
-                    seq ++;
-                    rc_dt = opal_convertor_pack(convertor, &iov, &iov_count, &max_data );
-                    mca_btl_smcuda_send_cuda_unpack_sig(bml_btl->btl, bml_btl->btl_endpoint, lindex, seq);
+                    rc_sig = mca_btl_smcuda_send_cuda_unpack_sig(bml_btl->btl, bml_btl->btl_endpoint, lindex, seq);
+                    if (rc_sig == OPAL_ERR_OUT_OF_RESOURCE) {
+                        mca_btl_smcuda_set_cuda_dt_pack_seq(bml_btl->btl_endpoint, lindex, seq);
+                        return rc_sig;
+                    }
+                    while (rc_dt != 1) {
+                        iov.iov_base += pipeline_size;
+                        seq ++;
+                        rc_dt = opal_convertor_pack(convertor, &iov, &iov_count, &max_data );
+                        rc_sig = mca_btl_smcuda_send_cuda_unpack_sig(bml_btl->btl, bml_btl->btl_endpoint, lindex, seq);
+                        if (rc_sig == OPAL_ERR_OUT_OF_RESOURCE) {
+                            mca_btl_smcuda_set_cuda_dt_pack_seq(bml_btl->btl_endpoint, lindex, seq);
+                            return rc_sig;
+                        }
+                    }
+                    rc_sig = mca_btl_smcuda_send_cuda_unpack_sig(bml_btl->btl, bml_btl->btl_endpoint, lindex, -1);
+                    if (rc_sig == OPAL_ERR_OUT_OF_RESOURCE) {
+                        mca_btl_smcuda_set_cuda_dt_pack_seq(bml_btl->btl_endpoint, lindex, -1);
+                        return rc_sig;
+                    }
+                    if( OPAL_UNLIKELY(OMPI_SUCCESS != rc) ) {
+                        mca_pml_ob1_free_rdma_resources(sendreq);
+                    }
+                } else {
+                    rc = mca_pml_ob1_send_request_start_rndv(sendreq, bml_btl, 0, 0);
                 }
-                mca_btl_smcuda_send_cuda_unpack_sig(bml_btl->btl, bml_btl->btl_endpoint, lindex, -1);
-                if( OPAL_UNLIKELY(OMPI_SUCCESS != rc) ) {
-                    mca_pml_ob1_free_rdma_resources(sendreq);
+            } else { /* RMDA has been started before, but no resource (frag) last time, so back to re-schedule */
+                seq = mca_btl_smcuda_get_cuda_dt_pack_seq(bml_btl->btl_endpoint, lindex);
+                pipeline_size = mca_btl_smcuda_get_cuda_dt_pack_pipeline_size(bml_btl->btl_endpoint, lindex);
+                printf("*****************I resent seq %d, pipeline %lu\n", seq, pipeline_size);
+                rc_dt = 0;
+                rc_sig = mca_btl_smcuda_send_cuda_unpack_sig(bml_btl->btl, bml_btl->btl_endpoint, lindex, seq);
+                if (rc_sig == OPAL_ERR_OUT_OF_RESOURCE) {
+                    mca_btl_smcuda_set_cuda_dt_pack_seq(bml_btl->btl_endpoint, lindex, seq);
+                    return rc_sig;
+                }
+                if (seq != -1) {
+                    
+                    while (rc_dt != 1) {
+                        seq ++;
+                        iov.iov_base = convertor->gpu_buffer_ptr + pipeline_size * seq;
+                        iov.iov_len = pipeline_size;
+                        rc_dt = opal_convertor_pack(convertor, &iov, &iov_count, &pipeline_size );     
+                        rc_sig = mca_btl_smcuda_send_cuda_unpack_sig(bml_btl->btl, bml_btl->btl_endpoint, lindex, seq);
+                        if (rc_sig == OPAL_ERR_OUT_OF_RESOURCE) {
+                            mca_btl_smcuda_set_cuda_dt_pack_seq(bml_btl->btl_endpoint, lindex, seq);
+                            return rc_sig;
+                        }
+                    }
+                    rc_sig = mca_btl_smcuda_send_cuda_unpack_sig(bml_btl->btl, bml_btl->btl_endpoint, lindex, -1);
+                    if (rc_sig == OPAL_ERR_OUT_OF_RESOURCE) {
+                        mca_btl_smcuda_set_cuda_dt_pack_seq(bml_btl->btl_endpoint, lindex, -1);
+                        return rc_sig;
+                    }
                 }
-            } else {
-                rc = mca_pml_ob1_send_request_start_rndv(sendreq, bml_btl, 0, 0);
             }
+            
         } else {
             rc = mca_pml_ob1_send_request_start_rndv(sendreq, bml_btl, 0, 0);
         }
diff --git a/opal/datatype/cuda/opal_datatype_cuda.cu b/opal/datatype/cuda/opal_datatype_cuda.cu
index b81e5196a8f..b6ed096b7d9 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cu
+++ b/opal/datatype/cuda/opal_datatype_cuda.cu
@@ -1,3 +1,6 @@
+#include "opal/datatype/opal_convertor_internal.h"
+#include "opal/datatype/opal_datatype_internal.h"
+
 #include "opal_datatype_cuda_internal.cuh"
 #include "opal_datatype_cuda.cuh"
 #include <cuda_runtime_api.h>
@@ -10,6 +13,7 @@
  * NOTE: The order of this array *MUST* match what is listed in datatype.h
  * (use of designated initializers should relax this restrictions some)
  */
+/*
 OPAL_DECLSPEC const size_t opal_datatype_basicDatatypesSize[OPAL_DATATYPE_MAX_PREDEFINED] = {
     OPAL_DATATYPE_LOOP_SIZE,
     OPAL_DATATYPE_END_LOOP_SIZE,
@@ -19,12 +23,12 @@ OPAL_DECLSPEC const size_t opal_datatype_basicDatatypesSize[OPAL_DATATYPE_MAX_PR
     OPAL_DATATYPE_INT2_SIZE,
     OPAL_DATATYPE_INT4_SIZE,
     OPAL_DATATYPE_INT8_SIZE,
-    OPAL_DATATYPE_INT16_SIZE,       /* Yes, double-machine word integers are available */
+    OPAL_DATATYPE_INT16_SIZE,   
     OPAL_DATATYPE_UINT1_SIZE,
     OPAL_DATATYPE_UINT2_SIZE,
     OPAL_DATATYPE_UINT4_SIZE,
     OPAL_DATATYPE_UINT8_SIZE,
-    OPAL_DATATYPE_UINT16_SIZE,      /* Yes, double-machine word integers are available */
+    OPAL_DATATYPE_UINT16_SIZE,  
     OPAL_DATATYPE_FLOAT2_SIZE,
     OPAL_DATATYPE_FLOAT4_SIZE,
     OPAL_DATATYPE_FLOAT8_SIZE,
@@ -37,7 +41,7 @@ OPAL_DECLSPEC const size_t opal_datatype_basicDatatypesSize[OPAL_DATATYPE_MAX_PR
     OPAL_DATATYPE_WCHAR_SIZE,
     OPAL_DATATYPE_UNAVAILABLE_SIZE,
 };
-
+*/
 /***** my variables ********/
 
 
diff --git a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
index e9359209c01..50e7cb18a68 100644
--- a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
+++ b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
@@ -5,7 +5,7 @@
 #include <stddef.h>
 #include <sys/time.h>
 
-#include "opal_datatype_orig_internal.h"
+//#include "opal_datatype_orig_internal.h"
 
 
 /* OPAL_CUDA */
@@ -13,7 +13,7 @@
 #define OPAL_DATATYPE_CUDA_DEBUG
 //#define OPAL_DATATYPE_CUDA_KERNEL_TIME
 #define OPAL_DATATYPE_CUDA_DEBUG_LEVEL  0
-#define OPAL_DATATYPE_CUDA_TIMING
+//#define OPAL_DATATYPE_CUDA_TIMING
 
 
 #define IOV_ARRAY_SIZE          1
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
index 96bdc12d961..bb2cb63048e 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
@@ -1,4 +1,7 @@
- #include "opal_datatype_cuda_internal.cuh"
+#include "opal/datatype/opal_convertor_internal.h"
+#include "opal/datatype/opal_datatype_internal.h"
+
+#include "opal_datatype_cuda_internal.cuh"
 #include <stdio.h> 
 #include <time.h>
 
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
index 87184277d9a..6c10f17d398 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
@@ -1,3 +1,6 @@
+#include "opal/datatype/opal_convertor_internal.h"
+#include "opal/datatype/opal_datatype_internal.h"
+
 #include "opal_datatype_cuda_internal.cuh"
 #include "opal_datatype_cuda.cuh"
 
@@ -412,7 +415,7 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
     GET_TIME(start_total);
 #endif
     
-    DT_CUDA_DEBUG ( opal_cuda_output(0, "GPU datatype packing using iovec\n"); );
+    DT_CUDA_DEBUG ( opal_cuda_output(0, "GPU datatype PACKING using iovec\n"); );
 
 
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
@@ -422,11 +425,11 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
     description = pConvertor->use_desc->desc;
     pStack = pConvertor->pStack + pConvertor->stack_pos;
     pElem = &(description[pStack->index]);
-    printf("size elem %lu, size %d\n", pElem->elem.common.type, opal_datatype_basicDatatypesSize[pElem->elem.common.type]);
+//    printf("size elem %lu, size %d\n", pElem->elem.common.type, opal_datatype_basicDatatypes[pElem->elem.common.type]->size);
     
-//    assert(opal_datatype_basicDatatypesSize[pElem->elem.common.type] != 0);
+//    assert(opal_datatype_basicDatatypes[pElem->elem.common.type]->size != 0);
 
-    printf("buffer size %d, max_data %d\n", iov[0].iov_len, *max_data);
+ //   printf("buffer size %d, max_data %d\n", iov[0].iov_len, *max_data);
     if ((iov[0].iov_base == NULL) || opal_cuda_is_gpu_buffer(iov[0].iov_base)) {
         if (iov[0].iov_len == 0) {
             buffer_size = DT_CUDA_BUFFER_SIZE;
@@ -468,7 +471,7 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
     cuda_streams->current_stream_id = 0;
     convertor_flags = pConvertor->flags;
     complete_flag = opal_convertor_raw( pConvertor, cuda_iov, &cuda_iov_count, &length );
-    DT_CUDA_DEBUG ( opal_cuda_output(1, "complete flag %d, iov count %d, length %d, submit to CUDA stream %d\n", complete_flag, cuda_iov_count, length, cuda_streams->current_stream_id); );
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "PACKING complete flag %d, iov count %d, length %d, submit to CUDA stream %d\n", complete_flag, cuda_iov_count, length, cuda_streams->current_stream_id); );
 
 #if defined(OPAL_DATATYPE_CUDA_TIMING)    
     GET_TIME( end );
@@ -481,17 +484,6 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
     nb_blocks = 256;
     
     while (cuda_iov_count > 0) {
-    // void* temp_addr;
-    // size_t temp_size;
-    // for (i = 1; i < cuda_iov_count/2; i+=2) {
-    //     temp_addr = cuda_iov[i].iov_base;
-    //     temp_size = cuda_iov[i].iov_len;
-    //     cuda_iov[i].iov_base = cuda_iov[cuda_iov_count-i].iov_base;
-    //     cuda_iov[i].iov_len = cuda_iov[cuda_iov_count-i].iov_len;
-    //     cuda_iov[cuda_iov_count-i].iov_base = temp_addr;
-    //     cuda_iov[cuda_iov_count-i].iov_len = temp_size;
-    //     // printf("swap %d, %d, len %d %d\n", i, cuda_iov_count-i, cuda_iov[i].iov_len, cuda_iov[cuda_iov_count-i].iov_len);
-    // }
         
         current_block = 0;
         task_iteration = 0;
@@ -510,7 +502,7 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
             if (buffer_size >= cuda_iov[i].iov_len) {
                 length_per_iovec = cuda_iov[i].iov_len;
             } else {
-                orig_alignment = opal_datatype_basicDatatypesSize[pElem->elem.common.type];
+                orig_alignment = opal_datatype_basicDatatypes[pElem->elem.common.type]->size;
                 length_per_iovec = buffer_size / orig_alignment * orig_alignment;
                 buffer_isfull = 1;
             }
@@ -531,7 +523,7 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
             count_desc = length_per_iovec / alignment;
             residue_desc = length_per_iovec % alignment;
             nb_blocks_per_description = (count_desc + thread_per_block - 1) / thread_per_block;
-            DT_CUDA_DEBUG ( opal_cuda_output(-1, "description %d, size %d, residue %d, alignment %d, nb_block_aquired %d\n", i, count_desc, residue_desc, alignment, nb_blocks_per_description); );
+            DT_CUDA_DEBUG ( opal_cuda_output(10, "PACKING description %d, size %d, residue %d, alignment %d, nb_block_aquired %d\n", i, count_desc, residue_desc, alignment, nb_blocks_per_description); );
             for (j = 0; j < nb_blocks_per_description; j++) {
                 cuda_iov_dist_h_current[current_block].src[task_iteration] = (unsigned char *)(cuda_iov[i].iov_base) + j * thread_per_block * alignment;
                 cuda_iov_dist_h_current[current_block].dst[task_iteration] = destination;
@@ -543,7 +535,7 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
                     cuda_iov_dist_h_current[current_block].nb_elements[task_iteration] = count_desc - j*thread_per_block; //(thread_per_block - ((j+1)*thread_per_block - count_desc));// * sizeof(double);
                 }
                 destination += cuda_iov_dist_h_current[current_block].nb_elements[task_iteration] * alignment;
-                DT_CUDA_DEBUG( opal_cuda_output(3, "\tblock %d, src %p, dst %p, nb_elements %d, alignment %d\n", current_block, cuda_iov_dist_h_current[current_block].src[task_iteration], cuda_iov_dist_h_current[current_block].dst[task_iteration], cuda_iov_dist_h_current[current_block].nb_elements[task_iteration], cuda_iov_dist_h_current[current_block].element_alignment[task_iteration]); );
+                DT_CUDA_DEBUG( opal_cuda_output(12, "PACKING \tblock %d, src %p, dst %p, nb_elements %d, alignment %d\n", current_block, cuda_iov_dist_h_current[current_block].src[task_iteration], cuda_iov_dist_h_current[current_block].dst[task_iteration], cuda_iov_dist_h_current[current_block].nb_elements[task_iteration], cuda_iov_dist_h_current[current_block].element_alignment[task_iteration]); );
                 current_block += 1;
                 if (current_block >= nb_blocks) {
                     current_block = 0;
@@ -554,14 +546,14 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
             
             /* handle residue */
             if (residue_desc != 0) {
-                orig_alignment = opal_datatype_basicDatatypesSize[pElem->elem.common.type];
+                orig_alignment = opal_datatype_basicDatatypes[pElem->elem.common.type]->size;
                 cuda_iov_dist_h_current[current_block].src[task_iteration] = (unsigned char *)(cuda_iov[i].iov_base) + length_per_iovec / alignment * alignment;
                 cuda_iov_dist_h_current[current_block].dst[task_iteration] = destination;
                 cuda_iov_dist_h_current[current_block].element_alignment[task_iteration] = orig_alignment;
                 cuda_iov_dist_h_current[current_block].nb_tasks = task_iteration + 1;
                 cuda_iov_dist_h_current[current_block].nb_elements[task_iteration] = (length_per_iovec - length_per_iovec / alignment * alignment) / orig_alignment;
                 destination += cuda_iov_dist_h_current[current_block].nb_elements[task_iteration] * orig_alignment;
-                DT_CUDA_DEBUG( opal_cuda_output(3, "\tblock %d, src %p, dst %p, nb_elements %d, alignment %d\n", current_block, cuda_iov_dist_h_current[current_block].src[task_iteration], cuda_iov_dist_h_current[current_block].dst[task_iteration], cuda_iov_dist_h_current[current_block].nb_elements[task_iteration], cuda_iov_dist_h_current[current_block].element_alignment[task_iteration]); );
+                DT_CUDA_DEBUG( opal_cuda_output(12, "PACKING \tblock %d, src %p, dst %p, nb_elements %d, alignment %d\n", current_block, cuda_iov_dist_h_current[current_block].src[task_iteration], cuda_iov_dist_h_current[current_block].dst[task_iteration], cuda_iov_dist_h_current[current_block].nb_elements[task_iteration], cuda_iov_dist_h_current[current_block].element_alignment[task_iteration]); );
                 current_block += 1;
                 if (current_block >= nb_blocks) {
                     current_block = 0;
@@ -598,7 +590,7 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
 #endif
         convertor_flags = pConvertor->flags;
         complete_flag = opal_convertor_raw( pConvertor, cuda_iov, &cuda_iov_count, &length );
-        DT_CUDA_DEBUG ( opal_cuda_output(1, "complete flag %d, iov count %d, length %d, submit to CUDA stream %d\n", complete_flag, cuda_iov_count, length, cuda_streams->current_stream_id); );
+        DT_CUDA_DEBUG ( opal_cuda_output(2, "PACKING complete flag %d, iov count %d, length %d, submit to CUDA stream %d\n", complete_flag, cuda_iov_count, length, cuda_streams->current_stream_id); );
 #if defined(OPAL_DATATYPE_CUDA_TIMING)    
         GET_TIME( end );
         total_time = ELAPSED_TIME( start, end );
@@ -630,7 +622,7 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
     iov[0].iov_len = total_packed;
     *max_data = total_packed;
     *out_size = 1;
-    DT_CUDA_DEBUG ( opal_cuda_output(0, "total packed %d\n", total_packed); );
+    DT_CUDA_DEBUG ( opal_cuda_output(0, "PACKING total packed %d\n", total_packed); );
     
 #if defined(OPAL_DATATYPE_CUDA_TIMING)    
     GET_TIME( end_total );
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
index 35a4ff73078..bbc18989e6e 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
@@ -1,3 +1,6 @@
+#include "opal/datatype/opal_convertor_internal.h"
+#include "opal/datatype/opal_datatype_internal.h"
+
 #include "opal_datatype_cuda_internal.cuh"
 #include <cuda.h>
 #include <stdio.h> 
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
index fd4fec00a73..13531b93d3e 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
@@ -1,3 +1,6 @@
+#include "opal/datatype/opal_convertor_internal.h"
+#include "opal/datatype/opal_datatype_internal.h"
+
 #include "opal_datatype_cuda_internal.cuh"
 #include "opal_datatype_cuda.cuh"
 
@@ -298,8 +301,6 @@ int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvert
     description = pConvertor->use_desc->desc;
     pStack = pConvertor->pStack + pConvertor->stack_pos;
     pElem = &(description[pStack->index]);
-    printf("size elem %lu, size %d\n", pElem->elem.common.type, opal_datatype_basicDatatypesSize[pElem->elem.common.type]);
-    
     DT_CUDA_DEBUG ( opal_cuda_output(0, "GPU datatype UNpacking using iovec\n"); );
     
     // double *vtmp = (double *)iov[0].iov_base;
@@ -347,8 +348,8 @@ int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvert
     cuda_streams->current_stream_id = 0;
     convertor_flags = pConvertor->flags;
     complete_flag = opal_convertor_raw( pConvertor, cuda_iov, &cuda_iov_count, &length );
-    DT_CUDA_DEBUG ( opal_cuda_output(1, "complete flag %d, iov count %d, length %d, submit to CUDA stream %d\n", complete_flag, cuda_iov_count, length, cuda_streams->current_stream_id); );
-    
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "UNPACKING complete flag %d, iov count %d, length %d, submit to CUDA stream %d\n", complete_flag, cuda_iov_count, length, cuda_streams->current_stream_id); );
+
 #if defined (OPAL_DATATYPE_CUDA_TIMING)
     GET_TIME( end );
     total_time = ELAPSED_TIME( start, end );
@@ -377,7 +378,7 @@ int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvert
             if (buffer_size >= cuda_iov[i].iov_len) {
                 length_per_iovec = cuda_iov[i].iov_len;
             } else {
-                orig_alignment = opal_datatype_basicDatatypesSize[pElem->elem.common.type];
+                orig_alignment = opal_datatype_basicDatatypes[pElem->elem.common.type]->size;
                 length_per_iovec = buffer_size / orig_alignment * orig_alignment;
                 buffer_isfull = 1;
             }
@@ -398,7 +399,7 @@ int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvert
             count_desc = length_per_iovec / alignment;
             residue_desc = length_per_iovec % alignment;
             nb_blocks_per_description = (count_desc + thread_per_block - 1) / thread_per_block;
-            DT_CUDA_DEBUG ( opal_cuda_output(2, "description %d, size %d, residue %d, alignment %d\n", i, count_desc, residue_desc, alignment); );
+            DT_CUDA_DEBUG ( opal_cuda_output(10, "UNPACKING description %d, size %d, residue %d, alignment %d\n", i, count_desc, residue_desc, alignment); );
             for (j = 0; j < nb_blocks_per_description; j++) {
                 cuda_iov_dist_h_current[current_block].dst[task_iteration] = (unsigned char *)(cuda_iov[i].iov_base) + j * thread_per_block * alignment;
                 cuda_iov_dist_h_current[current_block].src[task_iteration] = source;
@@ -410,7 +411,7 @@ int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvert
                     cuda_iov_dist_h_current[current_block].nb_elements[task_iteration] = (thread_per_block - ((j+1)*thread_per_block - count_desc));// * sizeof(double);
                 }
                 source += cuda_iov_dist_h_current[current_block].nb_elements[task_iteration] * alignment;
-                DT_CUDA_DEBUG( opal_cuda_output(3, "\tblock %d, src %p, dst %p, nb_elements %d, alignment %d\n", current_block, cuda_iov_dist_h_current[current_block].src[task_iteration], cuda_iov_dist_h_current[current_block].dst[task_iteration], cuda_iov_dist_h_current[current_block].nb_elements[task_iteration], cuda_iov_dist_h_current[current_block].element_alignment[task_iteration]); );
+                DT_CUDA_DEBUG( opal_cuda_output(12, "UNPACKING \tblock %d, src %p, dst %p, nb_elements %d, alignment %d\n", current_block, cuda_iov_dist_h_current[current_block].src[task_iteration], cuda_iov_dist_h_current[current_block].dst[task_iteration], cuda_iov_dist_h_current[current_block].nb_elements[task_iteration], cuda_iov_dist_h_current[current_block].element_alignment[task_iteration]); );
                 current_block += 1;
                 if (current_block >= nb_blocks) {
                     current_block = 0;
@@ -421,14 +422,14 @@ int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvert
             
             /* handle residue */
             if (residue_desc != 0) {
-                orig_alignment = opal_datatype_basicDatatypesSize[pElem->elem.common.type];
+                orig_alignment = opal_datatype_basicDatatypes[pElem->elem.common.type]->size;
                 cuda_iov_dist_h_current[current_block].dst[task_iteration] = (unsigned char *)(cuda_iov[i].iov_base) + length_per_iovec / alignment * alignment;
                 cuda_iov_dist_h_current[current_block].src[task_iteration] = source;
                 cuda_iov_dist_h_current[current_block].element_alignment[task_iteration] = orig_alignment;
                 cuda_iov_dist_h_current[current_block].nb_tasks = task_iteration + 1;
                 cuda_iov_dist_h_current[current_block].nb_elements[task_iteration] = (length_per_iovec - length_per_iovec / alignment * alignment) / orig_alignment;
                 source += cuda_iov_dist_h_current[current_block].nb_elements[task_iteration] * orig_alignment;
-                DT_CUDA_DEBUG( opal_cuda_output(3, "\tblock %d, src %p, dst %p, nb_elements %d, alignment %d\n", current_block, cuda_iov_dist_h_current[current_block].src[task_iteration], cuda_iov_dist_h_current[current_block].dst[task_iteration], cuda_iov_dist_h_current[current_block].nb_elements[task_iteration], cuda_iov_dist_h_current[current_block].element_alignment[task_iteration]); );
+                DT_CUDA_DEBUG( opal_cuda_output(12, "UNPACKING \tblock %d, src %p, dst %p, nb_elements %d, alignment %d\n", current_block, cuda_iov_dist_h_current[current_block].src[task_iteration], cuda_iov_dist_h_current[current_block].dst[task_iteration], cuda_iov_dist_h_current[current_block].nb_elements[task_iteration], cuda_iov_dist_h_current[current_block].element_alignment[task_iteration]); );
                 current_block += 1;
                 if (current_block >= nb_blocks) {
                     current_block = 0;
@@ -465,8 +466,8 @@ int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvert
 #endif   
         convertor_flags = pConvertor->flags;     
         complete_flag = opal_convertor_raw( pConvertor, cuda_iov, &cuda_iov_count, &length );
-        DT_CUDA_DEBUG ( opal_cuda_output(1, "complete flag %d, iov count %d, length %d, submit to CUDA stream %d\n", complete_flag, cuda_iov_count, length, cuda_streams->current_stream_id); );
-#if defined(OPAL_DATATYPE_CUDA_TIMING)    
+        DT_CUDA_DEBUG ( opal_cuda_output(8, "UNPACKING complete flag %d, iov count %d, length %d, submit to CUDA stream %d\n", complete_flag, cuda_iov_count, length, cuda_streams->current_stream_id); );
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
         GET_TIME( end );
         total_time = ELAPSED_TIME( start, end );
         printf( "[Timing]: ddt to iov in %ld microsec\n", total_time );
@@ -478,9 +479,9 @@ int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvert
     iov[0].iov_len = total_unpacked;
     *max_data = total_unpacked;
     *out_size = 1;
-    DT_CUDA_DEBUG ( opal_cuda_output(0, "total unpacked %d\n", total_unpacked); );
-    
-#if defined(OPAL_DATATYPE_CUDA_TIMING)    
+    DT_CUDA_DEBUG ( opal_cuda_output(0, "UNPACKING total unpacked %d\n", total_unpacked); );
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
     GET_TIME( end_total );
     total_time = ELAPSED_TIME( start_total, end_total );
     printf( "[Timing]: total unpacking in %ld microsec\n", total_time );
diff --git a/opal/mca/btl/smcuda/btl_smcuda.c b/opal/mca/btl/smcuda/btl_smcuda.c
index 6041a8b64e8..0c80a1d8b5b 100644
--- a/opal/mca/btl/smcuda/btl_smcuda.c
+++ b/opal/mca/btl/smcuda/btl_smcuda.c
@@ -1114,7 +1114,7 @@ int mca_btl_smcuda_get_cuda (struct mca_btl_base_module_t *btl,
     offset = (size_t) ((intptr_t) remote_address - (intptr_t) reg_ptr->base.base);
     remote_memory_address = (unsigned char *)reg_ptr->base.alloc_base + offset;
     if (0 != offset) {
-        printf("!!!!!!offset %d, ra %p, base %p\n", offset, (void*)remote_address, (void*)reg_ptr->base.base);
+        printf("!!!!!!offset %lu, ra %p, base %p\n", offset, (void*)remote_address, (void*)reg_ptr->base.base);
         opal_output(-1, "OFFSET=%d", (int)offset);
     }
 
@@ -1144,6 +1144,7 @@ int mca_btl_smcuda_get_cuda (struct mca_btl_base_module_t *btl,
             convertor->gpu_buffer_ptr = remote_memory_address;
             mca_btl_smcuda_cuda_dt_unpack_clone(convertor, ep, local_address, local_handle, (mca_btl_base_completion_fn_t)cbfunc, cbcontext, cbdata, pipeline_size, lindex);
             done = 0;
+            mca_btl_smcuda_free(btl, (mca_btl_base_descriptor_t *)frag);
         } else {
             recvreq->req_recv.req_base.req_convertor.flags |= CONVERTOR_CUDA;
             rc = mca_common_cuda_memcpy(local_address, remote_memory_address, size,
@@ -1259,6 +1260,7 @@ int mca_btl_smcuda_send_cuda_unpack_sig(struct mca_btl_base_module_t* btl,
     /* allocate a fragment, giving up if we can't get one */
     MCA_BTL_SMCUDA_FRAG_ALLOC_EAGER(frag);
     if( OPAL_UNLIKELY(NULL == frag) ) {
+        printf("!!!!!!!!!! no frag \n");
         return OPAL_ERR_OUT_OF_RESOURCE;;
     }
 
@@ -1269,6 +1271,7 @@ int mca_btl_smcuda_send_cuda_unpack_sig(struct mca_btl_base_module_t* btl,
     memcpy(frag->segment.seg_addr.pval, &cuda_dt_hdr, sizeof(cuda_dt_hdr_t));
     
     rc = mca_btl_smcuda_send(btl, endpoint, (struct mca_btl_base_descriptor_t*)frag,  MCA_BTL_TAG_SMCUDA_DATATYPE_UNPACK);
+    printf("######## rank %d, send seq %d, endpoint %p\n", endpoint->my_smp_rank, seq, endpoint);
     return rc;
 }
 
@@ -1295,6 +1298,41 @@ int mca_btl_smcuda_send_cuda_pack_sig(struct mca_btl_base_module_t* btl,
     return rc;
 }
 
+int mca_btl_smcuda_check_cuda_dt_pack_clone_exist(struct mca_btl_base_endpoint_t *endpoint, struct opal_convertor_t *convertor)
+{
+    int i;
+    for (i = 0; i < SMCUDA_DT_CLONE_SIZE; i++) {
+        if (endpoint->smcuda_dt_pack_clone[i].convertor == convertor) {
+            return i;
+        }
+    }
+    return -1;
+}
+
+int mca_btl_smcuda_set_cuda_dt_pack_seq(struct mca_btl_base_endpoint_t *endpoint, int lindex, int seq)
+{
+    endpoint->smcuda_dt_pack_clone[lindex].seq = seq;
+    return 0;
+}
+
+int mca_btl_smcuda_get_cuda_dt_pack_seq(struct mca_btl_base_endpoint_t *endpoint, int lindex)
+{
+    if (lindex >= SMCUDA_DT_CLONE_SIZE) {
+        return -9;
+    } else {
+        return endpoint->smcuda_dt_pack_clone[lindex].seq;
+    }
+}
+
+int mca_btl_smcuda_get_cuda_dt_pack_pipeline_size(struct mca_btl_base_endpoint_t *endpoint, int lindex)
+{
+    if (lindex >= SMCUDA_DT_CLONE_SIZE) {
+        return -9;
+    } else {
+        return endpoint->smcuda_dt_pack_clone[lindex].pipeline_size;
+    }
+}
+
 int mca_btl_smcuda_alloc_cuda_dt_pack_clone(struct mca_btl_base_endpoint_t *endpoint)
 {
     int i;
@@ -1347,6 +1385,7 @@ void mca_btl_smcuda_cuda_dt_pack_clone(struct opal_convertor_t *convertor,
     endpoint->smcuda_dt_pack_clone[lindex].cbdata = cbdata;
     endpoint->smcuda_dt_pack_clone[lindex].pipeline_size = pipeline_size;
     endpoint->smcuda_dt_pack_clone[lindex].lindex = lindex;
+    endpoint->smcuda_dt_pack_clone[lindex].seq = -9;
 }
 
 void mca_btl_smcuda_cuda_dt_unpack_clone(struct opal_convertor_t *convertor,
@@ -1369,6 +1408,7 @@ void mca_btl_smcuda_cuda_dt_unpack_clone(struct opal_convertor_t *convertor,
     endpoint->smcuda_dt_unpack_clone[lindex].cbdata = cbdata;
     endpoint->smcuda_dt_unpack_clone[lindex].pipeline_size = pipeline_size;
     endpoint->smcuda_dt_unpack_clone[lindex].lindex = lindex;
+    endpoint->smcuda_dt_unpack_clone[lindex].seq = -9;
 }
 
 #endif /* OPAL_CUDA_SUPPORT */
diff --git a/opal/mca/btl/smcuda/btl_smcuda.h b/opal/mca/btl/smcuda/btl_smcuda.h
index c43fbe0b190..a1173502449 100644
--- a/opal/mca/btl/smcuda/btl_smcuda.h
+++ b/opal/mca/btl/smcuda/btl_smcuda.h
@@ -526,6 +526,7 @@ typedef struct {
     void *cbdata;
     size_t pipeline_size;
     int lindex;
+    int seq;
 } cuda_dt_clone_t;
 
 #define SMCUDA_DT_CLONE_SIZE 20
@@ -533,6 +534,10 @@ extern cuda_dt_clone_t smcuda_dt_clone[SMCUDA_DT_CLONE_SIZE];
 
 int mca_btl_smcuda_send_cuda_unpack_sig(struct mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t* endpoint, int lindex, int seq);
 int mca_btl_smcuda_send_cuda_pack_sig(struct mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t* endpoint, int lindex, int seq);
+int mca_btl_smcuda_check_cuda_dt_pack_clone_exist(struct mca_btl_base_endpoint_t *endpoint, struct opal_convertor_t *convertor);
+int mca_btl_smcuda_set_cuda_dt_pack_seq(struct mca_btl_base_endpoint_t *endpoint, int lindex, int seq);
+int mca_btl_smcuda_get_cuda_dt_pack_seq(struct mca_btl_base_endpoint_t *endpoint, int lindex);
+int mca_btl_smcuda_get_cuda_dt_pack_pipeline_size(struct mca_btl_base_endpoint_t *endpoint, int lindex);
 int mca_btl_smcuda_alloc_cuda_dt_pack_clone(struct mca_btl_base_endpoint_t *endpoint);
 int mca_btl_smcuda_alloc_cuda_dt_unpack_clone(struct mca_btl_base_endpoint_t *endpoint);
 void mca_btl_smcuda_free_cuda_dt_pack_clone(struct mca_btl_base_endpoint_t *endpoint, int lindex);
diff --git a/opal/mca/btl/smcuda/btl_smcuda_component.c b/opal/mca/btl/smcuda/btl_smcuda_component.c
index 4633134bac5..8a113ab5a01 100644
--- a/opal/mca/btl/smcuda/btl_smcuda_component.c
+++ b/opal/mca/btl/smcuda/btl_smcuda_component.c
@@ -883,7 +883,7 @@ static void btl_smcuda_datatype_unpack(mca_btl_base_module_t* btl,
         iov.iov_len = my_cuda_dt_clone->pipeline_size;
         opal_convertor_unpack(convertor, &iov, &iov_count, &max_data );
     }
-    
+   // MCA_BTL_SMCUDA_FRAG_RETURN(frag);
 }
 
 static void btl_smcuda_datatype_pack(mca_btl_base_module_t* btl,
@@ -910,6 +910,7 @@ static void btl_smcuda_datatype_pack(mca_btl_base_module_t* btl,
         opal_cuda_free_gpu_buffer_p(my_cuda_dt_clone->gpu_ptr, 0);
         mca_btl_smcuda_free_cuda_dt_pack_clone(my_cuda_dt_clone->endpoint, lindex);
     }
+  //  MCA_BTL_SMCUDA_FRAG_RETURN(frag);
 }
 
 #endif /* OPAL_CUDA_SUPPORT */
@@ -1187,6 +1188,12 @@ int mca_btl_smcuda_component_progress(void)
                                           &frag->base, status?OPAL_ERROR:OPAL_SUCCESS);
                 }
                 if( btl_ownership ) {
+                    if (frag->hdr->tag == MCA_BTL_TAG_SMCUDA_DATATYPE_PACK) {
+                        printf("&&&&&&&&&&&&&&&&&&got PACK TAG\n");
+                    }
+                    if (frag->hdr->tag == MCA_BTL_TAG_SMCUDA_DATATYPE_UNPACK) {
+                        printf("&&&&&&&&&&&&&&&&&&got UNPACK TAG\n");
+                    }
                     MCA_BTL_SMCUDA_FRAG_RETURN(frag);
                 }
                 OPAL_THREAD_ADD32(&mca_btl_smcuda_component.num_outstanding_frags, -1);
diff --git a/opal/mca/common/cuda/common_cuda.h b/opal/mca/common/cuda/common_cuda.h
index 0b5a724d9dc..20290dff7d8 100644
--- a/opal/mca/common/cuda/common_cuda.h
+++ b/opal/mca/common/cuda/common_cuda.h
@@ -41,6 +41,7 @@ struct mca_mpool_common_cuda_reg_data_t {
     // uint64_t pipeline_evtHandle[MAX_IPC_EVENT_HANDLE*EVTHANDLE_SIZE];
     size_t pipeline_size;
     uint32_t lindex;
+    uint8_t pack_required;
 };
 typedef struct mca_mpool_common_cuda_reg_data_t mca_mpool_common_cuda_reg_data_t;
 
diff --git a/test/datatype/Makefile.am b/test/datatype/Makefile.am
index b9232a59893..b32f3a64713 100644
--- a/test/datatype/Makefile.am
+++ b/test/datatype/Makefile.am
@@ -29,7 +29,7 @@ unpack_ooo_LDADD = \
 
 ddt_test_SOURCES = ddt_test.c ddt_lib.c ddt_lib.h
 ddt_test_LDFLAGS = $(WRAPPER_EXTRA_LDFLAGS)
-ddt_test_CFLAGS = -I/mnt/sw/cuda/include -g 
+ddt_test_CFLAGS = -I/mnt/sw/cuda/include -g -O0
 ddt_test_LDADD = $(top_builddir)/ompi/libmpi.la $(top_builddir)/opal/mca/common/cuda/libmca_common_cuda.la -L/mnt/sw/cuda/lib64 -lcudart
 
 #ddt_test_old_SOURCES = ddt_test_old.c ddt_lib.c ddt_lib.h
diff --git a/test/datatype/ddt_test.c b/test/datatype/ddt_test.c
index 98aa6f1347a..459566eaa09 100644
--- a/test/datatype/ddt_test.c
+++ b/test/datatype/ddt_test.c
@@ -830,7 +830,7 @@ int main( int argc, char* argv[] )
     if( outputFlags & CHECK_PACK_UNPACK ) {
         for (i = 1; i <= 1; i++) {
 //        local_copy_ddt_count(pdt, 1);
-    //        local_copy_with_convertor(pdt, 1, 1024*1024*10, 4000);
+            local_copy_with_convertor(pdt, 1, 1024*1024*10, 4000);
         }
     }
     OBJ_RELEASE( pdt ); assert( pdt == NULL );

From a670db4c9c38e1300a20d594ba807cc83925d0dd Mon Sep 17 00:00:00 2001
From: eddy16112 <eddy16112@gmail.com>
Date: Fri, 21 Aug 2015 22:20:54 -0400
Subject: [PATCH 014/190] big changes, now pack is driven by receiver by active
 message

---
 ompi/mca/pml/ob1/pml_ob1_cuda.c            | 126 +++++----------------
 opal/mca/btl/smcuda/btl_smcuda.c           |  12 +-
 opal/mca/btl/smcuda/btl_smcuda.h           |   5 +-
 opal/mca/btl/smcuda/btl_smcuda_component.c |  33 +++++-
 4 files changed, 69 insertions(+), 107 deletions(-)

diff --git a/ompi/mca/pml/ob1/pml_ob1_cuda.c b/ompi/mca/pml/ob1/pml_ob1_cuda.c
index f4d4907c336..338cff3ebc8 100644
--- a/ompi/mca/pml/ob1/pml_ob1_cuda.c
+++ b/ompi/mca/pml/ob1/pml_ob1_cuda.c
@@ -49,10 +49,10 @@ size_t mca_pml_ob1_rdma_cuda_btls(
     size_t size,
     mca_pml_ob1_com_btl_t* rdma_btls);
     
-int mca_pml_ob1_rdma_cuda_btl_register_events(
+int mca_pml_ob1_rdma_cuda_btl_register_data(
     mca_pml_ob1_com_btl_t* rdma_btls, 
     uint32_t num_btls_used, 
-    struct opal_convertor_t* convertor, size_t pipeline_size, int lindex);
+    size_t pipeline_size, int lindex, uint8_t pack_required);
 
 int mca_pml_ob1_cuda_need_buffers(void * rreq,
                                   mca_btl_base_module_t* btl);
@@ -106,102 +106,34 @@ int mca_pml_ob1_send_request_start_cuda(mca_pml_ob1_send_request_t* sendreq,
         sendreq->req_send.req_base.req_convertor.flags |= CONVERTOR_CUDA;
         mca_bml_base_btl_t* bml_endpoint_btl = mca_bml_base_btl_array_get_index(&(sendreq->req_endpoint->btl_send), 0);
         if ((bml_endpoint_btl->btl_flags & MCA_BTL_FLAGS_CUDA_GET) && CUDA_DDT_WITH_RDMA) {
-            
-            int seq = 0;
-            int rc_dt = 0;
-            int rc_sig = 0;
+            printf("GPU data ready for GET!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n");
             unsigned char *base;
-            struct iovec iov;
-            size_t pipeline_size = 0;
-            uint32_t iov_count = 1;
-            size_t max_data = 0;
             struct opal_convertor_t *convertor = &(sendreq->req_send.req_base.req_convertor);
-            int lindex = mca_btl_smcuda_check_cuda_dt_pack_clone_exist(bml_btl->btl_endpoint, convertor); 
-            if (lindex == -1) {
-                /* this is the first time for this convertor */
-                printf("GPU data ready for GET!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n");
-                base = opal_cuda_malloc_gpu_buffer_p(convertor->local_size, 0);
-                convertor->gpu_buffer_ptr = base;
-                sendreq->req_send.req_bytes_packed = convertor->local_size;
-                printf("GPU BUFFER %p, local %lu, remote %lu\n", base, convertor->local_size, convertor->remote_size);
-                if( 0 != (sendreq->req_rdma_cnt = (uint32_t)mca_pml_ob1_rdma_cuda_btls(
-                                                                               sendreq->req_endpoint,
-                                                                               base,
-                                                                               sendreq->req_send.req_bytes_packed,
-                                                                               sendreq->req_rdma))) {
-                
-                    pipeline_size = 1024*1024;
-                    iov.iov_base = base;
-                    iov.iov_len = pipeline_size;
-                    max_data = 0;
-                    /* the first pack here is used to get the correct size of pipeline_size */
-                    /* because pack may not use the whole pipeline size */
-                    rc_dt = opal_convertor_pack(convertor, &iov, &iov_count, &max_data );
-                    pipeline_size = max_data;
-                    lindex = mca_btl_smcuda_alloc_cuda_dt_pack_clone(bml_btl->btl_endpoint);
-                    assert(lindex >= 0);
-                    mca_pml_ob1_rdma_cuda_btl_register_events(sendreq->req_rdma, sendreq->req_rdma_cnt, convertor, pipeline_size, lindex); 
-                    mca_btl_smcuda_cuda_dt_pack_clone(convertor, bml_btl->btl_endpoint, NULL, NULL, NULL, NULL, NULL, pipeline_size, lindex);
-                
-                    rc = mca_pml_ob1_send_request_start_rdma(sendreq, bml_btl,
-                                                             sendreq->req_send.req_bytes_packed);
-                
-                    rc_sig = mca_btl_smcuda_send_cuda_unpack_sig(bml_btl->btl, bml_btl->btl_endpoint, lindex, seq);
-                    if (rc_sig == OPAL_ERR_OUT_OF_RESOURCE) {
-                        mca_btl_smcuda_set_cuda_dt_pack_seq(bml_btl->btl_endpoint, lindex, seq);
-                        return rc_sig;
-                    }
-                    while (rc_dt != 1) {
-                        iov.iov_base += pipeline_size;
-                        seq ++;
-                        rc_dt = opal_convertor_pack(convertor, &iov, &iov_count, &max_data );
-                        rc_sig = mca_btl_smcuda_send_cuda_unpack_sig(bml_btl->btl, bml_btl->btl_endpoint, lindex, seq);
-                        if (rc_sig == OPAL_ERR_OUT_OF_RESOURCE) {
-                            mca_btl_smcuda_set_cuda_dt_pack_seq(bml_btl->btl_endpoint, lindex, seq);
-                            return rc_sig;
-                        }
-                    }
-                    rc_sig = mca_btl_smcuda_send_cuda_unpack_sig(bml_btl->btl, bml_btl->btl_endpoint, lindex, -1);
-                    if (rc_sig == OPAL_ERR_OUT_OF_RESOURCE) {
-                        mca_btl_smcuda_set_cuda_dt_pack_seq(bml_btl->btl_endpoint, lindex, -1);
-                        return rc_sig;
-                    }
-                    if( OPAL_UNLIKELY(OMPI_SUCCESS != rc) ) {
-                        mca_pml_ob1_free_rdma_resources(sendreq);
-                    }
-                } else {
-                    rc = mca_pml_ob1_send_request_start_rndv(sendreq, bml_btl, 0, 0);
-                }
-            } else { /* RMDA has been started before, but no resource (frag) last time, so back to re-schedule */
-                seq = mca_btl_smcuda_get_cuda_dt_pack_seq(bml_btl->btl_endpoint, lindex);
-                pipeline_size = mca_btl_smcuda_get_cuda_dt_pack_pipeline_size(bml_btl->btl_endpoint, lindex);
-                printf("*****************I resent seq %d, pipeline %lu\n", seq, pipeline_size);
-                rc_dt = 0;
-                rc_sig = mca_btl_smcuda_send_cuda_unpack_sig(bml_btl->btl, bml_btl->btl_endpoint, lindex, seq);
-                if (rc_sig == OPAL_ERR_OUT_OF_RESOURCE) {
-                    mca_btl_smcuda_set_cuda_dt_pack_seq(bml_btl->btl_endpoint, lindex, seq);
-                    return rc_sig;
-                }
-                if (seq != -1) {
-                    
-                    while (rc_dt != 1) {
-                        seq ++;
-                        iov.iov_base = convertor->gpu_buffer_ptr + pipeline_size * seq;
-                        iov.iov_len = pipeline_size;
-                        rc_dt = opal_convertor_pack(convertor, &iov, &iov_count, &pipeline_size );     
-                        rc_sig = mca_btl_smcuda_send_cuda_unpack_sig(bml_btl->btl, bml_btl->btl_endpoint, lindex, seq);
-                        if (rc_sig == OPAL_ERR_OUT_OF_RESOURCE) {
-                            mca_btl_smcuda_set_cuda_dt_pack_seq(bml_btl->btl_endpoint, lindex, seq);
-                            return rc_sig;
-                        }
-                    }
-                    rc_sig = mca_btl_smcuda_send_cuda_unpack_sig(bml_btl->btl, bml_btl->btl_endpoint, lindex, -1);
-                    if (rc_sig == OPAL_ERR_OUT_OF_RESOURCE) {
-                        mca_btl_smcuda_set_cuda_dt_pack_seq(bml_btl->btl_endpoint, lindex, -1);
-                        return rc_sig;
-                    }
+            base = opal_cuda_malloc_gpu_buffer_p(convertor->local_size, 0);
+            convertor->gpu_buffer_ptr = base;
+            sendreq->req_send.req_bytes_packed = convertor->local_size;
+            printf("GPU BUFFER %p, local %lu, remote %lu\n", base, convertor->local_size, convertor->remote_size);
+            if( 0 != (sendreq->req_rdma_cnt = (uint32_t)mca_pml_ob1_rdma_cuda_btls(
+                                                                           sendreq->req_endpoint,
+                                                                           base,
+                                                                           sendreq->req_send.req_bytes_packed,
+                                                                           sendreq->req_rdma))) {
+    
+                int lindex = mca_btl_smcuda_alloc_cuda_dt_pack_clone(bml_btl->btl_endpoint);
+                assert(lindex >= 0);
+                mca_pml_ob1_rdma_cuda_btl_register_data(sendreq->req_rdma, sendreq->req_rdma_cnt, 0, lindex, 1); 
+                mca_btl_smcuda_cuda_dt_pack_clone(convertor, bml_btl->btl_endpoint, NULL, NULL, NULL, NULL, NULL, 0, lindex);
+    
+                rc = mca_pml_ob1_send_request_start_rdma(sendreq, bml_btl,
+                                                         sendreq->req_send.req_bytes_packed);
+    
+                if( OPAL_UNLIKELY(OMPI_SUCCESS != rc) ) {
+                    mca_pml_ob1_free_rdma_resources(sendreq);
                 }
+            } else {
+                rc = mca_pml_ob1_send_request_start_rndv(sendreq, bml_btl, 0, 0);
             }
+
             
         } else {
             rc = mca_pml_ob1_send_request_start_rndv(sendreq, bml_btl, 0, 0);
@@ -269,10 +201,10 @@ size_t mca_pml_ob1_rdma_cuda_btls(
     return num_btls_used;
 }
 
-int mca_pml_ob1_rdma_cuda_btl_register_events(
+int mca_pml_ob1_rdma_cuda_btl_register_data(
     mca_pml_ob1_com_btl_t* rdma_btls, 
     uint32_t num_btls_used, 
-    struct opal_convertor_t* convertor, size_t pipeline_size, int lindex)
+    size_t pipeline_size, int lindex, uint8_t pack_required)
 {
     uint32_t i, j;
     for (i = 0; i < num_btls_used; i++) {
@@ -284,9 +216,9 @@ int mca_pml_ob1_rdma_cuda_btl_register_events(
       //       mca_common_cuda_geteventhandle(&convertor->pipeline_event[j], j, (mca_mpool_base_registration_t *)cuda_reg);
       // //      printf("event %lu, j %d\n", convertor->pipeline_event[j], j);
       //   }
-        printf("i send pipeline %ld\n", pipeline_size);
         cuda_reg->data.pipeline_size = pipeline_size;
         cuda_reg->data.lindex = lindex;
+        cuda_reg->data.pack_required = pack_required;
 
     }
     return 0;
diff --git a/opal/mca/btl/smcuda/btl_smcuda.c b/opal/mca/btl/smcuda/btl_smcuda.c
index 0c80a1d8b5b..96ca945e0dc 100644
--- a/opal/mca/btl/smcuda/btl_smcuda.c
+++ b/opal/mca/btl/smcuda/btl_smcuda.c
@@ -1140,9 +1140,11 @@ int mca_btl_smcuda_get_cuda (struct mca_btl_base_module_t *btl,
             struct opal_convertor_t *convertor = &(recvreq->req_recv.req_base.req_convertor);   
             size_t pipeline_size = remote_handle->reg_data.pipeline_size;
             uint32_t lindex = remote_handle->reg_data.lindex;
-            printf("i receive pipeline %ld, lindex %d\n", pipeline_size, lindex);
+            uint8_t pack_required = remote_handle->reg_data.pack_required;
+            printf("i receive pipeline %ld, lindex %d, pack_required %d\n", pipeline_size, lindex, pack_required);
             convertor->gpu_buffer_ptr = remote_memory_address;
             mca_btl_smcuda_cuda_dt_unpack_clone(convertor, ep, local_address, local_handle, (mca_btl_base_completion_fn_t)cbfunc, cbcontext, cbdata, pipeline_size, lindex);
+            mca_btl_smcuda_send_cuda_pack_sig(btl, ep, lindex, 0, 0);
             done = 0;
             mca_btl_smcuda_free(btl, (mca_btl_base_descriptor_t *)frag);
         } else {
@@ -1251,7 +1253,8 @@ static void mca_btl_smcuda_send_cuda_ipc_request(struct mca_btl_base_module_t* b
 }
 
 int mca_btl_smcuda_send_cuda_unpack_sig(struct mca_btl_base_module_t* btl,
-                                           struct mca_btl_base_endpoint_t* endpoint, int lindex, int seq)
+                                           struct mca_btl_base_endpoint_t* endpoint, 
+                                           int lindex, int pipeline_size, int seq)
 {
     mca_btl_smcuda_frag_t* frag;
     int rc;
@@ -1268,6 +1271,7 @@ int mca_btl_smcuda_send_cuda_unpack_sig(struct mca_btl_base_module_t* btl,
     frag->base.des_flags = MCA_BTL_DES_FLAGS_BTL_OWNERSHIP;
     cuda_dt_hdr.seq = seq;
     cuda_dt_hdr.lindex = lindex;
+    cuda_dt_hdr.pipeline_size = pipeline_size;
     memcpy(frag->segment.seg_addr.pval, &cuda_dt_hdr, sizeof(cuda_dt_hdr_t));
     
     rc = mca_btl_smcuda_send(btl, endpoint, (struct mca_btl_base_descriptor_t*)frag,  MCA_BTL_TAG_SMCUDA_DATATYPE_UNPACK);
@@ -1276,7 +1280,8 @@ int mca_btl_smcuda_send_cuda_unpack_sig(struct mca_btl_base_module_t* btl,
 }
 
 int mca_btl_smcuda_send_cuda_pack_sig(struct mca_btl_base_module_t* btl,
-                                      struct mca_btl_base_endpoint_t* endpoint, int lindex, int seq)
+                                      struct mca_btl_base_endpoint_t* endpoint, 
+                                      int lindex, int pipeline_size, int seq)
 {
     mca_btl_smcuda_frag_t* frag;
     int rc;
@@ -1292,6 +1297,7 @@ int mca_btl_smcuda_send_cuda_pack_sig(struct mca_btl_base_module_t* btl,
     frag->base.des_flags = MCA_BTL_DES_FLAGS_BTL_OWNERSHIP;
     cuda_dt_hdr.seq = seq;
     cuda_dt_hdr.lindex = lindex;
+    cuda_dt_hdr.pipeline_size = pipeline_size;
     memcpy(frag->segment.seg_addr.pval, &cuda_dt_hdr, sizeof(cuda_dt_hdr_t));
     
     rc = mca_btl_smcuda_send(btl, endpoint, (struct mca_btl_base_descriptor_t*)frag,  MCA_BTL_TAG_SMCUDA_DATATYPE_PACK);
diff --git a/opal/mca/btl/smcuda/btl_smcuda.h b/opal/mca/btl/smcuda/btl_smcuda.h
index a1173502449..a90ba5c0f19 100644
--- a/opal/mca/btl/smcuda/btl_smcuda.h
+++ b/opal/mca/btl/smcuda/btl_smcuda.h
@@ -512,6 +512,7 @@ enum ipcState {
 typedef struct {
     int seq;
     int lindex;
+    int pipeline_size;
 } cuda_dt_hdr_t;
 
 /* package save pack/unpack convertor and cbfunc */
@@ -532,8 +533,8 @@ typedef struct {
 #define SMCUDA_DT_CLONE_SIZE 20
 extern cuda_dt_clone_t smcuda_dt_clone[SMCUDA_DT_CLONE_SIZE];
 
-int mca_btl_smcuda_send_cuda_unpack_sig(struct mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t* endpoint, int lindex, int seq);
-int mca_btl_smcuda_send_cuda_pack_sig(struct mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t* endpoint, int lindex, int seq);
+int mca_btl_smcuda_send_cuda_unpack_sig(struct mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t* endpoint, int lindex, int pipeline_size, int seq);
+int mca_btl_smcuda_send_cuda_pack_sig(struct mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t* endpoint, int lindex, int pipeline_size, int seq);
 int mca_btl_smcuda_check_cuda_dt_pack_clone_exist(struct mca_btl_base_endpoint_t *endpoint, struct opal_convertor_t *convertor);
 int mca_btl_smcuda_set_cuda_dt_pack_seq(struct mca_btl_base_endpoint_t *endpoint, int lindex, int seq);
 int mca_btl_smcuda_get_cuda_dt_pack_seq(struct mca_btl_base_endpoint_t *endpoint, int lindex);
diff --git a/opal/mca/btl/smcuda/btl_smcuda_component.c b/opal/mca/btl/smcuda/btl_smcuda_component.c
index 8a113ab5a01..5fd845edf24 100644
--- a/opal/mca/btl/smcuda/btl_smcuda_component.c
+++ b/opal/mca/btl/smcuda/btl_smcuda_component.c
@@ -857,6 +857,7 @@ static void btl_smcuda_datatype_unpack(mca_btl_base_module_t* btl,
     memcpy(&cuda_dt_hdr, segments->seg_addr.pval, sizeof(cuda_dt_hdr_t));
     int seq = cuda_dt_hdr.seq;
     int lindex = cuda_dt_hdr.lindex;
+    int pipeline_size = cuda_dt_hdr.pipeline_size;
     mca_btl_smcuda_frag_t *frag = (mca_btl_smcuda_frag_t *)des;
     cuda_dt_clone_t *my_cuda_dt_clone;
 
@@ -872,15 +873,15 @@ static void btl_smcuda_datatype_unpack(mca_btl_base_module_t* btl,
         cbfunc(btl, endpoint, my_cuda_dt_clone->local_address, my_cuda_dt_clone->local_handle, my_cuda_dt_clone->cbcontext, my_cuda_dt_clone->cbdata, OPAL_SUCCESS);
         mca_btl_smcuda_free_cuda_dt_unpack_clone(endpoint, lindex);
     } else if (seq == -1) {
-        mca_btl_smcuda_send_cuda_pack_sig(btl, endpoint, lindex, -1);
+        mca_btl_smcuda_send_cuda_pack_sig(btl, endpoint, lindex, pipeline_size, -1);
     } else {
         struct iovec iov;
         uint32_t iov_count = 1;
         size_t max_data;
         struct opal_convertor_t *convertor = my_cuda_dt_clone->convertor;   
-        iov.iov_base = convertor->gpu_buffer_ptr + seq * my_cuda_dt_clone->pipeline_size;
-        max_data = my_cuda_dt_clone->pipeline_size;
-        iov.iov_len = my_cuda_dt_clone->pipeline_size;
+        iov.iov_base = convertor->gpu_buffer_ptr + seq * pipeline_size;
+        max_data = pipeline_size;
+        iov.iov_len = pipeline_size;
         opal_convertor_unpack(convertor, &iov, &iov_count, &max_data );
     }
    // MCA_BTL_SMCUDA_FRAG_RETURN(frag);
@@ -906,9 +907,31 @@ static void btl_smcuda_datatype_pack(mca_btl_base_module_t* btl,
     printf("$$$$$$$$$$$$$$hello, rank %d in smcuda pack seq %d, index %d\n", my_cuda_dt_clone->endpoint->my_smp_rank, seq, lindex);
     
     if (seq == -1) {
-        mca_btl_smcuda_send_cuda_unpack_sig(btl, my_cuda_dt_clone->endpoint, lindex, -2);
+        mca_btl_smcuda_send_cuda_unpack_sig(btl, my_cuda_dt_clone->endpoint, lindex, 0, -2);
         opal_cuda_free_gpu_buffer_p(my_cuda_dt_clone->gpu_ptr, 0);
         mca_btl_smcuda_free_cuda_dt_pack_clone(my_cuda_dt_clone->endpoint, lindex);
+    } else {
+        struct opal_convertor_t *convertor = my_cuda_dt_clone->convertor;
+        struct iovec iov;
+        int rc_dt = 0;
+        size_t pipeline_size = 1024*1024;
+        uint32_t iov_count = 1;
+        iov.iov_base = convertor->gpu_buffer_ptr;
+        iov.iov_len = pipeline_size;
+        size_t max_data = 0;
+        int seq = 0;
+        /* the first pack here is used to get the correct size of pipeline_size */
+        /* because pack may not use the whole pipeline size */
+        rc_dt = opal_convertor_pack(convertor, &iov, &iov_count, &max_data );
+        pipeline_size = max_data;
+        mca_btl_smcuda_send_cuda_unpack_sig(btl, endpoint, lindex, pipeline_size, seq);
+        while (rc_dt != 1) {
+            iov.iov_base += pipeline_size;
+            seq ++;
+            rc_dt = opal_convertor_pack(convertor, &iov, &iov_count, &max_data );
+            mca_btl_smcuda_send_cuda_unpack_sig(btl, endpoint, lindex, pipeline_size, seq);
+        }
+        mca_btl_smcuda_send_cuda_unpack_sig(btl, endpoint, lindex, pipeline_size, -1);
     }
   //  MCA_BTL_SMCUDA_FRAG_RETURN(frag);
 }

From 42ad920176fd6766ad3ad80e588d9e4d8154e717 Mon Sep 17 00:00:00 2001
From: eddy16112 <eddy16112@gmail.com>
Date: Mon, 31 Aug 2015 01:03:21 -0400
Subject: [PATCH 015/190] intel test working

Conflicts:
	opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
	opal/mca/btl/smcuda/btl_smcuda.c
---
 .../cuda/opal_datatype_cuda_internal.cuh      |  2 +-
 .../cuda/opal_datatype_pack_cuda_kernel.cu    | 14 +++++-----
 .../cuda/opal_datatype_pack_cuda_wrapper.cu   |  9 ++++---
 .../cuda/opal_datatype_unpack_cuda_kernel.cu  | 14 +++++-----
 .../cuda/opal_datatype_unpack_cuda_wrapper.cu |  6 +++--
 opal/mca/btl/smcuda/btl_smcuda.c              | 27 ++++++++++++++++++-
 opal/mca/btl/smcuda/btl_smcuda_component.c    |  2 +-
 test/datatype/ddt_test.c                      |  6 ++---
 8 files changed, 55 insertions(+), 25 deletions(-)

diff --git a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
index 50e7cb18a68..3d8640bcbc2 100644
--- a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
+++ b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
@@ -13,7 +13,7 @@
 #define OPAL_DATATYPE_CUDA_DEBUG
 //#define OPAL_DATATYPE_CUDA_KERNEL_TIME
 #define OPAL_DATATYPE_CUDA_DEBUG_LEVEL  0
-//#define OPAL_DATATYPE_CUDA_TIMING
+#define OPAL_DATATYPE_CUDA_TIMING
 
 
 #define IOV_ARRAY_SIZE          1
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
index bb2cb63048e..42962316da3 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
@@ -536,15 +536,15 @@ __global__ void pack_contiguous_loop_cuda_kernel_global( uint32_t copy_loops,
 {
     uint32_t _i, tid, num_threads;
     uint32_t gap, nb_elements;
-    double *_source_tmp, *_destination_tmp, *_src_disp_tmp;;
+    char *_source_tmp, *_destination_tmp, *_src_disp_tmp;;
     
     tid = threadIdx.x + blockIdx.x * blockDim.x;
     num_threads = gridDim.x * blockDim.x;
     
-    gap = (extent - size) / 8;
-    nb_elements = size / 8;
-    _src_disp_tmp = (double*)source;
-    _destination_tmp = (double*)destination;
+    gap = (extent - size) / 1;
+    nb_elements = size / 1;
+    _src_disp_tmp = (char*)source;
+    _destination_tmp = (char*)destination;
     _destination_tmp += tid;
 
     for (_i = tid; _i < copy_loops*nb_elements; _i+=num_threads) {
@@ -623,9 +623,9 @@ __global__ void opal_generic_simple_pack_cuda_iov_kernel( ddt_cuda_iov_dist_t* c
             _destination_tmp = dst + threadIdx.x * alignment;
 #if !defined (OPAL_DATATYPE_CUDA_DRY_RUN)
             if (alignment == ALIGNMENT_DOUBLE) {
-                *((double *)_destination_tmp) = *((double *)_source_tmp);
+                *((long *)_destination_tmp) = *((long *)_source_tmp);
             } else if (alignment == ALIGNMENT_FLOAT) {
-                *((float *)_destination_tmp) = *((float *)_source_tmp);
+                *((int *)_destination_tmp) = *((int *)_source_tmp);
             } else {
                 * _destination_tmp = *_source_tmp;
             }
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
index 6c10f17d398..608e56dcd67 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
@@ -465,7 +465,7 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
     
     DT_CUDA_DEBUG ( opal_cuda_output(0, "Pack GPU base %p, pack to buffer %p\n", pConvertor->pBaseBuf, destination););
 
-    cuda_iov_count = CUDA_NB_IOV;
+    cuda_iov_count = 1000;//CUDA_NB_IOV;
     total_packed = 0;
     total_converted = pConvertor->bConverted;
     cuda_streams->current_stream_id = 0;
@@ -498,7 +498,7 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
         }
 
         for (i = 0; i < cuda_iov_count; i++) {
-    //        pElem = &(description[pStack->index+i]);
+        //    pElem = &(description[pStack->index+i]);
             if (buffer_size >= cuda_iov[i].iov_len) {
                 length_per_iovec = cuda_iov[i].iov_len;
             } else {
@@ -518,7 +518,7 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
                 alignment = ALIGNMENT_CHAR;
             }
             
-            alignment = ALIGNMENT_DOUBLE;
+           // alignment = ALIGNMENT_DOUBLE;
 
             count_desc = length_per_iovec / alignment;
             residue_desc = length_per_iovec % alignment;
@@ -552,6 +552,9 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
                 cuda_iov_dist_h_current[current_block].element_alignment[task_iteration] = orig_alignment;
                 cuda_iov_dist_h_current[current_block].nb_tasks = task_iteration + 1;
                 cuda_iov_dist_h_current[current_block].nb_elements[task_iteration] = (length_per_iovec - length_per_iovec / alignment * alignment) / orig_alignment;
+                if (cuda_iov_dist_h_current[current_block].nb_elements[task_iteration] == 0) {
+                    cuda_iov_dist_h_current[current_block].nb_elements[task_iteration] = 1;
+                }
                 destination += cuda_iov_dist_h_current[current_block].nb_elements[task_iteration] * orig_alignment;
                 DT_CUDA_DEBUG( opal_cuda_output(12, "PACKING \tblock %d, src %p, dst %p, nb_elements %d, alignment %d\n", current_block, cuda_iov_dist_h_current[current_block].src[task_iteration], cuda_iov_dist_h_current[current_block].dst[task_iteration], cuda_iov_dist_h_current[current_block].nb_elements[task_iteration], cuda_iov_dist_h_current[current_block].element_alignment[task_iteration]); );
                 current_block += 1;
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
index bbc18989e6e..6ff69eaba12 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
@@ -277,9 +277,9 @@ __global__ void opal_generic_simple_unpack_cuda_iov_kernel( ddt_cuda_iov_dist_t*
             _destination_tmp = dst + threadIdx.x * alignment;
 #if !defined (OPAL_DATATYPE_CUDA_DRY_RUN)
                 if (alignment == ALIGNMENT_DOUBLE) {
-                    *((double *)_destination_tmp) = *((double *)_source_tmp);
+                    *((long *)_destination_tmp) = *((long *)_source_tmp);
                 } else if (alignment == ALIGNMENT_FLOAT) {
-                    *((float *)_destination_tmp) = *((float *)_source_tmp);
+                    *((int *)_destination_tmp) = *((int *)_source_tmp);
                 } else {
                     * _destination_tmp = *_source_tmp;
                 }
@@ -296,15 +296,15 @@ __global__ void unpack_contiguous_loop_cuda_kernel_global( uint32_t copy_loops,
 {
     uint32_t _i, tid, num_threads;
     uint32_t gap, nb_elements;
-    double *_source_tmp, *_destination_tmp, *_dst_disp_tmp;;
+    char *_source_tmp, *_destination_tmp, *_dst_disp_tmp;;
     
     tid = threadIdx.x + blockIdx.x * blockDim.x;
     num_threads = gridDim.x * blockDim.x;
     
-    gap = (extent - size) / 8;
-    nb_elements = size / 8;
-    _dst_disp_tmp = (double*)destination;
-    _source_tmp = (double*)source;
+    gap = (extent - size) / 1;
+    nb_elements = size / 1;
+    _dst_disp_tmp = (char*)destination;
+    _source_tmp = (char*)source;
     _destination_tmp = _dst_disp_tmp + tid;
     _source_tmp += tid;
 
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
index 13531b93d3e..24a0bfc034f 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
@@ -375,6 +375,7 @@ int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvert
         }
         
         for (i = 0; i < cuda_iov_count; i++) {
+    //        pElem = &(description[pStack->index+i]);
             if (buffer_size >= cuda_iov[i].iov_len) {
                 length_per_iovec = cuda_iov[i].iov_len;
             } else {
@@ -393,8 +394,6 @@ int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvert
             } else {
                 alignment = ALIGNMENT_CHAR;
             }
-            
-            alignment = ALIGNMENT_DOUBLE;
 
             count_desc = length_per_iovec / alignment;
             residue_desc = length_per_iovec % alignment;
@@ -428,6 +427,9 @@ int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvert
                 cuda_iov_dist_h_current[current_block].element_alignment[task_iteration] = orig_alignment;
                 cuda_iov_dist_h_current[current_block].nb_tasks = task_iteration + 1;
                 cuda_iov_dist_h_current[current_block].nb_elements[task_iteration] = (length_per_iovec - length_per_iovec / alignment * alignment) / orig_alignment;
+                if (cuda_iov_dist_h_current[current_block].nb_elements[task_iteration] == 0) {
+                    cuda_iov_dist_h_current[current_block].nb_elements[task_iteration] = 1;
+                }
                 source += cuda_iov_dist_h_current[current_block].nb_elements[task_iteration] * orig_alignment;
                 DT_CUDA_DEBUG( opal_cuda_output(12, "UNPACKING \tblock %d, src %p, dst %p, nb_elements %d, alignment %d\n", current_block, cuda_iov_dist_h_current[current_block].src[task_iteration], cuda_iov_dist_h_current[current_block].dst[task_iteration], cuda_iov_dist_h_current[current_block].nb_elements[task_iteration], cuda_iov_dist_h_current[current_block].element_alignment[task_iteration]); );
                 current_block += 1;
diff --git a/opal/mca/btl/smcuda/btl_smcuda.c b/opal/mca/btl/smcuda/btl_smcuda.c
index 96ca945e0dc..221150d5ccc 100644
--- a/opal/mca/btl/smcuda/btl_smcuda.c
+++ b/opal/mca/btl/smcuda/btl_smcuda.c
@@ -400,6 +400,7 @@ smcuda_btl_first_time_init(mca_btl_smcuda_t *smcuda_btl,
 
     /* allocation will be for the fragment descriptor and payload buffer */
     length = sizeof(mca_btl_smcuda_frag1_t);
+    printf("free list %d\n", mca_btl_smcuda_component.sm_free_list_num);
     length_payload =
         sizeof(mca_btl_smcuda_hdr_t) + mca_btl_smcuda_component.eager_limit;
     i = opal_free_list_init (&mca_btl_smcuda_component.sm_frags_eager, length,
@@ -1044,6 +1045,28 @@ static int mca_btl_smcuda_deregister_mem (struct mca_btl_base_module_t* btl,
     return OPAL_SUCCESS;
 }
 
+int mca_btl_smcuda_notify_packing_done(void* send_value, int my_rank, int peer_rank)
+{
+    sm_fifo_t* fifo_send = &(mca_btl_smcuda_component.fifo[peer_rank][FIFO_MAP(my_rank)]);
+    if (fifo_send == NULL) {
+        return OPAL_ERROR;
+    } else {
+   //     return sm_fifo_write(send_value, fifo_send);
+        int tail = fifo_send->tail;
+        int head = fifo_send->head;
+        if ((head + 1) & fifo_send->mask == tail) {
+            printf("fifo is full\n");
+            return OPAL_ERR_OUT_OF_RESOURCE;
+        } else {
+            volatile void **q = (volatile void **) RELATIVE2VIRTUAL(fifo_send->queue);
+            tail = (tail - 1) & fifo_send->mask;
+            q[tail] = send_value;
+            printf("write to place %d tail %d head %d\n", tail, fifo_send->tail, fifo_send->head);
+            return OPAL_SUCCESS;
+        }
+    }
+}
+
 int mca_btl_smcuda_get_cuda (struct mca_btl_base_module_t *btl,
     struct mca_btl_base_endpoint_t *ep, void *local_address,
     uint64_t remote_address, struct mca_btl_base_registration_handle_t *local_handle,
@@ -1144,7 +1167,9 @@ int mca_btl_smcuda_get_cuda (struct mca_btl_base_module_t *btl,
             printf("i receive pipeline %ld, lindex %d, pack_required %d\n", pipeline_size, lindex, pack_required);
             convertor->gpu_buffer_ptr = remote_memory_address;
             mca_btl_smcuda_cuda_dt_unpack_clone(convertor, ep, local_address, local_handle, (mca_btl_base_completion_fn_t)cbfunc, cbcontext, cbdata, pipeline_size, lindex);
-            mca_btl_smcuda_send_cuda_pack_sig(btl, ep, lindex, 0, 0);
+            if (pack_required) {
+                mca_btl_smcuda_send_cuda_pack_sig(btl, ep, lindex, 0, 0);
+            }
             done = 0;
             mca_btl_smcuda_free(btl, (mca_btl_base_descriptor_t *)frag);
         } else {
diff --git a/opal/mca/btl/smcuda/btl_smcuda_component.c b/opal/mca/btl/smcuda/btl_smcuda_component.c
index 5fd845edf24..3962e12af7f 100644
--- a/opal/mca/btl/smcuda/btl_smcuda_component.c
+++ b/opal/mca/btl/smcuda/btl_smcuda_component.c
@@ -914,7 +914,7 @@ static void btl_smcuda_datatype_pack(mca_btl_base_module_t* btl,
         struct opal_convertor_t *convertor = my_cuda_dt_clone->convertor;
         struct iovec iov;
         int rc_dt = 0;
-        size_t pipeline_size = 1024*1024;
+        size_t pipeline_size = 1024*1024*200;
         uint32_t iov_count = 1;
         iov.iov_base = convertor->gpu_buffer_ptr;
         iov.iov_len = pipeline_size;
diff --git a/test/datatype/ddt_test.c b/test/datatype/ddt_test.c
index 459566eaa09..bb69643ee17 100644
--- a/test/datatype/ddt_test.c
+++ b/test/datatype/ddt_test.c
@@ -828,9 +828,9 @@ int main( int argc, char* argv[] )
     printf( "\n\n#\n * TEST UPPER TRIANGULAR MATRIX (size 100)\n #\n\n" );
     pdt = upper_matrix(4000);
     if( outputFlags & CHECK_PACK_UNPACK ) {
-        for (i = 1; i <= 1; i++) {
+        for (i = 1; i <= 4; i++) {
 //        local_copy_ddt_count(pdt, 1);
-            local_copy_with_convertor(pdt, 1, 1024*1024*10, 4000);
+            local_copy_with_convertor(pdt, 1, 1024*1024*200, 4000);
         }
     }
     OBJ_RELEASE( pdt ); assert( pdt == NULL );
@@ -990,7 +990,7 @@ int main( int argc, char* argv[] )
       //  local_copy_with_convertor( pdt, 1, 6000 );
       //  local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 6000 );
       //  local_copy_with_convertor( pdt, 1, 36000 );
-          local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 1024*2000, 4000, 256, 384 );
+      //    local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 1024*2000, 4000, 256, 384 );
         }
     }
     printf( ">>--------------------------------------------<<\n" );

From bab3559f78222561c3e5f6d48098acd603810c4f Mon Sep 17 00:00:00 2001
From: eddy16112 <eddy16112@gmail.com>
Date: Mon, 31 Aug 2015 19:17:04 -0400
Subject: [PATCH 016/190] fix a bug when buffer is not big enough for whole ddt

Conflicts:
	opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
---
 .../cuda/opal_datatype_pack_cuda_wrapper.cu   | 18 ++++++++++-----
 .../cuda/opal_datatype_unpack_cuda_wrapper.cu | 23 ++++++++++++-------
 opal/mca/btl/smcuda/btl_smcuda_component.c    |  2 +-
 test/datatype/ddt_test.c                      |  2 ++
 4 files changed, 30 insertions(+), 15 deletions(-)

diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
index 608e56dcd67..b5443ffb3b9 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
@@ -402,6 +402,7 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
     dt_elem_desc_t* pElem;
     dt_stack_t* pStack;
     uint8_t alignment, orig_alignment;
+    int32_t orig_stack_index;
     
     ddt_cuda_iov_dist_t* cuda_iov_dist_h_current;
     ddt_cuda_iov_dist_t* cuda_iov_dist_d_current;
@@ -470,6 +471,7 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
     total_converted = pConvertor->bConverted;
     cuda_streams->current_stream_id = 0;
     convertor_flags = pConvertor->flags;
+    orig_stack_index = pStack->index;
     complete_flag = opal_convertor_raw( pConvertor, cuda_iov, &cuda_iov_count, &length );
     DT_CUDA_DEBUG ( opal_cuda_output(2, "PACKING complete flag %d, iov count %d, length %d, submit to CUDA stream %d\n", complete_flag, cuda_iov_count, length, cuda_streams->current_stream_id); );
 
@@ -498,7 +500,7 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
         }
 
         for (i = 0; i < cuda_iov_count; i++) {
-        //    pElem = &(description[pStack->index+i]);
+            pElem = &(description[orig_stack_index+i]);
             if (buffer_size >= cuda_iov[i].iov_len) {
                 length_per_iovec = cuda_iov[i].iov_len;
             } else {
@@ -510,9 +512,9 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
             total_packed += length_per_iovec;
             
             /* check alignment */
-            if ((uintptr_t)(cuda_iov[i].iov_base) % ALIGNMENT_DOUBLE == 0 && (uintptr_t)destination % ALIGNMENT_DOUBLE == 0) {
+            if ((uintptr_t)(cuda_iov[i].iov_base) % ALIGNMENT_DOUBLE == 0 && (uintptr_t)destination % ALIGNMENT_DOUBLE == 0 && length_per_iovec >= ALIGNMENT_DOUBLE) {
                 alignment = ALIGNMENT_DOUBLE;
-            } else if ((uintptr_t)(cuda_iov[i].iov_base) % ALIGNMENT_FLOAT == 0 && (uintptr_t)destination % ALIGNMENT_FLOAT == 0) {
+            } else if ((uintptr_t)(cuda_iov[i].iov_base) % ALIGNMENT_FLOAT == 0 && (uintptr_t)destination % ALIGNMENT_FLOAT == 0 && length_per_iovec >= ALIGNMENT_FLOAT) {
                 alignment = ALIGNMENT_FLOAT;
             } else {
                 alignment = ALIGNMENT_CHAR;
@@ -534,6 +536,9 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
                 } else {
                     cuda_iov_dist_h_current[current_block].nb_elements[task_iteration] = count_desc - j*thread_per_block; //(thread_per_block - ((j+1)*thread_per_block - count_desc));// * sizeof(double);
                 }
+#if defined (OPAL_DATATYPE_CUDA_DEBUG)
+                assert(cuda_iov_dist_h_current[current_block].nb_elements[task_iteration] > 0);
+#endif /* OPAL_DATATYPE_CUDA_DEBUG */
                 destination += cuda_iov_dist_h_current[current_block].nb_elements[task_iteration] * alignment;
                 DT_CUDA_DEBUG( opal_cuda_output(12, "PACKING \tblock %d, src %p, dst %p, nb_elements %d, alignment %d\n", current_block, cuda_iov_dist_h_current[current_block].src[task_iteration], cuda_iov_dist_h_current[current_block].dst[task_iteration], cuda_iov_dist_h_current[current_block].nb_elements[task_iteration], cuda_iov_dist_h_current[current_block].element_alignment[task_iteration]); );
                 current_block += 1;
@@ -552,9 +557,9 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
                 cuda_iov_dist_h_current[current_block].element_alignment[task_iteration] = orig_alignment;
                 cuda_iov_dist_h_current[current_block].nb_tasks = task_iteration + 1;
                 cuda_iov_dist_h_current[current_block].nb_elements[task_iteration] = (length_per_iovec - length_per_iovec / alignment * alignment) / orig_alignment;
-                if (cuda_iov_dist_h_current[current_block].nb_elements[task_iteration] == 0) {
-                    cuda_iov_dist_h_current[current_block].nb_elements[task_iteration] = 1;
-                }
+#if defined (OPAL_DATATYPE_CUDA_DEBUG)
+                assert(cuda_iov_dist_h_current[current_block].nb_elements[task_iteration] > 0);
+#endif /* OPAL_DATATYPE_CUDA_DEBUG */
                 destination += cuda_iov_dist_h_current[current_block].nb_elements[task_iteration] * orig_alignment;
                 DT_CUDA_DEBUG( opal_cuda_output(12, "PACKING \tblock %d, src %p, dst %p, nb_elements %d, alignment %d\n", current_block, cuda_iov_dist_h_current[current_block].src[task_iteration], cuda_iov_dist_h_current[current_block].dst[task_iteration], cuda_iov_dist_h_current[current_block].nb_elements[task_iteration], cuda_iov_dist_h_current[current_block].element_alignment[task_iteration]); );
                 current_block += 1;
@@ -592,6 +597,7 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
         GET_TIME(start);
 #endif
         convertor_flags = pConvertor->flags;
+        orig_stack_index = pStack->index;
         complete_flag = opal_convertor_raw( pConvertor, cuda_iov, &cuda_iov_count, &length );
         DT_CUDA_DEBUG ( opal_cuda_output(2, "PACKING complete flag %d, iov count %d, length %d, submit to CUDA stream %d\n", complete_flag, cuda_iov_count, length, cuda_streams->current_stream_id); );
 #if defined(OPAL_DATATYPE_CUDA_TIMING)    
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
index 24a0bfc034f..555d41f9517 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
@@ -285,7 +285,8 @@ int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvert
     dt_elem_desc_t* pElem;
     dt_stack_t* pStack;
     uint8_t alignment, orig_alignment;
-    
+    int32_t orig_stack_index;
+
     ddt_cuda_iov_dist_t* cuda_iov_dist_h_current;
     ddt_cuda_iov_dist_t* cuda_iov_dist_d_current;
     
@@ -347,6 +348,7 @@ int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvert
     total_converted = pConvertor->bConverted;
     cuda_streams->current_stream_id = 0;
     convertor_flags = pConvertor->flags;
+    orig_stack_index = pStack->index;
     complete_flag = opal_convertor_raw( pConvertor, cuda_iov, &cuda_iov_count, &length );
     DT_CUDA_DEBUG ( opal_cuda_output(2, "UNPACKING complete flag %d, iov count %d, length %d, submit to CUDA stream %d\n", complete_flag, cuda_iov_count, length, cuda_streams->current_stream_id); );
 
@@ -375,7 +377,7 @@ int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvert
         }
         
         for (i = 0; i < cuda_iov_count; i++) {
-    //        pElem = &(description[pStack->index+i]);
+            pElem = &(description[orig_stack_index+i]);
             if (buffer_size >= cuda_iov[i].iov_len) {
                 length_per_iovec = cuda_iov[i].iov_len;
             } else {
@@ -387,9 +389,9 @@ int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvert
             total_unpacked += length_per_iovec;
             
             /* check alignment */
-            if ((uintptr_t)(cuda_iov[i].iov_base) % ALIGNMENT_DOUBLE == 0 && (uintptr_t)source % ALIGNMENT_DOUBLE == 0) {
+            if ((uintptr_t)(cuda_iov[i].iov_base) % ALIGNMENT_DOUBLE == 0 && (uintptr_t)source % ALIGNMENT_DOUBLE == 0 && length_per_iovec >= ALIGNMENT_DOUBLE) {
                 alignment = ALIGNMENT_DOUBLE;
-            } else if ((uintptr_t)(cuda_iov[i].iov_base) % ALIGNMENT_FLOAT == 0 && (uintptr_t)source % ALIGNMENT_FLOAT == 0) {
+            } else if ((uintptr_t)(cuda_iov[i].iov_base) % ALIGNMENT_FLOAT == 0 && (uintptr_t)source % ALIGNMENT_FLOAT == 0 && length_per_iovec >= ALIGNMENT_FLOAT) {
                 alignment = ALIGNMENT_FLOAT;
             } else {
                 alignment = ALIGNMENT_CHAR;
@@ -409,6 +411,9 @@ int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvert
                 } else {
                     cuda_iov_dist_h_current[current_block].nb_elements[task_iteration] = (thread_per_block - ((j+1)*thread_per_block - count_desc));// * sizeof(double);
                 }
+#if defined (OPAL_DATATYPE_CUDA_DEBUG)
+                assert (cuda_iov_dist_h_current[current_block].nb_elements[task_iteration] > 0); 
+#endif /* OPAL_DATATYPE_CUDA_DEBUG */
                 source += cuda_iov_dist_h_current[current_block].nb_elements[task_iteration] * alignment;
                 DT_CUDA_DEBUG( opal_cuda_output(12, "UNPACKING \tblock %d, src %p, dst %p, nb_elements %d, alignment %d\n", current_block, cuda_iov_dist_h_current[current_block].src[task_iteration], cuda_iov_dist_h_current[current_block].dst[task_iteration], cuda_iov_dist_h_current[current_block].nb_elements[task_iteration], cuda_iov_dist_h_current[current_block].element_alignment[task_iteration]); );
                 current_block += 1;
@@ -427,9 +432,9 @@ int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvert
                 cuda_iov_dist_h_current[current_block].element_alignment[task_iteration] = orig_alignment;
                 cuda_iov_dist_h_current[current_block].nb_tasks = task_iteration + 1;
                 cuda_iov_dist_h_current[current_block].nb_elements[task_iteration] = (length_per_iovec - length_per_iovec / alignment * alignment) / orig_alignment;
-                if (cuda_iov_dist_h_current[current_block].nb_elements[task_iteration] == 0) {
-                    cuda_iov_dist_h_current[current_block].nb_elements[task_iteration] = 1;
-                }
+#if defined (OPAL_DATATYPE_CUDA_DEBUG)
+                assert (cuda_iov_dist_h_current[current_block].nb_elements[task_iteration] > 0);
+#endif /* OPAL_DATATYPE_CUDA_DEBUG */
                 source += cuda_iov_dist_h_current[current_block].nb_elements[task_iteration] * orig_alignment;
                 DT_CUDA_DEBUG( opal_cuda_output(12, "UNPACKING \tblock %d, src %p, dst %p, nb_elements %d, alignment %d\n", current_block, cuda_iov_dist_h_current[current_block].src[task_iteration], cuda_iov_dist_h_current[current_block].dst[task_iteration], cuda_iov_dist_h_current[current_block].nb_elements[task_iteration], cuda_iov_dist_h_current[current_block].element_alignment[task_iteration]); );
                 current_block += 1;
@@ -465,8 +470,10 @@ int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvert
         }
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
         GET_TIME(start);
-#endif   
         convertor_flags = pConvertor->flags;     
+#endif
+        convertor_flags = pConvertor->flags;
+        orig_stack_index = pStack->index;
         complete_flag = opal_convertor_raw( pConvertor, cuda_iov, &cuda_iov_count, &length );
         DT_CUDA_DEBUG ( opal_cuda_output(8, "UNPACKING complete flag %d, iov count %d, length %d, submit to CUDA stream %d\n", complete_flag, cuda_iov_count, length, cuda_streams->current_stream_id); );
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
diff --git a/opal/mca/btl/smcuda/btl_smcuda_component.c b/opal/mca/btl/smcuda/btl_smcuda_component.c
index 3962e12af7f..cd6c7ce071b 100644
--- a/opal/mca/btl/smcuda/btl_smcuda_component.c
+++ b/opal/mca/btl/smcuda/btl_smcuda_component.c
@@ -914,7 +914,7 @@ static void btl_smcuda_datatype_pack(mca_btl_base_module_t* btl,
         struct opal_convertor_t *convertor = my_cuda_dt_clone->convertor;
         struct iovec iov;
         int rc_dt = 0;
-        size_t pipeline_size = 1024*1024*200;
+        size_t pipeline_size = 1024*1024*20;
         uint32_t iov_count = 1;
         iov.iov_base = convertor->gpu_buffer_ptr;
         iov.iov_len = pipeline_size;
diff --git a/test/datatype/ddt_test.c b/test/datatype/ddt_test.c
index bb69643ee17..ae72785b86c 100644
--- a/test/datatype/ddt_test.c
+++ b/test/datatype/ddt_test.c
@@ -644,6 +644,8 @@ static int local_copy_with_convertor( ompi_datatype_t* pdt, int count, int chunk
     dt_length = compute_buffer_length(pdt, count);
     printf("length %lu\n", dt_length);
 
+    cudaSetDevice(1);
+
 #if defined (DDT_TEST_CUDA)
     cudaError_t error = cudaMalloc((void **)&psrc, dt_length);
     if ( error != cudaSuccess) {

From 29c90a0a326d1b7ffbe27d1bee5119636c9247cd Mon Sep 17 00:00:00 2001
From: eddy16112 <eddy16112@gmail.com>
Date: Wed, 2 Sep 2015 17:33:48 -0400
Subject: [PATCH 017/190] if data in different gpu, instead of copy direct from
 one to the other, we do a D2D copy

Conflicts:
	opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
	test/datatype/Makefile.am
---
 ompi/mca/pml/ob1/pml_ob1_cuda.c               | 15 ++++++--
 opal/datatype/cuda/opal_datatype_cuda.cu      |  8 +++-
 .../cuda/opal_datatype_pack_cuda_kernel.cu    | 10 ++---
 .../cuda/opal_datatype_pack_cuda_wrapper.cu   | 38 ++++++++++++-------
 .../cuda/opal_datatype_unpack_cuda_kernel.cu  | 10 ++---
 .../cuda/opal_datatype_unpack_cuda_wrapper.cu | 38 ++++++++++++-------
 opal/mca/btl/smcuda/btl_smcuda.c              | 34 +++++++++++++----
 opal/mca/btl/smcuda/btl_smcuda.h              | 12 ++++--
 opal/mca/btl/smcuda/btl_smcuda_component.c    | 27 ++++++++++---
 opal/mca/common/cuda/common_cuda.c            | 13 +++++++
 opal/mca/common/cuda/common_cuda.h            |  2 +
 test/datatype/Makefile.am                     |  7 +++-
 12 files changed, 156 insertions(+), 58 deletions(-)

diff --git a/ompi/mca/pml/ob1/pml_ob1_cuda.c b/ompi/mca/pml/ob1/pml_ob1_cuda.c
index 338cff3ebc8..05556c14a90 100644
--- a/ompi/mca/pml/ob1/pml_ob1_cuda.c
+++ b/ompi/mca/pml/ob1/pml_ob1_cuda.c
@@ -52,7 +52,7 @@ size_t mca_pml_ob1_rdma_cuda_btls(
 int mca_pml_ob1_rdma_cuda_btl_register_data(
     mca_pml_ob1_com_btl_t* rdma_btls, 
     uint32_t num_btls_used, 
-    size_t pipeline_size, int lindex, uint8_t pack_required);
+    size_t pipeline_size, int lindex, uint8_t pack_required, uint8_t gpu_device);
 
 int mca_pml_ob1_cuda_need_buffers(void * rreq,
                                   mca_btl_base_module_t* btl);
@@ -109,6 +109,7 @@ int mca_pml_ob1_send_request_start_cuda(mca_pml_ob1_send_request_t* sendreq,
             printf("GPU data ready for GET!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n");
             unsigned char *base;
             struct opal_convertor_t *convertor = &(sendreq->req_send.req_base.req_convertor);
+            int local_device = 0;
             base = opal_cuda_malloc_gpu_buffer_p(convertor->local_size, 0);
             convertor->gpu_buffer_ptr = base;
             sendreq->req_send.req_bytes_packed = convertor->local_size;
@@ -121,8 +122,13 @@ int mca_pml_ob1_send_request_start_cuda(mca_pml_ob1_send_request_t* sendreq,
     
                 int lindex = mca_btl_smcuda_alloc_cuda_dt_pack_clone(bml_btl->btl_endpoint);
                 assert(lindex >= 0);
-                mca_pml_ob1_rdma_cuda_btl_register_data(sendreq->req_rdma, sendreq->req_rdma_cnt, 0, lindex, 1); 
-                mca_btl_smcuda_cuda_dt_pack_clone(convertor, bml_btl->btl_endpoint, NULL, NULL, NULL, NULL, NULL, 0, lindex);
+                rc = mca_common_cuda_get_device(&local_device);
+                if (rc != 0) {
+                    opal_output_verbose(0, "Failed to get the GPU device ID, rc=%d", rc);
+                    return rc;
+                }
+                mca_pml_ob1_rdma_cuda_btl_register_data(sendreq->req_rdma, sendreq->req_rdma_cnt, 0, lindex, 1, local_device); 
+                mca_btl_smcuda_cuda_dt_pack_clone(convertor, bml_btl->btl_endpoint, NULL, NULL, NULL, NULL, NULL, NULL, 0, lindex, 0, local_device);
     
                 rc = mca_pml_ob1_send_request_start_rdma(sendreq, bml_btl,
                                                          sendreq->req_send.req_bytes_packed);
@@ -204,7 +210,7 @@ size_t mca_pml_ob1_rdma_cuda_btls(
 int mca_pml_ob1_rdma_cuda_btl_register_data(
     mca_pml_ob1_com_btl_t* rdma_btls, 
     uint32_t num_btls_used, 
-    size_t pipeline_size, int lindex, uint8_t pack_required)
+    size_t pipeline_size, int lindex, uint8_t pack_required, uint8_t gpu_device)
 {
     uint32_t i, j;
     for (i = 0; i < num_btls_used; i++) {
@@ -219,6 +225,7 @@ int mca_pml_ob1_rdma_cuda_btl_register_data(
         cuda_reg->data.pipeline_size = pipeline_size;
         cuda_reg->data.lindex = lindex;
         cuda_reg->data.pack_required = pack_required;
+        cuda_reg->data.gpu_device = gpu_device;
 
     }
     return 0;
diff --git a/opal/datatype/cuda/opal_datatype_cuda.cu b/opal/datatype/cuda/opal_datatype_cuda.cu
index b6ed096b7d9..b94679358a0 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cu
+++ b/opal/datatype/cuda/opal_datatype_cuda.cu
@@ -213,6 +213,7 @@ void opal_datatype_cuda_init(void)
         opal_cuda_output(0, "Cannot retrieve the device being used. Drop CUDA support!\n");
         return;
     }    
+    printf("current device %d\n", device);
 
     cuda_free_list = init_cuda_free_list();
     
@@ -367,6 +368,9 @@ unsigned char* opal_cuda_get_gpu_pack_buffer()
 
 void* opal_cuda_malloc_gpu_buffer(size_t size, int gpu_id)
 {
+    int dev_id;
+    cudaGetDevice(&dev_id);
+    printf("malloc gpu buffer in dev %d\n", dev_id);
     ddt_cuda_device_t *device = &cuda_device[gpu_id];
     if (device->buffer_free_size < size) {
         return NULL;
@@ -402,7 +406,7 @@ void* opal_cuda_malloc_gpu_buffer(size_t size, int gpu_id)
         cuda_list_push_head(&device->buffer_used, p);
         device->buffer_used_size += size;
         device->buffer_free_size -= size;
-        DT_CUDA_DEBUG( opal_cuda_output( 0, "Malloc GPU buffer %p.\n", addr); );
+        DT_CUDA_DEBUG( opal_cuda_output( 1, "Malloc GPU buffer %p.\n", addr); );
         return addr;
     }
 }
@@ -440,7 +444,7 @@ void opal_cuda_free_gpu_buffer(void *addr, int gpu_id)
     }
     cuda_list_item_merge_by_addr(&device->buffer_free, ptr);
     device->buffer_free_size += ptr->size;
-    DT_CUDA_DEBUG( opal_cuda_output( 0, "Free GPU buffer %p.\n", addr); );
+    DT_CUDA_DEBUG( opal_cuda_output( 1, "Free GPU buffer %p.\n", addr); );
 }
 
 void opal_dump_cuda_list(ddt_cuda_list_t *list)
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
index 42962316da3..9bf130630f9 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
@@ -536,15 +536,15 @@ __global__ void pack_contiguous_loop_cuda_kernel_global( uint32_t copy_loops,
 {
     uint32_t _i, tid, num_threads;
     uint32_t gap, nb_elements;
-    char *_source_tmp, *_destination_tmp, *_src_disp_tmp;;
+    double *_source_tmp, *_destination_tmp, *_src_disp_tmp;;
     
     tid = threadIdx.x + blockIdx.x * blockDim.x;
     num_threads = gridDim.x * blockDim.x;
     
-    gap = (extent - size) / 1;
-    nb_elements = size / 1;
-    _src_disp_tmp = (char*)source;
-    _destination_tmp = (char*)destination;
+    gap = (extent - size) / 8;
+    nb_elements = size / 8;
+    _src_disp_tmp = (double*)source;
+    _destination_tmp = (double*)destination;
     _destination_tmp += tid;
 
     for (_i = tid; _i < copy_loops*nb_elements; _i+=num_threads) {
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
index b5443ffb3b9..56a85e3709d 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
@@ -195,11 +195,10 @@ int32_t opal_generic_simple_pack_function_cuda_vector(opal_convertor_t* pConvert
     long total_time;
 #endif
     
-    DT_CUDA_DEBUG( opal_cuda_output( 1, "opal_convertor_generic_simple_pack( %p:%p, {%p, %lu}, %d )\n",
+    DT_CUDA_DEBUG( opal_cuda_output( 1, "opal_convertor_generic_simple_pack_cuda_vector( %p:%p, {%p, %lu}, %u, %u )\n",
                                 (void*)pConvertor, (void*)pConvertor->pBaseBuf,
-                                iov[0].iov_base, (unsigned long)iov[0].iov_len, *out_size ); );
+                                iov[0].iov_base, (unsigned long)iov[0].iov_len, *out_size, *max_data ); );
 
-    printf("I am in simple pack vector, max_data %lu, iov_len %lu\n", *max_data, iov[0].iov_len);
     description = pConvertor->use_desc->desc;
     
     /* For the first step we have to add both displacement to the source. After in the
@@ -214,7 +213,7 @@ int32_t opal_generic_simple_pack_function_cuda_vector(opal_convertor_t* pConvert
     pConvertor->stack_pos--;
     pElem = &(description[pos_desc]);
 
-    DT_CUDA_DEBUG( opal_cuda_output( 0, "pack start pos_desc %d count_desc %d disp %ld\n"
+    DT_CUDA_DEBUG( opal_cuda_output( 1, "pack start pos_desc %d count_desc %d disp %ld\n"
                            "stack_pos %d pos_desc %d count_desc %d disp %ld\n",
                            pos_desc, count_desc, (long)(conv_ptr - pConvertor->pBaseBuf),
                            pConvertor->stack_pos, pStack->index, (int)pStack->count, (long)pStack->disp ); );
@@ -247,7 +246,6 @@ int32_t opal_generic_simple_pack_function_cuda_vector(opal_convertor_t* pConvert
             free_required = 1;
             iov_ptr = pConvertor->gpu_buffer_ptr;
         }
-        printf("original local %d\n", iov_len_local);
         while( 1 ) {
             while( pElem->elem.common.flags & OPAL_DATATYPE_FLAG_DATA ) {
                 /* now here we have a basic datatype */
@@ -260,7 +258,7 @@ int32_t opal_generic_simple_pack_function_cuda_vector(opal_convertor_t* pConvert
                 goto complete_loop;
             }
             if( OPAL_DATATYPE_END_LOOP == pElem->elem.common.type ) { /* end of the current loop */
-                DT_CUDA_DEBUG( opal_cuda_output( 1, "pack end_loop count %d stack_pos %d"
+                DT_CUDA_DEBUG( opal_cuda_output( 2, "pack end_loop count %d stack_pos %d"
                                                  " pos_desc %d disp %ld space %lu\n",
                                                  (int)pStack->count, pConvertor->stack_pos,
                                                  pos_desc, (long)pStack->disp, (unsigned long)iov_len_local ); );
@@ -286,7 +284,7 @@ int32_t opal_generic_simple_pack_function_cuda_vector(opal_convertor_t* pConvert
                 }
                 conv_ptr = pConvertor->pBaseBuf + pStack->disp;
                 UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
-                DT_CUDA_DEBUG( opal_cuda_output( 1, "pack new_loop count %d stack_pos %d pos_desc %d count_desc %d disp %ld space %lu\n",
+                DT_CUDA_DEBUG( opal_cuda_output( 2, "pack new_loop count %d stack_pos %d pos_desc %d count_desc %d disp %ld space %lu\n",
                                                  (int)pStack->count, pConvertor->stack_pos, pos_desc,
                                                  count_desc, (long)pStack->disp, (unsigned long)iov_len_local ); );
             }
@@ -314,7 +312,7 @@ int32_t opal_generic_simple_pack_function_cuda_vector(opal_convertor_t* pConvert
     complete_loop:
         iov[iov_count].iov_len -= iov_len_local;  /* update the amount of valid data */
         total_packed += iov[iov_count].iov_len;
-        printf("iov_len %d, local %d\n", iov[iov_count].iov_len, iov_len_local);
+ //       printf("iov_len %d, local %d\n", iov[iov_count].iov_len, iov_len_local);
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
         GET_TIME(start);
 #endif
@@ -324,7 +322,7 @@ int32_t opal_generic_simple_pack_function_cuda_vector(opal_convertor_t* pConvert
 #if defined(OPAL_DATATYPE_CUDA_TIMING) 
         GET_TIME( end );
         total_time = ELAPSED_TIME( start, end );
-        printf( "[Timing]: DtoH memcpy in %ld microsec\n", total_time );
+        printf( "[Timing]: DtoH memcpy in %ld microsec, transfer required %d\n", total_time, transfer_required );
 #endif
     }
     *max_data = total_packed;
@@ -332,7 +330,7 @@ int32_t opal_generic_simple_pack_function_cuda_vector(opal_convertor_t* pConvert
     *out_size = iov_count;
     if( pConvertor->bConverted == pConvertor->local_size ) {
         pConvertor->flags |= CONVERTOR_COMPLETED;
-        printf("total packed %lu\n", pConvertor->bConverted);
+        DT_CUDA_DEBUG( opal_cuda_output( 1, "total packed %lu\n", pConvertor->bConverted); );
         if (pConvertor->gpu_buffer_ptr != NULL && free_required) {
            opal_cuda_free_gpu_buffer(pConvertor->gpu_buffer_ptr, 0);
            pConvertor->gpu_buffer_ptr = NULL;
@@ -359,8 +357,13 @@ void pack_contiguous_loop_cuda( dt_elem_desc_t* ELEM,
     uint32_t _copy_loops = *(COUNT);
     uint32_t num_blocks, tasks_per_block;
     unsigned char* _destination = *(DESTINATION);
+    
+#if defined(OPAL_DATATYPE_CUDA_TIMING)    
+    TIMER_DATA_TYPE start, end, start_total, end_total;
+    long total_time;
+#endif
 
-    printf("I am in pack_contiguous_loop_cuda\n");
+    DT_CUDA_DEBUG( opal_cuda_output( 0, "I am in pack_contiguous_loop_cuda\n"); );
 
     if( (_copy_loops * _end_loop->size) > *(SPACE) )
         _copy_loops = (uint32_t)(*(SPACE) / _end_loop->size);
@@ -369,7 +372,10 @@ void pack_contiguous_loop_cuda( dt_elem_desc_t* ELEM,
  //   _source = pBaseBuf_GPU;
  //   _destination = (unsigned char*)cuda_desc_h->iov[0].iov_base;
 #endif
-    
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+    GET_TIME(start);
+#endif    
     tasks_per_block = THREAD_PER_BLOCK * TASK_PER_THREAD;
     num_blocks = (*COUNT + tasks_per_block - 1) / tasks_per_block;
     pack_contiguous_loop_cuda_kernel_global<<<192, 4*THREAD_PER_BLOCK>>>(_copy_loops, _end_loop->size, _loop->extent, _source, _destination);
@@ -382,6 +388,12 @@ void pack_contiguous_loop_cuda( dt_elem_desc_t* ELEM,
 #endif
     
     cudaDeviceSynchronize();
+    
+#if defined(OPAL_DATATYPE_CUDA_TIMING) 
+    GET_TIME( end );
+    total_time = ELAPSED_TIME( start, end );
+    printf( "[Timing]: vector packing in %ld microsec\n", total_time );
+#endif
 }
 
 int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor,
@@ -619,7 +631,7 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
 #if defined(OPAL_DATATYPE_CUDA_TIMING) 
     GET_TIME( end );
     total_time = ELAPSED_TIME( start, end );
-    printf( "[Timing]: DtoH memcpy in %ld microsec\n", total_time );
+    printf( "[Timing]: DtoH memcpy in %ld microsec, transfer required %d\n", total_time, transfer_required );
 #endif
     // float *vtmp = (float *)iov[0].iov_base;
     // DT_CUDA_DEBUG ( opal_cuda_output(0, "packed iov buffer, total packed %d\n", total_packed); );
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
index 6ff69eaba12..3303e6fe9f5 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
@@ -296,15 +296,15 @@ __global__ void unpack_contiguous_loop_cuda_kernel_global( uint32_t copy_loops,
 {
     uint32_t _i, tid, num_threads;
     uint32_t gap, nb_elements;
-    char *_source_tmp, *_destination_tmp, *_dst_disp_tmp;;
+    double *_source_tmp, *_destination_tmp, *_dst_disp_tmp;;
     
     tid = threadIdx.x + blockIdx.x * blockDim.x;
     num_threads = gridDim.x * blockDim.x;
     
-    gap = (extent - size) / 1;
-    nb_elements = size / 1;
-    _dst_disp_tmp = (char*)destination;
-    _source_tmp = (char*)source;
+    gap = (extent - size) / 8;
+    nb_elements = size / 8;
+    _dst_disp_tmp = (double*)destination;
+    _source_tmp = (double*)source;
     _destination_tmp = _dst_disp_tmp + tid;
     _source_tmp += tid;
 
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
index 555d41f9517..36316ae877f 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
@@ -131,10 +131,9 @@ int32_t opal_generic_simple_unpack_function_cuda_vector( opal_convertor_t* pConv
     TIMER_DATA_TYPE start, end, start_total, end_total;
     long total_time;
 #endif
-    
-    printf("i am in simple unpack vector, max_data %lu, iov len %lu\n", *max_data, iov[0].iov_len);
-    DT_CUDA_DEBUG( opal_cuda_output( 1, "opal_convertor_generic_simple_unpack( %p, {%p, %lu}, %u )\n",
-                                     (void*)pConvertor, iov[0].iov_base, (unsigned long)iov[0].iov_len, *out_size ); )                  
+
+    DT_CUDA_DEBUG( opal_cuda_output( 1, "opal_convertor_generic_simple_unpack( %p, {%p, %lu}, %u , %u)\n",
+                                     (void*)pConvertor, iov[0].iov_base, (unsigned long)iov[0].iov_len, *out_size, *max_data ); )
 
     description = pConvertor->use_desc->desc;
 
@@ -150,7 +149,7 @@ int32_t opal_generic_simple_unpack_function_cuda_vector( opal_convertor_t* pConv
     pConvertor->stack_pos--;
     pElem = &(description[pos_desc]);
 
-    DT_CUDA_DEBUG( opal_cuda_output( 0, "unpack start pos_desc %d count_desc %d disp %ld\n"
+    DT_CUDA_DEBUG( opal_cuda_output( 1, "unpack start pos_desc %d count_desc %d disp %ld\n"
                            "stack_pos %d pos_desc %d count_desc %d disp %ld\n",
                            pos_desc, count_desc, (long)(conv_ptr - pConvertor->pBaseBuf),
                            pConvertor->stack_pos, pStack->index, (int)pStack->count, (long)(pStack->disp) ); );
@@ -173,7 +172,7 @@ int32_t opal_generic_simple_unpack_function_cuda_vector( opal_convertor_t* pConv
 #if defined(OPAL_DATATYPE_CUDA_TIMING) 
         GET_TIME( end );
         total_time = ELAPSED_TIME( start, end );
-        printf( "[Timing]: HtoD memcpy in %ld microsec\n", total_time );
+        printf( "[Timing]: HtoD memcpy in %ld microsec, free required %d\n", total_time, free_required );
 #endif
         iov_len_local = iov[iov_count].iov_len;
         if( 0 != pConvertor->partial_length ) {
@@ -191,7 +190,7 @@ int32_t opal_generic_simple_unpack_function_cuda_vector( opal_convertor_t* pConv
                 goto complete_loop;
             }
             if( OPAL_DATATYPE_END_LOOP == pElem->elem.common.type ) { /* end of the current loop */
-                DT_CUDA_DEBUG( opal_cuda_output( 0, "unpack end_loop count %d stack_pos %d pos_desc %d disp %ld space %lu\n",
+                DT_CUDA_DEBUG( opal_cuda_output( 2, "unpack end_loop count %d stack_pos %d pos_desc %d disp %ld space %lu\n",
                                                  (int)pStack->count, pConvertor->stack_pos, pos_desc,
                                                  (long)pStack->disp, (unsigned long)iov_len_local ); );
                 if( --(pStack->count) == 0 ) { /* end of loop */
@@ -216,7 +215,7 @@ int32_t opal_generic_simple_unpack_function_cuda_vector( opal_convertor_t* pConv
                 }
                 conv_ptr = pConvertor->pBaseBuf + pStack->disp;
                 UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
-                DT_CUDA_DEBUG( opal_cuda_output( 0, "unpack new_loop count %d stack_pos %d pos_desc %d disp %ld space %lu\n",
+                DT_CUDA_DEBUG( opal_cuda_output( 2, "unpack new_loop count %d stack_pos %d pos_desc %d disp %ld space %lu\n",
                                                  (int)pStack->count, pConvertor->stack_pos, pos_desc,
                                                  (long)pStack->disp, (unsigned long)iov_len_local ); );
             }
@@ -251,7 +250,7 @@ int32_t opal_generic_simple_unpack_function_cuda_vector( opal_convertor_t* pConv
     *out_size = iov_count;
     if( pConvertor->bConverted == pConvertor->remote_size ) {
         pConvertor->flags |= CONVERTOR_COMPLETED;
-        printf("total unpacked %lu\n", pConvertor->bConverted);
+        DT_CUDA_DEBUG( opal_cuda_output( 1, "total packed %lu\n", pConvertor->bConverted); );
         if (pConvertor->gpu_buffer_ptr != NULL && free_required) {
             opal_cuda_free_gpu_buffer(pConvertor->gpu_buffer_ptr, 0);
             pConvertor->gpu_buffer_ptr = NULL;
@@ -261,7 +260,7 @@ int32_t opal_generic_simple_unpack_function_cuda_vector( opal_convertor_t* pConv
     /* Save the global position for the next round */
     PUSH_STACK( pStack, pConvertor->stack_pos, pos_desc, pElem->elem.common.type, count_desc,
                 conv_ptr - pConvertor->pBaseBuf );
-    DT_CUDA_DEBUG( opal_cuda_output( 1, "unpack save stack stack_pos %d pos_desc %d count_desc %d disp %ld\n",
+    DT_CUDA_DEBUG( opal_cuda_output( 2, "unpack save stack stack_pos %d pos_desc %d count_desc %d disp %ld\n",
                                      pConvertor->stack_pos, pStack->index, (int)pStack->count, (long)pStack->disp ); );
     return 0;
 }
@@ -335,7 +334,7 @@ int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvert
 #if defined(OPAL_DATATYPE_CUDA_TIMING) 
     GET_TIME( end );
     total_time = ELAPSED_TIME( start, end );
-    printf( "[Timing]: HtoD memcpy in %ld microsec\n", total_time );
+    printf( "[Timing]: HtoD memcpy in %ld microsec, free required %d\n", total_time, free_required );
 #endif
 
 
@@ -520,14 +519,22 @@ void unpack_contiguous_loop_cuda( dt_elem_desc_t* ELEM,
     uint32_t num_blocks, tasks_per_block;
     unsigned char* _source = *(SOURCE);
 
-    printf("I am in unpack_contiguous_loop_cuda\n");
+#if defined(OPAL_DATATYPE_CUDA_TIMING)    
+    TIMER_DATA_TYPE start, end, start_total, end_total;
+    long total_time;
+#endif
+    
+    DT_CUDA_DEBUG( opal_cuda_output( 0, "I am in unpack_contiguous_loop_cuda\n"); );
 
     if( (_copy_loops * _end_loop->size) > *(SPACE) )
         _copy_loops = (uint32_t)(*(SPACE) / _end_loop->size);
 
     // _destination = pBaseBuf_GPU;
     // _source = (unsigned char*)cuda_desc_h->iov[0].iov_base;
-    
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+    GET_TIME(start);
+#endif
     tasks_per_block = THREAD_PER_BLOCK * TASK_PER_THREAD;
     num_blocks = (*COUNT + tasks_per_block - 1) / tasks_per_block;
     unpack_contiguous_loop_cuda_kernel_global<<<192, 4*THREAD_PER_BLOCK>>>(_copy_loops, _end_loop->size, _loop->extent, _source, _destination);
@@ -540,4 +547,9 @@ void unpack_contiguous_loop_cuda( dt_elem_desc_t* ELEM,
 #endif
     
     cudaDeviceSynchronize();
+#if defined(OPAL_DATATYPE_CUDA_TIMING) 
+    GET_TIME( end );
+    total_time = ELAPSED_TIME( start, end );
+    printf( "[Timing]: vector unpacking in %ld microsec\n", total_time );
+#endif
 }
diff --git a/opal/mca/btl/smcuda/btl_smcuda.c b/opal/mca/btl/smcuda/btl_smcuda.c
index 221150d5ccc..cc10683752f 100644
--- a/opal/mca/btl/smcuda/btl_smcuda.c
+++ b/opal/mca/btl/smcuda/btl_smcuda.c
@@ -74,6 +74,7 @@
 #include "ompi/mca/pml/ob1/pml_ob1_recvreq.h"
 #include "ompi/mca/pml/ob1/pml_ob1_rdmafrag.h"
 
+
 #if OPAL_CUDA_SUPPORT
 static struct mca_btl_base_registration_handle_t *mca_btl_smcuda_register_mem (
     struct mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t *endpoint, void *base,
@@ -1164,9 +1165,20 @@ int mca_btl_smcuda_get_cuda (struct mca_btl_base_module_t *btl,
             size_t pipeline_size = remote_handle->reg_data.pipeline_size;
             uint32_t lindex = remote_handle->reg_data.lindex;
             uint8_t pack_required = remote_handle->reg_data.pack_required;
-            printf("i receive pipeline %ld, lindex %d, pack_required %d\n", pipeline_size, lindex, pack_required);
-            convertor->gpu_buffer_ptr = remote_memory_address;
-            mca_btl_smcuda_cuda_dt_unpack_clone(convertor, ep, local_address, local_handle, (mca_btl_base_completion_fn_t)cbfunc, cbcontext, cbdata, pipeline_size, lindex);
+            uint8_t remote_device = remote_handle->reg_data.gpu_device;
+            uint8_t local_device = 0;
+            rc = mca_common_cuda_get_device(&local_device);
+            printf("i receive pipeline %ld, lindex %d, pack_required %d, remote_device %d， local_device %d\n", pipeline_size, lindex, pack_required, remote_device, local_device);
+            if (rc != 0) {
+                opal_output(0, "Failed to get the GPU device ID, rc=%d", rc);
+                return rc;
+            }
+            if (remote_device != local_device && !OPAL_DATATYPE_DIRECT_COPY_GPUMEM) {
+                convertor->gpu_buffer_ptr = NULL;  
+            } else {
+                convertor->gpu_buffer_ptr = remote_memory_address;   
+            }
+            mca_btl_smcuda_cuda_dt_unpack_clone(convertor, ep, local_address, local_handle, remote_memory_address, (mca_btl_base_completion_fn_t)cbfunc, cbcontext, cbdata, pipeline_size, lindex, remote_device, local_device);
             if (pack_required) {
                 mca_btl_smcuda_send_cuda_pack_sig(btl, ep, lindex, 0, 0);
             }
@@ -1400,46 +1412,54 @@ void mca_btl_smcuda_cuda_dt_pack_clone(struct opal_convertor_t *convertor,
                                        struct mca_btl_base_endpoint_t *endpoint,
                                        void *local_address,
                                        struct mca_btl_base_registration_handle_t *local_handle,
+                                       void *remote_gpu_address,
                                        mca_btl_base_completion_fn_t cbfunc,
                                        void *cbcontext,
                                        void *cbdata,
                                        size_t pipeline_size,
-                                       int lindex)
+                                       int lindex, uint8_t remote_device, uint8_t local_device)
 {
     endpoint->smcuda_dt_pack_clone[lindex].convertor = convertor;
-    endpoint->smcuda_dt_pack_clone[lindex].gpu_ptr = convertor->gpu_buffer_ptr;
+ //   endpoint->smcuda_dt_pack_clone[lindex].gpu_ptr = convertor->gpu_buffer_ptr;
     endpoint->smcuda_dt_pack_clone[lindex].endpoint = endpoint;
     endpoint->smcuda_dt_pack_clone[lindex].local_address = local_address;
     endpoint->smcuda_dt_pack_clone[lindex].local_handle = local_handle;
+    endpoint->smcuda_dt_pack_clone[lindex].remote_gpu_address = remote_gpu_address;
     endpoint->smcuda_dt_pack_clone[lindex].cbfunc = cbfunc;
     endpoint->smcuda_dt_pack_clone[lindex].cbcontext = cbcontext;
     endpoint->smcuda_dt_pack_clone[lindex].cbdata = cbdata;
     endpoint->smcuda_dt_pack_clone[lindex].pipeline_size = pipeline_size;
     endpoint->smcuda_dt_pack_clone[lindex].lindex = lindex;
     endpoint->smcuda_dt_pack_clone[lindex].seq = -9;
+    endpoint->smcuda_dt_pack_clone[lindex].remote_device = remote_device;
+    endpoint->smcuda_dt_pack_clone[lindex].local_device = local_device;
 }
 
 void mca_btl_smcuda_cuda_dt_unpack_clone(struct opal_convertor_t *convertor,
                                          struct mca_btl_base_endpoint_t *endpoint,
                                          void *local_address,
                                          struct mca_btl_base_registration_handle_t *local_handle,
+                                         void *remote_gpu_address,
                                          mca_btl_base_completion_fn_t cbfunc,
                                          void *cbcontext,
                                          void *cbdata,
                                          size_t pipeline_size,
-                                         int lindex)
+                                         int lindex, uint8_t remote_device, uint8_t local_device)
 {
     endpoint->smcuda_dt_unpack_clone[lindex].convertor = convertor;
-    endpoint->smcuda_dt_unpack_clone[lindex].gpu_ptr = convertor->gpu_buffer_ptr;
+//    endpoint->smcuda_dt_unpack_clone[lindex].gpu_ptr = convertor->gpu_buffer_ptr;
     endpoint->smcuda_dt_unpack_clone[lindex].endpoint = endpoint;
     endpoint->smcuda_dt_unpack_clone[lindex].local_address = local_address;
     endpoint->smcuda_dt_unpack_clone[lindex].local_handle = local_handle;
+    endpoint->smcuda_dt_unpack_clone[lindex].remote_gpu_address = remote_gpu_address;
     endpoint->smcuda_dt_unpack_clone[lindex].cbfunc = cbfunc;
     endpoint->smcuda_dt_unpack_clone[lindex].cbcontext = cbcontext;
     endpoint->smcuda_dt_unpack_clone[lindex].cbdata = cbdata;
     endpoint->smcuda_dt_unpack_clone[lindex].pipeline_size = pipeline_size;
     endpoint->smcuda_dt_unpack_clone[lindex].lindex = lindex;
     endpoint->smcuda_dt_unpack_clone[lindex].seq = -9;
+    endpoint->smcuda_dt_unpack_clone[lindex].remote_device = remote_device;
+    endpoint->smcuda_dt_unpack_clone[lindex].local_device = local_device;
 }
 
 #endif /* OPAL_CUDA_SUPPORT */
diff --git a/opal/mca/btl/smcuda/btl_smcuda.h b/opal/mca/btl/smcuda/btl_smcuda.h
index a90ba5c0f19..d562be32904 100644
--- a/opal/mca/btl/smcuda/btl_smcuda.h
+++ b/opal/mca/btl/smcuda/btl_smcuda.h
@@ -41,6 +41,8 @@
 #include "opal/mca/btl/btl.h"
 #include "opal/mca/common/sm/common_sm.h"
 
+#define OPAL_DATATYPE_DIRECT_COPY_GPUMEM    1
+
 BEGIN_C_DECLS
 
 /*
@@ -518,16 +520,18 @@ typedef struct {
 /* package save pack/unpack convertor and cbfunc */
 typedef struct {
     struct opal_convertor_t *convertor;
-    void *gpu_ptr;
     struct mca_btl_base_endpoint_t *endpoint;
     void *local_address;
     struct mca_btl_base_registration_handle_t *local_handle;
+    void *remote_gpu_address;
     mca_btl_base_completion_fn_t cbfunc;
     void *cbcontext;
     void *cbdata;
     size_t pipeline_size;
     int lindex;
     int seq;
+    uint8_t remote_device;
+    uint8_t local_device;
 } cuda_dt_clone_t;
 
 #define SMCUDA_DT_CLONE_SIZE 20
@@ -547,20 +551,22 @@ void mca_btl_smcuda_cuda_dt_pack_clone(struct opal_convertor_t *convertor,
                                        struct mca_btl_base_endpoint_t *endpoint,
                                        void *local_address,
                                        struct mca_btl_base_registration_handle_t *local_handle,
+                                       void *remote_gpu_address,
                                        mca_btl_base_completion_fn_t cbfunc,
                                        void *cbcontext,
                                        void *cbdata,
                                        size_t pipeline_size,
-                                       int lindex);
+                                       int lindex, uint8_t remote_device, uint8_t local_device);
 void mca_btl_smcuda_cuda_dt_unpack_clone(struct opal_convertor_t *convertor,
                                          struct mca_btl_base_endpoint_t *endpoint,
                                          void *local_address,
                                          struct mca_btl_base_registration_handle_t *local_handle,
+                                         void *remote_gpu_address,
                                          mca_btl_base_completion_fn_t cbfunc,
                                          void *cbcontext,
                                          void *cbdata,
                                          size_t pipeline_size,
-                                         int lindex);
+                                         int lindex, uint8_t remote_device, uint8_t local_device);
 
 #endif /* OPAL_CUDA_SUPPORT */
 
diff --git a/opal/mca/btl/smcuda/btl_smcuda_component.c b/opal/mca/btl/smcuda/btl_smcuda_component.c
index cd6c7ce071b..78568ab952e 100644
--- a/opal/mca/btl/smcuda/btl_smcuda_component.c
+++ b/opal/mca/btl/smcuda/btl_smcuda_component.c
@@ -878,11 +878,26 @@ static void btl_smcuda_datatype_unpack(mca_btl_base_module_t* btl,
         struct iovec iov;
         uint32_t iov_count = 1;
         size_t max_data;
-        struct opal_convertor_t *convertor = my_cuda_dt_clone->convertor;   
-        iov.iov_base = convertor->gpu_buffer_ptr + seq * pipeline_size;
+        struct opal_convertor_t *convertor = my_cuda_dt_clone->convertor;     
+        if (!OPAL_DATATYPE_DIRECT_COPY_GPUMEM && my_cuda_dt_clone->remote_device != my_cuda_dt_clone->local_device) {
+            convertor->gpu_buffer_ptr = opal_cuda_malloc_gpu_buffer_p(pipeline_size, 0);
+            mca_common_cuda_memp2pcpy(convertor->gpu_buffer_ptr, my_cuda_dt_clone->remote_gpu_address + seq*pipeline_size, pipeline_size);
+            iov.iov_base = convertor->gpu_buffer_ptr;
+            printf("start D2D copy src %p, dst %p, size %lu\n", my_cuda_dt_clone->remote_gpu_address + seq*pipeline_size, convertor->gpu_buffer_ptr, pipeline_size);
+            
+        } else {
+            iov.iov_base = convertor->gpu_buffer_ptr + seq * pipeline_size;
+        }
         max_data = pipeline_size;
         iov.iov_len = pipeline_size;
         opal_convertor_unpack(convertor, &iov, &iov_count, &max_data );
+        if (!OPAL_DATATYPE_DIRECT_COPY_GPUMEM && my_cuda_dt_clone->remote_device != my_cuda_dt_clone->local_device) {
+            if (convertor->gpu_buffer_ptr != NULL) {
+                opal_cuda_free_gpu_buffer_p(convertor->gpu_buffer_ptr, 0);
+                convertor->gpu_buffer_ptr = NULL;
+            }
+            
+        }
     }
    // MCA_BTL_SMCUDA_FRAG_RETURN(frag);
 }
@@ -905,13 +920,15 @@ static void btl_smcuda_datatype_pack(mca_btl_base_module_t* btl,
     my_cuda_dt_clone = &endpoint->smcuda_dt_pack_clone[lindex];
     
     printf("$$$$$$$$$$$$$$hello, rank %d in smcuda pack seq %d, index %d\n", my_cuda_dt_clone->endpoint->my_smp_rank, seq, lindex);
-    
+    struct opal_convertor_t *convertor = my_cuda_dt_clone->convertor;
     if (seq == -1) {
         mca_btl_smcuda_send_cuda_unpack_sig(btl, my_cuda_dt_clone->endpoint, lindex, 0, -2);
-        opal_cuda_free_gpu_buffer_p(my_cuda_dt_clone->gpu_ptr, 0);
+        if (convertor->gpu_buffer_ptr != NULL) {
+            opal_cuda_free_gpu_buffer_p(convertor->gpu_buffer_ptr, 0);
+            convertor->gpu_buffer_ptr = NULL;
+        }
         mca_btl_smcuda_free_cuda_dt_pack_clone(my_cuda_dt_clone->endpoint, lindex);
     } else {
-        struct opal_convertor_t *convertor = my_cuda_dt_clone->convertor;
         struct iovec iov;
         int rc_dt = 0;
         size_t pipeline_size = 1024*1024*20;
diff --git a/opal/mca/common/cuda/common_cuda.c b/opal/mca/common/cuda/common_cuda.c
index f59d4365006..2554c445302 100644
--- a/opal/mca/common/cuda/common_cuda.c
+++ b/opal/mca/common/cuda/common_cuda.c
@@ -2080,6 +2080,19 @@ int mca_common_cuda_get_address_range(void *pbase, size_t *psize, void *base)
     return 0;
 }
 
+int mca_common_cuda_memp2pcpy(void *dest, const void *src, size_t size)
+{
+    CUresult result;
+
+    result = cuFunc.cuMemcpy((CUdeviceptr)dest, (CUdeviceptr)src, size);
+    if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
+        opal_show_help("help-mpi-common-cuda.txt", "cuMemcpy failed",
+                        true, OPAL_PROC_MY_HOSTNAME, result);
+        return OPAL_ERROR;
+    }
+    return OPAL_SUCCESS;
+}
+
 #if OPAL_CUDA_GDR_SUPPORT
 /* Check to see if the memory was freed between the time it was stored in
  * the registration cache and now.  Return true if the memory was previously
diff --git a/opal/mca/common/cuda/common_cuda.h b/opal/mca/common/cuda/common_cuda.h
index 20290dff7d8..d5220052d63 100644
--- a/opal/mca/common/cuda/common_cuda.h
+++ b/opal/mca/common/cuda/common_cuda.h
@@ -42,6 +42,7 @@ struct mca_mpool_common_cuda_reg_data_t {
     size_t pipeline_size;
     uint32_t lindex;
     uint8_t pack_required;
+    uint8_t gpu_device;
 };
 typedef struct mca_mpool_common_cuda_reg_data_t mca_mpool_common_cuda_reg_data_t;
 
@@ -99,6 +100,7 @@ OPAL_DECLSPEC int mca_common_cuda_create_event(uint64_t **event);
 OPAL_DECLSPEC int mca_common_cuda_record_event(uint64_t *event);
 OPAL_DECLSPEC int mca_common_cuda_query_event(uint64_t *event);
 OPAL_DECLSPEC int mca_common_cuda_openeventhandle(uint64_t **event, int n, mca_mpool_common_cuda_reg_data_t *handle);
+OPAL_DECLSPEC int mca_common_cuda_memp2pcpy(void *dest, const void *src, size_t size);
 #if OPAL_CUDA_GDR_SUPPORT
 OPAL_DECLSPEC bool mca_common_cuda_previously_freed_memory(mca_mpool_base_registration_t *reg);
 OPAL_DECLSPEC void mca_common_cuda_get_buffer_id(mca_mpool_base_registration_t *reg);
diff --git a/test/datatype/Makefile.am b/test/datatype/Makefile.am
index b32f3a64713..186fdd1c1bb 100644
--- a/test/datatype/Makefile.am
+++ b/test/datatype/Makefile.am
@@ -14,7 +14,7 @@
 #
 
 if PROJECT_OMPI
-    MPI_TESTS = checksum position position_noncontig ddt_test ddt_raw unpack_ooo ddt_pack
+    MPI_TESTS = checksum position position_noncontig ddt_test ddt_raw unpack_ooo ddt_pack ddt_benchmark
     MPI_CHECKS = to_self ddt_pack
 endif
 TESTS = opal_datatype_test $(MPI_TESTS)
@@ -32,6 +32,11 @@ ddt_test_LDFLAGS = $(WRAPPER_EXTRA_LDFLAGS)
 ddt_test_CFLAGS = -I/mnt/sw/cuda/include -g -O0
 ddt_test_LDADD = $(top_builddir)/ompi/libmpi.la $(top_builddir)/opal/mca/common/cuda/libmca_common_cuda.la -L/mnt/sw/cuda/lib64 -lcudart
 
+ddt_benchmark_SOURCES = ddt_benchmark.c ddt_lib.c ddt_lib.h
+ddt_benchmark_LDFLAGS = $(WRAPPER_EXTRA_LDFLAGS)
+ddt_benchmark_CFLAGS = -I/mnt/sw/cuda/include -g -O0
+ddt_benchmark_LDADD = $(top_builddir)/ompi/libmpi.la $(top_builddir)/opal/mca/common/cuda/libmca_common_cuda.la -L/mnt/sw/cuda/lib64 -lcudart
+
 #ddt_test_old_SOURCES = ddt_test_old.c ddt_lib.c ddt_lib.h
 #ddt_test_old_LDFLAGS = $(WRAPPER_EXTRA_LDFLAGS)
 #ddt_test_old_LDADD = $(top_builddir)/ompi/libmpi.la

From 44a1550d04bf041440d84221078be934936b7b4e Mon Sep 17 00:00:00 2001
From: eddy16112 <eddy16112@gmail.com>
Date: Tue, 8 Sep 2015 00:42:11 -0400
Subject: [PATCH 018/190] now we can use cudamemcpy2d

Conflicts:
	opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
---
 opal/datatype/cuda/opal_datatype_cuda.cuh     | 12 +++
 .../cuda/opal_datatype_cuda_internal.cuh      |  1 +
 .../cuda/opal_datatype_pack_cuda_wrapper.cu   | 73 +++++++++++++++---
 .../cuda/opal_datatype_unpack_cuda_wrapper.cu | 75 ++++++++++++++++---
 opal/mca/btl/smcuda/btl_smcuda.h              |  2 +-
 opal/mca/btl/smcuda/btl_smcuda_component.c    |  4 +-
 6 files changed, 143 insertions(+), 24 deletions(-)

diff --git a/opal/datatype/cuda/opal_datatype_cuda.cuh b/opal/datatype/cuda/opal_datatype_cuda.cuh
index 04dd5f88a26..6e86640b5e6 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cuh
+++ b/opal/datatype/cuda/opal_datatype_cuda.cuh
@@ -44,11 +44,23 @@ void pack_contiguous_loop_cuda( dt_elem_desc_t* ELEM,
                                 unsigned char** DESTINATION,
                                 size_t* SPACE );
                                 
+void pack_contiguous_loop_cuda_memcpy2d( dt_elem_desc_t* ELEM,
+                                         uint32_t* COUNT,
+                                         unsigned char** SOURCE,
+                                         unsigned char** DESTINATION,
+                                         size_t* SPACE, uint8_t* transfer_required );
+                                
 void unpack_contiguous_loop_cuda( dt_elem_desc_t* ELEM,
                                   uint32_t* COUNT,
                                   unsigned char** SOURCE,
                                   unsigned char** DESTINATION,
                                   size_t* SPACE );
+
+void unpack_contiguous_loop_cuda_memcpy2d( dt_elem_desc_t* ELEM,
+                                           uint32_t* COUNT,
+                                           unsigned char** SOURCE,
+                                           unsigned char** DESTINATION,
+                                           size_t* SPACE, uint8_t* free_required );
                                   
 void pack_predefined_data_cuda( dt_elem_desc_t* ELEM,
                                 uint32_t* COUNT,
diff --git a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
index 3d8640bcbc2..98d787ac650 100644
--- a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
+++ b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
@@ -14,6 +14,7 @@
 //#define OPAL_DATATYPE_CUDA_KERNEL_TIME
 #define OPAL_DATATYPE_CUDA_DEBUG_LEVEL  0
 #define OPAL_DATATYPE_CUDA_TIMING
+#define OPAL_DATATYPE_VECTOR_USE_MEMCPY2D   0
 
 
 #define IOV_ARRAY_SIZE          1
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
index 56a85e3709d..9a589501ae4 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
@@ -230,13 +230,13 @@ int32_t opal_generic_simple_pack_function_cuda_vector(opal_convertor_t* pConvert
             if (iov[iov_count].iov_base == NULL) {
                 iov[iov_count].iov_base = (unsigned char *)opal_cuda_malloc_gpu_buffer(iov_len_local, 0);
                 iov_ptr = (unsigned char *)iov[iov_count].iov_base;
+                pConvertor->gpu_buffer_ptr = iov_ptr;
                 free_required = 1;
             } else {
                 iov_ptr = (unsigned char *)iov[iov_count].iov_base;
                 free_required = 0;
             }
             transfer_required = 0;
-            pConvertor->gpu_buffer_ptr = iov_ptr;
         } else {
             iov_len_local = iov[iov_count].iov_len;
             if (pConvertor->gpu_buffer_ptr == NULL) {
@@ -291,7 +291,12 @@ int32_t opal_generic_simple_pack_function_cuda_vector(opal_convertor_t* pConvert
             if( OPAL_DATATYPE_LOOP == pElem->elem.common.type ) {
                 OPAL_PTRDIFF_TYPE local_disp = (OPAL_PTRDIFF_TYPE)conv_ptr;
                 if( pElem->loop.common.flags & OPAL_DATATYPE_FLAG_CONTIGUOUS ) {
-                    pack_contiguous_loop_cuda(pElem, &count_desc, &conv_ptr, &iov_ptr, &iov_len_local);
+                    if (transfer_required && OPAL_DATATYPE_VECTOR_USE_MEMCPY2D) {
+                        iov_ptr = (unsigned char*)iov[iov_count].iov_base;
+                        pack_contiguous_loop_cuda_memcpy2d(pElem, &count_desc, &conv_ptr, &iov_ptr, &iov_len_local, &transfer_required);
+                    } else {
+                        pack_contiguous_loop_cuda(pElem, &count_desc, &conv_ptr, &iov_ptr, &iov_len_local);
+                    }
                     if( 0 == count_desc ) {  /* completed */
                         pos_desc += pElem->loop.items + 1;
                         goto update_loop_description;
@@ -330,8 +335,8 @@ int32_t opal_generic_simple_pack_function_cuda_vector(opal_convertor_t* pConvert
     *out_size = iov_count;
     if( pConvertor->bConverted == pConvertor->local_size ) {
         pConvertor->flags |= CONVERTOR_COMPLETED;
-        DT_CUDA_DEBUG( opal_cuda_output( 1, "total packed %lu\n", pConvertor->bConverted); );
-        if (pConvertor->gpu_buffer_ptr != NULL && free_required) {
+        DT_CUDA_DEBUG( opal_cuda_output( 0, "Total packed %lu\n", pConvertor->bConverted); );
+        if (pConvertor->gpu_buffer_ptr != NULL && free_required == 1) {
            opal_cuda_free_gpu_buffer(pConvertor->gpu_buffer_ptr, 0);
            pConvertor->gpu_buffer_ptr = NULL;
         }
@@ -376,9 +381,10 @@ void pack_contiguous_loop_cuda( dt_elem_desc_t* ELEM,
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
     GET_TIME(start);
 #endif    
-    tasks_per_block = THREAD_PER_BLOCK * TASK_PER_THREAD;
-    num_blocks = (*COUNT + tasks_per_block - 1) / tasks_per_block;
-    pack_contiguous_loop_cuda_kernel_global<<<192, 4*THREAD_PER_BLOCK>>>(_copy_loops, _end_loop->size, _loop->extent, _source, _destination);
+ //   tasks_per_block = THREAD_PER_BLOCK * TASK_PER_THREAD;
+ //   num_blocks = (*COUNT + tasks_per_block - 1) / tasks_per_block;
+    cudaMemcpy2D(_destination, _end_loop->size, _source, _loop->extent, _end_loop->size, _copy_loops, cudaMemcpyDeviceToDevice);
+//     pack_contiguous_loop_cuda_kernel_global<<<192, 4*THREAD_PER_BLOCK>>>(_copy_loops, _end_loop->size, _loop->extent, _source, _destination);
 
 #if !defined(OPAL_DATATYPE_CUDA_DRY_RUN)    
     *(SOURCE) = _source +  _loop->extent*_copy_loops - _end_loop->first_elem_disp;
@@ -396,6 +402,52 @@ void pack_contiguous_loop_cuda( dt_elem_desc_t* ELEM,
 #endif
 }
 
+void pack_contiguous_loop_cuda_memcpy2d( dt_elem_desc_t* ELEM,
+                                uint32_t* COUNT,
+                                unsigned char** SOURCE,
+                                unsigned char** DESTINATION,
+                                size_t* SPACE, uint8_t* transfer_required )
+{
+    ddt_loop_desc_t *_loop = (ddt_loop_desc_t*)(ELEM);
+    ddt_endloop_desc_t* _end_loop = (ddt_endloop_desc_t*)((ELEM) + _loop->items);
+    unsigned char* _source = (*SOURCE) + _end_loop->first_elem_disp;
+    uint32_t _copy_loops = *(COUNT);
+    uint32_t num_blocks, tasks_per_block;
+    unsigned char* _destination = *(DESTINATION);
+    
+#if defined(OPAL_DATATYPE_CUDA_TIMING)    
+    TIMER_DATA_TYPE start, end, start_total, end_total;
+    long total_time;
+#endif
+
+    DT_CUDA_DEBUG( opal_cuda_output( 0, "I am in pack_contiguous_loop_cuda_memcpy2d\n"); );
+
+    if( (_copy_loops * _end_loop->size) > *(SPACE) )
+        _copy_loops = (uint32_t)(*(SPACE) / _end_loop->size);
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+    GET_TIME(start);
+#endif    
+
+    cudaMemcpy2D(_destination, _end_loop->size, _source, _loop->extent, _end_loop->size, _copy_loops, cudaMemcpyDeviceToHost);
+
+#if !defined(OPAL_DATATYPE_CUDA_DRY_RUN)    
+    *(SOURCE) = _source +  _loop->extent*_copy_loops - _end_loop->first_elem_disp;
+    *(DESTINATION) = *(DESTINATION)  + _copy_loops * _end_loop->size;
+    *(SPACE) -= _copy_loops * _end_loop->size;
+    *(COUNT) -= _copy_loops;
+#endif
+    
+//    cudaDeviceSynchronize();
+    *transfer_required = 0;
+    
+#if defined(OPAL_DATATYPE_CUDA_TIMING) 
+    GET_TIME( end );
+    total_time = ELAPSED_TIME( start, end );
+    printf( "[Timing]: vector packing with memcpy2d in %ld microsec\n", total_time );
+#endif
+}
+
 int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor,
                                                     struct iovec* iov,
                                                     uint32_t* out_size,
@@ -453,13 +505,13 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
         if (iov[0].iov_base == NULL) {
             iov[0].iov_base = (unsigned char *)opal_cuda_malloc_gpu_buffer(buffer_size, 0);
             destination = (unsigned char *)iov[0].iov_base;
+            pConvertor->gpu_buffer_ptr = destination;
             free_required = 1;
         } else {
             destination = (unsigned char *)iov[0].iov_base;
             free_required = 0;
         }
         transfer_required = 0;
-        pConvertor->gpu_buffer_ptr = destination;
     } else {
         buffer_size = iov[0].iov_len;
         if (pConvertor->gpu_buffer_ptr == NULL) {
@@ -620,7 +672,10 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
     }
     
 
-    cudaDeviceSynchronize();
+  //  cudaDeviceSynchronize();
+    for (i = 0; i < NB_STREAMS; i++) {
+        cudaStreamSynchronize(cuda_streams->opal_cuda_stream[i]);
+    }
     
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
     GET_TIME(start);
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
index 36316ae877f..484f22cf785 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
@@ -161,13 +161,16 @@ int32_t opal_generic_simple_unpack_function_cuda_vector( opal_convertor_t* pConv
         if (opal_cuda_is_gpu_buffer(iov[iov_count].iov_base)) {
             iov_ptr = (unsigned char*)iov[iov_count].iov_base;
             free_required = 0;
-        } else {  
+        } else if (!OPAL_DATATYPE_VECTOR_USE_MEMCPY2D){
             if (pConvertor->gpu_buffer_ptr == NULL) {
                 pConvertor->gpu_buffer_ptr = (unsigned char*)opal_cuda_malloc_gpu_buffer(iov[iov_count].iov_len, 0);
             }
             iov_ptr = pConvertor->gpu_buffer_ptr;
             cudaMemcpy(iov_ptr, iov[iov_count].iov_base, iov[iov_count].iov_len, cudaMemcpyHostToDevice);
             free_required = 1;
+        } else {
+            iov_ptr = (unsigned char*)iov[iov_count].iov_base;
+            free_required = 255;
         }
 #if defined(OPAL_DATATYPE_CUDA_TIMING) 
         GET_TIME( end );
@@ -222,7 +225,11 @@ int32_t opal_generic_simple_unpack_function_cuda_vector( opal_convertor_t* pConv
             if( OPAL_DATATYPE_LOOP == pElem->elem.common.type ) {
                 OPAL_PTRDIFF_TYPE local_disp = (OPAL_PTRDIFF_TYPE)conv_ptr;
                 if( pElem->loop.common.flags & OPAL_DATATYPE_FLAG_CONTIGUOUS ) {
-                    unpack_contiguous_loop_cuda(pElem, &count_desc, &iov_ptr, &conv_ptr, &iov_len_local);
+                    if (free_required == 255 && OPAL_DATATYPE_VECTOR_USE_MEMCPY2D) {
+                        unpack_contiguous_loop_cuda_memcpy2d(pElem, &count_desc, &iov_ptr, &conv_ptr, &iov_len_local, &free_required);
+                    } else {
+                        unpack_contiguous_loop_cuda(pElem, &count_desc, &iov_ptr, &conv_ptr, &iov_len_local);
+                    }
                     if( 0 == count_desc ) {  /* completed */
                         pos_desc += pElem->loop.items + 1;
                         goto update_loop_description;
@@ -250,8 +257,8 @@ int32_t opal_generic_simple_unpack_function_cuda_vector( opal_convertor_t* pConv
     *out_size = iov_count;
     if( pConvertor->bConverted == pConvertor->remote_size ) {
         pConvertor->flags |= CONVERTOR_COMPLETED;
-        DT_CUDA_DEBUG( opal_cuda_output( 1, "total packed %lu\n", pConvertor->bConverted); );
-        if (pConvertor->gpu_buffer_ptr != NULL && free_required) {
+        DT_CUDA_DEBUG( opal_cuda_output( 0, "Total unpacked %lu\n", pConvertor->bConverted); );
+        if (pConvertor->gpu_buffer_ptr != NULL && free_required == 1) {
             opal_cuda_free_gpu_buffer(pConvertor->gpu_buffer_ptr, 0);
             pConvertor->gpu_buffer_ptr = NULL;
         }
@@ -482,8 +489,10 @@ int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvert
 #endif
 
     }
-    cudaDeviceSynchronize();
-    
+    for (i = 0; i < NB_STREAMS; i++) {
+        cudaStreamSynchronize(cuda_streams->opal_cuda_stream[i]);
+    }
+
     iov[0].iov_len = total_unpacked;
     *max_data = total_unpacked;
     *out_size = 1;
@@ -529,15 +538,13 @@ void unpack_contiguous_loop_cuda( dt_elem_desc_t* ELEM,
     if( (_copy_loops * _end_loop->size) > *(SPACE) )
         _copy_loops = (uint32_t)(*(SPACE) / _end_loop->size);
 
-    // _destination = pBaseBuf_GPU;
-    // _source = (unsigned char*)cuda_desc_h->iov[0].iov_base;
-
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
     GET_TIME(start);
 #endif
-    tasks_per_block = THREAD_PER_BLOCK * TASK_PER_THREAD;
-    num_blocks = (*COUNT + tasks_per_block - 1) / tasks_per_block;
-    unpack_contiguous_loop_cuda_kernel_global<<<192, 4*THREAD_PER_BLOCK>>>(_copy_loops, _end_loop->size, _loop->extent, _source, _destination);
+//    tasks_per_block = THREAD_PER_BLOCK * TASK_PER_THREAD;
+//    num_blocks = (*COUNT + tasks_per_block - 1) / tasks_per_block;
+//    unpack_contiguous_loop_cuda_kernel_global<<<192, 4*THREAD_PER_BLOCK>>>(_copy_loops, _end_loop->size, _loop->extent, _source, _destination);
+     cudaMemcpy2D(_destination, _loop->extent, _source, _end_loop->size, _end_loop->size, _copy_loops, cudaMemcpyDeviceToDevice);
 
 #if !defined(OPAL_DATATYPE_CUDA_DRY_RUN)     
     *(DESTINATION) = _destination + _loop->extent*_copy_loops - _end_loop->first_elem_disp;
@@ -553,3 +560,47 @@ void unpack_contiguous_loop_cuda( dt_elem_desc_t* ELEM,
     printf( "[Timing]: vector unpacking in %ld microsec\n", total_time );
 #endif
 }
+
+void unpack_contiguous_loop_cuda_memcpy2d( dt_elem_desc_t* ELEM,
+                                  uint32_t* COUNT,
+                                  unsigned char** SOURCE,
+                                  unsigned char** DESTINATION,
+                                  size_t* SPACE, uint8_t* free_required )
+{
+    ddt_loop_desc_t *_loop = (ddt_loop_desc_t*)(ELEM);
+    ddt_endloop_desc_t* _end_loop = (ddt_endloop_desc_t*)((ELEM) + _loop->items);
+    unsigned char* _destination = (*DESTINATION) + _end_loop->first_elem_disp;
+    uint32_t _copy_loops = *(COUNT);
+    uint32_t num_blocks, tasks_per_block;
+    unsigned char* _source = *(SOURCE);
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)    
+    TIMER_DATA_TYPE start, end, start_total, end_total;
+    long total_time;
+#endif
+    
+    DT_CUDA_DEBUG( opal_cuda_output( 0, "I am in unpack_contiguous_loop_cuda_memcpy2d\n"); );
+
+    if( (_copy_loops * _end_loop->size) > *(SPACE) )
+        _copy_loops = (uint32_t)(*(SPACE) / _end_loop->size);
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+    GET_TIME(start);
+#endif
+    cudaMemcpy2D(_destination, _loop->extent, _source, _end_loop->size, _end_loop->size, _copy_loops, cudaMemcpyHostToDevice);
+
+#if !defined(OPAL_DATATYPE_CUDA_DRY_RUN)
+    *(DESTINATION) = _destination + _loop->extent*_copy_loops - _end_loop->first_elem_disp;
+    *(SOURCE) = *(SOURCE)  + _copy_loops * _end_loop->size;
+    *(SPACE) -= _copy_loops * _end_loop->size;
+    *(COUNT) -= _copy_loops;
+#endif
+    
+//    *free_required = 0;
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING) 
+    GET_TIME( end );
+    total_time = ELAPSED_TIME( start, end );
+    printf( "[Timing]: vector unpacking with memcpy2d in %ld microsec\n", total_time );
+#endif
+}
diff --git a/opal/mca/btl/smcuda/btl_smcuda.h b/opal/mca/btl/smcuda/btl_smcuda.h
index d562be32904..20465decc10 100644
--- a/opal/mca/btl/smcuda/btl_smcuda.h
+++ b/opal/mca/btl/smcuda/btl_smcuda.h
@@ -41,7 +41,7 @@
 #include "opal/mca/btl/btl.h"
 #include "opal/mca/common/sm/common_sm.h"
 
-#define OPAL_DATATYPE_DIRECT_COPY_GPUMEM    1
+#define OPAL_DATATYPE_DIRECT_COPY_GPUMEM    0
 
 BEGIN_C_DECLS
 
diff --git a/opal/mca/btl/smcuda/btl_smcuda_component.c b/opal/mca/btl/smcuda/btl_smcuda_component.c
index 78568ab952e..2382fef5d94 100644
--- a/opal/mca/btl/smcuda/btl_smcuda_component.c
+++ b/opal/mca/btl/smcuda/btl_smcuda_component.c
@@ -142,7 +142,7 @@ static int mca_btl_smcuda_component_verify(void) {
 static int smcuda_register(void)
 {
     /* register SM component parameters */
-    mca_btl_smcuda_param_register_int("free_list_num", 8, OPAL_INFO_LVL_5, &mca_btl_smcuda_component.sm_free_list_num);
+    mca_btl_smcuda_param_register_int("free_list_num", 16, OPAL_INFO_LVL_5, &mca_btl_smcuda_component.sm_free_list_num);
     mca_btl_smcuda_param_register_int("free_list_max", -1, OPAL_INFO_LVL_5, &mca_btl_smcuda_component.sm_free_list_max);
     mca_btl_smcuda_param_register_int("free_list_inc", 64, OPAL_INFO_LVL_5, &mca_btl_smcuda_component.sm_free_list_inc);
     mca_btl_smcuda_param_register_int("max_procs", -1, OPAL_INFO_LVL_5, &mca_btl_smcuda_component.sm_max_procs);
@@ -931,7 +931,7 @@ static void btl_smcuda_datatype_pack(mca_btl_base_module_t* btl,
     } else {
         struct iovec iov;
         int rc_dt = 0;
-        size_t pipeline_size = 1024*1024*20;
+        size_t pipeline_size = 1024*1024*200;
         uint32_t iov_count = 1;
         iov.iov_base = convertor->gpu_buffer_ptr;
         iov.iov_len = pipeline_size;

From a67c842be9a360511b3852a3c64a8304e91bed3e Mon Sep 17 00:00:00 2001
From: eddy16112 <eddy16112@gmail.com>
Date: Tue, 8 Sep 2015 18:34:57 -0400
Subject: [PATCH 019/190] enable zero copy + fix GPU buffer bug

Conflicts:
	opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
---
 opal/datatype/cuda/opal_datatype_cuda.cu      |  10 +-
 opal/datatype/cuda/opal_datatype_cuda.cuh     |  22 ++-
 .../cuda/opal_datatype_cuda_internal.cuh      |  10 +
 .../cuda/opal_datatype_pack_cuda_kernel.cu    |  14 ++
 .../cuda/opal_datatype_pack_cuda_wrapper.cu   | 172 ++++++++++++++++--
 .../cuda/opal_datatype_unpack_cuda_wrapper.cu |  87 +++++++--
 opal/mca/btl/smcuda/btl_smcuda_component.c    |   2 +-
 7 files changed, 280 insertions(+), 37 deletions(-)

diff --git a/opal/datatype/cuda/opal_datatype_cuda.cu b/opal/datatype/cuda/opal_datatype_cuda.cu
index b94679358a0..9791e40fef1 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cu
+++ b/opal/datatype/cuda/opal_datatype_cuda.cu
@@ -370,9 +370,9 @@ void* opal_cuda_malloc_gpu_buffer(size_t size, int gpu_id)
 {
     int dev_id;
     cudaGetDevice(&dev_id);
-    printf("malloc gpu buffer in dev %d\n", dev_id);
     ddt_cuda_device_t *device = &cuda_device[gpu_id];
     if (device->buffer_free_size < size) {
+        DT_CUDA_DEBUG( opal_cuda_output( 0, "No GPU buffer at dev_id %d.\n", dev_id); );
         return NULL;
     }
     ddt_cuda_buffer_t *ptr = NULL;
@@ -406,7 +406,7 @@ void* opal_cuda_malloc_gpu_buffer(size_t size, int gpu_id)
         cuda_list_push_head(&device->buffer_used, p);
         device->buffer_used_size += size;
         device->buffer_free_size -= size;
-        DT_CUDA_DEBUG( opal_cuda_output( 1, "Malloc GPU buffer %p.\n", addr); );
+        DT_CUDA_DEBUG( opal_cuda_output( 0, "Malloc GPU buffer %p, dev_id %d.\n", addr, dev_id); );
         return addr;
     }
 }
@@ -442,9 +442,11 @@ void opal_cuda_free_gpu_buffer(void *addr, int gpu_id)
     if (ptr == NULL) {
         DT_CUDA_DEBUG( opal_cuda_output( 0, "addr %p is not managed.\n", addr); );
     }
+    size_t size = ptr->size;
     cuda_list_item_merge_by_addr(&device->buffer_free, ptr);
-    device->buffer_free_size += ptr->size;
-    DT_CUDA_DEBUG( opal_cuda_output( 1, "Free GPU buffer %p.\n", addr); );
+    device->buffer_free_size += size;
+    device->buffer_used_size -= size;
+    DT_CUDA_DEBUG( opal_cuda_output( 0, "Free GPU buffer %p.\n", addr); );
 }
 
 void opal_dump_cuda_list(ddt_cuda_list_t *list)
diff --git a/opal/datatype/cuda/opal_datatype_cuda.cuh b/opal/datatype/cuda/opal_datatype_cuda.cuh
index 6e86640b5e6..b770f136969 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cuh
+++ b/opal/datatype/cuda/opal_datatype_cuda.cuh
@@ -48,7 +48,19 @@ void pack_contiguous_loop_cuda_memcpy2d( dt_elem_desc_t* ELEM,
                                          uint32_t* COUNT,
                                          unsigned char** SOURCE,
                                          unsigned char** DESTINATION,
-                                         size_t* SPACE, uint8_t* transfer_required );
+                                         size_t* SPACE );
+                                         
+void pack_contiguous_loop_cuda_zerocopy( dt_elem_desc_t* ELEM,
+                                         uint32_t* COUNT,
+                                         unsigned char** SOURCE,
+                                         unsigned char** DESTINATION,
+                                         size_t* SPACE );
+                                         
+void pack_contiguous_loop_cuda_pipeline( dt_elem_desc_t* ELEM,
+                                         uint32_t* COUNT,
+                                         unsigned char** SOURCE,
+                                         unsigned char** DESTINATION,
+                                         size_t* SPACE, unsigned char* gpu_buffer );
                                 
 void unpack_contiguous_loop_cuda( dt_elem_desc_t* ELEM,
                                   uint32_t* COUNT,
@@ -60,7 +72,13 @@ void unpack_contiguous_loop_cuda_memcpy2d( dt_elem_desc_t* ELEM,
                                            uint32_t* COUNT,
                                            unsigned char** SOURCE,
                                            unsigned char** DESTINATION,
-                                           size_t* SPACE, uint8_t* free_required );
+                                           size_t* SPACE );
+
+void unpack_contiguous_loop_cuda_zerocopy( dt_elem_desc_t* ELEM,
+                                           uint32_t* COUNT,
+                                           unsigned char** SOURCE,
+                                           unsigned char** DESTINATION,
+                                           size_t* SPACE);
                                   
 void pack_predefined_data_cuda( dt_elem_desc_t* ELEM,
                                 uint32_t* COUNT,
diff --git a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
index 98d787ac650..c0cfda8ea90 100644
--- a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
+++ b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
@@ -15,6 +15,8 @@
 #define OPAL_DATATYPE_CUDA_DEBUG_LEVEL  0
 #define OPAL_DATATYPE_CUDA_TIMING
 #define OPAL_DATATYPE_VECTOR_USE_MEMCPY2D   0
+#define OPAL_DATATYPE_VECTOR_USE_ZEROCPY   0
+#define OPAL_DATATYPE_VECTOR_USE_PIPELINE   0
 
 
 #define IOV_ARRAY_SIZE          1
@@ -160,6 +162,14 @@ __global__ void opal_generic_simple_pack_cuda_iov_kernel( ddt_cuda_iov_dist_t* c
 
 __global__ void opal_generic_simple_unpack_cuda_iov_kernel( ddt_cuda_iov_dist_t* cuda_iov_dist);
 
+__global__ void opal_empty_kernel(uint32_t copy_loops,
+                                  size_t size,
+                                  OPAL_PTRDIFF_TYPE extent,
+                                  unsigned char* source,
+                                  unsigned char* destination);
+                            
+__global__ void opal_empty_kernel_noargs();
+
 void opal_cuda_output(int output_id, const char *format, ...);
 
 #if defined (OPAL_DATATYPE_CUDA_DEBUG)
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
index 9bf130630f9..79281adf6cb 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
@@ -633,3 +633,17 @@ __global__ void opal_generic_simple_pack_cuda_iov_kernel( ddt_cuda_iov_dist_t* c
         }
     }
 }
+
+__global__ void opal_empty_kernel(uint32_t copy_loops,
+                                  size_t size,
+                                  OPAL_PTRDIFF_TYPE extent,
+                                  unsigned char* source,
+                                  unsigned char* destination)
+{
+    
+}
+
+__global__ void opal_empty_kernel_noargs()
+{
+    
+}
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
index 9a589501ae4..01fc947043c 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
@@ -238,13 +238,29 @@ int32_t opal_generic_simple_pack_function_cuda_vector(opal_convertor_t* pConvert
             }
             transfer_required = 0;
         } else {
-            iov_len_local = iov[iov_count].iov_len;
-            if (pConvertor->gpu_buffer_ptr == NULL) {
-                pConvertor->gpu_buffer_ptr = (unsigned char*)opal_cuda_malloc_gpu_buffer(iov_len_local, 0);
+            if (OPAL_DATATYPE_VECTOR_USE_MEMCPY2D || OPAL_DATATYPE_VECTOR_USE_ZEROCPY) {
+                pConvertor->gpu_buffer_ptr = NULL;
+                transfer_required = 0;
+                free_required = 0;
+                iov_ptr = (unsigned char*)iov[iov_count].iov_base;
+                iov_len_local = iov[iov_count].iov_len;
+            } else if (OPAL_DATATYPE_VECTOR_USE_PIPELINE){
+                iov_len_local = iov[iov_count].iov_len;
+                if (pConvertor->gpu_buffer_ptr == NULL) {
+                    pConvertor->gpu_buffer_ptr = (unsigned char*)opal_cuda_malloc_gpu_buffer(iov_len_local, 0);
+                }
+                transfer_required = 0;
+                free_required = 1;
+                iov_ptr = (unsigned char*)iov[iov_count].iov_base;
+            } else {
+                iov_len_local = iov[iov_count].iov_len;
+                if (pConvertor->gpu_buffer_ptr == NULL) {
+                    pConvertor->gpu_buffer_ptr = (unsigned char*)opal_cuda_malloc_gpu_buffer(iov_len_local, 0);
+                }
+                transfer_required = 1;
+                free_required = 1;
+                iov_ptr = pConvertor->gpu_buffer_ptr;
             }
-            transfer_required = 1;
-            free_required = 1;
-            iov_ptr = pConvertor->gpu_buffer_ptr;
         }
         while( 1 ) {
             while( pElem->elem.common.flags & OPAL_DATATYPE_FLAG_DATA ) {
@@ -291,9 +307,12 @@ int32_t opal_generic_simple_pack_function_cuda_vector(opal_convertor_t* pConvert
             if( OPAL_DATATYPE_LOOP == pElem->elem.common.type ) {
                 OPAL_PTRDIFF_TYPE local_disp = (OPAL_PTRDIFF_TYPE)conv_ptr;
                 if( pElem->loop.common.flags & OPAL_DATATYPE_FLAG_CONTIGUOUS ) {
-                    if (transfer_required && OPAL_DATATYPE_VECTOR_USE_MEMCPY2D) {
-                        iov_ptr = (unsigned char*)iov[iov_count].iov_base;
-                        pack_contiguous_loop_cuda_memcpy2d(pElem, &count_desc, &conv_ptr, &iov_ptr, &iov_len_local, &transfer_required);
+                    if (OPAL_DATATYPE_VECTOR_USE_MEMCPY2D) {
+                        pack_contiguous_loop_cuda_memcpy2d(pElem, &count_desc, &conv_ptr, &iov_ptr, &iov_len_local);
+                    } else if (OPAL_DATATYPE_VECTOR_USE_ZEROCPY) {
+                        pack_contiguous_loop_cuda_zerocopy(pElem, &count_desc, &conv_ptr, &iov_ptr, &iov_len_local);
+                    } else if (OPAL_DATATYPE_VECTOR_USE_PIPELINE) {
+                        pack_contiguous_loop_cuda_pipeline(pElem, &count_desc, &conv_ptr, &iov_ptr, &iov_len_local, pConvertor->gpu_buffer_ptr);
                     } else {
                         pack_contiguous_loop_cuda(pElem, &count_desc, &conv_ptr, &iov_ptr, &iov_len_local);
                     }
@@ -337,6 +356,7 @@ int32_t opal_generic_simple_pack_function_cuda_vector(opal_convertor_t* pConvert
         pConvertor->flags |= CONVERTOR_COMPLETED;
         DT_CUDA_DEBUG( opal_cuda_output( 0, "Total packed %lu\n", pConvertor->bConverted); );
         if (pConvertor->gpu_buffer_ptr != NULL && free_required == 1) {
+            printf("free\n");
            opal_cuda_free_gpu_buffer(pConvertor->gpu_buffer_ptr, 0);
            pConvertor->gpu_buffer_ptr = NULL;
         }
@@ -383,8 +403,84 @@ void pack_contiguous_loop_cuda( dt_elem_desc_t* ELEM,
 #endif    
  //   tasks_per_block = THREAD_PER_BLOCK * TASK_PER_THREAD;
  //   num_blocks = (*COUNT + tasks_per_block - 1) / tasks_per_block;
-    cudaMemcpy2D(_destination, _end_loop->size, _source, _loop->extent, _end_loop->size, _copy_loops, cudaMemcpyDeviceToDevice);
-//     pack_contiguous_loop_cuda_kernel_global<<<192, 4*THREAD_PER_BLOCK>>>(_copy_loops, _end_loop->size, _loop->extent, _source, _destination);
+ //    cudaMemcpy2D(_destination, _end_loop->size, _source, _loop->extent, _end_loop->size, _copy_loops, cudaMemcpyDeviceToDevice);
+//    pack_contiguous_loop_cuda_kernel_global<<<192, 4*THREAD_PER_BLOCK>>>(_copy_loops, _end_loop->size, _loop->extent, _source, _destination);
+//    int i;
+//    for (i = 0; i < 4; i++) {
+//     opal_empty_kernel<<<192, 4*THREAD_PER_BLOCK>>>(_copy_loops, _end_loop->size, _loop->extent, _source, _destination);
+     pack_contiguous_loop_cuda_kernel_global<<<192, 4*THREAD_PER_BLOCK>>>(_copy_loops, _end_loop->size, _loop->extent, _source, _destination);
+//     }
+
+#if !defined(OPAL_DATATYPE_CUDA_DRY_RUN)    
+    *(SOURCE) = _source +  _loop->extent*_copy_loops - _end_loop->first_elem_disp;
+    *(DESTINATION) = *(DESTINATION)  + _copy_loops * _end_loop->size;
+    *(SPACE) -= _copy_loops * _end_loop->size;
+    *(COUNT) -= _copy_loops;
+#endif
+    
+    cudaDeviceSynchronize();
+    
+#if defined(OPAL_DATATYPE_CUDA_TIMING) 
+    GET_TIME( end );
+    total_time = ELAPSED_TIME( start, end );
+    printf( "[Timing]: vector packing in %ld microsec\n", total_time );
+#endif
+}
+
+void pack_contiguous_loop_cuda_pipeline( dt_elem_desc_t* ELEM,
+                                         uint32_t* COUNT,
+                                         unsigned char** SOURCE,
+                                         unsigned char** DESTINATION,
+                                         size_t* SPACE, unsigned char* gpu_buffer )
+{
+    ddt_loop_desc_t *_loop = (ddt_loop_desc_t*)(ELEM);
+    ddt_endloop_desc_t* _end_loop = (ddt_endloop_desc_t*)((ELEM) + _loop->items);
+    unsigned char* _source = (*SOURCE) + _end_loop->first_elem_disp;
+    uint32_t _copy_loops = *(COUNT);
+    uint32_t num_blocks, tasks_per_block;
+    unsigned char* _destination_host = *(DESTINATION);
+    unsigned char* _destination_dev = gpu_buffer;
+    int i, pipeline_blocks;
+    uint32_t _copy_loops_per_pipeline; 
+    
+#if defined(OPAL_DATATYPE_CUDA_TIMING)    
+    TIMER_DATA_TYPE start, end, start_total, end_total;
+    long total_time;
+#endif
+
+    DT_CUDA_DEBUG( opal_cuda_output( 0, "I am in pack_contiguous_loop_cuda_pipeline\n"); );
+
+    if( (_copy_loops * _end_loop->size) > *(SPACE) )
+        _copy_loops = (uint32_t)(*(SPACE) / _end_loop->size);
+
+#if !defined(OPAL_DATATYPE_CUDA_DRY_RUN)
+ //   _source = pBaseBuf_GPU;
+ //   _destination = (unsigned char*)cuda_desc_h->iov[0].iov_base;
+#endif
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+    GET_TIME(start);
+#endif    
+ //   tasks_per_block = THREAD_PER_BLOCK * TASK_PER_THREAD;
+ //   num_blocks = (*COUNT + tasks_per_block - 1) / tasks_per_block;
+//    cudaMemcpy2D(_destination, _end_loop->size, _source, _loop->extent, _end_loop->size, _copy_loops, cudaMemcpyDeviceToDevice);
+    pipeline_blocks = 4;
+    cuda_streams->current_stream_id = 0;
+    _copy_loops_per_pipeline = (_copy_loops + pipeline_blocks -1 )/ pipeline_blocks;
+    pack_contiguous_loop_cuda_kernel_global<<<192, 4*THREAD_PER_BLOCK, 0, cuda_streams->opal_cuda_stream[cuda_streams->current_stream_id]>>>(_copy_loops_per_pipeline, _end_loop->size, _loop->extent, _source, _destination_dev);
+    for (i = 1; i <= pipeline_blocks; i++) {
+        cudaMemcpyAsync(_destination_host, _destination_dev, _end_loop->size * _copy_loops_per_pipeline, cudaMemcpyDeviceToHost, cuda_streams->opal_cuda_stream[cuda_streams->current_stream_id]);
+        cuda_streams->current_stream_id ++;
+        cuda_streams->current_stream_id = cuda_streams->current_stream_id % NB_STREAMS;
+        _source += _loop->extent * _copy_loops_per_pipeline;
+        _destination_dev += _end_loop->size * _copy_loops_per_pipeline;
+        _destination_host += _end_loop->size * _copy_loops_per_pipeline;
+        if (i == pipeline_blocks) {
+            _copy_loops_per_pipeline = _copy_loops - _copy_loops_per_pipeline * (pipeline_blocks - 1);
+        }
+        pack_contiguous_loop_cuda_kernel_global<<<192, 4*THREAD_PER_BLOCK, 0, cuda_streams->opal_cuda_stream[cuda_streams->current_stream_id]>>>(_copy_loops_per_pipeline, _end_loop->size, _loop->extent, _source, _destination_dev);
+    }
+    cudaMemcpyAsync(_destination_host, _destination_dev, _end_loop->size * _copy_loops_per_pipeline, cudaMemcpyDeviceToHost, cuda_streams->opal_cuda_stream[cuda_streams->current_stream_id]);
 
 #if !defined(OPAL_DATATYPE_CUDA_DRY_RUN)    
     *(SOURCE) = _source +  _loop->extent*_copy_loops - _end_loop->first_elem_disp;
@@ -406,7 +502,7 @@ void pack_contiguous_loop_cuda_memcpy2d( dt_elem_desc_t* ELEM,
                                 uint32_t* COUNT,
                                 unsigned char** SOURCE,
                                 unsigned char** DESTINATION,
-                                size_t* SPACE, uint8_t* transfer_required )
+                                size_t* SPACE )
 {
     ddt_loop_desc_t *_loop = (ddt_loop_desc_t*)(ELEM);
     ddt_endloop_desc_t* _end_loop = (ddt_endloop_desc_t*)((ELEM) + _loop->items);
@@ -439,7 +535,6 @@ void pack_contiguous_loop_cuda_memcpy2d( dt_elem_desc_t* ELEM,
 #endif
     
 //    cudaDeviceSynchronize();
-    *transfer_required = 0;
     
 #if defined(OPAL_DATATYPE_CUDA_TIMING) 
     GET_TIME( end );
@@ -448,6 +543,57 @@ void pack_contiguous_loop_cuda_memcpy2d( dt_elem_desc_t* ELEM,
 #endif
 }
 
+void pack_contiguous_loop_cuda_zerocopy( dt_elem_desc_t* ELEM,
+                                uint32_t* COUNT,
+                                unsigned char** SOURCE,
+                                unsigned char** DESTINATION,
+                                size_t* SPACE )
+{
+    ddt_loop_desc_t *_loop = (ddt_loop_desc_t*)(ELEM);
+    ddt_endloop_desc_t* _end_loop = (ddt_endloop_desc_t*)((ELEM) + _loop->items);
+    unsigned char* _source = (*SOURCE) + _end_loop->first_elem_disp;
+    uint32_t _copy_loops = *(COUNT);
+    uint32_t num_blocks, tasks_per_block;
+    unsigned char* _destination = *(DESTINATION);
+    unsigned char* _destination_dev;
+    
+#if defined(OPAL_DATATYPE_CUDA_TIMING)    
+    TIMER_DATA_TYPE start, end, start_total, end_total;
+    long total_time;
+#endif
+
+    DT_CUDA_DEBUG( opal_cuda_output( 0, "I am in pack_contiguous_loop_cuda_zerocopy\n"); );
+
+    if( (_copy_loops * _end_loop->size) > *(SPACE) )
+        _copy_loops = (uint32_t)(*(SPACE) / _end_loop->size);
+
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+    GET_TIME(start);
+#endif    
+ //   tasks_per_block = THREAD_PER_BLOCK * TASK_PER_THREAD;
+ //   num_blocks = (*COUNT + tasks_per_block - 1) / tasks_per_block;
+ //   cudaHostRegister(_destination, _copy_loops*_end_loop->size, cudaHostRegisterMapped);
+    cudaHostGetDevicePointer((void **)&_destination_dev, (void *) _destination, 0);
+    pack_contiguous_loop_cuda_kernel_global<<<192, 4*THREAD_PER_BLOCK>>>(_copy_loops, _end_loop->size, _loop->extent, _source, _destination_dev);
+
+#if !defined(OPAL_DATATYPE_CUDA_DRY_RUN)    
+    *(SOURCE) = _source +  _loop->extent*_copy_loops - _end_loop->first_elem_disp;
+    *(DESTINATION) = *(DESTINATION)  + _copy_loops * _end_loop->size;
+    *(SPACE) -= _copy_loops * _end_loop->size;
+    *(COUNT) -= _copy_loops;
+#endif
+    
+    cudaDeviceSynchronize();
+ //   cudaHostUnregister(_destination);
+    
+#if defined(OPAL_DATATYPE_CUDA_TIMING) 
+    GET_TIME( end );
+    total_time = ELAPSED_TIME( start, end );
+    printf( "[Timing]: vector packing in %ld microsec\n", total_time );
+#endif
+}
+
 int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor,
                                                     struct iovec* iov,
                                                     uint32_t* out_size,
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
index 484f22cf785..e48c0340bd8 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
@@ -161,18 +161,21 @@ int32_t opal_generic_simple_unpack_function_cuda_vector( opal_convertor_t* pConv
         if (opal_cuda_is_gpu_buffer(iov[iov_count].iov_base)) {
             iov_ptr = (unsigned char*)iov[iov_count].iov_base;
             free_required = 0;
-        } else if (!OPAL_DATATYPE_VECTOR_USE_MEMCPY2D){
-            if (pConvertor->gpu_buffer_ptr == NULL) {
-                pConvertor->gpu_buffer_ptr = (unsigned char*)opal_cuda_malloc_gpu_buffer(iov[iov_count].iov_len, 0);
-            }
-            iov_ptr = pConvertor->gpu_buffer_ptr;
-            cudaMemcpy(iov_ptr, iov[iov_count].iov_base, iov[iov_count].iov_len, cudaMemcpyHostToDevice);
-            free_required = 1;
         } else {
-            iov_ptr = (unsigned char*)iov[iov_count].iov_base;
-            free_required = 255;
-        }
-#if defined(OPAL_DATATYPE_CUDA_TIMING) 
+            if (OPAL_DATATYPE_VECTOR_USE_MEMCPY2D || OPAL_DATATYPE_VECTOR_USE_ZEROCPY) {
+                iov_ptr = (unsigned char*)iov[iov_count].iov_base;
+                pConvertor->gpu_buffer_ptr = NULL;
+                free_required = 0;
+            } else {
+                if (pConvertor->gpu_buffer_ptr == NULL) {
+                    pConvertor->gpu_buffer_ptr = (unsigned char*)opal_cuda_malloc_gpu_buffer(iov[iov_count].iov_len, 0);
+                }
+                iov_ptr = pConvertor->gpu_buffer_ptr;
+                cudaMemcpy(iov_ptr, iov[iov_count].iov_base, iov[iov_count].iov_len, cudaMemcpyHostToDevice);
+                free_required = 1;
+            }
+        } 
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
         GET_TIME( end );
         total_time = ELAPSED_TIME( start, end );
         printf( "[Timing]: HtoD memcpy in %ld microsec, free required %d\n", total_time, free_required );
@@ -225,8 +228,10 @@ int32_t opal_generic_simple_unpack_function_cuda_vector( opal_convertor_t* pConv
             if( OPAL_DATATYPE_LOOP == pElem->elem.common.type ) {
                 OPAL_PTRDIFF_TYPE local_disp = (OPAL_PTRDIFF_TYPE)conv_ptr;
                 if( pElem->loop.common.flags & OPAL_DATATYPE_FLAG_CONTIGUOUS ) {
-                    if (free_required == 255 && OPAL_DATATYPE_VECTOR_USE_MEMCPY2D) {
-                        unpack_contiguous_loop_cuda_memcpy2d(pElem, &count_desc, &iov_ptr, &conv_ptr, &iov_len_local, &free_required);
+                    if (OPAL_DATATYPE_VECTOR_USE_MEMCPY2D) {
+                        unpack_contiguous_loop_cuda_memcpy2d(pElem, &count_desc, &iov_ptr, &conv_ptr, &iov_len_local);
+                    } else if (OPAL_DATATYPE_VECTOR_USE_ZEROCPY) {
+                        unpack_contiguous_loop_cuda_zerocopy(pElem, &count_desc, &iov_ptr, &conv_ptr, &iov_len_local);
                     } else {
                         unpack_contiguous_loop_cuda(pElem, &count_desc, &iov_ptr, &conv_ptr, &iov_len_local);
                     }
@@ -543,8 +548,8 @@ void unpack_contiguous_loop_cuda( dt_elem_desc_t* ELEM,
 #endif
 //    tasks_per_block = THREAD_PER_BLOCK * TASK_PER_THREAD;
 //    num_blocks = (*COUNT + tasks_per_block - 1) / tasks_per_block;
-//    unpack_contiguous_loop_cuda_kernel_global<<<192, 4*THREAD_PER_BLOCK>>>(_copy_loops, _end_loop->size, _loop->extent, _source, _destination);
-     cudaMemcpy2D(_destination, _loop->extent, _source, _end_loop->size, _end_loop->size, _copy_loops, cudaMemcpyDeviceToDevice);
+    unpack_contiguous_loop_cuda_kernel_global<<<192, 4*THREAD_PER_BLOCK>>>(_copy_loops, _end_loop->size, _loop->extent, _source, _destination);
+//     cudaMemcpy2D(_destination, _loop->extent, _source, _end_loop->size, _end_loop->size, _copy_loops, cudaMemcpyDeviceToDevice);
 
 #if !defined(OPAL_DATATYPE_CUDA_DRY_RUN)     
     *(DESTINATION) = _destination + _loop->extent*_copy_loops - _end_loop->first_elem_disp;
@@ -565,7 +570,7 @@ void unpack_contiguous_loop_cuda_memcpy2d( dt_elem_desc_t* ELEM,
                                   uint32_t* COUNT,
                                   unsigned char** SOURCE,
                                   unsigned char** DESTINATION,
-                                  size_t* SPACE, uint8_t* free_required )
+                                  size_t* SPACE )
 {
     ddt_loop_desc_t *_loop = (ddt_loop_desc_t*)(ELEM);
     ddt_endloop_desc_t* _end_loop = (ddt_endloop_desc_t*)((ELEM) + _loop->items);
@@ -596,7 +601,6 @@ void unpack_contiguous_loop_cuda_memcpy2d( dt_elem_desc_t* ELEM,
     *(COUNT) -= _copy_loops;
 #endif
     
-//    *free_required = 0;
 
 #if defined(OPAL_DATATYPE_CUDA_TIMING) 
     GET_TIME( end );
@@ -604,3 +608,52 @@ void unpack_contiguous_loop_cuda_memcpy2d( dt_elem_desc_t* ELEM,
     printf( "[Timing]: vector unpacking with memcpy2d in %ld microsec\n", total_time );
 #endif
 }
+
+void unpack_contiguous_loop_cuda_zerocopy( dt_elem_desc_t* ELEM,
+                                           uint32_t* COUNT,
+                                           unsigned char** SOURCE,
+                                           unsigned char** DESTINATION,
+                                           size_t* SPACE)
+{
+    ddt_loop_desc_t *_loop = (ddt_loop_desc_t*)(ELEM);
+    ddt_endloop_desc_t* _end_loop = (ddt_endloop_desc_t*)((ELEM) + _loop->items);
+    unsigned char* _destination = (*DESTINATION) + _end_loop->first_elem_disp;
+    uint32_t _copy_loops = *(COUNT);
+    uint32_t num_blocks, tasks_per_block;
+    unsigned char* _source = *(SOURCE);
+    unsigned char* _source_dev;
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)    
+    TIMER_DATA_TYPE start, end, start_total, end_total;
+    long total_time;
+#endif
+    
+    DT_CUDA_DEBUG( opal_cuda_output( 0, "I am in unpack_contiguous_loop_cuda_zerocopy\n"); );
+
+    if( (_copy_loops * _end_loop->size) > *(SPACE) )
+        _copy_loops = (uint32_t)(*(SPACE) / _end_loop->size);
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+    GET_TIME(start);
+#endif
+//    tasks_per_block = THREAD_PER_BLOCK * TASK_PER_THREAD;
+//    num_blocks = (*COUNT + tasks_per_block - 1) / tasks_per_block;
+//    cudaHostRegister(_source, _copy_loops*_end_loop->size, cudaHostRegisterMapped);
+    cudaHostGetDevicePointer((void **)&_source_dev, (void *) _source, 0);
+    unpack_contiguous_loop_cuda_kernel_global<<<192, 4*THREAD_PER_BLOCK>>>(_copy_loops, _end_loop->size, _loop->extent, _source, _destination);
+
+#if !defined(OPAL_DATATYPE_CUDA_DRY_RUN)
+    *(DESTINATION) = _destination + _loop->extent*_copy_loops - _end_loop->first_elem_disp;
+    *(SOURCE) = *(SOURCE)  + _copy_loops * _end_loop->size;
+    *(SPACE) -= _copy_loops * _end_loop->size;
+    *(COUNT) -= _copy_loops;
+#endif
+
+    cudaDeviceSynchronize();
+  //  cudaHostUnregister(_source);
+#if defined(OPAL_DATATYPE_CUDA_TIMING) 
+    GET_TIME( end );
+    total_time = ELAPSED_TIME( start, end );
+    printf( "[Timing]: vector unpacking in %ld microsec\n", total_time );
+#endif
+}
diff --git a/opal/mca/btl/smcuda/btl_smcuda_component.c b/opal/mca/btl/smcuda/btl_smcuda_component.c
index 2382fef5d94..a9b08f3efdc 100644
--- a/opal/mca/btl/smcuda/btl_smcuda_component.c
+++ b/opal/mca/btl/smcuda/btl_smcuda_component.c
@@ -931,7 +931,7 @@ static void btl_smcuda_datatype_pack(mca_btl_base_module_t* btl,
     } else {
         struct iovec iov;
         int rc_dt = 0;
-        size_t pipeline_size = 1024*1024*200;
+        size_t pipeline_size = 1024*1024*10;
         uint32_t iov_count = 1;
         iov.iov_base = convertor->gpu_buffer_ptr;
         iov.iov_len = pipeline_size;

From 7bd8151ef211b9d255f407eb91148551ff311e08 Mon Sep 17 00:00:00 2001
From: eddy16112 <eddy16112@gmail.com>
Date: Mon, 14 Sep 2015 17:22:43 -0400
Subject: [PATCH 020/190] put pipeline size into mca

---
 .../cuda/opal_datatype_cuda_internal.cuh      |  2 +-
 .../cuda/opal_datatype_pack_cuda_wrapper.cu   | 15 ++++++++-----
 .../cuda/opal_datatype_unpack_cuda_wrapper.cu |  2 +-
 opal/datatype/opal_datatype_pack.c            |  3 ++-
 opal/datatype/opal_datatype_unpack.c          |  3 ++-
 opal/mca/btl/smcuda/btl_smcuda.c              | 22 -------------------
 opal/mca/btl/smcuda/btl_smcuda.h              |  1 +
 opal/mca/btl/smcuda/btl_smcuda_component.c    |  4 +++-
 8 files changed, 19 insertions(+), 33 deletions(-)

diff --git a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
index c0cfda8ea90..938c1b5f8a1 100644
--- a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
+++ b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
@@ -30,7 +30,7 @@
 #define CUDA_NB_IOV         4096
 #define CUDA_IOV_LEN        1024*1204
 #define CUDA_MAX_NB_BLOCKS  1024
-#define CUDA_IOV_MAX_TASK_PER_BLOCK 200
+#define CUDA_IOV_MAX_TASK_PER_BLOCK 10
 #define ALIGNMENT_DOUBLE    8
 #define ALIGNMENT_FLOAT     4
 #define ALIGNMENT_CHAR      1
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
index 01fc947043c..e45a0b7df15 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
@@ -601,7 +601,7 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
 {
     uint32_t i, j;
     uint32_t count_desc, current_block, task_iteration, nb_blocks_per_description, residue_desc;
-    uint32_t nb_blocks, thread_per_block;
+    uint32_t nb_blocks, thread_per_block, nb_blocks_used;
     size_t length, buffer_size, length_per_iovec, dst_offset;
     unsigned char *destination, *destination_tmp;
     size_t total_packed, total_converted;
@@ -692,8 +692,9 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
 #endif
     
     dst_offset = 0;
-    thread_per_block = CUDA_WARP_SIZE * 5;
+    thread_per_block = CUDA_WARP_SIZE * 4;
     nb_blocks = 256;
+    nb_blocks_used = 0;
     
     while (cuda_iov_count > 0) {
         
@@ -752,6 +753,7 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
                 destination += cuda_iov_dist_h_current[current_block].nb_elements[task_iteration] * alignment;
                 DT_CUDA_DEBUG( opal_cuda_output(12, "PACKING \tblock %d, src %p, dst %p, nb_elements %d, alignment %d\n", current_block, cuda_iov_dist_h_current[current_block].src[task_iteration], cuda_iov_dist_h_current[current_block].dst[task_iteration], cuda_iov_dist_h_current[current_block].nb_elements[task_iteration], cuda_iov_dist_h_current[current_block].element_alignment[task_iteration]); );
                 current_block += 1;
+                nb_blocks_used ++;
                 if (current_block >= nb_blocks) {
                     current_block = 0;
                     task_iteration ++;
@@ -773,6 +775,7 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
                 destination += cuda_iov_dist_h_current[current_block].nb_elements[task_iteration] * orig_alignment;
                 DT_CUDA_DEBUG( opal_cuda_output(12, "PACKING \tblock %d, src %p, dst %p, nb_elements %d, alignment %d\n", current_block, cuda_iov_dist_h_current[current_block].src[task_iteration], cuda_iov_dist_h_current[current_block].dst[task_iteration], cuda_iov_dist_h_current[current_block].nb_elements[task_iteration], cuda_iov_dist_h_current[current_block].element_alignment[task_iteration]); );
                 current_block += 1;
+                nb_blocks_used ++;
                 if (current_block >= nb_blocks) {
                     current_block = 0;
                     task_iteration ++;
@@ -788,7 +791,7 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
 #if defined(OPAL_DATATYPE_CUDA_TIMING)    
         GET_TIME( end );
         total_time = ELAPSED_TIME( start, end );
-        printf( "[Timing]: Pack to dest %p, iov is prepared in %ld microsec, kernel submitted to CUDA stream %d\n", destination_tmp, total_time,  cuda_streams->current_stream_id);
+        printf( "[Timing]: Pack to dest %p, iov is prepared in %ld microsec, kernel submitted to CUDA stream %d, nb_blocks %d\n", destination_tmp, total_time,  cuda_streams->current_stream_id, nb_blocks_used);
 #endif
 
         cudaMemcpyAsync(cuda_iov_dist_d_current, cuda_iov_dist_h_current, sizeof(ddt_cuda_iov_dist_t)*(nb_blocks), cudaMemcpyHostToDevice, cuda_streams->opal_cuda_stream[cuda_streams->current_stream_id]);
@@ -818,10 +821,10 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
     }
     
 
-  //  cudaDeviceSynchronize();
-    for (i = 0; i < NB_STREAMS; i++) {
+    cudaDeviceSynchronize();
+ /*   for (i = 0; i < NB_STREAMS; i++) {
         cudaStreamSynchronize(cuda_streams->opal_cuda_stream[i]);
-    }
+    }*/
     
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
     GET_TIME(start);
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
index e48c0340bd8..2f281bdb494 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
@@ -370,7 +370,7 @@ int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvert
 #endif
     
     dst_offset = 0;
-    thread_per_block = CUDA_WARP_SIZE * 5;
+    thread_per_block = CUDA_WARP_SIZE * 4;
     nb_blocks = 256;
     
     while (cuda_iov_count > 0) {
diff --git a/opal/datatype/opal_datatype_pack.c b/opal/datatype/opal_datatype_pack.c
index 7ddefdd1728..54a28b93c5b 100644
--- a/opal/datatype/opal_datatype_pack.c
+++ b/opal/datatype/opal_datatype_pack.c
@@ -424,7 +424,8 @@ opal_generic_simple_pack_cuda_function( opal_convertor_t* pConvertor,
     
     if( OPAL_DATATYPE_LOOP == pElem->elem.common.type ) {
         if (opal_generic_simple_pack_function_cuda_vector_p != NULL) {
-            return (*opal_generic_simple_pack_function_cuda_vector_p)( pConvertor, iov, out_size, max_data);
+        //    return (*opal_generic_simple_pack_function_cuda_vector_p)( pConvertor, iov, out_size, max_data);
+            return (*opal_generic_simple_pack_function_cuda_iov_p)( pConvertor, iov, out_size, max_data);
         }
     } else {
         if (opal_generic_simple_pack_function_cuda_iov_p != NULL) {
diff --git a/opal/datatype/opal_datatype_unpack.c b/opal/datatype/opal_datatype_unpack.c
index ff8dae77971..5fe4003063d 100644
--- a/opal/datatype/opal_datatype_unpack.c
+++ b/opal/datatype/opal_datatype_unpack.c
@@ -611,7 +611,8 @@ opal_generic_simple_unpack_cuda_function( opal_convertor_t* pConvertor,
     
     if( OPAL_DATATYPE_LOOP == pElem->elem.common.type ) {
         if (opal_generic_simple_unpack_function_cuda_vector_p != NULL) {
-            return (*opal_generic_simple_unpack_function_cuda_vector_p)( pConvertor, iov, out_size, max_data);
+         //   return (*opal_generic_simple_unpack_function_cuda_vector_p)( pConvertor, iov, out_size, max_data);
+            return (*opal_generic_simple_unpack_function_cuda_iov_p)( pConvertor, iov, out_size, max_data);
         }
     } else {
         if (opal_generic_simple_unpack_function_cuda_iov_p != NULL) {
diff --git a/opal/mca/btl/smcuda/btl_smcuda.c b/opal/mca/btl/smcuda/btl_smcuda.c
index cc10683752f..da403ad937d 100644
--- a/opal/mca/btl/smcuda/btl_smcuda.c
+++ b/opal/mca/btl/smcuda/btl_smcuda.c
@@ -1046,28 +1046,6 @@ static int mca_btl_smcuda_deregister_mem (struct mca_btl_base_module_t* btl,
     return OPAL_SUCCESS;
 }
 
-int mca_btl_smcuda_notify_packing_done(void* send_value, int my_rank, int peer_rank)
-{
-    sm_fifo_t* fifo_send = &(mca_btl_smcuda_component.fifo[peer_rank][FIFO_MAP(my_rank)]);
-    if (fifo_send == NULL) {
-        return OPAL_ERROR;
-    } else {
-   //     return sm_fifo_write(send_value, fifo_send);
-        int tail = fifo_send->tail;
-        int head = fifo_send->head;
-        if ((head + 1) & fifo_send->mask == tail) {
-            printf("fifo is full\n");
-            return OPAL_ERR_OUT_OF_RESOURCE;
-        } else {
-            volatile void **q = (volatile void **) RELATIVE2VIRTUAL(fifo_send->queue);
-            tail = (tail - 1) & fifo_send->mask;
-            q[tail] = send_value;
-            printf("write to place %d tail %d head %d\n", tail, fifo_send->tail, fifo_send->head);
-            return OPAL_SUCCESS;
-        }
-    }
-}
-
 int mca_btl_smcuda_get_cuda (struct mca_btl_base_module_t *btl,
     struct mca_btl_base_endpoint_t *ep, void *local_address,
     uint64_t remote_address, struct mca_btl_base_registration_handle_t *local_handle,
diff --git a/opal/mca/btl/smcuda/btl_smcuda.h b/opal/mca/btl/smcuda/btl_smcuda.h
index 20465decc10..478dd184d24 100644
--- a/opal/mca/btl/smcuda/btl_smcuda.h
+++ b/opal/mca/btl/smcuda/btl_smcuda.h
@@ -207,6 +207,7 @@ struct mca_btl_smcuda_component_t {
     int cuda_ipc_output;
     int use_cuda_ipc;
     int use_cuda_ipc_same_gpu;
+    int cuda_dt_pipeline_size;
 #endif /* OPAL_CUDA_SUPPORT */
 };
 typedef struct mca_btl_smcuda_component_t mca_btl_smcuda_component_t;
diff --git a/opal/mca/btl/smcuda/btl_smcuda_component.c b/opal/mca/btl/smcuda/btl_smcuda_component.c
index a9b08f3efdc..870301b5f9c 100644
--- a/opal/mca/btl/smcuda/btl_smcuda_component.c
+++ b/opal/mca/btl/smcuda/btl_smcuda_component.c
@@ -167,6 +167,7 @@ static int smcuda_register(void)
     mca_btl_smcuda_param_register_int("use_cuda_ipc", 1, OPAL_INFO_LVL_4, &mca_btl_smcuda_component.use_cuda_ipc);
     mca_btl_smcuda_param_register_int("use_cuda_ipc_same_gpu", 1, OPAL_INFO_LVL_4,&mca_btl_smcuda_component.use_cuda_ipc_same_gpu);
     mca_btl_smcuda_param_register_int("cuda_ipc_verbose", 0, OPAL_INFO_LVL_4, &mca_btl_smcuda_component.cuda_ipc_verbose);
+    mca_btl_smcuda_param_register_int("cuda_dt_pipeline_size", 1024*1024*400, OPAL_INFO_LVL_4, &mca_btl_smcuda_component.cuda_dt_pipeline_size);
     mca_btl_smcuda_component.cuda_ipc_output = opal_output_open(NULL);
     opal_output_set_verbosity(mca_btl_smcuda_component.cuda_ipc_output, mca_btl_smcuda_component.cuda_ipc_verbose);
 #else /* OPAL_CUDA_SUPPORT */
@@ -931,7 +932,8 @@ static void btl_smcuda_datatype_pack(mca_btl_base_module_t* btl,
     } else {
         struct iovec iov;
         int rc_dt = 0;
-        size_t pipeline_size = 1024*1024*10;
+        size_t pipeline_size = mca_btl_smcuda_component.cuda_dt_pipeline_size;
+        printf("Pipeline_size %ld\n", pipeline_size);
         uint32_t iov_count = 1;
         iov.iov_base = convertor->gpu_buffer_ptr;
         iov.iov_len = pipeline_size;

From 9d103578bd1cf3d4a59f11ec28564b1f41f99d71 Mon Sep 17 00:00:00 2001
From: George Bosilca <bosilca@icl.utk.edu>
Date: Tue, 15 Sep 2015 14:16:16 -0400
Subject: [PATCH 021/190] Upon datatype commit create a list of iovec
 representing a single iteration of the datatype based on a NULL pointer. This
 list will then contain the displacement and the length of each fragment of
 the datatype memory layout and can be used for any packing/unpacking purpose.

---
 opal/datatype/opal_convertor.h         |  6 +++++-
 opal/datatype/opal_convertor_raw.c     | 29 ++++++++++++++++++++++++++
 opal/datatype/opal_datatype.h          |  6 +++++-
 opal/datatype/opal_datatype_optimize.c |  6 ++++++
 4 files changed, 45 insertions(+), 2 deletions(-)

diff --git a/opal/datatype/opal_convertor.h b/opal/datatype/opal_convertor.h
index 1ee0c010e63..ace5cf4b1e4 100644
--- a/opal/datatype/opal_convertor.h
+++ b/opal/datatype/opal_convertor.h
@@ -283,7 +283,11 @@ opal_convertor_raw( opal_convertor_t* convertor,  /* [IN/OUT] */
                     struct iovec* iov,            /* [IN/OUT] */
                     uint32_t* iov_count,          /* [IN/OUT] */
                     size_t* length );             /* [OUT]    */
-
+OPAL_DECLSPEC void
+opal_convertor_to_iov(struct opal_convertor_t *convertor,
+                      struct iovec **iov,
+                      uint32_t *iov_count,
+                      size_t *max_data);
 /*
  * Upper level does not need to call the _nocheck function directly.
  */
diff --git a/opal/datatype/opal_convertor_raw.c b/opal/datatype/opal_convertor_raw.c
index b57d5aa1ded..441ee9ee0fc 100644
--- a/opal/datatype/opal_convertor_raw.c
+++ b/opal/datatype/opal_convertor_raw.c
@@ -211,3 +211,32 @@ opal_convertor_raw( opal_convertor_t* pConvertor,
                            pConvertor->stack_pos, pStack->index, (int)pStack->count, (long)pStack->disp ); );
     return 0;
 }
+
+#define IOVEC_INITIAL_SIZE 64
+
+void
+opal_convertor_to_iov(struct opal_convertor_t *convertor,
+                      struct iovec **iov,
+                      uint32_t *iov_count,
+                      size_t *max_data)
+{
+    uint32_t temp_count = IOVEC_INITIAL_SIZE;
+    struct iovec *iovec;
+    size_t temp_data;
+
+    *iov_count = 0;
+    *max_data = 0;
+
+    *iov = iovec = (struct iovec*) malloc(temp_count * sizeof(struct iovec));
+    while(1) {
+        int ret = opal_convertor_raw(convertor, iovec, &temp_count, &temp_data);
+        *iov_count += temp_count;
+        *max_data += temp_data;
+        if(ret)
+            break;
+
+        *iov = (struct iovec*)realloc(*iov, (*iov_count + IOVEC_INITIAL_SIZE) * sizeof(struct iovec));
+        temp_count = IOVEC_INITIAL_SIZE;
+        iovec = &((*iov)[*iov_count]);
+    }
+}
diff --git a/opal/datatype/opal_datatype.h b/opal/datatype/opal_datatype.h
index cf00a690c56..bec40665d15 100644
--- a/opal/datatype/opal_datatype.h
+++ b/opal/datatype/opal_datatype.h
@@ -128,7 +128,11 @@ struct opal_datatype_t {
                                       Reason being is that Fortran is not at the OPAL layer. */
     /* --- cacheline 5 boundary (320 bytes) was 32-36 bytes ago --- */
 
-    /* size: 352, cachelines: 6, members: 15 */
+    struct iovec*      iov;
+    int                iov_count;
+    size_t             max_data;
+    /* size: 372, cachelines: 6, members: 18 */
+
     /* last cacheline: 28-32 bytes */
 };
 
diff --git a/opal/datatype/opal_datatype_optimize.c b/opal/datatype/opal_datatype_optimize.c
index b52719bcfc3..951b240764b 100644
--- a/opal/datatype/opal_datatype_optimize.c
+++ b/opal/datatype/opal_datatype_optimize.c
@@ -304,5 +304,11 @@ int32_t opal_datatype_commit( opal_datatype_t * pData )
         pLast->first_elem_disp = first_elem_disp;
         pLast->size            = pData->size;
     }
+
+    /* save a compressed datatype description as a iovec list */
+    opal_convertor_t* conv = opal_convertor_create( opal_local_arch, 0 /* unused */);
+    opal_convertor_prepare_for_send( conv, pData, 1, (void*)0 );
+    opal_convertor_to_iov(conv, &pData->iov, &pData->iov_count, &pData->max_data);
+    OBJ_RELEASE(conv);
     return OPAL_SUCCESS;
 }

From 756b2af5abea2d7b4a87ead6e2910cd8417e580b Mon Sep 17 00:00:00 2001
From: eddy16112 <eddy16112@gmail.com>
Date: Thu, 17 Sep 2015 01:52:23 -0400
Subject: [PATCH 022/190] contiguous vs non-contiguous is working

Conflicts:
	opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
	opal/datatype/opal_datatype_unpack.c
---
 ompi/mca/pml/ob1/pml_ob1_cuda.c               |  9 ++-
 opal/datatype/cuda/opal_datatype_cuda.cu      |  2 +
 .../cuda/opal_datatype_cuda_internal.cuh      |  4 +-
 .../cuda/opal_datatype_pack_cuda_wrapper.cu   | 50 +++++++++-------
 .../cuda/opal_datatype_unpack_cuda_wrapper.cu | 32 ++++++----
 opal/datatype/opal_datatype_optimize.c        |  8 +--
 opal/datatype/opal_datatype_unpack.c          |  4 +-
 opal/mca/btl/smcuda/btl_smcuda.c              | 59 +++++++++++++------
 opal/mca/btl/smcuda/btl_smcuda_component.c    | 42 ++++++-------
 9 files changed, 129 insertions(+), 81 deletions(-)

diff --git a/ompi/mca/pml/ob1/pml_ob1_cuda.c b/ompi/mca/pml/ob1/pml_ob1_cuda.c
index 05556c14a90..e0248bbc69e 100644
--- a/ompi/mca/pml/ob1/pml_ob1_cuda.c
+++ b/ompi/mca/pml/ob1/pml_ob1_cuda.c
@@ -67,6 +67,7 @@ int mca_pml_ob1_send_request_start_cuda(mca_pml_ob1_send_request_t* sendreq,
                                         mca_bml_base_btl_t* bml_btl,
                                         size_t size) {
     int rc;
+    int local_device = 0;
 #if OPAL_CUDA_SUPPORT_41
 #if OPAL_CUDA_GDR_SUPPORT
     /* With some BTLs, switch to RNDV from RGET at large messages */
@@ -87,6 +88,13 @@ int mca_pml_ob1_send_request_start_cuda(mca_pml_ob1_send_request_t* sendreq,
                                                                            base,
                                                                            sendreq->req_send.req_bytes_packed,
                                                                            sendreq->req_rdma))) {
+            
+            rc = mca_common_cuda_get_device(&local_device);
+            if (rc != 0) {
+                opal_output_verbose(0, "Failed to get the GPU device ID, rc=%d", rc);
+                return rc;
+            }                                                                   
+            mca_pml_ob1_rdma_cuda_btl_register_data(sendreq->req_rdma, sendreq->req_rdma_cnt, 0, -1, 0, local_device); 
             rc = mca_pml_ob1_send_request_start_rdma(sendreq, bml_btl,
                                                      sendreq->req_send.req_bytes_packed);
             if( OPAL_UNLIKELY(OMPI_SUCCESS != rc) ) {
@@ -109,7 +117,6 @@ int mca_pml_ob1_send_request_start_cuda(mca_pml_ob1_send_request_t* sendreq,
             printf("GPU data ready for GET!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n");
             unsigned char *base;
             struct opal_convertor_t *convertor = &(sendreq->req_send.req_base.req_convertor);
-            int local_device = 0;
             base = opal_cuda_malloc_gpu_buffer_p(convertor->local_size, 0);
             convertor->gpu_buffer_ptr = base;
             sendreq->req_send.req_bytes_packed = convertor->local_size;
diff --git a/opal/datatype/cuda/opal_datatype_cuda.cu b/opal/datatype/cuda/opal_datatype_cuda.cu
index 9791e40fef1..29ade337b69 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cu
+++ b/opal/datatype/cuda/opal_datatype_cuda.cu
@@ -293,6 +293,8 @@ void opal_datatype_cuda_init(void)
     // ALIGNMENT_DOUBLE = sizeof(double);
     // ALIGNMENT_FLOAT = sizeof(float);
     // ALIGNMENT_CHAR = sizeof(char);
+    
+    cudaDeviceSynchronize();
 }
 
 void opal_datatype_cuda_fini(void)
diff --git a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
index 938c1b5f8a1..2102edb6a9c 100644
--- a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
+++ b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
@@ -26,8 +26,8 @@
 #define THREAD_PER_BLOCK    32
 #define CUDA_WARP_SIZE      32
 #define TASK_PER_THREAD     2
-#define NB_STREAMS          4
-#define CUDA_NB_IOV         4096
+#define NB_STREAMS          8
+#define CUDA_NB_IOV         1024*20
 #define CUDA_IOV_LEN        1024*1204
 #define CUDA_MAX_NB_BLOCKS  1024
 #define CUDA_IOV_MAX_TASK_PER_BLOCK 10
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
index e45a0b7df15..250e3e253e3 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
@@ -619,19 +619,10 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
 
 #if defined(OPAL_DATATYPE_CUDA_TIMING)    
     TIMER_DATA_TYPE start, end, start_total, end_total;
-    long total_time;
-#endif
-    
-#if defined(OPAL_DATATYPE_CUDA_TIMING)
-    GET_TIME(start_total);
+    long total_time, move_time;
 #endif
     
     DT_CUDA_DEBUG ( opal_cuda_output(0, "GPU datatype PACKING using iovec\n"); );
-
-
-#if defined(OPAL_DATATYPE_CUDA_TIMING)
-    GET_TIME(start);
-#endif
     
     description = pConvertor->use_desc->desc;
     pStack = pConvertor->pStack + pConvertor->stack_pos;
@@ -659,17 +650,24 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
         }
         transfer_required = 0;
     } else {
-        buffer_size = iov[0].iov_len;
-        if (pConvertor->gpu_buffer_ptr == NULL) {
-            pConvertor->gpu_buffer_ptr = (unsigned char*)opal_cuda_malloc_gpu_buffer(buffer_size, 0);
-        }
-        transfer_required = 1;
-        free_required = 1;
+        if (OPAL_DATATYPE_VECTOR_USE_ZEROCPY) {
+            pConvertor->gpu_buffer_ptr = NULL;
+            transfer_required = 0;
+            free_required = 0;
+            cudaHostGetDevicePointer((void **)&destination, (void *)iov[0].iov_base, 0);
+        } else {
+            buffer_size = iov[0].iov_len;
+            if (pConvertor->gpu_buffer_ptr == NULL) {
+                pConvertor->gpu_buffer_ptr = (unsigned char*)opal_cuda_malloc_gpu_buffer(buffer_size, 0);
+            }
+            transfer_required = 1;
+            free_required = 1;
 #if defined(OPAL_DATATYPE_CUDA_DRY_RUN)
-        destination = (unsigned char*)iov[0].iov_base;
+            destination = (unsigned char*)iov[0].iov_base;
 #else
-        destination = pConvertor->gpu_buffer_ptr;
+            destination = pConvertor->gpu_buffer_ptr;
 #endif /* OPAL_DATATYPE_CUDA_DRY_RUN */
+        }
     }
     
     destination_tmp = destination;
@@ -682,6 +680,14 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
     cuda_streams->current_stream_id = 0;
     convertor_flags = pConvertor->flags;
     orig_stack_index = pStack->index;
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+    GET_TIME(start_total);
+#endif
+    
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+    GET_TIME(start);
+#endif
     complete_flag = opal_convertor_raw( pConvertor, cuda_iov, &cuda_iov_count, &length );
     DT_CUDA_DEBUG ( opal_cuda_output(2, "PACKING complete flag %d, iov count %d, length %d, submit to CUDA stream %d\n", complete_flag, cuda_iov_count, length, cuda_streams->current_stream_id); );
 
@@ -692,7 +698,7 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
 #endif
     
     dst_offset = 0;
-    thread_per_block = CUDA_WARP_SIZE * 4;
+    thread_per_block = CUDA_WARP_SIZE * 5;
     nb_blocks = 256;
     nb_blocks_used = 0;
     
@@ -834,8 +840,8 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
     } 
 #if defined(OPAL_DATATYPE_CUDA_TIMING) 
     GET_TIME( end );
-    total_time = ELAPSED_TIME( start, end );
-    printf( "[Timing]: DtoH memcpy in %ld microsec, transfer required %d\n", total_time, transfer_required );
+    move_time = ELAPSED_TIME( start, end );
+    printf( "[Timing]: DtoH memcpy in %ld microsec, transfer required %d\n", move_time, transfer_required );
 #endif
     // float *vtmp = (float *)iov[0].iov_base;
     // DT_CUDA_DEBUG ( opal_cuda_output(0, "packed iov buffer, total packed %d\n", total_packed); );
@@ -852,7 +858,7 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
 #if defined(OPAL_DATATYPE_CUDA_TIMING)    
     GET_TIME( end_total );
     total_time = ELAPSED_TIME( start_total, end_total );
-    printf( "[Timing]: total packing in %ld microsec\n", total_time );
+    printf( "[Timing]: total packing in %ld microsec, kernel %ld microsec\n", total_time, total_time-move_time );
 #endif
     
     if( pConvertor->bConverted == pConvertor->local_size ) {
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
index 2f281bdb494..893f280c68f 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
@@ -303,7 +303,7 @@ int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvert
     
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
     TIMER_DATA_TYPE start, end, start_total, end_total;
-    long total_time;
+    long total_time, move_time;
 #endif
     
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
@@ -327,17 +327,23 @@ int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvert
     if (opal_cuda_is_gpu_buffer(iov[0].iov_base)) {
         source = (unsigned char*)iov[0].iov_base;
         free_required = 0;
-    } else {  
+    } else {
+        if (OPAL_DATATYPE_VECTOR_USE_ZEROCPY) {
+            cudaHostGetDevicePointer((void **)&source, (void *)iov[0].iov_base, 0);
+            pConvertor->gpu_buffer_ptr = NULL;
+            free_required = 0;
+        } else {
 #if defined(OPAL_DATATYPE_CUDA_DRY_RUN)
-        source = (unsigned char*)iov[0].iov_base;
+            source = (unsigned char*)iov[0].iov_base;
 #else
-        if (pConvertor->gpu_buffer_ptr == NULL) {
-            pConvertor->gpu_buffer_ptr = (unsigned char*)opal_cuda_malloc_gpu_buffer(iov[0].iov_len, 0);
+            if (pConvertor->gpu_buffer_ptr == NULL) {
+                pConvertor->gpu_buffer_ptr = (unsigned char*)opal_cuda_malloc_gpu_buffer(iov[0].iov_len, 0);
+            }
+            source = pConvertor->gpu_buffer_ptr;
+#endif /* OPAL_DATATYPE_CUDA_DRY_RUN */
+            cudaMemcpy(source, iov[0].iov_base, iov[0].iov_len, cudaMemcpyHostToDevice);
+            free_required = 1;
         }
-        source = pConvertor->gpu_buffer_ptr;
-#endif /* OPAL_DATATYPE_CUDA_DRY_RUN */  
-        cudaMemcpy(source, iov[0].iov_base, iov[0].iov_len, cudaMemcpyHostToDevice);
-        free_required = 1;
     }
     
     source_tmp = source;
@@ -345,8 +351,8 @@ int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvert
     DT_CUDA_DEBUG ( opal_cuda_output(0, "UNpack GPU base %p, unpack from buffer %p, total size %ld\n", pConvertor->pBaseBuf, source, iov[0].iov_len); );
 #if defined(OPAL_DATATYPE_CUDA_TIMING) 
     GET_TIME( end );
-    total_time = ELAPSED_TIME( start, end );
-    printf( "[Timing]: HtoD memcpy in %ld microsec, free required %d\n", total_time, free_required );
+    move_time = ELAPSED_TIME( start, end );
+    printf( "[Timing]: HtoD memcpy in %ld microsec, free required %d\n", move_time, free_required );
 #endif
 
 
@@ -370,7 +376,7 @@ int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvert
 #endif
     
     dst_offset = 0;
-    thread_per_block = CUDA_WARP_SIZE * 4;
+    thread_per_block = CUDA_WARP_SIZE * 5;
     nb_blocks = 256;
     
     while (cuda_iov_count > 0) {
@@ -506,7 +512,7 @@ int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvert
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
     GET_TIME( end_total );
     total_time = ELAPSED_TIME( start_total, end_total );
-    printf( "[Timing]: total unpacking in %ld microsec\n", total_time );
+    printf( "[Timing]: total unpacking in %ld microsec, kernel %ld microsec\n", total_time, total_time-move_time );
 #endif
     
     if( pConvertor->bConverted == pConvertor->local_size ) {
diff --git a/opal/datatype/opal_datatype_optimize.c b/opal/datatype/opal_datatype_optimize.c
index 951b240764b..5ccea9ba1d3 100644
--- a/opal/datatype/opal_datatype_optimize.c
+++ b/opal/datatype/opal_datatype_optimize.c
@@ -306,9 +306,9 @@ int32_t opal_datatype_commit( opal_datatype_t * pData )
     }
 
     /* save a compressed datatype description as a iovec list */
-    opal_convertor_t* conv = opal_convertor_create( opal_local_arch, 0 /* unused */);
-    opal_convertor_prepare_for_send( conv, pData, 1, (void*)0 );
-    opal_convertor_to_iov(conv, &pData->iov, &pData->iov_count, &pData->max_data);
-    OBJ_RELEASE(conv);
+//    opal_convertor_t* conv = opal_convertor_create( opal_local_arch, 0 /* unused */);
+//    opal_convertor_prepare_for_send( conv, pData, 1, (void*)0 );
+//    opal_convertor_to_iov(conv, &pData->iov, &pData->iov_count, &pData->max_data);
+//    OBJ_RELEASE(conv);
     return OPAL_SUCCESS;
 }
diff --git a/opal/datatype/opal_datatype_unpack.c b/opal/datatype/opal_datatype_unpack.c
index 5fe4003063d..fd269de6764 100644
--- a/opal/datatype/opal_datatype_unpack.c
+++ b/opal/datatype/opal_datatype_unpack.c
@@ -611,8 +611,8 @@ opal_generic_simple_unpack_cuda_function( opal_convertor_t* pConvertor,
     
     if( OPAL_DATATYPE_LOOP == pElem->elem.common.type ) {
         if (opal_generic_simple_unpack_function_cuda_vector_p != NULL) {
-         //   return (*opal_generic_simple_unpack_function_cuda_vector_p)( pConvertor, iov, out_size, max_data);
-            return (*opal_generic_simple_unpack_function_cuda_iov_p)( pConvertor, iov, out_size, max_data);
+            return (*opal_generic_simple_unpack_function_cuda_vector_p)( pConvertor, iov, out_size, max_data);
+          //  return (*opal_generic_simple_unpack_function_cuda_iov_p)( pConvertor, iov, out_size, max_data);
         }
     } else {
         if (opal_generic_simple_unpack_function_cuda_iov_p != NULL) {
diff --git a/opal/mca/btl/smcuda/btl_smcuda.c b/opal/mca/btl/smcuda/btl_smcuda.c
index da403ad937d..ca314d30ebf 100644
--- a/opal/mca/btl/smcuda/btl_smcuda.c
+++ b/opal/mca/btl/smcuda/btl_smcuda.c
@@ -55,6 +55,7 @@
 
 #if OPAL_CUDA_SUPPORT
 #include "opal/mca/common/cuda/common_cuda.h"
+#include "opal/datatype/opal_datatype_gpu.h"
 #endif /* OPAL_CUDA_SUPPORT */
 #include "opal/mca/mpool/base/base.h"
 #include "opal/mca/mpool/sm/mpool_sm.h"
@@ -1135,18 +1136,19 @@ int mca_btl_smcuda_get_cuda (struct mca_btl_base_module_t *btl,
     if ((recvreq->req_recv.req_base.req_convertor.flags & CONVERTOR_CUDA) &&
         (bml_btl->btl_flags & MCA_BTL_FLAGS_CUDA_GET)) {
         recvreq->req_recv.req_base.req_convertor.flags &= ~CONVERTOR_CUDA;
+        uint8_t pack_required = remote_handle->reg_data.pack_required;
+        uint32_t lindex = remote_handle->reg_data.lindex;
+        uint8_t remote_device = remote_handle->reg_data.gpu_device;
+        uint8_t local_device = 0;
         if(opal_convertor_need_buffers(&recvreq->req_recv.req_base.req_convertor) == true) {
             recvreq->req_recv.req_base.req_convertor.flags |= CONVERTOR_CUDA;
-            printf("RECEIVE REGT!!!!!!!!!!!\n");
+            printf("RECEIVE REGT UNPACK, size %ld!!!!!!!!!!!\n", size);
             
             struct opal_convertor_t *convertor = &(recvreq->req_recv.req_base.req_convertor);   
             size_t pipeline_size = remote_handle->reg_data.pipeline_size;
-            uint32_t lindex = remote_handle->reg_data.lindex;
-            uint8_t pack_required = remote_handle->reg_data.pack_required;
-            uint8_t remote_device = remote_handle->reg_data.gpu_device;
-            uint8_t local_device = 0;
-            rc = mca_common_cuda_get_device(&local_device);
             printf("i receive pipeline %ld, lindex %d, pack_required %d, remote_device %d， local_device %d\n", pipeline_size, lindex, pack_required, remote_device, local_device);
+            
+            rc = mca_common_cuda_get_device(&local_device);
             if (rc != 0) {
                 opal_output(0, "Failed to get the GPU device ID, rc=%d", rc);
                 return rc;
@@ -1156,23 +1158,46 @@ int mca_btl_smcuda_get_cuda (struct mca_btl_base_module_t *btl,
             } else {
                 convertor->gpu_buffer_ptr = remote_memory_address;   
             }
-            mca_btl_smcuda_cuda_dt_unpack_clone(convertor, ep, local_address, local_handle, remote_memory_address, (mca_btl_base_completion_fn_t)cbfunc, cbcontext, cbdata, pipeline_size, lindex, remote_device, local_device);
             if (pack_required) {
+                mca_btl_smcuda_cuda_dt_unpack_clone(convertor, ep, local_address, local_handle, remote_memory_address, (mca_btl_base_completion_fn_t)cbfunc, cbcontext, cbdata, pipeline_size, lindex, remote_device, local_device);
                 mca_btl_smcuda_send_cuda_pack_sig(btl, ep, lindex, 0, 0);
+                done = 0;
+                mca_btl_smcuda_free(btl, (mca_btl_base_descriptor_t *)frag);
+            } else {
+                struct iovec iov;
+                uint32_t iov_count = 1;
+                size_t max_data;
+                if (!OPAL_DATATYPE_DIRECT_COPY_GPUMEM && remote_device != local_device) {
+                    convertor->gpu_buffer_ptr = opal_cuda_malloc_gpu_buffer_p(size, 0);
+                    mca_common_cuda_memp2pcpy(convertor->gpu_buffer_ptr, remote_memory_address, size);
+                    iov.iov_base = convertor->gpu_buffer_ptr;
+                    printf("start D2D copy src %p, dst %p, size %lu\n", remote_memory_address, convertor->gpu_buffer_ptr, size);
+                } else {
+                    iov.iov_base = convertor->gpu_buffer_ptr;
+                }
+                iov.iov_len = size;
+                max_data = size;
+                opal_convertor_unpack(convertor, &iov, &iov_count, &max_data );
+                done = 1;
             }
-            done = 0;
-            mca_btl_smcuda_free(btl, (mca_btl_base_descriptor_t *)frag);
         } else {
+            printf("RECEIVE REGT CONTIGUOUS, size %ld !!!!!!!!!!!\n", size);
             recvreq->req_recv.req_base.req_convertor.flags |= CONVERTOR_CUDA;
-            rc = mca_common_cuda_memcpy(local_address, remote_memory_address, size,
-        				"mca_btl_smcuda_get", (mca_btl_base_descriptor_t *)frag,
-        				&done);
-            if (OPAL_SUCCESS != rc) {
-                /* Out of resources can be handled by upper layers. */
-                if (OPAL_ERR_OUT_OF_RESOURCE != rc) {
-                    opal_output(0, "Failed to cuMemcpy GPU memory, rc=%d", rc);
+            if (pack_required) {
+                mca_btl_smcuda_cuda_dt_unpack_clone(NULL, ep, local_address, local_handle, remote_memory_address, (mca_btl_base_completion_fn_t)cbfunc, cbcontext, cbdata, 0, lindex, 0, 0);
+                mca_btl_smcuda_send_cuda_pack_sig(btl, ep, lindex, 0, 0);
+                done = 0;
+            } else {
+                rc = mca_common_cuda_memcpy(local_address, remote_memory_address, size,
+        		            "mca_btl_smcuda_get", (mca_btl_base_descriptor_t *)frag,
+        				    &done);
+                if (OPAL_SUCCESS != rc) {
+                    /* Out of resources can be handled by upper layers. */
+                    if (OPAL_ERR_OUT_OF_RESOURCE != rc) {
+                        opal_output(0, "Failed to cuMemcpy GPU memory, rc=%d", rc);
+                    }
+                    return rc;
                 }
-                return rc;
             }
         }
     }
diff --git a/opal/mca/btl/smcuda/btl_smcuda_component.c b/opal/mca/btl/smcuda/btl_smcuda_component.c
index 870301b5f9c..da2fc6bf6b3 100644
--- a/opal/mca/btl/smcuda/btl_smcuda_component.c
+++ b/opal/mca/btl/smcuda/btl_smcuda_component.c
@@ -879,25 +879,27 @@ static void btl_smcuda_datatype_unpack(mca_btl_base_module_t* btl,
         struct iovec iov;
         uint32_t iov_count = 1;
         size_t max_data;
-        struct opal_convertor_t *convertor = my_cuda_dt_clone->convertor;     
-        if (!OPAL_DATATYPE_DIRECT_COPY_GPUMEM && my_cuda_dt_clone->remote_device != my_cuda_dt_clone->local_device) {
-            convertor->gpu_buffer_ptr = opal_cuda_malloc_gpu_buffer_p(pipeline_size, 0);
-            mca_common_cuda_memp2pcpy(convertor->gpu_buffer_ptr, my_cuda_dt_clone->remote_gpu_address + seq*pipeline_size, pipeline_size);
-            iov.iov_base = convertor->gpu_buffer_ptr;
-            printf("start D2D copy src %p, dst %p, size %lu\n", my_cuda_dt_clone->remote_gpu_address + seq*pipeline_size, convertor->gpu_buffer_ptr, pipeline_size);
-            
-        } else {
-            iov.iov_base = convertor->gpu_buffer_ptr + seq * pipeline_size;
-        }
-        max_data = pipeline_size;
-        iov.iov_len = pipeline_size;
-        opal_convertor_unpack(convertor, &iov, &iov_count, &max_data );
-        if (!OPAL_DATATYPE_DIRECT_COPY_GPUMEM && my_cuda_dt_clone->remote_device != my_cuda_dt_clone->local_device) {
-            if (convertor->gpu_buffer_ptr != NULL) {
-                opal_cuda_free_gpu_buffer_p(convertor->gpu_buffer_ptr, 0);
-                convertor->gpu_buffer_ptr = NULL;
+        struct opal_convertor_t *convertor = my_cuda_dt_clone->convertor;
+        if (convertor == NULL) { /* do not unpack */
+            mca_common_cuda_memp2pcpy(my_cuda_dt_clone->local_address + seq*pipeline_size, my_cuda_dt_clone->remote_gpu_address + seq*pipeline_size, pipeline_size);
+        } else {     /* unpack */
+            if (!OPAL_DATATYPE_DIRECT_COPY_GPUMEM && my_cuda_dt_clone->remote_device != my_cuda_dt_clone->local_device) {
+                convertor->gpu_buffer_ptr = opal_cuda_malloc_gpu_buffer_p(pipeline_size, 0);
+                mca_common_cuda_memp2pcpy(convertor->gpu_buffer_ptr, my_cuda_dt_clone->remote_gpu_address + seq*pipeline_size, pipeline_size);
+                iov.iov_base = convertor->gpu_buffer_ptr;
+                printf("start D2D copy src %p, dst %p, size %lu\n", my_cuda_dt_clone->remote_gpu_address + seq*pipeline_size, convertor->gpu_buffer_ptr, pipeline_size);        
+            } else {
+                iov.iov_base = convertor->gpu_buffer_ptr + seq * pipeline_size;
+            }
+            max_data = pipeline_size;
+            iov.iov_len = pipeline_size;
+            opal_convertor_unpack(convertor, &iov, &iov_count, &max_data );
+            if (!OPAL_DATATYPE_DIRECT_COPY_GPUMEM && my_cuda_dt_clone->remote_device != my_cuda_dt_clone->local_device) {
+                if (convertor->gpu_buffer_ptr != NULL) {
+                    opal_cuda_free_gpu_buffer_p(convertor->gpu_buffer_ptr, 0);
+                    convertor->gpu_buffer_ptr = NULL;
+                }   
             }
-            
         }
     }
    // MCA_BTL_SMCUDA_FRAG_RETURN(frag);
@@ -923,12 +925,12 @@ static void btl_smcuda_datatype_pack(mca_btl_base_module_t* btl,
     printf("$$$$$$$$$$$$$$hello, rank %d in smcuda pack seq %d, index %d\n", my_cuda_dt_clone->endpoint->my_smp_rank, seq, lindex);
     struct opal_convertor_t *convertor = my_cuda_dt_clone->convertor;
     if (seq == -1) {
-        mca_btl_smcuda_send_cuda_unpack_sig(btl, my_cuda_dt_clone->endpoint, lindex, 0, -2);
+        mca_btl_smcuda_send_cuda_unpack_sig(btl, endpoint, lindex, 0, -2);
         if (convertor->gpu_buffer_ptr != NULL) {
             opal_cuda_free_gpu_buffer_p(convertor->gpu_buffer_ptr, 0);
             convertor->gpu_buffer_ptr = NULL;
         }
-        mca_btl_smcuda_free_cuda_dt_pack_clone(my_cuda_dt_clone->endpoint, lindex);
+        mca_btl_smcuda_free_cuda_dt_pack_clone(endpoint, lindex);
     } else {
         struct iovec iov;
         int rc_dt = 0;

From 3a6bdd9daa85bd966d857db6bbed6d0885ef82c9 Mon Sep 17 00:00:00 2001
From: eddy16112 <eddy16112@gmail.com>
Date: Thu, 17 Sep 2015 16:20:08 -0400
Subject: [PATCH 023/190] Fix pipeline bug

---
 ompi/mca/pml/ob1/pml_ob1_cuda.c            |  2 +-
 opal/mca/btl/smcuda/btl_smcuda.c           | 43 +++++++--------------
 opal/mca/btl/smcuda/btl_smcuda.h           | 24 +++---------
 opal/mca/btl/smcuda/btl_smcuda_component.c | 44 +++++++++++++---------
 4 files changed, 48 insertions(+), 65 deletions(-)

diff --git a/ompi/mca/pml/ob1/pml_ob1_cuda.c b/ompi/mca/pml/ob1/pml_ob1_cuda.c
index e0248bbc69e..05339d4f9d4 100644
--- a/ompi/mca/pml/ob1/pml_ob1_cuda.c
+++ b/ompi/mca/pml/ob1/pml_ob1_cuda.c
@@ -135,7 +135,7 @@ int mca_pml_ob1_send_request_start_cuda(mca_pml_ob1_send_request_t* sendreq,
                     return rc;
                 }
                 mca_pml_ob1_rdma_cuda_btl_register_data(sendreq->req_rdma, sendreq->req_rdma_cnt, 0, lindex, 1, local_device); 
-                mca_btl_smcuda_cuda_dt_pack_clone(convertor, bml_btl->btl_endpoint, NULL, NULL, NULL, NULL, NULL, NULL, 0, lindex, 0, local_device);
+                mca_btl_smcuda_cuda_dt_pack_clone(convertor, bml_btl->btl_endpoint, NULL, NULL, 0, lindex, 0, local_device);
     
                 rc = mca_pml_ob1_send_request_start_rdma(sendreq, bml_btl,
                                                          sendreq->req_send.req_bytes_packed);
diff --git a/opal/mca/btl/smcuda/btl_smcuda.c b/opal/mca/btl/smcuda/btl_smcuda.c
index ca314d30ebf..da940fafcf2 100644
--- a/opal/mca/btl/smcuda/btl_smcuda.c
+++ b/opal/mca/btl/smcuda/btl_smcuda.c
@@ -1145,8 +1145,8 @@ int mca_btl_smcuda_get_cuda (struct mca_btl_base_module_t *btl,
             printf("RECEIVE REGT UNPACK, size %ld!!!!!!!!!!!\n", size);
             
             struct opal_convertor_t *convertor = &(recvreq->req_recv.req_base.req_convertor);   
-            size_t pipeline_size = remote_handle->reg_data.pipeline_size;
-            printf("i receive pipeline %ld, lindex %d, pack_required %d, remote_device %d， local_device %d\n", pipeline_size, lindex, pack_required, remote_device, local_device);
+          //  size_t pipeline_size = remote_handle->reg_data.pipeline_size;
+            printf("i receive lindex %d, pack_required %d, remote_device %d， local_device %d\n", lindex, pack_required, remote_device, local_device);
             
             rc = mca_common_cuda_get_device(&local_device);
             if (rc != 0) {
@@ -1159,10 +1159,10 @@ int mca_btl_smcuda_get_cuda (struct mca_btl_base_module_t *btl,
                 convertor->gpu_buffer_ptr = remote_memory_address;   
             }
             if (pack_required) {
-                mca_btl_smcuda_cuda_dt_unpack_clone(convertor, ep, local_address, local_handle, remote_memory_address, (mca_btl_base_completion_fn_t)cbfunc, cbcontext, cbdata, pipeline_size, lindex, remote_device, local_device);
+                mca_btl_smcuda_cuda_dt_unpack_clone(convertor, ep, remote_memory_address, (mca_btl_base_descriptor_t *)frag, 
+                                                    0, lindex, remote_device, local_device);
                 mca_btl_smcuda_send_cuda_pack_sig(btl, ep, lindex, 0, 0);
                 done = 0;
-                mca_btl_smcuda_free(btl, (mca_btl_base_descriptor_t *)frag);
             } else {
                 struct iovec iov;
                 uint32_t iov_count = 1;
@@ -1184,7 +1184,8 @@ int mca_btl_smcuda_get_cuda (struct mca_btl_base_module_t *btl,
             printf("RECEIVE REGT CONTIGUOUS, size %ld !!!!!!!!!!!\n", size);
             recvreq->req_recv.req_base.req_convertor.flags |= CONVERTOR_CUDA;
             if (pack_required) {
-                mca_btl_smcuda_cuda_dt_unpack_clone(NULL, ep, local_address, local_handle, remote_memory_address, (mca_btl_base_completion_fn_t)cbfunc, cbcontext, cbdata, 0, lindex, 0, 0);
+                mca_btl_smcuda_cuda_dt_unpack_clone(NULL, ep, remote_memory_address, (mca_btl_base_descriptor_t *)frag, 
+                                                    0, lindex, 0, 0);
                 mca_btl_smcuda_send_cuda_pack_sig(btl, ep, lindex, 0, 0);
                 done = 0;
             } else {
@@ -1294,7 +1295,7 @@ static void mca_btl_smcuda_send_cuda_ipc_request(struct mca_btl_base_module_t* b
 
 int mca_btl_smcuda_send_cuda_unpack_sig(struct mca_btl_base_module_t* btl,
                                            struct mca_btl_base_endpoint_t* endpoint, 
-                                           int lindex, int pipeline_size, int seq)
+                                           int lindex, int packed_size, int seq)
 {
     mca_btl_smcuda_frag_t* frag;
     int rc;
@@ -1311,7 +1312,7 @@ int mca_btl_smcuda_send_cuda_unpack_sig(struct mca_btl_base_module_t* btl,
     frag->base.des_flags = MCA_BTL_DES_FLAGS_BTL_OWNERSHIP;
     cuda_dt_hdr.seq = seq;
     cuda_dt_hdr.lindex = lindex;
-    cuda_dt_hdr.pipeline_size = pipeline_size;
+    cuda_dt_hdr.packed_size = packed_size;
     memcpy(frag->segment.seg_addr.pval, &cuda_dt_hdr, sizeof(cuda_dt_hdr_t));
     
     rc = mca_btl_smcuda_send(btl, endpoint, (struct mca_btl_base_descriptor_t*)frag,  MCA_BTL_TAG_SMCUDA_DATATYPE_UNPACK);
@@ -1321,7 +1322,7 @@ int mca_btl_smcuda_send_cuda_unpack_sig(struct mca_btl_base_module_t* btl,
 
 int mca_btl_smcuda_send_cuda_pack_sig(struct mca_btl_base_module_t* btl,
                                       struct mca_btl_base_endpoint_t* endpoint, 
-                                      int lindex, int pipeline_size, int seq)
+                                      int lindex, int packed_size, int seq)
 {
     mca_btl_smcuda_frag_t* frag;
     int rc;
@@ -1337,7 +1338,7 @@ int mca_btl_smcuda_send_cuda_pack_sig(struct mca_btl_base_module_t* btl,
     frag->base.des_flags = MCA_BTL_DES_FLAGS_BTL_OWNERSHIP;
     cuda_dt_hdr.seq = seq;
     cuda_dt_hdr.lindex = lindex;
-    cuda_dt_hdr.pipeline_size = pipeline_size;
+    cuda_dt_hdr.packed_size = packed_size;
     memcpy(frag->segment.seg_addr.pval, &cuda_dt_hdr, sizeof(cuda_dt_hdr_t));
     
     rc = mca_btl_smcuda_send(btl, endpoint, (struct mca_btl_base_descriptor_t*)frag,  MCA_BTL_TAG_SMCUDA_DATATYPE_PACK);
@@ -1413,56 +1414,40 @@ void mca_btl_smcuda_free_cuda_dt_unpack_clone(struct mca_btl_base_endpoint_t *en
 
 void mca_btl_smcuda_cuda_dt_pack_clone(struct opal_convertor_t *convertor,
                                        struct mca_btl_base_endpoint_t *endpoint,
-                                       void *local_address,
-                                       struct mca_btl_base_registration_handle_t *local_handle,
                                        void *remote_gpu_address,
-                                       mca_btl_base_completion_fn_t cbfunc,
-                                       void *cbcontext,
-                                       void *cbdata,
+                                       mca_btl_base_descriptor_t *frag,
                                        size_t pipeline_size,
                                        int lindex, uint8_t remote_device, uint8_t local_device)
 {
     endpoint->smcuda_dt_pack_clone[lindex].convertor = convertor;
  //   endpoint->smcuda_dt_pack_clone[lindex].gpu_ptr = convertor->gpu_buffer_ptr;
     endpoint->smcuda_dt_pack_clone[lindex].endpoint = endpoint;
-    endpoint->smcuda_dt_pack_clone[lindex].local_address = local_address;
-    endpoint->smcuda_dt_pack_clone[lindex].local_handle = local_handle;
     endpoint->smcuda_dt_pack_clone[lindex].remote_gpu_address = remote_gpu_address;
-    endpoint->smcuda_dt_pack_clone[lindex].cbfunc = cbfunc;
-    endpoint->smcuda_dt_pack_clone[lindex].cbcontext = cbcontext;
-    endpoint->smcuda_dt_pack_clone[lindex].cbdata = cbdata;
     endpoint->smcuda_dt_pack_clone[lindex].pipeline_size = pipeline_size;
     endpoint->smcuda_dt_pack_clone[lindex].lindex = lindex;
     endpoint->smcuda_dt_pack_clone[lindex].seq = -9;
     endpoint->smcuda_dt_pack_clone[lindex].remote_device = remote_device;
     endpoint->smcuda_dt_pack_clone[lindex].local_device = local_device;
+    endpoint->smcuda_dt_pack_clone[lindex].frag = frag;
 }
 
 void mca_btl_smcuda_cuda_dt_unpack_clone(struct opal_convertor_t *convertor,
                                          struct mca_btl_base_endpoint_t *endpoint,
-                                         void *local_address,
-                                         struct mca_btl_base_registration_handle_t *local_handle,
                                          void *remote_gpu_address,
-                                         mca_btl_base_completion_fn_t cbfunc,
-                                         void *cbcontext,
-                                         void *cbdata,
+                                         mca_btl_base_descriptor_t *frag,
                                          size_t pipeline_size,
                                          int lindex, uint8_t remote_device, uint8_t local_device)
 {
     endpoint->smcuda_dt_unpack_clone[lindex].convertor = convertor;
 //    endpoint->smcuda_dt_unpack_clone[lindex].gpu_ptr = convertor->gpu_buffer_ptr;
     endpoint->smcuda_dt_unpack_clone[lindex].endpoint = endpoint;
-    endpoint->smcuda_dt_unpack_clone[lindex].local_address = local_address;
-    endpoint->smcuda_dt_unpack_clone[lindex].local_handle = local_handle;
     endpoint->smcuda_dt_unpack_clone[lindex].remote_gpu_address = remote_gpu_address;
-    endpoint->smcuda_dt_unpack_clone[lindex].cbfunc = cbfunc;
-    endpoint->smcuda_dt_unpack_clone[lindex].cbcontext = cbcontext;
-    endpoint->smcuda_dt_unpack_clone[lindex].cbdata = cbdata;
     endpoint->smcuda_dt_unpack_clone[lindex].pipeline_size = pipeline_size;
     endpoint->smcuda_dt_unpack_clone[lindex].lindex = lindex;
     endpoint->smcuda_dt_unpack_clone[lindex].seq = -9;
     endpoint->smcuda_dt_unpack_clone[lindex].remote_device = remote_device;
     endpoint->smcuda_dt_unpack_clone[lindex].local_device = local_device;
+    endpoint->smcuda_dt_unpack_clone[lindex].frag = frag;
 }
 
 #endif /* OPAL_CUDA_SUPPORT */
diff --git a/opal/mca/btl/smcuda/btl_smcuda.h b/opal/mca/btl/smcuda/btl_smcuda.h
index 478dd184d24..d8ef5ed29f6 100644
--- a/opal/mca/btl/smcuda/btl_smcuda.h
+++ b/opal/mca/btl/smcuda/btl_smcuda.h
@@ -515,31 +515,27 @@ enum ipcState {
 typedef struct {
     int seq;
     int lindex;
-    int pipeline_size;
+    int packed_size;
 } cuda_dt_hdr_t;
 
 /* package save pack/unpack convertor and cbfunc */
 typedef struct {
     struct opal_convertor_t *convertor;
     struct mca_btl_base_endpoint_t *endpoint;
-    void *local_address;
-    struct mca_btl_base_registration_handle_t *local_handle;
     void *remote_gpu_address;
-    mca_btl_base_completion_fn_t cbfunc;
-    void *cbcontext;
-    void *cbdata;
     size_t pipeline_size;
     int lindex;
     int seq;
     uint8_t remote_device;
     uint8_t local_device;
+    mca_btl_base_descriptor_t *frag;
 } cuda_dt_clone_t;
 
 #define SMCUDA_DT_CLONE_SIZE 20
 extern cuda_dt_clone_t smcuda_dt_clone[SMCUDA_DT_CLONE_SIZE];
 
-int mca_btl_smcuda_send_cuda_unpack_sig(struct mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t* endpoint, int lindex, int pipeline_size, int seq);
-int mca_btl_smcuda_send_cuda_pack_sig(struct mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t* endpoint, int lindex, int pipeline_size, int seq);
+int mca_btl_smcuda_send_cuda_unpack_sig(struct mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t* endpoint, int lindex, int packed_size, int seq);
+int mca_btl_smcuda_send_cuda_pack_sig(struct mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t* endpoint, int lindex, int packed_size, int seq);
 int mca_btl_smcuda_check_cuda_dt_pack_clone_exist(struct mca_btl_base_endpoint_t *endpoint, struct opal_convertor_t *convertor);
 int mca_btl_smcuda_set_cuda_dt_pack_seq(struct mca_btl_base_endpoint_t *endpoint, int lindex, int seq);
 int mca_btl_smcuda_get_cuda_dt_pack_seq(struct mca_btl_base_endpoint_t *endpoint, int lindex);
@@ -550,22 +546,14 @@ void mca_btl_smcuda_free_cuda_dt_pack_clone(struct mca_btl_base_endpoint_t *endp
 void mca_btl_smcuda_free_cuda_dt_unpack_clone(struct mca_btl_base_endpoint_t *endpoint, int lindex);
 void mca_btl_smcuda_cuda_dt_pack_clone(struct opal_convertor_t *convertor,
                                        struct mca_btl_base_endpoint_t *endpoint,
-                                       void *local_address,
-                                       struct mca_btl_base_registration_handle_t *local_handle,
                                        void *remote_gpu_address,
-                                       mca_btl_base_completion_fn_t cbfunc,
-                                       void *cbcontext,
-                                       void *cbdata,
+                                       mca_btl_base_descriptor_t *frag,
                                        size_t pipeline_size,
                                        int lindex, uint8_t remote_device, uint8_t local_device);
 void mca_btl_smcuda_cuda_dt_unpack_clone(struct opal_convertor_t *convertor,
                                          struct mca_btl_base_endpoint_t *endpoint,
-                                         void *local_address,
-                                         struct mca_btl_base_registration_handle_t *local_handle,
                                          void *remote_gpu_address,
-                                         mca_btl_base_completion_fn_t cbfunc,
-                                         void *cbcontext,
-                                         void *cbdata,
+                                         mca_btl_base_descriptor_t *frag,
                                          size_t pipeline_size,
                                          int lindex, uint8_t remote_device, uint8_t local_device);
 
diff --git a/opal/mca/btl/smcuda/btl_smcuda_component.c b/opal/mca/btl/smcuda/btl_smcuda_component.c
index da2fc6bf6b3..e4e1c280857 100644
--- a/opal/mca/btl/smcuda/btl_smcuda_component.c
+++ b/opal/mca/btl/smcuda/btl_smcuda_component.c
@@ -858,7 +858,7 @@ static void btl_smcuda_datatype_unpack(mca_btl_base_module_t* btl,
     memcpy(&cuda_dt_hdr, segments->seg_addr.pval, sizeof(cuda_dt_hdr_t));
     int seq = cuda_dt_hdr.seq;
     int lindex = cuda_dt_hdr.lindex;
-    int pipeline_size = cuda_dt_hdr.pipeline_size;
+    size_t packed_size = cuda_dt_hdr.packed_size;
     mca_btl_smcuda_frag_t *frag = (mca_btl_smcuda_frag_t *)des;
     cuda_dt_clone_t *my_cuda_dt_clone;
 
@@ -870,29 +870,38 @@ static void btl_smcuda_datatype_unpack(mca_btl_base_module_t* btl,
     printf("$$$$$$$$$$$$$$hello, rank %d in smcuda unpack seq %d, index %d\n", my_cuda_dt_clone->endpoint->my_smp_rank, seq, lindex);
     
     if (seq == -2) {
-        mca_btl_base_rdma_completion_fn_t cbfunc = (mca_btl_base_rdma_completion_fn_t)my_cuda_dt_clone->cbfunc;
-        cbfunc(btl, endpoint, my_cuda_dt_clone->local_address, my_cuda_dt_clone->local_handle, my_cuda_dt_clone->cbcontext, my_cuda_dt_clone->cbdata, OPAL_SUCCESS);
+        mca_btl_smcuda_frag_t *frag_recv = (mca_btl_smcuda_frag_t *) my_cuda_dt_clone->frag;
+        mca_btl_base_rdma_completion_fn_t cbfunc = (mca_btl_base_rdma_completion_fn_t) frag_recv->base.des_cbfunc;
+        cbfunc (btl, endpoint, frag_recv->segment.seg_addr.pval, frag_recv->local_handle, frag_recv->base.des_context, frag_recv->base.des_cbdata, OPAL_SUCCESS);
+        mca_btl_smcuda_free(btl, (mca_btl_base_descriptor_t *)frag_recv);
         mca_btl_smcuda_free_cuda_dt_unpack_clone(endpoint, lindex);
     } else if (seq == -1) {
-        mca_btl_smcuda_send_cuda_pack_sig(btl, endpoint, lindex, pipeline_size, -1);
+        mca_btl_smcuda_send_cuda_pack_sig(btl, endpoint, lindex, 0, -1);
     } else {
         struct iovec iov;
         uint32_t iov_count = 1;
         size_t max_data;
         struct opal_convertor_t *convertor = my_cuda_dt_clone->convertor;
+        if (my_cuda_dt_clone->pipeline_size == 0) {
+            my_cuda_dt_clone->pipeline_size = packed_size;
+        }
+        size_t pipeline_size = my_cuda_dt_clone->pipeline_size;
         if (convertor == NULL) { /* do not unpack */
-            mca_common_cuda_memp2pcpy(my_cuda_dt_clone->local_address + seq*pipeline_size, my_cuda_dt_clone->remote_gpu_address + seq*pipeline_size, pipeline_size);
+            mca_btl_smcuda_frag_t *frag_recv = (mca_btl_smcuda_frag_t *) my_cuda_dt_clone->frag;
+            unsigned char *local_address = (unsigned char*)frag_recv->segment.seg_addr.pval;
+            printf("D2D local %p, remote %p, size %ld\n", local_address + seq*pipeline_size, my_cuda_dt_clone->remote_gpu_address+seq*pipeline_size, packed_size);
+            mca_common_cuda_memp2pcpy(local_address + seq*pipeline_size, my_cuda_dt_clone->remote_gpu_address + seq*pipeline_size, packed_size);
         } else {     /* unpack */
             if (!OPAL_DATATYPE_DIRECT_COPY_GPUMEM && my_cuda_dt_clone->remote_device != my_cuda_dt_clone->local_device) {
                 convertor->gpu_buffer_ptr = opal_cuda_malloc_gpu_buffer_p(pipeline_size, 0);
-                mca_common_cuda_memp2pcpy(convertor->gpu_buffer_ptr, my_cuda_dt_clone->remote_gpu_address + seq*pipeline_size, pipeline_size);
+                mca_common_cuda_memp2pcpy(convertor->gpu_buffer_ptr, my_cuda_dt_clone->remote_gpu_address + seq*pipeline_size, packed_size);
                 iov.iov_base = convertor->gpu_buffer_ptr;
-                printf("start D2D copy src %p, dst %p, size %lu\n", my_cuda_dt_clone->remote_gpu_address + seq*pipeline_size, convertor->gpu_buffer_ptr, pipeline_size);        
+                printf("start D2D copy src %p, dst %p, size %lu\n", my_cuda_dt_clone->remote_gpu_address + seq*pipeline_size, convertor->gpu_buffer_ptr, packed_size);        
             } else {
                 iov.iov_base = convertor->gpu_buffer_ptr + seq * pipeline_size;
             }
-            max_data = pipeline_size;
-            iov.iov_len = pipeline_size;
+            max_data = packed_size;
+            iov.iov_len = packed_size;
             opal_convertor_unpack(convertor, &iov, &iov_count, &max_data );
             if (!OPAL_DATATYPE_DIRECT_COPY_GPUMEM && my_cuda_dt_clone->remote_device != my_cuda_dt_clone->local_device) {
                 if (convertor->gpu_buffer_ptr != NULL) {
@@ -934,25 +943,26 @@ static void btl_smcuda_datatype_pack(mca_btl_base_module_t* btl,
     } else {
         struct iovec iov;
         int rc_dt = 0;
-        size_t pipeline_size = mca_btl_smcuda_component.cuda_dt_pipeline_size;
-        printf("Pipeline_size %ld\n", pipeline_size);
+        size_t packed_size = mca_btl_smcuda_component.cuda_dt_pipeline_size;
+        printf("Pipeline_size %ld\n", packed_size);
         uint32_t iov_count = 1;
         iov.iov_base = convertor->gpu_buffer_ptr;
-        iov.iov_len = pipeline_size;
+        iov.iov_len = packed_size;
         size_t max_data = 0;
         int seq = 0;
         /* the first pack here is used to get the correct size of pipeline_size */
         /* because pack may not use the whole pipeline size */
         rc_dt = opal_convertor_pack(convertor, &iov, &iov_count, &max_data );
-        pipeline_size = max_data;
-        mca_btl_smcuda_send_cuda_unpack_sig(btl, endpoint, lindex, pipeline_size, seq);
+        packed_size = max_data;
+        mca_btl_smcuda_send_cuda_unpack_sig(btl, endpoint, lindex, packed_size, seq);
         while (rc_dt != 1) {
-            iov.iov_base += pipeline_size;
+            iov.iov_base += packed_size;
             seq ++;
             rc_dt = opal_convertor_pack(convertor, &iov, &iov_count, &max_data );
-            mca_btl_smcuda_send_cuda_unpack_sig(btl, endpoint, lindex, pipeline_size, seq);
+            packed_size = max_data;
+            mca_btl_smcuda_send_cuda_unpack_sig(btl, endpoint, lindex, packed_size, seq);
         }
-        mca_btl_smcuda_send_cuda_unpack_sig(btl, endpoint, lindex, pipeline_size, -1);
+        mca_btl_smcuda_send_cuda_unpack_sig(btl, endpoint, lindex, 0, -1);
     }
   //  MCA_BTL_SMCUDA_FRAG_RETURN(frag);
 }

From f86c81eb228a4d494f776cc504f82d3d4a8604f9 Mon Sep 17 00:00:00 2001
From: eddy16112 <eddy16112@gmail.com>
Date: Fri, 18 Sep 2015 00:39:46 -0400
Subject: [PATCH 024/190] now we are able to pack directly to remote buffer if
 receiver is contiguous

---
 opal/mca/btl/smcuda/btl_smcuda.c           | 54 +++++++++++++-------
 opal/mca/btl/smcuda/btl_smcuda.h           | 18 +++++--
 opal/mca/btl/smcuda/btl_smcuda_component.c | 59 ++++++++++++++++++----
 3 files changed, 101 insertions(+), 30 deletions(-)

diff --git a/opal/mca/btl/smcuda/btl_smcuda.c b/opal/mca/btl/smcuda/btl_smcuda.c
index da940fafcf2..7dd56f6e612 100644
--- a/opal/mca/btl/smcuda/btl_smcuda.c
+++ b/opal/mca/btl/smcuda/btl_smcuda.c
@@ -1140,6 +1140,11 @@ int mca_btl_smcuda_get_cuda (struct mca_btl_base_module_t *btl,
         uint32_t lindex = remote_handle->reg_data.lindex;
         uint8_t remote_device = remote_handle->reg_data.gpu_device;
         uint8_t local_device = 0;
+        rc = mca_common_cuda_get_device(&local_device);
+        if (rc != 0) {
+            opal_output(0, "Failed to get the GPU device ID, rc=%d", rc);
+            return rc;
+        }
         if(opal_convertor_need_buffers(&recvreq->req_recv.req_base.req_convertor) == true) {
             recvreq->req_recv.req_base.req_convertor.flags |= CONVERTOR_CUDA;
             printf("RECEIVE REGT UNPACK, size %ld!!!!!!!!!!!\n", size);
@@ -1148,11 +1153,6 @@ int mca_btl_smcuda_get_cuda (struct mca_btl_base_module_t *btl,
           //  size_t pipeline_size = remote_handle->reg_data.pipeline_size;
             printf("i receive lindex %d, pack_required %d, remote_device %d， local_device %d\n", lindex, pack_required, remote_device, local_device);
             
-            rc = mca_common_cuda_get_device(&local_device);
-            if (rc != 0) {
-                opal_output(0, "Failed to get the GPU device ID, rc=%d", rc);
-                return rc;
-            }
             if (remote_device != local_device && !OPAL_DATATYPE_DIRECT_COPY_GPUMEM) {
                 convertor->gpu_buffer_ptr = NULL;  
             } else {
@@ -1161,7 +1161,12 @@ int mca_btl_smcuda_get_cuda (struct mca_btl_base_module_t *btl,
             if (pack_required) {
                 mca_btl_smcuda_cuda_dt_unpack_clone(convertor, ep, remote_memory_address, (mca_btl_base_descriptor_t *)frag, 
                                                     0, lindex, remote_device, local_device);
-                mca_btl_smcuda_send_cuda_pack_sig(btl, ep, lindex, 0, 0);
+                cuda_dt_hdr_t send_msg;
+                send_msg.lindex = lindex;
+                send_msg.packed_size = 0;
+                send_msg.seq = 0;
+                send_msg.msg_type = CUDA_PACK_TO_LOCAL;
+                mca_btl_smcuda_send_cuda_pack_sig(btl, ep, &send_msg);
                 done = 0;
             } else {
                 struct iovec iov;
@@ -1184,9 +1189,28 @@ int mca_btl_smcuda_get_cuda (struct mca_btl_base_module_t *btl,
             printf("RECEIVE REGT CONTIGUOUS, size %ld !!!!!!!!!!!\n", size);
             recvreq->req_recv.req_base.req_convertor.flags |= CONVERTOR_CUDA;
             if (pack_required) {
+                cuda_dt_hdr_t send_msg;
+                send_msg.lindex = lindex;
+                send_msg.packed_size = 0;
+                if (remote_device == local_device && OPAL_DATATYPE_DIRECT_COPY_GPUMEM) {
+                    /* now we are able to let sender pack directly to my memory */
+                    mca_mpool_common_cuda_reg_t loc_reg;
+                    mca_mpool_common_cuda_reg_t *loc_reg_ptr = &loc_reg;
+                    cuda_getmemhandle(local_address, size, (mca_mpool_base_registration_t *)&loc_reg, NULL);
+                    memcpy(send_msg.mem_handle, loc_reg_ptr->data.memHandle, sizeof(loc_reg_ptr->data.memHandle));
+                    send_msg.seq = -9;
+                    send_msg.msg_type = CUDA_PACK_TO_REMOTE;
+                    send_msg.remote_address = local_address;
+                    send_msg.remote_base = loc_reg.base.base;
+                    mca_common_wait_stream_synchronize(&loc_reg);
+                    printf("send r_addr %p, r_base %p\n", local_address, loc_reg.base.base);
+                } else {
+                    send_msg.seq = 0;
+                    send_msg.msg_type = CUDA_PACK_TO_LOCAL;
+                }
                 mca_btl_smcuda_cuda_dt_unpack_clone(NULL, ep, remote_memory_address, (mca_btl_base_descriptor_t *)frag, 
                                                     0, lindex, 0, 0);
-                mca_btl_smcuda_send_cuda_pack_sig(btl, ep, lindex, 0, 0);
+                mca_btl_smcuda_send_cuda_pack_sig(btl, ep, &send_msg);
                 done = 0;
             } else {
                 rc = mca_common_cuda_memcpy(local_address, remote_memory_address, size,
@@ -1295,7 +1319,7 @@ static void mca_btl_smcuda_send_cuda_ipc_request(struct mca_btl_base_module_t* b
 
 int mca_btl_smcuda_send_cuda_unpack_sig(struct mca_btl_base_module_t* btl,
                                            struct mca_btl_base_endpoint_t* endpoint, 
-                                           int lindex, int packed_size, int seq)
+                                           cuda_dt_hdr_t *send_msg)
 {
     mca_btl_smcuda_frag_t* frag;
     int rc;
@@ -1310,19 +1334,16 @@ int mca_btl_smcuda_send_cuda_unpack_sig(struct mca_btl_base_module_t* btl,
 
     /* Fill in fragment fields. */
     frag->base.des_flags = MCA_BTL_DES_FLAGS_BTL_OWNERSHIP;
-    cuda_dt_hdr.seq = seq;
-    cuda_dt_hdr.lindex = lindex;
-    cuda_dt_hdr.packed_size = packed_size;
-    memcpy(frag->segment.seg_addr.pval, &cuda_dt_hdr, sizeof(cuda_dt_hdr_t));
+    memcpy(frag->segment.seg_addr.pval, send_msg, sizeof(cuda_dt_hdr_t));
     
     rc = mca_btl_smcuda_send(btl, endpoint, (struct mca_btl_base_descriptor_t*)frag,  MCA_BTL_TAG_SMCUDA_DATATYPE_UNPACK);
-    printf("######## rank %d, send seq %d, endpoint %p\n", endpoint->my_smp_rank, seq, endpoint);
+    printf("######## rank %d, send seq %d, endpoint %p\n", endpoint->my_smp_rank, send_msg->seq, endpoint);
     return rc;
 }
 
 int mca_btl_smcuda_send_cuda_pack_sig(struct mca_btl_base_module_t* btl,
                                       struct mca_btl_base_endpoint_t* endpoint, 
-                                      int lindex, int packed_size, int seq)
+                                      cuda_dt_hdr_t *send_msg)
 {
     mca_btl_smcuda_frag_t* frag;
     int rc;
@@ -1336,10 +1357,7 @@ int mca_btl_smcuda_send_cuda_pack_sig(struct mca_btl_base_module_t* btl,
 
     /* Fill in fragment fields. */
     frag->base.des_flags = MCA_BTL_DES_FLAGS_BTL_OWNERSHIP;
-    cuda_dt_hdr.seq = seq;
-    cuda_dt_hdr.lindex = lindex;
-    cuda_dt_hdr.packed_size = packed_size;
-    memcpy(frag->segment.seg_addr.pval, &cuda_dt_hdr, sizeof(cuda_dt_hdr_t));
+    memcpy(frag->segment.seg_addr.pval, send_msg, sizeof(cuda_dt_hdr_t));
     
     rc = mca_btl_smcuda_send(btl, endpoint, (struct mca_btl_base_descriptor_t*)frag,  MCA_BTL_TAG_SMCUDA_DATATYPE_PACK);
     return rc;
diff --git a/opal/mca/btl/smcuda/btl_smcuda.h b/opal/mca/btl/smcuda/btl_smcuda.h
index d8ef5ed29f6..7616e16c720 100644
--- a/opal/mca/btl/smcuda/btl_smcuda.h
+++ b/opal/mca/btl/smcuda/btl_smcuda.h
@@ -41,7 +41,7 @@
 #include "opal/mca/btl/btl.h"
 #include "opal/mca/common/sm/common_sm.h"
 
-#define OPAL_DATATYPE_DIRECT_COPY_GPUMEM    0
+#define OPAL_DATATYPE_DIRECT_COPY_GPUMEM    1
 
 BEGIN_C_DECLS
 
@@ -514,10 +514,22 @@ enum ipcState {
 /* cuda datatype control message */
 typedef struct {
     int seq;
+    int msg_type;
     int lindex;
     int packed_size;
+    void *remote_address;
+    void *remote_base;
+    uint64_t mem_handle[8];
 } cuda_dt_hdr_t;
 
+#define CUDA_UNPACK_FROM_REMOTE     0
+#define CUDA_PACK_COMPLETE          1
+#define CUDA_PACK_COMPLETE_ACK      2
+#define CUDA_PACK_CLEANUP           3
+#define CUDA_PACK_TO_LOCAL          4
+#define CUDA_PACK_TO_REMOTE         5
+#define CUDA_UNPACK_NO              6
+
 /* package save pack/unpack convertor and cbfunc */
 typedef struct {
     struct opal_convertor_t *convertor;
@@ -534,8 +546,8 @@ typedef struct {
 #define SMCUDA_DT_CLONE_SIZE 20
 extern cuda_dt_clone_t smcuda_dt_clone[SMCUDA_DT_CLONE_SIZE];
 
-int mca_btl_smcuda_send_cuda_unpack_sig(struct mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t* endpoint, int lindex, int packed_size, int seq);
-int mca_btl_smcuda_send_cuda_pack_sig(struct mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t* endpoint, int lindex, int packed_size, int seq);
+int mca_btl_smcuda_send_cuda_unpack_sig(struct mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t* endpoint, cuda_dt_hdr_t *send_msg);
+int mca_btl_smcuda_send_cuda_pack_sig(struct mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t* endpoint, cuda_dt_hdr_t *send_msg);
 int mca_btl_smcuda_check_cuda_dt_pack_clone_exist(struct mca_btl_base_endpoint_t *endpoint, struct opal_convertor_t *convertor);
 int mca_btl_smcuda_set_cuda_dt_pack_seq(struct mca_btl_base_endpoint_t *endpoint, int lindex, int seq);
 int mca_btl_smcuda_get_cuda_dt_pack_seq(struct mca_btl_base_endpoint_t *endpoint, int lindex);
diff --git a/opal/mca/btl/smcuda/btl_smcuda_component.c b/opal/mca/btl/smcuda/btl_smcuda_component.c
index e4e1c280857..0243822d1d9 100644
--- a/opal/mca/btl/smcuda/btl_smcuda_component.c
+++ b/opal/mca/btl/smcuda/btl_smcuda_component.c
@@ -859,6 +859,7 @@ static void btl_smcuda_datatype_unpack(mca_btl_base_module_t* btl,
     int seq = cuda_dt_hdr.seq;
     int lindex = cuda_dt_hdr.lindex;
     size_t packed_size = cuda_dt_hdr.packed_size;
+    int msg_type = cuda_dt_hdr.msg_type;
     mca_btl_smcuda_frag_t *frag = (mca_btl_smcuda_frag_t *)des;
     cuda_dt_clone_t *my_cuda_dt_clone;
 
@@ -869,15 +870,20 @@ static void btl_smcuda_datatype_unpack(mca_btl_base_module_t* btl,
     
     printf("$$$$$$$$$$$$$$hello, rank %d in smcuda unpack seq %d, index %d\n", my_cuda_dt_clone->endpoint->my_smp_rank, seq, lindex);
     
-    if (seq == -2) {
+    if (msg_type == CUDA_PACK_CLEANUP) {
         mca_btl_smcuda_frag_t *frag_recv = (mca_btl_smcuda_frag_t *) my_cuda_dt_clone->frag;
         mca_btl_base_rdma_completion_fn_t cbfunc = (mca_btl_base_rdma_completion_fn_t) frag_recv->base.des_cbfunc;
         cbfunc (btl, endpoint, frag_recv->segment.seg_addr.pval, frag_recv->local_handle, frag_recv->base.des_context, frag_recv->base.des_cbdata, OPAL_SUCCESS);
         mca_btl_smcuda_free(btl, (mca_btl_base_descriptor_t *)frag_recv);
         mca_btl_smcuda_free_cuda_dt_unpack_clone(endpoint, lindex);
-    } else if (seq == -1) {
-        mca_btl_smcuda_send_cuda_pack_sig(btl, endpoint, lindex, 0, -1);
-    } else {
+    } else if (msg_type == CUDA_PACK_COMPLETE) {
+        cuda_dt_hdr_t send_msg;
+        send_msg.lindex = lindex;
+        send_msg.packed_size = 0;
+        send_msg.seq = -1;
+        send_msg.msg_type = CUDA_PACK_COMPLETE_ACK;
+        mca_btl_smcuda_send_cuda_pack_sig(btl, endpoint, &send_msg);
+    } else if (msg_type == CUDA_UNPACK_FROM_REMOTE){
         struct iovec iov;
         uint32_t iov_count = 1;
         size_t max_data;
@@ -924,8 +930,10 @@ static void btl_smcuda_datatype_pack(mca_btl_base_module_t* btl,
     memcpy(&cuda_dt_hdr, segments->seg_addr.pval, sizeof(cuda_dt_hdr_t));
     int seq = cuda_dt_hdr.seq;
     int lindex = cuda_dt_hdr.lindex;
+    int msg_type = cuda_dt_hdr.msg_type;
     mca_btl_smcuda_frag_t *frag = (mca_btl_smcuda_frag_t *)des;
     cuda_dt_clone_t *my_cuda_dt_clone;
+    cuda_dt_hdr_t send_msg;
 
     /* We can find the endoint back from the rank embedded in the header */
     endpoint = mca_btl_smcuda_component.sm_peers[frag->hdr->my_smp_rank];
@@ -933,14 +941,35 @@ static void btl_smcuda_datatype_pack(mca_btl_base_module_t* btl,
     
     printf("$$$$$$$$$$$$$$hello, rank %d in smcuda pack seq %d, index %d\n", my_cuda_dt_clone->endpoint->my_smp_rank, seq, lindex);
     struct opal_convertor_t *convertor = my_cuda_dt_clone->convertor;
-    if (seq == -1) {
-        mca_btl_smcuda_send_cuda_unpack_sig(btl, endpoint, lindex, 0, -2);
+    send_msg.lindex = lindex;
+    if (msg_type == CUDA_PACK_COMPLETE_ACK) {
+        send_msg.packed_size = 0;
+        send_msg.seq = -2;
+        send_msg.msg_type = CUDA_PACK_CLEANUP;
+        mca_btl_smcuda_send_cuda_unpack_sig(btl, endpoint, &send_msg);
         if (convertor->gpu_buffer_ptr != NULL) {
             opal_cuda_free_gpu_buffer_p(convertor->gpu_buffer_ptr, 0);
             convertor->gpu_buffer_ptr = NULL;
         }
         mca_btl_smcuda_free_cuda_dt_pack_clone(endpoint, lindex);
     } else {
+        mca_mpool_common_cuda_reg_t *rget_reg_ptr = NULL;
+        if (msg_type == CUDA_PACK_TO_REMOTE) { /* receiver is contiguous, and ask me to pack directly to his gpu memory */
+            opal_cuda_free_gpu_buffer_p(convertor->gpu_buffer_ptr, 0);
+            mca_mpool_common_cuda_reg_t rget_reg;
+            rget_reg_ptr= &rget_reg;
+            memset(&rget_reg, 0, sizeof(rget_reg));
+            memcpy(rget_reg.data.memHandle, cuda_dt_hdr.mem_handle, sizeof(cuda_dt_hdr.mem_handle));
+            cuda_openmemhandle(NULL, 0, (mca_mpool_base_registration_t *)&rget_reg, NULL);
+            mca_common_wait_stream_synchronize(&rget_reg);
+            size_t offset = (size_t) ((intptr_t) cuda_dt_hdr.remote_address - (intptr_t) cuda_dt_hdr.remote_base);
+            unsigned char *remote_memory_address = (unsigned char *)rget_reg_ptr->base.alloc_base + offset;
+            convertor->gpu_buffer_ptr = remote_memory_address;
+            printf("remote_memory_address $$$$$$ %p, r_addr %p, r_base %p\n", remote_memory_address, cuda_dt_hdr.remote_address, cuda_dt_hdr.remote_base);
+            send_msg.msg_type = CUDA_UNPACK_NO;
+        } else {
+            send_msg.msg_type = CUDA_UNPACK_FROM_REMOTE;
+        }
         struct iovec iov;
         int rc_dt = 0;
         size_t packed_size = mca_btl_smcuda_component.cuda_dt_pipeline_size;
@@ -954,15 +983,27 @@ static void btl_smcuda_datatype_pack(mca_btl_base_module_t* btl,
         /* because pack may not use the whole pipeline size */
         rc_dt = opal_convertor_pack(convertor, &iov, &iov_count, &max_data );
         packed_size = max_data;
-        mca_btl_smcuda_send_cuda_unpack_sig(btl, endpoint, lindex, packed_size, seq);
+        send_msg.packed_size = packed_size;
+        send_msg.seq = seq;
+        mca_btl_smcuda_send_cuda_unpack_sig(btl, endpoint, &send_msg);
         while (rc_dt != 1) {
             iov.iov_base += packed_size;
             seq ++;
             rc_dt = opal_convertor_pack(convertor, &iov, &iov_count, &max_data );
             packed_size = max_data;
-            mca_btl_smcuda_send_cuda_unpack_sig(btl, endpoint, lindex, packed_size, seq);
+            send_msg.packed_size = packed_size;
+            send_msg.seq = seq;
+            mca_btl_smcuda_send_cuda_unpack_sig(btl, endpoint, &send_msg);
+        }
+        
+        send_msg.packed_size = 0;
+        send_msg.seq = -1;
+        send_msg.msg_type = CUDA_PACK_COMPLETE;
+        mca_btl_smcuda_send_cuda_unpack_sig(btl, endpoint, &send_msg);
+        
+        if (rget_reg_ptr != NULL) { /* close memhandle */
+            cuda_closememhandle(NULL, (mca_mpool_base_registration_t *)rget_reg_ptr);
         }
-        mca_btl_smcuda_send_cuda_unpack_sig(btl, endpoint, lindex, 0, -1);
     }
   //  MCA_BTL_SMCUDA_FRAG_RETURN(frag);
 }

From 6ae39b2d8748f4f8980b7de685e93c2d9228057f Mon Sep 17 00:00:00 2001
From: eddy16112 <eddy16112@gmail.com>
Date: Tue, 29 Sep 2015 17:12:40 -0400
Subject: [PATCH 025/190] add ddt_benchmark

---
 test/datatype/ddt_benchmark.c | 1184 +++++++++++++++++++++++++++++++++
 1 file changed, 1184 insertions(+)
 create mode 100644 test/datatype/ddt_benchmark.c

diff --git a/test/datatype/ddt_benchmark.c b/test/datatype/ddt_benchmark.c
new file mode 100644
index 00000000000..860e9b87c94
--- /dev/null
+++ b/test/datatype/ddt_benchmark.c
@@ -0,0 +1,1184 @@
+/* -*- Mode: C; c-basic-offset:4 ; -*- */
+/*
+ * Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana
+ *                         University Research and Technology
+ *                         Corporation.  All rights reserved.
+ * Copyright (c) 2004-2009 The University of Tennessee and The University
+ *                         of Tennessee Research Foundation.  All rights
+ *                         reserved.
+ * Copyright (c) 2004-2006 High Performance Computing Center Stuttgart, 
+ *                         University of Stuttgart.  All rights reserved.
+ * Copyright (c) 2004-2006 The Regents of the University of California.
+ *                         All rights reserved.
+ * Copyright (c) 2006      Sun Microsystems Inc. All rights reserved.
+ * $COPYRIGHT$
+ * 
+ * Additional copyrights may follow
+ * 
+ * $HEADER$
+ */
+
+#include "ompi_config.h"
+#include "ddt_lib.h"
+#include "opal/runtime/opal.h"
+#include "opal/datatype/opal_convertor.h"
+#include <time.h>
+#include <stdlib.h>
+#ifdef HAVE_SYS_TIME_H
+#include <sys/time.h>
+#endif
+#include <stdio.h>
+#include <string.h>
+
+#define DDT_TEST_CUDA
+#define CUDA_MEMCPY_2D_D2H
+
+
+#include <cuda_runtime_api.h>
+#include "opal/mca/common/cuda/common_cuda.h"
+#include "opal/runtime/opal_params.h"
+#define CONVERTOR_CUDA             0x00400000
+
+
+/* Compile with:
+mpicc -DHAVE_CONFIG_H -I. -I../../include -I../../../ompi-trunk/include  -I../.. -I../../include -I../../../ompi-trunk/opal -I../../../ompi-trunk/orte -I../../../ompi-trunk/ompi -g ddt_test.c -o ddt_test
+*/
+
+#define TIMER_DATA_TYPE struct timeval
+#define GET_TIME(TV)   gettimeofday( &(TV), NULL )
+#define ELAPSED_TIME(TSTART, TEND)  (((TEND).tv_sec - (TSTART).tv_sec) * 1000000 + ((TEND).tv_usec - (TSTART).tv_usec))
+
+#define DUMP_DATA_AFTER_COMMIT 0x00000001
+#define CHECK_PACK_UNPACK      0x00000002
+
+uint32_t remote_arch = 0xffffffff;
+
+static int test_upper( unsigned int length )
+{
+    double *mat1, *mat2, *inbuf;
+    ompi_datatype_t *pdt;
+    opal_convertor_t * pConv;
+    char *ptr;
+    int rc;
+    unsigned int i, j, iov_count, split_chunk, total_length;
+    size_t max_data;
+    struct iovec a;
+    TIMER_DATA_TYPE start, end;
+    long total_time;
+
+    printf( "test upper matrix\n" );
+    pdt = upper_matrix( length );
+    /*dt_dump( pdt );*/
+
+    mat1 = malloc( length * length * sizeof(double) );
+    init_random_upper_matrix( length, mat1 );
+    mat2 = calloc( length * length, sizeof(double) );
+
+    total_length = length * (length + 1) * ( sizeof(double) / 2);
+    inbuf = (double*)malloc( total_length );
+    ptr = (char*)inbuf;
+    /* copy upper matrix in the array simulating the input buffer */
+    for( i = 0; i < length; i++ ) {
+        uint32_t pos = i * length + i;
+        for( j = i; j < length; j++, pos++ ) {
+            *inbuf = mat1[pos];
+            inbuf++;
+        }
+    }
+    inbuf = (double*)ptr;
+    pConv = opal_convertor_create( remote_arch, 0 );
+    if( OPAL_SUCCESS != opal_convertor_prepare_for_recv( pConv, &(pdt->super), 1, mat2 ) ) {
+        printf( "Cannot attach the datatype to a convertor\n" );
+        return OMPI_ERROR;
+    }
+    
+    cudaDeviceSynchronize();
+
+    GET_TIME( start );
+    split_chunk = (length + 1) * sizeof(double);
+    /*    split_chunk = (total_length + 1) * sizeof(double); */
+    for( i = total_length; i > 0; ) {
+        if( i <= split_chunk ) {  /* equal test just to be able to set a breakpoint */
+            split_chunk = i;
+        }
+        a.iov_base = ptr;
+        a.iov_len = split_chunk;
+        iov_count = 1;
+        max_data = split_chunk;
+        opal_convertor_unpack( pConv, &a, &iov_count, &max_data );
+        ptr += max_data;
+        i -= max_data;
+        if( mat2[0] != inbuf[0] ) assert(0);
+    }
+    GET_TIME( end );
+    total_time = ELAPSED_TIME( start, end );
+    printf( "complete unpacking in %ld microsec\n", total_time );
+    free( inbuf );
+    rc = check_diag_matrix( length, mat1, mat2 );
+    free( mat1 );
+    free( mat2 );
+
+    /* test the automatic destruction pf the data */
+    ompi_datatype_destroy( &pdt ); assert( pdt == NULL );
+
+    OBJ_RELEASE( pConv );
+    return rc;
+}
+
+/**
+ * Computing the correct buffer length for moving a multiple of a datatype
+ * is not an easy task. Define a function to centralize the complexity in a
+ * single location.
+ */
+static size_t compute_buffer_length(ompi_datatype_t* pdt, int count)
+{
+    MPI_Aint extent, lb, true_extent, true_lb;
+    size_t length;
+
+    ompi_datatype_get_extent(pdt, &lb, &extent);
+    ompi_datatype_get_true_extent(pdt, &true_lb, &true_extent); (void)true_lb;
+    length = true_lb + true_extent + (count - 1) * extent;
+
+    return  length;
+}
+
+/**
+ *  Conversion function. They deal with data-types in 3 ways, always making local copies.
+ * In order to allow performance testings, there are 3 functions:
+ *  - one copying directly from one memory location to another one using the
+ *    data-type copy function.
+ *  - one which use a 2 convertors created with the same data-type
+ *  - and one using 2 convertors created from different data-types.
+ *
+ */
+static int local_copy_ddt_count( ompi_datatype_t* pdt, int count )
+{
+    void *pdst, *psrc;
+    TIMER_DATA_TYPE start, end;
+    long total_time;
+    size_t length;
+
+    length = compute_buffer_length(pdt, count);
+
+    pdst = malloc(length);
+    psrc = malloc(length);
+
+    for( size_t i = 0; i < length; i++ )
+	((char*)psrc)[i] = i % 128 + 32;
+    memset(pdst, 0, length);
+
+    cache_trash();  /* make sure the cache is useless */
+
+    GET_TIME( start );
+    if( OMPI_SUCCESS != ompi_datatype_copy_content_same_ddt( pdt, count, pdst, psrc ) ) {
+        printf( "Unable to copy the datatype in the function local_copy_ddt_count."
+                " Is the datatype committed ?\n" );
+    }
+    GET_TIME( end );
+    total_time = ELAPSED_TIME( start, end );
+    printf( "direct local copy in %ld microsec\n", total_time );
+    free(pdst);
+    free(psrc);
+
+    return OMPI_SUCCESS;
+}
+
+static void fill_vectors(double* vp, int itera, int contig, int gap)
+{
+    int i, j;
+    for (i = 0; i < itera-1; i++ ){
+        for (j = i*gap; j < (i+1)*gap; j++) {
+            if (j >= i*gap && j < i*gap+contig) {
+                vp[j] = 1.1;
+            } else {
+                vp[j] = -1.0;
+            }
+        }
+    }
+    for (i = (itera-1)*gap; i < (itera-1)*gap+contig; i++) {
+        vp[i] = 1.1;
+    }
+    
+    // printf("vector generated:\n");
+    // for (i = 0; i < (itera-1)*gap+contig; i++) {
+    //     printf("%1.f ", vp[i]);
+    // }
+    // printf("\n");
+}
+
+static void verify_vectors(double *vp, int itera, int contig, int gap)
+{
+    int i, j;
+    int error = 0;
+    for (i = 0; i < itera-1; i++) {
+        for (j = i*gap; j < (i+1)*gap; j++) {
+            if (j >= i*gap && j < i*gap+contig) {
+                if (vp[j] != 1.1) {
+                    error ++;
+                }
+            } 
+        }
+    }
+    for (i = (itera-1)*gap; i < (itera-1)*gap+contig; i++) {
+        if (vp[i] != 1.1) {
+            error ++;
+        }
+    }
+    // printf("vector received:\n");
+    // for (i = 0; i < (itera-1)*gap+contig; i++) {
+    //     printf("%1.f ", vp[i]);
+    // }
+    if (error != 0) {
+        printf("%d error is found\n", error);
+    } else {
+        printf("no error is found\n");
+    }
+}
+
+static int
+vector_ddt( ompi_datatype_t* send_type, int send_count,
+            ompi_datatype_t* recv_type, int recv_count,
+            int chunk, int itera, int contig, int gap )
+{
+    void *pdst = NULL, *psrc = NULL, *ptemp = NULL, *psrc_host = NULL, *pdst_host = NULL; 
+    opal_convertor_t *send_convertor = NULL, *recv_convertor = NULL;
+    struct iovec iov;
+    uint32_t iov_count;
+    size_t max_data;
+    int32_t length = 0, done1 = 0, done2 = 0;
+    TIMER_DATA_TYPE start, end, unpack_start, unpack_end;
+    long total_time, unpack_time = 0, push_time = 0, pop_time = 0, pack_time = 0;
+    size_t slength, rlength;
+
+    rlength = compute_buffer_length(recv_type, recv_count);
+    slength = compute_buffer_length(send_type, send_count);
+    
+    cudaSetDevice(0);
+
+    cudaError_t error = cudaMalloc((void **)&psrc, slength);
+    if ( error != cudaSuccess) {
+        printf("CUDA error: %s\n", cudaGetErrorString(error));
+        exit(-1);
+    }
+    cudaMemset(psrc, 0, slength);
+    printf("cudamalloc psrc %p\n", psrc);
+    
+    error = cudaMalloc((void **)&pdst, rlength);
+    if ( error != cudaSuccess) {
+        printf("CUDA error: %s\n", cudaGetErrorString(error));
+        exit(-1);
+    }
+    cudaMemset(pdst, 0, rlength); 
+    printf("cudamalloc pdst %p\n", pdst);
+    
+ //   error = cudaHostAlloc((void **)&ptemp, chunk, cudaHostAllocMapped);
+    error = cudaMallocHost((void **)&ptemp, chunk);
+    //ptemp = malloc(chunk);
+    if ( error != cudaSuccess) {
+        printf("CUDA error: %s\n", cudaGetErrorString(error));
+        exit(-1);
+    }
+    memset(ptemp, 0, chunk);
+    printf("cudamallochost ptemp %p\n", ptemp);
+    
+    
+    error = cudaMallocHost((void **)&psrc_host, slength);
+    error = cudaMallocHost((void **)&pdst_host, rlength);
+ //   psrc_host = malloc(slength);
+ //   pdst_host = malloc(rlength);
+    printf("cudamallochost phost \n");
+    
+    memset(psrc_host, 0, slength);
+    memset(pdst_host, 0, rlength);
+    if (itera > 0) {
+        fill_vectors((double *)psrc_host, itera, contig, gap);
+    }
+    cudaMemcpy(psrc, psrc_host, slength, cudaMemcpyHostToDevice);
+
+
+    send_convertor = opal_convertor_create( remote_arch, 0 );
+#if defined (DDT_TEST_CUDA)
+    send_convertor->flags |= CONVERTOR_CUDA;
+    if( OPAL_SUCCESS != opal_convertor_prepare_for_send( send_convertor, &(send_type->super), send_count, psrc ) ) {
+        printf( "Unable to create the send convertor. Is the datatype committed ?\n" );
+        goto clean_and_return;
+    }
+#else
+    if( OPAL_SUCCESS != opal_convertor_prepare_for_send( send_convertor, &(send_type->super), send_count, psrc_host ) ) {
+        printf( "Unable to create the send convertor. Is the datatype committed ?\n" );
+        goto clean_and_return;
+    }
+#endif
+    recv_convertor = opal_convertor_create( remote_arch, 0 );
+#if defined (DDT_TEST_CUDA)
+    recv_convertor->flags |= CONVERTOR_CUDA;
+    if( OPAL_SUCCESS != opal_convertor_prepare_for_recv( recv_convertor, &(recv_type->super), recv_count, pdst ) ) {
+        printf( "Unable to create the recv convertor. Is the datatype committed ?\n" );
+        goto clean_and_return;
+    }
+#else
+    if( OPAL_SUCCESS != opal_convertor_prepare_for_recv( recv_convertor, &(recv_type->super), recv_count, pdst_host ) ) {
+        printf( "Unable to create the recv convertor. Is the datatype committed ?\n" );
+        goto clean_and_return;
+    }
+#endif
+
+    cache_trash();  /* make sure the cache is useless */
+    cudaDeviceSynchronize();
+    
+    GET_TIME( start );
+#if !defined (DDT_TEST_CUDA)
+    GET_TIME( unpack_start );
+    cudaMemcpy(psrc_host, psrc, slength, cudaMemcpyDeviceToHost);
+    GET_TIME( unpack_end );
+    push_time = ELAPSED_TIME( unpack_start, unpack_end );
+#endif
+    while( (done1 & done2) != 1 ) {
+        /* They are supposed to finish in exactly the same time. */
+        if( done1 | done2 ) {
+            printf( "WRONG !!! the send is %s but the receive is %s in local_copy_with_convertor_2datatypes\n",
+                    (done1 ? "finish" : "not finish"),
+                    (done2 ? "finish" : "not finish") );
+        }
+
+        max_data = chunk;
+        iov_count = 1;
+        iov.iov_base = ptemp;
+        iov.iov_len = chunk;
+
+        if( done1 == 0 ) {
+            done1 = opal_convertor_pack( send_convertor, &iov, &iov_count, &max_data );
+       //     done1 = 1;
+        }
+
+        if( done2 == 0 ) {
+            GET_TIME( unpack_start );
+            done2 = opal_convertor_unpack( recv_convertor, &iov, &iov_count, &max_data );
+            GET_TIME( unpack_end );
+            unpack_time += ELAPSED_TIME( unpack_start, unpack_end );
+        }
+
+        length += max_data;
+    }
+#if !defined (DDT_TEST_CUDA)
+    GET_TIME( unpack_start );
+    cudaMemcpy(pdst, pdst_host, rlength, cudaMemcpyHostToDevice);
+    GET_TIME( unpack_end );
+    pop_time = ELAPSED_TIME( unpack_start, unpack_end );
+#endif
+
+    GET_TIME( end );
+    total_time = ELAPSED_TIME( start, end );
+    pack_time = total_time - unpack_time - push_time - pop_time;
+    printf( "copying different data-types using convertors in %ld microsec, p&up in %ld \n", total_time, pack_time+unpack_time );
+    printf( "\t unpack in %ld microsec [pack in %ld microsec], push in %ld microsec, pop in %ld microsec\n", unpack_time,
+            pack_time, push_time, pop_time);
+            
+    memset(pdst_host, 0, slength);
+    cudaMemcpy(pdst_host, pdst, rlength, cudaMemcpyDeviceToHost);
+    if (itera > 0) {
+        verify_vectors((double *)pdst_host, itera, contig, gap);
+    }
+
+ clean_and_return:
+    if( send_convertor != NULL ) {
+        OBJ_RELEASE( send_convertor ); assert( send_convertor == NULL );
+    }
+    if( recv_convertor != NULL ) {
+        OBJ_RELEASE( recv_convertor ); assert( recv_convertor == NULL );
+    }
+
+    if( NULL != pdst ) cudaFree( pdst );
+    if( NULL != psrc ) cudaFree( psrc );
+    if( NULL != ptemp ) cudaFreeHost( ptemp );
+    if( NULL != psrc_host ) cudaFreeHost( psrc_host );
+    if( NULL != pdst_host ) cudaFreeHost( pdst_host );
+
+    return OMPI_SUCCESS;
+}
+
+static int
+vector_ddt_2d( ompi_datatype_t* send_type, int send_count,
+            ompi_datatype_t* recv_type, int recv_count,
+            int chunk, int itera, int contig, int gap )
+{
+    void *pdst = NULL, *psrc = NULL, *ptemp = NULL, *psrc_host = NULL, *pdst_host = NULL; 
+    opal_convertor_t *send_convertor = NULL, *recv_convertor = NULL;
+    struct iovec iov;
+    uint32_t iov_count;
+    size_t max_data;
+    int32_t length = 0, done1 = 0, done2 = 0;
+    TIMER_DATA_TYPE start, end, unpack_start, unpack_end;
+    long total_time, unpack_time = 0, push_time = 0, pop_time = 0, pack_time = 0;
+    size_t slength, rlength;
+
+    rlength = compute_buffer_length(recv_type, recv_count);
+    slength = compute_buffer_length(send_type, send_count);
+    
+    cudaSetDevice(2);
+
+    cudaError_t error = cudaMalloc((void **)&psrc, slength);
+    if ( error != cudaSuccess) {
+        printf("CUDA error: %s\n", cudaGetErrorString(error));
+        exit(-1);
+    }
+    cudaMemset(psrc, 0, slength);
+    printf("cudamalloc psrc %p\n", psrc);
+    
+    error = cudaMalloc((void **)&pdst, rlength);
+    if ( error != cudaSuccess) {
+        printf("CUDA error: %s\n", cudaGetErrorString(error));
+        exit(-1);
+    }
+    cudaMemset(pdst, 0, rlength); 
+    printf("cudamalloc pdst %p\n", pdst);
+    
+    error = cudaMallocHost((void **)&ptemp, chunk);
+    if ( error != cudaSuccess) {
+        printf("CUDA error: %s\n", cudaGetErrorString(error));
+        exit(-1);
+    }
+    memset(ptemp, 0, chunk);
+    printf("cudamallochost ptemp %p\n", ptemp);
+    
+    
+    error = cudaMallocHost((void **)&psrc_host, slength);
+    error = cudaMallocHost((void **)&pdst_host, rlength);
+    printf("cudamallochost phost \n");
+    
+    memset(psrc_host, 0, slength);
+    memset(pdst_host, 0, rlength);
+    if (itera > 0) {
+        fill_vectors((double *)psrc_host, itera, contig, gap);
+    }
+    cudaMemcpy(psrc, psrc_host, slength, cudaMemcpyHostToDevice);
+
+
+    GET_TIME( start );
+    //cudaMemcpy2D(pdst, contig*sizeof(double), psrc, gap*sizeof(double), contig*sizeof(double), itera, cudaMemcpyDeviceToDevice);
+    cudaMemcpy2D(psrc_host, contig*sizeof(double), psrc, gap*sizeof(double), contig*sizeof(double), itera, cudaMemcpyDeviceToHost);
+    GET_TIME( end );
+    pop_time = ELAPSED_TIME( start, end );
+    
+    GET_TIME( start );
+    cudaMemcpy2D(pdst, gap*sizeof(double), psrc_host, contig*sizeof(double), contig*sizeof(double), itera, cudaMemcpyHostToDevice);
+    GET_TIME( end );
+    push_time = ELAPSED_TIME( start, end );
+    
+    printf( "MEMCPY2D D2H %ld microseconds, H2D %ld microseconds, size %ld\n", pop_time, push_time, contig*sizeof(double)*itera); 
+            
+    memset(pdst_host, 0, slength);
+    cudaMemcpy(pdst_host, pdst, rlength, cudaMemcpyDeviceToHost);
+    if (itera > 0) {
+        verify_vectors((double *)pdst_host, itera, contig, gap);
+    }
+    /* D2D D2H */
+    if (itera > 0) {
+        fill_vectors((double *)psrc_host, itera, contig, gap);
+    }
+    cudaMemcpy(psrc, psrc_host, slength, cudaMemcpyHostToDevice);
+
+
+    GET_TIME( start );
+    cudaMemcpy2D(pdst, contig*sizeof(double), psrc, gap*sizeof(double), contig*sizeof(double), itera, cudaMemcpyDeviceToDevice);
+    GET_TIME( end );
+    pack_time = ELAPSED_TIME( start, end );
+    
+    GET_TIME( start );
+    cudaMemcpy(psrc_host, pdst, contig*sizeof(double)*itera, cudaMemcpyDeviceToHost);
+    GET_TIME( end );
+    pop_time = ELAPSED_TIME( start, end );
+    
+    GET_TIME( start );
+    cudaMemcpy(psrc, psrc_host, contig*sizeof(double)*itera, cudaMemcpyHostToDevice);
+    GET_TIME( end );
+    push_time = ELAPSED_TIME( start, end );
+    
+    GET_TIME( start );
+    cudaMemcpy2D(pdst, gap*sizeof(double), psrc, contig*sizeof(double), contig*sizeof(double), itera, cudaMemcpyDeviceToDevice);
+    GET_TIME( end );
+    unpack_time = ELAPSED_TIME( start, end );
+    
+    printf( "MEMCPY2D D2H %ld microseconds, H2D %ld microseconds, pack in %ld, unpack in %ld, size %lu \n", pop_time, push_time, pack_time, unpack_time, contig*sizeof(double)*itera); 
+            
+    memset(pdst_host, 0, slength);
+    cudaMemcpy(pdst_host, pdst, rlength, cudaMemcpyDeviceToHost);
+    if (itera > 0) {
+        verify_vectors((double *)pdst_host, itera, contig, gap);
+    }
+    
+
+ clean_and_return:
+    if( send_convertor != NULL ) {
+        OBJ_RELEASE( send_convertor ); assert( send_convertor == NULL );
+    }
+    if( recv_convertor != NULL ) {
+        OBJ_RELEASE( recv_convertor ); assert( recv_convertor == NULL );
+    }
+
+    if( NULL != pdst ) cudaFree( pdst );
+    if( NULL != psrc ) cudaFree( psrc );
+    if( NULL != ptemp ) cudaFreeHost( ptemp );
+    if( NULL != psrc_host ) cudaFreeHost( psrc_host );
+    if( NULL != pdst_host ) cudaFreeHost( pdst_host );
+
+    return OMPI_SUCCESS;
+}
+
+
+static int
+local_copy_with_convertor_2datatypes_struct( ompi_datatype_t* send_type, int send_count,
+                                      ompi_datatype_t* recv_type, int recv_count,
+                                      int chunk, int count)
+{
+    void *pdst = NULL, *psrc = NULL, *ptemp = NULL, *phost = NULL;
+    opal_convertor_t *send_convertor = NULL, *recv_convertor = NULL;
+    struct iovec iov;
+    uint32_t iov_count;
+    size_t max_data;
+    int32_t length = 0, done1 = 0, done2 = 0;
+    TIMER_DATA_TYPE start, end, unpack_start, unpack_end;
+    long total_time, unpack_time = 0;
+    size_t slength, rlength;
+
+    rlength = compute_buffer_length(recv_type, recv_count);
+    slength = compute_buffer_length(send_type, send_count);
+    
+#if defined (DDT_TEST_CUDA)
+    cudaSetDevice(0);
+#endif
+
+#if defined (DDT_TEST_CUDA)
+    cudaError_t error = cudaMalloc((void **)&psrc, slength);
+    if ( error != cudaSuccess) {
+        printf("CUDA error: %s\n", cudaGetErrorString(error));
+        exit(-1);
+    }
+    cudaMemset(psrc, 0, slength);
+    printf("cudamalloc psrc %p\n", psrc);
+    
+    error = cudaMalloc((void **)&pdst, rlength);
+    if ( error != cudaSuccess) {
+        printf("CUDA error: %s\n", cudaGetErrorString(error));
+        exit(-1);
+    }
+    cudaMemset(pdst, 0, rlength); 
+    printf("cudamalloc pdst %p\n", pdst);
+    
+    error = cudaMallocHost((void **)&ptemp, chunk);
+    if ( error != cudaSuccess) {
+        printf("CUDA error: %s\n", cudaGetErrorString(error));
+        exit(-1);
+    }
+    memset(ptemp, 0, chunk);
+    printf("cudamallochost ptemp %p\n", ptemp);
+    
+    error = cudaMallocHost((void **)&phost, slength);
+    if ( error != cudaSuccess) {
+        printf("CUDA error: %s\n", cudaGetErrorString(error));
+        exit(-1);
+    }
+    memset(phost, 0, slength);
+    printf("cudamallochost phost %p\n", phost);
+#else
+    pdst  = malloc( rlength );
+    psrc  = malloc( slength );
+    ptemp = malloc( chunk );
+
+    /* initialize the buffers to prevent valgrind from complaining */
+    for( size_t i = 0; i < slength; i++ )
+            ((char*)psrc)[i] = i % 128 + 32;
+    memset(pdst, 0, rlength);
+#endif
+    
+#if defined (DDT_TEST_CUDA)
+
+    cudaMemcpy(psrc, phost, slength, cudaMemcpyHostToDevice);
+#else 
+
+#endif
+
+    send_convertor = opal_convertor_create( remote_arch, 0 );
+#if defined (DDT_TEST_CUDA)
+    send_convertor->flags |= CONVERTOR_CUDA;
+#endif
+    if( OPAL_SUCCESS != opal_convertor_prepare_for_send( send_convertor, &(send_type->super), send_count, psrc ) ) {
+        printf( "Unable to create the send convertor. Is the datatype committed ?\n" );
+        goto clean_and_return;
+    }
+    recv_convertor = opal_convertor_create( remote_arch, 0 );
+#if defined (DDT_TEST_CUDA)
+    recv_convertor->flags |= CONVERTOR_CUDA;
+#endif
+    if( OPAL_SUCCESS != opal_convertor_prepare_for_recv( recv_convertor, &(recv_type->super), recv_count, pdst ) ) {
+        printf( "Unable to create the recv convertor. Is the datatype committed ?\n" );
+        goto clean_and_return;
+    }
+
+    cache_trash();  /* make sure the cache is useless */
+
+    GET_TIME( start );
+    while( (done1 & done2) != 1 ) {
+        /* They are supposed to finish in exactly the same time. */
+        if( done1 | done2 ) {
+            printf( "WRONG !!! the send is %s but the receive is %s in local_copy_with_convertor_2datatypes\n",
+                    (done1 ? "finish" : "not finish"),
+                    (done2 ? "finish" : "not finish") );
+        }
+
+        max_data = chunk;
+        iov_count = 1;
+        iov.iov_base = ptemp;
+        iov.iov_len = chunk;
+
+        if( done1 == 0 ) {
+            done1 = opal_convertor_pack( send_convertor, &iov, &iov_count, &max_data );
+        }
+
+        if( done2 == 0 ) {
+            GET_TIME( unpack_start );
+            done2 = opal_convertor_unpack( recv_convertor, &iov, &iov_count, &max_data );
+            GET_TIME( unpack_end );
+            unpack_time += ELAPSED_TIME( unpack_start, unpack_end );
+        }
+
+        length += max_data;
+    }
+    GET_TIME( end );
+    total_time = ELAPSED_TIME( start, end );
+    printf( "copying different data-types using convertors in %ld microsec\n", total_time );
+    printf( "\t unpack in %ld microsec [pack in %ld microsec]\n", unpack_time,
+            total_time - unpack_time );
+            
+#if defined (DDT_TEST_CUDA)
+    memset(phost, 0, slength);
+    cudaMemcpy(phost, pdst, rlength, cudaMemcpyDeviceToHost);
+
+#else
+
+#endif
+ clean_and_return:
+    if( send_convertor != NULL ) {
+        OBJ_RELEASE( send_convertor ); assert( send_convertor == NULL );
+    }
+    if( recv_convertor != NULL ) {
+        OBJ_RELEASE( recv_convertor ); assert( recv_convertor == NULL );
+    }
+#if defined (DDT_TEST_CUDA)
+    if( NULL != pdst ) cudaFree( pdst );
+    if( NULL != psrc ) cudaFree( psrc );
+    if( NULL != ptemp ) cudaFreeHost( ptemp );
+    if( NULL != phost ) cudaFreeHost( phost );
+#else
+    if( NULL != pdst ) free( pdst );
+    if( NULL != psrc ) free( psrc );
+    if( NULL != ptemp ) free( ptemp );
+#endif
+    return OMPI_SUCCESS;
+}
+
+
+static void fill_upper_matrix(void *matt, int msize)
+{
+    int i, j, start, end;
+    int *blklens, *displs;
+#if defined (TEST_DOUBLE)
+    double *mat = (double *)matt;
+#elif defined (TEST_FLOAT)
+    float *mat = (float *)matt;
+#elif defined (TEST_CHAR)
+    char *mat = (char *)matt;
+#else
+    void *mat = matt;
+#endif
+    
+    blklens = (int *)malloc(sizeof(int)*msize);
+    displs = (int *)malloc(sizeof(int)*msize);
+    for (i = 0; i < msize; i++) {
+        blklens[i] = msize - i;
+        displs[i] = i*msize + i;
+    }
+    for (i = 0; i < msize; i++) {
+        start = displs[i];
+        end = start + blklens[i];
+        for (j = start; j < end; j++) {
+#if defined (TEST_CHAR)
+            mat[j] = 'a';
+#else
+            mat[j] = 0.0 + i;
+#endif
+        }
+    }
+    free(blklens);
+    free(displs);
+
+   // printf("matrix generate\n");
+   // for (i = 0; i < msize; i++) {
+   //     for (j = 0; j < msize; j++) {
+   //         printf(" %1.f ", mat[i*msize+j]);
+   //     }
+   //     printf("\n");
+   // }
+}
+
+static void verify_mat_result(void *matt, int msize)
+{
+    int *blklens, *displs;
+    int i, j, error = 0;
+    int start, end;
+#if defined (TEST_DOUBLE)
+    double *mat = (double *)matt;
+#elif defined (TEST_FLOAT)
+    float *mat = (float *)matt;
+#elif defined (TEST_CHAR)
+    char *mat = (char *)matt;
+#else
+    void *mat = matt;
+#endif
+    
+    blklens = (int *)malloc(sizeof(int)*msize);
+    displs = (int *)malloc(sizeof(int)*msize);
+    for (i = 0; i < msize; i++) {
+        blklens[i] = msize - i;
+        displs[i] = i*msize + i;
+    }
+    for (i = 0; i < msize; i++) {
+        start = displs[i];
+        end = start + blklens[i];
+        for (j = start; j < end; j++) {
+#if defined (TEST_CHAR) 
+            if (mat[j] != 'a') {
+#else
+            if (mat[j] != (0.0+i)) {
+#endif
+                error ++;
+            }
+        }
+    }
+    free(blklens);
+    free(displs);
+    
+    // printf("matrix received\n");
+    // for (i = 0; i < msize; i++) {
+    //     for (j = 0; j < msize; j++) {
+    //         printf(" %1.f ", mat[i*msize+j]);
+    //     }
+    //     printf("\n");
+    // }
+    
+    if (error != 0) {
+        printf("error is found %d\n", error);
+    } else {
+        printf("no error is found\n");
+    }
+}
+
+static int local_copy_with_convertor( ompi_datatype_t* pdt, int count, int chunk, int msize )
+{
+    void *pdst = NULL, *psrc = NULL, *ptemp = NULL, *phost = NULL;
+    opal_convertor_t *send_convertor = NULL, *recv_convertor = NULL;
+    struct iovec iov;
+    uint32_t iov_count;
+    size_t max_data, dt_length;
+    int32_t length = 0, done1 = 0, done2 = 0;
+    TIMER_DATA_TYPE start, end, unpack_start, unpack_end;
+    long total_time, unpack_time = 0;
+
+    dt_length = compute_buffer_length(pdt, count);
+    printf("length %lu\n", dt_length);
+
+#if defined (DDT_TEST_CUDA)
+    cudaSetDevice(0);
+#endif
+
+#if defined (DDT_TEST_CUDA)
+    cudaError_t error = cudaMalloc((void **)&psrc, dt_length);
+    if ( error != cudaSuccess) {
+        printf("CUDA error: %s\n", cudaGetErrorString(error));
+        exit(-1);
+    }
+    cudaMemset(psrc, 0, dt_length);
+    printf("cudamalloc psrc %p\n", psrc);
+    
+    error = cudaMalloc((void **)&pdst, dt_length);
+    if ( error != cudaSuccess) {
+        printf("CUDA error: %s\n", cudaGetErrorString(error));
+        exit(-1);
+    }
+    cudaMemset(pdst, 0, dt_length); 
+    printf("cudamalloc pdst %p\n", pdst);
+    
+    error = cudaMallocHost((void **)&ptemp, chunk);
+    if ( error != cudaSuccess) {
+        printf("CUDA error: %s\n", cudaGetErrorString(error));
+        exit(-1);
+    }
+    memset(ptemp, 0, chunk);
+    printf("cudamallochost ptemp %p\n", ptemp);
+    
+    error = cudaMallocHost((void **)&phost, dt_length);
+    if ( error != cudaSuccess) {
+        printf("CUDA error: %s\n", cudaGetErrorString(error));
+        exit(-1);
+    }
+    memset(phost, 0, dt_length);
+    printf("cudamallochost phost %p\n", phost);
+#else
+    pdst  = malloc(dt_length);
+    psrc  = malloc(dt_length);
+    ptemp = malloc(chunk);
+    
+    for( int i = 0; i < length; ((char*)psrc)[i] = i % 128 + 32, i++ );
+    memset( pdst, 0, length );
+#endif
+
+#if defined (DDT_TEST_CUDA)
+    if (msize > 0) {
+        fill_upper_matrix(phost, msize);
+    }
+    cudaMemcpy(psrc, phost, dt_length, cudaMemcpyHostToDevice);
+#else 
+    if (msize > 0) {
+        fill_upper_matrix(psrc, msize);
+    }
+#endif
+
+    send_convertor = opal_convertor_create( remote_arch, 0 );
+#if defined (DDT_TEST_CUDA)
+    send_convertor->flags |= CONVERTOR_CUDA;
+#endif
+    if( OPAL_SUCCESS != opal_convertor_prepare_for_send( send_convertor, &(pdt->super), count, psrc ) ) {
+        printf( "Unable to create the send convertor. Is the datatype committed ?\n" );
+        goto clean_and_return;
+    }
+
+    recv_convertor = opal_convertor_create( remote_arch, 0 );
+#if defined (DDT_TEST_CUDA)
+    recv_convertor->flags |= CONVERTOR_CUDA;
+#endif
+    if( OPAL_SUCCESS != opal_convertor_prepare_for_recv( recv_convertor, &(pdt->super), count, pdst ) ) {
+        printf( "Unable to create the recv convertor. Is the datatype committed ?\n" );
+        goto clean_and_return;
+    }
+
+    cache_trash();  /* make sure the cache is useless */
+    cudaDeviceSynchronize();
+
+    GET_TIME( start );
+    while( (done1 & done2) != 1 ) {
+        /* They are supposed to finish in exactly the same time. */
+        if( done1 | done2 ) {
+            printf( "WRONG !!! the send is %s but the receive is %s in local_copy_with_convertor\n",
+                    (done1 ? "finish" : "not finish"),
+                    (done2 ? "finish" : "not finish") );
+        }
+
+        max_data = chunk;
+        iov_count = 1;
+        iov.iov_base = ptemp;
+        iov.iov_len = chunk;
+
+        if( done1 == 0 ) {
+            done1 = opal_convertor_pack( send_convertor, &iov, &iov_count, &max_data );
+        }
+
+        if( done2 == 0 ) {
+            GET_TIME( unpack_start );
+            done2 = opal_convertor_unpack( recv_convertor, &iov, &iov_count, &max_data );
+            GET_TIME( unpack_end );
+            unpack_time += ELAPSED_TIME( unpack_start, unpack_end );
+        }
+
+        length += max_data;
+    }
+    GET_TIME( end );
+    total_time = ELAPSED_TIME( start, end );
+    printf( "copying same data-type using convertors in %ld microsec\n", total_time );
+    printf( "\t unpack in %ld microsec [pack in %ld microsec]\n", unpack_time,
+            total_time - unpack_time );
+            
+#if defined (DDT_TEST_CUDA)
+    memset(phost, 0, dt_length);
+    cudaMemcpy(phost, pdst, dt_length, cudaMemcpyDeviceToHost);
+    if (msize > 0) {
+        verify_mat_result(phost, msize);
+    }
+#else
+    if (msize > 0) {
+        verify_mat_result(pdst, msize);
+    }
+#endif
+clean_and_return:
+    if( NULL != send_convertor ) OBJ_RELEASE( send_convertor );
+    if( NULL != recv_convertor ) OBJ_RELEASE( recv_convertor );
+
+#if defined (DDT_TEST_CUDA)
+    if( NULL != pdst ) cudaFree( pdst );
+    if( NULL != psrc ) cudaFree( psrc );
+    if( NULL != ptemp ) cudaFreeHost( ptemp );
+    if( NULL != phost ) cudaFreeHost( phost );
+#else
+    if( NULL != pdst ) free( pdst );
+    if( NULL != psrc ) free( psrc );
+    if( NULL != ptemp ) free( ptemp );
+#endif
+    return OMPI_SUCCESS;
+}
+
+/**
+ * Main function. Call several tests and print-out the results. It try to stress the convertor
+ * using difficult data-type constructions as well as strange segment sizes for the conversion.
+ * Usually, it is able to detect most of the data-type and convertor problems. Any modifications
+ * on the data-type engine should first pass all the tests from this file, before going into other
+ * tests.
+ */
+int main( int argc, char* argv[] )
+{
+    ompi_datatype_t *pdt, *pdt1, *pdt2, *pdt3;
+    int rc, length = 500, i;
+
+#if defined (DDT_TEST_CUDA)
+    opal_cuda_support = 1;
+#endif
+    opal_init_util(&argc, &argv);
+#if defined (DDT_TEST_CUDA)
+   // mca_common_cuda_stage_one_init();
+#endif
+    ompi_datatype_init();
+
+    /**
+     * By default simulate homogeneous architectures.
+     */
+    remote_arch = opal_local_arch;
+/*    printf( "\n\n#\n * TEST INVERSED VECTOR\n #\n\n" );
+    pdt = create_inversed_vector( &ompi_mpi_int.dt, 10 );
+    if( outputFlags & CHECK_PACK_UNPACK ) {
+        local_copy_ddt_count(pdt, 100);
+        local_copy_with_convertor(pdt, 100, 956);
+    }
+    OBJ_RELEASE( pdt ); assert( pdt == NULL );
+    printf( "\n\n#\n * TEST STRANGE DATATYPE\n #\n\n" );
+    pdt = create_strange_dt();
+    if( outputFlags & CHECK_PACK_UNPACK ) {
+        local_copy_ddt_count(pdt, 1);
+        local_copy_with_convertor(pdt, 1, 956);
+    }
+    OBJ_RELEASE( pdt ); assert( pdt == NULL );
+*/    
+    printf("\n TEST STRUCT \n");
+    pdt = create_struct_type(5);
+    if( outputFlags & CHECK_PACK_UNPACK ) {
+        for (i = 1; i <= 1; i++) {
+  //          local_copy_with_convertor_2datatypes_struct(pdt, 1, pdt, 1, 1024*1024*100, 5);
+        }
+    }
+    
+    printf( "\n\n#\n * TEST UPPER TRIANGULAR MATRIX (size 100)\n #\n\n" );
+    int mat_size = 500;
+    for (mat_size = 500; mat_size <= 6000; mat_size +=500) {
+        pdt = upper_matrix(mat_size);
+        printf("----matrix size %d-----\n", mat_size);
+        if( outputFlags & CHECK_PACK_UNPACK ) {
+            for (i = 1; i <= 1; i++) {
+                local_copy_with_convertor(pdt, 1, 1024*1024*200, mat_size);
+            }
+        }
+        OBJ_RELEASE( pdt ); assert( pdt == NULL );
+    }
+    
+    int packed_size = 256;
+    int blk_len = 4;
+    int blk_count;
+    
+    while (packed_size <= 8388608) {
+        blk_count = packed_size / blk_len / sizeof(double);
+        printf( ">>--------------------------------------------<<\n" );
+        printf( "Vector data-type packed size %d, blk %d, count %d\n", packed_size, blk_len, blk_count );
+        pdt = create_vector_type( MPI_DOUBLE, blk_count, blk_len, 128+blk_len );
+        if( outputFlags & CHECK_PACK_UNPACK ) {
+            for (i = 0; i < 4; i++) {
+            //     vector_ddt( pdt, 1, pdt, 1, 1024*1024*30, blk_count, blk_len, 128+blk_len );
+            }
+        }
+        packed_size *= 2;
+    }
+    
+    packed_size = 256;
+    blk_len = 16;
+    while (packed_size <= 8388608) {
+        blk_count = packed_size / blk_len / sizeof(double);
+        printf( ">>--------------------------------------------<<\n" );
+        printf( "Vector data-type packed size %d, blk %d, count %d\n", packed_size, blk_len, blk_count );
+        pdt = create_vector_type( MPI_DOUBLE, blk_count, blk_len, 128+blk_len );
+        if( outputFlags & CHECK_PACK_UNPACK ) {
+            for (i = 0; i < 4; i++) {
+        //         vector_ddt( pdt, 1, pdt, 1, 1024*1024*30, blk_count, blk_len, 128+blk_len );
+            }
+        }
+        packed_size *= 2;
+    }
+    
+    packed_size = 1024;
+    blk_len = 64;
+    while (packed_size <= 8388608) {
+        blk_count = packed_size / blk_len / sizeof(double);
+        printf( ">>--------------------------------------------<<\n" );
+        printf( "Vector data-type packed size %d, blk %d, count %d\n", packed_size, blk_len, blk_count );
+        pdt = create_vector_type( MPI_DOUBLE, blk_count, blk_len, 128+blk_len );
+        if( outputFlags & CHECK_PACK_UNPACK ) {
+            for (i = 0; i < 4; i++) {
+         //       vector_ddt( pdt, 1, pdt, 1, 1024*1024*30, blk_count, blk_len, 128+blk_len );
+           //     vector_ddt_2d( pdt, 1, pdt, 1, 1024*1024*30, blk_count, blk_len, 128+blk_len );
+            }
+        }
+        packed_size *= 2;
+    }
+    
+    
+    for (blk_len = 4; blk_len <= 64; blk_len += 2) {
+        printf( ">>--------------------------------------------<<\n" );
+        printf( "Vector data-type (1024 times %d double stride 512)\n", blk_len );
+        pdt = create_vector_type( MPI_DOUBLE, 1000, blk_len, blk_len+128);
+        if( outputFlags & CHECK_PACK_UNPACK ) {
+            for (i = 0; i < 4; i++) {
+      //         vector_ddt( pdt, 1, pdt, 1, 1024*1024*20 , 1000, blk_len, blk_len+128);
+     //          vector_ddt_2d( pdt, 1, pdt, 1, 1024*1024*100 , 8192, blk_len, blk_len+128);
+            }
+        }
+        OBJ_RELEASE( pdt ); assert( pdt == NULL );
+    }
+    
+    
+    for (blk_len = 4; blk_len <= 64; blk_len += 2) {
+        printf( ">>--------------------------------------------<<\n" );
+        printf( "Vector data-type (60000 times %d double stride 512)\n", blk_len );
+        pdt = create_vector_type( MPI_DOUBLE, 8000, blk_len, blk_len+128);
+        if( outputFlags & CHECK_PACK_UNPACK ) {
+            for (i = 0; i < 4; i++) {
+     //            vector_ddt( pdt, 1, pdt, 1, 1024*1024*100 , 8000, blk_len, blk_len+128);
+    //             vector_ddt_2d( pdt, 1, pdt, 1, 1024*1024*100 , 8192, blk_len, blk_len+128);
+            }
+        }
+        OBJ_RELEASE( pdt ); assert( pdt == NULL );
+    }
+    
+    /*
+    for (blk_len = 4; blk_len <= 32; blk_len += 1) {
+        printf( ">>--------------------------------------------<<\n" );
+        printf( "Vector data-type (4000 times %d double stride 512)\n", blk_len );
+        pdt = create_vector_type( MPI_DOUBLE, 1000, blk_len, blk_len+64);
+        if( outputFlags & CHECK_PACK_UNPACK ) {
+            for (i = 0; i < 4; i++) {
+                vector_ddt( pdt, 1, pdt, 1, 1024*1024*200 , 1000, blk_len, blk_len+64);
+            }
+        }
+        OBJ_RELEASE( pdt ); assert( pdt == NULL );
+    }
+    */
+      
+    printf( "Vector data-type (4000 times 256 double stride 384)\n" );
+    pdt = create_vector_type( MPI_DOUBLE, 4000, 256, 384 );
+//    ompi_datatype_dump( pdt );
+    if( outputFlags & CHECK_PACK_UNPACK ) {
+        for (i = 0; i < 4; i++) {
+       // local_copy_ddt_count(pdt, 1);
+      //  local_copy_with_convertor( pdt, 1, 12 );
+      //  local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 12 );
+      //  local_copy_with_convertor( pdt, 1, 82 );
+      //  local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 82 );
+      //  local_copy_with_convertor( pdt, 1, 6000 );
+      //  local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 6000 );
+      //  local_copy_with_convertor( pdt, 1, 36000 );
+     //     vector_ddt( pdt, 1, pdt, 1, 1024*1024*200, 4000, 256, 384 );
+        }
+    }
+    printf( ">>--------------------------------------------<<\n" );
+    OBJ_RELEASE( pdt ); assert( pdt == NULL );
+    
+    printf( "Vector data-type (4000 times 128 double stride 256)\n" );
+    pdt = create_vector_type( MPI_DOUBLE, 4000, 128, 256 );
+//    ompi_datatype_dump( pdt );
+    if( outputFlags & CHECK_PACK_UNPACK ) {
+        for (i = 0; i < 10; i++) {
+       // local_copy_ddt_count(pdt, 1);
+      //  local_copy_with_convertor( pdt, 1, 12 );
+      //  local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 12 );
+      //  local_copy_with_convertor( pdt, 1, 82 );
+      //  local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 82 );
+      //  local_copy_with_convertor( pdt, 1, 6000 );
+      //  local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 6000 );
+      //  local_copy_with_convertor( pdt, 1, 36000 );
+    //      local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 1024*1024*5 );
+        }
+    }
+    printf( ">>--------------------------------------------<<\n" );
+    OBJ_RELEASE( pdt ); assert( pdt == NULL );
+    
+    printf( "Vector data-type (2000 times 3 double stride 4)\n" );
+    pdt = create_vector_type( MPI_DOUBLE, 2000, 3, 4 );
+//    ompi_datatype_dump( pdt );
+    if( outputFlags & CHECK_PACK_UNPACK ) {
+        for (i = 0; i < 10; i++) {
+       // local_copy_ddt_count(pdt, 1);
+      //  local_copy_with_convertor( pdt, 1, 12 );
+      //  local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 12 );
+      //  local_copy_with_convertor( pdt, 1, 82 );
+      //  local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 82 );
+      //  local_copy_with_convertor( pdt, 1, 6000 );
+      //  local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 6000 );
+      //  local_copy_with_convertor( pdt, 1, 36000 );
+      //  local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 1024*1024*4 );
+        }
+    }
+    printf( ">>--------------------------------------------<<\n" );
+    OBJ_RELEASE( pdt ); assert( pdt == NULL );
+    /*
+    printf( ">>--------------------------------------------<<\n" );
+    pdt = test_struct_char_double();
+    if( outputFlags & CHECK_PACK_UNPACK ) {
+        local_copy_ddt_count(pdt, 4500);
+        local_copy_with_convertor( pdt, 4500, 12 );
+        local_copy_with_convertor_2datatypes( pdt, 4500, pdt, 4500, 12 );
+    }
+    printf( ">>--------------------------------------------<<\n" );
+    OBJ_RELEASE( pdt ); assert( pdt == NULL );
+    
+    printf( ">>--------------------------------------------<<\n" );
+    pdt = test_create_twice_two_doubles();
+    if( outputFlags & CHECK_PACK_UNPACK ) {
+        local_copy_ddt_count(pdt, 4500);
+        local_copy_with_convertor( pdt, 4500, 12 );
+        local_copy_with_convertor_2datatypes( pdt, 4500, pdt, 4500, 12 );
+    }
+    printf( ">>--------------------------------------------<<\n" );
+    OBJ_RELEASE( pdt ); assert( pdt == NULL );
+    printf( ">>--------------------------------------------<<\n" );
+    pdt = test_create_blacs_type();
+    if( outputFlags & CHECK_PACK_UNPACK ) {
+        ompi_datatype_dump( pdt );
+        local_copy_ddt_count(pdt, 2);
+        local_copy_ddt_count(pdt, 4500);
+        local_copy_with_convertor( pdt, 4500, 956 );
+        local_copy_with_convertor_2datatypes( pdt, 4500, pdt, 4500, 956 );
+        local_copy_with_convertor( pdt, 4500, 16*1024 );
+        local_copy_with_convertor_2datatypes( pdt, 4500, pdt, 4500, 16*1024 );
+        local_copy_with_convertor( pdt, 4500, 64*1024 );
+        local_copy_with_convertor_2datatypes( pdt, 4500, pdt, 4500, 64*1024 );
+    }
+    printf( ">>--------------------------------------------<<\n" );
+    OBJ_RELEASE( pdt ); assert( pdt == NULL );
+    printf( ">>--------------------------------------------<<\n" );
+    pdt1 = test_create_blacs_type1( &ompi_mpi_int.dt );
+    pdt2 = test_create_blacs_type2( &ompi_mpi_int.dt );
+    if( outputFlags & CHECK_PACK_UNPACK ) {
+        local_copy_with_convertor_2datatypes( pdt1, 1, pdt2, 1, 100 );
+    }
+    printf( ">>--------------------------------------------<<\n" );
+    OBJ_RELEASE( pdt1 ); assert( pdt1 == NULL );
+    OBJ_RELEASE( pdt2 ); assert( pdt2 == NULL );
+*/
+    /* clean-ups all data allocations */
+    ompi_datatype_finalize();
+
+    return OMPI_SUCCESS;
+}

From 25ead9bd909a3980a24f7706947f0a5c240f32ac Mon Sep 17 00:00:00 2001
From: eddy16112 <eddy16112@gmail.com>
Date: Thu, 1 Oct 2015 23:00:08 -0400
Subject: [PATCH 026/190] modify for matrix transpose

---
 opal/datatype/cuda/opal_datatype_cuda.cuh     |   6 +
 .../cuda/opal_datatype_pack_cuda_wrapper.cu   | 227 ++++++++++++++--
 .../cuda/opal_datatype_unpack_cuda_wrapper.cu | 236 ++++++++++++++++-
 opal/datatype/opal_datatype_pack.c            |   3 +-
 opal/datatype/opal_datatype_unpack.c          |   3 +-
 opal/mca/btl/smcuda/btl_smcuda_component.c    |   2 -
 test/datatype/ddt_benchmark.c                 | 244 +++++++++++++++++-
 7 files changed, 689 insertions(+), 32 deletions(-)

diff --git a/opal/datatype/cuda/opal_datatype_cuda.cuh b/opal/datatype/cuda/opal_datatype_cuda.cuh
index b770f136969..436eaa9aec3 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cuh
+++ b/opal/datatype/cuda/opal_datatype_cuda.cuh
@@ -85,6 +85,12 @@ void pack_predefined_data_cuda( dt_elem_desc_t* ELEM,
                                 unsigned char** SOURCE,
                                 unsigned char** DESTINATION,
                                 size_t* SPACE );
+                                
+void unpack_predefined_data_cuda( dt_elem_desc_t* ELEM,
+                                  uint32_t* COUNT,
+                                  unsigned char** SOURCE,
+                                  unsigned char** DESTINATION,
+                                  size_t* SPACE );
 
 void opal_cuda_sync_device(void);
 
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
index 250e3e253e3..1268280fab6 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
@@ -266,11 +266,13 @@ int32_t opal_generic_simple_pack_function_cuda_vector(opal_convertor_t* pConvert
             while( pElem->elem.common.flags & OPAL_DATATYPE_FLAG_DATA ) {
                 /* now here we have a basic datatype */
                 /* should not go into here */
-                pStack--;
-                pConvertor->stack_pos--;
-                pos_desc --;
-                pElem = &(description[pos_desc]);
-                count_desc = count_desc_tmp;
+                pack_predefined_data_cuda( pElem, &count_desc, &conv_ptr, &iov_ptr, &iov_len_local );
+                if( 0 == count_desc ) {  /* completed */
+                    conv_ptr = pConvertor->pBaseBuf + pStack->disp;
+                    pos_desc++;  /* advance to the next data */
+                    UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
+                    continue;
+                }
                 goto complete_loop;
             }
             if( OPAL_DATATYPE_END_LOOP == pElem->elem.common.type ) { /* end of the current loop */
@@ -327,8 +329,7 @@ int32_t opal_generic_simple_pack_function_cuda_vector(opal_convertor_t* pConvert
                             pStack->disp + local_disp);
                 pos_desc++;
             update_loop_description:  /* update the current state */
-              //  conv_ptr = pConvertor->pBaseBuf + pStack->disp;
-                count_desc_tmp = count_desc;
+                conv_ptr = pConvertor->pBaseBuf + pStack->disp;
                 UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
                 continue;
             }
@@ -349,6 +350,7 @@ int32_t opal_generic_simple_pack_function_cuda_vector(opal_convertor_t* pConvert
         printf( "[Timing]: DtoH memcpy in %ld microsec, transfer required %d\n", total_time, transfer_required );
 #endif
     }
+    cudaDeviceSynchronize();
     *max_data = total_packed;
     pConvertor->bConverted += total_packed;  /* update the already converted bytes */
     *out_size = iov_count;
@@ -370,6 +372,205 @@ int32_t opal_generic_simple_pack_function_cuda_vector(opal_convertor_t* pConvert
     return 0;
 }
 
+// int32_t opal_generic_simple_pack_function_cuda_vector(opal_convertor_t* pConvertor,
+//                                                       struct iovec* iov,
+//                                                       uint32_t* out_size,
+//                                                       size_t* max_data )
+// {
+//     dt_stack_t* pStack;       /* pointer to the position on the stack */
+//     uint32_t pos_desc;        /* actual position in the description of the derived datatype */
+//     uint32_t count_desc;      /* the number of items already done in the actual pos_desc */
+//     size_t total_packed = 0;  /* total amount packed this time */
+//     dt_elem_desc_t* description;
+//     dt_elem_desc_t* pElem;
+//     const opal_datatype_t *pData = pConvertor->pDesc;
+//     unsigned char *conv_ptr, *iov_ptr;
+//     size_t iov_len_local;
+//     uint32_t iov_count;
+//     uint8_t transfer_required;
+//     uint8_t free_required;
+//     uint32_t count_desc_tmp;
+//
+// #if defined(OPAL_DATATYPE_CUDA_TIMING)
+//     TIMER_DATA_TYPE start, end, start_total, end_total;
+//     long total_time;
+// #endif
+//
+//     DT_CUDA_DEBUG( opal_cuda_output( 1, "opal_convertor_generic_simple_pack_cuda_vector( %p:%p, {%p, %lu}, %u, %u )\n",
+//                                 (void*)pConvertor, (void*)pConvertor->pBaseBuf,
+//                                 iov[0].iov_base, (unsigned long)iov[0].iov_len, *out_size, *max_data ); );
+//
+//     description = pConvertor->use_desc->desc;
+//
+//     /* For the first step we have to add both displacement to the source. After in the
+//      * main while loop we will set back the conv_ptr to the correct value. This is
+//      * due to the fact that the convertor can stop in the middle of a data with a count
+//      */
+//     pStack = pConvertor->pStack + pConvertor->stack_pos;
+//     pos_desc   = pStack->index;
+//     conv_ptr   = pConvertor->pBaseBuf + pStack->disp;
+//     count_desc = (uint32_t)pStack->count;
+//     pStack--;
+//     pConvertor->stack_pos--;
+//     pElem = &(description[pos_desc]);
+//
+//     DT_CUDA_DEBUG( opal_cuda_output( 1, "pack start pos_desc %d count_desc %d disp %ld\n"
+//                            "stack_pos %d pos_desc %d count_desc %d disp %ld\n",
+//                            pos_desc, count_desc, (long)(conv_ptr - pConvertor->pBaseBuf),
+//                            pConvertor->stack_pos, pStack->index, (int)pStack->count, (long)pStack->disp ); );
+//
+//
+//     for( iov_count = 0; iov_count < (*out_size); iov_count++ ) {
+//         if ((iov[iov_count].iov_base == NULL) || opal_cuda_is_gpu_buffer(iov[iov_count].iov_base)) {
+//             if (iov[iov_count].iov_len == 0) {
+//                 iov_len_local = DT_CUDA_BUFFER_SIZE;
+//             } else {
+//                 iov_len_local = iov[iov_count].iov_len;
+//             }
+//
+//             if (iov[iov_count].iov_base == NULL) {
+//                 iov[iov_count].iov_base = (unsigned char *)opal_cuda_malloc_gpu_buffer(iov_len_local, 0);
+//                 iov_ptr = (unsigned char *)iov[iov_count].iov_base;
+//                 pConvertor->gpu_buffer_ptr = iov_ptr;
+//                 free_required = 1;
+//             } else {
+//                 iov_ptr = (unsigned char *)iov[iov_count].iov_base;
+//                 free_required = 0;
+//             }
+//             transfer_required = 0;
+//         } else {
+//             if (OPAL_DATATYPE_VECTOR_USE_MEMCPY2D || OPAL_DATATYPE_VECTOR_USE_ZEROCPY) {
+//                 pConvertor->gpu_buffer_ptr = NULL;
+//                 transfer_required = 0;
+//                 free_required = 0;
+//                 iov_ptr = (unsigned char*)iov[iov_count].iov_base;
+//                 iov_len_local = iov[iov_count].iov_len;
+//             } else if (OPAL_DATATYPE_VECTOR_USE_PIPELINE){
+//                 iov_len_local = iov[iov_count].iov_len;
+//                 if (pConvertor->gpu_buffer_ptr == NULL) {
+//                     pConvertor->gpu_buffer_ptr = (unsigned char*)opal_cuda_malloc_gpu_buffer(iov_len_local, 0);
+//                 }
+//                 transfer_required = 0;
+//                 free_required = 1;
+//                 iov_ptr = (unsigned char*)iov[iov_count].iov_base;
+//             } else {
+//                 iov_len_local = iov[iov_count].iov_len;
+//                 if (pConvertor->gpu_buffer_ptr == NULL) {
+//                     pConvertor->gpu_buffer_ptr = (unsigned char*)opal_cuda_malloc_gpu_buffer(iov_len_local, 0);
+//                 }
+//                 transfer_required = 1;
+//                 free_required = 1;
+//                 iov_ptr = pConvertor->gpu_buffer_ptr;
+//             }
+//         }
+//         while( 1 ) {
+//             while( pElem->elem.common.flags & OPAL_DATATYPE_FLAG_DATA ) {
+//                 /* now here we have a basic datatype */
+//                 /* should not go into here */
+//                 pStack--;
+//                 pConvertor->stack_pos--;
+//                 pos_desc --;
+//                 pElem = &(description[pos_desc]);
+//                 count_desc = count_desc_tmp;
+//                 goto complete_loop;
+//             }
+//             if( OPAL_DATATYPE_END_LOOP == pElem->elem.common.type ) { /* end of the current loop */
+//                 DT_CUDA_DEBUG( opal_cuda_output( 2, "pack end_loop count %d stack_pos %d"
+//                                                  " pos_desc %d disp %ld space %lu\n",
+//                                                  (int)pStack->count, pConvertor->stack_pos,
+//                                                  pos_desc, (long)pStack->disp, (unsigned long)iov_len_local ); );
+//                 if( --(pStack->count) == 0 ) { /* end of loop */
+//                     if( 0 == pConvertor->stack_pos ) {
+//                         /* we lie about the size of the next element in order to
+//                          * make sure we exit the main loop.
+//                          */
+//                         *out_size = iov_count;
+//                         goto complete_loop;  /* completed */
+//                     }
+//                     pConvertor->stack_pos--;
+//                     pStack--;
+//                     pos_desc++;
+//                 } else {
+//                     pos_desc = pStack->index + 1;
+//                     if( pStack->index == -1 ) {
+//                         pStack->disp += (pData->ub - pData->lb);
+//                     } else {
+//                         assert( OPAL_DATATYPE_LOOP == description[pStack->index].loop.common.type );
+//                         pStack->disp += description[pStack->index].loop.extent;
+//                     }
+//                 }
+//                 conv_ptr = pConvertor->pBaseBuf + pStack->disp;
+//                 UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
+//                 DT_CUDA_DEBUG( opal_cuda_output( 2, "pack new_loop count %d stack_pos %d pos_desc %d count_desc %d disp %ld space %lu\n",
+//                                                  (int)pStack->count, pConvertor->stack_pos, pos_desc,
+//                                                  count_desc, (long)pStack->disp, (unsigned long)iov_len_local ); );
+//             }
+//             if( OPAL_DATATYPE_LOOP == pElem->elem.common.type ) {
+//                 OPAL_PTRDIFF_TYPE local_disp = (OPAL_PTRDIFF_TYPE)conv_ptr;
+//                 if( pElem->loop.common.flags & OPAL_DATATYPE_FLAG_CONTIGUOUS ) {
+//                     if (OPAL_DATATYPE_VECTOR_USE_MEMCPY2D) {
+//                         pack_contiguous_loop_cuda_memcpy2d(pElem, &count_desc, &conv_ptr, &iov_ptr, &iov_len_local);
+//                     } else if (OPAL_DATATYPE_VECTOR_USE_ZEROCPY) {
+//                         pack_contiguous_loop_cuda_zerocopy(pElem, &count_desc, &conv_ptr, &iov_ptr, &iov_len_local);
+//                     } else if (OPAL_DATATYPE_VECTOR_USE_PIPELINE) {
+//                         pack_contiguous_loop_cuda_pipeline(pElem, &count_desc, &conv_ptr, &iov_ptr, &iov_len_local, pConvertor->gpu_buffer_ptr);
+//                     } else {
+//                         pack_contiguous_loop_cuda(pElem, &count_desc, &conv_ptr, &iov_ptr, &iov_len_local);
+//                     }
+//                     if( 0 == count_desc ) {  /* completed */
+//                         pos_desc += pElem->loop.items + 1;
+//                         goto update_loop_description;
+//                     }
+//                     /* Save the stack with the correct last_count value. */
+//                 }
+//                 local_disp = (OPAL_PTRDIFF_TYPE)conv_ptr - local_disp;
+//                 PUSH_STACK( pStack, pConvertor->stack_pos, pos_desc, OPAL_DATATYPE_LOOP, count_desc,
+//                             pStack->disp + local_disp);
+//                 pos_desc++;
+//             update_loop_description:  /* update the current state */
+//               //  conv_ptr = pConvertor->pBaseBuf + pStack->disp;
+//                 count_desc_tmp = count_desc;
+//                 UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
+//                 continue;
+//             }
+//         }
+//     complete_loop:
+//         iov[iov_count].iov_len -= iov_len_local;  /* update the amount of valid data */
+//         total_packed += iov[iov_count].iov_len;
+//  //       printf("iov_len %d, local %d\n", iov[iov_count].iov_len, iov_len_local);
+// #if defined(OPAL_DATATYPE_CUDA_TIMING)
+//         GET_TIME(start);
+// #endif
+//         if (transfer_required) {
+//             cudaMemcpy(iov[iov_count].iov_base, pConvertor->gpu_buffer_ptr, total_packed, cudaMemcpyDeviceToHost);
+//         }
+// #if defined(OPAL_DATATYPE_CUDA_TIMING)
+//         GET_TIME( end );
+//         total_time = ELAPSED_TIME( start, end );
+//         printf( "[Timing]: DtoH memcpy in %ld microsec, transfer required %d\n", total_time, transfer_required );
+// #endif
+//     }
+//     *max_data = total_packed;
+//     pConvertor->bConverted += total_packed;  /* update the already converted bytes */
+//     *out_size = iov_count;
+//     if( pConvertor->bConverted == pConvertor->local_size ) {
+//         pConvertor->flags |= CONVERTOR_COMPLETED;
+//         DT_CUDA_DEBUG( opal_cuda_output( 0, "Total packed %lu\n", pConvertor->bConverted); );
+//         if (pConvertor->gpu_buffer_ptr != NULL && free_required == 1) {
+//             printf("free\n");
+//            opal_cuda_free_gpu_buffer(pConvertor->gpu_buffer_ptr, 0);
+//            pConvertor->gpu_buffer_ptr = NULL;
+//         }
+//         return 1;
+//     }
+//     /* Save the global position for the next round */
+//     PUSH_STACK( pStack, pConvertor->stack_pos, pos_desc, pElem->elem.common.type, count_desc,
+//                 conv_ptr - pConvertor->pBaseBuf );
+//     DT_CUDA_DEBUG( opal_cuda_output( 2, "pack save stack stack_pos %d pos_desc %d count_desc %d disp %ld\n",
+//                                      pConvertor->stack_pos, pStack->index, (int)pStack->count, (long)pStack->disp ); );
+//     return 0;
+// }
+
 void pack_contiguous_loop_cuda( dt_elem_desc_t* ELEM,
                                 uint32_t* COUNT,
                                 unsigned char** SOURCE,
@@ -892,10 +1093,6 @@ void pack_predefined_data_cuda( dt_elem_desc_t* ELEM,
         if( 0 == _copy_count ) return;  /* nothing to do */
     }
     
-#if !defined(OPAL_DATATYPE_CUDA_DRY_RUN)
-    _source = pBaseBuf_GPU + _elem->disp;
-    _destination = (unsigned char*)cuda_desc_h->iov[0].iov_base;
-#endif
     
     if (*COUNT / TASK_PER_THREAD < CUDA_WARP_SIZE) {
         thread_per_block = CUDA_WARP_SIZE;
@@ -904,13 +1101,13 @@ void pack_predefined_data_cuda( dt_elem_desc_t* ELEM,
     } else if (*COUNT / TASK_PER_THREAD < CUDA_WARP_SIZE * 3) {
         thread_per_block = CUDA_WARP_SIZE * 3;
     } else {
-        thread_per_block = CUDA_WARP_SIZE * 4;
+        thread_per_block = CUDA_WARP_SIZE * 5;
     }
     tasks_per_block = thread_per_block * TASK_PER_THREAD;
     nb_blocks = (*COUNT + tasks_per_block - 1) / tasks_per_block;
 
-    DBGPRINT("num_blocks %d, thread %d\n", nb_blocks, tasks_per_block);
-    DBGPRINT( "GPU pack 1. memcpy( %p, %p, %lu ) => space %lu\n", _destination, _source, (unsigned long)_copy_count, (unsigned long)(*(SPACE)) );
+ //   DBGPRINT("num_blocks %d, thread %d\n", nb_blocks, tasks_per_block);
+ //   DBGPRINT( "GPU pack 1. memcpy( %p, %p, %lu ) => space %lu\n", _destination, _source, (unsigned long)_copy_count, (unsigned long)(*(SPACE)) );
     
     pack_contiguous_loop_cuda_kernel_global<<<nb_blocks, thread_per_block, 0, cuda_streams->opal_cuda_stream[cuda_streams->current_stream_id]>>>(_copy_count, _copy_blength, _elem->extent, _source, _destination);
     cuda_streams->current_stream_id ++;
@@ -924,7 +1121,5 @@ void pack_predefined_data_cuda( dt_elem_desc_t* ELEM,
     *(COUNT)  -= _copy_count;
 #endif
     
-    cuda_desc_h->iov[0].iov_base = (unsigned char*)cuda_desc_h->iov[0].iov_base + _copy_blength;
- //   cudaDeviceSynchronize();
 }
 
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
index 893f280c68f..8f8af75274e 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
@@ -188,11 +188,17 @@ int32_t opal_generic_simple_unpack_function_cuda_vector( opal_convertor_t* pConv
             while( pElem->elem.common.flags & OPAL_DATATYPE_FLAG_DATA ) {
                 /* now here we have a basic datatype */
                 /* should not go to here */
-                pStack--;
-                pConvertor->stack_pos--;
-                pos_desc --;
-                pElem = &(description[pos_desc]);
-                count_desc = count_desc_tmp;
+                unpack_predefined_data_cuda( pElem, &count_desc, &iov_ptr, &conv_ptr, &iov_len_local );
+                if( 0 == count_desc ) {  /* completed */
+                    conv_ptr = pConvertor->pBaseBuf + pStack->disp;
+                    pos_desc++;  /* advance to the next data */
+                    UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
+                    continue;
+                }
+                assert( pElem->elem.common.type < OPAL_DATATYPE_MAX_PREDEFINED );
+                if( 0 != iov_len_local ) {
+                    assert(0);
+                }
                 goto complete_loop;
             }
             if( OPAL_DATATYPE_END_LOOP == pElem->elem.common.type ) { /* end of the current loop */
@@ -246,8 +252,7 @@ int32_t opal_generic_simple_unpack_function_cuda_vector( opal_convertor_t* pConv
                             pStack->disp + local_disp);
                 pos_desc++;
             update_loop_description:  /* update the current state */
-            //    conv_ptr = pConvertor->pBaseBuf + pStack->disp;
-                count_desc_tmp = count_desc;
+                conv_ptr = pConvertor->pBaseBuf + pStack->disp;
                 UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
                 continue;
             }
@@ -257,6 +262,7 @@ int32_t opal_generic_simple_unpack_function_cuda_vector( opal_convertor_t* pConv
         total_unpacked += iov[iov_count].iov_len;
     }
  complete_conversion:
+    cudaDeviceSynchronize();
     *max_data = total_unpacked;
     pConvertor->bConverted += total_unpacked;  /* update the already converted bytes */
     *out_size = iov_count;
@@ -277,6 +283,173 @@ int32_t opal_generic_simple_unpack_function_cuda_vector( opal_convertor_t* pConv
     return 0;
 }
 
+// int32_t opal_generic_simple_unpack_function_cuda_vector( opal_convertor_t* pConvertor,
+//                                                          struct iovec* iov, uint32_t* out_size,
+//                                                          size_t* max_data )
+// {
+//     dt_stack_t* pStack;                /* pointer to the position on the stack */
+//     uint32_t pos_desc;                 /* actual position in the description of the derived datatype */
+//     uint32_t count_desc;               /* the number of items already done in the actual pos_desc */
+//     size_t total_unpacked = 0;         /* total size unpacked this time */
+//     dt_elem_desc_t* description;
+//     dt_elem_desc_t* pElem;
+//     const opal_datatype_t *pData = pConvertor->pDesc;
+//     unsigned char *conv_ptr, *iov_ptr;
+//     size_t iov_len_local;
+//     uint32_t iov_count;
+//     uint8_t free_required;
+//     uint32_t count_desc_tmp;
+//
+// #if defined(OPAL_DATATYPE_CUDA_TIMING)
+//     TIMER_DATA_TYPE start, end;
+//     long total_time;
+// #endif
+//
+//     DT_CUDA_DEBUG( opal_cuda_output( 1, "opal_convertor_generic_simple_unpack( %p, {%p, %lu}, %u , %u)\n",
+//                                      (void*)pConvertor, iov[0].iov_base, (unsigned long)iov[0].iov_len, *out_size, *max_data ); )
+//
+//     description = pConvertor->use_desc->desc;
+//
+//     /* For the first step we have to add both displacement to the source. After in the
+//      * main while loop we will set back the source_base to the correct value. This is
+//      * due to the fact that the convertor can stop in the middle of a data with a count
+//      */
+//     pStack     = pConvertor->pStack + pConvertor->stack_pos;
+//     pos_desc   = pStack->index;
+//     conv_ptr   = pConvertor->pBaseBuf + pStack->disp;
+//     count_desc = (uint32_t)pStack->count;
+//     pStack--;
+//     pConvertor->stack_pos--;
+//     pElem = &(description[pos_desc]);
+//
+//     DT_CUDA_DEBUG( opal_cuda_output( 1, "unpack start pos_desc %d count_desc %d disp %ld\n"
+//                            "stack_pos %d pos_desc %d count_desc %d disp %ld\n",
+//                            pos_desc, count_desc, (long)(conv_ptr - pConvertor->pBaseBuf),
+//                            pConvertor->stack_pos, pStack->index, (int)pStack->count, (long)(pStack->disp) ); );
+//
+//     for( iov_count = 0; iov_count < (*out_size); iov_count++ ) {
+// #if defined(OPAL_DATATYPE_CUDA_TIMING)
+//         GET_TIME(start);
+// #endif
+//         if (opal_cuda_is_gpu_buffer(iov[iov_count].iov_base)) {
+//             iov_ptr = (unsigned char*)iov[iov_count].iov_base;
+//             free_required = 0;
+//         } else {
+//             if (OPAL_DATATYPE_VECTOR_USE_MEMCPY2D || OPAL_DATATYPE_VECTOR_USE_ZEROCPY) {
+//                 iov_ptr = (unsigned char*)iov[iov_count].iov_base;
+//                 pConvertor->gpu_buffer_ptr = NULL;
+//                 free_required = 0;
+//             } else {
+//                 if (pConvertor->gpu_buffer_ptr == NULL) {
+//                     pConvertor->gpu_buffer_ptr = (unsigned char*)opal_cuda_malloc_gpu_buffer(iov[iov_count].iov_len, 0);
+//                 }
+//                 iov_ptr = pConvertor->gpu_buffer_ptr;
+//                 cudaMemcpy(iov_ptr, iov[iov_count].iov_base, iov[iov_count].iov_len, cudaMemcpyHostToDevice);
+//                 free_required = 1;
+//             }
+//         }
+// #if defined(OPAL_DATATYPE_CUDA_TIMING)
+//         GET_TIME( end );
+//         total_time = ELAPSED_TIME( start, end );
+//         printf( "[Timing]: HtoD memcpy in %ld microsec, free required %d\n", total_time, free_required );
+// #endif
+//         iov_len_local = iov[iov_count].iov_len;
+//         if( 0 != pConvertor->partial_length ) {
+//             /* not support yet */
+//         }
+//         while( 1 ) {
+//             while( pElem->elem.common.flags & OPAL_DATATYPE_FLAG_DATA ) {
+//                 /* now here we have a basic datatype */
+//                 /* should not go to here */
+//                 pStack--;
+//                 pConvertor->stack_pos--;
+//                 pos_desc --;
+//                 pElem = &(description[pos_desc]);
+//                 count_desc = count_desc_tmp;
+//                 goto complete_loop;
+//             }
+//             if( OPAL_DATATYPE_END_LOOP == pElem->elem.common.type ) { /* end of the current loop */
+//                 DT_CUDA_DEBUG( opal_cuda_output( 2, "unpack end_loop count %d stack_pos %d pos_desc %d disp %ld space %lu\n",
+//                                                  (int)pStack->count, pConvertor->stack_pos, pos_desc,
+//                                                  (long)pStack->disp, (unsigned long)iov_len_local ); );
+//                 if( --(pStack->count) == 0 ) { /* end of loop */
+//                     if( 0 == pConvertor->stack_pos ) {
+//                         /* Do the same thing as when the loop is completed */
+//                         iov[iov_count].iov_len -= iov_len_local;  /* update the amount of valid data */
+//                         total_unpacked += iov[iov_count].iov_len;
+//                         iov_count++;  /* go to the next */
+//                         goto complete_conversion;
+//                     }
+//                     pConvertor->stack_pos--;
+//                     pStack--;
+//                     pos_desc++;
+//                 } else {
+//                     pos_desc = pStack->index + 1;
+//                     if( pStack->index == -1 ) {
+//                         pStack->disp += (pData->ub - pData->lb);
+//                     } else {
+//                         assert( OPAL_DATATYPE_LOOP == description[pStack->index].loop.common.type );
+//                         pStack->disp += description[pStack->index].loop.extent;
+//                     }
+//                 }
+//                 conv_ptr = pConvertor->pBaseBuf + pStack->disp;
+//                 UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
+//                 DT_CUDA_DEBUG( opal_cuda_output( 2, "unpack new_loop count %d stack_pos %d pos_desc %d disp %ld space %lu\n",
+//                                                  (int)pStack->count, pConvertor->stack_pos, pos_desc,
+//                                                  (long)pStack->disp, (unsigned long)iov_len_local ); );
+//             }
+//             if( OPAL_DATATYPE_LOOP == pElem->elem.common.type ) {
+//                 OPAL_PTRDIFF_TYPE local_disp = (OPAL_PTRDIFF_TYPE)conv_ptr;
+//                 if( pElem->loop.common.flags & OPAL_DATATYPE_FLAG_CONTIGUOUS ) {
+//                     if (OPAL_DATATYPE_VECTOR_USE_MEMCPY2D) {
+//                         unpack_contiguous_loop_cuda_memcpy2d(pElem, &count_desc, &iov_ptr, &conv_ptr, &iov_len_local);
+//                     } else if (OPAL_DATATYPE_VECTOR_USE_ZEROCPY) {
+//                         unpack_contiguous_loop_cuda_zerocopy(pElem, &count_desc, &iov_ptr, &conv_ptr, &iov_len_local);
+//                     } else {
+//                         unpack_contiguous_loop_cuda(pElem, &count_desc, &iov_ptr, &conv_ptr, &iov_len_local);
+//                     }
+//                     if( 0 == count_desc ) {  /* completed */
+//                         pos_desc += pElem->loop.items + 1;
+//                         goto update_loop_description;
+//                     }
+//                     /* Save the stack with the correct last_count value. */
+//                 }
+//                 local_disp = (OPAL_PTRDIFF_TYPE)conv_ptr - local_disp;
+//                 PUSH_STACK( pStack, pConvertor->stack_pos, pos_desc, OPAL_DATATYPE_LOOP, count_desc,
+//                             pStack->disp + local_disp);
+//                 pos_desc++;
+//             update_loop_description:  /* update the current state */
+//             //    conv_ptr = pConvertor->pBaseBuf + pStack->disp;
+//                 count_desc_tmp = count_desc;
+//                 UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
+//                 continue;
+//             }
+//         }
+//     complete_loop:
+//         iov[iov_count].iov_len -= iov_len_local;  /* update the amount of valid data */
+//         total_unpacked += iov[iov_count].iov_len;
+//     }
+//  complete_conversion:
+//     *max_data = total_unpacked;
+//     pConvertor->bConverted += total_unpacked;  /* update the already converted bytes */
+//     *out_size = iov_count;
+//     if( pConvertor->bConverted == pConvertor->remote_size ) {
+//         pConvertor->flags |= CONVERTOR_COMPLETED;
+//         DT_CUDA_DEBUG( opal_cuda_output( 0, "Total unpacked %lu\n", pConvertor->bConverted); );
+//         if (pConvertor->gpu_buffer_ptr != NULL && free_required == 1) {
+//             opal_cuda_free_gpu_buffer(pConvertor->gpu_buffer_ptr, 0);
+//             pConvertor->gpu_buffer_ptr = NULL;
+//         }
+//         return 1;
+//     }
+//     /* Save the global position for the next round */
+//     PUSH_STACK( pStack, pConvertor->stack_pos, pos_desc, pElem->elem.common.type, count_desc,
+//                 conv_ptr - pConvertor->pBaseBuf );
+//     DT_CUDA_DEBUG( opal_cuda_output( 2, "unpack save stack stack_pos %d pos_desc %d count_desc %d disp %ld\n",
+//                                      pConvertor->stack_pos, pStack->index, (int)pStack->count, (long)pStack->disp ); );
+//     return 0;
+// }
+
 int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvertor,
                                                   struct iovec* iov, 
                                                   uint32_t* out_size,
@@ -663,3 +836,52 @@ void unpack_contiguous_loop_cuda_zerocopy( dt_elem_desc_t* ELEM,
     printf( "[Timing]: vector unpacking in %ld microsec\n", total_time );
 #endif
 }
+
+void unpack_predefined_data_cuda( dt_elem_desc_t* ELEM,
+                                  uint32_t* COUNT,
+                                  unsigned char** SOURCE,
+                                  unsigned char** DESTINATION,
+                                  size_t* SPACE )
+{
+    uint32_t _copy_count = *(COUNT);
+    size_t _copy_blength;
+    ddt_elem_desc_t* _elem = &((ELEM)->elem);
+    unsigned char* _source = (*SOURCE);
+    uint32_t nb_blocks, tasks_per_block, thread_per_block;
+    unsigned char* _destination = *(DESTINATION) + _elem->disp;;
+
+    _copy_blength = 8;//opal_datatype_basicDatatypes[_elem->common.type]->size;
+    if( (_copy_count * _copy_blength) > *(SPACE) ) {
+        _copy_count = (uint32_t)(*(SPACE) / _copy_blength);
+        if( 0 == _copy_count ) return;  /* nothing to do */
+    }
+    
+    
+    if (*COUNT / TASK_PER_THREAD < CUDA_WARP_SIZE) {
+        thread_per_block = CUDA_WARP_SIZE;
+    } else if (*COUNT / TASK_PER_THREAD < CUDA_WARP_SIZE * 2) {
+        thread_per_block = CUDA_WARP_SIZE * 2;
+    } else if (*COUNT / TASK_PER_THREAD < CUDA_WARP_SIZE * 3) {
+        thread_per_block = CUDA_WARP_SIZE * 3;
+    } else {
+        thread_per_block = CUDA_WARP_SIZE * 5;
+    }
+    tasks_per_block = thread_per_block * TASK_PER_THREAD;
+    nb_blocks = (*COUNT + tasks_per_block - 1) / tasks_per_block;
+
+ //   DBGPRINT("num_blocks %d, thread %d\n", nb_blocks, tasks_per_block);
+ //   DBGPRINT( "GPU pack 1. memcpy( %p, %p, %lu ) => space %lu\n", _destination, _source, (unsigned long)_copy_count, (unsigned long)(*(SPACE)) );
+    
+    unpack_contiguous_loop_cuda_kernel_global<<<nb_blocks, thread_per_block, 0, cuda_streams->opal_cuda_stream[cuda_streams->current_stream_id]>>>(_copy_count, _copy_blength, _elem->extent, _source, _destination);
+    cuda_streams->current_stream_id ++;
+    cuda_streams->current_stream_id = cuda_streams->current_stream_id % NB_STREAMS;
+    
+#if !defined(OPAL_DATATYPE_CUDA_DRY_RUN)  
+    _copy_blength *= _copy_count;
+    *(DESTINATION)  = _destination + _elem->extent*_copy_count - _elem->disp;
+    *(SOURCE) += _copy_blength;
+    *(SPACE)  -= _copy_blength;
+    *(COUNT)  -= _copy_count;
+#endif
+    
+}
diff --git a/opal/datatype/opal_datatype_pack.c b/opal/datatype/opal_datatype_pack.c
index 54a28b93c5b..372d5a1291a 100644
--- a/opal/datatype/opal_datatype_pack.c
+++ b/opal/datatype/opal_datatype_pack.c
@@ -421,7 +421,8 @@ opal_generic_simple_pack_cuda_function( opal_convertor_t* pConvertor,
     pStack = pConvertor->pStack + pConvertor->stack_pos;
     pos_desc   = pStack->index;
     pElem = &(description[pos_desc]);
-    
+   
+   // return (*opal_generic_simple_pack_function_cuda_vector_p)( pConvertor, iov, out_size, max_data);
     if( OPAL_DATATYPE_LOOP == pElem->elem.common.type ) {
         if (opal_generic_simple_pack_function_cuda_vector_p != NULL) {
         //    return (*opal_generic_simple_pack_function_cuda_vector_p)( pConvertor, iov, out_size, max_data);
diff --git a/opal/datatype/opal_datatype_unpack.c b/opal/datatype/opal_datatype_unpack.c
index fd269de6764..d9d69683174 100644
--- a/opal/datatype/opal_datatype_unpack.c
+++ b/opal/datatype/opal_datatype_unpack.c
@@ -608,7 +608,8 @@ opal_generic_simple_unpack_cuda_function( opal_convertor_t* pConvertor,
     pStack = pConvertor->pStack + pConvertor->stack_pos;
     pos_desc   = pStack->index;
     pElem = &(description[pos_desc]);
-    
+   
+//    return (*opal_generic_simple_unpack_function_cuda_vector_p)( pConvertor, iov, out_size, max_data);
     if( OPAL_DATATYPE_LOOP == pElem->elem.common.type ) {
         if (opal_generic_simple_unpack_function_cuda_vector_p != NULL) {
             return (*opal_generic_simple_unpack_function_cuda_vector_p)( pConvertor, iov, out_size, max_data);
diff --git a/opal/mca/btl/smcuda/btl_smcuda_component.c b/opal/mca/btl/smcuda/btl_smcuda_component.c
index 0243822d1d9..3ffde4608fc 100644
--- a/opal/mca/btl/smcuda/btl_smcuda_component.c
+++ b/opal/mca/btl/smcuda/btl_smcuda_component.c
@@ -1284,10 +1284,8 @@ int mca_btl_smcuda_component_progress(void)
                 }
                 if( btl_ownership ) {
                     if (frag->hdr->tag == MCA_BTL_TAG_SMCUDA_DATATYPE_PACK) {
-                        printf("&&&&&&&&&&&&&&&&&&got PACK TAG\n");
                     }
                     if (frag->hdr->tag == MCA_BTL_TAG_SMCUDA_DATATYPE_UNPACK) {
-                        printf("&&&&&&&&&&&&&&&&&&got UNPACK TAG\n");
                     }
                     MCA_BTL_SMCUDA_FRAG_RETURN(frag);
                 }
diff --git a/test/datatype/ddt_benchmark.c b/test/datatype/ddt_benchmark.c
index 860e9b87c94..228238002e4 100644
--- a/test/datatype/ddt_benchmark.c
+++ b/test/datatype/ddt_benchmark.c
@@ -925,6 +925,232 @@ static int local_copy_with_convertor( ompi_datatype_t* pdt, int count, int chunk
     return OMPI_SUCCESS;
 }
 
+static void fill_matrix(void *matt, int msize)
+{
+    int i, j;
+#if defined (TEST_DOUBLE)
+    double *mat = (double *)matt;
+#elif defined (TEST_FLOAT)
+    float *mat = (float *)matt;
+#elif defined (TEST_CHAR)
+    char *mat = (char *)matt;
+#else
+    void *mat = matt;
+#endif
+    
+    for (i = 0; i < msize*msize; i++) {
+        mat[i] = i;
+    }
+
+   // printf("matrix generate\n");
+   // for (i = 0; i < msize; i++) {
+   //     for (j = 0; j < msize; j++) {
+   //         printf(" %1.f ", mat[i*msize+j]);
+   //     }
+   //     printf("\n");
+   // }
+}
+
+static void verify_mat(void *matt, int msize)
+{
+    int i, j, error = 0;
+#if defined (TEST_DOUBLE)
+    double *mat = (double *)matt;
+#elif defined (TEST_FLOAT)
+    float *mat = (float *)matt;
+#elif defined (TEST_CHAR)
+    char *mat = (char *)matt;
+#else
+    void *mat = matt;
+#endif
+    
+    for (i = 0; i < msize*msize; i++) {
+#if defined (TEST_CHAR) 
+        if (mat[i] != 'a') {
+#else
+        if (mat[i] != (0.0+i)) {
+#endif
+            error ++;
+        }
+    }
+    
+    // printf("matrix received\n");
+    // for (i = 0; i < msize; i++) {
+    //     for (j = 0; j < msize; j++) {
+    //         printf(" %1.f ", mat[i*msize+j]);
+    //     }
+    //     printf("\n");
+    // }
+    
+    if (error != 0) {
+        printf("error is found %d\n", error);
+    } else {
+        printf("no error is found\n");
+    }
+}
+
+static int local_copy_with_convertor_mat( ompi_datatype_t* pdt, int count, int chunk, int msize )
+{
+    void *pdst = NULL, *psrc = NULL, *ptemp = NULL, *phost = NULL;
+    opal_convertor_t *send_convertor = NULL, *recv_convertor = NULL;
+    struct iovec iov;
+    uint32_t iov_count;
+    size_t max_data, dt_length;
+    int32_t length = 0, done1 = 0, done2 = 0;
+    TIMER_DATA_TYPE start, end, unpack_start, unpack_end;
+    long total_time, unpack_time = 0;
+
+    dt_length = compute_buffer_length(pdt, count);
+    printf("length %lu\n", dt_length);
+
+#if defined (DDT_TEST_CUDA)
+    cudaSetDevice(0);
+#endif
+
+#if defined (DDT_TEST_CUDA)
+    cudaError_t error = cudaMalloc((void **)&psrc, dt_length);
+    if ( error != cudaSuccess) {
+        printf("CUDA error: %s\n", cudaGetErrorString(error));
+        exit(-1);
+    }
+    cudaMemset(psrc, 0, dt_length);
+    printf("cudamalloc psrc %p\n", psrc);
+    
+    error = cudaMalloc((void **)&pdst, dt_length);
+    if ( error != cudaSuccess) {
+        printf("CUDA error: %s\n", cudaGetErrorString(error));
+        exit(-1);
+    }
+    cudaMemset(pdst, 0, dt_length); 
+    printf("cudamalloc pdst %p\n", pdst);
+    
+    error = cudaMallocHost((void **)&ptemp, chunk);
+    if ( error != cudaSuccess) {
+        printf("CUDA error: %s\n", cudaGetErrorString(error));
+        exit(-1);
+    }
+    memset(ptemp, 0, chunk);
+    printf("cudamallochost ptemp %p\n", ptemp);
+    
+    error = cudaMallocHost((void **)&phost, dt_length);
+    if ( error != cudaSuccess) {
+        printf("CUDA error: %s\n", cudaGetErrorString(error));
+        exit(-1);
+    }
+    memset(phost, 0, dt_length);
+    printf("cudamallochost phost %p\n", phost);
+#else
+    pdst  = malloc(dt_length);
+    psrc  = malloc(dt_length);
+    ptemp = malloc(chunk);
+    
+    for( int i = 0; i < length; ((char*)psrc)[i] = i % 128 + 32, i++ );
+    memset( pdst, 0, length );
+#endif
+
+#if defined (DDT_TEST_CUDA)
+    if (msize > 0) {
+        fill_matrix(phost, msize);
+    }
+    cudaMemcpy(psrc, phost, dt_length, cudaMemcpyHostToDevice);
+#else 
+    if (msize > 0) {
+  //      fill_upper_matrix(psrc, msize);
+    }
+#endif
+
+    send_convertor = opal_convertor_create( remote_arch, 0 );
+#if defined (DDT_TEST_CUDA)
+    send_convertor->flags |= CONVERTOR_CUDA;
+#endif
+    if( OPAL_SUCCESS != opal_convertor_prepare_for_send( send_convertor, &(pdt->super), count, psrc ) ) {
+        printf( "Unable to create the send convertor. Is the datatype committed ?\n" );
+        goto clean_and_return;
+    }
+
+    recv_convertor = opal_convertor_create( remote_arch, 0 );
+#if defined (DDT_TEST_CUDA)
+    recv_convertor->flags |= CONVERTOR_CUDA;
+#endif
+    if( OPAL_SUCCESS != opal_convertor_prepare_for_recv( recv_convertor, &(pdt->super), count, pdst ) ) {
+        printf( "Unable to create the recv convertor. Is the datatype committed ?\n" );
+        goto clean_and_return;
+    }
+
+    cache_trash();  /* make sure the cache is useless */
+    cudaDeviceSynchronize();
+
+    GET_TIME( start );
+    while( (done1 & done2) != 1 ) {
+        /* They are supposed to finish in exactly the same time. */
+        if( done1 | done2 ) {
+            printf( "WRONG !!! the send is %s but the receive is %s in local_copy_with_convertor\n",
+                    (done1 ? "finish" : "not finish"),
+                    (done2 ? "finish" : "not finish") );
+        }
+
+        max_data = chunk;
+        iov_count = 1;
+        iov.iov_base = ptemp;
+        iov.iov_len = chunk;
+
+        if( done1 == 0 ) {
+            done1 = opal_convertor_pack( send_convertor, &iov, &iov_count, &max_data );
+        }
+        
+        // int i,j = 0;
+        // printf("buffer received\n");
+        // double *mat_temp = (double*)ptemp;
+        // for (i = 0; i < msize; i++) {
+        //     for (j = 0; j < msize; j++) {
+        //         printf(" %1.f ", mat_temp[i*msize+j]);
+        //     }
+        //     printf("\n");
+        // }
+
+        if( done2 == 0 ) {
+            GET_TIME( unpack_start );
+            done2 = opal_convertor_unpack( recv_convertor, &iov, &iov_count, &max_data );
+            GET_TIME( unpack_end );
+            unpack_time += ELAPSED_TIME( unpack_start, unpack_end );
+        }
+
+        length += max_data;
+    }
+    GET_TIME( end );
+    total_time = ELAPSED_TIME( start, end );
+    printf( "copying same data-type using convertors in %ld microsec\n", total_time );
+    printf( "\t unpack in %ld microsec [pack in %ld microsec]\n", unpack_time,
+            total_time - unpack_time );
+            
+#if defined (DDT_TEST_CUDA)
+    memset(phost, 0, dt_length);
+    cudaMemcpy(phost, pdst, dt_length, cudaMemcpyDeviceToHost);
+    if (msize > 0) {
+     verify_mat(phost, msize);
+    }
+#else
+    if (msize > 0) {
+//      verify_mat_result(pdst, msize);
+    }
+#endif
+clean_and_return:
+    if( NULL != send_convertor ) OBJ_RELEASE( send_convertor );
+    if( NULL != recv_convertor ) OBJ_RELEASE( recv_convertor );
+
+#if defined (DDT_TEST_CUDA)
+    if( NULL != pdst ) cudaFree( pdst );
+    if( NULL != psrc ) cudaFree( psrc );
+    if( NULL != ptemp ) cudaFreeHost( ptemp );
+    if( NULL != phost ) cudaFreeHost( phost );
+#else
+    if( NULL != pdst ) free( pdst );
+    if( NULL != psrc ) free( psrc );
+    if( NULL != ptemp ) free( ptemp );
+#endif
+    return OMPI_SUCCESS;
+}
+
 /**
  * Main function. Call several tests and print-out the results. It try to stress the convertor
  * using difficult data-type constructions as well as strange segment sizes for the conversion.
@@ -980,12 +1206,20 @@ int main( int argc, char* argv[] )
         printf("----matrix size %d-----\n", mat_size);
         if( outputFlags & CHECK_PACK_UNPACK ) {
             for (i = 1; i <= 1; i++) {
-                local_copy_with_convertor(pdt, 1, 1024*1024*200, mat_size);
+     //           local_copy_with_convertor(pdt, 1, 1024*1024*200, mat_size);
             }
         }
         OBJ_RELEASE( pdt ); assert( pdt == NULL );
     }
     
+    ompi_datatype_t *column, *matt;
+    mat_size = 500;
+    ompi_datatype_create_vector( mat_size, 1, mat_size, MPI_DOUBLE, &column );
+    ompi_datatype_create_hvector( mat_size, 1, sizeof(double), column, &matt );
+    ompi_datatype_commit( &matt );
+  //  local_copy_with_convertor_mat(matt, 1, 1200000, mat_size);
+    
+    
     int packed_size = 256;
     int blk_len = 4;
     int blk_count;
@@ -1035,13 +1269,13 @@ int main( int argc, char* argv[] )
     }
     
     
-    for (blk_len = 4; blk_len <= 64; blk_len += 2) {
+    for (blk_len = 64; blk_len <= 64; blk_len += 2) {
         printf( ">>--------------------------------------------<<\n" );
         printf( "Vector data-type (1024 times %d double stride 512)\n", blk_len );
         pdt = create_vector_type( MPI_DOUBLE, 1000, blk_len, blk_len+128);
         if( outputFlags & CHECK_PACK_UNPACK ) {
             for (i = 0; i < 4; i++) {
-      //         vector_ddt( pdt, 1, pdt, 1, 1024*1024*20 , 1000, blk_len, blk_len+128);
+                 vector_ddt( pdt, 1, pdt, 1, 1024*10 , 1000, blk_len, blk_len+128);
      //          vector_ddt_2d( pdt, 1, pdt, 1, 1024*1024*100 , 8192, blk_len, blk_len+128);
             }
         }
@@ -1099,7 +1333,7 @@ int main( int argc, char* argv[] )
     pdt = create_vector_type( MPI_DOUBLE, 4000, 128, 256 );
 //    ompi_datatype_dump( pdt );
     if( outputFlags & CHECK_PACK_UNPACK ) {
-        for (i = 0; i < 10; i++) {
+        for (i = 0; i < 1; i++) {
        // local_copy_ddt_count(pdt, 1);
       //  local_copy_with_convertor( pdt, 1, 12 );
       //  local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 12 );
@@ -1108,7 +1342,7 @@ int main( int argc, char* argv[] )
       //  local_copy_with_convertor( pdt, 1, 6000 );
       //  local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 6000 );
       //  local_copy_with_convertor( pdt, 1, 36000 );
-    //      local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 1024*1024*5 );
+     //     local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 1024*1024*5 );
         }
     }
     printf( ">>--------------------------------------------<<\n" );

From 5e14fdd12690e990e3da643e6418f3cbf7161463 Mon Sep 17 00:00:00 2001
From: eddy16112 <eddy16112@gmail.com>
Date: Fri, 2 Oct 2015 16:32:16 -0400
Subject: [PATCH 027/190] enable vector

---
 .../cuda/opal_datatype_pack_cuda_wrapper.cu   | 398 +++++++++---------
 .../cuda/opal_datatype_unpack_cuda_wrapper.cu | 334 +++++++--------
 test/datatype/ddt_benchmark.c                 |   8 +-
 3 files changed, 370 insertions(+), 370 deletions(-)

diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
index 1268280fab6..c3b327c733e 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
@@ -171,7 +171,7 @@ int32_t opal_generic_simple_pack_function_cuda( opal_convertor_t* pConvertor,
                                                   
 }
 
-int32_t opal_generic_simple_pack_function_cuda_vector(opal_convertor_t* pConvertor,
+int32_t opal_generic_simple_pack_function_cuda_vector2(opal_convertor_t* pConvertor,
                                                       struct iovec* iov,
                                                       uint32_t* out_size,
                                                       size_t* max_data )
@@ -372,204 +372,204 @@ int32_t opal_generic_simple_pack_function_cuda_vector(opal_convertor_t* pConvert
     return 0;
 }
 
-// int32_t opal_generic_simple_pack_function_cuda_vector(opal_convertor_t* pConvertor,
-//                                                       struct iovec* iov,
-//                                                       uint32_t* out_size,
-//                                                       size_t* max_data )
-// {
-//     dt_stack_t* pStack;       /* pointer to the position on the stack */
-//     uint32_t pos_desc;        /* actual position in the description of the derived datatype */
-//     uint32_t count_desc;      /* the number of items already done in the actual pos_desc */
-//     size_t total_packed = 0;  /* total amount packed this time */
-//     dt_elem_desc_t* description;
-//     dt_elem_desc_t* pElem;
-//     const opal_datatype_t *pData = pConvertor->pDesc;
-//     unsigned char *conv_ptr, *iov_ptr;
-//     size_t iov_len_local;
-//     uint32_t iov_count;
-//     uint8_t transfer_required;
-//     uint8_t free_required;
-//     uint32_t count_desc_tmp;
-//
-// #if defined(OPAL_DATATYPE_CUDA_TIMING)
-//     TIMER_DATA_TYPE start, end, start_total, end_total;
-//     long total_time;
-// #endif
-//
-//     DT_CUDA_DEBUG( opal_cuda_output( 1, "opal_convertor_generic_simple_pack_cuda_vector( %p:%p, {%p, %lu}, %u, %u )\n",
-//                                 (void*)pConvertor, (void*)pConvertor->pBaseBuf,
-//                                 iov[0].iov_base, (unsigned long)iov[0].iov_len, *out_size, *max_data ); );
-//
-//     description = pConvertor->use_desc->desc;
-//
-//     /* For the first step we have to add both displacement to the source. After in the
-//      * main while loop we will set back the conv_ptr to the correct value. This is
-//      * due to the fact that the convertor can stop in the middle of a data with a count
-//      */
-//     pStack = pConvertor->pStack + pConvertor->stack_pos;
-//     pos_desc   = pStack->index;
-//     conv_ptr   = pConvertor->pBaseBuf + pStack->disp;
-//     count_desc = (uint32_t)pStack->count;
-//     pStack--;
-//     pConvertor->stack_pos--;
-//     pElem = &(description[pos_desc]);
-//
-//     DT_CUDA_DEBUG( opal_cuda_output( 1, "pack start pos_desc %d count_desc %d disp %ld\n"
-//                            "stack_pos %d pos_desc %d count_desc %d disp %ld\n",
-//                            pos_desc, count_desc, (long)(conv_ptr - pConvertor->pBaseBuf),
-//                            pConvertor->stack_pos, pStack->index, (int)pStack->count, (long)pStack->disp ); );
-//
-//
-//     for( iov_count = 0; iov_count < (*out_size); iov_count++ ) {
-//         if ((iov[iov_count].iov_base == NULL) || opal_cuda_is_gpu_buffer(iov[iov_count].iov_base)) {
-//             if (iov[iov_count].iov_len == 0) {
-//                 iov_len_local = DT_CUDA_BUFFER_SIZE;
-//             } else {
-//                 iov_len_local = iov[iov_count].iov_len;
-//             }
-//
-//             if (iov[iov_count].iov_base == NULL) {
-//                 iov[iov_count].iov_base = (unsigned char *)opal_cuda_malloc_gpu_buffer(iov_len_local, 0);
-//                 iov_ptr = (unsigned char *)iov[iov_count].iov_base;
-//                 pConvertor->gpu_buffer_ptr = iov_ptr;
-//                 free_required = 1;
-//             } else {
-//                 iov_ptr = (unsigned char *)iov[iov_count].iov_base;
-//                 free_required = 0;
-//             }
-//             transfer_required = 0;
-//         } else {
-//             if (OPAL_DATATYPE_VECTOR_USE_MEMCPY2D || OPAL_DATATYPE_VECTOR_USE_ZEROCPY) {
-//                 pConvertor->gpu_buffer_ptr = NULL;
-//                 transfer_required = 0;
-//                 free_required = 0;
-//                 iov_ptr = (unsigned char*)iov[iov_count].iov_base;
-//                 iov_len_local = iov[iov_count].iov_len;
-//             } else if (OPAL_DATATYPE_VECTOR_USE_PIPELINE){
-//                 iov_len_local = iov[iov_count].iov_len;
-//                 if (pConvertor->gpu_buffer_ptr == NULL) {
-//                     pConvertor->gpu_buffer_ptr = (unsigned char*)opal_cuda_malloc_gpu_buffer(iov_len_local, 0);
-//                 }
-//                 transfer_required = 0;
-//                 free_required = 1;
-//                 iov_ptr = (unsigned char*)iov[iov_count].iov_base;
-//             } else {
-//                 iov_len_local = iov[iov_count].iov_len;
-//                 if (pConvertor->gpu_buffer_ptr == NULL) {
-//                     pConvertor->gpu_buffer_ptr = (unsigned char*)opal_cuda_malloc_gpu_buffer(iov_len_local, 0);
-//                 }
-//                 transfer_required = 1;
-//                 free_required = 1;
-//                 iov_ptr = pConvertor->gpu_buffer_ptr;
-//             }
-//         }
-//         while( 1 ) {
-//             while( pElem->elem.common.flags & OPAL_DATATYPE_FLAG_DATA ) {
-//                 /* now here we have a basic datatype */
-//                 /* should not go into here */
-//                 pStack--;
-//                 pConvertor->stack_pos--;
-//                 pos_desc --;
-//                 pElem = &(description[pos_desc]);
-//                 count_desc = count_desc_tmp;
-//                 goto complete_loop;
-//             }
-//             if( OPAL_DATATYPE_END_LOOP == pElem->elem.common.type ) { /* end of the current loop */
-//                 DT_CUDA_DEBUG( opal_cuda_output( 2, "pack end_loop count %d stack_pos %d"
-//                                                  " pos_desc %d disp %ld space %lu\n",
-//                                                  (int)pStack->count, pConvertor->stack_pos,
-//                                                  pos_desc, (long)pStack->disp, (unsigned long)iov_len_local ); );
-//                 if( --(pStack->count) == 0 ) { /* end of loop */
-//                     if( 0 == pConvertor->stack_pos ) {
-//                         /* we lie about the size of the next element in order to
-//                          * make sure we exit the main loop.
-//                          */
-//                         *out_size = iov_count;
-//                         goto complete_loop;  /* completed */
-//                     }
-//                     pConvertor->stack_pos--;
-//                     pStack--;
-//                     pos_desc++;
-//                 } else {
-//                     pos_desc = pStack->index + 1;
-//                     if( pStack->index == -1 ) {
-//                         pStack->disp += (pData->ub - pData->lb);
-//                     } else {
-//                         assert( OPAL_DATATYPE_LOOP == description[pStack->index].loop.common.type );
-//                         pStack->disp += description[pStack->index].loop.extent;
-//                     }
-//                 }
-//                 conv_ptr = pConvertor->pBaseBuf + pStack->disp;
-//                 UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
-//                 DT_CUDA_DEBUG( opal_cuda_output( 2, "pack new_loop count %d stack_pos %d pos_desc %d count_desc %d disp %ld space %lu\n",
-//                                                  (int)pStack->count, pConvertor->stack_pos, pos_desc,
-//                                                  count_desc, (long)pStack->disp, (unsigned long)iov_len_local ); );
-//             }
-//             if( OPAL_DATATYPE_LOOP == pElem->elem.common.type ) {
-//                 OPAL_PTRDIFF_TYPE local_disp = (OPAL_PTRDIFF_TYPE)conv_ptr;
-//                 if( pElem->loop.common.flags & OPAL_DATATYPE_FLAG_CONTIGUOUS ) {
-//                     if (OPAL_DATATYPE_VECTOR_USE_MEMCPY2D) {
-//                         pack_contiguous_loop_cuda_memcpy2d(pElem, &count_desc, &conv_ptr, &iov_ptr, &iov_len_local);
-//                     } else if (OPAL_DATATYPE_VECTOR_USE_ZEROCPY) {
-//                         pack_contiguous_loop_cuda_zerocopy(pElem, &count_desc, &conv_ptr, &iov_ptr, &iov_len_local);
-//                     } else if (OPAL_DATATYPE_VECTOR_USE_PIPELINE) {
-//                         pack_contiguous_loop_cuda_pipeline(pElem, &count_desc, &conv_ptr, &iov_ptr, &iov_len_local, pConvertor->gpu_buffer_ptr);
-//                     } else {
-//                         pack_contiguous_loop_cuda(pElem, &count_desc, &conv_ptr, &iov_ptr, &iov_len_local);
-//                     }
-//                     if( 0 == count_desc ) {  /* completed */
-//                         pos_desc += pElem->loop.items + 1;
-//                         goto update_loop_description;
-//                     }
-//                     /* Save the stack with the correct last_count value. */
-//                 }
-//                 local_disp = (OPAL_PTRDIFF_TYPE)conv_ptr - local_disp;
-//                 PUSH_STACK( pStack, pConvertor->stack_pos, pos_desc, OPAL_DATATYPE_LOOP, count_desc,
-//                             pStack->disp + local_disp);
-//                 pos_desc++;
-//             update_loop_description:  /* update the current state */
-//               //  conv_ptr = pConvertor->pBaseBuf + pStack->disp;
-//                 count_desc_tmp = count_desc;
-//                 UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
-//                 continue;
-//             }
-//         }
-//     complete_loop:
-//         iov[iov_count].iov_len -= iov_len_local;  /* update the amount of valid data */
-//         total_packed += iov[iov_count].iov_len;
-//  //       printf("iov_len %d, local %d\n", iov[iov_count].iov_len, iov_len_local);
-// #if defined(OPAL_DATATYPE_CUDA_TIMING)
-//         GET_TIME(start);
-// #endif
-//         if (transfer_required) {
-//             cudaMemcpy(iov[iov_count].iov_base, pConvertor->gpu_buffer_ptr, total_packed, cudaMemcpyDeviceToHost);
-//         }
-// #if defined(OPAL_DATATYPE_CUDA_TIMING)
-//         GET_TIME( end );
-//         total_time = ELAPSED_TIME( start, end );
-//         printf( "[Timing]: DtoH memcpy in %ld microsec, transfer required %d\n", total_time, transfer_required );
-// #endif
-//     }
-//     *max_data = total_packed;
-//     pConvertor->bConverted += total_packed;  /* update the already converted bytes */
-//     *out_size = iov_count;
-//     if( pConvertor->bConverted == pConvertor->local_size ) {
-//         pConvertor->flags |= CONVERTOR_COMPLETED;
-//         DT_CUDA_DEBUG( opal_cuda_output( 0, "Total packed %lu\n", pConvertor->bConverted); );
-//         if (pConvertor->gpu_buffer_ptr != NULL && free_required == 1) {
-//             printf("free\n");
-//            opal_cuda_free_gpu_buffer(pConvertor->gpu_buffer_ptr, 0);
-//            pConvertor->gpu_buffer_ptr = NULL;
-//         }
-//         return 1;
-//     }
-//     /* Save the global position for the next round */
-//     PUSH_STACK( pStack, pConvertor->stack_pos, pos_desc, pElem->elem.common.type, count_desc,
-//                 conv_ptr - pConvertor->pBaseBuf );
-//     DT_CUDA_DEBUG( opal_cuda_output( 2, "pack save stack stack_pos %d pos_desc %d count_desc %d disp %ld\n",
-//                                      pConvertor->stack_pos, pStack->index, (int)pStack->count, (long)pStack->disp ); );
-//     return 0;
-// }
+int32_t opal_generic_simple_pack_function_cuda_vector(opal_convertor_t* pConvertor,
+                                                      struct iovec* iov,
+                                                      uint32_t* out_size,
+                                                      size_t* max_data )
+{
+    dt_stack_t* pStack;       /* pointer to the position on the stack */
+    uint32_t pos_desc;        /* actual position in the description of the derived datatype */
+    uint32_t count_desc;      /* the number of items already done in the actual pos_desc */
+    size_t total_packed = 0;  /* total amount packed this time */
+    dt_elem_desc_t* description;
+    dt_elem_desc_t* pElem;
+    const opal_datatype_t *pData = pConvertor->pDesc;
+    unsigned char *conv_ptr, *iov_ptr;
+    size_t iov_len_local;
+    uint32_t iov_count;
+    uint8_t transfer_required;
+    uint8_t free_required;
+    uint32_t count_desc_tmp;
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+    TIMER_DATA_TYPE start, end, start_total, end_total;
+    long total_time;
+#endif
+
+    DT_CUDA_DEBUG( opal_cuda_output( 1, "opal_convertor_generic_simple_pack_cuda_vector( %p:%p, {%p, %lu}, %u, %u )\n",
+                                (void*)pConvertor, (void*)pConvertor->pBaseBuf,
+                                iov[0].iov_base, (unsigned long)iov[0].iov_len, *out_size, *max_data ); );
+
+    description = pConvertor->use_desc->desc;
+
+    /* For the first step we have to add both displacement to the source. After in the
+     * main while loop we will set back the conv_ptr to the correct value. This is
+     * due to the fact that the convertor can stop in the middle of a data with a count
+     */
+    pStack = pConvertor->pStack + pConvertor->stack_pos;
+    pos_desc   = pStack->index;
+    conv_ptr   = pConvertor->pBaseBuf + pStack->disp;
+    count_desc = (uint32_t)pStack->count;
+    pStack--;
+    pConvertor->stack_pos--;
+    pElem = &(description[pos_desc]);
+
+    DT_CUDA_DEBUG( opal_cuda_output( 1, "pack start pos_desc %d count_desc %d disp %ld\n"
+                           "stack_pos %d pos_desc %d count_desc %d disp %ld\n",
+                           pos_desc, count_desc, (long)(conv_ptr - pConvertor->pBaseBuf),
+                           pConvertor->stack_pos, pStack->index, (int)pStack->count, (long)pStack->disp ); );
+
+
+    for( iov_count = 0; iov_count < (*out_size); iov_count++ ) {
+        if ((iov[iov_count].iov_base == NULL) || opal_cuda_is_gpu_buffer(iov[iov_count].iov_base)) {
+            if (iov[iov_count].iov_len == 0) {
+                iov_len_local = DT_CUDA_BUFFER_SIZE;
+            } else {
+                iov_len_local = iov[iov_count].iov_len;
+            }
+
+            if (iov[iov_count].iov_base == NULL) {
+                iov[iov_count].iov_base = (unsigned char *)opal_cuda_malloc_gpu_buffer(iov_len_local, 0);
+                iov_ptr = (unsigned char *)iov[iov_count].iov_base;
+                pConvertor->gpu_buffer_ptr = iov_ptr;
+                free_required = 1;
+            } else {
+                iov_ptr = (unsigned char *)iov[iov_count].iov_base;
+                free_required = 0;
+            }
+            transfer_required = 0;
+        } else {
+            if (OPAL_DATATYPE_VECTOR_USE_MEMCPY2D || OPAL_DATATYPE_VECTOR_USE_ZEROCPY) {
+                pConvertor->gpu_buffer_ptr = NULL;
+                transfer_required = 0;
+                free_required = 0;
+                iov_ptr = (unsigned char*)iov[iov_count].iov_base;
+                iov_len_local = iov[iov_count].iov_len;
+            } else if (OPAL_DATATYPE_VECTOR_USE_PIPELINE){
+                iov_len_local = iov[iov_count].iov_len;
+                if (pConvertor->gpu_buffer_ptr == NULL) {
+                    pConvertor->gpu_buffer_ptr = (unsigned char*)opal_cuda_malloc_gpu_buffer(iov_len_local, 0);
+                }
+                transfer_required = 0;
+                free_required = 1;
+                iov_ptr = (unsigned char*)iov[iov_count].iov_base;
+            } else {
+                iov_len_local = iov[iov_count].iov_len;
+                if (pConvertor->gpu_buffer_ptr == NULL) {
+                    pConvertor->gpu_buffer_ptr = (unsigned char*)opal_cuda_malloc_gpu_buffer(iov_len_local, 0);
+                }
+                transfer_required = 1;
+                free_required = 1;
+                iov_ptr = pConvertor->gpu_buffer_ptr;
+            }
+        }
+        while( 1 ) {
+            while( pElem->elem.common.flags & OPAL_DATATYPE_FLAG_DATA ) {
+                /* now here we have a basic datatype */
+                /* should not go into here */
+                pStack--;
+                pConvertor->stack_pos--;
+                pos_desc --;
+                pElem = &(description[pos_desc]);
+                count_desc = count_desc_tmp;
+                goto complete_loop;
+            }
+            if( OPAL_DATATYPE_END_LOOP == pElem->elem.common.type ) { /* end of the current loop */
+                DT_CUDA_DEBUG( opal_cuda_output( 2, "pack end_loop count %d stack_pos %d"
+                                                 " pos_desc %d disp %ld space %lu\n",
+                                                 (int)pStack->count, pConvertor->stack_pos,
+                                                 pos_desc, (long)pStack->disp, (unsigned long)iov_len_local ); );
+                if( --(pStack->count) == 0 ) { /* end of loop */
+                    if( 0 == pConvertor->stack_pos ) {
+                        /* we lie about the size of the next element in order to
+                         * make sure we exit the main loop.
+                         */
+                        *out_size = iov_count;
+                        goto complete_loop;  /* completed */
+                    }
+                    pConvertor->stack_pos--;
+                    pStack--;
+                    pos_desc++;
+                } else {
+                    pos_desc = pStack->index + 1;
+                    if( pStack->index == -1 ) {
+                        pStack->disp += (pData->ub - pData->lb);
+                    } else {
+                        assert( OPAL_DATATYPE_LOOP == description[pStack->index].loop.common.type );
+                        pStack->disp += description[pStack->index].loop.extent;
+                    }
+                }
+                conv_ptr = pConvertor->pBaseBuf + pStack->disp;
+                UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
+                DT_CUDA_DEBUG( opal_cuda_output( 2, "pack new_loop count %d stack_pos %d pos_desc %d count_desc %d disp %ld space %lu\n",
+                                                 (int)pStack->count, pConvertor->stack_pos, pos_desc,
+                                                 count_desc, (long)pStack->disp, (unsigned long)iov_len_local ); );
+            }
+            if( OPAL_DATATYPE_LOOP == pElem->elem.common.type ) {
+                OPAL_PTRDIFF_TYPE local_disp = (OPAL_PTRDIFF_TYPE)conv_ptr;
+                if( pElem->loop.common.flags & OPAL_DATATYPE_FLAG_CONTIGUOUS ) {
+                    if (OPAL_DATATYPE_VECTOR_USE_MEMCPY2D) {
+                        pack_contiguous_loop_cuda_memcpy2d(pElem, &count_desc, &conv_ptr, &iov_ptr, &iov_len_local);
+                    } else if (OPAL_DATATYPE_VECTOR_USE_ZEROCPY) {
+                        pack_contiguous_loop_cuda_zerocopy(pElem, &count_desc, &conv_ptr, &iov_ptr, &iov_len_local);
+                    } else if (OPAL_DATATYPE_VECTOR_USE_PIPELINE) {
+                        pack_contiguous_loop_cuda_pipeline(pElem, &count_desc, &conv_ptr, &iov_ptr, &iov_len_local, pConvertor->gpu_buffer_ptr);
+                    } else {
+                        pack_contiguous_loop_cuda(pElem, &count_desc, &conv_ptr, &iov_ptr, &iov_len_local);
+                    }
+                    if( 0 == count_desc ) {  /* completed */
+                        pos_desc += pElem->loop.items + 1;
+                        goto update_loop_description;
+                    }
+                    /* Save the stack with the correct last_count value. */
+                }
+                local_disp = (OPAL_PTRDIFF_TYPE)conv_ptr - local_disp;
+                PUSH_STACK( pStack, pConvertor->stack_pos, pos_desc, OPAL_DATATYPE_LOOP, count_desc,
+                            pStack->disp + local_disp);
+                pos_desc++;
+            update_loop_description:  /* update the current state */
+              //  conv_ptr = pConvertor->pBaseBuf + pStack->disp;
+                count_desc_tmp = count_desc;
+                UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
+                continue;
+            }
+        }
+    complete_loop:
+        iov[iov_count].iov_len -= iov_len_local;  /* update the amount of valid data */
+        total_packed += iov[iov_count].iov_len;
+ //       printf("iov_len %d, local %d\n", iov[iov_count].iov_len, iov_len_local);
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+        GET_TIME(start);
+#endif
+        if (transfer_required) {
+            cudaMemcpy(iov[iov_count].iov_base, pConvertor->gpu_buffer_ptr, total_packed, cudaMemcpyDeviceToHost);
+        }
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+        GET_TIME( end );
+        total_time = ELAPSED_TIME( start, end );
+        printf( "[Timing]: DtoH memcpy in %ld microsec, transfer required %d\n", total_time, transfer_required );
+#endif
+    }
+    *max_data = total_packed;
+    pConvertor->bConverted += total_packed;  /* update the already converted bytes */
+    *out_size = iov_count;
+    if( pConvertor->bConverted == pConvertor->local_size ) {
+        pConvertor->flags |= CONVERTOR_COMPLETED;
+        DT_CUDA_DEBUG( opal_cuda_output( 0, "Total packed %lu\n", pConvertor->bConverted); );
+        if (pConvertor->gpu_buffer_ptr != NULL && free_required == 1) {
+            printf("free\n");
+           opal_cuda_free_gpu_buffer(pConvertor->gpu_buffer_ptr, 0);
+           pConvertor->gpu_buffer_ptr = NULL;
+        }
+        return 1;
+    }
+    /* Save the global position for the next round */
+    PUSH_STACK( pStack, pConvertor->stack_pos, pos_desc, pElem->elem.common.type, count_desc,
+                conv_ptr - pConvertor->pBaseBuf );
+    DT_CUDA_DEBUG( opal_cuda_output( 2, "pack save stack stack_pos %d pos_desc %d count_desc %d disp %ld\n",
+                                     pConvertor->stack_pos, pStack->index, (int)pStack->count, (long)pStack->disp ); );
+    return 0;
+}
 
 void pack_contiguous_loop_cuda( dt_elem_desc_t* ELEM,
                                 uint32_t* COUNT,
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
index 8f8af75274e..5374e2d9fc8 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
@@ -110,7 +110,7 @@ int32_t opal_generic_simple_unpack_function_cuda( opal_convertor_t* pConvertor,
 #endif /* OPAL_DATATYPE_CUDA_DRY_RUN */
 }
 
-int32_t opal_generic_simple_unpack_function_cuda_vector( opal_convertor_t* pConvertor,
+int32_t opal_generic_simple_unpack_function_cuda_vector2( opal_convertor_t* pConvertor,
                                                          struct iovec* iov, uint32_t* out_size,
                                                          size_t* max_data )
 {
@@ -283,172 +283,172 @@ int32_t opal_generic_simple_unpack_function_cuda_vector( opal_convertor_t* pConv
     return 0;
 }
 
-// int32_t opal_generic_simple_unpack_function_cuda_vector( opal_convertor_t* pConvertor,
-//                                                          struct iovec* iov, uint32_t* out_size,
-//                                                          size_t* max_data )
-// {
-//     dt_stack_t* pStack;                /* pointer to the position on the stack */
-//     uint32_t pos_desc;                 /* actual position in the description of the derived datatype */
-//     uint32_t count_desc;               /* the number of items already done in the actual pos_desc */
-//     size_t total_unpacked = 0;         /* total size unpacked this time */
-//     dt_elem_desc_t* description;
-//     dt_elem_desc_t* pElem;
-//     const opal_datatype_t *pData = pConvertor->pDesc;
-//     unsigned char *conv_ptr, *iov_ptr;
-//     size_t iov_len_local;
-//     uint32_t iov_count;
-//     uint8_t free_required;
-//     uint32_t count_desc_tmp;
-//
-// #if defined(OPAL_DATATYPE_CUDA_TIMING)
-//     TIMER_DATA_TYPE start, end;
-//     long total_time;
-// #endif
-//
-//     DT_CUDA_DEBUG( opal_cuda_output( 1, "opal_convertor_generic_simple_unpack( %p, {%p, %lu}, %u , %u)\n",
-//                                      (void*)pConvertor, iov[0].iov_base, (unsigned long)iov[0].iov_len, *out_size, *max_data ); )
-//
-//     description = pConvertor->use_desc->desc;
-//
-//     /* For the first step we have to add both displacement to the source. After in the
-//      * main while loop we will set back the source_base to the correct value. This is
-//      * due to the fact that the convertor can stop in the middle of a data with a count
-//      */
-//     pStack     = pConvertor->pStack + pConvertor->stack_pos;
-//     pos_desc   = pStack->index;
-//     conv_ptr   = pConvertor->pBaseBuf + pStack->disp;
-//     count_desc = (uint32_t)pStack->count;
-//     pStack--;
-//     pConvertor->stack_pos--;
-//     pElem = &(description[pos_desc]);
-//
-//     DT_CUDA_DEBUG( opal_cuda_output( 1, "unpack start pos_desc %d count_desc %d disp %ld\n"
-//                            "stack_pos %d pos_desc %d count_desc %d disp %ld\n",
-//                            pos_desc, count_desc, (long)(conv_ptr - pConvertor->pBaseBuf),
-//                            pConvertor->stack_pos, pStack->index, (int)pStack->count, (long)(pStack->disp) ); );
-//
-//     for( iov_count = 0; iov_count < (*out_size); iov_count++ ) {
-// #if defined(OPAL_DATATYPE_CUDA_TIMING)
-//         GET_TIME(start);
-// #endif
-//         if (opal_cuda_is_gpu_buffer(iov[iov_count].iov_base)) {
-//             iov_ptr = (unsigned char*)iov[iov_count].iov_base;
-//             free_required = 0;
-//         } else {
-//             if (OPAL_DATATYPE_VECTOR_USE_MEMCPY2D || OPAL_DATATYPE_VECTOR_USE_ZEROCPY) {
-//                 iov_ptr = (unsigned char*)iov[iov_count].iov_base;
-//                 pConvertor->gpu_buffer_ptr = NULL;
-//                 free_required = 0;
-//             } else {
-//                 if (pConvertor->gpu_buffer_ptr == NULL) {
-//                     pConvertor->gpu_buffer_ptr = (unsigned char*)opal_cuda_malloc_gpu_buffer(iov[iov_count].iov_len, 0);
-//                 }
-//                 iov_ptr = pConvertor->gpu_buffer_ptr;
-//                 cudaMemcpy(iov_ptr, iov[iov_count].iov_base, iov[iov_count].iov_len, cudaMemcpyHostToDevice);
-//                 free_required = 1;
-//             }
-//         }
-// #if defined(OPAL_DATATYPE_CUDA_TIMING)
-//         GET_TIME( end );
-//         total_time = ELAPSED_TIME( start, end );
-//         printf( "[Timing]: HtoD memcpy in %ld microsec, free required %d\n", total_time, free_required );
-// #endif
-//         iov_len_local = iov[iov_count].iov_len;
-//         if( 0 != pConvertor->partial_length ) {
-//             /* not support yet */
-//         }
-//         while( 1 ) {
-//             while( pElem->elem.common.flags & OPAL_DATATYPE_FLAG_DATA ) {
-//                 /* now here we have a basic datatype */
-//                 /* should not go to here */
-//                 pStack--;
-//                 pConvertor->stack_pos--;
-//                 pos_desc --;
-//                 pElem = &(description[pos_desc]);
-//                 count_desc = count_desc_tmp;
-//                 goto complete_loop;
-//             }
-//             if( OPAL_DATATYPE_END_LOOP == pElem->elem.common.type ) { /* end of the current loop */
-//                 DT_CUDA_DEBUG( opal_cuda_output( 2, "unpack end_loop count %d stack_pos %d pos_desc %d disp %ld space %lu\n",
-//                                                  (int)pStack->count, pConvertor->stack_pos, pos_desc,
-//                                                  (long)pStack->disp, (unsigned long)iov_len_local ); );
-//                 if( --(pStack->count) == 0 ) { /* end of loop */
-//                     if( 0 == pConvertor->stack_pos ) {
-//                         /* Do the same thing as when the loop is completed */
-//                         iov[iov_count].iov_len -= iov_len_local;  /* update the amount of valid data */
-//                         total_unpacked += iov[iov_count].iov_len;
-//                         iov_count++;  /* go to the next */
-//                         goto complete_conversion;
-//                     }
-//                     pConvertor->stack_pos--;
-//                     pStack--;
-//                     pos_desc++;
-//                 } else {
-//                     pos_desc = pStack->index + 1;
-//                     if( pStack->index == -1 ) {
-//                         pStack->disp += (pData->ub - pData->lb);
-//                     } else {
-//                         assert( OPAL_DATATYPE_LOOP == description[pStack->index].loop.common.type );
-//                         pStack->disp += description[pStack->index].loop.extent;
-//                     }
-//                 }
-//                 conv_ptr = pConvertor->pBaseBuf + pStack->disp;
-//                 UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
-//                 DT_CUDA_DEBUG( opal_cuda_output( 2, "unpack new_loop count %d stack_pos %d pos_desc %d disp %ld space %lu\n",
-//                                                  (int)pStack->count, pConvertor->stack_pos, pos_desc,
-//                                                  (long)pStack->disp, (unsigned long)iov_len_local ); );
-//             }
-//             if( OPAL_DATATYPE_LOOP == pElem->elem.common.type ) {
-//                 OPAL_PTRDIFF_TYPE local_disp = (OPAL_PTRDIFF_TYPE)conv_ptr;
-//                 if( pElem->loop.common.flags & OPAL_DATATYPE_FLAG_CONTIGUOUS ) {
-//                     if (OPAL_DATATYPE_VECTOR_USE_MEMCPY2D) {
-//                         unpack_contiguous_loop_cuda_memcpy2d(pElem, &count_desc, &iov_ptr, &conv_ptr, &iov_len_local);
-//                     } else if (OPAL_DATATYPE_VECTOR_USE_ZEROCPY) {
-//                         unpack_contiguous_loop_cuda_zerocopy(pElem, &count_desc, &iov_ptr, &conv_ptr, &iov_len_local);
-//                     } else {
-//                         unpack_contiguous_loop_cuda(pElem, &count_desc, &iov_ptr, &conv_ptr, &iov_len_local);
-//                     }
-//                     if( 0 == count_desc ) {  /* completed */
-//                         pos_desc += pElem->loop.items + 1;
-//                         goto update_loop_description;
-//                     }
-//                     /* Save the stack with the correct last_count value. */
-//                 }
-//                 local_disp = (OPAL_PTRDIFF_TYPE)conv_ptr - local_disp;
-//                 PUSH_STACK( pStack, pConvertor->stack_pos, pos_desc, OPAL_DATATYPE_LOOP, count_desc,
-//                             pStack->disp + local_disp);
-//                 pos_desc++;
-//             update_loop_description:  /* update the current state */
-//             //    conv_ptr = pConvertor->pBaseBuf + pStack->disp;
-//                 count_desc_tmp = count_desc;
-//                 UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
-//                 continue;
-//             }
-//         }
-//     complete_loop:
-//         iov[iov_count].iov_len -= iov_len_local;  /* update the amount of valid data */
-//         total_unpacked += iov[iov_count].iov_len;
-//     }
-//  complete_conversion:
-//     *max_data = total_unpacked;
-//     pConvertor->bConverted += total_unpacked;  /* update the already converted bytes */
-//     *out_size = iov_count;
-//     if( pConvertor->bConverted == pConvertor->remote_size ) {
-//         pConvertor->flags |= CONVERTOR_COMPLETED;
-//         DT_CUDA_DEBUG( opal_cuda_output( 0, "Total unpacked %lu\n", pConvertor->bConverted); );
-//         if (pConvertor->gpu_buffer_ptr != NULL && free_required == 1) {
-//             opal_cuda_free_gpu_buffer(pConvertor->gpu_buffer_ptr, 0);
-//             pConvertor->gpu_buffer_ptr = NULL;
-//         }
-//         return 1;
-//     }
-//     /* Save the global position for the next round */
-//     PUSH_STACK( pStack, pConvertor->stack_pos, pos_desc, pElem->elem.common.type, count_desc,
-//                 conv_ptr - pConvertor->pBaseBuf );
-//     DT_CUDA_DEBUG( opal_cuda_output( 2, "unpack save stack stack_pos %d pos_desc %d count_desc %d disp %ld\n",
-//                                      pConvertor->stack_pos, pStack->index, (int)pStack->count, (long)pStack->disp ); );
-//     return 0;
-// }
+int32_t opal_generic_simple_unpack_function_cuda_vector( opal_convertor_t* pConvertor,
+                                                         struct iovec* iov, uint32_t* out_size,
+                                                         size_t* max_data )
+{
+    dt_stack_t* pStack;                /* pointer to the position on the stack */
+    uint32_t pos_desc;                 /* actual position in the description of the derived datatype */
+    uint32_t count_desc;               /* the number of items already done in the actual pos_desc */
+    size_t total_unpacked = 0;         /* total size unpacked this time */
+    dt_elem_desc_t* description;
+    dt_elem_desc_t* pElem;
+    const opal_datatype_t *pData = pConvertor->pDesc;
+    unsigned char *conv_ptr, *iov_ptr;
+    size_t iov_len_local;
+    uint32_t iov_count;
+    uint8_t free_required;
+    uint32_t count_desc_tmp;
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+    TIMER_DATA_TYPE start, end;
+    long total_time;
+#endif
+
+    DT_CUDA_DEBUG( opal_cuda_output( 1, "opal_convertor_generic_simple_unpack( %p, {%p, %lu}, %u , %u)\n",
+                                     (void*)pConvertor, iov[0].iov_base, (unsigned long)iov[0].iov_len, *out_size, *max_data ); )
+
+    description = pConvertor->use_desc->desc;
+
+    /* For the first step we have to add both displacement to the source. After in the
+     * main while loop we will set back the source_base to the correct value. This is
+     * due to the fact that the convertor can stop in the middle of a data with a count
+     */
+    pStack     = pConvertor->pStack + pConvertor->stack_pos;
+    pos_desc   = pStack->index;
+    conv_ptr   = pConvertor->pBaseBuf + pStack->disp;
+    count_desc = (uint32_t)pStack->count;
+    pStack--;
+    pConvertor->stack_pos--;
+    pElem = &(description[pos_desc]);
+
+    DT_CUDA_DEBUG( opal_cuda_output( 1, "unpack start pos_desc %d count_desc %d disp %ld\n"
+                           "stack_pos %d pos_desc %d count_desc %d disp %ld\n",
+                           pos_desc, count_desc, (long)(conv_ptr - pConvertor->pBaseBuf),
+                           pConvertor->stack_pos, pStack->index, (int)pStack->count, (long)(pStack->disp) ); );
+
+    for( iov_count = 0; iov_count < (*out_size); iov_count++ ) {
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+        GET_TIME(start);
+#endif
+        if (opal_cuda_is_gpu_buffer(iov[iov_count].iov_base)) {
+            iov_ptr = (unsigned char*)iov[iov_count].iov_base;
+            free_required = 0;
+        } else {
+            if (OPAL_DATATYPE_VECTOR_USE_MEMCPY2D || OPAL_DATATYPE_VECTOR_USE_ZEROCPY) {
+                iov_ptr = (unsigned char*)iov[iov_count].iov_base;
+                pConvertor->gpu_buffer_ptr = NULL;
+                free_required = 0;
+            } else {
+                if (pConvertor->gpu_buffer_ptr == NULL) {
+                    pConvertor->gpu_buffer_ptr = (unsigned char*)opal_cuda_malloc_gpu_buffer(iov[iov_count].iov_len, 0);
+                }
+                iov_ptr = pConvertor->gpu_buffer_ptr;
+                cudaMemcpy(iov_ptr, iov[iov_count].iov_base, iov[iov_count].iov_len, cudaMemcpyHostToDevice);
+                free_required = 1;
+            }
+        }
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+        GET_TIME( end );
+        total_time = ELAPSED_TIME( start, end );
+        printf( "[Timing]: HtoD memcpy in %ld microsec, free required %d\n", total_time, free_required );
+#endif
+        iov_len_local = iov[iov_count].iov_len;
+        if( 0 != pConvertor->partial_length ) {
+            /* not support yet */
+        }
+        while( 1 ) {
+            while( pElem->elem.common.flags & OPAL_DATATYPE_FLAG_DATA ) {
+                /* now here we have a basic datatype */
+                /* should not go to here */
+                pStack--;
+                pConvertor->stack_pos--;
+                pos_desc --;
+                pElem = &(description[pos_desc]);
+                count_desc = count_desc_tmp;
+                goto complete_loop;
+            }
+            if( OPAL_DATATYPE_END_LOOP == pElem->elem.common.type ) { /* end of the current loop */
+                DT_CUDA_DEBUG( opal_cuda_output( 2, "unpack end_loop count %d stack_pos %d pos_desc %d disp %ld space %lu\n",
+                                                 (int)pStack->count, pConvertor->stack_pos, pos_desc,
+                                                 (long)pStack->disp, (unsigned long)iov_len_local ); );
+                if( --(pStack->count) == 0 ) { /* end of loop */
+                    if( 0 == pConvertor->stack_pos ) {
+                        /* Do the same thing as when the loop is completed */
+                        iov[iov_count].iov_len -= iov_len_local;  /* update the amount of valid data */
+                        total_unpacked += iov[iov_count].iov_len;
+                        iov_count++;  /* go to the next */
+                        goto complete_conversion;
+                    }
+                    pConvertor->stack_pos--;
+                    pStack--;
+                    pos_desc++;
+                } else {
+                    pos_desc = pStack->index + 1;
+                    if( pStack->index == -1 ) {
+                        pStack->disp += (pData->ub - pData->lb);
+                    } else {
+                        assert( OPAL_DATATYPE_LOOP == description[pStack->index].loop.common.type );
+                        pStack->disp += description[pStack->index].loop.extent;
+                    }
+                }
+                conv_ptr = pConvertor->pBaseBuf + pStack->disp;
+                UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
+                DT_CUDA_DEBUG( opal_cuda_output( 2, "unpack new_loop count %d stack_pos %d pos_desc %d disp %ld space %lu\n",
+                                                 (int)pStack->count, pConvertor->stack_pos, pos_desc,
+                                                 (long)pStack->disp, (unsigned long)iov_len_local ); );
+            }
+            if( OPAL_DATATYPE_LOOP == pElem->elem.common.type ) {
+                OPAL_PTRDIFF_TYPE local_disp = (OPAL_PTRDIFF_TYPE)conv_ptr;
+                if( pElem->loop.common.flags & OPAL_DATATYPE_FLAG_CONTIGUOUS ) {
+                    if (OPAL_DATATYPE_VECTOR_USE_MEMCPY2D) {
+                        unpack_contiguous_loop_cuda_memcpy2d(pElem, &count_desc, &iov_ptr, &conv_ptr, &iov_len_local);
+                    } else if (OPAL_DATATYPE_VECTOR_USE_ZEROCPY) {
+                        unpack_contiguous_loop_cuda_zerocopy(pElem, &count_desc, &iov_ptr, &conv_ptr, &iov_len_local);
+                    } else {
+                        unpack_contiguous_loop_cuda(pElem, &count_desc, &iov_ptr, &conv_ptr, &iov_len_local);
+                    }
+                    if( 0 == count_desc ) {  /* completed */
+                        pos_desc += pElem->loop.items + 1;
+                        goto update_loop_description;
+                    }
+                    /* Save the stack with the correct last_count value. */
+                }
+                local_disp = (OPAL_PTRDIFF_TYPE)conv_ptr - local_disp;
+                PUSH_STACK( pStack, pConvertor->stack_pos, pos_desc, OPAL_DATATYPE_LOOP, count_desc,
+                            pStack->disp + local_disp);
+                pos_desc++;
+            update_loop_description:  /* update the current state */
+            //    conv_ptr = pConvertor->pBaseBuf + pStack->disp;
+                count_desc_tmp = count_desc;
+                UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
+                continue;
+            }
+        }
+    complete_loop:
+        iov[iov_count].iov_len -= iov_len_local;  /* update the amount of valid data */
+        total_unpacked += iov[iov_count].iov_len;
+    }
+ complete_conversion:
+    *max_data = total_unpacked;
+    pConvertor->bConverted += total_unpacked;  /* update the already converted bytes */
+    *out_size = iov_count;
+    if( pConvertor->bConverted == pConvertor->remote_size ) {
+        pConvertor->flags |= CONVERTOR_COMPLETED;
+        DT_CUDA_DEBUG( opal_cuda_output( 0, "Total unpacked %lu\n", pConvertor->bConverted); );
+        if (pConvertor->gpu_buffer_ptr != NULL && free_required == 1) {
+            opal_cuda_free_gpu_buffer(pConvertor->gpu_buffer_ptr, 0);
+            pConvertor->gpu_buffer_ptr = NULL;
+        }
+        return 1;
+    }
+    /* Save the global position for the next round */
+    PUSH_STACK( pStack, pConvertor->stack_pos, pos_desc, pElem->elem.common.type, count_desc,
+                conv_ptr - pConvertor->pBaseBuf );
+    DT_CUDA_DEBUG( opal_cuda_output( 2, "unpack save stack stack_pos %d pos_desc %d count_desc %d disp %ld\n",
+                                     pConvertor->stack_pos, pStack->index, (int)pStack->count, (long)pStack->disp ); );
+    return 0;
+}
 
 int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvertor,
                                                   struct iovec* iov, 
diff --git a/test/datatype/ddt_benchmark.c b/test/datatype/ddt_benchmark.c
index 228238002e4..36f0e7e8659 100644
--- a/test/datatype/ddt_benchmark.c
+++ b/test/datatype/ddt_benchmark.c
@@ -30,7 +30,7 @@
 #include <stdio.h>
 #include <string.h>
 
-#define DDT_TEST_CUDA
+//#define DDT_TEST_CUDA
 #define CUDA_MEMCPY_2D_D2H
 
 
@@ -1213,11 +1213,11 @@ int main( int argc, char* argv[] )
     }
     
     ompi_datatype_t *column, *matt;
-    mat_size = 500;
+    mat_size = 1500;
     ompi_datatype_create_vector( mat_size, 1, mat_size, MPI_DOUBLE, &column );
     ompi_datatype_create_hvector( mat_size, 1, sizeof(double), column, &matt );
     ompi_datatype_commit( &matt );
-  //  local_copy_with_convertor_mat(matt, 1, 1200000, mat_size);
+    local_copy_with_convertor_mat(matt, 1, 200000000, mat_size);
     
     
     int packed_size = 256;
@@ -1275,7 +1275,7 @@ int main( int argc, char* argv[] )
         pdt = create_vector_type( MPI_DOUBLE, 1000, blk_len, blk_len+128);
         if( outputFlags & CHECK_PACK_UNPACK ) {
             for (i = 0; i < 4; i++) {
-                 vector_ddt( pdt, 1, pdt, 1, 1024*10 , 1000, blk_len, blk_len+128);
+                 vector_ddt( pdt, 1, pdt, 1, 1024*10240 , 1000, blk_len, blk_len+128);
      //          vector_ddt_2d( pdt, 1, pdt, 1, 1024*1024*100 , 8192, blk_len, blk_len+128);
             }
         }

From d03c6012e54b321cef90fc3096e90b496f898171 Mon Sep 17 00:00:00 2001
From: eddy16112 <eddy16112@gmail.com>
Date: Tue, 6 Oct 2015 00:32:52 -0400
Subject: [PATCH 028/190] receiver now will send msg back to sender for buffer
 reuse

Conflicts:
	opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
---
 ompi/mca/pml/ob1/pml_ob1_cuda.c               |  1 +
 .../cuda/opal_datatype_pack_cuda_wrapper.cu   |  5 +-
 opal/datatype/opal_convertor.h                |  1 +
 opal/datatype/opal_datatype_gpu.h             |  2 +-
 opal/mca/btl/smcuda/btl_smcuda.c              |  6 +-
 opal/mca/btl/smcuda/btl_smcuda.h              |  9 +--
 opal/mca/btl/smcuda/btl_smcuda_component.c    | 72 ++++++++++++++-----
 test/datatype/ddt_benchmark.c                 | 41 ++++++++---
 8 files changed, 101 insertions(+), 36 deletions(-)

diff --git a/ompi/mca/pml/ob1/pml_ob1_cuda.c b/ompi/mca/pml/ob1/pml_ob1_cuda.c
index 05339d4f9d4..79b739b8356 100644
--- a/ompi/mca/pml/ob1/pml_ob1_cuda.c
+++ b/ompi/mca/pml/ob1/pml_ob1_cuda.c
@@ -119,6 +119,7 @@ int mca_pml_ob1_send_request_start_cuda(mca_pml_ob1_send_request_t* sendreq,
             struct opal_convertor_t *convertor = &(sendreq->req_send.req_base.req_convertor);
             base = opal_cuda_malloc_gpu_buffer_p(convertor->local_size, 0);
             convertor->gpu_buffer_ptr = base;
+            convertor->gpu_buffer_size = convertor->local_size;
             sendreq->req_send.req_bytes_packed = convertor->local_size;
             printf("GPU BUFFER %p, local %lu, remote %lu\n", base, convertor->local_size, convertor->remote_size);
             if( 0 != (sendreq->req_rdma_cnt = (uint32_t)mca_pml_ob1_rdma_cuda_btls(
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
index c3b327c733e..a4d4b427a45 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
@@ -604,8 +604,9 @@ void pack_contiguous_loop_cuda( dt_elem_desc_t* ELEM,
 #endif    
  //   tasks_per_block = THREAD_PER_BLOCK * TASK_PER_THREAD;
  //   num_blocks = (*COUNT + tasks_per_block - 1) / tasks_per_block;
- //    cudaMemcpy2D(_destination, _end_loop->size, _source, _loop->extent, _end_loop->size, _copy_loops, cudaMemcpyDeviceToDevice);
-//    pack_contiguous_loop_cuda_kernel_global<<<192, 4*THREAD_PER_BLOCK>>>(_copy_loops, _end_loop->size, _loop->extent, _source, _destination);
+//    printf("extent %ld, size %ld, count %ld\n", _loop->extent, _end_loop->size, _copy_loops);
+//    cudaMemcpy2D(_destination, _end_loop->size, _source, _loop->extent, _end_loop->size, _copy_loops, cudaMemcpyDeviceToDevice);
+    pack_contiguous_loop_cuda_kernel_global<<<192, 4*THREAD_PER_BLOCK>>>(_copy_loops, _end_loop->size, _loop->extent, _source, _destination);
 //    int i;
 //    for (i = 0; i < 4; i++) {
 //     opal_empty_kernel<<<192, 4*THREAD_PER_BLOCK>>>(_copy_loops, _end_loop->size, _loop->extent, _source, _destination);
diff --git a/opal/datatype/opal_convertor.h b/opal/datatype/opal_convertor.h
index ace5cf4b1e4..6b4746eaa9a 100644
--- a/opal/datatype/opal_convertor.h
+++ b/opal/datatype/opal_convertor.h
@@ -113,6 +113,7 @@ struct opal_convertor_t {
     void *                        stream;         /**< CUstream for async copy */
 
     unsigned char *               gpu_buffer_ptr; /**< GPU buffer used for pack/unpack */
+    size_t                        gpu_buffer_size;
     uint64_t *                    pipeline_event[MAX_IPC_EVENT_HANDLE]; /**< cuda event for pipeline */
 #endif
     /* size: 248, cachelines: 4, members: 20 */
diff --git a/opal/datatype/opal_datatype_gpu.h b/opal/datatype/opal_datatype_gpu.h
index 8ae90cde92f..887c8a0918b 100644
--- a/opal/datatype/opal_datatype_gpu.h
+++ b/opal/datatype/opal_datatype_gpu.h
@@ -66,4 +66,4 @@ extern unsigned char* (*opal_cuda_get_gpu_pack_buffer_p)(void);
 extern void (*opal_cuda_free_gpu_buffer_p)(void *addr, int gpu_id);
 
 extern void* (*opal_cuda_malloc_gpu_buffer_p)(size_t size, int gpu_id);
-#endif /* OPAL_DATATYPE_GPU_H_HAS_BEEN_INCLUDED */
\ No newline at end of file
+#endif /* OPAL_DATATYPE_GPU_H_HAS_BEEN_INCLUDED */
diff --git a/opal/mca/btl/smcuda/btl_smcuda.c b/opal/mca/btl/smcuda/btl_smcuda.c
index 7dd56f6e612..dacc343ba84 100644
--- a/opal/mca/btl/smcuda/btl_smcuda.c
+++ b/opal/mca/btl/smcuda/btl_smcuda.c
@@ -1165,7 +1165,7 @@ int mca_btl_smcuda_get_cuda (struct mca_btl_base_module_t *btl,
                 send_msg.lindex = lindex;
                 send_msg.packed_size = 0;
                 send_msg.seq = 0;
-                send_msg.msg_type = CUDA_PACK_TO_LOCAL;
+                send_msg.msg_type = CUDA_PACK_TO_LOCAL_START;
                 mca_btl_smcuda_send_cuda_pack_sig(btl, ep, &send_msg);
                 done = 0;
             } else {
@@ -1199,14 +1199,14 @@ int mca_btl_smcuda_get_cuda (struct mca_btl_base_module_t *btl,
                     cuda_getmemhandle(local_address, size, (mca_mpool_base_registration_t *)&loc_reg, NULL);
                     memcpy(send_msg.mem_handle, loc_reg_ptr->data.memHandle, sizeof(loc_reg_ptr->data.memHandle));
                     send_msg.seq = -9;
-                    send_msg.msg_type = CUDA_PACK_TO_REMOTE;
+                    send_msg.msg_type = CUDA_PACK_TO_REMOTE_START;
                     send_msg.remote_address = local_address;
                     send_msg.remote_base = loc_reg.base.base;
                     mca_common_wait_stream_synchronize(&loc_reg);
                     printf("send r_addr %p, r_base %p\n", local_address, loc_reg.base.base);
                 } else {
                     send_msg.seq = 0;
-                    send_msg.msg_type = CUDA_PACK_TO_LOCAL;
+                    send_msg.msg_type = CUDA_PACK_TO_LOCAL_START;
                 }
                 mca_btl_smcuda_cuda_dt_unpack_clone(NULL, ep, remote_memory_address, (mca_btl_base_descriptor_t *)frag, 
                                                     0, lindex, 0, 0);
diff --git a/opal/mca/btl/smcuda/btl_smcuda.h b/opal/mca/btl/smcuda/btl_smcuda.h
index 7616e16c720..a1d9e5166e1 100644
--- a/opal/mca/btl/smcuda/btl_smcuda.h
+++ b/opal/mca/btl/smcuda/btl_smcuda.h
@@ -522,13 +522,14 @@ typedef struct {
     uint64_t mem_handle[8];
 } cuda_dt_hdr_t;
 
-#define CUDA_UNPACK_FROM_REMOTE     0
+#define CUDA_UNPACK_FROM_SEQ        0
 #define CUDA_PACK_COMPLETE          1
 #define CUDA_PACK_COMPLETE_ACK      2
 #define CUDA_PACK_CLEANUP           3
-#define CUDA_PACK_TO_LOCAL          4
-#define CUDA_PACK_TO_REMOTE         5
-#define CUDA_UNPACK_NO              6
+#define CUDA_PACK_TO_LOCAL_START    4
+#define CUDA_PACK_TO_REMOTE_START   5
+#define CUDA_PACK_TO_SEQ      6
+#define CUDA_UNPACK_NO              7
 
 /* package save pack/unpack convertor and cbfunc */
 typedef struct {
diff --git a/opal/mca/btl/smcuda/btl_smcuda_component.c b/opal/mca/btl/smcuda/btl_smcuda_component.c
index 3ffde4608fc..de772340fa0 100644
--- a/opal/mca/btl/smcuda/btl_smcuda_component.c
+++ b/opal/mca/btl/smcuda/btl_smcuda_component.c
@@ -167,7 +167,7 @@ static int smcuda_register(void)
     mca_btl_smcuda_param_register_int("use_cuda_ipc", 1, OPAL_INFO_LVL_4, &mca_btl_smcuda_component.use_cuda_ipc);
     mca_btl_smcuda_param_register_int("use_cuda_ipc_same_gpu", 1, OPAL_INFO_LVL_4,&mca_btl_smcuda_component.use_cuda_ipc_same_gpu);
     mca_btl_smcuda_param_register_int("cuda_ipc_verbose", 0, OPAL_INFO_LVL_4, &mca_btl_smcuda_component.cuda_ipc_verbose);
-    mca_btl_smcuda_param_register_int("cuda_dt_pipeline_size", 1024*1024*400, OPAL_INFO_LVL_4, &mca_btl_smcuda_component.cuda_dt_pipeline_size);
+    mca_btl_smcuda_param_register_int("cuda_ddt_pipeline_size", 1024*1024*400, OPAL_INFO_LVL_4, &mca_btl_smcuda_component.cuda_dt_pipeline_size);
     mca_btl_smcuda_component.cuda_ipc_output = opal_output_open(NULL);
     opal_output_set_verbosity(mca_btl_smcuda_component.cuda_ipc_output, mca_btl_smcuda_component.cuda_ipc_verbose);
 #else /* OPAL_CUDA_SUPPORT */
@@ -869,6 +869,8 @@ static void btl_smcuda_datatype_unpack(mca_btl_base_module_t* btl,
     assert(my_cuda_dt_clone->lindex == lindex);
     
     printf("$$$$$$$$$$$$$$hello, rank %d in smcuda unpack seq %d, index %d\n", my_cuda_dt_clone->endpoint->my_smp_rank, seq, lindex);
+    cuda_dt_hdr_t send_msg;
+    send_msg.lindex = lindex;
     
     if (msg_type == CUDA_PACK_CLEANUP) {
         mca_btl_smcuda_frag_t *frag_recv = (mca_btl_smcuda_frag_t *) my_cuda_dt_clone->frag;
@@ -877,13 +879,11 @@ static void btl_smcuda_datatype_unpack(mca_btl_base_module_t* btl,
         mca_btl_smcuda_free(btl, (mca_btl_base_descriptor_t *)frag_recv);
         mca_btl_smcuda_free_cuda_dt_unpack_clone(endpoint, lindex);
     } else if (msg_type == CUDA_PACK_COMPLETE) {
-        cuda_dt_hdr_t send_msg;
-        send_msg.lindex = lindex;
         send_msg.packed_size = 0;
         send_msg.seq = -1;
         send_msg.msg_type = CUDA_PACK_COMPLETE_ACK;
         mca_btl_smcuda_send_cuda_pack_sig(btl, endpoint, &send_msg);
-    } else if (msg_type == CUDA_UNPACK_FROM_REMOTE){
+    } else if (msg_type == CUDA_UNPACK_FROM_SEQ){
         struct iovec iov;
         uint32_t iov_count = 1;
         size_t max_data;
@@ -916,6 +916,10 @@ static void btl_smcuda_datatype_unpack(mca_btl_base_module_t* btl,
                 }   
             }
         }
+        send_msg.seq = seq;
+        send_msg.packed_size = packed_size;
+        send_msg.msg_type = CUDA_PACK_TO_SEQ;
+        mca_btl_smcuda_send_cuda_pack_sig(btl, endpoint, &send_msg);
     }
    // MCA_BTL_SMCUDA_FRAG_RETURN(frag);
 }
@@ -931,9 +935,14 @@ static void btl_smcuda_datatype_pack(mca_btl_base_module_t* btl,
     int seq = cuda_dt_hdr.seq;
     int lindex = cuda_dt_hdr.lindex;
     int msg_type = cuda_dt_hdr.msg_type;
+    size_t packed_size = cuda_dt_hdr.packed_size;
     mca_btl_smcuda_frag_t *frag = (mca_btl_smcuda_frag_t *)des;
     cuda_dt_clone_t *my_cuda_dt_clone;
     cuda_dt_hdr_t send_msg;
+    
+    uint32_t iov_count = 1;
+    int rc_dt = 0;
+    size_t max_data = 0;
 
     /* We can find the endoint back from the rank embedded in the header */
     endpoint = mca_btl_smcuda_component.sm_peers[frag->hdr->my_smp_rank];
@@ -952,9 +961,28 @@ static void btl_smcuda_datatype_pack(mca_btl_base_module_t* btl,
             convertor->gpu_buffer_ptr = NULL;
         }
         mca_btl_smcuda_free_cuda_dt_pack_clone(endpoint, lindex);
+    } else if (msg_type == CUDA_PACK_TO_SEQ) {
+        printf("i receive a message pack to seq, packed %ld, pipeline_size %ld\n", convertor->bConverted, my_cuda_dt_clone->pipeline_size); 
+        if (convertor->bConverted < convertor->local_size) {
+            struct iovec iov;
+            iov.iov_base = convertor->gpu_buffer_ptr + seq*my_cuda_dt_clone->pipeline_size;
+            iov.iov_len = packed_size;
+            rc_dt = opal_convertor_pack(convertor, &iov, &iov_count, &max_data );
+            packed_size = max_data;
+            send_msg.packed_size = packed_size;
+            send_msg.seq = seq;
+            send_msg.msg_type = CUDA_UNPACK_FROM_SEQ;
+            mca_btl_smcuda_send_cuda_unpack_sig(btl, endpoint, &send_msg);
+            if (rc_dt == 1) {
+                send_msg.packed_size = 0;
+                send_msg.seq = -1;
+                send_msg.msg_type = CUDA_PACK_COMPLETE;
+                mca_btl_smcuda_send_cuda_unpack_sig(btl, endpoint, &send_msg);
+            }
+        }
     } else {
         mca_mpool_common_cuda_reg_t *rget_reg_ptr = NULL;
-        if (msg_type == CUDA_PACK_TO_REMOTE) { /* receiver is contiguous, and ask me to pack directly to his gpu memory */
+        if (msg_type == CUDA_PACK_TO_REMOTE_START) { /* receiver is contiguous, and ask me to pack directly to his gpu memory */
             opal_cuda_free_gpu_buffer_p(convertor->gpu_buffer_ptr, 0);
             mca_mpool_common_cuda_reg_t rget_reg;
             rget_reg_ptr= &rget_reg;
@@ -967,39 +995,49 @@ static void btl_smcuda_datatype_pack(mca_btl_base_module_t* btl,
             convertor->gpu_buffer_ptr = remote_memory_address;
             printf("remote_memory_address $$$$$$ %p, r_addr %p, r_base %p\n", remote_memory_address, cuda_dt_hdr.remote_address, cuda_dt_hdr.remote_base);
             send_msg.msg_type = CUDA_UNPACK_NO;
+            convertor->gpu_buffer_size = convertor->local_size;
         } else {
-            send_msg.msg_type = CUDA_UNPACK_FROM_REMOTE;
+            send_msg.msg_type = CUDA_UNPACK_FROM_SEQ;
         }
         struct iovec iov;
-        int rc_dt = 0;
-        size_t packed_size = mca_btl_smcuda_component.cuda_dt_pipeline_size;
+        packed_size = mca_btl_smcuda_component.cuda_dt_pipeline_size;
         printf("Pipeline_size %ld\n", packed_size);
-        uint32_t iov_count = 1;
         iov.iov_base = convertor->gpu_buffer_ptr;
         iov.iov_len = packed_size;
-        size_t max_data = 0;
-        int seq = 0;
+        max_data = 0;
+        seq = 0;
         /* the first pack here is used to get the correct size of pipeline_size */
         /* because pack may not use the whole pipeline size */
         rc_dt = opal_convertor_pack(convertor, &iov, &iov_count, &max_data );
         packed_size = max_data;
+        iov.iov_base += packed_size;
+        /* save pipeline size */
+        my_cuda_dt_clone->pipeline_size = packed_size;   
+        convertor->gpu_buffer_size -= packed_size;
         send_msg.packed_size = packed_size;
         send_msg.seq = seq;
         mca_btl_smcuda_send_cuda_unpack_sig(btl, endpoint, &send_msg);
-        while (rc_dt != 1) {
-            iov.iov_base += packed_size;
+        while (rc_dt != 1 && convertor->gpu_buffer_size > 0) {
+            if (convertor->gpu_buffer_size < packed_size) {
+                packed_size = convertor->gpu_buffer_size;
+            } 
+            iov.iov_len = packed_size;
             seq ++;
             rc_dt = opal_convertor_pack(convertor, &iov, &iov_count, &max_data );
             packed_size = max_data;
+            iov.iov_base += packed_size;
+            convertor->gpu_buffer_size -= packed_size;
             send_msg.packed_size = packed_size;
             send_msg.seq = seq;
             mca_btl_smcuda_send_cuda_unpack_sig(btl, endpoint, &send_msg);
         }
         
-        send_msg.packed_size = 0;
-        send_msg.seq = -1;
-        send_msg.msg_type = CUDA_PACK_COMPLETE;
-        mca_btl_smcuda_send_cuda_unpack_sig(btl, endpoint, &send_msg);
+        if (rc_dt == 1) {
+            send_msg.packed_size = 0;
+            send_msg.seq = -1;
+            send_msg.msg_type = CUDA_PACK_COMPLETE;
+            mca_btl_smcuda_send_cuda_unpack_sig(btl, endpoint, &send_msg);
+        }
         
         if (rget_reg_ptr != NULL) { /* close memhandle */
             cuda_closememhandle(NULL, (mca_mpool_base_registration_t *)rget_reg_ptr);
diff --git a/test/datatype/ddt_benchmark.c b/test/datatype/ddt_benchmark.c
index 36f0e7e8659..2d25274ee9b 100644
--- a/test/datatype/ddt_benchmark.c
+++ b/test/datatype/ddt_benchmark.c
@@ -30,7 +30,7 @@
 #include <stdio.h>
 #include <string.h>
 
-//#define DDT_TEST_CUDA
+#define DDT_TEST_CUDA
 #define CUDA_MEMCPY_2D_D2H
 
 
@@ -191,7 +191,7 @@ static void fill_vectors(double* vp, int itera, int contig, int gap)
             if (j >= i*gap && j < i*gap+contig) {
                 vp[j] = 1.1;
             } else {
-                vp[j] = -1.0;
+                vp[j] = 0;
             }
         }
     }
@@ -203,7 +203,7 @@ static void fill_vectors(double* vp, int itera, int contig, int gap)
     // for (i = 0; i < (itera-1)*gap+contig; i++) {
     //     printf("%1.f ", vp[i]);
     // }
-    // printf("\n");
+    printf("\n");
 }
 
 static void verify_vectors(double *vp, int itera, int contig, int gap)
@@ -350,6 +350,16 @@ vector_ddt( ompi_datatype_t* send_type, int send_count,
             done1 = opal_convertor_pack( send_convertor, &iov, &iov_count, &max_data );
        //     done1 = 1;
         }
+        
+        // int i,j = 0;
+        // printf("buffer received\n");
+        // double *mat_temp = (double*)ptemp;
+        // for (i = 0; i < itera; i++) {
+        //     for (j = 0; j < contig; j++) {
+        //         printf(" %1.f ", mat_temp[i*itera+j]);
+        //     }
+        //     printf("\n");
+        // }
 
         if( done2 == 0 ) {
             GET_TIME( unpack_start );
@@ -1213,11 +1223,11 @@ int main( int argc, char* argv[] )
     }
     
     ompi_datatype_t *column, *matt;
-    mat_size = 1500;
-    ompi_datatype_create_vector( mat_size, 1, mat_size, MPI_DOUBLE, &column );
-    ompi_datatype_create_hvector( mat_size, 1, sizeof(double), column, &matt );
-    ompi_datatype_commit( &matt );
-    local_copy_with_convertor_mat(matt, 1, 200000000, mat_size);
+    mat_size = 4000;
+//    ompi_datatype_create_vector( mat_size, 1, mat_size, MPI_DOUBLE, &column );
+//    ompi_datatype_create_hvector( mat_size, 1, sizeof(double), column, &matt );
+//    ompi_datatype_commit( &matt );
+//    local_copy_with_convertor_mat(matt, 1, 200000000, mat_size);
     
     
     int packed_size = 256;
@@ -1275,7 +1285,7 @@ int main( int argc, char* argv[] )
         pdt = create_vector_type( MPI_DOUBLE, 1000, blk_len, blk_len+128);
         if( outputFlags & CHECK_PACK_UNPACK ) {
             for (i = 0; i < 4; i++) {
-                 vector_ddt( pdt, 1, pdt, 1, 1024*10240 , 1000, blk_len, blk_len+128);
+      //           vector_ddt( pdt, 1, pdt, 1, 1024*10240 , 1000, blk_len, blk_len+128);
      //          vector_ddt_2d( pdt, 1, pdt, 1, 1024*1024*100 , 8192, blk_len, blk_len+128);
             }
         }
@@ -1296,6 +1306,19 @@ int main( int argc, char* argv[] )
         OBJ_RELEASE( pdt ); assert( pdt == NULL );
     }
     
+    for (blk_len = 2000; blk_len <= 2000; blk_len += 500) {
+        printf( ">>--------------------------------------------<<\n" );
+        printf( "Vector data-type (60000 times %d double stride 512)\n", blk_len );
+        pdt = create_vector_type( MPI_DOUBLE, blk_len, blk_len, blk_len*2);
+        if( outputFlags & CHECK_PACK_UNPACK ) {
+            for (i = 0; i < 4; i++) {
+                  vector_ddt( pdt, 1, pdt, 1, 1024*1024*100 , blk_len, blk_len, blk_len*2);
+    //             vector_ddt_2d( pdt, 1, pdt, 1, 1024*1024*100 , 8192, blk_len, blk_len+128);
+            }
+        }
+        OBJ_RELEASE( pdt ); assert( pdt == NULL );
+    }
+    
     /*
     for (blk_len = 4; blk_len <= 32; blk_len += 1) {
         printf( ">>--------------------------------------------<<\n" );

From c377c36c047e2fdcdaa3b7913aaf7a698cfd9f55 Mon Sep 17 00:00:00 2001
From: Wei Wu <eddy16112@gmail.com>
Date: Fri, 9 Oct 2015 16:46:41 -0700
Subject: [PATCH 029/190] fix zerocopy

---
 .../cuda/opal_datatype_pack_cuda_wrapper.cu         | 13 +++++++++----
 .../cuda/opal_datatype_unpack_cuda_wrapper.cu       | 13 +++++++++----
 opal/mca/btl/openib/btl_openib_frag.h               |  2 ++
 opal/mca/btl/smcuda/btl_smcuda.h                    |  2 +-
 4 files changed, 21 insertions(+), 9 deletions(-)

diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
index a4d4b427a45..00c7812b605 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
@@ -605,8 +605,8 @@ void pack_contiguous_loop_cuda( dt_elem_desc_t* ELEM,
  //   tasks_per_block = THREAD_PER_BLOCK * TASK_PER_THREAD;
  //   num_blocks = (*COUNT + tasks_per_block - 1) / tasks_per_block;
 //    printf("extent %ld, size %ld, count %ld\n", _loop->extent, _end_loop->size, _copy_loops);
-//    cudaMemcpy2D(_destination, _end_loop->size, _source, _loop->extent, _end_loop->size, _copy_loops, cudaMemcpyDeviceToDevice);
-    pack_contiguous_loop_cuda_kernel_global<<<192, 4*THREAD_PER_BLOCK>>>(_copy_loops, _end_loop->size, _loop->extent, _source, _destination);
+    cudaMemcpy2D(_destination, _end_loop->size, _source, _loop->extent, _end_loop->size, _copy_loops, cudaMemcpyDeviceToDevice);
+//    pack_contiguous_loop_cuda_kernel_global<<<192, 4*THREAD_PER_BLOCK>>>(_copy_loops, _end_loop->size, _loop->extent, _source, _destination);
 //    int i;
 //    for (i = 0; i < 4; i++) {
 //     opal_empty_kernel<<<192, 4*THREAD_PER_BLOCK>>>(_copy_loops, _end_loop->size, _loop->extent, _source, _destination);
@@ -776,7 +776,12 @@ void pack_contiguous_loop_cuda_zerocopy( dt_elem_desc_t* ELEM,
  //   tasks_per_block = THREAD_PER_BLOCK * TASK_PER_THREAD;
  //   num_blocks = (*COUNT + tasks_per_block - 1) / tasks_per_block;
  //   cudaHostRegister(_destination, _copy_loops*_end_loop->size, cudaHostRegisterMapped);
-    cudaHostGetDevicePointer((void **)&_destination_dev, (void *) _destination, 0);
+    cudaError_t reg_rv = cudaHostGetDevicePointer((void **)&_destination_dev, (void *) _destination, 0);
+    if (reg_rv != cudaSuccess) {
+        const char *cuda_err = cudaGetErrorString(reg_rv);
+        printf("can not get dev  mem, %s\n", cuda_err);
+    }
+    //cudaMemcpy2D(_destination_dev, _end_loop->size, _source, _loop->extent, _end_loop->size, _copy_loops, cudaMemcpyDeviceToDevice);
     pack_contiguous_loop_cuda_kernel_global<<<192, 4*THREAD_PER_BLOCK>>>(_copy_loops, _end_loop->size, _loop->extent, _source, _destination_dev);
 
 #if !defined(OPAL_DATATYPE_CUDA_DRY_RUN)    
@@ -852,13 +857,13 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
         }
         transfer_required = 0;
     } else {
+        buffer_size = iov[0].iov_len;
         if (OPAL_DATATYPE_VECTOR_USE_ZEROCPY) {
             pConvertor->gpu_buffer_ptr = NULL;
             transfer_required = 0;
             free_required = 0;
             cudaHostGetDevicePointer((void **)&destination, (void *)iov[0].iov_base, 0);
         } else {
-            buffer_size = iov[0].iov_len;
             if (pConvertor->gpu_buffer_ptr == NULL) {
                 pConvertor->gpu_buffer_ptr = (unsigned char*)opal_cuda_malloc_gpu_buffer(buffer_size, 0);
             }
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
index 5374e2d9fc8..c268fe2fb94 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
@@ -727,8 +727,8 @@ void unpack_contiguous_loop_cuda( dt_elem_desc_t* ELEM,
 #endif
 //    tasks_per_block = THREAD_PER_BLOCK * TASK_PER_THREAD;
 //    num_blocks = (*COUNT + tasks_per_block - 1) / tasks_per_block;
-    unpack_contiguous_loop_cuda_kernel_global<<<192, 4*THREAD_PER_BLOCK>>>(_copy_loops, _end_loop->size, _loop->extent, _source, _destination);
-//     cudaMemcpy2D(_destination, _loop->extent, _source, _end_loop->size, _end_loop->size, _copy_loops, cudaMemcpyDeviceToDevice);
+//    unpack_contiguous_loop_cuda_kernel_global<<<192, 4*THREAD_PER_BLOCK>>>(_copy_loops, _end_loop->size, _loop->extent, _source, _destination);
+     cudaMemcpy2D(_destination, _loop->extent, _source, _end_loop->size, _end_loop->size, _copy_loops, cudaMemcpyDeviceToDevice);
 
 #if !defined(OPAL_DATATYPE_CUDA_DRY_RUN)     
     *(DESTINATION) = _destination + _loop->extent*_copy_loops - _end_loop->first_elem_disp;
@@ -818,8 +818,13 @@ void unpack_contiguous_loop_cuda_zerocopy( dt_elem_desc_t* ELEM,
 //    tasks_per_block = THREAD_PER_BLOCK * TASK_PER_THREAD;
 //    num_blocks = (*COUNT + tasks_per_block - 1) / tasks_per_block;
 //    cudaHostRegister(_source, _copy_loops*_end_loop->size, cudaHostRegisterMapped);
-    cudaHostGetDevicePointer((void **)&_source_dev, (void *) _source, 0);
-    unpack_contiguous_loop_cuda_kernel_global<<<192, 4*THREAD_PER_BLOCK>>>(_copy_loops, _end_loop->size, _loop->extent, _source, _destination);
+    cudaError_t reg_rv = cudaHostGetDevicePointer((void **)&_source_dev, (void *) _source, 0);
+    if (reg_rv != cudaSuccess) {
+        const char *cuda_err = cudaGetErrorString(reg_rv);
+        printf("can not get dev mem, %s\n", cuda_err);
+    }
+    //cudaMemcpy2D(_destination, _loop->extent, _source_dev, _end_loop->size, _end_loop->size, _copy_loops, cudaMemcpyDeviceToDevice);
+    unpack_contiguous_loop_cuda_kernel_global<<<192, 4*THREAD_PER_BLOCK>>>(_copy_loops, _end_loop->size, _loop->extent, _source_dev, _destination);
 
 #if !defined(OPAL_DATATYPE_CUDA_DRY_RUN)
     *(DESTINATION) = _destination + _loop->extent*_copy_loops - _end_loop->first_elem_disp;
diff --git a/opal/mca/btl/openib/btl_openib_frag.h b/opal/mca/btl/openib/btl_openib_frag.h
index 7ca37142429..b73a817e1e6 100644
--- a/opal/mca/btl/openib/btl_openib_frag.h
+++ b/opal/mca/btl/openib/btl_openib_frag.h
@@ -25,6 +25,8 @@
 #ifndef MCA_BTL_IB_FRAG_H
 #define MCA_BTL_IB_FRAG_H
 
+#define OPAL_OPENIB_PAD_HDR 1
+
 #include "opal_config.h"
 #include "opal/align.h"
 #include "opal/mca/btl/btl.h"
diff --git a/opal/mca/btl/smcuda/btl_smcuda.h b/opal/mca/btl/smcuda/btl_smcuda.h
index a1d9e5166e1..abd043f9f10 100644
--- a/opal/mca/btl/smcuda/btl_smcuda.h
+++ b/opal/mca/btl/smcuda/btl_smcuda.h
@@ -41,7 +41,7 @@
 #include "opal/mca/btl/btl.h"
 #include "opal/mca/common/sm/common_sm.h"
 
-#define OPAL_DATATYPE_DIRECT_COPY_GPUMEM    1
+#define OPAL_DATATYPE_DIRECT_COPY_GPUMEM    0
 
 BEGIN_C_DECLS
 

From c4b5fcfb4408cba516935644d0d2733298fdde42 Mon Sep 17 00:00:00 2001
From: eddy16112 <eddy16112@gmail.com>
Date: Thu, 22 Oct 2015 17:36:31 -0400
Subject: [PATCH 030/190] offset instead of actual addess, and lots of clean up
 for unused functions

Conflicts:
	opal/datatype/cuda/opal_datatype_cuda.cu
	opal/datatype/cuda/opal_datatype_cuda_internal.cuh
	opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
	opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
	opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
	opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
	opal/datatype/opal_datatype_gpu.c
---
 ompi/mca/pml/ob1/pml_ob1_cuda.c               |   2 +-
 opal/datatype/cuda/opal_datatype_cuda.cu      | 144 +----
 opal/datatype/cuda/opal_datatype_cuda.cuh     |  10 -
 .../cuda/opal_datatype_cuda_internal.cuh      |  74 +--
 .../cuda/opal_datatype_pack_cuda_kernel.cu    | 539 +-----------------
 .../cuda/opal_datatype_pack_cuda_wrapper.cu   | 292 ++--------
 .../cuda/opal_datatype_unpack_cuda_kernel.cu  | 262 +--------
 .../cuda/opal_datatype_unpack_cuda_wrapper.cu | 243 ++------
 opal/datatype/opal_datatype_gpu.c             |  35 +-
 opal/datatype/opal_datatype_gpu.h             |  13 -
 opal/mca/btl/smcuda/btl_smcuda.c              |  72 ++-
 opal/mca/btl/smcuda/btl_smcuda.h              |  15 +-
 opal/mca/btl/smcuda/btl_smcuda_component.c    |  16 +-
 opal/mca/btl/smcuda/btl_smcuda_endpoint.h     |   4 +-
 test/datatype/ddt_benchmark.c                 |   4 +-
 15 files changed, 219 insertions(+), 1506 deletions(-)

diff --git a/ompi/mca/pml/ob1/pml_ob1_cuda.c b/ompi/mca/pml/ob1/pml_ob1_cuda.c
index 79b739b8356..34a56f3c18b 100644
--- a/ompi/mca/pml/ob1/pml_ob1_cuda.c
+++ b/ompi/mca/pml/ob1/pml_ob1_cuda.c
@@ -136,7 +136,7 @@ int mca_pml_ob1_send_request_start_cuda(mca_pml_ob1_send_request_t* sendreq,
                     return rc;
                 }
                 mca_pml_ob1_rdma_cuda_btl_register_data(sendreq->req_rdma, sendreq->req_rdma_cnt, 0, lindex, 1, local_device); 
-                mca_btl_smcuda_cuda_dt_pack_clone(convertor, bml_btl->btl_endpoint, NULL, NULL, 0, lindex, 0, local_device);
+                mca_btl_smcuda_cuda_dt_pack_clone( bml_btl->btl_endpoint, convertor, NULL, NULL, 0, lindex, 0, local_device);
     
                 rc = mca_pml_ob1_send_request_start_rdma(sendreq, bml_btl,
                                                          sendreq->req_send.req_bytes_packed);
diff --git a/opal/datatype/cuda/opal_datatype_cuda.cu b/opal/datatype/cuda/opal_datatype_cuda.cu
index 29ade337b69..bce80b4a592 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cu
+++ b/opal/datatype/cuda/opal_datatype_cuda.cu
@@ -9,56 +9,14 @@
 #include <assert.h>
 #include <stdarg.h> 
 
-/*
- * NOTE: The order of this array *MUST* match what is listed in datatype.h
- * (use of designated initializers should relax this restrictions some)
- */
-/*
-OPAL_DECLSPEC const size_t opal_datatype_basicDatatypesSize[OPAL_DATATYPE_MAX_PREDEFINED] = {
-    OPAL_DATATYPE_LOOP_SIZE,
-    OPAL_DATATYPE_END_LOOP_SIZE,
-    OPAL_DATATYPE_LB_SIZE,
-    OPAL_DATATYPE_UB_SIZE,
-    OPAL_DATATYPE_INT1_SIZE,
-    OPAL_DATATYPE_INT2_SIZE,
-    OPAL_DATATYPE_INT4_SIZE,
-    OPAL_DATATYPE_INT8_SIZE,
-    OPAL_DATATYPE_INT16_SIZE,   
-    OPAL_DATATYPE_UINT1_SIZE,
-    OPAL_DATATYPE_UINT2_SIZE,
-    OPAL_DATATYPE_UINT4_SIZE,
-    OPAL_DATATYPE_UINT8_SIZE,
-    OPAL_DATATYPE_UINT16_SIZE,  
-    OPAL_DATATYPE_FLOAT2_SIZE,
-    OPAL_DATATYPE_FLOAT4_SIZE,
-    OPAL_DATATYPE_FLOAT8_SIZE,
-    OPAL_DATATYPE_FLOAT12_SIZE,
-    OPAL_DATATYPE_FLOAT16_SIZE,
-    OPAL_DATATYPE_FLOAT_COMPLEX_SIZE,
-    OPAL_DATATYPE_DOUBLE_COMPLEX_SIZE,
-    OPAL_DATATYPE_LONG_DOUBLE_COMPLEX_SIZE,
-    OPAL_DATATYPE_BOOL_SIZE,
-    OPAL_DATATYPE_WCHAR_SIZE,
-    OPAL_DATATYPE_UNAVAILABLE_SIZE,
-};
-*/
-/***** my variables ********/
-
 
 ddt_cuda_list_t *cuda_free_list;
 ddt_cuda_device_t *cuda_device;
-ddt_cuda_desc_t *cuda_desc_d, *cuda_desc_h;
-unsigned char *pBaseBuf_GPU, *gpu_src_const, *gpu_dest_const;
-unsigned char *ddt_cuda_pack_buffer, *ddt_cuda_unpack_buffer;
 ddt_cuda_stream_t* cuda_streams;
 struct iovec cuda_iov[CUDA_NB_IOV];
 uint32_t cuda_iov_count;
-ddt_cuda_description_dist_t description_dist_h[CUDA_MAX_NB_BLOCKS];
-ddt_cuda_description_dist_t* description_dist_d;
-ddt_cuda_iov_dist_t cuda_iov_dist_h[NB_STREAMS][CUDA_MAX_NB_BLOCKS];
+ddt_cuda_iov_dist_t* cuda_iov_dist_h[NB_STREAMS];
 ddt_cuda_iov_dist_t* cuda_iov_dist_d[NB_STREAMS];
-dt_elem_desc_t* description_d;
-uint8_t opal_datatype_cuda_debug;
 
 //uint8_t ALIGNMENT_DOUBLE, ALIGNMENT_FLOAT, ALIGNMENT_CHAR;
 
@@ -202,6 +160,17 @@ static inline void cuda_list_item_merge_by_addr(ddt_cuda_list_t *list)
     }
 }
 
+void opal_cuda_output(int output_id, const char *format, ...)
+{
+    if (output_id >= 0 && output_id <= OPAL_DATATYPE_CUDA_DEBUG_LEVEL) {
+        va_list arglist;
+        fprintf( stderr, "[Debug %d]: ", output_id );
+        va_start(arglist, format);
+        vfprintf(stderr, format, arglist);
+        va_end(arglist);
+    }
+}
+
 void opal_datatype_cuda_init(void)
 {
     uint32_t i;
@@ -213,7 +182,6 @@ void opal_datatype_cuda_init(void)
         opal_cuda_output(0, "Cannot retrieve the device being used. Drop CUDA support!\n");
         return;
     }    
-    printf("current device %d\n", device);
 
     cuda_free_list = init_cuda_free_list();
     
@@ -224,6 +192,7 @@ void opal_datatype_cuda_init(void)
         if (cudaMalloc((void **)(&gpu_ptr), sizeof(char)*DT_CUDA_BUFFER_SIZE) != cudaSuccess) {
             DT_CUDA_DEBUG( opal_cuda_output( 0, "cudaMalloc is failed in GPU %d\n", i); );
         }
+        DT_CUDA_DEBUG ( opal_cuda_output(2, "DDT engine cudaMalloc buffer %p in GPU %d\n", gpu_ptr, i););
         cudaMemset(gpu_ptr, 0, sizeof(char)*DT_CUDA_BUFFER_SIZE);
         cuda_device[i].gpu_buffer = gpu_ptr;
         
@@ -241,33 +210,6 @@ void opal_datatype_cuda_init(void)
         cuda_device[i].buffer_used.nb_elements = 0;
     }
     
-    cudaMalloc((void **)&cuda_desc_d, sizeof(ddt_cuda_desc_t));
-    cudaMallocHost((void **)&cuda_desc_h, sizeof(ddt_cuda_desc_t));
-    printf("size cuda_desc %d\n", sizeof(ddt_cuda_desc_t));
-    
-    // printf("malloc iov\n");
-    // for (i = 0; i < IOV_ARRAY_SIZE; i++) {
-    //     void* iov_base;
-    //     cudaMalloc( (void **)&iov_base, sizeof(char)*IOV_LEN);
-    //     cuda_desc_h->iov[i].iov_base = iov_base;
-    //     cuda_desc_h->iov[i].iov_len = IOV_LEN;
-    // }
-    
-    cudaMalloc((void **)(&ddt_cuda_pack_buffer), sizeof(char)*DT_CUDA_BUFFER_SIZE);
-    printf("malloc cuda packing buffer, %p\n", ddt_cuda_pack_buffer);
-    cudaMalloc((void **)(&ddt_cuda_unpack_buffer), sizeof(char)*DT_CUDA_BUFFER_SIZE);
-    printf("malloc cuda unpacking buffer, %p\n", ddt_cuda_unpack_buffer);
-
-    cuda_desc_h->iov[0].iov_base = ddt_cuda_pack_buffer;
-    cuda_desc_h->iov[0].iov_len = DT_CUDA_BUFFER_SIZE;
-    
-    cudaMalloc((void **)(&pBaseBuf_GPU), sizeof(char)*DT_CUDA_BUFFER_SIZE);
-    gpu_src_const = pBaseBuf_GPU;
-    gpu_dest_const = (unsigned char*)cuda_desc_h->iov[0].iov_base; 
-    
-    cuda_desc_h->description_max_count = 0;
-    cuda_desc_h->description_count = 0;
-    
     /* init cuda stream */
     cuda_streams = (ddt_cuda_stream_t*)malloc(sizeof(ddt_cuda_stream_t));
     for (i = 0; i < NB_STREAMS; i++) {
@@ -278,17 +220,11 @@ void opal_datatype_cuda_init(void)
     /* init cuda_iov */
     cuda_iov_count = CUDA_NB_IOV;
     
-    /* init description dist array */
-    cudaMalloc((void **)(&description_dist_d), sizeof(ddt_cuda_description_dist_t)*CUDA_MAX_NB_BLOCKS);
-    cuda_desc_h->description_dist = description_dist_d;
-    
     /* only for iov version */
     for (i = 0; i < NB_STREAMS; i++) {
         cudaMalloc((void **)(&cuda_iov_dist_d[i]), sizeof(ddt_cuda_iov_dist_t)*CUDA_MAX_NB_BLOCKS);
     }
     
-    opal_datatype_cuda_debug = 1;
-    
     // /* init size for double, float, char */
     // ALIGNMENT_DOUBLE = sizeof(double);
     // ALIGNMENT_FLOAT = sizeof(float);
@@ -301,29 +237,6 @@ void opal_datatype_cuda_fini(void)
 {
     uint32_t i;
     
-    if (cuda_desc_d != NULL) {
-        cudaFree(cuda_desc_d);
-        cuda_desc_d = NULL;
-    }
-    if (cuda_desc_h->description != NULL) {
-        cudaFree(cuda_desc_h->description);
-        cuda_desc_h->description = NULL;
-    }
-    if (cuda_desc_h->description_dist != NULL) {
-        cudaFree(cuda_desc_h->description_dist);
-        cuda_desc_h->description_dist = NULL;
-    }
-    printf("free iov\n");
-    if (cuda_desc_h != NULL) {    
-        for (i = 0; i < IOV_ARRAY_SIZE; i++) {
-            cudaFree(cuda_desc_h->iov[i].iov_base);
-            cuda_desc_h->iov[i].iov_base = NULL;
-        }
-    
-        cudaFreeHost(cuda_desc_h);
-        cuda_desc_h = NULL;
-    }
-    
     /* destory cuda stream */
     for (i = 0; i < NB_STREAMS; i++) {
         cudaStreamDestroy(cuda_streams->opal_cuda_stream[i]);
@@ -339,8 +252,6 @@ void opal_datatype_cuda_fini(void)
 void opal_cuda_sync_device(void)
 {
     cudaDeviceSynchronize();
-    pBaseBuf_GPU = gpu_src_const;
-    cuda_desc_h->iov[0].iov_base = (void*)gpu_dest_const;
 }
 
 int32_t opal_cuda_is_gpu_buffer(const void *ptr)
@@ -359,15 +270,6 @@ int32_t opal_cuda_is_gpu_buffer(const void *ptr)
     return (memType == CU_MEMORYTYPE_DEVICE) ? 1 : 0;
 }
 
-unsigned char* opal_cuda_get_gpu_pack_buffer()
-{
-    if (ddt_cuda_pack_buffer != NULL) {
-        return ddt_cuda_pack_buffer;
-    } else {
-        return NULL;
-    }
-}
-
 void* opal_cuda_malloc_gpu_buffer(size_t size, int gpu_id)
 {
     int dev_id;
@@ -408,7 +310,7 @@ void* opal_cuda_malloc_gpu_buffer(size_t size, int gpu_id)
         cuda_list_push_head(&device->buffer_used, p);
         device->buffer_used_size += size;
         device->buffer_free_size -= size;
-        DT_CUDA_DEBUG( opal_cuda_output( 0, "Malloc GPU buffer %p, dev_id %d.\n", addr, dev_id); );
+        DT_CUDA_DEBUG( opal_cuda_output( 2, "Malloc GPU buffer %p, dev_id %d.\n", addr, dev_id); );
         return addr;
     }
 }
@@ -448,28 +350,16 @@ void opal_cuda_free_gpu_buffer(void *addr, int gpu_id)
     cuda_list_item_merge_by_addr(&device->buffer_free, ptr);
     device->buffer_free_size += size;
     device->buffer_used_size -= size;
-    DT_CUDA_DEBUG( opal_cuda_output( 0, "Free GPU buffer %p.\n", addr); );
+    DT_CUDA_DEBUG( opal_cuda_output( 2, "Free GPU buffer %p.\n", addr); );
 }
 
 void opal_dump_cuda_list(ddt_cuda_list_t *list)
 {
     ddt_cuda_buffer_t *ptr = NULL;
     ptr = list->head;
-    DT_CUDA_DEBUG( opal_cuda_output( 0, "DUMP cuda list %p, nb_elements %d\n", list, list->nb_elements); );
+    DT_CUDA_DEBUG( opal_cuda_output( 2, "DUMP cuda list %p, nb_elements %d\n", list, list->nb_elements); );
     while (ptr != NULL) {
-        DT_CUDA_DEBUG( opal_cuda_output( 0, "\titem addr %p, size %ld.\n", ptr->gpu_addr, ptr->size); );
+        DT_CUDA_DEBUG( opal_cuda_output( 2, "\titem addr %p, size %ld.\n", ptr->gpu_addr, ptr->size); );
         ptr = ptr->next;
     }
 }
-
-/* from internal.h*/
-void opal_cuda_output(int output_id, const char *format, ...)
-{
-    if (output_id >= 0 && output_id <= OPAL_DATATYPE_CUDA_DEBUG_LEVEL) {
-        va_list arglist;
-        fprintf( stderr, "[Debug %d]: ", output_id );
-        va_start(arglist, format);
-        vfprintf(stderr, format, arglist);
-        va_end(arglist);
-    }
-}
diff --git a/opal/datatype/cuda/opal_datatype_cuda.cuh b/opal/datatype/cuda/opal_datatype_cuda.cuh
index 436eaa9aec3..94336ac6475 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cuh
+++ b/opal/datatype/cuda/opal_datatype_cuda.cuh
@@ -8,10 +8,6 @@ void opal_datatype_cuda_init(void);
 
 void opal_datatype_cuda_fini(void);
                                 
-int32_t opal_generic_simple_pack_function_cuda( opal_convertor_t* pConvertor,
-                                                struct iovec* iov, 
-                                                uint32_t* out_size,
-                                                size_t* max_data );
                                                 
 int32_t opal_generic_simple_pack_function_cuda_vector( opal_convertor_t* pConvertor,
                                                        struct iovec* iov, 
@@ -22,11 +18,6 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
                                                     struct iovec* iov, 
                                                     uint32_t* out_size,
                                                     size_t* max_data );                                              
-
-int32_t opal_generic_simple_unpack_function_cuda( opal_convertor_t* pConvertor,
-                                                  struct iovec* iov, 
-                                                  uint32_t* out_size,
-                                                  size_t* max_data );
                                                   
 int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvertor,
                                                   struct iovec* iov, 
@@ -102,7 +93,6 @@ void opal_cuda_free_gpu_buffer(void *addr, int gpu_id);
 
 void opal_dump_cuda_list(ddt_cuda_list_t *list);
 
-unsigned char* opal_cuda_get_gpu_pack_buffer();
 }
                             
 #endif  /* OPAL_DATATYPE_CUDA_H_HAS_BEEN_INCLUDED */
\ No newline at end of file
diff --git a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
index 2102edb6a9c..160d54336d4 100644
--- a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
+++ b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
@@ -10,9 +10,9 @@
 
 /* OPAL_CUDA */
 // #define OPAL_DATATYPE_CUDA_DRY_RUN
-#define OPAL_DATATYPE_CUDA_DEBUG
+#define OPAL_DATATYPE_CUDA_DEBUG    1
 //#define OPAL_DATATYPE_CUDA_KERNEL_TIME
-#define OPAL_DATATYPE_CUDA_DEBUG_LEVEL  0
+#define OPAL_DATATYPE_CUDA_DEBUG_LEVEL  2
 #define OPAL_DATATYPE_CUDA_TIMING
 #define OPAL_DATATYPE_VECTOR_USE_MEMCPY2D   0
 #define OPAL_DATATYPE_VECTOR_USE_ZEROCPY   0
@@ -40,43 +40,16 @@
 #define ELAPSED_TIME(TSTART, TEND)  (((TEND).tv_sec - (TSTART).tv_sec) * 1000000 + ((TEND).tv_usec - (TSTART).tv_usec))
 
 
-
-typedef struct {
-    uint32_t description_index[200];     /* index of y direction */
-    uint32_t description_local_index[200];   /* index of x direction */
-    uint32_t dst_offset[200];
-    uint32_t description_used;
-} ddt_cuda_description_dist_t;
-
-typedef struct {
-    dt_stack_t pStack[DT_STATIC_STACK_SIZE];
-    dt_elem_desc_t* description;
-    struct iovec iov[IOV_ARRAY_SIZE];
-    uint32_t stack_pos;
-    uint32_t stack_size;
-    unsigned char* pBaseBuf; /* const */
-    OPAL_PTRDIFF_TYPE lb;  /* const */
-    OPAL_PTRDIFF_TYPE ub;  /* const */
-    size_t bConverted;
-    size_t local_size; /* const */
-    uint32_t out_size;
-    size_t max_data;
-    uint32_t description_count;
-    uint32_t description_max_count;
-    ddt_cuda_description_dist_t *description_dist;
-} ddt_cuda_desc_t;
-
 typedef struct {
     cudaStream_t opal_cuda_stream[NB_STREAMS];
     uint32_t current_stream_id;
 } ddt_cuda_stream_t;
 
 typedef struct {
-    unsigned char* src[CUDA_IOV_MAX_TASK_PER_BLOCK];
-    unsigned char* dst[CUDA_IOV_MAX_TASK_PER_BLOCK];
-    uint32_t nb_elements[CUDA_IOV_MAX_TASK_PER_BLOCK];
-    uint8_t element_alignment[CUDA_IOV_MAX_TASK_PER_BLOCK];
-    uint32_t nb_tasks;
+    size_t src_offset;
+    size_t dst_offset;
+    uint32_t nb_elements;
+    uint8_t element_alignment;
 } ddt_cuda_iov_dist_t;
 
 typedef struct ddt_cuda_buffer{
@@ -103,19 +76,11 @@ typedef struct {
 
 extern ddt_cuda_list_t *cuda_free_list;
 extern ddt_cuda_device_t *cuda_device;
-extern ddt_cuda_desc_t *cuda_desc_d, *cuda_desc_h;
-extern unsigned char* pBaseBuf_GPU;
-extern unsigned char *ddt_cuda_pack_buffer, *ddt_cuda_unpack_buffer;
-extern size_t ddt_cuda_buffer_space;
 extern ddt_cuda_stream_t* cuda_streams;
 extern struct iovec cuda_iov[CUDA_NB_IOV];
 extern uint32_t cuda_iov_count;
-extern ddt_cuda_description_dist_t description_dist_h[CUDA_MAX_NB_BLOCKS];
-extern ddt_cuda_description_dist_t* description_dist_d;
-extern ddt_cuda_iov_dist_t cuda_iov_dist_h[NB_STREAMS][CUDA_MAX_NB_BLOCKS];
+extern ddt_cuda_iov_dist_t* cuda_iov_dist_h[NB_STREAMS];
 extern ddt_cuda_iov_dist_t* cuda_iov_dist_d[NB_STREAMS];
-extern dt_elem_desc_t* description_d;
-extern uint8_t opal_datatype_cuda_debug;
 
 //extern uint8_t ALIGNMENT_DOUBLE, ALIGNMENT_FLOAT, ALIGNMENT_CHAR;
 
@@ -126,24 +91,6 @@ extern uint8_t opal_datatype_cuda_debug;
 #define DBGPRINT(fmt, ...) 
 #endif 
 
-__device__ void pack_contiguous_loop_cuda_kernel( dt_elem_desc_t* ELEM,
-                                                  uint32_t* COUNT,
-                                                  unsigned char** SOURCE,
-                                                  unsigned char** DESTINATION,
-                                                  size_t* SPACE );
-                                                            
-__device__ void unpack_contiguous_loop_cuda_kernel( dt_elem_desc_t* ELEM,
-                                                    uint32_t* COUNT,
-                                                    unsigned char** SOURCE,
-                                                    unsigned char** DESTINATION,
-                                                    size_t* SPACE );
-                                                  
-__global__ void opal_generic_simple_pack_cuda_kernel(ddt_cuda_desc_t* cuda_desc);
-
-__global__ void opal_generic_simple_pack_cuda_kernel_v2(ddt_cuda_desc_t* cuda_desc);
-
-__global__ void opal_generic_simple_unpack_cuda_kernel(ddt_cuda_desc_t* cuda_desc);
-
 __global__ void pack_contiguous_loop_cuda_kernel_global( uint32_t copy_loops,
                                                          size_t size,
                                                          OPAL_PTRDIFF_TYPE extent,
@@ -156,11 +103,10 @@ __global__ void unpack_contiguous_loop_cuda_kernel_global( uint32_t copy_loops,
                                                            unsigned char* source,
                                                            unsigned char* destination );
                                                            
-// __global__ void opal_generic_simple_pack_cuda_iov_kernel( ddt_cuda_description_dist_t* desc_dist_d, dt_elem_desc_t* desc_d, uint32_t required_blocks, struct iovec* iov, unsigned char* pBaseBuf);
 
-__global__ void opal_generic_simple_pack_cuda_iov_kernel( ddt_cuda_iov_dist_t* cuda_iov_dist);
+__global__ void opal_generic_simple_pack_cuda_iov_kernel( ddt_cuda_iov_dist_t* cuda_iov_dist, int nb_blocks_used, unsigned char* source_base, unsigned char* destination_base);
 
-__global__ void opal_generic_simple_unpack_cuda_iov_kernel( ddt_cuda_iov_dist_t* cuda_iov_dist);
+__global__ void opal_generic_simple_unpack_cuda_iov_kernel( ddt_cuda_iov_dist_t* cuda_iov_dist, int nb_blocks_used, unsigned char* source_base, unsigned char* destination_base);
 
 __global__ void opal_empty_kernel(uint32_t copy_loops,
                                   size_t size,
@@ -173,7 +119,7 @@ __global__ void opal_empty_kernel_noargs();
 void opal_cuda_output(int output_id, const char *format, ...);
 
 #if defined (OPAL_DATATYPE_CUDA_DEBUG)
-#define DT_CUDA_DEBUG( INST ) if (opal_datatype_cuda_debug) { INST }
+#define DT_CUDA_DEBUG( INST ) if (OPAL_DATATYPE_CUDA_DEBUG) { INST }
 #else
 #define DT_CUDA_DEBUG( INST )
 #endif
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
index 79281adf6cb..a58b831b78b 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
@@ -5,529 +5,6 @@
 #include <stdio.h> 
 #include <time.h>
 
-__device__ void pack_contiguous_loop_cuda_kernel( dt_elem_desc_t* ELEM,
-                                                  uint32_t* COUNT,
-                                                  unsigned char** SOURCE,
-                                                  unsigned char** DESTINATION,
-                                                  size_t* SPACE )
-{
-    ddt_loop_desc_t *_loop = (ddt_loop_desc_t*)(ELEM);
-    ddt_endloop_desc_t* _end_loop = (ddt_endloop_desc_t*)((ELEM) + _loop->items);
-    unsigned char* _src_disp = (*SOURCE) + _end_loop->first_elem_disp;
-    uint32_t _copy_loops = *(COUNT);
-    uint32_t _i, tid, num_threads;
-    unsigned char* _destination = *DESTINATION;
-//    unsigned char* _source = _src_disp;
-    uint32_t gap, nb_elements;
-    double *_source_tmp, *_destination_tmp, *_src_disp_tmp;
-    
-    tid = threadIdx.x + blockIdx.x * blockDim.x;
-    num_threads = gridDim.x * blockDim.x;
-
-    if( (_copy_loops * _end_loop->size) > *(SPACE) )
-        _copy_loops = (uint32_t)(*(SPACE) / _end_loop->size);
-    
-//     num_task_per_thread = _copy_loops / num_threads;
-//     residue = _copy_loops % num_threads;
-//     if ( ((tid < residue) && (residue != 0)) || (residue == 0) ) {
-//         num_task_per_thread += residue == 0 ? 0 : 1;
-//         start_index = tid * num_task_per_thread;
-//     } else {
-//         start_index = residue * (num_task_per_thread+1) + (tid-residue) * num_task_per_thread;
-//     }
-//
-//     end_index = start_index + num_task_per_thread;
-//     DBGPRINT("tid %d, start %d, end %d, num_task_per_thread %d, copy_loops %d\n", tid, start_index, end_index, num_task_per_thread, _copy_loops);
-//     for( _i = start_index; _i < end_index; _i++ ) {
-//         // OPAL_DATATYPE_SAFEGUARD_POINTER( _source, _loop->extent, (CONVERTOR)->pBaseBuf,
-//         //                             (CONVERTOR)->pDesc, (CONVERTOR)->count );
-//         _source = _src_disp + _i * _loop->extent;
-//         _destination = *DESTINATION + _i * _end_loop->size;
-//         DBGPRINT("tid %d, pack 3. memcpy( %p, %p, %lu ) => space %lu, _i %d\n",
-//                                tid, _destination, _source, (unsigned long)_end_loop->size, (unsigned long)(*(SPACE) - _i * _end_loop->size), _i );
-//     //    MEMCPY_CSUM( *(DESTINATION), _source, _end_loop->size, (CONVERTOR) );
-// #if !defined (OPAL_DATATYPE_CUDA_DRY_RUN)
-//  //       memcpy(_destination, _source, _end_loop->size);
-//         _source_tmp = (double *)_source;
-//         _destination_tmp = (double *)_destination;
-//         for (_j = 0; _j < _end_loop->size/8; _j++)
-//         {
-//             *_destination_tmp = *_source_tmp;
-//             _destination_tmp ++;
-//             _source_tmp ++;
-//         }
-// #endif /* OPAL_DATATYPE_CUDA_DRY_RUN */
-//     }
-    
-    gap = (_loop->extent - _end_loop->size) / 8;
-    nb_elements = _end_loop->size / 8;
-    _src_disp_tmp = (double*)_src_disp;
-    _destination_tmp = (double*)_destination;
-    _destination_tmp += tid;
-
-    __syncthreads();
-
-    for (_i = tid; _i < _copy_loops*nb_elements; _i+=num_threads) {
-        _source_tmp = _src_disp_tmp + tid + _i/num_threads*num_threads + _i/nb_elements * gap;
-#if defined (OPAL_DATATYPE_CUDA_DEBUG)
-        if (_i % nb_elements == 0 ) {
-            DBGPRINT("tid %d, pack 3. memcpy( %p, %p, %lu ) => space %lu, _i %d, actual _i %d\n",
-                                            tid, _destination_tmp, _source_tmp, (unsigned long)_end_loop->size, (unsigned long)(*(SPACE) - _i/nb_elements * _end_loop->size), _i/nb_elements, _i );
-        }
-        // if (_i / nb_elements ==1 && tid == 0 ) {
-        //     DBGPRINT("tid %d, pack 3. memcpy( %p, %p, %lu ) => space %lu, _i %d, actual _i %d\n",
-        //                                     tid, _destination_tmp, _source_tmp, (unsigned long)_end_loop->size, (unsigned long)(*(SPACE) - _i/nb_elements * _end_loop->size), _i/nb_elements, _i );
-        // }
-#endif /* OPAL_DATATYPE_CUDA_DEBUG */
-#if !defined (OPAL_DATATYPE_CUDA_DRY_RUN)
-        *_destination_tmp = *_source_tmp;
-#endif /* ! OPAL_DATATYPE_CUDA_DRY_RUN */
-        _destination_tmp += num_threads;
-
-    }
-    *(SOURCE) = _src_disp + _copy_loops*_loop->extent - _end_loop->first_elem_disp;
-    *(DESTINATION) = *(DESTINATION)  + _copy_loops * _end_loop->size;
-    *(SPACE) -= _copy_loops * _end_loop->size;
-    *(COUNT) -= _copy_loops;
-
-}
-
-__device__ void pack_predefined_data_cuda_kernel( dt_elem_desc_t* ELEM,
-                                                  uint32_t* COUNT,
-                                                  unsigned char** SOURCE,
-                                                  unsigned char** DESTINATION,
-                                                  size_t* SPACE )
-{
-    uint32_t _copy_count = *(COUNT);
-    size_t _copy_blength;
-    ddt_elem_desc_t* _elem = &((ELEM)->elem);
-    unsigned char* _src_disp = (*SOURCE) + _elem->disp;
-    uint32_t _i, tid, num_threads;
-    unsigned char* _destination = *DESTINATION;
-    uint32_t gap, nb_elements;
-    double *_source_tmp, *_destination_tmp, *_src_disp_tmp;;
-
-    _copy_blength = 8;//opal_datatype_basicDatatypes[_elem->common.type]->size;
-    if( (_copy_count * _copy_blength) > *(SPACE) ) {
-        _copy_count = (uint32_t)(*(SPACE) / _copy_blength);
-        if( 0 == _copy_count ) return;  /* nothing to do */
-    }
-    
-    tid = threadIdx.x + blockIdx.x * blockDim.x;
-    num_threads = gridDim.x * blockDim.x;
-    
-    gap = (_elem->extent - _copy_blength) / 8;
-    nb_elements = _copy_blength / 8;
-    _src_disp_tmp = (double*)_src_disp;
-    _destination_tmp = (double*)_destination;
-    _destination_tmp += tid;
-    
-    __syncthreads();
-    
-    for (_i = tid; _i < _copy_count*nb_elements; _i+=num_threads) {
-        _source_tmp = _src_disp_tmp + tid + _i/num_threads*num_threads + _i/nb_elements * gap;
-#if defined (OPAL_DATATYPE_CUDA_DEBUG)
-        if (_i == 0 ) {
-            DBGPRINT("tid %d, pack 3. memcpy( %p, %p, %lu ) => space %lu, _i %d, count %d\n",
-                                            tid, _destination_tmp, _source_tmp, (unsigned long)_copy_blength*_copy_count, (unsigned long)(*(SPACE) - _i/nb_elements * _copy_blength), _i/nb_elements, _copy_count );
-        }
-        // if (_i / nb_elements ==1 && tid == 0 ) {
-        //     DBGPRINT("tid %d, pack 3. memcpy( %p, %p, %lu ) => space %lu, _i %d, actual _i %d\n",
-        //                                     tid, _destination_tmp, _source_tmp, (unsigned long)_end_loop->size, (unsigned long)(*(SPACE) - _i/nb_elements * _end_loop->size), _i/nb_elements, _i );
-        // }
-#endif /* OPAL_DATATYPE_CUDA_DEBUG */
-#if !defined (OPAL_DATATYPE_CUDA_DRY_RUN)
-        *_destination_tmp = *_source_tmp;
-#endif /* ! OPAL_DATATYPE_CUDA_DRY_RUN */
-        _destination_tmp += num_threads;
-
-    }
-    
-    _copy_blength *= _copy_count;
-    *(SOURCE)  = _src_disp + _elem->extent*_copy_count - _elem->disp;
-    *(DESTINATION) += _copy_blength;
-    *(SPACE)  -= _copy_blength;
-    *(COUNT)  -= _copy_count;
-    
-}
-
-__device__ void pack_predefined_data_cuda_kernel_v2( dt_elem_desc_t* ELEM,
-                                                     uint32_t* COUNT,
-                                                     unsigned char* SOURCE,
-                                                     unsigned char* DESTINATION,
-                                                     size_t* SPACE,
-                                                     uint32_t local_index,
-                                                     uint32_t dst_offset )
-{
-    uint32_t _copy_count = *(COUNT);
-    size_t _copy_blength;
-    ddt_elem_desc_t* _elem = &((ELEM)->elem);
-    unsigned char* _src_disp = (SOURCE) + _elem->disp;
-    uint32_t local_tid;
-    unsigned char* _destination = DESTINATION;
-    double *_source_tmp, *_destination_tmp, *_src_disp_tmp;;
-
-    _copy_blength = 8;//opal_datatype_basicDatatypes[_elem->common.type]->size;
-    // if( (_copy_count * _copy_blength) > *(SPACE) ) {
-    //     _copy_count = (uint32_t)(*(SPACE) / _copy_blength);
-    //     if( 0 == _copy_count ) return;  /* nothing to do */
-    // }
-    
-    local_tid = threadIdx.x + local_index * blockDim.x;
-    _src_disp_tmp = (double*)_src_disp;
-    _destination_tmp = (double*)_destination + dst_offset;
-    
-    if (local_tid < _copy_count) {
-        _source_tmp = _src_disp_tmp + local_tid;
-        _destination_tmp += local_tid;
-#if defined (OPAL_DATATYPE_CUDA_DEBUG)
-       if (local_tid == 0 ) {
-            DBGPRINT("tid %d, local_index %d, pack 1. memcpy( %p, %p, %lu ) => space %lu, blockIdx %d, count %d, destination %p, offset %d\n",
-                                            local_tid, local_index, _destination_tmp, _source_tmp, (unsigned long)_copy_blength*_copy_count, (unsigned long)(*(SPACE) - local_tid * _copy_blength), blockIdx.x, _copy_count, _destination, dst_offset );
-       }
-#endif /* OPAL_DATATYPE_CUDA_DEBUG */
-#if !defined (OPAL_DATATYPE_CUDA_DRY_RUN)
-       *_destination_tmp = *_source_tmp;
-#endif /* ! OPAL_DATATYPE_CUDA_DRY_RUN */
-    }
-}
-
-__global__ void opal_generic_simple_pack_cuda_kernel(ddt_cuda_desc_t* cuda_desc)
-{
-    dt_stack_t *pStack;       /* pointer to the position on the stack */
-    uint32_t pos_desc;        /* actual position in the description of the derived datatype */
-    uint32_t count_desc;      /* the number of items already done in the actual pos_desc */
-    size_t total_packed = 0;  /* total amount packed this time */
-    dt_elem_desc_t* description;
-    dt_elem_desc_t* pElem;
-    unsigned char *conv_ptr, *iov_ptr, *pBaseBuf;
-    size_t iov_len_local;
-    uint32_t iov_count;
-    uint32_t stack_pos;
-    struct iovec* iov;
-
-    OPAL_PTRDIFF_TYPE extent;
-    uint32_t out_size;
-
-    // __shared__ ddt_cuda_desc_t cuda_desc_b;
-    __shared__ dt_stack_t shared_pStack[DT_STATIC_STACK_SIZE];
-
-    if (threadIdx.x < DT_STATIC_STACK_SIZE) {
-        shared_pStack[threadIdx.x] = cuda_desc->pStack[threadIdx.x];
-    }
-    __syncthreads();
-
-
-    // load cuda descriptor from constant memory
-    iov = cuda_desc->iov;
-    pStack = shared_pStack;
-    description = cuda_desc->description;
-    stack_pos = cuda_desc->stack_pos;
-    pBaseBuf = cuda_desc->pBaseBuf;
-    extent = cuda_desc->ub - cuda_desc->lb;
-    out_size = cuda_desc->out_size;
-
-    pStack = pStack + stack_pos;
-    pos_desc   = pStack->index;
-    conv_ptr   = pBaseBuf + pStack->disp;
-    count_desc = (uint32_t)pStack->count;
-    pStack--;
-    stack_pos--;
-    pElem = &(description[pos_desc]);
-
-//    printf("pack start pos_desc %d count_desc %d disp %ld, stack_pos %d pos_desc %d count_desc %d disp %ld\n",
-//            pos_desc, count_desc, (long)(conv_ptr - pBaseBuf), stack_pos, pStack->index, (int)pStack->count, (long)pStack->disp);
-
-    for( iov_count = 0; iov_count < out_size; iov_count++ ) {
-        iov_ptr = (unsigned char *) iov[iov_count].iov_base;
-        iov_len_local = iov[iov_count].iov_len;
-        DBGPRINT("iov_len_local %lu, flags %d, types %d, count %d\n", iov_len_local, description->elem.common.flags, description->elem.common.type, description->elem.count);
-        while( 1 ) {
-            while( pElem->elem.common.flags & OPAL_DATATYPE_FLAG_DATA ) {
-                /* now here we have a basic datatype */
-                // PACK_PREDEFINED_DATATYPE( pConvertor, pElem, count_desc,
-                //                           conv_ptr, iov_ptr, iov_len_local );     
-                pack_predefined_data_cuda_kernel(pElem, &count_desc, &conv_ptr, &iov_ptr, &iov_len_local);
-                if( 0 == count_desc ) {  /* completed */
-                    conv_ptr = pBaseBuf + pStack->disp;
-                    pos_desc++;  /* advance to the next data */
-                    UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
-                    continue;
-                }
-                goto complete_loop;
-            }
-            if( OPAL_DATATYPE_END_LOOP == pElem->elem.common.type ) { /* end of the current loop */
-                // DO_DEBUG( opal_output( 0, "pack end_loop count %d stack_pos %d"
-                //                        " pos_desc %d disp %ld space %lu\n",
-                //                        (int)pStack->count, pConvertor->stack_pos,
-                //                        pos_desc, (long)pStack->disp, (unsigned long)iov_len_local ); );
-                if (threadIdx.x == 0) {
-                    (pStack->count)--;
-                }
-                __syncthreads();
-
-                if( (pStack->count) == 0 ) { /* end of loop */
-                    if( 0 == stack_pos ) {
-                        /* we lie about the size of the next element in order to
-                         * make sure we exit the main loop.
-                         */
-                        out_size = iov_count;
-                        goto complete_loop;  /* completed */
-                    }
-                    stack_pos--;
-                    pStack--;
-                    pos_desc++;
-                } else {
-                    pos_desc = pStack->index + 1;
-                    if (threadIdx.x == 0) {
-                        if( pStack->index == -1 ) {
-                            pStack->disp += extent;
-                        } else {
-                            // assert( OPAL_DATATYPE_LOOP == description[pStack->index].loop.common.type );
-                            pStack->disp += description[pStack->index].loop.extent;
-                        }
-                    }
-                    __syncthreads();
-                }
-                conv_ptr = pBaseBuf + pStack->disp;
-                UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
-                // DO_DEBUG( opal_output( 0, "pack new_loop count %d stack_pos %d pos_desc %d count_desc %d disp %ld space %lu\n",
-                //                        (int)pStack->count, pConvertor->stack_pos, pos_desc,
-                //                        count_desc, (long)pStack->disp, (unsigned long)iov_len_local ); );
-            }
-            if( OPAL_DATATYPE_LOOP == pElem->elem.common.type ) {
-                OPAL_PTRDIFF_TYPE local_disp = (OPAL_PTRDIFF_TYPE)conv_ptr;
-                if( pElem->loop.common.flags & OPAL_DATATYPE_FLAG_CONTIGUOUS ) {
-                    pack_contiguous_loop_cuda_kernel( pElem, &count_desc,
-                                          &conv_ptr, &iov_ptr, &iov_len_local );
-                    if( 0 == count_desc ) {  /* completed */
-                        pos_desc += pElem->loop.items + 1;
-                        goto update_loop_description;
-                    }
-                    /* Save the stack with the correct last_count value. */
-                }
-                local_disp = (OPAL_PTRDIFF_TYPE)conv_ptr - local_disp;
-
-                PUSH_STACK( pStack, stack_pos, pos_desc, OPAL_DATATYPE_LOOP, count_desc,
-                            pStack->disp + local_disp);
-
-                pos_desc++;
-            update_loop_description:  /* update the current state */
-                conv_ptr = pBaseBuf + pStack->disp;
-                UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
-                // DDT_DUMP_STACK( pConvertor->pStack, pConvertor->stack_pos, pElem, "advance loop" );
-                continue;
-            }
-        }
-    complete_loop:
-        if (threadIdx.x == 0) {
-            iov[iov_count].iov_len -= iov_len_local;  /* update the amount of valid data */
-        }
-        __syncthreads();
-        total_packed += iov[iov_count].iov_len;
-    }
-
-    // if (tid == 0) {
-    //     cuda_desc->max_data = total_packed;
-    //     cuda_desc->out_size = iov_count;
-    //     // cuda_desc->bConverted += total_packed;  /* update the already converted bytes */
-    //     // if( cuda_desc->bConverted == cuda_desc->local_size ) {
-    //     //     cuda_desc->stack_pos = stack_pos;
-    //     //     memcpy(cuda_desc->pStack, pStack_head, sizeof(dt_stack_t)*cuda_desc->stack_size);
-    //     //     return;
-    //     // }
-    //     // /* Save the global position for the next round */
-    //     // PUSH_STACK( pStack, stack_pos, pos_desc, OPAL_DATATYPE_INT8, count_desc,
-    //     //             conv_ptr - pBaseBuf );
-    //     // memcpy(cuda_desc->pStack, pStack_head, sizeof(dt_stack_t)*cuda_desc->stack_size);
-    //     // cuda_desc->stack_pos = stack_pos;
-    // }
-
-    return;
-}
-
-__global__ void opal_generic_simple_pack_cuda_kernel_v2(ddt_cuda_desc_t* cuda_desc)
-{
-    dt_stack_t *pStack;       /* pointer to the position on the stack */
-    uint32_t pos_desc;        /* actual position in the description of the derived datatype */
-    uint32_t count_desc;      /* the number of items already done in the actual pos_desc */
-    size_t total_packed = 0;  /* total amount packed this time */
-    dt_elem_desc_t* description;
-    dt_elem_desc_t* pElem;
-    unsigned char *conv_ptr, *iov_ptr, *pBaseBuf;
-    size_t iov_len_local;
-    uint32_t iov_count;
-    uint32_t stack_pos;
-    struct iovec* iov;
-    ddt_cuda_description_dist_t* description_dist_d;
-    uint32_t ct = 0, local_index, dst_offset;
-
-    OPAL_PTRDIFF_TYPE extent;
-    uint32_t out_size;
-
-    // __shared__ ddt_cuda_desc_t cuda_desc_b;
-    __shared__ dt_stack_t shared_pStack[DT_STATIC_STACK_SIZE];
-
-    if (threadIdx.x < DT_STATIC_STACK_SIZE) {
-        shared_pStack[threadIdx.x] = cuda_desc->pStack[threadIdx.x];
-    }
-    __syncthreads();
-
-
-    // load cuda descriptor from constant memory
-    iov = cuda_desc->iov;
-    pStack = shared_pStack;
-    description = cuda_desc->description;
-    stack_pos = cuda_desc->stack_pos;
-    pBaseBuf = cuda_desc->pBaseBuf;
-    extent = cuda_desc->ub - cuda_desc->lb;
-    out_size = cuda_desc->out_size;
-    description_dist_d = cuda_desc->description_dist;
-
-    pStack = pStack + stack_pos;
-    pos_desc = description_dist_d[blockIdx.x].description_index[ct];
-    local_index = description_dist_d[blockIdx.x].description_local_index[ct];
-    dst_offset = description_dist_d[blockIdx.x].dst_offset[ct];
-    pElem = &(description[pos_desc]);
-    count_desc = pElem->elem.count;
-    conv_ptr = pBaseBuf + pStack->disp;
-    pStack--;
-    stack_pos--;
-
-//    printf("pack start pos_desc %d count_desc %d disp %ld, stack_pos %d pos_desc %d count_desc %d disp %ld\n",
-//            pos_desc, count_desc, (long)(conv_ptr - pBaseBuf), stack_pos, pStack->index, (int)pStack->count, (long)pStack->disp);
-
-    for( iov_count = 0; iov_count < out_size; iov_count++ ) {
-        iov_ptr = (unsigned char *) iov[iov_count].iov_base;
-        iov_len_local = iov[iov_count].iov_len;
-//        DBGPRINT("iov_len_local %lu, flags %d, types %d, count %d\n", iov_len_local, description->elem.common.flags, description->elem.common.type, description->elem.count);
-        while( 1 ) {
-            while( pElem->elem.common.flags & OPAL_DATATYPE_FLAG_DATA ) {
-                /* now here we have a basic datatype */
-                // PACK_PREDEFINED_DATATYPE( pConvertor, pElem, count_desc,
-                //                           conv_ptr, iov_ptr, iov_len_local );  
-               pack_predefined_data_cuda_kernel_v2(pElem, &count_desc, conv_ptr, iov_ptr, &iov_len_local, local_index, dst_offset);
-               count_desc = 0;
-                if( 0 == count_desc ) {  /* completed */
-                    conv_ptr = pBaseBuf + pStack->disp;
-                    ct ++;
-                    if (ct >= description_dist_d[blockIdx.x].description_used) {
-                        pos_desc = cuda_desc->description_count-1;
-                    } else {
-                        pos_desc = description_dist_d[blockIdx.x].description_index[ct];  /* advance to the next data */
-                        local_index = description_dist_d[blockIdx.x].description_local_index[ct];
-                        dst_offset = description_dist_d[blockIdx.x].dst_offset[ct];
-                    }
-#if defined (OPAL_DATATYPE_CUDA_DEBUG)
-                    if (pos_desc > (cuda_desc->description_count - 1)) {
-                        printf("EEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEERROR, block %d, thread %d, pos_desc %d\n", blockIdx.x, threadIdx.x, pos_desc);
-                    }
-#endif /* OPAL_DATATYPE_CUDA_DEBUG */
-                    UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
-#if defined (OPAL_DATATYPE_CUDA_DEBUG)
-                    if (pos_desc < (cuda_desc->description_count - 1) && !(pElem->elem.common.flags & OPAL_DATATYPE_FLAG_DATA)) {
-                        printf("I get a error block %d, thread %d, pos_desc %d\n", blockIdx.x, threadIdx.x, pos_desc);
-                    }
-#endif /* OPAL_DATATYPE_CUDA_DEBUG */
-                    continue;
-                }
-                goto complete_loop;
-            }
-            if( OPAL_DATATYPE_END_LOOP == pElem->elem.common.type ) { /* end of the current loop */
-                // DO_DEBUG( opal_output( 0, "pack end_loop count %d stack_pos %d"
-                //                        " pos_desc %d disp %ld space %lu\n",
-                //                        (int)pStack->count, pConvertor->stack_pos,
-                //                        pos_desc, (long)pStack->disp, (unsigned long)iov_len_local ); );
-                if (threadIdx.x == 0) {
-                    (pStack->count)--;
-                }
-                __syncthreads();
-
-                if( (pStack->count) == 0 ) { /* end of loop */
-                    if( 0 == stack_pos ) {
-                        /* we lie about the size of the next element in order to
-                         * make sure we exit the main loop.
-                         */
-                        out_size = iov_count;
-                        goto complete_loop;  /* completed */
-                    }
-                    stack_pos--;
-                    pStack--;
-                    pos_desc++;
-                } else {
-                    pos_desc = pStack->index + 1;
-                    if (threadIdx.x == 0) {
-                        if( pStack->index == -1 ) {
-                            pStack->disp += extent;
-                        } else {
-                            // assert( OPAL_DATATYPE_LOOP == description[pStack->index].loop.common.type );
-                            pStack->disp += description[pStack->index].loop.extent;
-                        }
-                    }
-                    __syncthreads();
-                }
-                conv_ptr = pBaseBuf + pStack->disp;
-                UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
-                // DO_DEBUG( opal_output( 0, "pack new_loop count %d stack_pos %d pos_desc %d count_desc %d disp %ld space %lu\n",
-                //                        (int)pStack->count, pConvertor->stack_pos, pos_desc,
-                //                        count_desc, (long)pStack->disp, (unsigned long)iov_len_local ); );
-            }
-            if( OPAL_DATATYPE_LOOP == pElem->elem.common.type ) {
-                OPAL_PTRDIFF_TYPE local_disp = (OPAL_PTRDIFF_TYPE)conv_ptr;
-                if( pElem->loop.common.flags & OPAL_DATATYPE_FLAG_CONTIGUOUS ) {
-                    pack_contiguous_loop_cuda_kernel( pElem, &count_desc,
-                                          &conv_ptr, &iov_ptr, &iov_len_local );
-                    if( 0 == count_desc ) {  /* completed */
-                        pos_desc += pElem->loop.items + 1;
-                        goto update_loop_description;
-                    }
-                    /* Save the stack with the correct last_count value. */
-                }
-                local_disp = (OPAL_PTRDIFF_TYPE)conv_ptr - local_disp;
-
-                PUSH_STACK( pStack, stack_pos, pos_desc, OPAL_DATATYPE_LOOP, count_desc,
-                            pStack->disp + local_disp);
-
-                pos_desc++;
-            update_loop_description:  /* update the current state */
-                conv_ptr = pBaseBuf + pStack->disp;
-                UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
-                // DDT_DUMP_STACK( pConvertor->pStack, pConvertor->stack_pos, pElem, "advance loop" );
-                continue;
-            }
-        }
-    complete_loop:
-        if (threadIdx.x == 0) {
-            iov[iov_count].iov_len -= iov_len_local;  /* update the amount of valid data */
-        }
-        __syncthreads();
-        total_packed += iov[iov_count].iov_len;
-    }
-#if defined (OPAL_DATATYPE_CUDA_DEBUG)    
-    if (ct != description_dist_d[blockIdx.x].description_used) {
-        printf("I am at the end, but error,ct %d\n", ct);
-    }
-#endif /* OPAL_DATATYPE_CUDA_DEBUG */
-
-    // if (tid == 0) {
-    //     cuda_desc->max_data = total_packed;
-    //     cuda_desc->out_size = iov_count;
-    //     // cuda_desc->bConverted += total_packed;  /* update the already converted bytes */
-    //     // if( cuda_desc->bConverted == cuda_desc->local_size ) {
-    //     //     cuda_desc->stack_pos = stack_pos;
-    //     //     memcpy(cuda_desc->pStack, pStack_head, sizeof(dt_stack_t)*cuda_desc->stack_size);
-    //     //     return;
-    //     // }
-    //     // /* Save the global position for the next round */
-    //     // PUSH_STACK( pStack, stack_pos, pos_desc, OPAL_DATATYPE_INT8, count_desc,
-    //     //             conv_ptr - pBaseBuf );
-    //     // memcpy(cuda_desc->pStack, pStack_head, sizeof(dt_stack_t)*cuda_desc->stack_size);
-    //     // cuda_desc->stack_pos = stack_pos;
-    // }
-
-    return;
-}
-
 __global__ void pack_contiguous_loop_cuda_kernel_global( uint32_t copy_loops,
                                                          size_t size,
                                                          OPAL_PTRDIFF_TYPE extent,
@@ -593,10 +70,10 @@ __global__ void pack_contiguous_loop_cuda_kernel_global( uint32_t copy_loops,
 //
 // }
 
-__global__ void opal_generic_simple_pack_cuda_iov_kernel( ddt_cuda_iov_dist_t* cuda_iov_dist)
+__global__ void opal_generic_simple_pack_cuda_iov_kernel( ddt_cuda_iov_dist_t* cuda_iov_dist, int nb_blocks_used, unsigned char* source_base, unsigned char* destination_base)
 {
     uint32_t i, _copy_count;
-    unsigned char *src, *dst;
+    size_t src_offset, dst_offset;
     uint8_t alignment;
     unsigned char *_source_tmp, *_destination_tmp;
     
@@ -609,18 +86,18 @@ __global__ void opal_generic_simple_pack_cuda_iov_kernel( ddt_cuda_iov_dist_t* c
     __syncthreads();
     
     for (i = 0; i < nb_tasks; i++) {
-        src = cuda_iov_dist[blockIdx.x].src[i];
-        dst = cuda_iov_dist[blockIdx.x].dst[i];
-        _copy_count = cuda_iov_dist[blockIdx.x].nb_elements[i];
-        alignment = cuda_iov_dist[blockIdx.x].element_alignment[i];
+        src_offset = cuda_iov_dist[blockIdx.x + i * gridDim.x].src_offset;
+        dst_offset = cuda_iov_dist[blockIdx.x + i * gridDim.x].dst_offset;
+        _copy_count = cuda_iov_dist[blockIdx.x + i * gridDim.x].nb_elements;
+        alignment = cuda_iov_dist[blockIdx.x + i * gridDim.x].element_alignment;
         
         // if (threadIdx.x == 0) {
         //     printf("block %d, ali %d, nb_element %d\n", blockIdx.x, cuda_iov_dist[blockIdx.x].element_alignment[i], _copy_count);
         // }
         
         if (threadIdx.x < _copy_count) {
-            _source_tmp = src + threadIdx.x * alignment;
-            _destination_tmp = dst + threadIdx.x * alignment;
+            _source_tmp = source_base + src_offset + threadIdx.x * alignment;
+            _destination_tmp = destination_base + dst_offset + threadIdx.x * alignment;
 #if !defined (OPAL_DATATYPE_CUDA_DRY_RUN)
             if (alignment == ALIGNMENT_DOUBLE) {
                 *((long *)_destination_tmp) = *((long *)_source_tmp);
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
index 00c7812b605..efc0c7af957 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
@@ -7,169 +7,6 @@
 #include <stdio.h>
 #include <assert.h>
 
-int32_t opal_generic_simple_pack_function_cuda( opal_convertor_t* pConvertor,
-                                                struct iovec* iov, 
-                                                uint32_t* out_size,
-                                                size_t* max_data )
-{
-    uint32_t i;
-    dt_elem_desc_t* description;
-    dt_elem_desc_t* pElem;
-    const opal_datatype_t *pData = pConvertor->pDesc;
-    uint32_t tasks_per_block, num_blocks, thread_per_block;
-    dt_stack_t* pStack;
-    
-    //return -99;
-
-    description = pConvertor->use_desc->desc;
-    
-    cuda_desc_h->stack_pos = pConvertor->stack_pos;
-#if defined(OPAL_DATATYPE_CUDA_DRY_RUN)
-    cuda_desc_h->pBaseBuf = pConvertor->pBaseBuf;
-#else
-    cuda_desc_h->pBaseBuf = pBaseBuf_GPU;
-#endif /* OPAL_DATATYPE_CUDA_DRY_RUN */
-    cuda_desc_h->lb = pData->lb;
-    cuda_desc_h->ub = pData->ub;
-    cuda_desc_h->out_size = *out_size;
-    cuda_desc_h->max_data = *max_data;
-    cuda_desc_h->bConverted = pConvertor->bConverted;
-    cuda_desc_h->local_size = pConvertor->local_size;
-    cuda_desc_h->stack_size = pConvertor->stack_size;
-    
-    for (i = 0; i < pConvertor->stack_size; i++) {
-        cuda_desc_h->pStack[i] = pConvertor->pStack[i];
-    }
-    if (cuda_desc_h->description_max_count != 0) {
-        if (cuda_desc_h->description_max_count >= (pConvertor->use_desc->used+1)) {
-            cuda_desc_h->description_count = pConvertor->use_desc->used+1;
-        } else {
-            cudaFree(cuda_desc_h->description);
-            cuda_desc_h->description = NULL;
-            cudaMalloc((void **)&(cuda_desc_h->description), sizeof(dt_elem_desc_t)*(pConvertor->use_desc->used+1));
-            cuda_desc_h->description_max_count = pConvertor->use_desc->used+1;
-            cuda_desc_h->description_count = pConvertor->use_desc->used+1;
-        }
-        
-    } else {
-        cudaMalloc((void **)&(cuda_desc_h->description), sizeof(dt_elem_desc_t)*(pConvertor->use_desc->used+1));
-        cuda_desc_h->description_max_count = pConvertor->use_desc->used+1;
-        cuda_desc_h->description_count = pConvertor->use_desc->used+1;
-    }
-    cudaMemcpy(cuda_desc_h->description, description, sizeof(dt_elem_desc_t)*(cuda_desc_h->description_count), cudaMemcpyHostToDevice);
-    printf("description ct %d\n", cuda_desc_h->description_count);
-    
-    // for (i = 0; i < pConvertor->use_desc->used+1; i++) {
-    //     cuda_desc_h->description[i] = description[i];
-    // }
-    
-    DBGPRINT("stack_size %d\n", pConvertor->stack_size);
-
-    DBGPRINT("flags %d, types %d, count %d\n", description->elem.common.flags, description->elem.common.type, description->elem.count);
-    
-    for (i = 0; i < *out_size; i++) {
-#if defined (OPAL_DATATYPE_CUDA_DRY_RUN)
-        cuda_desc_h->iov[i].iov_base = iov[i].iov_base;
-#endif /* OPAL_DATATYPE_CUDA_DRY_RUN */
-        cuda_desc_h->iov[i].iov_len = iov[i].iov_len;
-    }
-    
-    pStack = pConvertor->pStack + pConvertor->stack_pos;
-    thread_per_block = CUDA_WARP_SIZE * 5;
-    tasks_per_block = thread_per_block * TASK_PER_THREAD;
-    num_blocks = ((uint32_t)pStack->count + tasks_per_block - 1) / tasks_per_block;
-    num_blocks = 512;
-
-    /***/
-    uint32_t pos_desc, count_desc, current_block, task_iteration, nb_blocks_per_description, j, dst_offset;
-    pos_desc   = pStack->index;
-    pElem = &(description[pos_desc]);
-    count_desc = (uint32_t)pStack->count;
-    current_block = 0;
-    task_iteration = 0;
-    dst_offset = 0;
-    while( 1 ) {
-        while( pElem->elem.common.flags & OPAL_DATATYPE_FLAG_DATA ) {
-            nb_blocks_per_description = (count_desc + thread_per_block - 1) / thread_per_block;
-            for (i = 0; i < nb_blocks_per_description; i++) {
-                description_dist_h[current_block].description_index[task_iteration] = pos_desc;
-                description_dist_h[current_block].description_local_index[task_iteration] = i;
-                description_dist_h[current_block].dst_offset[task_iteration] = dst_offset;
-                description_dist_h[current_block].description_used = task_iteration + 1;
-                if ( (i+1) * thread_per_block <= count_desc) {
-                    dst_offset += thread_per_block;
-                } else {
-                    dst_offset += thread_per_block - ((i+1)*thread_per_block - count_desc);
-                }
-                current_block += 1;
-                if (current_block >= num_blocks) {
-                    current_block = 0;
-                    task_iteration ++;
-                }
-            }
-            pos_desc ++;
-            pElem = &(description[pos_desc]);
-            count_desc = pElem->elem.count;
-        }
-        if( OPAL_DATATYPE_END_LOOP == pElem->elem.common.type ) {
-            break;
-        }
-    }
-
-    // for (i = 0; i < num_blocks; i++) {
-    //     printf("block %d\t, used %d\n", i, description_dist_h[i].description_used);
-    //     for (j = 0; j < description_dist_h[i].description_used; j++) {
-    //         pos_desc = description_dist_h[i].description_index[j];
-    //         pElem = &(description[pos_desc]);
-    //         printf("i %d\t, descp_pos %d\t, local_index %d\t, count %d\t, dst offset %d\n", j, description_dist_h[i].description_index[j], description_dist_h[i].description_local_index[j], pElem->elem.count, description_dist_h[i].dst_offset[j]);
-    //     }
-    // }
-
-    cudaMemcpy(cuda_desc_h->description_dist, description_dist_h, sizeof(ddt_cuda_description_dist_t)*(num_blocks), cudaMemcpyHostToDevice);
-    /***/
-    
-    cudaMemcpy(cuda_desc_d, cuda_desc_h, sizeof(ddt_cuda_desc_t), cudaMemcpyHostToDevice);
-      
-    printf("launch pack kernel, count %d, num_blocks %d, total threads %d\n", (uint32_t)pStack->count, num_blocks, num_blocks*thread_per_block);
-    opal_generic_simple_pack_cuda_kernel_v2<<<num_blocks, thread_per_block>>>(cuda_desc_d);
-#if !defined(OPAL_DATATYPE_CUDA_DRY_RUN)
-    size_t position = pConvertor->pDesc->size;
-//    opal_convertor_set_position_nocheck(pConvertor, &position);
-#endif
-    cudaDeviceSynchronize();
-    
-   return 1;
-    
-    
-#if defined(OPAL_DATATYPE_CUDA_DRY_RUN)
-    return -99;
-#else
-    // /* copy stack and description data back to CPU */
-    // cudaMemcpy(cuda_desc_h, cuda_desc_d, sizeof(ddt_cuda_desc_t), cudaMemcpyDeviceToHost);
-    //
-    // for (i = 0; i < pConvertor->stack_size; i++) {
-    //     pConvertor->pStack[i] = cuda_desc_h->pStack[i];
-    // }
-    //
-    // pConvertor->stack_pos = cuda_desc_h->stack_pos;
-    // *out_size = cuda_desc_h->out_size;
-    // *max_data = cuda_desc_h->max_data;
-    // pConvertor->bConverted = cuda_desc_h->bConverted;
-    // pConvertor->local_size = cuda_desc_h->local_size;
-    //
-    // for (i = 0; i < *out_size; i++) {
-    //     iov[i].iov_len = cuda_desc_h->iov[i].iov_len;
-    // }
-    //
-    if( pConvertor->bConverted == pConvertor->local_size ) {
-        // pConvertor->flags |= CONVERTOR_COMPLETED;
-        return 1;
-    }
-
-    return 0;
-#endif /* OPAL_DATATYPE_CUDA_DRY_RUN */
-                                                  
-}
 
 int32_t opal_generic_simple_pack_function_cuda_vector2(opal_convertor_t* pConvertor,
                                                       struct iovec* iov,
@@ -396,7 +233,7 @@ int32_t opal_generic_simple_pack_function_cuda_vector(opal_convertor_t* pConvert
     long total_time;
 #endif
 
-    DT_CUDA_DEBUG( opal_cuda_output( 1, "opal_convertor_generic_simple_pack_cuda_vector( %p:%p, {%p, %lu}, %u, %u )\n",
+    DT_CUDA_DEBUG( opal_cuda_output( 2, "opal_convertor_generic_simple_pack_cuda_vector( %p:%p, {%p, %lu}, %u, %u )\n",
                                 (void*)pConvertor, (void*)pConvertor->pBaseBuf,
                                 iov[0].iov_base, (unsigned long)iov[0].iov_len, *out_size, *max_data ); );
 
@@ -414,7 +251,7 @@ int32_t opal_generic_simple_pack_function_cuda_vector(opal_convertor_t* pConvert
     pConvertor->stack_pos--;
     pElem = &(description[pos_desc]);
 
-    DT_CUDA_DEBUG( opal_cuda_output( 1, "pack start pos_desc %d count_desc %d disp %ld\n"
+    DT_CUDA_DEBUG( opal_cuda_output( 4, "pack start pos_desc %d count_desc %d disp %ld\n"
                            "stack_pos %d pos_desc %d count_desc %d disp %ld\n",
                            pos_desc, count_desc, (long)(conv_ptr - pConvertor->pBaseBuf),
                            pConvertor->stack_pos, pStack->index, (int)pStack->count, (long)pStack->disp ); );
@@ -475,7 +312,7 @@ int32_t opal_generic_simple_pack_function_cuda_vector(opal_convertor_t* pConvert
                 goto complete_loop;
             }
             if( OPAL_DATATYPE_END_LOOP == pElem->elem.common.type ) { /* end of the current loop */
-                DT_CUDA_DEBUG( opal_cuda_output( 2, "pack end_loop count %d stack_pos %d"
+                DT_CUDA_DEBUG( opal_cuda_output( 4, "pack end_loop count %d stack_pos %d"
                                                  " pos_desc %d disp %ld space %lu\n",
                                                  (int)pStack->count, pConvertor->stack_pos,
                                                  pos_desc, (long)pStack->disp, (unsigned long)iov_len_local ); );
@@ -501,7 +338,7 @@ int32_t opal_generic_simple_pack_function_cuda_vector(opal_convertor_t* pConvert
                 }
                 conv_ptr = pConvertor->pBaseBuf + pStack->disp;
                 UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
-                DT_CUDA_DEBUG( opal_cuda_output( 2, "pack new_loop count %d stack_pos %d pos_desc %d count_desc %d disp %ld space %lu\n",
+                DT_CUDA_DEBUG( opal_cuda_output( 4, "pack new_loop count %d stack_pos %d pos_desc %d count_desc %d disp %ld space %lu\n",
                                                  (int)pStack->count, pConvertor->stack_pos, pos_desc,
                                                  count_desc, (long)pStack->disp, (unsigned long)iov_len_local ); );
             }
@@ -537,7 +374,6 @@ int32_t opal_generic_simple_pack_function_cuda_vector(opal_convertor_t* pConvert
     complete_loop:
         iov[iov_count].iov_len -= iov_len_local;  /* update the amount of valid data */
         total_packed += iov[iov_count].iov_len;
- //       printf("iov_len %d, local %d\n", iov[iov_count].iov_len, iov_len_local);
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
         GET_TIME(start);
 #endif
@@ -547,15 +383,15 @@ int32_t opal_generic_simple_pack_function_cuda_vector(opal_convertor_t* pConvert
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
         GET_TIME( end );
         total_time = ELAPSED_TIME( start, end );
-        printf( "[Timing]: DtoH memcpy in %ld microsec, transfer required %d\n", total_time, transfer_required );
+        DT_CUDA_DEBUG( opal_cuda_output( 2, "[Timing]: DtoH memcpy in %ld microsec, transfer required %d\n", total_time, transfer_required ); );
 #endif
     }
     *max_data = total_packed;
     pConvertor->bConverted += total_packed;  /* update the already converted bytes */
     *out_size = iov_count;
+    DT_CUDA_DEBUG( opal_cuda_output( 2, "Pack total packed %lu\n", total_packed); );
     if( pConvertor->bConverted == pConvertor->local_size ) {
         pConvertor->flags |= CONVERTOR_COMPLETED;
-        DT_CUDA_DEBUG( opal_cuda_output( 0, "Total packed %lu\n", pConvertor->bConverted); );
         if (pConvertor->gpu_buffer_ptr != NULL && free_required == 1) {
             printf("free\n");
            opal_cuda_free_gpu_buffer(pConvertor->gpu_buffer_ptr, 0);
@@ -566,7 +402,7 @@ int32_t opal_generic_simple_pack_function_cuda_vector(opal_convertor_t* pConvert
     /* Save the global position for the next round */
     PUSH_STACK( pStack, pConvertor->stack_pos, pos_desc, pElem->elem.common.type, count_desc,
                 conv_ptr - pConvertor->pBaseBuf );
-    DT_CUDA_DEBUG( opal_cuda_output( 2, "pack save stack stack_pos %d pos_desc %d count_desc %d disp %ld\n",
+    DT_CUDA_DEBUG( opal_cuda_output( 4, "pack save stack stack_pos %d pos_desc %d count_desc %d disp %ld\n",
                                      pConvertor->stack_pos, pStack->index, (int)pStack->count, (long)pStack->disp ); );
     return 0;
 }
@@ -589,15 +425,11 @@ void pack_contiguous_loop_cuda( dt_elem_desc_t* ELEM,
     long total_time;
 #endif
 
-    DT_CUDA_DEBUG( opal_cuda_output( 0, "I am in pack_contiguous_loop_cuda\n"); );
+    DT_CUDA_DEBUG( opal_cuda_output( 2, "Pack using contiguous_loop_cuda\n"); );
 
     if( (_copy_loops * _end_loop->size) > *(SPACE) )
         _copy_loops = (uint32_t)(*(SPACE) / _end_loop->size);
 
-#if !defined(OPAL_DATATYPE_CUDA_DRY_RUN)
- //   _source = pBaseBuf_GPU;
- //   _destination = (unsigned char*)cuda_desc_h->iov[0].iov_base;
-#endif
 
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
     GET_TIME(start);
@@ -625,7 +457,7 @@ void pack_contiguous_loop_cuda( dt_elem_desc_t* ELEM,
 #if defined(OPAL_DATATYPE_CUDA_TIMING) 
     GET_TIME( end );
     total_time = ELAPSED_TIME( start, end );
-    printf( "[Timing]: vector packing in %ld microsec\n", total_time );
+    DT_CUDA_DEBUG( opal_cuda_output( 2, "[Timing]: vector packing in %ld microsec\n", total_time ); );
 #endif
 }
 
@@ -650,7 +482,7 @@ void pack_contiguous_loop_cuda_pipeline( dt_elem_desc_t* ELEM,
     long total_time;
 #endif
 
-    DT_CUDA_DEBUG( opal_cuda_output( 0, "I am in pack_contiguous_loop_cuda_pipeline\n"); );
+    DT_CUDA_DEBUG( opal_cuda_output( 2, "Pack using contiguous_loop_cuda_pipeline\n"); );
 
     if( (_copy_loops * _end_loop->size) > *(SPACE) )
         _copy_loops = (uint32_t)(*(SPACE) / _end_loop->size);
@@ -696,7 +528,7 @@ void pack_contiguous_loop_cuda_pipeline( dt_elem_desc_t* ELEM,
 #if defined(OPAL_DATATYPE_CUDA_TIMING) 
     GET_TIME( end );
     total_time = ELAPSED_TIME( start, end );
-    printf( "[Timing]: vector packing in %ld microsec\n", total_time );
+    DT_CUDA_DEBUG( opal_cuda_output( 2, "[Timing]: vector packing in %ld microsec\n", total_time ); );
 #endif
 }
 
@@ -718,7 +550,7 @@ void pack_contiguous_loop_cuda_memcpy2d( dt_elem_desc_t* ELEM,
     long total_time;
 #endif
 
-    DT_CUDA_DEBUG( opal_cuda_output( 0, "I am in pack_contiguous_loop_cuda_memcpy2d\n"); );
+    DT_CUDA_DEBUG( opal_cuda_output( 2, "Pack using contiguous_loop_cuda_memcpy2d\n"); );
 
     if( (_copy_loops * _end_loop->size) > *(SPACE) )
         _copy_loops = (uint32_t)(*(SPACE) / _end_loop->size);
@@ -741,7 +573,7 @@ void pack_contiguous_loop_cuda_memcpy2d( dt_elem_desc_t* ELEM,
 #if defined(OPAL_DATATYPE_CUDA_TIMING) 
     GET_TIME( end );
     total_time = ELAPSED_TIME( start, end );
-    printf( "[Timing]: vector packing with memcpy2d in %ld microsec\n", total_time );
+    DT_CUDA_DEBUG( opal_cuda_output( 2, "[Timing]: vector packing with memcpy2d in %ld microsec\n", total_time ); );
 #endif
 }
 
@@ -764,7 +596,7 @@ void pack_contiguous_loop_cuda_zerocopy( dt_elem_desc_t* ELEM,
     long total_time;
 #endif
 
-    DT_CUDA_DEBUG( opal_cuda_output( 0, "I am in pack_contiguous_loop_cuda_zerocopy\n"); );
+    DT_CUDA_DEBUG( opal_cuda_output( 2, "Pack using contiguous_loop_cuda_zerocopy\n"); );
 
     if( (_copy_loops * _end_loop->size) > *(SPACE) )
         _copy_loops = (uint32_t)(*(SPACE) / _end_loop->size);
@@ -797,7 +629,7 @@ void pack_contiguous_loop_cuda_zerocopy( dt_elem_desc_t* ELEM,
 #if defined(OPAL_DATATYPE_CUDA_TIMING) 
     GET_TIME( end );
     total_time = ELAPSED_TIME( start, end );
-    printf( "[Timing]: vector packing in %ld microsec\n", total_time );
+    DT_CUDA_DEBUG( opal_cuda_output( 2, "[Timing]: vector packing in %ld microsec\n", total_time ); );
 #endif
 }
 
@@ -810,16 +642,16 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
     uint32_t count_desc, current_block, task_iteration, nb_blocks_per_description, residue_desc;
     uint32_t nb_blocks, thread_per_block, nb_blocks_used;
     size_t length, buffer_size, length_per_iovec, dst_offset;
-    unsigned char *destination, *destination_tmp;
+    unsigned char *destination, *destination_base, *source_base;
     size_t total_packed, total_converted;
     int32_t complete_flag = 0;
     uint8_t buffer_isfull = 0, transfer_required, free_required;
     uint32_t convertor_flags;
-    dt_elem_desc_t* description;
-    dt_elem_desc_t* pElem;
-    dt_stack_t* pStack;
+//    dt_elem_desc_t* description;
+//    dt_elem_desc_t* pElem;
+//    dt_stack_t* pStack;
     uint8_t alignment, orig_alignment;
-    int32_t orig_stack_index;
+//    int32_t orig_stack_index;
     
     ddt_cuda_iov_dist_t* cuda_iov_dist_h_current;
     ddt_cuda_iov_dist_t* cuda_iov_dist_d_current;
@@ -829,12 +661,11 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
     long total_time, move_time;
 #endif
     
-    DT_CUDA_DEBUG ( opal_cuda_output(0, "GPU datatype PACKING using iovec\n"); );
-    
-    description = pConvertor->use_desc->desc;
+    /*description = pConvertor->use_desc->desc;
     pStack = pConvertor->pStack + pConvertor->stack_pos;
     pElem = &(description[pStack->index]);
-//    printf("size elem %lu, size %d\n", pElem->elem.common.type, opal_datatype_basicDatatypes[pElem->elem.common.type]->size);
+    printf("size elem %lu, size %d\n", pElem->elem.common.type, opal_datatype_basicDatatypes[pElem->elem.common.type]->size);
+    */
     
 //    assert(opal_datatype_basicDatatypes[pElem->elem.common.type]->size != 0);
 
@@ -869,24 +700,19 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
             }
             transfer_required = 1;
             free_required = 1;
-#if defined(OPAL_DATATYPE_CUDA_DRY_RUN)
-            destination = (unsigned char*)iov[0].iov_base;
-#else
             destination = pConvertor->gpu_buffer_ptr;
-#endif /* OPAL_DATATYPE_CUDA_DRY_RUN */
         }
     }
     
-    destination_tmp = destination;
-    
-    DT_CUDA_DEBUG ( opal_cuda_output(0, "Pack GPU base %p, pack to buffer %p\n", pConvertor->pBaseBuf, destination););
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "Pack using IOV, GPU base %p, pack to buffer %p\n", pConvertor->pBaseBuf, destination););
 
     cuda_iov_count = 1000;//CUDA_NB_IOV;
     total_packed = 0;
     total_converted = pConvertor->bConverted;
     cuda_streams->current_stream_id = 0;
     convertor_flags = pConvertor->flags;
-    orig_stack_index = pStack->index;
+  //  orig_stack_index = pStack->index;
+    destination_base = destination;
 
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
     GET_TIME(start_total);
@@ -896,12 +722,12 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
     GET_TIME(start);
 #endif
     complete_flag = opal_convertor_raw( pConvertor, cuda_iov, &cuda_iov_count, &length );
-    DT_CUDA_DEBUG ( opal_cuda_output(2, "PACKING complete flag %d, iov count %d, length %d, submit to CUDA stream %d\n", complete_flag, cuda_iov_count, length, cuda_streams->current_stream_id); );
+    DT_CUDA_DEBUG ( opal_cuda_output(4, "Pack complete flag %d, iov count %d, length %d, submit to CUDA stream %d\n", complete_flag, cuda_iov_count, length, cuda_streams->current_stream_id); );
 
 #if defined(OPAL_DATATYPE_CUDA_TIMING)    
     GET_TIME( end );
     total_time = ELAPSED_TIME( start, end );
-    printf( "[Timing]: ddt to iov in %ld microsec\n", total_time );
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: ddt to iov in %ld microsec\n", total_time ); );
 #endif
     
     dst_offset = 0;
@@ -914,7 +740,8 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
         current_block = 0;
         task_iteration = 0;
         cuda_iov_dist_h_current = cuda_iov_dist_h[cuda_streams->current_stream_id];
-        cuda_iov_dist_d_current = cuda_iov_dist_d[cuda_streams->current_stream_id]; 
+        cuda_iov_dist_d_current = cuda_iov_dist_d[cuda_streams->current_stream_id];
+        source_base = (unsigned char*)cuda_iov[0].iov_base; 
 
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
         GET_TIME(start);
@@ -924,11 +751,12 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
         }
 
         for (i = 0; i < cuda_iov_count; i++) {
-            pElem = &(description[orig_stack_index+i]);
+          /*  pElem = &(description[orig_stack_index+i]);*/
             if (buffer_size >= cuda_iov[i].iov_len) {
                 length_per_iovec = cuda_iov[i].iov_len;
             } else {
-                orig_alignment = opal_datatype_basicDatatypes[pElem->elem.common.type]->size;
+                /*orig_alignment = opal_datatype_basicDatatypes[pElem->elem.common.type]->size;*/
+                orig_alignment = ALIGNMENT_CHAR;
                 length_per_iovec = buffer_size / orig_alignment * orig_alignment;
                 buffer_isfull = 1;
             }
@@ -949,12 +777,11 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
             count_desc = length_per_iovec / alignment;
             residue_desc = length_per_iovec % alignment;
             nb_blocks_per_description = (count_desc + thread_per_block - 1) / thread_per_block;
-            DT_CUDA_DEBUG ( opal_cuda_output(10, "PACKING description %d, size %d, residue %d, alignment %d, nb_block_aquired %d\n", i, count_desc, residue_desc, alignment, nb_blocks_per_description); );
+            DT_CUDA_DEBUG ( opal_cuda_output(10, "Pack description %d, size %d, residue %d, alignment %d, nb_block_aquired %d\n", i, count_desc, residue_desc, alignment, nb_blocks_per_description); );
             for (j = 0; j < nb_blocks_per_description; j++) {
-                cuda_iov_dist_h_current[current_block].src[task_iteration] = (unsigned char *)(cuda_iov[i].iov_base) + j * thread_per_block * alignment;
-                cuda_iov_dist_h_current[current_block].dst[task_iteration] = destination;
-                cuda_iov_dist_h_current[current_block].element_alignment[task_iteration] = alignment;
-                cuda_iov_dist_h_current[current_block].nb_tasks = task_iteration + 1;
+                cuda_iov_dist_h_current[nb_blocks_used].src_offset = (unsigned char *)(cuda_iov[i].iov_base) + j * thread_per_block * alignment - source_base;
+                cuda_iov_dist_h_current[nb_blocks_used].dst_offset = destination - destination_base;
+                cuda_iov_dist_h_current[nb_blocks_used].element_alignment = alignment;
                 if ( (j+1) * thread_per_block <= count_desc) {
                     cuda_iov_dist_h_current[current_block].nb_elements[task_iteration] = thread_per_block;// * sizeof(double);
                 } else {
@@ -963,9 +790,8 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
 #if defined (OPAL_DATATYPE_CUDA_DEBUG)
                 assert(cuda_iov_dist_h_current[current_block].nb_elements[task_iteration] > 0);
 #endif /* OPAL_DATATYPE_CUDA_DEBUG */
-                destination += cuda_iov_dist_h_current[current_block].nb_elements[task_iteration] * alignment;
-                DT_CUDA_DEBUG( opal_cuda_output(12, "PACKING \tblock %d, src %p, dst %p, nb_elements %d, alignment %d\n", current_block, cuda_iov_dist_h_current[current_block].src[task_iteration], cuda_iov_dist_h_current[current_block].dst[task_iteration], cuda_iov_dist_h_current[current_block].nb_elements[task_iteration], cuda_iov_dist_h_current[current_block].element_alignment[task_iteration]); );
-                current_block += 1;
+                destination += cuda_iov_dist_h_current[nb_blocks_used].nb_elements * alignment;
+                DT_CUDA_DEBUG( opal_cuda_output(12, "Pack \tblock %d, src %p, dst %p, nb_elements %d, alignment %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].src_offset, cuda_iov_dist_h_current[nb_blocks_used].dst_offset, cuda_iov_dist_h_current[nb_blocks_used].nb_elements, cuda_iov_dist_h_current[nb_blocks_used].element_alignment); );
                 nb_blocks_used ++;
                 if (current_block >= nb_blocks) {
                     current_block = 0;
@@ -976,18 +802,17 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
             
             /* handle residue */
             if (residue_desc != 0) {
-                orig_alignment = opal_datatype_basicDatatypes[pElem->elem.common.type]->size;
-                cuda_iov_dist_h_current[current_block].src[task_iteration] = (unsigned char *)(cuda_iov[i].iov_base) + length_per_iovec / alignment * alignment;
-                cuda_iov_dist_h_current[current_block].dst[task_iteration] = destination;
-                cuda_iov_dist_h_current[current_block].element_alignment[task_iteration] = orig_alignment;
-                cuda_iov_dist_h_current[current_block].nb_tasks = task_iteration + 1;
-                cuda_iov_dist_h_current[current_block].nb_elements[task_iteration] = (length_per_iovec - length_per_iovec / alignment * alignment) / orig_alignment;
+                /*orig_alignment = opal_datatype_basicDatatypes[pElem->elem.common.type]->size;*/
+                orig_alignment = ALIGNMENT_CHAR;
+                cuda_iov_dist_h_current[nb_blocks_used].src_offset = (unsigned char *)(cuda_iov[i].iov_base) + length_per_iovec / alignment * alignment - source_base;
+                cuda_iov_dist_h_current[nb_blocks_used].dst_offset = destination - destination_base;
+                cuda_iov_dist_h_current[nb_blocks_used].element_alignment = orig_alignment;
+                cuda_iov_dist_h_current[nb_blocks_used].nb_elements = (length_per_iovec - length_per_iovec / alignment * alignment) / orig_alignment;
 #if defined (OPAL_DATATYPE_CUDA_DEBUG)
                 assert(cuda_iov_dist_h_current[current_block].nb_elements[task_iteration] > 0);
 #endif /* OPAL_DATATYPE_CUDA_DEBUG */
-                destination += cuda_iov_dist_h_current[current_block].nb_elements[task_iteration] * orig_alignment;
-                DT_CUDA_DEBUG( opal_cuda_output(12, "PACKING \tblock %d, src %p, dst %p, nb_elements %d, alignment %d\n", current_block, cuda_iov_dist_h_current[current_block].src[task_iteration], cuda_iov_dist_h_current[current_block].dst[task_iteration], cuda_iov_dist_h_current[current_block].nb_elements[task_iteration], cuda_iov_dist_h_current[current_block].element_alignment[task_iteration]); );
-                current_block += 1;
+                destination += cuda_iov_dist_h_current[nb_blocks_used].nb_elements * orig_alignment;
+                DT_CUDA_DEBUG( opal_cuda_output(12, "Pack \tblock %d, src %p, dst %p, nb_elements %d, alignment %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].src_offset, cuda_iov_dist_h_current[nb_blocks_used].dst_offset, cuda_iov_dist_h_current[nb_blocks_used].nb_elements, cuda_iov_dist_h_current[nb_blocks_used].element_alignment); );
                 nb_blocks_used ++;
                 if (current_block >= nb_blocks) {
                     current_block = 0;
@@ -1004,11 +829,11 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
 #if defined(OPAL_DATATYPE_CUDA_TIMING)    
         GET_TIME( end );
         total_time = ELAPSED_TIME( start, end );
-        printf( "[Timing]: Pack to dest %p, iov is prepared in %ld microsec, kernel submitted to CUDA stream %d, nb_blocks %d\n", destination_tmp, total_time,  cuda_streams->current_stream_id, nb_blocks_used);
+        DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Pack to dest %p, iov is prepared in %ld microsec, kernel submitted to CUDA stream %d, nb_blocks %d\n", destination_base, total_time,  cuda_streams->current_stream_id, nb_blocks_used); );
 #endif
 
-        cudaMemcpyAsync(cuda_iov_dist_d_current, cuda_iov_dist_h_current, sizeof(ddt_cuda_iov_dist_t)*(nb_blocks), cudaMemcpyHostToDevice, cuda_streams->opal_cuda_stream[cuda_streams->current_stream_id]);
-        opal_generic_simple_pack_cuda_iov_kernel<<<nb_blocks, thread_per_block, 0, cuda_streams->opal_cuda_stream[cuda_streams->current_stream_id]>>>(cuda_iov_dist_d_current);
+        cudaMemcpyAsync(cuda_iov_dist_d_current, cuda_iov_dist_h_current, sizeof(ddt_cuda_iov_dist_t)*(nb_blocks_used), cudaMemcpyHostToDevice, cuda_streams->opal_cuda_stream[cuda_streams->current_stream_id]);
+        opal_generic_simple_pack_cuda_iov_kernel<<<nb_blocks, thread_per_block, 0, cuda_streams->opal_cuda_stream[cuda_streams->current_stream_id]>>>(cuda_iov_dist_d_current, nb_blocks_used, source_base, destination_base);
         cuda_streams->current_stream_id ++;
         cuda_streams->current_stream_id = cuda_streams->current_stream_id % NB_STREAMS;
         
@@ -1023,21 +848,20 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
         GET_TIME(start);
 #endif
         convertor_flags = pConvertor->flags;
-        orig_stack_index = pStack->index;
+//        orig_stack_index = pStack->index;
         complete_flag = opal_convertor_raw( pConvertor, cuda_iov, &cuda_iov_count, &length );
-        DT_CUDA_DEBUG ( opal_cuda_output(2, "PACKING complete flag %d, iov count %d, length %d, submit to CUDA stream %d\n", complete_flag, cuda_iov_count, length, cuda_streams->current_stream_id); );
+        DT_CUDA_DEBUG ( opal_cuda_output(4, "Pack complete flag %d, iov count %d, length %d, submit to CUDA stream %d\n", complete_flag, cuda_iov_count, length, cuda_streams->current_stream_id); );
 #if defined(OPAL_DATATYPE_CUDA_TIMING)    
         GET_TIME( end );
         total_time = ELAPSED_TIME( start, end );
-        printf( "[Timing]: ddt to iov in %ld microsec\n", total_time );
+        DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: ddt to iov in %ld microsec\n", total_time ); );
 #endif
     }
     
 
-    cudaDeviceSynchronize();
- /*   for (i = 0; i < NB_STREAMS; i++) {
+    for (i = 0; i < NB_STREAMS; i++) {
         cudaStreamSynchronize(cuda_streams->opal_cuda_stream[i]);
-    }*/
+    }
     
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
     GET_TIME(start);
@@ -1048,7 +872,7 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
 #if defined(OPAL_DATATYPE_CUDA_TIMING) 
     GET_TIME( end );
     move_time = ELAPSED_TIME( start, end );
-    printf( "[Timing]: DtoH memcpy in %ld microsec, transfer required %d\n", move_time, transfer_required );
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: DtoH memcpy in %ld microsec, transfer required %d\n", move_time, transfer_required ); );
 #endif
     // float *vtmp = (float *)iov[0].iov_base;
     // DT_CUDA_DEBUG ( opal_cuda_output(0, "packed iov buffer, total packed %d\n", total_packed); );
@@ -1060,12 +884,12 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
     iov[0].iov_len = total_packed;
     *max_data = total_packed;
     *out_size = 1;
-    DT_CUDA_DEBUG ( opal_cuda_output(0, "PACKING total packed %d\n", total_packed); );
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "Pack total packed %d\n", total_packed); );
     
 #if defined(OPAL_DATATYPE_CUDA_TIMING)    
     GET_TIME( end_total );
     total_time = ELAPSED_TIME( start_total, end_total );
-    printf( "[Timing]: total packing in %ld microsec, kernel %ld microsec\n", total_time, total_time-move_time );
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: total packing in %ld microsec, kernel %ld microsec\n", total_time, total_time-move_time ); );
 #endif
     
     if( pConvertor->bConverted == pConvertor->local_size ) {
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
index 3303e6fe9f5..2ea3bb59885 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
@@ -5,257 +5,11 @@
 #include <cuda.h>
 #include <stdio.h> 
 
-__device__ void unpack_contiguous_loop_cuda_kernel( dt_elem_desc_t* ELEM,
-                                                    uint32_t* COUNT,
-                                                    unsigned char** SOURCE,
-                                                    unsigned char** DESTINATION,
-                                                    size_t* SPACE )
-{
-    ddt_loop_desc_t *_loop = (ddt_loop_desc_t*)(ELEM);
-    ddt_endloop_desc_t* _end_loop = (ddt_endloop_desc_t*)((ELEM) + _loop->items);
-    unsigned char* _dst_disp = (*DESTINATION) + _end_loop->first_elem_disp;
-    uint32_t _copy_loops = *(COUNT);
-    uint32_t _i, tid, num_threads;
-    unsigned char* _source = *SOURCE;
-//    unsigned char* _source = _src_disp;
-    uint32_t gap, nb_elements;
-    double *_source_tmp, *_destination_tmp, *_dst_disp_tmp;;
-    
-    tid = threadIdx.x + blockIdx.x * blockDim.x;
-    num_threads = gridDim.x * blockDim.x;
-
-    if( (_copy_loops * _end_loop->size) > *(SPACE) )
-        _copy_loops = (uint32_t)(*(SPACE) / _end_loop->size);
-    
-    gap = (_loop->extent - _end_loop->size) / 8;
-    nb_elements = _end_loop->size / 8;
-    _dst_disp_tmp = (double*)_dst_disp;
-    _source_tmp = (double*)_source;
-    _destination_tmp = _dst_disp_tmp + tid;
-    _source_tmp += tid;
-
-    __syncthreads();
-    for (_i = tid; _i < _copy_loops*nb_elements; _i+=num_threads) {
-        _destination_tmp = _dst_disp_tmp + tid + _i/num_threads*num_threads + _i/nb_elements * gap;
-#if defined (OPAL_DATATYPE_CUDA_DEBUG)
-        if (_i % nb_elements == 0 ) {
-            DBGPRINT("tid %d, pack 3. memcpy( %p, %p, %lu ) => space %lu, _i %d, actual _i %d\n",
-                                            tid, _destination_tmp, _source_tmp, (unsigned long)_end_loop->size, (unsigned long)(*(SPACE) - _i/nb_elements * _end_loop->size), _i/nb_elements, _i );
-        }
-        // if (_i / nb_elements ==1 && tid == 0 ) {
-        //     DBGPRINT("tid %d, pack 3. memcpy( %p, %p, %lu ) => space %lu, _i %d, actual _i %d\n",
-        //                                     tid, _destination_tmp, _source_tmp, (unsigned long)_end_loop->size, (unsigned long)(*(SPACE) - _i/nb_elements * _end_loop->size), _i/nb_elements, _i );
-        // }
-#endif /* OPAL_DATATYPE_CUDA_DEBUG */
-#if !defined (OPAL_DATATYPE_CUDA_DRY_RUN)
-        *_destination_tmp = *_source_tmp;
-#endif /* ! OPAL_DATATYPE_CUDA_DRY_RUN */
-        _source_tmp += num_threads;
-//        _source_tmp += num_threads;
-
-    }
-    *(DESTINATION) = _dst_disp + _copy_loops*_loop->extent - _end_loop->first_elem_disp;
-    *(SOURCE) = *(SOURCE)  + _copy_loops * _end_loop->size;
-    *(SPACE) -= _copy_loops * _end_loop->size;
-    *(COUNT) -= _copy_loops;
-
-    __syncthreads();
-}
-
-__global__ void opal_generic_simple_unpack_cuda_kernel(ddt_cuda_desc_t* cuda_desc)
-{
-    dt_stack_t* pStack;                /* pointer to the position on the stack */
-    uint32_t pos_desc;                 /* actual position in the description of the derived datatype */
-    uint32_t count_desc;               /* the number of items already done in the actual pos_desc */
-    size_t total_unpacked = 0;         /* total size unpacked this time */
-    dt_elem_desc_t* description;
-    dt_elem_desc_t* pElem;
-    unsigned char *conv_ptr, *iov_ptr, *pBaseBuf;
-    size_t iov_len_local;
-    uint32_t iov_count;
-    uint32_t stack_pos;
-    struct iovec* iov;
-
-    OPAL_PTRDIFF_TYPE lb; 
-    OPAL_PTRDIFF_TYPE ub;
-    uint32_t out_size;
-    uint32_t tid;
-
-    tid = threadIdx.x + blockIdx.x * blockDim.x;
-    
- //   __shared__ ddt_cuda_desc_t cuda_desc_b;
-    __shared__ dt_stack_t shared_pStack[DT_STATIC_STACK_SIZE];
-
-    if (threadIdx.x < DT_STATIC_STACK_SIZE) {
-        shared_pStack[threadIdx.x] = cuda_desc->pStack[threadIdx.x];
-    }
-    __syncthreads();
-    
-    // load cuda descriptor from constant memory
-    iov = cuda_desc->iov;
-    pStack = shared_pStack;
-    description = cuda_desc->description;
-    stack_pos = cuda_desc->stack_pos;
-    pBaseBuf = cuda_desc->pBaseBuf;
-    lb = cuda_desc->lb;
-    ub = cuda_desc->ub;
-    out_size = cuda_desc->out_size;
-
-    /* For the first step we have to add both displacement to the source. After in the
-     * main while loop we will set back the source_base to the correct value. This is
-     * due to the fact that the convertor can stop in the middle of a data with a count
-     */
-    pStack     = pStack + stack_pos;
-    pos_desc   = pStack->index;
-    conv_ptr   = pBaseBuf + pStack->disp;
-    count_desc = (uint32_t)pStack->count;
-    pStack--;
-    stack_pos--;
-    pElem = &(description[pos_desc]);
-
-
-    for( iov_count = 0; iov_count < out_size; iov_count++ ) {
-        iov_ptr = (unsigned char *) iov[iov_count].iov_base;
-        iov_len_local = iov[iov_count].iov_len;
-        // if( 0 != pConvertor->partial_length ) {
-        //     size_t element_length = opal_datatype_basicDatatypes[pElem->elem.common.type]->size;
-        //     size_t missing_length = element_length - pConvertor->partial_length;
-        //
-        //     assert( pElem->elem.common.flags & OPAL_DATATYPE_FLAG_DATA );
-        //     COMPUTE_CSUM( iov_ptr, missing_length, pConvertor );
-        //     opal_unpack_partial_datatype( pConvertor, pElem,
-        //                                   iov_ptr,
-        //                                   pConvertor->partial_length, element_length - pConvertor->partial_length,
-        //                                   &conv_ptr );
-        //     --count_desc;
-        //     if( 0 == count_desc ) {
-        //         conv_ptr = pConvertor->pBaseBuf + pStack->disp;
-        //         pos_desc++;  /* advance to the next data */
-        //         UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
-        //     }
-        //     iov_ptr       += missing_length;
-        //     iov_len_local -= missing_length;
-        //     pConvertor->partial_length = 0;  /* nothing more inside */
-        // }
-        while( 1 ) {
-            while( pElem->elem.common.flags & OPAL_DATATYPE_FLAG_DATA ) {
-                /* now here we have a basic datatype */
-                // UNPACK_PREDEFINED_DATATYPE( pConvertor, pElem, count_desc,
-                //                             iov_ptr, conv_ptr, iov_len_local );
-                if( 0 == count_desc ) {  /* completed */
-                    conv_ptr = pBaseBuf + pStack->disp;
-                    pos_desc++;  /* advance to the next data */
-                    UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
-                    continue;
-                }
-                // assert( pElem->elem.common.type < OPAL_DATATYPE_MAX_PREDEFINED );
-                if( 0 != iov_len_local ) {
-                    unsigned char* temp = conv_ptr;
-                    /* We have some partial data here. Let's copy it into the convertor
-                     * and keep it hot until the next round.
-                     */
-                    // assert( iov_len_local < opal_datatype_basicDatatypes[pElem->elem.common.type]->size );
-                    // COMPUTE_CSUM( iov_ptr, iov_len_local, pConvertor );
-                    //
-                    // opal_unpack_partial_datatype( pConvertor, pElem,
-                    //                               iov_ptr, 0, iov_len_local,
-                    //                               &temp );
-                    //
-                    // pConvertor->partial_length = (uint32_t)iov_len_local;
-                    iov_len_local = 0;
-                }
-                goto complete_loop;
-            }
-            if( OPAL_DATATYPE_END_LOOP == pElem->elem.common.type ) { /* end of the current loop */
-                // DO_DEBUG( opal_output( 0, "unpack end_loop count %d stack_pos %d pos_desc %d disp %ld space %lu\n",
-                //                        (int)pStack->count, pConvertor->stack_pos, pos_desc,
-                //                        (long)pStack->disp, (unsigned long)iov_len_local ); );
-                if (threadIdx.x == 0) {
-                    (pStack->count)--;
-                }
-                __syncthreads();
-                
-                if( pStack->count == 0 ) { /* end of loop */
-                    if( 0 == stack_pos ) {
-                        /* Do the same thing as when the loop is completed */
-                        iov[iov_count].iov_len -= iov_len_local;  /* update the amount of valid data */
-                        total_unpacked += iov[iov_count].iov_len;
-                        iov_count++;  /* go to the next */
-                        goto complete_conversion;
-                    }
-                    stack_pos--;
-                    pStack--;
-                    pos_desc++;
-                } else {
-                    pos_desc = pStack->index + 1;
-                    if (threadIdx.x == 0) {
-                        if( pStack->index == -1 ) {
-                            pStack->disp += (ub - lb);
-                        } else {
-                            //assert( OPAL_DATATYPE_LOOP == description[pStack->index].loop.common.type );
-                            pStack->disp += description[pStack->index].loop.extent;
-                        }
-                    }
-                    __syncthreads();
-                }
-                conv_ptr = pBaseBuf + pStack->disp;
-                UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
-                // DO_DEBUG( opal_output( 0, "unpack new_loop count %d stack_pos %d pos_desc %d disp %ld space %lu\n",
-                //                        (int)pStack->count, pConvertor->stack_pos, pos_desc,
-                //                        (long)pStack->disp, (unsigned long)iov_len_local ); );
-            }
-            if( OPAL_DATATYPE_LOOP == pElem->elem.common.type ) {
-                OPAL_PTRDIFF_TYPE local_disp = (OPAL_PTRDIFF_TYPE)conv_ptr;
-                if( pElem->loop.common.flags & OPAL_DATATYPE_FLAG_CONTIGUOUS ) {
-                    unpack_contiguous_loop_cuda_kernel( pElem, &count_desc,
-                                                        &iov_ptr, &conv_ptr, &iov_len_local );
-                    count_desc = 0;
-                    if( 0 == count_desc ) {  /* completed */
-                        pos_desc += pElem->loop.items + 1;
-                        goto update_loop_description;
-                    }
-                    /* Save the stack with the correct last_count value. */
-                }
-                local_disp = (OPAL_PTRDIFF_TYPE)conv_ptr - local_disp;
-                PUSH_STACK( pStack, stack_pos, pos_desc, OPAL_DATATYPE_LOOP, count_desc,
-                            pStack->disp + local_disp);
-                pos_desc++;
-            update_loop_description:  /* update the current state */
-                conv_ptr = pBaseBuf + pStack->disp;
-                UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
-                // DDT_DUMP_STACK( pConvertor->pStack, pConvertor->stack_pos, pElem, "advance loop" );
-                continue;
-            }
-        }
-    complete_loop:
-        if (threadIdx.x == 0) { 
-            iov[iov_count].iov_len -= iov_len_local;  /* update the amount of valid data */
-        }
-        __syncthreads();
-        total_unpacked += iov[iov_count].iov_len;
-    }
- complete_conversion:
-    if (tid == 0) {
-        cuda_desc->max_data = total_unpacked;
-    //    pConvertor->bConverted += total_unpacked;  /* update the already converted bytes */
-        cuda_desc->out_size = iov_count;
-        // if( pConvertor->bConverted == pConvertor->remote_size ) {
-        //     pConvertor->flags |= CONVERTOR_COMPLETED;
-        //     return 1;
-        // }
-        // /* Save the global position for the next round */
-        // PUSH_STACK( pStack, pConvertor->stack_pos, pos_desc, OPAL_DATATYPE_UINT1, count_desc,
-        //             conv_ptr - pConvertor->pBaseBuf );
-        // DO_DEBUG( opal_output( 0, "unpack save stack stack_pos %d pos_desc %d count_desc %d disp %ld\n",
-        //                        pConvertor->stack_pos, pStack->index, (int)pStack->count, (long)pStack->disp ); );
-    }
-}
-
 
-__global__ void opal_generic_simple_unpack_cuda_iov_kernel( ddt_cuda_iov_dist_t* cuda_iov_dist)
+__global__ void opal_generic_simple_unpack_cuda_iov_kernel( ddt_cuda_iov_dist_t* cuda_iov_dist, int nb_blocks_used, unsigned char* source_base, unsigned char* destination_base)
 {
     uint32_t i, _copy_count;
-    unsigned char *src, *dst;
+    size_t src_offset, dst_offset;
     uint8_t alignment;
     unsigned char *_source_tmp, *_destination_tmp;
     
@@ -267,14 +21,14 @@ __global__ void opal_generic_simple_unpack_cuda_iov_kernel( ddt_cuda_iov_dist_t*
     __syncthreads();
     
     for (i = 0; i < nb_tasks; i++) {
-        src = cuda_iov_dist[blockIdx.x].src[i];
-        dst = cuda_iov_dist[blockIdx.x].dst[i];
-        _copy_count = cuda_iov_dist[blockIdx.x].nb_elements[i];
-        alignment = cuda_iov_dist[blockIdx.x].element_alignment[i];
+        src_offset = cuda_iov_dist[blockIdx.x + i * gridDim.x].src_offset;
+        dst_offset = cuda_iov_dist[blockIdx.x + i * gridDim.x].dst_offset;
+        _copy_count = cuda_iov_dist[blockIdx.x + i * gridDim.x].nb_elements;
+        alignment = cuda_iov_dist[blockIdx.x + i * gridDim.x].element_alignment;
         
         if (threadIdx.x < _copy_count) {
-            _source_tmp = src + threadIdx.x * alignment;
-            _destination_tmp = dst + threadIdx.x * alignment;
+            _source_tmp = source_base + src_offset + threadIdx.x * alignment;
+            _destination_tmp = destination_base + dst_offset + threadIdx.x * alignment;
 #if !defined (OPAL_DATATYPE_CUDA_DRY_RUN)
                 if (alignment == ALIGNMENT_DOUBLE) {
                     *((long *)_destination_tmp) = *((long *)_source_tmp);
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
index c268fe2fb94..52f9acccc09 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
@@ -7,108 +7,6 @@
 #include <stdio.h>
 #include <assert.h>
 
-int32_t opal_generic_simple_unpack_function_cuda( opal_convertor_t* pConvertor,
-                                                  struct iovec* iov, 
-                                                  uint32_t* out_size,
-                                                  size_t* max_data )
-{
-    uint32_t i;
-    dt_elem_desc_t* description;
-    const opal_datatype_t *pData = pConvertor->pDesc;
-    uint32_t tasks_per_block, num_blocks, thread_per_block;
-    dt_stack_t* pStack;
-    
-    return -99;
-    description = pConvertor->use_desc->desc;
-    
-    cuda_desc_h->stack_pos = pConvertor->stack_pos;
-#if defined(OPAL_DATATYPE_CUDA_DRY_RUN)
-    cuda_desc_h->pBaseBuf = pConvertor->pBaseBuf;
-#else
-    cuda_desc_h->pBaseBuf = pBaseBuf_GPU;
-#endif /* OPAL_DATATYPE_CUDA_DRY_RUN */
-    cuda_desc_h->lb = pData->lb;
-    cuda_desc_h->ub = pData->ub;
-    cuda_desc_h->out_size = *out_size;
-    cuda_desc_h->max_data = *max_data;
-    cuda_desc_h->bConverted = pConvertor->bConverted;
-    cuda_desc_h->local_size = pConvertor->local_size;
-    cuda_desc_h->stack_size = pConvertor->stack_size;
-    
-    for (i = 0; i < pConvertor->stack_size; i++) {
-        cuda_desc_h->pStack[i] = pConvertor->pStack[i];
-    }
-    if (cuda_desc_h->description_max_count != 0) {
-        if (cuda_desc_h->description_max_count >= (pConvertor->use_desc->used+1)) {
-            cuda_desc_h->description_count = pConvertor->use_desc->used+1;
-        } else {
-            cudaFree(cuda_desc_h->description);
-            cuda_desc_h->description = NULL;
-            cudaMalloc((void **)&(cuda_desc_h->description), sizeof(dt_elem_desc_t)*(pConvertor->use_desc->used+1));
-            cuda_desc_h->description_max_count = pConvertor->use_desc->used+1;
-            cuda_desc_h->description_count = pConvertor->use_desc->used+1;
-        }
-        
-    } else {
-        cudaMalloc((void **)&(cuda_desc_h->description), sizeof(dt_elem_desc_t)*(pConvertor->use_desc->used+1));
-        cuda_desc_h->description_max_count = pConvertor->use_desc->used+1;
-        cuda_desc_h->description_count = pConvertor->use_desc->used+1;
-    }
-    cudaMemcpy(cuda_desc_h->description, description, sizeof(dt_elem_desc_t)*(pConvertor->use_desc->used+1), cudaMemcpyHostToDevice);
-    
-    DBGPRINT("stack_size %d\n", pConvertor->stack_size);
-
-    DBGPRINT("flags %d, types %d, count %d\n", description->elem.common.flags, description->elem.common.type, description->elem.count);
-    
-    for (i = 0; i < *out_size; i++) {
-#if defined (OPAL_DATATYPE_CUDA_DRY_RUN)
-        cuda_desc_h->iov[i].iov_base = iov[i].iov_base;
-#endif /* OPAL_DATATYPE_CUDA_DRY_RUN */
-        cuda_desc_h->iov[i].iov_len = iov[i].iov_len;
-    }
-    
-    cudaMemcpy(cuda_desc_d, cuda_desc_h, sizeof(ddt_cuda_desc_t), cudaMemcpyHostToDevice);
-    
-    pStack = pConvertor->pStack + pConvertor->stack_pos;
-    thread_per_block = CUDA_WARP_SIZE * 3;
-    tasks_per_block = thread_per_block * TASK_PER_THREAD;
-    num_blocks = ((uint32_t)pStack->count + tasks_per_block - 1) / tasks_per_block;
-    printf("launch unpack kernel, count %d, num_blocks %d, total threads %d\n", (uint32_t)pStack->count, num_blocks, num_blocks*thread_per_block);
-    opal_generic_simple_unpack_cuda_kernel<<<192, thread_per_block>>>(cuda_desc_d);
-#if !defined(OPAL_DATATYPE_CUDA_DRY_RUN)
-    size_t position = pConvertor->pDesc->size;
-    opal_convertor_set_position_nocheck(pConvertor, &position);
-#endif
-    cudaDeviceSynchronize();
-    
-#if defined(OPAL_DATATYPE_CUDA_DRY_RUN)
-    return -99;
-#else
-    // /* copy stack and description data back to CPU */
-    // cudaMemcpy(cuda_desc_h, cuda_desc_d, sizeof(ddt_cuda_desc_t), cudaMemcpyDeviceToHost);
-    //
-    // for (i = 0; i < pConvertor->stack_size; i++) {
-    //     pConvertor->pStack[i] = cuda_desc_h->pStack[i];
-    // }
-    //
-    // pConvertor->stack_pos = cuda_desc_h->stack_pos;
-    // *out_size = cuda_desc_h->out_size;
-    // *max_data = cuda_desc_h->max_data;
-    // pConvertor->bConverted = cuda_desc_h->bConverted;
-    // pConvertor->local_size = cuda_desc_h->local_size;
-    //
-    // for (i = 0; i < *out_size; i++) {
-    //     iov[i].iov_len = cuda_desc_h->iov[i].iov_len;
-    // }
-    //
-    if( pConvertor->bConverted == pConvertor->local_size ) {
-        // pConvertor->flags |= CONVERTOR_COMPLETED;
-        return 1;
-    }
-
-    return 0;
-#endif /* OPAL_DATATYPE_CUDA_DRY_RUN */
-}
 
 int32_t opal_generic_simple_unpack_function_cuda_vector2( opal_convertor_t* pConvertor,
                                                          struct iovec* iov, uint32_t* out_size,
@@ -305,7 +203,7 @@ int32_t opal_generic_simple_unpack_function_cuda_vector( opal_convertor_t* pConv
     long total_time;
 #endif
 
-    DT_CUDA_DEBUG( opal_cuda_output( 1, "opal_convertor_generic_simple_unpack( %p, {%p, %lu}, %u , %u)\n",
+    DT_CUDA_DEBUG( opal_cuda_output( 2, "opal_convertor_generic_simple_unpack_vector( %p, {%p, %lu}, %u , %u)\n",
                                      (void*)pConvertor, iov[0].iov_base, (unsigned long)iov[0].iov_len, *out_size, *max_data ); )
 
     description = pConvertor->use_desc->desc;
@@ -322,7 +220,7 @@ int32_t opal_generic_simple_unpack_function_cuda_vector( opal_convertor_t* pConv
     pConvertor->stack_pos--;
     pElem = &(description[pos_desc]);
 
-    DT_CUDA_DEBUG( opal_cuda_output( 1, "unpack start pos_desc %d count_desc %d disp %ld\n"
+    DT_CUDA_DEBUG( opal_cuda_output( 4, "unpack start pos_desc %d count_desc %d disp %ld\n"
                            "stack_pos %d pos_desc %d count_desc %d disp %ld\n",
                            pos_desc, count_desc, (long)(conv_ptr - pConvertor->pBaseBuf),
                            pConvertor->stack_pos, pStack->index, (int)pStack->count, (long)(pStack->disp) ); );
@@ -351,7 +249,7 @@ int32_t opal_generic_simple_unpack_function_cuda_vector( opal_convertor_t* pConv
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
         GET_TIME( end );
         total_time = ELAPSED_TIME( start, end );
-        printf( "[Timing]: HtoD memcpy in %ld microsec, free required %d\n", total_time, free_required );
+        DT_CUDA_DEBUG( opal_cuda_output( 2, "[Timing]: HtoD memcpy in %ld microsec, free required %d\n", total_time, free_required ); );
 #endif
         iov_len_local = iov[iov_count].iov_len;
         if( 0 != pConvertor->partial_length ) {
@@ -369,7 +267,7 @@ int32_t opal_generic_simple_unpack_function_cuda_vector( opal_convertor_t* pConv
                 goto complete_loop;
             }
             if( OPAL_DATATYPE_END_LOOP == pElem->elem.common.type ) { /* end of the current loop */
-                DT_CUDA_DEBUG( opal_cuda_output( 2, "unpack end_loop count %d stack_pos %d pos_desc %d disp %ld space %lu\n",
+                DT_CUDA_DEBUG( opal_cuda_output( 4, "unpack end_loop count %d stack_pos %d pos_desc %d disp %ld space %lu\n",
                                                  (int)pStack->count, pConvertor->stack_pos, pos_desc,
                                                  (long)pStack->disp, (unsigned long)iov_len_local ); );
                 if( --(pStack->count) == 0 ) { /* end of loop */
@@ -394,7 +292,7 @@ int32_t opal_generic_simple_unpack_function_cuda_vector( opal_convertor_t* pConv
                 }
                 conv_ptr = pConvertor->pBaseBuf + pStack->disp;
                 UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
-                DT_CUDA_DEBUG( opal_cuda_output( 2, "unpack new_loop count %d stack_pos %d pos_desc %d disp %ld space %lu\n",
+                DT_CUDA_DEBUG( opal_cuda_output( 4, "unpack new_loop count %d stack_pos %d pos_desc %d disp %ld space %lu\n",
                                                  (int)pStack->count, pConvertor->stack_pos, pos_desc,
                                                  (long)pStack->disp, (unsigned long)iov_len_local ); );
             }
@@ -433,9 +331,9 @@ int32_t opal_generic_simple_unpack_function_cuda_vector( opal_convertor_t* pConv
     *max_data = total_unpacked;
     pConvertor->bConverted += total_unpacked;  /* update the already converted bytes */
     *out_size = iov_count;
+    DT_CUDA_DEBUG( opal_cuda_output( 2, "Unpack total unpacked %lu\n", total_unpacked); );
     if( pConvertor->bConverted == pConvertor->remote_size ) {
         pConvertor->flags |= CONVERTOR_COMPLETED;
-        DT_CUDA_DEBUG( opal_cuda_output( 0, "Total unpacked %lu\n", pConvertor->bConverted); );
         if (pConvertor->gpu_buffer_ptr != NULL && free_required == 1) {
             opal_cuda_free_gpu_buffer(pConvertor->gpu_buffer_ptr, 0);
             pConvertor->gpu_buffer_ptr = NULL;
@@ -445,7 +343,7 @@ int32_t opal_generic_simple_unpack_function_cuda_vector( opal_convertor_t* pConv
     /* Save the global position for the next round */
     PUSH_STACK( pStack, pConvertor->stack_pos, pos_desc, pElem->elem.common.type, count_desc,
                 conv_ptr - pConvertor->pBaseBuf );
-    DT_CUDA_DEBUG( opal_cuda_output( 2, "unpack save stack stack_pos %d pos_desc %d count_desc %d disp %ld\n",
+    DT_CUDA_DEBUG( opal_cuda_output( 4, "unpack save stack stack_pos %d pos_desc %d count_desc %d disp %ld\n",
                                      pConvertor->stack_pos, pStack->index, (int)pStack->count, (long)pStack->disp ); );
     return 0;
 }
@@ -459,17 +357,17 @@ int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvert
     uint32_t count_desc, current_block, task_iteration, nb_blocks_per_description, dst_offset, residue_desc;
     uint32_t nb_blocks, thread_per_block;
     size_t length, buffer_size, length_per_iovec;
-    unsigned char *source, *source_tmp;
+    unsigned char *source, *source_base, *destination_base;
     size_t total_unpacked, total_converted;
     int32_t complete_flag = 0;
     uint8_t buffer_isfull = 0;
     uint8_t free_required = 0;
     uint32_t convertor_flags;
-    dt_elem_desc_t* description;
-    dt_elem_desc_t* pElem;
-    dt_stack_t* pStack;
+//    dt_elem_desc_t* description;
+//    dt_elem_desc_t* pElem;
+//    dt_stack_t* pStack;
     uint8_t alignment, orig_alignment;
-    int32_t orig_stack_index;
+//    int32_t orig_stack_index;
 
     ddt_cuda_iov_dist_t* cuda_iov_dist_h_current;
     ddt_cuda_iov_dist_t* cuda_iov_dist_d_current;
@@ -482,18 +380,13 @@ int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvert
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
     GET_TIME(start_total);
 #endif
-    
-    description = pConvertor->use_desc->desc;
+
+/*    description = pConvertor->use_desc->desc;
     pStack = pConvertor->pStack + pConvertor->stack_pos;
     pElem = &(description[pStack->index]);
-    DT_CUDA_DEBUG ( opal_cuda_output(0, "GPU datatype UNpacking using iovec\n"); );
-    
-    // double *vtmp = (double *)iov[0].iov_base;
-    // for (uint32_t i = 0; i < iov[0].iov_len/sizeof(double); i++) {
-    //     printf(" %1.f ", *vtmp);
-    //     vtmp ++;
-    // }
-    // printf("\n");
+    printf("size elem %d, size %lu\n", pElem->elem.common.type, opal_datatype_basicDatatypes[pElem->elem.common.type]->size);
+*/
+
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
     GET_TIME(start);
 #endif
@@ -506,26 +399,22 @@ int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvert
             pConvertor->gpu_buffer_ptr = NULL;
             free_required = 0;
         } else {
-#if defined(OPAL_DATATYPE_CUDA_DRY_RUN)
-            source = (unsigned char*)iov[0].iov_base;
-#else
             if (pConvertor->gpu_buffer_ptr == NULL) {
                 pConvertor->gpu_buffer_ptr = (unsigned char*)opal_cuda_malloc_gpu_buffer(iov[0].iov_len, 0);
             }
             source = pConvertor->gpu_buffer_ptr;
-#endif /* OPAL_DATATYPE_CUDA_DRY_RUN */
             cudaMemcpy(source, iov[0].iov_base, iov[0].iov_len, cudaMemcpyHostToDevice);
             free_required = 1;
         }
     }
     
-    source_tmp = source;
 
-    DT_CUDA_DEBUG ( opal_cuda_output(0, "UNpack GPU base %p, unpack from buffer %p, total size %ld\n", pConvertor->pBaseBuf, source, iov[0].iov_len); );
-#if defined(OPAL_DATATYPE_CUDA_TIMING) 
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "Unpack using IOV, GPU base %p, unpack from buffer %p, total size %ld\n",
+                                     pConvertor->pBaseBuf, source, iov[0].iov_len); );
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
     GET_TIME( end );
     move_time = ELAPSED_TIME( start, end );
-    printf( "[Timing]: HtoD memcpy in %ld microsec, free required %d\n", move_time, free_required );
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: HtoD memcpy in %ld microsec, free required %d\n", move_time, free_required ); );
 #endif
 
 
@@ -538,14 +427,15 @@ int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvert
     total_converted = pConvertor->bConverted;
     cuda_streams->current_stream_id = 0;
     convertor_flags = pConvertor->flags;
-    orig_stack_index = pStack->index;
+//    orig_stack_index = pStack->index;
+    source_base = source;
     complete_flag = opal_convertor_raw( pConvertor, cuda_iov, &cuda_iov_count, &length );
-    DT_CUDA_DEBUG ( opal_cuda_output(2, "UNPACKING complete flag %d, iov count %d, length %d, submit to CUDA stream %d\n", complete_flag, cuda_iov_count, length, cuda_streams->current_stream_id); );
+    DT_CUDA_DEBUG ( opal_cuda_output(4, "Unpack complete flag %d, iov count %d, length %d, submit to CUDA stream %d\n", complete_flag, cuda_iov_count, length, cuda_streams->current_stream_id); );
 
 #if defined (OPAL_DATATYPE_CUDA_TIMING)
     GET_TIME( end );
     total_time = ELAPSED_TIME( start, end );
-    printf( "[Timing]: ddt to iov in %ld microsec\n", total_time );
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: ddt to iov in %ld microsec\n", total_time ); );
 #endif
     
     dst_offset = 0;
@@ -557,8 +447,9 @@ int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvert
         current_block = 0;
         task_iteration = 0;
         cuda_iov_dist_h_current = cuda_iov_dist_h[cuda_streams->current_stream_id];
-        cuda_iov_dist_d_current = cuda_iov_dist_d[cuda_streams->current_stream_id]; 
-        
+        cuda_iov_dist_d_current = cuda_iov_dist_d[cuda_streams->current_stream_id];
+        destination_base = (unsigned char*)cuda_iov[0].iov_base;
+
 #if defined (OPAL_DATATYPE_CUDA_TIMING)
         GET_TIME(start);
 #endif
@@ -567,11 +458,12 @@ int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvert
         }
         
         for (i = 0; i < cuda_iov_count; i++) {
-            pElem = &(description[orig_stack_index+i]);
+//            pElem = &(description[orig_stack_index+i]);
             if (buffer_size >= cuda_iov[i].iov_len) {
                 length_per_iovec = cuda_iov[i].iov_len;
             } else {
-                orig_alignment = opal_datatype_basicDatatypes[pElem->elem.common.type]->size;
+              /*  orig_alignment = opal_datatype_basicDatatypes[pElem->elem.common.type]->size;*/
+                orig_alignment = ALIGNMENT_CHAR;
                 length_per_iovec = buffer_size / orig_alignment * orig_alignment;
                 buffer_isfull = 1;
             }
@@ -590,12 +482,11 @@ int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvert
             count_desc = length_per_iovec / alignment;
             residue_desc = length_per_iovec % alignment;
             nb_blocks_per_description = (count_desc + thread_per_block - 1) / thread_per_block;
-            DT_CUDA_DEBUG ( opal_cuda_output(10, "UNPACKING description %d, size %d, residue %d, alignment %d\n", i, count_desc, residue_desc, alignment); );
+            DT_CUDA_DEBUG ( opal_cuda_output(10, "Unpack description %d, size %d, residue %d, alignment %d\n", i, count_desc, residue_desc, alignment); );
             for (j = 0; j < nb_blocks_per_description; j++) {
-                cuda_iov_dist_h_current[current_block].dst[task_iteration] = (unsigned char *)(cuda_iov[i].iov_base) + j * thread_per_block * alignment;
-                cuda_iov_dist_h_current[current_block].src[task_iteration] = source;
-                cuda_iov_dist_h_current[current_block].element_alignment[task_iteration] = alignment;
-                cuda_iov_dist_h_current[current_block].nb_tasks = task_iteration + 1;
+                cuda_iov_dist_h_current[nb_blocks_used].dst_offset = (unsigned char *)(cuda_iov[i].iov_base) + j * thread_per_block * alignment - destination_base;
+                cuda_iov_dist_h_current[nb_blocks_used].src_offset = source - source_base;
+                cuda_iov_dist_h_current[nb_blocks_used].element_alignment = alignment;
                 if ( (j+1) * thread_per_block <= count_desc) {
                     cuda_iov_dist_h_current[current_block].nb_elements[task_iteration] = thread_per_block;// * sizeof(double);
                 } else {
@@ -604,35 +495,25 @@ int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvert
 #if defined (OPAL_DATATYPE_CUDA_DEBUG)
                 assert (cuda_iov_dist_h_current[current_block].nb_elements[task_iteration] > 0); 
 #endif /* OPAL_DATATYPE_CUDA_DEBUG */
-                source += cuda_iov_dist_h_current[current_block].nb_elements[task_iteration] * alignment;
-                DT_CUDA_DEBUG( opal_cuda_output(12, "UNPACKING \tblock %d, src %p, dst %p, nb_elements %d, alignment %d\n", current_block, cuda_iov_dist_h_current[current_block].src[task_iteration], cuda_iov_dist_h_current[current_block].dst[task_iteration], cuda_iov_dist_h_current[current_block].nb_elements[task_iteration], cuda_iov_dist_h_current[current_block].element_alignment[task_iteration]); );
-                current_block += 1;
-                if (current_block >= nb_blocks) {
-                    current_block = 0;
-                    task_iteration ++;
-                    assert(task_iteration < CUDA_IOV_MAX_TASK_PER_BLOCK);
-                }
+                source += cuda_iov_dist_h_current[nb_blocks_used].nb_elements * alignment;
+                DT_CUDA_DEBUG( opal_cuda_output(12, "Unpack \tblock %d, src %p, dst %p, nb_elements %d, alignment %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].src_offset, cuda_iov_dist_h_current[nb_blocks_used].dst_offset, cuda_iov_dist_h_current[nb_blocks_used].nb_elements, cuda_iov_dist_h_current[nb_blocks_used].element_alignment); );
+                nb_blocks_used ++;
             }
             
             /* handle residue */
             if (residue_desc != 0) {
-                orig_alignment = opal_datatype_basicDatatypes[pElem->elem.common.type]->size;
-                cuda_iov_dist_h_current[current_block].dst[task_iteration] = (unsigned char *)(cuda_iov[i].iov_base) + length_per_iovec / alignment * alignment;
-                cuda_iov_dist_h_current[current_block].src[task_iteration] = source;
-                cuda_iov_dist_h_current[current_block].element_alignment[task_iteration] = orig_alignment;
-                cuda_iov_dist_h_current[current_block].nb_tasks = task_iteration + 1;
-                cuda_iov_dist_h_current[current_block].nb_elements[task_iteration] = (length_per_iovec - length_per_iovec / alignment * alignment) / orig_alignment;
+               /* orig_alignment = opal_datatype_basicDatatypes[pElem->elem.common.type]->size;*/
+                orig_alignment = ALIGNMENT_CHAR;
+                cuda_iov_dist_h_current[nb_blocks_used].dst_offset = (unsigned char *)(cuda_iov[i].iov_base) + length_per_iovec / alignment * alignment - destination_base;
+                cuda_iov_dist_h_current[nb_blocks_used].src_offset = source - source_base;
+                cuda_iov_dist_h_current[nb_blocks_used].element_alignment = orig_alignment;
+                cuda_iov_dist_h_current[nb_blocks_used].nb_elements = (length_per_iovec - length_per_iovec / alignment * alignment) / orig_alignment;
 #if defined (OPAL_DATATYPE_CUDA_DEBUG)
                 assert (cuda_iov_dist_h_current[current_block].nb_elements[task_iteration] > 0);
 #endif /* OPAL_DATATYPE_CUDA_DEBUG */
-                source += cuda_iov_dist_h_current[current_block].nb_elements[task_iteration] * orig_alignment;
-                DT_CUDA_DEBUG( opal_cuda_output(12, "UNPACKING \tblock %d, src %p, dst %p, nb_elements %d, alignment %d\n", current_block, cuda_iov_dist_h_current[current_block].src[task_iteration], cuda_iov_dist_h_current[current_block].dst[task_iteration], cuda_iov_dist_h_current[current_block].nb_elements[task_iteration], cuda_iov_dist_h_current[current_block].element_alignment[task_iteration]); );
-                current_block += 1;
-                if (current_block >= nb_blocks) {
-                    current_block = 0;
-                    task_iteration ++;
-                    assert(task_iteration < CUDA_IOV_MAX_TASK_PER_BLOCK);
-                }
+                source += cuda_iov_dist_h_current[nb_blocks_used].nb_elements * orig_alignment;
+                DT_CUDA_DEBUG( opal_cuda_output(12, "Unpack \tblock %d, src %p, dst %p, nb_elements %d, alignment %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].src_offset, cuda_iov_dist_h_current[nb_blocks_used].dst_offset, cuda_iov_dist_h_current[nb_blocks_used].nb_elements, cuda_iov_dist_h_current[nb_blocks_used].element_alignment); );
+                nb_blocks_used ++;
             }
             
             if (buffer_isfull) {
@@ -643,11 +524,11 @@ int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvert
 #if defined(OPAL_DATATYPE_CUDA_TIMING)    
         GET_TIME( end );
         total_time = ELAPSED_TIME( start, end );
-        printf( "[Timing]: UNpack src %p, iov is prepared in %ld microsec, kernel submitted to CUDA stream %d\n", source_tmp, total_time,  cuda_streams->current_stream_id);
+        DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Unpack src %p, iov is prepared in %ld microsec, kernel submitted to CUDA stream %d\n", source_base, total_time,  cuda_streams->current_stream_id); );
 #endif
-                
-        cudaMemcpyAsync(cuda_iov_dist_d_current, cuda_iov_dist_h_current, sizeof(ddt_cuda_iov_dist_t)*(nb_blocks), cudaMemcpyHostToDevice, cuda_streams->opal_cuda_stream[cuda_streams->current_stream_id]);
-        opal_generic_simple_unpack_cuda_iov_kernel<<<nb_blocks, thread_per_block, 0, cuda_streams->opal_cuda_stream[cuda_streams->current_stream_id]>>>(cuda_iov_dist_d_current);
+
+        cudaMemcpyAsync(cuda_iov_dist_d_current, cuda_iov_dist_h_current, sizeof(ddt_cuda_iov_dist_t)*(nb_blocks_used), cudaMemcpyHostToDevice, cuda_streams->opal_cuda_stream[cuda_streams->current_stream_id]);
+        opal_generic_simple_unpack_cuda_iov_kernel<<<nb_blocks, thread_per_block, 0, cuda_streams->opal_cuda_stream[cuda_streams->current_stream_id]>>>(cuda_iov_dist_d_current, nb_blocks_used, source_base, destination_base);
         cuda_streams->current_stream_id ++;
         cuda_streams->current_stream_id = cuda_streams->current_stream_id % NB_STREAMS;    
         
@@ -663,13 +544,13 @@ int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvert
         convertor_flags = pConvertor->flags;     
 #endif
         convertor_flags = pConvertor->flags;
-        orig_stack_index = pStack->index;
+//        orig_stack_index = pStack->index;
         complete_flag = opal_convertor_raw( pConvertor, cuda_iov, &cuda_iov_count, &length );
-        DT_CUDA_DEBUG ( opal_cuda_output(8, "UNPACKING complete flag %d, iov count %d, length %d, submit to CUDA stream %d\n", complete_flag, cuda_iov_count, length, cuda_streams->current_stream_id); );
+        DT_CUDA_DEBUG ( opal_cuda_output(4, "Unpack complete flag %d, iov count %d, length %d, submit to CUDA stream %d, nb_blocks %d\n", complete_flag, cuda_iov_count, length, cuda_streams->current_stream_id, nb_blocks_used); );
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
         GET_TIME( end );
         total_time = ELAPSED_TIME( start, end );
-        printf( "[Timing]: ddt to iov in %ld microsec\n", total_time );
+        DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: ddt to iov in %ld microsec\n", total_time ); );
 #endif
 
     }
@@ -680,12 +561,12 @@ int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvert
     iov[0].iov_len = total_unpacked;
     *max_data = total_unpacked;
     *out_size = 1;
-    DT_CUDA_DEBUG ( opal_cuda_output(0, "UNPACKING total unpacked %d\n", total_unpacked); );
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "Unpack total unpacked %d\n", total_unpacked); );
 
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
     GET_TIME( end_total );
     total_time = ELAPSED_TIME( start_total, end_total );
-    printf( "[Timing]: total unpacking in %ld microsec, kernel %ld microsec\n", total_time, total_time-move_time );
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: total unpacking in %ld microsec, kernel %ld microsec\n", total_time, total_time-move_time ); );
 #endif
     
     if( pConvertor->bConverted == pConvertor->local_size ) {
@@ -717,7 +598,7 @@ void unpack_contiguous_loop_cuda( dt_elem_desc_t* ELEM,
     long total_time;
 #endif
     
-    DT_CUDA_DEBUG( opal_cuda_output( 0, "I am in unpack_contiguous_loop_cuda\n"); );
+    DT_CUDA_DEBUG( opal_cuda_output( 2, "Unpack using contiguous_loop_cuda\n"); );
 
     if( (_copy_loops * _end_loop->size) > *(SPACE) )
         _copy_loops = (uint32_t)(*(SPACE) / _end_loop->size);
@@ -741,7 +622,7 @@ void unpack_contiguous_loop_cuda( dt_elem_desc_t* ELEM,
 #if defined(OPAL_DATATYPE_CUDA_TIMING) 
     GET_TIME( end );
     total_time = ELAPSED_TIME( start, end );
-    printf( "[Timing]: vector unpacking in %ld microsec\n", total_time );
+    DT_CUDA_DEBUG( opal_cuda_output( 2, "[Timing]: vector unpacking in %ld microsec\n", total_time ); );
 #endif
 }
 
@@ -763,7 +644,7 @@ void unpack_contiguous_loop_cuda_memcpy2d( dt_elem_desc_t* ELEM,
     long total_time;
 #endif
     
-    DT_CUDA_DEBUG( opal_cuda_output( 0, "I am in unpack_contiguous_loop_cuda_memcpy2d\n"); );
+    DT_CUDA_DEBUG( opal_cuda_output( 2, "Unpack using contiguous_loop_cuda_memcpy2d\n"); );
 
     if( (_copy_loops * _end_loop->size) > *(SPACE) )
         _copy_loops = (uint32_t)(*(SPACE) / _end_loop->size);
@@ -784,7 +665,7 @@ void unpack_contiguous_loop_cuda_memcpy2d( dt_elem_desc_t* ELEM,
 #if defined(OPAL_DATATYPE_CUDA_TIMING) 
     GET_TIME( end );
     total_time = ELAPSED_TIME( start, end );
-    printf( "[Timing]: vector unpacking with memcpy2d in %ld microsec\n", total_time );
+    DT_CUDA_DEBUG( opal_cuda_output( 2, "[Timing]: vector unpacking with memcpy2d in %ld microsec\n", total_time ); );
 #endif
 }
 
@@ -807,7 +688,7 @@ void unpack_contiguous_loop_cuda_zerocopy( dt_elem_desc_t* ELEM,
     long total_time;
 #endif
     
-    DT_CUDA_DEBUG( opal_cuda_output( 0, "I am in unpack_contiguous_loop_cuda_zerocopy\n"); );
+    DT_CUDA_DEBUG( opal_cuda_output( 2, "Unpack using contiguous_loop_cuda_zerocopy\n"); );
 
     if( (_copy_loops * _end_loop->size) > *(SPACE) )
         _copy_loops = (uint32_t)(*(SPACE) / _end_loop->size);
@@ -838,7 +719,7 @@ void unpack_contiguous_loop_cuda_zerocopy( dt_elem_desc_t* ELEM,
 #if defined(OPAL_DATATYPE_CUDA_TIMING) 
     GET_TIME( end );
     total_time = ELAPSED_TIME( start, end );
-    printf( "[Timing]: vector unpacking in %ld microsec\n", total_time );
+    DT_CUDA_DEBUG( opal_cuda_output( 2, "[Timing]: vector unpacking in %ld microsec\n", total_time ); );
 #endif
 }
 
diff --git a/opal/datatype/opal_datatype_gpu.c b/opal/datatype/opal_datatype_gpu.c
index ef7a8f41d27..095cd477dd3 100644
--- a/opal/datatype/opal_datatype_gpu.c
+++ b/opal/datatype/opal_datatype_gpu.c
@@ -45,15 +45,6 @@ void (*opal_datatype_cuda_init_p)(void) = NULL;
 
 void (*opal_datatype_cuda_fini_p)(void) = NULL;
 
-int32_t (*opal_generic_simple_pack_function_cuda_p)( opal_convertor_t* pConvertor,
-                                                     struct iovec* iov,
-                                                     uint32_t* out_size,
-                                                     size_t* max_data ) = NULL;
-
-int32_t (*opal_generic_simple_unpack_function_cuda_p)( opal_convertor_t* pConvertor,
-                                                       struct iovec* iov,
-                                                       uint32_t* out_size,
-                                                       size_t* max_data ) = NULL;
 
 int32_t (*opal_generic_simple_pack_function_cuda_iov_p)( opal_convertor_t* pConvertor,
                                                         struct iovec* iov,
@@ -95,8 +86,6 @@ void (*pack_predefined_data_cuda_p)( dt_elem_desc_t* ELEM,
 
 void (*opal_cuda_sync_device_p)(void) = NULL;
 
-unsigned char* (*opal_cuda_get_gpu_pack_buffer_p)(void) = NULL;
-
 void (*opal_cuda_free_gpu_buffer_p)(void *addr, int gpu_id) = NULL;
 
 void* (*opal_cuda_malloc_gpu_buffer_p)(size_t size, int gpu_id) = NULL;
@@ -129,8 +118,6 @@ int32_t opal_datatype_gpu_init(void)
         }
         OPAL_DATATYPE_FIND_CUDA_FUNCTION_OR_RETURN( opal_datatype_cuda_handle, opal_datatype_cuda_init );
         OPAL_DATATYPE_FIND_CUDA_FUNCTION_OR_RETURN( opal_datatype_cuda_handle, opal_datatype_cuda_fini );
-        OPAL_DATATYPE_FIND_CUDA_FUNCTION_OR_RETURN( opal_datatype_cuda_handle, opal_generic_simple_pack_function_cuda );
-        OPAL_DATATYPE_FIND_CUDA_FUNCTION_OR_RETURN( opal_datatype_cuda_handle, opal_generic_simple_unpack_function_cuda );
         OPAL_DATATYPE_FIND_CUDA_FUNCTION_OR_RETURN( opal_datatype_cuda_handle, opal_generic_simple_pack_function_cuda_iov );
         OPAL_DATATYPE_FIND_CUDA_FUNCTION_OR_RETURN( opal_datatype_cuda_handle, opal_generic_simple_unpack_function_cuda_iov );
         OPAL_DATATYPE_FIND_CUDA_FUNCTION_OR_RETURN( opal_datatype_cuda_handle, opal_generic_simple_pack_function_cuda_vector );
@@ -139,12 +126,11 @@ int32_t opal_datatype_gpu_init(void)
         OPAL_DATATYPE_FIND_CUDA_FUNCTION_OR_RETURN( opal_datatype_cuda_handle, unpack_contiguous_loop_cuda );
         OPAL_DATATYPE_FIND_CUDA_FUNCTION_OR_RETURN( opal_datatype_cuda_handle, pack_predefined_data_cuda );
         OPAL_DATATYPE_FIND_CUDA_FUNCTION_OR_RETURN( opal_datatype_cuda_handle, opal_cuda_sync_device );
-        OPAL_DATATYPE_FIND_CUDA_FUNCTION_OR_RETURN( opal_datatype_cuda_handle, opal_cuda_get_gpu_pack_buffer );
         OPAL_DATATYPE_FIND_CUDA_FUNCTION_OR_RETURN( opal_datatype_cuda_handle, opal_cuda_free_gpu_buffer );
         OPAL_DATATYPE_FIND_CUDA_FUNCTION_OR_RETURN( opal_datatype_cuda_handle, opal_cuda_malloc_gpu_buffer );
 
         (*opal_datatype_cuda_init_p)();
-        printf("cuda init done\n");
+        opal_output( 0, "cuda init done\n");
     }
     return OPAL_SUCCESS;
 }
@@ -156,8 +142,6 @@ int32_t opal_datatype_gpu_fini(void)
         /* Reset all functions to NULL */
         opal_datatype_cuda_init_p = NULL;
         opal_datatype_cuda_fini_p = NULL;
-        opal_generic_simple_pack_function_cuda_p = NULL;
-        opal_generic_simple_unpack_function_cuda_p = NULL;
         opal_generic_simple_pack_function_cuda_iov_p = NULL;
         opal_generic_simple_unpack_function_cuda_iov_p = NULL;
         opal_generic_simple_pack_function_cuda_vector_p = NULL;
@@ -166,7 +150,6 @@ int32_t opal_datatype_gpu_fini(void)
         unpack_contiguous_loop_cuda_p = NULL;
         pack_predefined_data_cuda_p = NULL;
         opal_cuda_sync_device_p = NULL;
-        opal_cuda_get_gpu_pack_buffer_p = NULL;
         opal_cuda_free_gpu_buffer_p = NULL;
         opal_cuda_malloc_gpu_buffer_p = NULL;
 
@@ -176,21 +159,7 @@ int32_t opal_datatype_gpu_fini(void)
         if( NULL != opal_datatype_cuda_lib )
             free(opal_datatype_cuda_lib);
         opal_datatype_cuda_lib = NULL;
-        printf("cuda fini done\n");
+        opal_output( 0, "cuda fini done\n");
     }
     return OPAL_SUCCESS;
 }
-
-unsigned char* opal_datatype_get_gpu_buffer(void)
-{
-#if OPAL_DATATYPE_CUDA_KERNEL
-    if (opal_datatype_gpu_init() != OPAL_SUCCESS) {
-        opal_datatype_gpu_fini();
-        return NULL;
-    }
-    return (*opal_cuda_get_gpu_pack_buffer_p)();
-#else
-    return NULL;
-#endif /* defined OPAL_DATATYPE_CUDA_KERNEL */
-    
-}
diff --git a/opal/datatype/opal_datatype_gpu.h b/opal/datatype/opal_datatype_gpu.h
index 887c8a0918b..d50e2fe8d99 100644
--- a/opal/datatype/opal_datatype_gpu.h
+++ b/opal/datatype/opal_datatype_gpu.h
@@ -5,21 +5,10 @@
 
 int32_t opal_datatype_gpu_init(void);
 int32_t opal_datatype_gpu_fini(void);
-unsigned char* opal_datatype_get_gpu_buffer(void);
 
 extern void (*opal_datatype_cuda_init_p)(void);
 
 extern void (*opal_datatype_cuda_fini_p)(void);
-
-extern int32_t (*opal_generic_simple_pack_function_cuda_p)( opal_convertor_t* pConvertor,
-                                                            struct iovec* iov, 
-                                                            uint32_t* out_size,
-                                                            size_t* max_data );
-                                                            
-extern int32_t (*opal_generic_simple_unpack_function_cuda_p)( opal_convertor_t* pConvertor,
-                                                              struct iovec* iov, 
-                                                              uint32_t* out_size,
-                                                              size_t* max_data );
                                                               
 extern int32_t (*opal_generic_simple_pack_function_cuda_iov_p)( opal_convertor_t* pConvertor,
                                                                 struct iovec* iov, 
@@ -61,8 +50,6 @@ extern void (*pack_predefined_data_cuda_p)( dt_elem_desc_t* ELEM,
                                             
 extern void (*opal_cuda_sync_device_p)(void);
 
-extern unsigned char* (*opal_cuda_get_gpu_pack_buffer_p)(void);
-
 extern void (*opal_cuda_free_gpu_buffer_p)(void *addr, int gpu_id);
 
 extern void* (*opal_cuda_malloc_gpu_buffer_p)(size_t size, int gpu_id);
diff --git a/opal/mca/btl/smcuda/btl_smcuda.c b/opal/mca/btl/smcuda/btl_smcuda.c
index dacc343ba84..2e7bee3279b 100644
--- a/opal/mca/btl/smcuda/btl_smcuda.c
+++ b/opal/mca/btl/smcuda/btl_smcuda.c
@@ -495,8 +495,8 @@ create_sm_endpoint(int local_proc, struct opal_proc_t *proc)
                                                  NULL,
                                                  &resources);
         for (int i = 0; i < SMCUDA_DT_CLONE_SIZE; i++) {
-            ep->smcuda_dt_pack_clone[i].lindex = -1;
-            ep->smcuda_dt_unpack_clone[i].lindex = -1;
+            ep->smcuda_ddt_pack_clone[i].lindex = -1;
+            ep->smcuda_ddt_unpack_clone[i].lindex = -1;
         }
     }
 #endif /* OPAL_CUDA_SUPPORT */
@@ -1159,7 +1159,7 @@ int mca_btl_smcuda_get_cuda (struct mca_btl_base_module_t *btl,
                 convertor->gpu_buffer_ptr = remote_memory_address;   
             }
             if (pack_required) {
-                mca_btl_smcuda_cuda_dt_unpack_clone(convertor, ep, remote_memory_address, (mca_btl_base_descriptor_t *)frag, 
+                mca_btl_smcuda_cuda_dt_unpack_clone(ep, convertor, remote_memory_address, (mca_btl_base_descriptor_t *)frag, 
                                                     0, lindex, remote_device, local_device);
                 cuda_dt_hdr_t send_msg;
                 send_msg.lindex = lindex;
@@ -1208,7 +1208,7 @@ int mca_btl_smcuda_get_cuda (struct mca_btl_base_module_t *btl,
                     send_msg.seq = 0;
                     send_msg.msg_type = CUDA_PACK_TO_LOCAL_START;
                 }
-                mca_btl_smcuda_cuda_dt_unpack_clone(NULL, ep, remote_memory_address, (mca_btl_base_descriptor_t *)frag, 
+                mca_btl_smcuda_cuda_dt_unpack_clone(ep, NULL, remote_memory_address, (mca_btl_base_descriptor_t *)frag, 
                                                     0, lindex, 0, 0);
                 mca_btl_smcuda_send_cuda_pack_sig(btl, ep, &send_msg);
                 done = 0;
@@ -1367,7 +1367,7 @@ int mca_btl_smcuda_check_cuda_dt_pack_clone_exist(struct mca_btl_base_endpoint_t
 {
     int i;
     for (i = 0; i < SMCUDA_DT_CLONE_SIZE; i++) {
-        if (endpoint->smcuda_dt_pack_clone[i].convertor == convertor) {
+        if (endpoint->smcuda_ddt_pack_clone[i].convertor == convertor) {
             return i;
         }
     }
@@ -1376,7 +1376,7 @@ int mca_btl_smcuda_check_cuda_dt_pack_clone_exist(struct mca_btl_base_endpoint_t
 
 int mca_btl_smcuda_set_cuda_dt_pack_seq(struct mca_btl_base_endpoint_t *endpoint, int lindex, int seq)
 {
-    endpoint->smcuda_dt_pack_clone[lindex].seq = seq;
+    endpoint->smcuda_ddt_pack_clone[lindex].seq = seq;
     return 0;
 }
 
@@ -1385,7 +1385,7 @@ int mca_btl_smcuda_get_cuda_dt_pack_seq(struct mca_btl_base_endpoint_t *endpoint
     if (lindex >= SMCUDA_DT_CLONE_SIZE) {
         return -9;
     } else {
-        return endpoint->smcuda_dt_pack_clone[lindex].seq;
+        return endpoint->smcuda_ddt_pack_clone[lindex].seq;
     }
 }
 
@@ -1394,7 +1394,7 @@ int mca_btl_smcuda_get_cuda_dt_pack_pipeline_size(struct mca_btl_base_endpoint_t
     if (lindex >= SMCUDA_DT_CLONE_SIZE) {
         return -9;
     } else {
-        return endpoint->smcuda_dt_pack_clone[lindex].pipeline_size;
+        return endpoint->smcuda_ddt_pack_clone[lindex].pipeline_size;
     }
 }
 
@@ -1402,7 +1402,7 @@ int mca_btl_smcuda_alloc_cuda_dt_pack_clone(struct mca_btl_base_endpoint_t *endp
 {
     int i;
     for (i = 0; i < SMCUDA_DT_CLONE_SIZE; i++) {
-        if (endpoint->smcuda_dt_pack_clone[i].lindex == -1) {
+        if (endpoint->smcuda_ddt_pack_clone[i].lindex == -1) {
             return i;
         }
     }
@@ -1412,7 +1412,7 @@ int mca_btl_smcuda_alloc_cuda_dt_unpack_clone(struct mca_btl_base_endpoint_t *en
 {
     int i;
     for (i = 0; i < SMCUDA_DT_CLONE_SIZE; i++) {
-        if (endpoint->smcuda_dt_unpack_clone[i].lindex == -1) {
+        if (endpoint->smcuda_ddt_unpack_clone[i].lindex == -1) {
             return i;
         }
     }
@@ -1421,51 +1421,47 @@ int mca_btl_smcuda_alloc_cuda_dt_unpack_clone(struct mca_btl_base_endpoint_t *en
 
 void mca_btl_smcuda_free_cuda_dt_pack_clone(struct mca_btl_base_endpoint_t *endpoint, int lindex)
 {
-    assert(endpoint->smcuda_dt_pack_clone[lindex].lindex == lindex);
-    endpoint->smcuda_dt_pack_clone[lindex].lindex = -1;
+    assert(endpoint->smcuda_ddt_pack_clone[lindex].lindex == lindex);
+    endpoint->smcuda_ddt_pack_clone[lindex].lindex = -1;
 }
 void mca_btl_smcuda_free_cuda_dt_unpack_clone(struct mca_btl_base_endpoint_t *endpoint, int lindex)
 {
-    assert(endpoint->smcuda_dt_unpack_clone[lindex].lindex == lindex);
-    endpoint->smcuda_dt_unpack_clone[lindex].lindex = -1;
+    assert(endpoint->smcuda_ddt_unpack_clone[lindex].lindex == lindex);
+    endpoint->smcuda_ddt_unpack_clone[lindex].lindex = -1;
 }
 
-void mca_btl_smcuda_cuda_dt_pack_clone(struct opal_convertor_t *convertor,
-                                       struct mca_btl_base_endpoint_t *endpoint,
+void mca_btl_smcuda_cuda_dt_pack_clone(struct mca_btl_base_endpoint_t *endpoint,
+                                       struct opal_convertor_t *convertor,
                                        void *remote_gpu_address,
                                        mca_btl_base_descriptor_t *frag,
                                        size_t pipeline_size,
                                        int lindex, uint8_t remote_device, uint8_t local_device)
 {
-    endpoint->smcuda_dt_pack_clone[lindex].convertor = convertor;
- //   endpoint->smcuda_dt_pack_clone[lindex].gpu_ptr = convertor->gpu_buffer_ptr;
-    endpoint->smcuda_dt_pack_clone[lindex].endpoint = endpoint;
-    endpoint->smcuda_dt_pack_clone[lindex].remote_gpu_address = remote_gpu_address;
-    endpoint->smcuda_dt_pack_clone[lindex].pipeline_size = pipeline_size;
-    endpoint->smcuda_dt_pack_clone[lindex].lindex = lindex;
-    endpoint->smcuda_dt_pack_clone[lindex].seq = -9;
-    endpoint->smcuda_dt_pack_clone[lindex].remote_device = remote_device;
-    endpoint->smcuda_dt_pack_clone[lindex].local_device = local_device;
-    endpoint->smcuda_dt_pack_clone[lindex].frag = frag;
+    endpoint->smcuda_ddt_pack_clone[lindex].convertor = convertor;
+    endpoint->smcuda_ddt_pack_clone[lindex].remote_gpu_address = remote_gpu_address;
+    endpoint->smcuda_ddt_pack_clone[lindex].pipeline_size = pipeline_size;
+    endpoint->smcuda_ddt_pack_clone[lindex].lindex = lindex;
+    endpoint->smcuda_ddt_pack_clone[lindex].seq = -9;
+    endpoint->smcuda_ddt_pack_clone[lindex].remote_device = remote_device;
+    endpoint->smcuda_ddt_pack_clone[lindex].local_device = local_device;
+    endpoint->smcuda_ddt_pack_clone[lindex].frag = frag;
 }
 
-void mca_btl_smcuda_cuda_dt_unpack_clone(struct opal_convertor_t *convertor,
-                                         struct mca_btl_base_endpoint_t *endpoint,
+void mca_btl_smcuda_cuda_dt_unpack_clone(struct mca_btl_base_endpoint_t *endpoint,
+                                         struct opal_convertor_t *convertor,
                                          void *remote_gpu_address,
                                          mca_btl_base_descriptor_t *frag,
                                          size_t pipeline_size,
                                          int lindex, uint8_t remote_device, uint8_t local_device)
 {
-    endpoint->smcuda_dt_unpack_clone[lindex].convertor = convertor;
-//    endpoint->smcuda_dt_unpack_clone[lindex].gpu_ptr = convertor->gpu_buffer_ptr;
-    endpoint->smcuda_dt_unpack_clone[lindex].endpoint = endpoint;
-    endpoint->smcuda_dt_unpack_clone[lindex].remote_gpu_address = remote_gpu_address;
-    endpoint->smcuda_dt_unpack_clone[lindex].pipeline_size = pipeline_size;
-    endpoint->smcuda_dt_unpack_clone[lindex].lindex = lindex;
-    endpoint->smcuda_dt_unpack_clone[lindex].seq = -9;
-    endpoint->smcuda_dt_unpack_clone[lindex].remote_device = remote_device;
-    endpoint->smcuda_dt_unpack_clone[lindex].local_device = local_device;
-    endpoint->smcuda_dt_unpack_clone[lindex].frag = frag;
+    endpoint->smcuda_ddt_unpack_clone[lindex].convertor = convertor;
+    endpoint->smcuda_ddt_unpack_clone[lindex].remote_gpu_address = remote_gpu_address;
+    endpoint->smcuda_ddt_unpack_clone[lindex].pipeline_size = pipeline_size;
+    endpoint->smcuda_ddt_unpack_clone[lindex].lindex = lindex;
+    endpoint->smcuda_ddt_unpack_clone[lindex].seq = -9;
+    endpoint->smcuda_ddt_unpack_clone[lindex].remote_device = remote_device;
+    endpoint->smcuda_ddt_unpack_clone[lindex].local_device = local_device;
+    endpoint->smcuda_ddt_unpack_clone[lindex].frag = frag;
 }
 
 #endif /* OPAL_CUDA_SUPPORT */
diff --git a/opal/mca/btl/smcuda/btl_smcuda.h b/opal/mca/btl/smcuda/btl_smcuda.h
index abd043f9f10..f9171ec8962 100644
--- a/opal/mca/btl/smcuda/btl_smcuda.h
+++ b/opal/mca/btl/smcuda/btl_smcuda.h
@@ -207,7 +207,7 @@ struct mca_btl_smcuda_component_t {
     int cuda_ipc_output;
     int use_cuda_ipc;
     int use_cuda_ipc_same_gpu;
-    int cuda_dt_pipeline_size;
+    int cuda_ddt_pipeline_size;
 #endif /* OPAL_CUDA_SUPPORT */
 };
 typedef struct mca_btl_smcuda_component_t mca_btl_smcuda_component_t;
@@ -534,7 +534,6 @@ typedef struct {
 /* package save pack/unpack convertor and cbfunc */
 typedef struct {
     struct opal_convertor_t *convertor;
-    struct mca_btl_base_endpoint_t *endpoint;
     void *remote_gpu_address;
     size_t pipeline_size;
     int lindex;
@@ -542,10 +541,10 @@ typedef struct {
     uint8_t remote_device;
     uint8_t local_device;
     mca_btl_base_descriptor_t *frag;
-} cuda_dt_clone_t;
+} cuda_ddt_clone_t;
 
 #define SMCUDA_DT_CLONE_SIZE 20
-extern cuda_dt_clone_t smcuda_dt_clone[SMCUDA_DT_CLONE_SIZE];
+extern cuda_ddt_clone_t smcuda_dt_clone[SMCUDA_DT_CLONE_SIZE];
 
 int mca_btl_smcuda_send_cuda_unpack_sig(struct mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t* endpoint, cuda_dt_hdr_t *send_msg);
 int mca_btl_smcuda_send_cuda_pack_sig(struct mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t* endpoint, cuda_dt_hdr_t *send_msg);
@@ -557,14 +556,14 @@ int mca_btl_smcuda_alloc_cuda_dt_pack_clone(struct mca_btl_base_endpoint_t *endp
 int mca_btl_smcuda_alloc_cuda_dt_unpack_clone(struct mca_btl_base_endpoint_t *endpoint);
 void mca_btl_smcuda_free_cuda_dt_pack_clone(struct mca_btl_base_endpoint_t *endpoint, int lindex);
 void mca_btl_smcuda_free_cuda_dt_unpack_clone(struct mca_btl_base_endpoint_t *endpoint, int lindex);
-void mca_btl_smcuda_cuda_dt_pack_clone(struct opal_convertor_t *convertor,
-                                       struct mca_btl_base_endpoint_t *endpoint,
+void mca_btl_smcuda_cuda_dt_pack_clone(struct mca_btl_base_endpoint_t *endpoint,
+                                       struct opal_convertor_t *convertor,
                                        void *remote_gpu_address,
                                        mca_btl_base_descriptor_t *frag,
                                        size_t pipeline_size,
                                        int lindex, uint8_t remote_device, uint8_t local_device);
-void mca_btl_smcuda_cuda_dt_unpack_clone(struct opal_convertor_t *convertor,
-                                         struct mca_btl_base_endpoint_t *endpoint,
+void mca_btl_smcuda_cuda_dt_unpack_clone(struct mca_btl_base_endpoint_t *endpoint,
+                                         struct opal_convertor_t *convertor,
                                          void *remote_gpu_address,
                                          mca_btl_base_descriptor_t *frag,
                                          size_t pipeline_size,
diff --git a/opal/mca/btl/smcuda/btl_smcuda_component.c b/opal/mca/btl/smcuda/btl_smcuda_component.c
index de772340fa0..ee25fabd4e5 100644
--- a/opal/mca/btl/smcuda/btl_smcuda_component.c
+++ b/opal/mca/btl/smcuda/btl_smcuda_component.c
@@ -167,7 +167,7 @@ static int smcuda_register(void)
     mca_btl_smcuda_param_register_int("use_cuda_ipc", 1, OPAL_INFO_LVL_4, &mca_btl_smcuda_component.use_cuda_ipc);
     mca_btl_smcuda_param_register_int("use_cuda_ipc_same_gpu", 1, OPAL_INFO_LVL_4,&mca_btl_smcuda_component.use_cuda_ipc_same_gpu);
     mca_btl_smcuda_param_register_int("cuda_ipc_verbose", 0, OPAL_INFO_LVL_4, &mca_btl_smcuda_component.cuda_ipc_verbose);
-    mca_btl_smcuda_param_register_int("cuda_ddt_pipeline_size", 1024*1024*400, OPAL_INFO_LVL_4, &mca_btl_smcuda_component.cuda_dt_pipeline_size);
+    mca_btl_smcuda_param_register_int("cuda_ddt_pipeline_size", 1024*1024*400, OPAL_INFO_LVL_4, &mca_btl_smcuda_component.cuda_ddt_pipeline_size);
     mca_btl_smcuda_component.cuda_ipc_output = opal_output_open(NULL);
     opal_output_set_verbosity(mca_btl_smcuda_component.cuda_ipc_output, mca_btl_smcuda_component.cuda_ipc_verbose);
 #else /* OPAL_CUDA_SUPPORT */
@@ -861,14 +861,14 @@ static void btl_smcuda_datatype_unpack(mca_btl_base_module_t* btl,
     size_t packed_size = cuda_dt_hdr.packed_size;
     int msg_type = cuda_dt_hdr.msg_type;
     mca_btl_smcuda_frag_t *frag = (mca_btl_smcuda_frag_t *)des;
-    cuda_dt_clone_t *my_cuda_dt_clone;
+    cuda_ddt_clone_t *my_cuda_dt_clone;
 
     /* We can find the endoint back from the rank embedded in the header */
     endpoint = mca_btl_smcuda_component.sm_peers[frag->hdr->my_smp_rank];
-    my_cuda_dt_clone = &endpoint->smcuda_dt_unpack_clone[lindex];
+    my_cuda_dt_clone = &endpoint->smcuda_ddt_unpack_clone[lindex];
     assert(my_cuda_dt_clone->lindex == lindex);
     
-    printf("$$$$$$$$$$$$$$hello, rank %d in smcuda unpack seq %d, index %d\n", my_cuda_dt_clone->endpoint->my_smp_rank, seq, lindex);
+    printf("$$$$$$$$$$$$$$hello, rank %d in smcuda unpack seq %d, index %d\n", endpoint->my_smp_rank, seq, lindex);
     cuda_dt_hdr_t send_msg;
     send_msg.lindex = lindex;
     
@@ -937,7 +937,7 @@ static void btl_smcuda_datatype_pack(mca_btl_base_module_t* btl,
     int msg_type = cuda_dt_hdr.msg_type;
     size_t packed_size = cuda_dt_hdr.packed_size;
     mca_btl_smcuda_frag_t *frag = (mca_btl_smcuda_frag_t *)des;
-    cuda_dt_clone_t *my_cuda_dt_clone;
+    cuda_ddt_clone_t *my_cuda_dt_clone;
     cuda_dt_hdr_t send_msg;
     
     uint32_t iov_count = 1;
@@ -946,9 +946,9 @@ static void btl_smcuda_datatype_pack(mca_btl_base_module_t* btl,
 
     /* We can find the endoint back from the rank embedded in the header */
     endpoint = mca_btl_smcuda_component.sm_peers[frag->hdr->my_smp_rank];
-    my_cuda_dt_clone = &endpoint->smcuda_dt_pack_clone[lindex];
+    my_cuda_dt_clone = &endpoint->smcuda_ddt_pack_clone[lindex];
     
-    printf("$$$$$$$$$$$$$$hello, rank %d in smcuda pack seq %d, index %d\n", my_cuda_dt_clone->endpoint->my_smp_rank, seq, lindex);
+    printf("$$$$$$$$$$$$$$hello, rank %d in smcuda pack seq %d, index %d\n", endpoint->my_smp_rank, seq, lindex);
     struct opal_convertor_t *convertor = my_cuda_dt_clone->convertor;
     send_msg.lindex = lindex;
     if (msg_type == CUDA_PACK_COMPLETE_ACK) {
@@ -1000,7 +1000,7 @@ static void btl_smcuda_datatype_pack(mca_btl_base_module_t* btl,
             send_msg.msg_type = CUDA_UNPACK_FROM_SEQ;
         }
         struct iovec iov;
-        packed_size = mca_btl_smcuda_component.cuda_dt_pipeline_size;
+        packed_size = mca_btl_smcuda_component.cuda_ddt_pipeline_size;
         printf("Pipeline_size %ld\n", packed_size);
         iov.iov_base = convertor->gpu_buffer_ptr;
         iov.iov_len = packed_size;
diff --git a/opal/mca/btl/smcuda/btl_smcuda_endpoint.h b/opal/mca/btl/smcuda/btl_smcuda_endpoint.h
index e4df5ee56d0..f3b79866c14 100644
--- a/opal/mca/btl/smcuda/btl_smcuda_endpoint.h
+++ b/opal/mca/btl/smcuda/btl_smcuda_endpoint.h
@@ -49,8 +49,8 @@ struct mca_btl_base_endpoint_t {
     opal_proc_t *proc_opal;  /**< Needed for adding CUDA IPC support dynamically */
     enum ipcState ipcstate;  /**< CUDA IPC connection status */
     int ipctries;            /**< Number of times CUDA IPC connect was sent */
-    cuda_dt_clone_t smcuda_dt_pack_clone[SMCUDA_DT_CLONE_SIZE];
-    cuda_dt_clone_t smcuda_dt_unpack_clone[SMCUDA_DT_CLONE_SIZE];
+    cuda_ddt_clone_t smcuda_ddt_pack_clone[SMCUDA_DT_CLONE_SIZE];
+    cuda_ddt_clone_t smcuda_ddt_unpack_clone[SMCUDA_DT_CLONE_SIZE];
 #endif /* OPAL_CUDA_SUPPORT */
 };
 
diff --git a/test/datatype/ddt_benchmark.c b/test/datatype/ddt_benchmark.c
index 2d25274ee9b..92bdf644d4d 100644
--- a/test/datatype/ddt_benchmark.c
+++ b/test/datatype/ddt_benchmark.c
@@ -1211,12 +1211,12 @@ int main( int argc, char* argv[] )
     
     printf( "\n\n#\n * TEST UPPER TRIANGULAR MATRIX (size 100)\n #\n\n" );
     int mat_size = 500;
-    for (mat_size = 500; mat_size <= 6000; mat_size +=500) {
+    for (mat_size = 500; mat_size <= 500; mat_size +=500) {
         pdt = upper_matrix(mat_size);
         printf("----matrix size %d-----\n", mat_size);
         if( outputFlags & CHECK_PACK_UNPACK ) {
             for (i = 1; i <= 1; i++) {
-     //           local_copy_with_convertor(pdt, 1, 1024*1024*200, mat_size);
+                local_copy_with_convertor(pdt, 1, 1024*1024*200, mat_size);
             }
         }
         OBJ_RELEASE( pdt ); assert( pdt == NULL );

From c451a4a22616eb7790f750236fc9b54137f18cd7 Mon Sep 17 00:00:00 2001
From: eddy16112 <eddy16112@gmail.com>
Date: Sun, 25 Oct 2015 18:54:31 -0400
Subject: [PATCH 031/190] rewrite pipeline

---
 ompi/mca/pml/ob1/pml_ob1_cuda.c               |   6 +-
 .../cuda/opal_datatype_cuda_internal.cuh      |   2 +-
 .../cuda/opal_datatype_pack_cuda_wrapper.cu   |   2 +
 .../cuda/opal_datatype_unpack_cuda_wrapper.cu |   2 +
 opal/mca/btl/smcuda/btl_smcuda.c              |  69 ++++------
 opal/mca/btl/smcuda/btl_smcuda.h              |  53 ++++----
 opal/mca/btl/smcuda/btl_smcuda_component.c    | 127 +++++++-----------
 7 files changed, 114 insertions(+), 147 deletions(-)

diff --git a/ompi/mca/pml/ob1/pml_ob1_cuda.c b/ompi/mca/pml/ob1/pml_ob1_cuda.c
index 34a56f3c18b..a8a4d893ecf 100644
--- a/ompi/mca/pml/ob1/pml_ob1_cuda.c
+++ b/ompi/mca/pml/ob1/pml_ob1_cuda.c
@@ -117,8 +117,10 @@ int mca_pml_ob1_send_request_start_cuda(mca_pml_ob1_send_request_t* sendreq,
             printf("GPU data ready for GET!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n");
             unsigned char *base;
             struct opal_convertor_t *convertor = &(sendreq->req_send.req_base.req_convertor);
+         //   base = opal_cuda_malloc_gpu_buffer_p(4000000*4, 0);
             base = opal_cuda_malloc_gpu_buffer_p(convertor->local_size, 0);
             convertor->gpu_buffer_ptr = base;
+            convertor->gpu_buffer_size = 4000000*4;//convertor->local_size;
             convertor->gpu_buffer_size = convertor->local_size;
             sendreq->req_send.req_bytes_packed = convertor->local_size;
             printf("GPU BUFFER %p, local %lu, remote %lu\n", base, convertor->local_size, convertor->remote_size);
@@ -128,7 +130,7 @@ int mca_pml_ob1_send_request_start_cuda(mca_pml_ob1_send_request_t* sendreq,
                                                                            sendreq->req_send.req_bytes_packed,
                                                                            sendreq->req_rdma))) {
     
-                int lindex = mca_btl_smcuda_alloc_cuda_dt_pack_clone(bml_btl->btl_endpoint);
+                int lindex = mca_btl_smcuda_alloc_cuda_ddt_pack_clone(bml_btl->btl_endpoint);
                 assert(lindex >= 0);
                 rc = mca_common_cuda_get_device(&local_device);
                 if (rc != 0) {
@@ -136,7 +138,7 @@ int mca_pml_ob1_send_request_start_cuda(mca_pml_ob1_send_request_t* sendreq,
                     return rc;
                 }
                 mca_pml_ob1_rdma_cuda_btl_register_data(sendreq->req_rdma, sendreq->req_rdma_cnt, 0, lindex, 1, local_device); 
-                mca_btl_smcuda_cuda_dt_pack_clone( bml_btl->btl_endpoint, convertor, NULL, NULL, 0, lindex, 0, local_device);
+                mca_btl_smcuda_cuda_ddt_pack_clone( bml_btl->btl_endpoint, convertor, NULL, NULL, lindex, 0, local_device);
     
                 rc = mca_pml_ob1_send_request_start_rdma(sendreq, bml_btl,
                                                          sendreq->req_send.req_bytes_packed);
diff --git a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
index 160d54336d4..268554126ab 100644
--- a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
+++ b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
@@ -13,7 +13,7 @@
 #define OPAL_DATATYPE_CUDA_DEBUG    1
 //#define OPAL_DATATYPE_CUDA_KERNEL_TIME
 #define OPAL_DATATYPE_CUDA_DEBUG_LEVEL  2
-#define OPAL_DATATYPE_CUDA_TIMING
+//#define OPAL_DATATYPE_CUDA_TIMING
 #define OPAL_DATATYPE_VECTOR_USE_MEMCPY2D   0
 #define OPAL_DATATYPE_VECTOR_USE_ZEROCPY   0
 #define OPAL_DATATYPE_VECTOR_USE_PIPELINE   0
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
index efc0c7af957..9ee6fc0f032 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
@@ -839,9 +839,11 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
         
         /* buffer is full */
         if (buffer_isfull) {
+            size_t total_converted_tmp = total_converted;
             pConvertor->flags = convertor_flags;
             total_converted += total_packed;
             opal_convertor_set_position_nocheck(pConvertor, &total_converted);
+            total_packed = total_converted - total_converted_tmp;
             break;
         }
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
index 52f9acccc09..ba8a89e88cb 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
@@ -534,9 +534,11 @@ int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvert
         
         /* buffer is full */
         if (buffer_isfull) {
+            size_t total_converted_tmp = total_converted;
             pConvertor->flags = convertor_flags;
             total_converted += total_unpacked;
             opal_convertor_set_position_nocheck(pConvertor, &total_converted);
+            total_unpacked = total_converted - total_converted_tmp;
             break;
         }
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
diff --git a/opal/mca/btl/smcuda/btl_smcuda.c b/opal/mca/btl/smcuda/btl_smcuda.c
index 2e7bee3279b..14d0a3995ce 100644
--- a/opal/mca/btl/smcuda/btl_smcuda.c
+++ b/opal/mca/btl/smcuda/btl_smcuda.c
@@ -1158,10 +1158,10 @@ int mca_btl_smcuda_get_cuda (struct mca_btl_base_module_t *btl,
             } else {
                 convertor->gpu_buffer_ptr = remote_memory_address;   
             }
+            cuda_ddt_hdr_t send_msg;
             if (pack_required) {
-                mca_btl_smcuda_cuda_dt_unpack_clone(ep, convertor, remote_memory_address, (mca_btl_base_descriptor_t *)frag, 
-                                                    0, lindex, remote_device, local_device);
-                cuda_dt_hdr_t send_msg;
+                mca_btl_smcuda_cuda_ddt_unpack_clone(ep, convertor, remote_memory_address, (mca_btl_base_descriptor_t *)frag, 
+                                                    lindex, remote_device, local_device);
                 send_msg.lindex = lindex;
                 send_msg.packed_size = 0;
                 send_msg.seq = 0;
@@ -1188,13 +1188,13 @@ int mca_btl_smcuda_get_cuda (struct mca_btl_base_module_t *btl,
         } else {
             printf("RECEIVE REGT CONTIGUOUS, size %ld !!!!!!!!!!!\n", size);
             recvreq->req_recv.req_base.req_convertor.flags |= CONVERTOR_CUDA;
+            cuda_ddt_hdr_t send_msg;
             if (pack_required) {
-                cuda_dt_hdr_t send_msg;
                 send_msg.lindex = lindex;
                 send_msg.packed_size = 0;
                 if (remote_device == local_device && OPAL_DATATYPE_DIRECT_COPY_GPUMEM) {
                     /* now we are able to let sender pack directly to my memory */
-                    mca_mpool_common_cuda_reg_t loc_reg;
+  /*                  mca_mpool_common_cuda_reg_t loc_reg;
                     mca_mpool_common_cuda_reg_t *loc_reg_ptr = &loc_reg;
                     cuda_getmemhandle(local_address, size, (mca_mpool_base_registration_t *)&loc_reg, NULL);
                     memcpy(send_msg.mem_handle, loc_reg_ptr->data.memHandle, sizeof(loc_reg_ptr->data.memHandle));
@@ -1203,13 +1203,13 @@ int mca_btl_smcuda_get_cuda (struct mca_btl_base_module_t *btl,
                     send_msg.remote_address = local_address;
                     send_msg.remote_base = loc_reg.base.base;
                     mca_common_wait_stream_synchronize(&loc_reg);
-                    printf("send r_addr %p, r_base %p\n", local_address, loc_reg.base.base);
+                    printf("send r_addr %p, r_base %p\n", local_address, loc_reg.base.base);*/
                 } else {
                     send_msg.seq = 0;
                     send_msg.msg_type = CUDA_PACK_TO_LOCAL_START;
                 }
-                mca_btl_smcuda_cuda_dt_unpack_clone(ep, NULL, remote_memory_address, (mca_btl_base_descriptor_t *)frag, 
-                                                    0, lindex, 0, 0);
+                mca_btl_smcuda_cuda_ddt_unpack_clone(ep, NULL, remote_memory_address, (mca_btl_base_descriptor_t *)frag, 
+                                                    lindex, 0, 0);
                 mca_btl_smcuda_send_cuda_pack_sig(btl, ep, &send_msg);
                 done = 0;
             } else {
@@ -1319,11 +1319,11 @@ static void mca_btl_smcuda_send_cuda_ipc_request(struct mca_btl_base_module_t* b
 
 int mca_btl_smcuda_send_cuda_unpack_sig(struct mca_btl_base_module_t* btl,
                                            struct mca_btl_base_endpoint_t* endpoint, 
-                                           cuda_dt_hdr_t *send_msg)
+                                           cuda_ddt_hdr_t *send_msg)
 {
     mca_btl_smcuda_frag_t* frag;
     int rc;
-    cuda_dt_hdr_t cuda_dt_hdr;
+    cuda_ddt_hdr_t cuda_dt_hdr;
     
     /* allocate a fragment, giving up if we can't get one */
     MCA_BTL_SMCUDA_FRAG_ALLOC_EAGER(frag);
@@ -1334,7 +1334,7 @@ int mca_btl_smcuda_send_cuda_unpack_sig(struct mca_btl_base_module_t* btl,
 
     /* Fill in fragment fields. */
     frag->base.des_flags = MCA_BTL_DES_FLAGS_BTL_OWNERSHIP;
-    memcpy(frag->segment.seg_addr.pval, send_msg, sizeof(cuda_dt_hdr_t));
+    memcpy(frag->segment.seg_addr.pval, send_msg, sizeof(cuda_ddt_hdr_t));
     
     rc = mca_btl_smcuda_send(btl, endpoint, (struct mca_btl_base_descriptor_t*)frag,  MCA_BTL_TAG_SMCUDA_DATATYPE_UNPACK);
     printf("######## rank %d, send seq %d, endpoint %p\n", endpoint->my_smp_rank, send_msg->seq, endpoint);
@@ -1343,11 +1343,11 @@ int mca_btl_smcuda_send_cuda_unpack_sig(struct mca_btl_base_module_t* btl,
 
 int mca_btl_smcuda_send_cuda_pack_sig(struct mca_btl_base_module_t* btl,
                                       struct mca_btl_base_endpoint_t* endpoint, 
-                                      cuda_dt_hdr_t *send_msg)
+                                      cuda_ddt_hdr_t *send_msg)
 {
     mca_btl_smcuda_frag_t* frag;
     int rc;
-    cuda_dt_hdr_t cuda_dt_hdr;
+    cuda_ddt_hdr_t cuda_dt_hdr;
     
     /* allocate a fragment, giving up if we can't get one */
     MCA_BTL_SMCUDA_FRAG_ALLOC_EAGER(frag);
@@ -1357,7 +1357,7 @@ int mca_btl_smcuda_send_cuda_pack_sig(struct mca_btl_base_module_t* btl,
 
     /* Fill in fragment fields. */
     frag->base.des_flags = MCA_BTL_DES_FLAGS_BTL_OWNERSHIP;
-    memcpy(frag->segment.seg_addr.pval, send_msg, sizeof(cuda_dt_hdr_t));
+    memcpy(frag->segment.seg_addr.pval, send_msg, sizeof(cuda_ddt_hdr_t));
     
     rc = mca_btl_smcuda_send(btl, endpoint, (struct mca_btl_base_descriptor_t*)frag,  MCA_BTL_TAG_SMCUDA_DATATYPE_PACK);
     return rc;
@@ -1389,16 +1389,7 @@ int mca_btl_smcuda_get_cuda_dt_pack_seq(struct mca_btl_base_endpoint_t *endpoint
     }
 }
 
-int mca_btl_smcuda_get_cuda_dt_pack_pipeline_size(struct mca_btl_base_endpoint_t *endpoint, int lindex)
-{
-    if (lindex >= SMCUDA_DT_CLONE_SIZE) {
-        return -9;
-    } else {
-        return endpoint->smcuda_ddt_pack_clone[lindex].pipeline_size;
-    }
-}
-
-int mca_btl_smcuda_alloc_cuda_dt_pack_clone(struct mca_btl_base_endpoint_t *endpoint)
+int mca_btl_smcuda_alloc_cuda_ddt_pack_clone(struct mca_btl_base_endpoint_t *endpoint)
 {
     int i;
     for (i = 0; i < SMCUDA_DT_CLONE_SIZE; i++) {
@@ -1408,7 +1399,7 @@ int mca_btl_smcuda_alloc_cuda_dt_pack_clone(struct mca_btl_base_endpoint_t *endp
     }
     return -1;
 }
-int mca_btl_smcuda_alloc_cuda_dt_unpack_clone(struct mca_btl_base_endpoint_t *endpoint)
+int mca_btl_smcuda_alloc_cuda_ddt_unpack_clone(struct mca_btl_base_endpoint_t *endpoint)
 {
     int i;
     for (i = 0; i < SMCUDA_DT_CLONE_SIZE; i++) {
@@ -1419,27 +1410,25 @@ int mca_btl_smcuda_alloc_cuda_dt_unpack_clone(struct mca_btl_base_endpoint_t *en
     return -1;
 }
 
-void mca_btl_smcuda_free_cuda_dt_pack_clone(struct mca_btl_base_endpoint_t *endpoint, int lindex)
+void mca_btl_smcuda_free_cuda_ddt_pack_clone(struct mca_btl_base_endpoint_t *endpoint, int lindex)
 {
     assert(endpoint->smcuda_ddt_pack_clone[lindex].lindex == lindex);
     endpoint->smcuda_ddt_pack_clone[lindex].lindex = -1;
 }
-void mca_btl_smcuda_free_cuda_dt_unpack_clone(struct mca_btl_base_endpoint_t *endpoint, int lindex)
+void mca_btl_smcuda_free_cuda_ddt_unpack_clone(struct mca_btl_base_endpoint_t *endpoint, int lindex)
 {
     assert(endpoint->smcuda_ddt_unpack_clone[lindex].lindex == lindex);
     endpoint->smcuda_ddt_unpack_clone[lindex].lindex = -1;
 }
 
-void mca_btl_smcuda_cuda_dt_pack_clone(struct mca_btl_base_endpoint_t *endpoint,
-                                       struct opal_convertor_t *convertor,
-                                       void *remote_gpu_address,
-                                       mca_btl_base_descriptor_t *frag,
-                                       size_t pipeline_size,
-                                       int lindex, uint8_t remote_device, uint8_t local_device)
+void mca_btl_smcuda_cuda_ddt_pack_clone(struct mca_btl_base_endpoint_t *endpoint,
+                                        struct opal_convertor_t *convertor,
+                                        void *remote_gpu_address,
+                                        mca_btl_base_descriptor_t *frag,
+                                        int lindex, uint8_t remote_device, uint8_t local_device)
 {
     endpoint->smcuda_ddt_pack_clone[lindex].convertor = convertor;
     endpoint->smcuda_ddt_pack_clone[lindex].remote_gpu_address = remote_gpu_address;
-    endpoint->smcuda_ddt_pack_clone[lindex].pipeline_size = pipeline_size;
     endpoint->smcuda_ddt_pack_clone[lindex].lindex = lindex;
     endpoint->smcuda_ddt_pack_clone[lindex].seq = -9;
     endpoint->smcuda_ddt_pack_clone[lindex].remote_device = remote_device;
@@ -1447,16 +1436,14 @@ void mca_btl_smcuda_cuda_dt_pack_clone(struct mca_btl_base_endpoint_t *endpoint,
     endpoint->smcuda_ddt_pack_clone[lindex].frag = frag;
 }
 
-void mca_btl_smcuda_cuda_dt_unpack_clone(struct mca_btl_base_endpoint_t *endpoint,
-                                         struct opal_convertor_t *convertor,
-                                         void *remote_gpu_address,
-                                         mca_btl_base_descriptor_t *frag,
-                                         size_t pipeline_size,
-                                         int lindex, uint8_t remote_device, uint8_t local_device)
+void mca_btl_smcuda_cuda_ddt_unpack_clone(struct mca_btl_base_endpoint_t *endpoint,
+                                          struct opal_convertor_t *convertor,
+                                          void *remote_gpu_address,
+                                          mca_btl_base_descriptor_t *frag,
+                                          int lindex, uint8_t remote_device, uint8_t local_device)
 {
     endpoint->smcuda_ddt_unpack_clone[lindex].convertor = convertor;
     endpoint->smcuda_ddt_unpack_clone[lindex].remote_gpu_address = remote_gpu_address;
-    endpoint->smcuda_ddt_unpack_clone[lindex].pipeline_size = pipeline_size;
     endpoint->smcuda_ddt_unpack_clone[lindex].lindex = lindex;
     endpoint->smcuda_ddt_unpack_clone[lindex].seq = -9;
     endpoint->smcuda_ddt_unpack_clone[lindex].remote_device = remote_device;
diff --git a/opal/mca/btl/smcuda/btl_smcuda.h b/opal/mca/btl/smcuda/btl_smcuda.h
index f9171ec8962..46ae97b3909 100644
--- a/opal/mca/btl/smcuda/btl_smcuda.h
+++ b/opal/mca/btl/smcuda/btl_smcuda.h
@@ -511,31 +511,34 @@ enum ipcState {
     IPC_BAD
 };
 
-/* cuda datatype control message */
+/* cuda datatype pack/unpack message */
 typedef struct {
     int seq;
     int msg_type;
     int lindex;
     int packed_size;
+} cuda_ddt_hdr_t;
+
+/* cuda datatype put message */
+typedef struct {
     void *remote_address;
     void *remote_base;
     uint64_t mem_handle[8];
-} cuda_dt_hdr_t;
+} cuda_ddt_put_hdr_t;
 
-#define CUDA_UNPACK_FROM_SEQ        0
-#define CUDA_PACK_COMPLETE          1
-#define CUDA_PACK_COMPLETE_ACK      2
-#define CUDA_PACK_CLEANUP           3
+#define CUDA_DDT_UNPACK_FROM_BLOCK  0
+#define CUDA_DDT_COMPLETE           1
+#define CUDA_DDT_COMPLETE_ACK       2
+#define CUDA_DDT_CLEANUP            3
 #define CUDA_PACK_TO_LOCAL_START    4
 #define CUDA_PACK_TO_REMOTE_START   5
-#define CUDA_PACK_TO_SEQ      6
+#define CUDA_DDT_PACK_TO_BLOCK      6
 #define CUDA_UNPACK_NO              7
 
 /* package save pack/unpack convertor and cbfunc */
 typedef struct {
     struct opal_convertor_t *convertor;
     void *remote_gpu_address;
-    size_t pipeline_size;
     int lindex;
     int seq;
     uint8_t remote_device;
@@ -546,28 +549,26 @@ typedef struct {
 #define SMCUDA_DT_CLONE_SIZE 20
 extern cuda_ddt_clone_t smcuda_dt_clone[SMCUDA_DT_CLONE_SIZE];
 
-int mca_btl_smcuda_send_cuda_unpack_sig(struct mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t* endpoint, cuda_dt_hdr_t *send_msg);
-int mca_btl_smcuda_send_cuda_pack_sig(struct mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t* endpoint, cuda_dt_hdr_t *send_msg);
+int mca_btl_smcuda_send_cuda_unpack_sig(struct mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t* endpoint, cuda_ddt_hdr_t *send_msg);
+int mca_btl_smcuda_send_cuda_pack_sig(struct mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t* endpoint, cuda_ddt_hdr_t *send_msg);
 int mca_btl_smcuda_check_cuda_dt_pack_clone_exist(struct mca_btl_base_endpoint_t *endpoint, struct opal_convertor_t *convertor);
 int mca_btl_smcuda_set_cuda_dt_pack_seq(struct mca_btl_base_endpoint_t *endpoint, int lindex, int seq);
 int mca_btl_smcuda_get_cuda_dt_pack_seq(struct mca_btl_base_endpoint_t *endpoint, int lindex);
 int mca_btl_smcuda_get_cuda_dt_pack_pipeline_size(struct mca_btl_base_endpoint_t *endpoint, int lindex);
-int mca_btl_smcuda_alloc_cuda_dt_pack_clone(struct mca_btl_base_endpoint_t *endpoint);
-int mca_btl_smcuda_alloc_cuda_dt_unpack_clone(struct mca_btl_base_endpoint_t *endpoint);
-void mca_btl_smcuda_free_cuda_dt_pack_clone(struct mca_btl_base_endpoint_t *endpoint, int lindex);
-void mca_btl_smcuda_free_cuda_dt_unpack_clone(struct mca_btl_base_endpoint_t *endpoint, int lindex);
-void mca_btl_smcuda_cuda_dt_pack_clone(struct mca_btl_base_endpoint_t *endpoint,
-                                       struct opal_convertor_t *convertor,
-                                       void *remote_gpu_address,
-                                       mca_btl_base_descriptor_t *frag,
-                                       size_t pipeline_size,
-                                       int lindex, uint8_t remote_device, uint8_t local_device);
-void mca_btl_smcuda_cuda_dt_unpack_clone(struct mca_btl_base_endpoint_t *endpoint,
-                                         struct opal_convertor_t *convertor,
-                                         void *remote_gpu_address,
-                                         mca_btl_base_descriptor_t *frag,
-                                         size_t pipeline_size,
-                                         int lindex, uint8_t remote_device, uint8_t local_device);
+int mca_btl_smcuda_alloc_cuda_ddt_pack_clone(struct mca_btl_base_endpoint_t *endpoint);
+int mca_btl_smcuda_alloc_cuda_ddt_unpack_clone(struct mca_btl_base_endpoint_t *endpoint);
+void mca_btl_smcuda_free_cuda_ddt_pack_clone(struct mca_btl_base_endpoint_t *endpoint, int lindex);
+void mca_btl_smcuda_free_cuda_ddt_unpack_clone(struct mca_btl_base_endpoint_t *endpoint, int lindex);
+void mca_btl_smcuda_cuda_ddt_pack_clone(struct mca_btl_base_endpoint_t *endpoint,
+                                        struct opal_convertor_t *convertor,
+                                        void *remote_gpu_address,
+                                        mca_btl_base_descriptor_t *frag,
+                                        int lindex, uint8_t remote_device, uint8_t local_device);
+void mca_btl_smcuda_cuda_ddt_unpack_clone(struct mca_btl_base_endpoint_t *endpoint,
+                                          struct opal_convertor_t *convertor,
+                                          void *remote_gpu_address,
+                                          mca_btl_base_descriptor_t *frag,
+                                          int lindex, uint8_t remote_device, uint8_t local_device);
 
 #endif /* OPAL_CUDA_SUPPORT */
 
diff --git a/opal/mca/btl/smcuda/btl_smcuda_component.c b/opal/mca/btl/smcuda/btl_smcuda_component.c
index ee25fabd4e5..4f46b8a5beb 100644
--- a/opal/mca/btl/smcuda/btl_smcuda_component.c
+++ b/opal/mca/btl/smcuda/btl_smcuda_component.c
@@ -853,13 +853,13 @@ static void btl_smcuda_datatype_unpack(mca_btl_base_module_t* btl,
                                        mca_btl_base_descriptor_t* des, void* cbdata)
 {   
     struct mca_btl_base_endpoint_t *endpoint;
-    cuda_dt_hdr_t cuda_dt_hdr;
+    cuda_ddt_hdr_t recv_msg;
     mca_btl_base_segment_t* segments = des->des_segments;
-    memcpy(&cuda_dt_hdr, segments->seg_addr.pval, sizeof(cuda_dt_hdr_t));
-    int seq = cuda_dt_hdr.seq;
-    int lindex = cuda_dt_hdr.lindex;
-    size_t packed_size = cuda_dt_hdr.packed_size;
-    int msg_type = cuda_dt_hdr.msg_type;
+    memcpy(&recv_msg, segments->seg_addr.pval, sizeof(cuda_ddt_hdr_t));
+    int seq = recv_msg.seq;
+    int lindex = recv_msg.lindex;
+    size_t packed_size = recv_msg.packed_size;
+    int msg_type = recv_msg.msg_type;
     mca_btl_smcuda_frag_t *frag = (mca_btl_smcuda_frag_t *)des;
     cuda_ddt_clone_t *my_cuda_dt_clone;
 
@@ -869,29 +869,21 @@ static void btl_smcuda_datatype_unpack(mca_btl_base_module_t* btl,
     assert(my_cuda_dt_clone->lindex == lindex);
     
     printf("$$$$$$$$$$$$$$hello, rank %d in smcuda unpack seq %d, index %d\n", endpoint->my_smp_rank, seq, lindex);
-    cuda_dt_hdr_t send_msg;
+    cuda_ddt_hdr_t send_msg;
     send_msg.lindex = lindex;
     
-    if (msg_type == CUDA_PACK_CLEANUP) {
+    if (msg_type == CUDA_DDT_CLEANUP) {
         mca_btl_smcuda_frag_t *frag_recv = (mca_btl_smcuda_frag_t *) my_cuda_dt_clone->frag;
         mca_btl_base_rdma_completion_fn_t cbfunc = (mca_btl_base_rdma_completion_fn_t) frag_recv->base.des_cbfunc;
         cbfunc (btl, endpoint, frag_recv->segment.seg_addr.pval, frag_recv->local_handle, frag_recv->base.des_context, frag_recv->base.des_cbdata, OPAL_SUCCESS);
         mca_btl_smcuda_free(btl, (mca_btl_base_descriptor_t *)frag_recv);
-        mca_btl_smcuda_free_cuda_dt_unpack_clone(endpoint, lindex);
-    } else if (msg_type == CUDA_PACK_COMPLETE) {
-        send_msg.packed_size = 0;
-        send_msg.seq = -1;
-        send_msg.msg_type = CUDA_PACK_COMPLETE_ACK;
-        mca_btl_smcuda_send_cuda_pack_sig(btl, endpoint, &send_msg);
-    } else if (msg_type == CUDA_UNPACK_FROM_SEQ){
+        mca_btl_smcuda_free_cuda_ddt_unpack_clone(endpoint, lindex);
+    } else if (msg_type == CUDA_DDT_UNPACK_FROM_BLOCK || msg_type == CUDA_DDT_COMPLETE){
         struct iovec iov;
         uint32_t iov_count = 1;
         size_t max_data;
         struct opal_convertor_t *convertor = my_cuda_dt_clone->convertor;
-        if (my_cuda_dt_clone->pipeline_size == 0) {
-            my_cuda_dt_clone->pipeline_size = packed_size;
-        }
-        size_t pipeline_size = my_cuda_dt_clone->pipeline_size;
+        size_t pipeline_size = mca_btl_smcuda_component.cuda_ddt_pipeline_size;
         if (convertor == NULL) { /* do not unpack */
             mca_btl_smcuda_frag_t *frag_recv = (mca_btl_smcuda_frag_t *) my_cuda_dt_clone->frag;
             unsigned char *local_address = (unsigned char*)frag_recv->segment.seg_addr.pval;
@@ -899,7 +891,7 @@ static void btl_smcuda_datatype_unpack(mca_btl_base_module_t* btl,
             mca_common_cuda_memp2pcpy(local_address + seq*pipeline_size, my_cuda_dt_clone->remote_gpu_address + seq*pipeline_size, packed_size);
         } else {     /* unpack */
             if (!OPAL_DATATYPE_DIRECT_COPY_GPUMEM && my_cuda_dt_clone->remote_device != my_cuda_dt_clone->local_device) {
-                convertor->gpu_buffer_ptr = opal_cuda_malloc_gpu_buffer_p(pipeline_size, 0);
+                convertor->gpu_buffer_ptr = opal_cuda_malloc_gpu_buffer_p(packed_size, 0);
                 mca_common_cuda_memp2pcpy(convertor->gpu_buffer_ptr, my_cuda_dt_clone->remote_gpu_address + seq*pipeline_size, packed_size);
                 iov.iov_base = convertor->gpu_buffer_ptr;
                 printf("start D2D copy src %p, dst %p, size %lu\n", my_cuda_dt_clone->remote_gpu_address + seq*pipeline_size, convertor->gpu_buffer_ptr, packed_size);        
@@ -918,7 +910,11 @@ static void btl_smcuda_datatype_unpack(mca_btl_base_module_t* btl,
         }
         send_msg.seq = seq;
         send_msg.packed_size = packed_size;
-        send_msg.msg_type = CUDA_PACK_TO_SEQ;
+        if (msg_type == CUDA_DDT_COMPLETE) {
+            send_msg.msg_type = CUDA_DDT_COMPLETE_ACK;
+        } else {
+            send_msg.msg_type = CUDA_DDT_PACK_TO_BLOCK;
+        }
         mca_btl_smcuda_send_cuda_pack_sig(btl, endpoint, &send_msg);
     }
    // MCA_BTL_SMCUDA_FRAG_RETURN(frag);
@@ -929,19 +925,19 @@ static void btl_smcuda_datatype_pack(mca_btl_base_module_t* btl,
                                      mca_btl_base_descriptor_t* des, void* cbdata)
 {
     struct mca_btl_base_endpoint_t *endpoint;
-    cuda_dt_hdr_t cuda_dt_hdr;
+    cuda_ddt_hdr_t recv_msg;
     mca_btl_base_segment_t* segments = des->des_segments;
-    memcpy(&cuda_dt_hdr, segments->seg_addr.pval, sizeof(cuda_dt_hdr_t));
-    int seq = cuda_dt_hdr.seq;
-    int lindex = cuda_dt_hdr.lindex;
-    int msg_type = cuda_dt_hdr.msg_type;
-    size_t packed_size = cuda_dt_hdr.packed_size;
+    memcpy(&recv_msg, segments->seg_addr.pval, sizeof(cuda_ddt_hdr_t));
+    int seq = recv_msg.seq;
+    int lindex = recv_msg.lindex;
+    int msg_type = recv_msg.msg_type;
+    size_t packed_size = recv_msg.packed_size;
     mca_btl_smcuda_frag_t *frag = (mca_btl_smcuda_frag_t *)des;
     cuda_ddt_clone_t *my_cuda_dt_clone;
-    cuda_dt_hdr_t send_msg;
+    cuda_ddt_hdr_t send_msg;
     
     uint32_t iov_count = 1;
-    int rc_dt = 0;
+    int rv_dt = 0;
     size_t max_data = 0;
 
     /* We can find the endoint back from the rank embedded in the header */
@@ -951,39 +947,37 @@ static void btl_smcuda_datatype_pack(mca_btl_base_module_t* btl,
     printf("$$$$$$$$$$$$$$hello, rank %d in smcuda pack seq %d, index %d\n", endpoint->my_smp_rank, seq, lindex);
     struct opal_convertor_t *convertor = my_cuda_dt_clone->convertor;
     send_msg.lindex = lindex;
-    if (msg_type == CUDA_PACK_COMPLETE_ACK) {
+    if (msg_type == CUDA_DDT_COMPLETE_ACK) {
         send_msg.packed_size = 0;
         send_msg.seq = -2;
-        send_msg.msg_type = CUDA_PACK_CLEANUP;
+        send_msg.msg_type = CUDA_DDT_CLEANUP;
         mca_btl_smcuda_send_cuda_unpack_sig(btl, endpoint, &send_msg);
         if (convertor->gpu_buffer_ptr != NULL) {
             opal_cuda_free_gpu_buffer_p(convertor->gpu_buffer_ptr, 0);
             convertor->gpu_buffer_ptr = NULL;
         }
-        mca_btl_smcuda_free_cuda_dt_pack_clone(endpoint, lindex);
-    } else if (msg_type == CUDA_PACK_TO_SEQ) {
-        printf("i receive a message pack to seq, packed %ld, pipeline_size %ld\n", convertor->bConverted, my_cuda_dt_clone->pipeline_size); 
+        mca_btl_smcuda_free_cuda_ddt_pack_clone(endpoint, lindex);
+    } else if (msg_type == CUDA_DDT_PACK_TO_BLOCK) {
+        printf("i receive a message pack to seq, packed %ld, pipeline_size %ld\n", convertor->bConverted, mca_btl_smcuda_component.cuda_ddt_pipeline_size); 
         if (convertor->bConverted < convertor->local_size) {
             struct iovec iov;
-            iov.iov_base = convertor->gpu_buffer_ptr + seq*my_cuda_dt_clone->pipeline_size;
+            iov.iov_base = convertor->gpu_buffer_ptr + seq * mca_btl_smcuda_component.cuda_ddt_pipeline_size;;
             iov.iov_len = packed_size;
-            rc_dt = opal_convertor_pack(convertor, &iov, &iov_count, &max_data );
+            rv_dt = opal_convertor_pack(convertor, &iov, &iov_count, &max_data );
             packed_size = max_data;
             send_msg.packed_size = packed_size;
             send_msg.seq = seq;
-            send_msg.msg_type = CUDA_UNPACK_FROM_SEQ;
-            mca_btl_smcuda_send_cuda_unpack_sig(btl, endpoint, &send_msg);
-            if (rc_dt == 1) {
-                send_msg.packed_size = 0;
-                send_msg.seq = -1;
-                send_msg.msg_type = CUDA_PACK_COMPLETE;
-                mca_btl_smcuda_send_cuda_unpack_sig(btl, endpoint, &send_msg);
+            if (rv_dt == 1) {
+                send_msg.msg_type = CUDA_DDT_COMPLETE;
+            } else {
+                send_msg.msg_type = CUDA_DDT_UNPACK_FROM_BLOCK;
             }
+            mca_btl_smcuda_send_cuda_unpack_sig(btl, endpoint, &send_msg);
         }
     } else {
         mca_mpool_common_cuda_reg_t *rget_reg_ptr = NULL;
         if (msg_type == CUDA_PACK_TO_REMOTE_START) { /* receiver is contiguous, and ask me to pack directly to his gpu memory */
-            opal_cuda_free_gpu_buffer_p(convertor->gpu_buffer_ptr, 0);
+/*            opal_cuda_free_gpu_buffer_p(convertor->gpu_buffer_ptr, 0);
             mca_mpool_common_cuda_reg_t rget_reg;
             rget_reg_ptr= &rget_reg;
             memset(&rget_reg, 0, sizeof(rget_reg));
@@ -995,48 +989,27 @@ static void btl_smcuda_datatype_pack(mca_btl_base_module_t* btl,
             convertor->gpu_buffer_ptr = remote_memory_address;
             printf("remote_memory_address $$$$$$ %p, r_addr %p, r_base %p\n", remote_memory_address, cuda_dt_hdr.remote_address, cuda_dt_hdr.remote_base);
             send_msg.msg_type = CUDA_UNPACK_NO;
-            convertor->gpu_buffer_size = convertor->local_size;
+            convertor->gpu_buffer_size = convertor->local_size;*/
         } else {
-            send_msg.msg_type = CUDA_UNPACK_FROM_SEQ;
+            send_msg.msg_type = CUDA_DDT_UNPACK_FROM_BLOCK;
         }
         struct iovec iov;
         packed_size = mca_btl_smcuda_component.cuda_ddt_pipeline_size;
         printf("Pipeline_size %ld\n", packed_size);
+        iov.iov_len = mca_btl_smcuda_component.cuda_ddt_pipeline_size;
         iov.iov_base = convertor->gpu_buffer_ptr;
-        iov.iov_len = packed_size;
-        max_data = 0;
         seq = 0;
-        /* the first pack here is used to get the correct size of pipeline_size */
-        /* because pack may not use the whole pipeline size */
-        rc_dt = opal_convertor_pack(convertor, &iov, &iov_count, &max_data );
-        packed_size = max_data;
-        iov.iov_base += packed_size;
-        /* save pipeline size */
-        my_cuda_dt_clone->pipeline_size = packed_size;   
-        convertor->gpu_buffer_size -= packed_size;
-        send_msg.packed_size = packed_size;
-        send_msg.seq = seq;
-        mca_btl_smcuda_send_cuda_unpack_sig(btl, endpoint, &send_msg);
-        while (rc_dt != 1 && convertor->gpu_buffer_size > 0) {
-            if (convertor->gpu_buffer_size < packed_size) {
-                packed_size = convertor->gpu_buffer_size;
-            } 
-            iov.iov_len = packed_size;
-            seq ++;
-            rc_dt = opal_convertor_pack(convertor, &iov, &iov_count, &max_data );
-            packed_size = max_data;
-            iov.iov_base += packed_size;
-            convertor->gpu_buffer_size -= packed_size;
-            send_msg.packed_size = packed_size;
+        while (rv_dt != 1 && convertor->gpu_buffer_size > 0) {
+            rv_dt = opal_convertor_pack(convertor, &iov, &iov_count, &max_data );
+            iov.iov_base += mca_btl_smcuda_component.cuda_ddt_pipeline_size;
+            convertor->gpu_buffer_size -= mca_btl_smcuda_component.cuda_ddt_pipeline_size;
+            send_msg.packed_size = max_data;
             send_msg.seq = seq;
+            if (rv_dt == 1) {
+                send_msg.msg_type = CUDA_DDT_COMPLETE;
+            }
             mca_btl_smcuda_send_cuda_unpack_sig(btl, endpoint, &send_msg);
-        }
-        
-        if (rc_dt == 1) {
-            send_msg.packed_size = 0;
-            send_msg.seq = -1;
-            send_msg.msg_type = CUDA_PACK_COMPLETE;
-            mca_btl_smcuda_send_cuda_unpack_sig(btl, endpoint, &send_msg);
+            seq ++;
         }
         
         if (rget_reg_ptr != NULL) { /* close memhandle */

From fa331f8b56feedf4bffe567c16b469eb734ddbd2 Mon Sep 17 00:00:00 2001
From: eddy16112 <eddy16112@gmail.com>
Date: Mon, 26 Oct 2015 17:02:33 -0400
Subject: [PATCH 032/190] s up and running. PUT size in an MCA parameters.

Conflicts:
	opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu

Conflicts:
	opal/mca/btl/btl.h
---
 ompi/mca/pml/ob1/pml_ob1_cuda.c               |  17 +--
 opal/datatype/cuda/opal_datatype_cuda.cu      |  11 ++
 opal/datatype/cuda/opal_datatype_cuda.cuh     |   4 +
 .../cuda/opal_datatype_pack_cuda_wrapper.cu   |   3 +
 .../cuda/opal_datatype_unpack_cuda_wrapper.cu |   3 +
 opal/datatype/opal_datatype_gpu.c             |   8 ++
 opal/datatype/opal_datatype_gpu.h             |   4 +
 opal/mca/btl/btl.h                            |   3 +
 opal/mca/btl/smcuda/btl_smcuda.c              | 115 +++++++++---------
 opal/mca/btl/smcuda/btl_smcuda.h              |  17 +--
 opal/mca/btl/smcuda/btl_smcuda_component.c    |  98 ++++++++++-----
 opal/mca/common/cuda/common_cuda.c            |   1 -
 12 files changed, 177 insertions(+), 107 deletions(-)

diff --git a/ompi/mca/pml/ob1/pml_ob1_cuda.c b/ompi/mca/pml/ob1/pml_ob1_cuda.c
index a8a4d893ecf..2550b4c93b1 100644
--- a/ompi/mca/pml/ob1/pml_ob1_cuda.c
+++ b/ompi/mca/pml/ob1/pml_ob1_cuda.c
@@ -114,16 +114,19 @@ int mca_pml_ob1_send_request_start_cuda(mca_pml_ob1_send_request_t* sendreq,
         sendreq->req_send.req_base.req_convertor.flags |= CONVERTOR_CUDA;
         mca_bml_base_btl_t* bml_endpoint_btl = mca_bml_base_btl_array_get_index(&(sendreq->req_endpoint->btl_send), 0);
         if ((bml_endpoint_btl->btl_flags & MCA_BTL_FLAGS_CUDA_GET) && CUDA_DDT_WITH_RDMA) {
-            printf("GPU data ready for GET!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n");
             unsigned char *base;
             struct opal_convertor_t *convertor = &(sendreq->req_send.req_base.req_convertor);
-         //   base = opal_cuda_malloc_gpu_buffer_p(4000000*4, 0);
-            base = opal_cuda_malloc_gpu_buffer_p(convertor->local_size, 0);
+            size_t buffer_size = 0;
+            if (convertor->local_size > bml_btl->btl->btl_cuda_ddt_pipeline_size) {
+                buffer_size = bml_btl->btl->btl_cuda_ddt_pipeline_size * bml_btl->btl->btl_cuda_ddt_pipeline_depth;
+            } else {
+                buffer_size = convertor->local_size;
+            }
+            base = opal_cuda_malloc_gpu_buffer_p(buffer_size, 0);
             convertor->gpu_buffer_ptr = base;
-            convertor->gpu_buffer_size = 4000000*4;//convertor->local_size;
-            convertor->gpu_buffer_size = convertor->local_size;
+            convertor->gpu_buffer_size = buffer_size;
             sendreq->req_send.req_bytes_packed = convertor->local_size;
-            printf("GPU BUFFER %p, local %lu, remote %lu\n", base, convertor->local_size, convertor->remote_size);
+            opal_output(0, "malloc GPU BUFFER %p for pack, local size %lu, pipeline size %lu, depth %d\n", base, convertor->local_size, bml_btl->btl->btl_cuda_ddt_pipeline_size, bml_btl->btl->btl_cuda_ddt_pipeline_depth);
             if( 0 != (sendreq->req_rdma_cnt = (uint32_t)mca_pml_ob1_rdma_cuda_btls(
                                                                            sendreq->req_endpoint,
                                                                            base,
@@ -232,7 +235,7 @@ int mca_pml_ob1_rdma_cuda_btl_register_data(
       //       mca_common_cuda_geteventhandle(&convertor->pipeline_event[j], j, (mca_mpool_base_registration_t *)cuda_reg);
       // //      printf("event %lu, j %d\n", convertor->pipeline_event[j], j);
       //   }
-        cuda_reg->data.pipeline_size = pipeline_size;
+   //     cuda_reg->data.pipeline_size = pipeline_size;
         cuda_reg->data.lindex = lindex;
         cuda_reg->data.pack_required = pack_required;
         cuda_reg->data.gpu_device = gpu_device;
diff --git a/opal/datatype/cuda/opal_datatype_cuda.cu b/opal/datatype/cuda/opal_datatype_cuda.cu
index bce80b4a592..0f0d52d558b 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cu
+++ b/opal/datatype/cuda/opal_datatype_cuda.cu
@@ -353,6 +353,17 @@ void opal_cuda_free_gpu_buffer(void *addr, int gpu_id)
     DT_CUDA_DEBUG( opal_cuda_output( 2, "Free GPU buffer %p.\n", addr); );
 }
 
+void opal_cuda_d2dcpy_async(void* dst, const void* src, size_t count)
+{
+    cudaMemcpyAsync(dst, src, count, cudaMemcpyDeviceToDevice, cuda_streams->opal_cuda_stream[0]);
+}
+
+void opal_cuda_d2dcpy(void* dst, const void* src, size_t count)
+{
+    cudaMemcpyAsync(dst, src, count, cudaMemcpyDeviceToDevice, cuda_streams->opal_cuda_stream[0]);
+    cudaStreamSynchronize(cuda_streams->opal_cuda_stream[0]);
+}
+
 void opal_dump_cuda_list(ddt_cuda_list_t *list)
 {
     ddt_cuda_buffer_t *ptr = NULL;
diff --git a/opal/datatype/cuda/opal_datatype_cuda.cuh b/opal/datatype/cuda/opal_datatype_cuda.cuh
index 94336ac6475..d71d349d46b 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cuh
+++ b/opal/datatype/cuda/opal_datatype_cuda.cuh
@@ -91,6 +91,10 @@ void* opal_cuda_malloc_gpu_buffer(size_t size, int gpu_id);
 
 void opal_cuda_free_gpu_buffer(void *addr, int gpu_id);
 
+void opal_cuda_d2dcpy_async(void* dst, const void* src, size_t count);
+
+void opal_cuda_d2dcpy(void* dst, const void* src, size_t count);
+
 void opal_dump_cuda_list(ddt_cuda_list_t *list);
 
 }
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
index 9ee6fc0f032..381aaf99ae8 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
@@ -300,6 +300,7 @@ int32_t opal_generic_simple_pack_function_cuda_vector(opal_convertor_t* pConvert
                 iov_ptr = pConvertor->gpu_buffer_ptr;
             }
         }
+        cudaStreamSynchronize(cuda_streams->opal_cuda_stream[0]);
         while( 1 ) {
             while( pElem->elem.common.flags & OPAL_DATATYPE_FLAG_DATA ) {
                 /* now here we have a basic datatype */
@@ -713,6 +714,8 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
     convertor_flags = pConvertor->flags;
   //  orig_stack_index = pStack->index;
     destination_base = destination;
+    
+    cudaStreamSynchronize(cuda_streams->opal_cuda_stream[0]);
 
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
     GET_TIME(start_total);
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
index ba8a89e88cb..4ee73897f68 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
@@ -251,6 +251,7 @@ int32_t opal_generic_simple_unpack_function_cuda_vector( opal_convertor_t* pConv
         total_time = ELAPSED_TIME( start, end );
         DT_CUDA_DEBUG( opal_cuda_output( 2, "[Timing]: HtoD memcpy in %ld microsec, free required %d\n", total_time, free_required ); );
 #endif
+        cudaStreamSynchronize(cuda_streams->opal_cuda_stream[0]);
         iov_len_local = iov[iov_count].iov_len;
         if( 0 != pConvertor->partial_length ) {
             /* not support yet */
@@ -377,6 +378,8 @@ int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvert
     long total_time, move_time;
 #endif
     
+    cudaStreamSynchronize(cuda_streams->opal_cuda_stream[0]);
+
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
     GET_TIME(start_total);
 #endif
diff --git a/opal/datatype/opal_datatype_gpu.c b/opal/datatype/opal_datatype_gpu.c
index 095cd477dd3..f05ecbd84b5 100644
--- a/opal/datatype/opal_datatype_gpu.c
+++ b/opal/datatype/opal_datatype_gpu.c
@@ -90,6 +90,10 @@ void (*opal_cuda_free_gpu_buffer_p)(void *addr, int gpu_id) = NULL;
 
 void* (*opal_cuda_malloc_gpu_buffer_p)(size_t size, int gpu_id) = NULL;
 
+void (*opal_cuda_d2dcpy_async_p)(void* dst, const void* src, size_t count) = NULL;
+
+void (*opal_cuda_d2dcpy_p)(void* dst, const void* src, size_t count) = NULL;
+
 #define OPAL_DATATYPE_FIND_CUDA_FUNCTION_OR_RETURN(handle, fname)       \
     do {                                                                \
         char* _error;                                                   \
@@ -128,6 +132,8 @@ int32_t opal_datatype_gpu_init(void)
         OPAL_DATATYPE_FIND_CUDA_FUNCTION_OR_RETURN( opal_datatype_cuda_handle, opal_cuda_sync_device );
         OPAL_DATATYPE_FIND_CUDA_FUNCTION_OR_RETURN( opal_datatype_cuda_handle, opal_cuda_free_gpu_buffer );
         OPAL_DATATYPE_FIND_CUDA_FUNCTION_OR_RETURN( opal_datatype_cuda_handle, opal_cuda_malloc_gpu_buffer );
+        OPAL_DATATYPE_FIND_CUDA_FUNCTION_OR_RETURN( opal_datatype_cuda_handle, opal_cuda_d2dcpy_async );
+        OPAL_DATATYPE_FIND_CUDA_FUNCTION_OR_RETURN( opal_datatype_cuda_handle, opal_cuda_d2dcpy );
 
         (*opal_datatype_cuda_init_p)();
         opal_output( 0, "cuda init done\n");
@@ -152,6 +158,8 @@ int32_t opal_datatype_gpu_fini(void)
         opal_cuda_sync_device_p = NULL;
         opal_cuda_free_gpu_buffer_p = NULL;
         opal_cuda_malloc_gpu_buffer_p = NULL;
+        opal_cuda_d2dcpy_async_p = NULL;
+        opal_cuda_d2dcpy_p = NULL;
 
         dlclose(opal_datatype_cuda_handle);
         opal_datatype_cuda_handle = NULL;
diff --git a/opal/datatype/opal_datatype_gpu.h b/opal/datatype/opal_datatype_gpu.h
index d50e2fe8d99..df42d68b6fc 100644
--- a/opal/datatype/opal_datatype_gpu.h
+++ b/opal/datatype/opal_datatype_gpu.h
@@ -53,4 +53,8 @@ extern void (*opal_cuda_sync_device_p)(void);
 extern void (*opal_cuda_free_gpu_buffer_p)(void *addr, int gpu_id);
 
 extern void* (*opal_cuda_malloc_gpu_buffer_p)(size_t size, int gpu_id);
+
+extern void (*opal_cuda_d2dcpy_async_p)(void* dst, const void* src, size_t count);
+
+extern void (*opal_cuda_d2dcpy_p)(void* dst, const void* src, size_t count);
 #endif /* OPAL_DATATYPE_GPU_H_HAS_BEEN_INCLUDED */
diff --git a/opal/mca/btl/btl.h b/opal/mca/btl/btl.h
index 431610ff17f..1a38ec4c331 100644
--- a/opal/mca/btl/btl.h
+++ b/opal/mca/btl/btl.h
@@ -190,6 +190,7 @@ typedef uint8_t mca_btl_base_tag_t;
 #define MCA_BTL_TAG_SMCUDA            (MCA_BTL_TAG_BTL + 2)
 #define MCA_BTL_TAG_SMCUDA_DATATYPE_UNPACK   (MCA_BTL_TAG_BTL + 3)
 #define MCA_BTL_TAG_SMCUDA_DATATYPE_PACK     (MCA_BTL_TAG_BTL + 4)
+#define MCA_BTL_TAG_SMCUDA_DATATYPE_PUT      (MCA_BTL_TAG_BTL + 5)
 
 /* prefered protocol */
 #define MCA_BTL_FLAGS_SEND            0x0001
@@ -1181,6 +1182,8 @@ struct mca_btl_base_module_t {
 #endif /* OPAL_CUDA_GDR_SUPPORT */
 #if OPAL_CUDA_SUPPORT
     size_t      btl_cuda_max_send_size;   /**< set if CUDA max send_size is different from host max send size */
+    size_t      btl_cuda_ddt_pipeline_size;
+    int         btl_cuda_ddt_pipeline_depth;
 #endif /* OPAL_CUDA_SUPPORT */
 };
 typedef struct mca_btl_base_module_t mca_btl_base_module_t;
diff --git a/opal/mca/btl/smcuda/btl_smcuda.c b/opal/mca/btl/smcuda/btl_smcuda.c
index 14d0a3995ce..9d5d5441683 100644
--- a/opal/mca/btl/smcuda/btl_smcuda.c
+++ b/opal/mca/btl/smcuda/btl_smcuda.c
@@ -83,6 +83,13 @@ static struct mca_btl_base_registration_handle_t *mca_btl_smcuda_register_mem (
 
 static int mca_btl_smcuda_deregister_mem (struct mca_btl_base_module_t* btl,
                                           struct mca_btl_base_registration_handle_t *handle);
+                                          
+inline static int mca_btl_smcuda_cuda_ddt_start_pack(struct mca_btl_base_module_t *btl, 
+                                                     struct mca_btl_base_endpoint_t *endpoint,
+                                                     struct opal_convertor_t *convertor,
+                                                     void *remote_gpu_address,
+                                                     mca_btl_base_descriptor_t *frag,
+                                                     int lindex, uint8_t remote_device, uint8_t local_device);
 #endif
 
 mca_btl_smcuda_t mca_btl_smcuda = {
@@ -402,7 +409,6 @@ smcuda_btl_first_time_init(mca_btl_smcuda_t *smcuda_btl,
 
     /* allocation will be for the fragment descriptor and payload buffer */
     length = sizeof(mca_btl_smcuda_frag1_t);
-    printf("free list %d\n", mca_btl_smcuda_component.sm_free_list_num);
     length_payload =
         sizeof(mca_btl_smcuda_hdr_t) + mca_btl_smcuda_component.eager_limit;
     i = opal_free_list_init (&mca_btl_smcuda_component.sm_frags_eager, length,
@@ -1147,11 +1153,8 @@ int mca_btl_smcuda_get_cuda (struct mca_btl_base_module_t *btl,
         }
         if(opal_convertor_need_buffers(&recvreq->req_recv.req_base.req_convertor) == true) {
             recvreq->req_recv.req_base.req_convertor.flags |= CONVERTOR_CUDA;
-            printf("RECEIVE REGT UNPACK, size %ld!!!!!!!!!!!\n", size);
             
             struct opal_convertor_t *convertor = &(recvreq->req_recv.req_base.req_convertor);   
-          //  size_t pipeline_size = remote_handle->reg_data.pipeline_size;
-            printf("i receive lindex %d, pack_required %d, remote_device %d， local_device %d\n", lindex, pack_required, remote_device, local_device);
             
             if (remote_device != local_device && !OPAL_DATATYPE_DIRECT_COPY_GPUMEM) {
                 convertor->gpu_buffer_ptr = NULL;  
@@ -1160,13 +1163,8 @@ int mca_btl_smcuda_get_cuda (struct mca_btl_base_module_t *btl,
             }
             cuda_ddt_hdr_t send_msg;
             if (pack_required) {
-                mca_btl_smcuda_cuda_ddt_unpack_clone(ep, convertor, remote_memory_address, (mca_btl_base_descriptor_t *)frag, 
+                mca_btl_smcuda_cuda_ddt_start_pack(btl, ep, convertor, remote_memory_address, (mca_btl_base_descriptor_t *)frag, 
                                                     lindex, remote_device, local_device);
-                send_msg.lindex = lindex;
-                send_msg.packed_size = 0;
-                send_msg.seq = 0;
-                send_msg.msg_type = CUDA_PACK_TO_LOCAL_START;
-                mca_btl_smcuda_send_cuda_pack_sig(btl, ep, &send_msg);
                 done = 0;
             } else {
                 struct iovec iov;
@@ -1174,9 +1172,9 @@ int mca_btl_smcuda_get_cuda (struct mca_btl_base_module_t *btl,
                 size_t max_data;
                 if (!OPAL_DATATYPE_DIRECT_COPY_GPUMEM && remote_device != local_device) {
                     convertor->gpu_buffer_ptr = opal_cuda_malloc_gpu_buffer_p(size, 0);
-                    mca_common_cuda_memp2pcpy(convertor->gpu_buffer_ptr, remote_memory_address, size);
+                    (*opal_cuda_d2dcpy_async_p)(convertor->gpu_buffer_ptr, remote_memory_address, size);
                     iov.iov_base = convertor->gpu_buffer_ptr;
-                    printf("start D2D copy src %p, dst %p, size %lu\n", remote_memory_address, convertor->gpu_buffer_ptr, size);
+                    opal_output(0, "start D2D copy src %p, dst %p, size %lu\n", remote_memory_address, convertor->gpu_buffer_ptr, size);
                 } else {
                     iov.iov_base = convertor->gpu_buffer_ptr;
                 }
@@ -1186,31 +1184,28 @@ int mca_btl_smcuda_get_cuda (struct mca_btl_base_module_t *btl,
                 done = 1;
             }
         } else {
-            printf("RECEIVE REGT CONTIGUOUS, size %ld !!!!!!!!!!!\n", size);
             recvreq->req_recv.req_base.req_convertor.flags |= CONVERTOR_CUDA;
-            cuda_ddt_hdr_t send_msg;
             if (pack_required) {
-                send_msg.lindex = lindex;
-                send_msg.packed_size = 0;
-                if (remote_device == local_device && OPAL_DATATYPE_DIRECT_COPY_GPUMEM) {
+                if (remote_device == local_device || OPAL_DATATYPE_DIRECT_COPY_GPUMEM) {
                     /* now we are able to let sender pack directly to my memory */
-  /*                  mca_mpool_common_cuda_reg_t loc_reg;
+                    mca_mpool_common_cuda_reg_t loc_reg;
                     mca_mpool_common_cuda_reg_t *loc_reg_ptr = &loc_reg;
-                    cuda_getmemhandle(local_address, size, (mca_mpool_base_registration_t *)&loc_reg, NULL);
-                    memcpy(send_msg.mem_handle, loc_reg_ptr->data.memHandle, sizeof(loc_reg_ptr->data.memHandle));
-                    send_msg.seq = -9;
-                    send_msg.msg_type = CUDA_PACK_TO_REMOTE_START;
-                    send_msg.remote_address = local_address;
-                    send_msg.remote_base = loc_reg.base.base;
-                    mca_common_wait_stream_synchronize(&loc_reg);
-                    printf("send r_addr %p, r_base %p\n", local_address, loc_reg.base.base);*/
+                    cuda_ddt_put_hdr_t put_msg;
+                    if (OPAL_SUCCESS != cuda_getmemhandle(local_address, size, (mca_mpool_base_registration_t *)&loc_reg, NULL)) {
+                        mca_btl_smcuda_cuda_ddt_start_pack(btl, ep, NULL, remote_memory_address, (mca_btl_base_descriptor_t *)frag, 
+                                                           lindex, remote_device, local_device);
+                    }
+                    memcpy(put_msg.mem_handle, loc_reg_ptr->data.memHandle, sizeof(loc_reg_ptr->data.memHandle));
+                    put_msg.remote_address = local_address;
+                    put_msg.remote_base = loc_reg.base.base;
+                    put_msg.lindex = lindex;
+                    mca_btl_smcuda_cuda_ddt_unpack_clone(ep, NULL, remote_memory_address, (mca_btl_base_descriptor_t *)frag, 
+                                                        lindex, 0, 0);
+                    mca_btl_smcuda_send_cuda_put_sig(btl, ep, &put_msg);
                 } else {
-                    send_msg.seq = 0;
-                    send_msg.msg_type = CUDA_PACK_TO_LOCAL_START;
+                    mca_btl_smcuda_cuda_ddt_start_pack(btl, ep, NULL, remote_memory_address, (mca_btl_base_descriptor_t *)frag, 
+                                                       lindex, remote_device, local_device);
                 }
-                mca_btl_smcuda_cuda_ddt_unpack_clone(ep, NULL, remote_memory_address, (mca_btl_base_descriptor_t *)frag, 
-                                                    lindex, 0, 0);
-                mca_btl_smcuda_send_cuda_pack_sig(btl, ep, &send_msg);
                 done = 0;
             } else {
                 rc = mca_common_cuda_memcpy(local_address, remote_memory_address, size,
@@ -1323,12 +1318,11 @@ int mca_btl_smcuda_send_cuda_unpack_sig(struct mca_btl_base_module_t* btl,
 {
     mca_btl_smcuda_frag_t* frag;
     int rc;
-    cuda_ddt_hdr_t cuda_dt_hdr;
     
     /* allocate a fragment, giving up if we can't get one */
     MCA_BTL_SMCUDA_FRAG_ALLOC_EAGER(frag);
     if( OPAL_UNLIKELY(NULL == frag) ) {
-        printf("!!!!!!!!!! no frag \n");
+        opal_output(0, "no frag for send unpack sig\n");
         return OPAL_ERR_OUT_OF_RESOURCE;;
     }
 
@@ -1337,7 +1331,6 @@ int mca_btl_smcuda_send_cuda_unpack_sig(struct mca_btl_base_module_t* btl,
     memcpy(frag->segment.seg_addr.pval, send_msg, sizeof(cuda_ddt_hdr_t));
     
     rc = mca_btl_smcuda_send(btl, endpoint, (struct mca_btl_base_descriptor_t*)frag,  MCA_BTL_TAG_SMCUDA_DATATYPE_UNPACK);
-    printf("######## rank %d, send seq %d, endpoint %p\n", endpoint->my_smp_rank, send_msg->seq, endpoint);
     return rc;
 }
 
@@ -1347,11 +1340,11 @@ int mca_btl_smcuda_send_cuda_pack_sig(struct mca_btl_base_module_t* btl,
 {
     mca_btl_smcuda_frag_t* frag;
     int rc;
-    cuda_ddt_hdr_t cuda_dt_hdr;
     
     /* allocate a fragment, giving up if we can't get one */
     MCA_BTL_SMCUDA_FRAG_ALLOC_EAGER(frag);
     if( OPAL_UNLIKELY(NULL == frag) ) {
+        opal_output(0, "no frag for send pack sig\n");
         return OPAL_ERR_OUT_OF_RESOURCE;;
     }
 
@@ -1363,30 +1356,44 @@ int mca_btl_smcuda_send_cuda_pack_sig(struct mca_btl_base_module_t* btl,
     return rc;
 }
 
-int mca_btl_smcuda_check_cuda_dt_pack_clone_exist(struct mca_btl_base_endpoint_t *endpoint, struct opal_convertor_t *convertor)
+int mca_btl_smcuda_send_cuda_put_sig(struct mca_btl_base_module_t* btl,
+                                     struct mca_btl_base_endpoint_t* endpoint, 
+                                     cuda_ddt_put_hdr_t *put_msg)
 {
-    int i;
-    for (i = 0; i < SMCUDA_DT_CLONE_SIZE; i++) {
-        if (endpoint->smcuda_ddt_pack_clone[i].convertor == convertor) {
-            return i;
-        }
+    mca_btl_smcuda_frag_t* frag;
+    int rc;
+    
+    /* allocate a fragment, giving up if we can't get one */
+    MCA_BTL_SMCUDA_FRAG_ALLOC_EAGER(frag);
+    if( OPAL_UNLIKELY(NULL == frag) ) {
+        opal_output(0, "no frag for send put sig\n");
+        return OPAL_ERR_OUT_OF_RESOURCE;;
     }
-    return -1;
-}
 
-int mca_btl_smcuda_set_cuda_dt_pack_seq(struct mca_btl_base_endpoint_t *endpoint, int lindex, int seq)
-{
-    endpoint->smcuda_ddt_pack_clone[lindex].seq = seq;
-    return 0;
+    /* Fill in fragment fields. */
+    frag->base.des_flags = MCA_BTL_DES_FLAGS_BTL_OWNERSHIP;
+    memcpy(frag->segment.seg_addr.pval, put_msg, sizeof(cuda_ddt_put_hdr_t));
+    
+    rc = mca_btl_smcuda_send(btl, endpoint, (struct mca_btl_base_descriptor_t*)frag,  MCA_BTL_TAG_SMCUDA_DATATYPE_PUT);
+    return rc;
 }
 
-int mca_btl_smcuda_get_cuda_dt_pack_seq(struct mca_btl_base_endpoint_t *endpoint, int lindex)
+inline static int mca_btl_smcuda_cuda_ddt_start_pack(struct mca_btl_base_module_t *btl,
+                                                     struct mca_btl_base_endpoint_t *endpoint,
+                                                     struct opal_convertor_t *convertor,
+                                                     void *remote_gpu_address,
+                                                     mca_btl_base_descriptor_t *frag,
+                                                     int lindex, uint8_t remote_device, uint8_t local_device)
 {
-    if (lindex >= SMCUDA_DT_CLONE_SIZE) {
-        return -9;
-    } else {
-        return endpoint->smcuda_ddt_pack_clone[lindex].seq;
-    }
+    cuda_ddt_hdr_t send_msg;
+    mca_btl_smcuda_cuda_ddt_unpack_clone(endpoint, convertor, remote_gpu_address, (mca_btl_base_descriptor_t *)frag, 
+                                        lindex, remote_device, local_device);
+    send_msg.lindex = lindex;
+    send_msg.packed_size = 0;
+    send_msg.seq = 0;
+    send_msg.msg_type = CUDA_DDT_PACK_START;
+    opal_output(0, "smcuda btl start pack, remote_gpu_address %p, frag %p, lindex %d, remote_device %d, local_device %d\n", remote_gpu_address, frag, lindex, remote_device, local_device);
+    mca_btl_smcuda_send_cuda_pack_sig(btl, endpoint, &send_msg);
 }
 
 int mca_btl_smcuda_alloc_cuda_ddt_pack_clone(struct mca_btl_base_endpoint_t *endpoint)
@@ -1430,7 +1437,6 @@ void mca_btl_smcuda_cuda_ddt_pack_clone(struct mca_btl_base_endpoint_t *endpoint
     endpoint->smcuda_ddt_pack_clone[lindex].convertor = convertor;
     endpoint->smcuda_ddt_pack_clone[lindex].remote_gpu_address = remote_gpu_address;
     endpoint->smcuda_ddt_pack_clone[lindex].lindex = lindex;
-    endpoint->smcuda_ddt_pack_clone[lindex].seq = -9;
     endpoint->smcuda_ddt_pack_clone[lindex].remote_device = remote_device;
     endpoint->smcuda_ddt_pack_clone[lindex].local_device = local_device;
     endpoint->smcuda_ddt_pack_clone[lindex].frag = frag;
@@ -1445,7 +1451,6 @@ void mca_btl_smcuda_cuda_ddt_unpack_clone(struct mca_btl_base_endpoint_t *endpoi
     endpoint->smcuda_ddt_unpack_clone[lindex].convertor = convertor;
     endpoint->smcuda_ddt_unpack_clone[lindex].remote_gpu_address = remote_gpu_address;
     endpoint->smcuda_ddt_unpack_clone[lindex].lindex = lindex;
-    endpoint->smcuda_ddt_unpack_clone[lindex].seq = -9;
     endpoint->smcuda_ddt_unpack_clone[lindex].remote_device = remote_device;
     endpoint->smcuda_ddt_unpack_clone[lindex].local_device = local_device;
     endpoint->smcuda_ddt_unpack_clone[lindex].frag = frag;
diff --git a/opal/mca/btl/smcuda/btl_smcuda.h b/opal/mca/btl/smcuda/btl_smcuda.h
index 46ae97b3909..288dc2027d3 100644
--- a/opal/mca/btl/smcuda/btl_smcuda.h
+++ b/opal/mca/btl/smcuda/btl_smcuda.h
@@ -513,14 +513,15 @@ enum ipcState {
 
 /* cuda datatype pack/unpack message */
 typedef struct {
+    int lindex;
     int seq;
     int msg_type;
-    int lindex;
     int packed_size;
 } cuda_ddt_hdr_t;
 
 /* cuda datatype put message */
 typedef struct {
+    int lindex;
     void *remote_address;
     void *remote_base;
     uint64_t mem_handle[8];
@@ -530,31 +531,25 @@ typedef struct {
 #define CUDA_DDT_COMPLETE           1
 #define CUDA_DDT_COMPLETE_ACK       2
 #define CUDA_DDT_CLEANUP            3
-#define CUDA_PACK_TO_LOCAL_START    4
-#define CUDA_PACK_TO_REMOTE_START   5
-#define CUDA_DDT_PACK_TO_BLOCK      6
-#define CUDA_UNPACK_NO              7
+#define CUDA_DDT_PACK_START         4
+#define CUDA_DDT_PACK_TO_BLOCK      5
+#define CUDA_UNPACK_NO              6
 
 /* package save pack/unpack convertor and cbfunc */
 typedef struct {
     struct opal_convertor_t *convertor;
     void *remote_gpu_address;
     int lindex;
-    int seq;
     uint8_t remote_device;
     uint8_t local_device;
     mca_btl_base_descriptor_t *frag;
 } cuda_ddt_clone_t;
 
 #define SMCUDA_DT_CLONE_SIZE 20
-extern cuda_ddt_clone_t smcuda_dt_clone[SMCUDA_DT_CLONE_SIZE];
 
 int mca_btl_smcuda_send_cuda_unpack_sig(struct mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t* endpoint, cuda_ddt_hdr_t *send_msg);
 int mca_btl_smcuda_send_cuda_pack_sig(struct mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t* endpoint, cuda_ddt_hdr_t *send_msg);
-int mca_btl_smcuda_check_cuda_dt_pack_clone_exist(struct mca_btl_base_endpoint_t *endpoint, struct opal_convertor_t *convertor);
-int mca_btl_smcuda_set_cuda_dt_pack_seq(struct mca_btl_base_endpoint_t *endpoint, int lindex, int seq);
-int mca_btl_smcuda_get_cuda_dt_pack_seq(struct mca_btl_base_endpoint_t *endpoint, int lindex);
-int mca_btl_smcuda_get_cuda_dt_pack_pipeline_size(struct mca_btl_base_endpoint_t *endpoint, int lindex);
+int mca_btl_smcuda_send_cuda_put_sig(struct mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t* endpoint, cuda_ddt_put_hdr_t *put_msg);
 int mca_btl_smcuda_alloc_cuda_ddt_pack_clone(struct mca_btl_base_endpoint_t *endpoint);
 int mca_btl_smcuda_alloc_cuda_ddt_unpack_clone(struct mca_btl_base_endpoint_t *endpoint);
 void mca_btl_smcuda_free_cuda_ddt_pack_clone(struct mca_btl_base_endpoint_t *endpoint, int lindex);
diff --git a/opal/mca/btl/smcuda/btl_smcuda_component.c b/opal/mca/btl/smcuda/btl_smcuda_component.c
index 4f46b8a5beb..51be3eafafa 100644
--- a/opal/mca/btl/smcuda/btl_smcuda_component.c
+++ b/opal/mca/btl/smcuda/btl_smcuda_component.c
@@ -173,6 +173,9 @@ static int smcuda_register(void)
 #else /* OPAL_CUDA_SUPPORT */
     mca_btl_smcuda.super.btl_exclusivity = MCA_BTL_EXCLUSIVITY_LOW;
 #endif /* OPAL_CUDA_SUPPORT */
+    mca_btl_smcuda.super.btl_cuda_ddt_pipeline_size = mca_btl_smcuda_component.cuda_ddt_pipeline_size;
+    printf("pipeline size %lu\n", mca_btl_smcuda.super.btl_cuda_ddt_pipeline_size);
+    mca_btl_smcuda.super.btl_cuda_ddt_pipeline_depth = 4;
     mca_btl_smcuda.super.btl_eager_limit = 4*1024;
     mca_btl_smcuda.super.btl_rndv_eager_limit = 4*1024;
     mca_btl_smcuda.super.btl_max_send_size = 32*1024;
@@ -848,6 +851,7 @@ static void btl_smcuda_control(mca_btl_base_module_t* btl,
     }
 }
 
+/* for receiver */
 static void btl_smcuda_datatype_unpack(mca_btl_base_module_t* btl,
                                        mca_btl_base_tag_t tag,
                                        mca_btl_base_descriptor_t* des, void* cbdata)
@@ -868,7 +872,6 @@ static void btl_smcuda_datatype_unpack(mca_btl_base_module_t* btl,
     my_cuda_dt_clone = &endpoint->smcuda_ddt_unpack_clone[lindex];
     assert(my_cuda_dt_clone->lindex == lindex);
     
-    printf("$$$$$$$$$$$$$$hello, rank %d in smcuda unpack seq %d, index %d\n", endpoint->my_smp_rank, seq, lindex);
     cuda_ddt_hdr_t send_msg;
     send_msg.lindex = lindex;
     
@@ -887,14 +890,14 @@ static void btl_smcuda_datatype_unpack(mca_btl_base_module_t* btl,
         if (convertor == NULL) { /* do not unpack */
             mca_btl_smcuda_frag_t *frag_recv = (mca_btl_smcuda_frag_t *) my_cuda_dt_clone->frag;
             unsigned char *local_address = (unsigned char*)frag_recv->segment.seg_addr.pval;
-            printf("D2D local %p, remote %p, size %ld\n", local_address + seq*pipeline_size, my_cuda_dt_clone->remote_gpu_address+seq*pipeline_size, packed_size);
+            opal_output(0, "no unpack, start D2D copy local %p, remote %p, size %ld\n", local_address + seq*pipeline_size, my_cuda_dt_clone->remote_gpu_address+seq*pipeline_size, packed_size);
             mca_common_cuda_memp2pcpy(local_address + seq*pipeline_size, my_cuda_dt_clone->remote_gpu_address + seq*pipeline_size, packed_size);
         } else {     /* unpack */
             if (!OPAL_DATATYPE_DIRECT_COPY_GPUMEM && my_cuda_dt_clone->remote_device != my_cuda_dt_clone->local_device) {
                 convertor->gpu_buffer_ptr = opal_cuda_malloc_gpu_buffer_p(packed_size, 0);
-                mca_common_cuda_memp2pcpy(convertor->gpu_buffer_ptr, my_cuda_dt_clone->remote_gpu_address + seq*pipeline_size, packed_size);
+                (*opal_cuda_d2dcpy_async_p)(convertor->gpu_buffer_ptr, my_cuda_dt_clone->remote_gpu_address + seq*pipeline_size, packed_size);
                 iov.iov_base = convertor->gpu_buffer_ptr;
-                printf("start D2D copy src %p, dst %p, size %lu\n", my_cuda_dt_clone->remote_gpu_address + seq*pipeline_size, convertor->gpu_buffer_ptr, packed_size);        
+                opal_output(0, "unpack, start D2D copy src %p, dst %p, size %lu\n", my_cuda_dt_clone->remote_gpu_address + seq*pipeline_size, convertor->gpu_buffer_ptr, packed_size);        
             } else {
                 iov.iov_base = convertor->gpu_buffer_ptr + seq * pipeline_size;
             }
@@ -917,9 +920,9 @@ static void btl_smcuda_datatype_unpack(mca_btl_base_module_t* btl,
         }
         mca_btl_smcuda_send_cuda_pack_sig(btl, endpoint, &send_msg);
     }
-   // MCA_BTL_SMCUDA_FRAG_RETURN(frag);
 }
 
+/* for sender */
 static void btl_smcuda_datatype_pack(mca_btl_base_module_t* btl,
                                      mca_btl_base_tag_t tag,
                                      mca_btl_base_descriptor_t* des, void* cbdata)
@@ -944,7 +947,6 @@ static void btl_smcuda_datatype_pack(mca_btl_base_module_t* btl,
     endpoint = mca_btl_smcuda_component.sm_peers[frag->hdr->my_smp_rank];
     my_cuda_dt_clone = &endpoint->smcuda_ddt_pack_clone[lindex];
     
-    printf("$$$$$$$$$$$$$$hello, rank %d in smcuda pack seq %d, index %d\n", endpoint->my_smp_rank, seq, lindex);
     struct opal_convertor_t *convertor = my_cuda_dt_clone->convertor;
     send_msg.lindex = lindex;
     if (msg_type == CUDA_DDT_COMPLETE_ACK) {
@@ -958,7 +960,6 @@ static void btl_smcuda_datatype_pack(mca_btl_base_module_t* btl,
         }
         mca_btl_smcuda_free_cuda_ddt_pack_clone(endpoint, lindex);
     } else if (msg_type == CUDA_DDT_PACK_TO_BLOCK) {
-        printf("i receive a message pack to seq, packed %ld, pipeline_size %ld\n", convertor->bConverted, mca_btl_smcuda_component.cuda_ddt_pipeline_size); 
         if (convertor->bConverted < convertor->local_size) {
             struct iovec iov;
             iov.iov_base = convertor->gpu_buffer_ptr + seq * mca_btl_smcuda_component.cuda_ddt_pipeline_size;;
@@ -974,28 +975,8 @@ static void btl_smcuda_datatype_pack(mca_btl_base_module_t* btl,
             }
             mca_btl_smcuda_send_cuda_unpack_sig(btl, endpoint, &send_msg);
         }
-    } else {
-        mca_mpool_common_cuda_reg_t *rget_reg_ptr = NULL;
-        if (msg_type == CUDA_PACK_TO_REMOTE_START) { /* receiver is contiguous, and ask me to pack directly to his gpu memory */
-/*            opal_cuda_free_gpu_buffer_p(convertor->gpu_buffer_ptr, 0);
-            mca_mpool_common_cuda_reg_t rget_reg;
-            rget_reg_ptr= &rget_reg;
-            memset(&rget_reg, 0, sizeof(rget_reg));
-            memcpy(rget_reg.data.memHandle, cuda_dt_hdr.mem_handle, sizeof(cuda_dt_hdr.mem_handle));
-            cuda_openmemhandle(NULL, 0, (mca_mpool_base_registration_t *)&rget_reg, NULL);
-            mca_common_wait_stream_synchronize(&rget_reg);
-            size_t offset = (size_t) ((intptr_t) cuda_dt_hdr.remote_address - (intptr_t) cuda_dt_hdr.remote_base);
-            unsigned char *remote_memory_address = (unsigned char *)rget_reg_ptr->base.alloc_base + offset;
-            convertor->gpu_buffer_ptr = remote_memory_address;
-            printf("remote_memory_address $$$$$$ %p, r_addr %p, r_base %p\n", remote_memory_address, cuda_dt_hdr.remote_address, cuda_dt_hdr.remote_base);
-            send_msg.msg_type = CUDA_UNPACK_NO;
-            convertor->gpu_buffer_size = convertor->local_size;*/
-        } else {
-            send_msg.msg_type = CUDA_DDT_UNPACK_FROM_BLOCK;
-        }
+    } else if (msg_type == CUDA_DDT_PACK_START) {
         struct iovec iov;
-        packed_size = mca_btl_smcuda_component.cuda_ddt_pipeline_size;
-        printf("Pipeline_size %ld\n", packed_size);
         iov.iov_len = mca_btl_smcuda_component.cuda_ddt_pipeline_size;
         iov.iov_base = convertor->gpu_buffer_ptr;
         seq = 0;
@@ -1007,16 +988,65 @@ static void btl_smcuda_datatype_pack(mca_btl_base_module_t* btl,
             send_msg.seq = seq;
             if (rv_dt == 1) {
                 send_msg.msg_type = CUDA_DDT_COMPLETE;
+            } else {
+                send_msg.msg_type = CUDA_DDT_UNPACK_FROM_BLOCK;
             }
             mca_btl_smcuda_send_cuda_unpack_sig(btl, endpoint, &send_msg);
             seq ++;
         }
-        
-        if (rget_reg_ptr != NULL) { /* close memhandle */
-            cuda_closememhandle(NULL, (mca_mpool_base_registration_t *)rget_reg_ptr);
-        }
+    } else {
+        opal_output(0, "unknown message\n");
     }
-  //  MCA_BTL_SMCUDA_FRAG_RETURN(frag);
+}
+
+/* for sender */
+static void btl_smcuda_datatype_put(mca_btl_base_module_t* btl,
+                                    mca_btl_base_tag_t tag,
+                                    mca_btl_base_descriptor_t* des, void* cbdata)
+{
+    struct mca_btl_base_endpoint_t *endpoint;
+    cuda_ddt_put_hdr_t recv_msg;
+    mca_btl_base_segment_t* segments = des->des_segments;
+    memcpy(&recv_msg, segments->seg_addr.pval, sizeof(cuda_ddt_put_hdr_t));
+    int lindex = recv_msg.lindex;
+    void *remote_address = recv_msg.remote_address;
+    void *remote_base = recv_msg.remote_base;
+    mca_btl_smcuda_frag_t *frag = (mca_btl_smcuda_frag_t *)des;
+    cuda_ddt_clone_t *my_cuda_dt_clone;
+    cuda_ddt_hdr_t send_msg;
+    
+    /* We can find the endoint back from the rank embedded in the header */
+    endpoint = mca_btl_smcuda_component.sm_peers[frag->hdr->my_smp_rank];
+    my_cuda_dt_clone = &endpoint->smcuda_ddt_pack_clone[lindex];
+    struct opal_convertor_t *convertor = my_cuda_dt_clone->convertor;
+    
+    opal_cuda_free_gpu_buffer_p(convertor->gpu_buffer_ptr, 0);
+    mca_mpool_common_cuda_reg_t *rget_reg_ptr = NULL;
+    mca_mpool_common_cuda_reg_t rget_reg;
+    rget_reg_ptr= &rget_reg;
+    memset(&rget_reg, 0, sizeof(rget_reg));
+    memcpy(rget_reg.data.memHandle, recv_msg.mem_handle, sizeof(recv_msg.mem_handle));
+    cuda_openmemhandle(NULL, 0, (mca_mpool_base_registration_t *)&rget_reg, NULL);
+    size_t offset = (size_t) ((intptr_t)remote_address - (intptr_t)remote_base);
+    unsigned char *remote_memory_address = (unsigned char *)rget_reg_ptr->base.alloc_base + offset;
+    convertor->gpu_buffer_ptr = remote_memory_address;
+    opal_output(0, "smcuda start put, remote_memory_address $$$$$$ %p, r_addr %p, r_base %p\n", remote_memory_address, remote_address, remote_base);
+    convertor->gpu_buffer_size = convertor->local_size;
+    
+    struct iovec iov;
+    uint32_t iov_count = 1;
+    int rv_dt = 0;
+    size_t max_data = 0;
+    iov.iov_len = convertor->local_size;
+    iov.iov_base = convertor->gpu_buffer_ptr;
+    rv_dt = opal_convertor_pack(convertor, &iov, &iov_count, &max_data );
+    assert(rv_dt == 1);
+    send_msg.lindex = lindex;
+    send_msg.packed_size = 0;
+    send_msg.seq = -2;
+    send_msg.msg_type = CUDA_DDT_CLEANUP;
+    mca_btl_smcuda_send_cuda_unpack_sig(btl, endpoint, &send_msg);
+    mca_btl_smcuda_free_cuda_ddt_pack_clone(endpoint, lindex);
 }
 
 #endif /* OPAL_CUDA_SUPPORT */
@@ -1137,6 +1167,8 @@ mca_btl_smcuda_component_init(int *num_btls,
     mca_btl_base_active_message_trigger[MCA_BTL_TAG_SMCUDA_DATATYPE_UNPACK].cbdata = NULL;
     mca_btl_base_active_message_trigger[MCA_BTL_TAG_SMCUDA_DATATYPE_PACK].cbfunc = btl_smcuda_datatype_pack;
     mca_btl_base_active_message_trigger[MCA_BTL_TAG_SMCUDA_DATATYPE_PACK].cbdata = NULL;
+    mca_btl_base_active_message_trigger[MCA_BTL_TAG_SMCUDA_DATATYPE_PUT].cbfunc = btl_smcuda_datatype_put;
+    mca_btl_base_active_message_trigger[MCA_BTL_TAG_SMCUDA_DATATYPE_PUT].cbdata = NULL;
 
 #endif /* OPAL_CUDA_SUPPORT */
 
diff --git a/opal/mca/common/cuda/common_cuda.c b/opal/mca/common/cuda/common_cuda.c
index 2554c445302..0d758126de7 100644
--- a/opal/mca/common/cuda/common_cuda.c
+++ b/opal/mca/common/cuda/common_cuda.c
@@ -1058,7 +1058,6 @@ int cuda_getmemhandle(void *base, size_t size, mca_mpool_base_registration_t *ne
                             "CUDA: cuMemGetAddressRange passed: addr=%p, size=%d, pbase=%p, psize=%d ",
                             base, (int)size, (void *)pbase, (int)psize);
     }
-    printf("sizeof memhandle %lu, CUipcMemHandle %lu, cuEvent %lu, char %lu\n", sizeof(memHandle), sizeof(CUipcMemHandle), sizeof(CUevent), sizeof(char));
 
     /* Store all the information in the registration */
     cuda_reg->base.base = (void *)pbase;

From a3f79aaefb5d1cf5e218809609341cb1b81b23a0 Mon Sep 17 00:00:00 2001
From: eddy16112 <eddy16112@gmail.com>
Date: Mon, 26 Oct 2015 22:21:15 -0400
Subject: [PATCH 033/190] less bugs

Conflicts:
	ompi/mca/pml/monitoring/pml_monitoring_component.c
	opal/mca/mpool/gpusm/mpool_gpusm.h
---
 opal/datatype/cuda/opal_datatype_cuda.cu      | 121 +++++++++---------
 .../cuda/opal_datatype_cuda_internal.cuh      |   2 +-
 .../cuda/opal_datatype_orig_internal.h        |   3 -
 .../cuda/opal_datatype_pack_cuda_kernel.cu    |   6 +-
 .../cuda/opal_datatype_pack_cuda_wrapper.cu   |  31 ++---
 .../cuda/opal_datatype_unpack_cuda_kernel.cu  |   5 +-
 .../cuda/opal_datatype_unpack_cuda_wrapper.cu |  63 +++++----
 opal/datatype/opal_convertor.c                |   8 +-
 opal/datatype/opal_datatype_gpu.c             |   2 +
 opal/datatype/opal_datatype_pack.c            |   8 +-
 opal/mca/common/cuda/common_cuda.c            |   4 +-
 opal/mca/mpool/gpusm/mpool_gpusm.h            |   4 +-
 test/datatype/Makefile.am                     |   2 +-
 test/datatype/ddt_lib.h                       |   1 +
 14 files changed, 125 insertions(+), 135 deletions(-)

diff --git a/opal/datatype/cuda/opal_datatype_cuda.cu b/opal/datatype/cuda/opal_datatype_cuda.cu
index 0f0d52d558b..18706fe0f78 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cu
+++ b/opal/datatype/cuda/opal_datatype_cuda.cu
@@ -142,21 +142,27 @@ static inline void cuda_list_insert_before(ddt_cuda_list_t *list, ddt_cuda_buffe
     list->nb_elements ++;
 }
 
-static inline void cuda_list_item_merge_by_addr(ddt_cuda_list_t *list)
+/**
+ * Collapse the list of free buffers by mergining consecutive buffers. As the property of this list
+ * is continously maintained, we only have to parse it up to the newest inserted elements.
+ */
+static inline void cuda_list_item_merge_by_addr(ddt_cuda_list_t *list, ddt_cuda_buffer_t* last)
 {
-    ddt_cuda_buffer_t *ptr = NULL;
+    ddt_cuda_buffer_t *current = list->head;
     ddt_cuda_buffer_t *next = NULL;
-    ptr = list->head;
-    while(ptr != NULL) {
-        next = ptr->next;
-        if (next == NULL) {
-            break;
-        } else if ((ptr->gpu_addr + ptr->size) == next->gpu_addr) {
-            ptr->size += next->size;
+    void* stop_addr = last->gpu_addr;
+
+    while(1) {  /* loop forever, the exit conditions are inside */
+        if( NULL == (next = current->next) ) return;
+        if ((current->gpu_addr + current->size) == next->gpu_addr) {
+            current->size += next->size;
             cuda_list_delete(list, next);
-        } else {
-            ptr = ptr->next;
+            free(next);  /* release the element, and try to continue merging */
+            continue;
         }
+        current = current->next;
+        if( NULL == current ) return;
+        if( current->gpu_addr > stop_addr ) return;
     }
 }
 
@@ -210,6 +216,7 @@ void opal_datatype_cuda_init(void)
         cuda_device[i].buffer_used.nb_elements = 0;
     }
     
+    
     /* init cuda stream */
     cuda_streams = (ddt_cuda_stream_t*)malloc(sizeof(ddt_cuda_stream_t));
     for (i = 0; i < NB_STREAMS; i++) {
@@ -222,7 +229,8 @@ void opal_datatype_cuda_init(void)
     
     /* only for iov version */
     for (i = 0; i < NB_STREAMS; i++) {
-        cudaMalloc((void **)(&cuda_iov_dist_d[i]), sizeof(ddt_cuda_iov_dist_t)*CUDA_MAX_NB_BLOCKS);
+        cudaMallocHost((void **)(&cuda_iov_dist_h[i]), sizeof(ddt_cuda_iov_dist_t)*CUDA_MAX_NB_BLOCKS*CUDA_IOV_MAX_TASK_PER_BLOCK);
+        cudaMalloc((void **)(&cuda_iov_dist_d[i]), sizeof(ddt_cuda_iov_dist_t)*CUDA_MAX_NB_BLOCKS*CUDA_IOV_MAX_TASK_PER_BLOCK);
     }
     
     // /* init size for double, float, char */
@@ -245,6 +253,7 @@ void opal_datatype_cuda_fini(void)
     
     /* only for iov version */
     for (i = 0; i < NB_STREAMS; i++) {
+        cudaFreeHost(cuda_iov_dist_h[i]);
         cudaFree(cuda_iov_dist_d[i]);
     }
 }
@@ -279,72 +288,60 @@ void* opal_cuda_malloc_gpu_buffer(size_t size, int gpu_id)
         DT_CUDA_DEBUG( opal_cuda_output( 0, "No GPU buffer at dev_id %d.\n", dev_id); );
         return NULL;
     }
-    ddt_cuda_buffer_t *ptr = NULL;
-    void *addr = NULL;
-    ptr = device->buffer_free.head;
+    ddt_cuda_buffer_t *ptr = device->buffer_free.head;
     while (ptr != NULL) {
-        if (ptr->size >= size) {
-            addr = ptr->gpu_addr;
-            ptr->size -= size;
-            if (ptr->size == 0) {
-                cuda_list_delete(&device->buffer_free, ptr);
-                obj_ddt_cuda_buffer_reset(ptr);
-                cuda_list_push_head(cuda_free_list, ptr);
-            } else {
-                ptr->gpu_addr += size;
-            }
-            break;
+        if (ptr->size < size) {  /* Not enough room in this buffer, check next */
+            ptr = ptr->next;
+            continue;
         }
-        ptr = ptr->next;
-    }
-    
-    if (ptr == NULL) {
-        return NULL;
-    } else {    
-        ddt_cuda_buffer_t *p = cuda_list_pop_tail(cuda_free_list);
-        if (p == NULL) {
-            p = obj_ddt_cuda_buffer_new();
+        void *addr = ptr->gpu_addr;
+        ptr->size -= size;
+        if (ptr->size == 0) {
+            cuda_list_delete(&device->buffer_free, ptr);
+            obj_ddt_cuda_buffer_reset(ptr);
+            /* hold on this ptr object, we will reuse it right away */
+        } else {
+            ptr->gpu_addr += size;
+            ptr = cuda_list_pop_tail(cuda_free_list);
+            if( NULL == ptr )
+                ptr = obj_ddt_cuda_buffer_new();
         }
-        p->size = size;
-        p->gpu_addr = (unsigned char*)addr;
-        cuda_list_push_head(&device->buffer_used, p);
+        assert(NULL != ptr);
+        ptr->size = size;
+        ptr->gpu_addr = (unsigned char*)addr;
+        cuda_list_push_head(&device->buffer_used, ptr);
         device->buffer_used_size += size;
         device->buffer_free_size -= size;
         DT_CUDA_DEBUG( opal_cuda_output( 2, "Malloc GPU buffer %p, dev_id %d.\n", addr, dev_id); );
         return addr;
     }
+    return NULL;
 }
 
 void opal_cuda_free_gpu_buffer(void *addr, int gpu_id)
 {
     ddt_cuda_device_t *device = &cuda_device[gpu_id];
-    ddt_cuda_buffer_t *ptr = NULL;
-    ddt_cuda_buffer_t *ptr_next = NULL;
-    ptr = device->buffer_used.head;
-    while (ptr != NULL) {
-        if (ptr->gpu_addr == addr) {
-            cuda_list_delete(&device->buffer_used, ptr);
-            ptr_next = device->buffer_free.head;
-            while (ptr_next != NULL) {
-                if (ptr_next->gpu_addr > addr) {
-                    break;
-                }
-                ptr_next = ptr_next->next;
-            }
-            if (ptr_next == NULL) {
-                /* buffer_free is empty, or insert to last one */
-                cuda_list_push_tail(&device->buffer_free, ptr);
-            } else {
-                cuda_list_insert_before(&device->buffer_free, ptr, ptr_next);
-            }
-            cuda_list_item_merge_by_addr(&device->buffer_free);
-            device->buffer_free_size += ptr->size;
+    ddt_cuda_buffer_t *ptr = device->buffer_used.head;
+
+    /* Find the holder of this GPU allocation */
+    for( ; (NULL != ptr) && (ptr->gpu_addr != addr); ptr = ptr->next );
+    if (NULL == ptr) {  /* we could not find it. something went wrong */
+        DT_CUDA_DEBUG( opal_cuda_output( 0, "addr %p is not managed.\n", addr); );
+        return;
+    }
+    cuda_list_delete(&device->buffer_used, ptr);
+    /* Insert the element in the list of free buffers ordered by the addr */
+    ddt_cuda_buffer_t *ptr_next = device->buffer_free.head;
+    while (ptr_next != NULL) {
+        if (ptr_next->gpu_addr > addr) {
             break;
         }
-        ptr = ptr->next;
+        ptr_next = ptr_next->next;
     }
-    if (ptr == NULL) {
-        DT_CUDA_DEBUG( opal_cuda_output( 0, "addr %p is not managed.\n", addr); );
+    if (ptr_next == NULL) {  /* buffer_free is empty, or insert to last one */
+        cuda_list_push_tail(&device->buffer_free, ptr);
+    } else {
+        cuda_list_insert_before(&device->buffer_free, ptr, ptr_next);
     }
     size_t size = ptr->size;
     cuda_list_item_merge_by_addr(&device->buffer_free, ptr);
diff --git a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
index 268554126ab..fe49449f976 100644
--- a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
+++ b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
@@ -30,7 +30,7 @@
 #define CUDA_NB_IOV         1024*20
 #define CUDA_IOV_LEN        1024*1204
 #define CUDA_MAX_NB_BLOCKS  1024
-#define CUDA_IOV_MAX_TASK_PER_BLOCK 10
+#define CUDA_IOV_MAX_TASK_PER_BLOCK 400
 #define ALIGNMENT_DOUBLE    8
 #define ALIGNMENT_FLOAT     4
 #define ALIGNMENT_CHAR      1
diff --git a/opal/datatype/cuda/opal_datatype_orig_internal.h b/opal/datatype/cuda/opal_datatype_orig_internal.h
index 90561359f75..4dde12d235d 100644
--- a/opal/datatype/cuda/opal_datatype_orig_internal.h
+++ b/opal/datatype/cuda/opal_datatype_orig_internal.h
@@ -5,9 +5,6 @@
 
 #include "opal_config.h"
 
-/* original OMPI */
-#define OPAL_DECLSPEC
-
 #define OPAL_PTRDIFF_TYPE ptrdiff_t
 #define DT_STATIC_STACK_SIZE    5                /**< This should be sufficient for most applications */
 
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
index a58b831b78b..dd9af2a5a7e 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
@@ -81,7 +81,11 @@ __global__ void opal_generic_simple_pack_cuda_iov_kernel( ddt_cuda_iov_dist_t* c
     
     if (threadIdx.x == 0) {
         //printf("iov pack kernel \n");
-        nb_tasks = cuda_iov_dist[blockIdx.x].nb_tasks;
+        nb_tasks = nb_blocks_used / gridDim.x;
+        if (blockIdx.x < (nb_blocks_used % gridDim.x)) {
+            nb_tasks ++;
+        }
+   //     printf("nb_tasks %d, griddim %d, nb_blocks_used %d, bloid %d \n", nb_tasks, gridDim.x, nb_blocks_used, blockIdx.x);
     }
     __syncthreads();
     
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
index 381aaf99ae8..0a51f66d877 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
@@ -443,7 +443,7 @@ void pack_contiguous_loop_cuda( dt_elem_desc_t* ELEM,
 //    int i;
 //    for (i = 0; i < 4; i++) {
 //     opal_empty_kernel<<<192, 4*THREAD_PER_BLOCK>>>(_copy_loops, _end_loop->size, _loop->extent, _source, _destination);
-     pack_contiguous_loop_cuda_kernel_global<<<192, 4*THREAD_PER_BLOCK>>>(_copy_loops, _end_loop->size, _loop->extent, _source, _destination);
+ //    pack_contiguous_loop_cuda_kernel_global<<<192, 4*THREAD_PER_BLOCK>>>(_copy_loops, _end_loop->size, _loop->extent, _source, _destination);
 //     }
 
 #if !defined(OPAL_DATATYPE_CUDA_DRY_RUN)    
@@ -640,7 +640,7 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
                                                     size_t* max_data )
 {
     uint32_t i, j;
-    uint32_t count_desc, current_block, task_iteration, nb_blocks_per_description, residue_desc;
+    uint32_t count_desc, nb_blocks_per_description, residue_desc;
     uint32_t nb_blocks, thread_per_block, nb_blocks_used;
     size_t length, buffer_size, length_per_iovec, dst_offset;
     unsigned char *destination, *destination_base, *source_base;
@@ -736,12 +736,10 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
     dst_offset = 0;
     thread_per_block = CUDA_WARP_SIZE * 5;
     nb_blocks = 256;
-    nb_blocks_used = 0;
     
     while (cuda_iov_count > 0) {
         
-        current_block = 0;
-        task_iteration = 0;
+        nb_blocks_used = 0;
         cuda_iov_dist_h_current = cuda_iov_dist_h[cuda_streams->current_stream_id];
         cuda_iov_dist_d_current = cuda_iov_dist_d[cuda_streams->current_stream_id];
         source_base = (unsigned char*)cuda_iov[0].iov_base; 
@@ -749,9 +747,6 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
         GET_TIME(start);
 #endif
-        for (i = 0; i < nb_blocks; i++) {
-            cuda_iov_dist_h_current[i].nb_tasks = 0;
-        }
 
         for (i = 0; i < cuda_iov_count; i++) {
           /*  pElem = &(description[orig_stack_index+i]);*/
@@ -786,21 +781,17 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
                 cuda_iov_dist_h_current[nb_blocks_used].dst_offset = destination - destination_base;
                 cuda_iov_dist_h_current[nb_blocks_used].element_alignment = alignment;
                 if ( (j+1) * thread_per_block <= count_desc) {
-                    cuda_iov_dist_h_current[current_block].nb_elements[task_iteration] = thread_per_block;// * sizeof(double);
+                    cuda_iov_dist_h_current[nb_blocks_used].nb_elements = thread_per_block;// * sizeof(double);
                 } else {
-                    cuda_iov_dist_h_current[current_block].nb_elements[task_iteration] = count_desc - j*thread_per_block; //(thread_per_block - ((j+1)*thread_per_block - count_desc));// * sizeof(double);
+                    cuda_iov_dist_h_current[nb_blocks_used].nb_elements = count_desc - j*thread_per_block; //(thread_per_block - ((j+1)*thread_per_block - count_desc));// * sizeof(double);
                 }
 #if defined (OPAL_DATATYPE_CUDA_DEBUG)
-                assert(cuda_iov_dist_h_current[current_block].nb_elements[task_iteration] > 0);
+                assert(cuda_iov_dist_h_current[nb_blocks_used].nb_elements > 0);
 #endif /* OPAL_DATATYPE_CUDA_DEBUG */
                 destination += cuda_iov_dist_h_current[nb_blocks_used].nb_elements * alignment;
                 DT_CUDA_DEBUG( opal_cuda_output(12, "Pack \tblock %d, src %p, dst %p, nb_elements %d, alignment %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].src_offset, cuda_iov_dist_h_current[nb_blocks_used].dst_offset, cuda_iov_dist_h_current[nb_blocks_used].nb_elements, cuda_iov_dist_h_current[nb_blocks_used].element_alignment); );
                 nb_blocks_used ++;
-                if (current_block >= nb_blocks) {
-                    current_block = 0;
-                    task_iteration ++;
-                    assert(task_iteration < CUDA_IOV_MAX_TASK_PER_BLOCK);
-                }
+                assert (nb_blocks_used < CUDA_MAX_NB_BLOCKS*CUDA_IOV_MAX_TASK_PER_BLOCK);
             }
             
             /* handle residue */
@@ -812,16 +803,12 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
                 cuda_iov_dist_h_current[nb_blocks_used].element_alignment = orig_alignment;
                 cuda_iov_dist_h_current[nb_blocks_used].nb_elements = (length_per_iovec - length_per_iovec / alignment * alignment) / orig_alignment;
 #if defined (OPAL_DATATYPE_CUDA_DEBUG)
-                assert(cuda_iov_dist_h_current[current_block].nb_elements[task_iteration] > 0);
+                assert(cuda_iov_dist_h_current[nb_blocks_used].nb_elements > 0);
 #endif /* OPAL_DATATYPE_CUDA_DEBUG */
                 destination += cuda_iov_dist_h_current[nb_blocks_used].nb_elements * orig_alignment;
                 DT_CUDA_DEBUG( opal_cuda_output(12, "Pack \tblock %d, src %p, dst %p, nb_elements %d, alignment %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].src_offset, cuda_iov_dist_h_current[nb_blocks_used].dst_offset, cuda_iov_dist_h_current[nb_blocks_used].nb_elements, cuda_iov_dist_h_current[nb_blocks_used].element_alignment); );
                 nb_blocks_used ++;
-                if (current_block >= nb_blocks) {
-                    current_block = 0;
-                    task_iteration ++;
-                    assert(task_iteration < CUDA_IOV_MAX_TASK_PER_BLOCK);
-                }
+                assert (nb_blocks_used < CUDA_MAX_NB_BLOCKS*CUDA_IOV_MAX_TASK_PER_BLOCK);
             }
             
             if (buffer_isfull) {
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
index 2ea3bb59885..a23aff7710c 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
@@ -16,7 +16,10 @@ __global__ void opal_generic_simple_unpack_cuda_iov_kernel( ddt_cuda_iov_dist_t*
     __shared__ uint32_t nb_tasks;
     
     if (threadIdx.x == 0) {
-        nb_tasks = cuda_iov_dist[blockIdx.x].nb_tasks;
+        nb_tasks = nb_blocks_used / gridDim.x;
+        if (blockIdx.x < nb_blocks_used % gridDim.x) {
+            nb_tasks ++;
+        }
     }
     __syncthreads();
     
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
index 4ee73897f68..696a2c12694 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
@@ -26,7 +26,7 @@ int32_t opal_generic_simple_unpack_function_cuda_vector2( opal_convertor_t* pCon
     uint32_t count_desc_tmp;
 
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
-    TIMER_DATA_TYPE start, end, start_total, end_total;
+    TIMER_DATA_TYPE start, end;
     long total_time;
 #endif
 
@@ -350,13 +350,13 @@ int32_t opal_generic_simple_unpack_function_cuda_vector( opal_convertor_t* pConv
 }
 
 int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvertor,
-                                                  struct iovec* iov, 
-                                                  uint32_t* out_size,
-                                                  size_t* max_data )
+                                                      struct iovec* iov,
+                                                      uint32_t* out_size,
+                                                      size_t* max_data )
 {
     uint32_t i, j;
-    uint32_t count_desc, current_block, task_iteration, nb_blocks_per_description, dst_offset, residue_desc;
-    uint32_t nb_blocks, thread_per_block;
+    uint32_t count_desc, nb_blocks_per_description, dst_offset, residue_desc;
+    uint32_t nb_blocks, thread_per_block, nb_blocks_used;
     size_t length, buffer_size, length_per_iovec;
     unsigned char *source, *source_base, *destination_base;
     size_t total_unpacked, total_converted;
@@ -372,7 +372,7 @@ int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvert
 
     ddt_cuda_iov_dist_t* cuda_iov_dist_h_current;
     ddt_cuda_iov_dist_t* cuda_iov_dist_d_current;
-    
+
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
     TIMER_DATA_TYPE start, end, start_total, end_total;
     long total_time, move_time;
@@ -410,7 +410,6 @@ int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvert
             free_required = 1;
         }
     }
-    
 
     DT_CUDA_DEBUG ( opal_cuda_output(2, "Unpack using IOV, GPU base %p, unpack from buffer %p, total size %ld\n",
                                      pConvertor->pBaseBuf, source, iov[0].iov_len); );
@@ -440,15 +439,14 @@ int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvert
     total_time = ELAPSED_TIME( start, end );
     DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: ddt to iov in %ld microsec\n", total_time ); );
 #endif
-    
+
     dst_offset = 0;
     thread_per_block = CUDA_WARP_SIZE * 5;
     nb_blocks = 256;
-    
+
     while (cuda_iov_count > 0) {
-        
-        current_block = 0;
-        task_iteration = 0;
+
+        nb_blocks_used = 0;
         cuda_iov_dist_h_current = cuda_iov_dist_h[cuda_streams->current_stream_id];
         cuda_iov_dist_d_current = cuda_iov_dist_d[cuda_streams->current_stream_id];
         destination_base = (unsigned char*)cuda_iov[0].iov_base;
@@ -456,10 +454,7 @@ int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvert
 #if defined (OPAL_DATATYPE_CUDA_TIMING)
         GET_TIME(start);
 #endif
-        for (i = 0; i < nb_blocks; i++) {
-            cuda_iov_dist_h_current[i].nb_tasks = 0;
-        }
-        
+
         for (i = 0; i < cuda_iov_count; i++) {
 //            pElem = &(description[orig_stack_index+i]);
             if (buffer_size >= cuda_iov[i].iov_len) {
@@ -472,7 +467,7 @@ int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvert
             }
             buffer_size -= length_per_iovec;
             total_unpacked += length_per_iovec;
-            
+
             /* check alignment */
             if ((uintptr_t)(cuda_iov[i].iov_base) % ALIGNMENT_DOUBLE == 0 && (uintptr_t)source % ALIGNMENT_DOUBLE == 0 && length_per_iovec >= ALIGNMENT_DOUBLE) {
                 alignment = ALIGNMENT_DOUBLE;
@@ -482,6 +477,8 @@ int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvert
                 alignment = ALIGNMENT_CHAR;
             }
 
+            //alignment = ALIGNMENT_DOUBLE;
+
             count_desc = length_per_iovec / alignment;
             residue_desc = length_per_iovec % alignment;
             nb_blocks_per_description = (count_desc + thread_per_block - 1) / thread_per_block;
@@ -491,18 +488,18 @@ int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvert
                 cuda_iov_dist_h_current[nb_blocks_used].src_offset = source - source_base;
                 cuda_iov_dist_h_current[nb_blocks_used].element_alignment = alignment;
                 if ( (j+1) * thread_per_block <= count_desc) {
-                    cuda_iov_dist_h_current[current_block].nb_elements[task_iteration] = thread_per_block;// * sizeof(double);
+                    cuda_iov_dist_h_current[nb_blocks_used].nb_elements = thread_per_block;// * sizeof(double);
                 } else {
-                    cuda_iov_dist_h_current[current_block].nb_elements[task_iteration] = (thread_per_block - ((j+1)*thread_per_block - count_desc));// * sizeof(double);
+                    cuda_iov_dist_h_current[nb_blocks_used].nb_elements = (thread_per_block - ((j+1)*thread_per_block - count_desc));// * sizeof(double);
                 }
 #if defined (OPAL_DATATYPE_CUDA_DEBUG)
-                assert (cuda_iov_dist_h_current[current_block].nb_elements[task_iteration] > 0); 
+                assert (cuda_iov_dist_h_current[nb_blocks_used].nb_elements > 0); 
 #endif /* OPAL_DATATYPE_CUDA_DEBUG */
                 source += cuda_iov_dist_h_current[nb_blocks_used].nb_elements * alignment;
                 DT_CUDA_DEBUG( opal_cuda_output(12, "Unpack \tblock %d, src %p, dst %p, nb_elements %d, alignment %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].src_offset, cuda_iov_dist_h_current[nb_blocks_used].dst_offset, cuda_iov_dist_h_current[nb_blocks_used].nb_elements, cuda_iov_dist_h_current[nb_blocks_used].element_alignment); );
                 nb_blocks_used ++;
             }
-            
+
             /* handle residue */
             if (residue_desc != 0) {
                /* orig_alignment = opal_datatype_basicDatatypes[pElem->elem.common.type]->size;*/
@@ -512,19 +509,19 @@ int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvert
                 cuda_iov_dist_h_current[nb_blocks_used].element_alignment = orig_alignment;
                 cuda_iov_dist_h_current[nb_blocks_used].nb_elements = (length_per_iovec - length_per_iovec / alignment * alignment) / orig_alignment;
 #if defined (OPAL_DATATYPE_CUDA_DEBUG)
-                assert (cuda_iov_dist_h_current[current_block].nb_elements[task_iteration] > 0);
+                assert (cuda_iov_dist_h_current[nb_blocks_used].nb_elements > 0);
 #endif /* OPAL_DATATYPE_CUDA_DEBUG */
                 source += cuda_iov_dist_h_current[nb_blocks_used].nb_elements * orig_alignment;
                 DT_CUDA_DEBUG( opal_cuda_output(12, "Unpack \tblock %d, src %p, dst %p, nb_elements %d, alignment %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].src_offset, cuda_iov_dist_h_current[nb_blocks_used].dst_offset, cuda_iov_dist_h_current[nb_blocks_used].nb_elements, cuda_iov_dist_h_current[nb_blocks_used].element_alignment); );
                 nb_blocks_used ++;
             }
-            
+
             if (buffer_isfull) {
                 break;
             }
         }
 
-#if defined(OPAL_DATATYPE_CUDA_TIMING)    
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
         GET_TIME( end );
         total_time = ELAPSED_TIME( start, end );
         DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Unpack src %p, iov is prepared in %ld microsec, kernel submitted to CUDA stream %d\n", source_base, total_time,  cuda_streams->current_stream_id); );
@@ -533,8 +530,8 @@ int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvert
         cudaMemcpyAsync(cuda_iov_dist_d_current, cuda_iov_dist_h_current, sizeof(ddt_cuda_iov_dist_t)*(nb_blocks_used), cudaMemcpyHostToDevice, cuda_streams->opal_cuda_stream[cuda_streams->current_stream_id]);
         opal_generic_simple_unpack_cuda_iov_kernel<<<nb_blocks, thread_per_block, 0, cuda_streams->opal_cuda_stream[cuda_streams->current_stream_id]>>>(cuda_iov_dist_d_current, nb_blocks_used, source_base, destination_base);
         cuda_streams->current_stream_id ++;
-        cuda_streams->current_stream_id = cuda_streams->current_stream_id % NB_STREAMS;    
-        
+        cuda_streams->current_stream_id = cuda_streams->current_stream_id % NB_STREAMS;
+
         /* buffer is full */
         if (buffer_isfull) {
             size_t total_converted_tmp = total_converted;
@@ -546,7 +543,6 @@ int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvert
         }
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
         GET_TIME(start);
-        convertor_flags = pConvertor->flags;     
 #endif
         convertor_flags = pConvertor->flags;
 //        orig_stack_index = pStack->index;
@@ -559,6 +555,7 @@ int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvert
 #endif
 
     }
+   // cudaDeviceSynchronize();
     for (i = 0; i < NB_STREAMS; i++) {
         cudaStreamSynchronize(cuda_streams->opal_cuda_stream[i]);
     }
@@ -573,7 +570,7 @@ int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvert
     total_time = ELAPSED_TIME( start_total, end_total );
     DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: total unpacking in %ld microsec, kernel %ld microsec\n", total_time, total_time-move_time ); );
 #endif
-    
+
     if( pConvertor->bConverted == pConvertor->local_size ) {
         pConvertor->flags |= CONVERTOR_COMPLETED;
         if (pConvertor->gpu_buffer_ptr != NULL && free_required) {
@@ -581,8 +578,8 @@ int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvert
             pConvertor->gpu_buffer_ptr = NULL;
         }
         return 1;
-    }        
-    return 0;   
+    }
+    return 0;
 }
 
 void unpack_contiguous_loop_cuda( dt_elem_desc_t* ELEM,
@@ -616,13 +613,13 @@ void unpack_contiguous_loop_cuda( dt_elem_desc_t* ELEM,
 //    unpack_contiguous_loop_cuda_kernel_global<<<192, 4*THREAD_PER_BLOCK>>>(_copy_loops, _end_loop->size, _loop->extent, _source, _destination);
      cudaMemcpy2D(_destination, _loop->extent, _source, _end_loop->size, _end_loop->size, _copy_loops, cudaMemcpyDeviceToDevice);
 
-#if !defined(OPAL_DATATYPE_CUDA_DRY_RUN)     
+#if !defined(OPAL_DATATYPE_CUDA_DRY_RUN)
     *(DESTINATION) = _destination + _loop->extent*_copy_loops - _end_loop->first_elem_disp;
     *(SOURCE) = *(SOURCE)  + _copy_loops * _end_loop->size;
     *(SPACE) -= _copy_loops * _end_loop->size;
     *(COUNT) -= _copy_loops;
 #endif
-    
+
     cudaDeviceSynchronize();
 #if defined(OPAL_DATATYPE_CUDA_TIMING) 
     GET_TIME( end );
diff --git a/opal/datatype/opal_convertor.c b/opal/datatype/opal_convertor.c
index 7a8448afbde..3e0ac066c84 100644
--- a/opal/datatype/opal_convertor.c
+++ b/opal/datatype/opal_convertor.c
@@ -563,8 +563,8 @@ int32_t opal_convertor_prepare_for_recv( opal_convertor_t* convertor,
     if (opal_datatype_gpu_init() != OPAL_SUCCESS) {
         opal_datatype_gpu_fini();
     }
-#endif /* defined OPAL_DATATYPE_CUDA_KERNEL */
-#endif
+#endif /* OPAL_DATATYPE_CUDA_KERNEL */
+#endif  /* OPAL_CUDA_SUPPORT */
 
     OPAL_CONVERTOR_PREPARE( convertor, datatype, count, pUserBuf );
 
@@ -612,8 +612,8 @@ int32_t opal_convertor_prepare_for_send( opal_convertor_t* convertor,
     if (opal_datatype_gpu_init() != OPAL_SUCCESS) {
         opal_datatype_gpu_fini();
     }
-#endif /* defined OPAL_DATATYPE_CUDA_KERNEL */
-#endif
+#endif /* OPAL_DATATYPE_CUDA_KERNEL */
+#endif  /* OPAL_CUDA_SUPPORT */
 
     OPAL_CONVERTOR_PREPARE( convertor, datatype, count, pUserBuf );
 
diff --git a/opal/datatype/opal_datatype_gpu.c b/opal/datatype/opal_datatype_gpu.c
index f05ecbd84b5..4e516766737 100644
--- a/opal/datatype/opal_datatype_gpu.c
+++ b/opal/datatype/opal_datatype_gpu.c
@@ -22,7 +22,9 @@
 #include "opal_config.h"
 
 #include <stddef.h>
+#include <stdio.h>
 #include <dlfcn.h>
+#include <stdio.h>
 
 #include "opal/mca/installdirs/installdirs.h"
 #include "opal/datatype/opal_convertor_internal.h"
diff --git a/opal/datatype/opal_datatype_pack.c b/opal/datatype/opal_datatype_pack.c
index 372d5a1291a..5a5a2470cb1 100644
--- a/opal/datatype/opal_datatype_pack.c
+++ b/opal/datatype/opal_datatype_pack.c
@@ -290,7 +290,7 @@ opal_generic_simple_pack_function( opal_convertor_t* pConvertor,
                            (void*)pConvertor, (void*)pConvertor->pBaseBuf,
                            iov[0].iov_base, (unsigned long)iov[0].iov_len, *out_size ); );
 
-    printf("I am in simple pack, max_data %lu, iov_len %lu\n", *max_data, iov[0].iov_len);
+    opal_output(0, "I am in simple pack, max_data %lu, iov_len %lu\n", *max_data, iov[0].iov_len);
     description = pConvertor->use_desc->desc;
 
     /* For the first step we have to add both displacement to the source. After in the
@@ -390,7 +390,7 @@ opal_generic_simple_pack_function( opal_convertor_t* pConvertor,
     *out_size = iov_count;
     if( pConvertor->bConverted == pConvertor->local_size ) {
         pConvertor->flags |= CONVERTOR_COMPLETED;
-        printf("total packed %lu\n", pConvertor->bConverted);
+        opal_output(0, "total packed %lu\n", pConvertor->bConverted);
         // double *vtmp = (double *)iov[0].iov_base;
         // for (uint32_t i = 0; i < total_packed/8; i++) {
         //     printf(" %1.f ", *vtmp);
@@ -425,8 +425,8 @@ opal_generic_simple_pack_cuda_function( opal_convertor_t* pConvertor,
    // return (*opal_generic_simple_pack_function_cuda_vector_p)( pConvertor, iov, out_size, max_data);
     if( OPAL_DATATYPE_LOOP == pElem->elem.common.type ) {
         if (opal_generic_simple_pack_function_cuda_vector_p != NULL) {
-        //    return (*opal_generic_simple_pack_function_cuda_vector_p)( pConvertor, iov, out_size, max_data);
-            return (*opal_generic_simple_pack_function_cuda_iov_p)( pConvertor, iov, out_size, max_data);
+            return (*opal_generic_simple_pack_function_cuda_vector_p)( pConvertor, iov, out_size, max_data);
+        //    return (*opal_generic_simple_pack_function_cuda_iov_p)( pConvertor, iov, out_size, max_data);
         }
     } else {
         if (opal_generic_simple_pack_function_cuda_iov_p != NULL) {
diff --git a/opal/mca/common/cuda/common_cuda.c b/opal/mca/common/cuda/common_cuda.c
index 0d758126de7..04c333efd1b 100644
--- a/opal/mca/common/cuda/common_cuda.c
+++ b/opal/mca/common/cuda/common_cuda.c
@@ -1924,7 +1924,9 @@ static int mca_common_cuda_is_gpu_buffer(const void *pUserBuf, opal_convertor_t
     if (!stage_three_init_complete) {
         if (0 != mca_common_cuda_stage_three_init()) {
             opal_cuda_support = 0;
-        }
+        } else {
+	    opal_datatype_gpu_init();
+	}
     }
 
     return 1;
diff --git a/opal/mca/mpool/gpusm/mpool_gpusm.h b/opal/mca/mpool/gpusm/mpool_gpusm.h
index 537c95108a8..5d3d02b5110 100644
--- a/opal/mca/mpool/gpusm/mpool_gpusm.h
+++ b/opal/mca/mpool/gpusm/mpool_gpusm.h
@@ -41,8 +41,8 @@ struct mca_mpool_gpusm_registration_t {
     uint64_t evtHandle[EVTHANDLE_SIZE]; /* CUipcEventHandle */
     uintptr_t event;                    /* CUevent */
 };
-typedef struct mca_mpool_gpusm_registration_t mca_mpool_gpusm_registration_t;
-OPAL_DECLSPEC OBJ_CLASS_DECLARATION(mca_mpool_gpusm_registration_t);
+typedef struct mca_mpool_gpusm_registration_t mca_mpool_gpusm_registration_t; 
+OPAL_DECLSPEC OBJ_CLASS_DECLARATION(mca_mpool_gpusm_registration_t); 
 
 struct mca_mpool_gpusm_component_t {
     mca_mpool_base_component_t super;
diff --git a/test/datatype/Makefile.am b/test/datatype/Makefile.am
index 186fdd1c1bb..e516e08ae6f 100644
--- a/test/datatype/Makefile.am
+++ b/test/datatype/Makefile.am
@@ -15,7 +15,7 @@
 
 if PROJECT_OMPI
     MPI_TESTS = checksum position position_noncontig ddt_test ddt_raw unpack_ooo ddt_pack ddt_benchmark
-    MPI_CHECKS = to_self ddt_pack
+    MPI_CHECKS = to_self
 endif
 TESTS = opal_datatype_test $(MPI_TESTS)
 
diff --git a/test/datatype/ddt_lib.h b/test/datatype/ddt_lib.h
index 539434f9525..0f6bbc2cb37 100644
--- a/test/datatype/ddt_lib.h
+++ b/test/datatype/ddt_lib.h
@@ -96,5 +96,6 @@ extern ompi_datatype_t* create_strange_dt( void );
 extern ompi_datatype_t* create_contiguous_type( const ompi_datatype_t* data, int count );
 extern ompi_datatype_t* create_vector_type( const ompi_datatype_t* data, int count,
                                             int length, int stride );
+extern ompi_datatype_t* create_struct_constant_gap_resized_ddt( ompi_datatype_t* type );
 extern ompi_datatype_t* create_struct_type(int count);
 

From 8acf2543d2e46e4a342cc0d54c39d5f1779aa0e3 Mon Sep 17 00:00:00 2001
From: eddy16112 <eddy16112@gmail.com>
Date: Tue, 27 Oct 2015 01:24:23 -0400
Subject: [PATCH 034/190] fix pipelining for non-contiguous to contiguous

---
 opal/mca/btl/smcuda/btl_smcuda.c           | 16 +++++++++++-----
 opal/mca/btl/smcuda/btl_smcuda.h           |  1 +
 opal/mca/btl/smcuda/btl_smcuda_component.c | 21 ++++++++++++---------
 3 files changed, 24 insertions(+), 14 deletions(-)

diff --git a/opal/mca/btl/smcuda/btl_smcuda.c b/opal/mca/btl/smcuda/btl_smcuda.c
index 9d5d5441683..e53449e82eb 100644
--- a/opal/mca/btl/smcuda/btl_smcuda.c
+++ b/opal/mca/btl/smcuda/btl_smcuda.c
@@ -1123,7 +1123,7 @@ int mca_btl_smcuda_get_cuda (struct mca_btl_base_module_t *btl,
     offset = (size_t) ((intptr_t) remote_address - (intptr_t) reg_ptr->base.base);
     remote_memory_address = (unsigned char *)reg_ptr->base.alloc_base + offset;
     if (0 != offset) {
-        printf("!!!!!!offset %lu, ra %p, base %p\n", offset, (void*)remote_address, (void*)reg_ptr->base.base);
+        printf("!!!!!!offset %lu, ra %p, base %p, remote %p\n", offset, (void*)remote_address, (void*)reg_ptr->base.base, remote_memory_address);
         opal_output(-1, "OFFSET=%d", (int)offset);
     }
 
@@ -1151,10 +1151,12 @@ int mca_btl_smcuda_get_cuda (struct mca_btl_base_module_t *btl,
             opal_output(0, "Failed to get the GPU device ID, rc=%d", rc);
             return rc;
         }
+        struct opal_convertor_t *convertor = NULL;
         if(opal_convertor_need_buffers(&recvreq->req_recv.req_base.req_convertor) == true) {
             recvreq->req_recv.req_base.req_convertor.flags |= CONVERTOR_CUDA;
             
-            struct opal_convertor_t *convertor = &(recvreq->req_recv.req_base.req_convertor);   
+            convertor = &(recvreq->req_recv.req_base.req_convertor);   
+            printf("local addr %p, pbase %p\n", local_address, convertor->pBaseBuf);
             
             if (remote_device != local_device && !OPAL_DATATYPE_DIRECT_COPY_GPUMEM) {
                 convertor->gpu_buffer_ptr = NULL;  
@@ -1181,29 +1183,31 @@ int mca_btl_smcuda_get_cuda (struct mca_btl_base_module_t *btl,
                 iov.iov_len = size;
                 max_data = size;
                 opal_convertor_unpack(convertor, &iov, &iov_count, &max_data );
+                opal_cuda_free_gpu_buffer_p(convertor->gpu_buffer_ptr, 0);
                 done = 1;
             }
         } else {
             recvreq->req_recv.req_base.req_convertor.flags |= CONVERTOR_CUDA;
             if (pack_required) {
+                convertor = &(recvreq->req_recv.req_base.req_convertor);   
                 if (remote_device == local_device || OPAL_DATATYPE_DIRECT_COPY_GPUMEM) {
                     /* now we are able to let sender pack directly to my memory */
                     mca_mpool_common_cuda_reg_t loc_reg;
                     mca_mpool_common_cuda_reg_t *loc_reg_ptr = &loc_reg;
                     cuda_ddt_put_hdr_t put_msg;
                     if (OPAL_SUCCESS != cuda_getmemhandle(local_address, size, (mca_mpool_base_registration_t *)&loc_reg, NULL)) {
-                        mca_btl_smcuda_cuda_ddt_start_pack(btl, ep, NULL, remote_memory_address, (mca_btl_base_descriptor_t *)frag, 
+                        mca_btl_smcuda_cuda_ddt_start_pack(btl, ep, convertor, remote_memory_address, (mca_btl_base_descriptor_t *)frag, 
                                                            lindex, remote_device, local_device);
                     }
                     memcpy(put_msg.mem_handle, loc_reg_ptr->data.memHandle, sizeof(loc_reg_ptr->data.memHandle));
                     put_msg.remote_address = local_address;
                     put_msg.remote_base = loc_reg.base.base;
                     put_msg.lindex = lindex;
-                    mca_btl_smcuda_cuda_ddt_unpack_clone(ep, NULL, remote_memory_address, (mca_btl_base_descriptor_t *)frag, 
+                    mca_btl_smcuda_cuda_ddt_unpack_clone(ep, convertor, remote_memory_address, (mca_btl_base_descriptor_t *)frag, 
                                                         lindex, 0, 0);
                     mca_btl_smcuda_send_cuda_put_sig(btl, ep, &put_msg);
                 } else {
-                    mca_btl_smcuda_cuda_ddt_start_pack(btl, ep, NULL, remote_memory_address, (mca_btl_base_descriptor_t *)frag, 
+                    mca_btl_smcuda_cuda_ddt_start_pack(btl, ep, convertor, remote_memory_address, (mca_btl_base_descriptor_t *)frag, 
                                                        lindex, remote_device, local_device);
                 }
                 done = 0;
@@ -1435,6 +1439,7 @@ void mca_btl_smcuda_cuda_ddt_pack_clone(struct mca_btl_base_endpoint_t *endpoint
                                         int lindex, uint8_t remote_device, uint8_t local_device)
 {
     endpoint->smcuda_ddt_pack_clone[lindex].convertor = convertor;
+    endpoint->smcuda_ddt_pack_clone[lindex].current_convertor_pBaseBuf = convertor->pBaseBuf;
     endpoint->smcuda_ddt_pack_clone[lindex].remote_gpu_address = remote_gpu_address;
     endpoint->smcuda_ddt_pack_clone[lindex].lindex = lindex;
     endpoint->smcuda_ddt_pack_clone[lindex].remote_device = remote_device;
@@ -1449,6 +1454,7 @@ void mca_btl_smcuda_cuda_ddt_unpack_clone(struct mca_btl_base_endpoint_t *endpoi
                                           int lindex, uint8_t remote_device, uint8_t local_device)
 {
     endpoint->smcuda_ddt_unpack_clone[lindex].convertor = convertor;
+    endpoint->smcuda_ddt_unpack_clone[lindex].current_convertor_pBaseBuf = convertor->pBaseBuf;
     endpoint->smcuda_ddt_unpack_clone[lindex].remote_gpu_address = remote_gpu_address;
     endpoint->smcuda_ddt_unpack_clone[lindex].lindex = lindex;
     endpoint->smcuda_ddt_unpack_clone[lindex].remote_device = remote_device;
diff --git a/opal/mca/btl/smcuda/btl_smcuda.h b/opal/mca/btl/smcuda/btl_smcuda.h
index 288dc2027d3..26dbcb34b2d 100644
--- a/opal/mca/btl/smcuda/btl_smcuda.h
+++ b/opal/mca/btl/smcuda/btl_smcuda.h
@@ -538,6 +538,7 @@ typedef struct {
 /* package save pack/unpack convertor and cbfunc */
 typedef struct {
     struct opal_convertor_t *convertor;
+    unsigned char *current_convertor_pBaseBuf;
     void *remote_gpu_address;
     int lindex;
     uint8_t remote_device;
diff --git a/opal/mca/btl/smcuda/btl_smcuda_component.c b/opal/mca/btl/smcuda/btl_smcuda_component.c
index 51be3eafafa..3d8d01c90a1 100644
--- a/opal/mca/btl/smcuda/btl_smcuda_component.c
+++ b/opal/mca/btl/smcuda/btl_smcuda_component.c
@@ -887,12 +887,16 @@ static void btl_smcuda_datatype_unpack(mca_btl_base_module_t* btl,
         size_t max_data;
         struct opal_convertor_t *convertor = my_cuda_dt_clone->convertor;
         size_t pipeline_size = mca_btl_smcuda_component.cuda_ddt_pipeline_size;
-        if (convertor == NULL) { /* do not unpack */
+        convertor->flags &= ~CONVERTOR_CUDA;
+        if (opal_convertor_need_buffers(convertor) == false) { /* do not unpack */
+            convertor->flags |= CONVERTOR_CUDA;
             mca_btl_smcuda_frag_t *frag_recv = (mca_btl_smcuda_frag_t *) my_cuda_dt_clone->frag;
-            unsigned char *local_address = (unsigned char*)frag_recv->segment.seg_addr.pval;
-            opal_output(0, "no unpack, start D2D copy local %p, remote %p, size %ld\n", local_address + seq*pipeline_size, my_cuda_dt_clone->remote_gpu_address+seq*pipeline_size, packed_size);
-            mca_common_cuda_memp2pcpy(local_address + seq*pipeline_size, my_cuda_dt_clone->remote_gpu_address + seq*pipeline_size, packed_size);
+            unsigned char *local_address = my_cuda_dt_clone->current_convertor_pBaseBuf;
+            opal_output(0, "no unpack, start D2D copy local %p, remote %p, size %ld\n", local_address, my_cuda_dt_clone->remote_gpu_address+seq*pipeline_size, packed_size);
+            mca_common_cuda_memp2pcpy(local_address, my_cuda_dt_clone->remote_gpu_address + seq*pipeline_size, packed_size);
+            my_cuda_dt_clone->current_convertor_pBaseBuf += packed_size;
         } else {     /* unpack */
+            convertor->flags |= CONVERTOR_CUDA;
             if (!OPAL_DATATYPE_DIRECT_COPY_GPUMEM && my_cuda_dt_clone->remote_device != my_cuda_dt_clone->local_device) {
                 convertor->gpu_buffer_ptr = opal_cuda_malloc_gpu_buffer_p(packed_size, 0);
                 (*opal_cuda_d2dcpy_async_p)(convertor->gpu_buffer_ptr, my_cuda_dt_clone->remote_gpu_address + seq*pipeline_size, packed_size);
@@ -912,7 +916,6 @@ static void btl_smcuda_datatype_unpack(mca_btl_base_module_t* btl,
             }
         }
         send_msg.seq = seq;
-        send_msg.packed_size = packed_size;
         if (msg_type == CUDA_DDT_COMPLETE) {
             send_msg.msg_type = CUDA_DDT_COMPLETE_ACK;
         } else {
@@ -934,7 +937,6 @@ static void btl_smcuda_datatype_pack(mca_btl_base_module_t* btl,
     int seq = recv_msg.seq;
     int lindex = recv_msg.lindex;
     int msg_type = recv_msg.msg_type;
-    size_t packed_size = recv_msg.packed_size;
     mca_btl_smcuda_frag_t *frag = (mca_btl_smcuda_frag_t *)des;
     cuda_ddt_clone_t *my_cuda_dt_clone;
     cuda_ddt_hdr_t send_msg;
@@ -942,6 +944,7 @@ static void btl_smcuda_datatype_pack(mca_btl_base_module_t* btl,
     uint32_t iov_count = 1;
     int rv_dt = 0;
     size_t max_data = 0;
+    size_t packed_size = 0;
 
     /* We can find the endoint back from the rank embedded in the header */
     endpoint = mca_btl_smcuda_component.sm_peers[frag->hdr->my_smp_rank];
@@ -962,8 +965,8 @@ static void btl_smcuda_datatype_pack(mca_btl_base_module_t* btl,
     } else if (msg_type == CUDA_DDT_PACK_TO_BLOCK) {
         if (convertor->bConverted < convertor->local_size) {
             struct iovec iov;
-            iov.iov_base = convertor->gpu_buffer_ptr + seq * mca_btl_smcuda_component.cuda_ddt_pipeline_size;;
-            iov.iov_len = packed_size;
+            iov.iov_base = convertor->gpu_buffer_ptr + seq * mca_btl_smcuda_component.cuda_ddt_pipeline_size;
+            iov.iov_len = mca_btl_smcuda_component.cuda_ddt_pipeline_size;
             rv_dt = opal_convertor_pack(convertor, &iov, &iov_count, &max_data );
             packed_size = max_data;
             send_msg.packed_size = packed_size;
@@ -977,8 +980,8 @@ static void btl_smcuda_datatype_pack(mca_btl_base_module_t* btl,
         }
     } else if (msg_type == CUDA_DDT_PACK_START) {
         struct iovec iov;
-        iov.iov_len = mca_btl_smcuda_component.cuda_ddt_pipeline_size;
         iov.iov_base = convertor->gpu_buffer_ptr;
+        iov.iov_len = mca_btl_smcuda_component.cuda_ddt_pipeline_size;
         seq = 0;
         while (rv_dt != 1 && convertor->gpu_buffer_size > 0) {
             rv_dt = opal_convertor_pack(convertor, &iov, &iov_count, &max_data );

From fe8890109db588fb44a8f8183e10e66fa76ab0b5 Mon Sep 17 00:00:00 2001
From: Wei Wu <wwu12@dancer.icl.utk.edu>
Date: Tue, 27 Oct 2015 18:30:10 -0400
Subject: [PATCH 035/190] opal_datatype is chnaged, so we need more space

---
 ompi/datatype/ompi_datatype.h    |  2 +-
 opal/mca/btl/smcuda/btl_smcuda.c | 29 ++++++++++++++---------------
 2 files changed, 15 insertions(+), 16 deletions(-)

diff --git a/ompi/datatype/ompi_datatype.h b/ompi/datatype/ompi_datatype.h
index 17e1632e07d..9ff0719867c 100644
--- a/ompi/datatype/ompi_datatype.h
+++ b/ompi/datatype/ompi_datatype.h
@@ -94,7 +94,7 @@ OMPI_DECLSPEC OBJ_CLASS_DECLARATION(ompi_datatype_t);
 /* Using set constant for padding of the DATATYPE handles because the size of
  * base structure is very close to being the same no matter the bitness.
  */
-#define PREDEFINED_DATATYPE_PAD (512)
+#define PREDEFINED_DATATYPE_PAD (1024)
 
 struct ompi_predefined_datatype_t {
     struct ompi_datatype_t dt;
diff --git a/opal/mca/btl/smcuda/btl_smcuda.c b/opal/mca/btl/smcuda/btl_smcuda.c
index e53449e82eb..be8df760f4f 100644
--- a/opal/mca/btl/smcuda/btl_smcuda.c
+++ b/opal/mca/btl/smcuda/btl_smcuda.c
@@ -72,9 +72,9 @@
 #include "btl_smcuda_frag.h"
 #include "btl_smcuda_fifo.h"
 
-#include "ompi/mca/pml/ob1/pml_ob1_recvreq.h"
+#include "ompi/mca/bml/bml.h"
 #include "ompi/mca/pml/ob1/pml_ob1_rdmafrag.h"
-
+#include "ompi/mca/pml/base/pml_base_request.h"
 
 #if OPAL_CUDA_SUPPORT
 static struct mca_btl_base_registration_handle_t *mca_btl_smcuda_register_mem (
@@ -1136,26 +1136,25 @@ int mca_btl_smcuda_get_cuda (struct mca_btl_base_module_t *btl,
     
     /* datatype RDMA */
     mca_pml_ob1_rdma_frag_t *frag_ob1 = cbdata;
-    mca_pml_ob1_recv_request_t *recvreq = (mca_pml_ob1_recv_request_t *) frag_ob1->rdma_req;
     mca_bml_base_btl_t *bml_btl = frag_ob1->rdma_bml;
-    
-    if ((recvreq->req_recv.req_base.req_convertor.flags & CONVERTOR_CUDA) &&
+    mca_pml_base_request_t *req = (mca_pml_base_request_t*) frag_ob1->rdma_req;
+    opal_convertor_t* convertor = &req->req_convertor;
+
+    if ((convertor->flags & CONVERTOR_CUDA) &&
         (bml_btl->btl_flags & MCA_BTL_FLAGS_CUDA_GET)) {
-        recvreq->req_recv.req_base.req_convertor.flags &= ~CONVERTOR_CUDA;
+        convertor->flags &= ~CONVERTOR_CUDA;
         uint8_t pack_required = remote_handle->reg_data.pack_required;
         uint32_t lindex = remote_handle->reg_data.lindex;
         uint8_t remote_device = remote_handle->reg_data.gpu_device;
-        uint8_t local_device = 0;
+        int32_t local_device = 0;
         rc = mca_common_cuda_get_device(&local_device);
         if (rc != 0) {
             opal_output(0, "Failed to get the GPU device ID, rc=%d", rc);
             return rc;
         }
-        struct opal_convertor_t *convertor = NULL;
-        if(opal_convertor_need_buffers(&recvreq->req_recv.req_base.req_convertor) == true) {
-            recvreq->req_recv.req_base.req_convertor.flags |= CONVERTOR_CUDA;
+        if(opal_convertor_need_buffers(convertor) == true) {
+            convertor->flags |= CONVERTOR_CUDA;
             
-            convertor = &(recvreq->req_recv.req_base.req_convertor);   
             printf("local addr %p, pbase %p\n", local_address, convertor->pBaseBuf);
             
             if (remote_device != local_device && !OPAL_DATATYPE_DIRECT_COPY_GPUMEM) {
@@ -1163,7 +1162,6 @@ int mca_btl_smcuda_get_cuda (struct mca_btl_base_module_t *btl,
             } else {
                 convertor->gpu_buffer_ptr = remote_memory_address;   
             }
-            cuda_ddt_hdr_t send_msg;
             if (pack_required) {
                 mca_btl_smcuda_cuda_ddt_start_pack(btl, ep, convertor, remote_memory_address, (mca_btl_base_descriptor_t *)frag, 
                                                     lindex, remote_device, local_device);
@@ -1187,9 +1185,8 @@ int mca_btl_smcuda_get_cuda (struct mca_btl_base_module_t *btl,
                 done = 1;
             }
         } else {
-            recvreq->req_recv.req_base.req_convertor.flags |= CONVERTOR_CUDA;
+            convertor->flags |= CONVERTOR_CUDA;
             if (pack_required) {
-                convertor = &(recvreq->req_recv.req_base.req_convertor);   
                 if (remote_device == local_device || OPAL_DATATYPE_DIRECT_COPY_GPUMEM) {
                     /* now we are able to let sender pack directly to my memory */
                     mca_mpool_common_cuda_reg_t loc_reg;
@@ -1396,8 +1393,10 @@ inline static int mca_btl_smcuda_cuda_ddt_start_pack(struct mca_btl_base_module_
     send_msg.packed_size = 0;
     send_msg.seq = 0;
     send_msg.msg_type = CUDA_DDT_PACK_START;
-    opal_output(0, "smcuda btl start pack, remote_gpu_address %p, frag %p, lindex %d, remote_device %d, local_device %d\n", remote_gpu_address, frag, lindex, remote_device, local_device);
+    opal_output(0, "smcuda btl start pack, remote_gpu_address %p, frag %p, lindex %d, remote_device %d, local_device %d\n",
+                (void*)remote_gpu_address, (void*)frag, lindex, remote_device, local_device);
     mca_btl_smcuda_send_cuda_pack_sig(btl, endpoint, &send_msg);
+    return OPAL_SUCCESS;
 }
 
 int mca_btl_smcuda_alloc_cuda_ddt_pack_clone(struct mca_btl_base_endpoint_t *endpoint)

From 688a423854064af73a7496ef09fca9a058b4a81b Mon Sep 17 00:00:00 2001
From: Wei Wu <wwu12@dancer.icl.utk.edu>
Date: Tue, 27 Oct 2015 18:52:03 -0400
Subject: [PATCH 036/190] reorder datatypes to cache boundaries

---
 opal/datatype/opal_datatype.h | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/opal/datatype/opal_datatype.h b/opal/datatype/opal_datatype.h
index bec40665d15..5a61aa6fae6 100644
--- a/opal/datatype/opal_datatype.h
+++ b/opal/datatype/opal_datatype.h
@@ -107,33 +107,32 @@ struct opal_datatype_t {
     size_t             size;     /**< total size in bytes of the memory used by the data if
                                       the data is put on a contiguous buffer */
     OPAL_PTRDIFF_TYPE  true_lb;  /**< the true lb of the data without user defined lb and ub */
+    /* --- cacheline 1 boundary (64 bytes) --- */
     OPAL_PTRDIFF_TYPE  true_ub;  /**< the true ub of the data without user defined lb and ub */
     OPAL_PTRDIFF_TYPE  lb;       /**< lower bound in memory */
     OPAL_PTRDIFF_TYPE  ub;       /**< upper bound in memory */
-    /* --- cacheline 1 boundary (64 bytes) --- */
     size_t             nbElems;  /**< total number of elements inside the datatype */
-    uint32_t           align;    /**< data should be aligned to */
 
     /* Attribute fields */
     char               name[OPAL_MAX_OBJECT_NAME];  /**< name of the datatype */
-    /* --- cacheline 2 boundary (128 bytes) was 8-12 bytes ago --- */
+    /* --- cacheline 2 boundary (128 bytes) was 40 bytes ago --- */
     dt_type_desc_t     desc;     /**< the data description */
     dt_type_desc_t     opt_desc; /**< short description of the data used when conversion is useless
                                       or in the send case (without conversion) */
 
+    uint32_t           align;    /**< data should be aligned to */
     uint32_t           btypes[OPAL_DATATYPE_MAX_SUPPORTED];
                                  /**< basic elements count used to compute the size of the
                                       datatype for remote nodes. The length of the array is dependent on
                                       the maximum number of datatypes of all top layers.
                                       Reason being is that Fortran is not at the OPAL layer. */
-    /* --- cacheline 5 boundary (320 bytes) was 32-36 bytes ago --- */
-
+    /* --- cacheline 6 boundary (384 bytes) was 8 bytes ago --- */
     struct iovec*      iov;
     int                iov_count;
     size_t             max_data;
-    /* size: 372, cachelines: 6, members: 18 */
+    /* size: 416, cachelines: 7, members: 18 */
 
-    /* last cacheline: 28-32 bytes */
+    /* last cacheline: 32 bytes */
 };
 
 typedef struct opal_datatype_t opal_datatype_t;

From 04a9785d10e5831fda2e327c3a06b6148f856c39 Mon Sep 17 00:00:00 2001
From: eddy16112 <eddy16112@gmail.com>
Date: Tue, 27 Oct 2015 22:01:02 -0400
Subject: [PATCH 037/190] slience warnings

---
 ompi/mca/pml/ob1/pml_ob1_cuda.c            | 17 ++++++++---------
 opal/mca/btl/smcuda/btl_smcuda.c           | 12 ++++++------
 opal/mca/btl/smcuda/btl_smcuda.h           |  8 ++++----
 opal/mca/btl/smcuda/btl_smcuda_component.c | 14 ++++++++------
 opal/mca/common/cuda/common_cuda.h         |  4 +---
 5 files changed, 27 insertions(+), 28 deletions(-)

diff --git a/ompi/mca/pml/ob1/pml_ob1_cuda.c b/ompi/mca/pml/ob1/pml_ob1_cuda.c
index 2550b4c93b1..b8d07483962 100644
--- a/ompi/mca/pml/ob1/pml_ob1_cuda.c
+++ b/ompi/mca/pml/ob1/pml_ob1_cuda.c
@@ -52,7 +52,7 @@ size_t mca_pml_ob1_rdma_cuda_btls(
 int mca_pml_ob1_rdma_cuda_btl_register_data(
     mca_pml_ob1_com_btl_t* rdma_btls, 
     uint32_t num_btls_used, 
-    size_t pipeline_size, int lindex, uint8_t pack_required, uint8_t gpu_device);
+    int lindex, uint8_t pack_required, int32_t gpu_device);
 
 int mca_pml_ob1_cuda_need_buffers(void * rreq,
                                   mca_btl_base_module_t* btl);
@@ -67,7 +67,7 @@ int mca_pml_ob1_send_request_start_cuda(mca_pml_ob1_send_request_t* sendreq,
                                         mca_bml_base_btl_t* bml_btl,
                                         size_t size) {
     int rc;
-    int local_device = 0;
+    int32_t local_device = 0;
 #if OPAL_CUDA_SUPPORT_41
 #if OPAL_CUDA_GDR_SUPPORT
     /* With some BTLs, switch to RNDV from RGET at large messages */
@@ -91,10 +91,10 @@ int mca_pml_ob1_send_request_start_cuda(mca_pml_ob1_send_request_t* sendreq,
             
             rc = mca_common_cuda_get_device(&local_device);
             if (rc != 0) {
-                opal_output_verbose(0, "Failed to get the GPU device ID, rc=%d", rc);
+                opal_output(0, "Failed to get the GPU device ID, rc= %d\n", rc);
                 return rc;
             }                                                                   
-            mca_pml_ob1_rdma_cuda_btl_register_data(sendreq->req_rdma, sendreq->req_rdma_cnt, 0, -1, 0, local_device); 
+            mca_pml_ob1_rdma_cuda_btl_register_data(sendreq->req_rdma, sendreq->req_rdma_cnt, -1, 0, local_device); 
             rc = mca_pml_ob1_send_request_start_rdma(sendreq, bml_btl,
                                                      sendreq->req_send.req_bytes_packed);
             if( OPAL_UNLIKELY(OMPI_SUCCESS != rc) ) {
@@ -137,10 +137,10 @@ int mca_pml_ob1_send_request_start_cuda(mca_pml_ob1_send_request_t* sendreq,
                 assert(lindex >= 0);
                 rc = mca_common_cuda_get_device(&local_device);
                 if (rc != 0) {
-                    opal_output_verbose(0, "Failed to get the GPU device ID, rc=%d", rc);
+                    opal_output(0, "Failed to get the GPU device ID, rc=%d\n", rc);
                     return rc;
                 }
-                mca_pml_ob1_rdma_cuda_btl_register_data(sendreq->req_rdma, sendreq->req_rdma_cnt, 0, lindex, 1, local_device); 
+                mca_pml_ob1_rdma_cuda_btl_register_data(sendreq->req_rdma, sendreq->req_rdma_cnt, lindex, 1, local_device); 
                 mca_btl_smcuda_cuda_ddt_pack_clone( bml_btl->btl_endpoint, convertor, NULL, NULL, lindex, 0, local_device);
     
                 rc = mca_pml_ob1_send_request_start_rdma(sendreq, bml_btl,
@@ -223,9 +223,9 @@ size_t mca_pml_ob1_rdma_cuda_btls(
 int mca_pml_ob1_rdma_cuda_btl_register_data(
     mca_pml_ob1_com_btl_t* rdma_btls, 
     uint32_t num_btls_used, 
-    size_t pipeline_size, int lindex, uint8_t pack_required, uint8_t gpu_device)
+    int lindex, uint8_t pack_required, int32_t gpu_device)
 {
-    uint32_t i, j;
+    uint32_t i;
     for (i = 0; i < num_btls_used; i++) {
         mca_btl_base_registration_handle_t *handle = rdma_btls[i].btl_reg;
         mca_mpool_common_cuda_reg_t *cuda_reg = (mca_mpool_common_cuda_reg_t *)
@@ -235,7 +235,6 @@ int mca_pml_ob1_rdma_cuda_btl_register_data(
       //       mca_common_cuda_geteventhandle(&convertor->pipeline_event[j], j, (mca_mpool_base_registration_t *)cuda_reg);
       // //      printf("event %lu, j %d\n", convertor->pipeline_event[j], j);
       //   }
-   //     cuda_reg->data.pipeline_size = pipeline_size;
         cuda_reg->data.lindex = lindex;
         cuda_reg->data.pack_required = pack_required;
         cuda_reg->data.gpu_device = gpu_device;
diff --git a/opal/mca/btl/smcuda/btl_smcuda.c b/opal/mca/btl/smcuda/btl_smcuda.c
index be8df760f4f..f6e27a7c47c 100644
--- a/opal/mca/btl/smcuda/btl_smcuda.c
+++ b/opal/mca/btl/smcuda/btl_smcuda.c
@@ -89,7 +89,7 @@ inline static int mca_btl_smcuda_cuda_ddt_start_pack(struct mca_btl_base_module_
                                                      struct opal_convertor_t *convertor,
                                                      void *remote_gpu_address,
                                                      mca_btl_base_descriptor_t *frag,
-                                                     int lindex, uint8_t remote_device, uint8_t local_device);
+                                                     int lindex, int remote_device, int local_device);
 #endif
 
 mca_btl_smcuda_t mca_btl_smcuda = {
@@ -1145,8 +1145,8 @@ int mca_btl_smcuda_get_cuda (struct mca_btl_base_module_t *btl,
         convertor->flags &= ~CONVERTOR_CUDA;
         uint8_t pack_required = remote_handle->reg_data.pack_required;
         uint32_t lindex = remote_handle->reg_data.lindex;
-        uint8_t remote_device = remote_handle->reg_data.gpu_device;
-        int32_t local_device = 0;
+        int remote_device = remote_handle->reg_data.gpu_device;
+        int local_device = 0;
         rc = mca_common_cuda_get_device(&local_device);
         if (rc != 0) {
             opal_output(0, "Failed to get the GPU device ID, rc=%d", rc);
@@ -1384,7 +1384,7 @@ inline static int mca_btl_smcuda_cuda_ddt_start_pack(struct mca_btl_base_module_
                                                      struct opal_convertor_t *convertor,
                                                      void *remote_gpu_address,
                                                      mca_btl_base_descriptor_t *frag,
-                                                     int lindex, uint8_t remote_device, uint8_t local_device)
+                                                     int lindex, int remote_device, int local_device)
 {
     cuda_ddt_hdr_t send_msg;
     mca_btl_smcuda_cuda_ddt_unpack_clone(endpoint, convertor, remote_gpu_address, (mca_btl_base_descriptor_t *)frag, 
@@ -1435,7 +1435,7 @@ void mca_btl_smcuda_cuda_ddt_pack_clone(struct mca_btl_base_endpoint_t *endpoint
                                         struct opal_convertor_t *convertor,
                                         void *remote_gpu_address,
                                         mca_btl_base_descriptor_t *frag,
-                                        int lindex, uint8_t remote_device, uint8_t local_device)
+                                        int lindex, int remote_device, int local_device)
 {
     endpoint->smcuda_ddt_pack_clone[lindex].convertor = convertor;
     endpoint->smcuda_ddt_pack_clone[lindex].current_convertor_pBaseBuf = convertor->pBaseBuf;
@@ -1450,7 +1450,7 @@ void mca_btl_smcuda_cuda_ddt_unpack_clone(struct mca_btl_base_endpoint_t *endpoi
                                           struct opal_convertor_t *convertor,
                                           void *remote_gpu_address,
                                           mca_btl_base_descriptor_t *frag,
-                                          int lindex, uint8_t remote_device, uint8_t local_device)
+                                          int lindex, int remote_device, int local_device)
 {
     endpoint->smcuda_ddt_unpack_clone[lindex].convertor = convertor;
     endpoint->smcuda_ddt_unpack_clone[lindex].current_convertor_pBaseBuf = convertor->pBaseBuf;
diff --git a/opal/mca/btl/smcuda/btl_smcuda.h b/opal/mca/btl/smcuda/btl_smcuda.h
index 26dbcb34b2d..ec5cbfa129c 100644
--- a/opal/mca/btl/smcuda/btl_smcuda.h
+++ b/opal/mca/btl/smcuda/btl_smcuda.h
@@ -541,8 +541,8 @@ typedef struct {
     unsigned char *current_convertor_pBaseBuf;
     void *remote_gpu_address;
     int lindex;
-    uint8_t remote_device;
-    uint8_t local_device;
+    int remote_device;
+    int local_device;
     mca_btl_base_descriptor_t *frag;
 } cuda_ddt_clone_t;
 
@@ -559,12 +559,12 @@ void mca_btl_smcuda_cuda_ddt_pack_clone(struct mca_btl_base_endpoint_t *endpoint
                                         struct opal_convertor_t *convertor,
                                         void *remote_gpu_address,
                                         mca_btl_base_descriptor_t *frag,
-                                        int lindex, uint8_t remote_device, uint8_t local_device);
+                                        int lindex, int remote_device, int local_device);
 void mca_btl_smcuda_cuda_ddt_unpack_clone(struct mca_btl_base_endpoint_t *endpoint,
                                           struct opal_convertor_t *convertor,
                                           void *remote_gpu_address,
                                           mca_btl_base_descriptor_t *frag,
-                                          int lindex, uint8_t remote_device, uint8_t local_device);
+                                          int lindex, int remote_device, int local_device);
 
 #endif /* OPAL_CUDA_SUPPORT */
 
diff --git a/opal/mca/btl/smcuda/btl_smcuda_component.c b/opal/mca/btl/smcuda/btl_smcuda_component.c
index 3d8d01c90a1..183edb8b671 100644
--- a/opal/mca/btl/smcuda/btl_smcuda_component.c
+++ b/opal/mca/btl/smcuda/btl_smcuda_component.c
@@ -888,20 +888,22 @@ static void btl_smcuda_datatype_unpack(mca_btl_base_module_t* btl,
         struct opal_convertor_t *convertor = my_cuda_dt_clone->convertor;
         size_t pipeline_size = mca_btl_smcuda_component.cuda_ddt_pipeline_size;
         convertor->flags &= ~CONVERTOR_CUDA;
+        unsigned char *remote_address = NULL;
         if (opal_convertor_need_buffers(convertor) == false) { /* do not unpack */
             convertor->flags |= CONVERTOR_CUDA;
-            mca_btl_smcuda_frag_t *frag_recv = (mca_btl_smcuda_frag_t *) my_cuda_dt_clone->frag;
             unsigned char *local_address = my_cuda_dt_clone->current_convertor_pBaseBuf;
-            opal_output(0, "no unpack, start D2D copy local %p, remote %p, size %ld\n", local_address, my_cuda_dt_clone->remote_gpu_address+seq*pipeline_size, packed_size);
-            mca_common_cuda_memp2pcpy(local_address, my_cuda_dt_clone->remote_gpu_address + seq*pipeline_size, packed_size);
+            remote_address = (unsigned char*)my_cuda_dt_clone->remote_gpu_address + seq * pipeline_size;
+            opal_output(0, "no unpack, start D2D copy local %p, remote %p, size %ld\n", local_address, remote_address, packed_size);
+            mca_common_cuda_memp2pcpy(local_address, (unsigned char*)my_cuda_dt_clone->remote_gpu_address + seq*pipeline_size, packed_size);
             my_cuda_dt_clone->current_convertor_pBaseBuf += packed_size;
         } else {     /* unpack */
             convertor->flags |= CONVERTOR_CUDA;
             if (!OPAL_DATATYPE_DIRECT_COPY_GPUMEM && my_cuda_dt_clone->remote_device != my_cuda_dt_clone->local_device) {
                 convertor->gpu_buffer_ptr = opal_cuda_malloc_gpu_buffer_p(packed_size, 0);
-                (*opal_cuda_d2dcpy_async_p)(convertor->gpu_buffer_ptr, my_cuda_dt_clone->remote_gpu_address + seq*pipeline_size, packed_size);
+                remote_address = (unsigned char*)my_cuda_dt_clone->remote_gpu_address + seq * pipeline_size;
+                (*opal_cuda_d2dcpy_async_p)(convertor->gpu_buffer_ptr, remote_address, packed_size);
                 iov.iov_base = convertor->gpu_buffer_ptr;
-                opal_output(0, "unpack, start D2D copy src %p, dst %p, size %lu\n", my_cuda_dt_clone->remote_gpu_address + seq*pipeline_size, convertor->gpu_buffer_ptr, packed_size);        
+                opal_output(0, "unpack, start D2D copy src %p, dst %p, size %lu\n", remote_address, convertor->gpu_buffer_ptr, packed_size);        
             } else {
                 iov.iov_base = convertor->gpu_buffer_ptr + seq * pipeline_size;
             }
@@ -985,7 +987,7 @@ static void btl_smcuda_datatype_pack(mca_btl_base_module_t* btl,
         seq = 0;
         while (rv_dt != 1 && convertor->gpu_buffer_size > 0) {
             rv_dt = opal_convertor_pack(convertor, &iov, &iov_count, &max_data );
-            iov.iov_base += mca_btl_smcuda_component.cuda_ddt_pipeline_size;
+            iov.iov_base = (void*)((unsigned char*)iov.iov_base + mca_btl_smcuda_component.cuda_ddt_pipeline_size);
             convertor->gpu_buffer_size -= mca_btl_smcuda_component.cuda_ddt_pipeline_size;
             send_msg.packed_size = max_data;
             send_msg.seq = seq;
diff --git a/opal/mca/common/cuda/common_cuda.h b/opal/mca/common/cuda/common_cuda.h
index d5220052d63..61256fa6809 100644
--- a/opal/mca/common/cuda/common_cuda.h
+++ b/opal/mca/common/cuda/common_cuda.h
@@ -38,11 +38,9 @@ struct mca_mpool_common_cuda_reg_data_t {
     uint64_t event;
     opal_ptr_t memh_seg_addr;
     size_t memh_seg_len;
-    // uint64_t pipeline_evtHandle[MAX_IPC_EVENT_HANDLE*EVTHANDLE_SIZE];
-    size_t pipeline_size;
     uint32_t lindex;
     uint8_t pack_required;
-    uint8_t gpu_device;
+    int32_t gpu_device;
 };
 typedef struct mca_mpool_common_cuda_reg_data_t mca_mpool_common_cuda_reg_data_t;
 

From e05edf8420b40dce028338d403f2b08a6c9836e5 Mon Sep 17 00:00:00 2001
From: eddy16112 <eddy16112@gmail.com>
Date: Wed, 28 Oct 2015 16:29:28 -0400
Subject: [PATCH 038/190] remove smcuda btl calls from pml ob1

---
 ompi/mca/pml/ob1/pml_ob1_cuda.c            |  15 +-
 opal/mca/btl/smcuda/btl_smcuda.c           | 155 +++++++++++----------
 opal/mca/btl/smcuda/btl_smcuda.h           |  29 ++--
 opal/mca/btl/smcuda/btl_smcuda_component.c |  33 ++---
 opal/mca/btl/smcuda/btl_smcuda_endpoint.h  |   5 +-
 opal/mca/common/cuda/common_cuda.h         |   2 +-
 6 files changed, 115 insertions(+), 124 deletions(-)

diff --git a/ompi/mca/pml/ob1/pml_ob1_cuda.c b/ompi/mca/pml/ob1/pml_ob1_cuda.c
index b8d07483962..9cc3aa94d9b 100644
--- a/ompi/mca/pml/ob1/pml_ob1_cuda.c
+++ b/ompi/mca/pml/ob1/pml_ob1_cuda.c
@@ -52,7 +52,7 @@ size_t mca_pml_ob1_rdma_cuda_btls(
 int mca_pml_ob1_rdma_cuda_btl_register_data(
     mca_pml_ob1_com_btl_t* rdma_btls, 
     uint32_t num_btls_used, 
-    int lindex, uint8_t pack_required, int32_t gpu_device);
+    struct opal_convertor_t *pack_convertor, uint8_t pack_required, int32_t gpu_device);
 
 int mca_pml_ob1_cuda_need_buffers(void * rreq,
                                   mca_btl_base_module_t* btl);
@@ -78,6 +78,7 @@ int mca_pml_ob1_send_request_start_cuda(mca_pml_ob1_send_request_t* sendreq,
 #endif /* OPAL_CUDA_GDR_SUPPORT */
 
     sendreq->req_send.req_base.req_convertor.flags &= ~CONVERTOR_CUDA;
+    struct opal_convertor_t *convertor = &(sendreq->req_send.req_base.req_convertor);
     if (opal_convertor_need_buffers(&sendreq->req_send.req_base.req_convertor) == false) {
         unsigned char *base;
         opal_convertor_get_current_pointer( &sendreq->req_send.req_base.req_convertor, (void**)&base );
@@ -94,7 +95,7 @@ int mca_pml_ob1_send_request_start_cuda(mca_pml_ob1_send_request_t* sendreq,
                 opal_output(0, "Failed to get the GPU device ID, rc= %d\n", rc);
                 return rc;
             }                                                                   
-            mca_pml_ob1_rdma_cuda_btl_register_data(sendreq->req_rdma, sendreq->req_rdma_cnt, -1, 0, local_device); 
+            mca_pml_ob1_rdma_cuda_btl_register_data(sendreq->req_rdma, sendreq->req_rdma_cnt, convertor, 0, local_device); 
             rc = mca_pml_ob1_send_request_start_rdma(sendreq, bml_btl,
                                                      sendreq->req_send.req_bytes_packed);
             if( OPAL_UNLIKELY(OMPI_SUCCESS != rc) ) {
@@ -115,7 +116,6 @@ int mca_pml_ob1_send_request_start_cuda(mca_pml_ob1_send_request_t* sendreq,
         mca_bml_base_btl_t* bml_endpoint_btl = mca_bml_base_btl_array_get_index(&(sendreq->req_endpoint->btl_send), 0);
         if ((bml_endpoint_btl->btl_flags & MCA_BTL_FLAGS_CUDA_GET) && CUDA_DDT_WITH_RDMA) {
             unsigned char *base;
-            struct opal_convertor_t *convertor = &(sendreq->req_send.req_base.req_convertor);
             size_t buffer_size = 0;
             if (convertor->local_size > bml_btl->btl->btl_cuda_ddt_pipeline_size) {
                 buffer_size = bml_btl->btl->btl_cuda_ddt_pipeline_size * bml_btl->btl->btl_cuda_ddt_pipeline_depth;
@@ -133,15 +133,12 @@ int mca_pml_ob1_send_request_start_cuda(mca_pml_ob1_send_request_t* sendreq,
                                                                            sendreq->req_send.req_bytes_packed,
                                                                            sendreq->req_rdma))) {
     
-                int lindex = mca_btl_smcuda_alloc_cuda_ddt_pack_clone(bml_btl->btl_endpoint);
-                assert(lindex >= 0);
                 rc = mca_common_cuda_get_device(&local_device);
                 if (rc != 0) {
                     opal_output(0, "Failed to get the GPU device ID, rc=%d\n", rc);
                     return rc;
                 }
-                mca_pml_ob1_rdma_cuda_btl_register_data(sendreq->req_rdma, sendreq->req_rdma_cnt, lindex, 1, local_device); 
-                mca_btl_smcuda_cuda_ddt_pack_clone( bml_btl->btl_endpoint, convertor, NULL, NULL, lindex, 0, local_device);
+                mca_pml_ob1_rdma_cuda_btl_register_data(sendreq->req_rdma, sendreq->req_rdma_cnt, convertor, 1, local_device); 
     
                 rc = mca_pml_ob1_send_request_start_rdma(sendreq, bml_btl,
                                                          sendreq->req_send.req_bytes_packed);
@@ -223,7 +220,7 @@ size_t mca_pml_ob1_rdma_cuda_btls(
 int mca_pml_ob1_rdma_cuda_btl_register_data(
     mca_pml_ob1_com_btl_t* rdma_btls, 
     uint32_t num_btls_used, 
-    int lindex, uint8_t pack_required, int32_t gpu_device)
+    struct opal_convertor_t *pack_convertor, uint8_t pack_required, int32_t gpu_device)
 {
     uint32_t i;
     for (i = 0; i < num_btls_used; i++) {
@@ -235,9 +232,9 @@ int mca_pml_ob1_rdma_cuda_btl_register_data(
       //       mca_common_cuda_geteventhandle(&convertor->pipeline_event[j], j, (mca_mpool_base_registration_t *)cuda_reg);
       // //      printf("event %lu, j %d\n", convertor->pipeline_event[j], j);
       //   }
-        cuda_reg->data.lindex = lindex;
         cuda_reg->data.pack_required = pack_required;
         cuda_reg->data.gpu_device = gpu_device;
+        cuda_reg->data.pack_convertor = pack_convertor;
 
     }
     return 0;
diff --git a/opal/mca/btl/smcuda/btl_smcuda.c b/opal/mca/btl/smcuda/btl_smcuda.c
index f6e27a7c47c..2d015ad11fb 100644
--- a/opal/mca/btl/smcuda/btl_smcuda.c
+++ b/opal/mca/btl/smcuda/btl_smcuda.c
@@ -86,7 +86,8 @@ static int mca_btl_smcuda_deregister_mem (struct mca_btl_base_module_t* btl,
                                           
 inline static int mca_btl_smcuda_cuda_ddt_start_pack(struct mca_btl_base_module_t *btl, 
                                                      struct mca_btl_base_endpoint_t *endpoint,
-                                                     struct opal_convertor_t *convertor,
+                                                     struct opal_convertor_t *pack_convertor,
+                                                     struct opal_convertor_t *unpack_convertor,
                                                      void *remote_gpu_address,
                                                      mca_btl_base_descriptor_t *frag,
                                                      int lindex, int remote_device, int local_device);
@@ -500,9 +501,13 @@ create_sm_endpoint(int local_proc, struct opal_proc_t *proc)
         ep->mpool = mca_mpool_base_module_create("rgpusm",
                                                  NULL,
                                                  &resources);
-        for (int i = 0; i < SMCUDA_DT_CLONE_SIZE; i++) {
-            ep->smcuda_ddt_pack_clone[i].lindex = -1;
-            ep->smcuda_ddt_unpack_clone[i].lindex = -1;
+        /* alloc array for pack/unpack use */
+        ep->smcuda_ddt_clone = NULL;
+        ep->smcuda_ddt_clone = (cuda_ddt_clone_t *)malloc(sizeof(cuda_ddt_clone_t) * SMCUDA_DT_CLONE_SIZE);
+        ep->smcuda_ddt_clone_size = SMCUDA_DT_CLONE_SIZE;
+        ep->smcuda_ddt_clone_avail = SMCUDA_DT_CLONE_SIZE;
+        for (int i = 0; i < ep->smcuda_ddt_clone_size; i++) {
+            ep->smcuda_ddt_clone[i].lindex = -1;
         }
     }
 #endif /* OPAL_CUDA_SUPPORT */
@@ -709,6 +714,15 @@ int mca_btl_smcuda_del_procs(
     struct opal_proc_t **procs,
     struct mca_btl_base_endpoint_t **peers)
 {
+    int32_t proc;
+    struct mca_btl_base_endpoint_t * ep;
+    for (proc = 0; proc < (int32_t)nprocs; proc++) {
+        ep = peers[proc];
+        if (ep->smcuda_ddt_clone != NULL) {
+            free(ep->smcuda_ddt_clone);
+            ep->smcuda_ddt_clone = NULL;
+        }
+    }
     return OPAL_SUCCESS;
 }
 
@@ -1138,32 +1152,34 @@ int mca_btl_smcuda_get_cuda (struct mca_btl_base_module_t *btl,
     mca_pml_ob1_rdma_frag_t *frag_ob1 = cbdata;
     mca_bml_base_btl_t *bml_btl = frag_ob1->rdma_bml;
     mca_pml_base_request_t *req = (mca_pml_base_request_t*) frag_ob1->rdma_req;
-    opal_convertor_t* convertor = &req->req_convertor;
+    opal_convertor_t* unpack_convertor = &req->req_convertor;
 
-    if ((convertor->flags & CONVERTOR_CUDA) &&
+    if ((unpack_convertor->flags & CONVERTOR_CUDA) &&
         (bml_btl->btl_flags & MCA_BTL_FLAGS_CUDA_GET)) {
-        convertor->flags &= ~CONVERTOR_CUDA;
+        unpack_convertor->flags &= ~CONVERTOR_CUDA;
         uint8_t pack_required = remote_handle->reg_data.pack_required;
-        uint32_t lindex = remote_handle->reg_data.lindex;
+        int lindex = -1;
         int remote_device = remote_handle->reg_data.gpu_device;
+        opal_convertor_t* pack_convertor = remote_handle->reg_data.pack_convertor;
         int local_device = 0;
         rc = mca_common_cuda_get_device(&local_device);
         if (rc != 0) {
             opal_output(0, "Failed to get the GPU device ID, rc=%d", rc);
             return rc;
         }
-        if(opal_convertor_need_buffers(convertor) == true) {
-            convertor->flags |= CONVERTOR_CUDA;
+        if(opal_convertor_need_buffers(unpack_convertor) == true) {
+            unpack_convertor->flags |= CONVERTOR_CUDA;
             
-            printf("local addr %p, pbase %p\n", local_address, convertor->pBaseBuf);
+            printf("local addr %p, pbase %p\n", local_address, unpack_convertor->pBaseBuf);
             
             if (remote_device != local_device && !OPAL_DATATYPE_DIRECT_COPY_GPUMEM) {
-                convertor->gpu_buffer_ptr = NULL;  
+                unpack_convertor->gpu_buffer_ptr = NULL;  
             } else {
-                convertor->gpu_buffer_ptr = remote_memory_address;   
+                unpack_convertor->gpu_buffer_ptr = remote_memory_address;   
             }
             if (pack_required) {
-                mca_btl_smcuda_cuda_ddt_start_pack(btl, ep, convertor, remote_memory_address, (mca_btl_base_descriptor_t *)frag, 
+                lindex = mca_btl_smcuda_alloc_cuda_ddt_clone(ep);
+                mca_btl_smcuda_cuda_ddt_start_pack(btl, ep, pack_convertor, unpack_convertor, remote_memory_address, (mca_btl_base_descriptor_t *)frag, 
                                                     lindex, remote_device, local_device);
                 done = 0;
             } else {
@@ -1171,40 +1187,42 @@ int mca_btl_smcuda_get_cuda (struct mca_btl_base_module_t *btl,
                 uint32_t iov_count = 1;
                 size_t max_data;
                 if (!OPAL_DATATYPE_DIRECT_COPY_GPUMEM && remote_device != local_device) {
-                    convertor->gpu_buffer_ptr = opal_cuda_malloc_gpu_buffer_p(size, 0);
-                    (*opal_cuda_d2dcpy_async_p)(convertor->gpu_buffer_ptr, remote_memory_address, size);
-                    iov.iov_base = convertor->gpu_buffer_ptr;
-                    opal_output(0, "start D2D copy src %p, dst %p, size %lu\n", remote_memory_address, convertor->gpu_buffer_ptr, size);
+                    unpack_convertor->gpu_buffer_ptr = opal_cuda_malloc_gpu_buffer_p(size, 0);
+                    (*opal_cuda_d2dcpy_async_p)(unpack_convertor->gpu_buffer_ptr, remote_memory_address, size);
+                    iov.iov_base = unpack_convertor->gpu_buffer_ptr;
+                    opal_output(0, "start D2D copy src %p, dst %p, size %lu\n", remote_memory_address, unpack_convertor->gpu_buffer_ptr, size);
                 } else {
-                    iov.iov_base = convertor->gpu_buffer_ptr;
+                    iov.iov_base = unpack_convertor->gpu_buffer_ptr;
                 }
                 iov.iov_len = size;
                 max_data = size;
-                opal_convertor_unpack(convertor, &iov, &iov_count, &max_data );
-                opal_cuda_free_gpu_buffer_p(convertor->gpu_buffer_ptr, 0);
+                opal_convertor_unpack(unpack_convertor, &iov, &iov_count, &max_data );
+                opal_cuda_free_gpu_buffer_p(unpack_convertor->gpu_buffer_ptr, 0);
                 done = 1;
             }
         } else {
-            convertor->flags |= CONVERTOR_CUDA;
+            unpack_convertor->flags |= CONVERTOR_CUDA;
             if (pack_required) {
+                lindex = mca_btl_smcuda_alloc_cuda_ddt_clone(ep);
                 if (remote_device == local_device || OPAL_DATATYPE_DIRECT_COPY_GPUMEM) {
                     /* now we are able to let sender pack directly to my memory */
                     mca_mpool_common_cuda_reg_t loc_reg;
                     mca_mpool_common_cuda_reg_t *loc_reg_ptr = &loc_reg;
                     cuda_ddt_put_hdr_t put_msg;
                     if (OPAL_SUCCESS != cuda_getmemhandle(local_address, size, (mca_mpool_base_registration_t *)&loc_reg, NULL)) {
-                        mca_btl_smcuda_cuda_ddt_start_pack(btl, ep, convertor, remote_memory_address, (mca_btl_base_descriptor_t *)frag, 
+                        mca_btl_smcuda_cuda_ddt_start_pack(btl, ep, pack_convertor, unpack_convertor, remote_memory_address, (mca_btl_base_descriptor_t *)frag, 
                                                            lindex, remote_device, local_device);
                     }
                     memcpy(put_msg.mem_handle, loc_reg_ptr->data.memHandle, sizeof(loc_reg_ptr->data.memHandle));
                     put_msg.remote_address = local_address;
                     put_msg.remote_base = loc_reg.base.base;
                     put_msg.lindex = lindex;
-                    mca_btl_smcuda_cuda_ddt_unpack_clone(ep, convertor, remote_memory_address, (mca_btl_base_descriptor_t *)frag, 
+                    put_msg.pack_convertor = pack_convertor;
+                    mca_btl_smcuda_cuda_ddt_clone(ep, pack_convertor, unpack_convertor, remote_memory_address, (mca_btl_base_descriptor_t *)frag, 
                                                         lindex, 0, 0);
                     mca_btl_smcuda_send_cuda_put_sig(btl, ep, &put_msg);
                 } else {
-                    mca_btl_smcuda_cuda_ddt_start_pack(btl, ep, convertor, remote_memory_address, (mca_btl_base_descriptor_t *)frag, 
+                    mca_btl_smcuda_cuda_ddt_start_pack(btl, ep, pack_convertor, unpack_convertor, remote_memory_address, (mca_btl_base_descriptor_t *)frag, 
                                                        lindex, remote_device, local_device);
                 }
                 done = 0;
@@ -1381,84 +1399,67 @@ int mca_btl_smcuda_send_cuda_put_sig(struct mca_btl_base_module_t* btl,
 
 inline static int mca_btl_smcuda_cuda_ddt_start_pack(struct mca_btl_base_module_t *btl,
                                                      struct mca_btl_base_endpoint_t *endpoint,
-                                                     struct opal_convertor_t *convertor,
+                                                     struct opal_convertor_t *pack_convertor,
+                                                     struct opal_convertor_t *unpack_convertor,
                                                      void *remote_gpu_address,
                                                      mca_btl_base_descriptor_t *frag,
                                                      int lindex, int remote_device, int local_device)
 {
     cuda_ddt_hdr_t send_msg;
-    mca_btl_smcuda_cuda_ddt_unpack_clone(endpoint, convertor, remote_gpu_address, (mca_btl_base_descriptor_t *)frag, 
+    mca_btl_smcuda_cuda_ddt_clone(endpoint, pack_convertor, unpack_convertor, remote_gpu_address, (mca_btl_base_descriptor_t *)frag, 
                                         lindex, remote_device, local_device);
     send_msg.lindex = lindex;
     send_msg.packed_size = 0;
     send_msg.seq = 0;
     send_msg.msg_type = CUDA_DDT_PACK_START;
+    send_msg.pack_convertor = pack_convertor;
     opal_output(0, "smcuda btl start pack, remote_gpu_address %p, frag %p, lindex %d, remote_device %d, local_device %d\n",
                 (void*)remote_gpu_address, (void*)frag, lindex, remote_device, local_device);
     mca_btl_smcuda_send_cuda_pack_sig(btl, endpoint, &send_msg);
     return OPAL_SUCCESS;
 }
 
-int mca_btl_smcuda_alloc_cuda_ddt_pack_clone(struct mca_btl_base_endpoint_t *endpoint)
-{
-    int i;
-    for (i = 0; i < SMCUDA_DT_CLONE_SIZE; i++) {
-        if (endpoint->smcuda_ddt_pack_clone[i].lindex == -1) {
-            return i;
-        }
-    }
-    return -1;
-}
-int mca_btl_smcuda_alloc_cuda_ddt_unpack_clone(struct mca_btl_base_endpoint_t *endpoint)
+int mca_btl_smcuda_alloc_cuda_ddt_clone(struct mca_btl_base_endpoint_t *endpoint)
 {
     int i;
-    for (i = 0; i < SMCUDA_DT_CLONE_SIZE; i++) {
-        if (endpoint->smcuda_ddt_unpack_clone[i].lindex == -1) {
-            return i;
+    if (endpoint->smcuda_ddt_clone_avail > 0) {
+        for (i = 0; i < endpoint->smcuda_ddt_clone_size; i++) {
+            if (endpoint->smcuda_ddt_clone[i].lindex == -1) {
+                endpoint->smcuda_ddt_clone_avail --;
+                opal_output(0, "Alloc cuda ddt clone array success, lindex %d\n",i);
+                return i;
+            }
         }
+    } else {
+        endpoint->smcuda_ddt_clone = realloc(endpoint->smcuda_ddt_clone, endpoint->smcuda_ddt_clone_size + SMCUDA_DT_CLONE_SIZE);
+        endpoint->smcuda_ddt_clone_avail = SMCUDA_DT_CLONE_SIZE - 1;
+        endpoint->smcuda_ddt_clone_size += SMCUDA_DT_CLONE_SIZE;
+        return endpoint->smcuda_ddt_clone_size - SMCUDA_DT_CLONE_SIZE;
     }
-    return -1;
-}
-
-void mca_btl_smcuda_free_cuda_ddt_pack_clone(struct mca_btl_base_endpoint_t *endpoint, int lindex)
-{
-    assert(endpoint->smcuda_ddt_pack_clone[lindex].lindex == lindex);
-    endpoint->smcuda_ddt_pack_clone[lindex].lindex = -1;
-}
-void mca_btl_smcuda_free_cuda_ddt_unpack_clone(struct mca_btl_base_endpoint_t *endpoint, int lindex)
-{
-    assert(endpoint->smcuda_ddt_unpack_clone[lindex].lindex == lindex);
-    endpoint->smcuda_ddt_unpack_clone[lindex].lindex = -1;
 }
 
-void mca_btl_smcuda_cuda_ddt_pack_clone(struct mca_btl_base_endpoint_t *endpoint,
-                                        struct opal_convertor_t *convertor,
-                                        void *remote_gpu_address,
-                                        mca_btl_base_descriptor_t *frag,
-                                        int lindex, int remote_device, int local_device)
+void mca_btl_smcuda_free_cuda_ddt_clone(struct mca_btl_base_endpoint_t *endpoint, int lindex)
 {
-    endpoint->smcuda_ddt_pack_clone[lindex].convertor = convertor;
-    endpoint->smcuda_ddt_pack_clone[lindex].current_convertor_pBaseBuf = convertor->pBaseBuf;
-    endpoint->smcuda_ddt_pack_clone[lindex].remote_gpu_address = remote_gpu_address;
-    endpoint->smcuda_ddt_pack_clone[lindex].lindex = lindex;
-    endpoint->smcuda_ddt_pack_clone[lindex].remote_device = remote_device;
-    endpoint->smcuda_ddt_pack_clone[lindex].local_device = local_device;
-    endpoint->smcuda_ddt_pack_clone[lindex].frag = frag;
+    assert(endpoint->smcuda_ddt_clone[lindex].lindex == lindex);
+    endpoint->smcuda_ddt_clone[lindex].lindex = -1;
+    endpoint->smcuda_ddt_clone_avail ++;
 }
 
-void mca_btl_smcuda_cuda_ddt_unpack_clone(struct mca_btl_base_endpoint_t *endpoint,
-                                          struct opal_convertor_t *convertor,
-                                          void *remote_gpu_address,
-                                          mca_btl_base_descriptor_t *frag,
-                                          int lindex, int remote_device, int local_device)
+void mca_btl_smcuda_cuda_ddt_clone(struct mca_btl_base_endpoint_t *endpoint,
+                                   struct opal_convertor_t *pack_convertor,
+                                   struct opal_convertor_t *unpack_convertor,
+                                   void *remote_gpu_address,
+                                   mca_btl_base_descriptor_t *frag,
+                                   int lindex, int remote_device, int local_device)
 {
-    endpoint->smcuda_ddt_unpack_clone[lindex].convertor = convertor;
-    endpoint->smcuda_ddt_unpack_clone[lindex].current_convertor_pBaseBuf = convertor->pBaseBuf;
-    endpoint->smcuda_ddt_unpack_clone[lindex].remote_gpu_address = remote_gpu_address;
-    endpoint->smcuda_ddt_unpack_clone[lindex].lindex = lindex;
-    endpoint->smcuda_ddt_unpack_clone[lindex].remote_device = remote_device;
-    endpoint->smcuda_ddt_unpack_clone[lindex].local_device = local_device;
-    endpoint->smcuda_ddt_unpack_clone[lindex].frag = frag;
+    endpoint->smcuda_ddt_clone[lindex].pack_convertor = pack_convertor;
+    endpoint->smcuda_ddt_clone[lindex].unpack_convertor = unpack_convertor;
+    endpoint->smcuda_ddt_clone[lindex].current_unpack_convertor_pBaseBuf = unpack_convertor->pBaseBuf;
+    endpoint->smcuda_ddt_clone[lindex].remote_gpu_address = remote_gpu_address;
+    endpoint->smcuda_ddt_clone[lindex].lindex = lindex;
+    endpoint->smcuda_ddt_clone[lindex].remote_device = remote_device;
+    endpoint->smcuda_ddt_clone[lindex].local_device = local_device;
+    endpoint->smcuda_ddt_clone[lindex].frag = frag;
 }
 
 #endif /* OPAL_CUDA_SUPPORT */
diff --git a/opal/mca/btl/smcuda/btl_smcuda.h b/opal/mca/btl/smcuda/btl_smcuda.h
index ec5cbfa129c..8305029d79e 100644
--- a/opal/mca/btl/smcuda/btl_smcuda.h
+++ b/opal/mca/btl/smcuda/btl_smcuda.h
@@ -517,6 +517,7 @@ typedef struct {
     int seq;
     int msg_type;
     int packed_size;
+    struct opal_convertor_t *pack_convertor;
 } cuda_ddt_hdr_t;
 
 /* cuda datatype put message */
@@ -525,6 +526,7 @@ typedef struct {
     void *remote_address;
     void *remote_base;
     uint64_t mem_handle[8];
+    struct opal_convertor_t *pack_convertor;
 } cuda_ddt_put_hdr_t;
 
 #define CUDA_DDT_UNPACK_FROM_BLOCK  0
@@ -537,8 +539,9 @@ typedef struct {
 
 /* package save pack/unpack convertor and cbfunc */
 typedef struct {
-    struct opal_convertor_t *convertor;
-    unsigned char *current_convertor_pBaseBuf;
+    struct opal_convertor_t *pack_convertor;
+    struct opal_convertor_t *unpack_convertor;
+    unsigned char *current_unpack_convertor_pBaseBuf;
     void *remote_gpu_address;
     int lindex;
     int remote_device;
@@ -551,20 +554,14 @@ typedef struct {
 int mca_btl_smcuda_send_cuda_unpack_sig(struct mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t* endpoint, cuda_ddt_hdr_t *send_msg);
 int mca_btl_smcuda_send_cuda_pack_sig(struct mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t* endpoint, cuda_ddt_hdr_t *send_msg);
 int mca_btl_smcuda_send_cuda_put_sig(struct mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t* endpoint, cuda_ddt_put_hdr_t *put_msg);
-int mca_btl_smcuda_alloc_cuda_ddt_pack_clone(struct mca_btl_base_endpoint_t *endpoint);
-int mca_btl_smcuda_alloc_cuda_ddt_unpack_clone(struct mca_btl_base_endpoint_t *endpoint);
-void mca_btl_smcuda_free_cuda_ddt_pack_clone(struct mca_btl_base_endpoint_t *endpoint, int lindex);
-void mca_btl_smcuda_free_cuda_ddt_unpack_clone(struct mca_btl_base_endpoint_t *endpoint, int lindex);
-void mca_btl_smcuda_cuda_ddt_pack_clone(struct mca_btl_base_endpoint_t *endpoint,
-                                        struct opal_convertor_t *convertor,
-                                        void *remote_gpu_address,
-                                        mca_btl_base_descriptor_t *frag,
-                                        int lindex, int remote_device, int local_device);
-void mca_btl_smcuda_cuda_ddt_unpack_clone(struct mca_btl_base_endpoint_t *endpoint,
-                                          struct opal_convertor_t *convertor,
-                                          void *remote_gpu_address,
-                                          mca_btl_base_descriptor_t *frag,
-                                          int lindex, int remote_device, int local_device);
+int mca_btl_smcuda_alloc_cuda_ddt_clone(struct mca_btl_base_endpoint_t *endpoint);
+void mca_btl_smcuda_free_cuda_ddt_clone(struct mca_btl_base_endpoint_t *endpoint, int lindex);
+void mca_btl_smcuda_cuda_ddt_clone(struct mca_btl_base_endpoint_t *endpoint,
+                                   struct opal_convertor_t *pack_convertor,
+                                   struct opal_convertor_t *unpack_convertor,
+                                   void *remote_gpu_address,
+                                   mca_btl_base_descriptor_t *frag,
+                                   int lindex, int remote_device, int local_device);
 
 #endif /* OPAL_CUDA_SUPPORT */
 
diff --git a/opal/mca/btl/smcuda/btl_smcuda_component.c b/opal/mca/btl/smcuda/btl_smcuda_component.c
index 183edb8b671..c7bdb40c028 100644
--- a/opal/mca/btl/smcuda/btl_smcuda_component.c
+++ b/opal/mca/btl/smcuda/btl_smcuda_component.c
@@ -856,7 +856,7 @@ static void btl_smcuda_datatype_unpack(mca_btl_base_module_t* btl,
                                        mca_btl_base_tag_t tag,
                                        mca_btl_base_descriptor_t* des, void* cbdata)
 {   
-    struct mca_btl_base_endpoint_t *endpoint;
+    struct mca_btl_base_endpoint_t *endpoint = NULL;
     cuda_ddt_hdr_t recv_msg;
     mca_btl_base_segment_t* segments = des->des_segments;
     memcpy(&recv_msg, segments->seg_addr.pval, sizeof(cuda_ddt_hdr_t));
@@ -869,33 +869,34 @@ static void btl_smcuda_datatype_unpack(mca_btl_base_module_t* btl,
 
     /* We can find the endoint back from the rank embedded in the header */
     endpoint = mca_btl_smcuda_component.sm_peers[frag->hdr->my_smp_rank];
-    my_cuda_dt_clone = &endpoint->smcuda_ddt_unpack_clone[lindex];
+    my_cuda_dt_clone = &endpoint->smcuda_ddt_clone[lindex];
     assert(my_cuda_dt_clone->lindex == lindex);
     
     cuda_ddt_hdr_t send_msg;
     send_msg.lindex = lindex;
+    send_msg.pack_convertor = my_cuda_dt_clone->pack_convertor;
     
     if (msg_type == CUDA_DDT_CLEANUP) {
         mca_btl_smcuda_frag_t *frag_recv = (mca_btl_smcuda_frag_t *) my_cuda_dt_clone->frag;
         mca_btl_base_rdma_completion_fn_t cbfunc = (mca_btl_base_rdma_completion_fn_t) frag_recv->base.des_cbfunc;
         cbfunc (btl, endpoint, frag_recv->segment.seg_addr.pval, frag_recv->local_handle, frag_recv->base.des_context, frag_recv->base.des_cbdata, OPAL_SUCCESS);
         mca_btl_smcuda_free(btl, (mca_btl_base_descriptor_t *)frag_recv);
-        mca_btl_smcuda_free_cuda_ddt_unpack_clone(endpoint, lindex);
+        mca_btl_smcuda_free_cuda_ddt_clone(endpoint, lindex);
     } else if (msg_type == CUDA_DDT_UNPACK_FROM_BLOCK || msg_type == CUDA_DDT_COMPLETE){
         struct iovec iov;
         uint32_t iov_count = 1;
         size_t max_data;
-        struct opal_convertor_t *convertor = my_cuda_dt_clone->convertor;
+        struct opal_convertor_t *convertor = my_cuda_dt_clone->unpack_convertor;
         size_t pipeline_size = mca_btl_smcuda_component.cuda_ddt_pipeline_size;
         convertor->flags &= ~CONVERTOR_CUDA;
         unsigned char *remote_address = NULL;
         if (opal_convertor_need_buffers(convertor) == false) { /* do not unpack */
             convertor->flags |= CONVERTOR_CUDA;
-            unsigned char *local_address = my_cuda_dt_clone->current_convertor_pBaseBuf;
+            unsigned char *local_address = my_cuda_dt_clone->current_unpack_convertor_pBaseBuf;
             remote_address = (unsigned char*)my_cuda_dt_clone->remote_gpu_address + seq * pipeline_size;
             opal_output(0, "no unpack, start D2D copy local %p, remote %p, size %ld\n", local_address, remote_address, packed_size);
             mca_common_cuda_memp2pcpy(local_address, (unsigned char*)my_cuda_dt_clone->remote_gpu_address + seq*pipeline_size, packed_size);
-            my_cuda_dt_clone->current_convertor_pBaseBuf += packed_size;
+            my_cuda_dt_clone->current_unpack_convertor_pBaseBuf += packed_size;
         } else {     /* unpack */
             convertor->flags |= CONVERTOR_CUDA;
             if (!OPAL_DATATYPE_DIRECT_COPY_GPUMEM && my_cuda_dt_clone->remote_device != my_cuda_dt_clone->local_device) {
@@ -932,27 +933,25 @@ static void btl_smcuda_datatype_pack(mca_btl_base_module_t* btl,
                                      mca_btl_base_tag_t tag,
                                      mca_btl_base_descriptor_t* des, void* cbdata)
 {
-    struct mca_btl_base_endpoint_t *endpoint;
+    struct mca_btl_base_endpoint_t *endpoint = NULL;
     cuda_ddt_hdr_t recv_msg;
     mca_btl_base_segment_t* segments = des->des_segments;
     memcpy(&recv_msg, segments->seg_addr.pval, sizeof(cuda_ddt_hdr_t));
     int seq = recv_msg.seq;
     int lindex = recv_msg.lindex;
     int msg_type = recv_msg.msg_type;
+    struct opal_convertor_t *convertor = recv_msg.pack_convertor;
     mca_btl_smcuda_frag_t *frag = (mca_btl_smcuda_frag_t *)des;
-    cuda_ddt_clone_t *my_cuda_dt_clone;
     cuda_ddt_hdr_t send_msg;
     
+    /* We can find the endoint back from the rank embedded in the header */
+    endpoint = mca_btl_smcuda_component.sm_peers[frag->hdr->my_smp_rank];
+    
     uint32_t iov_count = 1;
     int rv_dt = 0;
     size_t max_data = 0;
     size_t packed_size = 0;
 
-    /* We can find the endoint back from the rank embedded in the header */
-    endpoint = mca_btl_smcuda_component.sm_peers[frag->hdr->my_smp_rank];
-    my_cuda_dt_clone = &endpoint->smcuda_ddt_pack_clone[lindex];
-    
-    struct opal_convertor_t *convertor = my_cuda_dt_clone->convertor;
     send_msg.lindex = lindex;
     if (msg_type == CUDA_DDT_COMPLETE_ACK) {
         send_msg.packed_size = 0;
@@ -963,7 +962,6 @@ static void btl_smcuda_datatype_pack(mca_btl_base_module_t* btl,
             opal_cuda_free_gpu_buffer_p(convertor->gpu_buffer_ptr, 0);
             convertor->gpu_buffer_ptr = NULL;
         }
-        mca_btl_smcuda_free_cuda_ddt_pack_clone(endpoint, lindex);
     } else if (msg_type == CUDA_DDT_PACK_TO_BLOCK) {
         if (convertor->bConverted < convertor->local_size) {
             struct iovec iov;
@@ -1009,21 +1007,19 @@ static void btl_smcuda_datatype_put(mca_btl_base_module_t* btl,
                                     mca_btl_base_tag_t tag,
                                     mca_btl_base_descriptor_t* des, void* cbdata)
 {
-    struct mca_btl_base_endpoint_t *endpoint;
+    struct mca_btl_base_endpoint_t *endpoint = NULL;
     cuda_ddt_put_hdr_t recv_msg;
     mca_btl_base_segment_t* segments = des->des_segments;
     memcpy(&recv_msg, segments->seg_addr.pval, sizeof(cuda_ddt_put_hdr_t));
     int lindex = recv_msg.lindex;
     void *remote_address = recv_msg.remote_address;
     void *remote_base = recv_msg.remote_base;
+    struct opal_convertor_t *convertor = recv_msg.pack_convertor;
     mca_btl_smcuda_frag_t *frag = (mca_btl_smcuda_frag_t *)des;
-    cuda_ddt_clone_t *my_cuda_dt_clone;
     cuda_ddt_hdr_t send_msg;
     
     /* We can find the endoint back from the rank embedded in the header */
     endpoint = mca_btl_smcuda_component.sm_peers[frag->hdr->my_smp_rank];
-    my_cuda_dt_clone = &endpoint->smcuda_ddt_pack_clone[lindex];
-    struct opal_convertor_t *convertor = my_cuda_dt_clone->convertor;
     
     opal_cuda_free_gpu_buffer_p(convertor->gpu_buffer_ptr, 0);
     mca_mpool_common_cuda_reg_t *rget_reg_ptr = NULL;
@@ -1051,7 +1047,6 @@ static void btl_smcuda_datatype_put(mca_btl_base_module_t* btl,
     send_msg.seq = -2;
     send_msg.msg_type = CUDA_DDT_CLEANUP;
     mca_btl_smcuda_send_cuda_unpack_sig(btl, endpoint, &send_msg);
-    mca_btl_smcuda_free_cuda_ddt_pack_clone(endpoint, lindex);
 }
 
 #endif /* OPAL_CUDA_SUPPORT */
diff --git a/opal/mca/btl/smcuda/btl_smcuda_endpoint.h b/opal/mca/btl/smcuda/btl_smcuda_endpoint.h
index f3b79866c14..20936dbeac1 100644
--- a/opal/mca/btl/smcuda/btl_smcuda_endpoint.h
+++ b/opal/mca/btl/smcuda/btl_smcuda_endpoint.h
@@ -49,8 +49,9 @@ struct mca_btl_base_endpoint_t {
     opal_proc_t *proc_opal;  /**< Needed for adding CUDA IPC support dynamically */
     enum ipcState ipcstate;  /**< CUDA IPC connection status */
     int ipctries;            /**< Number of times CUDA IPC connect was sent */
-    cuda_ddt_clone_t smcuda_ddt_pack_clone[SMCUDA_DT_CLONE_SIZE];
-    cuda_ddt_clone_t smcuda_ddt_unpack_clone[SMCUDA_DT_CLONE_SIZE];
+    cuda_ddt_clone_t *smcuda_ddt_clone;
+    int smcuda_ddt_clone_size;
+    int smcuda_ddt_clone_avail;
 #endif /* OPAL_CUDA_SUPPORT */
 };
 
diff --git a/opal/mca/common/cuda/common_cuda.h b/opal/mca/common/cuda/common_cuda.h
index 61256fa6809..9adda6dc82f 100644
--- a/opal/mca/common/cuda/common_cuda.h
+++ b/opal/mca/common/cuda/common_cuda.h
@@ -38,9 +38,9 @@ struct mca_mpool_common_cuda_reg_data_t {
     uint64_t event;
     opal_ptr_t memh_seg_addr;
     size_t memh_seg_len;
-    uint32_t lindex;
     uint8_t pack_required;
     int32_t gpu_device;
+    struct opal_convertor_t *pack_convertor;
 };
 typedef struct mca_mpool_common_cuda_reg_data_t mca_mpool_common_cuda_reg_data_t;
 

From c13df8c92285e4b604cb61e9818e8b8b24898ab5 Mon Sep 17 00:00:00 2001
From: eddy16112 <eddy16112@gmail.com>
Date: Wed, 28 Oct 2015 16:33:31 -0400
Subject: [PATCH 039/190] this file is not used anymore

---
 .../cuda/opal_datatype_orig_internal.h        | 645 ------------------
 1 file changed, 645 deletions(-)
 delete mode 100644 opal/datatype/cuda/opal_datatype_orig_internal.h

diff --git a/opal/datatype/cuda/opal_datatype_orig_internal.h b/opal/datatype/cuda/opal_datatype_orig_internal.h
deleted file mode 100644
index 4dde12d235d..00000000000
--- a/opal/datatype/cuda/opal_datatype_orig_internal.h
+++ /dev/null
@@ -1,645 +0,0 @@
-#ifndef OPAL_DATATYPE_ORIG_INTERNAL_H_HAS_BEEN_INCLUDED
-#define OPAL_DATATYPE_ORIG_INTERNAL_H_HAS_BEEN_INCLUDED
-
-#include <stdbool.h>
-
-#include "opal_config.h"
-
-#define OPAL_PTRDIFF_TYPE ptrdiff_t
-#define DT_STATIC_STACK_SIZE    5                /**< This should be sufficient for most applications */
-
-#if OPAL_ENABLE_DEBUG
-/* Any kind of unique ID should do the job */
-#define OPAL_OBJ_MAGIC_ID ((0xdeafbeedULL << 32) + 0xdeafbeedULL)
-#endif
-
-/* keep the last 16 bits free for data flags */
-#define CONVERTOR_DATATYPE_MASK    0x0000FFFF
-#define CONVERTOR_SEND_CONVERSION  0x00010000
-#define CONVERTOR_RECV             0x00020000
-#define CONVERTOR_SEND             0x00040000
-#define CONVERTOR_HOMOGENEOUS      0x00080000
-#define CONVERTOR_NO_OP            0x00100000
-#define CONVERTOR_WITH_CHECKSUM    0x00200000
-#define CONVERTOR_CUDA             0x00400000
-#define CONVERTOR_CUDA_ASYNC       0x00800000
-#define CONVERTOR_TYPE_MASK        0x00FF0000
-#define CONVERTOR_STATE_START      0x01000000
-#define CONVERTOR_STATE_COMPLETE   0x02000000
-#define CONVERTOR_STATE_ALLOC      0x04000000
-#define CONVERTOR_COMPLETED        0x08000000
-
-#define OPAL_DATATYPE_LOOP           0
-#define OPAL_DATATYPE_END_LOOP       1
-#define OPAL_DATATYPE_LB             2
-#define OPAL_DATATYPE_UB             3
-#define OPAL_DATATYPE_FIRST_TYPE     4 /* Number of first real type */
-#define OPAL_DATATYPE_INT1           4
-#define OPAL_DATATYPE_INT2           5
-#define OPAL_DATATYPE_INT4           6
-#define OPAL_DATATYPE_INT8           7
-#define OPAL_DATATYPE_INT16          8
-#define OPAL_DATATYPE_UINT1          9
-#define OPAL_DATATYPE_UINT2          10
-#define OPAL_DATATYPE_UINT4          11
-#define OPAL_DATATYPE_UINT8          12
-#define OPAL_DATATYPE_UINT16         13
-#define OPAL_DATATYPE_FLOAT2         14
-#define OPAL_DATATYPE_FLOAT4         15
-#define OPAL_DATATYPE_FLOAT8         16
-#define OPAL_DATATYPE_FLOAT12        17
-#define OPAL_DATATYPE_FLOAT16        18
-#define OPAL_DATATYPE_FLOAT_COMPLEX  19
-#define OPAL_DATATYPE_DOUBLE_COMPLEX 20
-#define OPAL_DATATYPE_LONG_DOUBLE_COMPLEX 21
-#define OPAL_DATATYPE_BOOL           22
-#define OPAL_DATATYPE_WCHAR          23
-#define OPAL_DATATYPE_UNAVAILABLE    24
-
-/* flags for the datatypes. */
-#define OPAL_DATATYPE_FLAG_UNAVAILABLE   0x0001  /**< datatypes unavailable on the build (OS or compiler dependant) */
-#define OPAL_DATATYPE_FLAG_PREDEFINED    0x0002  /**< cannot be removed: initial and predefined datatypes */
-#define OPAL_DATATYPE_FLAG_COMMITED      0x0004  /**< ready to be used for a send/recv operation */
-#define OPAL_DATATYPE_FLAG_OVERLAP       0x0008  /**< datatype is unpropper for a recv operation */
-#define OPAL_DATATYPE_FLAG_CONTIGUOUS    0x0010  /**< contiguous datatype */
-#define OPAL_DATATYPE_FLAG_NO_GAPS       0x0020  /**< no gaps around the datatype, aka OPAL_DATATYPE_FLAG_CONTIGUOUS and extent == size */
-#define OPAL_DATATYPE_FLAG_USER_LB       0x0040  /**< has a user defined LB */
-#define OPAL_DATATYPE_FLAG_USER_UB       0x0080  /**< has a user defined UB */
-#define OPAL_DATATYPE_FLAG_DATA          0x0100  /**< data or control structure */
-/*
- * We should make the difference here between the predefined contiguous and non contiguous
- * datatypes. The OPAL_DATATYPE_FLAG_BASIC is held by all predefined contiguous datatypes.
- */
-#define OPAL_DATATYPE_FLAG_BASIC         (OPAL_DATATYPE_FLAG_PREDEFINED | \
-                                          OPAL_DATATYPE_FLAG_CONTIGUOUS | \
-                                          OPAL_DATATYPE_FLAG_NO_GAPS |    \
-                                          OPAL_DATATYPE_FLAG_DATA |       \
-                                          OPAL_DATATYPE_FLAG_COMMITED)
- 
-/* typedefs ***********************************************************/
-
-typedef struct opal_object_t opal_object_t;
-typedef struct opal_class_t opal_class_t;
-typedef void (*opal_construct_t) (opal_object_t *);
-typedef void (*opal_destruct_t) (opal_object_t *);
-
-
-/* types **************************************************************/
-
-/**
-* Class descriptor.
-*
-* There should be a single instance of this descriptor for each class
-* definition.
-*/
-struct opal_class_t {
-  const char *cls_name;           /**< symbolic name for class */
-  opal_class_t *cls_parent;       /**< parent class descriptor */
-  opal_construct_t cls_construct; /**< class constructor */
-  opal_destruct_t cls_destruct;   /**< class destructor */
-  int cls_initialized;            /**< is class initialized */
-  int cls_depth;                  /**< depth of class hierarchy tree */
-  opal_construct_t *cls_construct_array;
-                                  /**< array of parent class constructors */
-  opal_destruct_t *cls_destruct_array;
-                                  /**< array of parent class destructors */
-  size_t cls_sizeof;              /**< size of an object instance */
-};
-
-/**
- * Base object.
- *
- * This is special and does not follow the pattern for other classes.
- */
-struct opal_object_t {
-#if OPAL_ENABLE_DEBUG
-    /** Magic ID -- want this to be the very first item in the
-        struct's memory */
-    uint64_t obj_magic_id;
-#endif
-    opal_class_t *obj_class;            /**< class descriptor */
-    volatile int32_t obj_reference_count;   /**< reference count */
-#if OPAL_ENABLE_DEBUG
-   const char* cls_init_file_name;        /**< In debug mode store the file where the object get contructed */
-   int   cls_init_lineno;           /**< In debug mode store the line number where the object get contructed */
-#endif  /* OPAL_ENABLE_DEBUG */
-};
-
-/**
- * Declaration for class descriptor
- *
- * @param NAME          Name of class
- *
- * Put this in NAME.h
- */
-#define OBJ_CLASS_DECLARATION(NAME)             \
-    extern opal_class_t NAME ## _class
-
-/**
- * Return a pointer to the class descriptor associated with a
- * class type.
- *
- * @param NAME          Name of class
- * @return              Pointer to class descriptor
- */
-#define OBJ_CLASS(NAME)     (&(NAME ## _class))
-
-/**
- * For static initializations of OBJects.
- *
- * @param NAME   Name of the class to initialize
- */
-#if OPAL_ENABLE_DEBUG
-#define OPAL_OBJ_STATIC_INIT(BASE_CLASS) { OPAL_OBJ_MAGIC_ID, OBJ_CLASS(BASE_CLASS), 1, __FILE__, __LINE__ }
-#else
-#define OPAL_OBJ_STATIC_INIT(BASE_CLASS) { OBJ_CLASS(BASE_CLASS), 1 }
-#endif
-
-
-
-struct ddt_elem_id_description {
-    uint16_t   flags;  /**< flags for the record */
-    uint16_t   type;   /**< the basic data type id */
-};
-typedef struct ddt_elem_id_description ddt_elem_id_description;
-
-/* the basic element. A data description is composed
- * by a set of basic elements.
- */
-struct ddt_elem_desc {
-    ddt_elem_id_description common;           /**< basic data description and flags */
-    uint32_t                count;            /**< number of blocks */
-    uint32_t                blocklen;         /**< number of elements on each block */
-    OPAL_PTRDIFF_TYPE       extent;           /**< extent of each block (in bytes) */
-    OPAL_PTRDIFF_TYPE       disp;             /**< displacement of the first block */
-};
-typedef struct ddt_elem_desc ddt_elem_desc_t;
-
-struct ddt_loop_desc {
-    ddt_elem_id_description common;           /**< basic data description and flags */
-    uint32_t                loops;            /**< number of elements */
-    uint32_t                items;            /**< number of items in the loop */
-    size_t                  unused;           /**< not used right now */
-    OPAL_PTRDIFF_TYPE       extent;           /**< extent of the whole loop */
-};
-typedef struct ddt_loop_desc ddt_loop_desc_t;
-
-struct ddt_endloop_desc {
-    ddt_elem_id_description common;           /**< basic data description and flags */
-    uint32_t                items;            /**< number of elements */
-    uint32_t                unused;           /**< not used right now */
-    size_t                  size;             /**< real size of the data in the loop */
-    OPAL_PTRDIFF_TYPE       first_elem_disp;  /**< the displacement of the first block in the loop */
-};
-typedef struct ddt_endloop_desc ddt_endloop_desc_t;
-
-union dt_elem_desc {
-    ddt_elem_desc_t    elem;
-    ddt_loop_desc_t    loop;
-    ddt_endloop_desc_t end_loop;
-};
-typedef union dt_elem_desc dt_elem_desc_t;
-
-/* dt_type_description */
-typedef uint32_t opal_datatype_count_t;
-
-struct dt_type_desc_t {
-    opal_datatype_count_t  length;  /**< the maximum number of elements in the description array */
-    opal_datatype_count_t  used;    /**< the number of used elements in the description array */
-    dt_elem_desc_t*        desc;
-};
-typedef struct dt_type_desc_t dt_type_desc_t;
-
-/*
- * The datatype description.
- */
-#define OPAL_DATATYPE_MAX_PREDEFINED 25
-#define OPAL_DATATYPE_MAX_SUPPORTED  47
-#define OPAL_MAX_OBJECT_NAME         64
-
-struct opal_datatype_t {
-    opal_object_t      super;    /**< basic superclass */
-    uint16_t           flags;    /**< the flags */
-    uint16_t           id;       /**< data id, normally the index in the data array. */
-    uint32_t           bdt_used; /**< bitset of which basic datatypes are used in the data description */
-    size_t             size;     /**< total size in bytes of the memory used by the data if
-                                      the data is put on a contiguous buffer */
-    OPAL_PTRDIFF_TYPE  true_lb;  /**< the true lb of the data without user defined lb and ub */
-    OPAL_PTRDIFF_TYPE  true_ub;  /**< the true ub of the data without user defined lb and ub */
-    OPAL_PTRDIFF_TYPE  lb;       /**< lower bound in memory */
-    OPAL_PTRDIFF_TYPE  ub;       /**< upper bound in memory */
-    /* --- cacheline 1 boundary (64 bytes) --- */
-    size_t             nbElems;  /**< total number of elements inside the datatype */
-    uint32_t           align;    /**< data should be aligned to */
-
-    /* Attribute fields */
-    char               name[OPAL_MAX_OBJECT_NAME];  /**< name of the datatype */
-    /* --- cacheline 2 boundary (128 bytes) was 8-12 bytes ago --- */
-    dt_type_desc_t     desc;     /**< the data description */
-    dt_type_desc_t     opt_desc; /**< short description of the data used when conversion is useless
-                                      or in the send case (without conversion) */
-
-    uint32_t           btypes[OPAL_DATATYPE_MAX_SUPPORTED];
-                                 /**< basic elements count used to compute the size of the
-                                      datatype for remote nodes. The length of the array is dependent on
-                                      the maximum number of datatypes of all top layers.
-                                      Reason being is that Fortran is not at the OPAL layer. */
-    /* --- cacheline 5 boundary (320 bytes) was 32-36 bytes ago --- */
-
-    /* size: 352, cachelines: 6, members: 15 */
-    /* last cacheline: 28-32 bytes */
-};
-
-typedef struct opal_datatype_t opal_datatype_t;
-
-OPAL_DECLSPEC OBJ_CLASS_DECLARATION( opal_datatype_t );
-
-/* convertor and stack */
-typedef struct opal_convertor_t opal_convertor_t;
-
-typedef int32_t (*convertor_advance_fct_t)( opal_convertor_t* pConvertor,
-                                            struct iovec* iov,
-                                            uint32_t* out_size,
-                                            size_t* max_data );
-typedef void*(*memalloc_fct_t)( size_t* pLength, void* userdata );
-typedef void*(*memcpy_fct_t)( void* dest, const void* src, size_t n, opal_convertor_t* pConvertor );
-
-/* The master convertor struct (defined in convertor_internal.h) */
-struct opal_convertor_master_t;
-
-struct dt_stack_t {
-    int32_t           index;    /**< index in the element description */
-    int16_t           type;     /**< the type used for the last pack/unpack (original or OPAL_DATATYPE_UINT1) */
-    size_t            count;    /**< number of times we still have to do it */
-    OPAL_PTRDIFF_TYPE disp;     /**< actual displacement depending on the count field */
-};
-typedef struct dt_stack_t dt_stack_t;
-
-typedef int32_t (*conversion_fct_t)( opal_convertor_t* pConvertor, uint32_t count,
-                                     const void* from, size_t from_len, OPAL_PTRDIFF_TYPE from_extent,
-                                     void* to, size_t to_length, OPAL_PTRDIFF_TYPE to_extent,
-                                     OPAL_PTRDIFF_TYPE *advance );
-
-typedef struct opal_convertor_master_t {
-    struct opal_convertor_master_t* next;
-    uint32_t                        remote_arch;
-    uint32_t                        flags;
-    uint32_t                        hetero_mask;
-    const size_t                    remote_sizes[OPAL_DATATYPE_MAX_PREDEFINED];
-    conversion_fct_t*               pFunctions;   /**< the convertor functions pointer */
-} opal_convertor_master_t;
-
-#define MAX_IPC_EVENT_HANDLE   10
-
-struct opal_convertor_t {
-    opal_object_t                 super;          /**< basic superclass */
-    uint32_t                      remoteArch;     /**< the remote architecture */
-    uint32_t                      flags;          /**< the properties of this convertor */
-    size_t                        local_size;     /**< overall length data on local machine, compared to bConverted */
-    size_t                        remote_size;    /**< overall length data on remote machine, compared to bConverted */
-    const opal_datatype_t*        pDesc;          /**< the datatype description associated with the convertor */
-    const dt_type_desc_t*         use_desc;       /**< the version used by the convertor (normal or optimized) */
-    opal_datatype_count_t         count;          /**< the total number of full datatype elements */
-    uint32_t                      stack_size;     /**< size of the allocated stack */
-    /* --- cacheline 1 boundary (64 bytes) --- */
-    unsigned char*                pBaseBuf;       /**< initial buffer as supplied by the user */
-    dt_stack_t*                   pStack;         /**< the local stack for the actual conversion */
-    convertor_advance_fct_t       fAdvance;       /**< pointer to the pack/unpack functions */
-    struct opal_convertor_master_t* master;       /**< the master convertor */
-
-    /* All others fields get modified for every call to pack/unpack functions */
-    uint32_t                      stack_pos;      /**< the actual position on the stack */
-    uint32_t                      partial_length; /**< amount of data left over from the last unpack */
-    size_t                        bConverted;     /**< # of bytes already converted */
-    uint32_t                      checksum;       /**< checksum computed by pack/unpack operation */
-    uint32_t                      csum_ui1;       /**< partial checksum computed by pack/unpack operation */
-    size_t                        csum_ui2;       /**< partial checksum computed by pack/unpack operation */
-     /* --- cacheline 2 boundary (128 bytes) --- */
-    dt_stack_t                    static_stack[DT_STATIC_STACK_SIZE];  /**< local stack for small datatypes */
-    /* --- cacheline 3 boundary (192 bytes) was 56 bytes ago --- */
-
-#if OPAL_CUDA_SUPPORT
-    memcpy_fct_t                  cbmemcpy;       /**< memcpy or cuMemcpy */
-    void *                        stream;         /**< CUstream for async copy */
-
-    unsigned char *               gpu_buffer_ptr; /**< GPU buffer used for pack/unpack */
-    uint64_t *                    pipeline_event[MAX_IPC_EVENT_HANDLE]; /**< cuda event for pipeline */
-#endif
-    /* size: 248, cachelines: 4, members: 20 */
-    /* last cacheline: 56 bytes */
-};
-
-struct iovec {  
-    void *iov_base; /* Starting address */  
-    size_t iov_len; /* Length in bytes */  
-};
-
-
-OPAL_DECLSPEC extern union dt_elem_desc opal_datatype_predefined_elem_desc[2 * OPAL_DATATYPE_MAX_PREDEFINED];
-
-#define OPAL_DATATYPE_INIT_BTYPES_ARRAY_UNAVAILABLE { 0 }
-#define OPAL_DATATYPE_INIT_BTYPES_ARRAY(NAME) { [OPAL_DATATYPE_ ## NAME] = 1 }
-
-#define OPAL_DATATYPE_INIT_NAME(NAME) "OPAL_" #NAME
-
-/*
- * Macro to initialize the main description for basic types, setting the pointer
- * into the array opal_datatype_predefined_type_desc, which is initialized at
- * runtime in opal_datatype_init(). Each basic type has two desc-elements....
- */
-#define OPAL_DATATYPE_INIT_DESC_PREDEFINED(NAME)                                     \
-    {                                                                                \
-        .length = 1, .used = 1,                                                      \
-        .desc = &(opal_datatype_predefined_elem_desc[2 * OPAL_DATATYPE_ ## NAME])    \
-    }
-#define OPAL_DATATYPE_INIT_DESC_NULL  {.length = 0, .used = 0, .desc = NULL}
-
-#define OPAL_DATATYPE_INITIALIZER_UNAVAILABLE_NAMED( NAME, FLAGS )                   \
-    {                                                                                \
-        .super = OPAL_OBJ_STATIC_INIT(opal_datatype_t),                              \
-        .flags = OPAL_DATATYPE_FLAG_UNAVAILABLE | OPAL_DATATYPE_FLAG_PREDEFINED | (FLAGS), \
-        .id = OPAL_DATATYPE_ ## NAME,                                                \
-        .bdt_used = 0,                                                               \
-        .size = 0,                                                                   \
-        .true_lb = 0, .true_ub = 0, .lb = 0, .ub = 0,                                \
-        .align = 0,                                                                  \
-        .nbElems = 1,                                                                \
-        .name = OPAL_DATATYPE_INIT_NAME(NAME),                                       \
-        .desc = OPAL_DATATYPE_INIT_DESC_PREDEFINED(UNAVAILABLE),                     \
-        .opt_desc = OPAL_DATATYPE_INIT_DESC_PREDEFINED(UNAVAILABLE),                 \
-        .btypes = OPAL_DATATYPE_INIT_BTYPES_ARRAY_UNAVAILABLE                        \
-    }
-
-#define OPAL_DATATYPE_INITIALIZER_EMPTY( FLAGS )                        \
-    {                                                                   \
-        .super = OPAL_OBJ_STATIC_INIT(opal_datatype_t),                 \
-        .flags = OPAL_DATATYPE_FLAG_PREDEFINED | (FLAGS),               \
-        .id = 0,                                                        \
-        .bdt_used = 0,                                                  \
-        .size = 0,                                                      \
-        .true_lb = 0, .true_ub = 0, .lb = 0, .ub = 0,                   \
-        .align = 0,                                                     \
-        .nbElems = 1,                                                   \
-        .name = OPAL_DATATYPE_INIT_NAME(EMPTY),                         \
-        .desc = OPAL_DATATYPE_INIT_DESC_NULL,                           \
-        .opt_desc = OPAL_DATATYPE_INIT_DESC_NULL,                       \
-        .btypes = OPAL_DATATYPE_INIT_BTYPES_ARRAY_UNAVAILABLE           \
-    }
-
-#define OPAL_DATATYPE_INIT_BASIC_TYPE( TYPE, NAME, FLAGS )              \
-    {                                                                   \
-        .super = OPAL_OBJ_STATIC_INIT(opal_datatype_t),                 \
-        .flags = OPAL_DATATYPE_FLAG_PREDEFINED | (FLAGS),               \
-        .id = TYPE,                                                     \
-        .bdt_used = (((uint32_t)1)<<(TYPE)),                            \
-        .size = 0,                                                      \
-        .true_lb = 0, .true_ub = 0, .lb = 0, .ub = 0,                   \
-        .align = 0,                                                     \
-        .nbElems = 1,                                                   \
-        .name = OPAL_DATATYPE_INIT_NAME(NAME),                          \
-        .desc = OPAL_DATATYPE_INIT_DESC_NULL,                           \
-        .opt_desc = OPAL_DATATYPE_INIT_DESC_NULL,                       \
-        .btypes = OPAL_DATATYPE_INIT_BTYPES_ARRAY(NAME)                 \
-    }
-    
-#define OPAL_DATATYPE_INIT_BASIC_DATATYPE( TYPE, ALIGN, NAME, FLAGS )                \
-    {                                                                                \
-        .super = OPAL_OBJ_STATIC_INIT(opal_datatype_t),                              \
-        .flags = OPAL_DATATYPE_FLAG_BASIC | (FLAGS),                                 \
-        .id = OPAL_DATATYPE_ ## NAME,                                                \
-        .bdt_used = (((uint32_t)1)<<(OPAL_DATATYPE_ ## NAME)),                       \
-        .size = sizeof(TYPE),                                                        \
-        .true_lb = 0, .true_ub = sizeof(TYPE), .lb = 0, .ub = sizeof(TYPE),          \
-        .align = (ALIGN),                                                            \
-        .nbElems = 1,                                                                \
-        .name = OPAL_DATATYPE_INIT_NAME(NAME),                                       \
-        .desc = OPAL_DATATYPE_INIT_DESC_PREDEFINED(NAME),                            \
-        .opt_desc = OPAL_DATATYPE_INIT_DESC_PREDEFINED(NAME),                        \
-        .btypes = OPAL_DATATYPE_INIT_BTYPES_ARRAY(NAME)                              \
-    }
-
-#define OPAL_DATATYPE_INITIALIZER_LOOP(FLAGS)       OPAL_DATATYPE_INIT_BASIC_TYPE( OPAL_DATATYPE_LOOP, LOOP, FLAGS )
-#define OPAL_DATATYPE_INITIALIZER_END_LOOP(FLAGS)   OPAL_DATATYPE_INIT_BASIC_TYPE( OPAL_DATATYPE_END_LOOP, END_LOOP, FLAGS )
-#define OPAL_DATATYPE_INITIALIZER_LB(FLAGS)         OPAL_DATATYPE_INIT_BASIC_TYPE( OPAL_DATATYPE_LB, LB, FLAGS )
-#define OPAL_DATATYPE_INITIALIZER_UB(FLAGS)         OPAL_DATATYPE_INIT_BASIC_TYPE( OPAL_DATATYPE_UB, UB, FLAGS ) 
-#define OPAL_DATATYPE_INITIALIZER_INT1(FLAGS)       OPAL_DATATYPE_INIT_BASIC_DATATYPE( int8_t, OPAL_ALIGNMENT_INT8, INT1, FLAGS )
-#define OPAL_DATATYPE_INITIALIZER_INT2(FLAGS)       OPAL_DATATYPE_INIT_BASIC_DATATYPE( int16_t, OPAL_ALIGNMENT_INT16, INT2, FLAGS )
-#define OPAL_DATATYPE_INITIALIZER_INT4(FLAGS)       OPAL_DATATYPE_INIT_BASIC_DATATYPE( int32_t, OPAL_ALIGNMENT_INT32, INT4, FLAGS )
-#define OPAL_DATATYPE_INITIALIZER_INT8(FLAGS)       OPAL_DATATYPE_INIT_BASIC_DATATYPE( int64_t, OPAL_ALIGNMENT_INT64, INT8, FLAGS )
-#ifdef HAVE_INT128_T
-#define OPAL_DATATYPE_INITIALIZER_INT16(FLAGS)      OPAL_DATATYPE_INIT_BASIC_DATATYPE( int128_t, OPAL_ALIGNMENT_INT128, INT16, FLAGS )
-#else
-#define OPAL_DATATYPE_INITIALIZER_INT16(FLAGS)      OPAL_DATATYPE_INITIALIZER_UNAVAILABLE_NAMED( INT16, FLAGS )
-#endif
-#define OPAL_DATATYPE_INITIALIZER_UINT1(FLAGS)      OPAL_DATATYPE_INIT_BASIC_DATATYPE( uint8_t, OPAL_ALIGNMENT_INT8, UINT1, FLAGS )
-#define OPAL_DATATYPE_INITIALIZER_UINT2(FLAGS)      OPAL_DATATYPE_INIT_BASIC_DATATYPE( uint16_t, OPAL_ALIGNMENT_INT16, UINT2, FLAGS )
-#define OPAL_DATATYPE_INITIALIZER_UINT4(FLAGS)      OPAL_DATATYPE_INIT_BASIC_DATATYPE( uint32_t, OPAL_ALIGNMENT_INT32, UINT4, FLAGS )
-#define OPAL_DATATYPE_INITIALIZER_UINT8(FLAGS)      OPAL_DATATYPE_INIT_BASIC_DATATYPE( uint64_t, OPAL_ALIGNMENT_INT64, UINT8, FLAGS )
-#ifdef HAVE_UINT128_T
-#define OPAL_DATATYPE_INITIALIZER_UINT16(FLAGS)     OPAL_DATATYPE_INIT_BASIC_DATATYPE( uint128_t, OPAL_ALIGNMENT_INT128, UINT16, FLAGS )
-#else
-#define OPAL_DATATYPE_INITIALIZER_UINT16(FLAGS)     OPAL_DATATYPE_INITIALIZER_UNAVAILABLE_NAMED( INT16, FLAGS )
-#endif
-
-#if SIZEOF_FLOAT == 2
-#define OPAL_DATATYPE_INITIALIZER_FLOAT2(FLAGS)     OPAL_DATATYPE_INIT_BASIC_DATATYPE( float, OPAL_ALIGNMENT_FLOAT, FLOAT2, FLAGS )
-#elif SIZEOF_DOUBLE == 2
-#define OPAL_DATATYPE_INITIALIZER_FLOAT2(FLAGS)     OPAL_DATATYPE_INIT_BASIC_DATATYPE( double, OPAL_ALIGNMENT_DOUBLE, FLOAT2, FLAGS )
-#elif SIZEOF_LONG_DOUBLE == 2
-#define OPAL_DATATYPE_INITIALIZER_FLOAT2(FLAGS)     OPAL_DATATYPE_INIT_BASIC_DATATYPE( long double, OPAL_ALIGNMENT_LONG_DOUBLE, FLOAT2, FLAGS )
-#else
-#define OPAL_DATATYPE_INITIALIZER_FLOAT2(FLAGS)     OPAL_DATATYPE_INITIALIZER_UNAVAILABLE_NAMED( FLOAT2, FLAGS )
-#endif
-
-#if SIZEOF_FLOAT == 4
-#define OPAL_DATATYPE_INITIALIZER_FLOAT4(FLAGS)     OPAL_DATATYPE_INIT_BASIC_DATATYPE( float, OPAL_ALIGNMENT_FLOAT, FLOAT4, FLAGS )
-#elif SIZEOF_DOUBLE == 4
-#define OPAL_DATATYPE_INITIALIZER_FLOAT4(FLAGS)     OPAL_DATATYPE_INIT_BASIC_DATATYPE( double, OPAL_ALIGNMENT_DOUBLE, FLOAT4, FLAGS )
-#elif SIZEOF_LONG_DOUBLE == 4
-#define OPAL_DATATYPE_INITIALIZER_FLOAT4(FLAGS)     OPAL_DATATYPE_INIT_BASIC_DATATYPE( long double, OPAL_ALIGNMENT_LONG_DOUBLE, FLOAT4, FLAGS )
-#else
-#define OPAL_DATATYPE_INITIALIZER_FLOAT4(FLAGS)     OPAL_DATATYPE_INITIALIZER_UNAVAILABLE_NAMED( FLOAT4, FLAGS )
-#endif
-
-#if SIZEOF_FLOAT == 8
-#define OPAL_DATATYPE_INITIALIZER_FLOAT8(FLAGS)     OPAL_DATATYPE_INIT_BASIC_DATATYPE( float, OPAL_ALIGNMENT_FLOAT, FLOAT8, FLAGS )
-#elif SIZEOF_DOUBLE == 8
-#define OPAL_DATATYPE_INITIALIZER_FLOAT8(FLAGS)     OPAL_DATATYPE_INIT_BASIC_DATATYPE( double, OPAL_ALIGNMENT_DOUBLE, FLOAT8, FLAGS )
-#elif SIZEOF_LONG_DOUBLE == 8
-#define OPAL_DATATYPE_INITIALIZER_FLOAT8(FLAGS)     OPAL_DATATYPE_INIT_BASIC_DATATYPE( long double, OPAL_ALIGNMENT_LONG_DOUBLE, FLOAT8, FLAGS )
-#else
-#define OPAL_DATATYPE_INITIALIZER_FLOAT8(FLAGS)     OPAL_DATATYPE_INITIALIZER_UNAVAILABLE_NAMED( FLOAT8, FLAGS )
-#endif
-
-#if SIZEOF_FLOAT == 12
-#define OPAL_DATATYPE_INITIALIZER_FLOAT12(FLAGS)    OPAL_DATATYPE_INIT_BASIC_DATATYPE( float, OPAL_ALIGNMENT_FLOAT, FLOAT12, FLAGS )
-#elif SIZEOF_DOUBLE == 12
-#define OPAL_DATATYPE_INITIALIZER_FLOAT12(FLAGS)    OPAL_DATATYPE_INIT_BASIC_DATATYPE( double, OPAL_ALIGNMENT_DOUBLE, FLOAT12, FLAGS )
-#elif SIZEOF_LONG_DOUBLE == 12
-#define OPAL_DATATYPE_INITIALIZER_FLOAT12(FLAGS)    OPAL_DATATYPE_INIT_BASIC_DATATYPE( long double, OPAL_ALIGNMENT_LONG_DOUBLE, FLOAT12, FLAGS )
-#else
-#define OPAL_DATATYPE_INITIALIZER_FLOAT12(FLAGS)    OPAL_DATATYPE_INITIALIZER_UNAVAILABLE_NAMED( FLOAT12, FLAGS )
-#endif
-
-#if SIZEOF_FLOAT == 16
-#define OPAL_DATATYPE_INITIALIZER_FLOAT16(FLAGS)    OPAL_DATATYPE_INIT_BASIC_DATATYPE( float, OPAL_ALIGNMENT_FLOAT, FLOAT16, FLAGS )
-#elif SIZEOF_DOUBLE == 16
-#define OPAL_DATATYPE_INITIALIZER_FLOAT16(FLAGS)    OPAL_DATATYPE_INIT_BASIC_DATATYPE( double, OPAL_ALIGNMENT_DOUBLE, FLOAT16, FLAGS )
-#elif SIZEOF_LONG_DOUBLE == 16
-#define OPAL_DATATYPE_INITIALIZER_FLOAT16(FLAGS)    OPAL_DATATYPE_INIT_BASIC_DATATYPE( long double, OPAL_ALIGNMENT_LONG_DOUBLE, FLOAT16, FLAGS )
-#else
-#define OPAL_DATATYPE_INITIALIZER_FLOAT16(FLAGS)    OPAL_DATATYPE_INITIALIZER_UNAVAILABLE_NAMED( FLOAT16, FLAGS )
-#endif
-
-#if HAVE_FLOAT__COMPLEX
-#define OPAL_DATATYPE_INITIALIZER_FLOAT_COMPLEX(FLAGS) OPAL_DATATYPE_INIT_BASIC_DATATYPE( float _Complex, OPAL_ALIGNMENT_FLOAT_COMPLEX, FLOAT_COMPLEX, FLAGS )
-#else
-#define OPAL_DATATYPE_INITIALIZER_FLOAT_COMPLEX(FLAGS) OPAL_DATATYPE_INITIALIZER_UNAVAILABLE_NAMED( FLOAT_COMPLEX, FLAGS)
-#endif
-
-#if HAVE_DOUBLE__COMPLEX
-#define OPAL_DATATYPE_INITIALIZER_DOUBLE_COMPLEX(FLAGS) OPAL_DATATYPE_INIT_BASIC_DATATYPE( double _Complex, OPAL_ALIGNMENT_DOUBLE_COMPLEX, DOUBLE_COMPLEX, FLAGS )
-#else
-#define OPAL_DATATYPE_INITIALIZER_DOUBLE_COMPLEX(FLAGS) OPAL_DATATYPE_INITIALIZER_UNAVAILABLE_NAMED( DOUBLE_COMPLEX, FLAGS)
-#endif
-
-#if HAVE_LONG_DOUBLE__COMPLEX
-#define OPAL_DATATYPE_INITIALIZER_LONG_DOUBLE_COMPLEX(FLAGS) OPAL_DATATYPE_INIT_BASIC_DATATYPE( long double _Complex, OPAL_ALIGNMENT_LONG_DOUBLE_COMPLEX, LONG_DOUBLE_COMPLEX, FLAGS )
-#else
-#define OPAL_DATATYPE_INITIALIZER_LONG_DOUBLE_COMPLEX(FLAGS) OPAL_DATATYPE_INITIALIZER_UNAVAILABLE_NAMED( LONG_DOUBLE_COMPLEX, FLAGS)
-#endif
-
-#define OPAL_DATATYPE_INITIALIZER_BOOL(FLAGS)       OPAL_DATATYPE_INIT_BASIC_DATATYPE( _Bool, OPAL_ALIGNMENT_BOOL, BOOL, FLAGS )
-
-#if OPAL_ALIGNMENT_WCHAR != 0
-#define OPAL_DATATYPE_INITIALIZER_WCHAR(FLAGS)      OPAL_DATATYPE_INIT_BASIC_DATATYPE( wchar_t, OPAL_ALIGNMENT_WCHAR, WCHAR, FLAGS )
-#else
-#define OPAL_DATATYPE_INITIALIZER_WCHAR(FLAGS)      OPAL_DATATYPE_INITIALIZER_UNAVAILABLE_NAMED( WCHAR, FLAGS )
-#endif
-    
-#define SAVE_STACK( PSTACK, INDEX, TYPE, COUNT, DISP) \
-do { \
-   (PSTACK)->index    = (INDEX); \
-   (PSTACK)->type     = (TYPE); \
-   (PSTACK)->count    = (COUNT); \
-   (PSTACK)->disp     = (DISP); \
-} while(0)
-
-#define PUSH_STACK( PSTACK, STACK_POS, INDEX, TYPE, COUNT, DISP) \
-do { \
-    dt_stack_t* pTempStack = (PSTACK) + 1; \
-    SAVE_STACK( pTempStack, (INDEX), (TYPE), (COUNT), (DISP) );  \
-    (STACK_POS)++; \
-    (PSTACK) = pTempStack; \
-} while(0)
-
-#define UPDATE_INTERNAL_COUNTERS( DESCRIPTION, POSITION, ELEMENT, COUNTER ) \
-    do {                                                                \
-        (ELEMENT) = &((DESCRIPTION)[(POSITION)]);                       \
-        (COUNTER) = (ELEMENT)->elem.count;                              \
-    } while (0)   
-
-OPAL_DECLSPEC extern const size_t opal_datatype_basicDatatypesSize[OPAL_DATATYPE_MAX_PREDEFINED];
-
-#define     OPAL_DATATYPE_LOOP_SIZE         0
-#define     OPAL_DATATYPE_END_LOOP_SIZE     0
-#define     OPAL_DATATYPE_LB_SIZE           0
-#define     OPAL_DATATYPE_UB_SIZE           0
-#define     OPAL_DATATYPE_INT1_SIZE         sizeof(int8_t)
-#define     OPAL_DATATYPE_INT2_SIZE         sizeof(int16_t)
-#define     OPAL_DATATYPE_INT4_SIZE         sizeof(int32_t)
-#define     OPAL_DATATYPE_INT8_SIZE         sizeof(int64_t)
-#ifdef HAVE_INT128_T
-#   define  OPAL_DATATYPE_INT16_SIZE        sizeof(int128_t)       /* Yes, double-machine word integers are available */
-#else
-#   define  OPAL_DATATYPE_INT16_SIZE        0
-#endif
-
-#define     OPAL_DATATYPE_UINT1_SIZE        sizeof(uint8_t)
-#define     OPAL_DATATYPE_UINT2_SIZE        sizeof(uint16_t)
-#define     OPAL_DATATYPE_UINT4_SIZE        sizeof(uint32_t)
-#define     OPAL_DATATYPE_UINT8_SIZE        sizeof(uint64_t)
-#ifdef HAVE_UINT128_T
-#   define  OPAL_DATATYPE_UINT16_SIZE       sizeof(uint128_t)      /* Yes, double-machine word integers are available */
-#else
-#   define  OPAL_DATATYPE_UINT16_SIZE       0
-#endif
-
-#if SIZEOF_FLOAT == 2
-#   define  OPAL_DATATYPE_FLOAT2_SIZE       sizeof(float)
-#elif SIZEOF_DOUBLE == 2
-#   define  OPAL_DATATYPE_FLOAT2_SIZE       sizeof(double)
-#elif SIZEOF_LONG_DOUBLE == 2
-#   define  OPAL_DATATYPE_FLOAT2_SIZE       sizeof(long double)
-#else
-#   define  OPAL_DATATYPE_FLOAT2_SIZE       0
-#endif
-
-#if SIZEOF_FLOAT == 4
-#   define  OPAL_DATATYPE_FLOAT4_SIZE       sizeof(float)
-#elif SIZEOF_DOUBLE == 4
-#   define  OPAL_DATATYPE_FLOAT4_SIZE       sizeof(double)
-#elif SIZEOF_LONG_DOUBLE == 4
-#   define  OPAL_DATATYPE_FLOAT4_SIZE       sizeof(long double)
-#else
-#   define  OPAL_DATATYPE_FLOAT4_SIZE       0
-#endif
-
-#if SIZEOF_FLOAT == 8
-#   define  OPAL_DATATYPE_FLOAT8_SIZE       sizeof(float)
-#elif SIZEOF_DOUBLE == 8
-#   define  OPAL_DATATYPE_FLOAT8_SIZE       sizeof(double)
-#elif SIZEOF_LONG_DOUBLE == 8
-#   define  OPAL_DATATYPE_FLOAT8_SIZE       sizeof(long double)
-#else
-#   define  OPAL_DATATYPE_FLOAT8_SIZE       0
-#endif
-
-#if SIZEOF_FLOAT == 12
-#   define  OPAL_DATATYPE_FLOAT12_SIZE      sizeof(float)
-#elif SIZEOF_DOUBLE == 12
-#   define  OPAL_DATATYPE_FLOAT12_SIZE      sizeof(double)
-#elif SIZEOF_LONG_DOUBLE == 12
-#   define  OPAL_DATATYPE_FLOAT12_SIZE      sizeof(long double)
-#else
-#   define  OPAL_DATATYPE_FLOAT12_SIZE      0
-#endif
-
-#if SIZEOF_FLOAT == 16
-#   define  OPAL_DATATYPE_FLOAT16_SIZE      sizeof(float)
-#elif SIZEOF_DOUBLE == 16
-#   define  OPAL_DATATYPE_FLOAT16_SIZE      sizeof(double)
-#elif SIZEOF_LONG_DOUBLE == 16
-#   define  OPAL_DATATYPE_FLOAT16_SIZE      sizeof(long double)
-#else
-#   define  OPAL_DATATYPE_FLOAT16_SIZE      0
-#endif
-        
-#if HAVE_FLOAT__COMPLEX
-#   define  OPAL_DATATYPE_FLOAT_COMPLEX_SIZE    sizeof(float _Complex)
-#else
-#   define  OPAL_DATATYPE_FLOAT_COMPLEX_SIZE    0
-#endif
-
-#if HAVE_DOUBLE__COMPLEX
-#   define  OPAL_DATATYPE_DOUBLE_COMPLEX_SIZE    sizeof(float _Complex)
-#else
-#   define  OPAL_DATATYPE_DOUBLE_COMPLEX_SIZE    0
-#endif
-    
-#if HAVE_LONG_DOUBLE__COMPLEX
-#   define  OPAL_DATATYPE_LONG_DOUBLE_COMPLEX_SIZE    sizeof(float _Complex)
-#else
-#   define  OPAL_DATATYPE_LONG_DOUBLE_COMPLEX_SIZE    0
-#endif
-
-#define     OPAL_DATATYPE_BOOL_SIZE         sizeof(_Bool)
-#if OPAL_ALIGNMENT_WCHAR != 0
-#   define  OPAL_DATATYPE_WCHAR_SIZE        sizeof(wchar_t)
-#else 
-#   define  OPAL_DATATYPE_WCHAR_SIZE        0
-#endif
-
-#define     OPAL_DATATYPE_UNAVAILABLE_SIZE  0
-
-#endif  /* OPAL_DATATYPE_ORIG_INTERNAL_H_HAS_BEEN_INCLUDED */

From fa023416e35f50bbf08bea14b1f4be2485889670 Mon Sep 17 00:00:00 2001
From: eddy16112 <eddy16112@gmail.com>
Date: Thu, 29 Oct 2015 17:15:50 -0400
Subject: [PATCH 040/190] cuda ddt support is able to turn itself off. Make it
 support multi-GPU when ompi support multi-GPU in the future

---
 ompi/mca/pml/ob1/pml_ob1_cuda.c               |   2 +-
 opal/datatype/cuda/opal_datatype_cuda.cu      | 100 ++++++++----------
 opal/datatype/cuda/opal_datatype_cuda.cuh     |   6 +-
 .../cuda/opal_datatype_cuda_internal.cuh      |  10 +-
 .../cuda/opal_datatype_pack_cuda_wrapper.cu   |  23 ++--
 .../cuda/opal_datatype_unpack_cuda_wrapper.cu |  13 ++-
 opal/datatype/opal_convertor.c                |  18 +++-
 opal/datatype/opal_datatype_gpu.c             |  20 ++--
 opal/datatype/opal_datatype_gpu.h             |   8 +-
 9 files changed, 107 insertions(+), 93 deletions(-)

diff --git a/ompi/mca/pml/ob1/pml_ob1_cuda.c b/ompi/mca/pml/ob1/pml_ob1_cuda.c
index 9cc3aa94d9b..c13a4f4f620 100644
--- a/ompi/mca/pml/ob1/pml_ob1_cuda.c
+++ b/ompi/mca/pml/ob1/pml_ob1_cuda.c
@@ -114,7 +114,7 @@ int mca_pml_ob1_send_request_start_cuda(mca_pml_ob1_send_request_t* sendreq,
          * memory into RNDV message is expensive. */
         sendreq->req_send.req_base.req_convertor.flags |= CONVERTOR_CUDA;
         mca_bml_base_btl_t* bml_endpoint_btl = mca_bml_base_btl_array_get_index(&(sendreq->req_endpoint->btl_send), 0);
-        if ((bml_endpoint_btl->btl_flags & MCA_BTL_FLAGS_CUDA_GET) && CUDA_DDT_WITH_RDMA) {
+        if ((bml_endpoint_btl->btl_flags & MCA_BTL_FLAGS_CUDA_GET) && (opal_datatype_cuda_kernel_support == 1)) {
             unsigned char *base;
             size_t buffer_size = 0;
             if (convertor->local_size > bml_btl->btl->btl_cuda_ddt_pipeline_size) {
diff --git a/opal/datatype/cuda/opal_datatype_cuda.cu b/opal/datatype/cuda/opal_datatype_cuda.cu
index 18706fe0f78..e0ca2cd7ed3 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cu
+++ b/opal/datatype/cuda/opal_datatype_cuda.cu
@@ -11,12 +11,10 @@
 
 
 ddt_cuda_list_t *cuda_free_list;
-ddt_cuda_device_t *cuda_device;
-ddt_cuda_stream_t* cuda_streams;
+ddt_cuda_device_t *cuda_devices;
+ddt_cuda_device_t *current_cuda_device;
 struct iovec cuda_iov[CUDA_NB_IOV];
 uint32_t cuda_iov_count;
-ddt_cuda_iov_dist_t* cuda_iov_dist_h[NB_STREAMS];
-ddt_cuda_iov_dist_t* cuda_iov_dist_d[NB_STREAMS];
 
 //uint8_t ALIGNMENT_DOUBLE, ALIGNMENT_FLOAT, ALIGNMENT_CHAR;
 
@@ -177,90 +175,86 @@ void opal_cuda_output(int output_id, const char *format, ...)
     }
 }
 
-void opal_datatype_cuda_init(void)
+int32_t opal_datatype_cuda_init(void)
 {
-    uint32_t i;
+    uint32_t i, j;
     int device;
     cudaError res;
 
     res = cudaGetDevice(&device);
     if( cudaSuccess != res ) {
         opal_cuda_output(0, "Cannot retrieve the device being used. Drop CUDA support!\n");
-        return;
+        return OPAL_ERROR;
     }    
 
     cuda_free_list = init_cuda_free_list();
     
     /* init device */
-    cuda_device = (ddt_cuda_device_t *)malloc(sizeof(ddt_cuda_device_t)*1);
-    for (i = 0; i < 1; i++) {
+    cuda_devices = (ddt_cuda_device_t *)malloc(sizeof(ddt_cuda_device_t)*NB_GPUS);
+    for (i = 0; i < NB_GPUS; i++) {
         unsigned char *gpu_ptr = NULL;
         if (cudaMalloc((void **)(&gpu_ptr), sizeof(char)*DT_CUDA_BUFFER_SIZE) != cudaSuccess) {
             DT_CUDA_DEBUG( opal_cuda_output( 0, "cudaMalloc is failed in GPU %d\n", i); );
+            return OPAL_ERROR;
         }
         DT_CUDA_DEBUG ( opal_cuda_output(2, "DDT engine cudaMalloc buffer %p in GPU %d\n", gpu_ptr, i););
         cudaMemset(gpu_ptr, 0, sizeof(char)*DT_CUDA_BUFFER_SIZE);
-        cuda_device[i].gpu_buffer = gpu_ptr;
+        cuda_devices[i].gpu_buffer = gpu_ptr;
         
-        cuda_device[i].buffer_free_size = DT_CUDA_BUFFER_SIZE;
+        cuda_devices[i].buffer_free_size = DT_CUDA_BUFFER_SIZE;
         ddt_cuda_buffer_t *p = obj_ddt_cuda_buffer_new();
         p->size = DT_CUDA_BUFFER_SIZE;
         p->gpu_addr = gpu_ptr;
-        cuda_device[i].buffer_free.head = p;
-        cuda_device[i].buffer_free.tail = cuda_device[i].buffer_free.head;
-        cuda_device[i].buffer_free.nb_elements = 1;
+        cuda_devices[i].buffer_free.head = p;
+        cuda_devices[i].buffer_free.tail = cuda_devices[i].buffer_free.head;
+        cuda_devices[i].buffer_free.nb_elements = 1;
         
-        cuda_device[i].buffer_used.head = NULL;
-        cuda_device[i].buffer_used.tail = NULL;
-        cuda_device[i].buffer_used_size = 0;
-        cuda_device[i].buffer_used.nb_elements = 0;
-    }
+        cuda_devices[i].buffer_used.head = NULL;
+        cuda_devices[i].buffer_used.tail = NULL;
+        cuda_devices[i].buffer_used_size = 0;
+        cuda_devices[i].buffer_used.nb_elements = 0;
     
-    
-    /* init cuda stream */
-    cuda_streams = (ddt_cuda_stream_t*)malloc(sizeof(ddt_cuda_stream_t));
-    for (i = 0; i < NB_STREAMS; i++) {
-        cudaStreamCreate(&(cuda_streams->opal_cuda_stream[i]));
+        /* init cuda stream */
+        ddt_cuda_stream_t *cuda_streams = (ddt_cuda_stream_t*)malloc(sizeof(ddt_cuda_stream_t));
+        for (j = 0; j < NB_STREAMS; j++) {
+            cudaStreamCreate(&(cuda_streams->opal_cuda_stream[j]));
+            cudaMallocHost((void **)(&(cuda_devices[i].cuda_iov_dist_h[j])), sizeof(ddt_cuda_iov_dist_t) * CUDA_MAX_NB_BLOCKS * CUDA_IOV_MAX_TASK_PER_BLOCK);
+            cudaMalloc((void **)(&(cuda_devices[i].cuda_iov_dist_d[j])), sizeof(ddt_cuda_iov_dist_t) * CUDA_MAX_NB_BLOCKS * CUDA_IOV_MAX_TASK_PER_BLOCK);
+        }
+        cuda_streams->current_stream_id = 0;
+        cuda_devices[i].cuda_streams = cuda_streams;
     }
-    cuda_streams->current_stream_id = 0;
+    current_cuda_device = &(cuda_devices[0]);
     
     /* init cuda_iov */
     cuda_iov_count = CUDA_NB_IOV;
     
-    /* only for iov version */
-    for (i = 0; i < NB_STREAMS; i++) {
-        cudaMallocHost((void **)(&cuda_iov_dist_h[i]), sizeof(ddt_cuda_iov_dist_t)*CUDA_MAX_NB_BLOCKS*CUDA_IOV_MAX_TASK_PER_BLOCK);
-        cudaMalloc((void **)(&cuda_iov_dist_d[i]), sizeof(ddt_cuda_iov_dist_t)*CUDA_MAX_NB_BLOCKS*CUDA_IOV_MAX_TASK_PER_BLOCK);
-    }
-    
     // /* init size for double, float, char */
     // ALIGNMENT_DOUBLE = sizeof(double);
     // ALIGNMENT_FLOAT = sizeof(float);
     // ALIGNMENT_CHAR = sizeof(char);
     
     cudaDeviceSynchronize();
+    return OPAL_SUCCESS;
 }
 
-void opal_datatype_cuda_fini(void)
+int32_t opal_datatype_cuda_fini(void)
 {
-    uint32_t i;
+    uint32_t i, j;
     
-    /* destory cuda stream */
-    for (i = 0; i < NB_STREAMS; i++) {
-        cudaStreamDestroy(cuda_streams->opal_cuda_stream[i]);
-    }
-    free(cuda_streams);
-    
-    /* only for iov version */
-    for (i = 0; i < NB_STREAMS; i++) {
-        cudaFreeHost(cuda_iov_dist_h[i]);
-        cudaFree(cuda_iov_dist_d[i]);
+    for (i = 0; i < NB_GPUS; i++) {
+        /* free gpu buffer */
+        cudaFree(cuda_devices[i].gpu_buffer);   
+        /* destory cuda stream and iov*/
+        for (j = 0; j < NB_STREAMS; j++) {
+            cudaStreamDestroy(cuda_devices[i].cuda_streams->opal_cuda_stream[j]);
+            cudaFreeHost(cuda_devices[i].cuda_iov_dist_h[j]);
+            cudaFree(cuda_devices[i].cuda_iov_dist_d[j]);
+        }
+        free(cuda_devices[i].cuda_streams);
     }
-}
-
-void opal_cuda_sync_device(void)
-{
-    cudaDeviceSynchronize();
+    current_cuda_device = NULL;
+    return OPAL_SUCCESS;
 }
 
 int32_t opal_cuda_is_gpu_buffer(const void *ptr)
@@ -283,7 +277,7 @@ void* opal_cuda_malloc_gpu_buffer(size_t size, int gpu_id)
 {
     int dev_id;
     cudaGetDevice(&dev_id);
-    ddt_cuda_device_t *device = &cuda_device[gpu_id];
+    ddt_cuda_device_t *device = &cuda_devices[gpu_id];
     if (device->buffer_free_size < size) {
         DT_CUDA_DEBUG( opal_cuda_output( 0, "No GPU buffer at dev_id %d.\n", dev_id); );
         return NULL;
@@ -320,7 +314,7 @@ void* opal_cuda_malloc_gpu_buffer(size_t size, int gpu_id)
 
 void opal_cuda_free_gpu_buffer(void *addr, int gpu_id)
 {
-    ddt_cuda_device_t *device = &cuda_device[gpu_id];
+    ddt_cuda_device_t *device = &cuda_devices[gpu_id];
     ddt_cuda_buffer_t *ptr = device->buffer_used.head;
 
     /* Find the holder of this GPU allocation */
@@ -352,13 +346,13 @@ void opal_cuda_free_gpu_buffer(void *addr, int gpu_id)
 
 void opal_cuda_d2dcpy_async(void* dst, const void* src, size_t count)
 {
-    cudaMemcpyAsync(dst, src, count, cudaMemcpyDeviceToDevice, cuda_streams->opal_cuda_stream[0]);
+    cudaMemcpyAsync(dst, src, count, cudaMemcpyDeviceToDevice, current_cuda_device->cuda_streams->opal_cuda_stream[0]);
 }
 
 void opal_cuda_d2dcpy(void* dst, const void* src, size_t count)
 {
-    cudaMemcpyAsync(dst, src, count, cudaMemcpyDeviceToDevice, cuda_streams->opal_cuda_stream[0]);
-    cudaStreamSynchronize(cuda_streams->opal_cuda_stream[0]);
+    cudaMemcpyAsync(dst, src, count, cudaMemcpyDeviceToDevice, current_cuda_device->cuda_streams->opal_cuda_stream[0]);
+    cudaStreamSynchronize(current_cuda_device->cuda_streams->opal_cuda_stream[0]);
 }
 
 void opal_dump_cuda_list(ddt_cuda_list_t *list)
diff --git a/opal/datatype/cuda/opal_datatype_cuda.cuh b/opal/datatype/cuda/opal_datatype_cuda.cuh
index d71d349d46b..5cc2a77c6ef 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cuh
+++ b/opal/datatype/cuda/opal_datatype_cuda.cuh
@@ -4,9 +4,9 @@
 extern "C"
 {
     
-void opal_datatype_cuda_init(void);
+int32_t opal_datatype_cuda_init(void);
 
-void opal_datatype_cuda_fini(void);
+int32_t opal_datatype_cuda_fini(void);
                                 
                                                 
 int32_t opal_generic_simple_pack_function_cuda_vector( opal_convertor_t* pConvertor,
@@ -83,8 +83,6 @@ void unpack_predefined_data_cuda( dt_elem_desc_t* ELEM,
                                   unsigned char** DESTINATION,
                                   size_t* SPACE );
 
-void opal_cuda_sync_device(void);
-
 int32_t opal_cuda_is_gpu_buffer(const void *ptr);
 
 void* opal_cuda_malloc_gpu_buffer(size_t size, int gpu_id);
diff --git a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
index fe49449f976..3977da4125b 100644
--- a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
+++ b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
@@ -19,6 +19,7 @@
 #define OPAL_DATATYPE_VECTOR_USE_PIPELINE   0
 
 
+#define NB_GPUS                 1
 #define IOV_ARRAY_SIZE          1
 #define DT_CUDA_BUFFER_SIZE    1024*1024*200
 #define DT_CUDA_FREE_LIST_SIZE  50
@@ -72,15 +73,16 @@ typedef struct {
     ddt_cuda_list_t buffer_used;
     size_t buffer_free_size;
     size_t buffer_used_size;
+    ddt_cuda_stream_t *cuda_streams;
+    ddt_cuda_iov_dist_t* cuda_iov_dist_h[NB_STREAMS];
+    ddt_cuda_iov_dist_t* cuda_iov_dist_d[NB_STREAMS];
 } ddt_cuda_device_t;
 
 extern ddt_cuda_list_t *cuda_free_list;
-extern ddt_cuda_device_t *cuda_device;
-extern ddt_cuda_stream_t* cuda_streams;
+extern ddt_cuda_device_t *cuda_devices;
+extern ddt_cuda_device_t *current_cuda_device;
 extern struct iovec cuda_iov[CUDA_NB_IOV];
 extern uint32_t cuda_iov_count;
-extern ddt_cuda_iov_dist_t* cuda_iov_dist_h[NB_STREAMS];
-extern ddt_cuda_iov_dist_t* cuda_iov_dist_d[NB_STREAMS];
 
 //extern uint8_t ALIGNMENT_DOUBLE, ALIGNMENT_FLOAT, ALIGNMENT_CHAR;
 
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
index 0a51f66d877..2c674bbea6d 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
@@ -27,6 +27,8 @@ int32_t opal_generic_simple_pack_function_cuda_vector2(opal_convertor_t* pConver
     uint8_t free_required;
     uint32_t count_desc_tmp;
     
+    ddt_cuda_stream_t *cuda_streams = current_cuda_device->cuda_streams;
+    
 #if defined(OPAL_DATATYPE_CUDA_TIMING)    
     TIMER_DATA_TYPE start, end, start_total, end_total;
     long total_time;
@@ -227,6 +229,8 @@ int32_t opal_generic_simple_pack_function_cuda_vector(opal_convertor_t* pConvert
     uint8_t transfer_required;
     uint8_t free_required;
     uint32_t count_desc_tmp;
+    
+    ddt_cuda_stream_t *cuda_streams = current_cuda_device->cuda_streams;
 
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
     TIMER_DATA_TYPE start, end, start_total, end_total;
@@ -478,6 +482,8 @@ void pack_contiguous_loop_cuda_pipeline( dt_elem_desc_t* ELEM,
     int i, pipeline_blocks;
     uint32_t _copy_loops_per_pipeline; 
     
+    ddt_cuda_stream_t *cuda_streams = current_cuda_device->cuda_streams;
+    
 #if defined(OPAL_DATATYPE_CUDA_TIMING)    
     TIMER_DATA_TYPE start, end, start_total, end_total;
     long total_time;
@@ -654,6 +660,7 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
     uint8_t alignment, orig_alignment;
 //    int32_t orig_stack_index;
     
+    ddt_cuda_stream_t *cuda_streams = current_cuda_device->cuda_streams;
     ddt_cuda_iov_dist_t* cuda_iov_dist_h_current;
     ddt_cuda_iov_dist_t* cuda_iov_dist_d_current;
 
@@ -740,8 +747,8 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
     while (cuda_iov_count > 0) {
         
         nb_blocks_used = 0;
-        cuda_iov_dist_h_current = cuda_iov_dist_h[cuda_streams->current_stream_id];
-        cuda_iov_dist_d_current = cuda_iov_dist_d[cuda_streams->current_stream_id];
+        cuda_iov_dist_h_current = current_cuda_device->cuda_iov_dist_h[cuda_streams->current_stream_id];
+        cuda_iov_dist_d_current = current_cuda_device->cuda_iov_dist_d[cuda_streams->current_stream_id];
         source_base = (unsigned char*)cuda_iov[0].iov_base; 
 
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
@@ -769,8 +776,6 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
             } else {
                 alignment = ALIGNMENT_CHAR;
             }
-            
-           // alignment = ALIGNMENT_DOUBLE;
 
             count_desc = length_per_iovec / alignment;
             residue_desc = length_per_iovec % alignment;
@@ -866,13 +871,7 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
     move_time = ELAPSED_TIME( start, end );
     DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: DtoH memcpy in %ld microsec, transfer required %d\n", move_time, transfer_required ); );
 #endif
-    // float *vtmp = (float *)iov[0].iov_base;
-    // DT_CUDA_DEBUG ( opal_cuda_output(0, "packed iov buffer, total packed %d\n", total_packed); );
-    // for (uint32_t i = 0; i < total_packed/sizeof(float); i++) {
-    //     printf(" %1.f ", *vtmp);
-    //     vtmp ++;
-    // }
-    // printf("\n");
+
     iov[0].iov_len = total_packed;
     *max_data = total_packed;
     *out_size = 1;
@@ -908,6 +907,8 @@ void pack_predefined_data_cuda( dt_elem_desc_t* ELEM,
     unsigned char* _source = (*SOURCE) + _elem->disp;
     uint32_t nb_blocks, tasks_per_block, thread_per_block;
     unsigned char* _destination = *(DESTINATION);
+    
+    ddt_cuda_stream_t *cuda_streams = current_cuda_device->cuda_streams;
 
     _copy_blength = 8;//opal_datatype_basicDatatypes[_elem->common.type]->size;
     if( (_copy_count * _copy_blength) > *(SPACE) ) {
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
index 696a2c12694..f6251fd77f7 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
@@ -24,6 +24,8 @@ int32_t opal_generic_simple_unpack_function_cuda_vector2( opal_convertor_t* pCon
     uint32_t iov_count;
     uint8_t free_required;
     uint32_t count_desc_tmp;
+    
+    ddt_cuda_stream_t *cuda_streams = current_cuda_device->cuda_streams;
 
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
     TIMER_DATA_TYPE start, end;
@@ -197,6 +199,8 @@ int32_t opal_generic_simple_unpack_function_cuda_vector( opal_convertor_t* pConv
     uint32_t iov_count;
     uint8_t free_required;
     uint32_t count_desc_tmp;
+    
+    ddt_cuda_stream_t *cuda_streams = current_cuda_device->cuda_streams;
 
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
     TIMER_DATA_TYPE start, end;
@@ -370,6 +374,7 @@ int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvert
     uint8_t alignment, orig_alignment;
 //    int32_t orig_stack_index;
 
+    ddt_cuda_stream_t *cuda_streams = current_cuda_device->cuda_streams;
     ddt_cuda_iov_dist_t* cuda_iov_dist_h_current;
     ddt_cuda_iov_dist_t* cuda_iov_dist_d_current;
 
@@ -447,8 +452,8 @@ int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvert
     while (cuda_iov_count > 0) {
 
         nb_blocks_used = 0;
-        cuda_iov_dist_h_current = cuda_iov_dist_h[cuda_streams->current_stream_id];
-        cuda_iov_dist_d_current = cuda_iov_dist_d[cuda_streams->current_stream_id];
+        cuda_iov_dist_h_current = current_cuda_device->cuda_iov_dist_h[cuda_streams->current_stream_id];
+        cuda_iov_dist_d_current = current_cuda_device->cuda_iov_dist_d[cuda_streams->current_stream_id];
         destination_base = (unsigned char*)cuda_iov[0].iov_base;
 
 #if defined (OPAL_DATATYPE_CUDA_TIMING)
@@ -736,7 +741,9 @@ void unpack_predefined_data_cuda( dt_elem_desc_t* ELEM,
     ddt_elem_desc_t* _elem = &((ELEM)->elem);
     unsigned char* _source = (*SOURCE);
     uint32_t nb_blocks, tasks_per_block, thread_per_block;
-    unsigned char* _destination = *(DESTINATION) + _elem->disp;;
+    unsigned char* _destination = *(DESTINATION) + _elem->disp;
+    
+    ddt_cuda_stream_t *cuda_streams = current_cuda_device->cuda_streams;
 
     _copy_blength = 8;//opal_datatype_basicDatatypes[_elem->common.type]->size;
     if( (_copy_count * _copy_blength) > *(SPACE) ) {
diff --git a/opal/datatype/opal_convertor.c b/opal/datatype/opal_convertor.c
index 3e0ac066c84..c32d96043ac 100644
--- a/opal/datatype/opal_convertor.c
+++ b/opal/datatype/opal_convertor.c
@@ -577,7 +577,12 @@ int32_t opal_convertor_prepare_for_recv( opal_convertor_t* convertor,
         if( convertor->pDesc->flags & OPAL_DATATYPE_FLAG_CONTIGUOUS ) {
             convertor->fAdvance = opal_unpack_homogeneous_contig_checksum;
         } else {
-            convertor->fAdvance = opal_generic_simple_unpack_checksum;
+            if ((convertor->flags & CONVERTOR_CUDA) && (opal_datatype_cuda_kernel_support == 1)) {
+                convertor->fAdvance = opal_generic_simple_unpack_cuda_checksum;
+                convertor->gpu_buffer_ptr = NULL;
+            } else {
+                convertor->fAdvance = opal_generic_simple_unpack_checksum;
+            }
         }
     } else {
 #if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
@@ -588,7 +593,7 @@ int32_t opal_convertor_prepare_for_recv( opal_convertor_t* convertor,
         if( convertor->pDesc->flags & OPAL_DATATYPE_FLAG_CONTIGUOUS ) {
             convertor->fAdvance = opal_unpack_homogeneous_contig;
         } else {
-            if ((convertor->flags & CONVERTOR_CUDA) && OPAL_DATATYPE_CUDA_KERNEL) {
+            if ((convertor->flags & CONVERTOR_CUDA) && (opal_datatype_cuda_kernel_support == 1)) {
                 convertor->fAdvance = opal_generic_simple_unpack_cuda;
                 convertor->gpu_buffer_ptr = NULL;
             } else {
@@ -625,7 +630,12 @@ int32_t opal_convertor_prepare_for_send( opal_convertor_t* convertor,
             else
                 convertor->fAdvance = opal_pack_homogeneous_contig_with_gaps_checksum;
         } else {
-            convertor->fAdvance = opal_generic_simple_pack_checksum;
+            if ((convertor->flags & CONVERTOR_CUDA) && (opal_datatype_cuda_kernel_support == 1)) {
+                convertor->fAdvance = opal_generic_simple_pack_cuda_checksum;
+                convertor->gpu_buffer_ptr = NULL;
+            } else {
+                convertor->fAdvance = opal_generic_simple_pack_checksum;
+            }
         }
     } else {
         if( datatype->flags & OPAL_DATATYPE_FLAG_CONTIGUOUS ) {
@@ -635,7 +645,7 @@ int32_t opal_convertor_prepare_for_send( opal_convertor_t* convertor,
             else
                 convertor->fAdvance = opal_pack_homogeneous_contig_with_gaps;
         } else {
-            if ((convertor->flags & CONVERTOR_CUDA) && OPAL_DATATYPE_CUDA_KERNEL ) {
+            if ((convertor->flags & CONVERTOR_CUDA) && (opal_datatype_cuda_kernel_support == 1)) {
                 convertor->fAdvance = opal_generic_simple_pack_cuda;
                 convertor->gpu_buffer_ptr = NULL;
             } else {
diff --git a/opal/datatype/opal_datatype_gpu.c b/opal/datatype/opal_datatype_gpu.c
index 4e516766737..f21b22c72d2 100644
--- a/opal/datatype/opal_datatype_gpu.c
+++ b/opal/datatype/opal_datatype_gpu.c
@@ -40,12 +40,14 @@
 
 #include "opal/datatype/opal_datatype_gpu.h"
 
+int32_t opal_datatype_cuda_kernel_support = 0;
+
 static void *opal_datatype_cuda_handle = NULL;
 static char *opal_datatype_cuda_lib = NULL;
 
-void (*opal_datatype_cuda_init_p)(void) = NULL;
+int32_t (*opal_datatype_cuda_init_p)(void) = NULL;
 
-void (*opal_datatype_cuda_fini_p)(void) = NULL;
+int32_t (*opal_datatype_cuda_fini_p)(void) = NULL;
 
 
 int32_t (*opal_generic_simple_pack_function_cuda_iov_p)( opal_convertor_t* pConvertor,
@@ -86,8 +88,6 @@ void (*pack_predefined_data_cuda_p)( dt_elem_desc_t* ELEM,
                                      unsigned char** DESTINATION,
                                      size_t* SPACE ) = NULL;
 
-void (*opal_cuda_sync_device_p)(void) = NULL;
-
 void (*opal_cuda_free_gpu_buffer_p)(void *addr, int gpu_id) = NULL;
 
 void* (*opal_cuda_malloc_gpu_buffer_p)(size_t size, int gpu_id) = NULL;
@@ -131,14 +131,16 @@ int32_t opal_datatype_gpu_init(void)
         OPAL_DATATYPE_FIND_CUDA_FUNCTION_OR_RETURN( opal_datatype_cuda_handle, pack_contiguous_loop_cuda );
         OPAL_DATATYPE_FIND_CUDA_FUNCTION_OR_RETURN( opal_datatype_cuda_handle, unpack_contiguous_loop_cuda );
         OPAL_DATATYPE_FIND_CUDA_FUNCTION_OR_RETURN( opal_datatype_cuda_handle, pack_predefined_data_cuda );
-        OPAL_DATATYPE_FIND_CUDA_FUNCTION_OR_RETURN( opal_datatype_cuda_handle, opal_cuda_sync_device );
         OPAL_DATATYPE_FIND_CUDA_FUNCTION_OR_RETURN( opal_datatype_cuda_handle, opal_cuda_free_gpu_buffer );
         OPAL_DATATYPE_FIND_CUDA_FUNCTION_OR_RETURN( opal_datatype_cuda_handle, opal_cuda_malloc_gpu_buffer );
         OPAL_DATATYPE_FIND_CUDA_FUNCTION_OR_RETURN( opal_datatype_cuda_handle, opal_cuda_d2dcpy_async );
         OPAL_DATATYPE_FIND_CUDA_FUNCTION_OR_RETURN( opal_datatype_cuda_handle, opal_cuda_d2dcpy );
 
-        (*opal_datatype_cuda_init_p)();
-        opal_output( 0, "cuda init done\n");
+        if (OPAL_SUCCESS != (*opal_datatype_cuda_init_p)()) {
+            return OPAL_ERROR;
+        }
+        opal_datatype_cuda_kernel_support = 1;
+        opal_output( 0, "opal_datatype_cuda_kernel_support init done\n");
     }
     return OPAL_SUCCESS;
 }
@@ -157,7 +159,6 @@ int32_t opal_datatype_gpu_fini(void)
         pack_contiguous_loop_cuda_p = NULL;
         unpack_contiguous_loop_cuda_p = NULL;
         pack_predefined_data_cuda_p = NULL;
-        opal_cuda_sync_device_p = NULL;
         opal_cuda_free_gpu_buffer_p = NULL;
         opal_cuda_malloc_gpu_buffer_p = NULL;
         opal_cuda_d2dcpy_async_p = NULL;
@@ -169,7 +170,8 @@ int32_t opal_datatype_gpu_fini(void)
         if( NULL != opal_datatype_cuda_lib )
             free(opal_datatype_cuda_lib);
         opal_datatype_cuda_lib = NULL;
-        opal_output( 0, "cuda fini done\n");
+        opal_datatype_cuda_kernel_support = 0;
+        opal_output( 0, "opal_datatype_cuda_kernel_support fini done\n");
     }
     return OPAL_SUCCESS;
 }
diff --git a/opal/datatype/opal_datatype_gpu.h b/opal/datatype/opal_datatype_gpu.h
index df42d68b6fc..340fbf24da7 100644
--- a/opal/datatype/opal_datatype_gpu.h
+++ b/opal/datatype/opal_datatype_gpu.h
@@ -3,12 +3,14 @@
 
 #define OPAL_DATATYPE_CUDA_KERNEL   1
 
+extern int32_t opal_datatype_cuda_kernel_support;
+
 int32_t opal_datatype_gpu_init(void);
 int32_t opal_datatype_gpu_fini(void);
 
-extern void (*opal_datatype_cuda_init_p)(void);
+extern int32_t (*opal_datatype_cuda_init_p)(void);
 
-extern void (*opal_datatype_cuda_fini_p)(void);
+extern int32_t (*opal_datatype_cuda_fini_p)(void);
                                                               
 extern int32_t (*opal_generic_simple_pack_function_cuda_iov_p)( opal_convertor_t* pConvertor,
                                                                 struct iovec* iov, 
@@ -47,8 +49,6 @@ extern void (*pack_predefined_data_cuda_p)( dt_elem_desc_t* ELEM,
                                             unsigned char** SOURCE,
                                             unsigned char** DESTINATION,
                                             size_t* SPACE );
-                                            
-extern void (*opal_cuda_sync_device_p)(void);
 
 extern void (*opal_cuda_free_gpu_buffer_p)(void *addr, int gpu_id);
 

From 7258acd77195b4ec476c5180e9b3990088c24993 Mon Sep 17 00:00:00 2001
From: eddy16112 <eddy16112@gmail.com>
Date: Fri, 30 Oct 2015 18:42:09 -0400
Subject: [PATCH 041/190] fix a cuda stream bug for iov, remove some stream
 syncs

---
 opal/datatype/cuda/opal_datatype_cuda.cu      | 35 ++++++--
 opal/datatype/cuda/opal_datatype_cuda.cuh     |  4 +-
 .../cuda/opal_datatype_cuda_internal.cuh      | 20 ++++-
 .../cuda/opal_datatype_pack_cuda_wrapper.cu   | 85 ++++++++++---------
 .../cuda/opal_datatype_unpack_cuda_wrapper.cu | 76 +++++++++++------
 test/datatype/ddt_benchmark.c                 |  6 +-
 6 files changed, 146 insertions(+), 80 deletions(-)

diff --git a/opal/datatype/cuda/opal_datatype_cuda.cu b/opal/datatype/cuda/opal_datatype_cuda.cu
index e0ca2cd7ed3..3c5208d7122 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cu
+++ b/opal/datatype/cuda/opal_datatype_cuda.cu
@@ -215,14 +215,21 @@ int32_t opal_datatype_cuda_init(void)
         cuda_devices[i].buffer_used.nb_elements = 0;
     
         /* init cuda stream */
-        ddt_cuda_stream_t *cuda_streams = (ddt_cuda_stream_t*)malloc(sizeof(ddt_cuda_stream_t));
+        ddt_cuda_stream_t *cuda_streams = (ddt_cuda_stream_t *)malloc(sizeof(ddt_cuda_stream_t));
+        ddt_cuda_iov_pipeline_block_t *cuda_iov_pipeline_block = NULL;
         for (j = 0; j < NB_STREAMS; j++) {
             cudaStreamCreate(&(cuda_streams->opal_cuda_stream[j]));
-            cudaMallocHost((void **)(&(cuda_devices[i].cuda_iov_dist_h[j])), sizeof(ddt_cuda_iov_dist_t) * CUDA_MAX_NB_BLOCKS * CUDA_IOV_MAX_TASK_PER_BLOCK);
-            cudaMalloc((void **)(&(cuda_devices[i].cuda_iov_dist_d[j])), sizeof(ddt_cuda_iov_dist_t) * CUDA_MAX_NB_BLOCKS * CUDA_IOV_MAX_TASK_PER_BLOCK);
+            cuda_iov_pipeline_block = (ddt_cuda_iov_pipeline_block_t *)malloc(sizeof(ddt_cuda_iov_pipeline_block_t));
+            cudaMallocHost((void **)(&(cuda_iov_pipeline_block->cuda_iov_dist_h)), sizeof(ddt_cuda_iov_dist_t) * CUDA_MAX_NB_BLOCKS * CUDA_IOV_MAX_TASK_PER_BLOCK);
+            cudaMalloc((void **)(&(cuda_iov_pipeline_block->cuda_iov_dist_d)), sizeof(ddt_cuda_iov_dist_t) * CUDA_MAX_NB_BLOCKS * CUDA_IOV_MAX_TASK_PER_BLOCK);
+            cuda_iov_pipeline_block->cuda_stream = &(cuda_streams->opal_cuda_stream[0]);
+            cuda_iov_pipeline_block->cuda_stream_id = 0;
+            cudaEventCreate(&(cuda_iov_pipeline_block->cuda_event), cudaEventDisableTiming);
+            cuda_devices[i].cuda_iov_pipeline_block[j] = cuda_iov_pipeline_block;
         }
         cuda_streams->current_stream_id = 0;
         cuda_devices[i].cuda_streams = cuda_streams;
+        cudaEventCreate(&(cuda_devices[i].memcpy_event), cudaEventDisableTiming);
     }
     current_cuda_device = &(cuda_devices[0]);
     
@@ -246,12 +253,23 @@ int32_t opal_datatype_cuda_fini(void)
         /* free gpu buffer */
         cudaFree(cuda_devices[i].gpu_buffer);   
         /* destory cuda stream and iov*/
+        ddt_cuda_iov_pipeline_block_t *cuda_iov_pipeline_block = NULL;
         for (j = 0; j < NB_STREAMS; j++) {
             cudaStreamDestroy(cuda_devices[i].cuda_streams->opal_cuda_stream[j]);
-            cudaFreeHost(cuda_devices[i].cuda_iov_dist_h[j]);
-            cudaFree(cuda_devices[i].cuda_iov_dist_d[j]);
+            cuda_iov_pipeline_block = cuda_devices[i].cuda_iov_pipeline_block[j];
+            if (cuda_iov_pipeline_block != NULL) {
+                cudaFreeHost(cuda_iov_pipeline_block->cuda_iov_dist_h);
+                cudaFree(cuda_iov_pipeline_block->cuda_iov_dist_d);
+                cudaEventDestroy(cuda_iov_pipeline_block->cuda_event);
+                cuda_iov_pipeline_block->cuda_stream = NULL;
+                cuda_iov_pipeline_block->cuda_stream_id = -1;
+                free(cuda_iov_pipeline_block);
+                cuda_iov_pipeline_block = NULL;
+            }
         }
         free(cuda_devices[i].cuda_streams);
+        cuda_devices[i].cuda_streams = NULL;
+        cudaEventDestroy(cuda_devices[i].memcpy_event);
     }
     current_cuda_device = NULL;
     return OPAL_SUCCESS;
@@ -344,6 +362,13 @@ void opal_cuda_free_gpu_buffer(void *addr, int gpu_id)
     DT_CUDA_DEBUG( opal_cuda_output( 2, "Free GPU buffer %p.\n", addr); );
 }
 
+void opal_cuda_check_error(cudaError_t err)
+{
+    if (err != cudaSuccess) {
+        DT_CUDA_DEBUG( opal_cuda_output(0, "CUDA calls error %s\n", cudaGetErrorString(err)); );
+    }
+}
+
 void opal_cuda_d2dcpy_async(void* dst, const void* src, size_t count)
 {
     cudaMemcpyAsync(dst, src, count, cudaMemcpyDeviceToDevice, current_cuda_device->cuda_streams->opal_cuda_stream[0]);
diff --git a/opal/datatype/cuda/opal_datatype_cuda.cuh b/opal/datatype/cuda/opal_datatype_cuda.cuh
index 5cc2a77c6ef..8c228fc3404 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cuh
+++ b/opal/datatype/cuda/opal_datatype_cuda.cuh
@@ -35,7 +35,7 @@ void pack_contiguous_loop_cuda( dt_elem_desc_t* ELEM,
                                 unsigned char** DESTINATION,
                                 size_t* SPACE );
                                 
-void pack_contiguous_loop_cuda_memcpy2d( dt_elem_desc_t* ELEM,
+void pack_contiguous_loop_cuda_memcpy2d_d2h( dt_elem_desc_t* ELEM,
                                          uint32_t* COUNT,
                                          unsigned char** SOURCE,
                                          unsigned char** DESTINATION,
@@ -59,7 +59,7 @@ void unpack_contiguous_loop_cuda( dt_elem_desc_t* ELEM,
                                   unsigned char** DESTINATION,
                                   size_t* SPACE );
 
-void unpack_contiguous_loop_cuda_memcpy2d( dt_elem_desc_t* ELEM,
+void unpack_contiguous_loop_cuda_memcpy2d_d2h( dt_elem_desc_t* ELEM,
                                            uint32_t* COUNT,
                                            unsigned char** SOURCE,
                                            unsigned char** DESTINATION,
diff --git a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
index 3977da4125b..506a5fe22cd 100644
--- a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
+++ b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
@@ -13,10 +13,12 @@
 #define OPAL_DATATYPE_CUDA_DEBUG    1
 //#define OPAL_DATATYPE_CUDA_KERNEL_TIME
 #define OPAL_DATATYPE_CUDA_DEBUG_LEVEL  2
-//#define OPAL_DATATYPE_CUDA_TIMING
-#define OPAL_DATATYPE_VECTOR_USE_MEMCPY2D   0
+#define OPAL_DATATYPE_CUDA_TIMING
+#define OPAL_DATATYPE_VECTOR_USE_MEMCPY2D_D2H   0
 #define OPAL_DATATYPE_VECTOR_USE_ZEROCPY   0
 #define OPAL_DATATYPE_VECTOR_USE_PIPELINE   0
+#define OPAL_DATATYPE_VECTOR_USE_MEMCPY2D_AS_KERNEL   1
+
 
 
 #define NB_GPUS                 1
@@ -53,6 +55,14 @@ typedef struct {
     uint8_t element_alignment;
 } ddt_cuda_iov_dist_t;
 
+typedef struct {
+    ddt_cuda_iov_dist_t* cuda_iov_dist_h;
+    ddt_cuda_iov_dist_t* cuda_iov_dist_d;
+    cudaStream_t *cuda_stream;
+    int32_t cuda_stream_id;
+    cudaEvent_t cuda_event;
+} ddt_cuda_iov_pipeline_block_t;
+
 typedef struct ddt_cuda_buffer{
     unsigned char* gpu_addr;
     size_t size;
@@ -74,8 +84,8 @@ typedef struct {
     size_t buffer_free_size;
     size_t buffer_used_size;
     ddt_cuda_stream_t *cuda_streams;
-    ddt_cuda_iov_dist_t* cuda_iov_dist_h[NB_STREAMS];
-    ddt_cuda_iov_dist_t* cuda_iov_dist_d[NB_STREAMS];
+    ddt_cuda_iov_pipeline_block_t *cuda_iov_pipeline_block[NB_STREAMS];
+    cudaEvent_t memcpy_event;
 } ddt_cuda_device_t;
 
 extern ddt_cuda_list_t *cuda_free_list;
@@ -120,6 +130,8 @@ __global__ void opal_empty_kernel_noargs();
 
 void opal_cuda_output(int output_id, const char *format, ...);
 
+void opal_cuda_check_error(cudaError_t err);
+
 #if defined (OPAL_DATATYPE_CUDA_DEBUG)
 #define DT_CUDA_DEBUG( INST ) if (OPAL_DATATYPE_CUDA_DEBUG) { INST }
 #else
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
index 2c674bbea6d..dccf2803c6a 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
@@ -77,7 +77,7 @@ int32_t opal_generic_simple_pack_function_cuda_vector2(opal_convertor_t* pConver
             }
             transfer_required = 0;
         } else {
-            if (OPAL_DATATYPE_VECTOR_USE_MEMCPY2D || OPAL_DATATYPE_VECTOR_USE_ZEROCPY) {
+            if (OPAL_DATATYPE_VECTOR_USE_MEMCPY2D_D2H || OPAL_DATATYPE_VECTOR_USE_ZEROCPY) {
                 pConvertor->gpu_buffer_ptr = NULL;
                 transfer_required = 0;
                 free_required = 0;
@@ -148,8 +148,8 @@ int32_t opal_generic_simple_pack_function_cuda_vector2(opal_convertor_t* pConver
             if( OPAL_DATATYPE_LOOP == pElem->elem.common.type ) {
                 OPAL_PTRDIFF_TYPE local_disp = (OPAL_PTRDIFF_TYPE)conv_ptr;
                 if( pElem->loop.common.flags & OPAL_DATATYPE_FLAG_CONTIGUOUS ) {
-                    if (OPAL_DATATYPE_VECTOR_USE_MEMCPY2D) {
-                        pack_contiguous_loop_cuda_memcpy2d(pElem, &count_desc, &conv_ptr, &iov_ptr, &iov_len_local);
+                    if (OPAL_DATATYPE_VECTOR_USE_MEMCPY2D_D2H) {
+                        pack_contiguous_loop_cuda_memcpy2d_d2h(pElem, &count_desc, &conv_ptr, &iov_ptr, &iov_len_local);
                     } else if (OPAL_DATATYPE_VECTOR_USE_ZEROCPY) {
                         pack_contiguous_loop_cuda_zerocopy(pElem, &count_desc, &conv_ptr, &iov_ptr, &iov_len_local);
                     } else if (OPAL_DATATYPE_VECTOR_USE_PIPELINE) {
@@ -280,7 +280,7 @@ int32_t opal_generic_simple_pack_function_cuda_vector(opal_convertor_t* pConvert
             }
             transfer_required = 0;
         } else {
-            if (OPAL_DATATYPE_VECTOR_USE_MEMCPY2D || OPAL_DATATYPE_VECTOR_USE_ZEROCPY) {
+            if (OPAL_DATATYPE_VECTOR_USE_MEMCPY2D_D2H || OPAL_DATATYPE_VECTOR_USE_ZEROCPY) {
                 pConvertor->gpu_buffer_ptr = NULL;
                 transfer_required = 0;
                 free_required = 0;
@@ -304,7 +304,6 @@ int32_t opal_generic_simple_pack_function_cuda_vector(opal_convertor_t* pConvert
                 iov_ptr = pConvertor->gpu_buffer_ptr;
             }
         }
-        cudaStreamSynchronize(cuda_streams->opal_cuda_stream[0]);
         while( 1 ) {
             while( pElem->elem.common.flags & OPAL_DATATYPE_FLAG_DATA ) {
                 /* now here we have a basic datatype */
@@ -350,8 +349,8 @@ int32_t opal_generic_simple_pack_function_cuda_vector(opal_convertor_t* pConvert
             if( OPAL_DATATYPE_LOOP == pElem->elem.common.type ) {
                 OPAL_PTRDIFF_TYPE local_disp = (OPAL_PTRDIFF_TYPE)conv_ptr;
                 if( pElem->loop.common.flags & OPAL_DATATYPE_FLAG_CONTIGUOUS ) {
-                    if (OPAL_DATATYPE_VECTOR_USE_MEMCPY2D) {
-                        pack_contiguous_loop_cuda_memcpy2d(pElem, &count_desc, &conv_ptr, &iov_ptr, &iov_len_local);
+                    if (OPAL_DATATYPE_VECTOR_USE_MEMCPY2D_D2H) {
+                        pack_contiguous_loop_cuda_memcpy2d_d2h(pElem, &count_desc, &conv_ptr, &iov_ptr, &iov_len_local);
                     } else if (OPAL_DATATYPE_VECTOR_USE_ZEROCPY) {
                         pack_contiguous_loop_cuda_zerocopy(pElem, &count_desc, &conv_ptr, &iov_ptr, &iov_len_local);
                     } else if (OPAL_DATATYPE_VECTOR_USE_PIPELINE) {
@@ -425,6 +424,8 @@ void pack_contiguous_loop_cuda( dt_elem_desc_t* ELEM,
     uint32_t num_blocks, tasks_per_block;
     unsigned char* _destination = *(DESTINATION);
     
+    ddt_cuda_stream_t *cuda_streams = current_cuda_device->cuda_streams;
+    
 #if defined(OPAL_DATATYPE_CUDA_TIMING)    
     TIMER_DATA_TYPE start, end, start_total, end_total;
     long total_time;
@@ -442,13 +443,11 @@ void pack_contiguous_loop_cuda( dt_elem_desc_t* ELEM,
  //   tasks_per_block = THREAD_PER_BLOCK * TASK_PER_THREAD;
  //   num_blocks = (*COUNT + tasks_per_block - 1) / tasks_per_block;
 //    printf("extent %ld, size %ld, count %ld\n", _loop->extent, _end_loop->size, _copy_loops);
-    cudaMemcpy2D(_destination, _end_loop->size, _source, _loop->extent, _end_loop->size, _copy_loops, cudaMemcpyDeviceToDevice);
-//    pack_contiguous_loop_cuda_kernel_global<<<192, 4*THREAD_PER_BLOCK>>>(_copy_loops, _end_loop->size, _loop->extent, _source, _destination);
-//    int i;
-//    for (i = 0; i < 4; i++) {
-//     opal_empty_kernel<<<192, 4*THREAD_PER_BLOCK>>>(_copy_loops, _end_loop->size, _loop->extent, _source, _destination);
- //    pack_contiguous_loop_cuda_kernel_global<<<192, 4*THREAD_PER_BLOCK>>>(_copy_loops, _end_loop->size, _loop->extent, _source, _destination);
-//     }
+#if OPAL_DATATYPE_VECTOR_USE_MEMCPY2D_AS_KERNEL
+    cudaMemcpy2DAsync(_destination, _end_loop->size, _source, _loop->extent, _end_loop->size, _copy_loops, cudaMemcpyDeviceToDevice, cuda_streams->opal_cuda_stream[0]);
+#else
+    pack_contiguous_loop_cuda_kernel_global<<<192, 4*THREAD_PER_BLOCK, 0, cuda_streams->opal_cuda_stream[0]>>>(_copy_loops, _end_loop->size, _loop->extent, _source, _destination);
+#endif /* OPAL_DATATYPE_VECTOR_USE_MEMCPY2D_AS_KERNEL */
 
 #if !defined(OPAL_DATATYPE_CUDA_DRY_RUN)    
     *(SOURCE) = _source +  _loop->extent*_copy_loops - _end_loop->first_elem_disp;
@@ -457,7 +456,7 @@ void pack_contiguous_loop_cuda( dt_elem_desc_t* ELEM,
     *(COUNT) -= _copy_loops;
 #endif
     
-    cudaDeviceSynchronize();
+    cudaStreamSynchronize(cuda_streams->opal_cuda_stream[0]);
     
 #if defined(OPAL_DATATYPE_CUDA_TIMING) 
     GET_TIME( end );
@@ -466,6 +465,7 @@ void pack_contiguous_loop_cuda( dt_elem_desc_t* ELEM,
 #endif
 }
 
+/* this function will not be used */
 void pack_contiguous_loop_cuda_pipeline( dt_elem_desc_t* ELEM,
                                          uint32_t* COUNT,
                                          unsigned char** SOURCE,
@@ -537,9 +537,9 @@ void pack_contiguous_loop_cuda_pipeline( dt_elem_desc_t* ELEM,
     total_time = ELAPSED_TIME( start, end );
     DT_CUDA_DEBUG( opal_cuda_output( 2, "[Timing]: vector packing in %ld microsec\n", total_time ); );
 #endif
-}
+} 
 
-void pack_contiguous_loop_cuda_memcpy2d( dt_elem_desc_t* ELEM,
+void pack_contiguous_loop_cuda_memcpy2d_d2h( dt_elem_desc_t* ELEM,
                                 uint32_t* COUNT,
                                 unsigned char** SOURCE,
                                 unsigned char** DESTINATION,
@@ -551,6 +551,7 @@ void pack_contiguous_loop_cuda_memcpy2d( dt_elem_desc_t* ELEM,
     uint32_t _copy_loops = *(COUNT);
     uint32_t num_blocks, tasks_per_block;
     unsigned char* _destination = *(DESTINATION);
+    ddt_cuda_stream_t *cuda_streams = current_cuda_device->cuda_streams;
     
 #if defined(OPAL_DATATYPE_CUDA_TIMING)    
     TIMER_DATA_TYPE start, end, start_total, end_total;
@@ -566,7 +567,7 @@ void pack_contiguous_loop_cuda_memcpy2d( dt_elem_desc_t* ELEM,
     GET_TIME(start);
 #endif    
 
-    cudaMemcpy2D(_destination, _end_loop->size, _source, _loop->extent, _end_loop->size, _copy_loops, cudaMemcpyDeviceToHost);
+    cudaMemcpy2DAsync(_destination, _end_loop->size, _source, _loop->extent, _end_loop->size, _copy_loops, cudaMemcpyDeviceToHost, cuda_streams->opal_cuda_stream[0]);
 
 #if !defined(OPAL_DATATYPE_CUDA_DRY_RUN)    
     *(SOURCE) = _source +  _loop->extent*_copy_loops - _end_loop->first_elem_disp;
@@ -575,7 +576,7 @@ void pack_contiguous_loop_cuda_memcpy2d( dt_elem_desc_t* ELEM,
     *(COUNT) -= _copy_loops;
 #endif
     
-//    cudaDeviceSynchronize();
+    cudaStreamSynchronize(cuda_streams->opal_cuda_stream[0]);
     
 #if defined(OPAL_DATATYPE_CUDA_TIMING) 
     GET_TIME( end );
@@ -597,6 +598,7 @@ void pack_contiguous_loop_cuda_zerocopy( dt_elem_desc_t* ELEM,
     uint32_t num_blocks, tasks_per_block;
     unsigned char* _destination = *(DESTINATION);
     unsigned char* _destination_dev;
+    ddt_cuda_stream_t *cuda_streams = current_cuda_device->cuda_streams;
     
 #if defined(OPAL_DATATYPE_CUDA_TIMING)    
     TIMER_DATA_TYPE start, end, start_total, end_total;
@@ -612,16 +614,17 @@ void pack_contiguous_loop_cuda_zerocopy( dt_elem_desc_t* ELEM,
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
     GET_TIME(start);
 #endif    
- //   tasks_per_block = THREAD_PER_BLOCK * TASK_PER_THREAD;
- //   num_blocks = (*COUNT + tasks_per_block - 1) / tasks_per_block;
- //   cudaHostRegister(_destination, _copy_loops*_end_loop->size, cudaHostRegisterMapped);
+
     cudaError_t reg_rv = cudaHostGetDevicePointer((void **)&_destination_dev, (void *) _destination, 0);
     if (reg_rv != cudaSuccess) {
         const char *cuda_err = cudaGetErrorString(reg_rv);
         printf("can not get dev  mem, %s\n", cuda_err);
     }
-    //cudaMemcpy2D(_destination_dev, _end_loop->size, _source, _loop->extent, _end_loop->size, _copy_loops, cudaMemcpyDeviceToDevice);
-    pack_contiguous_loop_cuda_kernel_global<<<192, 4*THREAD_PER_BLOCK>>>(_copy_loops, _end_loop->size, _loop->extent, _source, _destination_dev);
+#if OPAL_DATATYPE_VECTOR_USE_MEMCPY2D_AS_KERNEL
+    cudaMemcpy2DAsync(_destination_dev, _end_loop->size, _source, _loop->extent, _end_loop->size, _copy_loops, cudaMemcpyDeviceToDevice, cuda_streams->opal_cuda_stream[0]);
+#else
+    pack_contiguous_loop_cuda_kernel_global<<<192, 4*THREAD_PER_BLOCK, 0, cuda_streams->opal_cuda_stream[0]>>>(_copy_loops, _end_loop->size, _loop->extent, _source, _destination_dev);
+#endif /* OPAL_DATATYPE_VECTOR_USE_MEMCPY2D_AS_KERNEL */
 
 #if !defined(OPAL_DATATYPE_CUDA_DRY_RUN)    
     *(SOURCE) = _source +  _loop->extent*_copy_loops - _end_loop->first_elem_disp;
@@ -630,8 +633,7 @@ void pack_contiguous_loop_cuda_zerocopy( dt_elem_desc_t* ELEM,
     *(COUNT) -= _copy_loops;
 #endif
     
-    cudaDeviceSynchronize();
- //   cudaHostUnregister(_destination);
+    cudaStreamSynchronize(cuda_streams->opal_cuda_stream[0]);
     
 #if defined(OPAL_DATATYPE_CUDA_TIMING) 
     GET_TIME( end );
@@ -659,10 +661,13 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
 //    dt_stack_t* pStack;
     uint8_t alignment, orig_alignment;
 //    int32_t orig_stack_index;
-    
+    cudaError_t cuda_err;
     ddt_cuda_stream_t *cuda_streams = current_cuda_device->cuda_streams;
     ddt_cuda_iov_dist_t* cuda_iov_dist_h_current;
     ddt_cuda_iov_dist_t* cuda_iov_dist_d_current;
+    ddt_cuda_iov_pipeline_block_t *cuda_iov_pipeline_block;
+    int iov_pipeline_block_id = 0;
+    cudaStream_t *cuda_stream_iov = NULL;
 
 #if defined(OPAL_DATATYPE_CUDA_TIMING)    
     TIMER_DATA_TYPE start, end, start_total, end_total;
@@ -721,8 +726,6 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
     convertor_flags = pConvertor->flags;
   //  orig_stack_index = pStack->index;
     destination_base = destination;
-    
-    cudaStreamSynchronize(cuda_streams->opal_cuda_stream[0]);
 
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
     GET_TIME(start_total);
@@ -747,8 +750,12 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
     while (cuda_iov_count > 0) {
         
         nb_blocks_used = 0;
-        cuda_iov_dist_h_current = current_cuda_device->cuda_iov_dist_h[cuda_streams->current_stream_id];
-        cuda_iov_dist_d_current = current_cuda_device->cuda_iov_dist_d[cuda_streams->current_stream_id];
+        cuda_iov_pipeline_block = current_cuda_device->cuda_iov_pipeline_block[iov_pipeline_block_id];
+        cuda_iov_dist_h_current = cuda_iov_pipeline_block->cuda_iov_dist_h;
+        cuda_iov_dist_d_current = cuda_iov_pipeline_block->cuda_iov_dist_d;
+        cuda_stream_iov = cuda_iov_pipeline_block->cuda_stream;
+        cuda_err = cudaStreamWaitEvent(*cuda_stream_iov, cuda_iov_pipeline_block->cuda_event, 0);
+        opal_cuda_check_error(cuda_err);
         source_base = (unsigned char*)cuda_iov[0].iov_base; 
 
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
@@ -786,9 +793,9 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
                 cuda_iov_dist_h_current[nb_blocks_used].dst_offset = destination - destination_base;
                 cuda_iov_dist_h_current[nb_blocks_used].element_alignment = alignment;
                 if ( (j+1) * thread_per_block <= count_desc) {
-                    cuda_iov_dist_h_current[nb_blocks_used].nb_elements = thread_per_block;// * sizeof(double);
+                    cuda_iov_dist_h_current[nb_blocks_used].nb_elements = thread_per_block;
                 } else {
-                    cuda_iov_dist_h_current[nb_blocks_used].nb_elements = count_desc - j*thread_per_block; //(thread_per_block - ((j+1)*thread_per_block - count_desc));// * sizeof(double);
+                    cuda_iov_dist_h_current[nb_blocks_used].nb_elements = count_desc - j*thread_per_block; 
                 }
 #if defined (OPAL_DATATYPE_CUDA_DEBUG)
                 assert(cuda_iov_dist_h_current[nb_blocks_used].nb_elements > 0);
@@ -824,13 +831,15 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
 #if defined(OPAL_DATATYPE_CUDA_TIMING)    
         GET_TIME( end );
         total_time = ELAPSED_TIME( start, end );
-        DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Pack to dest %p, iov is prepared in %ld microsec, kernel submitted to CUDA stream %d, nb_blocks %d\n", destination_base, total_time,  cuda_streams->current_stream_id, nb_blocks_used); );
+        DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Pack to dest %p, iov is prepared in %ld microsec, kernel submitted to CUDA stream %d, nb_blocks %d\n", destination_base, total_time,  cuda_iov_pipeline_block->cuda_stream_id, nb_blocks_used); );
 #endif
 
-        cudaMemcpyAsync(cuda_iov_dist_d_current, cuda_iov_dist_h_current, sizeof(ddt_cuda_iov_dist_t)*(nb_blocks_used), cudaMemcpyHostToDevice, cuda_streams->opal_cuda_stream[cuda_streams->current_stream_id]);
-        opal_generic_simple_pack_cuda_iov_kernel<<<nb_blocks, thread_per_block, 0, cuda_streams->opal_cuda_stream[cuda_streams->current_stream_id]>>>(cuda_iov_dist_d_current, nb_blocks_used, source_base, destination_base);
-        cuda_streams->current_stream_id ++;
-        cuda_streams->current_stream_id = cuda_streams->current_stream_id % NB_STREAMS;
+        cudaMemcpyAsync(cuda_iov_dist_d_current, cuda_iov_dist_h_current, sizeof(ddt_cuda_iov_dist_t)*(nb_blocks_used), cudaMemcpyHostToDevice, *cuda_stream_iov);
+        opal_generic_simple_pack_cuda_iov_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cuda_iov_dist_d_current, nb_blocks_used, source_base, destination_base);
+        cuda_err = cudaEventRecord(cuda_iov_pipeline_block->cuda_event, *cuda_stream_iov);
+        opal_cuda_check_error(cuda_err);
+        iov_pipeline_block_id ++;
+        iov_pipeline_block_id = iov_pipeline_block_id % NB_STREAMS;
         
         /* buffer is full */
         if (buffer_isfull) {
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
index f6251fd77f7..a8ba035ef78 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
@@ -62,7 +62,7 @@ int32_t opal_generic_simple_unpack_function_cuda_vector2( opal_convertor_t* pCon
             iov_ptr = (unsigned char*)iov[iov_count].iov_base;
             free_required = 0;
         } else {
-            if (OPAL_DATATYPE_VECTOR_USE_MEMCPY2D || OPAL_DATATYPE_VECTOR_USE_ZEROCPY) {
+            if (OPAL_DATATYPE_VECTOR_USE_MEMCPY2D_D2H || OPAL_DATATYPE_VECTOR_USE_ZEROCPY) {
                 iov_ptr = (unsigned char*)iov[iov_count].iov_base;
                 pConvertor->gpu_buffer_ptr = NULL;
                 free_required = 0;
@@ -81,6 +81,7 @@ int32_t opal_generic_simple_unpack_function_cuda_vector2( opal_convertor_t* pCon
         printf( "[Timing]: HtoD memcpy in %ld microsec, free required %d\n", total_time, free_required );
 #endif
         iov_len_local = iov[iov_count].iov_len;
+        cudaDeviceSynchronize();
         if( 0 != pConvertor->partial_length ) {
             /* not support yet */
         }
@@ -134,8 +135,8 @@ int32_t opal_generic_simple_unpack_function_cuda_vector2( opal_convertor_t* pCon
             if( OPAL_DATATYPE_LOOP == pElem->elem.common.type ) {
                 OPAL_PTRDIFF_TYPE local_disp = (OPAL_PTRDIFF_TYPE)conv_ptr;
                 if( pElem->loop.common.flags & OPAL_DATATYPE_FLAG_CONTIGUOUS ) {
-                    if (OPAL_DATATYPE_VECTOR_USE_MEMCPY2D) {
-                        unpack_contiguous_loop_cuda_memcpy2d(pElem, &count_desc, &iov_ptr, &conv_ptr, &iov_len_local);
+                    if (OPAL_DATATYPE_VECTOR_USE_MEMCPY2D_D2H) {
+                        unpack_contiguous_loop_cuda_memcpy2d_d2h(pElem, &count_desc, &iov_ptr, &conv_ptr, &iov_len_local);
                     } else if (OPAL_DATATYPE_VECTOR_USE_ZEROCPY) {
                         unpack_contiguous_loop_cuda_zerocopy(pElem, &count_desc, &iov_ptr, &conv_ptr, &iov_len_local);
                     } else {
@@ -237,7 +238,7 @@ int32_t opal_generic_simple_unpack_function_cuda_vector( opal_convertor_t* pConv
             iov_ptr = (unsigned char*)iov[iov_count].iov_base;
             free_required = 0;
         } else {
-            if (OPAL_DATATYPE_VECTOR_USE_MEMCPY2D || OPAL_DATATYPE_VECTOR_USE_ZEROCPY) {
+            if (OPAL_DATATYPE_VECTOR_USE_MEMCPY2D_D2H || OPAL_DATATYPE_VECTOR_USE_ZEROCPY) {
                 iov_ptr = (unsigned char*)iov[iov_count].iov_base;
                 pConvertor->gpu_buffer_ptr = NULL;
                 free_required = 0;
@@ -255,7 +256,6 @@ int32_t opal_generic_simple_unpack_function_cuda_vector( opal_convertor_t* pConv
         total_time = ELAPSED_TIME( start, end );
         DT_CUDA_DEBUG( opal_cuda_output( 2, "[Timing]: HtoD memcpy in %ld microsec, free required %d\n", total_time, free_required ); );
 #endif
-        cudaStreamSynchronize(cuda_streams->opal_cuda_stream[0]);
         iov_len_local = iov[iov_count].iov_len;
         if( 0 != pConvertor->partial_length ) {
             /* not support yet */
@@ -304,8 +304,8 @@ int32_t opal_generic_simple_unpack_function_cuda_vector( opal_convertor_t* pConv
             if( OPAL_DATATYPE_LOOP == pElem->elem.common.type ) {
                 OPAL_PTRDIFF_TYPE local_disp = (OPAL_PTRDIFF_TYPE)conv_ptr;
                 if( pElem->loop.common.flags & OPAL_DATATYPE_FLAG_CONTIGUOUS ) {
-                    if (OPAL_DATATYPE_VECTOR_USE_MEMCPY2D) {
-                        unpack_contiguous_loop_cuda_memcpy2d(pElem, &count_desc, &iov_ptr, &conv_ptr, &iov_len_local);
+                    if (OPAL_DATATYPE_VECTOR_USE_MEMCPY2D_D2H) {
+                        unpack_contiguous_loop_cuda_memcpy2d_d2h(pElem, &count_desc, &iov_ptr, &conv_ptr, &iov_len_local);
                     } else if (OPAL_DATATYPE_VECTOR_USE_ZEROCPY) {
                         unpack_contiguous_loop_cuda_zerocopy(pElem, &count_desc, &iov_ptr, &conv_ptr, &iov_len_local);
                     } else {
@@ -373,17 +373,18 @@ int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvert
 //    dt_stack_t* pStack;
     uint8_t alignment, orig_alignment;
 //    int32_t orig_stack_index;
-
+    cudaError_t cuda_err;
     ddt_cuda_stream_t *cuda_streams = current_cuda_device->cuda_streams;
     ddt_cuda_iov_dist_t* cuda_iov_dist_h_current;
     ddt_cuda_iov_dist_t* cuda_iov_dist_d_current;
+    ddt_cuda_iov_pipeline_block_t *cuda_iov_pipeline_block;
+    int iov_pipeline_block_id = 0;
+    cudaStream_t *cuda_stream_iov = NULL;
 
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
     TIMER_DATA_TYPE start, end, start_total, end_total;
     long total_time, move_time;
 #endif
-    
-    cudaStreamSynchronize(cuda_streams->opal_cuda_stream[0]);
 
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
     GET_TIME(start_total);
@@ -423,6 +424,9 @@ int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvert
     move_time = ELAPSED_TIME( start, end );
     DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: HtoD memcpy in %ld microsec, free required %d\n", move_time, free_required ); );
 #endif
+    
+//    cuda_err = cudaEventRecord(current_cuda_device->memcpy_event, current_cuda_device->cuda_streams->opal_cuda_stream[0]);
+//    opal_cuda_check_error(cuda_err);
 
 
 #if defined (OPAL_DATATYPE_CUDA_TIMING)
@@ -452,8 +456,12 @@ int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvert
     while (cuda_iov_count > 0) {
 
         nb_blocks_used = 0;
-        cuda_iov_dist_h_current = current_cuda_device->cuda_iov_dist_h[cuda_streams->current_stream_id];
-        cuda_iov_dist_d_current = current_cuda_device->cuda_iov_dist_d[cuda_streams->current_stream_id];
+        cuda_iov_pipeline_block = current_cuda_device->cuda_iov_pipeline_block[iov_pipeline_block_id];
+        cuda_iov_dist_h_current = cuda_iov_pipeline_block->cuda_iov_dist_h;
+        cuda_iov_dist_d_current = cuda_iov_pipeline_block->cuda_iov_dist_d;
+        cuda_stream_iov = cuda_iov_pipeline_block->cuda_stream;
+        cuda_err = cudaStreamWaitEvent(*cuda_stream_iov, cuda_iov_pipeline_block->cuda_event, 0);
+        opal_cuda_check_error(cuda_err);
         destination_base = (unsigned char*)cuda_iov[0].iov_base;
 
 #if defined (OPAL_DATATYPE_CUDA_TIMING)
@@ -529,14 +537,16 @@ int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvert
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
         GET_TIME( end );
         total_time = ELAPSED_TIME( start, end );
-        DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Unpack src %p, iov is prepared in %ld microsec, kernel submitted to CUDA stream %d\n", source_base, total_time,  cuda_streams->current_stream_id); );
+        DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Unpack src %p, iov is prepared in %ld microsec, kernel submitted to CUDA stream %d, nb_blocks_used %d\n", source_base, total_time,  cuda_iov_pipeline_block->cuda_stream_id, nb_blocks_used); );
 #endif
 
-        cudaMemcpyAsync(cuda_iov_dist_d_current, cuda_iov_dist_h_current, sizeof(ddt_cuda_iov_dist_t)*(nb_blocks_used), cudaMemcpyHostToDevice, cuda_streams->opal_cuda_stream[cuda_streams->current_stream_id]);
-        opal_generic_simple_unpack_cuda_iov_kernel<<<nb_blocks, thread_per_block, 0, cuda_streams->opal_cuda_stream[cuda_streams->current_stream_id]>>>(cuda_iov_dist_d_current, nb_blocks_used, source_base, destination_base);
-        cuda_streams->current_stream_id ++;
-        cuda_streams->current_stream_id = cuda_streams->current_stream_id % NB_STREAMS;
-
+        cudaMemcpyAsync(cuda_iov_dist_d_current, cuda_iov_dist_h_current, sizeof(ddt_cuda_iov_dist_t)*(nb_blocks_used), cudaMemcpyHostToDevice, *cuda_stream_iov);
+        opal_generic_simple_unpack_cuda_iov_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cuda_iov_dist_d_current, nb_blocks_used, source_base, destination_base);
+        cuda_err = cudaEventRecord(cuda_iov_pipeline_block->cuda_event, *cuda_stream_iov);
+        opal_cuda_check_error(cuda_err);
+        iov_pipeline_block_id ++;
+        iov_pipeline_block_id = iov_pipeline_block_id % NB_STREAMS;
+        
         /* buffer is full */
         if (buffer_isfull) {
             size_t total_converted_tmp = total_converted;
@@ -560,7 +570,7 @@ int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvert
 #endif
 
     }
-   // cudaDeviceSynchronize();
+
     for (i = 0; i < NB_STREAMS; i++) {
         cudaStreamSynchronize(cuda_streams->opal_cuda_stream[i]);
     }
@@ -599,6 +609,7 @@ void unpack_contiguous_loop_cuda( dt_elem_desc_t* ELEM,
     uint32_t _copy_loops = *(COUNT);
     uint32_t num_blocks, tasks_per_block;
     unsigned char* _source = *(SOURCE);
+    ddt_cuda_stream_t *cuda_streams = current_cuda_device->cuda_streams;
 
 #if defined(OPAL_DATATYPE_CUDA_TIMING)    
     TIMER_DATA_TYPE start, end, start_total, end_total;
@@ -615,8 +626,11 @@ void unpack_contiguous_loop_cuda( dt_elem_desc_t* ELEM,
 #endif
 //    tasks_per_block = THREAD_PER_BLOCK * TASK_PER_THREAD;
 //    num_blocks = (*COUNT + tasks_per_block - 1) / tasks_per_block;
-//    unpack_contiguous_loop_cuda_kernel_global<<<192, 4*THREAD_PER_BLOCK>>>(_copy_loops, _end_loop->size, _loop->extent, _source, _destination);
-     cudaMemcpy2D(_destination, _loop->extent, _source, _end_loop->size, _end_loop->size, _copy_loops, cudaMemcpyDeviceToDevice);
+#if OPAL_DATATYPE_VECTOR_USE_MEMCPY2D_AS_KERNEL
+     cudaMemcpy2DAsync(_destination, _loop->extent, _source, _end_loop->size, _end_loop->size, _copy_loops, cudaMemcpyDeviceToDevice, cuda_streams->opal_cuda_stream[0]);
+#else
+     unpack_contiguous_loop_cuda_kernel_global<<<192, 4*THREAD_PER_BLOCK, 0, cuda_streams->opal_cuda_stream[0]>>>(_copy_loops, _end_loop->size, _loop->extent, _source, _destination);
+#endif /* OPAL_DATATYPE_VECTOR_USE_MEMCPY2D_AS_KERNEL */
 
 #if !defined(OPAL_DATATYPE_CUDA_DRY_RUN)
     *(DESTINATION) = _destination + _loop->extent*_copy_loops - _end_loop->first_elem_disp;
@@ -625,7 +639,7 @@ void unpack_contiguous_loop_cuda( dt_elem_desc_t* ELEM,
     *(COUNT) -= _copy_loops;
 #endif
 
-    cudaDeviceSynchronize();
+    cudaStreamSynchronize(cuda_streams->opal_cuda_stream[0]);
 #if defined(OPAL_DATATYPE_CUDA_TIMING) 
     GET_TIME( end );
     total_time = ELAPSED_TIME( start, end );
@@ -645,6 +659,7 @@ void unpack_contiguous_loop_cuda_memcpy2d( dt_elem_desc_t* ELEM,
     uint32_t _copy_loops = *(COUNT);
     uint32_t num_blocks, tasks_per_block;
     unsigned char* _source = *(SOURCE);
+    ddt_cuda_stream_t *cuda_streams = current_cuda_device->cuda_streams;
 
 #if defined(OPAL_DATATYPE_CUDA_TIMING)    
     TIMER_DATA_TYPE start, end, start_total, end_total;
@@ -659,7 +674,7 @@ void unpack_contiguous_loop_cuda_memcpy2d( dt_elem_desc_t* ELEM,
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
     GET_TIME(start);
 #endif
-    cudaMemcpy2D(_destination, _loop->extent, _source, _end_loop->size, _end_loop->size, _copy_loops, cudaMemcpyHostToDevice);
+    cudaMemcpy2DAsync(_destination, _loop->extent, _source, _end_loop->size, _end_loop->size, _copy_loops, cudaMemcpyHostToDevice, cuda_streams->opal_cuda_stream[0]);
 
 #if !defined(OPAL_DATATYPE_CUDA_DRY_RUN)
     *(DESTINATION) = _destination + _loop->extent*_copy_loops - _end_loop->first_elem_disp;
@@ -668,7 +683,8 @@ void unpack_contiguous_loop_cuda_memcpy2d( dt_elem_desc_t* ELEM,
     *(COUNT) -= _copy_loops;
 #endif
     
-
+    cudaStreamSynchronize(cuda_streams->opal_cuda_stream[0]);
+    
 #if defined(OPAL_DATATYPE_CUDA_TIMING) 
     GET_TIME( end );
     total_time = ELAPSED_TIME( start, end );
@@ -689,6 +705,7 @@ void unpack_contiguous_loop_cuda_zerocopy( dt_elem_desc_t* ELEM,
     uint32_t num_blocks, tasks_per_block;
     unsigned char* _source = *(SOURCE);
     unsigned char* _source_dev;
+    ddt_cuda_stream_t *cuda_streams = current_cuda_device->cuda_streams;
 
 #if defined(OPAL_DATATYPE_CUDA_TIMING)    
     TIMER_DATA_TYPE start, end, start_total, end_total;
@@ -705,14 +722,17 @@ void unpack_contiguous_loop_cuda_zerocopy( dt_elem_desc_t* ELEM,
 #endif
 //    tasks_per_block = THREAD_PER_BLOCK * TASK_PER_THREAD;
 //    num_blocks = (*COUNT + tasks_per_block - 1) / tasks_per_block;
-//    cudaHostRegister(_source, _copy_loops*_end_loop->size, cudaHostRegisterMapped);
+
     cudaError_t reg_rv = cudaHostGetDevicePointer((void **)&_source_dev, (void *) _source, 0);
     if (reg_rv != cudaSuccess) {
         const char *cuda_err = cudaGetErrorString(reg_rv);
         printf("can not get dev mem, %s\n", cuda_err);
     }
-    //cudaMemcpy2D(_destination, _loop->extent, _source_dev, _end_loop->size, _end_loop->size, _copy_loops, cudaMemcpyDeviceToDevice);
-    unpack_contiguous_loop_cuda_kernel_global<<<192, 4*THREAD_PER_BLOCK>>>(_copy_loops, _end_loop->size, _loop->extent, _source_dev, _destination);
+#if OPAL_DATATYPE_VECTOR_USE_MEMCPY2D_AS_KERNEL
+    cudaMemcpy2DAsync(_destination, _loop->extent, _source_dev, _end_loop->size, _end_loop->size, _copy_loops, cudaMemcpyDeviceToDevice, cuda_streams->opal_cuda_stream[0]);
+#else
+    unpack_contiguous_loop_cuda_kernel_global<<<192, 4*THREAD_PER_BLOCK, 0, cuda_streams->opal_cuda_stream[0]>>>(_copy_loops, _end_loop->size, _loop->extent, _source_dev, _destination);
+#endif /* OPAL_DATATYPE_VECTOR_USE_MEMCPY2D_AS_KERNEL */
 
 #if !defined(OPAL_DATATYPE_CUDA_DRY_RUN)
     *(DESTINATION) = _destination + _loop->extent*_copy_loops - _end_loop->first_elem_disp;
@@ -721,7 +741,7 @@ void unpack_contiguous_loop_cuda_zerocopy( dt_elem_desc_t* ELEM,
     *(COUNT) -= _copy_loops;
 #endif
 
-    cudaDeviceSynchronize();
+    cudaStreamSynchronize(cuda_streams->opal_cuda_stream[0]);
   //  cudaHostUnregister(_source);
 #if defined(OPAL_DATATYPE_CUDA_TIMING) 
     GET_TIME( end );
diff --git a/test/datatype/ddt_benchmark.c b/test/datatype/ddt_benchmark.c
index 92bdf644d4d..45440dc2c04 100644
--- a/test/datatype/ddt_benchmark.c
+++ b/test/datatype/ddt_benchmark.c
@@ -1211,11 +1211,11 @@ int main( int argc, char* argv[] )
     
     printf( "\n\n#\n * TEST UPPER TRIANGULAR MATRIX (size 100)\n #\n\n" );
     int mat_size = 500;
-    for (mat_size = 500; mat_size <= 500; mat_size +=500) {
+    for (mat_size = 6000; mat_size <= 6000; mat_size +=500) {
         pdt = upper_matrix(mat_size);
         printf("----matrix size %d-----\n", mat_size);
         if( outputFlags & CHECK_PACK_UNPACK ) {
-            for (i = 1; i <= 1; i++) {
+            for (i = 1; i <= 2; i++) {
                 local_copy_with_convertor(pdt, 1, 1024*1024*200, mat_size);
             }
         }
@@ -1312,7 +1312,7 @@ int main( int argc, char* argv[] )
         pdt = create_vector_type( MPI_DOUBLE, blk_len, blk_len, blk_len*2);
         if( outputFlags & CHECK_PACK_UNPACK ) {
             for (i = 0; i < 4; i++) {
-                  vector_ddt( pdt, 1, pdt, 1, 1024*1024*100 , blk_len, blk_len, blk_len*2);
+        //          vector_ddt( pdt, 1, pdt, 1, 1024*1024*100 , blk_len, blk_len, blk_len*2);
     //             vector_ddt_2d( pdt, 1, pdt, 1, 1024*1024*100 , 8192, blk_len, blk_len+128);
             }
         }

From 4271a0d56ae3e4b52df2b40c401202a88273be8e Mon Sep 17 00:00:00 2001
From: Wei Wu <eddy16112@gmail.com>
Date: Wed, 4 Nov 2015 12:05:59 -0800
Subject: [PATCH 042/190] in openib, disable rdma for non-contiguous gpu data

---
 ompi/mca/pml/ob1/pml_ob1_cuda.c            | 53 ++++++++++++++++++----
 opal/mca/btl/btl.h                         |  3 +-
 opal/mca/btl/openib/btl_openib_mca.c       |  1 +
 opal/mca/btl/smcuda/btl_smcuda_component.c |  1 +
 4 files changed, 48 insertions(+), 10 deletions(-)

diff --git a/ompi/mca/pml/ob1/pml_ob1_cuda.c b/ompi/mca/pml/ob1/pml_ob1_cuda.c
index c13a4f4f620..65db631b32f 100644
--- a/ompi/mca/pml/ob1/pml_ob1_cuda.c
+++ b/ompi/mca/pml/ob1/pml_ob1_cuda.c
@@ -54,6 +54,8 @@ int mca_pml_ob1_rdma_cuda_btl_register_data(
     uint32_t num_btls_used, 
     struct opal_convertor_t *pack_convertor, uint8_t pack_required, int32_t gpu_device);
 
+size_t mca_pml_ob1_rdma_cuda_avail(mca_bml_base_endpoint_t* bml_endpoint);
+
 int mca_pml_ob1_cuda_need_buffers(void * rreq,
                                   mca_btl_base_module_t* btl);
 
@@ -69,17 +71,17 @@ int mca_pml_ob1_send_request_start_cuda(mca_pml_ob1_send_request_t* sendreq,
     int rc;
     int32_t local_device = 0;
 #if OPAL_CUDA_SUPPORT_41
-#if OPAL_CUDA_GDR_SUPPORT
-    /* With some BTLs, switch to RNDV from RGET at large messages */
-    if ((sendreq->req_send.req_base.req_convertor.flags & CONVERTOR_CUDA) &&
-        (sendreq->req_send.req_bytes_packed > (bml_btl->btl->btl_cuda_rdma_limit - sizeof(mca_pml_ob1_hdr_t)))) {
-        return mca_pml_ob1_send_request_start_rndv(sendreq, bml_btl, 0, 0);
-    }
-#endif /* OPAL_CUDA_GDR_SUPPORT */
 
     sendreq->req_send.req_base.req_convertor.flags &= ~CONVERTOR_CUDA;
     struct opal_convertor_t *convertor = &(sendreq->req_send.req_base.req_convertor);
     if (opal_convertor_need_buffers(&sendreq->req_send.req_base.req_convertor) == false) {
+#if OPAL_CUDA_GDR_SUPPORT
+        /* With some BTLs, switch to RNDV from RGET at large messages */
+        if ((sendreq->req_send.req_bytes_packed > (bml_btl->btl->btl_cuda_rdma_limit - sizeof(mca_pml_ob1_hdr_t)))) {
+            sendreq->req_send.req_base.req_convertor.flags |= CONVERTOR_CUDA;
+            return mca_pml_ob1_send_request_start_rndv(sendreq, bml_btl, 0, 0);
+        }
+#endif /* OPAL_CUDA_GDR_SUPPORT */
         unsigned char *base;
         opal_convertor_get_current_pointer( &sendreq->req_send.req_base.req_convertor, (void**)&base );
         /* Set flag back */
@@ -113,8 +115,9 @@ int mca_pml_ob1_send_request_start_cuda(mca_pml_ob1_send_request_t* sendreq,
         /* Do not send anything with first rendezvous message as copying GPU
          * memory into RNDV message is expensive. */
         sendreq->req_send.req_base.req_convertor.flags |= CONVERTOR_CUDA;
-        mca_bml_base_btl_t* bml_endpoint_btl = mca_bml_base_btl_array_get_index(&(sendreq->req_endpoint->btl_send), 0);
-        if ((bml_endpoint_btl->btl_flags & MCA_BTL_FLAGS_CUDA_GET) && (opal_datatype_cuda_kernel_support == 1)) {
+        if ((mca_pml_ob1_rdma_cuda_avail(sendreq->req_endpoint) != 0) && 
+            (opal_datatype_cuda_kernel_support == 1) && 
+            (bml_btl->btl->btl_cuda_ddt_allow_rdma == 1)) {
             unsigned char *base;
             size_t buffer_size = 0;
             if (convertor->local_size > bml_btl->btl->btl_cuda_ddt_pipeline_size) {
@@ -240,6 +243,38 @@ int mca_pml_ob1_rdma_cuda_btl_register_data(
     return 0;
 }
 
+size_t mca_pml_ob1_rdma_cuda_avail(mca_bml_base_endpoint_t* bml_endpoint)
+{
+    int num_btls = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_send);
+    double weight_total = 0;
+    int num_btls_used = 0, n;
+
+    /* shortcut when there are no rdma capable btls */
+    if(num_btls == 0) {
+        return 0;
+    }
+
+    /* check to see if memory is registered */
+    for(n = 0; n < num_btls && num_btls_used < mca_pml_ob1.max_rdma_per_request;
+            n++) {
+        mca_bml_base_btl_t* bml_btl =
+            mca_bml_base_btl_array_get_index(&bml_endpoint->btl_send, n);
+
+        if (bml_btl->btl_flags & MCA_BTL_FLAGS_CUDA_GET) {
+            weight_total += bml_btl->btl_weight;
+            num_btls_used++;
+        }
+    }
+
+    /* if we don't use leave_pinned and all BTLs that already have this memory
+ *      * registered amount to less then half of available bandwidth - fall back to
+ *           * pipeline protocol */
+    if(0 == num_btls_used || (!mca_pml_ob1.leave_pinned && weight_total < 0.5))
+        return 0;
+
+    return num_btls_used;
+}
+
 int mca_pml_ob1_cuda_need_buffers(void * rreq,
                                   mca_btl_base_module_t* btl)
 {
diff --git a/opal/mca/btl/btl.h b/opal/mca/btl/btl.h
index 1a38ec4c331..7e693c62b84 100644
--- a/opal/mca/btl/btl.h
+++ b/opal/mca/btl/btl.h
@@ -1182,8 +1182,9 @@ struct mca_btl_base_module_t {
 #endif /* OPAL_CUDA_GDR_SUPPORT */
 #if OPAL_CUDA_SUPPORT
     size_t      btl_cuda_max_send_size;   /**< set if CUDA max send_size is different from host max send size */
+    int32_t     btl_cuda_ddt_allow_rdma;
     size_t      btl_cuda_ddt_pipeline_size;
-    int         btl_cuda_ddt_pipeline_depth;
+    int32_t     btl_cuda_ddt_pipeline_depth;
 #endif /* OPAL_CUDA_SUPPORT */
 };
 typedef struct mca_btl_base_module_t mca_btl_base_module_t;
diff --git a/opal/mca/btl/openib/btl_openib_mca.c b/opal/mca/btl/openib/btl_openib_mca.c
index d3664435496..293c4452fce 100644
--- a/opal/mca/btl/openib/btl_openib_mca.c
+++ b/opal/mca/btl/openib/btl_openib_mca.c
@@ -648,6 +648,7 @@ int btl_openib_register_mca_params(void)
         mca_btl_openib_module.super.btl_cuda_rdma_limit = 0;  /* Unused */
     }
 #endif /* OPAL_CUDA_GDR_SUPPORT */
+    mca_btl_openib_module.super.btl_cuda_ddt_allow_rdma = 0;
 #endif /* OPAL_CUDA_SUPPORT */
     CHECK(mca_btl_base_param_register(
             &mca_btl_openib_component.super.btl_version,
diff --git a/opal/mca/btl/smcuda/btl_smcuda_component.c b/opal/mca/btl/smcuda/btl_smcuda_component.c
index c7bdb40c028..9c1f5235d1e 100644
--- a/opal/mca/btl/smcuda/btl_smcuda_component.c
+++ b/opal/mca/btl/smcuda/btl_smcuda_component.c
@@ -176,6 +176,7 @@ static int smcuda_register(void)
     mca_btl_smcuda.super.btl_cuda_ddt_pipeline_size = mca_btl_smcuda_component.cuda_ddt_pipeline_size;
     printf("pipeline size %lu\n", mca_btl_smcuda.super.btl_cuda_ddt_pipeline_size);
     mca_btl_smcuda.super.btl_cuda_ddt_pipeline_depth = 4;
+    mca_btl_smcuda.super.btl_cuda_ddt_allow_rdma = 1;
     mca_btl_smcuda.super.btl_eager_limit = 4*1024;
     mca_btl_smcuda.super.btl_rndv_eager_limit = 4*1024;
     mca_btl_smcuda.super.btl_max_send_size = 32*1024;

From 85f64280e7be630d8dfcd2ddf97ab73bf9c12ec6 Mon Sep 17 00:00:00 2001
From: eddy16112 <eddy16112@gmail.com>
Date: Wed, 4 Nov 2015 17:26:59 -0500
Subject: [PATCH 043/190] move ddt kernel support function pointer into
 opal_datatype_cuda.c

---
 ompi/mca/pml/ob1/pml_ob1_cuda.c            |   4 +-
 opal/datatype/Makefile.am                  |   4 +-
 opal/datatype/opal_convertor.c             |  11 --
 opal/datatype/opal_datatype_cuda.c         | 167 +++++++++++++++++++
 opal/datatype/opal_datatype_cuda.h         |  26 +++
 opal/datatype/opal_datatype_gpu.c          | 177 ---------------------
 opal/datatype/opal_datatype_gpu.h          |  60 -------
 opal/datatype/opal_datatype_module.c       |  10 +-
 opal/datatype/opal_datatype_pack.c         |  21 +--
 opal/datatype/opal_datatype_unpack.c       |  14 +-
 opal/mca/btl/smcuda/btl_smcuda.c           |   9 +-
 opal/mca/btl/smcuda/btl_smcuda_component.c |  12 +-
 opal/mca/common/cuda/common_cuda.c         |  13 +-
 opal/mca/common/cuda/common_cuda.h         |   1 -
 14 files changed, 224 insertions(+), 305 deletions(-)
 delete mode 100644 opal/datatype/opal_datatype_gpu.c
 delete mode 100644 opal/datatype/opal_datatype_gpu.h

diff --git a/ompi/mca/pml/ob1/pml_ob1_cuda.c b/ompi/mca/pml/ob1/pml_ob1_cuda.c
index 65db631b32f..a3fbd0fa8c7 100644
--- a/ompi/mca/pml/ob1/pml_ob1_cuda.c
+++ b/ompi/mca/pml/ob1/pml_ob1_cuda.c
@@ -37,7 +37,7 @@
 #include "ompi/mca/bml/base/base.h"
 #include "ompi/memchecker.h"
 
-#include "opal/datatype/opal_datatype_gpu.h"
+#include "opal/datatype/opal_datatype_cuda.h"
 #include "opal/mca/common/cuda/common_cuda.h"
 #include "opal/mca/btl/smcuda/btl_smcuda.h"
 
@@ -125,7 +125,7 @@ int mca_pml_ob1_send_request_start_cuda(mca_pml_ob1_send_request_t* sendreq,
             } else {
                 buffer_size = convertor->local_size;
             }
-            base = opal_cuda_malloc_gpu_buffer_p(buffer_size, 0);
+            base = opal_cuda_malloc_gpu_buffer(buffer_size, 0);
             convertor->gpu_buffer_ptr = base;
             convertor->gpu_buffer_size = buffer_size;
             sendreq->req_send.req_bytes_packed = convertor->local_size;
diff --git a/opal/datatype/Makefile.am b/opal/datatype/Makefile.am
index 7683c2e8786..ca64cf29237 100644
--- a/opal/datatype/Makefile.am
+++ b/opal/datatype/Makefile.am
@@ -32,8 +32,7 @@ headers = \
         opal_datatype_memcpy.h \
         opal_datatype_pack.h \
         opal_datatype_prototypes.h \
-        opal_datatype_unpack.h \
-		opal_datatype_gpu.h
+        opal_datatype_unpack.h
 
 
 noinst_LTLIBRARIES = \
@@ -61,7 +60,6 @@ libdatatype_la_SOURCES = \
         opal_datatype_get_count.c \
         opal_datatype_module.c \
         opal_datatype_optimize.c \
-		opal_datatype_gpu.c \
         opal_datatype_pack.c \
         opal_datatype_position.c \
         opal_datatype_resize.c \
diff --git a/opal/datatype/opal_convertor.c b/opal/datatype/opal_convertor.c
index c32d96043ac..34e25bc0b17 100644
--- a/opal/datatype/opal_convertor.c
+++ b/opal/datatype/opal_convertor.c
@@ -39,7 +39,6 @@
 #include "opal/datatype/opal_convertor_internal.h"
 #if OPAL_CUDA_SUPPORT
 #include "opal/datatype/opal_datatype_cuda.h"
-#include "opal/datatype/opal_datatype_gpu.h"
 #define MEMCPY_CUDA( DST, SRC, BLENGTH, CONVERTOR ) \
     CONVERTOR->cbmemcpy( (DST), (SRC), (BLENGTH), (CONVERTOR) )
 #endif
@@ -559,11 +558,6 @@ int32_t opal_convertor_prepare_for_recv( opal_convertor_t* convertor,
     convertor->flags |= CONVERTOR_RECV;
 #if OPAL_CUDA_SUPPORT
     mca_cuda_convertor_init(convertor, pUserBuf);
-#if OPAL_DATATYPE_CUDA_KERNEL
-    if (opal_datatype_gpu_init() != OPAL_SUCCESS) {
-        opal_datatype_gpu_fini();
-    }
-#endif /* OPAL_DATATYPE_CUDA_KERNEL */
 #endif  /* OPAL_CUDA_SUPPORT */
 
     OPAL_CONVERTOR_PREPARE( convertor, datatype, count, pUserBuf );
@@ -613,11 +607,6 @@ int32_t opal_convertor_prepare_for_send( opal_convertor_t* convertor,
     convertor->flags |= CONVERTOR_SEND;
 #if OPAL_CUDA_SUPPORT
     mca_cuda_convertor_init(convertor, pUserBuf);
-#if OPAL_DATATYPE_CUDA_KERNEL
-    if (opal_datatype_gpu_init() != OPAL_SUCCESS) {
-        opal_datatype_gpu_fini();
-    }
-#endif /* OPAL_DATATYPE_CUDA_KERNEL */
 #endif  /* OPAL_CUDA_SUPPORT */
 
     OPAL_CONVERTOR_PREPARE( convertor, datatype, count, pUserBuf );
diff --git a/opal/datatype/opal_datatype_cuda.c b/opal/datatype/opal_datatype_cuda.c
index e09618e747b..23cdb47acd6 100644
--- a/opal/datatype/opal_datatype_cuda.c
+++ b/opal/datatype/opal_datatype_cuda.c
@@ -12,11 +12,13 @@
 #include <errno.h>
 #include <string.h>
 #include <unistd.h>
+#include <dlfcn.h>
 
 #include "opal/align.h"
 #include "opal/util/output.h"
 #include "opal/datatype/opal_convertor.h"
 #include "opal/datatype/opal_datatype_cuda.h"
+#include "opal/mca/installdirs/installdirs.h"
 
 static bool initialized = false;
 int opal_cuda_verbose = 0;
@@ -26,6 +28,24 @@ static void opal_cuda_support_init(void);
 static int (*common_cuda_initialization_function)(opal_common_cuda_function_table_t *) = NULL;
 static opal_common_cuda_function_table_t ftable;
 
+/* folowing variables are used for cuda ddt kernel support */
+static opal_datatype_cuda_kernel_function_table_t cuda_kernel_table;
+static void *opal_datatype_cuda_kernel_handle = NULL;
+static char *opal_datatype_cuda_kernel_lib = NULL;
+int32_t opal_datatype_cuda_kernel_support = 0;
+
+#define OPAL_DATATYPE_FIND_CUDA_KERNEL_FUNCTION_OR_RETURN(handle, fname)            \
+    do {                                                                            \
+        char* _error;                                                               \
+        *(void **)(&(cuda_kernel_table.fname ## _p)) = dlsym((handle), # fname);    \
+        if(NULL != (_error = dlerror()) )  {                                        \
+            opal_output(0, "Finding %s error: %s\n", # fname, _error);              \
+            cuda_kernel_table.fname ## _p = NULL;                                   \
+            return OPAL_ERROR;                                                      \
+        }                                                                           \
+    } while (0)
+
+
 /* This function allows the common cuda code to register an
  * initialization function that gets called the first time an attempt
  * is made to send or receive a GPU pointer.  This allows us to delay
@@ -60,6 +80,10 @@ void mca_cuda_convertor_init(opal_convertor_t* convertor, const void *pUserBuf)
     if (ftable.gpu_is_gpu_buffer(pUserBuf, convertor)) {
         convertor->flags |= CONVERTOR_CUDA;
     }
+    
+    if (OPAL_SUCCESS != opal_datatype_cuda_kernel_support_init()) {
+        opal_datatype_cuda_kernel_support_fini();    
+    }
 }
 
 /* Checks the type of pointer
@@ -189,3 +213,146 @@ void opal_cuda_set_copy_function_async(opal_convertor_t* convertor, void *stream
     convertor->flags |= CONVERTOR_CUDA_ASYNC;
     convertor->stream = stream;
 }
+
+/* following functions are used for cuda ddt kernel support */
+int32_t opal_datatype_cuda_kernel_support_init(void)
+{
+    if (opal_datatype_cuda_kernel_handle ==  NULL) {
+
+        /* If the library name was initialized but the load failed, we have another chance to change it */
+        if( NULL != opal_datatype_cuda_kernel_lib )
+            free(opal_datatype_cuda_kernel_lib);
+        asprintf(&opal_datatype_cuda_kernel_lib, "%s/%s", opal_install_dirs.libdir, "opal_datatype_cuda.so");
+
+        opal_datatype_cuda_kernel_handle = dlopen(opal_datatype_cuda_kernel_lib , RTLD_LAZY);
+        if (!opal_datatype_cuda_kernel_handle) {
+            opal_output( 0, "Failed to load %s library: error %s\n", opal_datatype_cuda_kernel_lib, dlerror());
+            opal_datatype_cuda_kernel_handle = NULL;
+            return OPAL_ERROR;
+        }
+        
+        OPAL_DATATYPE_FIND_CUDA_KERNEL_FUNCTION_OR_RETURN( opal_datatype_cuda_kernel_handle, opal_datatype_cuda_init );
+        OPAL_DATATYPE_FIND_CUDA_KERNEL_FUNCTION_OR_RETURN( opal_datatype_cuda_kernel_handle, opal_datatype_cuda_fini );
+        OPAL_DATATYPE_FIND_CUDA_KERNEL_FUNCTION_OR_RETURN( opal_datatype_cuda_kernel_handle, opal_generic_simple_pack_function_cuda_iov );
+        OPAL_DATATYPE_FIND_CUDA_KERNEL_FUNCTION_OR_RETURN( opal_datatype_cuda_kernel_handle, opal_generic_simple_unpack_function_cuda_iov );
+        OPAL_DATATYPE_FIND_CUDA_KERNEL_FUNCTION_OR_RETURN( opal_datatype_cuda_kernel_handle, opal_generic_simple_pack_function_cuda_vector );
+        OPAL_DATATYPE_FIND_CUDA_KERNEL_FUNCTION_OR_RETURN( opal_datatype_cuda_kernel_handle, opal_generic_simple_unpack_function_cuda_vector );
+        OPAL_DATATYPE_FIND_CUDA_KERNEL_FUNCTION_OR_RETURN( opal_datatype_cuda_kernel_handle, opal_cuda_free_gpu_buffer );
+        OPAL_DATATYPE_FIND_CUDA_KERNEL_FUNCTION_OR_RETURN( opal_datatype_cuda_kernel_handle, opal_cuda_malloc_gpu_buffer );
+        OPAL_DATATYPE_FIND_CUDA_KERNEL_FUNCTION_OR_RETURN( opal_datatype_cuda_kernel_handle, opal_cuda_d2dcpy_async );
+        OPAL_DATATYPE_FIND_CUDA_KERNEL_FUNCTION_OR_RETURN( opal_datatype_cuda_kernel_handle, opal_cuda_d2dcpy );
+        
+        if (OPAL_SUCCESS != cuda_kernel_table.opal_datatype_cuda_init_p()) {
+            return OPAL_ERROR;
+        }
+        opal_datatype_cuda_kernel_support = 1;
+        opal_output( 0, "opal_datatype_cuda_kernel_support_init done\n");
+    }
+    return OPAL_SUCCESS;
+}
+
+int32_t opal_datatype_cuda_kernel_support_fini(void)
+{
+    if (opal_datatype_cuda_kernel_handle != NULL) {
+        cuda_kernel_table.opal_datatype_cuda_fini_p();
+        /* Reset all functions to NULL */
+        cuda_kernel_table.opal_datatype_cuda_init_p = NULL;
+        cuda_kernel_table.opal_datatype_cuda_fini_p = NULL;
+        cuda_kernel_table.opal_generic_simple_pack_function_cuda_iov_p = NULL;
+        cuda_kernel_table.opal_generic_simple_unpack_function_cuda_iov_p = NULL;
+        cuda_kernel_table.opal_generic_simple_pack_function_cuda_vector_p = NULL;
+        cuda_kernel_table.opal_generic_simple_unpack_function_cuda_vector_p = NULL;
+        cuda_kernel_table.opal_cuda_free_gpu_buffer_p = NULL;
+        cuda_kernel_table.opal_cuda_malloc_gpu_buffer_p = NULL;
+        cuda_kernel_table.opal_cuda_d2dcpy_async_p = NULL;
+        cuda_kernel_table.opal_cuda_d2dcpy_p = NULL;
+
+        dlclose(opal_datatype_cuda_kernel_handle);
+        opal_datatype_cuda_kernel_handle = NULL;
+
+        if( NULL != opal_datatype_cuda_kernel_lib )
+            free(opal_datatype_cuda_kernel_lib);
+        opal_datatype_cuda_kernel_lib = NULL;
+        opal_datatype_cuda_kernel_support = 0;
+        opal_output( 0, "opal_datatype_cuda_kernel_support_fini done\n");
+    }
+    return OPAL_SUCCESS;
+}
+
+int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor, struct iovec* iov, uint32_t* out_size, size_t* max_data )
+{
+    if (cuda_kernel_table.opal_generic_simple_pack_function_cuda_iov_p != NULL) {
+        return cuda_kernel_table.opal_generic_simple_pack_function_cuda_iov_p(pConvertor, iov, out_size, max_data);
+    } else {
+        opal_output(0, "opal_generic_simple_pack_function_cuda_iov function pointer is NULL\n");
+        return -1;
+    }
+}
+
+int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvertor, struct iovec* iov, uint32_t* out_size, size_t* max_data )
+{
+    if (cuda_kernel_table.opal_generic_simple_unpack_function_cuda_iov_p != NULL) {
+        return cuda_kernel_table.opal_generic_simple_unpack_function_cuda_iov_p(pConvertor, iov, out_size, max_data);
+    } else {
+        opal_output(0, "opal_generic_simple_unpack_function_cuda_iov function pointer is NULL\n");
+        return -1;
+    }
+}
+
+int32_t opal_generic_simple_pack_function_cuda_vector( opal_convertor_t* pConvertor, struct iovec* iov, uint32_t* out_size, size_t* max_data )
+{
+    if (cuda_kernel_table.opal_generic_simple_pack_function_cuda_vector_p != NULL) {
+        return cuda_kernel_table.opal_generic_simple_pack_function_cuda_vector_p(pConvertor, iov, out_size, max_data);
+    } else {
+        opal_output(0, "opal_generic_simple_pack_function_cuda_vector function pointer is NULL\n");
+        return -1;
+    }
+}
+
+int32_t opal_generic_simple_unpack_function_cuda_vector( opal_convertor_t* pConvertor, struct iovec* iov, uint32_t* out_size, size_t* max_data )
+{
+    if (cuda_kernel_table.opal_generic_simple_unpack_function_cuda_vector_p != NULL) {
+        return cuda_kernel_table.opal_generic_simple_unpack_function_cuda_vector_p(pConvertor, iov, out_size, max_data);
+    } else {
+        opal_output(0, "opal_generic_simple_unpack_function_cuda_vector function pointer is NULL\n");
+        return -1;
+    }
+}
+
+void* opal_cuda_malloc_gpu_buffer(size_t size, int gpu_id)
+{
+    if (cuda_kernel_table.opal_cuda_malloc_gpu_buffer_p != NULL) {
+        return cuda_kernel_table.opal_cuda_malloc_gpu_buffer_p(size, gpu_id);
+    } else {
+        opal_output(0, "opal_cuda_malloc_gpu_buffer function pointer is NULL\n");
+        return NULL;
+    }
+}
+
+void opal_cuda_free_gpu_buffer(void *addr, int gpu_id)
+{
+    if (cuda_kernel_table.opal_cuda_free_gpu_buffer_p != NULL) {
+        cuda_kernel_table.opal_cuda_free_gpu_buffer_p(addr, gpu_id);
+    } else {
+        opal_output(0, "opal_cuda_free_gpu_buffer function pointer is NULL\n");
+    }
+}
+
+void opal_cuda_d2dcpy(void* dst, const void* src, size_t count)
+{
+    if (cuda_kernel_table.opal_cuda_d2dcpy_p != NULL) {
+        cuda_kernel_table.opal_cuda_d2dcpy_p(dst, src, count);
+    } else {
+        opal_output(0, "opal_cuda_d2dcpy function pointer is NULL\n");
+    }
+}
+
+void opal_cuda_d2dcpy_async(void* dst, const void* src, size_t count)
+{
+    if (cuda_kernel_table.opal_cuda_d2dcpy_async_p != NULL) {
+        cuda_kernel_table.opal_cuda_d2dcpy_async_p(dst, src, count);
+    } else {
+        opal_output(0, "opal_cuda_d2dcpy_async function pointer is NULL\n");
+    }
+}
+
diff --git a/opal/datatype/opal_datatype_cuda.h b/opal/datatype/opal_datatype_cuda.h
index 676af80273b..a5a68074034 100644
--- a/opal/datatype/opal_datatype_cuda.h
+++ b/opal/datatype/opal_datatype_cuda.h
@@ -21,6 +21,21 @@ struct opal_common_cuda_function_table {
 };
 typedef struct opal_common_cuda_function_table opal_common_cuda_function_table_t;
 
+struct opal_datatype_cuda_kernel_function_table {
+    int32_t (*opal_datatype_cuda_init_p)(void);
+    int32_t (*opal_datatype_cuda_fini_p)(void);
+    void (*opal_cuda_free_gpu_buffer_p)(void *addr, int gpu_id);
+    void* (*opal_cuda_malloc_gpu_buffer_p)(size_t size, int gpu_id);
+    void (*opal_cuda_d2dcpy_async_p)(void* dst, const void* src, size_t count);
+    void (*opal_cuda_d2dcpy_p)(void* dst, const void* src, size_t count);
+    int32_t (*opal_generic_simple_pack_function_cuda_iov_p)( opal_convertor_t* pConvertor, struct iovec* iov, uint32_t* out_size, size_t* max_data );
+    int32_t (*opal_generic_simple_unpack_function_cuda_iov_p)( opal_convertor_t* pConvertor, struct iovec* iov, uint32_t* out_size, size_t* max_data );
+    int32_t (*opal_generic_simple_pack_function_cuda_vector_p)( opal_convertor_t* pConvertor, struct iovec* iov, uint32_t* out_size, size_t* max_data );
+    int32_t (*opal_generic_simple_unpack_function_cuda_vector_p)( opal_convertor_t* pConvertor, struct iovec* iov, uint32_t* out_size, size_t* max_data );                                                         
+};
+typedef struct opal_datatype_cuda_kernel_function_table opal_datatype_cuda_kernel_function_table_t;
+extern int32_t opal_datatype_cuda_kernel_support;
+
 void mca_cuda_convertor_init(opal_convertor_t* convertor, const void *pUserBuf);
 bool opal_cuda_check_bufs(char *dest, char *src);
 void* opal_cuda_memcpy(void * dest, const void * src, size_t size, opal_convertor_t* convertor);
@@ -29,4 +44,15 @@ void* opal_cuda_memmove(void * dest, void * src, size_t size);
 void opal_cuda_add_initialization_function(int (*fptr)(opal_common_cuda_function_table_t *));
 void opal_cuda_set_copy_function_async(opal_convertor_t* convertor, void *stream);
 
+int32_t opal_datatype_cuda_kernel_support_init(void);
+int32_t opal_datatype_cuda_kernel_support_fini(void);
+int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor, struct iovec* iov, uint32_t* out_size, size_t* max_data );
+int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvertor, struct iovec* iov, uint32_t* out_size, size_t* max_data );
+int32_t opal_generic_simple_pack_function_cuda_vector( opal_convertor_t* pConvertor, struct iovec* iov, uint32_t* out_size, size_t* max_data );
+int32_t opal_generic_simple_unpack_function_cuda_vector( opal_convertor_t* pConvertor, struct iovec* iov, uint32_t* out_size, size_t* max_data ); 
+void* opal_cuda_malloc_gpu_buffer(size_t size, int gpu_id);
+void opal_cuda_free_gpu_buffer(void *addr, int gpu_id);
+void opal_cuda_d2dcpy(void* dst, const void* src, size_t count);
+void opal_cuda_d2dcpy_async(void* dst, const void* src, size_t count);
+
 #endif
diff --git a/opal/datatype/opal_datatype_gpu.c b/opal/datatype/opal_datatype_gpu.c
deleted file mode 100644
index f21b22c72d2..00000000000
--- a/opal/datatype/opal_datatype_gpu.c
+++ /dev/null
@@ -1,177 +0,0 @@
-/* -*- Mode: C; c-basic-offset:4 ; -*- */
-/*
- * Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana
- *                         University Research and Technology
- *                         Corporation.  All rights reserved.
- * Copyright (c) 2004-2015 The University of Tennessee and The University
- *                         of Tennessee Research Foundation.  All rights
- *                         reserved.
- * Copyright (c) 2004-2006 High Performance Computing Center Stuttgart,
- *                         University of Stuttgart.  All rights reserved.
- * Copyright (c) 2004-2006 The Regents of the University of California.
- *                         All rights reserved.
- * Copyright (c) 2009      Oak Ridge National Labs.  All rights reserved.
- * Copyright (c) 2013 Cisco Systems, Inc.  All rights reserved.
- * $COPYRIGHT$
- *
- * Additional copyrights may follow
- *
- * $HEADER$
- */
-
-#include "opal_config.h"
-
-#include <stddef.h>
-#include <stdio.h>
-#include <dlfcn.h>
-#include <stdio.h>
-
-#include "opal/mca/installdirs/installdirs.h"
-#include "opal/datatype/opal_convertor_internal.h"
-#include "opal/datatype/opal_datatype_internal.h"
-
-#if OPAL_ENABLE_DEBUG
-#include "opal/util/output.h"
-
-#define DO_DEBUG(INST)  if( opal_pack_debug ) { INST }
-#else
-#define DO_DEBUG(INST)
-#endif  /* OPAL_ENABLE_DEBUG */
-
-#include "opal/datatype/opal_datatype_gpu.h"
-
-int32_t opal_datatype_cuda_kernel_support = 0;
-
-static void *opal_datatype_cuda_handle = NULL;
-static char *opal_datatype_cuda_lib = NULL;
-
-int32_t (*opal_datatype_cuda_init_p)(void) = NULL;
-
-int32_t (*opal_datatype_cuda_fini_p)(void) = NULL;
-
-
-int32_t (*opal_generic_simple_pack_function_cuda_iov_p)( opal_convertor_t* pConvertor,
-                                                        struct iovec* iov,
-                                                        uint32_t* out_size,
-                                                        size_t* max_data ) = NULL;
-
-int32_t (*opal_generic_simple_unpack_function_cuda_iov_p)( opal_convertor_t* pConvertor,
-                                                        struct iovec* iov,
-                                                        uint32_t* out_size,
-                                                        size_t* max_data ) = NULL;
-
-int32_t (*opal_generic_simple_pack_function_cuda_vector_p)( opal_convertor_t* pConvertor,
-                                                            struct iovec* iov,
-                                                            uint32_t* out_size,
-                                                            size_t* max_data ) = NULL;
-
-int32_t (*opal_generic_simple_unpack_function_cuda_vector_p)( opal_convertor_t* pConvertor,
-                                                              struct iovec* iov,
-                                                              uint32_t* out_size,
-                                                              size_t* max_data ) = NULL;
-
-void (*pack_contiguous_loop_cuda_p)( dt_elem_desc_t* ELEM,
-                                     uint32_t* COUNT,
-                                     unsigned char** SOURCE,
-                                     unsigned char** DESTINATION,
-                                     size_t* SPACE ) = NULL;
-
-void (*unpack_contiguous_loop_cuda_p)( dt_elem_desc_t* ELEM,
-                                       uint32_t* COUNT,
-                                       unsigned char** SOURCE,
-                                       unsigned char** DESTINATION,
-                                       size_t* SPACE ) = NULL;
-
-void (*pack_predefined_data_cuda_p)( dt_elem_desc_t* ELEM,
-                                     uint32_t* COUNT,
-                                     unsigned char** SOURCE,
-                                     unsigned char** DESTINATION,
-                                     size_t* SPACE ) = NULL;
-
-void (*opal_cuda_free_gpu_buffer_p)(void *addr, int gpu_id) = NULL;
-
-void* (*opal_cuda_malloc_gpu_buffer_p)(size_t size, int gpu_id) = NULL;
-
-void (*opal_cuda_d2dcpy_async_p)(void* dst, const void* src, size_t count) = NULL;
-
-void (*opal_cuda_d2dcpy_p)(void* dst, const void* src, size_t count) = NULL;
-
-#define OPAL_DATATYPE_FIND_CUDA_FUNCTION_OR_RETURN(handle, fname)       \
-    do {                                                                \
-        char* _error;                                                   \
-        *(void **)(&(fname ## _p)) = dlsym((handle), # fname);          \
-        if(NULL != (_error = dlerror()) )  {                            \
-            opal_output(0, "Finding %s error: %s\n", # fname, _error);  \
-            fname ## _p = NULL;                                         \
-            return OPAL_ERROR;                                          \
-        }                                                               \
-    } while (0)
-
-int32_t opal_datatype_gpu_init(void)
-{
-    if (opal_datatype_cuda_handle ==  NULL) {
-
-        /* If the library name was initialized but the load failed, we have another chance to change it */
-        if( NULL != opal_datatype_cuda_lib )
-            free(opal_datatype_cuda_lib);
-        asprintf(&opal_datatype_cuda_lib, "%s/%s", opal_install_dirs.libdir, "opal_datatype_cuda.so");
-
-        opal_datatype_cuda_handle = dlopen(opal_datatype_cuda_lib , RTLD_LAZY);
-        if (!opal_datatype_cuda_handle) {
-            opal_output( 0, "Failed to load %s library: error %s\n", opal_datatype_cuda_lib, dlerror());
-            opal_datatype_cuda_handle = NULL;
-            return OPAL_ERROR;
-        }
-        OPAL_DATATYPE_FIND_CUDA_FUNCTION_OR_RETURN( opal_datatype_cuda_handle, opal_datatype_cuda_init );
-        OPAL_DATATYPE_FIND_CUDA_FUNCTION_OR_RETURN( opal_datatype_cuda_handle, opal_datatype_cuda_fini );
-        OPAL_DATATYPE_FIND_CUDA_FUNCTION_OR_RETURN( opal_datatype_cuda_handle, opal_generic_simple_pack_function_cuda_iov );
-        OPAL_DATATYPE_FIND_CUDA_FUNCTION_OR_RETURN( opal_datatype_cuda_handle, opal_generic_simple_unpack_function_cuda_iov );
-        OPAL_DATATYPE_FIND_CUDA_FUNCTION_OR_RETURN( opal_datatype_cuda_handle, opal_generic_simple_pack_function_cuda_vector );
-        OPAL_DATATYPE_FIND_CUDA_FUNCTION_OR_RETURN( opal_datatype_cuda_handle, opal_generic_simple_unpack_function_cuda_vector );
-        OPAL_DATATYPE_FIND_CUDA_FUNCTION_OR_RETURN( opal_datatype_cuda_handle, pack_contiguous_loop_cuda );
-        OPAL_DATATYPE_FIND_CUDA_FUNCTION_OR_RETURN( opal_datatype_cuda_handle, unpack_contiguous_loop_cuda );
-        OPAL_DATATYPE_FIND_CUDA_FUNCTION_OR_RETURN( opal_datatype_cuda_handle, pack_predefined_data_cuda );
-        OPAL_DATATYPE_FIND_CUDA_FUNCTION_OR_RETURN( opal_datatype_cuda_handle, opal_cuda_free_gpu_buffer );
-        OPAL_DATATYPE_FIND_CUDA_FUNCTION_OR_RETURN( opal_datatype_cuda_handle, opal_cuda_malloc_gpu_buffer );
-        OPAL_DATATYPE_FIND_CUDA_FUNCTION_OR_RETURN( opal_datatype_cuda_handle, opal_cuda_d2dcpy_async );
-        OPAL_DATATYPE_FIND_CUDA_FUNCTION_OR_RETURN( opal_datatype_cuda_handle, opal_cuda_d2dcpy );
-
-        if (OPAL_SUCCESS != (*opal_datatype_cuda_init_p)()) {
-            return OPAL_ERROR;
-        }
-        opal_datatype_cuda_kernel_support = 1;
-        opal_output( 0, "opal_datatype_cuda_kernel_support init done\n");
-    }
-    return OPAL_SUCCESS;
-}
-
-int32_t opal_datatype_gpu_fini(void)
-{
-    if (opal_datatype_cuda_handle != NULL) {
-        (*opal_datatype_cuda_fini_p)();
-        /* Reset all functions to NULL */
-        opal_datatype_cuda_init_p = NULL;
-        opal_datatype_cuda_fini_p = NULL;
-        opal_generic_simple_pack_function_cuda_iov_p = NULL;
-        opal_generic_simple_unpack_function_cuda_iov_p = NULL;
-        opal_generic_simple_pack_function_cuda_vector_p = NULL;
-        opal_generic_simple_unpack_function_cuda_vector_p = NULL;
-        pack_contiguous_loop_cuda_p = NULL;
-        unpack_contiguous_loop_cuda_p = NULL;
-        pack_predefined_data_cuda_p = NULL;
-        opal_cuda_free_gpu_buffer_p = NULL;
-        opal_cuda_malloc_gpu_buffer_p = NULL;
-        opal_cuda_d2dcpy_async_p = NULL;
-        opal_cuda_d2dcpy_p = NULL;
-
-        dlclose(opal_datatype_cuda_handle);
-        opal_datatype_cuda_handle = NULL;
-
-        if( NULL != opal_datatype_cuda_lib )
-            free(opal_datatype_cuda_lib);
-        opal_datatype_cuda_lib = NULL;
-        opal_datatype_cuda_kernel_support = 0;
-        opal_output( 0, "opal_datatype_cuda_kernel_support fini done\n");
-    }
-    return OPAL_SUCCESS;
-}
diff --git a/opal/datatype/opal_datatype_gpu.h b/opal/datatype/opal_datatype_gpu.h
deleted file mode 100644
index 340fbf24da7..00000000000
--- a/opal/datatype/opal_datatype_gpu.h
+++ /dev/null
@@ -1,60 +0,0 @@
-#ifndef OPAL_DATATYPE_GPU_H_HAS_BEEN_INCLUDED
-#define OPAL_DATATYPE_GPU_H_HAS_BEEN_INCLUDED
-
-#define OPAL_DATATYPE_CUDA_KERNEL   1
-
-extern int32_t opal_datatype_cuda_kernel_support;
-
-int32_t opal_datatype_gpu_init(void);
-int32_t opal_datatype_gpu_fini(void);
-
-extern int32_t (*opal_datatype_cuda_init_p)(void);
-
-extern int32_t (*opal_datatype_cuda_fini_p)(void);
-                                                              
-extern int32_t (*opal_generic_simple_pack_function_cuda_iov_p)( opal_convertor_t* pConvertor,
-                                                                struct iovec* iov, 
-                                                                uint32_t* out_size,
-                                                                size_t* max_data );
-                                                                
-extern int32_t (*opal_generic_simple_pack_function_cuda_vector_p)( opal_convertor_t* pConvertor,
-                                                                   struct iovec* iov, 
-                                                                   uint32_t* out_size,
-                                                                   size_t* max_data );
-                                                                
-extern int32_t (*opal_generic_simple_unpack_function_cuda_iov_p)( opal_convertor_t* pConvertor,
-                                                                  struct iovec* iov, 
-                                                                  uint32_t* out_size,
-                                                                  size_t* max_data );
-                                                                  
-extern int32_t (*opal_generic_simple_unpack_function_cuda_vector_p)( opal_convertor_t* pConvertor,
-                                                                     struct iovec* iov, 
-                                                                     uint32_t* out_size,
-                                                                     size_t* max_data );
-                                                              
-extern void (*pack_contiguous_loop_cuda_p)( dt_elem_desc_t* ELEM,
-                                            uint32_t* COUNT,
-                                            unsigned char** SOURCE,
-                                            unsigned char** DESTINATION,
-                                            size_t* SPACE );
-                                            
-extern void (*unpack_contiguous_loop_cuda_p)( dt_elem_desc_t* ELEM,
-                                            uint32_t* COUNT,
-                                            unsigned char** SOURCE,
-                                            unsigned char** DESTINATION,
-                                            size_t* SPACE );
-
-extern void (*pack_predefined_data_cuda_p)( dt_elem_desc_t* ELEM,
-                                            uint32_t* COUNT,
-                                            unsigned char** SOURCE,
-                                            unsigned char** DESTINATION,
-                                            size_t* SPACE );
-
-extern void (*opal_cuda_free_gpu_buffer_p)(void *addr, int gpu_id);
-
-extern void* (*opal_cuda_malloc_gpu_buffer_p)(size_t size, int gpu_id);
-
-extern void (*opal_cuda_d2dcpy_async_p)(void* dst, const void* src, size_t count);
-
-extern void (*opal_cuda_d2dcpy_p)(void* dst, const void* src, size_t count);
-#endif /* OPAL_DATATYPE_GPU_H_HAS_BEEN_INCLUDED */
diff --git a/opal/datatype/opal_datatype_module.c b/opal/datatype/opal_datatype_module.c
index 09940374ab3..92a3fe40174 100644
--- a/opal/datatype/opal_datatype_module.c
+++ b/opal/datatype/opal_datatype_module.c
@@ -33,7 +33,9 @@
 #include "opal/datatype/opal_datatype.h"
 #include "opal/datatype/opal_convertor_internal.h"
 #include "opal/mca/base/mca_base_var.h"
-#include "opal/datatype/opal_datatype_gpu.h"
+#if OPAL_CUDA_SUPPORT
+#include "opal/datatype/opal_datatype_cuda.h"
+#endif /* OPAL_CUDA_SUPPORT */
 
 /* by default the debuging is turned off */
 int opal_datatype_dfd = -1;
@@ -249,9 +251,9 @@ int32_t opal_datatype_finalize( void )
     /* clear all master convertors */
     opal_convertor_destroy_masters();
 
-#if OPAL_DATATYPE_CUDA_KERNEL
-    opal_datatype_gpu_fini();
-#endif /* defined OPAL_DATATYPE_CUDA_KERNEL */
+#if OPAL_CUDA_SUPPORT
+    opal_datatype_cuda_kernel_support_fini();
+#endif /* OPAL_CUDA_SUPPORT */
 
     return OPAL_SUCCESS;
 }
diff --git a/opal/datatype/opal_datatype_pack.c b/opal/datatype/opal_datatype_pack.c
index 5a5a2470cb1..0bb29e2f3fc 100644
--- a/opal/datatype/opal_datatype_pack.c
+++ b/opal/datatype/opal_datatype_pack.c
@@ -37,7 +37,9 @@
 #include "opal/datatype/opal_datatype_checksum.h"
 #include "opal/datatype/opal_datatype_pack.h"
 #include "opal/datatype/opal_datatype_prototypes.h"
-#include "opal/datatype/opal_datatype_gpu.h"
+#if OPAL_CUDA_SUPPORT
+#include "opal/datatype/opal_datatype_cuda.h"
+#endif /* OPAL_CUDA_SUPPORT */
 
 #if defined(CHECKSUM)
 #define opal_pack_homogeneous_contig_function           opal_pack_homogeneous_contig_checksum
@@ -316,7 +318,6 @@ opal_generic_simple_pack_function( opal_convertor_t* pConvertor,
         while( 1 ) {
             while( pElem->elem.common.flags & OPAL_DATATYPE_FLAG_DATA ) {
                 /* now here we have a basic datatype */
-//                (*pack_predefined_data_cuda_p)(pElem, &count_desc, &conv_ptr, &iov_ptr, &iov_len_local);
                 PACK_PREDEFINED_DATATYPE( pConvertor, pElem, count_desc,
                                         conv_ptr, iov_ptr, iov_len_local );
                 if( 0 == count_desc ) {  /* completed */
@@ -361,7 +362,6 @@ opal_generic_simple_pack_function( opal_convertor_t* pConvertor,
             if( OPAL_DATATYPE_LOOP == pElem->elem.common.type ) {
                 OPAL_PTRDIFF_TYPE local_disp = (OPAL_PTRDIFF_TYPE)conv_ptr;
                 if( pElem->loop.common.flags & OPAL_DATATYPE_FLAG_CONTIGUOUS ) {
-                    //(*pack_contiguous_loop_cuda_p)(pElem, &count_desc, &conv_ptr, &iov_ptr, &iov_len_local);
                     PACK_CONTIGUOUS_LOOP( pConvertor, pElem, count_desc,
                                           conv_ptr, iov_ptr, iov_len_local );
                     if( 0 == count_desc ) {  /* completed */
@@ -391,12 +391,6 @@ opal_generic_simple_pack_function( opal_convertor_t* pConvertor,
     if( pConvertor->bConverted == pConvertor->local_size ) {
         pConvertor->flags |= CONVERTOR_COMPLETED;
         opal_output(0, "total packed %lu\n", pConvertor->bConverted);
-        // double *vtmp = (double *)iov[0].iov_base;
-        // for (uint32_t i = 0; i < total_packed/8; i++) {
-        //     printf(" %1.f ", *vtmp);
-        //     vtmp ++;
-        // }
-        // printf("\n");
         return 1;
     }
     /* Save the global position for the next round */
@@ -424,14 +418,9 @@ opal_generic_simple_pack_cuda_function( opal_convertor_t* pConvertor,
    
    // return (*opal_generic_simple_pack_function_cuda_vector_p)( pConvertor, iov, out_size, max_data);
     if( OPAL_DATATYPE_LOOP == pElem->elem.common.type ) {
-        if (opal_generic_simple_pack_function_cuda_vector_p != NULL) {
-            return (*opal_generic_simple_pack_function_cuda_vector_p)( pConvertor, iov, out_size, max_data);
-        //    return (*opal_generic_simple_pack_function_cuda_iov_p)( pConvertor, iov, out_size, max_data);
-        }
+        return opal_generic_simple_pack_function_cuda_vector( pConvertor, iov, out_size, max_data);
     } else {
-        if (opal_generic_simple_pack_function_cuda_iov_p != NULL) {
-            return (*opal_generic_simple_pack_function_cuda_iov_p)( pConvertor, iov, out_size, max_data);
-        }
+        return opal_generic_simple_pack_function_cuda_iov( pConvertor, iov, out_size, max_data);
     }
     return 0;
 }
diff --git a/opal/datatype/opal_datatype_unpack.c b/opal/datatype/opal_datatype_unpack.c
index d9d69683174..6a2fbd70a6c 100644
--- a/opal/datatype/opal_datatype_unpack.c
+++ b/opal/datatype/opal_datatype_unpack.c
@@ -27,7 +27,6 @@
 
 #include "opal/datatype/opal_convertor_internal.h"
 #include "opal/datatype/opal_datatype_internal.h"
-#include "opal/datatype/opal_datatype_gpu.h"
 
 #if OPAL_ENABLE_DEBUG
 #include "opal/util/output.h"
@@ -40,6 +39,9 @@
 #include "opal/datatype/opal_datatype_checksum.h"
 #include "opal/datatype/opal_datatype_unpack.h"
 #include "opal/datatype/opal_datatype_prototypes.h"
+#if OPAL_CUDA_SUPPORT
+#include "opal/datatype/opal_datatype_cuda.h"
+#endif /* OPAL_CUDA_SUPPORT */
 
 #if defined(CHECKSUM)
 #define opal_unpack_general_function            opal_unpack_general_checksum
@@ -385,7 +387,6 @@ opal_generic_simple_unpack_function( opal_convertor_t* pConvertor,
                 if( pElem->loop.common.flags & OPAL_DATATYPE_FLAG_CONTIGUOUS ) {
                     UNPACK_CONTIGUOUS_LOOP( pConvertor, pElem, count_desc, 
                                             iov_ptr, conv_ptr, iov_len_local );
-                //    (*unpack_contiguous_loop_cuda_p)(pElem, &count_desc, &iov_ptr, &conv_ptr, &iov_len_local);
                     if( 0 == count_desc ) {  /* completed */
                         pos_desc += pElem->loop.items + 1;
                         goto update_loop_description;
@@ -611,14 +612,9 @@ opal_generic_simple_unpack_cuda_function( opal_convertor_t* pConvertor,
    
 //    return (*opal_generic_simple_unpack_function_cuda_vector_p)( pConvertor, iov, out_size, max_data);
     if( OPAL_DATATYPE_LOOP == pElem->elem.common.type ) {
-        if (opal_generic_simple_unpack_function_cuda_vector_p != NULL) {
-            return (*opal_generic_simple_unpack_function_cuda_vector_p)( pConvertor, iov, out_size, max_data);
-          //  return (*opal_generic_simple_unpack_function_cuda_iov_p)( pConvertor, iov, out_size, max_data);
-        }
+        return opal_generic_simple_unpack_function_cuda_vector( pConvertor, iov, out_size, max_data);
     } else {
-        if (opal_generic_simple_unpack_function_cuda_iov_p != NULL) {
-            return (*opal_generic_simple_unpack_function_cuda_iov_p)( pConvertor, iov, out_size, max_data);
-        }
+        return opal_generic_simple_unpack_function_cuda_iov( pConvertor, iov, out_size, max_data);
     }
     return 0;
 }
diff --git a/opal/mca/btl/smcuda/btl_smcuda.c b/opal/mca/btl/smcuda/btl_smcuda.c
index 2d015ad11fb..eeafea57fb6 100644
--- a/opal/mca/btl/smcuda/btl_smcuda.c
+++ b/opal/mca/btl/smcuda/btl_smcuda.c
@@ -55,7 +55,7 @@
 
 #if OPAL_CUDA_SUPPORT
 #include "opal/mca/common/cuda/common_cuda.h"
-#include "opal/datatype/opal_datatype_gpu.h"
+#include "opal/datatype/opal_datatype_cuda.h"
 #endif /* OPAL_CUDA_SUPPORT */
 #include "opal/mca/mpool/base/base.h"
 #include "opal/mca/mpool/sm/mpool_sm.h"
@@ -1187,8 +1187,8 @@ int mca_btl_smcuda_get_cuda (struct mca_btl_base_module_t *btl,
                 uint32_t iov_count = 1;
                 size_t max_data;
                 if (!OPAL_DATATYPE_DIRECT_COPY_GPUMEM && remote_device != local_device) {
-                    unpack_convertor->gpu_buffer_ptr = opal_cuda_malloc_gpu_buffer_p(size, 0);
-                    (*opal_cuda_d2dcpy_async_p)(unpack_convertor->gpu_buffer_ptr, remote_memory_address, size);
+                    unpack_convertor->gpu_buffer_ptr = opal_cuda_malloc_gpu_buffer(size, 0);
+                    opal_cuda_d2dcpy_async(unpack_convertor->gpu_buffer_ptr, remote_memory_address, size);
                     iov.iov_base = unpack_convertor->gpu_buffer_ptr;
                     opal_output(0, "start D2D copy src %p, dst %p, size %lu\n", remote_memory_address, unpack_convertor->gpu_buffer_ptr, size);
                 } else {
@@ -1197,7 +1197,7 @@ int mca_btl_smcuda_get_cuda (struct mca_btl_base_module_t *btl,
                 iov.iov_len = size;
                 max_data = size;
                 opal_convertor_unpack(unpack_convertor, &iov, &iov_count, &max_data );
-                opal_cuda_free_gpu_buffer_p(unpack_convertor->gpu_buffer_ptr, 0);
+                opal_cuda_free_gpu_buffer(unpack_convertor->gpu_buffer_ptr, 0);
                 done = 1;
             }
         } else {
@@ -1436,6 +1436,7 @@ int mca_btl_smcuda_alloc_cuda_ddt_clone(struct mca_btl_base_endpoint_t *endpoint
         endpoint->smcuda_ddt_clone_size += SMCUDA_DT_CLONE_SIZE;
         return endpoint->smcuda_ddt_clone_size - SMCUDA_DT_CLONE_SIZE;
     }
+    return -1;
 }
 
 void mca_btl_smcuda_free_cuda_ddt_clone(struct mca_btl_base_endpoint_t *endpoint, int lindex)
diff --git a/opal/mca/btl/smcuda/btl_smcuda_component.c b/opal/mca/btl/smcuda/btl_smcuda_component.c
index 9c1f5235d1e..c4a299ef84a 100644
--- a/opal/mca/btl/smcuda/btl_smcuda_component.c
+++ b/opal/mca/btl/smcuda/btl_smcuda_component.c
@@ -54,7 +54,7 @@
 
 #if OPAL_CUDA_SUPPORT
 #include "opal/mca/common/cuda/common_cuda.h"
-#include "opal/datatype/opal_datatype_gpu.h"
+#include "opal/datatype/opal_datatype_cuda.h"
 #endif /* OPAL_CUDA_SUPPORT */
 #if OPAL_ENABLE_FT_CR    == 1
 #include "opal/runtime/opal_cr.h"
@@ -901,9 +901,9 @@ static void btl_smcuda_datatype_unpack(mca_btl_base_module_t* btl,
         } else {     /* unpack */
             convertor->flags |= CONVERTOR_CUDA;
             if (!OPAL_DATATYPE_DIRECT_COPY_GPUMEM && my_cuda_dt_clone->remote_device != my_cuda_dt_clone->local_device) {
-                convertor->gpu_buffer_ptr = opal_cuda_malloc_gpu_buffer_p(packed_size, 0);
+                convertor->gpu_buffer_ptr = opal_cuda_malloc_gpu_buffer(packed_size, 0);
                 remote_address = (unsigned char*)my_cuda_dt_clone->remote_gpu_address + seq * pipeline_size;
-                (*opal_cuda_d2dcpy_async_p)(convertor->gpu_buffer_ptr, remote_address, packed_size);
+                opal_cuda_d2dcpy_async(convertor->gpu_buffer_ptr, remote_address, packed_size);
                 iov.iov_base = convertor->gpu_buffer_ptr;
                 opal_output(0, "unpack, start D2D copy src %p, dst %p, size %lu\n", remote_address, convertor->gpu_buffer_ptr, packed_size);        
             } else {
@@ -914,7 +914,7 @@ static void btl_smcuda_datatype_unpack(mca_btl_base_module_t* btl,
             opal_convertor_unpack(convertor, &iov, &iov_count, &max_data );
             if (!OPAL_DATATYPE_DIRECT_COPY_GPUMEM && my_cuda_dt_clone->remote_device != my_cuda_dt_clone->local_device) {
                 if (convertor->gpu_buffer_ptr != NULL) {
-                    opal_cuda_free_gpu_buffer_p(convertor->gpu_buffer_ptr, 0);
+                    opal_cuda_free_gpu_buffer(convertor->gpu_buffer_ptr, 0);
                     convertor->gpu_buffer_ptr = NULL;
                 }   
             }
@@ -960,7 +960,7 @@ static void btl_smcuda_datatype_pack(mca_btl_base_module_t* btl,
         send_msg.msg_type = CUDA_DDT_CLEANUP;
         mca_btl_smcuda_send_cuda_unpack_sig(btl, endpoint, &send_msg);
         if (convertor->gpu_buffer_ptr != NULL) {
-            opal_cuda_free_gpu_buffer_p(convertor->gpu_buffer_ptr, 0);
+            opal_cuda_free_gpu_buffer(convertor->gpu_buffer_ptr, 0);
             convertor->gpu_buffer_ptr = NULL;
         }
     } else if (msg_type == CUDA_DDT_PACK_TO_BLOCK) {
@@ -1022,7 +1022,7 @@ static void btl_smcuda_datatype_put(mca_btl_base_module_t* btl,
     /* We can find the endoint back from the rank embedded in the header */
     endpoint = mca_btl_smcuda_component.sm_peers[frag->hdr->my_smp_rank];
     
-    opal_cuda_free_gpu_buffer_p(convertor->gpu_buffer_ptr, 0);
+    opal_cuda_free_gpu_buffer(convertor->gpu_buffer_ptr, 0);
     mca_mpool_common_cuda_reg_t *rget_reg_ptr = NULL;
     mca_mpool_common_cuda_reg_t rget_reg;
     rget_reg_ptr= &rget_reg;
diff --git a/opal/mca/common/cuda/common_cuda.c b/opal/mca/common/cuda/common_cuda.c
index 04c333efd1b..d89b11bd647 100644
--- a/opal/mca/common/cuda/common_cuda.c
+++ b/opal/mca/common/cuda/common_cuda.c
@@ -33,7 +33,6 @@
 #include "opal/align.h"
 #include "opal/datatype/opal_convertor.h"
 #include "opal/datatype/opal_datatype_cuda.h"
-#include "opal/datatype/opal_datatype_gpu.h"
 #include "opal/util/output.h"
 #include "opal/util/show_help.h"
 #include "opal/util/proc.h"
@@ -1650,16 +1649,6 @@ int progress_one_cuda_htod_event(struct mca_btl_base_descriptor_t **frag) {
     return 0;
 }
 
-int mca_common_cuda_geteventhandle(uint64_t **event, int n, mca_mpool_base_registration_t *newreg) 
-{
-    CUipcEventHandle evtHandle;
-    mca_mpool_common_cuda_reg_t *cuda_reg = (mca_mpool_common_cuda_reg_t*)newreg;
- //   mca_common_cuda_construct_event_and_handle(event, (void**)&evtHandle);
-//    printf("0 %p, 1 %p\n",&cuda_reg->data.pipeline_evtHandle[0], &cuda_reg->data.pipeline_evtHandle[EVTHANDLE_SIZE]);
- //   memcpy(&cuda_reg->data.pipeline_evtHandle[n*EVTHANDLE_SIZE], &evtHandle, sizeof(evtHandle));
-    return OPAL_SUCCESS;
-}
-
 int mca_common_cuda_create_event(uint64_t **event)
 {
     CUresult result;
@@ -1925,7 +1914,7 @@ static int mca_common_cuda_is_gpu_buffer(const void *pUserBuf, opal_convertor_t
         if (0 != mca_common_cuda_stage_three_init()) {
             opal_cuda_support = 0;
         } else {
-	    opal_datatype_gpu_init();
+	    opal_datatype_cuda_kernel_support_init();
 	}
     }
 
diff --git a/opal/mca/common/cuda/common_cuda.h b/opal/mca/common/cuda/common_cuda.h
index 9adda6dc82f..e0b511fa48b 100644
--- a/opal/mca/common/cuda/common_cuda.h
+++ b/opal/mca/common/cuda/common_cuda.h
@@ -93,7 +93,6 @@ OPAL_DECLSPEC int mca_common_cuda_device_can_access_peer(int *access, int dev1,
 OPAL_DECLSPEC int mca_common_cuda_stage_one_init(void);
 OPAL_DECLSPEC int mca_common_cuda_get_address_range(void *pbase, size_t *psize, void *base);
 OPAL_DECLSPEC void mca_common_cuda_fini(void);
-OPAL_DECLSPEC int mca_common_cuda_geteventhandle(uint64_t **event, int n, mca_mpool_base_registration_t *newreg);
 OPAL_DECLSPEC int mca_common_cuda_create_event(uint64_t **event);
 OPAL_DECLSPEC int mca_common_cuda_record_event(uint64_t *event);
 OPAL_DECLSPEC int mca_common_cuda_query_event(uint64_t *event);

From 44361c0c9e348feb649c240db4ded61c52dfa118 Mon Sep 17 00:00:00 2001
From: eddy16112 <eddy16112@gmail.com>
Date: Thu, 5 Nov 2015 16:25:41 -0500
Subject: [PATCH 044/190] rename some functions

---
 opal/datatype/cuda/Makefile.in                |   4 +-
 opal/datatype/cuda/opal_datatype_cuda.cu      |  14 +--
 opal/datatype/cuda/opal_datatype_cuda.cuh     |  46 ++++----
 .../cuda/opal_datatype_cuda_internal.cuh      |   8 --
 .../cuda/opal_datatype_pack_cuda_kernel.cu    |  43 +------
 .../cuda/opal_datatype_pack_cuda_wrapper.cu   |  34 +++---
 .../cuda/opal_datatype_unpack_cuda_wrapper.cu |  30 ++---
 opal/datatype/opal_convertor.h                |   1 -
 opal/datatype/opal_datatype_cuda.c            | 106 +++++++++---------
 opal/datatype/opal_datatype_cuda.h            |  24 ++--
 opal/datatype/opal_datatype_module.c          |   2 +-
 opal/datatype/opal_datatype_pack.c            |   2 +-
 opal/datatype/opal_datatype_unpack.c          |   2 +-
 opal/mca/common/cuda/common_cuda.c            |   2 +-
 14 files changed, 134 insertions(+), 184 deletions(-)

diff --git a/opal/datatype/cuda/Makefile.in b/opal/datatype/cuda/Makefile.in
index ded04f1ed3c..ea0af09c6d0 100644
--- a/opal/datatype/cuda/Makefile.in
+++ b/opal/datatype/cuda/Makefile.in
@@ -9,8 +9,8 @@ VPATH = @srcdir@
 NVCC       = nvcc
 ARCH       = @AR@
 ARCHFLAGS  = cr
-STLIB     ?= opal_datatype_cuda.a
-DYLIB     ?= opal_datatype_cuda.so
+STLIB     ?= opal_datatype_cuda_kernel.a
+DYLIB     ?= opal_datatype_cuda_kernel.so
 EXTLIB     = -L$(top_builddir)/opal/datatype/.libs -ldatatype -L$(top_builddir)/opal/.libs -lopen-pal -L/usr/local/cuda/lib -lcuda
 subdir     = opal/datatype/cuda
 
diff --git a/opal/datatype/cuda/opal_datatype_cuda.cu b/opal/datatype/cuda/opal_datatype_cuda.cu
index 3c5208d7122..e07adb33c5e 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cu
+++ b/opal/datatype/cuda/opal_datatype_cuda.cu
@@ -175,7 +175,7 @@ void opal_cuda_output(int output_id, const char *format, ...)
     }
 }
 
-int32_t opal_datatype_cuda_init(void)
+int32_t opal_ddt_cuda_kernel_init(void)
 {
     uint32_t i, j;
     int device;
@@ -245,7 +245,7 @@ int32_t opal_datatype_cuda_init(void)
     return OPAL_SUCCESS;
 }
 
-int32_t opal_datatype_cuda_fini(void)
+int32_t opal_ddt_cuda_kernel_fini(void)
 {
     uint32_t i, j;
     
@@ -275,7 +275,7 @@ int32_t opal_datatype_cuda_fini(void)
     return OPAL_SUCCESS;
 }
 
-int32_t opal_cuda_is_gpu_buffer(const void *ptr)
+int32_t opal_ddt_cuda_is_gpu_buffer(const void *ptr)
 {
     int res;
     CUmemorytype memType;
@@ -291,7 +291,7 @@ int32_t opal_cuda_is_gpu_buffer(const void *ptr)
     return (memType == CU_MEMORYTYPE_DEVICE) ? 1 : 0;
 }
 
-void* opal_cuda_malloc_gpu_buffer(size_t size, int gpu_id)
+void* opal_ddt_cuda_malloc_gpu_buffer(size_t size, int gpu_id)
 {
     int dev_id;
     cudaGetDevice(&dev_id);
@@ -330,7 +330,7 @@ void* opal_cuda_malloc_gpu_buffer(size_t size, int gpu_id)
     return NULL;
 }
 
-void opal_cuda_free_gpu_buffer(void *addr, int gpu_id)
+void opal_ddt_cuda_free_gpu_buffer(void *addr, int gpu_id)
 {
     ddt_cuda_device_t *device = &cuda_devices[gpu_id];
     ddt_cuda_buffer_t *ptr = device->buffer_used.head;
@@ -369,12 +369,12 @@ void opal_cuda_check_error(cudaError_t err)
     }
 }
 
-void opal_cuda_d2dcpy_async(void* dst, const void* src, size_t count)
+void opal_ddt_cuda_d2dcpy_async(void* dst, const void* src, size_t count)
 {
     cudaMemcpyAsync(dst, src, count, cudaMemcpyDeviceToDevice, current_cuda_device->cuda_streams->opal_cuda_stream[0]);
 }
 
-void opal_cuda_d2dcpy(void* dst, const void* src, size_t count)
+void opal_ddt_cuda_d2dcpy(void* dst, const void* src, size_t count)
 {
     cudaMemcpyAsync(dst, src, count, cudaMemcpyDeviceToDevice, current_cuda_device->cuda_streams->opal_cuda_stream[0]);
     cudaStreamSynchronize(current_cuda_device->cuda_streams->opal_cuda_stream[0]);
diff --git a/opal/datatype/cuda/opal_datatype_cuda.cuh b/opal/datatype/cuda/opal_datatype_cuda.cuh
index 8c228fc3404..53f548c6d34 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cuh
+++ b/opal/datatype/cuda/opal_datatype_cuda.cuh
@@ -4,30 +4,30 @@
 extern "C"
 {
     
-int32_t opal_datatype_cuda_init(void);
+int32_t opal_ddt_cuda_kernel_init(void);
 
-int32_t opal_datatype_cuda_fini(void);
+int32_t opal_ddt_cuda_kernel_fini(void);
                                 
                                                 
-int32_t opal_generic_simple_pack_function_cuda_vector( opal_convertor_t* pConvertor,
-                                                       struct iovec* iov, 
-                                                       uint32_t* out_size,
-                                                       size_t* max_data );
+int32_t opal_ddt_generic_simple_pack_function_cuda_vector( opal_convertor_t* pConvertor,
+                                                           struct iovec* iov, 
+                                                           uint32_t* out_size,
+                                                           size_t* max_data );
                                                 
-int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor,
-                                                    struct iovec* iov, 
-                                                    uint32_t* out_size,
-                                                    size_t* max_data );                                              
+int32_t opal_ddt_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor,
+                                                        struct iovec* iov, 
+                                                        uint32_t* out_size,
+                                                        size_t* max_data );                                              
                                                   
-int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvertor,
-                                                  struct iovec* iov, 
-                                                  uint32_t* out_size,
-                                                  size_t* max_data );  
+int32_t opal_ddt_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvertor,
+                                                          struct iovec* iov, 
+                                                          uint32_t* out_size,
+                                                          size_t* max_data );  
                                                 
-int32_t opal_generic_simple_unpack_function_cuda_vector( opal_convertor_t* pConvertor,
-                                                         struct iovec* iov, 
-                                                         uint32_t* out_size,
-                                                         size_t* max_data );
+int32_t opal_ddt_generic_simple_unpack_function_cuda_vector( opal_convertor_t* pConvertor,
+                                                             struct iovec* iov, 
+                                                             uint32_t* out_size,
+                                                             size_t* max_data );
 
 void pack_contiguous_loop_cuda( dt_elem_desc_t* ELEM,
                                 uint32_t* COUNT,
@@ -83,15 +83,15 @@ void unpack_predefined_data_cuda( dt_elem_desc_t* ELEM,
                                   unsigned char** DESTINATION,
                                   size_t* SPACE );
 
-int32_t opal_cuda_is_gpu_buffer(const void *ptr);
+int32_t opal_ddt_cuda_is_gpu_buffer(const void *ptr);
 
-void* opal_cuda_malloc_gpu_buffer(size_t size, int gpu_id);
+void* opal_ddt_cuda_malloc_gpu_buffer(size_t size, int gpu_id);
 
-void opal_cuda_free_gpu_buffer(void *addr, int gpu_id);
+void opal_ddt_cuda_free_gpu_buffer(void *addr, int gpu_id);
 
-void opal_cuda_d2dcpy_async(void* dst, const void* src, size_t count);
+void opal_ddt_cuda_d2dcpy_async(void* dst, const void* src, size_t count);
 
-void opal_cuda_d2dcpy(void* dst, const void* src, size_t count);
+void opal_ddt_cuda_d2dcpy(void* dst, const void* src, size_t count);
 
 void opal_dump_cuda_list(ddt_cuda_list_t *list);
 
diff --git a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
index 506a5fe22cd..7648eed3b3e 100644
--- a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
+++ b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
@@ -120,14 +120,6 @@ __global__ void opal_generic_simple_pack_cuda_iov_kernel( ddt_cuda_iov_dist_t* c
 
 __global__ void opal_generic_simple_unpack_cuda_iov_kernel( ddt_cuda_iov_dist_t* cuda_iov_dist, int nb_blocks_used, unsigned char* source_base, unsigned char* destination_base);
 
-__global__ void opal_empty_kernel(uint32_t copy_loops,
-                                  size_t size,
-                                  OPAL_PTRDIFF_TYPE extent,
-                                  unsigned char* source,
-                                  unsigned char* destination);
-                            
-__global__ void opal_empty_kernel_noargs();
-
 void opal_cuda_output(int output_id, const char *format, ...);
 
 void opal_cuda_check_error(cudaError_t err);
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
index dd9af2a5a7e..6b0e18b1078 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
@@ -43,33 +43,6 @@ __global__ void pack_contiguous_loop_cuda_kernel_global( uint32_t copy_loops,
     }
 }
 
-// __global__ void opal_generic_simple_pack_cuda_iov_kernel( ddt_cuda_description_dist_t* desc_dist_d,
-//                                                         dt_elem_desc_t* desc_d,
-//                                                         uint32_t required_blocks, struct iovec* iov, unsigned char* pBaseBuf)
-// {
-//     uint32_t i;
-//     dt_elem_desc_t* pElem;
-//     unsigned char *conv_ptr, *iov_ptr;
-//     uint32_t local_index, dst_offset, pos_desc, count_desc;
-//     size_t iov_len_local;
-//
-//     iov_ptr = (unsigned char *) iov[0].iov_base;
-//     iov_len_local = iov[0].iov_len;
-//     conv_ptr = pBaseBuf;
-//     for (i = 0; i < desc_dist_d[blockIdx.x].description_used; i++) {
-//         pos_desc = desc_dist_d[blockIdx.x].description_index[i];
-//         local_index = desc_dist_d[blockIdx.x].description_local_index[i];
-//         dst_offset = desc_dist_d[blockIdx.x].dst_offset[i];
-//         pElem = &(desc_d[pos_desc]);
-//         count_desc = pElem->elem.count;
-//
-//   //      if ( pElem->elem.common.flags & OPAL_DATATYPE_FLAG_DATA ) {
-//             pack_predefined_data_cuda_kernel_v2(pElem, &count_desc, conv_ptr, iov_ptr, &iov_len_local, local_index, dst_offset);
-// //        }
-//     }
-//
-// }
-
 __global__ void opal_generic_simple_pack_cuda_iov_kernel( ddt_cuda_iov_dist_t* cuda_iov_dist, int nb_blocks_used, unsigned char* source_base, unsigned char* destination_base)
 {
     uint32_t i, _copy_count;
@@ -113,18 +86,4 @@ __global__ void opal_generic_simple_pack_cuda_iov_kernel( ddt_cuda_iov_dist_t* c
 #endif /* ! OPAL_DATATYPE_CUDA_DRY_RUN */
         }
     }
-}
-
-__global__ void opal_empty_kernel(uint32_t copy_loops,
-                                  size_t size,
-                                  OPAL_PTRDIFF_TYPE extent,
-                                  unsigned char* source,
-                                  unsigned char* destination)
-{
-    
-}
-
-__global__ void opal_empty_kernel_noargs()
-{
-    
-}
+}
\ No newline at end of file
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
index dccf2803c6a..97481755209 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
@@ -8,7 +8,7 @@
 #include <assert.h>
 
 
-int32_t opal_generic_simple_pack_function_cuda_vector2(opal_convertor_t* pConvertor,
+int32_t opal_ddt_generic_simple_pack_function_cuda_vector2(opal_convertor_t* pConvertor,
                                                       struct iovec* iov,
                                                       uint32_t* out_size,
                                                       size_t* max_data )
@@ -59,7 +59,7 @@ int32_t opal_generic_simple_pack_function_cuda_vector2(opal_convertor_t* pConver
     
     
     for( iov_count = 0; iov_count < (*out_size); iov_count++ ) {
-        if ((iov[iov_count].iov_base == NULL) || opal_cuda_is_gpu_buffer(iov[iov_count].iov_base)) {
+        if ((iov[iov_count].iov_base == NULL) || opal_ddt_cuda_is_gpu_buffer(iov[iov_count].iov_base)) {
             if (iov[iov_count].iov_len == 0) {
                 iov_len_local = DT_CUDA_BUFFER_SIZE;
             } else {
@@ -67,7 +67,7 @@ int32_t opal_generic_simple_pack_function_cuda_vector2(opal_convertor_t* pConver
             }
         
             if (iov[iov_count].iov_base == NULL) {
-                iov[iov_count].iov_base = (unsigned char *)opal_cuda_malloc_gpu_buffer(iov_len_local, 0);
+                iov[iov_count].iov_base = (unsigned char *)opal_ddt_cuda_malloc_gpu_buffer(iov_len_local, 0);
                 iov_ptr = (unsigned char *)iov[iov_count].iov_base;
                 pConvertor->gpu_buffer_ptr = iov_ptr;
                 free_required = 1;
@@ -86,7 +86,7 @@ int32_t opal_generic_simple_pack_function_cuda_vector2(opal_convertor_t* pConver
             } else if (OPAL_DATATYPE_VECTOR_USE_PIPELINE){
                 iov_len_local = iov[iov_count].iov_len;
                 if (pConvertor->gpu_buffer_ptr == NULL) {
-                    pConvertor->gpu_buffer_ptr = (unsigned char*)opal_cuda_malloc_gpu_buffer(iov_len_local, 0);
+                    pConvertor->gpu_buffer_ptr = (unsigned char*)opal_ddt_cuda_malloc_gpu_buffer(iov_len_local, 0);
                 }
                 transfer_required = 0;
                 free_required = 1;
@@ -94,7 +94,7 @@ int32_t opal_generic_simple_pack_function_cuda_vector2(opal_convertor_t* pConver
             } else {
                 iov_len_local = iov[iov_count].iov_len;
                 if (pConvertor->gpu_buffer_ptr == NULL) {
-                    pConvertor->gpu_buffer_ptr = (unsigned char*)opal_cuda_malloc_gpu_buffer(iov_len_local, 0);
+                    pConvertor->gpu_buffer_ptr = (unsigned char*)opal_ddt_cuda_malloc_gpu_buffer(iov_len_local, 0);
                 }
                 transfer_required = 1;
                 free_required = 1;
@@ -198,7 +198,7 @@ int32_t opal_generic_simple_pack_function_cuda_vector2(opal_convertor_t* pConver
         DT_CUDA_DEBUG( opal_cuda_output( 0, "Total packed %lu\n", pConvertor->bConverted); );
         if (pConvertor->gpu_buffer_ptr != NULL && free_required == 1) {
             printf("free\n");
-           opal_cuda_free_gpu_buffer(pConvertor->gpu_buffer_ptr, 0);
+           opal_ddt_cuda_free_gpu_buffer(pConvertor->gpu_buffer_ptr, 0);
            pConvertor->gpu_buffer_ptr = NULL;
         }
         return 1;
@@ -211,7 +211,7 @@ int32_t opal_generic_simple_pack_function_cuda_vector2(opal_convertor_t* pConver
     return 0;
 }
 
-int32_t opal_generic_simple_pack_function_cuda_vector(opal_convertor_t* pConvertor,
+int32_t opal_ddt_generic_simple_pack_function_cuda_vector(opal_convertor_t* pConvertor,
                                                       struct iovec* iov,
                                                       uint32_t* out_size,
                                                       size_t* max_data )
@@ -262,7 +262,7 @@ int32_t opal_generic_simple_pack_function_cuda_vector(opal_convertor_t* pConvert
 
 
     for( iov_count = 0; iov_count < (*out_size); iov_count++ ) {
-        if ((iov[iov_count].iov_base == NULL) || opal_cuda_is_gpu_buffer(iov[iov_count].iov_base)) {
+        if ((iov[iov_count].iov_base == NULL) || opal_ddt_cuda_is_gpu_buffer(iov[iov_count].iov_base)) {
             if (iov[iov_count].iov_len == 0) {
                 iov_len_local = DT_CUDA_BUFFER_SIZE;
             } else {
@@ -270,7 +270,7 @@ int32_t opal_generic_simple_pack_function_cuda_vector(opal_convertor_t* pConvert
             }
 
             if (iov[iov_count].iov_base == NULL) {
-                iov[iov_count].iov_base = (unsigned char *)opal_cuda_malloc_gpu_buffer(iov_len_local, 0);
+                iov[iov_count].iov_base = (unsigned char *)opal_ddt_cuda_malloc_gpu_buffer(iov_len_local, 0);
                 iov_ptr = (unsigned char *)iov[iov_count].iov_base;
                 pConvertor->gpu_buffer_ptr = iov_ptr;
                 free_required = 1;
@@ -289,7 +289,7 @@ int32_t opal_generic_simple_pack_function_cuda_vector(opal_convertor_t* pConvert
             } else if (OPAL_DATATYPE_VECTOR_USE_PIPELINE){
                 iov_len_local = iov[iov_count].iov_len;
                 if (pConvertor->gpu_buffer_ptr == NULL) {
-                    pConvertor->gpu_buffer_ptr = (unsigned char*)opal_cuda_malloc_gpu_buffer(iov_len_local, 0);
+                    pConvertor->gpu_buffer_ptr = (unsigned char*)opal_ddt_cuda_malloc_gpu_buffer(iov_len_local, 0);
                 }
                 transfer_required = 0;
                 free_required = 1;
@@ -297,7 +297,7 @@ int32_t opal_generic_simple_pack_function_cuda_vector(opal_convertor_t* pConvert
             } else {
                 iov_len_local = iov[iov_count].iov_len;
                 if (pConvertor->gpu_buffer_ptr == NULL) {
-                    pConvertor->gpu_buffer_ptr = (unsigned char*)opal_cuda_malloc_gpu_buffer(iov_len_local, 0);
+                    pConvertor->gpu_buffer_ptr = (unsigned char*)opal_ddt_cuda_malloc_gpu_buffer(iov_len_local, 0);
                 }
                 transfer_required = 1;
                 free_required = 1;
@@ -398,7 +398,7 @@ int32_t opal_generic_simple_pack_function_cuda_vector(opal_convertor_t* pConvert
         pConvertor->flags |= CONVERTOR_COMPLETED;
         if (pConvertor->gpu_buffer_ptr != NULL && free_required == 1) {
             printf("free\n");
-           opal_cuda_free_gpu_buffer(pConvertor->gpu_buffer_ptr, 0);
+           opal_ddt_cuda_free_gpu_buffer(pConvertor->gpu_buffer_ptr, 0);
            pConvertor->gpu_buffer_ptr = NULL;
         }
         return 1;
@@ -642,7 +642,7 @@ void pack_contiguous_loop_cuda_zerocopy( dt_elem_desc_t* ELEM,
 #endif
 }
 
-int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor,
+int32_t opal_ddt_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor,
                                                     struct iovec* iov,
                                                     uint32_t* out_size,
                                                     size_t* max_data )
@@ -683,7 +683,7 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
 //    assert(opal_datatype_basicDatatypes[pElem->elem.common.type]->size != 0);
 
  //   printf("buffer size %d, max_data %d\n", iov[0].iov_len, *max_data);
-    if ((iov[0].iov_base == NULL) || opal_cuda_is_gpu_buffer(iov[0].iov_base)) {
+    if ((iov[0].iov_base == NULL) || opal_ddt_cuda_is_gpu_buffer(iov[0].iov_base)) {
         if (iov[0].iov_len == 0) {
             buffer_size = DT_CUDA_BUFFER_SIZE;
         } else {
@@ -691,7 +691,7 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
         }
         
         if (iov[0].iov_base == NULL) {
-            iov[0].iov_base = (unsigned char *)opal_cuda_malloc_gpu_buffer(buffer_size, 0);
+            iov[0].iov_base = (unsigned char *)opal_ddt_cuda_malloc_gpu_buffer(buffer_size, 0);
             destination = (unsigned char *)iov[0].iov_base;
             pConvertor->gpu_buffer_ptr = destination;
             free_required = 1;
@@ -709,7 +709,7 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
             cudaHostGetDevicePointer((void **)&destination, (void *)iov[0].iov_base, 0);
         } else {
             if (pConvertor->gpu_buffer_ptr == NULL) {
-                pConvertor->gpu_buffer_ptr = (unsigned char*)opal_cuda_malloc_gpu_buffer(buffer_size, 0);
+                pConvertor->gpu_buffer_ptr = (unsigned char*)opal_ddt_cuda_malloc_gpu_buffer(buffer_size, 0);
             }
             transfer_required = 1;
             free_required = 1;
@@ -895,7 +895,7 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
     if( pConvertor->bConverted == pConvertor->local_size ) {
         pConvertor->flags |= CONVERTOR_COMPLETED;
         if (pConvertor->gpu_buffer_ptr != NULL && free_required) {
-           opal_cuda_free_gpu_buffer(pConvertor->gpu_buffer_ptr, 0);
+           opal_ddt_cuda_free_gpu_buffer(pConvertor->gpu_buffer_ptr, 0);
            pConvertor->gpu_buffer_ptr = NULL;
         }
         return 1;
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
index a8ba035ef78..9d0e02067d1 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
@@ -8,7 +8,7 @@
 #include <assert.h>
 
 
-int32_t opal_generic_simple_unpack_function_cuda_vector2( opal_convertor_t* pConvertor,
+int32_t opal_ddt_generic_simple_unpack_function_cuda_vector2( opal_convertor_t* pConvertor,
                                                          struct iovec* iov, uint32_t* out_size,
                                                          size_t* max_data )
 {
@@ -58,7 +58,7 @@ int32_t opal_generic_simple_unpack_function_cuda_vector2( opal_convertor_t* pCon
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
         GET_TIME(start);
 #endif
-        if (opal_cuda_is_gpu_buffer(iov[iov_count].iov_base)) {
+        if (opal_ddt_cuda_is_gpu_buffer(iov[iov_count].iov_base)) {
             iov_ptr = (unsigned char*)iov[iov_count].iov_base;
             free_required = 0;
         } else {
@@ -68,7 +68,7 @@ int32_t opal_generic_simple_unpack_function_cuda_vector2( opal_convertor_t* pCon
                 free_required = 0;
             } else {
                 if (pConvertor->gpu_buffer_ptr == NULL) {
-                    pConvertor->gpu_buffer_ptr = (unsigned char*)opal_cuda_malloc_gpu_buffer(iov[iov_count].iov_len, 0);
+                    pConvertor->gpu_buffer_ptr = (unsigned char*)opal_ddt_cuda_malloc_gpu_buffer(iov[iov_count].iov_len, 0);
                 }
                 iov_ptr = pConvertor->gpu_buffer_ptr;
                 cudaMemcpy(iov_ptr, iov[iov_count].iov_base, iov[iov_count].iov_len, cudaMemcpyHostToDevice);
@@ -171,7 +171,7 @@ int32_t opal_generic_simple_unpack_function_cuda_vector2( opal_convertor_t* pCon
         pConvertor->flags |= CONVERTOR_COMPLETED;
         DT_CUDA_DEBUG( opal_cuda_output( 0, "Total unpacked %lu\n", pConvertor->bConverted); );
         if (pConvertor->gpu_buffer_ptr != NULL && free_required == 1) {
-            opal_cuda_free_gpu_buffer(pConvertor->gpu_buffer_ptr, 0);
+            opal_ddt_cuda_free_gpu_buffer(pConvertor->gpu_buffer_ptr, 0);
             pConvertor->gpu_buffer_ptr = NULL;
         }
         return 1;
@@ -184,7 +184,7 @@ int32_t opal_generic_simple_unpack_function_cuda_vector2( opal_convertor_t* pCon
     return 0;
 }
 
-int32_t opal_generic_simple_unpack_function_cuda_vector( opal_convertor_t* pConvertor,
+int32_t opal_ddt_generic_simple_unpack_function_cuda_vector( opal_convertor_t* pConvertor,
                                                          struct iovec* iov, uint32_t* out_size,
                                                          size_t* max_data )
 {
@@ -234,7 +234,7 @@ int32_t opal_generic_simple_unpack_function_cuda_vector( opal_convertor_t* pConv
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
         GET_TIME(start);
 #endif
-        if (opal_cuda_is_gpu_buffer(iov[iov_count].iov_base)) {
+        if (opal_ddt_cuda_is_gpu_buffer(iov[iov_count].iov_base)) {
             iov_ptr = (unsigned char*)iov[iov_count].iov_base;
             free_required = 0;
         } else {
@@ -244,7 +244,7 @@ int32_t opal_generic_simple_unpack_function_cuda_vector( opal_convertor_t* pConv
                 free_required = 0;
             } else {
                 if (pConvertor->gpu_buffer_ptr == NULL) {
-                    pConvertor->gpu_buffer_ptr = (unsigned char*)opal_cuda_malloc_gpu_buffer(iov[iov_count].iov_len, 0);
+                    pConvertor->gpu_buffer_ptr = (unsigned char*)opal_ddt_cuda_malloc_gpu_buffer(iov[iov_count].iov_len, 0);
                 }
                 iov_ptr = pConvertor->gpu_buffer_ptr;
                 cudaMemcpy(iov_ptr, iov[iov_count].iov_base, iov[iov_count].iov_len, cudaMemcpyHostToDevice);
@@ -340,7 +340,7 @@ int32_t opal_generic_simple_unpack_function_cuda_vector( opal_convertor_t* pConv
     if( pConvertor->bConverted == pConvertor->remote_size ) {
         pConvertor->flags |= CONVERTOR_COMPLETED;
         if (pConvertor->gpu_buffer_ptr != NULL && free_required == 1) {
-            opal_cuda_free_gpu_buffer(pConvertor->gpu_buffer_ptr, 0);
+            opal_ddt_cuda_free_gpu_buffer(pConvertor->gpu_buffer_ptr, 0);
             pConvertor->gpu_buffer_ptr = NULL;
         }
         return 1;
@@ -353,10 +353,10 @@ int32_t opal_generic_simple_unpack_function_cuda_vector( opal_convertor_t* pConv
     return 0;
 }
 
-int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvertor,
-                                                      struct iovec* iov,
-                                                      uint32_t* out_size,
-                                                      size_t* max_data )
+int32_t opal_ddt_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvertor,
+                                                          struct iovec* iov,
+                                                          uint32_t* out_size,
+                                                          size_t* max_data )
 {
     uint32_t i, j;
     uint32_t count_desc, nb_blocks_per_description, dst_offset, residue_desc;
@@ -399,7 +399,7 @@ int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvert
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
     GET_TIME(start);
 #endif
-    if (opal_cuda_is_gpu_buffer(iov[0].iov_base)) {
+    if (opal_ddt_cuda_is_gpu_buffer(iov[0].iov_base)) {
         source = (unsigned char*)iov[0].iov_base;
         free_required = 0;
     } else {
@@ -409,7 +409,7 @@ int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvert
             free_required = 0;
         } else {
             if (pConvertor->gpu_buffer_ptr == NULL) {
-                pConvertor->gpu_buffer_ptr = (unsigned char*)opal_cuda_malloc_gpu_buffer(iov[0].iov_len, 0);
+                pConvertor->gpu_buffer_ptr = (unsigned char*)opal_ddt_cuda_malloc_gpu_buffer(iov[0].iov_len, 0);
             }
             source = pConvertor->gpu_buffer_ptr;
             cudaMemcpy(source, iov[0].iov_base, iov[0].iov_len, cudaMemcpyHostToDevice);
@@ -589,7 +589,7 @@ int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvert
     if( pConvertor->bConverted == pConvertor->local_size ) {
         pConvertor->flags |= CONVERTOR_COMPLETED;
         if (pConvertor->gpu_buffer_ptr != NULL && free_required) {
-            opal_cuda_free_gpu_buffer(pConvertor->gpu_buffer_ptr, 0);
+            opal_ddt_cuda_free_gpu_buffer(pConvertor->gpu_buffer_ptr, 0);
             pConvertor->gpu_buffer_ptr = NULL;
         }
         return 1;
diff --git a/opal/datatype/opal_convertor.h b/opal/datatype/opal_convertor.h
index 6b4746eaa9a..af74ee1221c 100644
--- a/opal/datatype/opal_convertor.h
+++ b/opal/datatype/opal_convertor.h
@@ -114,7 +114,6 @@ struct opal_convertor_t {
 
     unsigned char *               gpu_buffer_ptr; /**< GPU buffer used for pack/unpack */
     size_t                        gpu_buffer_size;
-    uint64_t *                    pipeline_event[MAX_IPC_EVENT_HANDLE]; /**< cuda event for pipeline */
 #endif
     /* size: 248, cachelines: 4, members: 20 */
     /* last cacheline: 56 bytes */
diff --git a/opal/datatype/opal_datatype_cuda.c b/opal/datatype/opal_datatype_cuda.c
index 23cdb47acd6..96c3221b94c 100644
--- a/opal/datatype/opal_datatype_cuda.c
+++ b/opal/datatype/opal_datatype_cuda.c
@@ -81,8 +81,8 @@ void mca_cuda_convertor_init(opal_convertor_t* convertor, const void *pUserBuf)
         convertor->flags |= CONVERTOR_CUDA;
     }
     
-    if (OPAL_SUCCESS != opal_datatype_cuda_kernel_support_init()) {
-        opal_datatype_cuda_kernel_support_fini();    
+    if (OPAL_SUCCESS != opal_cuda_kernel_support_init()) {
+        opal_cuda_kernel_support_fini();    
     }
 }
 
@@ -215,14 +215,14 @@ void opal_cuda_set_copy_function_async(opal_convertor_t* convertor, void *stream
 }
 
 /* following functions are used for cuda ddt kernel support */
-int32_t opal_datatype_cuda_kernel_support_init(void)
+int32_t opal_cuda_kernel_support_init(void)
 {
     if (opal_datatype_cuda_kernel_handle ==  NULL) {
 
         /* If the library name was initialized but the load failed, we have another chance to change it */
         if( NULL != opal_datatype_cuda_kernel_lib )
             free(opal_datatype_cuda_kernel_lib);
-        asprintf(&opal_datatype_cuda_kernel_lib, "%s/%s", opal_install_dirs.libdir, "opal_datatype_cuda.so");
+        asprintf(&opal_datatype_cuda_kernel_lib, "%s/%s", opal_install_dirs.libdir, "opal_datatype_cuda_kernel.so");
 
         opal_datatype_cuda_kernel_handle = dlopen(opal_datatype_cuda_kernel_lib , RTLD_LAZY);
         if (!opal_datatype_cuda_kernel_handle) {
@@ -231,41 +231,41 @@ int32_t opal_datatype_cuda_kernel_support_init(void)
             return OPAL_ERROR;
         }
         
-        OPAL_DATATYPE_FIND_CUDA_KERNEL_FUNCTION_OR_RETURN( opal_datatype_cuda_kernel_handle, opal_datatype_cuda_init );
-        OPAL_DATATYPE_FIND_CUDA_KERNEL_FUNCTION_OR_RETURN( opal_datatype_cuda_kernel_handle, opal_datatype_cuda_fini );
-        OPAL_DATATYPE_FIND_CUDA_KERNEL_FUNCTION_OR_RETURN( opal_datatype_cuda_kernel_handle, opal_generic_simple_pack_function_cuda_iov );
-        OPAL_DATATYPE_FIND_CUDA_KERNEL_FUNCTION_OR_RETURN( opal_datatype_cuda_kernel_handle, opal_generic_simple_unpack_function_cuda_iov );
-        OPAL_DATATYPE_FIND_CUDA_KERNEL_FUNCTION_OR_RETURN( opal_datatype_cuda_kernel_handle, opal_generic_simple_pack_function_cuda_vector );
-        OPAL_DATATYPE_FIND_CUDA_KERNEL_FUNCTION_OR_RETURN( opal_datatype_cuda_kernel_handle, opal_generic_simple_unpack_function_cuda_vector );
-        OPAL_DATATYPE_FIND_CUDA_KERNEL_FUNCTION_OR_RETURN( opal_datatype_cuda_kernel_handle, opal_cuda_free_gpu_buffer );
-        OPAL_DATATYPE_FIND_CUDA_KERNEL_FUNCTION_OR_RETURN( opal_datatype_cuda_kernel_handle, opal_cuda_malloc_gpu_buffer );
-        OPAL_DATATYPE_FIND_CUDA_KERNEL_FUNCTION_OR_RETURN( opal_datatype_cuda_kernel_handle, opal_cuda_d2dcpy_async );
-        OPAL_DATATYPE_FIND_CUDA_KERNEL_FUNCTION_OR_RETURN( opal_datatype_cuda_kernel_handle, opal_cuda_d2dcpy );
+        OPAL_DATATYPE_FIND_CUDA_KERNEL_FUNCTION_OR_RETURN( opal_datatype_cuda_kernel_handle, opal_ddt_cuda_kernel_init );
+        OPAL_DATATYPE_FIND_CUDA_KERNEL_FUNCTION_OR_RETURN( opal_datatype_cuda_kernel_handle, opal_ddt_cuda_kernel_fini );
+        OPAL_DATATYPE_FIND_CUDA_KERNEL_FUNCTION_OR_RETURN( opal_datatype_cuda_kernel_handle, opal_ddt_generic_simple_pack_function_cuda_iov );
+        OPAL_DATATYPE_FIND_CUDA_KERNEL_FUNCTION_OR_RETURN( opal_datatype_cuda_kernel_handle, opal_ddt_generic_simple_unpack_function_cuda_iov );
+        OPAL_DATATYPE_FIND_CUDA_KERNEL_FUNCTION_OR_RETURN( opal_datatype_cuda_kernel_handle, opal_ddt_generic_simple_pack_function_cuda_vector );
+        OPAL_DATATYPE_FIND_CUDA_KERNEL_FUNCTION_OR_RETURN( opal_datatype_cuda_kernel_handle, opal_ddt_generic_simple_unpack_function_cuda_vector );
+        OPAL_DATATYPE_FIND_CUDA_KERNEL_FUNCTION_OR_RETURN( opal_datatype_cuda_kernel_handle, opal_ddt_cuda_free_gpu_buffer );
+        OPAL_DATATYPE_FIND_CUDA_KERNEL_FUNCTION_OR_RETURN( opal_datatype_cuda_kernel_handle, opal_ddt_cuda_malloc_gpu_buffer );
+        OPAL_DATATYPE_FIND_CUDA_KERNEL_FUNCTION_OR_RETURN( opal_datatype_cuda_kernel_handle, opal_ddt_cuda_d2dcpy_async );
+        OPAL_DATATYPE_FIND_CUDA_KERNEL_FUNCTION_OR_RETURN( opal_datatype_cuda_kernel_handle, opal_ddt_cuda_d2dcpy );
         
-        if (OPAL_SUCCESS != cuda_kernel_table.opal_datatype_cuda_init_p()) {
+        if (OPAL_SUCCESS != cuda_kernel_table.opal_ddt_cuda_kernel_init_p()) {
             return OPAL_ERROR;
         }
         opal_datatype_cuda_kernel_support = 1;
-        opal_output( 0, "opal_datatype_cuda_kernel_support_init done\n");
+        opal_output( 0, "opal_cuda_kernel_support_init done\n");
     }
     return OPAL_SUCCESS;
 }
 
-int32_t opal_datatype_cuda_kernel_support_fini(void)
+int32_t opal_cuda_kernel_support_fini(void)
 {
     if (opal_datatype_cuda_kernel_handle != NULL) {
-        cuda_kernel_table.opal_datatype_cuda_fini_p();
+        cuda_kernel_table.opal_ddt_cuda_kernel_fini_p();
         /* Reset all functions to NULL */
-        cuda_kernel_table.opal_datatype_cuda_init_p = NULL;
-        cuda_kernel_table.opal_datatype_cuda_fini_p = NULL;
-        cuda_kernel_table.opal_generic_simple_pack_function_cuda_iov_p = NULL;
-        cuda_kernel_table.opal_generic_simple_unpack_function_cuda_iov_p = NULL;
-        cuda_kernel_table.opal_generic_simple_pack_function_cuda_vector_p = NULL;
-        cuda_kernel_table.opal_generic_simple_unpack_function_cuda_vector_p = NULL;
-        cuda_kernel_table.opal_cuda_free_gpu_buffer_p = NULL;
-        cuda_kernel_table.opal_cuda_malloc_gpu_buffer_p = NULL;
-        cuda_kernel_table.opal_cuda_d2dcpy_async_p = NULL;
-        cuda_kernel_table.opal_cuda_d2dcpy_p = NULL;
+        cuda_kernel_table.opal_ddt_cuda_kernel_init_p = NULL;
+        cuda_kernel_table.opal_ddt_cuda_kernel_fini_p = NULL;
+        cuda_kernel_table.opal_ddt_generic_simple_pack_function_cuda_iov_p = NULL;
+        cuda_kernel_table.opal_ddt_generic_simple_unpack_function_cuda_iov_p = NULL;
+        cuda_kernel_table.opal_ddt_generic_simple_pack_function_cuda_vector_p = NULL;
+        cuda_kernel_table.opal_ddt_generic_simple_unpack_function_cuda_vector_p = NULL;
+        cuda_kernel_table.opal_ddt_cuda_free_gpu_buffer_p = NULL;
+        cuda_kernel_table.opal_ddt_cuda_malloc_gpu_buffer_p = NULL;
+        cuda_kernel_table.opal_ddt_cuda_d2dcpy_async_p = NULL;
+        cuda_kernel_table.opal_ddt_cuda_d2dcpy_p = NULL;
 
         dlclose(opal_datatype_cuda_kernel_handle);
         opal_datatype_cuda_kernel_handle = NULL;
@@ -274,85 +274,85 @@ int32_t opal_datatype_cuda_kernel_support_fini(void)
             free(opal_datatype_cuda_kernel_lib);
         opal_datatype_cuda_kernel_lib = NULL;
         opal_datatype_cuda_kernel_support = 0;
-        opal_output( 0, "opal_datatype_cuda_kernel_support_fini done\n");
+        opal_output( 0, "opal_cuda_kernel_support_fini done\n");
     }
     return OPAL_SUCCESS;
 }
 
 int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor, struct iovec* iov, uint32_t* out_size, size_t* max_data )
 {
-    if (cuda_kernel_table.opal_generic_simple_pack_function_cuda_iov_p != NULL) {
-        return cuda_kernel_table.opal_generic_simple_pack_function_cuda_iov_p(pConvertor, iov, out_size, max_data);
+    if (cuda_kernel_table.opal_ddt_generic_simple_pack_function_cuda_iov_p != NULL) {
+        return cuda_kernel_table.opal_ddt_generic_simple_pack_function_cuda_iov_p(pConvertor, iov, out_size, max_data);
     } else {
-        opal_output(0, "opal_generic_simple_pack_function_cuda_iov function pointer is NULL\n");
+        opal_output(0, "opal_ddt_generic_simple_pack_function_cuda_iov function pointer is NULL\n");
         return -1;
     }
 }
 
 int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvertor, struct iovec* iov, uint32_t* out_size, size_t* max_data )
 {
-    if (cuda_kernel_table.opal_generic_simple_unpack_function_cuda_iov_p != NULL) {
-        return cuda_kernel_table.opal_generic_simple_unpack_function_cuda_iov_p(pConvertor, iov, out_size, max_data);
+    if (cuda_kernel_table.opal_ddt_generic_simple_unpack_function_cuda_iov_p != NULL) {
+        return cuda_kernel_table.opal_ddt_generic_simple_unpack_function_cuda_iov_p(pConvertor, iov, out_size, max_data);
     } else {
-        opal_output(0, "opal_generic_simple_unpack_function_cuda_iov function pointer is NULL\n");
+        opal_output(0, "opal_ddt_generic_simple_unpack_function_cuda_iov function pointer is NULL\n");
         return -1;
     }
 }
 
 int32_t opal_generic_simple_pack_function_cuda_vector( opal_convertor_t* pConvertor, struct iovec* iov, uint32_t* out_size, size_t* max_data )
 {
-    if (cuda_kernel_table.opal_generic_simple_pack_function_cuda_vector_p != NULL) {
-        return cuda_kernel_table.opal_generic_simple_pack_function_cuda_vector_p(pConvertor, iov, out_size, max_data);
+    if (cuda_kernel_table.opal_ddt_generic_simple_pack_function_cuda_vector_p != NULL) {
+        return cuda_kernel_table.opal_ddt_generic_simple_pack_function_cuda_vector_p(pConvertor, iov, out_size, max_data);
     } else {
-        opal_output(0, "opal_generic_simple_pack_function_cuda_vector function pointer is NULL\n");
+        opal_output(0, "opal_ddt_generic_simple_pack_function_cuda_vector function pointer is NULL\n");
         return -1;
     }
 }
 
 int32_t opal_generic_simple_unpack_function_cuda_vector( opal_convertor_t* pConvertor, struct iovec* iov, uint32_t* out_size, size_t* max_data )
 {
-    if (cuda_kernel_table.opal_generic_simple_unpack_function_cuda_vector_p != NULL) {
-        return cuda_kernel_table.opal_generic_simple_unpack_function_cuda_vector_p(pConvertor, iov, out_size, max_data);
+    if (cuda_kernel_table.opal_ddt_generic_simple_unpack_function_cuda_vector_p != NULL) {
+        return cuda_kernel_table.opal_ddt_generic_simple_unpack_function_cuda_vector_p(pConvertor, iov, out_size, max_data);
     } else {
-        opal_output(0, "opal_generic_simple_unpack_function_cuda_vector function pointer is NULL\n");
+        opal_output(0, "opal_ddt_generic_simple_unpack_function_cuda_vector function pointer is NULL\n");
         return -1;
     }
 }
 
 void* opal_cuda_malloc_gpu_buffer(size_t size, int gpu_id)
 {
-    if (cuda_kernel_table.opal_cuda_malloc_gpu_buffer_p != NULL) {
-        return cuda_kernel_table.opal_cuda_malloc_gpu_buffer_p(size, gpu_id);
+    if (cuda_kernel_table.opal_ddt_cuda_malloc_gpu_buffer_p != NULL) {
+        return cuda_kernel_table.opal_ddt_cuda_malloc_gpu_buffer_p(size, gpu_id);
     } else {
-        opal_output(0, "opal_cuda_malloc_gpu_buffer function pointer is NULL\n");
+        opal_output(0, "opal_ddt_cuda_malloc_gpu_buffer function pointer is NULL\n");
         return NULL;
     }
 }
 
 void opal_cuda_free_gpu_buffer(void *addr, int gpu_id)
 {
-    if (cuda_kernel_table.opal_cuda_free_gpu_buffer_p != NULL) {
-        cuda_kernel_table.opal_cuda_free_gpu_buffer_p(addr, gpu_id);
+    if (cuda_kernel_table.opal_ddt_cuda_free_gpu_buffer_p != NULL) {
+        cuda_kernel_table.opal_ddt_cuda_free_gpu_buffer_p(addr, gpu_id);
     } else {
-        opal_output(0, "opal_cuda_free_gpu_buffer function pointer is NULL\n");
+        opal_output(0, "opal_ddt_cuda_free_gpu_buffer function pointer is NULL\n");
     }
 }
 
 void opal_cuda_d2dcpy(void* dst, const void* src, size_t count)
 {
-    if (cuda_kernel_table.opal_cuda_d2dcpy_p != NULL) {
-        cuda_kernel_table.opal_cuda_d2dcpy_p(dst, src, count);
+    if (cuda_kernel_table.opal_ddt_cuda_d2dcpy_p != NULL) {
+        cuda_kernel_table.opal_ddt_cuda_d2dcpy_p(dst, src, count);
     } else {
-        opal_output(0, "opal_cuda_d2dcpy function pointer is NULL\n");
+        opal_output(0, "opal_ddt_cuda_d2dcpy function pointer is NULL\n");
     }
 }
 
 void opal_cuda_d2dcpy_async(void* dst, const void* src, size_t count)
 {
-    if (cuda_kernel_table.opal_cuda_d2dcpy_async_p != NULL) {
-        cuda_kernel_table.opal_cuda_d2dcpy_async_p(dst, src, count);
+    if (cuda_kernel_table.opal_ddt_cuda_d2dcpy_async_p != NULL) {
+        cuda_kernel_table.opal_ddt_cuda_d2dcpy_async_p(dst, src, count);
     } else {
-        opal_output(0, "opal_cuda_d2dcpy_async function pointer is NULL\n");
+        opal_output(0, "opal_ddt_cuda_d2dcpy_async function pointer is NULL\n");
     }
 }
 
diff --git a/opal/datatype/opal_datatype_cuda.h b/opal/datatype/opal_datatype_cuda.h
index a5a68074034..8b6f996e422 100644
--- a/opal/datatype/opal_datatype_cuda.h
+++ b/opal/datatype/opal_datatype_cuda.h
@@ -22,16 +22,16 @@ struct opal_common_cuda_function_table {
 typedef struct opal_common_cuda_function_table opal_common_cuda_function_table_t;
 
 struct opal_datatype_cuda_kernel_function_table {
-    int32_t (*opal_datatype_cuda_init_p)(void);
-    int32_t (*opal_datatype_cuda_fini_p)(void);
-    void (*opal_cuda_free_gpu_buffer_p)(void *addr, int gpu_id);
-    void* (*opal_cuda_malloc_gpu_buffer_p)(size_t size, int gpu_id);
-    void (*opal_cuda_d2dcpy_async_p)(void* dst, const void* src, size_t count);
-    void (*opal_cuda_d2dcpy_p)(void* dst, const void* src, size_t count);
-    int32_t (*opal_generic_simple_pack_function_cuda_iov_p)( opal_convertor_t* pConvertor, struct iovec* iov, uint32_t* out_size, size_t* max_data );
-    int32_t (*opal_generic_simple_unpack_function_cuda_iov_p)( opal_convertor_t* pConvertor, struct iovec* iov, uint32_t* out_size, size_t* max_data );
-    int32_t (*opal_generic_simple_pack_function_cuda_vector_p)( opal_convertor_t* pConvertor, struct iovec* iov, uint32_t* out_size, size_t* max_data );
-    int32_t (*opal_generic_simple_unpack_function_cuda_vector_p)( opal_convertor_t* pConvertor, struct iovec* iov, uint32_t* out_size, size_t* max_data );                                                         
+    int32_t (*opal_ddt_cuda_kernel_init_p)(void);
+    int32_t (*opal_ddt_cuda_kernel_fini_p)(void);
+    void (*opal_ddt_cuda_free_gpu_buffer_p)(void *addr, int gpu_id);
+    void* (*opal_ddt_cuda_malloc_gpu_buffer_p)(size_t size, int gpu_id);
+    void (*opal_ddt_cuda_d2dcpy_async_p)(void* dst, const void* src, size_t count);
+    void (*opal_ddt_cuda_d2dcpy_p)(void* dst, const void* src, size_t count);
+    int32_t (*opal_ddt_generic_simple_pack_function_cuda_iov_p)( opal_convertor_t* pConvertor, struct iovec* iov, uint32_t* out_size, size_t* max_data );
+    int32_t (*opal_ddt_generic_simple_unpack_function_cuda_iov_p)( opal_convertor_t* pConvertor, struct iovec* iov, uint32_t* out_size, size_t* max_data );
+    int32_t (*opal_ddt_generic_simple_pack_function_cuda_vector_p)( opal_convertor_t* pConvertor, struct iovec* iov, uint32_t* out_size, size_t* max_data );
+    int32_t (*opal_ddt_generic_simple_unpack_function_cuda_vector_p)( opal_convertor_t* pConvertor, struct iovec* iov, uint32_t* out_size, size_t* max_data );                                                         
 };
 typedef struct opal_datatype_cuda_kernel_function_table opal_datatype_cuda_kernel_function_table_t;
 extern int32_t opal_datatype_cuda_kernel_support;
@@ -44,8 +44,8 @@ void* opal_cuda_memmove(void * dest, void * src, size_t size);
 void opal_cuda_add_initialization_function(int (*fptr)(opal_common_cuda_function_table_t *));
 void opal_cuda_set_copy_function_async(opal_convertor_t* convertor, void *stream);
 
-int32_t opal_datatype_cuda_kernel_support_init(void);
-int32_t opal_datatype_cuda_kernel_support_fini(void);
+int32_t opal_cuda_kernel_support_init(void);
+int32_t opal_cuda_kernel_support_fini(void);
 int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor, struct iovec* iov, uint32_t* out_size, size_t* max_data );
 int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvertor, struct iovec* iov, uint32_t* out_size, size_t* max_data );
 int32_t opal_generic_simple_pack_function_cuda_vector( opal_convertor_t* pConvertor, struct iovec* iov, uint32_t* out_size, size_t* max_data );
diff --git a/opal/datatype/opal_datatype_module.c b/opal/datatype/opal_datatype_module.c
index 92a3fe40174..77d6bfa62ac 100644
--- a/opal/datatype/opal_datatype_module.c
+++ b/opal/datatype/opal_datatype_module.c
@@ -252,7 +252,7 @@ int32_t opal_datatype_finalize( void )
     opal_convertor_destroy_masters();
 
 #if OPAL_CUDA_SUPPORT
-    opal_datatype_cuda_kernel_support_fini();
+    opal_cuda_kernel_support_fini();
 #endif /* OPAL_CUDA_SUPPORT */
 
     return OPAL_SUCCESS;
diff --git a/opal/datatype/opal_datatype_pack.c b/opal/datatype/opal_datatype_pack.c
index 0bb29e2f3fc..0573db427df 100644
--- a/opal/datatype/opal_datatype_pack.c
+++ b/opal/datatype/opal_datatype_pack.c
@@ -416,7 +416,7 @@ opal_generic_simple_pack_cuda_function( opal_convertor_t* pConvertor,
     pos_desc   = pStack->index;
     pElem = &(description[pos_desc]);
    
-   // return (*opal_generic_simple_pack_function_cuda_vector_p)( pConvertor, iov, out_size, max_data);
+//    return opal_generic_simple_pack_function_cuda_iov( pConvertor, iov, out_size, max_data);
     if( OPAL_DATATYPE_LOOP == pElem->elem.common.type ) {
         return opal_generic_simple_pack_function_cuda_vector( pConvertor, iov, out_size, max_data);
     } else {
diff --git a/opal/datatype/opal_datatype_unpack.c b/opal/datatype/opal_datatype_unpack.c
index 6a2fbd70a6c..5f0ac368f68 100644
--- a/opal/datatype/opal_datatype_unpack.c
+++ b/opal/datatype/opal_datatype_unpack.c
@@ -610,7 +610,7 @@ opal_generic_simple_unpack_cuda_function( opal_convertor_t* pConvertor,
     pos_desc   = pStack->index;
     pElem = &(description[pos_desc]);
    
-//    return (*opal_generic_simple_unpack_function_cuda_vector_p)( pConvertor, iov, out_size, max_data);
+//    return opal_generic_simple_unpack_function_cuda_iov( pConvertor, iov, out_size, max_data);
     if( OPAL_DATATYPE_LOOP == pElem->elem.common.type ) {
         return opal_generic_simple_unpack_function_cuda_vector( pConvertor, iov, out_size, max_data);
     } else {
diff --git a/opal/mca/common/cuda/common_cuda.c b/opal/mca/common/cuda/common_cuda.c
index d89b11bd647..8621b0da59f 100644
--- a/opal/mca/common/cuda/common_cuda.c
+++ b/opal/mca/common/cuda/common_cuda.c
@@ -1914,7 +1914,7 @@ static int mca_common_cuda_is_gpu_buffer(const void *pUserBuf, opal_convertor_t
         if (0 != mca_common_cuda_stage_three_init()) {
             opal_cuda_support = 0;
         } else {
-	    opal_datatype_cuda_kernel_support_init();
+	    opal_cuda_kernel_support_init();
 	}
     }
 

From 6b95a38ffb8032b01721e57f84b46f730e90ab6f Mon Sep 17 00:00:00 2001
From: eddy16112 <eddy16112@gmail.com>
Date: Fri, 6 Nov 2015 18:43:31 -0500
Subject: [PATCH 045/190] check point

---
 opal/datatype/cuda/opal_datatype_cuda.cu      | 29 ++++++++
 opal/datatype/cuda/opal_datatype_cuda.cuh     |  6 ++
 .../cuda/opal_datatype_cuda_internal.cuh      |  4 +-
 .../cuda/opal_datatype_pack_cuda_wrapper.cu   | 70 +++++++++++++++----
 .../cuda/opal_datatype_unpack_cuda_wrapper.cu | 39 ++++++++---
 opal/datatype/opal_convertor.c                |  4 +-
 opal/datatype/opal_convertor.h                |  1 +
 opal/datatype/opal_datatype.h                 |  6 +-
 opal/datatype/opal_datatype_cuda.c            | 35 +++++++++-
 opal/datatype/opal_datatype_cuda.h            |  6 +-
 opal/datatype/opal_datatype_destroy.c         | 13 ++++
 opal/datatype/opal_datatype_optimize.c        |  7 ++
 opal/datatype/opal_datatype_pack.c            |  2 +-
 opal/datatype/opal_datatype_unpack.c          |  2 +-
 test/datatype/ddt_benchmark.c                 | 22 +++---
 15 files changed, 204 insertions(+), 42 deletions(-)

diff --git a/opal/datatype/cuda/opal_datatype_cuda.cu b/opal/datatype/cuda/opal_datatype_cuda.cu
index e07adb33c5e..6a6e06ff28d 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cu
+++ b/opal/datatype/cuda/opal_datatype_cuda.cu
@@ -275,6 +275,35 @@ int32_t opal_ddt_cuda_kernel_fini(void)
     return OPAL_SUCCESS;
 }
 
+void* opal_ddt_cuda_iov_dist_init(void) 
+{
+#if OPAL_DATATYPE_CUDA_IOV_CACHE 
+    ddt_cuda_iov_dist_t *p = NULL;
+    cudaMalloc((void **)(&p), sizeof(ddt_cuda_iov_dist_t) * NUM_CUDA_IOV_PER_DDT);
+    if (p != NULL) {
+        DT_CUDA_DEBUG( opal_cuda_output( 2, "Malloc cuda_iov_dist for ddt is successed %p.\n", p); );
+        return p;
+    } else {
+        DT_CUDA_DEBUG( opal_cuda_output( 0, "Malloc cuda_iov_dist for ddt is failed.\n"); );
+        return NULL;
+    }
+#else
+    DT_CUDA_DEBUG( opal_cuda_output( 2, "cuda iov cache is not enabled.\n"); );
+    return (void *)0xDEADBEEF;
+#endif /* OPAL_DATATYPE_CUDA_IOV_CACHE */
+}
+
+void opal_ddt_cuda_iov_dist_fini(void* cuda_iov_dist) 
+{
+#if OPAL_DATATYPE_CUDA_IOV_CACHE
+    ddt_cuda_iov_dist_t *p = (ddt_cuda_iov_dist_t *) cuda_iov_dist;
+    if (p != NULL) {
+        cudaFree(p);
+        DT_CUDA_DEBUG( opal_cuda_output( 2, "Free cuda_iov_dist for ddt is successed %p.\n", p); );
+    }
+#endif /* OPAL_DATATYPE_CUDA_IOV_CACHE */
+}
+
 int32_t opal_ddt_cuda_is_gpu_buffer(const void *ptr)
 {
     int res;
diff --git a/opal/datatype/cuda/opal_datatype_cuda.cuh b/opal/datatype/cuda/opal_datatype_cuda.cuh
index 53f548c6d34..ea3631af67f 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cuh
+++ b/opal/datatype/cuda/opal_datatype_cuda.cuh
@@ -95,6 +95,12 @@ void opal_ddt_cuda_d2dcpy(void* dst, const void* src, size_t count);
 
 void opal_dump_cuda_list(ddt_cuda_list_t *list);
 
+void* opal_ddt_cuda_iov_dist_init(void);
+
+void opal_ddt_cuda_iov_dist_fini(void *cuda_iov_dist);
+
+void pack_iov_cached(opal_convertor_t* pConvertor, unsigned char *destination);
+
 }
                             
 #endif  /* OPAL_DATATYPE_CUDA_H_HAS_BEEN_INCLUDED */
\ No newline at end of file
diff --git a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
index 7648eed3b3e..ca630fc1b93 100644
--- a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
+++ b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
@@ -18,6 +18,7 @@
 #define OPAL_DATATYPE_VECTOR_USE_ZEROCPY   0
 #define OPAL_DATATYPE_VECTOR_USE_PIPELINE   0
 #define OPAL_DATATYPE_VECTOR_USE_MEMCPY2D_AS_KERNEL   1
+#define OPAL_DATATYPE_CUDA_IOV_CACHE    1
 
 
 
@@ -36,7 +37,8 @@
 #define CUDA_IOV_MAX_TASK_PER_BLOCK 400
 #define ALIGNMENT_DOUBLE    8
 #define ALIGNMENT_FLOAT     4
-#define ALIGNMENT_CHAR      1
+#define ALIGNMENT_CHAR      18
+#define NUM_CUDA_IOV_PER_DDT    100000
 
 #define TIMER_DATA_TYPE struct timeval
 #define GET_TIME(TV)   gettimeofday( &(TV), NULL )
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
index 97481755209..b82888a3f96 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
@@ -8,7 +8,7 @@
 #include <assert.h>
 
 
-int32_t opal_ddt_generic_simple_pack_function_cuda_vector2(opal_convertor_t* pConvertor,
+int32_t opal_ddt_generic_simple_pack_function_cuda_vector(opal_convertor_t* pConvertor,
                                                       struct iovec* iov,
                                                       uint32_t* out_size,
                                                       size_t* max_data )
@@ -28,13 +28,15 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_vector2(opal_convertor_t* pCo
     uint32_t count_desc_tmp;
     
     ddt_cuda_stream_t *cuda_streams = current_cuda_device->cuda_streams;
+    int contiguous_loop_flag = 0;
+    int i;
     
 #if defined(OPAL_DATATYPE_CUDA_TIMING)    
     TIMER_DATA_TYPE start, end, start_total, end_total;
     long total_time;
 #endif
     
-    DT_CUDA_DEBUG( opal_cuda_output( 1, "opal_convertor_generic_simple_pack_cuda_vector( %p:%p, {%p, %lu}, %u, %u )\n",
+    DT_CUDA_DEBUG( opal_cuda_output( 2, "opal_convertor_generic_simple_pack_cuda_vector( %p:%p, {%p, %lu}, %u, %u )\n",
                                 (void*)pConvertor, (void*)pConvertor->pBaseBuf,
                                 iov[0].iov_base, (unsigned long)iov[0].iov_len, *out_size, *max_data ); );
 
@@ -52,7 +54,7 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_vector2(opal_convertor_t* pCo
     pConvertor->stack_pos--;
     pElem = &(description[pos_desc]);
 
-    DT_CUDA_DEBUG( opal_cuda_output( 1, "pack start pos_desc %d count_desc %d disp %ld\n"
+    DT_CUDA_DEBUG( opal_cuda_output( 4, "pack start pos_desc %d count_desc %d disp %ld\n"
                            "stack_pos %d pos_desc %d count_desc %d disp %ld\n",
                            pos_desc, count_desc, (long)(conv_ptr - pConvertor->pBaseBuf),
                            pConvertor->stack_pos, pStack->index, (int)pStack->count, (long)pStack->disp ); );
@@ -112,10 +114,17 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_vector2(opal_convertor_t* pCo
                     UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
                     continue;
                 }
+                if (contiguous_loop_flag) {
+                    pStack--;
+                    pConvertor->stack_pos--;
+                    pos_desc --;
+                    pElem = &(description[pos_desc]);
+                    count_desc = count_desc_tmp;
+                }
                 goto complete_loop;
             }
             if( OPAL_DATATYPE_END_LOOP == pElem->elem.common.type ) { /* end of the current loop */
-                DT_CUDA_DEBUG( opal_cuda_output( 2, "pack end_loop count %d stack_pos %d"
+                DT_CUDA_DEBUG( opal_cuda_output( 4, "pack end_loop count %d stack_pos %d"
                                                  " pos_desc %d disp %ld space %lu\n",
                                                  (int)pStack->count, pConvertor->stack_pos,
                                                  pos_desc, (long)pStack->disp, (unsigned long)iov_len_local ); );
@@ -141,7 +150,7 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_vector2(opal_convertor_t* pCo
                 }
                 conv_ptr = pConvertor->pBaseBuf + pStack->disp;
                 UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
-                DT_CUDA_DEBUG( opal_cuda_output( 2, "pack new_loop count %d stack_pos %d pos_desc %d count_desc %d disp %ld space %lu\n",
+                DT_CUDA_DEBUG( opal_cuda_output( 4, "pack new_loop count %d stack_pos %d pos_desc %d count_desc %d disp %ld space %lu\n",
                                                  (int)pStack->count, pConvertor->stack_pos, pos_desc,
                                                  count_desc, (long)pStack->disp, (unsigned long)iov_len_local ); );
             }
@@ -160,6 +169,8 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_vector2(opal_convertor_t* pCo
                     if( 0 == count_desc ) {  /* completed */
                         pos_desc += pElem->loop.items + 1;
                         goto update_loop_description;
+                    } else {
+                        contiguous_loop_flag = 1;
                     }
                     /* Save the stack with the correct last_count value. */
                 }
@@ -168,7 +179,11 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_vector2(opal_convertor_t* pCo
                             pStack->disp + local_disp);
                 pos_desc++;
             update_loop_description:  /* update the current state */
-                conv_ptr = pConvertor->pBaseBuf + pStack->disp;
+                if (contiguous_loop_flag) {
+                    count_desc_tmp = count_desc;
+                } else {
+                    conv_ptr = pConvertor->pBaseBuf + pStack->disp;
+                }
                 UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
                 continue;
             }
@@ -177,6 +192,9 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_vector2(opal_convertor_t* pCo
         iov[iov_count].iov_len -= iov_len_local;  /* update the amount of valid data */
         total_packed += iov[iov_count].iov_len;
  //       printf("iov_len %d, local %d\n", iov[iov_count].iov_len, iov_len_local);
+        for (i = 0; i < NB_STREAMS; i++) {
+            cudaStreamSynchronize(cuda_streams->opal_cuda_stream[i]);
+        }
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
         GET_TIME(start);
 #endif
@@ -186,16 +204,15 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_vector2(opal_convertor_t* pCo
 #if defined(OPAL_DATATYPE_CUDA_TIMING) 
         GET_TIME( end );
         total_time = ELAPSED_TIME( start, end );
-        printf( "[Timing]: DtoH memcpy in %ld microsec, transfer required %d\n", total_time, transfer_required );
+        DT_CUDA_DEBUG( opal_cuda_output( 2, "[Timing]: DtoH memcpy in %ld microsec, transfer required %d\n", total_time, transfer_required ); );
 #endif
     }
-    cudaDeviceSynchronize();
     *max_data = total_packed;
     pConvertor->bConverted += total_packed;  /* update the already converted bytes */
     *out_size = iov_count;
     if( pConvertor->bConverted == pConvertor->local_size ) {
         pConvertor->flags |= CONVERTOR_COMPLETED;
-        DT_CUDA_DEBUG( opal_cuda_output( 0, "Total packed %lu\n", pConvertor->bConverted); );
+        DT_CUDA_DEBUG( opal_cuda_output( 0, "Pack total packed %lu\n", pConvertor->bConverted); );
         if (pConvertor->gpu_buffer_ptr != NULL && free_required == 1) {
             printf("free\n");
            opal_ddt_cuda_free_gpu_buffer(pConvertor->gpu_buffer_ptr, 0);
@@ -206,12 +223,12 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_vector2(opal_convertor_t* pCo
     /* Save the global position for the next round */
     PUSH_STACK( pStack, pConvertor->stack_pos, pos_desc, pElem->elem.common.type, count_desc,
                 conv_ptr - pConvertor->pBaseBuf );
-    DT_CUDA_DEBUG( opal_cuda_output( 2, "pack save stack stack_pos %d pos_desc %d count_desc %d disp %ld\n",
+    DT_CUDA_DEBUG( opal_cuda_output( 4, "pack save stack stack_pos %d pos_desc %d count_desc %d disp %ld\n",
                                      pConvertor->stack_pos, pStack->index, (int)pStack->count, (long)pStack->disp ); );
     return 0;
 }
 
-int32_t opal_ddt_generic_simple_pack_function_cuda_vector(opal_convertor_t* pConvertor,
+int32_t opal_ddt_generic_simple_pack_function_cuda_vector2(opal_convertor_t* pConvertor,
                                                       struct iovec* iov,
                                                       uint32_t* out_size,
                                                       size_t* max_data )
@@ -369,7 +386,7 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_vector(opal_convertor_t* pCon
                             pStack->disp + local_disp);
                 pos_desc++;
             update_loop_description:  /* update the current state */
-              //  conv_ptr = pConvertor->pBaseBuf + pStack->disp;
+           //     conv_ptr = pConvertor->pBaseBuf + pStack->disp;
                 count_desc_tmp = count_desc;
                 UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
                 continue;
@@ -674,6 +691,12 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConve
     long total_time, move_time;
 #endif
     
+#if OPAL_DATATYPE_CUDA_IOV_CACHE
+    opal_datatype_t *pDesc = (opal_datatype_t *)pConvertor->pDesc;
+    ddt_cuda_iov_dist_t *cuda_iov_dist_cache = (ddt_cuda_iov_dist_t *)pDesc->cuda_iov_dist;
+    cuda_iov_dist_cache += pDesc->cuda_iov_count;    
+#endif /* OPAL_DATATYPE_CUDA_IOV_CACHE */
+    
     /*description = pConvertor->use_desc->desc;
     pStack = pConvertor->pStack + pConvertor->stack_pos;
     pElem = &(description[pStack->index]);
@@ -717,6 +740,13 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConve
         }
     }
     
+#if OPAL_DATATYPE_CUDA_IOV_CACHE
+    /* cuda iov is cached */
+    if (pDesc->cuda_iov_is_cached == 2) {
+        pack_iov_cached(pConvertor, destination);
+    }
+#endif /* OPAL_DATATYPE_CUDA_IOV_CACHE */    
+    
     DT_CUDA_DEBUG ( opal_cuda_output(2, "Pack using IOV, GPU base %p, pack to buffer %p\n", pConvertor->pBaseBuf, destination););
 
     cuda_iov_count = 1000;//CUDA_NB_IOV;
@@ -835,6 +865,11 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConve
 #endif
 
         cudaMemcpyAsync(cuda_iov_dist_d_current, cuda_iov_dist_h_current, sizeof(ddt_cuda_iov_dist_t)*(nb_blocks_used), cudaMemcpyHostToDevice, *cuda_stream_iov);
+#if OPAL_DATATYPE_CUDA_IOV_CACHE
+        cudaMemcpyAsync(cuda_iov_dist_cache, cuda_iov_dist_d_current, sizeof(ddt_cuda_iov_dist_t)*(nb_blocks_used), cudaMemcpyDeviceToDevice, *cuda_stream_iov);
+        pDesc->cuda_iov_count += nb_blocks_used;
+        cuda_iov_dist_cache += nb_blocks_used;
+#endif /* OPAL_DATATYPE_CUDA_IOV_CACHE */
         opal_generic_simple_pack_cuda_iov_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cuda_iov_dist_d_current, nb_blocks_used, source_base, destination_base);
         cuda_err = cudaEventRecord(cuda_iov_pipeline_block->cuda_event, *cuda_stream_iov);
         opal_cuda_check_error(cuda_err);
@@ -898,11 +933,22 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConve
            opal_ddt_cuda_free_gpu_buffer(pConvertor->gpu_buffer_ptr, 0);
            pConvertor->gpu_buffer_ptr = NULL;
         }
+#if OPAL_DATATYPE_CUDA_IOV_CACHE
+        pDesc->cuda_iov_is_cached = 2;
+#endif /* OPAL_DATATYPE_CUDA_IOV_CACHE */
         return 1;
     }        
     return 0;
 }
 
+#if OPAL_DATATYPE_CUDA_IOV_CACHE
+void pack_iov_cached(opal_convertor_t* pConvertor, unsigned char *destination)
+{
+    const opal_datatype_t *datatype = pConvertor->pDesc;
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "cuda iov cached %p, count %ld\n", datatype->cuda_iov_dist, datatype->cuda_iov_count ); );
+}
+#endif /* OPAL_DATATYPE_CUDA_IOV_CACHE */
+
 
 void pack_predefined_data_cuda( dt_elem_desc_t* ELEM,
                                 uint32_t* COUNT,
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
index 9d0e02067d1..f483d230934 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
@@ -8,7 +8,7 @@
 #include <assert.h>
 
 
-int32_t opal_ddt_generic_simple_unpack_function_cuda_vector2( opal_convertor_t* pConvertor,
+int32_t opal_ddt_generic_simple_unpack_function_cuda_vector( opal_convertor_t* pConvertor,
                                                          struct iovec* iov, uint32_t* out_size,
                                                          size_t* max_data )
 {
@@ -26,13 +26,15 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_vector2( opal_convertor_t*
     uint32_t count_desc_tmp;
     
     ddt_cuda_stream_t *cuda_streams = current_cuda_device->cuda_streams;
+    int contiguous_loop_flag = 0;
+    int i;
 
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
     TIMER_DATA_TYPE start, end;
     long total_time;
 #endif
 
-    DT_CUDA_DEBUG( opal_cuda_output( 1, "opal_convertor_generic_simple_unpack( %p, {%p, %lu}, %u , %u)\n",
+    DT_CUDA_DEBUG( opal_cuda_output( 2, "opal_convertor_generic_simple_unpack( %p, {%p, %lu}, %u , %u)\n",
                                      (void*)pConvertor, iov[0].iov_base, (unsigned long)iov[0].iov_len, *out_size, *max_data ); )
 
     description = pConvertor->use_desc->desc;
@@ -49,7 +51,7 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_vector2( opal_convertor_t*
     pConvertor->stack_pos--;
     pElem = &(description[pos_desc]);
 
-    DT_CUDA_DEBUG( opal_cuda_output( 1, "unpack start pos_desc %d count_desc %d disp %ld\n"
+    DT_CUDA_DEBUG( opal_cuda_output( 4, "unpack start pos_desc %d count_desc %d disp %ld\n"
                            "stack_pos %d pos_desc %d count_desc %d disp %ld\n",
                            pos_desc, count_desc, (long)(conv_ptr - pConvertor->pBaseBuf),
                            pConvertor->stack_pos, pStack->index, (int)pStack->count, (long)(pStack->disp) ); );
@@ -78,7 +80,7 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_vector2( opal_convertor_t*
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
         GET_TIME( end );
         total_time = ELAPSED_TIME( start, end );
-        printf( "[Timing]: HtoD memcpy in %ld microsec, free required %d\n", total_time, free_required );
+        DT_CUDA_DEBUG( opal_cuda_output( 2, "[Timing]: HtoD memcpy in %ld microsec, free required %d\n", total_time, free_required ); );
 #endif
         iov_len_local = iov[iov_count].iov_len;
         cudaDeviceSynchronize();
@@ -96,6 +98,13 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_vector2( opal_convertor_t*
                     UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
                     continue;
                 }
+                if (contiguous_loop_flag) {
+                    pStack--;
+                    pConvertor->stack_pos--;
+                    pos_desc --;
+                    pElem = &(description[pos_desc]);
+                    count_desc = count_desc_tmp;
+                }
                 assert( pElem->elem.common.type < OPAL_DATATYPE_MAX_PREDEFINED );
                 if( 0 != iov_len_local ) {
                     assert(0);
@@ -103,7 +112,7 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_vector2( opal_convertor_t*
                 goto complete_loop;
             }
             if( OPAL_DATATYPE_END_LOOP == pElem->elem.common.type ) { /* end of the current loop */
-                DT_CUDA_DEBUG( opal_cuda_output( 2, "unpack end_loop count %d stack_pos %d pos_desc %d disp %ld space %lu\n",
+                DT_CUDA_DEBUG( opal_cuda_output( 4, "unpack end_loop count %d stack_pos %d pos_desc %d disp %ld space %lu\n",
                                                  (int)pStack->count, pConvertor->stack_pos, pos_desc,
                                                  (long)pStack->disp, (unsigned long)iov_len_local ); );
                 if( --(pStack->count) == 0 ) { /* end of loop */
@@ -128,7 +137,7 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_vector2( opal_convertor_t*
                 }
                 conv_ptr = pConvertor->pBaseBuf + pStack->disp;
                 UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
-                DT_CUDA_DEBUG( opal_cuda_output( 2, "unpack new_loop count %d stack_pos %d pos_desc %d disp %ld space %lu\n",
+                DT_CUDA_DEBUG( opal_cuda_output( 4, "unpack new_loop count %d stack_pos %d pos_desc %d disp %ld space %lu\n",
                                                  (int)pStack->count, pConvertor->stack_pos, pos_desc,
                                                  (long)pStack->disp, (unsigned long)iov_len_local ); );
             }
@@ -145,6 +154,8 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_vector2( opal_convertor_t*
                     if( 0 == count_desc ) {  /* completed */
                         pos_desc += pElem->loop.items + 1;
                         goto update_loop_description;
+                    } else {
+                        contiguous_loop_flag = 1;
                     }
                     /* Save the stack with the correct last_count value. */
                 }
@@ -153,7 +164,11 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_vector2( opal_convertor_t*
                             pStack->disp + local_disp);
                 pos_desc++;
             update_loop_description:  /* update the current state */
-                conv_ptr = pConvertor->pBaseBuf + pStack->disp;
+                if (contiguous_loop_flag) {
+                    count_desc_tmp = count_desc;
+                } else {
+                    conv_ptr = pConvertor->pBaseBuf + pStack->disp;
+                }
                 UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
                 continue;
             }
@@ -163,13 +178,15 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_vector2( opal_convertor_t*
         total_unpacked += iov[iov_count].iov_len;
     }
  complete_conversion:
-    cudaDeviceSynchronize();
+    for (i = 0; i < NB_STREAMS; i++) {
+        cudaStreamSynchronize(cuda_streams->opal_cuda_stream[i]);
+    }
     *max_data = total_unpacked;
     pConvertor->bConverted += total_unpacked;  /* update the already converted bytes */
     *out_size = iov_count;
     if( pConvertor->bConverted == pConvertor->remote_size ) {
         pConvertor->flags |= CONVERTOR_COMPLETED;
-        DT_CUDA_DEBUG( opal_cuda_output( 0, "Total unpacked %lu\n", pConvertor->bConverted); );
+        DT_CUDA_DEBUG( opal_cuda_output( 2, "Unpack total unpacked %lu\n", pConvertor->bConverted); );
         if (pConvertor->gpu_buffer_ptr != NULL && free_required == 1) {
             opal_ddt_cuda_free_gpu_buffer(pConvertor->gpu_buffer_ptr, 0);
             pConvertor->gpu_buffer_ptr = NULL;
@@ -179,12 +196,12 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_vector2( opal_convertor_t*
     /* Save the global position for the next round */
     PUSH_STACK( pStack, pConvertor->stack_pos, pos_desc, pElem->elem.common.type, count_desc,
                 conv_ptr - pConvertor->pBaseBuf );
-    DT_CUDA_DEBUG( opal_cuda_output( 2, "unpack save stack stack_pos %d pos_desc %d count_desc %d disp %ld\n",
+    DT_CUDA_DEBUG( opal_cuda_output( 4, "unpack save stack stack_pos %d pos_desc %d count_desc %d disp %ld\n",
                                      pConvertor->stack_pos, pStack->index, (int)pStack->count, (long)pStack->disp ); );
     return 0;
 }
 
-int32_t opal_ddt_generic_simple_unpack_function_cuda_vector( opal_convertor_t* pConvertor,
+int32_t opal_ddt_generic_simple_unpack_function_cuda_vector2( opal_convertor_t* pConvertor,
                                                          struct iovec* iov, uint32_t* out_size,
                                                          size_t* max_data )
 {
diff --git a/opal/datatype/opal_convertor.c b/opal/datatype/opal_convertor.c
index 34e25bc0b17..93c885b0447 100644
--- a/opal/datatype/opal_convertor.c
+++ b/opal/datatype/opal_convertor.c
@@ -557,7 +557,7 @@ int32_t opal_convertor_prepare_for_recv( opal_convertor_t* convertor,
 
     convertor->flags |= CONVERTOR_RECV;
 #if OPAL_CUDA_SUPPORT
-    mca_cuda_convertor_init(convertor, pUserBuf);
+    mca_cuda_convertor_init(convertor, pUserBuf, datatype);
 #endif  /* OPAL_CUDA_SUPPORT */
 
     OPAL_CONVERTOR_PREPARE( convertor, datatype, count, pUserBuf );
@@ -606,7 +606,7 @@ int32_t opal_convertor_prepare_for_send( opal_convertor_t* convertor,
 {
     convertor->flags |= CONVERTOR_SEND;
 #if OPAL_CUDA_SUPPORT
-    mca_cuda_convertor_init(convertor, pUserBuf);
+    mca_cuda_convertor_init(convertor, pUserBuf, datatype);
 #endif  /* OPAL_CUDA_SUPPORT */
 
     OPAL_CONVERTOR_PREPARE( convertor, datatype, count, pUserBuf );
diff --git a/opal/datatype/opal_convertor.h b/opal/datatype/opal_convertor.h
index af74ee1221c..822a91e85e0 100644
--- a/opal/datatype/opal_convertor.h
+++ b/opal/datatype/opal_convertor.h
@@ -114,6 +114,7 @@ struct opal_convertor_t {
 
     unsigned char *               gpu_buffer_ptr; /**< GPU buffer used for pack/unpack */
     size_t                        gpu_buffer_size;
+    size_t                        current_cuda_iov_count;
 #endif
     /* size: 248, cachelines: 4, members: 20 */
     /* last cacheline: 56 bytes */
diff --git a/opal/datatype/opal_datatype.h b/opal/datatype/opal_datatype.h
index 5a61aa6fae6..ae37734d208 100644
--- a/opal/datatype/opal_datatype.h
+++ b/opal/datatype/opal_datatype.h
@@ -131,7 +131,11 @@ struct opal_datatype_t {
     int                iov_count;
     size_t             max_data;
     /* size: 416, cachelines: 7, members: 18 */
-
+#if OPAL_CUDA_SUPPORT
+    void *             cuda_iov_dist;
+    size_t             cuda_iov_count;
+    int8_t             cuda_iov_is_cached;
+#endif /* OPAL_CUDA_SUPPORT */
     /* last cacheline: 32 bytes */
 };
 
diff --git a/opal/datatype/opal_datatype_cuda.c b/opal/datatype/opal_datatype_cuda.c
index 96c3221b94c..729e460de1a 100644
--- a/opal/datatype/opal_datatype_cuda.c
+++ b/opal/datatype/opal_datatype_cuda.c
@@ -61,7 +61,7 @@ void opal_cuda_add_initialization_function(int (*fptr)(opal_common_cuda_function
  * is enabled or not.  If CUDA is not enabled, then short circuit out
  * for all future calls.
  */
-void mca_cuda_convertor_init(opal_convertor_t* convertor, const void *pUserBuf)
+void mca_cuda_convertor_init(opal_convertor_t* convertor, const void *pUserBuf, const struct opal_datatype_t* datatype)
 {
     /* Only do the initialization on the first GPU access */
     if (!initialized) {
@@ -84,6 +84,18 @@ void mca_cuda_convertor_init(opal_convertor_t* convertor, const void *pUserBuf)
     if (OPAL_SUCCESS != opal_cuda_kernel_support_init()) {
         opal_cuda_kernel_support_fini();    
     }
+    if (opal_datatype_cuda_kernel_support == 1 && datatype->cuda_iov_is_cached == 0) {
+        struct opal_datatype_t* datatype_tmp = (opal_datatype_t *)datatype;
+        datatype_tmp->cuda_iov_dist = opal_cuda_iov_dist_init();
+        if (datatype_tmp->cuda_iov_dist == (void*)0xDEADBEEF || datatype_tmp->cuda_iov_dist == NULL) {
+            /* either cuda iov cache is not enabled or cuda_iov_cache malloc is failed, then we do not cache cuda iov */
+            datatype_tmp->cuda_iov_is_cached = -1;
+        } else {
+            /* cuda iov buffer is ready , the value will be marked to 2 when caching is finished*/
+            datatype_tmp->cuda_iov_is_cached = 1;
+        }
+    }
+    
 }
 
 /* Checks the type of pointer
@@ -241,6 +253,8 @@ int32_t opal_cuda_kernel_support_init(void)
         OPAL_DATATYPE_FIND_CUDA_KERNEL_FUNCTION_OR_RETURN( opal_datatype_cuda_kernel_handle, opal_ddt_cuda_malloc_gpu_buffer );
         OPAL_DATATYPE_FIND_CUDA_KERNEL_FUNCTION_OR_RETURN( opal_datatype_cuda_kernel_handle, opal_ddt_cuda_d2dcpy_async );
         OPAL_DATATYPE_FIND_CUDA_KERNEL_FUNCTION_OR_RETURN( opal_datatype_cuda_kernel_handle, opal_ddt_cuda_d2dcpy );
+        OPAL_DATATYPE_FIND_CUDA_KERNEL_FUNCTION_OR_RETURN( opal_datatype_cuda_kernel_handle, opal_ddt_cuda_iov_dist_init );
+        OPAL_DATATYPE_FIND_CUDA_KERNEL_FUNCTION_OR_RETURN( opal_datatype_cuda_kernel_handle, opal_ddt_cuda_iov_dist_fini );
         
         if (OPAL_SUCCESS != cuda_kernel_table.opal_ddt_cuda_kernel_init_p()) {
             return OPAL_ERROR;
@@ -356,3 +370,22 @@ void opal_cuda_d2dcpy_async(void* dst, const void* src, size_t count)
     }
 }
 
+void* opal_cuda_iov_dist_init(void)
+{
+    if (cuda_kernel_table.opal_ddt_cuda_iov_dist_init_p != NULL) {
+        return cuda_kernel_table.opal_ddt_cuda_iov_dist_init_p();
+    } else {
+        opal_output(0, "opal_ddt_cuda_iov_dist_init function pointer is NULL\n");
+        return NULL;
+    }
+}
+
+void opal_cuda_iov_dist_fini(void *cuda_iov_dist)
+{
+    if (cuda_kernel_table.opal_ddt_cuda_iov_dist_fini_p != NULL) {
+        cuda_kernel_table.opal_ddt_cuda_iov_dist_fini_p(cuda_iov_dist);
+    } else {
+        opal_output(0, "opal_ddt_cuda_iov_dist_fini function pointer is NULL\n");
+    }
+}
+
diff --git a/opal/datatype/opal_datatype_cuda.h b/opal/datatype/opal_datatype_cuda.h
index 8b6f996e422..24e85f649b9 100644
--- a/opal/datatype/opal_datatype_cuda.h
+++ b/opal/datatype/opal_datatype_cuda.h
@@ -28,6 +28,8 @@ struct opal_datatype_cuda_kernel_function_table {
     void* (*opal_ddt_cuda_malloc_gpu_buffer_p)(size_t size, int gpu_id);
     void (*opal_ddt_cuda_d2dcpy_async_p)(void* dst, const void* src, size_t count);
     void (*opal_ddt_cuda_d2dcpy_p)(void* dst, const void* src, size_t count);
+    void* (*opal_ddt_cuda_iov_dist_init_p)(void);
+    void (*opal_ddt_cuda_iov_dist_fini_p)(void *cuda_iov_dist);
     int32_t (*opal_ddt_generic_simple_pack_function_cuda_iov_p)( opal_convertor_t* pConvertor, struct iovec* iov, uint32_t* out_size, size_t* max_data );
     int32_t (*opal_ddt_generic_simple_unpack_function_cuda_iov_p)( opal_convertor_t* pConvertor, struct iovec* iov, uint32_t* out_size, size_t* max_data );
     int32_t (*opal_ddt_generic_simple_pack_function_cuda_vector_p)( opal_convertor_t* pConvertor, struct iovec* iov, uint32_t* out_size, size_t* max_data );
@@ -36,7 +38,7 @@ struct opal_datatype_cuda_kernel_function_table {
 typedef struct opal_datatype_cuda_kernel_function_table opal_datatype_cuda_kernel_function_table_t;
 extern int32_t opal_datatype_cuda_kernel_support;
 
-void mca_cuda_convertor_init(opal_convertor_t* convertor, const void *pUserBuf);
+void mca_cuda_convertor_init(opal_convertor_t* convertor, const void *pUserBuf, const struct opal_datatype_t* datatype);
 bool opal_cuda_check_bufs(char *dest, char *src);
 void* opal_cuda_memcpy(void * dest, const void * src, size_t size, opal_convertor_t* convertor);
 void* opal_cuda_memcpy_sync(void * dest, const void * src, size_t size);
@@ -54,5 +56,7 @@ void* opal_cuda_malloc_gpu_buffer(size_t size, int gpu_id);
 void opal_cuda_free_gpu_buffer(void *addr, int gpu_id);
 void opal_cuda_d2dcpy(void* dst, const void* src, size_t count);
 void opal_cuda_d2dcpy_async(void* dst, const void* src, size_t count);
+void* opal_cuda_iov_dist_init(void);
+void opal_cuda_iov_dist_fini(void *cuda_iov_dist);
 
 #endif
diff --git a/opal/datatype/opal_datatype_destroy.c b/opal/datatype/opal_datatype_destroy.c
index d468cd07e8c..8c225e698c0 100644
--- a/opal/datatype/opal_datatype_destroy.c
+++ b/opal/datatype/opal_datatype_destroy.c
@@ -22,10 +22,23 @@
 #include "opal/constants.h"
 #include "opal/datatype/opal_datatype.h"
 #include "opal/datatype/opal_datatype_internal.h"
+#if OPAL_CUDA_SUPPORT
+#include "opal/datatype/opal_convertor.h"
+#include "opal/datatype/opal_datatype_cuda.h"
+#endif /* OPAL_CUDA_SUPPORT */   
 
 int32_t opal_datatype_destroy( opal_datatype_t** dt )
 {
     opal_datatype_t* pData = *dt;
+    
+#if OPAL_CUDA_SUPPORT   
+    /* free cuda iov */
+    if (opal_datatype_cuda_kernel_support== 1 && pData->cuda_iov_dist != NULL && pData->cuda_iov_dist != (void*)0xDEADBEEF) {
+        opal_cuda_iov_dist_fini(pData->cuda_iov_dist);
+        pData->cuda_iov_dist = NULL;
+        pData->cuda_iov_count = 0;
+    }
+#endif /* OPAL_CUDA_SUPPORT */
 
     if( (pData->flags & OPAL_DATATYPE_FLAG_PREDEFINED) &&
         (pData->super.obj_reference_count <= 1) )
diff --git a/opal/datatype/opal_datatype_optimize.c b/opal/datatype/opal_datatype_optimize.c
index 5ccea9ba1d3..b492aa9381b 100644
--- a/opal/datatype/opal_datatype_optimize.c
+++ b/opal/datatype/opal_datatype_optimize.c
@@ -305,6 +305,13 @@ int32_t opal_datatype_commit( opal_datatype_t * pData )
         pLast->size            = pData->size;
     }
 
+#if OPAL_CUDA_SUPPORT   
+    /* cuda iov for caching, it will be malloced latter when init convertor */
+    pData->cuda_iov_dist = NULL;
+    pData->cuda_iov_is_cached = 0;
+    pData->cuda_iov_count = 0;
+#endif /* OPAL_CUDA_SUPPORT */
+
     /* save a compressed datatype description as a iovec list */
 //    opal_convertor_t* conv = opal_convertor_create( opal_local_arch, 0 /* unused */);
 //    opal_convertor_prepare_for_send( conv, pData, 1, (void*)0 );
diff --git a/opal/datatype/opal_datatype_pack.c b/opal/datatype/opal_datatype_pack.c
index 0573db427df..9812a371a85 100644
--- a/opal/datatype/opal_datatype_pack.c
+++ b/opal/datatype/opal_datatype_pack.c
@@ -416,7 +416,7 @@ opal_generic_simple_pack_cuda_function( opal_convertor_t* pConvertor,
     pos_desc   = pStack->index;
     pElem = &(description[pos_desc]);
    
-//    return opal_generic_simple_pack_function_cuda_iov( pConvertor, iov, out_size, max_data);
+//    return opal_generic_simple_pack_function_cuda_vector( pConvertor, iov, out_size, max_data);
     if( OPAL_DATATYPE_LOOP == pElem->elem.common.type ) {
         return opal_generic_simple_pack_function_cuda_vector( pConvertor, iov, out_size, max_data);
     } else {
diff --git a/opal/datatype/opal_datatype_unpack.c b/opal/datatype/opal_datatype_unpack.c
index 5f0ac368f68..f5e1e76588f 100644
--- a/opal/datatype/opal_datatype_unpack.c
+++ b/opal/datatype/opal_datatype_unpack.c
@@ -610,7 +610,7 @@ opal_generic_simple_unpack_cuda_function( opal_convertor_t* pConvertor,
     pos_desc   = pStack->index;
     pElem = &(description[pos_desc]);
    
-//    return opal_generic_simple_unpack_function_cuda_iov( pConvertor, iov, out_size, max_data);
+//    return opal_generic_simple_unpack_function_cuda_vector( pConvertor, iov, out_size, max_data);
     if( OPAL_DATATYPE_LOOP == pElem->elem.common.type ) {
         return opal_generic_simple_unpack_function_cuda_vector( pConvertor, iov, out_size, max_data);
     } else {
diff --git a/test/datatype/ddt_benchmark.c b/test/datatype/ddt_benchmark.c
index 45440dc2c04..1bb91f663c8 100644
--- a/test/datatype/ddt_benchmark.c
+++ b/test/datatype/ddt_benchmark.c
@@ -1178,7 +1178,7 @@ int main( int argc, char* argv[] )
 #endif
     opal_init_util(&argc, &argv);
 #if defined (DDT_TEST_CUDA)
-   // mca_common_cuda_stage_one_init();
+    mca_common_cuda_stage_one_init();
 #endif
     ompi_datatype_init();
 
@@ -1216,18 +1216,18 @@ int main( int argc, char* argv[] )
         printf("----matrix size %d-----\n", mat_size);
         if( outputFlags & CHECK_PACK_UNPACK ) {
             for (i = 1; i <= 2; i++) {
-                local_copy_with_convertor(pdt, 1, 1024*1024*200, mat_size);
+    //            local_copy_with_convertor(pdt, 1, 1024*1024*200, mat_size);
             }
         }
         OBJ_RELEASE( pdt ); assert( pdt == NULL );
     }
     
     ompi_datatype_t *column, *matt;
-    mat_size = 4000;
-//    ompi_datatype_create_vector( mat_size, 1, mat_size, MPI_DOUBLE, &column );
-//    ompi_datatype_create_hvector( mat_size, 1, sizeof(double), column, &matt );
-//    ompi_datatype_commit( &matt );
-//    local_copy_with_convertor_mat(matt, 1, 200000000, mat_size);
+    mat_size = 1000;
+    ompi_datatype_create_vector( mat_size, 1, mat_size, MPI_DOUBLE, &column );
+    ompi_datatype_create_hvector( mat_size, 1, sizeof(double), column, &matt );
+    ompi_datatype_commit( &matt );
+    local_copy_with_convertor_mat(matt, 1, 200000000, mat_size);
     
     
     int packed_size = 256;
@@ -1279,13 +1279,13 @@ int main( int argc, char* argv[] )
     }
     
     
-    for (blk_len = 64; blk_len <= 64; blk_len += 2) {
+    for (blk_len = 1000; blk_len <= 1000; blk_len += 2) {
         printf( ">>--------------------------------------------<<\n" );
         printf( "Vector data-type (1024 times %d double stride 512)\n", blk_len );
-        pdt = create_vector_type( MPI_DOUBLE, 1000, blk_len, blk_len+128);
+        pdt = create_vector_type( MPI_DOUBLE, 1000, blk_len, blk_len*2);
         if( outputFlags & CHECK_PACK_UNPACK ) {
-            for (i = 0; i < 4; i++) {
-      //           vector_ddt( pdt, 1, pdt, 1, 1024*10240 , 1000, blk_len, blk_len+128);
+            for (i = 0; i < 1; i++) {
+                 vector_ddt( pdt, 1, pdt, 1, 2000000 , 1000, blk_len, blk_len*2);
      //          vector_ddt_2d( pdt, 1, pdt, 1, 1024*1024*100 , 8192, blk_len, blk_len+128);
             }
         }

From 101387c9febde809260713689ee2b1b25839a824 Mon Sep 17 00:00:00 2001
From: George Bosilca <bosilca@icl.utk.edu>
Date: Fri, 6 Nov 2015 20:40:13 -0500
Subject: [PATCH 046/190] Add support for caching the unpacked datatype
 description via the opal_convertor_raw_cached function.

---
 opal/datatype/opal_convertor.h       | 13 ++++++++++++-
 opal/datatype/opal_convertor_raw.c   | 27 ++++++++++++++++++++++++++-
 opal/datatype/opal_datatype.h        |  5 ++++-
 opal/datatype/opal_datatype_create.c | 10 +++++++++-
 4 files changed, 51 insertions(+), 4 deletions(-)

diff --git a/opal/datatype/opal_convertor.h b/opal/datatype/opal_convertor.h
index 822a91e85e0..fb8b4d630a4 100644
--- a/opal/datatype/opal_convertor.h
+++ b/opal/datatype/opal_convertor.h
@@ -3,7 +3,7 @@
  * Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana
  *                         University Research and Technology
  *                         Corporation.  All rights reserved.
- * Copyright (c) 2004-2014 The University of Tennessee and The University
+ * Copyright (c) 2004-2015 The University of Tennessee and The University
  *                         of Tennessee Research Foundation.  All rights
  *                         reserved.
  * Copyright (c) 2004-2006 High Performance Computing Center Stuttgart,
@@ -289,6 +289,17 @@ opal_convertor_to_iov(struct opal_convertor_t *convertor,
                       struct iovec **iov,
                       uint32_t *iov_count,
                       size_t *max_data);
+
+/**
+ * A straighforward description of the datatype in terms of a NULL
+ * based iovec (so basically displacements from the begining of a pointer,
+ * will be generated and stored in the datatype itself. This description
+ * can be used to pack/unpack the data manually.
+ */
+OPAL_DECLSPEC int
+opal_convertor_raw_cached(struct opal_convertor_t *convertor,
+                          const struct iovec **iov,
+                          uint32_t* iov_count);
 /*
  * Upper level does not need to call the _nocheck function directly.
  */
diff --git a/opal/datatype/opal_convertor_raw.c b/opal/datatype/opal_convertor_raw.c
index 441ee9ee0fc..bf46a7a9d5a 100644
--- a/opal/datatype/opal_convertor_raw.c
+++ b/opal/datatype/opal_convertor_raw.c
@@ -1,6 +1,6 @@
 /* -*- Mode: C; c-basic-offset:4 ; -*- */
 /*
- * Copyright (c) 2004-2009 The University of Tennessee and The University
+ * Copyright (c) 2004-2015 The University of Tennessee and The University
  *                         of Tennessee Research Foundation.  All rights
  *                         reserved.
  * Copyright (c) 2009      Oak Ridge National Labs.  All rights reserved.
@@ -240,3 +240,28 @@ opal_convertor_to_iov(struct opal_convertor_t *convertor,
         iovec = &((*iov)[*iov_count]);
     }
 }
+
+int opal_convertor_raw_cached(struct opal_convertor_t *convertor,
+                              const struct iovec **iov,
+                              uint32_t* iov_count)
+{
+    if( NULL == convertor->pDesc->cached_iovec ) {
+        struct opal_convertor_t conv;
+        size_t max_data;
+
+        OBJ_CONSTRUCT(&conv, opal_convertor_t);
+        conv.remoteArch = convertor->remoteArch;
+        conv.stack_pos  = 0;
+        conv.flags      = convertor->flags;
+        conv.master     = convertor->master;
+        opal_convertor_prepare_for_send(&conv, convertor->pDesc, 1, NULL);
+        opal_convertor_get_packed_size(&conv, &max_data);
+        opal_convertor_to_iov(&conv, (struct iovec **)&convertor->pDesc->cached_iovec,
+                              (uint32_t *)&convertor->pDesc->cached_iovec_count, &max_data);
+        OBJ_DESTRUCT(&conv);
+    }
+    *iov = convertor->pDesc->cached_iovec;
+    *iov_count = convertor->pDesc->cached_iovec_count;
+
+    return OPAL_SUCCESS;
+}
diff --git a/opal/datatype/opal_datatype.h b/opal/datatype/opal_datatype.h
index ae37734d208..efbf357c7fd 100644
--- a/opal/datatype/opal_datatype.h
+++ b/opal/datatype/opal_datatype.h
@@ -3,7 +3,7 @@
  * Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana
  *                         University Research and Technology
  *                         Corporation.  All rights reserved.
- * Copyright (c) 2004-2010 The University of Tennessee and The University
+ * Copyright (c) 2004-2015 The University of Tennessee and The University
  *                         of Tennessee Research Foundation.  All rights
  *                         reserved.
  * Copyright (c) 2004-2006 High Performance Computing Center Stuttgart,
@@ -137,6 +137,9 @@ struct opal_datatype_t {
     int8_t             cuda_iov_is_cached;
 #endif /* OPAL_CUDA_SUPPORT */
     /* last cacheline: 32 bytes */
+
+    struct iovec*      cached_iovec;
+    uint32_t           cached_iovec_count;
 };
 
 typedef struct opal_datatype_t opal_datatype_t;
diff --git a/opal/datatype/opal_datatype_create.c b/opal/datatype/opal_datatype_create.c
index e64e1f04190..b97a84f5174 100644
--- a/opal/datatype/opal_datatype_create.c
+++ b/opal/datatype/opal_datatype_create.c
@@ -3,7 +3,7 @@
  * Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana
  *                         University Research and Technology
  *                         Corporation.  All rights reserved.
- * Copyright (c) 2004-2013 The University of Tennessee and The University
+ * Copyright (c) 2004-2015 The University of Tennessee and The University
  *                         of Tennessee Research Foundation.  All rights
  *                         reserved.
  * Copyright (c) 2004-2006 High Performance Computing Center Stuttgart,
@@ -53,6 +53,9 @@ static void opal_datatype_construct( opal_datatype_t* pData )
     pData->opt_desc.length    = 0;
     pData->opt_desc.used      = 0;
 
+    pData->cached_iovec       = NULL;
+    pData->cached_iovec_count = 0;
+
     for( i = 0; i < OPAL_DATATYPE_MAX_SUPPORTED; i++ )
         pData->btypes[i]      = 0;
 }
@@ -82,6 +85,11 @@ static void opal_datatype_destruct( opal_datatype_t* datatype )
 
     /* make sure the name is set to empty */
     datatype->name[0] = '\0';
+
+    if( NULL != datatype->cached_iovec ) {
+        free(datatype->cached_iovec);
+        datatype->cached_iovec = NULL;
+    }
 }
 
 OBJ_CLASS_INSTANCE(opal_datatype_t, opal_object_t, opal_datatype_construct, opal_datatype_destruct);

From 0379c0b3ef875a0ec644d0c3b3e1e054a86a94a0 Mon Sep 17 00:00:00 2001
From: eddy16112 <eddy16112@gmail.com>
Date: Fri, 6 Nov 2015 23:23:33 -0500
Subject: [PATCH 047/190] check point use raw_cached, but cuda iov caching is
 not enabled

---
 .../cuda/opal_datatype_cuda_internal.cuh      |   8 +-
 .../cuda/opal_datatype_pack_cuda_wrapper.cu   | 293 +++++++++++++++++-
 .../cuda/opal_datatype_unpack_cuda_wrapper.cu | 254 ++++++++++++++-
 opal/datatype/opal_convertor.h                |   2 +
 opal/datatype/opal_datatype_cuda.c            |   3 +
 test/datatype/ddt_benchmark.c                 |  16 +-
 6 files changed, 562 insertions(+), 14 deletions(-)

diff --git a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
index ca630fc1b93..eff247a15d2 100644
--- a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
+++ b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
@@ -18,7 +18,7 @@
 #define OPAL_DATATYPE_VECTOR_USE_ZEROCPY   0
 #define OPAL_DATATYPE_VECTOR_USE_PIPELINE   0
 #define OPAL_DATATYPE_VECTOR_USE_MEMCPY2D_AS_KERNEL   1
-#define OPAL_DATATYPE_CUDA_IOV_CACHE    1
+#define OPAL_DATATYPE_CUDA_IOV_CACHE    0
 
 
 
@@ -37,7 +37,7 @@
 #define CUDA_IOV_MAX_TASK_PER_BLOCK 400
 #define ALIGNMENT_DOUBLE    8
 #define ALIGNMENT_FLOAT     4
-#define ALIGNMENT_CHAR      18
+#define ALIGNMENT_CHAR      1
 #define NUM_CUDA_IOV_PER_DDT    100000
 
 #define TIMER_DATA_TYPE struct timeval
@@ -139,6 +139,10 @@ int32_t opal_convertor_set_position_nocheck( opal_convertor_t* convertor, size_t
 int32_t opal_convertor_raw( opal_convertor_t* pConvertor, 
 		                    struct iovec* iov, uint32_t* iov_count,
 		                    size_t* length );
+
+int opal_convertor_raw_cached(struct opal_convertor_t *convertor,
+                              const struct iovec **iov,
+                              uint32_t* iov_count);
 }
 
 #endif  /* OPAL_DATATYPE_CUDA_INTERNAL_H_HAS_BEEN_INCLUDED */
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
index b82888a3f96..b2366b211f4 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
@@ -659,7 +659,7 @@ void pack_contiguous_loop_cuda_zerocopy( dt_elem_desc_t* ELEM,
 #endif
 }
 
-int32_t opal_ddt_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor,
+int32_t opal_ddt_generic_simple_pack_function_cuda_iov2( opal_convertor_t* pConvertor,
                                                     struct iovec* iov,
                                                     uint32_t* out_size,
                                                     size_t* max_data )
@@ -776,6 +776,7 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConve
     dst_offset = 0;
     thread_per_block = CUDA_WARP_SIZE * 5;
     nb_blocks = 256;
+    source_base = (unsigned char*)pConvertor->pBaseBuf; 
     
     while (cuda_iov_count > 0) {
         
@@ -786,7 +787,6 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConve
         cuda_stream_iov = cuda_iov_pipeline_block->cuda_stream;
         cuda_err = cudaStreamWaitEvent(*cuda_stream_iov, cuda_iov_pipeline_block->cuda_event, 0);
         opal_cuda_check_error(cuda_err);
-        source_base = (unsigned char*)cuda_iov[0].iov_base; 
 
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
         GET_TIME(start);
@@ -941,6 +941,295 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConve
     return 0;
 }
 
+int32_t opal_ddt_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor,
+                                                    struct iovec* iov,
+                                                    uint32_t* out_size,
+                                                    size_t* max_data )
+{
+    uint32_t i, j;
+    uint32_t count_desc, nb_blocks_per_description, residue_desc;
+    uint32_t nb_blocks, thread_per_block, nb_blocks_used;
+    size_t length, buffer_size, length_per_iovec;
+    unsigned char *destination, *destination_base, *source_base, *source;
+    size_t total_packed, total_converted;
+    int32_t complete_flag = 0;
+    uint8_t buffer_isfull = 0, transfer_required, free_required;
+    uint32_t convertor_flags;
+//    dt_elem_desc_t* description;
+//    dt_elem_desc_t* pElem;
+//    dt_stack_t* pStack;
+    uint8_t alignment, orig_alignment;
+//    int32_t orig_stack_index;
+    cudaError_t cuda_err;
+    ddt_cuda_stream_t *cuda_streams = current_cuda_device->cuda_streams;
+    ddt_cuda_iov_dist_t* cuda_iov_dist_h_current;
+    ddt_cuda_iov_dist_t* cuda_iov_dist_d_current;
+    ddt_cuda_iov_pipeline_block_t *cuda_iov_pipeline_block;
+    int iov_pipeline_block_id = 0;
+    cudaStream_t *cuda_stream_iov = NULL;
+    const struct iovec *ddt_iov = NULL;
+    uint32_t ddt_iov_count;
+    size_t iov_len;
+    int iov_start_pos, iov_end_pos;
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)    
+    TIMER_DATA_TYPE start, end, start_total, end_total;
+    long total_time, move_time;
+#endif
+    
+#if OPAL_DATATYPE_CUDA_IOV_CACHE
+    opal_datatype_t *pDesc = (opal_datatype_t *)pConvertor->pDesc;
+    ddt_cuda_iov_dist_t *cuda_iov_dist_cache = (ddt_cuda_iov_dist_t *)pDesc->cuda_iov_dist;
+    cuda_iov_dist_cache += pDesc->cuda_iov_count;    
+#endif /* OPAL_DATATYPE_CUDA_IOV_CACHE */
+    
+    /*description = pConvertor->use_desc->desc;
+    pStack = pConvertor->pStack + pConvertor->stack_pos;
+    pElem = &(description[pStack->index]);
+    printf("size elem %lu, size %d\n", pElem->elem.common.type, opal_datatype_basicDatatypes[pElem->elem.common.type]->size);
+    */
+    
+//    assert(opal_datatype_basicDatatypes[pElem->elem.common.type]->size != 0);
+
+ //   printf("buffer size %d, max_data %d\n", iov[0].iov_len, *max_data);
+    if ((iov[0].iov_base == NULL) || opal_ddt_cuda_is_gpu_buffer(iov[0].iov_base)) {
+        if (iov[0].iov_len == 0) {
+            buffer_size = DT_CUDA_BUFFER_SIZE;
+        } else {
+            buffer_size = iov[0].iov_len;
+        }
+        
+        if (iov[0].iov_base == NULL) {
+            iov[0].iov_base = (unsigned char *)opal_ddt_cuda_malloc_gpu_buffer(buffer_size, 0);
+            destination = (unsigned char *)iov[0].iov_base;
+            pConvertor->gpu_buffer_ptr = destination;
+            free_required = 1;
+        } else {
+            destination = (unsigned char *)iov[0].iov_base;
+            free_required = 0;
+        }
+        transfer_required = 0;
+    } else {
+        buffer_size = iov[0].iov_len;
+        if (OPAL_DATATYPE_VECTOR_USE_ZEROCPY) {
+            pConvertor->gpu_buffer_ptr = NULL;
+            transfer_required = 0;
+            free_required = 0;
+            cudaHostGetDevicePointer((void **)&destination, (void *)iov[0].iov_base, 0);
+        } else {
+            if (pConvertor->gpu_buffer_ptr == NULL) {
+                pConvertor->gpu_buffer_ptr = (unsigned char*)opal_ddt_cuda_malloc_gpu_buffer(buffer_size, 0);
+            }
+            transfer_required = 1;
+            free_required = 1;
+            destination = pConvertor->gpu_buffer_ptr;
+        }
+    }
+    
+#if OPAL_DATATYPE_CUDA_IOV_CACHE
+    /* cuda iov is cached */
+    if (pDesc->cuda_iov_is_cached == 2) {
+        pack_iov_cached(pConvertor, destination);
+    }
+#endif /* OPAL_DATATYPE_CUDA_IOV_CACHE */    
+    
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "Pack using IOV, GPU base %p, pack to buffer %p\n", pConvertor->pBaseBuf, destination););
+
+    cuda_iov_count = 4000;//CUDA_NB_IOV;
+    total_packed = 0;
+    total_converted = pConvertor->bConverted;
+    cuda_streams->current_stream_id = 0;
+  //  orig_stack_index = pStack->index;
+    destination_base = destination;
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+    GET_TIME(start_total);
+#endif
+    
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+    GET_TIME(start);
+#endif
+    
+    opal_convertor_raw_cached( pConvertor, &ddt_iov, &ddt_iov_count);
+    assert(ddt_iov != NULL);
+    DT_CUDA_DEBUG ( opal_cuda_output(4, "Pack iov count %d, submit to CUDA stream %d\n", ddt_iov_count, cuda_streams->current_stream_id); );
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)    
+    GET_TIME( end );
+    total_time = ELAPSED_TIME( start, end );
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: ddt to iov in %ld microsec\n", total_time ); );
+#endif
+    
+    thread_per_block = CUDA_WARP_SIZE * 5;
+    nb_blocks = 256;
+    
+    iov_start_pos = pConvertor->current_iov_pos;
+    iov_end_pos = iov_start_pos + 1000;
+    if (iov_end_pos > ddt_iov_count) {
+        iov_end_pos = ddt_iov_count;
+    }
+    source_base = (unsigned char*)pConvertor->pBaseBuf; 
+    
+    while (iov_start_pos < iov_end_pos && !buffer_isfull) {
+        
+        nb_blocks_used = 0;
+        cuda_iov_pipeline_block = current_cuda_device->cuda_iov_pipeline_block[iov_pipeline_block_id];
+        cuda_iov_dist_h_current = cuda_iov_pipeline_block->cuda_iov_dist_h;
+        cuda_iov_dist_d_current = cuda_iov_pipeline_block->cuda_iov_dist_d;
+        cuda_stream_iov = cuda_iov_pipeline_block->cuda_stream;
+        cuda_err = cudaStreamWaitEvent(*cuda_stream_iov, cuda_iov_pipeline_block->cuda_event, 0);
+        opal_cuda_check_error(cuda_err);
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+        GET_TIME(start);
+#endif
+
+        for (i = iov_start_pos; i < iov_end_pos; i++) {
+            if (pConvertor->current_iov_partial_length > 0) {
+                iov_len = pConvertor->current_iov_partial_length;
+                pConvertor->current_iov_partial_length = 0;
+            } else {
+                iov_len = ddt_iov[i].iov_len;
+            }
+            if (buffer_size >= iov_len) {
+                length_per_iovec = iov_len;
+            } else {
+                /*orig_alignment = opal_datatype_basicDatatypes[pElem->elem.common.type]->size;*/
+                orig_alignment = ALIGNMENT_CHAR;
+                length_per_iovec = buffer_size / orig_alignment * orig_alignment;
+                buffer_isfull = 1;
+                pConvertor->current_iov_partial_length = iov_len - length_per_iovec;
+                pConvertor->current_iov_pos = i;
+            }
+            buffer_size -= length_per_iovec;
+            total_packed += length_per_iovec;
+            source = (size_t)(ddt_iov[i].iov_base) + (ddt_iov[i].iov_len - iov_len) + source_base;
+            
+            /* check alignment */
+            if ((uintptr_t)(source) % ALIGNMENT_DOUBLE == 0 && (uintptr_t)destination % ALIGNMENT_DOUBLE == 0 && length_per_iovec >= ALIGNMENT_DOUBLE) {
+                alignment = ALIGNMENT_DOUBLE;
+            } else if ((uintptr_t)(source) % ALIGNMENT_FLOAT == 0 && (uintptr_t)destination % ALIGNMENT_FLOAT == 0 && length_per_iovec >= ALIGNMENT_FLOAT) {
+                alignment = ALIGNMENT_FLOAT;
+            } else {
+                alignment = ALIGNMENT_CHAR;
+            }
+
+            count_desc = length_per_iovec / alignment;
+            residue_desc = length_per_iovec % alignment;
+            nb_blocks_per_description = (count_desc + thread_per_block - 1) / thread_per_block;
+            DT_CUDA_DEBUG ( opal_cuda_output(10, "Pack description %d, size %d, residue %d, alignment %d, nb_block_aquired %d\n", i, count_desc, residue_desc, alignment, nb_blocks_per_description); );
+            for (j = 0; j < nb_blocks_per_description; j++) {
+                cuda_iov_dist_h_current[nb_blocks_used].src_offset = source + j * thread_per_block * alignment - source_base;
+                cuda_iov_dist_h_current[nb_blocks_used].dst_offset = destination - destination_base;
+                cuda_iov_dist_h_current[nb_blocks_used].element_alignment = alignment;
+                if ( (j+1) * thread_per_block <= count_desc) {
+                    cuda_iov_dist_h_current[nb_blocks_used].nb_elements = thread_per_block;
+                } else {
+                    cuda_iov_dist_h_current[nb_blocks_used].nb_elements = count_desc - j*thread_per_block; 
+                }
+#if defined (OPAL_DATATYPE_CUDA_DEBUG)
+                assert(cuda_iov_dist_h_current[nb_blocks_used].nb_elements > 0);
+#endif /* OPAL_DATATYPE_CUDA_DEBUG */
+                destination += cuda_iov_dist_h_current[nb_blocks_used].nb_elements * alignment;
+                DT_CUDA_DEBUG( opal_cuda_output(12, "Pack \tblock %d, src %p, dst %p, nb_elements %d, alignment %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].src_offset, cuda_iov_dist_h_current[nb_blocks_used].dst_offset, cuda_iov_dist_h_current[nb_blocks_used].nb_elements, cuda_iov_dist_h_current[nb_blocks_used].element_alignment); );
+                nb_blocks_used ++;
+                assert (nb_blocks_used < CUDA_MAX_NB_BLOCKS*CUDA_IOV_MAX_TASK_PER_BLOCK);
+            }
+            
+            /* handle residue */
+            if (residue_desc != 0) {
+                /*orig_alignment = opal_datatype_basicDatatypes[pElem->elem.common.type]->size;*/
+                orig_alignment = ALIGNMENT_CHAR;
+                cuda_iov_dist_h_current[nb_blocks_used].src_offset = source + length_per_iovec / alignment * alignment - source_base;
+                cuda_iov_dist_h_current[nb_blocks_used].dst_offset = destination - destination_base;
+                cuda_iov_dist_h_current[nb_blocks_used].element_alignment = orig_alignment;
+                cuda_iov_dist_h_current[nb_blocks_used].nb_elements = (length_per_iovec - length_per_iovec / alignment * alignment) / orig_alignment;
+#if defined (OPAL_DATATYPE_CUDA_DEBUG)
+                assert(cuda_iov_dist_h_current[nb_blocks_used].nb_elements > 0);
+#endif /* OPAL_DATATYPE_CUDA_DEBUG */
+                destination += cuda_iov_dist_h_current[nb_blocks_used].nb_elements * orig_alignment;
+                DT_CUDA_DEBUG( opal_cuda_output(12, "Pack \tblock %d, src %p, dst %p, nb_elements %d, alignment %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].src_offset, cuda_iov_dist_h_current[nb_blocks_used].dst_offset, cuda_iov_dist_h_current[nb_blocks_used].nb_elements, cuda_iov_dist_h_current[nb_blocks_used].element_alignment); );
+                nb_blocks_used ++;
+                assert (nb_blocks_used < CUDA_MAX_NB_BLOCKS*CUDA_IOV_MAX_TASK_PER_BLOCK);
+            }
+            
+            if (buffer_isfull) {
+                break;
+            }
+        }
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)    
+        GET_TIME( end );
+        total_time = ELAPSED_TIME( start, end );
+        DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Pack to dest %p, iov is prepared in %ld microsec, kernel submitted to CUDA stream %d, nb_blocks %d\n", destination_base, total_time,  cuda_iov_pipeline_block->cuda_stream_id, nb_blocks_used); );
+#endif
+
+        cudaMemcpyAsync(cuda_iov_dist_d_current, cuda_iov_dist_h_current, sizeof(ddt_cuda_iov_dist_t)*(nb_blocks_used), cudaMemcpyHostToDevice, *cuda_stream_iov);
+#if OPAL_DATATYPE_CUDA_IOV_CACHE
+        cudaMemcpyAsync(cuda_iov_dist_cache, cuda_iov_dist_d_current, sizeof(ddt_cuda_iov_dist_t)*(nb_blocks_used), cudaMemcpyDeviceToDevice, *cuda_stream_iov);
+        pDesc->cuda_iov_count += nb_blocks_used;
+        cuda_iov_dist_cache += nb_blocks_used;
+#endif /* OPAL_DATATYPE_CUDA_IOV_CACHE */
+        DT_CUDA_DEBUG ( opal_cuda_output(2, "kernel launched src_base %p, dst_base %p, nb_blocks %ld\n", source_base, destination_base, nb_blocks_used ); );
+        opal_generic_simple_pack_cuda_iov_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cuda_iov_dist_d_current, nb_blocks_used, source_base, destination_base);
+        cuda_err = cudaEventRecord(cuda_iov_pipeline_block->cuda_event, *cuda_stream_iov);
+        opal_cuda_check_error(cuda_err);
+        iov_pipeline_block_id ++;
+        iov_pipeline_block_id = iov_pipeline_block_id % NB_STREAMS;
+        
+//        orig_stack_index = pStack->index;
+        iov_start_pos = iov_end_pos;
+        iov_end_pos = iov_start_pos + 1000;
+        if (iov_end_pos > ddt_iov_count) {
+            iov_end_pos = ddt_iov_count;
+        }
+        DT_CUDA_DEBUG ( opal_cuda_output(4, "Pack iov start pos %d end pos %d, submit to CUDA stream %d\n", iov_start_pos, iov_end_pos, cuda_streams->current_stream_id); );
+    }
+    
+
+    for (i = 0; i < NB_STREAMS; i++) {
+        cudaStreamSynchronize(cuda_streams->opal_cuda_stream[i]);
+    }
+    
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+    GET_TIME(start);
+#endif
+    if (transfer_required) {
+        cudaMemcpy(iov[0].iov_base, pConvertor->gpu_buffer_ptr, total_packed, cudaMemcpyDeviceToHost);
+    } 
+#if defined(OPAL_DATATYPE_CUDA_TIMING) 
+    GET_TIME( end );
+    move_time = ELAPSED_TIME( start, end );
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: DtoH memcpy in %ld microsec, transfer required %d\n", move_time, transfer_required ); );
+#endif
+
+    iov[0].iov_len = total_packed;
+    *max_data = total_packed;
+    *out_size = 1;
+    pConvertor->bConverted += total_packed;
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "Pack total packed %d\n", total_packed); );
+    
+#if defined(OPAL_DATATYPE_CUDA_TIMING)    
+    GET_TIME( end_total );
+    total_time = ELAPSED_TIME( start_total, end_total );
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: total packing in %ld microsec, kernel %ld microsec\n", total_time, total_time-move_time ); );
+#endif
+    
+    if( pConvertor->bConverted == pConvertor->local_size ) {
+        pConvertor->flags |= CONVERTOR_COMPLETED;
+        if (pConvertor->gpu_buffer_ptr != NULL && free_required) {
+           opal_ddt_cuda_free_gpu_buffer(pConvertor->gpu_buffer_ptr, 0);
+           pConvertor->gpu_buffer_ptr = NULL;
+        }
+#if OPAL_DATATYPE_CUDA_IOV_CACHE
+        pDesc->cuda_iov_is_cached = 2;
+#endif /* OPAL_DATATYPE_CUDA_IOV_CACHE */
+        return 1;
+    }        
+    return 0;
+}
+
+
 #if OPAL_DATATYPE_CUDA_IOV_CACHE
 void pack_iov_cached(opal_convertor_t* pConvertor, unsigned char *destination)
 {
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
index f483d230934..4b4438fa8e4 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
@@ -370,7 +370,7 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_vector2( opal_convertor_t*
     return 0;
 }
 
-int32_t opal_ddt_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvertor,
+int32_t opal_ddt_generic_simple_unpack_function_cuda_iov2( opal_convertor_t* pConvertor,
                                                           struct iovec* iov,
                                                           uint32_t* out_size,
                                                           size_t* max_data )
@@ -469,6 +469,7 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pCon
     dst_offset = 0;
     thread_per_block = CUDA_WARP_SIZE * 5;
     nb_blocks = 256;
+    destination_base = (unsigned char*)pConvertor->pBaseBuf;
 
     while (cuda_iov_count > 0) {
 
@@ -479,7 +480,7 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pCon
         cuda_stream_iov = cuda_iov_pipeline_block->cuda_stream;
         cuda_err = cudaStreamWaitEvent(*cuda_stream_iov, cuda_iov_pipeline_block->cuda_event, 0);
         opal_cuda_check_error(cuda_err);
-        destination_base = (unsigned char*)cuda_iov[0].iov_base;
+        
 
 #if defined (OPAL_DATATYPE_CUDA_TIMING)
         GET_TIME(start);
@@ -614,6 +615,255 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pCon
     return 0;
 }
 
+int32_t opal_ddt_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvertor,
+                                                          struct iovec* iov,
+                                                          uint32_t* out_size,
+                                                          size_t* max_data )
+{
+    uint32_t i, j;
+    uint32_t count_desc, nb_blocks_per_description, residue_desc;
+    uint32_t nb_blocks, thread_per_block, nb_blocks_used;
+    size_t length, buffer_size, length_per_iovec;
+    unsigned char *source, *source_base, *destination_base, *destination;
+    size_t total_unpacked, total_converted;
+    int32_t complete_flag = 0;
+    uint8_t buffer_isfull = 0;
+    uint8_t free_required = 0;
+    uint32_t convertor_flags;
+//    dt_elem_desc_t* description;
+//    dt_elem_desc_t* pElem;
+//    dt_stack_t* pStack;
+    uint8_t alignment, orig_alignment;
+//    int32_t orig_stack_index;
+    cudaError_t cuda_err;
+    ddt_cuda_stream_t *cuda_streams = current_cuda_device->cuda_streams;
+    ddt_cuda_iov_dist_t* cuda_iov_dist_h_current;
+    ddt_cuda_iov_dist_t* cuda_iov_dist_d_current;
+    ddt_cuda_iov_pipeline_block_t *cuda_iov_pipeline_block;
+    int iov_pipeline_block_id = 0;
+    cudaStream_t *cuda_stream_iov = NULL;
+    const struct iovec *ddt_iov = NULL;
+    uint32_t ddt_iov_count;
+    size_t iov_len;
+    int iov_start_pos, iov_end_pos;
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+    TIMER_DATA_TYPE start, end, start_total, end_total;
+    long total_time, move_time;
+#endif
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+    GET_TIME(start_total);
+#endif
+
+/*    description = pConvertor->use_desc->desc;
+    pStack = pConvertor->pStack + pConvertor->stack_pos;
+    pElem = &(description[pStack->index]);
+    printf("size elem %d, size %lu\n", pElem->elem.common.type, opal_datatype_basicDatatypes[pElem->elem.common.type]->size);
+*/
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+    GET_TIME(start);
+#endif
+    if (opal_ddt_cuda_is_gpu_buffer(iov[0].iov_base)) {
+        source = (unsigned char*)iov[0].iov_base;
+        free_required = 0;
+    } else {
+        if (OPAL_DATATYPE_VECTOR_USE_ZEROCPY) {
+            cudaHostGetDevicePointer((void **)&source, (void *)iov[0].iov_base, 0);
+            pConvertor->gpu_buffer_ptr = NULL;
+            free_required = 0;
+        } else {
+            if (pConvertor->gpu_buffer_ptr == NULL) {
+                pConvertor->gpu_buffer_ptr = (unsigned char*)opal_ddt_cuda_malloc_gpu_buffer(iov[0].iov_len, 0);
+            }
+            source = pConvertor->gpu_buffer_ptr;
+            cudaMemcpy(source, iov[0].iov_base, iov[0].iov_len, cudaMemcpyHostToDevice);
+            free_required = 1;
+        }
+    }
+
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "Unpack using IOV, GPU base %p, unpack from buffer %p, total size %ld\n",
+                                     pConvertor->pBaseBuf, source, iov[0].iov_len); );
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+    GET_TIME( end );
+    move_time = ELAPSED_TIME( start, end );
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: HtoD memcpy in %ld microsec, free required %d\n", move_time, free_required ); );
+#endif
+    
+//    cuda_err = cudaEventRecord(current_cuda_device->memcpy_event, current_cuda_device->cuda_streams->opal_cuda_stream[0]);
+//    opal_cuda_check_error(cuda_err);
+
+
+#if defined (OPAL_DATATYPE_CUDA_TIMING)
+    GET_TIME(start);
+#endif
+    buffer_size = iov[0].iov_len;
+    cuda_iov_count = 1000;
+    total_unpacked = 0;
+    total_converted = pConvertor->bConverted;
+    cuda_streams->current_stream_id = 0;
+    convertor_flags = pConvertor->flags;
+//    orig_stack_index = pStack->index;
+    source_base = source;
+    opal_convertor_raw_cached( pConvertor, &ddt_iov, &ddt_iov_count);
+    assert(ddt_iov != NULL);
+    DT_CUDA_DEBUG ( opal_cuda_output(4, "Unpack iov count %d, submit to CUDA stream %d\n", ddt_iov_count, cuda_streams->current_stream_id); );
+
+#if defined (OPAL_DATATYPE_CUDA_TIMING)
+    GET_TIME( end );
+    total_time = ELAPSED_TIME( start, end );
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: ddt to iov in %ld microsec\n", total_time ); );
+#endif
+
+    thread_per_block = CUDA_WARP_SIZE * 5;
+    nb_blocks = 256;
+    
+    iov_start_pos = pConvertor->current_iov_pos;
+    iov_end_pos = iov_start_pos + 1000;
+    if (iov_end_pos > ddt_iov_count) {
+        iov_end_pos = ddt_iov_count;
+    }
+    destination_base = (unsigned char*)pConvertor->pBaseBuf;
+
+    while (iov_start_pos < iov_end_pos && !buffer_isfull) {
+
+        nb_blocks_used = 0;
+        cuda_iov_pipeline_block = current_cuda_device->cuda_iov_pipeline_block[iov_pipeline_block_id];
+        cuda_iov_dist_h_current = cuda_iov_pipeline_block->cuda_iov_dist_h;
+        cuda_iov_dist_d_current = cuda_iov_pipeline_block->cuda_iov_dist_d;
+        cuda_stream_iov = cuda_iov_pipeline_block->cuda_stream;
+        cuda_err = cudaStreamWaitEvent(*cuda_stream_iov, cuda_iov_pipeline_block->cuda_event, 0);
+        opal_cuda_check_error(cuda_err);
+        
+
+#if defined (OPAL_DATATYPE_CUDA_TIMING)
+        GET_TIME(start);
+#endif
+
+        for (i = iov_start_pos; i < iov_end_pos; i++) {
+            if (pConvertor->current_iov_partial_length > 0) {
+                iov_len = pConvertor->current_iov_partial_length;
+                pConvertor->current_iov_partial_length = 0;
+            } else {
+                iov_len = ddt_iov[i].iov_len;
+            }
+            if (buffer_size >= iov_len) {
+                length_per_iovec = iov_len;
+            } else {
+              /*  orig_alignment = opal_datatype_basicDatatypes[pElem->elem.common.type]->size;*/
+                orig_alignment = ALIGNMENT_CHAR;
+                length_per_iovec = buffer_size / orig_alignment * orig_alignment;
+                buffer_isfull = 1;
+                pConvertor->current_iov_partial_length = iov_len - length_per_iovec;
+                pConvertor->current_iov_pos = i;
+            }
+            buffer_size -= length_per_iovec;
+            total_unpacked += length_per_iovec;
+            destination = (size_t)(ddt_iov[i].iov_base) + (ddt_iov[i].iov_len - iov_len) + destination_base;
+
+            /* check alignment */
+            if ((uintptr_t)(destination) % ALIGNMENT_DOUBLE == 0 && (uintptr_t)source % ALIGNMENT_DOUBLE == 0 && length_per_iovec >= ALIGNMENT_DOUBLE) {
+                alignment = ALIGNMENT_DOUBLE;
+            } else if ((uintptr_t)(destination) % ALIGNMENT_FLOAT == 0 && (uintptr_t)source % ALIGNMENT_FLOAT == 0 && length_per_iovec >= ALIGNMENT_FLOAT) {
+                alignment = ALIGNMENT_FLOAT;
+            } else {
+                alignment = ALIGNMENT_CHAR;
+            }
+
+            //alignment = ALIGNMENT_DOUBLE;
+
+            count_desc = length_per_iovec / alignment;
+            residue_desc = length_per_iovec % alignment;
+            nb_blocks_per_description = (count_desc + thread_per_block - 1) / thread_per_block;
+            DT_CUDA_DEBUG ( opal_cuda_output(10, "Unpack description %d, size %d, residue %d, alignment %d\n", i, count_desc, residue_desc, alignment); );
+            for (j = 0; j < nb_blocks_per_description; j++) {
+                cuda_iov_dist_h_current[nb_blocks_used].dst_offset = destination + j * thread_per_block * alignment - destination_base;
+                cuda_iov_dist_h_current[nb_blocks_used].src_offset = source - source_base;
+                cuda_iov_dist_h_current[nb_blocks_used].element_alignment = alignment;
+                if ( (j+1) * thread_per_block <= count_desc) {
+                    cuda_iov_dist_h_current[nb_blocks_used].nb_elements = thread_per_block;// * sizeof(double);
+                } else {
+                    cuda_iov_dist_h_current[nb_blocks_used].nb_elements = (thread_per_block - ((j+1)*thread_per_block - count_desc));// * sizeof(double);
+                }
+#if defined (OPAL_DATATYPE_CUDA_DEBUG)
+                assert (cuda_iov_dist_h_current[nb_blocks_used].nb_elements > 0); 
+#endif /* OPAL_DATATYPE_CUDA_DEBUG */
+                source += cuda_iov_dist_h_current[nb_blocks_used].nb_elements * alignment;
+                DT_CUDA_DEBUG( opal_cuda_output(12, "Unpack \tblock %d, src %p, dst %p, nb_elements %d, alignment %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].src_offset, cuda_iov_dist_h_current[nb_blocks_used].dst_offset, cuda_iov_dist_h_current[nb_blocks_used].nb_elements, cuda_iov_dist_h_current[nb_blocks_used].element_alignment); );
+                nb_blocks_used ++;
+            }
+
+            /* handle residue */
+            if (residue_desc != 0) {
+               /* orig_alignment = opal_datatype_basicDatatypes[pElem->elem.common.type]->size;*/
+                orig_alignment = ALIGNMENT_CHAR;
+                cuda_iov_dist_h_current[nb_blocks_used].dst_offset = destination + length_per_iovec / alignment * alignment - destination_base;
+                cuda_iov_dist_h_current[nb_blocks_used].src_offset = source - source_base;
+                cuda_iov_dist_h_current[nb_blocks_used].element_alignment = orig_alignment;
+                cuda_iov_dist_h_current[nb_blocks_used].nb_elements = (length_per_iovec - length_per_iovec / alignment * alignment) / orig_alignment;
+#if defined (OPAL_DATATYPE_CUDA_DEBUG)
+                assert (cuda_iov_dist_h_current[nb_blocks_used].nb_elements > 0);
+#endif /* OPAL_DATATYPE_CUDA_DEBUG */
+                source += cuda_iov_dist_h_current[nb_blocks_used].nb_elements * orig_alignment;
+                DT_CUDA_DEBUG( opal_cuda_output(12, "Unpack \tblock %d, src %p, dst %p, nb_elements %d, alignment %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].src_offset, cuda_iov_dist_h_current[nb_blocks_used].dst_offset, cuda_iov_dist_h_current[nb_blocks_used].nb_elements, cuda_iov_dist_h_current[nb_blocks_used].element_alignment); );
+                nb_blocks_used ++;
+            }
+
+            if (buffer_isfull) {
+                break;
+            }
+        }
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+        GET_TIME( end );
+        total_time = ELAPSED_TIME( start, end );
+        DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Unpack src %p, iov is prepared in %ld microsec, kernel submitted to CUDA stream %d, nb_blocks_used %d\n", source_base, total_time,  cuda_iov_pipeline_block->cuda_stream_id, nb_blocks_used); );
+#endif
+
+        cudaMemcpyAsync(cuda_iov_dist_d_current, cuda_iov_dist_h_current, sizeof(ddt_cuda_iov_dist_t)*(nb_blocks_used), cudaMemcpyHostToDevice, *cuda_stream_iov);
+        opal_generic_simple_unpack_cuda_iov_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cuda_iov_dist_d_current, nb_blocks_used, source_base, destination_base);
+        cuda_err = cudaEventRecord(cuda_iov_pipeline_block->cuda_event, *cuda_stream_iov);
+        opal_cuda_check_error(cuda_err);
+        iov_pipeline_block_id ++;
+        iov_pipeline_block_id = iov_pipeline_block_id % NB_STREAMS;
+        
+        iov_start_pos = iov_end_pos;
+        iov_end_pos = iov_start_pos + 1000;
+        if (iov_end_pos > ddt_iov_count) {
+            iov_end_pos = ddt_iov_count;
+        }
+        DT_CUDA_DEBUG ( opal_cuda_output(4, "Unpack iov start pos %d end pos %d, submit to CUDA stream %d\n", iov_start_pos, iov_end_pos, cuda_streams->current_stream_id); );
+
+    }
+
+    for (i = 0; i < NB_STREAMS; i++) {
+        cudaStreamSynchronize(cuda_streams->opal_cuda_stream[i]);
+    }
+
+    iov[0].iov_len = total_unpacked;
+    *max_data = total_unpacked;
+    *out_size = 1;
+    pConvertor->bConverted += total_unpacked;
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "Unpack total unpacked %d\n", total_unpacked); );
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+    GET_TIME( end_total );
+    total_time = ELAPSED_TIME( start_total, end_total );
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: total unpacking in %ld microsec, kernel %ld microsec\n", total_time, total_time-move_time ); );
+#endif
+
+    if( pConvertor->bConverted == pConvertor->local_size ) {
+        pConvertor->flags |= CONVERTOR_COMPLETED;
+        if (pConvertor->gpu_buffer_ptr != NULL && free_required) {
+            opal_ddt_cuda_free_gpu_buffer(pConvertor->gpu_buffer_ptr, 0);
+            pConvertor->gpu_buffer_ptr = NULL;
+        }
+        return 1;
+    }
+    return 0;
+}
+
 void unpack_contiguous_loop_cuda( dt_elem_desc_t* ELEM,
                                   uint32_t* COUNT,
                                   unsigned char** SOURCE,
diff --git a/opal/datatype/opal_convertor.h b/opal/datatype/opal_convertor.h
index fb8b4d630a4..1ab600cc49b 100644
--- a/opal/datatype/opal_convertor.h
+++ b/opal/datatype/opal_convertor.h
@@ -115,6 +115,8 @@ struct opal_convertor_t {
     unsigned char *               gpu_buffer_ptr; /**< GPU buffer used for pack/unpack */
     size_t                        gpu_buffer_size;
     size_t                        current_cuda_iov_count;
+    size_t                        current_iov_pos;
+    size_t                        current_iov_partial_length;
 #endif
     /* size: 248, cachelines: 4, members: 20 */
     /* last cacheline: 56 bytes */
diff --git a/opal/datatype/opal_datatype_cuda.c b/opal/datatype/opal_datatype_cuda.c
index 729e460de1a..3b3fc556ef9 100644
--- a/opal/datatype/opal_datatype_cuda.c
+++ b/opal/datatype/opal_datatype_cuda.c
@@ -95,6 +95,9 @@ void mca_cuda_convertor_init(opal_convertor_t* convertor, const void *pUserBuf,
             datatype_tmp->cuda_iov_is_cached = 1;
         }
     }
+    convertor->current_cuda_iov_count = 0;
+    convertor->current_iov_pos = 0;
+    convertor->current_iov_partial_length = 0;
     
 }
 
diff --git a/test/datatype/ddt_benchmark.c b/test/datatype/ddt_benchmark.c
index 1bb91f663c8..50f62ec5839 100644
--- a/test/datatype/ddt_benchmark.c
+++ b/test/datatype/ddt_benchmark.c
@@ -1211,12 +1211,12 @@ int main( int argc, char* argv[] )
     
     printf( "\n\n#\n * TEST UPPER TRIANGULAR MATRIX (size 100)\n #\n\n" );
     int mat_size = 500;
-    for (mat_size = 6000; mat_size <= 6000; mat_size +=500) {
+    for (mat_size = 2000; mat_size <= 2000; mat_size +=500) {
         pdt = upper_matrix(mat_size);
         printf("----matrix size %d-----\n", mat_size);
         if( outputFlags & CHECK_PACK_UNPACK ) {
-            for (i = 1; i <= 2; i++) {
-    //            local_copy_with_convertor(pdt, 1, 1024*1024*200, mat_size);
+            for (i = 1; i <= 1; i++) {
+                local_copy_with_convertor(pdt, 1, 4000001, mat_size);
             }
         }
         OBJ_RELEASE( pdt ); assert( pdt == NULL );
@@ -1224,10 +1224,10 @@ int main( int argc, char* argv[] )
     
     ompi_datatype_t *column, *matt;
     mat_size = 1000;
-    ompi_datatype_create_vector( mat_size, 1, mat_size, MPI_DOUBLE, &column );
-    ompi_datatype_create_hvector( mat_size, 1, sizeof(double), column, &matt );
-    ompi_datatype_commit( &matt );
-    local_copy_with_convertor_mat(matt, 1, 200000000, mat_size);
+ //   ompi_datatype_create_vector( mat_size, 1, mat_size, MPI_DOUBLE, &column );
+ //   ompi_datatype_create_hvector( mat_size, 1, sizeof(double), column, &matt );
+ //   ompi_datatype_commit( &matt );
+ //   local_copy_with_convertor_mat(matt, 1, 200000000, mat_size);
     
     
     int packed_size = 256;
@@ -1285,7 +1285,7 @@ int main( int argc, char* argv[] )
         pdt = create_vector_type( MPI_DOUBLE, 1000, blk_len, blk_len*2);
         if( outputFlags & CHECK_PACK_UNPACK ) {
             for (i = 0; i < 1; i++) {
-                 vector_ddt( pdt, 1, pdt, 1, 2000000 , 1000, blk_len, blk_len*2);
+        //         vector_ddt( pdt, 1, pdt, 1, 2000000 , 1000, blk_len, blk_len*2);
      //          vector_ddt_2d( pdt, 1, pdt, 1, 1024*1024*100 , 8192, blk_len, blk_len+128);
             }
         }

From 4a39f49d2ff5e20a63ccc0c6ac4873bad0cd8d58 Mon Sep 17 00:00:00 2001
From: eddy16112 <eddy16112@gmail.com>
Date: Sun, 8 Nov 2015 15:23:02 -0500
Subject: [PATCH 048/190] check point, split iov into two version, non-cached
 and cached

---
 opal/datatype/cuda/opal_datatype_cuda.cu      |  12 +-
 opal/datatype/cuda/opal_datatype_cuda.cuh     |  36 +++--
 .../cuda/opal_datatype_cuda_internal.cuh      |  23 +++-
 .../cuda/opal_datatype_pack_cuda_kernel.cu    |  70 +++++++++-
 .../cuda/opal_datatype_pack_cuda_wrapper.cu   | 125 ++++++------------
 .../cuda/opal_datatype_unpack_cuda_kernel.cu  |  43 +++++-
 .../cuda/opal_datatype_unpack_cuda_wrapper.cu |  63 +++++----
 opal/datatype/opal_datatype_pack.c            |   2 +-
 opal/datatype/opal_datatype_unpack.c          |   2 +-
 test/datatype/ddt_benchmark.c                 |   2 +-
 10 files changed, 238 insertions(+), 140 deletions(-)

diff --git a/opal/datatype/cuda/opal_datatype_cuda.cu b/opal/datatype/cuda/opal_datatype_cuda.cu
index 6a6e06ff28d..fbafc2bfbe2 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cu
+++ b/opal/datatype/cuda/opal_datatype_cuda.cu
@@ -220,8 +220,10 @@ int32_t opal_ddt_cuda_kernel_init(void)
         for (j = 0; j < NB_STREAMS; j++) {
             cudaStreamCreate(&(cuda_streams->opal_cuda_stream[j]));
             cuda_iov_pipeline_block = (ddt_cuda_iov_pipeline_block_t *)malloc(sizeof(ddt_cuda_iov_pipeline_block_t));
-            cudaMallocHost((void **)(&(cuda_iov_pipeline_block->cuda_iov_dist_h)), sizeof(ddt_cuda_iov_dist_t) * CUDA_MAX_NB_BLOCKS * CUDA_IOV_MAX_TASK_PER_BLOCK);
-            cudaMalloc((void **)(&(cuda_iov_pipeline_block->cuda_iov_dist_d)), sizeof(ddt_cuda_iov_dist_t) * CUDA_MAX_NB_BLOCKS * CUDA_IOV_MAX_TASK_PER_BLOCK);
+            cudaMallocHost((void **)(&(cuda_iov_pipeline_block->cuda_iov_dist_non_cached_h)), sizeof(ddt_cuda_iov_dist_non_cached_t) * CUDA_MAX_NB_BLOCKS * CUDA_IOV_MAX_TASK_PER_BLOCK);
+            cudaMalloc((void **)(&(cuda_iov_pipeline_block->cuda_iov_dist_non_cached_d)), sizeof(ddt_cuda_iov_dist_non_cached_t) * CUDA_MAX_NB_BLOCKS * CUDA_IOV_MAX_TASK_PER_BLOCK);
+            cudaMallocHost((void **)(&(cuda_iov_pipeline_block->cuda_iov_dist_cached_h)), sizeof(ddt_cuda_iov_dist_cached_t) * CUDA_MAX_NB_BLOCKS * CUDA_IOV_MAX_TASK_PER_BLOCK);
+            cudaMalloc((void **)(&(cuda_iov_pipeline_block->cuda_iov_dist_cached_d)), sizeof(ddt_cuda_iov_dist_cached_t) * CUDA_MAX_NB_BLOCKS * CUDA_IOV_MAX_TASK_PER_BLOCK);
             cuda_iov_pipeline_block->cuda_stream = &(cuda_streams->opal_cuda_stream[0]);
             cuda_iov_pipeline_block->cuda_stream_id = 0;
             cudaEventCreate(&(cuda_iov_pipeline_block->cuda_event), cudaEventDisableTiming);
@@ -258,8 +260,10 @@ int32_t opal_ddt_cuda_kernel_fini(void)
             cudaStreamDestroy(cuda_devices[i].cuda_streams->opal_cuda_stream[j]);
             cuda_iov_pipeline_block = cuda_devices[i].cuda_iov_pipeline_block[j];
             if (cuda_iov_pipeline_block != NULL) {
-                cudaFreeHost(cuda_iov_pipeline_block->cuda_iov_dist_h);
-                cudaFree(cuda_iov_pipeline_block->cuda_iov_dist_d);
+                cudaFreeHost(cuda_iov_pipeline_block->cuda_iov_dist_non_cached_h);
+                cudaFree(cuda_iov_pipeline_block->cuda_iov_dist_non_cached_d);
+                cudaFreeHost(cuda_iov_pipeline_block->cuda_iov_dist_cached_h);
+                cudaFree(cuda_iov_pipeline_block->cuda_iov_dist_cached_d);
                 cudaEventDestroy(cuda_iov_pipeline_block->cuda_event);
                 cuda_iov_pipeline_block->cuda_stream = NULL;
                 cuda_iov_pipeline_block->cuda_stream_id = -1;
diff --git a/opal/datatype/cuda/opal_datatype_cuda.cuh b/opal/datatype/cuda/opal_datatype_cuda.cuh
index ea3631af67f..73a740a2822 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cuh
+++ b/opal/datatype/cuda/opal_datatype_cuda.cuh
@@ -12,22 +12,42 @@ int32_t opal_ddt_cuda_kernel_fini(void);
 int32_t opal_ddt_generic_simple_pack_function_cuda_vector( opal_convertor_t* pConvertor,
                                                            struct iovec* iov, 
                                                            uint32_t* out_size,
-                                                           size_t* max_data );
+                                                           size_t* max_data ); 
                                                 
+int32_t opal_ddt_generic_simple_unpack_function_cuda_vector( opal_convertor_t* pConvertor,
+                                                             struct iovec* iov, 
+                                                             uint32_t* out_size,
+                                                             size_t* max_data );
+                                                             
 int32_t opal_ddt_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor,
                                                         struct iovec* iov, 
                                                         uint32_t* out_size,
                                                         size_t* max_data );                                              
-                                                  
+
 int32_t opal_ddt_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvertor,
                                                           struct iovec* iov, 
                                                           uint32_t* out_size,
-                                                          size_t* max_data );  
-                                                
-int32_t opal_ddt_generic_simple_unpack_function_cuda_vector( opal_convertor_t* pConvertor,
-                                                             struct iovec* iov, 
-                                                             uint32_t* out_size,
-                                                             size_t* max_data );
+                                                          size_t* max_data ); 
+                                                          
+int32_t opal_ddt_generic_simple_pack_function_cuda_iov_non_cached( opal_convertor_t* pConvertor,
+                                                                   struct iovec* iov, 
+                                                                   uint32_t* out_size,
+                                                                   size_t* max_data );                                              
+
+int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_non_cached( opal_convertor_t* pConvertor,
+                                                                     struct iovec* iov, 
+                                                                     uint32_t* out_size,
+                                                                     size_t* max_data ); 
+                                                                                                                    
+int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t* pConvertor,
+                                                               struct iovec* iov, 
+                                                               uint32_t* out_size,
+                                                               size_t* max_data );                                              
+
+int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_t* pConvertor,
+                                                                 struct iovec* iov, 
+                                                                 uint32_t* out_size,
+                                                                 size_t* max_data ); 
 
 void pack_contiguous_loop_cuda( dt_elem_desc_t* ELEM,
                                 uint32_t* COUNT,
diff --git a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
index eff247a15d2..a91dd8e4f1b 100644
--- a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
+++ b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
@@ -50,16 +50,25 @@ typedef struct {
     uint32_t current_stream_id;
 } ddt_cuda_stream_t;
 
+typedef struct {
+    unsigned char* src;
+    unsigned char* dst;
+    uint32_t nb_elements;
+    uint8_t element_alignment;
+} ddt_cuda_iov_dist_non_cached_t;
+
 typedef struct {
     size_t src_offset;
     size_t dst_offset;
     uint32_t nb_elements;
     uint8_t element_alignment;
-} ddt_cuda_iov_dist_t;
+} ddt_cuda_iov_dist_cached_t;
 
 typedef struct {
-    ddt_cuda_iov_dist_t* cuda_iov_dist_h;
-    ddt_cuda_iov_dist_t* cuda_iov_dist_d;
+    ddt_cuda_iov_dist_non_cached_t* cuda_iov_dist_non_cached_h;
+    ddt_cuda_iov_dist_non_cached_t* cuda_iov_dist_non_cached_d;
+    ddt_cuda_iov_dist_cached_t* cuda_iov_dist_cached_h;
+    ddt_cuda_iov_dist_cached_t* cuda_iov_dist_cached_d;
     cudaStream_t *cuda_stream;
     int32_t cuda_stream_id;
     cudaEvent_t cuda_event;
@@ -118,9 +127,13 @@ __global__ void unpack_contiguous_loop_cuda_kernel_global( uint32_t copy_loops,
                                                            unsigned char* destination );
                                                            
 
-__global__ void opal_generic_simple_pack_cuda_iov_kernel( ddt_cuda_iov_dist_t* cuda_iov_dist, int nb_blocks_used, unsigned char* source_base, unsigned char* destination_base);
+__global__ void opal_generic_simple_pack_cuda_iov_non_cached_kernel( ddt_cuda_iov_dist_non_cached_t* cuda_iov_dist, int nb_blocks_used);
+
+__global__ void opal_generic_simple_unpack_cuda_iov_non_cached_kernel( ddt_cuda_iov_dist_non_cached_t* cuda_iov_dist, int nb_blocks_used);
+
+__global__ void opal_generic_simple_pack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, int nb_blocks_used, unsigned char* source_base, unsigned char* destination_base);
 
-__global__ void opal_generic_simple_unpack_cuda_iov_kernel( ddt_cuda_iov_dist_t* cuda_iov_dist, int nb_blocks_used, unsigned char* source_base, unsigned char* destination_base);
+__global__ void opal_generic_simple_unpack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, int nb_blocks_used, unsigned char* source_base, unsigned char* destination_base);
 
 void opal_cuda_output(int output_id, const char *format, ...);
 
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
index 6b0e18b1078..ccf7d923af7 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
@@ -43,10 +43,10 @@ __global__ void pack_contiguous_loop_cuda_kernel_global( uint32_t copy_loops,
     }
 }
 
-__global__ void opal_generic_simple_pack_cuda_iov_kernel( ddt_cuda_iov_dist_t* cuda_iov_dist, int nb_blocks_used, unsigned char* source_base, unsigned char* destination_base)
+__global__ void opal_generic_simple_pack_cuda_iov_non_cached_kernel( ddt_cuda_iov_dist_non_cached_t* cuda_iov_dist, int nb_blocks_used)
 {
     uint32_t i, _copy_count;
-    size_t src_offset, dst_offset;
+    unsigned char *src, *dst;
     uint8_t alignment;
     unsigned char *_source_tmp, *_destination_tmp;
     
@@ -63,8 +63,8 @@ __global__ void opal_generic_simple_pack_cuda_iov_kernel( ddt_cuda_iov_dist_t* c
     __syncthreads();
     
     for (i = 0; i < nb_tasks; i++) {
-        src_offset = cuda_iov_dist[blockIdx.x + i * gridDim.x].src_offset;
-        dst_offset = cuda_iov_dist[blockIdx.x + i * gridDim.x].dst_offset;
+        src = cuda_iov_dist[blockIdx.x + i * gridDim.x].src;
+        dst = cuda_iov_dist[blockIdx.x + i * gridDim.x].dst;
         _copy_count = cuda_iov_dist[blockIdx.x + i * gridDim.x].nb_elements;
         alignment = cuda_iov_dist[blockIdx.x + i * gridDim.x].element_alignment;
         
@@ -73,8 +73,8 @@ __global__ void opal_generic_simple_pack_cuda_iov_kernel( ddt_cuda_iov_dist_t* c
         // }
         
         if (threadIdx.x < _copy_count) {
-            _source_tmp = source_base + src_offset + threadIdx.x * alignment;
-            _destination_tmp = destination_base + dst_offset + threadIdx.x * alignment;
+            _source_tmp = src + threadIdx.x * alignment;
+            _destination_tmp = dst + threadIdx.x * alignment;
 #if !defined (OPAL_DATATYPE_CUDA_DRY_RUN)
             if (alignment == ALIGNMENT_DOUBLE) {
                 *((long *)_destination_tmp) = *((long *)_source_tmp);
@@ -86,4 +86,62 @@ __global__ void opal_generic_simple_pack_cuda_iov_kernel( ddt_cuda_iov_dist_t* c
 #endif /* ! OPAL_DATATYPE_CUDA_DRY_RUN */
         }
     }
+}
+
+__global__ void opal_generic_simple_pack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, int nb_blocks_used, unsigned char* source_base, unsigned char* destination_base)
+{
+    uint32_t i, _copy_count;
+    size_t src_offset, dst_offset;
+    uint8_t alignment;
+    unsigned char *_source_tmp, *_destination_tmp;
+    
+    __shared__ uint32_t nb_tasks;
+    __shared__ uint8_t my_alignment;
+    
+    if (threadIdx.x == 0) {
+        //printf("iov pack kernel \n");
+        nb_tasks = nb_blocks_used / gridDim.x;
+        if (blockIdx.x < (nb_blocks_used % gridDim.x)) {
+            nb_tasks ++;
+        }
+   //     printf("nb_tasks %d, griddim %d, nb_blocks_used %d, bloid %d \n", nb_tasks, gridDim.x, nb_blocks_used, blockIdx.x);
+    }
+    __syncthreads();
+    
+    for (i = 0; i < nb_tasks; i++) {
+        src_offset = cuda_iov_dist[blockIdx.x + i * gridDim.x].src_offset;
+        dst_offset = cuda_iov_dist[blockIdx.x + i * gridDim.x].dst_offset;
+        _copy_count = cuda_iov_dist[blockIdx.x + i * gridDim.x].nb_elements;
+        alignment = cuda_iov_dist[blockIdx.x + i * gridDim.x].element_alignment;
+        
+        if (threadIdx.x == 0) {
+            _source_tmp = source_base + src_offset;
+            _destination_tmp = destination_base + dst_offset;
+            if ((uintptr_t)(_source_tmp) % ALIGNMENT_DOUBLE == 0 && (uintptr_t)_destination_tmp % ALIGNMENT_DOUBLE == 0) {
+                my_alignment = ALIGNMENT_DOUBLE;
+            } else if ((uintptr_t)(_source_tmp) % ALIGNMENT_FLOAT == 0 && (uintptr_t)_destination_tmp % ALIGNMENT_FLOAT == 0) {
+                my_alignment = ALIGNMENT_FLOAT;
+            } else {
+                my_alignment = ALIGNMENT_CHAR;
+            }
+            if (my_alignment != alignment) {
+                printf("my align %d, align %d\n", my_alignment, alignment);
+            }
+        }
+        __syncthreads();
+        
+        if (threadIdx.x < _copy_count) {
+            _source_tmp = source_base + src_offset + threadIdx.x * alignment;
+            _destination_tmp = destination_base + dst_offset + threadIdx.x * alignment;
+#if !defined (OPAL_DATATYPE_CUDA_DRY_RUN)
+            if (my_alignment == ALIGNMENT_DOUBLE) {
+                *((long *)_destination_tmp) = *((long *)_source_tmp);
+            } else if (my_alignment == ALIGNMENT_FLOAT) {
+                *((int *)_destination_tmp) = *((int *)_source_tmp);
+            } else {
+                * _destination_tmp = *_source_tmp;
+            }
+#endif /* ! OPAL_DATATYPE_CUDA_DRY_RUN */
+        }
+    }
 }
\ No newline at end of file
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
index b2366b211f4..b8dda932626 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
@@ -659,16 +659,24 @@ void pack_contiguous_loop_cuda_zerocopy( dt_elem_desc_t* ELEM,
 #endif
 }
 
-int32_t opal_ddt_generic_simple_pack_function_cuda_iov2( opal_convertor_t* pConvertor,
-                                                    struct iovec* iov,
-                                                    uint32_t* out_size,
-                                                    size_t* max_data )
+int32_t opal_ddt_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor,
+                                                        struct iovec* iov,
+                                                        uint32_t* out_size,
+                                                        size_t* max_data )
+{
+    return opal_ddt_generic_simple_pack_function_cuda_iov_non_cached(pConvertor, iov, out_size, max_data);   
+}
+
+int32_t opal_ddt_generic_simple_pack_function_cuda_iov_non_cached( opal_convertor_t* pConvertor,
+                                                                   struct iovec* iov,
+                                                                   uint32_t* out_size,
+                                                                   size_t* max_data )
 {
     uint32_t i, j;
     uint32_t count_desc, nb_blocks_per_description, residue_desc;
     uint32_t nb_blocks, thread_per_block, nb_blocks_used;
     size_t length, buffer_size, length_per_iovec, dst_offset;
-    unsigned char *destination, *destination_base, *source_base;
+    unsigned char *destination, *destination_base;
     size_t total_packed, total_converted;
     int32_t complete_flag = 0;
     uint8_t buffer_isfull = 0, transfer_required, free_required;
@@ -680,8 +688,8 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov2( opal_convertor_t* pConv
 //    int32_t orig_stack_index;
     cudaError_t cuda_err;
     ddt_cuda_stream_t *cuda_streams = current_cuda_device->cuda_streams;
-    ddt_cuda_iov_dist_t* cuda_iov_dist_h_current;
-    ddt_cuda_iov_dist_t* cuda_iov_dist_d_current;
+    ddt_cuda_iov_dist_non_cached_t* cuda_iov_dist_h_current;
+    ddt_cuda_iov_dist_non_cached_t* cuda_iov_dist_d_current;
     ddt_cuda_iov_pipeline_block_t *cuda_iov_pipeline_block;
     int iov_pipeline_block_id = 0;
     cudaStream_t *cuda_stream_iov = NULL;
@@ -691,12 +699,6 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov2( opal_convertor_t* pConv
     long total_time, move_time;
 #endif
     
-#if OPAL_DATATYPE_CUDA_IOV_CACHE
-    opal_datatype_t *pDesc = (opal_datatype_t *)pConvertor->pDesc;
-    ddt_cuda_iov_dist_t *cuda_iov_dist_cache = (ddt_cuda_iov_dist_t *)pDesc->cuda_iov_dist;
-    cuda_iov_dist_cache += pDesc->cuda_iov_count;    
-#endif /* OPAL_DATATYPE_CUDA_IOV_CACHE */
-    
     /*description = pConvertor->use_desc->desc;
     pStack = pConvertor->pStack + pConvertor->stack_pos;
     pElem = &(description[pStack->index]);
@@ -738,16 +740,10 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov2( opal_convertor_t* pConv
             free_required = 1;
             destination = pConvertor->gpu_buffer_ptr;
         }
-    }
+    }   
     
-#if OPAL_DATATYPE_CUDA_IOV_CACHE
-    /* cuda iov is cached */
-    if (pDesc->cuda_iov_is_cached == 2) {
-        pack_iov_cached(pConvertor, destination);
-    }
-#endif /* OPAL_DATATYPE_CUDA_IOV_CACHE */    
-    
-    DT_CUDA_DEBUG ( opal_cuda_output(2, "Pack using IOV, GPU base %p, pack to buffer %p\n", pConvertor->pBaseBuf, destination););
+    destination_base = destination;
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "Pack using IOV non cached, GPU base %p, pack to buffer %p\n", pConvertor->pBaseBuf, destination););
 
     cuda_iov_count = 1000;//CUDA_NB_IOV;
     total_packed = 0;
@@ -755,7 +751,6 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov2( opal_convertor_t* pConv
     cuda_streams->current_stream_id = 0;
     convertor_flags = pConvertor->flags;
   //  orig_stack_index = pStack->index;
-    destination_base = destination;
 
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
     GET_TIME(start_total);
@@ -776,14 +771,13 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov2( opal_convertor_t* pConv
     dst_offset = 0;
     thread_per_block = CUDA_WARP_SIZE * 5;
     nb_blocks = 256;
-    source_base = (unsigned char*)pConvertor->pBaseBuf; 
     
     while (cuda_iov_count > 0) {
         
         nb_blocks_used = 0;
         cuda_iov_pipeline_block = current_cuda_device->cuda_iov_pipeline_block[iov_pipeline_block_id];
-        cuda_iov_dist_h_current = cuda_iov_pipeline_block->cuda_iov_dist_h;
-        cuda_iov_dist_d_current = cuda_iov_pipeline_block->cuda_iov_dist_d;
+        cuda_iov_dist_h_current = cuda_iov_pipeline_block->cuda_iov_dist_non_cached_h;
+        cuda_iov_dist_d_current = cuda_iov_pipeline_block->cuda_iov_dist_non_cached_d;
         cuda_stream_iov = cuda_iov_pipeline_block->cuda_stream;
         cuda_err = cudaStreamWaitEvent(*cuda_stream_iov, cuda_iov_pipeline_block->cuda_event, 0);
         opal_cuda_check_error(cuda_err);
@@ -819,8 +813,8 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov2( opal_convertor_t* pConv
             nb_blocks_per_description = (count_desc + thread_per_block - 1) / thread_per_block;
             DT_CUDA_DEBUG ( opal_cuda_output(10, "Pack description %d, size %d, residue %d, alignment %d, nb_block_aquired %d\n", i, count_desc, residue_desc, alignment, nb_blocks_per_description); );
             for (j = 0; j < nb_blocks_per_description; j++) {
-                cuda_iov_dist_h_current[nb_blocks_used].src_offset = (unsigned char *)(cuda_iov[i].iov_base) + j * thread_per_block * alignment - source_base;
-                cuda_iov_dist_h_current[nb_blocks_used].dst_offset = destination - destination_base;
+                cuda_iov_dist_h_current[nb_blocks_used].src = (unsigned char *)(cuda_iov[i].iov_base) + j * thread_per_block * alignment;
+                cuda_iov_dist_h_current[nb_blocks_used].dst = destination;
                 cuda_iov_dist_h_current[nb_blocks_used].element_alignment = alignment;
                 if ( (j+1) * thread_per_block <= count_desc) {
                     cuda_iov_dist_h_current[nb_blocks_used].nb_elements = thread_per_block;
@@ -831,7 +825,7 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov2( opal_convertor_t* pConv
                 assert(cuda_iov_dist_h_current[nb_blocks_used].nb_elements > 0);
 #endif /* OPAL_DATATYPE_CUDA_DEBUG */
                 destination += cuda_iov_dist_h_current[nb_blocks_used].nb_elements * alignment;
-                DT_CUDA_DEBUG( opal_cuda_output(12, "Pack \tblock %d, src %p, dst %p, nb_elements %d, alignment %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].src_offset, cuda_iov_dist_h_current[nb_blocks_used].dst_offset, cuda_iov_dist_h_current[nb_blocks_used].nb_elements, cuda_iov_dist_h_current[nb_blocks_used].element_alignment); );
+                DT_CUDA_DEBUG( opal_cuda_output(12, "Pack \tblock %d, src %p, dst %p, nb_elements %d, alignment %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].src, cuda_iov_dist_h_current[nb_blocks_used].dst, cuda_iov_dist_h_current[nb_blocks_used].nb_elements, cuda_iov_dist_h_current[nb_blocks_used].element_alignment); );
                 nb_blocks_used ++;
                 assert (nb_blocks_used < CUDA_MAX_NB_BLOCKS*CUDA_IOV_MAX_TASK_PER_BLOCK);
             }
@@ -840,15 +834,15 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov2( opal_convertor_t* pConv
             if (residue_desc != 0) {
                 /*orig_alignment = opal_datatype_basicDatatypes[pElem->elem.common.type]->size;*/
                 orig_alignment = ALIGNMENT_CHAR;
-                cuda_iov_dist_h_current[nb_blocks_used].src_offset = (unsigned char *)(cuda_iov[i].iov_base) + length_per_iovec / alignment * alignment - source_base;
-                cuda_iov_dist_h_current[nb_blocks_used].dst_offset = destination - destination_base;
+                cuda_iov_dist_h_current[nb_blocks_used].src = (unsigned char *)(cuda_iov[i].iov_base) + length_per_iovec / alignment * alignment;
+                cuda_iov_dist_h_current[nb_blocks_used].dst = destination;
                 cuda_iov_dist_h_current[nb_blocks_used].element_alignment = orig_alignment;
                 cuda_iov_dist_h_current[nb_blocks_used].nb_elements = (length_per_iovec - length_per_iovec / alignment * alignment) / orig_alignment;
 #if defined (OPAL_DATATYPE_CUDA_DEBUG)
                 assert(cuda_iov_dist_h_current[nb_blocks_used].nb_elements > 0);
 #endif /* OPAL_DATATYPE_CUDA_DEBUG */
                 destination += cuda_iov_dist_h_current[nb_blocks_used].nb_elements * orig_alignment;
-                DT_CUDA_DEBUG( opal_cuda_output(12, "Pack \tblock %d, src %p, dst %p, nb_elements %d, alignment %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].src_offset, cuda_iov_dist_h_current[nb_blocks_used].dst_offset, cuda_iov_dist_h_current[nb_blocks_used].nb_elements, cuda_iov_dist_h_current[nb_blocks_used].element_alignment); );
+                DT_CUDA_DEBUG( opal_cuda_output(12, "Pack \tblock %d, src %p, dst %p, nb_elements %d, alignment %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].src, cuda_iov_dist_h_current[nb_blocks_used].dst, cuda_iov_dist_h_current[nb_blocks_used].nb_elements, cuda_iov_dist_h_current[nb_blocks_used].element_alignment); );
                 nb_blocks_used ++;
                 assert (nb_blocks_used < CUDA_MAX_NB_BLOCKS*CUDA_IOV_MAX_TASK_PER_BLOCK);
             }
@@ -864,13 +858,8 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov2( opal_convertor_t* pConv
         DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Pack to dest %p, iov is prepared in %ld microsec, kernel submitted to CUDA stream %d, nb_blocks %d\n", destination_base, total_time,  cuda_iov_pipeline_block->cuda_stream_id, nb_blocks_used); );
 #endif
 
-        cudaMemcpyAsync(cuda_iov_dist_d_current, cuda_iov_dist_h_current, sizeof(ddt_cuda_iov_dist_t)*(nb_blocks_used), cudaMemcpyHostToDevice, *cuda_stream_iov);
-#if OPAL_DATATYPE_CUDA_IOV_CACHE
-        cudaMemcpyAsync(cuda_iov_dist_cache, cuda_iov_dist_d_current, sizeof(ddt_cuda_iov_dist_t)*(nb_blocks_used), cudaMemcpyDeviceToDevice, *cuda_stream_iov);
-        pDesc->cuda_iov_count += nb_blocks_used;
-        cuda_iov_dist_cache += nb_blocks_used;
-#endif /* OPAL_DATATYPE_CUDA_IOV_CACHE */
-        opal_generic_simple_pack_cuda_iov_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cuda_iov_dist_d_current, nb_blocks_used, source_base, destination_base);
+        cudaMemcpyAsync(cuda_iov_dist_d_current, cuda_iov_dist_h_current, sizeof(ddt_cuda_iov_dist_non_cached_t)*(nb_blocks_used), cudaMemcpyHostToDevice, *cuda_stream_iov);
+        opal_generic_simple_pack_cuda_iov_non_cached_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cuda_iov_dist_d_current, nb_blocks_used);
         cuda_err = cudaEventRecord(cuda_iov_pipeline_block->cuda_event, *cuda_stream_iov);
         opal_cuda_check_error(cuda_err);
         iov_pipeline_block_id ++;
@@ -933,18 +922,15 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov2( opal_convertor_t* pConv
            opal_ddt_cuda_free_gpu_buffer(pConvertor->gpu_buffer_ptr, 0);
            pConvertor->gpu_buffer_ptr = NULL;
         }
-#if OPAL_DATATYPE_CUDA_IOV_CACHE
-        pDesc->cuda_iov_is_cached = 2;
-#endif /* OPAL_DATATYPE_CUDA_IOV_CACHE */
         return 1;
     }        
     return 0;
 }
 
-int32_t opal_ddt_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor,
-                                                    struct iovec* iov,
-                                                    uint32_t* out_size,
-                                                    size_t* max_data )
+int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t* pConvertor,
+                                                               struct iovec* iov,
+                                                               uint32_t* out_size,
+                                                               size_t* max_data )
 {
     uint32_t i, j;
     uint32_t count_desc, nb_blocks_per_description, residue_desc;
@@ -962,8 +948,8 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConve
 //    int32_t orig_stack_index;
     cudaError_t cuda_err;
     ddt_cuda_stream_t *cuda_streams = current_cuda_device->cuda_streams;
-    ddt_cuda_iov_dist_t* cuda_iov_dist_h_current;
-    ddt_cuda_iov_dist_t* cuda_iov_dist_d_current;
+    ddt_cuda_iov_dist_cached_t* cuda_iov_dist_h_current;
+    ddt_cuda_iov_dist_cached_t* cuda_iov_dist_d_current;
     ddt_cuda_iov_pipeline_block_t *cuda_iov_pipeline_block;
     int iov_pipeline_block_id = 0;
     cudaStream_t *cuda_stream_iov = NULL;
@@ -977,12 +963,6 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConve
     long total_time, move_time;
 #endif
     
-#if OPAL_DATATYPE_CUDA_IOV_CACHE
-    opal_datatype_t *pDesc = (opal_datatype_t *)pConvertor->pDesc;
-    ddt_cuda_iov_dist_t *cuda_iov_dist_cache = (ddt_cuda_iov_dist_t *)pDesc->cuda_iov_dist;
-    cuda_iov_dist_cache += pDesc->cuda_iov_count;    
-#endif /* OPAL_DATATYPE_CUDA_IOV_CACHE */
-    
     /*description = pConvertor->use_desc->desc;
     pStack = pConvertor->pStack + pConvertor->stack_pos;
     pElem = &(description[pStack->index]);
@@ -1024,16 +1004,9 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConve
             free_required = 1;
             destination = pConvertor->gpu_buffer_ptr;
         }
-    }
-    
-#if OPAL_DATATYPE_CUDA_IOV_CACHE
-    /* cuda iov is cached */
-    if (pDesc->cuda_iov_is_cached == 2) {
-        pack_iov_cached(pConvertor, destination);
-    }
-#endif /* OPAL_DATATYPE_CUDA_IOV_CACHE */    
+    }   
     
-    DT_CUDA_DEBUG ( opal_cuda_output(2, "Pack using IOV, GPU base %p, pack to buffer %p\n", pConvertor->pBaseBuf, destination););
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "Pack using IOV cached, GPU base %p, pack to buffer %p\n", pConvertor->pBaseBuf, destination););
 
     cuda_iov_count = 4000;//CUDA_NB_IOV;
     total_packed = 0;
@@ -1074,8 +1047,8 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConve
         
         nb_blocks_used = 0;
         cuda_iov_pipeline_block = current_cuda_device->cuda_iov_pipeline_block[iov_pipeline_block_id];
-        cuda_iov_dist_h_current = cuda_iov_pipeline_block->cuda_iov_dist_h;
-        cuda_iov_dist_d_current = cuda_iov_pipeline_block->cuda_iov_dist_d;
+        cuda_iov_dist_h_current = cuda_iov_pipeline_block->cuda_iov_dist_cached_h;
+        cuda_iov_dist_d_current = cuda_iov_pipeline_block->cuda_iov_dist_cached_d;
         cuda_stream_iov = cuda_iov_pipeline_block->cuda_stream;
         cuda_err = cudaStreamWaitEvent(*cuda_stream_iov, cuda_iov_pipeline_block->cuda_event, 0);
         opal_cuda_check_error(cuda_err);
@@ -1164,14 +1137,9 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConve
         DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Pack to dest %p, iov is prepared in %ld microsec, kernel submitted to CUDA stream %d, nb_blocks %d\n", destination_base, total_time,  cuda_iov_pipeline_block->cuda_stream_id, nb_blocks_used); );
 #endif
 
-        cudaMemcpyAsync(cuda_iov_dist_d_current, cuda_iov_dist_h_current, sizeof(ddt_cuda_iov_dist_t)*(nb_blocks_used), cudaMemcpyHostToDevice, *cuda_stream_iov);
-#if OPAL_DATATYPE_CUDA_IOV_CACHE
-        cudaMemcpyAsync(cuda_iov_dist_cache, cuda_iov_dist_d_current, sizeof(ddt_cuda_iov_dist_t)*(nb_blocks_used), cudaMemcpyDeviceToDevice, *cuda_stream_iov);
-        pDesc->cuda_iov_count += nb_blocks_used;
-        cuda_iov_dist_cache += nb_blocks_used;
-#endif /* OPAL_DATATYPE_CUDA_IOV_CACHE */
+        cudaMemcpyAsync(cuda_iov_dist_d_current, cuda_iov_dist_h_current, sizeof(ddt_cuda_iov_dist_cached_t)*(nb_blocks_used), cudaMemcpyHostToDevice, *cuda_stream_iov);
         DT_CUDA_DEBUG ( opal_cuda_output(2, "kernel launched src_base %p, dst_base %p, nb_blocks %ld\n", source_base, destination_base, nb_blocks_used ); );
-        opal_generic_simple_pack_cuda_iov_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cuda_iov_dist_d_current, nb_blocks_used, source_base, destination_base);
+        opal_generic_simple_pack_cuda_iov_cached_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cuda_iov_dist_d_current, nb_blocks_used, source_base, destination_base);
         cuda_err = cudaEventRecord(cuda_iov_pipeline_block->cuda_event, *cuda_stream_iov);
         opal_cuda_check_error(cuda_err);
         iov_pipeline_block_id ++;
@@ -1221,24 +1189,11 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConve
            opal_ddt_cuda_free_gpu_buffer(pConvertor->gpu_buffer_ptr, 0);
            pConvertor->gpu_buffer_ptr = NULL;
         }
-#if OPAL_DATATYPE_CUDA_IOV_CACHE
-        pDesc->cuda_iov_is_cached = 2;
-#endif /* OPAL_DATATYPE_CUDA_IOV_CACHE */
         return 1;
     }        
     return 0;
 }
 
-
-#if OPAL_DATATYPE_CUDA_IOV_CACHE
-void pack_iov_cached(opal_convertor_t* pConvertor, unsigned char *destination)
-{
-    const opal_datatype_t *datatype = pConvertor->pDesc;
-    DT_CUDA_DEBUG ( opal_cuda_output(2, "cuda iov cached %p, count %ld\n", datatype->cuda_iov_dist, datatype->cuda_iov_count ); );
-}
-#endif /* OPAL_DATATYPE_CUDA_IOV_CACHE */
-
-
 void pack_predefined_data_cuda( dt_elem_desc_t* ELEM,
                                 uint32_t* COUNT,
                                 unsigned char** SOURCE,
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
index a23aff7710c..37527bd2071 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
@@ -6,7 +6,47 @@
 #include <stdio.h> 
 
 
-__global__ void opal_generic_simple_unpack_cuda_iov_kernel( ddt_cuda_iov_dist_t* cuda_iov_dist, int nb_blocks_used, unsigned char* source_base, unsigned char* destination_base)
+__global__ void opal_generic_simple_unpack_cuda_iov_non_cached_kernel( ddt_cuda_iov_dist_non_cached_t* cuda_iov_dist, int nb_blocks_used)
+{
+    uint32_t i, _copy_count;
+    unsigned char *src, *dst;
+    uint8_t alignment;
+    unsigned char *_source_tmp, *_destination_tmp;
+    
+    __shared__ uint32_t nb_tasks;
+    
+    if (threadIdx.x == 0) {
+        nb_tasks = nb_blocks_used / gridDim.x;
+        if (blockIdx.x < nb_blocks_used % gridDim.x) {
+            nb_tasks ++;
+        }
+    }
+    __syncthreads();
+    
+    for (i = 0; i < nb_tasks; i++) {
+        src = cuda_iov_dist[blockIdx.x + i * gridDim.x].src;
+        dst = cuda_iov_dist[blockIdx.x + i * gridDim.x].dst;
+        _copy_count = cuda_iov_dist[blockIdx.x + i * gridDim.x].nb_elements;
+        alignment = cuda_iov_dist[blockIdx.x + i * gridDim.x].element_alignment;
+        
+        if (threadIdx.x < _copy_count) {
+            _source_tmp = src + threadIdx.x * alignment;
+            _destination_tmp = dst + threadIdx.x * alignment;
+#if !defined (OPAL_DATATYPE_CUDA_DRY_RUN)
+                if (alignment == ALIGNMENT_DOUBLE) {
+                    *((long *)_destination_tmp) = *((long *)_source_tmp);
+                } else if (alignment == ALIGNMENT_FLOAT) {
+                    *((int *)_destination_tmp) = *((int *)_source_tmp);
+                } else {
+                    * _destination_tmp = *_source_tmp;
+                }
+        //   printf("src %p, %1.f | dst %p, %1.f\n", _source_tmp, *_source_tmp, _destination_tmp, *_destination_tmp);
+#endif /* ! OPAL_DATATYPE_CUDA_DRY_RUN */
+        }
+    }
+}
+
+__global__ void opal_generic_simple_unpack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, int nb_blocks_used, unsigned char* source_base, unsigned char* destination_base)
 {
     uint32_t i, _copy_count;
     size_t src_offset, dst_offset;
@@ -45,6 +85,7 @@ __global__ void opal_generic_simple_unpack_cuda_iov_kernel( ddt_cuda_iov_dist_t*
         }
     }
 }
+
 __global__ void unpack_contiguous_loop_cuda_kernel_global( uint32_t copy_loops,
                                                            size_t size,
                                                            OPAL_PTRDIFF_TYPE extent,
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
index 4b4438fa8e4..fc7b3d28f6a 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
@@ -370,16 +370,24 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_vector2( opal_convertor_t*
     return 0;
 }
 
-int32_t opal_ddt_generic_simple_unpack_function_cuda_iov2( opal_convertor_t* pConvertor,
+int32_t opal_ddt_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvertor,
                                                           struct iovec* iov,
                                                           uint32_t* out_size,
                                                           size_t* max_data )
+{
+    return opal_ddt_generic_simple_unpack_function_cuda_iov_non_cached(pConvertor, iov, out_size, max_data);
+}
+
+int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_non_cached( opal_convertor_t* pConvertor,
+                                                                     struct iovec* iov,
+                                                                     uint32_t* out_size,
+                                                                     size_t* max_data )
 {
     uint32_t i, j;
     uint32_t count_desc, nb_blocks_per_description, dst_offset, residue_desc;
     uint32_t nb_blocks, thread_per_block, nb_blocks_used;
     size_t length, buffer_size, length_per_iovec;
-    unsigned char *source, *source_base, *destination_base;
+    unsigned char *source, *source_base;
     size_t total_unpacked, total_converted;
     int32_t complete_flag = 0;
     uint8_t buffer_isfull = 0;
@@ -392,8 +400,8 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov2( opal_convertor_t* pCo
 //    int32_t orig_stack_index;
     cudaError_t cuda_err;
     ddt_cuda_stream_t *cuda_streams = current_cuda_device->cuda_streams;
-    ddt_cuda_iov_dist_t* cuda_iov_dist_h_current;
-    ddt_cuda_iov_dist_t* cuda_iov_dist_d_current;
+    ddt_cuda_iov_dist_non_cached_t* cuda_iov_dist_h_current;
+    ddt_cuda_iov_dist_non_cached_t* cuda_iov_dist_d_current;
     ddt_cuda_iov_pipeline_block_t *cuda_iov_pipeline_block;
     int iov_pipeline_block_id = 0;
     cudaStream_t *cuda_stream_iov = NULL;
@@ -434,7 +442,8 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov2( opal_convertor_t* pCo
         }
     }
 
-    DT_CUDA_DEBUG ( opal_cuda_output(2, "Unpack using IOV, GPU base %p, unpack from buffer %p, total size %ld\n",
+    source_base = source;
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "Unpack using IOV non cached, GPU base %p, unpack from buffer %p, total size %ld\n",
                                      pConvertor->pBaseBuf, source, iov[0].iov_len); );
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
     GET_TIME( end );
@@ -456,7 +465,6 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov2( opal_convertor_t* pCo
     cuda_streams->current_stream_id = 0;
     convertor_flags = pConvertor->flags;
 //    orig_stack_index = pStack->index;
-    source_base = source;
     complete_flag = opal_convertor_raw( pConvertor, cuda_iov, &cuda_iov_count, &length );
     DT_CUDA_DEBUG ( opal_cuda_output(4, "Unpack complete flag %d, iov count %d, length %d, submit to CUDA stream %d\n", complete_flag, cuda_iov_count, length, cuda_streams->current_stream_id); );
 
@@ -469,14 +477,13 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov2( opal_convertor_t* pCo
     dst_offset = 0;
     thread_per_block = CUDA_WARP_SIZE * 5;
     nb_blocks = 256;
-    destination_base = (unsigned char*)pConvertor->pBaseBuf;
 
     while (cuda_iov_count > 0) {
 
         nb_blocks_used = 0;
         cuda_iov_pipeline_block = current_cuda_device->cuda_iov_pipeline_block[iov_pipeline_block_id];
-        cuda_iov_dist_h_current = cuda_iov_pipeline_block->cuda_iov_dist_h;
-        cuda_iov_dist_d_current = cuda_iov_pipeline_block->cuda_iov_dist_d;
+        cuda_iov_dist_h_current = cuda_iov_pipeline_block->cuda_iov_dist_non_cached_h;
+        cuda_iov_dist_d_current = cuda_iov_pipeline_block->cuda_iov_dist_non_cached_d;
         cuda_stream_iov = cuda_iov_pipeline_block->cuda_stream;
         cuda_err = cudaStreamWaitEvent(*cuda_stream_iov, cuda_iov_pipeline_block->cuda_event, 0);
         opal_cuda_check_error(cuda_err);
@@ -515,8 +522,8 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov2( opal_convertor_t* pCo
             nb_blocks_per_description = (count_desc + thread_per_block - 1) / thread_per_block;
             DT_CUDA_DEBUG ( opal_cuda_output(10, "Unpack description %d, size %d, residue %d, alignment %d\n", i, count_desc, residue_desc, alignment); );
             for (j = 0; j < nb_blocks_per_description; j++) {
-                cuda_iov_dist_h_current[nb_blocks_used].dst_offset = (unsigned char *)(cuda_iov[i].iov_base) + j * thread_per_block * alignment - destination_base;
-                cuda_iov_dist_h_current[nb_blocks_used].src_offset = source - source_base;
+                cuda_iov_dist_h_current[nb_blocks_used].dst = (unsigned char *)(cuda_iov[i].iov_base) + j * thread_per_block * alignment;
+                cuda_iov_dist_h_current[nb_blocks_used].src = source;
                 cuda_iov_dist_h_current[nb_blocks_used].element_alignment = alignment;
                 if ( (j+1) * thread_per_block <= count_desc) {
                     cuda_iov_dist_h_current[nb_blocks_used].nb_elements = thread_per_block;// * sizeof(double);
@@ -527,7 +534,7 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov2( opal_convertor_t* pCo
                 assert (cuda_iov_dist_h_current[nb_blocks_used].nb_elements > 0); 
 #endif /* OPAL_DATATYPE_CUDA_DEBUG */
                 source += cuda_iov_dist_h_current[nb_blocks_used].nb_elements * alignment;
-                DT_CUDA_DEBUG( opal_cuda_output(12, "Unpack \tblock %d, src %p, dst %p, nb_elements %d, alignment %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].src_offset, cuda_iov_dist_h_current[nb_blocks_used].dst_offset, cuda_iov_dist_h_current[nb_blocks_used].nb_elements, cuda_iov_dist_h_current[nb_blocks_used].element_alignment); );
+                DT_CUDA_DEBUG( opal_cuda_output(12, "Unpack \tblock %d, src %p, dst %p, nb_elements %d, alignment %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].src, cuda_iov_dist_h_current[nb_blocks_used].dst, cuda_iov_dist_h_current[nb_blocks_used].nb_elements, cuda_iov_dist_h_current[nb_blocks_used].element_alignment); );
                 nb_blocks_used ++;
             }
 
@@ -535,15 +542,15 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov2( opal_convertor_t* pCo
             if (residue_desc != 0) {
                /* orig_alignment = opal_datatype_basicDatatypes[pElem->elem.common.type]->size;*/
                 orig_alignment = ALIGNMENT_CHAR;
-                cuda_iov_dist_h_current[nb_blocks_used].dst_offset = (unsigned char *)(cuda_iov[i].iov_base) + length_per_iovec / alignment * alignment - destination_base;
-                cuda_iov_dist_h_current[nb_blocks_used].src_offset = source - source_base;
+                cuda_iov_dist_h_current[nb_blocks_used].dst = (unsigned char *)(cuda_iov[i].iov_base) + length_per_iovec / alignment * alignment;
+                cuda_iov_dist_h_current[nb_blocks_used].src = source;
                 cuda_iov_dist_h_current[nb_blocks_used].element_alignment = orig_alignment;
                 cuda_iov_dist_h_current[nb_blocks_used].nb_elements = (length_per_iovec - length_per_iovec / alignment * alignment) / orig_alignment;
 #if defined (OPAL_DATATYPE_CUDA_DEBUG)
                 assert (cuda_iov_dist_h_current[nb_blocks_used].nb_elements > 0);
 #endif /* OPAL_DATATYPE_CUDA_DEBUG */
                 source += cuda_iov_dist_h_current[nb_blocks_used].nb_elements * orig_alignment;
-                DT_CUDA_DEBUG( opal_cuda_output(12, "Unpack \tblock %d, src %p, dst %p, nb_elements %d, alignment %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].src_offset, cuda_iov_dist_h_current[nb_blocks_used].dst_offset, cuda_iov_dist_h_current[nb_blocks_used].nb_elements, cuda_iov_dist_h_current[nb_blocks_used].element_alignment); );
+                DT_CUDA_DEBUG( opal_cuda_output(12, "Unpack \tblock %d, src %p, dst %p, nb_elements %d, alignment %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].src, cuda_iov_dist_h_current[nb_blocks_used].dst, cuda_iov_dist_h_current[nb_blocks_used].nb_elements, cuda_iov_dist_h_current[nb_blocks_used].element_alignment); );
                 nb_blocks_used ++;
             }
 
@@ -558,8 +565,8 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov2( opal_convertor_t* pCo
         DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Unpack src %p, iov is prepared in %ld microsec, kernel submitted to CUDA stream %d, nb_blocks_used %d\n", source_base, total_time,  cuda_iov_pipeline_block->cuda_stream_id, nb_blocks_used); );
 #endif
 
-        cudaMemcpyAsync(cuda_iov_dist_d_current, cuda_iov_dist_h_current, sizeof(ddt_cuda_iov_dist_t)*(nb_blocks_used), cudaMemcpyHostToDevice, *cuda_stream_iov);
-        opal_generic_simple_unpack_cuda_iov_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cuda_iov_dist_d_current, nb_blocks_used, source_base, destination_base);
+        cudaMemcpyAsync(cuda_iov_dist_d_current, cuda_iov_dist_h_current, sizeof(ddt_cuda_iov_dist_non_cached_t)*(nb_blocks_used), cudaMemcpyHostToDevice, *cuda_stream_iov);
+        opal_generic_simple_unpack_cuda_iov_non_cached_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cuda_iov_dist_d_current, nb_blocks_used);
         cuda_err = cudaEventRecord(cuda_iov_pipeline_block->cuda_event, *cuda_stream_iov);
         opal_cuda_check_error(cuda_err);
         iov_pipeline_block_id ++;
@@ -615,10 +622,10 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov2( opal_convertor_t* pCo
     return 0;
 }
 
-int32_t opal_ddt_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvertor,
-                                                          struct iovec* iov,
-                                                          uint32_t* out_size,
-                                                          size_t* max_data )
+int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_t* pConvertor,
+                                                                 struct iovec* iov,
+                                                                 uint32_t* out_size,
+                                                                 size_t* max_data )
 {
     uint32_t i, j;
     uint32_t count_desc, nb_blocks_per_description, residue_desc;
@@ -637,8 +644,8 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pCon
 //    int32_t orig_stack_index;
     cudaError_t cuda_err;
     ddt_cuda_stream_t *cuda_streams = current_cuda_device->cuda_streams;
-    ddt_cuda_iov_dist_t* cuda_iov_dist_h_current;
-    ddt_cuda_iov_dist_t* cuda_iov_dist_d_current;
+    ddt_cuda_iov_dist_cached_t* cuda_iov_dist_h_current;
+    ddt_cuda_iov_dist_cached_t* cuda_iov_dist_d_current;
     ddt_cuda_iov_pipeline_block_t *cuda_iov_pipeline_block;
     int iov_pipeline_block_id = 0;
     cudaStream_t *cuda_stream_iov = NULL;
@@ -683,7 +690,7 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pCon
         }
     }
 
-    DT_CUDA_DEBUG ( opal_cuda_output(2, "Unpack using IOV, GPU base %p, unpack from buffer %p, total size %ld\n",
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "Unpack using IOV cached, GPU base %p, unpack from buffer %p, total size %ld\n",
                                      pConvertor->pBaseBuf, source, iov[0].iov_len); );
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
     GET_TIME( end );
@@ -730,8 +737,8 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pCon
 
         nb_blocks_used = 0;
         cuda_iov_pipeline_block = current_cuda_device->cuda_iov_pipeline_block[iov_pipeline_block_id];
-        cuda_iov_dist_h_current = cuda_iov_pipeline_block->cuda_iov_dist_h;
-        cuda_iov_dist_d_current = cuda_iov_pipeline_block->cuda_iov_dist_d;
+        cuda_iov_dist_h_current = cuda_iov_pipeline_block->cuda_iov_dist_cached_h;
+        cuda_iov_dist_d_current = cuda_iov_pipeline_block->cuda_iov_dist_cached_d;
         cuda_stream_iov = cuda_iov_pipeline_block->cuda_stream;
         cuda_err = cudaStreamWaitEvent(*cuda_stream_iov, cuda_iov_pipeline_block->cuda_event, 0);
         opal_cuda_check_error(cuda_err);
@@ -821,8 +828,8 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pCon
         DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Unpack src %p, iov is prepared in %ld microsec, kernel submitted to CUDA stream %d, nb_blocks_used %d\n", source_base, total_time,  cuda_iov_pipeline_block->cuda_stream_id, nb_blocks_used); );
 #endif
 
-        cudaMemcpyAsync(cuda_iov_dist_d_current, cuda_iov_dist_h_current, sizeof(ddt_cuda_iov_dist_t)*(nb_blocks_used), cudaMemcpyHostToDevice, *cuda_stream_iov);
-        opal_generic_simple_unpack_cuda_iov_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cuda_iov_dist_d_current, nb_blocks_used, source_base, destination_base);
+        cudaMemcpyAsync(cuda_iov_dist_d_current, cuda_iov_dist_h_current, sizeof(ddt_cuda_iov_dist_cached_t)*(nb_blocks_used), cudaMemcpyHostToDevice, *cuda_stream_iov);
+        opal_generic_simple_unpack_cuda_iov_cached_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cuda_iov_dist_d_current, nb_blocks_used, source_base, destination_base);
         cuda_err = cudaEventRecord(cuda_iov_pipeline_block->cuda_event, *cuda_stream_iov);
         opal_cuda_check_error(cuda_err);
         iov_pipeline_block_id ++;
diff --git a/opal/datatype/opal_datatype_pack.c b/opal/datatype/opal_datatype_pack.c
index 9812a371a85..c8985db7913 100644
--- a/opal/datatype/opal_datatype_pack.c
+++ b/opal/datatype/opal_datatype_pack.c
@@ -416,7 +416,7 @@ opal_generic_simple_pack_cuda_function( opal_convertor_t* pConvertor,
     pos_desc   = pStack->index;
     pElem = &(description[pos_desc]);
    
-//    return opal_generic_simple_pack_function_cuda_vector( pConvertor, iov, out_size, max_data);
+    return opal_generic_simple_pack_function_cuda_iov( pConvertor, iov, out_size, max_data);
     if( OPAL_DATATYPE_LOOP == pElem->elem.common.type ) {
         return opal_generic_simple_pack_function_cuda_vector( pConvertor, iov, out_size, max_data);
     } else {
diff --git a/opal/datatype/opal_datatype_unpack.c b/opal/datatype/opal_datatype_unpack.c
index f5e1e76588f..5f51b3f828b 100644
--- a/opal/datatype/opal_datatype_unpack.c
+++ b/opal/datatype/opal_datatype_unpack.c
@@ -610,7 +610,7 @@ opal_generic_simple_unpack_cuda_function( opal_convertor_t* pConvertor,
     pos_desc   = pStack->index;
     pElem = &(description[pos_desc]);
    
-//    return opal_generic_simple_unpack_function_cuda_vector( pConvertor, iov, out_size, max_data);
+    return opal_generic_simple_unpack_function_cuda_iov( pConvertor, iov, out_size, max_data);
     if( OPAL_DATATYPE_LOOP == pElem->elem.common.type ) {
         return opal_generic_simple_unpack_function_cuda_vector( pConvertor, iov, out_size, max_data);
     } else {
diff --git a/test/datatype/ddt_benchmark.c b/test/datatype/ddt_benchmark.c
index 50f62ec5839..c8c3fd7db45 100644
--- a/test/datatype/ddt_benchmark.c
+++ b/test/datatype/ddt_benchmark.c
@@ -1216,7 +1216,7 @@ int main( int argc, char* argv[] )
         printf("----matrix size %d-----\n", mat_size);
         if( outputFlags & CHECK_PACK_UNPACK ) {
             for (i = 1; i <= 1; i++) {
-                local_copy_with_convertor(pdt, 1, 4000001, mat_size);
+                local_copy_with_convertor(pdt, 1, 40000000, mat_size);
             }
         }
         OBJ_RELEASE( pdt ); assert( pdt == NULL );

From d9927f4b992031af880a2fad77391e79c3fc7962 Mon Sep 17 00:00:00 2001
From: eddy16112 <eddy16112@gmail.com>
Date: Sun, 8 Nov 2015 18:26:45 -0500
Subject: [PATCH 049/190] check point iov cache

---
 opal/datatype/cuda/opal_datatype_cuda.cu      | 10 ++--
 .../cuda/opal_datatype_cuda_internal.cuh      |  3 +-
 .../cuda/opal_datatype_pack_cuda_kernel.cu    | 46 +++++++++--------
 .../cuda/opal_datatype_pack_cuda_wrapper.cu   | 24 ++++-----
 .../cuda/opal_datatype_unpack_cuda_kernel.cu  | 49 +++++++++++++------
 .../cuda/opal_datatype_unpack_cuda_wrapper.cu | 24 +++++----
 opal/datatype/opal_datatype_create.c          | 13 +++++
 opal/datatype/opal_datatype_cuda.c            |  7 ++-
 opal/datatype/opal_datatype_destroy.c         | 15 +-----
 test/datatype/ddt_benchmark.c                 |  2 +-
 10 files changed, 106 insertions(+), 87 deletions(-)

diff --git a/opal/datatype/cuda/opal_datatype_cuda.cu b/opal/datatype/cuda/opal_datatype_cuda.cu
index fbafc2bfbe2..f53e006a4fd 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cu
+++ b/opal/datatype/cuda/opal_datatype_cuda.cu
@@ -282,13 +282,13 @@ int32_t opal_ddt_cuda_kernel_fini(void)
 void* opal_ddt_cuda_iov_dist_init(void) 
 {
 #if OPAL_DATATYPE_CUDA_IOV_CACHE 
-    ddt_cuda_iov_dist_t *p = NULL;
-    cudaMalloc((void **)(&p), sizeof(ddt_cuda_iov_dist_t) * NUM_CUDA_IOV_PER_DDT);
+    ddt_cuda_iov_dist_cached_t *p = NULL;
+    cudaMalloc((void **)(&p), sizeof(ddt_cuda_iov_dist_cached_t) * NUM_CUDA_IOV_PER_DDT);
     if (p != NULL) {
-        DT_CUDA_DEBUG( opal_cuda_output( 2, "Malloc cuda_iov_dist for ddt is successed %p.\n", p); );
+        DT_CUDA_DEBUG( opal_cuda_output( 2, "Malloc cuda_iov_dist_cached for ddt is successed %p.\n", p); );
         return p;
     } else {
-        DT_CUDA_DEBUG( opal_cuda_output( 0, "Malloc cuda_iov_dist for ddt is failed.\n"); );
+        DT_CUDA_DEBUG( opal_cuda_output( 0, "Malloc cuda_iov_dist_cached for ddt is failed.\n"); );
         return NULL;
     }
 #else
@@ -300,7 +300,7 @@ void* opal_ddt_cuda_iov_dist_init(void)
 void opal_ddt_cuda_iov_dist_fini(void* cuda_iov_dist) 
 {
 #if OPAL_DATATYPE_CUDA_IOV_CACHE
-    ddt_cuda_iov_dist_t *p = (ddt_cuda_iov_dist_t *) cuda_iov_dist;
+    ddt_cuda_iov_dist_cached_t *p = (ddt_cuda_iov_dist_cached_t *) cuda_iov_dist;
     if (p != NULL) {
         cudaFree(p);
         DT_CUDA_DEBUG( opal_cuda_output( 2, "Free cuda_iov_dist for ddt is successed %p.\n", p); );
diff --git a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
index a91dd8e4f1b..1fa0e17b4c7 100644
--- a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
+++ b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
@@ -60,8 +60,7 @@ typedef struct {
 typedef struct {
     size_t src_offset;
     size_t dst_offset;
-    uint32_t nb_elements;
-    uint8_t element_alignment;
+    uint32_t nb_bytes;
 } ddt_cuda_iov_dist_cached_t;
 
 typedef struct {
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
index ccf7d923af7..f4a100b969d 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
@@ -90,16 +90,15 @@ __global__ void opal_generic_simple_pack_cuda_iov_non_cached_kernel( ddt_cuda_io
 
 __global__ void opal_generic_simple_pack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, int nb_blocks_used, unsigned char* source_base, unsigned char* destination_base)
 {
-    uint32_t i, _copy_count;
+    uint32_t i, j;
     size_t src_offset, dst_offset;
-    uint8_t alignment;
     unsigned char *_source_tmp, *_destination_tmp;
     
     __shared__ uint32_t nb_tasks;
-    __shared__ uint8_t my_alignment;
+    __shared__ uint32_t copy_count;
+    __shared__ uint8_t alignment;
     
     if (threadIdx.x == 0) {
-        //printf("iov pack kernel \n");
         nb_tasks = nb_blocks_used / gridDim.x;
         if (blockIdx.x < (nb_blocks_used % gridDim.x)) {
             nb_tasks ++;
@@ -111,37 +110,36 @@ __global__ void opal_generic_simple_pack_cuda_iov_cached_kernel( ddt_cuda_iov_di
     for (i = 0; i < nb_tasks; i++) {
         src_offset = cuda_iov_dist[blockIdx.x + i * gridDim.x].src_offset;
         dst_offset = cuda_iov_dist[blockIdx.x + i * gridDim.x].dst_offset;
-        _copy_count = cuda_iov_dist[blockIdx.x + i * gridDim.x].nb_elements;
-        alignment = cuda_iov_dist[blockIdx.x + i * gridDim.x].element_alignment;
         
         if (threadIdx.x == 0) {
             _source_tmp = source_base + src_offset;
             _destination_tmp = destination_base + dst_offset;
-            if ((uintptr_t)(_source_tmp) % ALIGNMENT_DOUBLE == 0 && (uintptr_t)_destination_tmp % ALIGNMENT_DOUBLE == 0) {
-                my_alignment = ALIGNMENT_DOUBLE;
-            } else if ((uintptr_t)(_source_tmp) % ALIGNMENT_FLOAT == 0 && (uintptr_t)_destination_tmp % ALIGNMENT_FLOAT == 0) {
-                my_alignment = ALIGNMENT_FLOAT;
+            uint32_t _nb_bytes = cuda_iov_dist[blockIdx.x + i * gridDim.x].nb_bytes;
+            if ((uintptr_t)(_source_tmp) % ALIGNMENT_DOUBLE == 0 && (uintptr_t)_destination_tmp % ALIGNMENT_DOUBLE == 0 && _nb_bytes >= ALIGNMENT_DOUBLE) {
+                alignment = ALIGNMENT_DOUBLE;
+            } else if ((uintptr_t)(_source_tmp) % ALIGNMENT_FLOAT == 0 && (uintptr_t)_destination_tmp % ALIGNMENT_FLOAT == 0 && _nb_bytes >= ALIGNMENT_FLOAT) {
+                alignment = ALIGNMENT_FLOAT;
             } else {
-                my_alignment = ALIGNMENT_CHAR;
-            }
-            if (my_alignment != alignment) {
-                printf("my align %d, align %d\n", my_alignment, alignment);
+                alignment = ALIGNMENT_CHAR;
             }
+            copy_count = _nb_bytes / alignment;
         }
         __syncthreads();
         
-        if (threadIdx.x < _copy_count) {
-            _source_tmp = source_base + src_offset + threadIdx.x * alignment;
-            _destination_tmp = destination_base + dst_offset + threadIdx.x * alignment;
+        for (j = threadIdx.x; j < copy_count; j += blockDim.x) {
+            if (j < copy_count) {
+                _source_tmp = source_base + src_offset + j * alignment;
+                _destination_tmp = destination_base + dst_offset + j * alignment;
 #if !defined (OPAL_DATATYPE_CUDA_DRY_RUN)
-            if (my_alignment == ALIGNMENT_DOUBLE) {
-                *((long *)_destination_tmp) = *((long *)_source_tmp);
-            } else if (my_alignment == ALIGNMENT_FLOAT) {
-                *((int *)_destination_tmp) = *((int *)_source_tmp);
-            } else {
-                * _destination_tmp = *_source_tmp;
-            }
+                if (alignment == ALIGNMENT_DOUBLE) {
+                    *((long *)_destination_tmp) = *((long *)_source_tmp);
+                } else if (alignment == ALIGNMENT_FLOAT) {
+                    *((int *)_destination_tmp) = *((int *)_source_tmp);
+                } else {
+                    * _destination_tmp = *_source_tmp;
+                }
 #endif /* ! OPAL_DATATYPE_CUDA_DRY_RUN */
+            }
         }
     }
 }
\ No newline at end of file
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
index b8dda932626..36cdcbaf3cd 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
@@ -664,7 +664,7 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConve
                                                         uint32_t* out_size,
                                                         size_t* max_data )
 {
-    return opal_ddt_generic_simple_pack_function_cuda_iov_non_cached(pConvertor, iov, out_size, max_data);   
+    return opal_ddt_generic_simple_pack_function_cuda_iov_cached(pConvertor, iov, out_size, max_data);   
 }
 
 int32_t opal_ddt_generic_simple_pack_function_cuda_iov_non_cached( opal_convertor_t* pConvertor,
@@ -1086,6 +1086,8 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
             } else {
                 alignment = ALIGNMENT_CHAR;
             }
+            
+            alignment = ALIGNMENT_DOUBLE;
 
             count_desc = length_per_iovec / alignment;
             residue_desc = length_per_iovec % alignment;
@@ -1094,17 +1096,16 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
             for (j = 0; j < nb_blocks_per_description; j++) {
                 cuda_iov_dist_h_current[nb_blocks_used].src_offset = source + j * thread_per_block * alignment - source_base;
                 cuda_iov_dist_h_current[nb_blocks_used].dst_offset = destination - destination_base;
-                cuda_iov_dist_h_current[nb_blocks_used].element_alignment = alignment;
                 if ( (j+1) * thread_per_block <= count_desc) {
-                    cuda_iov_dist_h_current[nb_blocks_used].nb_elements = thread_per_block;
+                    cuda_iov_dist_h_current[nb_blocks_used].nb_bytes = thread_per_block * alignment;
                 } else {
-                    cuda_iov_dist_h_current[nb_blocks_used].nb_elements = count_desc - j*thread_per_block; 
+                    cuda_iov_dist_h_current[nb_blocks_used].nb_bytes = (count_desc - j*thread_per_block) * alignment; 
                 }
 #if defined (OPAL_DATATYPE_CUDA_DEBUG)
-                assert(cuda_iov_dist_h_current[nb_blocks_used].nb_elements > 0);
+                assert(cuda_iov_dist_h_current[nb_blocks_used].nb_bytes > 0);
 #endif /* OPAL_DATATYPE_CUDA_DEBUG */
-                destination += cuda_iov_dist_h_current[nb_blocks_used].nb_elements * alignment;
-                DT_CUDA_DEBUG( opal_cuda_output(12, "Pack \tblock %d, src %p, dst %p, nb_elements %d, alignment %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].src_offset, cuda_iov_dist_h_current[nb_blocks_used].dst_offset, cuda_iov_dist_h_current[nb_blocks_used].nb_elements, cuda_iov_dist_h_current[nb_blocks_used].element_alignment); );
+                destination += cuda_iov_dist_h_current[nb_blocks_used].nb_bytes;
+                DT_CUDA_DEBUG( opal_cuda_output(12, "Pack \tblock %d, src_offset %ld, dst_offset %ld, nb_elements %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].src_offset, cuda_iov_dist_h_current[nb_blocks_used].dst_offset, cuda_iov_dist_h_current[nb_blocks_used].nb_bytes); );
                 nb_blocks_used ++;
                 assert (nb_blocks_used < CUDA_MAX_NB_BLOCKS*CUDA_IOV_MAX_TASK_PER_BLOCK);
             }
@@ -1115,13 +1116,12 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
                 orig_alignment = ALIGNMENT_CHAR;
                 cuda_iov_dist_h_current[nb_blocks_used].src_offset = source + length_per_iovec / alignment * alignment - source_base;
                 cuda_iov_dist_h_current[nb_blocks_used].dst_offset = destination - destination_base;
-                cuda_iov_dist_h_current[nb_blocks_used].element_alignment = orig_alignment;
-                cuda_iov_dist_h_current[nb_blocks_used].nb_elements = (length_per_iovec - length_per_iovec / alignment * alignment) / orig_alignment;
+                cuda_iov_dist_h_current[nb_blocks_used].nb_bytes = (length_per_iovec - length_per_iovec / alignment * alignment) / orig_alignment;
 #if defined (OPAL_DATATYPE_CUDA_DEBUG)
-                assert(cuda_iov_dist_h_current[nb_blocks_used].nb_elements > 0);
+                assert(cuda_iov_dist_h_current[nb_blocks_used].nb_bytes > 0);
 #endif /* OPAL_DATATYPE_CUDA_DEBUG */
-                destination += cuda_iov_dist_h_current[nb_blocks_used].nb_elements * orig_alignment;
-                DT_CUDA_DEBUG( opal_cuda_output(12, "Pack \tblock %d, src %p, dst %p, nb_elements %d, alignment %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].src_offset, cuda_iov_dist_h_current[nb_blocks_used].dst_offset, cuda_iov_dist_h_current[nb_blocks_used].nb_elements, cuda_iov_dist_h_current[nb_blocks_used].element_alignment); );
+                destination += cuda_iov_dist_h_current[nb_blocks_used].nb_bytes;
+                DT_CUDA_DEBUG( opal_cuda_output(12, "Pack \tblock %d, src_offset %ld, dst_offset %ld, nb_bytes %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].src_offset, cuda_iov_dist_h_current[nb_blocks_used].dst_offset, cuda_iov_dist_h_current[nb_blocks_used].nb_bytes); );
                 nb_blocks_used ++;
                 assert (nb_blocks_used < CUDA_MAX_NB_BLOCKS*CUDA_IOV_MAX_TASK_PER_BLOCK);
             }
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
index 37527bd2071..b1e2831f5c1 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
@@ -48,12 +48,13 @@ __global__ void opal_generic_simple_unpack_cuda_iov_non_cached_kernel( ddt_cuda_
 
 __global__ void opal_generic_simple_unpack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, int nb_blocks_used, unsigned char* source_base, unsigned char* destination_base)
 {
-    uint32_t i, _copy_count;
+    uint32_t i, j;
     size_t src_offset, dst_offset;
-    uint8_t alignment;
     unsigned char *_source_tmp, *_destination_tmp;
     
     __shared__ uint32_t nb_tasks;
+    __shared__ uint32_t copy_count;
+    __shared__ uint8_t alignment;
     
     if (threadIdx.x == 0) {
         nb_tasks = nb_blocks_used / gridDim.x;
@@ -66,22 +67,40 @@ __global__ void opal_generic_simple_unpack_cuda_iov_cached_kernel( ddt_cuda_iov_
     for (i = 0; i < nb_tasks; i++) {
         src_offset = cuda_iov_dist[blockIdx.x + i * gridDim.x].src_offset;
         dst_offset = cuda_iov_dist[blockIdx.x + i * gridDim.x].dst_offset;
-        _copy_count = cuda_iov_dist[blockIdx.x + i * gridDim.x].nb_elements;
-        alignment = cuda_iov_dist[blockIdx.x + i * gridDim.x].element_alignment;
         
-        if (threadIdx.x < _copy_count) {
-            _source_tmp = source_base + src_offset + threadIdx.x * alignment;
-            _destination_tmp = destination_base + dst_offset + threadIdx.x * alignment;
+        if (threadIdx.x == 0) {
+            _source_tmp = source_base + src_offset;
+            _destination_tmp = destination_base + dst_offset;
+            uint32_t _nb_bytes = cuda_iov_dist[blockIdx.x + i * gridDim.x].nb_bytes;
+            if ((uintptr_t)(_destination_tmp) % ALIGNMENT_DOUBLE == 0 && (uintptr_t)_source_tmp % ALIGNMENT_DOUBLE == 0 && _nb_bytes >= ALIGNMENT_DOUBLE) {
+                alignment = ALIGNMENT_DOUBLE;
+            } else if ((uintptr_t)(_destination_tmp) % ALIGNMENT_FLOAT == 0 && (uintptr_t)_source_tmp % ALIGNMENT_FLOAT == 0 && _nb_bytes >= ALIGNMENT_FLOAT) {
+                alignment = ALIGNMENT_FLOAT;
+            } else {
+                alignment = ALIGNMENT_CHAR;
+            }
+            copy_count = _nb_bytes / alignment;
+        }
+        __syncthreads();
+        
+        for (j = threadIdx.x; j < copy_count; j += blockDim.x) {
+/*            if (threadIdx.x == 0) {
+                if (copy_count > blockDim.x) printf("copy_count %d, dim %d\n", copy_count, blockDim.x);
+            }*/
+            if (j < copy_count) {
+                _source_tmp = source_base + src_offset + j * alignment;
+                _destination_tmp = destination_base + dst_offset + j * alignment;
 #if !defined (OPAL_DATATYPE_CUDA_DRY_RUN)
-                if (alignment == ALIGNMENT_DOUBLE) {
-                    *((long *)_destination_tmp) = *((long *)_source_tmp);
-                } else if (alignment == ALIGNMENT_FLOAT) {
-                    *((int *)_destination_tmp) = *((int *)_source_tmp);
-                } else {
-                    * _destination_tmp = *_source_tmp;
-                }
-        //   printf("src %p, %1.f | dst %p, %1.f\n", _source_tmp, *_source_tmp, _destination_tmp, *_destination_tmp);
+                    if (alignment == ALIGNMENT_DOUBLE) {
+                        *((long *)_destination_tmp) = *((long *)_source_tmp);
+                    } else if (alignment == ALIGNMENT_FLOAT) {
+                        *((int *)_destination_tmp) = *((int *)_source_tmp);
+                    } else {
+                        * _destination_tmp = *_source_tmp;
+                    }
+            //   printf("src %p, %1.f | dst %p, %1.f\n", _source_tmp, *_source_tmp, _destination_tmp, *_destination_tmp);
 #endif /* ! OPAL_DATATYPE_CUDA_DRY_RUN */
+            }
         }
     }
 }
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
index fc7b3d28f6a..0bdf66638fc 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
@@ -375,7 +375,7 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pCon
                                                           uint32_t* out_size,
                                                           size_t* max_data )
 {
-    return opal_ddt_generic_simple_unpack_function_cuda_iov_non_cached(pConvertor, iov, out_size, max_data);
+    return opal_ddt_generic_simple_unpack_function_cuda_iov_cached(pConvertor, iov, out_size, max_data);
 }
 
 int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_non_cached( opal_convertor_t* pConvertor,
@@ -778,7 +778,7 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
                 alignment = ALIGNMENT_CHAR;
             }
 
-            //alignment = ALIGNMENT_DOUBLE;
+            alignment = ALIGNMENT_DOUBLE;
 
             count_desc = length_per_iovec / alignment;
             residue_desc = length_per_iovec % alignment;
@@ -787,17 +787,16 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
             for (j = 0; j < nb_blocks_per_description; j++) {
                 cuda_iov_dist_h_current[nb_blocks_used].dst_offset = destination + j * thread_per_block * alignment - destination_base;
                 cuda_iov_dist_h_current[nb_blocks_used].src_offset = source - source_base;
-                cuda_iov_dist_h_current[nb_blocks_used].element_alignment = alignment;
                 if ( (j+1) * thread_per_block <= count_desc) {
-                    cuda_iov_dist_h_current[nb_blocks_used].nb_elements = thread_per_block;// * sizeof(double);
+                    cuda_iov_dist_h_current[nb_blocks_used].nb_bytes = thread_per_block * alignment;
                 } else {
-                    cuda_iov_dist_h_current[nb_blocks_used].nb_elements = (thread_per_block - ((j+1)*thread_per_block - count_desc));// * sizeof(double);
+                    cuda_iov_dist_h_current[nb_blocks_used].nb_bytes = (thread_per_block - ((j+1)*thread_per_block - count_desc)) * alignment;
                 }
 #if defined (OPAL_DATATYPE_CUDA_DEBUG)
-                assert (cuda_iov_dist_h_current[nb_blocks_used].nb_elements > 0); 
+                assert (cuda_iov_dist_h_current[nb_blocks_used].nb_bytes > 0); 
 #endif /* OPAL_DATATYPE_CUDA_DEBUG */
-                source += cuda_iov_dist_h_current[nb_blocks_used].nb_elements * alignment;
-                DT_CUDA_DEBUG( opal_cuda_output(12, "Unpack \tblock %d, src %p, dst %p, nb_elements %d, alignment %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].src_offset, cuda_iov_dist_h_current[nb_blocks_used].dst_offset, cuda_iov_dist_h_current[nb_blocks_used].nb_elements, cuda_iov_dist_h_current[nb_blocks_used].element_alignment); );
+                source += cuda_iov_dist_h_current[nb_blocks_used].nb_bytes;
+                DT_CUDA_DEBUG( opal_cuda_output(12, "Unpack \tblock %d, src_offset %ld, dst_offset %ld, nb_bytes %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].src_offset, cuda_iov_dist_h_current[nb_blocks_used].dst_offset, cuda_iov_dist_h_current[nb_blocks_used].nb_bytes); );
                 nb_blocks_used ++;
             }
 
@@ -807,13 +806,12 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
                 orig_alignment = ALIGNMENT_CHAR;
                 cuda_iov_dist_h_current[nb_blocks_used].dst_offset = destination + length_per_iovec / alignment * alignment - destination_base;
                 cuda_iov_dist_h_current[nb_blocks_used].src_offset = source - source_base;
-                cuda_iov_dist_h_current[nb_blocks_used].element_alignment = orig_alignment;
-                cuda_iov_dist_h_current[nb_blocks_used].nb_elements = (length_per_iovec - length_per_iovec / alignment * alignment) / orig_alignment;
+                cuda_iov_dist_h_current[nb_blocks_used].nb_bytes = (length_per_iovec - length_per_iovec / alignment * alignment) / orig_alignment;
 #if defined (OPAL_DATATYPE_CUDA_DEBUG)
-                assert (cuda_iov_dist_h_current[nb_blocks_used].nb_elements > 0);
+                assert (cuda_iov_dist_h_current[nb_blocks_used].nb_bytes > 0);
 #endif /* OPAL_DATATYPE_CUDA_DEBUG */
-                source += cuda_iov_dist_h_current[nb_blocks_used].nb_elements * orig_alignment;
-                DT_CUDA_DEBUG( opal_cuda_output(12, "Unpack \tblock %d, src %p, dst %p, nb_elements %d, alignment %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].src_offset, cuda_iov_dist_h_current[nb_blocks_used].dst_offset, cuda_iov_dist_h_current[nb_blocks_used].nb_elements, cuda_iov_dist_h_current[nb_blocks_used].element_alignment); );
+                source += cuda_iov_dist_h_current[nb_blocks_used].nb_bytes;
+                DT_CUDA_DEBUG( opal_cuda_output(12, "Unpack \tblock %d, src %ld, dst %ld, nb_bytes %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].src_offset, cuda_iov_dist_h_current[nb_blocks_used].dst_offset, cuda_iov_dist_h_current[nb_blocks_used].nb_bytes); );
                 nb_blocks_used ++;
             }
 
diff --git a/opal/datatype/opal_datatype_create.c b/opal/datatype/opal_datatype_create.c
index b97a84f5174..b95e13374d1 100644
--- a/opal/datatype/opal_datatype_create.c
+++ b/opal/datatype/opal_datatype_create.c
@@ -27,6 +27,10 @@
 #include "opal/datatype/opal_datatype_internal.h"
 #include "limits.h"
 #include "opal/prefetch.h"
+#if OPAL_CUDA_SUPPORT
+//#include "opal/datatype/opal_convertor.h"
+//#include "opal/datatype/opal_datatype_cuda.h"
+#endif /* OPAL_CUDA_SUPPORT */ 
 
 static void opal_datatype_construct( opal_datatype_t* pData )
 {
@@ -90,6 +94,15 @@ static void opal_datatype_destruct( opal_datatype_t* datatype )
         free(datatype->cached_iovec);
         datatype->cached_iovec = NULL;
     }
+    
+#if OPAL_CUDA_SUPPORT   
+    /* free cuda iov */
+/*    if (opal_datatype_cuda_kernel_support== 1 && datatype->cuda_iov_dist != NULL && datatype->cuda_iov_dist != (void*)0xDEADBEEF) {
+        opal_cuda_iov_dist_fini(datatype->cuda_iov_dist);
+        datatype->cuda_iov_dist = NULL;
+        datatype->cuda_iov_count = 0;
+    } */
+#endif /* OPAL_CUDA_SUPPORT */
 }
 
 OBJ_CLASS_INSTANCE(opal_datatype_t, opal_object_t, opal_datatype_construct, opal_datatype_destruct);
diff --git a/opal/datatype/opal_datatype_cuda.c b/opal/datatype/opal_datatype_cuda.c
index 3b3fc556ef9..e14e58bdb1c 100644
--- a/opal/datatype/opal_datatype_cuda.c
+++ b/opal/datatype/opal_datatype_cuda.c
@@ -84,7 +84,10 @@ void mca_cuda_convertor_init(opal_convertor_t* convertor, const void *pUserBuf,
     if (OPAL_SUCCESS != opal_cuda_kernel_support_init()) {
         opal_cuda_kernel_support_fini();    
     }
-    if (opal_datatype_cuda_kernel_support == 1 && datatype->cuda_iov_is_cached == 0) {
+
+#if 0    
+    convertor->flags &= ~CONVERTOR_CUDA;  
+    if (opal_datatype_cuda_kernel_support == 1 && datatype->cuda_iov_is_cached == 0 && opal_convertor_need_buffers(convertor) == true) {
         struct opal_datatype_t* datatype_tmp = (opal_datatype_t *)datatype;
         datatype_tmp->cuda_iov_dist = opal_cuda_iov_dist_init();
         if (datatype_tmp->cuda_iov_dist == (void*)0xDEADBEEF || datatype_tmp->cuda_iov_dist == NULL) {
@@ -95,6 +98,8 @@ void mca_cuda_convertor_init(opal_convertor_t* convertor, const void *pUserBuf,
             datatype_tmp->cuda_iov_is_cached = 1;
         }
     }
+    convertor->flags |= CONVERTOR_CUDA;
+#endif
     convertor->current_cuda_iov_count = 0;
     convertor->current_iov_pos = 0;
     convertor->current_iov_partial_length = 0;
diff --git a/opal/datatype/opal_datatype_destroy.c b/opal/datatype/opal_datatype_destroy.c
index 8c225e698c0..593d5bfd67a 100644
--- a/opal/datatype/opal_datatype_destroy.c
+++ b/opal/datatype/opal_datatype_destroy.c
@@ -21,24 +21,11 @@
 #include "opal_config.h"
 #include "opal/constants.h"
 #include "opal/datatype/opal_datatype.h"
-#include "opal/datatype/opal_datatype_internal.h"
-#if OPAL_CUDA_SUPPORT
-#include "opal/datatype/opal_convertor.h"
-#include "opal/datatype/opal_datatype_cuda.h"
-#endif /* OPAL_CUDA_SUPPORT */   
+#include "opal/datatype/opal_datatype_internal.h"  
 
 int32_t opal_datatype_destroy( opal_datatype_t** dt )
 {
     opal_datatype_t* pData = *dt;
-    
-#if OPAL_CUDA_SUPPORT   
-    /* free cuda iov */
-    if (opal_datatype_cuda_kernel_support== 1 && pData->cuda_iov_dist != NULL && pData->cuda_iov_dist != (void*)0xDEADBEEF) {
-        opal_cuda_iov_dist_fini(pData->cuda_iov_dist);
-        pData->cuda_iov_dist = NULL;
-        pData->cuda_iov_count = 0;
-    }
-#endif /* OPAL_CUDA_SUPPORT */
 
     if( (pData->flags & OPAL_DATATYPE_FLAG_PREDEFINED) &&
         (pData->super.obj_reference_count <= 1) )
diff --git a/test/datatype/ddt_benchmark.c b/test/datatype/ddt_benchmark.c
index c8c3fd7db45..50f62ec5839 100644
--- a/test/datatype/ddt_benchmark.c
+++ b/test/datatype/ddt_benchmark.c
@@ -1216,7 +1216,7 @@ int main( int argc, char* argv[] )
         printf("----matrix size %d-----\n", mat_size);
         if( outputFlags & CHECK_PACK_UNPACK ) {
             for (i = 1; i <= 1; i++) {
-                local_copy_with_convertor(pdt, 1, 40000000, mat_size);
+                local_copy_with_convertor(pdt, 1, 4000001, mat_size);
             }
         }
         OBJ_RELEASE( pdt ); assert( pdt == NULL );

From 4c192e9e5c0139c3617f0e3ed7e703020d84ea67 Mon Sep 17 00:00:00 2001
From: eddy16112 <eddy16112@gmail.com>
Date: Sun, 8 Nov 2015 22:04:22 -0500
Subject: [PATCH 050/190] another checkpoint

---
 opal/datatype/cuda/opal_datatype_cuda.cu      | 15 +++++++++-
 opal/datatype/cuda/opal_datatype_cuda.cuh     |  4 +++
 .../cuda/opal_datatype_cuda_internal.cuh      |  2 +-
 .../cuda/opal_datatype_pack_cuda_kernel.cu    |  5 ++--
 .../cuda/opal_datatype_pack_cuda_wrapper.cu   | 19 +++++-------
 .../cuda/opal_datatype_unpack_cuda_kernel.cu  |  7 +++--
 .../cuda/opal_datatype_unpack_cuda_wrapper.cu | 16 ++++------
 opal/datatype/opal_convertor.h                |  5 ++--
 opal/datatype/opal_datatype.h                 |  6 ++--
 opal/datatype/opal_datatype_create.c          | 21 ++++++++-----
 opal/datatype/opal_datatype_cuda.c            | 30 ++-----------------
 opal/datatype/opal_datatype_cuda.h            |  1 -
 opal/datatype/opal_datatype_optimize.c        |  7 -----
 13 files changed, 63 insertions(+), 75 deletions(-)

diff --git a/opal/datatype/cuda/opal_datatype_cuda.cu b/opal/datatype/cuda/opal_datatype_cuda.cu
index f53e006a4fd..e35fbcffd27 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cu
+++ b/opal/datatype/cuda/opal_datatype_cuda.cu
@@ -293,7 +293,7 @@ void* opal_ddt_cuda_iov_dist_init(void)
     }
 #else
     DT_CUDA_DEBUG( opal_cuda_output( 2, "cuda iov cache is not enabled.\n"); );
-    return (void *)0xDEADBEEF;
+    return NULL;
 #endif /* OPAL_DATATYPE_CUDA_IOV_CACHE */
 }
 
@@ -308,6 +308,19 @@ void opal_ddt_cuda_iov_dist_fini(void* cuda_iov_dist)
 #endif /* OPAL_DATATYPE_CUDA_IOV_CACHE */
 }
 
+void opal_ddt_get_cached_cuda_iov(struct opal_convertor_t *convertor,
+                                  ddt_cuda_iov_dist_cached_t **cuda_iov_dist,
+                                  uint32_t* cuda_iov_count, uint8_t *cuda_iov_is_cached)
+{
+    opal_datatype_t *datatype = (opal_datatype_t *)convertor->pDesc;
+    if (datatype->cached_cuda_iov_dist == NULL) {
+        datatype->cached_cuda_iov_dist = opal_ddt_cuda_iov_dist_init();
+        datatype->cached_cuda_iov_count = NUM_CUDA_IOV_PER_DDT;
+    }
+    *cuda_iov_dist = (ddt_cuda_iov_dist_cached_t *)datatype->cached_cuda_iov_dist;
+    *cuda_iov_count = datatype->cached_cuda_iov_count;                      
+}
+
 int32_t opal_ddt_cuda_is_gpu_buffer(const void *ptr)
 {
     int res;
diff --git a/opal/datatype/cuda/opal_datatype_cuda.cuh b/opal/datatype/cuda/opal_datatype_cuda.cuh
index 73a740a2822..0711b2c067d 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cuh
+++ b/opal/datatype/cuda/opal_datatype_cuda.cuh
@@ -121,6 +121,10 @@ void opal_ddt_cuda_iov_dist_fini(void *cuda_iov_dist);
 
 void pack_iov_cached(opal_convertor_t* pConvertor, unsigned char *destination);
 
+void opal_ddt_get_cached_cuda_iov(struct opal_convertor_t *convertor,
+                                  ddt_cuda_iov_dist_cached_t **cuda_iov_dist,
+                                  uint32_t *cuda_iov_count, uint8_t *cuda_iov_is_cached);
+
 }
                             
 #endif  /* OPAL_DATATYPE_CUDA_H_HAS_BEEN_INCLUDED */
\ No newline at end of file
diff --git a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
index 1fa0e17b4c7..779db2b385a 100644
--- a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
+++ b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
@@ -18,7 +18,7 @@
 #define OPAL_DATATYPE_VECTOR_USE_ZEROCPY   0
 #define OPAL_DATATYPE_VECTOR_USE_PIPELINE   0
 #define OPAL_DATATYPE_VECTOR_USE_MEMCPY2D_AS_KERNEL   1
-#define OPAL_DATATYPE_CUDA_IOV_CACHE    0
+#define OPAL_DATATYPE_CUDA_IOV_CACHE    1
 
 
 
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
index f4a100b969d..42acd8c4906 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
@@ -115,9 +115,10 @@ __global__ void opal_generic_simple_pack_cuda_iov_cached_kernel( ddt_cuda_iov_di
             _source_tmp = source_base + src_offset;
             _destination_tmp = destination_base + dst_offset;
             uint32_t _nb_bytes = cuda_iov_dist[blockIdx.x + i * gridDim.x].nb_bytes;
-            if ((uintptr_t)(_source_tmp) % ALIGNMENT_DOUBLE == 0 && (uintptr_t)_destination_tmp % ALIGNMENT_DOUBLE == 0 && _nb_bytes >= ALIGNMENT_DOUBLE) {
+            /* block size is either multiple of ALIGNMENT_DOUBLE or residule */
+            if ((uintptr_t)(_source_tmp) % ALIGNMENT_DOUBLE == 0 && (uintptr_t)_destination_tmp % ALIGNMENT_DOUBLE == 0 && _nb_bytes % ALIGNMENT_DOUBLE == 0) {
                 alignment = ALIGNMENT_DOUBLE;
-            } else if ((uintptr_t)(_source_tmp) % ALIGNMENT_FLOAT == 0 && (uintptr_t)_destination_tmp % ALIGNMENT_FLOAT == 0 && _nb_bytes >= ALIGNMENT_FLOAT) {
+            } else if ((uintptr_t)(_source_tmp) % ALIGNMENT_FLOAT == 0 && (uintptr_t)_destination_tmp % ALIGNMENT_FLOAT == 0 && _nb_bytes % ALIGNMENT_FLOAT == 0) {
                 alignment = ALIGNMENT_FLOAT;
             } else {
                 alignment = ALIGNMENT_CHAR;
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
index 36cdcbaf3cd..bae2a714b79 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
@@ -956,7 +956,10 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
     const struct iovec *ddt_iov = NULL;
     uint32_t ddt_iov_count;
     size_t iov_len;
-    int iov_start_pos, iov_end_pos;
+    uint32_t iov_start_pos, iov_end_pos;
+    ddt_cuda_iov_dist_cached_t* cached_cuda_iov_dist_d;
+    uint32_t cached_cuda_iov_count;
+    uint8_t cuda_iov_is_cached;
 
 #if defined(OPAL_DATATYPE_CUDA_TIMING)    
     TIMER_DATA_TYPE start, end, start_total, end_total;
@@ -1025,6 +1028,8 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
     
     opal_convertor_raw_cached( pConvertor, &ddt_iov, &ddt_iov_count);
     assert(ddt_iov != NULL);
+    opal_ddt_get_cached_cuda_iov(pConvertor, &cached_cuda_iov_dist_d, &cached_cuda_iov_count, &cuda_iov_is_cached);
+    assert(cached_cuda_iov_dist_d != NULL);
     DT_CUDA_DEBUG ( opal_cuda_output(4, "Pack iov count %d, submit to CUDA stream %d\n", ddt_iov_count, cuda_streams->current_stream_id); );
 
 #if defined(OPAL_DATATYPE_CUDA_TIMING)    
@@ -1078,15 +1083,7 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
             total_packed += length_per_iovec;
             source = (size_t)(ddt_iov[i].iov_base) + (ddt_iov[i].iov_len - iov_len) + source_base;
             
-            /* check alignment */
-            if ((uintptr_t)(source) % ALIGNMENT_DOUBLE == 0 && (uintptr_t)destination % ALIGNMENT_DOUBLE == 0 && length_per_iovec >= ALIGNMENT_DOUBLE) {
-                alignment = ALIGNMENT_DOUBLE;
-            } else if ((uintptr_t)(source) % ALIGNMENT_FLOAT == 0 && (uintptr_t)destination % ALIGNMENT_FLOAT == 0 && length_per_iovec >= ALIGNMENT_FLOAT) {
-                alignment = ALIGNMENT_FLOAT;
-            } else {
-                alignment = ALIGNMENT_CHAR;
-            }
-            
+            /* block size is either multiple of ALIGNMENT_DOUBLE or residule */
             alignment = ALIGNMENT_DOUBLE;
 
             count_desc = length_per_iovec / alignment;
@@ -1105,7 +1102,7 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
                 assert(cuda_iov_dist_h_current[nb_blocks_used].nb_bytes > 0);
 #endif /* OPAL_DATATYPE_CUDA_DEBUG */
                 destination += cuda_iov_dist_h_current[nb_blocks_used].nb_bytes;
-                DT_CUDA_DEBUG( opal_cuda_output(12, "Pack \tblock %d, src_offset %ld, dst_offset %ld, nb_elements %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].src_offset, cuda_iov_dist_h_current[nb_blocks_used].dst_offset, cuda_iov_dist_h_current[nb_blocks_used].nb_bytes); );
+                DT_CUDA_DEBUG( opal_cuda_output(12, "Pack \tblock %d, src_offset %ld, dst_offset %ld, nb_bytes %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].src_offset, cuda_iov_dist_h_current[nb_blocks_used].dst_offset, cuda_iov_dist_h_current[nb_blocks_used].nb_bytes); );
                 nb_blocks_used ++;
                 assert (nb_blocks_used < CUDA_MAX_NB_BLOCKS*CUDA_IOV_MAX_TASK_PER_BLOCK);
             }
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
index b1e2831f5c1..1fe37218fba 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
@@ -72,9 +72,9 @@ __global__ void opal_generic_simple_unpack_cuda_iov_cached_kernel( ddt_cuda_iov_
             _source_tmp = source_base + src_offset;
             _destination_tmp = destination_base + dst_offset;
             uint32_t _nb_bytes = cuda_iov_dist[blockIdx.x + i * gridDim.x].nb_bytes;
-            if ((uintptr_t)(_destination_tmp) % ALIGNMENT_DOUBLE == 0 && (uintptr_t)_source_tmp % ALIGNMENT_DOUBLE == 0 && _nb_bytes >= ALIGNMENT_DOUBLE) {
+            if ((uintptr_t)(_destination_tmp) % ALIGNMENT_DOUBLE == 0 && (uintptr_t)_source_tmp % ALIGNMENT_DOUBLE == 0 && _nb_bytes % ALIGNMENT_DOUBLE == 0) {
                 alignment = ALIGNMENT_DOUBLE;
-            } else if ((uintptr_t)(_destination_tmp) % ALIGNMENT_FLOAT == 0 && (uintptr_t)_source_tmp % ALIGNMENT_FLOAT == 0 && _nb_bytes >= ALIGNMENT_FLOAT) {
+            } else if ((uintptr_t)(_destination_tmp) % ALIGNMENT_FLOAT == 0 && (uintptr_t)_source_tmp % ALIGNMENT_FLOAT == 0 && _nb_bytes % ALIGNMENT_FLOAT == 0) {
                 alignment = ALIGNMENT_FLOAT;
             } else {
                 alignment = ALIGNMENT_CHAR;
@@ -90,6 +90,9 @@ __global__ void opal_generic_simple_unpack_cuda_iov_cached_kernel( ddt_cuda_iov_
             if (j < copy_count) {
                 _source_tmp = source_base + src_offset + j * alignment;
                 _destination_tmp = destination_base + dst_offset + j * alignment;
+  /*              if (threadIdx.x == 0) {
+                    printf("_src %p, dst %p, alignment %d, blk %d, j %d, count %d\n", _source_tmp, _destination_tmp, alignment, blockIdx.x, j, copy_count);
+                }*/
 #if !defined (OPAL_DATATYPE_CUDA_DRY_RUN)
                     if (alignment == ALIGNMENT_DOUBLE) {
                         *((long *)_destination_tmp) = *((long *)_source_tmp);
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
index 0bdf66638fc..ed105558f96 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
@@ -652,7 +652,10 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
     const struct iovec *ddt_iov = NULL;
     uint32_t ddt_iov_count;
     size_t iov_len;
-    int iov_start_pos, iov_end_pos;
+    uint32_t iov_start_pos, iov_end_pos;
+    ddt_cuda_iov_dist_cached_t* cached_cuda_iov_dist_d;
+    uint32_t cached_cuda_iov_count;
+    uint8_t cuda_iov_is_cached;
 
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
     TIMER_DATA_TYPE start, end, start_total, end_total;
@@ -715,6 +718,8 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
     source_base = source;
     opal_convertor_raw_cached( pConvertor, &ddt_iov, &ddt_iov_count);
     assert(ddt_iov != NULL);
+    opal_ddt_get_cached_cuda_iov(pConvertor, &cached_cuda_iov_dist_d, &cached_cuda_iov_count, &cuda_iov_is_cached);
+    assert(cached_cuda_iov_dist_d != NULL);
     DT_CUDA_DEBUG ( opal_cuda_output(4, "Unpack iov count %d, submit to CUDA stream %d\n", ddt_iov_count, cuda_streams->current_stream_id); );
 
 #if defined (OPAL_DATATYPE_CUDA_TIMING)
@@ -769,15 +774,6 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
             total_unpacked += length_per_iovec;
             destination = (size_t)(ddt_iov[i].iov_base) + (ddt_iov[i].iov_len - iov_len) + destination_base;
 
-            /* check alignment */
-            if ((uintptr_t)(destination) % ALIGNMENT_DOUBLE == 0 && (uintptr_t)source % ALIGNMENT_DOUBLE == 0 && length_per_iovec >= ALIGNMENT_DOUBLE) {
-                alignment = ALIGNMENT_DOUBLE;
-            } else if ((uintptr_t)(destination) % ALIGNMENT_FLOAT == 0 && (uintptr_t)source % ALIGNMENT_FLOAT == 0 && length_per_iovec >= ALIGNMENT_FLOAT) {
-                alignment = ALIGNMENT_FLOAT;
-            } else {
-                alignment = ALIGNMENT_CHAR;
-            }
-
             alignment = ALIGNMENT_DOUBLE;
 
             count_desc = length_per_iovec / alignment;
diff --git a/opal/datatype/opal_convertor.h b/opal/datatype/opal_convertor.h
index 1ab600cc49b..b7c0a43a6ed 100644
--- a/opal/datatype/opal_convertor.h
+++ b/opal/datatype/opal_convertor.h
@@ -114,9 +114,10 @@ struct opal_convertor_t {
 
     unsigned char *               gpu_buffer_ptr; /**< GPU buffer used for pack/unpack */
     size_t                        gpu_buffer_size;
-    size_t                        current_cuda_iov_count;
-    size_t                        current_iov_pos;
+    uint32_t                      current_cuda_iov_pos;
+    uint32_t                      current_iov_pos;
     size_t                        current_iov_partial_length;
+    opal_datatype_count_t         current_count;
 #endif
     /* size: 248, cachelines: 4, members: 20 */
     /* last cacheline: 56 bytes */
diff --git a/opal/datatype/opal_datatype.h b/opal/datatype/opal_datatype.h
index efbf357c7fd..dde50d8313d 100644
--- a/opal/datatype/opal_datatype.h
+++ b/opal/datatype/opal_datatype.h
@@ -132,9 +132,9 @@ struct opal_datatype_t {
     size_t             max_data;
     /* size: 416, cachelines: 7, members: 18 */
 #if OPAL_CUDA_SUPPORT
-    void *             cuda_iov_dist;
-    size_t             cuda_iov_count;
-    int8_t             cuda_iov_is_cached;
+    void *             cached_cuda_iov_dist;
+    uint32_t           cached_cuda_iov_count;
+    uint8_t            cuda_iov_is_cached;
 #endif /* OPAL_CUDA_SUPPORT */
     /* last cacheline: 32 bytes */
 
diff --git a/opal/datatype/opal_datatype_create.c b/opal/datatype/opal_datatype_create.c
index b95e13374d1..19caffe19ae 100644
--- a/opal/datatype/opal_datatype_create.c
+++ b/opal/datatype/opal_datatype_create.c
@@ -28,8 +28,8 @@
 #include "limits.h"
 #include "opal/prefetch.h"
 #if OPAL_CUDA_SUPPORT
-//#include "opal/datatype/opal_convertor.h"
-//#include "opal/datatype/opal_datatype_cuda.h"
+#include "opal/datatype/opal_convertor.h"
+#include "opal/datatype/opal_datatype_cuda.h"
 #endif /* OPAL_CUDA_SUPPORT */ 
 
 static void opal_datatype_construct( opal_datatype_t* pData )
@@ -59,6 +59,12 @@ static void opal_datatype_construct( opal_datatype_t* pData )
 
     pData->cached_iovec       = NULL;
     pData->cached_iovec_count = 0;
+    
+#if OPAL_CUDA_SUPPORT
+    pData->cached_cuda_iov_dist = NULL;
+    pData->cached_cuda_iov_count = 0;
+    pData->cuda_iov_is_cached = 0;
+#endif /* OPAL_CUDA_SUPPORT */
 
     for( i = 0; i < OPAL_DATATYPE_MAX_SUPPORTED; i++ )
         pData->btypes[i]      = 0;
@@ -97,11 +103,12 @@ static void opal_datatype_destruct( opal_datatype_t* datatype )
     
 #if OPAL_CUDA_SUPPORT   
     /* free cuda iov */
-/*    if (opal_datatype_cuda_kernel_support== 1 && datatype->cuda_iov_dist != NULL && datatype->cuda_iov_dist != (void*)0xDEADBEEF) {
-        opal_cuda_iov_dist_fini(datatype->cuda_iov_dist);
-        datatype->cuda_iov_dist = NULL;
-        datatype->cuda_iov_count = 0;
-    } */
+    if (opal_datatype_cuda_kernel_support == 1 && datatype->cached_cuda_iov_dist != NULL) {
+        opal_cuda_iov_dist_fini(datatype->cached_cuda_iov_dist);
+        datatype->cached_cuda_iov_dist = NULL;
+        datatype->cached_cuda_iov_count = 0;
+        datatype->cuda_iov_is_cached = 0;
+    }
 #endif /* OPAL_CUDA_SUPPORT */
 }
 
diff --git a/opal/datatype/opal_datatype_cuda.c b/opal/datatype/opal_datatype_cuda.c
index e14e58bdb1c..ddc48444777 100644
--- a/opal/datatype/opal_datatype_cuda.c
+++ b/opal/datatype/opal_datatype_cuda.c
@@ -85,25 +85,10 @@ void mca_cuda_convertor_init(opal_convertor_t* convertor, const void *pUserBuf,
         opal_cuda_kernel_support_fini();    
     }
 
-#if 0    
-    convertor->flags &= ~CONVERTOR_CUDA;  
-    if (opal_datatype_cuda_kernel_support == 1 && datatype->cuda_iov_is_cached == 0 && opal_convertor_need_buffers(convertor) == true) {
-        struct opal_datatype_t* datatype_tmp = (opal_datatype_t *)datatype;
-        datatype_tmp->cuda_iov_dist = opal_cuda_iov_dist_init();
-        if (datatype_tmp->cuda_iov_dist == (void*)0xDEADBEEF || datatype_tmp->cuda_iov_dist == NULL) {
-            /* either cuda iov cache is not enabled or cuda_iov_cache malloc is failed, then we do not cache cuda iov */
-            datatype_tmp->cuda_iov_is_cached = -1;
-        } else {
-            /* cuda iov buffer is ready , the value will be marked to 2 when caching is finished*/
-            datatype_tmp->cuda_iov_is_cached = 1;
-        }
-    }
-    convertor->flags |= CONVERTOR_CUDA;
-#endif
-    convertor->current_cuda_iov_count = 0;
+    convertor->current_cuda_iov_pos = 0;
     convertor->current_iov_pos = 0;
     convertor->current_iov_partial_length = 0;
-    
+    convertor->current_count = 0;
 }
 
 /* Checks the type of pointer
@@ -261,7 +246,6 @@ int32_t opal_cuda_kernel_support_init(void)
         OPAL_DATATYPE_FIND_CUDA_KERNEL_FUNCTION_OR_RETURN( opal_datatype_cuda_kernel_handle, opal_ddt_cuda_malloc_gpu_buffer );
         OPAL_DATATYPE_FIND_CUDA_KERNEL_FUNCTION_OR_RETURN( opal_datatype_cuda_kernel_handle, opal_ddt_cuda_d2dcpy_async );
         OPAL_DATATYPE_FIND_CUDA_KERNEL_FUNCTION_OR_RETURN( opal_datatype_cuda_kernel_handle, opal_ddt_cuda_d2dcpy );
-        OPAL_DATATYPE_FIND_CUDA_KERNEL_FUNCTION_OR_RETURN( opal_datatype_cuda_kernel_handle, opal_ddt_cuda_iov_dist_init );
         OPAL_DATATYPE_FIND_CUDA_KERNEL_FUNCTION_OR_RETURN( opal_datatype_cuda_kernel_handle, opal_ddt_cuda_iov_dist_fini );
         
         if (OPAL_SUCCESS != cuda_kernel_table.opal_ddt_cuda_kernel_init_p()) {
@@ -378,16 +362,6 @@ void opal_cuda_d2dcpy_async(void* dst, const void* src, size_t count)
     }
 }
 
-void* opal_cuda_iov_dist_init(void)
-{
-    if (cuda_kernel_table.opal_ddt_cuda_iov_dist_init_p != NULL) {
-        return cuda_kernel_table.opal_ddt_cuda_iov_dist_init_p();
-    } else {
-        opal_output(0, "opal_ddt_cuda_iov_dist_init function pointer is NULL\n");
-        return NULL;
-    }
-}
-
 void opal_cuda_iov_dist_fini(void *cuda_iov_dist)
 {
     if (cuda_kernel_table.opal_ddt_cuda_iov_dist_fini_p != NULL) {
diff --git a/opal/datatype/opal_datatype_cuda.h b/opal/datatype/opal_datatype_cuda.h
index 24e85f649b9..37af008daa8 100644
--- a/opal/datatype/opal_datatype_cuda.h
+++ b/opal/datatype/opal_datatype_cuda.h
@@ -28,7 +28,6 @@ struct opal_datatype_cuda_kernel_function_table {
     void* (*opal_ddt_cuda_malloc_gpu_buffer_p)(size_t size, int gpu_id);
     void (*opal_ddt_cuda_d2dcpy_async_p)(void* dst, const void* src, size_t count);
     void (*opal_ddt_cuda_d2dcpy_p)(void* dst, const void* src, size_t count);
-    void* (*opal_ddt_cuda_iov_dist_init_p)(void);
     void (*opal_ddt_cuda_iov_dist_fini_p)(void *cuda_iov_dist);
     int32_t (*opal_ddt_generic_simple_pack_function_cuda_iov_p)( opal_convertor_t* pConvertor, struct iovec* iov, uint32_t* out_size, size_t* max_data );
     int32_t (*opal_ddt_generic_simple_unpack_function_cuda_iov_p)( opal_convertor_t* pConvertor, struct iovec* iov, uint32_t* out_size, size_t* max_data );
diff --git a/opal/datatype/opal_datatype_optimize.c b/opal/datatype/opal_datatype_optimize.c
index b492aa9381b..5ccea9ba1d3 100644
--- a/opal/datatype/opal_datatype_optimize.c
+++ b/opal/datatype/opal_datatype_optimize.c
@@ -305,13 +305,6 @@ int32_t opal_datatype_commit( opal_datatype_t * pData )
         pLast->size            = pData->size;
     }
 
-#if OPAL_CUDA_SUPPORT   
-    /* cuda iov for caching, it will be malloced latter when init convertor */
-    pData->cuda_iov_dist = NULL;
-    pData->cuda_iov_is_cached = 0;
-    pData->cuda_iov_count = 0;
-#endif /* OPAL_CUDA_SUPPORT */
-
     /* save a compressed datatype description as a iovec list */
 //    opal_convertor_t* conv = opal_convertor_create( opal_local_arch, 0 /* unused */);
 //    opal_convertor_prepare_for_send( conv, pData, 1, (void*)0 );

From 0f2444949a5987e7ad53a8140f387db8a3658c42 Mon Sep 17 00:00:00 2001
From: eddy16112 <eddy16112@gmail.com>
Date: Mon, 9 Nov 2015 00:17:39 -0500
Subject: [PATCH 051/190] check point, cuda iov is cached, but not used for
 pack/unpack

---
 opal/datatype/cuda/opal_datatype_cuda.cu      |  30 ++-
 opal/datatype/cuda/opal_datatype_cuda.cuh     |   4 +
 .../cuda/opal_datatype_cuda_internal.cuh      |   3 +-
 .../cuda/opal_datatype_pack_cuda_wrapper.cu   | 185 +++++++++---------
 .../cuda/opal_datatype_unpack_cuda_wrapper.cu | 178 +++++++++--------
 test/datatype/ddt_benchmark.c                 |   4 +-
 6 files changed, 226 insertions(+), 178 deletions(-)

diff --git a/opal/datatype/cuda/opal_datatype_cuda.cu b/opal/datatype/cuda/opal_datatype_cuda.cu
index e35fbcffd27..ea77cadbae8 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cu
+++ b/opal/datatype/cuda/opal_datatype_cuda.cu
@@ -279,11 +279,11 @@ int32_t opal_ddt_cuda_kernel_fini(void)
     return OPAL_SUCCESS;
 }
 
-void* opal_ddt_cuda_iov_dist_init(void) 
+void* opal_ddt_cuda_iov_dist_init(uint32_t size) 
 {
 #if OPAL_DATATYPE_CUDA_IOV_CACHE 
     ddt_cuda_iov_dist_cached_t *p = NULL;
-    cudaMalloc((void **)(&p), sizeof(ddt_cuda_iov_dist_cached_t) * NUM_CUDA_IOV_PER_DDT);
+    cudaMalloc((void **)(&p), sizeof(ddt_cuda_iov_dist_cached_t) * size);
     if (p != NULL) {
         DT_CUDA_DEBUG( opal_cuda_output( 2, "Malloc cuda_iov_dist_cached for ddt is successed %p.\n", p); );
         return p;
@@ -314,13 +314,37 @@ void opal_ddt_get_cached_cuda_iov(struct opal_convertor_t *convertor,
 {
     opal_datatype_t *datatype = (opal_datatype_t *)convertor->pDesc;
     if (datatype->cached_cuda_iov_dist == NULL) {
-        datatype->cached_cuda_iov_dist = opal_ddt_cuda_iov_dist_init();
+        datatype->cached_cuda_iov_dist = opal_ddt_cuda_iov_dist_init(NUM_CUDA_IOV_PER_DDT);
         datatype->cached_cuda_iov_count = NUM_CUDA_IOV_PER_DDT;
     }
     *cuda_iov_dist = (ddt_cuda_iov_dist_cached_t *)datatype->cached_cuda_iov_dist;
     *cuda_iov_count = datatype->cached_cuda_iov_count;                      
 }
 
+void opal_ddt_set_cuda_iov_is_cached(struct opal_convertor_t *convertor, uint32_t cuda_iov_count)
+{
+    opal_datatype_t *datatype = (opal_datatype_t *)convertor->pDesc;
+    assert(datatype->cached_cuda_iov_dist != NULL);
+    datatype->cached_cuda_iov_count = cuda_iov_count;
+}
+
+void opal_ddt_check_cuda_iov_is_full(struct opal_convertor_t *convertor, uint32_t cuda_iov_count)
+{
+    opal_datatype_t *datatype = (opal_datatype_t *)convertor->pDesc;
+    assert(datatype->cached_cuda_iov_dist != NULL);
+    if (datatype->cached_cuda_iov_count < cuda_iov_count) {
+        printf("cuda count %d, new count %d\n", datatype->cached_cuda_iov_count, cuda_iov_count);
+  //      assert(0);
+        void *old_iov = datatype->cached_cuda_iov_dist;
+        void *new_iov = opal_ddt_cuda_iov_dist_init(datatype->cached_cuda_iov_count + NUM_CUDA_IOV_PER_DDT);
+        assert(new_iov != NULL);
+        cudaMemcpy(new_iov, old_iov, datatype->cached_cuda_iov_count * sizeof(ddt_cuda_iov_dist_cached_t), cudaMemcpyDeviceToDevice);
+        datatype->cached_cuda_iov_dist = new_iov;
+        datatype->cached_cuda_iov_count += NUM_CUDA_IOV_PER_DDT;
+        opal_ddt_cuda_iov_dist_fini(old_iov);
+    }
+}
+
 int32_t opal_ddt_cuda_is_gpu_buffer(const void *ptr)
 {
     int res;
diff --git a/opal/datatype/cuda/opal_datatype_cuda.cuh b/opal/datatype/cuda/opal_datatype_cuda.cuh
index 0711b2c067d..ea89dda3c53 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cuh
+++ b/opal/datatype/cuda/opal_datatype_cuda.cuh
@@ -124,6 +124,10 @@ void pack_iov_cached(opal_convertor_t* pConvertor, unsigned char *destination);
 void opal_ddt_get_cached_cuda_iov(struct opal_convertor_t *convertor,
                                   ddt_cuda_iov_dist_cached_t **cuda_iov_dist,
                                   uint32_t *cuda_iov_count, uint8_t *cuda_iov_is_cached);
+                                  
+void opal_ddt_set_cuda_iov_is_cached(struct opal_convertor_t *convertor, uint32_t cuda_iov_count);
+
+void opal_ddt_check_cuda_iov_is_full(struct opal_convertor_t *convertor, uint32_t cuda_iov_count);
 
 }
                             
diff --git a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
index 779db2b385a..d34e6039ff3 100644
--- a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
+++ b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
@@ -38,7 +38,8 @@
 #define ALIGNMENT_DOUBLE    8
 #define ALIGNMENT_FLOAT     4
 #define ALIGNMENT_CHAR      1
-#define NUM_CUDA_IOV_PER_DDT    100000
+#define NUM_CUDA_IOV_PER_DDT    150000
+#define IOV_PIPELINE_SIZE   1000
 
 #define TIMER_DATA_TYPE struct timeval
 #define GET_TIME(TV)   gettimeofday( &(TV), NULL )
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
index bae2a714b79..b5155a0e9e1 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
@@ -1011,7 +1011,6 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
     
     DT_CUDA_DEBUG ( opal_cuda_output(2, "Pack using IOV cached, GPU base %p, pack to buffer %p\n", pConvertor->pBaseBuf, destination););
 
-    cuda_iov_count = 4000;//CUDA_NB_IOV;
     total_packed = 0;
     total_converted = pConvertor->bConverted;
     cuda_streams->current_stream_id = 0;
@@ -1041,116 +1040,126 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
     thread_per_block = CUDA_WARP_SIZE * 5;
     nb_blocks = 256;
     
-    iov_start_pos = pConvertor->current_iov_pos;
-    iov_end_pos = iov_start_pos + 1000;
-    if (iov_end_pos > ddt_iov_count) {
-        iov_end_pos = ddt_iov_count;
-    }
-    source_base = (unsigned char*)pConvertor->pBaseBuf; 
+    while(pConvertor->current_count < pConvertor->count && !buffer_isfull) {
+    
+        iov_start_pos = pConvertor->current_iov_pos;
+        iov_end_pos = iov_start_pos + IOV_PIPELINE_SIZE;
+        if (iov_end_pos > ddt_iov_count) {
+            iov_end_pos = ddt_iov_count;
+        }
+        source_base = (unsigned char*)pConvertor->pBaseBuf; 
     
-    while (iov_start_pos < iov_end_pos && !buffer_isfull) {
+        while (iov_start_pos < iov_end_pos && !buffer_isfull) {
         
-        nb_blocks_used = 0;
-        cuda_iov_pipeline_block = current_cuda_device->cuda_iov_pipeline_block[iov_pipeline_block_id];
-        cuda_iov_dist_h_current = cuda_iov_pipeline_block->cuda_iov_dist_cached_h;
-        cuda_iov_dist_d_current = cuda_iov_pipeline_block->cuda_iov_dist_cached_d;
-        cuda_stream_iov = cuda_iov_pipeline_block->cuda_stream;
-        cuda_err = cudaStreamWaitEvent(*cuda_stream_iov, cuda_iov_pipeline_block->cuda_event, 0);
-        opal_cuda_check_error(cuda_err);
+            nb_blocks_used = 0;
+            cuda_iov_pipeline_block = current_cuda_device->cuda_iov_pipeline_block[iov_pipeline_block_id];
+            cuda_iov_dist_h_current = cuda_iov_pipeline_block->cuda_iov_dist_cached_h;
+            cuda_iov_dist_d_current = cached_cuda_iov_dist_d + pConvertor->current_cuda_iov_pos;
+            cuda_stream_iov = cuda_iov_pipeline_block->cuda_stream;
+            cuda_err = cudaStreamWaitEvent(*cuda_stream_iov, cuda_iov_pipeline_block->cuda_event, 0);
+            opal_cuda_check_error(cuda_err);
 
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
-        GET_TIME(start);
+            GET_TIME(start);
 #endif
 
-        for (i = iov_start_pos; i < iov_end_pos; i++) {
-            if (pConvertor->current_iov_partial_length > 0) {
-                iov_len = pConvertor->current_iov_partial_length;
-                pConvertor->current_iov_partial_length = 0;
-            } else {
-                iov_len = ddt_iov[i].iov_len;
-            }
-            if (buffer_size >= iov_len) {
-                length_per_iovec = iov_len;
-            } else {
-                /*orig_alignment = opal_datatype_basicDatatypes[pElem->elem.common.type]->size;*/
-                orig_alignment = ALIGNMENT_CHAR;
-                length_per_iovec = buffer_size / orig_alignment * orig_alignment;
-                buffer_isfull = 1;
-                pConvertor->current_iov_partial_length = iov_len - length_per_iovec;
-                pConvertor->current_iov_pos = i;
-            }
-            buffer_size -= length_per_iovec;
-            total_packed += length_per_iovec;
-            source = (size_t)(ddt_iov[i].iov_base) + (ddt_iov[i].iov_len - iov_len) + source_base;
-            
-            /* block size is either multiple of ALIGNMENT_DOUBLE or residule */
-            alignment = ALIGNMENT_DOUBLE;
-
-            count_desc = length_per_iovec / alignment;
-            residue_desc = length_per_iovec % alignment;
-            nb_blocks_per_description = (count_desc + thread_per_block - 1) / thread_per_block;
-            DT_CUDA_DEBUG ( opal_cuda_output(10, "Pack description %d, size %d, residue %d, alignment %d, nb_block_aquired %d\n", i, count_desc, residue_desc, alignment, nb_blocks_per_description); );
-            for (j = 0; j < nb_blocks_per_description; j++) {
-                cuda_iov_dist_h_current[nb_blocks_used].src_offset = source + j * thread_per_block * alignment - source_base;
-                cuda_iov_dist_h_current[nb_blocks_used].dst_offset = destination - destination_base;
-                if ( (j+1) * thread_per_block <= count_desc) {
-                    cuda_iov_dist_h_current[nb_blocks_used].nb_bytes = thread_per_block * alignment;
+            for (i = iov_start_pos; i < iov_end_pos && !buffer_isfull; i++) {
+                if (pConvertor->current_iov_partial_length > 0) {
+                    iov_len = pConvertor->current_iov_partial_length;
+                    pConvertor->current_iov_partial_length = 0;
+                } else {
+                    iov_len = ddt_iov[i].iov_len;
+                }
+                if (buffer_size >= iov_len) {
+                    length_per_iovec = iov_len;
                 } else {
-                    cuda_iov_dist_h_current[nb_blocks_used].nb_bytes = (count_desc - j*thread_per_block) * alignment; 
+                    /*orig_alignment = opal_datatype_basicDatatypes[pElem->elem.common.type]->size;*/
+                    orig_alignment = ALIGNMENT_CHAR;
+                    length_per_iovec = buffer_size / orig_alignment * orig_alignment;
+                    buffer_isfull = 1;
+                    pConvertor->current_iov_partial_length = iov_len - length_per_iovec;
+                    pConvertor->current_iov_pos = i;
                 }
+                buffer_size -= length_per_iovec;
+                total_packed += length_per_iovec;
+                source = (size_t)(ddt_iov[i].iov_base) + (ddt_iov[i].iov_len - iov_len) + source_base;
+            
+                /* block size is either multiple of ALIGNMENT_DOUBLE or residule */
+                alignment = ALIGNMENT_DOUBLE;
+
+                count_desc = length_per_iovec / alignment;
+                residue_desc = length_per_iovec % alignment;
+                nb_blocks_per_description = (count_desc + thread_per_block - 1) / thread_per_block;
+                DT_CUDA_DEBUG ( opal_cuda_output(10, "Pack description %d, size %d, residue %d, alignment %d, nb_block_aquired %d\n", i, count_desc, residue_desc, alignment, nb_blocks_per_description); );
+                for (j = 0; j < nb_blocks_per_description; j++) {
+                    cuda_iov_dist_h_current[nb_blocks_used].src_offset = source + j * thread_per_block * alignment - source_base;
+                    cuda_iov_dist_h_current[nb_blocks_used].dst_offset = destination - destination_base;
+                    if ( (j+1) * thread_per_block <= count_desc) {
+                        cuda_iov_dist_h_current[nb_blocks_used].nb_bytes = thread_per_block * alignment;
+                    } else {
+                        cuda_iov_dist_h_current[nb_blocks_used].nb_bytes = (count_desc - j*thread_per_block) * alignment; 
+                    }
 #if defined (OPAL_DATATYPE_CUDA_DEBUG)
-                assert(cuda_iov_dist_h_current[nb_blocks_used].nb_bytes > 0);
+                    assert(cuda_iov_dist_h_current[nb_blocks_used].nb_bytes > 0);
 #endif /* OPAL_DATATYPE_CUDA_DEBUG */
-                destination += cuda_iov_dist_h_current[nb_blocks_used].nb_bytes;
-                DT_CUDA_DEBUG( opal_cuda_output(12, "Pack \tblock %d, src_offset %ld, dst_offset %ld, nb_bytes %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].src_offset, cuda_iov_dist_h_current[nb_blocks_used].dst_offset, cuda_iov_dist_h_current[nb_blocks_used].nb_bytes); );
-                nb_blocks_used ++;
-                assert (nb_blocks_used < CUDA_MAX_NB_BLOCKS*CUDA_IOV_MAX_TASK_PER_BLOCK);
-            }
+                    destination += cuda_iov_dist_h_current[nb_blocks_used].nb_bytes;
+                    DT_CUDA_DEBUG( opal_cuda_output(12, "Pack \tblock %d, src_offset %ld, dst_offset %ld, nb_bytes %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].src_offset, cuda_iov_dist_h_current[nb_blocks_used].dst_offset, cuda_iov_dist_h_current[nb_blocks_used].nb_bytes); );
+                    nb_blocks_used ++;
+                    assert (nb_blocks_used < CUDA_MAX_NB_BLOCKS*CUDA_IOV_MAX_TASK_PER_BLOCK);
+                }
             
-            /* handle residue */
-            if (residue_desc != 0) {
-                /*orig_alignment = opal_datatype_basicDatatypes[pElem->elem.common.type]->size;*/
-                orig_alignment = ALIGNMENT_CHAR;
-                cuda_iov_dist_h_current[nb_blocks_used].src_offset = source + length_per_iovec / alignment * alignment - source_base;
-                cuda_iov_dist_h_current[nb_blocks_used].dst_offset = destination - destination_base;
-                cuda_iov_dist_h_current[nb_blocks_used].nb_bytes = (length_per_iovec - length_per_iovec / alignment * alignment) / orig_alignment;
+                /* handle residue */
+                if (residue_desc != 0) {
+                    /*orig_alignment = opal_datatype_basicDatatypes[pElem->elem.common.type]->size;*/
+                    orig_alignment = ALIGNMENT_CHAR;
+                    cuda_iov_dist_h_current[nb_blocks_used].src_offset = source + length_per_iovec / alignment * alignment - source_base;
+                    cuda_iov_dist_h_current[nb_blocks_used].dst_offset = destination - destination_base;
+                    cuda_iov_dist_h_current[nb_blocks_used].nb_bytes = (length_per_iovec - length_per_iovec / alignment * alignment) / orig_alignment;
 #if defined (OPAL_DATATYPE_CUDA_DEBUG)
-                assert(cuda_iov_dist_h_current[nb_blocks_used].nb_bytes > 0);
+                    assert(cuda_iov_dist_h_current[nb_blocks_used].nb_bytes > 0);
 #endif /* OPAL_DATATYPE_CUDA_DEBUG */
-                destination += cuda_iov_dist_h_current[nb_blocks_used].nb_bytes;
-                DT_CUDA_DEBUG( opal_cuda_output(12, "Pack \tblock %d, src_offset %ld, dst_offset %ld, nb_bytes %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].src_offset, cuda_iov_dist_h_current[nb_blocks_used].dst_offset, cuda_iov_dist_h_current[nb_blocks_used].nb_bytes); );
-                nb_blocks_used ++;
-                assert (nb_blocks_used < CUDA_MAX_NB_BLOCKS*CUDA_IOV_MAX_TASK_PER_BLOCK);
+                    destination += cuda_iov_dist_h_current[nb_blocks_used].nb_bytes;
+                    DT_CUDA_DEBUG( opal_cuda_output(12, "Pack \tblock %d, src_offset %ld, dst_offset %ld, nb_bytes %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].src_offset, cuda_iov_dist_h_current[nb_blocks_used].dst_offset, cuda_iov_dist_h_current[nb_blocks_used].nb_bytes); );
+                    nb_blocks_used ++;
+                    assert (nb_blocks_used < CUDA_MAX_NB_BLOCKS*CUDA_IOV_MAX_TASK_PER_BLOCK);
+                }
             }
             
-            if (buffer_isfull) {
-                break;
+            if (!buffer_isfull) {
+                pConvertor->current_iov_pos = i;
             }
-        }
 
 #if defined(OPAL_DATATYPE_CUDA_TIMING)    
-        GET_TIME( end );
-        total_time = ELAPSED_TIME( start, end );
-        DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Pack to dest %p, iov is prepared in %ld microsec, kernel submitted to CUDA stream %d, nb_blocks %d\n", destination_base, total_time,  cuda_iov_pipeline_block->cuda_stream_id, nb_blocks_used); );
+            GET_TIME( end );
+            total_time = ELAPSED_TIME( start, end );
+            DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Pack to dest %p, iov is prepared in %ld microsec, kernel submitted to CUDA stream %d, nb_blocks %d\n", destination_base, total_time,  cuda_iov_pipeline_block->cuda_stream_id, nb_blocks_used); );
 #endif
 
-        cudaMemcpyAsync(cuda_iov_dist_d_current, cuda_iov_dist_h_current, sizeof(ddt_cuda_iov_dist_cached_t)*(nb_blocks_used), cudaMemcpyHostToDevice, *cuda_stream_iov);
-        DT_CUDA_DEBUG ( opal_cuda_output(2, "kernel launched src_base %p, dst_base %p, nb_blocks %ld\n", source_base, destination_base, nb_blocks_used ); );
-        opal_generic_simple_pack_cuda_iov_cached_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cuda_iov_dist_d_current, nb_blocks_used, source_base, destination_base);
-        cuda_err = cudaEventRecord(cuda_iov_pipeline_block->cuda_event, *cuda_stream_iov);
-        opal_cuda_check_error(cuda_err);
-        iov_pipeline_block_id ++;
-        iov_pipeline_block_id = iov_pipeline_block_id % NB_STREAMS;
+          //  opal_ddt_check_cuda_iov_is_full(pConvertor, pConvertor->current_cuda_iov_pos + nb_blocks_used); /* make sure cuda iov has enough space */
+            cudaMemcpyAsync(cuda_iov_dist_d_current, cuda_iov_dist_h_current, sizeof(ddt_cuda_iov_dist_cached_t)*(nb_blocks_used), cudaMemcpyHostToDevice, *cuda_stream_iov);
+            DT_CUDA_DEBUG ( opal_cuda_output(2, "kernel launched src_base %p, dst_base %p, nb_blocks %ld\n", source_base, destination_base, nb_blocks_used ); );
+            opal_generic_simple_pack_cuda_iov_cached_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cuda_iov_dist_d_current, nb_blocks_used, source_base, destination_base);
+            cuda_err = cudaEventRecord(cuda_iov_pipeline_block->cuda_event, *cuda_stream_iov);
+            opal_cuda_check_error(cuda_err);
+            iov_pipeline_block_id ++;
+            iov_pipeline_block_id = iov_pipeline_block_id % NB_STREAMS;
+            pConvertor->current_cuda_iov_pos += nb_blocks_used;
         
-//        orig_stack_index = pStack->index;
-        iov_start_pos = iov_end_pos;
-        iov_end_pos = iov_start_pos + 1000;
-        if (iov_end_pos > ddt_iov_count) {
-            iov_end_pos = ddt_iov_count;
+    //        orig_stack_index = pStack->index;
+            iov_start_pos = iov_end_pos;
+            iov_end_pos = iov_start_pos + IOV_PIPELINE_SIZE;
+            if (iov_end_pos >= ddt_iov_count) {
+                iov_end_pos = ddt_iov_count;
+            }
+            /* count = 0 done, iov cached finished */
+            if (pConvertor->current_iov_pos == ddt_iov_count) {
+                pConvertor->current_count ++;
+                opal_ddt_set_cuda_iov_is_cached(pConvertor, pConvertor->current_cuda_iov_pos);
+                DT_CUDA_DEBUG ( opal_cuda_output(2, "Pack cuda iov is cached, count %d\n", pConvertor->current_cuda_iov_pos););
+            }
+            DT_CUDA_DEBUG ( opal_cuda_output(4, "Pack iov start pos %d end pos %d, submit to CUDA stream %d\n", iov_start_pos, iov_end_pos, cuda_streams->current_stream_id); );
         }
-        DT_CUDA_DEBUG ( opal_cuda_output(4, "Pack iov start pos %d end pos %d, submit to CUDA stream %d\n", iov_start_pos, iov_end_pos, cuda_streams->current_stream_id); );
     }
-    
 
     for (i = 0; i < NB_STREAMS; i++) {
         cudaStreamSynchronize(cuda_streams->opal_cuda_stream[i]);
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
index ed105558f96..7d3dfa404ac 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
@@ -709,7 +709,6 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
     GET_TIME(start);
 #endif
     buffer_size = iov[0].iov_len;
-    cuda_iov_count = 1000;
     total_unpacked = 0;
     total_converted = pConvertor->bConverted;
     cuda_streams->current_stream_id = 0;
@@ -731,111 +730,122 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
     thread_per_block = CUDA_WARP_SIZE * 5;
     nb_blocks = 256;
     
-    iov_start_pos = pConvertor->current_iov_pos;
-    iov_end_pos = iov_start_pos + 1000;
-    if (iov_end_pos > ddt_iov_count) {
-        iov_end_pos = ddt_iov_count;
-    }
-    destination_base = (unsigned char*)pConvertor->pBaseBuf;
+    while(pConvertor->current_count < pConvertor->count && !buffer_isfull) {
+    
+        iov_start_pos = pConvertor->current_iov_pos;
+        iov_end_pos = iov_start_pos + IOV_PIPELINE_SIZE;
+        if (iov_end_pos > ddt_iov_count) {
+            iov_end_pos = ddt_iov_count;
+        }
+        destination_base = (unsigned char*)pConvertor->pBaseBuf;
 
-    while (iov_start_pos < iov_end_pos && !buffer_isfull) {
+        while (iov_start_pos < iov_end_pos && !buffer_isfull) {
 
-        nb_blocks_used = 0;
-        cuda_iov_pipeline_block = current_cuda_device->cuda_iov_pipeline_block[iov_pipeline_block_id];
-        cuda_iov_dist_h_current = cuda_iov_pipeline_block->cuda_iov_dist_cached_h;
-        cuda_iov_dist_d_current = cuda_iov_pipeline_block->cuda_iov_dist_cached_d;
-        cuda_stream_iov = cuda_iov_pipeline_block->cuda_stream;
-        cuda_err = cudaStreamWaitEvent(*cuda_stream_iov, cuda_iov_pipeline_block->cuda_event, 0);
-        opal_cuda_check_error(cuda_err);
+            nb_blocks_used = 0;
+            cuda_iov_pipeline_block = current_cuda_device->cuda_iov_pipeline_block[iov_pipeline_block_id];
+            cuda_iov_dist_h_current = cuda_iov_pipeline_block->cuda_iov_dist_cached_h;
+            cuda_iov_dist_d_current = cached_cuda_iov_dist_d + pConvertor->current_cuda_iov_pos;
+            cuda_stream_iov = cuda_iov_pipeline_block->cuda_stream;
+            cuda_err = cudaStreamWaitEvent(*cuda_stream_iov, cuda_iov_pipeline_block->cuda_event, 0);
+            opal_cuda_check_error(cuda_err);
         
 
 #if defined (OPAL_DATATYPE_CUDA_TIMING)
-        GET_TIME(start);
+            GET_TIME(start);
 #endif
 
-        for (i = iov_start_pos; i < iov_end_pos; i++) {
-            if (pConvertor->current_iov_partial_length > 0) {
-                iov_len = pConvertor->current_iov_partial_length;
-                pConvertor->current_iov_partial_length = 0;
-            } else {
-                iov_len = ddt_iov[i].iov_len;
-            }
-            if (buffer_size >= iov_len) {
-                length_per_iovec = iov_len;
-            } else {
-              /*  orig_alignment = opal_datatype_basicDatatypes[pElem->elem.common.type]->size;*/
-                orig_alignment = ALIGNMENT_CHAR;
-                length_per_iovec = buffer_size / orig_alignment * orig_alignment;
-                buffer_isfull = 1;
-                pConvertor->current_iov_partial_length = iov_len - length_per_iovec;
-                pConvertor->current_iov_pos = i;
-            }
-            buffer_size -= length_per_iovec;
-            total_unpacked += length_per_iovec;
-            destination = (size_t)(ddt_iov[i].iov_base) + (ddt_iov[i].iov_len - iov_len) + destination_base;
-
-            alignment = ALIGNMENT_DOUBLE;
-
-            count_desc = length_per_iovec / alignment;
-            residue_desc = length_per_iovec % alignment;
-            nb_blocks_per_description = (count_desc + thread_per_block - 1) / thread_per_block;
-            DT_CUDA_DEBUG ( opal_cuda_output(10, "Unpack description %d, size %d, residue %d, alignment %d\n", i, count_desc, residue_desc, alignment); );
-            for (j = 0; j < nb_blocks_per_description; j++) {
-                cuda_iov_dist_h_current[nb_blocks_used].dst_offset = destination + j * thread_per_block * alignment - destination_base;
-                cuda_iov_dist_h_current[nb_blocks_used].src_offset = source - source_base;
-                if ( (j+1) * thread_per_block <= count_desc) {
-                    cuda_iov_dist_h_current[nb_blocks_used].nb_bytes = thread_per_block * alignment;
+            for (i = iov_start_pos; i < iov_end_pos && !buffer_isfull; i++) {
+                if (pConvertor->current_iov_partial_length > 0) {
+                    iov_len = pConvertor->current_iov_partial_length;
+                    pConvertor->current_iov_partial_length = 0;
+                } else {
+                    iov_len = ddt_iov[i].iov_len;
+                }
+                if (buffer_size >= iov_len) {
+                    length_per_iovec = iov_len;
                 } else {
-                    cuda_iov_dist_h_current[nb_blocks_used].nb_bytes = (thread_per_block - ((j+1)*thread_per_block - count_desc)) * alignment;
+                  /*  orig_alignment = opal_datatype_basicDatatypes[pElem->elem.common.type]->size;*/
+                    orig_alignment = ALIGNMENT_CHAR;
+                    length_per_iovec = buffer_size / orig_alignment * orig_alignment;
+                    buffer_isfull = 1;
+                    pConvertor->current_iov_partial_length = iov_len - length_per_iovec;
+                    pConvertor->current_iov_pos = i;
                 }
+                buffer_size -= length_per_iovec;
+                total_unpacked += length_per_iovec;
+                destination = (size_t)(ddt_iov[i].iov_base) + (ddt_iov[i].iov_len - iov_len) + destination_base;
+
+                alignment = ALIGNMENT_DOUBLE;
+
+                count_desc = length_per_iovec / alignment;
+                residue_desc = length_per_iovec % alignment;
+                nb_blocks_per_description = (count_desc + thread_per_block - 1) / thread_per_block;
+                DT_CUDA_DEBUG ( opal_cuda_output(10, "Unpack description %d, size %d, residue %d, alignment %d\n", i, count_desc, residue_desc, alignment); );
+                for (j = 0; j < nb_blocks_per_description; j++) {
+                    cuda_iov_dist_h_current[nb_blocks_used].dst_offset = destination + j * thread_per_block * alignment - destination_base;
+                    cuda_iov_dist_h_current[nb_blocks_used].src_offset = source - source_base;
+                    if ( (j+1) * thread_per_block <= count_desc) {
+                        cuda_iov_dist_h_current[nb_blocks_used].nb_bytes = thread_per_block * alignment;
+                    } else {
+                        cuda_iov_dist_h_current[nb_blocks_used].nb_bytes = (thread_per_block - ((j+1)*thread_per_block - count_desc)) * alignment;
+                    }
 #if defined (OPAL_DATATYPE_CUDA_DEBUG)
-                assert (cuda_iov_dist_h_current[nb_blocks_used].nb_bytes > 0); 
+                    assert (cuda_iov_dist_h_current[nb_blocks_used].nb_bytes > 0); 
 #endif /* OPAL_DATATYPE_CUDA_DEBUG */
-                source += cuda_iov_dist_h_current[nb_blocks_used].nb_bytes;
-                DT_CUDA_DEBUG( opal_cuda_output(12, "Unpack \tblock %d, src_offset %ld, dst_offset %ld, nb_bytes %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].src_offset, cuda_iov_dist_h_current[nb_blocks_used].dst_offset, cuda_iov_dist_h_current[nb_blocks_used].nb_bytes); );
-                nb_blocks_used ++;
-            }
+                    source += cuda_iov_dist_h_current[nb_blocks_used].nb_bytes;
+                    DT_CUDA_DEBUG( opal_cuda_output(12, "Unpack \tblock %d, src_offset %ld, dst_offset %ld, nb_bytes %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].src_offset, cuda_iov_dist_h_current[nb_blocks_used].dst_offset, cuda_iov_dist_h_current[nb_blocks_used].nb_bytes); );
+                    nb_blocks_used ++;
+                }
 
-            /* handle residue */
-            if (residue_desc != 0) {
-               /* orig_alignment = opal_datatype_basicDatatypes[pElem->elem.common.type]->size;*/
-                orig_alignment = ALIGNMENT_CHAR;
-                cuda_iov_dist_h_current[nb_blocks_used].dst_offset = destination + length_per_iovec / alignment * alignment - destination_base;
-                cuda_iov_dist_h_current[nb_blocks_used].src_offset = source - source_base;
-                cuda_iov_dist_h_current[nb_blocks_used].nb_bytes = (length_per_iovec - length_per_iovec / alignment * alignment) / orig_alignment;
+                /* handle residue */
+                if (residue_desc != 0) {
+                   /* orig_alignment = opal_datatype_basicDatatypes[pElem->elem.common.type]->size;*/
+                    orig_alignment = ALIGNMENT_CHAR;
+                    cuda_iov_dist_h_current[nb_blocks_used].dst_offset = destination + length_per_iovec / alignment * alignment - destination_base;
+                    cuda_iov_dist_h_current[nb_blocks_used].src_offset = source - source_base;
+                    cuda_iov_dist_h_current[nb_blocks_used].nb_bytes = (length_per_iovec - length_per_iovec / alignment * alignment) / orig_alignment;
 #if defined (OPAL_DATATYPE_CUDA_DEBUG)
-                assert (cuda_iov_dist_h_current[nb_blocks_used].nb_bytes > 0);
+                    assert (cuda_iov_dist_h_current[nb_blocks_used].nb_bytes > 0);
 #endif /* OPAL_DATATYPE_CUDA_DEBUG */
-                source += cuda_iov_dist_h_current[nb_blocks_used].nb_bytes;
-                DT_CUDA_DEBUG( opal_cuda_output(12, "Unpack \tblock %d, src %ld, dst %ld, nb_bytes %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].src_offset, cuda_iov_dist_h_current[nb_blocks_used].dst_offset, cuda_iov_dist_h_current[nb_blocks_used].nb_bytes); );
-                nb_blocks_used ++;
+                    source += cuda_iov_dist_h_current[nb_blocks_used].nb_bytes;
+                    DT_CUDA_DEBUG( opal_cuda_output(12, "Unpack \tblock %d, src %ld, dst %ld, nb_bytes %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].src_offset, cuda_iov_dist_h_current[nb_blocks_used].dst_offset, cuda_iov_dist_h_current[nb_blocks_used].nb_bytes); );
+                    nb_blocks_used ++;
+                }
             }
-
-            if (buffer_isfull) {
-                break;
+            
+            if (!buffer_isfull) {
+                pConvertor->current_iov_pos = i;
             }
-        }
 
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
-        GET_TIME( end );
-        total_time = ELAPSED_TIME( start, end );
-        DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Unpack src %p, iov is prepared in %ld microsec, kernel submitted to CUDA stream %d, nb_blocks_used %d\n", source_base, total_time,  cuda_iov_pipeline_block->cuda_stream_id, nb_blocks_used); );
+            GET_TIME( end );
+            total_time = ELAPSED_TIME( start, end );
+            DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Unpack src %p, iov is prepared in %ld microsec, kernel submitted to CUDA stream %d, nb_blocks_used %d\n", source_base, total_time,  cuda_iov_pipeline_block->cuda_stream_id, nb_blocks_used); );
 #endif
 
-        cudaMemcpyAsync(cuda_iov_dist_d_current, cuda_iov_dist_h_current, sizeof(ddt_cuda_iov_dist_cached_t)*(nb_blocks_used), cudaMemcpyHostToDevice, *cuda_stream_iov);
-        opal_generic_simple_unpack_cuda_iov_cached_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cuda_iov_dist_d_current, nb_blocks_used, source_base, destination_base);
-        cuda_err = cudaEventRecord(cuda_iov_pipeline_block->cuda_event, *cuda_stream_iov);
-        opal_cuda_check_error(cuda_err);
-        iov_pipeline_block_id ++;
-        iov_pipeline_block_id = iov_pipeline_block_id % NB_STREAMS;
+            cudaMemcpyAsync(cuda_iov_dist_d_current, cuda_iov_dist_h_current, sizeof(ddt_cuda_iov_dist_cached_t)*(nb_blocks_used), cudaMemcpyHostToDevice, *cuda_stream_iov);
+            opal_generic_simple_unpack_cuda_iov_cached_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cuda_iov_dist_d_current, nb_blocks_used, source_base, destination_base);
+            cuda_err = cudaEventRecord(cuda_iov_pipeline_block->cuda_event, *cuda_stream_iov);
+            opal_cuda_check_error(cuda_err);
+            iov_pipeline_block_id ++;
+            iov_pipeline_block_id = iov_pipeline_block_id % NB_STREAMS;
+            pConvertor->current_cuda_iov_pos += nb_blocks_used;
         
-        iov_start_pos = iov_end_pos;
-        iov_end_pos = iov_start_pos + 1000;
-        if (iov_end_pos > ddt_iov_count) {
-            iov_end_pos = ddt_iov_count;
-        }
-        DT_CUDA_DEBUG ( opal_cuda_output(4, "Unpack iov start pos %d end pos %d, submit to CUDA stream %d\n", iov_start_pos, iov_end_pos, cuda_streams->current_stream_id); );
+            iov_start_pos = iov_end_pos;
+            iov_end_pos = iov_start_pos + IOV_PIPELINE_SIZE;
+            if (iov_end_pos >= ddt_iov_count) {
+                iov_end_pos = ddt_iov_count;
+            }
+            /* finished */
+            if (pConvertor->current_iov_pos == ddt_iov_count) {
+                pConvertor->current_count ++;
+                opal_ddt_set_cuda_iov_is_cached(pConvertor, pConvertor->current_cuda_iov_pos);
+                DT_CUDA_DEBUG ( opal_cuda_output(2, "Pack cuda iov is cached, count %d\n", pConvertor->current_cuda_iov_pos););
+            }
+            DT_CUDA_DEBUG ( opal_cuda_output(4, "Unpack iov start pos %d end pos %d, submit to CUDA stream %d\n", iov_start_pos, iov_end_pos, cuda_streams->current_stream_id); );
 
+        }
+    
     }
 
     for (i = 0; i < NB_STREAMS; i++) {
diff --git a/test/datatype/ddt_benchmark.c b/test/datatype/ddt_benchmark.c
index 50f62ec5839..f2112d598b2 100644
--- a/test/datatype/ddt_benchmark.c
+++ b/test/datatype/ddt_benchmark.c
@@ -1211,12 +1211,12 @@ int main( int argc, char* argv[] )
     
     printf( "\n\n#\n * TEST UPPER TRIANGULAR MATRIX (size 100)\n #\n\n" );
     int mat_size = 500;
-    for (mat_size = 2000; mat_size <= 2000; mat_size +=500) {
+    for (mat_size = 1000; mat_size <= 1000; mat_size +=500) {
         pdt = upper_matrix(mat_size);
         printf("----matrix size %d-----\n", mat_size);
         if( outputFlags & CHECK_PACK_UNPACK ) {
             for (i = 1; i <= 1; i++) {
-                local_copy_with_convertor(pdt, 1, 4000001, mat_size);
+                local_copy_with_convertor(pdt, 1, 4000000, mat_size);
             }
         }
         OBJ_RELEASE( pdt ); assert( pdt == NULL );

From f1f4a7d28b52fbdc8816a6b57ca10443047f68ed Mon Sep 17 00:00:00 2001
From: eddy16112 <eddy16112@gmail.com>
Date: Mon, 9 Nov 2015 22:07:04 -0500
Subject: [PATCH 052/190] check point, ready to use cached cuda iov

---
 opal/datatype/cuda/opal_datatype_cuda.cu      | 72 ++++++++++++-------
 opal/datatype/cuda/opal_datatype_cuda.cuh     | 12 ++--
 .../cuda/opal_datatype_cuda_internal.cuh      | 17 +++--
 .../cuda/opal_datatype_pack_cuda_kernel.cu    | 13 ++--
 .../cuda/opal_datatype_pack_cuda_wrapper.cu   | 37 +++++++---
 .../cuda/opal_datatype_unpack_cuda_kernel.cu  | 13 ++--
 .../cuda/opal_datatype_unpack_cuda_wrapper.cu | 38 +++++++---
 opal/datatype/opal_datatype.h                 |  4 +-
 opal/datatype/opal_datatype_create.c          | 12 ++--
 opal/datatype/opal_datatype_cuda.c            | 11 +--
 opal/datatype/opal_datatype_cuda.h            |  6 +-
 11 files changed, 147 insertions(+), 88 deletions(-)

diff --git a/opal/datatype/cuda/opal_datatype_cuda.cu b/opal/datatype/cuda/opal_datatype_cuda.cu
index ea77cadbae8..3ac7ba0ac5f 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cu
+++ b/opal/datatype/cuda/opal_datatype_cuda.cu
@@ -223,7 +223,8 @@ int32_t opal_ddt_cuda_kernel_init(void)
             cudaMallocHost((void **)(&(cuda_iov_pipeline_block->cuda_iov_dist_non_cached_h)), sizeof(ddt_cuda_iov_dist_non_cached_t) * CUDA_MAX_NB_BLOCKS * CUDA_IOV_MAX_TASK_PER_BLOCK);
             cudaMalloc((void **)(&(cuda_iov_pipeline_block->cuda_iov_dist_non_cached_d)), sizeof(ddt_cuda_iov_dist_non_cached_t) * CUDA_MAX_NB_BLOCKS * CUDA_IOV_MAX_TASK_PER_BLOCK);
             cudaMallocHost((void **)(&(cuda_iov_pipeline_block->cuda_iov_dist_cached_h)), sizeof(ddt_cuda_iov_dist_cached_t) * CUDA_MAX_NB_BLOCKS * CUDA_IOV_MAX_TASK_PER_BLOCK);
-            cudaMalloc((void **)(&(cuda_iov_pipeline_block->cuda_iov_dist_cached_d)), sizeof(ddt_cuda_iov_dist_cached_t) * CUDA_MAX_NB_BLOCKS * CUDA_IOV_MAX_TASK_PER_BLOCK);
+            cudaMallocHost((void **)(&(cuda_iov_pipeline_block->cuda_iov_contig_buf_h)), sizeof(uintptr_t) * CUDA_MAX_NB_BLOCKS * CUDA_IOV_MAX_TASK_PER_BLOCK);
+            cudaMalloc((void **)(&(cuda_iov_pipeline_block->cuda_iov_contig_buf_d)), sizeof(uintptr_t) * CUDA_MAX_NB_BLOCKS * CUDA_IOV_MAX_TASK_PER_BLOCK);
             cuda_iov_pipeline_block->cuda_stream = &(cuda_streams->opal_cuda_stream[0]);
             cuda_iov_pipeline_block->cuda_stream_id = 0;
             cudaEventCreate(&(cuda_iov_pipeline_block->cuda_event), cudaEventDisableTiming);
@@ -263,7 +264,8 @@ int32_t opal_ddt_cuda_kernel_fini(void)
                 cudaFreeHost(cuda_iov_pipeline_block->cuda_iov_dist_non_cached_h);
                 cudaFree(cuda_iov_pipeline_block->cuda_iov_dist_non_cached_d);
                 cudaFreeHost(cuda_iov_pipeline_block->cuda_iov_dist_cached_h);
-                cudaFree(cuda_iov_pipeline_block->cuda_iov_dist_cached_d);
+                cudaFreeHost(cuda_iov_pipeline_block->cuda_iov_contig_buf_h);
+                cudaFree(cuda_iov_pipeline_block->cuda_iov_contig_buf_d);
                 cudaEventDestroy(cuda_iov_pipeline_block->cuda_event);
                 cuda_iov_pipeline_block->cuda_stream = NULL;
                 cuda_iov_pipeline_block->cuda_stream_id = -1;
@@ -279,14 +281,20 @@ int32_t opal_ddt_cuda_kernel_fini(void)
     return OPAL_SUCCESS;
 }
 
-void* opal_ddt_cuda_iov_dist_init(uint32_t size) 
+void* opal_ddt_cached_cuda_iov_init(uint32_t size) 
 {
 #if OPAL_DATATYPE_CUDA_IOV_CACHE 
-    ddt_cuda_iov_dist_cached_t *p = NULL;
-    cudaMalloc((void **)(&p), sizeof(ddt_cuda_iov_dist_cached_t) * size);
-    if (p != NULL) {
-        DT_CUDA_DEBUG( opal_cuda_output( 2, "Malloc cuda_iov_dist_cached for ddt is successed %p.\n", p); );
-        return p;
+    ddt_cuda_iov_total_cached_t *tmp = (ddt_cuda_iov_total_cached_t *)malloc(sizeof(ddt_cuda_iov_total_cached_t));
+    ddt_cuda_iov_dist_cached_t *tmp_cuda_iov_d = NULL;
+    cudaMalloc((void **)(&tmp_cuda_iov_d), sizeof(ddt_cuda_iov_dist_cached_t) * size);
+    uint32_t *tmp_nb_bytes = (uint32_t *)malloc(sizeof(uint32_t) * size);
+    if (tmp != NULL && tmp_cuda_iov_d != NULL && tmp_nb_bytes != NULL) {
+        tmp->cuda_iov_dist_d = tmp_cuda_iov_d;
+        tmp->cuda_iov_count = size;
+        tmp->cuda_iov_is_cached = 0;
+        tmp->nb_bytes_h = tmp_nb_bytes;
+        DT_CUDA_DEBUG( opal_cuda_output( 2, "Malloc cuda_iov_dist_cached for ddt is successed, cached cuda iov %p, cuda_iov_d %p, nb_bytes_h %p, size %d.\n", tmp, tmp_cuda_iov_d, tmp_nb_bytes, size); );
+        return tmp;
     } else {
         DT_CUDA_DEBUG( opal_cuda_output( 0, "Malloc cuda_iov_dist_cached for ddt is failed.\n"); );
         return NULL;
@@ -297,39 +305,54 @@ void* opal_ddt_cuda_iov_dist_init(uint32_t size)
 #endif /* OPAL_DATATYPE_CUDA_IOV_CACHE */
 }
 
-void opal_ddt_cuda_iov_dist_fini(void* cuda_iov_dist) 
+void opal_ddt_cached_cuda_iov_fini(void* cached_cuda_iov) 
 {
 #if OPAL_DATATYPE_CUDA_IOV_CACHE
-    ddt_cuda_iov_dist_cached_t *p = (ddt_cuda_iov_dist_cached_t *) cuda_iov_dist;
-    if (p != NULL) {
-        cudaFree(p);
-        DT_CUDA_DEBUG( opal_cuda_output( 2, "Free cuda_iov_dist for ddt is successed %p.\n", p); );
+    ddt_cuda_iov_total_cached_t *tmp = (ddt_cuda_iov_total_cached_t *) cached_cuda_iov;
+    if (tmp != NULL) {
+        DT_CUDA_DEBUG( opal_cuda_output( 2, "Free cuda_iov_dist for ddt is successed %p.\n", tmp); );
+        if (tmp->cuda_iov_dist_d != NULL) {
+            cudaFree(tmp->cuda_iov_dist_d);
+            tmp->cuda_iov_dist_d = NULL;
+        }
+        if (tmp->nb_bytes_h != NULL) {
+            free(tmp->nb_bytes_h);
+            tmp->nb_bytes_h = NULL;
+        }
+        free(tmp);
+        tmp = NULL;
     }
 #endif /* OPAL_DATATYPE_CUDA_IOV_CACHE */
 }
 
-void opal_ddt_get_cached_cuda_iov(struct opal_convertor_t *convertor,
-                                  ddt_cuda_iov_dist_cached_t **cuda_iov_dist,
-                                  uint32_t* cuda_iov_count, uint8_t *cuda_iov_is_cached)
+void opal_ddt_get_cached_cuda_iov(struct opal_convertor_t *convertor, ddt_cuda_iov_total_cached_t **cached_cuda_iov)
 {
     opal_datatype_t *datatype = (opal_datatype_t *)convertor->pDesc;
-    if (datatype->cached_cuda_iov_dist == NULL) {
-        datatype->cached_cuda_iov_dist = opal_ddt_cuda_iov_dist_init(NUM_CUDA_IOV_PER_DDT);
-        datatype->cached_cuda_iov_count = NUM_CUDA_IOV_PER_DDT;
+    if (datatype->cached_cuda_iov == NULL) {
+        datatype->cached_cuda_iov = opal_ddt_cached_cuda_iov_init(NUM_CUDA_IOV_PER_DDT);
     }
-    *cuda_iov_dist = (ddt_cuda_iov_dist_cached_t *)datatype->cached_cuda_iov_dist;
-    *cuda_iov_count = datatype->cached_cuda_iov_count;                      
+    *cached_cuda_iov = (ddt_cuda_iov_total_cached_t *)datatype->cached_cuda_iov;                   
 }
 
-void opal_ddt_set_cuda_iov_is_cached(struct opal_convertor_t *convertor, uint32_t cuda_iov_count)
+void opal_ddt_set_cuda_iov_cached(struct opal_convertor_t *convertor, uint32_t cuda_iov_count)
 {
     opal_datatype_t *datatype = (opal_datatype_t *)convertor->pDesc;
-    assert(datatype->cached_cuda_iov_dist != NULL);
-    datatype->cached_cuda_iov_count = cuda_iov_count;
+    assert(datatype->cached_cuda_iov != NULL);
+    ddt_cuda_iov_total_cached_t *tmp = (ddt_cuda_iov_total_cached_t *)datatype->cached_cuda_iov;
+    tmp->cuda_iov_count = cuda_iov_count;
+}
+
+uint8_t opal_ddt_cuda_iov_is_cached(struct opal_convertor_t *convertor)
+{
+    opal_datatype_t *datatype = (opal_datatype_t *)convertor->pDesc;
+    assert(datatype->cached_cuda_iov != NULL);
+    ddt_cuda_iov_total_cached_t *tmp = (ddt_cuda_iov_total_cached_t *)datatype->cached_cuda_iov;
+    return tmp->cuda_iov_is_cached;
 }
 
 void opal_ddt_check_cuda_iov_is_full(struct opal_convertor_t *convertor, uint32_t cuda_iov_count)
 {
+#if 0
     opal_datatype_t *datatype = (opal_datatype_t *)convertor->pDesc;
     assert(datatype->cached_cuda_iov_dist != NULL);
     if (datatype->cached_cuda_iov_count < cuda_iov_count) {
@@ -343,6 +366,7 @@ void opal_ddt_check_cuda_iov_is_full(struct opal_convertor_t *convertor, uint32_
         datatype->cached_cuda_iov_count += NUM_CUDA_IOV_PER_DDT;
         opal_ddt_cuda_iov_dist_fini(old_iov);
     }
+#endif
 }
 
 int32_t opal_ddt_cuda_is_gpu_buffer(const void *ptr)
diff --git a/opal/datatype/cuda/opal_datatype_cuda.cuh b/opal/datatype/cuda/opal_datatype_cuda.cuh
index ea89dda3c53..6c071188c2c 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cuh
+++ b/opal/datatype/cuda/opal_datatype_cuda.cuh
@@ -115,17 +115,17 @@ void opal_ddt_cuda_d2dcpy(void* dst, const void* src, size_t count);
 
 void opal_dump_cuda_list(ddt_cuda_list_t *list);
 
-void* opal_ddt_cuda_iov_dist_init(void);
+void* opal_ddt_cached_cuda_iov_init(void);
 
-void opal_ddt_cuda_iov_dist_fini(void *cuda_iov_dist);
+void opal_ddt_cached_cuda_iov_fini(void *cached_cuda_iov);
 
 void pack_iov_cached(opal_convertor_t* pConvertor, unsigned char *destination);
 
-void opal_ddt_get_cached_cuda_iov(struct opal_convertor_t *convertor,
-                                  ddt_cuda_iov_dist_cached_t **cuda_iov_dist,
-                                  uint32_t *cuda_iov_count, uint8_t *cuda_iov_is_cached);
+void opal_ddt_get_cached_cuda_iov(struct opal_convertor_t *convertor, ddt_cuda_iov_total_cached_t **cached_cuda_iov);
                                   
-void opal_ddt_set_cuda_iov_is_cached(struct opal_convertor_t *convertor, uint32_t cuda_iov_count);
+void opal_ddt_set_cuda_iov_cached(struct opal_convertor_t *convertor, uint32_t cuda_iov_count);
+
+uint8_t opal_ddt_cuda_iov_is_cached(struct opal_convertor_t *convertor);
 
 void opal_ddt_check_cuda_iov_is_full(struct opal_convertor_t *convertor, uint32_t cuda_iov_count);
 
diff --git a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
index d34e6039ff3..1b47b89f1d0 100644
--- a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
+++ b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
@@ -59,16 +59,23 @@ typedef struct {
 } ddt_cuda_iov_dist_non_cached_t;
 
 typedef struct {
-    size_t src_offset;
-    size_t dst_offset;
+    size_t ptr_offset;
     uint32_t nb_bytes;
 } ddt_cuda_iov_dist_cached_t;
 
+typedef struct {
+    ddt_cuda_iov_dist_cached_t* cuda_iov_dist_d;
+    uint32_t cuda_iov_count;
+    uint32_t* nb_bytes_h;
+    uint8_t cuda_iov_is_cached;
+} ddt_cuda_iov_total_cached_t;
+
 typedef struct {
     ddt_cuda_iov_dist_non_cached_t* cuda_iov_dist_non_cached_h;
     ddt_cuda_iov_dist_non_cached_t* cuda_iov_dist_non_cached_d;
     ddt_cuda_iov_dist_cached_t* cuda_iov_dist_cached_h;
-    ddt_cuda_iov_dist_cached_t* cuda_iov_dist_cached_d;
+    uintptr_t *cuda_iov_contig_buf_h;
+    uintptr_t *cuda_iov_contig_buf_d;
     cudaStream_t *cuda_stream;
     int32_t cuda_stream_id;
     cudaEvent_t cuda_event;
@@ -131,9 +138,9 @@ __global__ void opal_generic_simple_pack_cuda_iov_non_cached_kernel( ddt_cuda_io
 
 __global__ void opal_generic_simple_unpack_cuda_iov_non_cached_kernel( ddt_cuda_iov_dist_non_cached_t* cuda_iov_dist, int nb_blocks_used);
 
-__global__ void opal_generic_simple_pack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, int nb_blocks_used, unsigned char* source_base, unsigned char* destination_base);
+__global__ void opal_generic_simple_pack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, uintptr_t* cuda_iov_contig_buf_d, int nb_blocks_used, unsigned char* source_base);
 
-__global__ void opal_generic_simple_unpack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, int nb_blocks_used, unsigned char* source_base, unsigned char* destination_base);
+__global__ void opal_generic_simple_unpack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, uintptr_t* cuda_iov_contig_buf_d, int nb_blocks_used, unsigned char* destination_base);
 
 void opal_cuda_output(int output_id, const char *format, ...);
 
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
index 42acd8c4906..e85b83e55b5 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
@@ -88,10 +88,11 @@ __global__ void opal_generic_simple_pack_cuda_iov_non_cached_kernel( ddt_cuda_io
     }
 }
 
-__global__ void opal_generic_simple_pack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, int nb_blocks_used, unsigned char* source_base, unsigned char* destination_base)
+__global__ void opal_generic_simple_pack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, uintptr_t* cuda_iov_contig_buf_d, int nb_blocks_used, unsigned char* source_base)
 {
     uint32_t i, j;
-    size_t src_offset, dst_offset;
+    size_t src_offset;
+    unsigned char *dst;
     unsigned char *_source_tmp, *_destination_tmp;
     
     __shared__ uint32_t nb_tasks;
@@ -108,12 +109,12 @@ __global__ void opal_generic_simple_pack_cuda_iov_cached_kernel( ddt_cuda_iov_di
     __syncthreads();
     
     for (i = 0; i < nb_tasks; i++) {
-        src_offset = cuda_iov_dist[blockIdx.x + i * gridDim.x].src_offset;
-        dst_offset = cuda_iov_dist[blockIdx.x + i * gridDim.x].dst_offset;
+        src_offset = cuda_iov_dist[blockIdx.x + i * gridDim.x].ptr_offset;
+        dst = (unsigned char *)cuda_iov_contig_buf_d[blockIdx.x + i * gridDim.x];
         
         if (threadIdx.x == 0) {
             _source_tmp = source_base + src_offset;
-            _destination_tmp = destination_base + dst_offset;
+            _destination_tmp = dst;
             uint32_t _nb_bytes = cuda_iov_dist[blockIdx.x + i * gridDim.x].nb_bytes;
             /* block size is either multiple of ALIGNMENT_DOUBLE or residule */
             if ((uintptr_t)(_source_tmp) % ALIGNMENT_DOUBLE == 0 && (uintptr_t)_destination_tmp % ALIGNMENT_DOUBLE == 0 && _nb_bytes % ALIGNMENT_DOUBLE == 0) {
@@ -130,7 +131,7 @@ __global__ void opal_generic_simple_pack_cuda_iov_cached_kernel( ddt_cuda_iov_di
         for (j = threadIdx.x; j < copy_count; j += blockDim.x) {
             if (j < copy_count) {
                 _source_tmp = source_base + src_offset + j * alignment;
-                _destination_tmp = destination_base + dst_offset + j * alignment;
+                _destination_tmp = dst + j * alignment;
 #if !defined (OPAL_DATATYPE_CUDA_DRY_RUN)
                 if (alignment == ALIGNMENT_DOUBLE) {
                     *((long *)_destination_tmp) = *((long *)_source_tmp);
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
index b5155a0e9e1..e6b9545226f 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
@@ -950,6 +950,7 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
     ddt_cuda_stream_t *cuda_streams = current_cuda_device->cuda_streams;
     ddt_cuda_iov_dist_cached_t* cuda_iov_dist_h_current;
     ddt_cuda_iov_dist_cached_t* cuda_iov_dist_d_current;
+    uintptr_t *cuda_iov_contig_buf_h_current, *cuda_iov_contig_buf_d_current;
     ddt_cuda_iov_pipeline_block_t *cuda_iov_pipeline_block;
     int iov_pipeline_block_id = 0;
     cudaStream_t *cuda_stream_iov = NULL;
@@ -957,7 +958,9 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
     uint32_t ddt_iov_count;
     size_t iov_len;
     uint32_t iov_start_pos, iov_end_pos;
+    ddt_cuda_iov_total_cached_t* cached_cuda_iov;
     ddt_cuda_iov_dist_cached_t* cached_cuda_iov_dist_d;
+    uint32_t *cached_cuda_iov_nb_bytes_list_h, *cuda_iov_nb_bytes_list_h_current;
     uint32_t cached_cuda_iov_count;
     uint8_t cuda_iov_is_cached;
 
@@ -1027,8 +1030,13 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
     
     opal_convertor_raw_cached( pConvertor, &ddt_iov, &ddt_iov_count);
     assert(ddt_iov != NULL);
-    opal_ddt_get_cached_cuda_iov(pConvertor, &cached_cuda_iov_dist_d, &cached_cuda_iov_count, &cuda_iov_is_cached);
+    opal_ddt_get_cached_cuda_iov(pConvertor, &cached_cuda_iov);
+    cached_cuda_iov_dist_d = cached_cuda_iov->cuda_iov_dist_d;
     assert(cached_cuda_iov_dist_d != NULL);
+    cached_cuda_iov_nb_bytes_list_h = cached_cuda_iov->nb_bytes_h;
+    assert(cached_cuda_iov_nb_bytes_list_h != NULL);
+    cached_cuda_iov_count = cached_cuda_iov->cuda_iov_count;
+    cuda_iov_is_cached = cached_cuda_iov->cuda_iov_is_cached;
     DT_CUDA_DEBUG ( opal_cuda_output(4, "Pack iov count %d, submit to CUDA stream %d\n", ddt_iov_count, cuda_streams->current_stream_id); );
 
 #if defined(OPAL_DATATYPE_CUDA_TIMING)    
@@ -1040,7 +1048,8 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
     thread_per_block = CUDA_WARP_SIZE * 5;
     nb_blocks = 256;
     
-    while(pConvertor->current_count < pConvertor->count && !buffer_isfull) {
+    /* cuda iov is not cached, start to cache iov */
+    if(opal_ddt_cuda_iov_is_cached(pConvertor) == 0) {
     
         iov_start_pos = pConvertor->current_iov_pos;
         iov_end_pos = iov_start_pos + IOV_PIPELINE_SIZE;
@@ -1054,7 +1063,10 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
             nb_blocks_used = 0;
             cuda_iov_pipeline_block = current_cuda_device->cuda_iov_pipeline_block[iov_pipeline_block_id];
             cuda_iov_dist_h_current = cuda_iov_pipeline_block->cuda_iov_dist_cached_h;
+            cuda_iov_contig_buf_h_current = cuda_iov_pipeline_block->cuda_iov_contig_buf_h;
+            cuda_iov_contig_buf_d_current = cuda_iov_pipeline_block->cuda_iov_contig_buf_d;
             cuda_iov_dist_d_current = cached_cuda_iov_dist_d + pConvertor->current_cuda_iov_pos;
+            cuda_iov_nb_bytes_list_h_current = cached_cuda_iov_nb_bytes_list_h + pConvertor->current_cuda_iov_pos;
             cuda_stream_iov = cuda_iov_pipeline_block->cuda_stream;
             cuda_err = cudaStreamWaitEvent(*cuda_stream_iov, cuda_iov_pipeline_block->cuda_event, 0);
             opal_cuda_check_error(cuda_err);
@@ -1092,8 +1104,8 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
                 nb_blocks_per_description = (count_desc + thread_per_block - 1) / thread_per_block;
                 DT_CUDA_DEBUG ( opal_cuda_output(10, "Pack description %d, size %d, residue %d, alignment %d, nb_block_aquired %d\n", i, count_desc, residue_desc, alignment, nb_blocks_per_description); );
                 for (j = 0; j < nb_blocks_per_description; j++) {
-                    cuda_iov_dist_h_current[nb_blocks_used].src_offset = source + j * thread_per_block * alignment - source_base;
-                    cuda_iov_dist_h_current[nb_blocks_used].dst_offset = destination - destination_base;
+                    cuda_iov_dist_h_current[nb_blocks_used].ptr_offset = source + j * thread_per_block * alignment - source_base;
+                    cuda_iov_contig_buf_h_current[nb_blocks_used] = (uintptr_t)destination;
                     if ( (j+1) * thread_per_block <= count_desc) {
                         cuda_iov_dist_h_current[nb_blocks_used].nb_bytes = thread_per_block * alignment;
                     } else {
@@ -1102,8 +1114,9 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
 #if defined (OPAL_DATATYPE_CUDA_DEBUG)
                     assert(cuda_iov_dist_h_current[nb_blocks_used].nb_bytes > 0);
 #endif /* OPAL_DATATYPE_CUDA_DEBUG */
+                    cuda_iov_nb_bytes_list_h_current[nb_blocks_used] = cuda_iov_dist_h_current[nb_blocks_used].nb_bytes;
                     destination += cuda_iov_dist_h_current[nb_blocks_used].nb_bytes;
-                    DT_CUDA_DEBUG( opal_cuda_output(12, "Pack \tblock %d, src_offset %ld, dst_offset %ld, nb_bytes %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].src_offset, cuda_iov_dist_h_current[nb_blocks_used].dst_offset, cuda_iov_dist_h_current[nb_blocks_used].nb_bytes); );
+                    DT_CUDA_DEBUG( opal_cuda_output(12, "Pack \tblock %d, src_offset %ld, dst %p, nb_bytes %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].ptr_offset, cuda_iov_contig_buf_h_current[nb_blocks_used], cuda_iov_dist_h_current[nb_blocks_used].nb_bytes); );
                     nb_blocks_used ++;
                     assert (nb_blocks_used < CUDA_MAX_NB_BLOCKS*CUDA_IOV_MAX_TASK_PER_BLOCK);
                 }
@@ -1112,14 +1125,15 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
                 if (residue_desc != 0) {
                     /*orig_alignment = opal_datatype_basicDatatypes[pElem->elem.common.type]->size;*/
                     orig_alignment = ALIGNMENT_CHAR;
-                    cuda_iov_dist_h_current[nb_blocks_used].src_offset = source + length_per_iovec / alignment * alignment - source_base;
-                    cuda_iov_dist_h_current[nb_blocks_used].dst_offset = destination - destination_base;
-                    cuda_iov_dist_h_current[nb_blocks_used].nb_bytes = (length_per_iovec - length_per_iovec / alignment * alignment) / orig_alignment;
+                    cuda_iov_dist_h_current[nb_blocks_used].ptr_offset = source + length_per_iovec / alignment * alignment - source_base;
+                    cuda_iov_contig_buf_h_current[nb_blocks_used] = (uintptr_t)destination;
+                    cuda_iov_dist_h_current[nb_blocks_used].nb_bytes = length_per_iovec - length_per_iovec / alignment * alignment;
 #if defined (OPAL_DATATYPE_CUDA_DEBUG)
                     assert(cuda_iov_dist_h_current[nb_blocks_used].nb_bytes > 0);
 #endif /* OPAL_DATATYPE_CUDA_DEBUG */
+                    cuda_iov_nb_bytes_list_h_current[nb_blocks_used] = cuda_iov_dist_h_current[nb_blocks_used].nb_bytes;
                     destination += cuda_iov_dist_h_current[nb_blocks_used].nb_bytes;
-                    DT_CUDA_DEBUG( opal_cuda_output(12, "Pack \tblock %d, src_offset %ld, dst_offset %ld, nb_bytes %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].src_offset, cuda_iov_dist_h_current[nb_blocks_used].dst_offset, cuda_iov_dist_h_current[nb_blocks_used].nb_bytes); );
+                    DT_CUDA_DEBUG( opal_cuda_output(12, "Pack \tblock %d, src_offset %ld, dst %p, nb_bytes %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].ptr_offset, cuda_iov_contig_buf_h_current[nb_blocks_used], cuda_iov_dist_h_current[nb_blocks_used].nb_bytes); );
                     nb_blocks_used ++;
                     assert (nb_blocks_used < CUDA_MAX_NB_BLOCKS*CUDA_IOV_MAX_TASK_PER_BLOCK);
                 }
@@ -1137,8 +1151,9 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
 
           //  opal_ddt_check_cuda_iov_is_full(pConvertor, pConvertor->current_cuda_iov_pos + nb_blocks_used); /* make sure cuda iov has enough space */
             cudaMemcpyAsync(cuda_iov_dist_d_current, cuda_iov_dist_h_current, sizeof(ddt_cuda_iov_dist_cached_t)*(nb_blocks_used), cudaMemcpyHostToDevice, *cuda_stream_iov);
+            cudaMemcpyAsync(cuda_iov_contig_buf_d_current, cuda_iov_contig_buf_h_current, sizeof(uintptr_t)*(nb_blocks_used), cudaMemcpyHostToDevice, *cuda_stream_iov);
             DT_CUDA_DEBUG ( opal_cuda_output(2, "kernel launched src_base %p, dst_base %p, nb_blocks %ld\n", source_base, destination_base, nb_blocks_used ); );
-            opal_generic_simple_pack_cuda_iov_cached_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cuda_iov_dist_d_current, nb_blocks_used, source_base, destination_base);
+            opal_generic_simple_pack_cuda_iov_cached_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cuda_iov_dist_d_current, cuda_iov_contig_buf_d_current, nb_blocks_used, source_base);
             cuda_err = cudaEventRecord(cuda_iov_pipeline_block->cuda_event, *cuda_stream_iov);
             opal_cuda_check_error(cuda_err);
             iov_pipeline_block_id ++;
@@ -1154,7 +1169,7 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
             /* count = 0 done, iov cached finished */
             if (pConvertor->current_iov_pos == ddt_iov_count) {
                 pConvertor->current_count ++;
-                opal_ddt_set_cuda_iov_is_cached(pConvertor, pConvertor->current_cuda_iov_pos);
+                opal_ddt_set_cuda_iov_cached(pConvertor, pConvertor->current_cuda_iov_pos);
                 DT_CUDA_DEBUG ( opal_cuda_output(2, "Pack cuda iov is cached, count %d\n", pConvertor->current_cuda_iov_pos););
             }
             DT_CUDA_DEBUG ( opal_cuda_output(4, "Pack iov start pos %d end pos %d, submit to CUDA stream %d\n", iov_start_pos, iov_end_pos, cuda_streams->current_stream_id); );
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
index 1fe37218fba..9ea9414ba77 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
@@ -46,10 +46,11 @@ __global__ void opal_generic_simple_unpack_cuda_iov_non_cached_kernel( ddt_cuda_
     }
 }
 
-__global__ void opal_generic_simple_unpack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, int nb_blocks_used, unsigned char* source_base, unsigned char* destination_base)
+__global__ void opal_generic_simple_unpack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, uintptr_t* cuda_iov_contig_buf_d, int nb_blocks_used, unsigned char* destination_base)
 {
     uint32_t i, j;
-    size_t src_offset, dst_offset;
+    size_t dst_offset;
+    unsigned char *src;
     unsigned char *_source_tmp, *_destination_tmp;
     
     __shared__ uint32_t nb_tasks;
@@ -65,11 +66,11 @@ __global__ void opal_generic_simple_unpack_cuda_iov_cached_kernel( ddt_cuda_iov_
     __syncthreads();
     
     for (i = 0; i < nb_tasks; i++) {
-        src_offset = cuda_iov_dist[blockIdx.x + i * gridDim.x].src_offset;
-        dst_offset = cuda_iov_dist[blockIdx.x + i * gridDim.x].dst_offset;
+        src = (unsigned char *)cuda_iov_contig_buf_d[blockIdx.x + i * gridDim.x];
+        dst_offset = cuda_iov_dist[blockIdx.x + i * gridDim.x].ptr_offset;
         
         if (threadIdx.x == 0) {
-            _source_tmp = source_base + src_offset;
+            _source_tmp = src;
             _destination_tmp = destination_base + dst_offset;
             uint32_t _nb_bytes = cuda_iov_dist[blockIdx.x + i * gridDim.x].nb_bytes;
             if ((uintptr_t)(_destination_tmp) % ALIGNMENT_DOUBLE == 0 && (uintptr_t)_source_tmp % ALIGNMENT_DOUBLE == 0 && _nb_bytes % ALIGNMENT_DOUBLE == 0) {
@@ -88,7 +89,7 @@ __global__ void opal_generic_simple_unpack_cuda_iov_cached_kernel( ddt_cuda_iov_
                 if (copy_count > blockDim.x) printf("copy_count %d, dim %d\n", copy_count, blockDim.x);
             }*/
             if (j < copy_count) {
-                _source_tmp = source_base + src_offset + j * alignment;
+                _source_tmp = src + j * alignment;
                 _destination_tmp = destination_base + dst_offset + j * alignment;
   /*              if (threadIdx.x == 0) {
                     printf("_src %p, dst %p, alignment %d, blk %d, j %d, count %d\n", _source_tmp, _destination_tmp, alignment, blockIdx.x, j, copy_count);
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
index 7d3dfa404ac..49171e5b277 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
@@ -646,6 +646,7 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
     ddt_cuda_stream_t *cuda_streams = current_cuda_device->cuda_streams;
     ddt_cuda_iov_dist_cached_t* cuda_iov_dist_h_current;
     ddt_cuda_iov_dist_cached_t* cuda_iov_dist_d_current;
+    uintptr_t *cuda_iov_contig_buf_h_current, *cuda_iov_contig_buf_d_current;
     ddt_cuda_iov_pipeline_block_t *cuda_iov_pipeline_block;
     int iov_pipeline_block_id = 0;
     cudaStream_t *cuda_stream_iov = NULL;
@@ -653,7 +654,9 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
     uint32_t ddt_iov_count;
     size_t iov_len;
     uint32_t iov_start_pos, iov_end_pos;
+    ddt_cuda_iov_total_cached_t* cached_cuda_iov;
     ddt_cuda_iov_dist_cached_t* cached_cuda_iov_dist_d;
+    uint32_t *cached_cuda_iov_nb_bytes_list_h, *cuda_iov_nb_bytes_list_h_current;
     uint32_t cached_cuda_iov_count;
     uint8_t cuda_iov_is_cached;
 
@@ -717,8 +720,13 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
     source_base = source;
     opal_convertor_raw_cached( pConvertor, &ddt_iov, &ddt_iov_count);
     assert(ddt_iov != NULL);
-    opal_ddt_get_cached_cuda_iov(pConvertor, &cached_cuda_iov_dist_d, &cached_cuda_iov_count, &cuda_iov_is_cached);
+    opal_ddt_get_cached_cuda_iov(pConvertor, &cached_cuda_iov);
+    cached_cuda_iov_dist_d = cached_cuda_iov->cuda_iov_dist_d;
     assert(cached_cuda_iov_dist_d != NULL);
+    cached_cuda_iov_nb_bytes_list_h = cached_cuda_iov->nb_bytes_h;
+    assert(cached_cuda_iov_nb_bytes_list_h != NULL);
+    cached_cuda_iov_count = cached_cuda_iov->cuda_iov_count;
+    cuda_iov_is_cached = cached_cuda_iov->cuda_iov_is_cached;
     DT_CUDA_DEBUG ( opal_cuda_output(4, "Unpack iov count %d, submit to CUDA stream %d\n", ddt_iov_count, cuda_streams->current_stream_id); );
 
 #if defined (OPAL_DATATYPE_CUDA_TIMING)
@@ -730,7 +738,8 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
     thread_per_block = CUDA_WARP_SIZE * 5;
     nb_blocks = 256;
     
-    while(pConvertor->current_count < pConvertor->count && !buffer_isfull) {
+    /* cuda iov is not cached, start to cache iov */
+    if(opal_ddt_cuda_iov_is_cached(pConvertor) == 0) {
     
         iov_start_pos = pConvertor->current_iov_pos;
         iov_end_pos = iov_start_pos + IOV_PIPELINE_SIZE;
@@ -744,7 +753,11 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
             nb_blocks_used = 0;
             cuda_iov_pipeline_block = current_cuda_device->cuda_iov_pipeline_block[iov_pipeline_block_id];
             cuda_iov_dist_h_current = cuda_iov_pipeline_block->cuda_iov_dist_cached_h;
+            cuda_iov_contig_buf_h_current = cuda_iov_pipeline_block->cuda_iov_contig_buf_h;
+            cuda_iov_contig_buf_d_current = cuda_iov_pipeline_block->cuda_iov_contig_buf_d;
             cuda_iov_dist_d_current = cached_cuda_iov_dist_d + pConvertor->current_cuda_iov_pos;
+            cuda_iov_nb_bytes_list_h_current = cached_cuda_iov_nb_bytes_list_h + pConvertor->current_cuda_iov_pos;
+            DT_CUDA_DEBUG ( opal_cuda_output(2, "Pack cuda iov pos %d\n", pConvertor->current_cuda_iov_pos););
             cuda_stream_iov = cuda_iov_pipeline_block->cuda_stream;
             cuda_err = cudaStreamWaitEvent(*cuda_stream_iov, cuda_iov_pipeline_block->cuda_event, 0);
             opal_cuda_check_error(cuda_err);
@@ -782,8 +795,8 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
                 nb_blocks_per_description = (count_desc + thread_per_block - 1) / thread_per_block;
                 DT_CUDA_DEBUG ( opal_cuda_output(10, "Unpack description %d, size %d, residue %d, alignment %d\n", i, count_desc, residue_desc, alignment); );
                 for (j = 0; j < nb_blocks_per_description; j++) {
-                    cuda_iov_dist_h_current[nb_blocks_used].dst_offset = destination + j * thread_per_block * alignment - destination_base;
-                    cuda_iov_dist_h_current[nb_blocks_used].src_offset = source - source_base;
+                    cuda_iov_dist_h_current[nb_blocks_used].ptr_offset = destination + j * thread_per_block * alignment - destination_base;
+                    cuda_iov_contig_buf_h_current[nb_blocks_used] = (uintptr_t)source;
                     if ( (j+1) * thread_per_block <= count_desc) {
                         cuda_iov_dist_h_current[nb_blocks_used].nb_bytes = thread_per_block * alignment;
                     } else {
@@ -792,8 +805,9 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
 #if defined (OPAL_DATATYPE_CUDA_DEBUG)
                     assert (cuda_iov_dist_h_current[nb_blocks_used].nb_bytes > 0); 
 #endif /* OPAL_DATATYPE_CUDA_DEBUG */
+                    cuda_iov_nb_bytes_list_h_current[nb_blocks_used] = cuda_iov_dist_h_current[nb_blocks_used].nb_bytes;
                     source += cuda_iov_dist_h_current[nb_blocks_used].nb_bytes;
-                    DT_CUDA_DEBUG( opal_cuda_output(12, "Unpack \tblock %d, src_offset %ld, dst_offset %ld, nb_bytes %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].src_offset, cuda_iov_dist_h_current[nb_blocks_used].dst_offset, cuda_iov_dist_h_current[nb_blocks_used].nb_bytes); );
+                    DT_CUDA_DEBUG( opal_cuda_output(12, "Unpack \tblock %d, src_offset %ld, dst %p, nb_bytes %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].ptr_offset, cuda_iov_contig_buf_h_current[nb_blocks_used], cuda_iov_dist_h_current[nb_blocks_used].nb_bytes); );
                     nb_blocks_used ++;
                 }
 
@@ -801,14 +815,15 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
                 if (residue_desc != 0) {
                    /* orig_alignment = opal_datatype_basicDatatypes[pElem->elem.common.type]->size;*/
                     orig_alignment = ALIGNMENT_CHAR;
-                    cuda_iov_dist_h_current[nb_blocks_used].dst_offset = destination + length_per_iovec / alignment * alignment - destination_base;
-                    cuda_iov_dist_h_current[nb_blocks_used].src_offset = source - source_base;
-                    cuda_iov_dist_h_current[nb_blocks_used].nb_bytes = (length_per_iovec - length_per_iovec / alignment * alignment) / orig_alignment;
+                    cuda_iov_dist_h_current[nb_blocks_used].ptr_offset = destination + length_per_iovec / alignment * alignment - destination_base;
+                    cuda_iov_contig_buf_h_current[nb_blocks_used] = (uintptr_t)source;
+                    cuda_iov_dist_h_current[nb_blocks_used].nb_bytes = length_per_iovec - length_per_iovec / alignment * alignment;
 #if defined (OPAL_DATATYPE_CUDA_DEBUG)
                     assert (cuda_iov_dist_h_current[nb_blocks_used].nb_bytes > 0);
 #endif /* OPAL_DATATYPE_CUDA_DEBUG */
+                    cuda_iov_nb_bytes_list_h_current[nb_blocks_used] = cuda_iov_dist_h_current[nb_blocks_used].nb_bytes;
                     source += cuda_iov_dist_h_current[nb_blocks_used].nb_bytes;
-                    DT_CUDA_DEBUG( opal_cuda_output(12, "Unpack \tblock %d, src %ld, dst %ld, nb_bytes %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].src_offset, cuda_iov_dist_h_current[nb_blocks_used].dst_offset, cuda_iov_dist_h_current[nb_blocks_used].nb_bytes); );
+                    DT_CUDA_DEBUG( opal_cuda_output(12, "Unpack \tblock %d, src_offset %ld, dst %p, nb_bytes %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].ptr_offset, cuda_iov_contig_buf_h_current[nb_blocks_used], cuda_iov_dist_h_current[nb_blocks_used].nb_bytes); );
                     nb_blocks_used ++;
                 }
             }
@@ -824,7 +839,8 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
 #endif
 
             cudaMemcpyAsync(cuda_iov_dist_d_current, cuda_iov_dist_h_current, sizeof(ddt_cuda_iov_dist_cached_t)*(nb_blocks_used), cudaMemcpyHostToDevice, *cuda_stream_iov);
-            opal_generic_simple_unpack_cuda_iov_cached_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cuda_iov_dist_d_current, nb_blocks_used, source_base, destination_base);
+            cudaMemcpyAsync(cuda_iov_contig_buf_d_current, cuda_iov_contig_buf_h_current, sizeof(uintptr_t)*(nb_blocks_used), cudaMemcpyHostToDevice, *cuda_stream_iov);
+            opal_generic_simple_unpack_cuda_iov_cached_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cuda_iov_dist_d_current, cuda_iov_contig_buf_d_current, nb_blocks_used, destination_base);
             cuda_err = cudaEventRecord(cuda_iov_pipeline_block->cuda_event, *cuda_stream_iov);
             opal_cuda_check_error(cuda_err);
             iov_pipeline_block_id ++;
@@ -839,7 +855,7 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
             /* finished */
             if (pConvertor->current_iov_pos == ddt_iov_count) {
                 pConvertor->current_count ++;
-                opal_ddt_set_cuda_iov_is_cached(pConvertor, pConvertor->current_cuda_iov_pos);
+                opal_ddt_set_cuda_iov_cached(pConvertor, pConvertor->current_cuda_iov_pos);
                 DT_CUDA_DEBUG ( opal_cuda_output(2, "Pack cuda iov is cached, count %d\n", pConvertor->current_cuda_iov_pos););
             }
             DT_CUDA_DEBUG ( opal_cuda_output(4, "Unpack iov start pos %d end pos %d, submit to CUDA stream %d\n", iov_start_pos, iov_end_pos, cuda_streams->current_stream_id); );
diff --git a/opal/datatype/opal_datatype.h b/opal/datatype/opal_datatype.h
index dde50d8313d..6e161e96d76 100644
--- a/opal/datatype/opal_datatype.h
+++ b/opal/datatype/opal_datatype.h
@@ -132,9 +132,7 @@ struct opal_datatype_t {
     size_t             max_data;
     /* size: 416, cachelines: 7, members: 18 */
 #if OPAL_CUDA_SUPPORT
-    void *             cached_cuda_iov_dist;
-    uint32_t           cached_cuda_iov_count;
-    uint8_t            cuda_iov_is_cached;
+    void *             cached_cuda_iov;
 #endif /* OPAL_CUDA_SUPPORT */
     /* last cacheline: 32 bytes */
 
diff --git a/opal/datatype/opal_datatype_create.c b/opal/datatype/opal_datatype_create.c
index 19caffe19ae..44c0e3020b6 100644
--- a/opal/datatype/opal_datatype_create.c
+++ b/opal/datatype/opal_datatype_create.c
@@ -61,9 +61,7 @@ static void opal_datatype_construct( opal_datatype_t* pData )
     pData->cached_iovec_count = 0;
     
 #if OPAL_CUDA_SUPPORT
-    pData->cached_cuda_iov_dist = NULL;
-    pData->cached_cuda_iov_count = 0;
-    pData->cuda_iov_is_cached = 0;
+    pData->cached_cuda_iov = NULL;
 #endif /* OPAL_CUDA_SUPPORT */
 
     for( i = 0; i < OPAL_DATATYPE_MAX_SUPPORTED; i++ )
@@ -103,11 +101,9 @@ static void opal_datatype_destruct( opal_datatype_t* datatype )
     
 #if OPAL_CUDA_SUPPORT   
     /* free cuda iov */
-    if (opal_datatype_cuda_kernel_support == 1 && datatype->cached_cuda_iov_dist != NULL) {
-        opal_cuda_iov_dist_fini(datatype->cached_cuda_iov_dist);
-        datatype->cached_cuda_iov_dist = NULL;
-        datatype->cached_cuda_iov_count = 0;
-        datatype->cuda_iov_is_cached = 0;
+    if (opal_datatype_cuda_kernel_support == 1 && datatype->cached_cuda_iov != NULL) {
+        opal_cached_cuda_iov_fini(datatype->cached_cuda_iov);
+        datatype->cached_cuda_iov = NULL;
     }
 #endif /* OPAL_CUDA_SUPPORT */
 }
diff --git a/opal/datatype/opal_datatype_cuda.c b/opal/datatype/opal_datatype_cuda.c
index ddc48444777..c65e635a506 100644
--- a/opal/datatype/opal_datatype_cuda.c
+++ b/opal/datatype/opal_datatype_cuda.c
@@ -246,7 +246,7 @@ int32_t opal_cuda_kernel_support_init(void)
         OPAL_DATATYPE_FIND_CUDA_KERNEL_FUNCTION_OR_RETURN( opal_datatype_cuda_kernel_handle, opal_ddt_cuda_malloc_gpu_buffer );
         OPAL_DATATYPE_FIND_CUDA_KERNEL_FUNCTION_OR_RETURN( opal_datatype_cuda_kernel_handle, opal_ddt_cuda_d2dcpy_async );
         OPAL_DATATYPE_FIND_CUDA_KERNEL_FUNCTION_OR_RETURN( opal_datatype_cuda_kernel_handle, opal_ddt_cuda_d2dcpy );
-        OPAL_DATATYPE_FIND_CUDA_KERNEL_FUNCTION_OR_RETURN( opal_datatype_cuda_kernel_handle, opal_ddt_cuda_iov_dist_fini );
+        OPAL_DATATYPE_FIND_CUDA_KERNEL_FUNCTION_OR_RETURN( opal_datatype_cuda_kernel_handle, opal_ddt_cached_cuda_iov_fini );
         
         if (OPAL_SUCCESS != cuda_kernel_table.opal_ddt_cuda_kernel_init_p()) {
             return OPAL_ERROR;
@@ -272,6 +272,7 @@ int32_t opal_cuda_kernel_support_fini(void)
         cuda_kernel_table.opal_ddt_cuda_malloc_gpu_buffer_p = NULL;
         cuda_kernel_table.opal_ddt_cuda_d2dcpy_async_p = NULL;
         cuda_kernel_table.opal_ddt_cuda_d2dcpy_p = NULL;
+        cuda_kernel_table.opal_ddt_cached_cuda_iov_fini_p = NULL;
 
         dlclose(opal_datatype_cuda_kernel_handle);
         opal_datatype_cuda_kernel_handle = NULL;
@@ -362,12 +363,12 @@ void opal_cuda_d2dcpy_async(void* dst, const void* src, size_t count)
     }
 }
 
-void opal_cuda_iov_dist_fini(void *cuda_iov_dist)
+void opal_cached_cuda_iov_fini(void *cached_cuda_iov)
 {
-    if (cuda_kernel_table.opal_ddt_cuda_iov_dist_fini_p != NULL) {
-        cuda_kernel_table.opal_ddt_cuda_iov_dist_fini_p(cuda_iov_dist);
+    if (cuda_kernel_table.opal_ddt_cached_cuda_iov_fini_p != NULL) {
+        cuda_kernel_table.opal_ddt_cached_cuda_iov_fini_p(cached_cuda_iov);
     } else {
-        opal_output(0, "opal_ddt_cuda_iov_dist_fini function pointer is NULL\n");
+        opal_output(0, "opal_ddt_cached_cuda_iov_fini function pointer is NULL\n");
     }
 }
 
diff --git a/opal/datatype/opal_datatype_cuda.h b/opal/datatype/opal_datatype_cuda.h
index 37af008daa8..7b613470ab0 100644
--- a/opal/datatype/opal_datatype_cuda.h
+++ b/opal/datatype/opal_datatype_cuda.h
@@ -28,7 +28,7 @@ struct opal_datatype_cuda_kernel_function_table {
     void* (*opal_ddt_cuda_malloc_gpu_buffer_p)(size_t size, int gpu_id);
     void (*opal_ddt_cuda_d2dcpy_async_p)(void* dst, const void* src, size_t count);
     void (*opal_ddt_cuda_d2dcpy_p)(void* dst, const void* src, size_t count);
-    void (*opal_ddt_cuda_iov_dist_fini_p)(void *cuda_iov_dist);
+    void (*opal_ddt_cached_cuda_iov_fini_p)(void *cached_cuda_iov);
     int32_t (*opal_ddt_generic_simple_pack_function_cuda_iov_p)( opal_convertor_t* pConvertor, struct iovec* iov, uint32_t* out_size, size_t* max_data );
     int32_t (*opal_ddt_generic_simple_unpack_function_cuda_iov_p)( opal_convertor_t* pConvertor, struct iovec* iov, uint32_t* out_size, size_t* max_data );
     int32_t (*opal_ddt_generic_simple_pack_function_cuda_vector_p)( opal_convertor_t* pConvertor, struct iovec* iov, uint32_t* out_size, size_t* max_data );
@@ -55,7 +55,7 @@ void* opal_cuda_malloc_gpu_buffer(size_t size, int gpu_id);
 void opal_cuda_free_gpu_buffer(void *addr, int gpu_id);
 void opal_cuda_d2dcpy(void* dst, const void* src, size_t count);
 void opal_cuda_d2dcpy_async(void* dst, const void* src, size_t count);
-void* opal_cuda_iov_dist_init(void);
-void opal_cuda_iov_dist_fini(void *cuda_iov_dist);
+void* opal_cached_cuda_iov_init(void);
+void opal_cached_cuda_iov_fini(void *cached_cuda_iov);
 
 #endif

From 043fa9c584e33ec100ad1dd658b691adeeb8370c Mon Sep 17 00:00:00 2001
From: eddy16112 <eddy16112@gmail.com>
Date: Tue, 10 Nov 2015 00:33:29 -0500
Subject: [PATCH 053/190] checkpoint, cached cuda iov is working with multiple
 send, but not for count > 1

---
 opal/datatype/cuda/opal_datatype_cuda.cu      |  1 +
 .../cuda/opal_datatype_pack_cuda_wrapper.cu   | 62 +++++++++++++++---
 .../cuda/opal_datatype_unpack_cuda_wrapper.cu | 63 ++++++++++++++++---
 test/datatype/ddt_benchmark.c                 |  2 +-
 4 files changed, 112 insertions(+), 16 deletions(-)

diff --git a/opal/datatype/cuda/opal_datatype_cuda.cu b/opal/datatype/cuda/opal_datatype_cuda.cu
index 3ac7ba0ac5f..18494bcba70 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cu
+++ b/opal/datatype/cuda/opal_datatype_cuda.cu
@@ -340,6 +340,7 @@ void opal_ddt_set_cuda_iov_cached(struct opal_convertor_t *convertor, uint32_t c
     assert(datatype->cached_cuda_iov != NULL);
     ddt_cuda_iov_total_cached_t *tmp = (ddt_cuda_iov_total_cached_t *)datatype->cached_cuda_iov;
     tmp->cuda_iov_count = cuda_iov_count;
+    tmp->cuda_iov_is_cached = 1;
 }
 
 uint8_t opal_ddt_cuda_iov_is_cached(struct opal_convertor_t *convertor)
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
index e6b9545226f..34c1883c2d1 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
@@ -937,7 +937,7 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
     uint32_t nb_blocks, thread_per_block, nb_blocks_used;
     size_t length, buffer_size, length_per_iovec;
     unsigned char *destination, *destination_base, *source_base, *source;
-    size_t total_packed, total_converted;
+    size_t total_packed, packed_w_cache ,packed_wo_cache;
     int32_t complete_flag = 0;
     uint8_t buffer_isfull = 0, transfer_required, free_required;
     uint32_t convertor_flags;
@@ -957,7 +957,7 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
     const struct iovec *ddt_iov = NULL;
     uint32_t ddt_iov_count;
     size_t iov_len;
-    uint32_t iov_start_pos, iov_end_pos;
+    uint32_t iov_start_pos, iov_end_pos, cuda_iov_start_pos, cuda_iov_end_pos;
     ddt_cuda_iov_total_cached_t* cached_cuda_iov;
     ddt_cuda_iov_dist_cached_t* cached_cuda_iov_dist_d;
     uint32_t *cached_cuda_iov_nb_bytes_list_h, *cuda_iov_nb_bytes_list_h_current;
@@ -1015,7 +1015,8 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
     DT_CUDA_DEBUG ( opal_cuda_output(2, "Pack using IOV cached, GPU base %p, pack to buffer %p\n", pConvertor->pBaseBuf, destination););
 
     total_packed = 0;
-    total_converted = pConvertor->bConverted;
+    packed_wo_cache = 0;
+    packed_w_cache = 0;
     cuda_streams->current_stream_id = 0;
   //  orig_stack_index = pStack->index;
     destination_base = destination;
@@ -1047,6 +1048,7 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
     
     thread_per_block = CUDA_WARP_SIZE * 5;
     nb_blocks = 256;
+    source_base = (unsigned char*)pConvertor->pBaseBuf; 
     
     /* cuda iov is not cached, start to cache iov */
     if(opal_ddt_cuda_iov_is_cached(pConvertor) == 0) {
@@ -1056,7 +1058,6 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
         if (iov_end_pos > ddt_iov_count) {
             iov_end_pos = ddt_iov_count;
         }
-        source_base = (unsigned char*)pConvertor->pBaseBuf; 
     
         while (iov_start_pos < iov_end_pos && !buffer_isfull) {
         
@@ -1093,7 +1094,7 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
                     pConvertor->current_iov_pos = i;
                 }
                 buffer_size -= length_per_iovec;
-                total_packed += length_per_iovec;
+                packed_wo_cache += length_per_iovec;
                 source = (size_t)(ddt_iov[i].iov_base) + (ddt_iov[i].iov_len - iov_len) + source_base;
             
                 /* block size is either multiple of ALIGNMENT_DOUBLE or residule */
@@ -1175,11 +1176,60 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
             DT_CUDA_DEBUG ( opal_cuda_output(4, "Pack iov start pos %d end pos %d, submit to CUDA stream %d\n", iov_start_pos, iov_end_pos, cuda_streams->current_stream_id); );
         }
     }
+    total_packed += packed_wo_cache;
+    pConvertor->bConverted += packed_wo_cache;
+
+   
+    /* now we use cached cuda iov */
+    if( pConvertor->bConverted != pConvertor->local_size && !buffer_isfull) {
+        cuda_iov_start_pos = pConvertor->current_cuda_iov_pos;
+        cuda_iov_end_pos = cached_cuda_iov_count;
+        nb_blocks_used = 0;
+        cuda_iov_pipeline_block = current_cuda_device->cuda_iov_pipeline_block[iov_pipeline_block_id];
+        cuda_iov_contig_buf_h_current = cuda_iov_pipeline_block->cuda_iov_contig_buf_h;
+        cuda_iov_contig_buf_d_current = cuda_iov_pipeline_block->cuda_iov_contig_buf_d;
+        cuda_iov_dist_d_current = cached_cuda_iov_dist_d + pConvertor->current_cuda_iov_pos;
+        cuda_stream_iov = cuda_iov_pipeline_block->cuda_stream;
+        cuda_err = cudaStreamWaitEvent(*cuda_stream_iov, cuda_iov_pipeline_block->cuda_event, 0);
+        opal_cuda_check_error(cuda_err);
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+        GET_TIME(start);
+#endif
+        for (i = cuda_iov_start_pos; i < cuda_iov_end_pos && !buffer_isfull; i++) {
+            packed_w_cache += cached_cuda_iov_nb_bytes_list_h[i];
+            if (packed_w_cache <= buffer_size) {
+                cuda_iov_contig_buf_h_current[nb_blocks_used] = (uintptr_t)destination;
+                destination += cached_cuda_iov_nb_bytes_list_h[i];
+                nb_blocks_used ++;
+            } else {
+                packed_w_cache -= cached_cuda_iov_nb_bytes_list_h[i];
+                buffer_isfull = 1;
+                break;
+            }
+        }
+#if defined(OPAL_DATATYPE_CUDA_TIMING)    
+        GET_TIME( end );
+        total_time = ELAPSED_TIME( start, end );
+        DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Pack to dest %p, cached cuda iov is prepared in %ld microsec, kernel submitted to CUDA stream %d, nb_blocks %d\n", destination_base, total_time,  cuda_iov_pipeline_block->cuda_stream_id, nb_blocks_used); );
+#endif
+        pConvertor->current_cuda_iov_pos += nb_blocks_used;
+        cudaMemcpyAsync(cuda_iov_contig_buf_d_current, cuda_iov_contig_buf_h_current, sizeof(uintptr_t)*(nb_blocks_used), cudaMemcpyHostToDevice, *cuda_stream_iov);
+        DT_CUDA_DEBUG ( opal_cuda_output(2, "kernel launched src_base %p, dst_base %p, nb_blocks %ld\n", source_base, destination_base, nb_blocks_used ); );
+        opal_generic_simple_pack_cuda_iov_cached_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cuda_iov_dist_d_current, cuda_iov_contig_buf_d_current, nb_blocks_used, source_base);
+        cuda_err = cudaEventRecord(cuda_iov_pipeline_block->cuda_event, *cuda_stream_iov);
+        opal_cuda_check_error(cuda_err);
+        iov_pipeline_block_id ++;
+        iov_pipeline_block_id = iov_pipeline_block_id % NB_STREAMS;
+    }
 
     for (i = 0; i < NB_STREAMS; i++) {
         cudaStreamSynchronize(cuda_streams->opal_cuda_stream[i]);
     }
     
+    total_packed += packed_w_cache;
+    pConvertor->bConverted += packed_w_cache;
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "Pack total packed %d\n", total_packed); );
+    
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
     GET_TIME(start);
 #endif
@@ -1195,8 +1245,6 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
     iov[0].iov_len = total_packed;
     *max_data = total_packed;
     *out_size = 1;
-    pConvertor->bConverted += total_packed;
-    DT_CUDA_DEBUG ( opal_cuda_output(2, "Pack total packed %d\n", total_packed); );
     
 #if defined(OPAL_DATATYPE_CUDA_TIMING)    
     GET_TIME( end_total );
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
index 49171e5b277..6689a48a3b4 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
@@ -632,7 +632,7 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
     uint32_t nb_blocks, thread_per_block, nb_blocks_used;
     size_t length, buffer_size, length_per_iovec;
     unsigned char *source, *source_base, *destination_base, *destination;
-    size_t total_unpacked, total_converted;
+    size_t total_unpacked, unpacked_wo_cache, unpacked_w_cache;
     int32_t complete_flag = 0;
     uint8_t buffer_isfull = 0;
     uint8_t free_required = 0;
@@ -653,7 +653,7 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
     const struct iovec *ddt_iov = NULL;
     uint32_t ddt_iov_count;
     size_t iov_len;
-    uint32_t iov_start_pos, iov_end_pos;
+    uint32_t iov_start_pos, iov_end_pos, cuda_iov_start_pos, cuda_iov_end_pos;
     ddt_cuda_iov_total_cached_t* cached_cuda_iov;
     ddt_cuda_iov_dist_cached_t* cached_cuda_iov_dist_d;
     uint32_t *cached_cuda_iov_nb_bytes_list_h, *cuda_iov_nb_bytes_list_h_current;
@@ -713,7 +713,8 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
 #endif
     buffer_size = iov[0].iov_len;
     total_unpacked = 0;
-    total_converted = pConvertor->bConverted;
+    unpacked_wo_cache = 0;
+    unpacked_w_cache = 0;
     cuda_streams->current_stream_id = 0;
     convertor_flags = pConvertor->flags;
 //    orig_stack_index = pStack->index;
@@ -737,6 +738,7 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
 
     thread_per_block = CUDA_WARP_SIZE * 5;
     nb_blocks = 256;
+    destination_base = (unsigned char*)pConvertor->pBaseBuf;
     
     /* cuda iov is not cached, start to cache iov */
     if(opal_ddt_cuda_iov_is_cached(pConvertor) == 0) {
@@ -746,7 +748,6 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
         if (iov_end_pos > ddt_iov_count) {
             iov_end_pos = ddt_iov_count;
         }
-        destination_base = (unsigned char*)pConvertor->pBaseBuf;
 
         while (iov_start_pos < iov_end_pos && !buffer_isfull) {
 
@@ -785,7 +786,7 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
                     pConvertor->current_iov_pos = i;
                 }
                 buffer_size -= length_per_iovec;
-                total_unpacked += length_per_iovec;
+                unpacked_wo_cache += length_per_iovec;
                 destination = (size_t)(ddt_iov[i].iov_base) + (ddt_iov[i].iov_len - iov_len) + destination_base;
 
                 alignment = ALIGNMENT_DOUBLE;
@@ -861,18 +862,64 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
             DT_CUDA_DEBUG ( opal_cuda_output(4, "Unpack iov start pos %d end pos %d, submit to CUDA stream %d\n", iov_start_pos, iov_end_pos, cuda_streams->current_stream_id); );
 
         }
-    
     }
+    total_unpacked += unpacked_wo_cache;
+    pConvertor->bConverted += unpacked_wo_cache;
+#if 1    
+    /* now we use cached cuda iov */
+    if( pConvertor->bConverted != pConvertor->local_size && !buffer_isfull) {
+        cuda_iov_start_pos = pConvertor->current_cuda_iov_pos;
+        cuda_iov_end_pos = cached_cuda_iov_count;
+        nb_blocks_used = 0;
+        cuda_iov_pipeline_block = current_cuda_device->cuda_iov_pipeline_block[iov_pipeline_block_id];
+        cuda_iov_contig_buf_h_current = cuda_iov_pipeline_block->cuda_iov_contig_buf_h;
+        cuda_iov_contig_buf_d_current = cuda_iov_pipeline_block->cuda_iov_contig_buf_d;
+        cuda_iov_dist_d_current = cached_cuda_iov_dist_d + pConvertor->current_cuda_iov_pos;
+        cuda_stream_iov = cuda_iov_pipeline_block->cuda_stream;
+        cuda_err = cudaStreamWaitEvent(*cuda_stream_iov, cuda_iov_pipeline_block->cuda_event, 0);
+        opal_cuda_check_error(cuda_err);
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+        GET_TIME(start);
+#endif
+        for (i = cuda_iov_start_pos; i < cuda_iov_end_pos && !buffer_isfull; i++) {
+            unpacked_w_cache += cached_cuda_iov_nb_bytes_list_h[i];
+            if (unpacked_w_cache <= buffer_size) {
+                cuda_iov_contig_buf_h_current[nb_blocks_used] = (uintptr_t)source;
+                source += cached_cuda_iov_nb_bytes_list_h[i];
+                nb_blocks_used ++;
+            } else {
+                unpacked_w_cache -= cached_cuda_iov_nb_bytes_list_h[i];
+                buffer_isfull = 1;
+                break;
+            }
+        }
+#if defined(OPAL_DATATYPE_CUDA_TIMING)    
+        GET_TIME( end );
+        total_time = ELAPSED_TIME( start, end );
+        DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Unpack src %p, cached cuda iov is prepared in %ld microsec, kernel submitted to CUDA stream %d, nb_blocks %d\n", source_base, total_time,  cuda_iov_pipeline_block->cuda_stream_id, nb_blocks_used); );
+#endif
+        pConvertor->current_cuda_iov_pos += nb_blocks_used;
+        cudaMemcpyAsync(cuda_iov_contig_buf_d_current, cuda_iov_contig_buf_h_current, sizeof(uintptr_t)*(nb_blocks_used), cudaMemcpyHostToDevice, *cuda_stream_iov);
+        DT_CUDA_DEBUG ( opal_cuda_output(2, "kernel launched src_base %p, dst_base %p, nb_blocks %ld\n", source_base, destination_base, nb_blocks_used ); );
+        opal_generic_simple_unpack_cuda_iov_cached_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cuda_iov_dist_d_current, cuda_iov_contig_buf_d_current, nb_blocks_used, destination_base);
+        cuda_err = cudaEventRecord(cuda_iov_pipeline_block->cuda_event, *cuda_stream_iov);
+        opal_cuda_check_error(cuda_err);
+        iov_pipeline_block_id ++;
+        iov_pipeline_block_id = iov_pipeline_block_id % NB_STREAMS;
+    }
+#endif
 
     for (i = 0; i < NB_STREAMS; i++) {
         cudaStreamSynchronize(cuda_streams->opal_cuda_stream[i]);
     }
+    
+    total_unpacked += unpacked_w_cache;
+    pConvertor->bConverted += unpacked_w_cache;
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "Unpack total unpacked %d\n", total_unpacked); );
 
     iov[0].iov_len = total_unpacked;
     *max_data = total_unpacked;
     *out_size = 1;
-    pConvertor->bConverted += total_unpacked;
-    DT_CUDA_DEBUG ( opal_cuda_output(2, "Unpack total unpacked %d\n", total_unpacked); );
 
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
     GET_TIME( end_total );
diff --git a/test/datatype/ddt_benchmark.c b/test/datatype/ddt_benchmark.c
index f2112d598b2..6dd3b4cf879 100644
--- a/test/datatype/ddt_benchmark.c
+++ b/test/datatype/ddt_benchmark.c
@@ -1211,7 +1211,7 @@ int main( int argc, char* argv[] )
     
     printf( "\n\n#\n * TEST UPPER TRIANGULAR MATRIX (size 100)\n #\n\n" );
     int mat_size = 500;
-    for (mat_size = 1000; mat_size <= 1000; mat_size +=500) {
+    for (mat_size = 2000; mat_size <= 2000; mat_size +=500) {
         pdt = upper_matrix(mat_size);
         printf("----matrix size %d-----\n", mat_size);
         if( outputFlags & CHECK_PACK_UNPACK ) {

From 5e8c77ac2c05961e93c716e541b47de43f8577a1 Mon Sep 17 00:00:00 2001
From: eddy16112 <eddy16112@gmail.com>
Date: Tue, 10 Nov 2015 19:26:22 -0500
Subject: [PATCH 054/190] checkpoint, fix a bug for partial unpack

---
 .../cuda/opal_datatype_cuda_internal.cuh      |  2 +-
 .../cuda/opal_datatype_pack_cuda_wrapper.cu   | 17 +++----
 .../cuda/opal_datatype_unpack_cuda_kernel.cu  | 16 +++++--
 .../cuda/opal_datatype_unpack_cuda_wrapper.cu | 44 ++++++++++++++-----
 test/datatype/ddt_benchmark.c                 |  2 +-
 5 files changed, 58 insertions(+), 23 deletions(-)

diff --git a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
index 1b47b89f1d0..b7e8e9405f6 100644
--- a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
+++ b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
@@ -140,7 +140,7 @@ __global__ void opal_generic_simple_unpack_cuda_iov_non_cached_kernel( ddt_cuda_
 
 __global__ void opal_generic_simple_pack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, uintptr_t* cuda_iov_contig_buf_d, int nb_blocks_used, unsigned char* source_base);
 
-__global__ void opal_generic_simple_unpack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, uintptr_t* cuda_iov_contig_buf_d, int nb_blocks_used, unsigned char* destination_base);
+__global__ void opal_generic_simple_unpack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, uintptr_t* cuda_iov_contig_buf_d, int nb_blocks_used, unsigned char* destination_base, size_t cuda_iov_partial_length_start, size_t cuda_iov_partial_length_end);
 
 void opal_cuda_output(int output_id, const char *format, ...);
 
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
index 34c1883c2d1..55cb955808e 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
@@ -955,14 +955,14 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
     int iov_pipeline_block_id = 0;
     cudaStream_t *cuda_stream_iov = NULL;
     const struct iovec *ddt_iov = NULL;
-    uint32_t ddt_iov_count;
-    size_t iov_len;
+    uint32_t ddt_iov_count = 0;
+    size_t iov_len = 0;
     uint32_t iov_start_pos, iov_end_pos, cuda_iov_start_pos, cuda_iov_end_pos;
     ddt_cuda_iov_total_cached_t* cached_cuda_iov;
     ddt_cuda_iov_dist_cached_t* cached_cuda_iov_dist_d;
     uint32_t *cached_cuda_iov_nb_bytes_list_h, *cuda_iov_nb_bytes_list_h_current;
-    uint32_t cached_cuda_iov_count;
-    uint8_t cuda_iov_is_cached;
+    uint32_t cached_cuda_iov_count = 0;
+    uint8_t cuda_iov_is_cached = 0;
 
 #if defined(OPAL_DATATYPE_CUDA_TIMING)    
     TIMER_DATA_TYPE start, end, start_total, end_total;
@@ -1196,17 +1196,18 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
         GET_TIME(start);
 #endif
         for (i = cuda_iov_start_pos; i < cuda_iov_end_pos && !buffer_isfull; i++) {
-            packed_w_cache += cached_cuda_iov_nb_bytes_list_h[i];
-            if (packed_w_cache <= buffer_size) {
+            if (buffer_size >= cached_cuda_iov_nb_bytes_list_h[i]) {
                 cuda_iov_contig_buf_h_current[nb_blocks_used] = (uintptr_t)destination;
                 destination += cached_cuda_iov_nb_bytes_list_h[i];
-                nb_blocks_used ++;
+                packed_w_cache += cached_cuda_iov_nb_bytes_list_h[i];
+                buffer_size -= cached_cuda_iov_nb_bytes_list_h[i];
+                nb_blocks_used++;
             } else {
-                packed_w_cache -= cached_cuda_iov_nb_bytes_list_h[i];
                 buffer_isfull = 1;
                 break;
             }
         }
+        printf("nb_blocks_used %d, my %d\n", nb_blocks_used, i - cuda_iov_start_pos);
 #if defined(OPAL_DATATYPE_CUDA_TIMING)    
         GET_TIME( end );
         total_time = ELAPSED_TIME( start, end );
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
index 9ea9414ba77..c553a7991b0 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
@@ -46,7 +46,7 @@ __global__ void opal_generic_simple_unpack_cuda_iov_non_cached_kernel( ddt_cuda_
     }
 }
 
-__global__ void opal_generic_simple_unpack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, uintptr_t* cuda_iov_contig_buf_d, int nb_blocks_used, unsigned char* destination_base)
+__global__ void opal_generic_simple_unpack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, uintptr_t* cuda_iov_contig_buf_d, int nb_blocks_used, unsigned char* destination_base, size_t cuda_iov_partial_length_start, size_t cuda_iov_partial_length_end)
 {
     uint32_t i, j;
     size_t dst_offset;
@@ -68,11 +68,21 @@ __global__ void opal_generic_simple_unpack_cuda_iov_cached_kernel( ddt_cuda_iov_
     for (i = 0; i < nb_tasks; i++) {
         src = (unsigned char *)cuda_iov_contig_buf_d[blockIdx.x + i * gridDim.x];
         dst_offset = cuda_iov_dist[blockIdx.x + i * gridDim.x].ptr_offset;
-        
+        if (i == 0 && blockIdx.x == 0 && cuda_iov_partial_length_start != 0) {
+         //   if (threadIdx.x == 0) printf("cuda_iov_partial_length_start %d", cuda_iov_partial_length_start);
+            dst_offset = dst_offset + cuda_iov_dist[blockIdx.x + i * gridDim.x].nb_bytes - cuda_iov_partial_length_start; 
+        }
         if (threadIdx.x == 0) {
             _source_tmp = src;
             _destination_tmp = destination_base + dst_offset;
-            uint32_t _nb_bytes = cuda_iov_dist[blockIdx.x + i * gridDim.x].nb_bytes;
+            uint32_t _nb_bytes = 0;
+            if (i == 0 && blockIdx.x == 0 && cuda_iov_partial_length_start != 0) {
+                _nb_bytes = cuda_iov_partial_length_start;
+            } else if (i == nb_tasks-1 && (blockIdx.x == (nb_blocks_used-1) % gridDim.x) && cuda_iov_partial_length_end != 0) {
+                _nb_bytes = cuda_iov_partial_length_end;
+            } else {
+                _nb_bytes = cuda_iov_dist[blockIdx.x + i * gridDim.x].nb_bytes;
+            }
             if ((uintptr_t)(_destination_tmp) % ALIGNMENT_DOUBLE == 0 && (uintptr_t)_source_tmp % ALIGNMENT_DOUBLE == 0 && _nb_bytes % ALIGNMENT_DOUBLE == 0) {
                 alignment = ALIGNMENT_DOUBLE;
             } else if ((uintptr_t)(_destination_tmp) % ALIGNMENT_FLOAT == 0 && (uintptr_t)_source_tmp % ALIGNMENT_FLOAT == 0 && _nb_bytes % ALIGNMENT_FLOAT == 0) {
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
index 6689a48a3b4..66d72995e26 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
@@ -651,14 +651,16 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
     int iov_pipeline_block_id = 0;
     cudaStream_t *cuda_stream_iov = NULL;
     const struct iovec *ddt_iov = NULL;
-    uint32_t ddt_iov_count;
-    size_t iov_len;
+    uint32_t ddt_iov_count = 0;
+    size_t iov_len = 0;
     uint32_t iov_start_pos, iov_end_pos, cuda_iov_start_pos, cuda_iov_end_pos;
     ddt_cuda_iov_total_cached_t* cached_cuda_iov;
     ddt_cuda_iov_dist_cached_t* cached_cuda_iov_dist_d;
     uint32_t *cached_cuda_iov_nb_bytes_list_h, *cuda_iov_nb_bytes_list_h_current;
-    uint32_t cached_cuda_iov_count;
-    uint8_t cuda_iov_is_cached;
+    uint32_t cached_cuda_iov_count = 0;
+    uint8_t cuda_iov_is_cached = 0;
+    size_t cuda_iov_partial_length_start = 0;
+    size_t cuda_iov_partial_length_end = 0;
 
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
     TIMER_DATA_TYPE start, end, start_total, end_total;
@@ -841,7 +843,7 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
 
             cudaMemcpyAsync(cuda_iov_dist_d_current, cuda_iov_dist_h_current, sizeof(ddt_cuda_iov_dist_cached_t)*(nb_blocks_used), cudaMemcpyHostToDevice, *cuda_stream_iov);
             cudaMemcpyAsync(cuda_iov_contig_buf_d_current, cuda_iov_contig_buf_h_current, sizeof(uintptr_t)*(nb_blocks_used), cudaMemcpyHostToDevice, *cuda_stream_iov);
-            opal_generic_simple_unpack_cuda_iov_cached_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cuda_iov_dist_d_current, cuda_iov_contig_buf_d_current, nb_blocks_used, destination_base);
+            opal_generic_simple_unpack_cuda_iov_cached_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cuda_iov_dist_d_current, cuda_iov_contig_buf_d_current, nb_blocks_used, destination_base, 0, 0);
             cuda_err = cudaEventRecord(cuda_iov_pipeline_block->cuda_event, *cuda_stream_iov);
             opal_cuda_check_error(cuda_err);
             iov_pipeline_block_id ++;
@@ -881,14 +883,32 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
         GET_TIME(start);
 #endif
+        if (pConvertor->current_iov_partial_length > 0) {
+            cuda_iov_partial_length_start = pConvertor->current_iov_partial_length;
+            buffer_size -= cuda_iov_partial_length_start;
+            pConvertor->current_iov_partial_length = 0;
+            cuda_iov_contig_buf_h_current[nb_blocks_used] = (uintptr_t)source;
+            source += cuda_iov_partial_length_start;
+            cuda_iov_start_pos ++;
+            nb_blocks_used ++;
+        }
         for (i = cuda_iov_start_pos; i < cuda_iov_end_pos && !buffer_isfull; i++) {
-            unpacked_w_cache += cached_cuda_iov_nb_bytes_list_h[i];
-            if (unpacked_w_cache <= buffer_size) {
+            if (buffer_size >= cached_cuda_iov_nb_bytes_list_h[i]) {
                 cuda_iov_contig_buf_h_current[nb_blocks_used] = (uintptr_t)source;
                 source += cached_cuda_iov_nb_bytes_list_h[i];
+                unpacked_w_cache += cached_cuda_iov_nb_bytes_list_h[i];
+                buffer_size -= cached_cuda_iov_nb_bytes_list_h[i];
                 nb_blocks_used ++;
             } else {
-                unpacked_w_cache -= cached_cuda_iov_nb_bytes_list_h[i];
+                if (buffer_size > 0) {
+                    cuda_iov_partial_length_end = buffer_size;
+                    unpacked_w_cache += cuda_iov_partial_length_end;
+                    cuda_iov_contig_buf_h_current[nb_blocks_used] = (uintptr_t)source;
+                    source += cuda_iov_partial_length_end;
+                    pConvertor->current_iov_partial_length = cached_cuda_iov_nb_bytes_list_h[i] - cuda_iov_partial_length_end;
+                    nb_blocks_used ++;
+                }
+                buffer_size = 0;
                 buffer_isfull = 1;
                 break;
             }
@@ -898,10 +918,14 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
         total_time = ELAPSED_TIME( start, end );
         DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Unpack src %p, cached cuda iov is prepared in %ld microsec, kernel submitted to CUDA stream %d, nb_blocks %d\n", source_base, total_time,  cuda_iov_pipeline_block->cuda_stream_id, nb_blocks_used); );
 #endif
-        pConvertor->current_cuda_iov_pos += nb_blocks_used;
+        if (pConvertor->current_iov_partial_length > 0) {
+            pConvertor->current_cuda_iov_pos += nb_blocks_used - 1;
+        } else {
+            pConvertor->current_cuda_iov_pos += nb_blocks_used;
+        }
         cudaMemcpyAsync(cuda_iov_contig_buf_d_current, cuda_iov_contig_buf_h_current, sizeof(uintptr_t)*(nb_blocks_used), cudaMemcpyHostToDevice, *cuda_stream_iov);
         DT_CUDA_DEBUG ( opal_cuda_output(2, "kernel launched src_base %p, dst_base %p, nb_blocks %ld\n", source_base, destination_base, nb_blocks_used ); );
-        opal_generic_simple_unpack_cuda_iov_cached_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cuda_iov_dist_d_current, cuda_iov_contig_buf_d_current, nb_blocks_used, destination_base);
+        opal_generic_simple_unpack_cuda_iov_cached_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cuda_iov_dist_d_current, cuda_iov_contig_buf_d_current, nb_blocks_used, destination_base, cuda_iov_partial_length_start, cuda_iov_partial_length_end);
         cuda_err = cudaEventRecord(cuda_iov_pipeline_block->cuda_event, *cuda_stream_iov);
         opal_cuda_check_error(cuda_err);
         iov_pipeline_block_id ++;
diff --git a/test/datatype/ddt_benchmark.c b/test/datatype/ddt_benchmark.c
index 6dd3b4cf879..c8c3fd7db45 100644
--- a/test/datatype/ddt_benchmark.c
+++ b/test/datatype/ddt_benchmark.c
@@ -1216,7 +1216,7 @@ int main( int argc, char* argv[] )
         printf("----matrix size %d-----\n", mat_size);
         if( outputFlags & CHECK_PACK_UNPACK ) {
             for (i = 1; i <= 1; i++) {
-                local_copy_with_convertor(pdt, 1, 4000000, mat_size);
+                local_copy_with_convertor(pdt, 1, 40000000, mat_size);
             }
         }
         OBJ_RELEASE( pdt ); assert( pdt == NULL );

From deb67ec0cc05c79a065aeb72729706938f545614 Mon Sep 17 00:00:00 2001
From: eddy16112 <eddy16112@gmail.com>
Date: Wed, 11 Nov 2015 00:51:37 -0500
Subject: [PATCH 055/190] checkpoint, fix unpack size

---
 opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu | 1 +
 1 file changed, 1 insertion(+)

diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
index 66d72995e26..84d5bd5ea1d 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
@@ -885,6 +885,7 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
 #endif
         if (pConvertor->current_iov_partial_length > 0) {
             cuda_iov_partial_length_start = pConvertor->current_iov_partial_length;
+            unpacked_w_cache += cuda_iov_partial_length_start;
             buffer_size -= cuda_iov_partial_length_start;
             pConvertor->current_iov_partial_length = 0;
             cuda_iov_contig_buf_h_current[nb_blocks_used] = (uintptr_t)source;

From 56c9fa4713a2b3327d8a9075bd24e7e43c0e2815 Mon Sep 17 00:00:00 2001
From: eddy16112 <eddy16112@gmail.com>
Date: Wed, 11 Nov 2015 17:49:25 -0500
Subject: [PATCH 056/190] checkpoint, during unpack, cache the entire iov
 before unpack

---
 opal/datatype/cuda/opal_datatype_cuda.cu      |  14 ++
 opal/datatype/cuda/opal_datatype_cuda.cuh     |   2 +
 .../cuda/opal_datatype_pack_cuda_wrapper.cu   |   3 +-
 .../cuda/opal_datatype_unpack_cuda_wrapper.cu | 172 ++++++------------
 4 files changed, 75 insertions(+), 116 deletions(-)

diff --git a/opal/datatype/cuda/opal_datatype_cuda.cu b/opal/datatype/cuda/opal_datatype_cuda.cu
index 18494bcba70..471c6e63709 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cu
+++ b/opal/datatype/cuda/opal_datatype_cuda.cu
@@ -351,6 +351,20 @@ uint8_t opal_ddt_cuda_iov_is_cached(struct opal_convertor_t *convertor)
     return tmp->cuda_iov_is_cached;
 }
 
+void opal_ddt_set_cuda_iov_position(struct opal_convertor_t *convertor, size_t ddt_offset, const uint32_t *cached_cuda_iov_nb_bytes_list_h, const uint32_t cuda_iov_count)
+{
+    int i;
+    size_t iov_size = 0;
+    for(i = 0; i < cuda_iov_count; i++) {
+        iov_size += cached_cuda_iov_nb_bytes_list_h[i];
+        if (iov_size > ddt_offset) {
+            convertor->current_iov_partial_length = iov_size - ddt_offset;
+            convertor->current_cuda_iov_pos = i;
+            break;
+        }
+    }
+}
+
 void opal_ddt_check_cuda_iov_is_full(struct opal_convertor_t *convertor, uint32_t cuda_iov_count)
 {
 #if 0
diff --git a/opal/datatype/cuda/opal_datatype_cuda.cuh b/opal/datatype/cuda/opal_datatype_cuda.cuh
index 6c071188c2c..8e30726ace2 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cuh
+++ b/opal/datatype/cuda/opal_datatype_cuda.cuh
@@ -129,6 +129,8 @@ uint8_t opal_ddt_cuda_iov_is_cached(struct opal_convertor_t *convertor);
 
 void opal_ddt_check_cuda_iov_is_full(struct opal_convertor_t *convertor, uint32_t cuda_iov_count);
 
+void opal_ddt_set_cuda_iov_position(struct opal_convertor_t *convertor, size_t ddt_offset, const uint32_t *cached_cuda_iov_nb_bytes_list_h, const uint32_t cuda_iov_count);
+
 }
                             
 #endif  /* OPAL_DATATYPE_CUDA_H_HAS_BEEN_INCLUDED */
\ No newline at end of file
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
index 55cb955808e..8236692cad9 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
@@ -937,7 +937,7 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
     uint32_t nb_blocks, thread_per_block, nb_blocks_used;
     size_t length, buffer_size, length_per_iovec;
     unsigned char *destination, *destination_base, *source_base, *source;
-    size_t total_packed, packed_w_cache ,packed_wo_cache;
+    size_t total_packed = 0, packed_w_cache = 0, packed_wo_cache = 0;
     int32_t complete_flag = 0;
     uint8_t buffer_isfull = 0, transfer_required, free_required;
     uint32_t convertor_flags;
@@ -1207,7 +1207,6 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
                 break;
             }
         }
-        printf("nb_blocks_used %d, my %d\n", nb_blocks_used, i - cuda_iov_start_pos);
 #if defined(OPAL_DATATYPE_CUDA_TIMING)    
         GET_TIME( end );
         total_time = ELAPSED_TIME( start, end );
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
index 84d5bd5ea1d..549b58a9986 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
@@ -632,7 +632,7 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
     uint32_t nb_blocks, thread_per_block, nb_blocks_used;
     size_t length, buffer_size, length_per_iovec;
     unsigned char *source, *source_base, *destination_base, *destination;
-    size_t total_unpacked, unpacked_wo_cache, unpacked_w_cache;
+    size_t total_unpacked = 0, unpacked_wo_cache = 0, unpacked_w_cache = 0;
     int32_t complete_flag = 0;
     uint8_t buffer_isfull = 0;
     uint8_t free_required = 0;
@@ -744,132 +744,74 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
     
     /* cuda iov is not cached, start to cache iov */
     if(opal_ddt_cuda_iov_is_cached(pConvertor) == 0) {
+        nb_blocks_used = 0;
+        cuda_iov_pipeline_block = current_cuda_device->cuda_iov_pipeline_block[iov_pipeline_block_id];
+        cuda_iov_dist_h_current = cuda_iov_pipeline_block->cuda_iov_dist_cached_h;
+        cuda_stream_iov = cuda_iov_pipeline_block->cuda_stream;
+        cuda_err = cudaStreamWaitEvent(*cuda_stream_iov, cuda_iov_pipeline_block->cuda_event, 0);
+        opal_cuda_check_error(cuda_err);
     
-        iov_start_pos = pConvertor->current_iov_pos;
-        iov_end_pos = iov_start_pos + IOV_PIPELINE_SIZE;
-        if (iov_end_pos > ddt_iov_count) {
-            iov_end_pos = ddt_iov_count;
-        }
-
-        while (iov_start_pos < iov_end_pos && !buffer_isfull) {
-
-            nb_blocks_used = 0;
-            cuda_iov_pipeline_block = current_cuda_device->cuda_iov_pipeline_block[iov_pipeline_block_id];
-            cuda_iov_dist_h_current = cuda_iov_pipeline_block->cuda_iov_dist_cached_h;
-            cuda_iov_contig_buf_h_current = cuda_iov_pipeline_block->cuda_iov_contig_buf_h;
-            cuda_iov_contig_buf_d_current = cuda_iov_pipeline_block->cuda_iov_contig_buf_d;
-            cuda_iov_dist_d_current = cached_cuda_iov_dist_d + pConvertor->current_cuda_iov_pos;
-            cuda_iov_nb_bytes_list_h_current = cached_cuda_iov_nb_bytes_list_h + pConvertor->current_cuda_iov_pos;
-            DT_CUDA_DEBUG ( opal_cuda_output(2, "Pack cuda iov pos %d\n", pConvertor->current_cuda_iov_pos););
-            cuda_stream_iov = cuda_iov_pipeline_block->cuda_stream;
-            cuda_err = cudaStreamWaitEvent(*cuda_stream_iov, cuda_iov_pipeline_block->cuda_event, 0);
-            opal_cuda_check_error(cuda_err);
-        
 
 #if defined (OPAL_DATATYPE_CUDA_TIMING)
-            GET_TIME(start);
+        GET_TIME(start);
 #endif
 
-            for (i = iov_start_pos; i < iov_end_pos && !buffer_isfull; i++) {
-                if (pConvertor->current_iov_partial_length > 0) {
-                    iov_len = pConvertor->current_iov_partial_length;
-                    pConvertor->current_iov_partial_length = 0;
-                } else {
-                    iov_len = ddt_iov[i].iov_len;
-                }
-                if (buffer_size >= iov_len) {
-                    length_per_iovec = iov_len;
-                } else {
-                  /*  orig_alignment = opal_datatype_basicDatatypes[pElem->elem.common.type]->size;*/
-                    orig_alignment = ALIGNMENT_CHAR;
-                    length_per_iovec = buffer_size / orig_alignment * orig_alignment;
-                    buffer_isfull = 1;
-                    pConvertor->current_iov_partial_length = iov_len - length_per_iovec;
-                    pConvertor->current_iov_pos = i;
-                }
-                buffer_size -= length_per_iovec;
-                unpacked_wo_cache += length_per_iovec;
-                destination = (size_t)(ddt_iov[i].iov_base) + (ddt_iov[i].iov_len - iov_len) + destination_base;
+        for (i = 0; i < ddt_iov_count; i++) {
+            length_per_iovec = ddt_iov[i].iov_len;
+            destination = (size_t)(ddt_iov[i].iov_base) + destination_base;
 
-                alignment = ALIGNMENT_DOUBLE;
+            alignment = ALIGNMENT_DOUBLE;
 
-                count_desc = length_per_iovec / alignment;
-                residue_desc = length_per_iovec % alignment;
-                nb_blocks_per_description = (count_desc + thread_per_block - 1) / thread_per_block;
-                DT_CUDA_DEBUG ( opal_cuda_output(10, "Unpack description %d, size %d, residue %d, alignment %d\n", i, count_desc, residue_desc, alignment); );
-                for (j = 0; j < nb_blocks_per_description; j++) {
-                    cuda_iov_dist_h_current[nb_blocks_used].ptr_offset = destination + j * thread_per_block * alignment - destination_base;
-                    cuda_iov_contig_buf_h_current[nb_blocks_used] = (uintptr_t)source;
-                    if ( (j+1) * thread_per_block <= count_desc) {
-                        cuda_iov_dist_h_current[nb_blocks_used].nb_bytes = thread_per_block * alignment;
-                    } else {
-                        cuda_iov_dist_h_current[nb_blocks_used].nb_bytes = (thread_per_block - ((j+1)*thread_per_block - count_desc)) * alignment;
-                    }
+            count_desc = length_per_iovec / alignment;
+            residue_desc = length_per_iovec % alignment;
+            nb_blocks_per_description = (count_desc + thread_per_block - 1) / thread_per_block;
+            DT_CUDA_DEBUG ( opal_cuda_output(10, "Unpack description %d, size %d, residue %d, alignment %d\n", i, count_desc, residue_desc, alignment); );
+            for (j = 0; j < nb_blocks_per_description; j++) {
+                cuda_iov_dist_h_current[nb_blocks_used].ptr_offset = destination + j * thread_per_block * alignment - destination_base;
+                if ( (j+1) * thread_per_block <= count_desc) {
+                    cuda_iov_dist_h_current[nb_blocks_used].nb_bytes = thread_per_block * alignment;
+                } else {
+                    cuda_iov_dist_h_current[nb_blocks_used].nb_bytes = (thread_per_block - ((j+1)*thread_per_block - count_desc)) * alignment;
+                }
 #if defined (OPAL_DATATYPE_CUDA_DEBUG)
-                    assert (cuda_iov_dist_h_current[nb_blocks_used].nb_bytes > 0); 
+                assert (cuda_iov_dist_h_current[nb_blocks_used].nb_bytes > 0); 
 #endif /* OPAL_DATATYPE_CUDA_DEBUG */
-                    cuda_iov_nb_bytes_list_h_current[nb_blocks_used] = cuda_iov_dist_h_current[nb_blocks_used].nb_bytes;
-                    source += cuda_iov_dist_h_current[nb_blocks_used].nb_bytes;
-                    DT_CUDA_DEBUG( opal_cuda_output(12, "Unpack \tblock %d, src_offset %ld, dst %p, nb_bytes %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].ptr_offset, cuda_iov_contig_buf_h_current[nb_blocks_used], cuda_iov_dist_h_current[nb_blocks_used].nb_bytes); );
-                    nb_blocks_used ++;
-                }
+                cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] = cuda_iov_dist_h_current[nb_blocks_used].nb_bytes;
+                DT_CUDA_DEBUG( opal_cuda_output(12, "Unpack \tblock %d, src_offset %ld, nb_bytes %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].ptr_offset, cuda_iov_dist_h_current[nb_blocks_used].nb_bytes); );
+                nb_blocks_used ++;
+            }
 
-                /* handle residue */
-                if (residue_desc != 0) {
-                   /* orig_alignment = opal_datatype_basicDatatypes[pElem->elem.common.type]->size;*/
-                    orig_alignment = ALIGNMENT_CHAR;
-                    cuda_iov_dist_h_current[nb_blocks_used].ptr_offset = destination + length_per_iovec / alignment * alignment - destination_base;
-                    cuda_iov_contig_buf_h_current[nb_blocks_used] = (uintptr_t)source;
-                    cuda_iov_dist_h_current[nb_blocks_used].nb_bytes = length_per_iovec - length_per_iovec / alignment * alignment;
+            /* handle residue */
+            if (residue_desc != 0) {
+               /* orig_alignment = opal_datatype_basicDatatypes[pElem->elem.common.type]->size;*/
+                orig_alignment = ALIGNMENT_CHAR;
+                cuda_iov_dist_h_current[nb_blocks_used].ptr_offset = destination + length_per_iovec / alignment * alignment - destination_base;
+                cuda_iov_dist_h_current[nb_blocks_used].nb_bytes = length_per_iovec - length_per_iovec / alignment * alignment;
 #if defined (OPAL_DATATYPE_CUDA_DEBUG)
-                    assert (cuda_iov_dist_h_current[nb_blocks_used].nb_bytes > 0);
+                assert (cuda_iov_dist_h_current[nb_blocks_used].nb_bytes > 0);
 #endif /* OPAL_DATATYPE_CUDA_DEBUG */
-                    cuda_iov_nb_bytes_list_h_current[nb_blocks_used] = cuda_iov_dist_h_current[nb_blocks_used].nb_bytes;
-                    source += cuda_iov_dist_h_current[nb_blocks_used].nb_bytes;
-                    DT_CUDA_DEBUG( opal_cuda_output(12, "Unpack \tblock %d, src_offset %ld, dst %p, nb_bytes %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].ptr_offset, cuda_iov_contig_buf_h_current[nb_blocks_used], cuda_iov_dist_h_current[nb_blocks_used].nb_bytes); );
-                    nb_blocks_used ++;
-                }
-            }
-            
-            if (!buffer_isfull) {
-                pConvertor->current_iov_pos = i;
+                cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] = cuda_iov_dist_h_current[nb_blocks_used].nb_bytes;
+                DT_CUDA_DEBUG( opal_cuda_output(12, "Unpack \tblock %d, src_offset %ld, nb_bytes %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].ptr_offset, cuda_iov_dist_h_current[nb_blocks_used].nb_bytes); );
+                nb_blocks_used ++;
             }
-
+        }
+        
+        cudaMemcpyAsync(cached_cuda_iov_dist_d, cuda_iov_dist_h_current, sizeof(ddt_cuda_iov_dist_cached_t)*(nb_blocks_used), cudaMemcpyHostToDevice, *cuda_stream_iov);
+        cuda_err = cudaEventRecord(cuda_iov_pipeline_block->cuda_event, *cuda_stream_iov);
+        opal_cuda_check_error(cuda_err);
+        opal_ddt_set_cuda_iov_cached(pConvertor, nb_blocks_used);
+        DT_CUDA_DEBUG ( opal_cuda_output(2, "Pack cuda iov is cached, count %d\n", nb_blocks_used););
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
-            GET_TIME( end );
-            total_time = ELAPSED_TIME( start, end );
-            DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Unpack src %p, iov is prepared in %ld microsec, kernel submitted to CUDA stream %d, nb_blocks_used %d\n", source_base, total_time,  cuda_iov_pipeline_block->cuda_stream_id, nb_blocks_used); );
+        GET_TIME( end );
+        total_time = ELAPSED_TIME( start, end );
+        DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Unpack src %p, iov is prepared in %ld microsec, nb_blocks_used %d\n", source_base, total_time, nb_blocks_used); );
 #endif
-
-            cudaMemcpyAsync(cuda_iov_dist_d_current, cuda_iov_dist_h_current, sizeof(ddt_cuda_iov_dist_cached_t)*(nb_blocks_used), cudaMemcpyHostToDevice, *cuda_stream_iov);
-            cudaMemcpyAsync(cuda_iov_contig_buf_d_current, cuda_iov_contig_buf_h_current, sizeof(uintptr_t)*(nb_blocks_used), cudaMemcpyHostToDevice, *cuda_stream_iov);
-            opal_generic_simple_unpack_cuda_iov_cached_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cuda_iov_dist_d_current, cuda_iov_contig_buf_d_current, nb_blocks_used, destination_base, 0, 0);
-            cuda_err = cudaEventRecord(cuda_iov_pipeline_block->cuda_event, *cuda_stream_iov);
-            opal_cuda_check_error(cuda_err);
-            iov_pipeline_block_id ++;
-            iov_pipeline_block_id = iov_pipeline_block_id % NB_STREAMS;
-            pConvertor->current_cuda_iov_pos += nb_blocks_used;
-        
-            iov_start_pos = iov_end_pos;
-            iov_end_pos = iov_start_pos + IOV_PIPELINE_SIZE;
-            if (iov_end_pos >= ddt_iov_count) {
-                iov_end_pos = ddt_iov_count;
-            }
-            /* finished */
-            if (pConvertor->current_iov_pos == ddt_iov_count) {
-                pConvertor->current_count ++;
-                opal_ddt_set_cuda_iov_cached(pConvertor, pConvertor->current_cuda_iov_pos);
-                DT_CUDA_DEBUG ( opal_cuda_output(2, "Pack cuda iov is cached, count %d\n", pConvertor->current_cuda_iov_pos););
-            }
-            DT_CUDA_DEBUG ( opal_cuda_output(4, "Unpack iov start pos %d end pos %d, submit to CUDA stream %d\n", iov_start_pos, iov_end_pos, cuda_streams->current_stream_id); );
-
-        }
     }
-    total_unpacked += unpacked_wo_cache;
-    pConvertor->bConverted += unpacked_wo_cache;
+    
 #if 1    
     /* now we use cached cuda iov */
     if( pConvertor->bConverted != pConvertor->local_size && !buffer_isfull) {
+        opal_ddt_set_cuda_iov_position(pConvertor, pConvertor->bConverted, cached_cuda_iov_nb_bytes_list_h, cached_cuda_iov_count);
         cuda_iov_start_pos = pConvertor->current_cuda_iov_pos;
         cuda_iov_end_pos = cached_cuda_iov_count;
         nb_blocks_used = 0;
@@ -878,11 +820,10 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
         cuda_iov_contig_buf_d_current = cuda_iov_pipeline_block->cuda_iov_contig_buf_d;
         cuda_iov_dist_d_current = cached_cuda_iov_dist_d + pConvertor->current_cuda_iov_pos;
         cuda_stream_iov = cuda_iov_pipeline_block->cuda_stream;
-        cuda_err = cudaStreamWaitEvent(*cuda_stream_iov, cuda_iov_pipeline_block->cuda_event, 0);
-        opal_cuda_check_error(cuda_err);
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
         GET_TIME(start);
 #endif
+        printf("[00000] partial_length %ld, pos %d\n", pConvertor->current_iov_partial_length, pConvertor->current_cuda_iov_pos);
         if (pConvertor->current_iov_partial_length > 0) {
             cuda_iov_partial_length_start = pConvertor->current_iov_partial_length;
             unpacked_w_cache += cuda_iov_partial_length_start;
@@ -919,11 +860,14 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
         total_time = ELAPSED_TIME( start, end );
         DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Unpack src %p, cached cuda iov is prepared in %ld microsec, kernel submitted to CUDA stream %d, nb_blocks %d\n", source_base, total_time,  cuda_iov_pipeline_block->cuda_stream_id, nb_blocks_used); );
 #endif
+        /*
         if (pConvertor->current_iov_partial_length > 0) {
-            pConvertor->current_cuda_iov_pos += nb_blocks_used - 1;
-        } else {
-            pConvertor->current_cuda_iov_pos += nb_blocks_used;
-        }
+                    pConvertor->current_cuda_iov_pos += nb_blocks_used - 1;
+                } else {
+                    pConvertor->current_cuda_iov_pos += nb_blocks_used;
+                } */
+        cuda_err = cudaStreamWaitEvent(*cuda_stream_iov, cuda_iov_pipeline_block->cuda_event, 0);
+        opal_cuda_check_error(cuda_err);
         cudaMemcpyAsync(cuda_iov_contig_buf_d_current, cuda_iov_contig_buf_h_current, sizeof(uintptr_t)*(nb_blocks_used), cudaMemcpyHostToDevice, *cuda_stream_iov);
         DT_CUDA_DEBUG ( opal_cuda_output(2, "kernel launched src_base %p, dst_base %p, nb_blocks %ld\n", source_base, destination_base, nb_blocks_used ); );
         opal_generic_simple_unpack_cuda_iov_cached_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cuda_iov_dist_d_current, cuda_iov_contig_buf_d_current, nb_blocks_used, destination_base, cuda_iov_partial_length_start, cuda_iov_partial_length_end);

From f3e03bdd547dce11704218ade4cb432264ddce22 Mon Sep 17 00:00:00 2001
From: eddy16112 <eddy16112@gmail.com>
Date: Wed, 11 Nov 2015 21:07:45 -0500
Subject: [PATCH 057/190] another checkpoint

---
 .../cuda/opal_datatype_pack_cuda_wrapper.cu   | 186 ++++++------------
 .../cuda/opal_datatype_unpack_cuda_wrapper.cu |  28 +--
 test/datatype/ddt_benchmark.c                 |   6 +-
 test/datatype/ddt_lib.h                       |   4 +-
 4 files changed, 86 insertions(+), 138 deletions(-)

diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
index 8236692cad9..016b8294b8f 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
@@ -948,19 +948,21 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
 //    int32_t orig_stack_index;
     cudaError_t cuda_err;
     ddt_cuda_stream_t *cuda_streams = current_cuda_device->cuda_streams;
-    ddt_cuda_iov_dist_cached_t* cuda_iov_dist_h_current;
-    ddt_cuda_iov_dist_cached_t* cuda_iov_dist_d_current;
-    uintptr_t *cuda_iov_contig_buf_h_current, *cuda_iov_contig_buf_d_current;
-    ddt_cuda_iov_pipeline_block_t *cuda_iov_pipeline_block;
+    ddt_cuda_iov_dist_cached_t* cuda_iov_dist_h_current = NULL;
+    ddt_cuda_iov_dist_cached_t* cuda_iov_dist_d_current = NULL;
+    uintptr_t *cuda_iov_contig_buf_h_current = NULL; 
+    uintptr_t *cuda_iov_contig_buf_d_current = NULL;
+    ddt_cuda_iov_pipeline_block_t *cuda_iov_pipeline_block = NULL;
     int iov_pipeline_block_id = 0;
     cudaStream_t *cuda_stream_iov = NULL;
     const struct iovec *ddt_iov = NULL;
     uint32_t ddt_iov_count = 0;
     size_t iov_len = 0;
     uint32_t iov_start_pos, iov_end_pos, cuda_iov_start_pos, cuda_iov_end_pos;
-    ddt_cuda_iov_total_cached_t* cached_cuda_iov;
-    ddt_cuda_iov_dist_cached_t* cached_cuda_iov_dist_d;
-    uint32_t *cached_cuda_iov_nb_bytes_list_h, *cuda_iov_nb_bytes_list_h_current;
+    ddt_cuda_iov_total_cached_t* cached_cuda_iov = NULL;
+    ddt_cuda_iov_dist_cached_t* cached_cuda_iov_dist_d = NULL;
+    uint32_t *cached_cuda_iov_nb_bytes_list_h = NULL;
+    uint32_t *cuda_iov_nb_bytes_list_h_current = NULL;
     uint32_t cached_cuda_iov_count = 0;
     uint8_t cuda_iov_is_cached = 0;
 
@@ -1036,8 +1038,6 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
     assert(cached_cuda_iov_dist_d != NULL);
     cached_cuda_iov_nb_bytes_list_h = cached_cuda_iov->nb_bytes_h;
     assert(cached_cuda_iov_nb_bytes_list_h != NULL);
-    cached_cuda_iov_count = cached_cuda_iov->cuda_iov_count;
-    cuda_iov_is_cached = cached_cuda_iov->cuda_iov_is_cached;
     DT_CUDA_DEBUG ( opal_cuda_output(4, "Pack iov count %d, submit to CUDA stream %d\n", ddt_iov_count, cuda_streams->current_stream_id); );
 
 #if defined(OPAL_DATATYPE_CUDA_TIMING)    
@@ -1052,133 +1052,75 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
     
     /* cuda iov is not cached, start to cache iov */
     if(opal_ddt_cuda_iov_is_cached(pConvertor) == 0) {
-    
-        iov_start_pos = pConvertor->current_iov_pos;
-        iov_end_pos = iov_start_pos + IOV_PIPELINE_SIZE;
-        if (iov_end_pos > ddt_iov_count) {
-            iov_end_pos = ddt_iov_count;
-        }
-    
-        while (iov_start_pos < iov_end_pos && !buffer_isfull) {
-        
-            nb_blocks_used = 0;
-            cuda_iov_pipeline_block = current_cuda_device->cuda_iov_pipeline_block[iov_pipeline_block_id];
-            cuda_iov_dist_h_current = cuda_iov_pipeline_block->cuda_iov_dist_cached_h;
-            cuda_iov_contig_buf_h_current = cuda_iov_pipeline_block->cuda_iov_contig_buf_h;
-            cuda_iov_contig_buf_d_current = cuda_iov_pipeline_block->cuda_iov_contig_buf_d;
-            cuda_iov_dist_d_current = cached_cuda_iov_dist_d + pConvertor->current_cuda_iov_pos;
-            cuda_iov_nb_bytes_list_h_current = cached_cuda_iov_nb_bytes_list_h + pConvertor->current_cuda_iov_pos;
-            cuda_stream_iov = cuda_iov_pipeline_block->cuda_stream;
-            cuda_err = cudaStreamWaitEvent(*cuda_stream_iov, cuda_iov_pipeline_block->cuda_event, 0);
-            opal_cuda_check_error(cuda_err);
+        nb_blocks_used = 0;
+        cuda_iov_pipeline_block = current_cuda_device->cuda_iov_pipeline_block[iov_pipeline_block_id];
+        cuda_iov_dist_h_current = cuda_iov_pipeline_block->cuda_iov_dist_cached_h;
+        cuda_stream_iov = cuda_iov_pipeline_block->cuda_stream;
+        cuda_err = cudaStreamWaitEvent(*cuda_stream_iov, cuda_iov_pipeline_block->cuda_event, 0);
+        opal_cuda_check_error(cuda_err);
 
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
-            GET_TIME(start);
+        GET_TIME(start);
 #endif
 
-            for (i = iov_start_pos; i < iov_end_pos && !buffer_isfull; i++) {
-                if (pConvertor->current_iov_partial_length > 0) {
-                    iov_len = pConvertor->current_iov_partial_length;
-                    pConvertor->current_iov_partial_length = 0;
-                } else {
-                    iov_len = ddt_iov[i].iov_len;
-                }
-                if (buffer_size >= iov_len) {
-                    length_per_iovec = iov_len;
+        for (i = 0; i < ddt_iov_count; i++) {
+            length_per_iovec = ddt_iov[i].iov_len;
+            source = (size_t)(ddt_iov[i].iov_base) + source_base;
+        
+            /* block size is either multiple of ALIGNMENT_DOUBLE or residule */
+            alignment = ALIGNMENT_DOUBLE;
+
+            count_desc = length_per_iovec / alignment;
+            residue_desc = length_per_iovec % alignment;
+            nb_blocks_per_description = (count_desc + thread_per_block - 1) / thread_per_block;
+            DT_CUDA_DEBUG ( opal_cuda_output(10, "Pack description %d, size %d, residue %d, alignment %d, nb_block_aquired %d\n", i, count_desc, residue_desc, alignment, nb_blocks_per_description); );
+            for (j = 0; j < nb_blocks_per_description; j++) {
+                cuda_iov_dist_h_current[nb_blocks_used].ptr_offset = source + j * thread_per_block * alignment - source_base;
+                if ( (j+1) * thread_per_block <= count_desc) {
+                    cuda_iov_dist_h_current[nb_blocks_used].nb_bytes = thread_per_block * alignment;
                 } else {
-                    /*orig_alignment = opal_datatype_basicDatatypes[pElem->elem.common.type]->size;*/
-                    orig_alignment = ALIGNMENT_CHAR;
-                    length_per_iovec = buffer_size / orig_alignment * orig_alignment;
-                    buffer_isfull = 1;
-                    pConvertor->current_iov_partial_length = iov_len - length_per_iovec;
-                    pConvertor->current_iov_pos = i;
+                    cuda_iov_dist_h_current[nb_blocks_used].nb_bytes = (count_desc - j*thread_per_block) * alignment; 
                 }
-                buffer_size -= length_per_iovec;
-                packed_wo_cache += length_per_iovec;
-                source = (size_t)(ddt_iov[i].iov_base) + (ddt_iov[i].iov_len - iov_len) + source_base;
-            
-                /* block size is either multiple of ALIGNMENT_DOUBLE or residule */
-                alignment = ALIGNMENT_DOUBLE;
-
-                count_desc = length_per_iovec / alignment;
-                residue_desc = length_per_iovec % alignment;
-                nb_blocks_per_description = (count_desc + thread_per_block - 1) / thread_per_block;
-                DT_CUDA_DEBUG ( opal_cuda_output(10, "Pack description %d, size %d, residue %d, alignment %d, nb_block_aquired %d\n", i, count_desc, residue_desc, alignment, nb_blocks_per_description); );
-                for (j = 0; j < nb_blocks_per_description; j++) {
-                    cuda_iov_dist_h_current[nb_blocks_used].ptr_offset = source + j * thread_per_block * alignment - source_base;
-                    cuda_iov_contig_buf_h_current[nb_blocks_used] = (uintptr_t)destination;
-                    if ( (j+1) * thread_per_block <= count_desc) {
-                        cuda_iov_dist_h_current[nb_blocks_used].nb_bytes = thread_per_block * alignment;
-                    } else {
-                        cuda_iov_dist_h_current[nb_blocks_used].nb_bytes = (count_desc - j*thread_per_block) * alignment; 
-                    }
 #if defined (OPAL_DATATYPE_CUDA_DEBUG)
-                    assert(cuda_iov_dist_h_current[nb_blocks_used].nb_bytes > 0);
+                assert(cuda_iov_dist_h_current[nb_blocks_used].nb_bytes > 0);
 #endif /* OPAL_DATATYPE_CUDA_DEBUG */
-                    cuda_iov_nb_bytes_list_h_current[nb_blocks_used] = cuda_iov_dist_h_current[nb_blocks_used].nb_bytes;
-                    destination += cuda_iov_dist_h_current[nb_blocks_used].nb_bytes;
-                    DT_CUDA_DEBUG( opal_cuda_output(12, "Pack \tblock %d, src_offset %ld, dst %p, nb_bytes %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].ptr_offset, cuda_iov_contig_buf_h_current[nb_blocks_used], cuda_iov_dist_h_current[nb_blocks_used].nb_bytes); );
-                    nb_blocks_used ++;
-                    assert (nb_blocks_used < CUDA_MAX_NB_BLOCKS*CUDA_IOV_MAX_TASK_PER_BLOCK);
-                }
-            
-                /* handle residue */
-                if (residue_desc != 0) {
-                    /*orig_alignment = opal_datatype_basicDatatypes[pElem->elem.common.type]->size;*/
-                    orig_alignment = ALIGNMENT_CHAR;
-                    cuda_iov_dist_h_current[nb_blocks_used].ptr_offset = source + length_per_iovec / alignment * alignment - source_base;
-                    cuda_iov_contig_buf_h_current[nb_blocks_used] = (uintptr_t)destination;
-                    cuda_iov_dist_h_current[nb_blocks_used].nb_bytes = length_per_iovec - length_per_iovec / alignment * alignment;
+                cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] = cuda_iov_dist_h_current[nb_blocks_used].nb_bytes;
+                DT_CUDA_DEBUG( opal_cuda_output(12, "Pack \tblock %d, src_offset %ld, nb_bytes %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].ptr_offset, cuda_iov_dist_h_current[nb_blocks_used].nb_bytes); );
+                nb_blocks_used ++;
+                assert (nb_blocks_used < CUDA_MAX_NB_BLOCKS*CUDA_IOV_MAX_TASK_PER_BLOCK);
+            }
+        
+            /* handle residue */
+            if (residue_desc != 0) {
+                /*orig_alignment = opal_datatype_basicDatatypes[pElem->elem.common.type]->size;*/
+                orig_alignment = ALIGNMENT_CHAR;
+                cuda_iov_dist_h_current[nb_blocks_used].ptr_offset = source + length_per_iovec / alignment * alignment - source_base;
+                cuda_iov_dist_h_current[nb_blocks_used].nb_bytes = length_per_iovec - length_per_iovec / alignment * alignment;
 #if defined (OPAL_DATATYPE_CUDA_DEBUG)
-                    assert(cuda_iov_dist_h_current[nb_blocks_used].nb_bytes > 0);
+                assert(cuda_iov_dist_h_current[nb_blocks_used].nb_bytes > 0);
 #endif /* OPAL_DATATYPE_CUDA_DEBUG */
-                    cuda_iov_nb_bytes_list_h_current[nb_blocks_used] = cuda_iov_dist_h_current[nb_blocks_used].nb_bytes;
-                    destination += cuda_iov_dist_h_current[nb_blocks_used].nb_bytes;
-                    DT_CUDA_DEBUG( opal_cuda_output(12, "Pack \tblock %d, src_offset %ld, dst %p, nb_bytes %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].ptr_offset, cuda_iov_contig_buf_h_current[nb_blocks_used], cuda_iov_dist_h_current[nb_blocks_used].nb_bytes); );
-                    nb_blocks_used ++;
-                    assert (nb_blocks_used < CUDA_MAX_NB_BLOCKS*CUDA_IOV_MAX_TASK_PER_BLOCK);
-                }
-            }
-            
-            if (!buffer_isfull) {
-                pConvertor->current_iov_pos = i;
+                cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] = cuda_iov_dist_h_current[nb_blocks_used].nb_bytes;
+                DT_CUDA_DEBUG( opal_cuda_output(12, "Pack \tblock %d, src_offset %ld, nb_bytes %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].ptr_offset, cuda_iov_dist_h_current[nb_blocks_used].nb_bytes); );
+                nb_blocks_used ++;
+                assert (nb_blocks_used < CUDA_MAX_NB_BLOCKS*CUDA_IOV_MAX_TASK_PER_BLOCK);
             }
-
+        }
+        cudaMemcpyAsync(cached_cuda_iov_dist_d, cuda_iov_dist_h_current, sizeof(ddt_cuda_iov_dist_cached_t)*(nb_blocks_used), cudaMemcpyHostToDevice, *cuda_stream_iov);
+        cuda_err = cudaEventRecord(cuda_iov_pipeline_block->cuda_event, *cuda_stream_iov);
+        opal_cuda_check_error(cuda_err);
+        opal_ddt_set_cuda_iov_cached(pConvertor, nb_blocks_used);
+        DT_CUDA_DEBUG ( opal_cuda_output(2, "Pack cuda iov is cached, count %d\n", nb_blocks_used););
 #if defined(OPAL_DATATYPE_CUDA_TIMING)    
-            GET_TIME( end );
-            total_time = ELAPSED_TIME( start, end );
-            DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Pack to dest %p, iov is prepared in %ld microsec, kernel submitted to CUDA stream %d, nb_blocks %d\n", destination_base, total_time,  cuda_iov_pipeline_block->cuda_stream_id, nb_blocks_used); );
+        GET_TIME( end );
+        total_time = ELAPSED_TIME( start, end );
+        DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Pack to dest %p, iov is prepared in %ld microsec, nb_blocks %d\n", destination_base, total_time, nb_blocks_used); );
 #endif
-
-          //  opal_ddt_check_cuda_iov_is_full(pConvertor, pConvertor->current_cuda_iov_pos + nb_blocks_used); /* make sure cuda iov has enough space */
-            cudaMemcpyAsync(cuda_iov_dist_d_current, cuda_iov_dist_h_current, sizeof(ddt_cuda_iov_dist_cached_t)*(nb_blocks_used), cudaMemcpyHostToDevice, *cuda_stream_iov);
-            cudaMemcpyAsync(cuda_iov_contig_buf_d_current, cuda_iov_contig_buf_h_current, sizeof(uintptr_t)*(nb_blocks_used), cudaMemcpyHostToDevice, *cuda_stream_iov);
-            DT_CUDA_DEBUG ( opal_cuda_output(2, "kernel launched src_base %p, dst_base %p, nb_blocks %ld\n", source_base, destination_base, nb_blocks_used ); );
-            opal_generic_simple_pack_cuda_iov_cached_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cuda_iov_dist_d_current, cuda_iov_contig_buf_d_current, nb_blocks_used, source_base);
-            cuda_err = cudaEventRecord(cuda_iov_pipeline_block->cuda_event, *cuda_stream_iov);
-            opal_cuda_check_error(cuda_err);
-            iov_pipeline_block_id ++;
-            iov_pipeline_block_id = iov_pipeline_block_id % NB_STREAMS;
-            pConvertor->current_cuda_iov_pos += nb_blocks_used;
-        
-    //        orig_stack_index = pStack->index;
-            iov_start_pos = iov_end_pos;
-            iov_end_pos = iov_start_pos + IOV_PIPELINE_SIZE;
-            if (iov_end_pos >= ddt_iov_count) {
-                iov_end_pos = ddt_iov_count;
-            }
-            /* count = 0 done, iov cached finished */
-            if (pConvertor->current_iov_pos == ddt_iov_count) {
-                pConvertor->current_count ++;
-                opal_ddt_set_cuda_iov_cached(pConvertor, pConvertor->current_cuda_iov_pos);
-                DT_CUDA_DEBUG ( opal_cuda_output(2, "Pack cuda iov is cached, count %d\n", pConvertor->current_cuda_iov_pos););
-            }
-            DT_CUDA_DEBUG ( opal_cuda_output(4, "Pack iov start pos %d end pos %d, submit to CUDA stream %d\n", iov_start_pos, iov_end_pos, cuda_streams->current_stream_id); );
-        }
     }
-    total_packed += packed_wo_cache;
-    pConvertor->bConverted += packed_wo_cache;
-
+    for (i = 0; i < NB_STREAMS; i++) {
+        cudaStreamSynchronize(cuda_streams->opal_cuda_stream[i]);
+    }
+    
+    cached_cuda_iov_count = cached_cuda_iov->cuda_iov_count;
    
     /* now we use cached cuda iov */
     if( pConvertor->bConverted != pConvertor->local_size && !buffer_isfull) {
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
index 549b58a9986..f7427dd861e 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
@@ -644,19 +644,21 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
 //    int32_t orig_stack_index;
     cudaError_t cuda_err;
     ddt_cuda_stream_t *cuda_streams = current_cuda_device->cuda_streams;
-    ddt_cuda_iov_dist_cached_t* cuda_iov_dist_h_current;
-    ddt_cuda_iov_dist_cached_t* cuda_iov_dist_d_current;
-    uintptr_t *cuda_iov_contig_buf_h_current, *cuda_iov_contig_buf_d_current;
-    ddt_cuda_iov_pipeline_block_t *cuda_iov_pipeline_block;
+    ddt_cuda_iov_dist_cached_t* cuda_iov_dist_h_current = NULL;
+    ddt_cuda_iov_dist_cached_t* cuda_iov_dist_d_current = NULL;
+    uintptr_t *cuda_iov_contig_buf_h_current = NULL;
+    uintptr_t *cuda_iov_contig_buf_d_current = NULL;
+    ddt_cuda_iov_pipeline_block_t *cuda_iov_pipeline_block = NULL;
     int iov_pipeline_block_id = 0;
     cudaStream_t *cuda_stream_iov = NULL;
     const struct iovec *ddt_iov = NULL;
     uint32_t ddt_iov_count = 0;
     size_t iov_len = 0;
     uint32_t iov_start_pos, iov_end_pos, cuda_iov_start_pos, cuda_iov_end_pos;
-    ddt_cuda_iov_total_cached_t* cached_cuda_iov;
-    ddt_cuda_iov_dist_cached_t* cached_cuda_iov_dist_d;
-    uint32_t *cached_cuda_iov_nb_bytes_list_h, *cuda_iov_nb_bytes_list_h_current;
+    ddt_cuda_iov_total_cached_t* cached_cuda_iov = NULL;
+    ddt_cuda_iov_dist_cached_t* cached_cuda_iov_dist_d = NULL;
+    uint32_t *cached_cuda_iov_nb_bytes_list_h = NULL;
+    uint32_t *cuda_iov_nb_bytes_list_h_current = NULL;
     uint32_t cached_cuda_iov_count = 0;
     uint8_t cuda_iov_is_cached = 0;
     size_t cuda_iov_partial_length_start = 0;
@@ -728,8 +730,6 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
     assert(cached_cuda_iov_dist_d != NULL);
     cached_cuda_iov_nb_bytes_list_h = cached_cuda_iov->nb_bytes_h;
     assert(cached_cuda_iov_nb_bytes_list_h != NULL);
-    cached_cuda_iov_count = cached_cuda_iov->cuda_iov_count;
-    cuda_iov_is_cached = cached_cuda_iov->cuda_iov_is_cached;
     DT_CUDA_DEBUG ( opal_cuda_output(4, "Unpack iov count %d, submit to CUDA stream %d\n", ddt_iov_count, cuda_streams->current_stream_id); );
 
 #if defined (OPAL_DATATYPE_CUDA_TIMING)
@@ -796,11 +796,11 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
             }
         }
         
-        cudaMemcpyAsync(cached_cuda_iov_dist_d, cuda_iov_dist_h_current, sizeof(ddt_cuda_iov_dist_cached_t)*(nb_blocks_used), cudaMemcpyHostToDevice, *cuda_stream_iov);
+        cudaMemcpy(cached_cuda_iov_dist_d, cuda_iov_dist_h_current, sizeof(ddt_cuda_iov_dist_cached_t)*(nb_blocks_used), cudaMemcpyHostToDevice);
         cuda_err = cudaEventRecord(cuda_iov_pipeline_block->cuda_event, *cuda_stream_iov);
         opal_cuda_check_error(cuda_err);
         opal_ddt_set_cuda_iov_cached(pConvertor, nb_blocks_used);
-        DT_CUDA_DEBUG ( opal_cuda_output(2, "Pack cuda iov is cached, count %d\n", nb_blocks_used););
+        DT_CUDA_DEBUG ( opal_cuda_output(2, "Unpack cuda iov is cached, count %d\n", nb_blocks_used););
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
         GET_TIME( end );
         total_time = ELAPSED_TIME( start, end );
@@ -808,6 +808,12 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
 #endif
     }
     
+    for (i = 0; i < NB_STREAMS; i++) {
+        cudaStreamSynchronize(cuda_streams->opal_cuda_stream[i]);
+    }
+    
+    cached_cuda_iov_count = cached_cuda_iov->cuda_iov_count;
+    
 #if 1    
     /* now we use cached cuda iov */
     if( pConvertor->bConverted != pConvertor->local_size && !buffer_isfull) {
diff --git a/test/datatype/ddt_benchmark.c b/test/datatype/ddt_benchmark.c
index c8c3fd7db45..bab37e059c4 100644
--- a/test/datatype/ddt_benchmark.c
+++ b/test/datatype/ddt_benchmark.c
@@ -1306,13 +1306,13 @@ int main( int argc, char* argv[] )
         OBJ_RELEASE( pdt ); assert( pdt == NULL );
     }
     
-    for (blk_len = 2000; blk_len <= 2000; blk_len += 500) {
+    for (blk_len = 20; blk_len <= 20; blk_len += 500) {
         printf( ">>--------------------------------------------<<\n" );
         printf( "Vector data-type (60000 times %d double stride 512)\n", blk_len );
         pdt = create_vector_type( MPI_DOUBLE, blk_len, blk_len, blk_len*2);
         if( outputFlags & CHECK_PACK_UNPACK ) {
-            for (i = 0; i < 4; i++) {
-        //          vector_ddt( pdt, 1, pdt, 1, 1024*1024*100 , blk_len, blk_len, blk_len*2);
+            for (i = 0; i < 1; i++) {
+     //             vector_ddt( pdt, 1, pdt, 1, 1024*1024*100 , blk_len, blk_len, blk_len*2);
     //             vector_ddt_2d( pdt, 1, pdt, 1, 1024*1024*100 , 8192, blk_len, blk_len+128);
             }
         }
diff --git a/test/datatype/ddt_lib.h b/test/datatype/ddt_lib.h
index 0f6bbc2cb37..ef462ce0f31 100644
--- a/test/datatype/ddt_lib.h
+++ b/test/datatype/ddt_lib.h
@@ -34,9 +34,9 @@
 #define DUMP_DATA_AFTER_COMMIT 0x00000001
 #define CHECK_PACK_UNPACK      0x00000002
 
-#define TEST_DOUBLE
+//#define TEST_DOUBLE
 //#define TEST_FLOAT
-//#define TEST_CHAR
+#define TEST_CHAR
 
 
 extern uint32_t outputFlags;

From f524be68d886a9ce07f8cb9800378e9deb9d56ec Mon Sep 17 00:00:00 2001
From: eddy16112 <eddy16112@gmail.com>
Date: Wed, 11 Nov 2015 22:58:09 -0500
Subject: [PATCH 058/190] checkpoint , remove unnecessary cuda stream sync

---
 .../cuda/opal_datatype_pack_cuda_wrapper.cu        | 11 -----------
 .../cuda/opal_datatype_unpack_cuda_wrapper.cu      | 14 +-------------
 2 files changed, 1 insertion(+), 24 deletions(-)

diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
index 016b8294b8f..3509ac2de6b 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
@@ -1106,8 +1106,6 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
             }
         }
         cudaMemcpyAsync(cached_cuda_iov_dist_d, cuda_iov_dist_h_current, sizeof(ddt_cuda_iov_dist_cached_t)*(nb_blocks_used), cudaMemcpyHostToDevice, *cuda_stream_iov);
-        cuda_err = cudaEventRecord(cuda_iov_pipeline_block->cuda_event, *cuda_stream_iov);
-        opal_cuda_check_error(cuda_err);
         opal_ddt_set_cuda_iov_cached(pConvertor, nb_blocks_used);
         DT_CUDA_DEBUG ( opal_cuda_output(2, "Pack cuda iov is cached, count %d\n", nb_blocks_used););
 #if defined(OPAL_DATATYPE_CUDA_TIMING)    
@@ -1116,9 +1114,6 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
         DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Pack to dest %p, iov is prepared in %ld microsec, nb_blocks %d\n", destination_base, total_time, nb_blocks_used); );
 #endif
     }
-    for (i = 0; i < NB_STREAMS; i++) {
-        cudaStreamSynchronize(cuda_streams->opal_cuda_stream[i]);
-    }
     
     cached_cuda_iov_count = cached_cuda_iov->cuda_iov_count;
    
@@ -1132,8 +1127,6 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
         cuda_iov_contig_buf_d_current = cuda_iov_pipeline_block->cuda_iov_contig_buf_d;
         cuda_iov_dist_d_current = cached_cuda_iov_dist_d + pConvertor->current_cuda_iov_pos;
         cuda_stream_iov = cuda_iov_pipeline_block->cuda_stream;
-        cuda_err = cudaStreamWaitEvent(*cuda_stream_iov, cuda_iov_pipeline_block->cuda_event, 0);
-        opal_cuda_check_error(cuda_err);
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
         GET_TIME(start);
 #endif
@@ -1158,10 +1151,6 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
         cudaMemcpyAsync(cuda_iov_contig_buf_d_current, cuda_iov_contig_buf_h_current, sizeof(uintptr_t)*(nb_blocks_used), cudaMemcpyHostToDevice, *cuda_stream_iov);
         DT_CUDA_DEBUG ( opal_cuda_output(2, "kernel launched src_base %p, dst_base %p, nb_blocks %ld\n", source_base, destination_base, nb_blocks_used ); );
         opal_generic_simple_pack_cuda_iov_cached_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cuda_iov_dist_d_current, cuda_iov_contig_buf_d_current, nb_blocks_used, source_base);
-        cuda_err = cudaEventRecord(cuda_iov_pipeline_block->cuda_event, *cuda_stream_iov);
-        opal_cuda_check_error(cuda_err);
-        iov_pipeline_block_id ++;
-        iov_pipeline_block_id = iov_pipeline_block_id % NB_STREAMS;
     }
 
     for (i = 0; i < NB_STREAMS; i++) {
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
index f7427dd861e..062b75f7224 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
@@ -797,8 +797,6 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
         }
         
         cudaMemcpy(cached_cuda_iov_dist_d, cuda_iov_dist_h_current, sizeof(ddt_cuda_iov_dist_cached_t)*(nb_blocks_used), cudaMemcpyHostToDevice);
-        cuda_err = cudaEventRecord(cuda_iov_pipeline_block->cuda_event, *cuda_stream_iov);
-        opal_cuda_check_error(cuda_err);
         opal_ddt_set_cuda_iov_cached(pConvertor, nb_blocks_used);
         DT_CUDA_DEBUG ( opal_cuda_output(2, "Unpack cuda iov is cached, count %d\n", nb_blocks_used););
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
@@ -808,13 +806,8 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
 #endif
     }
     
-    for (i = 0; i < NB_STREAMS; i++) {
-        cudaStreamSynchronize(cuda_streams->opal_cuda_stream[i]);
-    }
-    
     cached_cuda_iov_count = cached_cuda_iov->cuda_iov_count;
-    
-#if 1    
+      
     /* now we use cached cuda iov */
     if( pConvertor->bConverted != pConvertor->local_size && !buffer_isfull) {
         opal_ddt_set_cuda_iov_position(pConvertor, pConvertor->bConverted, cached_cuda_iov_nb_bytes_list_h, cached_cuda_iov_count);
@@ -872,17 +865,12 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
                 } else {
                     pConvertor->current_cuda_iov_pos += nb_blocks_used;
                 } */
-        cuda_err = cudaStreamWaitEvent(*cuda_stream_iov, cuda_iov_pipeline_block->cuda_event, 0);
-        opal_cuda_check_error(cuda_err);
         cudaMemcpyAsync(cuda_iov_contig_buf_d_current, cuda_iov_contig_buf_h_current, sizeof(uintptr_t)*(nb_blocks_used), cudaMemcpyHostToDevice, *cuda_stream_iov);
         DT_CUDA_DEBUG ( opal_cuda_output(2, "kernel launched src_base %p, dst_base %p, nb_blocks %ld\n", source_base, destination_base, nb_blocks_used ); );
         opal_generic_simple_unpack_cuda_iov_cached_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cuda_iov_dist_d_current, cuda_iov_contig_buf_d_current, nb_blocks_used, destination_base, cuda_iov_partial_length_start, cuda_iov_partial_length_end);
-        cuda_err = cudaEventRecord(cuda_iov_pipeline_block->cuda_event, *cuda_stream_iov);
-        opal_cuda_check_error(cuda_err);
         iov_pipeline_block_id ++;
         iov_pipeline_block_id = iov_pipeline_block_id % NB_STREAMS;
     }
-#endif
 
     for (i = 0; i < NB_STREAMS; i++) {
         cudaStreamSynchronize(cuda_streams->opal_cuda_stream[i]);

From 4b76d89930e26be772bb8eed801edddd73fcfcc5 Mon Sep 17 00:00:00 2001
From: eddy16112 <eddy16112@gmail.com>
Date: Thu, 12 Nov 2015 22:56:40 -0500
Subject: [PATCH 059/190] use bit to replace %

---
 .../cuda/opal_datatype_cuda_internal.cuh      |  4 +-
 .../cuda/opal_datatype_pack_cuda_kernel.cu    | 35 +++++++-------
 .../cuda/opal_datatype_pack_cuda_wrapper.cu   | 19 +++-----
 .../cuda/opal_datatype_unpack_cuda_kernel.cu  | 48 +++++++++----------
 .../cuda/opal_datatype_unpack_cuda_wrapper.cu | 25 +++-------
 test/datatype/ddt_benchmark.c                 | 17 ++++++-
 6 files changed, 70 insertions(+), 78 deletions(-)

diff --git a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
index b7e8e9405f6..b1c36b66e14 100644
--- a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
+++ b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
@@ -138,9 +138,9 @@ __global__ void opal_generic_simple_pack_cuda_iov_non_cached_kernel( ddt_cuda_io
 
 __global__ void opal_generic_simple_unpack_cuda_iov_non_cached_kernel( ddt_cuda_iov_dist_non_cached_t* cuda_iov_dist, int nb_blocks_used);
 
-__global__ void opal_generic_simple_pack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, uintptr_t* cuda_iov_contig_buf_d, int nb_blocks_used, unsigned char* source_base);
+__global__ void opal_generic_simple_pack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, uint32_t cuda_iov_pos, uintptr_t* cuda_iov_contig_buf_d, int nb_blocks_used, unsigned char* source_base);
 
-__global__ void opal_generic_simple_unpack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, uintptr_t* cuda_iov_contig_buf_d, int nb_blocks_used, unsigned char* destination_base, size_t cuda_iov_partial_length_start, size_t cuda_iov_partial_length_end);
+__global__ void opal_generic_simple_unpack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, uint32_t cuda_iov_pos, uintptr_t* cuda_iov_contig_buf_d, int nb_blocks_used, unsigned char* destination_base, size_t cuda_iov_partial_length_start, size_t cuda_iov_partial_length_end);
 
 void opal_cuda_output(int output_id, const char *format, ...);
 
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
index e85b83e55b5..97c6c69aeff 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
@@ -88,16 +88,17 @@ __global__ void opal_generic_simple_pack_cuda_iov_non_cached_kernel( ddt_cuda_io
     }
 }
 
-__global__ void opal_generic_simple_pack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, uintptr_t* cuda_iov_contig_buf_d, int nb_blocks_used, unsigned char* source_base)
+__global__ void opal_generic_simple_pack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, uint32_t cuda_iov_pos, uintptr_t* cuda_iov_contig_buf_d, int nb_blocks_used, unsigned char* source_base)
 {
-    uint32_t i, j;
+    uint32_t i, j, _nb_bytes;    
     size_t src_offset;
     unsigned char *dst;
     unsigned char *_source_tmp, *_destination_tmp;
+    uint32_t current_cuda_iov_pos = cuda_iov_pos;
     
     __shared__ uint32_t nb_tasks;
-    __shared__ uint32_t copy_count;
-    __shared__ uint8_t alignment;
+    uint32_t copy_count;
+    uint8_t alignment;
     
     if (threadIdx.x == 0) {
         nb_tasks = nb_blocks_used / gridDim.x;
@@ -109,24 +110,20 @@ __global__ void opal_generic_simple_pack_cuda_iov_cached_kernel( ddt_cuda_iov_di
     __syncthreads();
     
     for (i = 0; i < nb_tasks; i++) {
-        src_offset = cuda_iov_dist[blockIdx.x + i * gridDim.x].ptr_offset;
+        src_offset = cuda_iov_dist[blockIdx.x + i * gridDim.x + current_cuda_iov_pos].ptr_offset;
         dst = (unsigned char *)cuda_iov_contig_buf_d[blockIdx.x + i * gridDim.x];
+        _nb_bytes = cuda_iov_dist[blockIdx.x + i * gridDim.x + current_cuda_iov_pos].nb_bytes;
         
-        if (threadIdx.x == 0) {
-            _source_tmp = source_base + src_offset;
-            _destination_tmp = dst;
-            uint32_t _nb_bytes = cuda_iov_dist[blockIdx.x + i * gridDim.x].nb_bytes;
-            /* block size is either multiple of ALIGNMENT_DOUBLE or residule */
-            if ((uintptr_t)(_source_tmp) % ALIGNMENT_DOUBLE == 0 && (uintptr_t)_destination_tmp % ALIGNMENT_DOUBLE == 0 && _nb_bytes % ALIGNMENT_DOUBLE == 0) {
-                alignment = ALIGNMENT_DOUBLE;
-            } else if ((uintptr_t)(_source_tmp) % ALIGNMENT_FLOAT == 0 && (uintptr_t)_destination_tmp % ALIGNMENT_FLOAT == 0 && _nb_bytes % ALIGNMENT_FLOAT == 0) {
-                alignment = ALIGNMENT_FLOAT;
-            } else {
-                alignment = ALIGNMENT_CHAR;
-            }
-            copy_count = _nb_bytes / alignment;
+        _source_tmp = source_base + src_offset;
+        /* block size is either multiple of ALIGNMENT_DOUBLE or residule */
+        if ((uintptr_t)(_source_tmp) & 0x7 == 0 && (uintptr_t)dst & 0x7 == 0 && _nb_bytes & 0x7 == 0) {
+            alignment = ALIGNMENT_DOUBLE;
+        } else if ((uintptr_t)(_source_tmp) & 0x3 == 0 && (uintptr_t)dst & 0x3 == 0 && _nb_bytes & 0x3 == 0) {
+            alignment = ALIGNMENT_FLOAT;
+        } else {
+            alignment = ALIGNMENT_CHAR;
         }
-        __syncthreads();
+        copy_count = _nb_bytes / alignment;
         
         for (j = threadIdx.x; j < copy_count; j += blockDim.x) {
             if (j < copy_count) {
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
index 3509ac2de6b..1d14c000977 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
@@ -937,7 +937,7 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
     uint32_t nb_blocks, thread_per_block, nb_blocks_used;
     size_t length, buffer_size, length_per_iovec;
     unsigned char *destination, *destination_base, *source_base, *source;
-    size_t total_packed = 0, packed_w_cache = 0, packed_wo_cache = 0;
+    size_t total_packed;
     int32_t complete_flag = 0;
     uint8_t buffer_isfull = 0, transfer_required, free_required;
     uint32_t convertor_flags;
@@ -1017,8 +1017,6 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
     DT_CUDA_DEBUG ( opal_cuda_output(2, "Pack using IOV cached, GPU base %p, pack to buffer %p\n", pConvertor->pBaseBuf, destination););
 
     total_packed = 0;
-    packed_wo_cache = 0;
-    packed_w_cache = 0;
     cuda_streams->current_stream_id = 0;
   //  orig_stack_index = pStack->index;
     destination_base = destination;
@@ -1093,7 +1091,6 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
             /* handle residue */
             if (residue_desc != 0) {
                 /*orig_alignment = opal_datatype_basicDatatypes[pElem->elem.common.type]->size;*/
-                orig_alignment = ALIGNMENT_CHAR;
                 cuda_iov_dist_h_current[nb_blocks_used].ptr_offset = source + length_per_iovec / alignment * alignment - source_base;
                 cuda_iov_dist_h_current[nb_blocks_used].nb_bytes = length_per_iovec - length_per_iovec / alignment * alignment;
 #if defined (OPAL_DATATYPE_CUDA_DEBUG)
@@ -1111,7 +1108,7 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
 #if defined(OPAL_DATATYPE_CUDA_TIMING)    
         GET_TIME( end );
         total_time = ELAPSED_TIME( start, end );
-        DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Pack to dest %p, iov is prepared in %ld microsec, nb_blocks %d\n", destination_base, total_time, nb_blocks_used); );
+        DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Pack cached cuda iov is prepared in %ld microsec, nb_blocks %d\n", total_time, nb_blocks_used); );
 #endif
     }
     
@@ -1125,7 +1122,6 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
         cuda_iov_pipeline_block = current_cuda_device->cuda_iov_pipeline_block[iov_pipeline_block_id];
         cuda_iov_contig_buf_h_current = cuda_iov_pipeline_block->cuda_iov_contig_buf_h;
         cuda_iov_contig_buf_d_current = cuda_iov_pipeline_block->cuda_iov_contig_buf_d;
-        cuda_iov_dist_d_current = cached_cuda_iov_dist_d + pConvertor->current_cuda_iov_pos;
         cuda_stream_iov = cuda_iov_pipeline_block->cuda_stream;
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
         GET_TIME(start);
@@ -1134,7 +1130,7 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
             if (buffer_size >= cached_cuda_iov_nb_bytes_list_h[i]) {
                 cuda_iov_contig_buf_h_current[nb_blocks_used] = (uintptr_t)destination;
                 destination += cached_cuda_iov_nb_bytes_list_h[i];
-                packed_w_cache += cached_cuda_iov_nb_bytes_list_h[i];
+                total_packed += cached_cuda_iov_nb_bytes_list_h[i];
                 buffer_size -= cached_cuda_iov_nb_bytes_list_h[i];
                 nb_blocks_used++;
             } else {
@@ -1147,18 +1143,17 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
         total_time = ELAPSED_TIME( start, end );
         DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Pack to dest %p, cached cuda iov is prepared in %ld microsec, kernel submitted to CUDA stream %d, nb_blocks %d\n", destination_base, total_time,  cuda_iov_pipeline_block->cuda_stream_id, nb_blocks_used); );
 #endif
-        pConvertor->current_cuda_iov_pos += nb_blocks_used;
         cudaMemcpyAsync(cuda_iov_contig_buf_d_current, cuda_iov_contig_buf_h_current, sizeof(uintptr_t)*(nb_blocks_used), cudaMemcpyHostToDevice, *cuda_stream_iov);
         DT_CUDA_DEBUG ( opal_cuda_output(2, "kernel launched src_base %p, dst_base %p, nb_blocks %ld\n", source_base, destination_base, nb_blocks_used ); );
-        opal_generic_simple_pack_cuda_iov_cached_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cuda_iov_dist_d_current, cuda_iov_contig_buf_d_current, nb_blocks_used, source_base);
+        opal_generic_simple_pack_cuda_iov_cached_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cached_cuda_iov_dist_d, pConvertor->current_cuda_iov_pos, cuda_iov_contig_buf_d_current, nb_blocks_used, source_base);
+        pConvertor->current_cuda_iov_pos += nb_blocks_used;
     }
 
     for (i = 0; i < NB_STREAMS; i++) {
         cudaStreamSynchronize(cuda_streams->opal_cuda_stream[i]);
     }
-    
-    total_packed += packed_w_cache;
-    pConvertor->bConverted += packed_w_cache;
+
+    pConvertor->bConverted += total_packed;
     DT_CUDA_DEBUG ( opal_cuda_output(2, "Pack total packed %d\n", total_packed); );
     
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
index c553a7991b0..7eb179a0a42 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
@@ -46,16 +46,18 @@ __global__ void opal_generic_simple_unpack_cuda_iov_non_cached_kernel( ddt_cuda_
     }
 }
 
-__global__ void opal_generic_simple_unpack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, uintptr_t* cuda_iov_contig_buf_d, int nb_blocks_used, unsigned char* destination_base, size_t cuda_iov_partial_length_start, size_t cuda_iov_partial_length_end)
+__global__ void opal_generic_simple_unpack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, uint32_t cuda_iov_pos, uintptr_t* cuda_iov_contig_buf_d, int nb_blocks_used, unsigned char* destination_base, size_t cuda_iov_partial_length_start, size_t cuda_iov_partial_length_end)
 {
     uint32_t i, j;
     size_t dst_offset;
     unsigned char *src;
     unsigned char *_source_tmp, *_destination_tmp;
+    uint32_t _nb_bytes;
+    uint32_t current_cuda_iov_pos = cuda_iov_pos; 
     
     __shared__ uint32_t nb_tasks;
-    __shared__ uint32_t copy_count;
-    __shared__ uint8_t alignment;
+    uint32_t copy_count;
+    uint8_t alignment;
     
     if (threadIdx.x == 0) {
         nb_tasks = nb_blocks_used / gridDim.x;
@@ -67,32 +69,28 @@ __global__ void opal_generic_simple_unpack_cuda_iov_cached_kernel( ddt_cuda_iov_
     
     for (i = 0; i < nb_tasks; i++) {
         src = (unsigned char *)cuda_iov_contig_buf_d[blockIdx.x + i * gridDim.x];
-        dst_offset = cuda_iov_dist[blockIdx.x + i * gridDim.x].ptr_offset;
+        dst_offset = cuda_iov_dist[blockIdx.x + i * gridDim.x + current_cuda_iov_pos].ptr_offset;
         if (i == 0 && blockIdx.x == 0 && cuda_iov_partial_length_start != 0) {
          //   if (threadIdx.x == 0) printf("cuda_iov_partial_length_start %d", cuda_iov_partial_length_start);
-            dst_offset = dst_offset + cuda_iov_dist[blockIdx.x + i * gridDim.x].nb_bytes - cuda_iov_partial_length_start; 
+            dst_offset = dst_offset + cuda_iov_dist[blockIdx.x + i * gridDim.x + current_cuda_iov_pos].nb_bytes - cuda_iov_partial_length_start; 
         }
-        if (threadIdx.x == 0) {
-            _source_tmp = src;
-            _destination_tmp = destination_base + dst_offset;
-            uint32_t _nb_bytes = 0;
-            if (i == 0 && blockIdx.x == 0 && cuda_iov_partial_length_start != 0) {
-                _nb_bytes = cuda_iov_partial_length_start;
-            } else if (i == nb_tasks-1 && (blockIdx.x == (nb_blocks_used-1) % gridDim.x) && cuda_iov_partial_length_end != 0) {
-                _nb_bytes = cuda_iov_partial_length_end;
-            } else {
-                _nb_bytes = cuda_iov_dist[blockIdx.x + i * gridDim.x].nb_bytes;
-            }
-            if ((uintptr_t)(_destination_tmp) % ALIGNMENT_DOUBLE == 0 && (uintptr_t)_source_tmp % ALIGNMENT_DOUBLE == 0 && _nb_bytes % ALIGNMENT_DOUBLE == 0) {
-                alignment = ALIGNMENT_DOUBLE;
-            } else if ((uintptr_t)(_destination_tmp) % ALIGNMENT_FLOAT == 0 && (uintptr_t)_source_tmp % ALIGNMENT_FLOAT == 0 && _nb_bytes % ALIGNMENT_FLOAT == 0) {
-                alignment = ALIGNMENT_FLOAT;
-            } else {
-                alignment = ALIGNMENT_CHAR;
-            }
-            copy_count = _nb_bytes / alignment;
+        _destination_tmp = destination_base + dst_offset;
+
+        if (i == 0 && blockIdx.x == 0 && cuda_iov_partial_length_start != 0) {
+            _nb_bytes = cuda_iov_partial_length_start;
+        } else if (i == nb_tasks-1 && (blockIdx.x == (nb_blocks_used-1) % gridDim.x) && cuda_iov_partial_length_end != 0) {
+            _nb_bytes = cuda_iov_partial_length_end;
+        } else {
+            _nb_bytes = cuda_iov_dist[blockIdx.x + i * gridDim.x + current_cuda_iov_pos].nb_bytes;
+        }
+        if ((uintptr_t)(_destination_tmp) & 0x7 == 0 && (uintptr_t)src & 0x7 == 0 && _nb_bytes & 0x7 == 0) {
+            alignment = ALIGNMENT_DOUBLE;
+        } else if ((uintptr_t)(_destination_tmp) & 0x3 == 0 && (uintptr_t)src & 0x3 == 0 && _nb_bytes & 0x3 == 0) {
+            alignment = ALIGNMENT_FLOAT;
+        } else {
+            alignment = ALIGNMENT_CHAR;
         }
-        __syncthreads();
+        copy_count = _nb_bytes / alignment;
         
         for (j = threadIdx.x; j < copy_count; j += blockDim.x) {
 /*            if (threadIdx.x == 0) {
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
index 062b75f7224..50009710d2d 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
@@ -632,7 +632,7 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
     uint32_t nb_blocks, thread_per_block, nb_blocks_used;
     size_t length, buffer_size, length_per_iovec;
     unsigned char *source, *source_base, *destination_base, *destination;
-    size_t total_unpacked = 0, unpacked_wo_cache = 0, unpacked_w_cache = 0;
+    size_t total_unpacked;
     int32_t complete_flag = 0;
     uint8_t buffer_isfull = 0;
     uint8_t free_required = 0;
@@ -717,8 +717,6 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
 #endif
     buffer_size = iov[0].iov_len;
     total_unpacked = 0;
-    unpacked_wo_cache = 0;
-    unpacked_w_cache = 0;
     cuda_streams->current_stream_id = 0;
     convertor_flags = pConvertor->flags;
 //    orig_stack_index = pStack->index;
@@ -802,7 +800,7 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
         GET_TIME( end );
         total_time = ELAPSED_TIME( start, end );
-        DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Unpack src %p, iov is prepared in %ld microsec, nb_blocks_used %d\n", source_base, total_time, nb_blocks_used); );
+        DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Unpack cached cuda iov is prepared in %ld microsec, nb_blocks_used %d\n", total_time, nb_blocks_used); );
 #endif
     }
     
@@ -825,7 +823,7 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
         printf("[00000] partial_length %ld, pos %d\n", pConvertor->current_iov_partial_length, pConvertor->current_cuda_iov_pos);
         if (pConvertor->current_iov_partial_length > 0) {
             cuda_iov_partial_length_start = pConvertor->current_iov_partial_length;
-            unpacked_w_cache += cuda_iov_partial_length_start;
+            total_unpacked += cuda_iov_partial_length_start;
             buffer_size -= cuda_iov_partial_length_start;
             pConvertor->current_iov_partial_length = 0;
             cuda_iov_contig_buf_h_current[nb_blocks_used] = (uintptr_t)source;
@@ -837,13 +835,13 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
             if (buffer_size >= cached_cuda_iov_nb_bytes_list_h[i]) {
                 cuda_iov_contig_buf_h_current[nb_blocks_used] = (uintptr_t)source;
                 source += cached_cuda_iov_nb_bytes_list_h[i];
-                unpacked_w_cache += cached_cuda_iov_nb_bytes_list_h[i];
+                total_unpacked += cached_cuda_iov_nb_bytes_list_h[i];
                 buffer_size -= cached_cuda_iov_nb_bytes_list_h[i];
                 nb_blocks_used ++;
             } else {
                 if (buffer_size > 0) {
                     cuda_iov_partial_length_end = buffer_size;
-                    unpacked_w_cache += cuda_iov_partial_length_end;
+                    total_unpacked += cuda_iov_partial_length_end;
                     cuda_iov_contig_buf_h_current[nb_blocks_used] = (uintptr_t)source;
                     source += cuda_iov_partial_length_end;
                     pConvertor->current_iov_partial_length = cached_cuda_iov_nb_bytes_list_h[i] - cuda_iov_partial_length_end;
@@ -859,25 +857,16 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
         total_time = ELAPSED_TIME( start, end );
         DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Unpack src %p, cached cuda iov is prepared in %ld microsec, kernel submitted to CUDA stream %d, nb_blocks %d\n", source_base, total_time,  cuda_iov_pipeline_block->cuda_stream_id, nb_blocks_used); );
 #endif
-        /*
-        if (pConvertor->current_iov_partial_length > 0) {
-                    pConvertor->current_cuda_iov_pos += nb_blocks_used - 1;
-                } else {
-                    pConvertor->current_cuda_iov_pos += nb_blocks_used;
-                } */
         cudaMemcpyAsync(cuda_iov_contig_buf_d_current, cuda_iov_contig_buf_h_current, sizeof(uintptr_t)*(nb_blocks_used), cudaMemcpyHostToDevice, *cuda_stream_iov);
         DT_CUDA_DEBUG ( opal_cuda_output(2, "kernel launched src_base %p, dst_base %p, nb_blocks %ld\n", source_base, destination_base, nb_blocks_used ); );
-        opal_generic_simple_unpack_cuda_iov_cached_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cuda_iov_dist_d_current, cuda_iov_contig_buf_d_current, nb_blocks_used, destination_base, cuda_iov_partial_length_start, cuda_iov_partial_length_end);
-        iov_pipeline_block_id ++;
-        iov_pipeline_block_id = iov_pipeline_block_id % NB_STREAMS;
+        opal_generic_simple_unpack_cuda_iov_cached_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cached_cuda_iov_dist_d, pConvertor->current_cuda_iov_pos, cuda_iov_contig_buf_d_current, nb_blocks_used, destination_base, cuda_iov_partial_length_start, cuda_iov_partial_length_end);
     }
 
     for (i = 0; i < NB_STREAMS; i++) {
         cudaStreamSynchronize(cuda_streams->opal_cuda_stream[i]);
     }
     
-    total_unpacked += unpacked_w_cache;
-    pConvertor->bConverted += unpacked_w_cache;
+    pConvertor->bConverted += total_unpacked;
     DT_CUDA_DEBUG ( opal_cuda_output(2, "Unpack total unpacked %d\n", total_unpacked); );
 
     iov[0].iov_len = total_unpacked;
diff --git a/test/datatype/ddt_benchmark.c b/test/datatype/ddt_benchmark.c
index bab37e059c4..d961ef34e4e 100644
--- a/test/datatype/ddt_benchmark.c
+++ b/test/datatype/ddt_benchmark.c
@@ -793,6 +793,8 @@ static int local_copy_with_convertor( ompi_datatype_t* pdt, int count, int chunk
     int32_t length = 0, done1 = 0, done2 = 0;
     TIMER_DATA_TYPE start, end, unpack_start, unpack_end;
     long total_time, unpack_time = 0;
+    int j, t_error = 0;
+    unsigned char *mat_char;
 
     dt_length = compute_buffer_length(pdt, count);
     printf("length %lu\n", dt_length);
@@ -890,7 +892,18 @@ static int local_copy_with_convertor( ompi_datatype_t* pdt, int count, int chunk
 
         if( done1 == 0 ) {
             done1 = opal_convertor_pack( send_convertor, &iov, &iov_count, &max_data );
+            
         }
+#if defined (TEST_CHAR)
+   /*     mat_char = (unsigned char *)ptemp;
+        for (j = 0; j < max_data; j++) {
+            if (mat_char[j] != 'a') {
+                t_error ++;
+                printf("error %d, %c\n", j, mat_char[j]);
+            }
+        }
+        printf("total error %d\n", t_error);*/
+#endif
 
         if( done2 == 0 ) {
             GET_TIME( unpack_start );
@@ -1306,13 +1319,13 @@ int main( int argc, char* argv[] )
         OBJ_RELEASE( pdt ); assert( pdt == NULL );
     }
     
-    for (blk_len = 20; blk_len <= 20; blk_len += 500) {
+    for (blk_len = 51; blk_len <= 51; blk_len += 500) {
         printf( ">>--------------------------------------------<<\n" );
         printf( "Vector data-type (60000 times %d double stride 512)\n", blk_len );
         pdt = create_vector_type( MPI_DOUBLE, blk_len, blk_len, blk_len*2);
         if( outputFlags & CHECK_PACK_UNPACK ) {
             for (i = 0; i < 1; i++) {
-     //             vector_ddt( pdt, 1, pdt, 1, 1024*1024*100 , blk_len, blk_len, blk_len*2);
+      //           vector_ddt( pdt, 1, pdt, 1, 1024*1024*100 , blk_len, blk_len, blk_len*2);
     //             vector_ddt_2d( pdt, 1, pdt, 1, 1024*1024*100 , 8192, blk_len, blk_len+128);
             }
         }

From ff3a896ee3b7f55ca154a4d13dc8070f2597f6ee Mon Sep 17 00:00:00 2001
From: eddy16112 <eddy16112@gmail.com>
Date: Thu, 12 Nov 2015 23:27:59 -0500
Subject: [PATCH 060/190] rollback to use %, not bit, since it is faster, not
 sure why

---
 opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu   | 6 +++---
 opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
index 97c6c69aeff..93fb188ddcd 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
@@ -116,9 +116,9 @@ __global__ void opal_generic_simple_pack_cuda_iov_cached_kernel( ddt_cuda_iov_di
         
         _source_tmp = source_base + src_offset;
         /* block size is either multiple of ALIGNMENT_DOUBLE or residule */
-        if ((uintptr_t)(_source_tmp) & 0x7 == 0 && (uintptr_t)dst & 0x7 == 0 && _nb_bytes & 0x7 == 0) {
+        if ((uintptr_t)(_source_tmp) % ALIGNMENT_DOUBLE == 0 && (uintptr_t)dst % ALIGNMENT_DOUBLE == 0 && _nb_bytes % ALIGNMENT_DOUBLE == 0) {
             alignment = ALIGNMENT_DOUBLE;
-        } else if ((uintptr_t)(_source_tmp) & 0x3 == 0 && (uintptr_t)dst & 0x3 == 0 && _nb_bytes & 0x3 == 0) {
+        } else if ((uintptr_t)(_source_tmp) % ALIGNMENT_FLOAT == 0 && (uintptr_t)dst % ALIGNMENT_FLOAT == 0 && _nb_bytes % ALIGNMENT_FLOAT == 0) {
             alignment = ALIGNMENT_FLOAT;
         } else {
             alignment = ALIGNMENT_CHAR;
@@ -141,4 +141,4 @@ __global__ void opal_generic_simple_pack_cuda_iov_cached_kernel( ddt_cuda_iov_di
             }
         }
     }
-}
\ No newline at end of file
+}
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
index 7eb179a0a42..f98a8c0b2ea 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
@@ -83,9 +83,9 @@ __global__ void opal_generic_simple_unpack_cuda_iov_cached_kernel( ddt_cuda_iov_
         } else {
             _nb_bytes = cuda_iov_dist[blockIdx.x + i * gridDim.x + current_cuda_iov_pos].nb_bytes;
         }
-        if ((uintptr_t)(_destination_tmp) & 0x7 == 0 && (uintptr_t)src & 0x7 == 0 && _nb_bytes & 0x7 == 0) {
+        if ((uintptr_t)(_destination_tmp) % ALIGNMENT_DOUBLE == 0 && (uintptr_t)src % ALIGNMENT_DOUBLE == 0 && _nb_bytes % ALIGNMENT_DOUBLE == 0) {
             alignment = ALIGNMENT_DOUBLE;
-        } else if ((uintptr_t)(_destination_tmp) & 0x3 == 0 && (uintptr_t)src & 0x3 == 0 && _nb_bytes & 0x3 == 0) {
+        } else if ((uintptr_t)(_destination_tmp) % ALIGNMENT_FLOAT == 0 && (uintptr_t)src % ALIGNMENT_FLOAT == 0 && _nb_bytes % ALIGNMENT_FLOAT == 0) {
             alignment = ALIGNMENT_FLOAT;
         } else {
             alignment = ALIGNMENT_CHAR;

From 141cbbf54cc82f2529780423758c3ea1c9e6e38a Mon Sep 17 00:00:00 2001
From: eddy16112 <eddy16112@gmail.com>
Date: Fri, 13 Nov 2015 16:48:09 -0500
Subject: [PATCH 061/190] now cuda iov is {nc_disp, c_disp}

---
 .../cuda/opal_datatype_cuda_internal.cuh      |  8 ++--
 .../cuda/opal_datatype_pack_cuda_kernel.cu    | 22 ++++++-----
 .../cuda/opal_datatype_pack_cuda_wrapper.cu   | 35 +++++++++--------
 .../cuda/opal_datatype_unpack_cuda_kernel.cu  | 35 +++++++++--------
 .../cuda/opal_datatype_unpack_cuda_wrapper.cu | 39 +++++++++----------
 test/datatype/ddt_benchmark.c                 |  4 +-
 6 files changed, 73 insertions(+), 70 deletions(-)

diff --git a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
index b1c36b66e14..ea4afa0b989 100644
--- a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
+++ b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
@@ -59,8 +59,8 @@ typedef struct {
 } ddt_cuda_iov_dist_non_cached_t;
 
 typedef struct {
-    size_t ptr_offset;
-    uint32_t nb_bytes;
+    size_t ncontig_disp;
+    size_t contig_disp;
 } ddt_cuda_iov_dist_cached_t;
 
 typedef struct {
@@ -138,9 +138,9 @@ __global__ void opal_generic_simple_pack_cuda_iov_non_cached_kernel( ddt_cuda_io
 
 __global__ void opal_generic_simple_unpack_cuda_iov_non_cached_kernel( ddt_cuda_iov_dist_non_cached_t* cuda_iov_dist, int nb_blocks_used);
 
-__global__ void opal_generic_simple_pack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, uint32_t cuda_iov_pos, uintptr_t* cuda_iov_contig_buf_d, int nb_blocks_used, unsigned char* source_base);
+__global__ void opal_generic_simple_pack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, uint32_t cuda_iov_pos, uintptr_t* cuda_iov_contig_buf_d, int nb_blocks_used, unsigned char* source_base, unsigned char* destination_base);
 
-__global__ void opal_generic_simple_unpack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, uint32_t cuda_iov_pos, uintptr_t* cuda_iov_contig_buf_d, int nb_blocks_used, unsigned char* destination_base, size_t cuda_iov_partial_length_start, size_t cuda_iov_partial_length_end);
+__global__ void opal_generic_simple_unpack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, uint32_t cuda_iov_pos, uintptr_t* cuda_iov_contig_buf_d, int nb_blocks_used, unsigned char* destination_base, unsigned char* source_base, size_t cuda_iov_partial_length_start, size_t cuda_iov_partial_length_end);
 
 void opal_cuda_output(int output_id, const char *format, ...);
 
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
index 93fb188ddcd..ddfd68b0e4c 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
@@ -88,13 +88,14 @@ __global__ void opal_generic_simple_pack_cuda_iov_non_cached_kernel( ddt_cuda_io
     }
 }
 
-__global__ void opal_generic_simple_pack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, uint32_t cuda_iov_pos, uintptr_t* cuda_iov_contig_buf_d, int nb_blocks_used, unsigned char* source_base)
+__global__ void opal_generic_simple_pack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, uint32_t cuda_iov_pos, uintptr_t* cuda_iov_contig_buf_d, int nb_blocks_used, unsigned char* source_base, unsigned char* destination_base)
 {
-    uint32_t i, j, _nb_bytes;    
-    size_t src_offset;
-    unsigned char *dst;
+    uint32_t i, j;
+    size_t _nb_bytes;    
+    size_t src_offset, dst_offset;
     unsigned char *_source_tmp, *_destination_tmp;
     uint32_t current_cuda_iov_pos = cuda_iov_pos;
+    size_t destination_disp = cuda_iov_dist[current_cuda_iov_pos].contig_disp;
     
     __shared__ uint32_t nb_tasks;
     uint32_t copy_count;
@@ -110,15 +111,16 @@ __global__ void opal_generic_simple_pack_cuda_iov_cached_kernel( ddt_cuda_iov_di
     __syncthreads();
     
     for (i = 0; i < nb_tasks; i++) {
-        src_offset = cuda_iov_dist[blockIdx.x + i * gridDim.x + current_cuda_iov_pos].ptr_offset;
-        dst = (unsigned char *)cuda_iov_contig_buf_d[blockIdx.x + i * gridDim.x];
-        _nb_bytes = cuda_iov_dist[blockIdx.x + i * gridDim.x + current_cuda_iov_pos].nb_bytes;
+        src_offset = cuda_iov_dist[blockIdx.x + i * gridDim.x + current_cuda_iov_pos].ncontig_disp;
+        dst_offset = cuda_iov_dist[blockIdx.x + i * gridDim.x + current_cuda_iov_pos].contig_disp - destination_disp;
+        _nb_bytes = cuda_iov_dist[blockIdx.x + i * gridDim.x + current_cuda_iov_pos + 1].contig_disp - cuda_iov_dist[blockIdx.x + i * gridDim.x + current_cuda_iov_pos].contig_disp;
         
         _source_tmp = source_base + src_offset;
+        _destination_tmp = destination_base + dst_offset;
         /* block size is either multiple of ALIGNMENT_DOUBLE or residule */
-        if ((uintptr_t)(_source_tmp) % ALIGNMENT_DOUBLE == 0 && (uintptr_t)dst % ALIGNMENT_DOUBLE == 0 && _nb_bytes % ALIGNMENT_DOUBLE == 0) {
+        if ((uintptr_t)(_source_tmp) % ALIGNMENT_DOUBLE == 0 && (uintptr_t)(_destination_tmp) % ALIGNMENT_DOUBLE == 0 && _nb_bytes % ALIGNMENT_DOUBLE == 0) {
             alignment = ALIGNMENT_DOUBLE;
-        } else if ((uintptr_t)(_source_tmp) % ALIGNMENT_FLOAT == 0 && (uintptr_t)dst % ALIGNMENT_FLOAT == 0 && _nb_bytes % ALIGNMENT_FLOAT == 0) {
+        } else if ((uintptr_t)(_source_tmp) % ALIGNMENT_FLOAT == 0 && (uintptr_t)(_destination_tmp) % ALIGNMENT_FLOAT == 0 && _nb_bytes % ALIGNMENT_FLOAT == 0) {
             alignment = ALIGNMENT_FLOAT;
         } else {
             alignment = ALIGNMENT_CHAR;
@@ -128,7 +130,7 @@ __global__ void opal_generic_simple_pack_cuda_iov_cached_kernel( ddt_cuda_iov_di
         for (j = threadIdx.x; j < copy_count; j += blockDim.x) {
             if (j < copy_count) {
                 _source_tmp = source_base + src_offset + j * alignment;
-                _destination_tmp = dst + j * alignment;
+                _destination_tmp = destination_base + dst_offset + j * alignment;
 #if !defined (OPAL_DATATYPE_CUDA_DRY_RUN)
                 if (alignment == ALIGNMENT_DOUBLE) {
                     *((long *)_destination_tmp) = *((long *)_source_tmp);
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
index 1d14c000977..f1ce6dbda7d 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
@@ -965,6 +965,7 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
     uint32_t *cuda_iov_nb_bytes_list_h_current = NULL;
     uint32_t cached_cuda_iov_count = 0;
     uint8_t cuda_iov_is_cached = 0;
+    size_t destionation_disp = 0;
 
 #if defined(OPAL_DATATYPE_CUDA_TIMING)    
     TIMER_DATA_TYPE start, end, start_total, end_total;
@@ -1073,17 +1074,18 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
             nb_blocks_per_description = (count_desc + thread_per_block - 1) / thread_per_block;
             DT_CUDA_DEBUG ( opal_cuda_output(10, "Pack description %d, size %d, residue %d, alignment %d, nb_block_aquired %d\n", i, count_desc, residue_desc, alignment, nb_blocks_per_description); );
             for (j = 0; j < nb_blocks_per_description; j++) {
-                cuda_iov_dist_h_current[nb_blocks_used].ptr_offset = source + j * thread_per_block * alignment - source_base;
+                cuda_iov_dist_h_current[nb_blocks_used].ncontig_disp = source + j * thread_per_block * alignment - source_base;
+                cuda_iov_dist_h_current[nb_blocks_used].contig_disp = destionation_disp;
                 if ( (j+1) * thread_per_block <= count_desc) {
-                    cuda_iov_dist_h_current[nb_blocks_used].nb_bytes = thread_per_block * alignment;
+                    cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] = thread_per_block * alignment;
                 } else {
-                    cuda_iov_dist_h_current[nb_blocks_used].nb_bytes = (count_desc - j*thread_per_block) * alignment; 
+                    cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] = (count_desc - j*thread_per_block) * alignment; 
                 }
 #if defined (OPAL_DATATYPE_CUDA_DEBUG)
-                assert(cuda_iov_dist_h_current[nb_blocks_used].nb_bytes > 0);
+                assert(cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] > 0);
 #endif /* OPAL_DATATYPE_CUDA_DEBUG */
-                cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] = cuda_iov_dist_h_current[nb_blocks_used].nb_bytes;
-                DT_CUDA_DEBUG( opal_cuda_output(12, "Pack \tblock %d, src_offset %ld, nb_bytes %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].ptr_offset, cuda_iov_dist_h_current[nb_blocks_used].nb_bytes); );
+                destionation_disp += cached_cuda_iov_nb_bytes_list_h[nb_blocks_used];
+                DT_CUDA_DEBUG( opal_cuda_output(12, "Pack \tblock %d, ncontig_disp %ld, contig_disp %ld, nb_bytes %ld\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].ncontig_disp, cuda_iov_dist_h_current[nb_blocks_used].contig_disp, cached_cuda_iov_nb_bytes_list_h[nb_blocks_used]); );
                 nb_blocks_used ++;
                 assert (nb_blocks_used < CUDA_MAX_NB_BLOCKS*CUDA_IOV_MAX_TASK_PER_BLOCK);
             }
@@ -1091,18 +1093,21 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
             /* handle residue */
             if (residue_desc != 0) {
                 /*orig_alignment = opal_datatype_basicDatatypes[pElem->elem.common.type]->size;*/
-                cuda_iov_dist_h_current[nb_blocks_used].ptr_offset = source + length_per_iovec / alignment * alignment - source_base;
-                cuda_iov_dist_h_current[nb_blocks_used].nb_bytes = length_per_iovec - length_per_iovec / alignment * alignment;
+                cuda_iov_dist_h_current[nb_blocks_used].ncontig_disp = source + length_per_iovec / alignment * alignment - source_base;
+                cuda_iov_dist_h_current[nb_blocks_used].contig_disp = destionation_disp;
+                cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] = length_per_iovec - length_per_iovec / alignment * alignment;
 #if defined (OPAL_DATATYPE_CUDA_DEBUG)
-                assert(cuda_iov_dist_h_current[nb_blocks_used].nb_bytes > 0);
+                assert(cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] > 0);
 #endif /* OPAL_DATATYPE_CUDA_DEBUG */
-                cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] = cuda_iov_dist_h_current[nb_blocks_used].nb_bytes;
-                DT_CUDA_DEBUG( opal_cuda_output(12, "Pack \tblock %d, src_offset %ld, nb_bytes %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].ptr_offset, cuda_iov_dist_h_current[nb_blocks_used].nb_bytes); );
+                destionation_disp += cached_cuda_iov_nb_bytes_list_h[nb_blocks_used];
+                DT_CUDA_DEBUG( opal_cuda_output(12, "Pack \tblock %d, ncontig_disp %ld, contig_disp %ld, nb_bytes %ld\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].ncontig_disp, cuda_iov_dist_h_current[nb_blocks_used].contig_disp, cached_cuda_iov_nb_bytes_list_h[nb_blocks_used]); );
                 nb_blocks_used ++;
                 assert (nb_blocks_used < CUDA_MAX_NB_BLOCKS*CUDA_IOV_MAX_TASK_PER_BLOCK);
             }
         }
-        cudaMemcpyAsync(cached_cuda_iov_dist_d, cuda_iov_dist_h_current, sizeof(ddt_cuda_iov_dist_cached_t)*(nb_blocks_used), cudaMemcpyHostToDevice, *cuda_stream_iov);
+        /* use additional entry to store the size of entire contiguous buffer needed for one ddt */
+        cuda_iov_dist_h_current[nb_blocks_used].contig_disp = destionation_disp;
+        cudaMemcpyAsync(cached_cuda_iov_dist_d, cuda_iov_dist_h_current, sizeof(ddt_cuda_iov_dist_cached_t)*(nb_blocks_used+1), cudaMemcpyHostToDevice, *cuda_stream_iov);
         opal_ddt_set_cuda_iov_cached(pConvertor, nb_blocks_used);
         DT_CUDA_DEBUG ( opal_cuda_output(2, "Pack cuda iov is cached, count %d\n", nb_blocks_used););
 #if defined(OPAL_DATATYPE_CUDA_TIMING)    
@@ -1128,8 +1133,6 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
 #endif
         for (i = cuda_iov_start_pos; i < cuda_iov_end_pos && !buffer_isfull; i++) {
             if (buffer_size >= cached_cuda_iov_nb_bytes_list_h[i]) {
-                cuda_iov_contig_buf_h_current[nb_blocks_used] = (uintptr_t)destination;
-                destination += cached_cuda_iov_nb_bytes_list_h[i];
                 total_packed += cached_cuda_iov_nb_bytes_list_h[i];
                 buffer_size -= cached_cuda_iov_nb_bytes_list_h[i];
                 nb_blocks_used++;
@@ -1143,9 +1146,9 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
         total_time = ELAPSED_TIME( start, end );
         DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Pack to dest %p, cached cuda iov is prepared in %ld microsec, kernel submitted to CUDA stream %d, nb_blocks %d\n", destination_base, total_time,  cuda_iov_pipeline_block->cuda_stream_id, nb_blocks_used); );
 #endif
-        cudaMemcpyAsync(cuda_iov_contig_buf_d_current, cuda_iov_contig_buf_h_current, sizeof(uintptr_t)*(nb_blocks_used), cudaMemcpyHostToDevice, *cuda_stream_iov);
+//        cudaMemcpyAsync(cuda_iov_contig_buf_d_current, cuda_iov_contig_buf_h_current, sizeof(uintptr_t)*(nb_blocks_used), cudaMemcpyHostToDevice, *cuda_stream_iov);
         DT_CUDA_DEBUG ( opal_cuda_output(2, "kernel launched src_base %p, dst_base %p, nb_blocks %ld\n", source_base, destination_base, nb_blocks_used ); );
-        opal_generic_simple_pack_cuda_iov_cached_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cached_cuda_iov_dist_d, pConvertor->current_cuda_iov_pos, cuda_iov_contig_buf_d_current, nb_blocks_used, source_base);
+        opal_generic_simple_pack_cuda_iov_cached_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cached_cuda_iov_dist_d, pConvertor->current_cuda_iov_pos, cuda_iov_contig_buf_d_current, nb_blocks_used, source_base, destination_base);
         pConvertor->current_cuda_iov_pos += nb_blocks_used;
     }
 
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
index f98a8c0b2ea..9cf705ae7e3 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
@@ -46,15 +46,17 @@ __global__ void opal_generic_simple_unpack_cuda_iov_non_cached_kernel( ddt_cuda_
     }
 }
 
-__global__ void opal_generic_simple_unpack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, uint32_t cuda_iov_pos, uintptr_t* cuda_iov_contig_buf_d, int nb_blocks_used, unsigned char* destination_base, size_t cuda_iov_partial_length_start, size_t cuda_iov_partial_length_end)
+__global__ void opal_generic_simple_unpack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, uint32_t cuda_iov_pos, uintptr_t* cuda_iov_contig_buf_d, int nb_blocks_used, unsigned char* destination_base, unsigned char* source_base, size_t cuda_iov_partial_length_start, size_t cuda_iov_partial_length_end)
 {
     uint32_t i, j;
-    size_t dst_offset;
+    size_t dst_offset, src_offset;
     unsigned char *src;
     unsigned char *_source_tmp, *_destination_tmp;
-    uint32_t _nb_bytes;
-    uint32_t current_cuda_iov_pos = cuda_iov_pos; 
-    
+    size_t _nb_bytes;
+    uint32_t current_cuda_iov_pos = cuda_iov_pos;
+    size_t source_disp = cuda_iov_dist[current_cuda_iov_pos].contig_disp;
+    size_t source_partial_disp = (cuda_iov_dist[current_cuda_iov_pos+1].contig_disp - cuda_iov_dist[current_cuda_iov_pos].contig_disp) - cuda_iov_partial_length_start; 
+
     __shared__ uint32_t nb_tasks;
     uint32_t copy_count;
     uint8_t alignment;
@@ -68,24 +70,23 @@ __global__ void opal_generic_simple_unpack_cuda_iov_cached_kernel( ddt_cuda_iov_
     __syncthreads();
     
     for (i = 0; i < nb_tasks; i++) {
-        src = (unsigned char *)cuda_iov_contig_buf_d[blockIdx.x + i * gridDim.x];
-        dst_offset = cuda_iov_dist[blockIdx.x + i * gridDim.x + current_cuda_iov_pos].ptr_offset;
-        if (i == 0 && blockIdx.x == 0 && cuda_iov_partial_length_start != 0) {
-         //   if (threadIdx.x == 0) printf("cuda_iov_partial_length_start %d", cuda_iov_partial_length_start);
-            dst_offset = dst_offset + cuda_iov_dist[blockIdx.x + i * gridDim.x + current_cuda_iov_pos].nb_bytes - cuda_iov_partial_length_start; 
-        }
-        _destination_tmp = destination_base + dst_offset;
+        src_offset = cuda_iov_dist[blockIdx.x + i * gridDim.x + current_cuda_iov_pos].contig_disp - source_disp - source_partial_disp;
+        dst_offset = cuda_iov_dist[blockIdx.x + i * gridDim.x + current_cuda_iov_pos].ncontig_disp;
+        _nb_bytes = cuda_iov_dist[blockIdx.x + i * gridDim.x + current_cuda_iov_pos + 1].contig_disp - cuda_iov_dist[blockIdx.x + i * gridDim.x + current_cuda_iov_pos].contig_disp;
 
         if (i == 0 && blockIdx.x == 0 && cuda_iov_partial_length_start != 0) {
+            src_offset = cuda_iov_dist[current_cuda_iov_pos].contig_disp - source_disp;
+            dst_offset = dst_offset + _nb_bytes - cuda_iov_partial_length_start;  
             _nb_bytes = cuda_iov_partial_length_start;
         } else if (i == nb_tasks-1 && (blockIdx.x == (nb_blocks_used-1) % gridDim.x) && cuda_iov_partial_length_end != 0) {
             _nb_bytes = cuda_iov_partial_length_end;
-        } else {
-            _nb_bytes = cuda_iov_dist[blockIdx.x + i * gridDim.x + current_cuda_iov_pos].nb_bytes;
         }
-        if ((uintptr_t)(_destination_tmp) % ALIGNMENT_DOUBLE == 0 && (uintptr_t)src % ALIGNMENT_DOUBLE == 0 && _nb_bytes % ALIGNMENT_DOUBLE == 0) {
+        
+        _destination_tmp = destination_base + dst_offset; 
+        _source_tmp = source_base + src_offset;
+        if ((uintptr_t)(_destination_tmp) % ALIGNMENT_DOUBLE == 0 && (uintptr_t)(_source_tmp) % ALIGNMENT_DOUBLE == 0 && _nb_bytes % ALIGNMENT_DOUBLE == 0) {
             alignment = ALIGNMENT_DOUBLE;
-        } else if ((uintptr_t)(_destination_tmp) % ALIGNMENT_FLOAT == 0 && (uintptr_t)src % ALIGNMENT_FLOAT == 0 && _nb_bytes % ALIGNMENT_FLOAT == 0) {
+        } else if ((uintptr_t)(_destination_tmp) % ALIGNMENT_FLOAT == 0 && (uintptr_t)(_source_tmp) % ALIGNMENT_FLOAT == 0 && _nb_bytes % ALIGNMENT_FLOAT == 0) {
             alignment = ALIGNMENT_FLOAT;
         } else {
             alignment = ALIGNMENT_CHAR;
@@ -97,7 +98,7 @@ __global__ void opal_generic_simple_unpack_cuda_iov_cached_kernel( ddt_cuda_iov_
                 if (copy_count > blockDim.x) printf("copy_count %d, dim %d\n", copy_count, blockDim.x);
             }*/
             if (j < copy_count) {
-                _source_tmp = src + j * alignment;
+                _source_tmp = source_base + src_offset + j * alignment;
                 _destination_tmp = destination_base + dst_offset + j * alignment;
   /*              if (threadIdx.x == 0) {
                     printf("_src %p, dst %p, alignment %d, blk %d, j %d, count %d\n", _source_tmp, _destination_tmp, alignment, blockIdx.x, j, copy_count);
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
index 50009710d2d..dc356d96471 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
@@ -663,6 +663,7 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
     uint8_t cuda_iov_is_cached = 0;
     size_t cuda_iov_partial_length_start = 0;
     size_t cuda_iov_partial_length_end = 0;
+    size_t source_disp = 0;
 
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
     TIMER_DATA_TYPE start, end, start_total, end_total;
@@ -765,17 +766,18 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
             nb_blocks_per_description = (count_desc + thread_per_block - 1) / thread_per_block;
             DT_CUDA_DEBUG ( opal_cuda_output(10, "Unpack description %d, size %d, residue %d, alignment %d\n", i, count_desc, residue_desc, alignment); );
             for (j = 0; j < nb_blocks_per_description; j++) {
-                cuda_iov_dist_h_current[nb_blocks_used].ptr_offset = destination + j * thread_per_block * alignment - destination_base;
+                cuda_iov_dist_h_current[nb_blocks_used].ncontig_disp = destination + j * thread_per_block * alignment - destination_base;
+                cuda_iov_dist_h_current[nb_blocks_used].contig_disp = source_disp;
                 if ( (j+1) * thread_per_block <= count_desc) {
-                    cuda_iov_dist_h_current[nb_blocks_used].nb_bytes = thread_per_block * alignment;
+                    cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] = thread_per_block * alignment;
                 } else {
-                    cuda_iov_dist_h_current[nb_blocks_used].nb_bytes = (thread_per_block - ((j+1)*thread_per_block - count_desc)) * alignment;
+                    cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] = (thread_per_block - ((j+1)*thread_per_block - count_desc)) * alignment;
                 }
 #if defined (OPAL_DATATYPE_CUDA_DEBUG)
-                assert (cuda_iov_dist_h_current[nb_blocks_used].nb_bytes > 0); 
+                assert (cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] > 0); 
 #endif /* OPAL_DATATYPE_CUDA_DEBUG */
-                cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] = cuda_iov_dist_h_current[nb_blocks_used].nb_bytes;
-                DT_CUDA_DEBUG( opal_cuda_output(12, "Unpack \tblock %d, src_offset %ld, nb_bytes %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].ptr_offset, cuda_iov_dist_h_current[nb_blocks_used].nb_bytes); );
+                source_disp += cached_cuda_iov_nb_bytes_list_h[nb_blocks_used];
+                DT_CUDA_DEBUG( opal_cuda_output(12, "Unpack \tblock %d, ncontig_disp %ld, contig_disp %ld, nb_bytes %ld\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].ncontig_disp, cuda_iov_dist_h_current[nb_blocks_used].contig_disp, cached_cuda_iov_nb_bytes_list_h[nb_blocks_used]); );
                 nb_blocks_used ++;
             }
 
@@ -783,18 +785,20 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
             if (residue_desc != 0) {
                /* orig_alignment = opal_datatype_basicDatatypes[pElem->elem.common.type]->size;*/
                 orig_alignment = ALIGNMENT_CHAR;
-                cuda_iov_dist_h_current[nb_blocks_used].ptr_offset = destination + length_per_iovec / alignment * alignment - destination_base;
-                cuda_iov_dist_h_current[nb_blocks_used].nb_bytes = length_per_iovec - length_per_iovec / alignment * alignment;
+                cuda_iov_dist_h_current[nb_blocks_used].ncontig_disp = destination + length_per_iovec / alignment * alignment - destination_base;
+                cuda_iov_dist_h_current[nb_blocks_used].contig_disp = source_disp;
+                cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] = length_per_iovec - length_per_iovec / alignment * alignment;
 #if defined (OPAL_DATATYPE_CUDA_DEBUG)
-                assert (cuda_iov_dist_h_current[nb_blocks_used].nb_bytes > 0);
+                assert (cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] > 0);
 #endif /* OPAL_DATATYPE_CUDA_DEBUG */
-                cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] = cuda_iov_dist_h_current[nb_blocks_used].nb_bytes;
-                DT_CUDA_DEBUG( opal_cuda_output(12, "Unpack \tblock %d, src_offset %ld, nb_bytes %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].ptr_offset, cuda_iov_dist_h_current[nb_blocks_used].nb_bytes); );
+                source_disp += cached_cuda_iov_nb_bytes_list_h[nb_blocks_used];
+                DT_CUDA_DEBUG( opal_cuda_output(12, "Unpack \tblock %d, ncontig_disp %ld, contig_disp %ld, nb_bytes %ld\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].ncontig_disp, cuda_iov_dist_h_current[nb_blocks_used].contig_disp, cached_cuda_iov_nb_bytes_list_h[nb_blocks_used]); );
                 nb_blocks_used ++;
             }
         }
-        
-        cudaMemcpy(cached_cuda_iov_dist_d, cuda_iov_dist_h_current, sizeof(ddt_cuda_iov_dist_cached_t)*(nb_blocks_used), cudaMemcpyHostToDevice);
+        /* use additional entry to store the size of entire contiguous buffer needed for one ddt */
+        cuda_iov_dist_h_current[nb_blocks_used].contig_disp = source_disp;
+        cudaMemcpy(cached_cuda_iov_dist_d, cuda_iov_dist_h_current, sizeof(ddt_cuda_iov_dist_cached_t)*(nb_blocks_used+1), cudaMemcpyHostToDevice);
         opal_ddt_set_cuda_iov_cached(pConvertor, nb_blocks_used);
         DT_CUDA_DEBUG ( opal_cuda_output(2, "Unpack cuda iov is cached, count %d\n", nb_blocks_used););
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
@@ -826,15 +830,11 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
             total_unpacked += cuda_iov_partial_length_start;
             buffer_size -= cuda_iov_partial_length_start;
             pConvertor->current_iov_partial_length = 0;
-            cuda_iov_contig_buf_h_current[nb_blocks_used] = (uintptr_t)source;
-            source += cuda_iov_partial_length_start;
             cuda_iov_start_pos ++;
             nb_blocks_used ++;
         }
         for (i = cuda_iov_start_pos; i < cuda_iov_end_pos && !buffer_isfull; i++) {
             if (buffer_size >= cached_cuda_iov_nb_bytes_list_h[i]) {
-                cuda_iov_contig_buf_h_current[nb_blocks_used] = (uintptr_t)source;
-                source += cached_cuda_iov_nb_bytes_list_h[i];
                 total_unpacked += cached_cuda_iov_nb_bytes_list_h[i];
                 buffer_size -= cached_cuda_iov_nb_bytes_list_h[i];
                 nb_blocks_used ++;
@@ -842,9 +842,6 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
                 if (buffer_size > 0) {
                     cuda_iov_partial_length_end = buffer_size;
                     total_unpacked += cuda_iov_partial_length_end;
-                    cuda_iov_contig_buf_h_current[nb_blocks_used] = (uintptr_t)source;
-                    source += cuda_iov_partial_length_end;
-                    pConvertor->current_iov_partial_length = cached_cuda_iov_nb_bytes_list_h[i] - cuda_iov_partial_length_end;
                     nb_blocks_used ++;
                 }
                 buffer_size = 0;
@@ -859,7 +856,7 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
 #endif
         cudaMemcpyAsync(cuda_iov_contig_buf_d_current, cuda_iov_contig_buf_h_current, sizeof(uintptr_t)*(nb_blocks_used), cudaMemcpyHostToDevice, *cuda_stream_iov);
         DT_CUDA_DEBUG ( opal_cuda_output(2, "kernel launched src_base %p, dst_base %p, nb_blocks %ld\n", source_base, destination_base, nb_blocks_used ); );
-        opal_generic_simple_unpack_cuda_iov_cached_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cached_cuda_iov_dist_d, pConvertor->current_cuda_iov_pos, cuda_iov_contig_buf_d_current, nb_blocks_used, destination_base, cuda_iov_partial_length_start, cuda_iov_partial_length_end);
+        opal_generic_simple_unpack_cuda_iov_cached_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cached_cuda_iov_dist_d, pConvertor->current_cuda_iov_pos, cuda_iov_contig_buf_d_current, nb_blocks_used, destination_base, source_base, cuda_iov_partial_length_start, cuda_iov_partial_length_end);
     }
 
     for (i = 0; i < NB_STREAMS; i++) {
diff --git a/test/datatype/ddt_benchmark.c b/test/datatype/ddt_benchmark.c
index d961ef34e4e..e879e5c0192 100644
--- a/test/datatype/ddt_benchmark.c
+++ b/test/datatype/ddt_benchmark.c
@@ -895,14 +895,14 @@ static int local_copy_with_convertor( ompi_datatype_t* pdt, int count, int chunk
             
         }
 #if defined (TEST_CHAR)
-   /*     mat_char = (unsigned char *)ptemp;
+        mat_char = (unsigned char *)ptemp;
         for (j = 0; j < max_data; j++) {
             if (mat_char[j] != 'a') {
                 t_error ++;
                 printf("error %d, %c\n", j, mat_char[j]);
             }
         }
-        printf("total error %d\n", t_error);*/
+        printf("total error %d\n", t_error);
 #endif
 
         if( done2 == 0 ) {

From b2f66114eb32d25be08825ce1ca5ab866f428a27 Mon Sep 17 00:00:00 2001
From: eddy16112 <eddy16112@gmail.com>
Date: Fri, 13 Nov 2015 18:33:48 -0500
Subject: [PATCH 062/190] clean up kernel, put variables uses multiple times
 into register

---
 .../datatype/cuda/opal_datatype_pack_cuda_kernel.cu |  8 +++++---
 .../cuda/opal_datatype_unpack_cuda_kernel.cu        | 13 +++++++------
 .../cuda/opal_datatype_unpack_cuda_wrapper.cu       |  1 -
 3 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
index ddfd68b0e4c..92a96d1cb2b 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
@@ -91,11 +91,12 @@ __global__ void opal_generic_simple_pack_cuda_iov_non_cached_kernel( ddt_cuda_io
 __global__ void opal_generic_simple_pack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, uint32_t cuda_iov_pos, uintptr_t* cuda_iov_contig_buf_d, int nb_blocks_used, unsigned char* source_base, unsigned char* destination_base)
 {
     uint32_t i, j;
-    size_t _nb_bytes;    
+    uint32_t _nb_bytes;    
     size_t src_offset, dst_offset;
     unsigned char *_source_tmp, *_destination_tmp;
     uint32_t current_cuda_iov_pos = cuda_iov_pos;
     size_t destination_disp = cuda_iov_dist[current_cuda_iov_pos].contig_disp;
+    size_t contig_disp;
     
     __shared__ uint32_t nb_tasks;
     uint32_t copy_count;
@@ -111,9 +112,10 @@ __global__ void opal_generic_simple_pack_cuda_iov_cached_kernel( ddt_cuda_iov_di
     __syncthreads();
     
     for (i = 0; i < nb_tasks; i++) {
+        contig_disp = cuda_iov_dist[blockIdx.x + i * gridDim.x + current_cuda_iov_pos].contig_disp;  /* this variable is used multiple times, so put in in register */
         src_offset = cuda_iov_dist[blockIdx.x + i * gridDim.x + current_cuda_iov_pos].ncontig_disp;
-        dst_offset = cuda_iov_dist[blockIdx.x + i * gridDim.x + current_cuda_iov_pos].contig_disp - destination_disp;
-        _nb_bytes = cuda_iov_dist[blockIdx.x + i * gridDim.x + current_cuda_iov_pos + 1].contig_disp - cuda_iov_dist[blockIdx.x + i * gridDim.x + current_cuda_iov_pos].contig_disp;
+        dst_offset = contig_disp - destination_disp;
+        _nb_bytes = cuda_iov_dist[blockIdx.x + i * gridDim.x + current_cuda_iov_pos + 1].contig_disp - contig_disp;
         
         _source_tmp = source_base + src_offset;
         _destination_tmp = destination_base + dst_offset;
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
index 9cf705ae7e3..f2c337ea682 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
@@ -50,12 +50,12 @@ __global__ void opal_generic_simple_unpack_cuda_iov_cached_kernel( ddt_cuda_iov_
 {
     uint32_t i, j;
     size_t dst_offset, src_offset;
-    unsigned char *src;
     unsigned char *_source_tmp, *_destination_tmp;
-    size_t _nb_bytes;
+    uint32_t _nb_bytes;
     uint32_t current_cuda_iov_pos = cuda_iov_pos;
     size_t source_disp = cuda_iov_dist[current_cuda_iov_pos].contig_disp;
-    size_t source_partial_disp = (cuda_iov_dist[current_cuda_iov_pos+1].contig_disp - cuda_iov_dist[current_cuda_iov_pos].contig_disp) - cuda_iov_partial_length_start; 
+    size_t source_partial_disp = (cuda_iov_dist[current_cuda_iov_pos+1].contig_disp - cuda_iov_dist[current_cuda_iov_pos].contig_disp) - cuda_iov_partial_length_start;
+    size_t contig_disp; 
 
     __shared__ uint32_t nb_tasks;
     uint32_t copy_count;
@@ -70,12 +70,13 @@ __global__ void opal_generic_simple_unpack_cuda_iov_cached_kernel( ddt_cuda_iov_
     __syncthreads();
     
     for (i = 0; i < nb_tasks; i++) {
-        src_offset = cuda_iov_dist[blockIdx.x + i * gridDim.x + current_cuda_iov_pos].contig_disp - source_disp - source_partial_disp;
+        contig_disp = cuda_iov_dist[blockIdx.x + i * gridDim.x + current_cuda_iov_pos].contig_disp;  /* this variable is used multiple times, so put in in register */
+        src_offset = contig_disp - source_disp - source_partial_disp;
         dst_offset = cuda_iov_dist[blockIdx.x + i * gridDim.x + current_cuda_iov_pos].ncontig_disp;
-        _nb_bytes = cuda_iov_dist[blockIdx.x + i * gridDim.x + current_cuda_iov_pos + 1].contig_disp - cuda_iov_dist[blockIdx.x + i * gridDim.x + current_cuda_iov_pos].contig_disp;
+        _nb_bytes = cuda_iov_dist[blockIdx.x + i * gridDim.x + current_cuda_iov_pos + 1].contig_disp - contig_disp;
 
         if (i == 0 && blockIdx.x == 0 && cuda_iov_partial_length_start != 0) {
-            src_offset = cuda_iov_dist[current_cuda_iov_pos].contig_disp - source_disp;
+            src_offset = contig_disp - source_disp;
             dst_offset = dst_offset + _nb_bytes - cuda_iov_partial_length_start;  
             _nb_bytes = cuda_iov_partial_length_start;
         } else if (i == nb_tasks-1 && (blockIdx.x == (nb_blocks_used-1) % gridDim.x) && cuda_iov_partial_length_end != 0) {
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
index dc356d96471..d400e05efcf 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
@@ -854,7 +854,6 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
         total_time = ELAPSED_TIME( start, end );
         DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Unpack src %p, cached cuda iov is prepared in %ld microsec, kernel submitted to CUDA stream %d, nb_blocks %d\n", source_base, total_time,  cuda_iov_pipeline_block->cuda_stream_id, nb_blocks_used); );
 #endif
-        cudaMemcpyAsync(cuda_iov_contig_buf_d_current, cuda_iov_contig_buf_h_current, sizeof(uintptr_t)*(nb_blocks_used), cudaMemcpyHostToDevice, *cuda_stream_iov);
         DT_CUDA_DEBUG ( opal_cuda_output(2, "kernel launched src_base %p, dst_base %p, nb_blocks %ld\n", source_base, destination_base, nb_blocks_used ); );
         opal_generic_simple_unpack_cuda_iov_cached_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cached_cuda_iov_dist_d, pConvertor->current_cuda_iov_pos, cuda_iov_contig_buf_d_current, nb_blocks_used, destination_base, source_base, cuda_iov_partial_length_start, cuda_iov_partial_length_end);
     }

From a59950f8bb1fdd21943976394c44bf96ba859b3b Mon Sep 17 00:00:00 2001
From: eddy16112 <eddy16112@gmail.com>
Date: Fri, 13 Nov 2015 22:18:41 -0500
Subject: [PATCH 063/190] another checkpoint

---
 opal/datatype/cuda/opal_datatype_cuda.cu      |  9 ++++
 .../cuda/opal_datatype_pack_cuda_wrapper.cu   | 26 +++++-----
 .../cuda/opal_datatype_unpack_cuda_kernel.cu  |  6 ++-
 .../cuda/opal_datatype_unpack_cuda_wrapper.cu | 47 ++++++++++---------
 4 files changed, 53 insertions(+), 35 deletions(-)

diff --git a/opal/datatype/cuda/opal_datatype_cuda.cu b/opal/datatype/cuda/opal_datatype_cuda.cu
index 471c6e63709..3213a3b43fd 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cu
+++ b/opal/datatype/cuda/opal_datatype_cuda.cu
@@ -355,12 +355,21 @@ void opal_ddt_set_cuda_iov_position(struct opal_convertor_t *convertor, size_t d
 {
     int i;
     size_t iov_size = 0;
+    convertor->current_iov_partial_length = 0;
+    convertor->current_cuda_iov_pos = 0;
+    if (ddt_offset == 0) {
+       return;
+    }
     for(i = 0; i < cuda_iov_count; i++) {
         iov_size += cached_cuda_iov_nb_bytes_list_h[i];
         if (iov_size > ddt_offset) {
             convertor->current_iov_partial_length = iov_size - ddt_offset;
             convertor->current_cuda_iov_pos = i;
             break;
+        } else if (iov_size == ddt_offset){
+            convertor->current_iov_partial_length = 0;
+            convertor->current_cuda_iov_pos = i+1;
+            break;
         }
     }
 }
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
index f1ce6dbda7d..36ce4e3951d 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
@@ -1117,17 +1117,14 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
 #endif
     }
     
-    cached_cuda_iov_count = cached_cuda_iov->cuda_iov_count;
-   
     /* now we use cached cuda iov */
-    if( pConvertor->bConverted != pConvertor->local_size && !buffer_isfull) {
-        cuda_iov_start_pos = pConvertor->current_cuda_iov_pos;
-        cuda_iov_end_pos = cached_cuda_iov_count;
-        nb_blocks_used = 0;
-        cuda_iov_pipeline_block = current_cuda_device->cuda_iov_pipeline_block[iov_pipeline_block_id];
-        cuda_iov_contig_buf_h_current = cuda_iov_pipeline_block->cuda_iov_contig_buf_h;
-        cuda_iov_contig_buf_d_current = cuda_iov_pipeline_block->cuda_iov_contig_buf_d;
-        cuda_stream_iov = cuda_iov_pipeline_block->cuda_stream;
+    cuda_iov_start_pos = pConvertor->current_cuda_iov_pos;
+    cuda_iov_end_pos = cached_cuda_iov->cuda_iov_count;
+    nb_blocks_used = 0;
+    cuda_iov_pipeline_block = current_cuda_device->cuda_iov_pipeline_block[iov_pipeline_block_id];
+    cuda_stream_iov = cuda_iov_pipeline_block->cuda_stream;
+   
+    if( pConvertor->current_count < pConvertor->count && !buffer_isfull) {
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
         GET_TIME(start);
 #endif
@@ -1141,16 +1138,21 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
                 break;
             }
         }
+        if (!buffer_isfull) {
+            pConvertor->current_count ++;
+            cuda_iov_start_pos = 0;
+            cuda_iov_end_pos = cached_cuda_iov->cuda_iov_count;
+        }
 #if defined(OPAL_DATATYPE_CUDA_TIMING)    
         GET_TIME( end );
         total_time = ELAPSED_TIME( start, end );
         DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Pack to dest %p, cached cuda iov is prepared in %ld microsec, kernel submitted to CUDA stream %d, nb_blocks %d\n", destination_base, total_time,  cuda_iov_pipeline_block->cuda_stream_id, nb_blocks_used); );
 #endif
-//        cudaMemcpyAsync(cuda_iov_contig_buf_d_current, cuda_iov_contig_buf_h_current, sizeof(uintptr_t)*(nb_blocks_used), cudaMemcpyHostToDevice, *cuda_stream_iov);
+    }
         DT_CUDA_DEBUG ( opal_cuda_output(2, "kernel launched src_base %p, dst_base %p, nb_blocks %ld\n", source_base, destination_base, nb_blocks_used ); );
         opal_generic_simple_pack_cuda_iov_cached_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cached_cuda_iov_dist_d, pConvertor->current_cuda_iov_pos, cuda_iov_contig_buf_d_current, nb_blocks_used, source_base, destination_base);
         pConvertor->current_cuda_iov_pos += nb_blocks_used;
-    }
+        pConvertor->current_cuda_iov_pos = pConvertor->current_cuda_iov_pos % cached_cuda_iov->cuda_iov_count;
 
     for (i = 0; i < NB_STREAMS; i++) {
         cudaStreamSynchronize(cuda_streams->opal_cuda_stream[i]);
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
index f2c337ea682..b58cff27cf3 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
@@ -54,7 +54,7 @@ __global__ void opal_generic_simple_unpack_cuda_iov_cached_kernel( ddt_cuda_iov_
     uint32_t _nb_bytes;
     uint32_t current_cuda_iov_pos = cuda_iov_pos;
     size_t source_disp = cuda_iov_dist[current_cuda_iov_pos].contig_disp;
-    size_t source_partial_disp = (cuda_iov_dist[current_cuda_iov_pos+1].contig_disp - cuda_iov_dist[current_cuda_iov_pos].contig_disp) - cuda_iov_partial_length_start;
+    size_t source_partial_disp = 0;
     size_t contig_disp; 
 
     __shared__ uint32_t nb_tasks;
@@ -69,6 +69,10 @@ __global__ void opal_generic_simple_unpack_cuda_iov_cached_kernel( ddt_cuda_iov_
     }
     __syncthreads();
     
+    if (cuda_iov_partial_length_start != 0) {
+        source_partial_disp = (cuda_iov_dist[current_cuda_iov_pos+1].contig_disp - cuda_iov_dist[current_cuda_iov_pos].contig_disp) - cuda_iov_partial_length_start;
+    }
+    
     for (i = 0; i < nb_tasks; i++) {
         contig_disp = cuda_iov_dist[blockIdx.x + i * gridDim.x + current_cuda_iov_pos].contig_disp;  /* this variable is used multiple times, so put in in register */
         src_offset = contig_disp - source_disp - source_partial_disp;
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
index d400e05efcf..70d9d10465e 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
@@ -807,32 +807,30 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
         DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Unpack cached cuda iov is prepared in %ld microsec, nb_blocks_used %d\n", total_time, nb_blocks_used); );
 #endif
     }
-    
-    cached_cuda_iov_count = cached_cuda_iov->cuda_iov_count;
       
     /* now we use cached cuda iov */
-    if( pConvertor->bConverted != pConvertor->local_size && !buffer_isfull) {
-        opal_ddt_set_cuda_iov_position(pConvertor, pConvertor->bConverted, cached_cuda_iov_nb_bytes_list_h, cached_cuda_iov_count);
-        cuda_iov_start_pos = pConvertor->current_cuda_iov_pos;
-        cuda_iov_end_pos = cached_cuda_iov_count;
-        nb_blocks_used = 0;
-        cuda_iov_pipeline_block = current_cuda_device->cuda_iov_pipeline_block[iov_pipeline_block_id];
-        cuda_iov_contig_buf_h_current = cuda_iov_pipeline_block->cuda_iov_contig_buf_h;
-        cuda_iov_contig_buf_d_current = cuda_iov_pipeline_block->cuda_iov_contig_buf_d;
-        cuda_iov_dist_d_current = cached_cuda_iov_dist_d + pConvertor->current_cuda_iov_pos;
-        cuda_stream_iov = cuda_iov_pipeline_block->cuda_stream;
+    cached_cuda_iov_count = cached_cuda_iov->cuda_iov_count;
+    opal_ddt_set_cuda_iov_position(pConvertor, pConvertor->bConverted, cached_cuda_iov_nb_bytes_list_h, cached_cuda_iov_count);
+    cuda_iov_start_pos = pConvertor->current_cuda_iov_pos;
+    cuda_iov_end_pos = cached_cuda_iov_count;
+    nb_blocks_used = 0;
+    cuda_iov_pipeline_block = current_cuda_device->cuda_iov_pipeline_block[iov_pipeline_block_id];
+    cuda_stream_iov = cuda_iov_pipeline_block->cuda_stream;
+    
+    printf("[00000] partial_length %ld, pos %d\n", pConvertor->current_iov_partial_length, pConvertor->current_cuda_iov_pos);
+    if (pConvertor->current_iov_partial_length > 0) {
+        cuda_iov_partial_length_start = pConvertor->current_iov_partial_length;
+        total_unpacked += cuda_iov_partial_length_start;
+        buffer_size -= cuda_iov_partial_length_start;
+        pConvertor->current_iov_partial_length = 0;
+        cuda_iov_start_pos ++;
+        nb_blocks_used ++;
+    }
+    
+    while( pConvertor->current_count < pConvertor->count && !buffer_isfull) {
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
         GET_TIME(start);
 #endif
-        printf("[00000] partial_length %ld, pos %d\n", pConvertor->current_iov_partial_length, pConvertor->current_cuda_iov_pos);
-        if (pConvertor->current_iov_partial_length > 0) {
-            cuda_iov_partial_length_start = pConvertor->current_iov_partial_length;
-            total_unpacked += cuda_iov_partial_length_start;
-            buffer_size -= cuda_iov_partial_length_start;
-            pConvertor->current_iov_partial_length = 0;
-            cuda_iov_start_pos ++;
-            nb_blocks_used ++;
-        }
         for (i = cuda_iov_start_pos; i < cuda_iov_end_pos && !buffer_isfull; i++) {
             if (buffer_size >= cached_cuda_iov_nb_bytes_list_h[i]) {
                 total_unpacked += cached_cuda_iov_nb_bytes_list_h[i];
@@ -849,14 +847,19 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
                 break;
             }
         }
+        if (!buffer_isfull) {
+            pConvertor->current_count ++;
+            cuda_iov_start_pos = 0;
+            cuda_iov_end_pos = cached_cuda_iov->cuda_iov_count;
+        }
 #if defined(OPAL_DATATYPE_CUDA_TIMING)    
         GET_TIME( end );
         total_time = ELAPSED_TIME( start, end );
         DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Unpack src %p, cached cuda iov is prepared in %ld microsec, kernel submitted to CUDA stream %d, nb_blocks %d\n", source_base, total_time,  cuda_iov_pipeline_block->cuda_stream_id, nb_blocks_used); );
 #endif
+    }
         DT_CUDA_DEBUG ( opal_cuda_output(2, "kernel launched src_base %p, dst_base %p, nb_blocks %ld\n", source_base, destination_base, nb_blocks_used ); );
         opal_generic_simple_unpack_cuda_iov_cached_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cached_cuda_iov_dist_d, pConvertor->current_cuda_iov_pos, cuda_iov_contig_buf_d_current, nb_blocks_used, destination_base, source_base, cuda_iov_partial_length_start, cuda_iov_partial_length_end);
-    }
 
     for (i = 0; i < NB_STREAMS; i++) {
         cudaStreamSynchronize(cuda_streams->opal_cuda_stream[i]);

From e3cb0eeb4853d9a566e160a533398821e9b5b477 Mon Sep 17 00:00:00 2001
From: eddy16112 <eddy16112@gmail.com>
Date: Sat, 14 Nov 2015 01:40:55 -0500
Subject: [PATCH 064/190] now convertor->count > 1 is woring

---
 opal/datatype/cuda/opal_datatype_cuda.cu      |  3 +++
 .../cuda/opal_datatype_cuda_internal.cuh      |  4 +--
 .../cuda/opal_datatype_pack_cuda_kernel.cu    | 27 ++++++++++++++-----
 .../cuda/opal_datatype_pack_cuda_wrapper.cu   | 27 +++++++++++--------
 .../cuda/opal_datatype_unpack_cuda_kernel.cu  | 27 ++++++++++++++-----
 .../cuda/opal_datatype_unpack_cuda_wrapper.cu | 22 ++++++++-------
 6 files changed, 74 insertions(+), 36 deletions(-)

diff --git a/opal/datatype/cuda/opal_datatype_cuda.cu b/opal/datatype/cuda/opal_datatype_cuda.cu
index 3213a3b43fd..ec33b5c0e4d 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cu
+++ b/opal/datatype/cuda/opal_datatype_cuda.cu
@@ -355,11 +355,14 @@ void opal_ddt_set_cuda_iov_position(struct opal_convertor_t *convertor, size_t d
 {
     int i;
     size_t iov_size = 0;
+    size_t ddt_size;
     convertor->current_iov_partial_length = 0;
     convertor->current_cuda_iov_pos = 0;
     if (ddt_offset == 0) {
        return;
     }
+    opal_datatype_type_size(convertor->pDesc, &ddt_size);
+    ddt_offset = ddt_offset % ddt_size;
     for(i = 0; i < cuda_iov_count; i++) {
         iov_size += cached_cuda_iov_nb_bytes_list_h[i];
         if (iov_size > ddt_offset) {
diff --git a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
index ea4afa0b989..82a28420580 100644
--- a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
+++ b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
@@ -138,9 +138,9 @@ __global__ void opal_generic_simple_pack_cuda_iov_non_cached_kernel( ddt_cuda_io
 
 __global__ void opal_generic_simple_unpack_cuda_iov_non_cached_kernel( ddt_cuda_iov_dist_non_cached_t* cuda_iov_dist, int nb_blocks_used);
 
-__global__ void opal_generic_simple_pack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, uint32_t cuda_iov_pos, uintptr_t* cuda_iov_contig_buf_d, int nb_blocks_used, unsigned char* source_base, unsigned char* destination_base);
+__global__ void opal_generic_simple_pack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, uint32_t cuda_iov_pos, uint32_t cuda_iov_count, uint32_t ddt_extent, uint32_t current_count, int nb_blocks_used, unsigned char* source_base, unsigned char* destination_base);
 
-__global__ void opal_generic_simple_unpack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, uint32_t cuda_iov_pos, uintptr_t* cuda_iov_contig_buf_d, int nb_blocks_used, unsigned char* destination_base, unsigned char* source_base, size_t cuda_iov_partial_length_start, size_t cuda_iov_partial_length_end);
+__global__ void opal_generic_simple_unpack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, uint32_t cuda_iov_pos, uint32_t cuda_iov_count, uint32_t ddt_extent, uint32_t current_count, int nb_blocks_used, unsigned char* destination_base, unsigned char* source_base, size_t cuda_iov_partial_length_start, size_t cuda_iov_partial_length_end);
 
 void opal_cuda_output(int output_id, const char *format, ...);
 
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
index 92a96d1cb2b..2564fe1393c 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
@@ -88,7 +88,7 @@ __global__ void opal_generic_simple_pack_cuda_iov_non_cached_kernel( ddt_cuda_io
     }
 }
 
-__global__ void opal_generic_simple_pack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, uint32_t cuda_iov_pos, uintptr_t* cuda_iov_contig_buf_d, int nb_blocks_used, unsigned char* source_base, unsigned char* destination_base)
+__global__ void opal_generic_simple_pack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, uint32_t cuda_iov_pos, uint32_t cuda_iov_count, uint32_t ddt_extent, uint32_t current_count, int nb_blocks_used, unsigned char* source_base, unsigned char* destination_base)
 {
     uint32_t i, j;
     uint32_t _nb_bytes;    
@@ -97,6 +97,9 @@ __global__ void opal_generic_simple_pack_cuda_iov_cached_kernel( ddt_cuda_iov_di
     uint32_t current_cuda_iov_pos = cuda_iov_pos;
     size_t destination_disp = cuda_iov_dist[current_cuda_iov_pos].contig_disp;
     size_t contig_disp;
+    uint32_t _my_cuda_iov_pos;
+    uint32_t _my_cuda_iov_iteration;
+    size_t ddt_size = cuda_iov_dist[cuda_iov_count].contig_disp;
     
     __shared__ uint32_t nb_tasks;
     uint32_t copy_count;
@@ -107,15 +110,20 @@ __global__ void opal_generic_simple_pack_cuda_iov_cached_kernel( ddt_cuda_iov_di
         if (blockIdx.x < (nb_blocks_used % gridDim.x)) {
             nb_tasks ++;
         }
-   //     printf("nb_tasks %d, griddim %d, nb_blocks_used %d, bloid %d \n", nb_tasks, gridDim.x, nb_blocks_used, blockIdx.x);
+    //    printf("cuda_iov_count %d, ddt_extent %d, current_count %d\n", cuda_iov_count, ddt_extent, current_count);
+    //     printf("nb_tasks %d, griddim %d, nb_blocks_used %d, bloid %d \n", nb_tasks, gridDim.x, nb_blocks_used, blockIdx.x);
     }
     __syncthreads();
     
     for (i = 0; i < nb_tasks; i++) {
-        contig_disp = cuda_iov_dist[blockIdx.x + i * gridDim.x + current_cuda_iov_pos].contig_disp;  /* this variable is used multiple times, so put in in register */
-        src_offset = cuda_iov_dist[blockIdx.x + i * gridDim.x + current_cuda_iov_pos].ncontig_disp;
-        dst_offset = contig_disp - destination_disp;
-        _nb_bytes = cuda_iov_dist[blockIdx.x + i * gridDim.x + current_cuda_iov_pos + 1].contig_disp - contig_disp;
+        /* these 3 variables are used multiple times, so put in in register */
+        _my_cuda_iov_pos = (blockIdx.x + i * gridDim.x + current_cuda_iov_pos) % cuda_iov_count;
+        _my_cuda_iov_iteration = (blockIdx.x + i * gridDim.x + current_cuda_iov_pos) / cuda_iov_count;
+        contig_disp = cuda_iov_dist[_my_cuda_iov_pos].contig_disp;  
+        
+        src_offset = cuda_iov_dist[_my_cuda_iov_pos].ncontig_disp + (_my_cuda_iov_iteration + current_count) * ddt_extent;
+        dst_offset = contig_disp + ddt_size * _my_cuda_iov_iteration - destination_disp;
+        _nb_bytes = cuda_iov_dist[_my_cuda_iov_pos + 1].contig_disp - contig_disp;
         
         _source_tmp = source_base + src_offset;
         _destination_tmp = destination_base + dst_offset;
@@ -128,7 +136,12 @@ __global__ void opal_generic_simple_pack_cuda_iov_cached_kernel( ddt_cuda_iov_di
             alignment = ALIGNMENT_CHAR;
         }
         copy_count = _nb_bytes / alignment;
-        
+    /*    
+        if (threadIdx.x == 0 && nb_tasks != 0) {
+            printf("pack block %d, src_offset %ld, dst_offset %ld, count %d, nb_bytes %d, nb_tasks %d, i %d\n", blockIdx.x, src_offset, dst_offset, copy_count, _nb_bytes, nb_tasks, i);
+        }
+        __syncthreads();
+      */  
         for (j = threadIdx.x; j < copy_count; j += blockDim.x) {
             if (j < copy_count) {
                 _source_tmp = source_base + src_offset + j * alignment;
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
index 36ce4e3951d..fc9181e902b 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
@@ -966,6 +966,8 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
     uint32_t cached_cuda_iov_count = 0;
     uint8_t cuda_iov_is_cached = 0;
     size_t destionation_disp = 0;
+    opal_datatype_count_t convertor_current_count;
+    OPAL_PTRDIFF_TYPE ddt_extent;
 
 #if defined(OPAL_DATATYPE_CUDA_TIMING)    
     TIMER_DATA_TYPE start, end, start_total, end_total;
@@ -1118,16 +1120,18 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
     }
     
     /* now we use cached cuda iov */
+    cached_cuda_iov_count = cached_cuda_iov->cuda_iov_count;
     cuda_iov_start_pos = pConvertor->current_cuda_iov_pos;
-    cuda_iov_end_pos = cached_cuda_iov->cuda_iov_count;
+    cuda_iov_end_pos = cached_cuda_iov_count;
     nb_blocks_used = 0;
     cuda_iov_pipeline_block = current_cuda_device->cuda_iov_pipeline_block[iov_pipeline_block_id];
     cuda_stream_iov = cuda_iov_pipeline_block->cuda_stream;
+    convertor_current_count = pConvertor->current_count;
    
-    if( pConvertor->current_count < pConvertor->count && !buffer_isfull) {
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
-        GET_TIME(start);
+    GET_TIME(start);
 #endif
+    while( pConvertor->current_count < pConvertor->count && !buffer_isfull) {
         for (i = cuda_iov_start_pos; i < cuda_iov_end_pos && !buffer_isfull; i++) {
             if (buffer_size >= cached_cuda_iov_nb_bytes_list_h[i]) {
                 total_packed += cached_cuda_iov_nb_bytes_list_h[i];
@@ -1143,16 +1147,17 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
             cuda_iov_start_pos = 0;
             cuda_iov_end_pos = cached_cuda_iov->cuda_iov_count;
         }
+    }
 #if defined(OPAL_DATATYPE_CUDA_TIMING)    
-        GET_TIME( end );
-        total_time = ELAPSED_TIME( start, end );
-        DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Pack to dest %p, cached cuda iov is prepared in %ld microsec, kernel submitted to CUDA stream %d, nb_blocks %d\n", destination_base, total_time,  cuda_iov_pipeline_block->cuda_stream_id, nb_blocks_used); );
+    GET_TIME( end );
+    total_time = ELAPSED_TIME( start, end );
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Pack to dest %p, cached cuda iov is prepared in %ld microsec, kernel submitted to CUDA stream %d, nb_blocks %d\n", destination_base, total_time,  cuda_iov_pipeline_block->cuda_stream_id, nb_blocks_used); );
 #endif
-    }
-        DT_CUDA_DEBUG ( opal_cuda_output(2, "kernel launched src_base %p, dst_base %p, nb_blocks %ld\n", source_base, destination_base, nb_blocks_used ); );
-        opal_generic_simple_pack_cuda_iov_cached_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cached_cuda_iov_dist_d, pConvertor->current_cuda_iov_pos, cuda_iov_contig_buf_d_current, nb_blocks_used, source_base, destination_base);
-        pConvertor->current_cuda_iov_pos += nb_blocks_used;
-        pConvertor->current_cuda_iov_pos = pConvertor->current_cuda_iov_pos % cached_cuda_iov->cuda_iov_count;
+    opal_datatype_type_extent(pConvertor->pDesc, &ddt_extent);
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "kernel launched src_base %p, dst_base %p, nb_blocks %ld, extent %ld\n", source_base, destination_base, nb_blocks_used, ddt_extent ); );
+    opal_generic_simple_pack_cuda_iov_cached_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cached_cuda_iov_dist_d, pConvertor->current_cuda_iov_pos, cached_cuda_iov_count, ddt_extent, convertor_current_count, nb_blocks_used, source_base, destination_base);
+    pConvertor->current_cuda_iov_pos += nb_blocks_used;
+    pConvertor->current_cuda_iov_pos = pConvertor->current_cuda_iov_pos % cached_cuda_iov->cuda_iov_count;
 
     for (i = 0; i < NB_STREAMS; i++) {
         cudaStreamSynchronize(cuda_streams->opal_cuda_stream[i]);
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
index b58cff27cf3..f6ee8e0bfc4 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
@@ -46,7 +46,7 @@ __global__ void opal_generic_simple_unpack_cuda_iov_non_cached_kernel( ddt_cuda_
     }
 }
 
-__global__ void opal_generic_simple_unpack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, uint32_t cuda_iov_pos, uintptr_t* cuda_iov_contig_buf_d, int nb_blocks_used, unsigned char* destination_base, unsigned char* source_base, size_t cuda_iov_partial_length_start, size_t cuda_iov_partial_length_end)
+__global__ void opal_generic_simple_unpack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, uint32_t cuda_iov_pos, uint32_t cuda_iov_count, uint32_t ddt_extent, uint32_t current_count, int nb_blocks_used, unsigned char* destination_base, unsigned char* source_base, size_t cuda_iov_partial_length_start, size_t cuda_iov_partial_length_end)
 {
     uint32_t i, j;
     size_t dst_offset, src_offset;
@@ -56,6 +56,9 @@ __global__ void opal_generic_simple_unpack_cuda_iov_cached_kernel( ddt_cuda_iov_
     size_t source_disp = cuda_iov_dist[current_cuda_iov_pos].contig_disp;
     size_t source_partial_disp = 0;
     size_t contig_disp; 
+    uint32_t _my_cuda_iov_pos;
+    uint32_t _my_cuda_iov_iteration;
+    size_t ddt_size = cuda_iov_dist[cuda_iov_count].contig_disp;
 
     __shared__ uint32_t nb_tasks;
     uint32_t copy_count;
@@ -66,6 +69,7 @@ __global__ void opal_generic_simple_unpack_cuda_iov_cached_kernel( ddt_cuda_iov_
         if (blockIdx.x < nb_blocks_used % gridDim.x) {
             nb_tasks ++;
         }
+     //   printf("cuda_iov_count %d, ddt_extent %d, current_count %d, ddt_size %d\n", cuda_iov_count, ddt_extent, current_count, ddt_size);
     }
     __syncthreads();
     
@@ -74,13 +78,17 @@ __global__ void opal_generic_simple_unpack_cuda_iov_cached_kernel( ddt_cuda_iov_
     }
     
     for (i = 0; i < nb_tasks; i++) {
-        contig_disp = cuda_iov_dist[blockIdx.x + i * gridDim.x + current_cuda_iov_pos].contig_disp;  /* this variable is used multiple times, so put in in register */
-        src_offset = contig_disp - source_disp - source_partial_disp;
-        dst_offset = cuda_iov_dist[blockIdx.x + i * gridDim.x + current_cuda_iov_pos].ncontig_disp;
-        _nb_bytes = cuda_iov_dist[blockIdx.x + i * gridDim.x + current_cuda_iov_pos + 1].contig_disp - contig_disp;
+        /* these 3 variables are used multiple times, so put in in register */
+        _my_cuda_iov_pos = (blockIdx.x + i * gridDim.x + current_cuda_iov_pos) % cuda_iov_count;
+        _my_cuda_iov_iteration = (blockIdx.x + i * gridDim.x + current_cuda_iov_pos) / cuda_iov_count;
+        contig_disp = cuda_iov_dist[_my_cuda_iov_pos].contig_disp; 
+        
+        src_offset = contig_disp + ddt_size * _my_cuda_iov_iteration - source_disp - source_partial_disp;
+        dst_offset = cuda_iov_dist[_my_cuda_iov_pos].ncontig_disp + (_my_cuda_iov_iteration + current_count) * ddt_extent;
+        _nb_bytes = cuda_iov_dist[_my_cuda_iov_pos + 1].contig_disp - contig_disp;
 
         if (i == 0 && blockIdx.x == 0 && cuda_iov_partial_length_start != 0) {
-            src_offset = contig_disp - source_disp;
+            src_offset = contig_disp + ddt_size * _my_cuda_iov_iteration - source_disp;
             dst_offset = dst_offset + _nb_bytes - cuda_iov_partial_length_start;  
             _nb_bytes = cuda_iov_partial_length_start;
         } else if (i == nb_tasks-1 && (blockIdx.x == (nb_blocks_used-1) % gridDim.x) && cuda_iov_partial_length_end != 0) {
@@ -97,7 +105,12 @@ __global__ void opal_generic_simple_unpack_cuda_iov_cached_kernel( ddt_cuda_iov_
             alignment = ALIGNMENT_CHAR;
         }
         copy_count = _nb_bytes / alignment;
-        
+   /*     
+        if (threadIdx.x == 0 && nb_tasks != 0) {
+            printf("unpack block %d, src_offset %ld, dst_offset %ld, count %d, nb_bytes %d, nb_tasks %d, i %d\n", blockIdx.x, src_offset, dst_offset, copy_count, _nb_bytes, nb_tasks, i);
+        }
+        __syncthreads();
+     */   
         for (j = threadIdx.x; j < copy_count; j += blockDim.x) {
 /*            if (threadIdx.x == 0) {
                 if (copy_count > blockDim.x) printf("copy_count %d, dim %d\n", copy_count, blockDim.x);
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
index 70d9d10465e..49355e8e017 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
@@ -664,6 +664,8 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
     size_t cuda_iov_partial_length_start = 0;
     size_t cuda_iov_partial_length_end = 0;
     size_t source_disp = 0;
+    opal_datatype_count_t convertor_current_count;
+    OPAL_PTRDIFF_TYPE ddt_extent;
 
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
     TIMER_DATA_TYPE start, end, start_total, end_total;
@@ -816,6 +818,7 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
     nb_blocks_used = 0;
     cuda_iov_pipeline_block = current_cuda_device->cuda_iov_pipeline_block[iov_pipeline_block_id];
     cuda_stream_iov = cuda_iov_pipeline_block->cuda_stream;
+    convertor_current_count = pConvertor->current_count;
     
     printf("[00000] partial_length %ld, pos %d\n", pConvertor->current_iov_partial_length, pConvertor->current_cuda_iov_pos);
     if (pConvertor->current_iov_partial_length > 0) {
@@ -827,10 +830,10 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
         nb_blocks_used ++;
     }
     
-    while( pConvertor->current_count < pConvertor->count && !buffer_isfull) {
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
-        GET_TIME(start);
+    GET_TIME(start);
 #endif
+    while( pConvertor->current_count < pConvertor->count && !buffer_isfull) {
         for (i = cuda_iov_start_pos; i < cuda_iov_end_pos && !buffer_isfull; i++) {
             if (buffer_size >= cached_cuda_iov_nb_bytes_list_h[i]) {
                 total_unpacked += cached_cuda_iov_nb_bytes_list_h[i];
@@ -850,16 +853,17 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
         if (!buffer_isfull) {
             pConvertor->current_count ++;
             cuda_iov_start_pos = 0;
-            cuda_iov_end_pos = cached_cuda_iov->cuda_iov_count;
+            cuda_iov_end_pos = cached_cuda_iov_count;
         }
+    }
 #if defined(OPAL_DATATYPE_CUDA_TIMING)    
-        GET_TIME( end );
-        total_time = ELAPSED_TIME( start, end );
-        DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Unpack src %p, cached cuda iov is prepared in %ld microsec, kernel submitted to CUDA stream %d, nb_blocks %d\n", source_base, total_time,  cuda_iov_pipeline_block->cuda_stream_id, nb_blocks_used); );
+    GET_TIME( end );
+    total_time = ELAPSED_TIME( start, end );
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Unpack src %p, cached cuda iov is prepared in %ld microsec, kernel submitted to CUDA stream %d, nb_blocks %d\n", source_base, total_time,  cuda_iov_pipeline_block->cuda_stream_id, nb_blocks_used); );
 #endif
-    }
-        DT_CUDA_DEBUG ( opal_cuda_output(2, "kernel launched src_base %p, dst_base %p, nb_blocks %ld\n", source_base, destination_base, nb_blocks_used ); );
-        opal_generic_simple_unpack_cuda_iov_cached_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cached_cuda_iov_dist_d, pConvertor->current_cuda_iov_pos, cuda_iov_contig_buf_d_current, nb_blocks_used, destination_base, source_base, cuda_iov_partial_length_start, cuda_iov_partial_length_end);
+    opal_datatype_type_extent(pConvertor->pDesc, &ddt_extent);
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "kernel launched src_base %p, dst_base %p, nb_blocks %ld\n", source_base, destination_base, nb_blocks_used ); );
+    opal_generic_simple_unpack_cuda_iov_cached_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cached_cuda_iov_dist_d, pConvertor->current_cuda_iov_pos, cached_cuda_iov_count, ddt_extent, convertor_current_count, nb_blocks_used, destination_base, source_base, cuda_iov_partial_length_start, cuda_iov_partial_length_end);
 
     for (i = 0; i < NB_STREAMS; i++) {
         cudaStreamSynchronize(cuda_streams->opal_cuda_stream[i]);

From 30d493b741732838d487f85764e45e38fec03883 Mon Sep 17 00:00:00 2001
From: eddy16112 <eddy16112@gmail.com>
Date: Mon, 16 Nov 2015 16:02:11 -0500
Subject: [PATCH 065/190] move the cuda iov caching into a seperate function

---
 opal/datatype/cuda/opal_datatype_cuda.cu      |  79 +++++++++++++
 opal/datatype/cuda/opal_datatype_cuda.cuh     |   2 +
 .../cuda/opal_datatype_pack_cuda_wrapper.cu   | 110 +++---------------
 .../cuda/opal_datatype_unpack_cuda_wrapper.cu | 107 ++---------------
 4 files changed, 105 insertions(+), 193 deletions(-)

diff --git a/opal/datatype/cuda/opal_datatype_cuda.cu b/opal/datatype/cuda/opal_datatype_cuda.cu
index ec33b5c0e4d..5747eb2b3a5 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cu
+++ b/opal/datatype/cuda/opal_datatype_cuda.cu
@@ -325,6 +325,85 @@ void opal_ddt_cached_cuda_iov_fini(void* cached_cuda_iov)
 #endif /* OPAL_DATATYPE_CUDA_IOV_CACHE */
 }
 
+/* cached_cuda_iov_d is not ready until explicitlt sync with cuda stream 0 
+*/
+int32_t opal_ddt_cache_cuda_iov(opal_convertor_t* pConvertor, ddt_cuda_iov_dist_cached_t *cached_cuda_iov_d, uint32_t *cached_cuda_iov_nb_bytes_list_h, uint32_t *cuda_iov_count)
+{
+    uint32_t i, j;
+    uint32_t count_desc, nb_blocks_per_description, residue_desc;
+    uint32_t thread_per_block, nb_blocks_used;
+    size_t length_per_iovec;
+    uint8_t alignment;
+    ddt_cuda_iov_pipeline_block_t *cuda_iov_pipeline_block = NULL;
+    ddt_cuda_iov_dist_cached_t *cuda_iov_h = NULL;
+    cudaStream_t *cuda_stream_iov = NULL;
+    const struct iovec *ddt_iov = NULL;
+    uint32_t ddt_iov_count = 0;
+    size_t ncontig_disp_base;
+    size_t contig_disp = 0;
+    
+    opal_convertor_raw_cached( pConvertor, &ddt_iov, &ddt_iov_count);
+    if (ddt_iov == NULL) {
+        DT_CUDA_DEBUG ( opal_cuda_output(0, "Can not get ddt iov\n"););
+        return OPAL_ERROR;
+    }
+    
+    nb_blocks_used = 0;
+    cuda_iov_pipeline_block = current_cuda_device->cuda_iov_pipeline_block[0];
+    cuda_iov_h = cuda_iov_pipeline_block->cuda_iov_dist_cached_h;
+    cuda_stream_iov = cuda_iov_pipeline_block->cuda_stream;
+
+    for (i = 0; i < ddt_iov_count; i++) {
+        length_per_iovec = ddt_iov[i].iov_len;
+        ncontig_disp_base = (size_t)(ddt_iov[i].iov_base);
+    
+        /* block size is either multiple of ALIGNMENT_DOUBLE or residule */
+        alignment = ALIGNMENT_DOUBLE;
+
+        count_desc = length_per_iovec / alignment;
+        residue_desc = length_per_iovec % alignment;
+        nb_blocks_per_description = (count_desc + thread_per_block - 1) / thread_per_block;
+        DT_CUDA_DEBUG ( opal_cuda_output(10, "Cache cuda IOV description %d, size %d, residue %d, alignment %d, nb_block_aquired %d\n", i, count_desc, residue_desc, alignment, nb_blocks_per_description); );
+        for (j = 0; j < nb_blocks_per_description; j++) {
+            cuda_iov_h[nb_blocks_used].ncontig_disp = ncontig_disp_base + j * thread_per_block * alignment;
+            cuda_iov_h[nb_blocks_used].contig_disp = contig_disp;
+            if ( (j+1) * thread_per_block <= count_desc) {
+                cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] = thread_per_block * alignment;
+            } else {
+                cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] = (count_desc - j*thread_per_block) * alignment; 
+            }
+#if defined (OPAL_DATATYPE_CUDA_DEBUG)
+            assert(cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] > 0);
+#endif /* OPAL_DATATYPE_CUDA_DEBUG */
+            contig_disp += cached_cuda_iov_nb_bytes_list_h[nb_blocks_used];
+            DT_CUDA_DEBUG( opal_cuda_output(12, "Cache cuda IOV \tblock %d, ncontig_disp %ld, contig_disp %ld, nb_bytes %ld\n", nb_blocks_used, cuda_iov_h[nb_blocks_used].ncontig_disp, cuda_iov_h[nb_blocks_used].contig_disp, cached_cuda_iov_nb_bytes_list_h[nb_blocks_used]); );
+            nb_blocks_used ++;
+            assert (nb_blocks_used < CUDA_MAX_NB_BLOCKS*CUDA_IOV_MAX_TASK_PER_BLOCK);
+        }
+    
+        /* handle residue */
+        if (residue_desc != 0) {
+            cuda_iov_h[nb_blocks_used].ncontig_disp = ncontig_disp_base + length_per_iovec / alignment * alignment;
+            cuda_iov_h[nb_blocks_used].contig_disp = contig_disp;
+            cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] = length_per_iovec - length_per_iovec / alignment * alignment;
+#if defined (OPAL_DATATYPE_CUDA_DEBUG)
+            assert(cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] > 0);
+#endif /* OPAL_DATATYPE_CUDA_DEBUG */
+            contig_disp += cached_cuda_iov_nb_bytes_list_h[nb_blocks_used];
+            DT_CUDA_DEBUG( opal_cuda_output(12, "Cache cuda IOV \tblock %d, ncontig_disp %ld, contig_disp %ld, nb_bytes %ld\n", nb_blocks_used, cuda_iov_h[nb_blocks_used].ncontig_disp, cuda_iov_h[nb_blocks_used].contig_disp, cached_cuda_iov_nb_bytes_list_h[nb_blocks_used]); );
+            nb_blocks_used ++;
+#if defined (OPAL_DATATYPE_CUDA_DEBUG)
+            assert (nb_blocks_used < CUDA_MAX_NB_BLOCKS*CUDA_IOV_MAX_TASK_PER_BLOCK);
+#endif /* OPAL_DATATYPE_CUDA_DEBUG */
+        }
+    }
+    /* use additional entry to store the size of entire contiguous buffer needed for one ddt */
+    cuda_iov_h[nb_blocks_used].contig_disp = contig_disp;
+    cudaMemcpyAsync(cached_cuda_iov_d, cuda_iov_h, sizeof(ddt_cuda_iov_dist_cached_t)*(nb_blocks_used+1), cudaMemcpyHostToDevice, *cuda_stream_iov);
+    *cuda_iov_count = nb_blocks_used;
+    return OPAL_SUCCESS;
+}
+
 void opal_ddt_get_cached_cuda_iov(struct opal_convertor_t *convertor, ddt_cuda_iov_total_cached_t **cached_cuda_iov)
 {
     opal_datatype_t *datatype = (opal_datatype_t *)convertor->pDesc;
diff --git a/opal/datatype/cuda/opal_datatype_cuda.cuh b/opal/datatype/cuda/opal_datatype_cuda.cuh
index 8e30726ace2..4a71ab37d63 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cuh
+++ b/opal/datatype/cuda/opal_datatype_cuda.cuh
@@ -131,6 +131,8 @@ void opal_ddt_check_cuda_iov_is_full(struct opal_convertor_t *convertor, uint32_
 
 void opal_ddt_set_cuda_iov_position(struct opal_convertor_t *convertor, size_t ddt_offset, const uint32_t *cached_cuda_iov_nb_bytes_list_h, const uint32_t cuda_iov_count);
 
+int32_t opal_ddt_cache_cuda_iov(opal_convertor_t* pConvertor, ddt_cuda_iov_dist_cached_t *cached_cuda_iov_d, uint32_t *cached_cuda_iov_nb_bytes_list_h, uint32_t *cuda_iov_count);
+
 }
                             
 #endif  /* OPAL_DATATYPE_CUDA_H_HAS_BEEN_INCLUDED */
\ No newline at end of file
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
index fc9181e902b..ddc2ec08a89 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
@@ -932,40 +932,21 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
                                                                uint32_t* out_size,
                                                                size_t* max_data )
 {
-    uint32_t i, j;
-    uint32_t count_desc, nb_blocks_per_description, residue_desc;
+    uint32_t i;
     uint32_t nb_blocks, thread_per_block, nb_blocks_used;
-    size_t length, buffer_size, length_per_iovec;
-    unsigned char *destination, *destination_base, *source_base, *source;
+    size_t buffer_size;
+    unsigned char *destination, *destination_base, *source_base;
     size_t total_packed;
-    int32_t complete_flag = 0;
     uint8_t buffer_isfull = 0, transfer_required, free_required;
-    uint32_t convertor_flags;
-//    dt_elem_desc_t* description;
-//    dt_elem_desc_t* pElem;
-//    dt_stack_t* pStack;
-    uint8_t alignment, orig_alignment;
-//    int32_t orig_stack_index;
     cudaError_t cuda_err;
     ddt_cuda_stream_t *cuda_streams = current_cuda_device->cuda_streams;
-    ddt_cuda_iov_dist_cached_t* cuda_iov_dist_h_current = NULL;
-    ddt_cuda_iov_dist_cached_t* cuda_iov_dist_d_current = NULL;
-    uintptr_t *cuda_iov_contig_buf_h_current = NULL; 
-    uintptr_t *cuda_iov_contig_buf_d_current = NULL;
     ddt_cuda_iov_pipeline_block_t *cuda_iov_pipeline_block = NULL;
-    int iov_pipeline_block_id = 0;
     cudaStream_t *cuda_stream_iov = NULL;
-    const struct iovec *ddt_iov = NULL;
-    uint32_t ddt_iov_count = 0;
-    size_t iov_len = 0;
-    uint32_t iov_start_pos, iov_end_pos, cuda_iov_start_pos, cuda_iov_end_pos;
+    uint32_t cuda_iov_start_pos, cuda_iov_end_pos;
     ddt_cuda_iov_total_cached_t* cached_cuda_iov = NULL;
     ddt_cuda_iov_dist_cached_t* cached_cuda_iov_dist_d = NULL;
     uint32_t *cached_cuda_iov_nb_bytes_list_h = NULL;
-    uint32_t *cuda_iov_nb_bytes_list_h_current = NULL;
     uint32_t cached_cuda_iov_count = 0;
-    uint8_t cuda_iov_is_cached = 0;
-    size_t destionation_disp = 0;
     opal_datatype_count_t convertor_current_count;
     OPAL_PTRDIFF_TYPE ddt_extent;
 
@@ -973,14 +954,6 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
     TIMER_DATA_TYPE start, end, start_total, end_total;
     long total_time, move_time;
 #endif
-    
-    /*description = pConvertor->use_desc->desc;
-    pStack = pConvertor->pStack + pConvertor->stack_pos;
-    pElem = &(description[pStack->index]);
-    printf("size elem %lu, size %d\n", pElem->elem.common.type, opal_datatype_basicDatatypes[pElem->elem.common.type]->size);
-    */
-    
-//    assert(opal_datatype_basicDatatypes[pElem->elem.common.type]->size != 0);
 
  //   printf("buffer size %d, max_data %d\n", iov[0].iov_len, *max_data);
     if ((iov[0].iov_base == NULL) || opal_ddt_cuda_is_gpu_buffer(iov[0].iov_base)) {
@@ -1021,7 +994,6 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
 
     total_packed = 0;
     cuda_streams->current_stream_id = 0;
-  //  orig_stack_index = pStack->index;
     destination_base = destination;
 
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
@@ -1032,14 +1004,11 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
     GET_TIME(start);
 #endif
     
-    opal_convertor_raw_cached( pConvertor, &ddt_iov, &ddt_iov_count);
-    assert(ddt_iov != NULL);
     opal_ddt_get_cached_cuda_iov(pConvertor, &cached_cuda_iov);
     cached_cuda_iov_dist_d = cached_cuda_iov->cuda_iov_dist_d;
     assert(cached_cuda_iov_dist_d != NULL);
     cached_cuda_iov_nb_bytes_list_h = cached_cuda_iov->nb_bytes_h;
     assert(cached_cuda_iov_nb_bytes_list_h != NULL);
-    DT_CUDA_DEBUG ( opal_cuda_output(4, "Pack iov count %d, submit to CUDA stream %d\n", ddt_iov_count, cuda_streams->current_stream_id); );
 
 #if defined(OPAL_DATATYPE_CUDA_TIMING)    
     GET_TIME( end );
@@ -1053,69 +1022,20 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
     
     /* cuda iov is not cached, start to cache iov */
     if(opal_ddt_cuda_iov_is_cached(pConvertor) == 0) {
-        nb_blocks_used = 0;
-        cuda_iov_pipeline_block = current_cuda_device->cuda_iov_pipeline_block[iov_pipeline_block_id];
-        cuda_iov_dist_h_current = cuda_iov_pipeline_block->cuda_iov_dist_cached_h;
-        cuda_stream_iov = cuda_iov_pipeline_block->cuda_stream;
-        cuda_err = cudaStreamWaitEvent(*cuda_stream_iov, cuda_iov_pipeline_block->cuda_event, 0);
-        opal_cuda_check_error(cuda_err);
-
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
         GET_TIME(start);
 #endif
-
-        for (i = 0; i < ddt_iov_count; i++) {
-            length_per_iovec = ddt_iov[i].iov_len;
-            source = (size_t)(ddt_iov[i].iov_base) + source_base;
-        
-            /* block size is either multiple of ALIGNMENT_DOUBLE or residule */
-            alignment = ALIGNMENT_DOUBLE;
-
-            count_desc = length_per_iovec / alignment;
-            residue_desc = length_per_iovec % alignment;
-            nb_blocks_per_description = (count_desc + thread_per_block - 1) / thread_per_block;
-            DT_CUDA_DEBUG ( opal_cuda_output(10, "Pack description %d, size %d, residue %d, alignment %d, nb_block_aquired %d\n", i, count_desc, residue_desc, alignment, nb_blocks_per_description); );
-            for (j = 0; j < nb_blocks_per_description; j++) {
-                cuda_iov_dist_h_current[nb_blocks_used].ncontig_disp = source + j * thread_per_block * alignment - source_base;
-                cuda_iov_dist_h_current[nb_blocks_used].contig_disp = destionation_disp;
-                if ( (j+1) * thread_per_block <= count_desc) {
-                    cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] = thread_per_block * alignment;
-                } else {
-                    cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] = (count_desc - j*thread_per_block) * alignment; 
-                }
-#if defined (OPAL_DATATYPE_CUDA_DEBUG)
-                assert(cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] > 0);
-#endif /* OPAL_DATATYPE_CUDA_DEBUG */
-                destionation_disp += cached_cuda_iov_nb_bytes_list_h[nb_blocks_used];
-                DT_CUDA_DEBUG( opal_cuda_output(12, "Pack \tblock %d, ncontig_disp %ld, contig_disp %ld, nb_bytes %ld\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].ncontig_disp, cuda_iov_dist_h_current[nb_blocks_used].contig_disp, cached_cuda_iov_nb_bytes_list_h[nb_blocks_used]); );
-                nb_blocks_used ++;
-                assert (nb_blocks_used < CUDA_MAX_NB_BLOCKS*CUDA_IOV_MAX_TASK_PER_BLOCK);
-            }
-        
-            /* handle residue */
-            if (residue_desc != 0) {
-                /*orig_alignment = opal_datatype_basicDatatypes[pElem->elem.common.type]->size;*/
-                cuda_iov_dist_h_current[nb_blocks_used].ncontig_disp = source + length_per_iovec / alignment * alignment - source_base;
-                cuda_iov_dist_h_current[nb_blocks_used].contig_disp = destionation_disp;
-                cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] = length_per_iovec - length_per_iovec / alignment * alignment;
-#if defined (OPAL_DATATYPE_CUDA_DEBUG)
-                assert(cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] > 0);
-#endif /* OPAL_DATATYPE_CUDA_DEBUG */
-                destionation_disp += cached_cuda_iov_nb_bytes_list_h[nb_blocks_used];
-                DT_CUDA_DEBUG( opal_cuda_output(12, "Pack \tblock %d, ncontig_disp %ld, contig_disp %ld, nb_bytes %ld\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].ncontig_disp, cuda_iov_dist_h_current[nb_blocks_used].contig_disp, cached_cuda_iov_nb_bytes_list_h[nb_blocks_used]); );
-                nb_blocks_used ++;
-                assert (nb_blocks_used < CUDA_MAX_NB_BLOCKS*CUDA_IOV_MAX_TASK_PER_BLOCK);
-            }
+        if (opal_ddt_cache_cuda_iov(pConvertor, cached_cuda_iov_dist_d, cached_cuda_iov_nb_bytes_list_h, &nb_blocks_used) == OPAL_SUCCESS) {
+            opal_ddt_set_cuda_iov_cached(pConvertor, nb_blocks_used);
+            DT_CUDA_DEBUG ( opal_cuda_output(2, "Pack cuda iov is cached, count %d\n", nb_blocks_used););
+        } else {
+            DT_CUDA_DEBUG ( opal_cuda_output(0, "Pack cache cuda iov is failed\n"););
+            return OPAL_ERROR;
         }
-        /* use additional entry to store the size of entire contiguous buffer needed for one ddt */
-        cuda_iov_dist_h_current[nb_blocks_used].contig_disp = destionation_disp;
-        cudaMemcpyAsync(cached_cuda_iov_dist_d, cuda_iov_dist_h_current, sizeof(ddt_cuda_iov_dist_cached_t)*(nb_blocks_used+1), cudaMemcpyHostToDevice, *cuda_stream_iov);
-        opal_ddt_set_cuda_iov_cached(pConvertor, nb_blocks_used);
-        DT_CUDA_DEBUG ( opal_cuda_output(2, "Pack cuda iov is cached, count %d\n", nb_blocks_used););
 #if defined(OPAL_DATATYPE_CUDA_TIMING)    
         GET_TIME( end );
         total_time = ELAPSED_TIME( start, end );
-        DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Pack cached cuda iov is prepared in %ld microsec, nb_blocks %d\n", total_time, nb_blocks_used); );
+        DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Pack cuda iov is cached in %ld microsec, nb_blocks %d\n", total_time, nb_blocks_used); );
 #endif
     }
     
@@ -1124,7 +1044,7 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
     cuda_iov_start_pos = pConvertor->current_cuda_iov_pos;
     cuda_iov_end_pos = cached_cuda_iov_count;
     nb_blocks_used = 0;
-    cuda_iov_pipeline_block = current_cuda_device->cuda_iov_pipeline_block[iov_pipeline_block_id];
+    cuda_iov_pipeline_block = current_cuda_device->cuda_iov_pipeline_block[0];
     cuda_stream_iov = cuda_iov_pipeline_block->cuda_stream;
     convertor_current_count = pConvertor->current_count;
    
@@ -1154,14 +1074,12 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
     DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Pack to dest %p, cached cuda iov is prepared in %ld microsec, kernel submitted to CUDA stream %d, nb_blocks %d\n", destination_base, total_time,  cuda_iov_pipeline_block->cuda_stream_id, nb_blocks_used); );
 #endif
     opal_datatype_type_extent(pConvertor->pDesc, &ddt_extent);
-    DT_CUDA_DEBUG ( opal_cuda_output(2, "kernel launched src_base %p, dst_base %p, nb_blocks %ld, extent %ld\n", source_base, destination_base, nb_blocks_used, ddt_extent ); );
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "Pack kernel launched src_base %p, dst_base %p, nb_blocks %ld, extent %ld\n", source_base, destination_base, nb_blocks_used, ddt_extent ); );
     opal_generic_simple_pack_cuda_iov_cached_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cached_cuda_iov_dist_d, pConvertor->current_cuda_iov_pos, cached_cuda_iov_count, ddt_extent, convertor_current_count, nb_blocks_used, source_base, destination_base);
     pConvertor->current_cuda_iov_pos += nb_blocks_used;
     pConvertor->current_cuda_iov_pos = pConvertor->current_cuda_iov_pos % cached_cuda_iov->cuda_iov_count;
 
-    for (i = 0; i < NB_STREAMS; i++) {
-        cudaStreamSynchronize(cuda_streams->opal_cuda_stream[i]);
-    }
+    cudaStreamSynchronize(cuda_streams->opal_cuda_stream[0]);
 
     pConvertor->bConverted += total_packed;
     DT_CUDA_DEBUG ( opal_cuda_output(2, "Pack total packed %d\n", total_packed); );
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
index 49355e8e017..fe8475a201a 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
@@ -627,43 +627,24 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
                                                                  uint32_t* out_size,
                                                                  size_t* max_data )
 {
-    uint32_t i, j;
-    uint32_t count_desc, nb_blocks_per_description, residue_desc;
+    uint32_t i;
     uint32_t nb_blocks, thread_per_block, nb_blocks_used;
-    size_t length, buffer_size, length_per_iovec;
+    size_t buffer_size;
     unsigned char *source, *source_base, *destination_base, *destination;
     size_t total_unpacked;
-    int32_t complete_flag = 0;
     uint8_t buffer_isfull = 0;
     uint8_t free_required = 0;
-    uint32_t convertor_flags;
-//    dt_elem_desc_t* description;
-//    dt_elem_desc_t* pElem;
-//    dt_stack_t* pStack;
-    uint8_t alignment, orig_alignment;
-//    int32_t orig_stack_index;
     cudaError_t cuda_err;
     ddt_cuda_stream_t *cuda_streams = current_cuda_device->cuda_streams;
-    ddt_cuda_iov_dist_cached_t* cuda_iov_dist_h_current = NULL;
-    ddt_cuda_iov_dist_cached_t* cuda_iov_dist_d_current = NULL;
-    uintptr_t *cuda_iov_contig_buf_h_current = NULL;
-    uintptr_t *cuda_iov_contig_buf_d_current = NULL;
     ddt_cuda_iov_pipeline_block_t *cuda_iov_pipeline_block = NULL;
-    int iov_pipeline_block_id = 0;
     cudaStream_t *cuda_stream_iov = NULL;
-    const struct iovec *ddt_iov = NULL;
-    uint32_t ddt_iov_count = 0;
-    size_t iov_len = 0;
-    uint32_t iov_start_pos, iov_end_pos, cuda_iov_start_pos, cuda_iov_end_pos;
+    uint32_t cuda_iov_start_pos, cuda_iov_end_pos;
     ddt_cuda_iov_total_cached_t* cached_cuda_iov = NULL;
     ddt_cuda_iov_dist_cached_t* cached_cuda_iov_dist_d = NULL;
     uint32_t *cached_cuda_iov_nb_bytes_list_h = NULL;
-    uint32_t *cuda_iov_nb_bytes_list_h_current = NULL;
     uint32_t cached_cuda_iov_count = 0;
-    uint8_t cuda_iov_is_cached = 0;
     size_t cuda_iov_partial_length_start = 0;
     size_t cuda_iov_partial_length_end = 0;
-    size_t source_disp = 0;
     opal_datatype_count_t convertor_current_count;
     OPAL_PTRDIFF_TYPE ddt_extent;
 
@@ -676,12 +657,6 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
     GET_TIME(start_total);
 #endif
 
-/*    description = pConvertor->use_desc->desc;
-    pStack = pConvertor->pStack + pConvertor->stack_pos;
-    pElem = &(description[pStack->index]);
-    printf("size elem %d, size %lu\n", pElem->elem.common.type, opal_datatype_basicDatatypes[pElem->elem.common.type]->size);
-*/
-
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
     GET_TIME(start);
 #endif
@@ -710,9 +685,6 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
     move_time = ELAPSED_TIME( start, end );
     DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: HtoD memcpy in %ld microsec, free required %d\n", move_time, free_required ); );
 #endif
-    
-//    cuda_err = cudaEventRecord(current_cuda_device->memcpy_event, current_cuda_device->cuda_streams->opal_cuda_stream[0]);
-//    opal_cuda_check_error(cuda_err);
 
 
 #if defined (OPAL_DATATYPE_CUDA_TIMING)
@@ -721,17 +693,12 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
     buffer_size = iov[0].iov_len;
     total_unpacked = 0;
     cuda_streams->current_stream_id = 0;
-    convertor_flags = pConvertor->flags;
-//    orig_stack_index = pStack->index;
     source_base = source;
-    opal_convertor_raw_cached( pConvertor, &ddt_iov, &ddt_iov_count);
-    assert(ddt_iov != NULL);
     opal_ddt_get_cached_cuda_iov(pConvertor, &cached_cuda_iov);
     cached_cuda_iov_dist_d = cached_cuda_iov->cuda_iov_dist_d;
     assert(cached_cuda_iov_dist_d != NULL);
     cached_cuda_iov_nb_bytes_list_h = cached_cuda_iov->nb_bytes_h;
     assert(cached_cuda_iov_nb_bytes_list_h != NULL);
-    DT_CUDA_DEBUG ( opal_cuda_output(4, "Unpack iov count %d, submit to CUDA stream %d\n", ddt_iov_count, cuda_streams->current_stream_id); );
 
 #if defined (OPAL_DATATYPE_CUDA_TIMING)
     GET_TIME( end );
@@ -745,68 +712,17 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
     
     /* cuda iov is not cached, start to cache iov */
     if(opal_ddt_cuda_iov_is_cached(pConvertor) == 0) {
-        nb_blocks_used = 0;
-        cuda_iov_pipeline_block = current_cuda_device->cuda_iov_pipeline_block[iov_pipeline_block_id];
-        cuda_iov_dist_h_current = cuda_iov_pipeline_block->cuda_iov_dist_cached_h;
-        cuda_stream_iov = cuda_iov_pipeline_block->cuda_stream;
-        cuda_err = cudaStreamWaitEvent(*cuda_stream_iov, cuda_iov_pipeline_block->cuda_event, 0);
-        opal_cuda_check_error(cuda_err);
-    
-
 #if defined (OPAL_DATATYPE_CUDA_TIMING)
         GET_TIME(start);
 #endif
-
-        for (i = 0; i < ddt_iov_count; i++) {
-            length_per_iovec = ddt_iov[i].iov_len;
-            destination = (size_t)(ddt_iov[i].iov_base) + destination_base;
-
-            alignment = ALIGNMENT_DOUBLE;
-
-            count_desc = length_per_iovec / alignment;
-            residue_desc = length_per_iovec % alignment;
-            nb_blocks_per_description = (count_desc + thread_per_block - 1) / thread_per_block;
-            DT_CUDA_DEBUG ( opal_cuda_output(10, "Unpack description %d, size %d, residue %d, alignment %d\n", i, count_desc, residue_desc, alignment); );
-            for (j = 0; j < nb_blocks_per_description; j++) {
-                cuda_iov_dist_h_current[nb_blocks_used].ncontig_disp = destination + j * thread_per_block * alignment - destination_base;
-                cuda_iov_dist_h_current[nb_blocks_used].contig_disp = source_disp;
-                if ( (j+1) * thread_per_block <= count_desc) {
-                    cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] = thread_per_block * alignment;
-                } else {
-                    cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] = (thread_per_block - ((j+1)*thread_per_block - count_desc)) * alignment;
-                }
-#if defined (OPAL_DATATYPE_CUDA_DEBUG)
-                assert (cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] > 0); 
-#endif /* OPAL_DATATYPE_CUDA_DEBUG */
-                source_disp += cached_cuda_iov_nb_bytes_list_h[nb_blocks_used];
-                DT_CUDA_DEBUG( opal_cuda_output(12, "Unpack \tblock %d, ncontig_disp %ld, contig_disp %ld, nb_bytes %ld\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].ncontig_disp, cuda_iov_dist_h_current[nb_blocks_used].contig_disp, cached_cuda_iov_nb_bytes_list_h[nb_blocks_used]); );
-                nb_blocks_used ++;
-            }
-
-            /* handle residue */
-            if (residue_desc != 0) {
-               /* orig_alignment = opal_datatype_basicDatatypes[pElem->elem.common.type]->size;*/
-                orig_alignment = ALIGNMENT_CHAR;
-                cuda_iov_dist_h_current[nb_blocks_used].ncontig_disp = destination + length_per_iovec / alignment * alignment - destination_base;
-                cuda_iov_dist_h_current[nb_blocks_used].contig_disp = source_disp;
-                cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] = length_per_iovec - length_per_iovec / alignment * alignment;
-#if defined (OPAL_DATATYPE_CUDA_DEBUG)
-                assert (cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] > 0);
-#endif /* OPAL_DATATYPE_CUDA_DEBUG */
-                source_disp += cached_cuda_iov_nb_bytes_list_h[nb_blocks_used];
-                DT_CUDA_DEBUG( opal_cuda_output(12, "Unpack \tblock %d, ncontig_disp %ld, contig_disp %ld, nb_bytes %ld\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].ncontig_disp, cuda_iov_dist_h_current[nb_blocks_used].contig_disp, cached_cuda_iov_nb_bytes_list_h[nb_blocks_used]); );
-                nb_blocks_used ++;
-            }
+        if (opal_ddt_cache_cuda_iov(pConvertor, cached_cuda_iov_dist_d, cached_cuda_iov_nb_bytes_list_h, &nb_blocks_used) == OPAL_SUCCESS) {
+            opal_ddt_set_cuda_iov_cached(pConvertor, nb_blocks_used);
+            DT_CUDA_DEBUG ( opal_cuda_output(2, "Unpack cuda iov is cached, count %d\n", nb_blocks_used););
         }
-        /* use additional entry to store the size of entire contiguous buffer needed for one ddt */
-        cuda_iov_dist_h_current[nb_blocks_used].contig_disp = source_disp;
-        cudaMemcpy(cached_cuda_iov_dist_d, cuda_iov_dist_h_current, sizeof(ddt_cuda_iov_dist_cached_t)*(nb_blocks_used+1), cudaMemcpyHostToDevice);
-        opal_ddt_set_cuda_iov_cached(pConvertor, nb_blocks_used);
-        DT_CUDA_DEBUG ( opal_cuda_output(2, "Unpack cuda iov is cached, count %d\n", nb_blocks_used););
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
         GET_TIME( end );
         total_time = ELAPSED_TIME( start, end );
-        DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Unpack cached cuda iov is prepared in %ld microsec, nb_blocks_used %d\n", total_time, nb_blocks_used); );
+        DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Unpack cuda iov is cached in %ld microsec, nb_blocks_used %d\n", total_time, nb_blocks_used); );
 #endif
     }
       
@@ -816,11 +732,10 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
     cuda_iov_start_pos = pConvertor->current_cuda_iov_pos;
     cuda_iov_end_pos = cached_cuda_iov_count;
     nb_blocks_used = 0;
-    cuda_iov_pipeline_block = current_cuda_device->cuda_iov_pipeline_block[iov_pipeline_block_id];
+    cuda_iov_pipeline_block = current_cuda_device->cuda_iov_pipeline_block[0];
     cuda_stream_iov = cuda_iov_pipeline_block->cuda_stream;
     convertor_current_count = pConvertor->current_count;
     
-    printf("[00000] partial_length %ld, pos %d\n", pConvertor->current_iov_partial_length, pConvertor->current_cuda_iov_pos);
     if (pConvertor->current_iov_partial_length > 0) {
         cuda_iov_partial_length_start = pConvertor->current_iov_partial_length;
         total_unpacked += cuda_iov_partial_length_start;
@@ -862,12 +777,10 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
     DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Unpack src %p, cached cuda iov is prepared in %ld microsec, kernel submitted to CUDA stream %d, nb_blocks %d\n", source_base, total_time,  cuda_iov_pipeline_block->cuda_stream_id, nb_blocks_used); );
 #endif
     opal_datatype_type_extent(pConvertor->pDesc, &ddt_extent);
-    DT_CUDA_DEBUG ( opal_cuda_output(2, "kernel launched src_base %p, dst_base %p, nb_blocks %ld\n", source_base, destination_base, nb_blocks_used ); );
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "Unpack kernel launched src_base %p, dst_base %p, nb_blocks %ld\n", source_base, destination_base, nb_blocks_used ); );
     opal_generic_simple_unpack_cuda_iov_cached_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cached_cuda_iov_dist_d, pConvertor->current_cuda_iov_pos, cached_cuda_iov_count, ddt_extent, convertor_current_count, nb_blocks_used, destination_base, source_base, cuda_iov_partial_length_start, cuda_iov_partial_length_end);
 
-    for (i = 0; i < NB_STREAMS; i++) {
-        cudaStreamSynchronize(cuda_streams->opal_cuda_stream[i]);
-    }
+    cudaStreamSynchronize(cuda_streams->opal_cuda_stream[0]);
     
     pConvertor->bConverted += total_unpacked;
     DT_CUDA_DEBUG ( opal_cuda_output(2, "Unpack total unpacked %d\n", total_unpacked); );

From 8c830a67e24336fead3b5ac053205777bb050788 Mon Sep 17 00:00:00 2001
From: eddy16112 <eddy16112@gmail.com>
Date: Mon, 16 Nov 2015 16:30:27 -0500
Subject: [PATCH 066/190] these two variables are useless now

---
 opal/datatype/cuda/opal_datatype_cuda.cu           | 4 ----
 opal/datatype/cuda/opal_datatype_cuda_internal.cuh | 2 --
 2 files changed, 6 deletions(-)

diff --git a/opal/datatype/cuda/opal_datatype_cuda.cu b/opal/datatype/cuda/opal_datatype_cuda.cu
index 5747eb2b3a5..a71099c41a3 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cu
+++ b/opal/datatype/cuda/opal_datatype_cuda.cu
@@ -223,8 +223,6 @@ int32_t opal_ddt_cuda_kernel_init(void)
             cudaMallocHost((void **)(&(cuda_iov_pipeline_block->cuda_iov_dist_non_cached_h)), sizeof(ddt_cuda_iov_dist_non_cached_t) * CUDA_MAX_NB_BLOCKS * CUDA_IOV_MAX_TASK_PER_BLOCK);
             cudaMalloc((void **)(&(cuda_iov_pipeline_block->cuda_iov_dist_non_cached_d)), sizeof(ddt_cuda_iov_dist_non_cached_t) * CUDA_MAX_NB_BLOCKS * CUDA_IOV_MAX_TASK_PER_BLOCK);
             cudaMallocHost((void **)(&(cuda_iov_pipeline_block->cuda_iov_dist_cached_h)), sizeof(ddt_cuda_iov_dist_cached_t) * CUDA_MAX_NB_BLOCKS * CUDA_IOV_MAX_TASK_PER_BLOCK);
-            cudaMallocHost((void **)(&(cuda_iov_pipeline_block->cuda_iov_contig_buf_h)), sizeof(uintptr_t) * CUDA_MAX_NB_BLOCKS * CUDA_IOV_MAX_TASK_PER_BLOCK);
-            cudaMalloc((void **)(&(cuda_iov_pipeline_block->cuda_iov_contig_buf_d)), sizeof(uintptr_t) * CUDA_MAX_NB_BLOCKS * CUDA_IOV_MAX_TASK_PER_BLOCK);
             cuda_iov_pipeline_block->cuda_stream = &(cuda_streams->opal_cuda_stream[0]);
             cuda_iov_pipeline_block->cuda_stream_id = 0;
             cudaEventCreate(&(cuda_iov_pipeline_block->cuda_event), cudaEventDisableTiming);
@@ -264,8 +262,6 @@ int32_t opal_ddt_cuda_kernel_fini(void)
                 cudaFreeHost(cuda_iov_pipeline_block->cuda_iov_dist_non_cached_h);
                 cudaFree(cuda_iov_pipeline_block->cuda_iov_dist_non_cached_d);
                 cudaFreeHost(cuda_iov_pipeline_block->cuda_iov_dist_cached_h);
-                cudaFreeHost(cuda_iov_pipeline_block->cuda_iov_contig_buf_h);
-                cudaFree(cuda_iov_pipeline_block->cuda_iov_contig_buf_d);
                 cudaEventDestroy(cuda_iov_pipeline_block->cuda_event);
                 cuda_iov_pipeline_block->cuda_stream = NULL;
                 cuda_iov_pipeline_block->cuda_stream_id = -1;
diff --git a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
index 82a28420580..5e7bb41d0dc 100644
--- a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
+++ b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
@@ -74,8 +74,6 @@ typedef struct {
     ddt_cuda_iov_dist_non_cached_t* cuda_iov_dist_non_cached_h;
     ddt_cuda_iov_dist_non_cached_t* cuda_iov_dist_non_cached_d;
     ddt_cuda_iov_dist_cached_t* cuda_iov_dist_cached_h;
-    uintptr_t *cuda_iov_contig_buf_h;
-    uintptr_t *cuda_iov_contig_buf_d;
     cudaStream_t *cuda_stream;
     int32_t cuda_stream_id;
     cudaEvent_t cuda_event;

From 8d1db8a2064f7d26fc6fae40c934bb6ed4e0464d Mon Sep 17 00:00:00 2001
From: eddy16112 <eddy16112@gmail.com>
Date: Mon, 16 Nov 2015 16:49:55 -0500
Subject: [PATCH 067/190] fix a bug for ib, current count of convertor should
 be set in set_cuda_iov_position

---
 opal/datatype/cuda/opal_datatype_cuda.cu | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/opal/datatype/cuda/opal_datatype_cuda.cu b/opal/datatype/cuda/opal_datatype_cuda.cu
index a71099c41a3..3129c320068 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cu
+++ b/opal/datatype/cuda/opal_datatype_cuda.cu
@@ -433,10 +433,12 @@ void opal_ddt_set_cuda_iov_position(struct opal_convertor_t *convertor, size_t d
     size_t ddt_size;
     convertor->current_iov_partial_length = 0;
     convertor->current_cuda_iov_pos = 0;
+    convertor->current_count = 0;
     if (ddt_offset == 0) {
        return;
     }
     opal_datatype_type_size(convertor->pDesc, &ddt_size);
+    convertor->current_count = ddt_offset / ddt_size;
     ddt_offset = ddt_offset % ddt_size;
     for(i = 0; i < cuda_iov_count; i++) {
         iov_size += cached_cuda_iov_nb_bytes_list_h[i];

From 754b0d034f1369687d82f99cfb738be3a330e43d Mon Sep 17 00:00:00 2001
From: eddy16112 <eddy16112@gmail.com>
Date: Mon, 16 Nov 2015 21:00:09 -0500
Subject: [PATCH 068/190] cleanup, move cudamalloc into cache cuda iov

---
 opal/datatype/cuda/opal_datatype_cuda.cu      | 61 +++++++++++++------
 opal/datatype/cuda/opal_datatype_cuda.cuh     |  2 +-
 .../cuda/opal_datatype_pack_cuda_wrapper.cu   | 14 ++---
 .../cuda/opal_datatype_unpack_cuda_wrapper.cu | 13 ++--
 4 files changed, 56 insertions(+), 34 deletions(-)

diff --git a/opal/datatype/cuda/opal_datatype_cuda.cu b/opal/datatype/cuda/opal_datatype_cuda.cu
index 3129c320068..d0927dc4162 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cu
+++ b/opal/datatype/cuda/opal_datatype_cuda.cu
@@ -281,15 +281,13 @@ void* opal_ddt_cached_cuda_iov_init(uint32_t size)
 {
 #if OPAL_DATATYPE_CUDA_IOV_CACHE 
     ddt_cuda_iov_total_cached_t *tmp = (ddt_cuda_iov_total_cached_t *)malloc(sizeof(ddt_cuda_iov_total_cached_t));
-    ddt_cuda_iov_dist_cached_t *tmp_cuda_iov_d = NULL;
-    cudaMalloc((void **)(&tmp_cuda_iov_d), sizeof(ddt_cuda_iov_dist_cached_t) * size);
     uint32_t *tmp_nb_bytes = (uint32_t *)malloc(sizeof(uint32_t) * size);
-    if (tmp != NULL && tmp_cuda_iov_d != NULL && tmp_nb_bytes != NULL) {
-        tmp->cuda_iov_dist_d = tmp_cuda_iov_d;
+    if (tmp != NULL && tmp_nb_bytes != NULL) {
+        tmp->cuda_iov_dist_d = NULL;
         tmp->cuda_iov_count = size;
         tmp->cuda_iov_is_cached = 0;
         tmp->nb_bytes_h = tmp_nb_bytes;
-        DT_CUDA_DEBUG( opal_cuda_output( 2, "Malloc cuda_iov_dist_cached for ddt is successed, cached cuda iov %p, cuda_iov_d %p, nb_bytes_h %p, size %d.\n", tmp, tmp_cuda_iov_d, tmp_nb_bytes, size); );
+        DT_CUDA_DEBUG( opal_cuda_output( 2, "Malloc cuda_iov_dist_cached for ddt is successed, cached cuda iov %p, nb_bytes_h %p, size %d.\n", tmp, tmp_nb_bytes, size); );
         return tmp;
     } else {
         DT_CUDA_DEBUG( opal_cuda_output( 0, "Malloc cuda_iov_dist_cached for ddt is failed.\n"); );
@@ -323,7 +321,7 @@ void opal_ddt_cached_cuda_iov_fini(void* cached_cuda_iov)
 
 /* cached_cuda_iov_d is not ready until explicitlt sync with cuda stream 0 
 */
-int32_t opal_ddt_cache_cuda_iov(opal_convertor_t* pConvertor, ddt_cuda_iov_dist_cached_t *cached_cuda_iov_d, uint32_t *cached_cuda_iov_nb_bytes_list_h, uint32_t *cuda_iov_count)
+int32_t opal_ddt_cache_cuda_iov(opal_convertor_t* pConvertor, uint32_t *cuda_iov_count)
 {
     uint32_t i, j;
     uint32_t count_desc, nb_blocks_per_description, residue_desc;
@@ -331,12 +329,17 @@ int32_t opal_ddt_cache_cuda_iov(opal_convertor_t* pConvertor, ddt_cuda_iov_dist_
     size_t length_per_iovec;
     uint8_t alignment;
     ddt_cuda_iov_pipeline_block_t *cuda_iov_pipeline_block = NULL;
-    ddt_cuda_iov_dist_cached_t *cuda_iov_h = NULL;
+    ddt_cuda_iov_total_cached_t* cached_cuda_iov = NULL;
+    ddt_cuda_iov_dist_cached_t *cached_cuda_iov_dist_d = NULL;
+    ddt_cuda_iov_dist_cached_t *cuda_iov_dist_h = NULL;
     cudaStream_t *cuda_stream_iov = NULL;
     const struct iovec *ddt_iov = NULL;
     uint32_t ddt_iov_count = 0;
     size_t ncontig_disp_base;
     size_t contig_disp = 0;
+    uint32_t *cached_cuda_iov_nb_bytes_list_h = NULL;
+    
+    opal_datatype_t *datatype = (opal_datatype_t *)pConvertor->pDesc;
     
     opal_convertor_raw_cached( pConvertor, &ddt_iov, &ddt_iov_count);
     if (ddt_iov == NULL) {
@@ -344,10 +347,18 @@ int32_t opal_ddt_cache_cuda_iov(opal_convertor_t* pConvertor, ddt_cuda_iov_dist_
         return OPAL_ERROR;
     }
     
+    
+    cached_cuda_iov = (ddt_cuda_iov_total_cached_t *)opal_ddt_cached_cuda_iov_init(NUM_CUDA_IOV_PER_DDT);
+    if (cached_cuda_iov == NULL) {
+        DT_CUDA_DEBUG ( opal_cuda_output(0, "Can not init cuda iov\n"););
+        return OPAL_ERROR;
+    }
+    cached_cuda_iov_nb_bytes_list_h = cached_cuda_iov->nb_bytes_h;
     nb_blocks_used = 0;
     cuda_iov_pipeline_block = current_cuda_device->cuda_iov_pipeline_block[0];
-    cuda_iov_h = cuda_iov_pipeline_block->cuda_iov_dist_cached_h;
+    cuda_iov_dist_h = cuda_iov_pipeline_block->cuda_iov_dist_cached_h;
     cuda_stream_iov = cuda_iov_pipeline_block->cuda_stream;
+    thread_per_block = CUDA_WARP_SIZE * 5;
 
     for (i = 0; i < ddt_iov_count; i++) {
         length_per_iovec = ddt_iov[i].iov_len;
@@ -361,8 +372,8 @@ int32_t opal_ddt_cache_cuda_iov(opal_convertor_t* pConvertor, ddt_cuda_iov_dist_
         nb_blocks_per_description = (count_desc + thread_per_block - 1) / thread_per_block;
         DT_CUDA_DEBUG ( opal_cuda_output(10, "Cache cuda IOV description %d, size %d, residue %d, alignment %d, nb_block_aquired %d\n", i, count_desc, residue_desc, alignment, nb_blocks_per_description); );
         for (j = 0; j < nb_blocks_per_description; j++) {
-            cuda_iov_h[nb_blocks_used].ncontig_disp = ncontig_disp_base + j * thread_per_block * alignment;
-            cuda_iov_h[nb_blocks_used].contig_disp = contig_disp;
+            cuda_iov_dist_h[nb_blocks_used].ncontig_disp = ncontig_disp_base + j * thread_per_block * alignment;
+            cuda_iov_dist_h[nb_blocks_used].contig_disp = contig_disp;
             if ( (j+1) * thread_per_block <= count_desc) {
                 cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] = thread_per_block * alignment;
             } else {
@@ -372,21 +383,21 @@ int32_t opal_ddt_cache_cuda_iov(opal_convertor_t* pConvertor, ddt_cuda_iov_dist_
             assert(cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] > 0);
 #endif /* OPAL_DATATYPE_CUDA_DEBUG */
             contig_disp += cached_cuda_iov_nb_bytes_list_h[nb_blocks_used];
-            DT_CUDA_DEBUG( opal_cuda_output(12, "Cache cuda IOV \tblock %d, ncontig_disp %ld, contig_disp %ld, nb_bytes %ld\n", nb_blocks_used, cuda_iov_h[nb_blocks_used].ncontig_disp, cuda_iov_h[nb_blocks_used].contig_disp, cached_cuda_iov_nb_bytes_list_h[nb_blocks_used]); );
+            DT_CUDA_DEBUG( opal_cuda_output(12, "Cache cuda IOV \tblock %d, ncontig_disp %ld, contig_disp %ld, nb_bytes %ld\n", nb_blocks_used, cuda_iov_dist_h[nb_blocks_used].ncontig_disp, cuda_iov_dist_h[nb_blocks_used].contig_disp, cached_cuda_iov_nb_bytes_list_h[nb_blocks_used]); );
             nb_blocks_used ++;
             assert (nb_blocks_used < CUDA_MAX_NB_BLOCKS*CUDA_IOV_MAX_TASK_PER_BLOCK);
         }
     
         /* handle residue */
         if (residue_desc != 0) {
-            cuda_iov_h[nb_blocks_used].ncontig_disp = ncontig_disp_base + length_per_iovec / alignment * alignment;
-            cuda_iov_h[nb_blocks_used].contig_disp = contig_disp;
+            cuda_iov_dist_h[nb_blocks_used].ncontig_disp = ncontig_disp_base + length_per_iovec / alignment * alignment;
+            cuda_iov_dist_h[nb_blocks_used].contig_disp = contig_disp;
             cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] = length_per_iovec - length_per_iovec / alignment * alignment;
 #if defined (OPAL_DATATYPE_CUDA_DEBUG)
             assert(cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] > 0);
 #endif /* OPAL_DATATYPE_CUDA_DEBUG */
             contig_disp += cached_cuda_iov_nb_bytes_list_h[nb_blocks_used];
-            DT_CUDA_DEBUG( opal_cuda_output(12, "Cache cuda IOV \tblock %d, ncontig_disp %ld, contig_disp %ld, nb_bytes %ld\n", nb_blocks_used, cuda_iov_h[nb_blocks_used].ncontig_disp, cuda_iov_h[nb_blocks_used].contig_disp, cached_cuda_iov_nb_bytes_list_h[nb_blocks_used]); );
+            DT_CUDA_DEBUG( opal_cuda_output(12, "Cache cuda IOV \tblock %d, ncontig_disp %ld, contig_disp %ld, nb_bytes %ld\n", nb_blocks_used, cuda_iov_dist_h[nb_blocks_used].ncontig_disp, cuda_iov_dist_h[nb_blocks_used].contig_disp, cached_cuda_iov_nb_bytes_list_h[nb_blocks_used]); );
             nb_blocks_used ++;
 #if defined (OPAL_DATATYPE_CUDA_DEBUG)
             assert (nb_blocks_used < CUDA_MAX_NB_BLOCKS*CUDA_IOV_MAX_TASK_PER_BLOCK);
@@ -394,8 +405,15 @@ int32_t opal_ddt_cache_cuda_iov(opal_convertor_t* pConvertor, ddt_cuda_iov_dist_
         }
     }
     /* use additional entry to store the size of entire contiguous buffer needed for one ddt */
-    cuda_iov_h[nb_blocks_used].contig_disp = contig_disp;
-    cudaMemcpyAsync(cached_cuda_iov_d, cuda_iov_h, sizeof(ddt_cuda_iov_dist_cached_t)*(nb_blocks_used+1), cudaMemcpyHostToDevice, *cuda_stream_iov);
+    cuda_iov_dist_h[nb_blocks_used].contig_disp = contig_disp;
+    cudaMalloc((void **)(&cached_cuda_iov_dist_d), sizeof(ddt_cuda_iov_dist_cached_t) * (nb_blocks_used+1));
+    if (cached_cuda_iov_dist_d == NULL) {
+        DT_CUDA_DEBUG ( opal_cuda_output(0, "Can not malloc cuda iov in GPU\n"););
+        return OPAL_ERROR;
+    }
+    cudaMemcpyAsync(cached_cuda_iov_dist_d, cuda_iov_dist_h, sizeof(ddt_cuda_iov_dist_cached_t)*(nb_blocks_used+1), cudaMemcpyHostToDevice, *cuda_stream_iov);
+    cached_cuda_iov->cuda_iov_dist_d = cached_cuda_iov_dist_d;
+    datatype->cached_cuda_iov = cached_cuda_iov;
     *cuda_iov_count = nb_blocks_used;
     return OPAL_SUCCESS;
 }
@@ -404,9 +422,10 @@ void opal_ddt_get_cached_cuda_iov(struct opal_convertor_t *convertor, ddt_cuda_i
 {
     opal_datatype_t *datatype = (opal_datatype_t *)convertor->pDesc;
     if (datatype->cached_cuda_iov == NULL) {
-        datatype->cached_cuda_iov = opal_ddt_cached_cuda_iov_init(NUM_CUDA_IOV_PER_DDT);
-    }
-    *cached_cuda_iov = (ddt_cuda_iov_total_cached_t *)datatype->cached_cuda_iov;                   
+        *cached_cuda_iov = NULL;
+    } else {
+        *cached_cuda_iov = (ddt_cuda_iov_total_cached_t *)datatype->cached_cuda_iov;
+    }                 
 }
 
 void opal_ddt_set_cuda_iov_cached(struct opal_convertor_t *convertor, uint32_t cuda_iov_count)
@@ -421,7 +440,9 @@ void opal_ddt_set_cuda_iov_cached(struct opal_convertor_t *convertor, uint32_t c
 uint8_t opal_ddt_cuda_iov_is_cached(struct opal_convertor_t *convertor)
 {
     opal_datatype_t *datatype = (opal_datatype_t *)convertor->pDesc;
-    assert(datatype->cached_cuda_iov != NULL);
+    if (datatype->cached_cuda_iov == NULL) {
+        return 0;
+    }
     ddt_cuda_iov_total_cached_t *tmp = (ddt_cuda_iov_total_cached_t *)datatype->cached_cuda_iov;
     return tmp->cuda_iov_is_cached;
 }
diff --git a/opal/datatype/cuda/opal_datatype_cuda.cuh b/opal/datatype/cuda/opal_datatype_cuda.cuh
index 4a71ab37d63..8ad9b3ec658 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cuh
+++ b/opal/datatype/cuda/opal_datatype_cuda.cuh
@@ -131,7 +131,7 @@ void opal_ddt_check_cuda_iov_is_full(struct opal_convertor_t *convertor, uint32_
 
 void opal_ddt_set_cuda_iov_position(struct opal_convertor_t *convertor, size_t ddt_offset, const uint32_t *cached_cuda_iov_nb_bytes_list_h, const uint32_t cuda_iov_count);
 
-int32_t opal_ddt_cache_cuda_iov(opal_convertor_t* pConvertor, ddt_cuda_iov_dist_cached_t *cached_cuda_iov_d, uint32_t *cached_cuda_iov_nb_bytes_list_h, uint32_t *cuda_iov_count);
+int32_t opal_ddt_cache_cuda_iov(opal_convertor_t* pConvertor, uint32_t *cuda_iov_count);
 
 }
                             
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
index ddc2ec08a89..c98d540e54e 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
@@ -1003,12 +1003,6 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
     GET_TIME(start);
 #endif
-    
-    opal_ddt_get_cached_cuda_iov(pConvertor, &cached_cuda_iov);
-    cached_cuda_iov_dist_d = cached_cuda_iov->cuda_iov_dist_d;
-    assert(cached_cuda_iov_dist_d != NULL);
-    cached_cuda_iov_nb_bytes_list_h = cached_cuda_iov->nb_bytes_h;
-    assert(cached_cuda_iov_nb_bytes_list_h != NULL);
 
 #if defined(OPAL_DATATYPE_CUDA_TIMING)    
     GET_TIME( end );
@@ -1025,7 +1019,7 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
         GET_TIME(start);
 #endif
-        if (opal_ddt_cache_cuda_iov(pConvertor, cached_cuda_iov_dist_d, cached_cuda_iov_nb_bytes_list_h, &nb_blocks_used) == OPAL_SUCCESS) {
+        if (opal_ddt_cache_cuda_iov(pConvertor, &nb_blocks_used) == OPAL_SUCCESS) {
             opal_ddt_set_cuda_iov_cached(pConvertor, nb_blocks_used);
             DT_CUDA_DEBUG ( opal_cuda_output(2, "Pack cuda iov is cached, count %d\n", nb_blocks_used););
         } else {
@@ -1040,6 +1034,12 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
     }
     
     /* now we use cached cuda iov */
+    opal_ddt_get_cached_cuda_iov(pConvertor, &cached_cuda_iov);
+    cached_cuda_iov_dist_d = cached_cuda_iov->cuda_iov_dist_d;
+    assert(cached_cuda_iov_dist_d != NULL);
+    cached_cuda_iov_nb_bytes_list_h = cached_cuda_iov->nb_bytes_h;
+    assert(cached_cuda_iov_nb_bytes_list_h != NULL);
+    
     cached_cuda_iov_count = cached_cuda_iov->cuda_iov_count;
     cuda_iov_start_pos = pConvertor->current_cuda_iov_pos;
     cuda_iov_end_pos = cached_cuda_iov_count;
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
index fe8475a201a..6808ab56fed 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
@@ -694,11 +694,6 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
     total_unpacked = 0;
     cuda_streams->current_stream_id = 0;
     source_base = source;
-    opal_ddt_get_cached_cuda_iov(pConvertor, &cached_cuda_iov);
-    cached_cuda_iov_dist_d = cached_cuda_iov->cuda_iov_dist_d;
-    assert(cached_cuda_iov_dist_d != NULL);
-    cached_cuda_iov_nb_bytes_list_h = cached_cuda_iov->nb_bytes_h;
-    assert(cached_cuda_iov_nb_bytes_list_h != NULL);
 
 #if defined (OPAL_DATATYPE_CUDA_TIMING)
     GET_TIME( end );
@@ -715,7 +710,7 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
 #if defined (OPAL_DATATYPE_CUDA_TIMING)
         GET_TIME(start);
 #endif
-        if (opal_ddt_cache_cuda_iov(pConvertor, cached_cuda_iov_dist_d, cached_cuda_iov_nb_bytes_list_h, &nb_blocks_used) == OPAL_SUCCESS) {
+        if (opal_ddt_cache_cuda_iov(pConvertor, &nb_blocks_used) == OPAL_SUCCESS) {
             opal_ddt_set_cuda_iov_cached(pConvertor, nb_blocks_used);
             DT_CUDA_DEBUG ( opal_cuda_output(2, "Unpack cuda iov is cached, count %d\n", nb_blocks_used););
         }
@@ -727,6 +722,12 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
     }
       
     /* now we use cached cuda iov */
+    opal_ddt_get_cached_cuda_iov(pConvertor, &cached_cuda_iov);
+    cached_cuda_iov_dist_d = cached_cuda_iov->cuda_iov_dist_d;
+    assert(cached_cuda_iov_dist_d != NULL);
+    cached_cuda_iov_nb_bytes_list_h = cached_cuda_iov->nb_bytes_h;
+    assert(cached_cuda_iov_nb_bytes_list_h != NULL);
+    
     cached_cuda_iov_count = cached_cuda_iov->cuda_iov_count;
     opal_ddt_set_cuda_iov_position(pConvertor, pConvertor->bConverted, cached_cuda_iov_nb_bytes_list_h, cached_cuda_iov_count);
     cuda_iov_start_pos = pConvertor->current_cuda_iov_pos;

From 27e44f5f95973958ca1787f74fdc9d62bf02b342 Mon Sep 17 00:00:00 2001
From: eddy16112 <eddy16112@gmail.com>
Date: Tue, 17 Nov 2015 02:53:38 -0500
Subject: [PATCH 069/190] rearrange varibles

---
 opal/datatype/cuda/opal_datatype_cuda.cu | 2 +-
 opal/datatype/opal_datatype.h            | 7 ++++---
 opal/datatype/opal_datatype_create.c     | 2 +-
 3 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/opal/datatype/cuda/opal_datatype_cuda.cu b/opal/datatype/cuda/opal_datatype_cuda.cu
index d0927dc4162..f79e4e5ed0d 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cu
+++ b/opal/datatype/cuda/opal_datatype_cuda.cu
@@ -413,7 +413,7 @@ int32_t opal_ddt_cache_cuda_iov(opal_convertor_t* pConvertor, uint32_t *cuda_iov
     }
     cudaMemcpyAsync(cached_cuda_iov_dist_d, cuda_iov_dist_h, sizeof(ddt_cuda_iov_dist_cached_t)*(nb_blocks_used+1), cudaMemcpyHostToDevice, *cuda_stream_iov);
     cached_cuda_iov->cuda_iov_dist_d = cached_cuda_iov_dist_d;
-    datatype->cached_cuda_iov = cached_cuda_iov;
+    datatype->cached_cuda_iov = (unsigned char*)cached_cuda_iov;
     *cuda_iov_count = nb_blocks_used;
     return OPAL_SUCCESS;
 }
diff --git a/opal/datatype/opal_datatype.h b/opal/datatype/opal_datatype.h
index 6e161e96d76..1287cdb1410 100644
--- a/opal/datatype/opal_datatype.h
+++ b/opal/datatype/opal_datatype.h
@@ -131,13 +131,14 @@ struct opal_datatype_t {
     int                iov_count;
     size_t             max_data;
     /* size: 416, cachelines: 7, members: 18 */
-#if OPAL_CUDA_SUPPORT
-    void *             cached_cuda_iov;
-#endif /* OPAL_CUDA_SUPPORT */
     /* last cacheline: 32 bytes */
 
     struct iovec*      cached_iovec;
     uint32_t           cached_iovec_count;
+
+#if OPAL_CUDA_SUPPORT
+    unsigned char *             cached_cuda_iov;
+#endif /* OPAL_CUDA_SUPPORT */
 };
 
 typedef struct opal_datatype_t opal_datatype_t;
diff --git a/opal/datatype/opal_datatype_create.c b/opal/datatype/opal_datatype_create.c
index 44c0e3020b6..e57a7d6c668 100644
--- a/opal/datatype/opal_datatype_create.c
+++ b/opal/datatype/opal_datatype_create.c
@@ -102,7 +102,7 @@ static void opal_datatype_destruct( opal_datatype_t* datatype )
 #if OPAL_CUDA_SUPPORT   
     /* free cuda iov */
     if (opal_datatype_cuda_kernel_support == 1 && datatype->cached_cuda_iov != NULL) {
-        opal_cached_cuda_iov_fini(datatype->cached_cuda_iov);
+        opal_cached_cuda_iov_fini((void*)datatype->cached_cuda_iov);
         datatype->cached_cuda_iov = NULL;
     }
 #endif /* OPAL_CUDA_SUPPORT */

From 7a663f47f215fd22f442d81aee59e250c216f846 Mon Sep 17 00:00:00 2001
From: eddy16112 <eddy16112@gmail.com>
Date: Tue, 17 Nov 2015 18:13:00 -0500
Subject: [PATCH 070/190] if cuda_iov is not big enough, use realloc. However,
 cudaMallocHost does not work with realloc, so use malloc instead

---
 opal/datatype/cuda/opal_datatype_cuda.cu | 35 +++++++++++++++++++++---
 1 file changed, 31 insertions(+), 4 deletions(-)

diff --git a/opal/datatype/cuda/opal_datatype_cuda.cu b/opal/datatype/cuda/opal_datatype_cuda.cu
index f79e4e5ed0d..cd74a081693 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cu
+++ b/opal/datatype/cuda/opal_datatype_cuda.cu
@@ -222,7 +222,10 @@ int32_t opal_ddt_cuda_kernel_init(void)
             cuda_iov_pipeline_block = (ddt_cuda_iov_pipeline_block_t *)malloc(sizeof(ddt_cuda_iov_pipeline_block_t));
             cudaMallocHost((void **)(&(cuda_iov_pipeline_block->cuda_iov_dist_non_cached_h)), sizeof(ddt_cuda_iov_dist_non_cached_t) * CUDA_MAX_NB_BLOCKS * CUDA_IOV_MAX_TASK_PER_BLOCK);
             cudaMalloc((void **)(&(cuda_iov_pipeline_block->cuda_iov_dist_non_cached_d)), sizeof(ddt_cuda_iov_dist_non_cached_t) * CUDA_MAX_NB_BLOCKS * CUDA_IOV_MAX_TASK_PER_BLOCK);
-            cudaMallocHost((void **)(&(cuda_iov_pipeline_block->cuda_iov_dist_cached_h)), sizeof(ddt_cuda_iov_dist_cached_t) * CUDA_MAX_NB_BLOCKS * CUDA_IOV_MAX_TASK_PER_BLOCK);
+            if (j == 0) {
+            //    cudaMallocHost((void **)(&(cuda_iov_pipeline_block->cuda_iov_dist_cached_h)), sizeof(ddt_cuda_iov_dist_cached_t) * NUM_CUDA_IOV_PER_DDT);
+                cuda_iov_pipeline_block->cuda_iov_dist_cached_h = (ddt_cuda_iov_dist_cached_t *)malloc(sizeof(ddt_cuda_iov_dist_cached_t) * NUM_CUDA_IOV_PER_DDT);
+            }
             cuda_iov_pipeline_block->cuda_stream = &(cuda_streams->opal_cuda_stream[0]);
             cuda_iov_pipeline_block->cuda_stream_id = 0;
             cudaEventCreate(&(cuda_iov_pipeline_block->cuda_event), cudaEventDisableTiming);
@@ -261,7 +264,8 @@ int32_t opal_ddt_cuda_kernel_fini(void)
             if (cuda_iov_pipeline_block != NULL) {
                 cudaFreeHost(cuda_iov_pipeline_block->cuda_iov_dist_non_cached_h);
                 cudaFree(cuda_iov_pipeline_block->cuda_iov_dist_non_cached_d);
-                cudaFreeHost(cuda_iov_pipeline_block->cuda_iov_dist_cached_h);
+                //cudaFreeHost(cuda_iov_pipeline_block->cuda_iov_dist_cached_h);
+                free(cuda_iov_pipeline_block->cuda_iov_dist_cached_h);
                 cudaEventDestroy(cuda_iov_pipeline_block->cuda_event);
                 cuda_iov_pipeline_block->cuda_stream = NULL;
                 cuda_iov_pipeline_block->cuda_stream_id = -1;
@@ -319,6 +323,22 @@ void opal_ddt_cached_cuda_iov_fini(void* cached_cuda_iov)
 #endif /* OPAL_DATATYPE_CUDA_IOV_CACHE */
 }
 
+static inline int32_t opal_ddt_cached_cuda_iov_isfull(ddt_cuda_iov_total_cached_t *cached_cuda_iov, ddt_cuda_iov_dist_cached_t **cuda_iov_dist_h, uint32_t nb_blocks_used)
+{
+    if (nb_blocks_used < cached_cuda_iov->cuda_iov_count) {
+        return 0;
+    } else {
+realloc_cuda_iov:
+        cached_cuda_iov->nb_bytes_h = (uint32_t *)realloc(cached_cuda_iov->nb_bytes_h, sizeof(uint32_t)*cached_cuda_iov->cuda_iov_count*2);
+        assert(cached_cuda_iov->nb_bytes_h != NULL);
+        cached_cuda_iov->cuda_iov_count *= 2;
+        if (nb_blocks_used >= cached_cuda_iov->cuda_iov_count) {
+            goto realloc_cuda_iov;
+        }
+        return 1;
+    }
+}
+
 /* cached_cuda_iov_d is not ready until explicitlt sync with cuda stream 0 
 */
 int32_t opal_ddt_cache_cuda_iov(opal_convertor_t* pConvertor, uint32_t *cuda_iov_count)
@@ -371,6 +391,13 @@ int32_t opal_ddt_cache_cuda_iov(opal_convertor_t* pConvertor, uint32_t *cuda_iov
         residue_desc = length_per_iovec % alignment;
         nb_blocks_per_description = (count_desc + thread_per_block - 1) / thread_per_block;
         DT_CUDA_DEBUG ( opal_cuda_output(10, "Cache cuda IOV description %d, size %d, residue %d, alignment %d, nb_block_aquired %d\n", i, count_desc, residue_desc, alignment, nb_blocks_per_description); );
+        if (opal_ddt_cached_cuda_iov_isfull(cached_cuda_iov, &(cuda_iov_pipeline_block->cuda_iov_dist_cached_h), nb_blocks_used + nb_blocks_per_description + 1)) {
+            cached_cuda_iov_nb_bytes_list_h = cached_cuda_iov->nb_bytes_h;
+            cuda_iov_dist_h = (ddt_cuda_iov_dist_cached_t *)realloc(cuda_iov_dist_h, sizeof(ddt_cuda_iov_dist_cached_t)*cached_cuda_iov->cuda_iov_count);
+            assert(cuda_iov_dist_h != NULL);
+            cuda_iov_pipeline_block->cuda_iov_dist_cached_h = cuda_iov_dist_h;
+        }
+        
         for (j = 0; j < nb_blocks_per_description; j++) {
             cuda_iov_dist_h[nb_blocks_used].ncontig_disp = ncontig_disp_base + j * thread_per_block * alignment;
             cuda_iov_dist_h[nb_blocks_used].contig_disp = contig_disp;
@@ -385,7 +412,7 @@ int32_t opal_ddt_cache_cuda_iov(opal_convertor_t* pConvertor, uint32_t *cuda_iov
             contig_disp += cached_cuda_iov_nb_bytes_list_h[nb_blocks_used];
             DT_CUDA_DEBUG( opal_cuda_output(12, "Cache cuda IOV \tblock %d, ncontig_disp %ld, contig_disp %ld, nb_bytes %ld\n", nb_blocks_used, cuda_iov_dist_h[nb_blocks_used].ncontig_disp, cuda_iov_dist_h[nb_blocks_used].contig_disp, cached_cuda_iov_nb_bytes_list_h[nb_blocks_used]); );
             nb_blocks_used ++;
-            assert (nb_blocks_used < CUDA_MAX_NB_BLOCKS*CUDA_IOV_MAX_TASK_PER_BLOCK);
+         //   assert (nb_blocks_used < NUM_CUDA_IOV_PER_DDT);
         }
     
         /* handle residue */
@@ -400,7 +427,7 @@ int32_t opal_ddt_cache_cuda_iov(opal_convertor_t* pConvertor, uint32_t *cuda_iov
             DT_CUDA_DEBUG( opal_cuda_output(12, "Cache cuda IOV \tblock %d, ncontig_disp %ld, contig_disp %ld, nb_bytes %ld\n", nb_blocks_used, cuda_iov_dist_h[nb_blocks_used].ncontig_disp, cuda_iov_dist_h[nb_blocks_used].contig_disp, cached_cuda_iov_nb_bytes_list_h[nb_blocks_used]); );
             nb_blocks_used ++;
 #if defined (OPAL_DATATYPE_CUDA_DEBUG)
-            assert (nb_blocks_used < CUDA_MAX_NB_BLOCKS*CUDA_IOV_MAX_TASK_PER_BLOCK);
+            //assert (nb_blocks_used < NUM_CUDA_IOV_PER_DDT);
 #endif /* OPAL_DATATYPE_CUDA_DEBUG */
         }
     }

From 150ba7a9957e8618159f4a3408f5ef894bc77551 Mon Sep 17 00:00:00 2001
From: eddy16112 <eddy16112@gmail.com>
Date: Wed, 18 Nov 2015 15:26:31 -0500
Subject: [PATCH 071/190] make sure check pointer is not NULL before free it

---
 opal/datatype/cuda/opal_datatype_cuda.cu | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/opal/datatype/cuda/opal_datatype_cuda.cu b/opal/datatype/cuda/opal_datatype_cuda.cu
index cd74a081693..2df143f2c61 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cu
+++ b/opal/datatype/cuda/opal_datatype_cuda.cu
@@ -225,6 +225,8 @@ int32_t opal_ddt_cuda_kernel_init(void)
             if (j == 0) {
             //    cudaMallocHost((void **)(&(cuda_iov_pipeline_block->cuda_iov_dist_cached_h)), sizeof(ddt_cuda_iov_dist_cached_t) * NUM_CUDA_IOV_PER_DDT);
                 cuda_iov_pipeline_block->cuda_iov_dist_cached_h = (ddt_cuda_iov_dist_cached_t *)malloc(sizeof(ddt_cuda_iov_dist_cached_t) * NUM_CUDA_IOV_PER_DDT);
+            } else {
+                cuda_iov_pipeline_block->cuda_iov_dist_cached_h = NULL;
             }
             cuda_iov_pipeline_block->cuda_stream = &(cuda_streams->opal_cuda_stream[0]);
             cuda_iov_pipeline_block->cuda_stream_id = 0;
@@ -262,10 +264,19 @@ int32_t opal_ddt_cuda_kernel_fini(void)
             cudaStreamDestroy(cuda_devices[i].cuda_streams->opal_cuda_stream[j]);
             cuda_iov_pipeline_block = cuda_devices[i].cuda_iov_pipeline_block[j];
             if (cuda_iov_pipeline_block != NULL) {
-                cudaFreeHost(cuda_iov_pipeline_block->cuda_iov_dist_non_cached_h);
-                cudaFree(cuda_iov_pipeline_block->cuda_iov_dist_non_cached_d);
-                //cudaFreeHost(cuda_iov_pipeline_block->cuda_iov_dist_cached_h);
-                free(cuda_iov_pipeline_block->cuda_iov_dist_cached_h);
+                if (cuda_iov_pipeline_block->cuda_iov_dist_non_cached_h != NULL) {
+                    cudaFreeHost(cuda_iov_pipeline_block->cuda_iov_dist_non_cached_h);
+                    cuda_iov_pipeline_block->cuda_iov_dist_non_cached_h = NULL;
+                }
+                if (cuda_iov_pipeline_block->cuda_iov_dist_non_cached_d != NULL) {
+                    cudaFree(cuda_iov_pipeline_block->cuda_iov_dist_non_cached_d);
+                    cuda_iov_pipeline_block->cuda_iov_dist_non_cached_d = NULL;
+                }
+                if (cuda_iov_pipeline_block->cuda_iov_dist_cached_h != NULL) {
+                    //cudaFreeHost(cuda_iov_pipeline_block->cuda_iov_dist_cached_h);
+                    free(cuda_iov_pipeline_block->cuda_iov_dist_cached_h);
+                    cuda_iov_pipeline_block->cuda_iov_dist_cached_h = NULL;
+                }
                 cudaEventDestroy(cuda_iov_pipeline_block->cuda_event);
                 cuda_iov_pipeline_block->cuda_stream = NULL;
                 cuda_iov_pipeline_block->cuda_stream_id = -1;

From 38ca646a82b9cd3f158a07de104bdadaeb3e0398 Mon Sep 17 00:00:00 2001
From: eddy16112 <eddy16112@gmail.com>
Date: Tue, 24 Nov 2015 20:18:17 -0500
Subject: [PATCH 072/190] checkpoint, rewrite non-cached version

---
 opal/datatype/cuda/opal_datatype_cuda.cu      |   6 +-
 .../cuda/opal_datatype_cuda_internal.cuh      |   4 +-
 .../cuda/opal_datatype_pack_cuda_wrapper.cu   | 251 +++++++++++++++++-
 .../cuda/opal_datatype_unpack_cuda_wrapper.cu | 220 ++++++++++++++-
 4 files changed, 463 insertions(+), 18 deletions(-)

diff --git a/opal/datatype/cuda/opal_datatype_cuda.cu b/opal/datatype/cuda/opal_datatype_cuda.cu
index 2df143f2c61..5ba8e1361c0 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cu
+++ b/opal/datatype/cuda/opal_datatype_cuda.cu
@@ -220,10 +220,9 @@ int32_t opal_ddt_cuda_kernel_init(void)
         for (j = 0; j < NB_STREAMS; j++) {
             cudaStreamCreate(&(cuda_streams->opal_cuda_stream[j]));
             cuda_iov_pipeline_block = (ddt_cuda_iov_pipeline_block_t *)malloc(sizeof(ddt_cuda_iov_pipeline_block_t));
-            cudaMallocHost((void **)(&(cuda_iov_pipeline_block->cuda_iov_dist_non_cached_h)), sizeof(ddt_cuda_iov_dist_non_cached_t) * CUDA_MAX_NB_BLOCKS * CUDA_IOV_MAX_TASK_PER_BLOCK);
-            cudaMalloc((void **)(&(cuda_iov_pipeline_block->cuda_iov_dist_non_cached_d)), sizeof(ddt_cuda_iov_dist_non_cached_t) * CUDA_MAX_NB_BLOCKS * CUDA_IOV_MAX_TASK_PER_BLOCK);
+            cudaMallocHost((void **)(&(cuda_iov_pipeline_block->cuda_iov_dist_non_cached_h)), sizeof(ddt_cuda_iov_dist_cached_t) * CUDA_MAX_NB_BLOCKS * CUDA_IOV_MAX_TASK_PER_BLOCK);
+            cudaMalloc((void **)(&(cuda_iov_pipeline_block->cuda_iov_dist_non_cached_d)), sizeof(ddt_cuda_iov_dist_cached_t) * CUDA_MAX_NB_BLOCKS * CUDA_IOV_MAX_TASK_PER_BLOCK);
             if (j == 0) {
-            //    cudaMallocHost((void **)(&(cuda_iov_pipeline_block->cuda_iov_dist_cached_h)), sizeof(ddt_cuda_iov_dist_cached_t) * NUM_CUDA_IOV_PER_DDT);
                 cuda_iov_pipeline_block->cuda_iov_dist_cached_h = (ddt_cuda_iov_dist_cached_t *)malloc(sizeof(ddt_cuda_iov_dist_cached_t) * NUM_CUDA_IOV_PER_DDT);
             } else {
                 cuda_iov_pipeline_block->cuda_iov_dist_cached_h = NULL;
@@ -273,7 +272,6 @@ int32_t opal_ddt_cuda_kernel_fini(void)
                     cuda_iov_pipeline_block->cuda_iov_dist_non_cached_d = NULL;
                 }
                 if (cuda_iov_pipeline_block->cuda_iov_dist_cached_h != NULL) {
-                    //cudaFreeHost(cuda_iov_pipeline_block->cuda_iov_dist_cached_h);
                     free(cuda_iov_pipeline_block->cuda_iov_dist_cached_h);
                     cuda_iov_pipeline_block->cuda_iov_dist_cached_h = NULL;
                 }
diff --git a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
index 5e7bb41d0dc..99dc76f1e05 100644
--- a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
+++ b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
@@ -71,8 +71,8 @@ typedef struct {
 } ddt_cuda_iov_total_cached_t;
 
 typedef struct {
-    ddt_cuda_iov_dist_non_cached_t* cuda_iov_dist_non_cached_h;
-    ddt_cuda_iov_dist_non_cached_t* cuda_iov_dist_non_cached_d;
+    ddt_cuda_iov_dist_cached_t* cuda_iov_dist_non_cached_h;
+    ddt_cuda_iov_dist_cached_t* cuda_iov_dist_non_cached_d;
     ddt_cuda_iov_dist_cached_t* cuda_iov_dist_cached_h;
     cudaStream_t *cuda_stream;
     int32_t cuda_stream_id;
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
index c98d540e54e..5bdfa88fbdb 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
@@ -664,9 +664,11 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConve
                                                         uint32_t* out_size,
                                                         size_t* max_data )
 {
-    return opal_ddt_generic_simple_pack_function_cuda_iov_cached(pConvertor, iov, out_size, max_data);   
+    return opal_ddt_generic_simple_pack_function_cuda_iov_non_cached(pConvertor, iov, out_size, max_data);   
 }
 
+#if 0
+
 int32_t opal_ddt_generic_simple_pack_function_cuda_iov_non_cached( opal_convertor_t* pConvertor,
                                                                    struct iovec* iov,
                                                                    uint32_t* out_size,
@@ -927,6 +929,243 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_non_cached( opal_converto
     return 0;
 }
 
+#endif
+
+
+int32_t opal_ddt_generic_simple_pack_function_cuda_iov_non_cached( opal_convertor_t* pConvertor,
+                                                                   struct iovec* iov,
+                                                                   uint32_t* out_size,
+                                                                   size_t* max_data )
+{
+    uint32_t i, j;
+    uint32_t count_desc, nb_blocks_per_description, residue_desc;
+    uint32_t nb_blocks, thread_per_block, nb_blocks_used;
+    size_t buffer_size, length_per_iovec;
+    unsigned char *destination, *destination_base, *source_base;
+    size_t total_packed;
+    uint8_t buffer_isfull = 0, transfer_required, free_required;
+    uint8_t alignment;
+    cudaError_t cuda_err;
+    ddt_cuda_stream_t *cuda_streams = current_cuda_device->cuda_streams;
+    ddt_cuda_iov_dist_cached_t* cuda_iov_dist_h_current;
+    ddt_cuda_iov_dist_cached_t* cuda_iov_dist_d_current;
+    ddt_cuda_iov_pipeline_block_t *cuda_iov_pipeline_block;
+    int iov_pipeline_block_id = 0;
+    cudaStream_t *cuda_stream_iov = NULL;
+    const struct iovec *ddt_iov = NULL;
+    uint32_t ddt_iov_count = 0;
+    size_t current_cuda_iov_length = 0;
+    size_t ncontig_disp_base;
+    size_t contig_disp = 0;
+    uint32_t ddt_iov_start_pos, ddt_iov_end_pos;
+    OPAL_PTRDIFF_TYPE ddt_extent;
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)    
+    TIMER_DATA_TYPE start, end, start_total, end_total;
+    long total_time, move_time;
+#endif
+    
+
+ //   printf("buffer size %d, max_data %d\n", iov[0].iov_len, *max_data);
+    if ((iov[0].iov_base == NULL) || opal_ddt_cuda_is_gpu_buffer(iov[0].iov_base)) {
+        if (iov[0].iov_len == 0) {
+            buffer_size = DT_CUDA_BUFFER_SIZE;
+        } else {
+            buffer_size = iov[0].iov_len;
+        }
+        
+        if (iov[0].iov_base == NULL) {
+            iov[0].iov_base = (unsigned char *)opal_ddt_cuda_malloc_gpu_buffer(buffer_size, 0);
+            destination = (unsigned char *)iov[0].iov_base;
+            pConvertor->gpu_buffer_ptr = destination;
+            free_required = 1;
+        } else {
+            destination = (unsigned char *)iov[0].iov_base;
+            free_required = 0;
+        }
+        transfer_required = 0;
+    } else {
+        buffer_size = iov[0].iov_len;
+        if (OPAL_DATATYPE_VECTOR_USE_ZEROCPY) {
+            pConvertor->gpu_buffer_ptr = NULL;
+            transfer_required = 0;
+            free_required = 0;
+            cudaHostGetDevicePointer((void **)&destination, (void *)iov[0].iov_base, 0);
+        } else {
+            if (pConvertor->gpu_buffer_ptr == NULL) {
+                pConvertor->gpu_buffer_ptr = (unsigned char*)opal_ddt_cuda_malloc_gpu_buffer(buffer_size, 0);
+            }
+            transfer_required = 1;
+            free_required = 1;
+            destination = pConvertor->gpu_buffer_ptr;
+        }
+    }   
+    
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "Pack using IOV non cached, GPU base %p, pack to buffer %p\n", pConvertor->pBaseBuf, destination););
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+    GET_TIME(start_total);
+#endif
+    
+    opal_convertor_raw_cached( pConvertor, &ddt_iov, &ddt_iov_count);
+    if (ddt_iov == NULL) {
+        DT_CUDA_DEBUG ( opal_cuda_output(0, "Can not get ddt iov\n"););
+        return OPAL_ERROR;
+    }
+    
+    total_packed = 0;
+    cuda_streams->current_stream_id = 0;
+    thread_per_block = CUDA_WARP_SIZE * 5;
+    nb_blocks = 256;
+    opal_datatype_type_extent(pConvertor->pDesc, &ddt_extent);
+    source_base = (unsigned char*)pConvertor->pBaseBuf + pConvertor->current_count * ddt_extent; 
+    destination_base = destination;
+    
+    for (i = 0; i < NB_STREAMS; i++) {
+        cudaStreamSynchronize(cuda_streams->opal_cuda_stream[i]);
+    }
+    
+    while( pConvertor->current_count < pConvertor->count && !buffer_isfull) {
+        
+        nb_blocks_used = 0;
+        ddt_iov_start_pos = pConvertor->current_iov_pos;
+        ddt_iov_end_pos = ddt_iov_start_pos + IOV_PIPELINE_SIZE;
+        if (ddt_iov_end_pos > ddt_iov_count) {
+            ddt_iov_end_pos = ddt_iov_count;
+        }
+        cuda_iov_pipeline_block = current_cuda_device->cuda_iov_pipeline_block[iov_pipeline_block_id];
+        cuda_iov_dist_h_current = cuda_iov_pipeline_block->cuda_iov_dist_non_cached_h;
+        cuda_iov_dist_d_current = cuda_iov_pipeline_block->cuda_iov_dist_non_cached_d;
+        cuda_stream_iov = cuda_iov_pipeline_block->cuda_stream;
+        cuda_err = cudaStreamWaitEvent(*cuda_stream_iov, cuda_iov_pipeline_block->cuda_event, 0);
+        opal_cuda_check_error(cuda_err);
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+        GET_TIME(start);
+#endif
+
+        for (i = ddt_iov_start_pos; i < ddt_iov_end_pos && !buffer_isfull; i++) {
+            if (pConvertor->current_iov_partial_length > 0) {
+                ncontig_disp_base = (size_t)(ddt_iov[i].iov_base) + ddt_iov[i].iov_len - pConvertor->current_iov_partial_length;
+                length_per_iovec = pConvertor->current_iov_partial_length;
+                pConvertor->current_iov_partial_length = 0;
+            } else {
+                ncontig_disp_base = (size_t)(ddt_iov[i].iov_base);
+                length_per_iovec = ddt_iov[i].iov_len;
+            }
+            if (buffer_size < length_per_iovec) {
+                pConvertor->current_iov_pos = i;
+                pConvertor->current_iov_partial_length = length_per_iovec - buffer_size;
+                length_per_iovec = buffer_size; 
+                buffer_isfull = 1;
+            }
+            buffer_size -= length_per_iovec;
+            total_packed += length_per_iovec;
+            
+            alignment = ALIGNMENT_DOUBLE;
+
+            count_desc = length_per_iovec / alignment;
+            residue_desc = length_per_iovec % alignment;
+            nb_blocks_per_description = (count_desc + thread_per_block - 1) / thread_per_block;
+            DT_CUDA_DEBUG ( opal_cuda_output(10, "Pack description %d, size %d, residue %d, alignment %d, nb_block_aquired %d\n", i, count_desc, residue_desc, alignment, nb_blocks_per_description); );
+            for (j = 0; j < nb_blocks_per_description; j++) {
+                cuda_iov_dist_h_current[nb_blocks_used].ncontig_disp = ncontig_disp_base + j * thread_per_block * alignment;
+                cuda_iov_dist_h_current[nb_blocks_used].contig_disp = contig_disp;
+                if ( (j+1) * thread_per_block <= count_desc) {
+                    current_cuda_iov_length = thread_per_block * alignment;
+                } else {
+                    current_cuda_iov_length = (count_desc - j*thread_per_block) * alignment; 
+                }
+#if defined (OPAL_DATATYPE_CUDA_DEBUG)
+                assert(current_cuda_iov_length > 0);
+#endif /* OPAL_DATATYPE_CUDA_DEBUG */
+                contig_disp += current_cuda_iov_length;
+                DT_CUDA_DEBUG( opal_cuda_output(12, "Pack \tblock %d, ncontig_disp %ld, contig_disp %ld, nb_bytes %ld\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].ncontig_disp, cuda_iov_dist_h_current[nb_blocks_used].contig_disp, current_cuda_iov_length); );
+                nb_blocks_used ++;
+                assert (nb_blocks_used < CUDA_MAX_NB_BLOCKS*CUDA_IOV_MAX_TASK_PER_BLOCK);
+            }
+            
+            /* handle residue */
+            if (residue_desc != 0) {
+                cuda_iov_dist_h_current[nb_blocks_used].ncontig_disp = ncontig_disp_base + length_per_iovec / alignment * alignment;
+                cuda_iov_dist_h_current[nb_blocks_used].contig_disp = contig_disp;
+                current_cuda_iov_length= length_per_iovec - length_per_iovec / alignment * alignment;
+#if defined (OPAL_DATATYPE_CUDA_DEBUG)
+                assert(current_cuda_iov_length > 0);
+#endif /* OPAL_DATATYPE_CUDA_DEBUG */
+                contig_disp += current_cuda_iov_length;
+                DT_CUDA_DEBUG( opal_cuda_output(12, "Pack \tblock %d, ncontig_disp %ld, contig_disp %ld, nb_bytes %ld\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].ncontig_disp, cuda_iov_dist_h_current[nb_blocks_used].contig_disp, current_cuda_iov_length); );
+                nb_blocks_used ++;
+                assert (nb_blocks_used < CUDA_MAX_NB_BLOCKS*CUDA_IOV_MAX_TASK_PER_BLOCK);
+            }
+        }
+        cuda_iov_dist_h_current[nb_blocks_used].contig_disp = contig_disp;
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)    
+        GET_TIME( end );
+        total_time = ELAPSED_TIME( start, end );
+        DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Pack src %p to dest %p, iov is prepared in %ld microsec, kernel submitted to CUDA stream %d, nb_blocks %d\n", source_base, destination_base, total_time,  cuda_iov_pipeline_block->cuda_stream_id, nb_blocks_used); );
+#endif
+
+        cudaMemcpyAsync(cuda_iov_dist_d_current, cuda_iov_dist_h_current, sizeof(ddt_cuda_iov_dist_cached_t)*(nb_blocks_used+1), cudaMemcpyHostToDevice, *cuda_stream_iov);
+        opal_generic_simple_pack_cuda_iov_cached_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cuda_iov_dist_d_current, 0, nb_blocks_used, 0, 0, nb_blocks_used, source_base, destination_base);
+        cuda_err = cudaEventRecord(cuda_iov_pipeline_block->cuda_event, *cuda_stream_iov);
+        opal_cuda_check_error(cuda_err);
+        iov_pipeline_block_id ++;
+        iov_pipeline_block_id = iov_pipeline_block_id % NB_STREAMS;
+        destination_base += contig_disp;
+        contig_disp = 0;
+        if (!buffer_isfull) {
+            pConvertor->current_iov_pos = i;
+            if (i == ddt_iov_count) {
+                pConvertor->current_count ++;
+                pConvertor->current_iov_pos = 0;
+                source_base += ddt_extent;
+            }
+        }
+        
+    }
+    
+
+    for (i = 0; i < NB_STREAMS; i++) {
+        cudaStreamSynchronize(cuda_streams->opal_cuda_stream[i]);
+    }
+    
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+    GET_TIME(start);
+#endif
+    if (transfer_required) {
+        cudaMemcpy(iov[0].iov_base, pConvertor->gpu_buffer_ptr, total_packed, cudaMemcpyDeviceToHost);
+    } 
+#if defined(OPAL_DATATYPE_CUDA_TIMING) 
+    GET_TIME( end );
+    move_time = ELAPSED_TIME( start, end );
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: DtoH memcpy in %ld microsec, transfer required %d\n", move_time, transfer_required ); );
+#endif
+
+    pConvertor->bConverted += total_packed;
+    iov[0].iov_len = total_packed;
+    *max_data = total_packed;
+    *out_size = 1;
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "Pack total packed %d\n", total_packed); );
+    
+#if defined(OPAL_DATATYPE_CUDA_TIMING)    
+    GET_TIME( end_total );
+    total_time = ELAPSED_TIME( start_total, end_total );
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: total packing in %ld microsec, kernel %ld microsec\n", total_time, total_time-move_time ); );
+#endif
+    
+    if( pConvertor->bConverted == pConvertor->local_size ) {
+        pConvertor->flags |= CONVERTOR_COMPLETED;
+        if (pConvertor->gpu_buffer_ptr != NULL && free_required) {
+           opal_ddt_cuda_free_gpu_buffer(pConvertor->gpu_buffer_ptr, 0);
+           pConvertor->gpu_buffer_ptr = NULL;
+        }
+        return 1;
+    }        
+    return 0;
+}
+
 int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t* pConvertor,
                                                                struct iovec* iov,
                                                                uint32_t* out_size,
@@ -1000,16 +1239,6 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
     GET_TIME(start_total);
 #endif
     
-#if defined(OPAL_DATATYPE_CUDA_TIMING)
-    GET_TIME(start);
-#endif
-
-#if defined(OPAL_DATATYPE_CUDA_TIMING)    
-    GET_TIME( end );
-    total_time = ELAPSED_TIME( start, end );
-    DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: ddt to iov in %ld microsec\n", total_time ); );
-#endif
-    
     thread_per_block = CUDA_WARP_SIZE * 5;
     nb_blocks = 256;
     source_base = (unsigned char*)pConvertor->pBaseBuf; 
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
index 6808ab56fed..6d0b906c0b0 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
@@ -370,14 +370,16 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_vector2( opal_convertor_t*
     return 0;
 }
 
+
 int32_t opal_ddt_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvertor,
                                                           struct iovec* iov,
                                                           uint32_t* out_size,
                                                           size_t* max_data )
 {
-    return opal_ddt_generic_simple_unpack_function_cuda_iov_cached(pConvertor, iov, out_size, max_data);
+    return opal_ddt_generic_simple_unpack_function_cuda_iov_non_cached(pConvertor, iov, out_size, max_data);
 }
 
+#if 0
 int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_non_cached( opal_convertor_t* pConvertor,
                                                                      struct iovec* iov,
                                                                      uint32_t* out_size,
@@ -622,6 +624,222 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_non_cached( opal_conver
     return 0;
 }
 
+#endif
+
+int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_non_cached( opal_convertor_t* pConvertor,
+                                                                     struct iovec* iov,
+                                                                     uint32_t* out_size,
+                                                                     size_t* max_data )
+{
+    uint32_t i, j;
+    uint32_t count_desc, nb_blocks_per_description, dst_offset, residue_desc;
+    uint32_t nb_blocks, thread_per_block, nb_blocks_used;
+    size_t buffer_size, length_per_iovec;
+    unsigned char *source, *source_base, *destination_base;
+    size_t total_unpacked;
+    uint8_t buffer_isfull = 0;
+    uint8_t free_required = 0;
+    uint8_t alignment;
+    cudaError_t cuda_err;
+    ddt_cuda_stream_t *cuda_streams = current_cuda_device->cuda_streams;
+    ddt_cuda_iov_dist_cached_t* cuda_iov_dist_h_current;
+    ddt_cuda_iov_dist_cached_t* cuda_iov_dist_d_current;
+    ddt_cuda_iov_pipeline_block_t *cuda_iov_pipeline_block;
+    int iov_pipeline_block_id = 0;
+    cudaStream_t *cuda_stream_iov = NULL;
+    const struct iovec *ddt_iov = NULL;
+    uint32_t ddt_iov_count = 0;
+    size_t current_cuda_iov_length = 0;
+    size_t ncontig_disp_base;
+    size_t contig_disp = 0;
+    uint32_t ddt_iov_start_pos, ddt_iov_end_pos;
+    OPAL_PTRDIFF_TYPE ddt_extent;
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+    TIMER_DATA_TYPE start, end, start_total, end_total;
+    long total_time, move_time;
+#endif
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+    GET_TIME(start_total);
+#endif
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+    GET_TIME(start);
+#endif
+    if (opal_ddt_cuda_is_gpu_buffer(iov[0].iov_base)) {
+        source = (unsigned char*)iov[0].iov_base;
+        free_required = 0;
+    } else {
+        if (OPAL_DATATYPE_VECTOR_USE_ZEROCPY) {
+            cudaHostGetDevicePointer((void **)&source, (void *)iov[0].iov_base, 0);
+            pConvertor->gpu_buffer_ptr = NULL;
+            free_required = 0;
+        } else {
+            if (pConvertor->gpu_buffer_ptr == NULL) {
+                pConvertor->gpu_buffer_ptr = (unsigned char*)opal_ddt_cuda_malloc_gpu_buffer(iov[0].iov_len, 0);
+            }
+            source = pConvertor->gpu_buffer_ptr;
+            cudaMemcpy(source, iov[0].iov_base, iov[0].iov_len, cudaMemcpyHostToDevice);
+            free_required = 1;
+        }
+    }
+
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "Unpack using IOV non cached, GPU base %p, unpack from buffer %p, total size %ld\n",
+                                     pConvertor->pBaseBuf, source, iov[0].iov_len); );
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+    GET_TIME( end );
+    move_time = ELAPSED_TIME( start, end );
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: HtoD memcpy in %ld microsec, free required %d\n", move_time, free_required ); );
+#endif
+    
+    opal_convertor_raw_cached( pConvertor, &ddt_iov, &ddt_iov_count);
+    if (ddt_iov == NULL) {
+        DT_CUDA_DEBUG ( opal_cuda_output(0, "Can not get ddt iov\n"););
+        return OPAL_ERROR;
+    }
+    
+    buffer_size = iov[0].iov_len;
+    total_unpacked = 0;
+    cuda_streams->current_stream_id = 0;
+
+    thread_per_block = CUDA_WARP_SIZE * 5;
+    nb_blocks = 256;
+    source_base = source;
+    opal_datatype_type_extent(pConvertor->pDesc, &ddt_extent);
+    destination_base = (unsigned char*)pConvertor->pBaseBuf + pConvertor->current_count * ddt_extent;
+    
+    for (i = 0; i < NB_STREAMS; i++) {
+        cudaStreamSynchronize(cuda_streams->opal_cuda_stream[i]);
+    }
+
+    while( pConvertor->current_count < pConvertor->count && !buffer_isfull) {
+
+        nb_blocks_used = 0;
+        ddt_iov_start_pos = pConvertor->current_iov_pos;
+        ddt_iov_end_pos = ddt_iov_start_pos + IOV_PIPELINE_SIZE;
+        if (ddt_iov_end_pos > ddt_iov_count) {
+            ddt_iov_end_pos = ddt_iov_count;
+        }
+        cuda_iov_pipeline_block = current_cuda_device->cuda_iov_pipeline_block[iov_pipeline_block_id];
+        cuda_iov_dist_h_current = cuda_iov_pipeline_block->cuda_iov_dist_non_cached_h;
+        cuda_iov_dist_d_current = cuda_iov_pipeline_block->cuda_iov_dist_non_cached_d;
+        cuda_stream_iov = cuda_iov_pipeline_block->cuda_stream;
+        cuda_err = cudaStreamWaitEvent(*cuda_stream_iov, cuda_iov_pipeline_block->cuda_event, 0);
+        opal_cuda_check_error(cuda_err);
+        
+
+#if defined (OPAL_DATATYPE_CUDA_TIMING)
+        GET_TIME(start);
+#endif
+
+        for (i = ddt_iov_start_pos; i < ddt_iov_end_pos && !buffer_isfull; i++) {
+            if (pConvertor->current_iov_partial_length > 0) {
+                ncontig_disp_base = (size_t)(ddt_iov[i].iov_base) + ddt_iov[i].iov_len - pConvertor->current_iov_partial_length;
+                length_per_iovec = pConvertor->current_iov_partial_length;
+                pConvertor->current_iov_partial_length = 0;
+            } else {
+                ncontig_disp_base = (size_t)(ddt_iov[i].iov_base);
+                length_per_iovec = ddt_iov[i].iov_len;
+            }
+            if (buffer_size < length_per_iovec) {
+                pConvertor->current_iov_pos = i;
+                pConvertor->current_iov_partial_length = length_per_iovec - buffer_size;
+                length_per_iovec = buffer_size; 
+                buffer_isfull = 1;
+            }
+            buffer_size -= length_per_iovec;
+            total_unpacked += length_per_iovec;
+
+            alignment = ALIGNMENT_DOUBLE;
+
+            count_desc = length_per_iovec / alignment;
+            residue_desc = length_per_iovec % alignment;
+            nb_blocks_per_description = (count_desc + thread_per_block - 1) / thread_per_block;
+            DT_CUDA_DEBUG ( opal_cuda_output(10, "Unpack description %d, size %d, residue %d, alignment %d\n", i, count_desc, residue_desc, alignment); );
+            for (j = 0; j < nb_blocks_per_description; j++) {
+                cuda_iov_dist_h_current[nb_blocks_used].ncontig_disp = ncontig_disp_base + j * thread_per_block * alignment;
+                cuda_iov_dist_h_current[nb_blocks_used].contig_disp = contig_disp;
+                if ( (j+1) * thread_per_block <= count_desc) {
+                    current_cuda_iov_length = thread_per_block * alignment;
+                } else {
+                    current_cuda_iov_length = (count_desc - j*thread_per_block) * alignment; 
+                }
+#if defined (OPAL_DATATYPE_CUDA_DEBUG)
+                assert(current_cuda_iov_length > 0);
+#endif /* OPAL_DATATYPE_CUDA_DEBUG */
+                contig_disp += current_cuda_iov_length;
+                DT_CUDA_DEBUG( opal_cuda_output(12, "Pack \tblock %d, ncontig_disp %ld, contig_disp %ld, nb_bytes %ld\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].ncontig_disp, cuda_iov_dist_h_current[nb_blocks_used].contig_disp, current_cuda_iov_length); );
+                nb_blocks_used ++;
+                assert (nb_blocks_used < CUDA_MAX_NB_BLOCKS*CUDA_IOV_MAX_TASK_PER_BLOCK);
+            }
+
+            /* handle residue */
+            if (residue_desc != 0) {
+                cuda_iov_dist_h_current[nb_blocks_used].ncontig_disp = ncontig_disp_base + length_per_iovec / alignment * alignment;
+                cuda_iov_dist_h_current[nb_blocks_used].contig_disp = contig_disp;
+                current_cuda_iov_length= length_per_iovec - length_per_iovec / alignment * alignment;
+#if defined (OPAL_DATATYPE_CUDA_DEBUG)
+                assert(current_cuda_iov_length > 0);
+#endif /* OPAL_DATATYPE_CUDA_DEBUG */
+                contig_disp += current_cuda_iov_length;
+                DT_CUDA_DEBUG( opal_cuda_output(12, "Pack \tblock %d, ncontig_disp %ld, contig_disp %ld, nb_bytes %ld\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].ncontig_disp, cuda_iov_dist_h_current[nb_blocks_used].contig_disp, current_cuda_iov_length); );
+                nb_blocks_used ++;
+                assert (nb_blocks_used < CUDA_MAX_NB_BLOCKS*CUDA_IOV_MAX_TASK_PER_BLOCK);
+            }
+            cuda_iov_dist_h_current[nb_blocks_used].contig_disp = contig_disp;
+        }
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+        GET_TIME( end );
+        total_time = ELAPSED_TIME( start, end );
+        DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Unpack src %p to dest %p, iov is prepared in %ld microsec, kernel submitted to CUDA stream %d, nb_blocks_used %d\n", source_base, destination_base, total_time,  cuda_iov_pipeline_block->cuda_stream_id, nb_blocks_used); );
+#endif
+
+        cudaMemcpyAsync(cuda_iov_dist_d_current, cuda_iov_dist_h_current, sizeof(ddt_cuda_iov_dist_cached_t)*(nb_blocks_used+1), cudaMemcpyHostToDevice, *cuda_stream_iov);
+        opal_generic_simple_unpack_cuda_iov_cached_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cuda_iov_dist_d_current, 0, nb_blocks_used, 0, 0, nb_blocks_used, destination_base, source_base, 0, 0);
+        cuda_err = cudaEventRecord(cuda_iov_pipeline_block->cuda_event, *cuda_stream_iov);
+        opal_cuda_check_error(cuda_err);
+        iov_pipeline_block_id ++;
+        iov_pipeline_block_id = iov_pipeline_block_id % NB_STREAMS;
+        source_base += contig_disp;
+        contig_disp = 0;
+        if (!buffer_isfull) {
+            pConvertor->current_iov_pos = i;
+            if (i == ddt_iov_count) {
+                pConvertor->current_count ++;
+                pConvertor->current_iov_pos = 0;
+                destination_base += ddt_extent;
+            }
+        }
+    }
+
+    for (i = 0; i < NB_STREAMS; i++) {
+        cudaStreamSynchronize(cuda_streams->opal_cuda_stream[i]);
+    }
+
+    iov[0].iov_len = total_unpacked;
+    *max_data = total_unpacked;
+    *out_size = 1;
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "Unpack total unpacked %d\n", total_unpacked); );
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+    GET_TIME( end_total );
+    total_time = ELAPSED_TIME( start_total, end_total );
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: total unpacking in %ld microsec, kernel %ld microsec\n", total_time, total_time-move_time ); );
+#endif
+
+    if( pConvertor->bConverted == pConvertor->local_size ) {
+        pConvertor->flags |= CONVERTOR_COMPLETED;
+        if (pConvertor->gpu_buffer_ptr != NULL && free_required) {
+            opal_ddt_cuda_free_gpu_buffer(pConvertor->gpu_buffer_ptr, 0);
+            pConvertor->gpu_buffer_ptr = NULL;
+        }
+        return 1;
+    }
+    return 0;
+}
+
 int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_t* pConvertor,
                                                                  struct iovec* iov,
                                                                  uint32_t* out_size,

From ade51ba25df205d2c1cf96a5ac64d9bdacec4e63 Mon Sep 17 00:00:00 2001
From: eddy16112 <eddy16112@gmail.com>
Date: Tue, 24 Nov 2015 21:14:23 -0500
Subject: [PATCH 073/190] fix for non cached iov

---
 opal/datatype/cuda/opal_datatype_cuda.cu      | 28 +++++++++++++++++++
 opal/datatype/cuda/opal_datatype_cuda.cuh     |  4 ++-
 .../cuda/opal_datatype_unpack_cuda_wrapper.cu |  2 ++
 3 files changed, 33 insertions(+), 1 deletion(-)

diff --git a/opal/datatype/cuda/opal_datatype_cuda.cu b/opal/datatype/cuda/opal_datatype_cuda.cu
index 5ba8e1361c0..7f00ef7dd51 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cu
+++ b/opal/datatype/cuda/opal_datatype_cuda.cu
@@ -511,6 +511,34 @@ void opal_ddt_set_cuda_iov_position(struct opal_convertor_t *convertor, size_t d
     }
 }
 
+void opal_ddt_set_ddt_iov_position(struct opal_convertor_t *convertor, size_t ddt_offset, const struct iovec *ddt_iov, const uint32_t ddt_iov_count)
+{
+    int i;
+    size_t iov_size = 0;
+    size_t ddt_size;
+    convertor->current_iov_partial_length = 0;
+    convertor->current_iov_pos = 0;
+    convertor->current_count = 0;
+    if (ddt_offset == 0) {
+       return;
+    }
+    opal_datatype_type_size(convertor->pDesc, &ddt_size);
+    convertor->current_count = ddt_offset / ddt_size;
+    ddt_offset = ddt_offset % ddt_size;
+    for(i = 0; i < ddt_iov_count; i++) {
+        iov_size += ddt_iov[i].iov_len;
+        if (iov_size > ddt_offset) {
+            convertor->current_iov_partial_length = iov_size - ddt_offset;
+            convertor->current_iov_pos = i;
+            break;
+        } else if (iov_size == ddt_offset){
+            convertor->current_iov_partial_length = 0;
+            convertor->current_iov_pos = i+1;
+            break;
+        }
+    }
+}
+
 void opal_ddt_check_cuda_iov_is_full(struct opal_convertor_t *convertor, uint32_t cuda_iov_count)
 {
 #if 0
diff --git a/opal/datatype/cuda/opal_datatype_cuda.cuh b/opal/datatype/cuda/opal_datatype_cuda.cuh
index 8ad9b3ec658..64f69c2974c 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cuh
+++ b/opal/datatype/cuda/opal_datatype_cuda.cuh
@@ -131,8 +131,10 @@ void opal_ddt_check_cuda_iov_is_full(struct opal_convertor_t *convertor, uint32_
 
 void opal_ddt_set_cuda_iov_position(struct opal_convertor_t *convertor, size_t ddt_offset, const uint32_t *cached_cuda_iov_nb_bytes_list_h, const uint32_t cuda_iov_count);
 
+void opal_ddt_set_ddt_iov_position(struct opal_convertor_t *convertor, size_t ddt_offset, const struct iovec *ddt_iov,  const uint32_t ddt_iov_count);
+
 int32_t opal_ddt_cache_cuda_iov(opal_convertor_t* pConvertor, uint32_t *cuda_iov_count);
 
 }
                             
-#endif  /* OPAL_DATATYPE_CUDA_H_HAS_BEEN_INCLUDED */
\ No newline at end of file
+#endif  /* OPAL_DATATYPE_CUDA_H_HAS_BEEN_INCLUDED */
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
index 6d0b906c0b0..b17dee516d2 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
@@ -708,6 +708,7 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_non_cached( opal_conver
     source_base = source;
     opal_datatype_type_extent(pConvertor->pDesc, &ddt_extent);
     destination_base = (unsigned char*)pConvertor->pBaseBuf + pConvertor->current_count * ddt_extent;
+    opal_ddt_set_ddt_iov_position(pConvertor, pConvertor->bConverted, ddt_iov, ddt_iov_count);
     
     for (i = 0; i < NB_STREAMS; i++) {
         cudaStreamSynchronize(cuda_streams->opal_cuda_stream[i]);
@@ -818,6 +819,7 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_non_cached( opal_conver
         cudaStreamSynchronize(cuda_streams->opal_cuda_stream[i]);
     }
 
+    pConvertor->bConverted += total_unpacked;
     iov[0].iov_len = total_unpacked;
     *max_data = total_unpacked;
     *out_size = 1;

From 897ea1e3b66bcd51b8bfa2295be0cc6a828bd4c6 Mon Sep 17 00:00:00 2001
From: Wei Wu <eddy16112@gmail.com>
Date: Tue, 24 Nov 2015 18:30:18 -0800
Subject: [PATCH 074/190] fix the non cached iov, set position should be put at
 first

---
 opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
index b17dee516d2..c84f09ca738 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
@@ -707,8 +707,8 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_non_cached( opal_conver
     nb_blocks = 256;
     source_base = source;
     opal_datatype_type_extent(pConvertor->pDesc, &ddt_extent);
-    destination_base = (unsigned char*)pConvertor->pBaseBuf + pConvertor->current_count * ddt_extent;
     opal_ddt_set_ddt_iov_position(pConvertor, pConvertor->bConverted, ddt_iov, ddt_iov_count);
+    destination_base = (unsigned char*)pConvertor->pBaseBuf + pConvertor->current_count * ddt_extent;
     
     for (i = 0; i < NB_STREAMS; i++) {
         cudaStreamSynchronize(cuda_streams->opal_cuda_stream[i]);

From e5d34419a1dd399af8750e2a65e114ec9690e9a9 Mon Sep 17 00:00:00 2001
From: eddy16112 <eddy16112@gmail.com>
Date: Wed, 25 Nov 2015 15:02:14 -0500
Subject: [PATCH 075/190] move ddt iov to cuda iov into a function

---
 opal/datatype/cuda/opal_datatype_cuda.cu      | 76 +++++++++++++++++++
 opal/datatype/cuda/opal_datatype_cuda.cuh     |  2 +
 .../cuda/opal_datatype_pack_cuda_wrapper.cu   | 73 ++----------------
 .../cuda/opal_datatype_unpack_cuda_wrapper.cu | 72 ++----------------
 4 files changed, 91 insertions(+), 132 deletions(-)

diff --git a/opal/datatype/cuda/opal_datatype_cuda.cu b/opal/datatype/cuda/opal_datatype_cuda.cu
index 7f00ef7dd51..02559ed283f 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cu
+++ b/opal/datatype/cuda/opal_datatype_cuda.cu
@@ -454,6 +454,82 @@ int32_t opal_ddt_cache_cuda_iov(opal_convertor_t* pConvertor, uint32_t *cuda_iov
     return OPAL_SUCCESS;
 }
 
+uint8_t opal_ddt_iov_to_cuda_iov(opal_convertor_t* pConvertor, const struct iovec *ddt_iov, ddt_cuda_iov_dist_cached_t* cuda_iov_dist_h_current, uint32_t ddt_iov_start_pos, uint32_t ddt_iov_end_pos, size_t *buffer_size, uint32_t *nb_blocks_used, size_t *total_converted, size_t *contig_disp_out, uint32_t *current_ddt_iov_pos)
+{
+    size_t ncontig_disp_base;
+    size_t contig_disp = 0;
+    size_t current_cuda_iov_length = 0;
+    uint8_t buffer_isfull = 0;
+    uint8_t alignment;
+    uint32_t count_desc, nb_blocks_per_description, residue_desc;
+    uint32_t thread_per_block;
+    size_t length_per_iovec;
+    uint32_t i, j;
+    
+    thread_per_block = CUDA_WARP_SIZE * 5;
+    
+    for (i = ddt_iov_start_pos; i < ddt_iov_end_pos && !buffer_isfull; i++) {
+        if (pConvertor->current_iov_partial_length > 0) {
+            ncontig_disp_base = (size_t)(ddt_iov[i].iov_base) + ddt_iov[i].iov_len - pConvertor->current_iov_partial_length;
+            length_per_iovec = pConvertor->current_iov_partial_length;
+            pConvertor->current_iov_partial_length = 0;
+        } else {
+            ncontig_disp_base = (size_t)(ddt_iov[i].iov_base);
+            length_per_iovec = ddt_iov[i].iov_len;
+        }
+        if (*buffer_size < length_per_iovec) {
+            pConvertor->current_iov_pos = i;
+            pConvertor->current_iov_partial_length = length_per_iovec - *buffer_size;
+            length_per_iovec = *buffer_size; 
+            buffer_isfull = 1;
+        }
+        *buffer_size -= length_per_iovec;
+        *total_converted += length_per_iovec;
+        
+        alignment = ALIGNMENT_DOUBLE;
+
+        count_desc = length_per_iovec / alignment;
+        residue_desc = length_per_iovec % alignment;
+        nb_blocks_per_description = (count_desc + thread_per_block - 1) / thread_per_block;
+        DT_CUDA_DEBUG ( opal_cuda_output(10, "DDT IOV to CUDA IOV description %d, size %d, residue %d, alignment %d, nb_block_aquired %d\n", i, count_desc, residue_desc, alignment, nb_blocks_per_description); );
+        for (j = 0; j < nb_blocks_per_description; j++) {
+            cuda_iov_dist_h_current[*nb_blocks_used].ncontig_disp = ncontig_disp_base + j * thread_per_block * alignment;
+            cuda_iov_dist_h_current[*nb_blocks_used].contig_disp = contig_disp;
+            if ( (j+1) * thread_per_block <= count_desc) {
+                current_cuda_iov_length = thread_per_block * alignment;
+            } else {
+                current_cuda_iov_length = (count_desc - j*thread_per_block) * alignment; 
+            }
+#if defined (OPAL_DATATYPE_CUDA_DEBUG)
+            assert(current_cuda_iov_length > 0);
+#endif /* OPAL_DATATYPE_CUDA_DEBUG */
+            contig_disp += current_cuda_iov_length;
+            DT_CUDA_DEBUG( opal_cuda_output(12, "DDT IOV to CUDA IOV \tblock %d, ncontig_disp %ld, contig_disp %ld, nb_bytes %ld\n", nb_blocks_used, cuda_iov_dist_h_current[*nb_blocks_used].ncontig_disp, cuda_iov_dist_h_current[*nb_blocks_used].contig_disp, current_cuda_iov_length); );
+            (*nb_blocks_used) ++;
+            assert (*nb_blocks_used < CUDA_MAX_NB_BLOCKS*CUDA_IOV_MAX_TASK_PER_BLOCK);
+        }
+        
+        /* handle residue */
+        if (residue_desc != 0) {
+            cuda_iov_dist_h_current[*nb_blocks_used].ncontig_disp = ncontig_disp_base + length_per_iovec / alignment * alignment;
+            cuda_iov_dist_h_current[*nb_blocks_used].contig_disp = contig_disp;
+            current_cuda_iov_length= length_per_iovec - length_per_iovec / alignment * alignment;
+#if defined (OPAL_DATATYPE_CUDA_DEBUG)
+            assert(current_cuda_iov_length > 0);
+#endif /* OPAL_DATATYPE_CUDA_DEBUG */
+            contig_disp += current_cuda_iov_length;
+            DT_CUDA_DEBUG( opal_cuda_output(12, "DDT IOV to CUDA IOV \tblock %d, ncontig_disp %ld, contig_disp %ld, nb_bytes %ld\n", nb_blocks_used, cuda_iov_dist_h_current[*nb_blocks_used].ncontig_disp, cuda_iov_dist_h_current[*nb_blocks_used].contig_disp, current_cuda_iov_length); );
+            (*nb_blocks_used) ++;
+            assert (*nb_blocks_used < CUDA_MAX_NB_BLOCKS*CUDA_IOV_MAX_TASK_PER_BLOCK);
+        }
+    }
+    cuda_iov_dist_h_current[*nb_blocks_used].contig_disp = contig_disp;
+    *contig_disp_out = contig_disp;
+    *current_ddt_iov_pos = i;
+    return buffer_isfull;
+        
+}
+
 void opal_ddt_get_cached_cuda_iov(struct opal_convertor_t *convertor, ddt_cuda_iov_total_cached_t **cached_cuda_iov)
 {
     opal_datatype_t *datatype = (opal_datatype_t *)convertor->pDesc;
diff --git a/opal/datatype/cuda/opal_datatype_cuda.cuh b/opal/datatype/cuda/opal_datatype_cuda.cuh
index 64f69c2974c..8e2a008ce22 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cuh
+++ b/opal/datatype/cuda/opal_datatype_cuda.cuh
@@ -135,6 +135,8 @@ void opal_ddt_set_ddt_iov_position(struct opal_convertor_t *convertor, size_t dd
 
 int32_t opal_ddt_cache_cuda_iov(opal_convertor_t* pConvertor, uint32_t *cuda_iov_count);
 
+uint8_t opal_ddt_iov_to_cuda_iov(opal_convertor_t* pConvertor, const struct iovec *ddt_iov, ddt_cuda_iov_dist_cached_t* cuda_iov_dist_h_current, uint32_t ddt_iov_start_pos, uint32_t ddt_iov_end_pos, size_t *buffer_size, uint32_t *nb_blocks_used, size_t *total_packed, size_t *contig_disp_out, uint32_t *current_ddt_iov_pos);
+
 }
                             
 #endif  /* OPAL_DATATYPE_CUDA_H_HAS_BEEN_INCLUDED */
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
index 5bdfa88fbdb..ad0e2d771d5 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
@@ -937,14 +937,12 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_non_cached( opal_converto
                                                                    uint32_t* out_size,
                                                                    size_t* max_data )
 {
-    uint32_t i, j;
-    uint32_t count_desc, nb_blocks_per_description, residue_desc;
+    uint32_t i;
     uint32_t nb_blocks, thread_per_block, nb_blocks_used;
-    size_t buffer_size, length_per_iovec;
+    size_t buffer_size;
     unsigned char *destination, *destination_base, *source_base;
     size_t total_packed;
     uint8_t buffer_isfull = 0, transfer_required, free_required;
-    uint8_t alignment;
     cudaError_t cuda_err;
     ddt_cuda_stream_t *cuda_streams = current_cuda_device->cuda_streams;
     ddt_cuda_iov_dist_cached_t* cuda_iov_dist_h_current;
@@ -954,10 +952,8 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_non_cached( opal_converto
     cudaStream_t *cuda_stream_iov = NULL;
     const struct iovec *ddt_iov = NULL;
     uint32_t ddt_iov_count = 0;
-    size_t current_cuda_iov_length = 0;
-    size_t ncontig_disp_base;
     size_t contig_disp = 0;
-    uint32_t ddt_iov_start_pos, ddt_iov_end_pos;
+    uint32_t ddt_iov_start_pos, ddt_iov_end_pos, current_ddt_iov_pos;
     OPAL_PTRDIFF_TYPE ddt_extent;
 
 #if defined(OPAL_DATATYPE_CUDA_TIMING)    
@@ -1044,62 +1040,7 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_non_cached( opal_converto
         GET_TIME(start);
 #endif
 
-        for (i = ddt_iov_start_pos; i < ddt_iov_end_pos && !buffer_isfull; i++) {
-            if (pConvertor->current_iov_partial_length > 0) {
-                ncontig_disp_base = (size_t)(ddt_iov[i].iov_base) + ddt_iov[i].iov_len - pConvertor->current_iov_partial_length;
-                length_per_iovec = pConvertor->current_iov_partial_length;
-                pConvertor->current_iov_partial_length = 0;
-            } else {
-                ncontig_disp_base = (size_t)(ddt_iov[i].iov_base);
-                length_per_iovec = ddt_iov[i].iov_len;
-            }
-            if (buffer_size < length_per_iovec) {
-                pConvertor->current_iov_pos = i;
-                pConvertor->current_iov_partial_length = length_per_iovec - buffer_size;
-                length_per_iovec = buffer_size; 
-                buffer_isfull = 1;
-            }
-            buffer_size -= length_per_iovec;
-            total_packed += length_per_iovec;
-            
-            alignment = ALIGNMENT_DOUBLE;
-
-            count_desc = length_per_iovec / alignment;
-            residue_desc = length_per_iovec % alignment;
-            nb_blocks_per_description = (count_desc + thread_per_block - 1) / thread_per_block;
-            DT_CUDA_DEBUG ( opal_cuda_output(10, "Pack description %d, size %d, residue %d, alignment %d, nb_block_aquired %d\n", i, count_desc, residue_desc, alignment, nb_blocks_per_description); );
-            for (j = 0; j < nb_blocks_per_description; j++) {
-                cuda_iov_dist_h_current[nb_blocks_used].ncontig_disp = ncontig_disp_base + j * thread_per_block * alignment;
-                cuda_iov_dist_h_current[nb_blocks_used].contig_disp = contig_disp;
-                if ( (j+1) * thread_per_block <= count_desc) {
-                    current_cuda_iov_length = thread_per_block * alignment;
-                } else {
-                    current_cuda_iov_length = (count_desc - j*thread_per_block) * alignment; 
-                }
-#if defined (OPAL_DATATYPE_CUDA_DEBUG)
-                assert(current_cuda_iov_length > 0);
-#endif /* OPAL_DATATYPE_CUDA_DEBUG */
-                contig_disp += current_cuda_iov_length;
-                DT_CUDA_DEBUG( opal_cuda_output(12, "Pack \tblock %d, ncontig_disp %ld, contig_disp %ld, nb_bytes %ld\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].ncontig_disp, cuda_iov_dist_h_current[nb_blocks_used].contig_disp, current_cuda_iov_length); );
-                nb_blocks_used ++;
-                assert (nb_blocks_used < CUDA_MAX_NB_BLOCKS*CUDA_IOV_MAX_TASK_PER_BLOCK);
-            }
-            
-            /* handle residue */
-            if (residue_desc != 0) {
-                cuda_iov_dist_h_current[nb_blocks_used].ncontig_disp = ncontig_disp_base + length_per_iovec / alignment * alignment;
-                cuda_iov_dist_h_current[nb_blocks_used].contig_disp = contig_disp;
-                current_cuda_iov_length= length_per_iovec - length_per_iovec / alignment * alignment;
-#if defined (OPAL_DATATYPE_CUDA_DEBUG)
-                assert(current_cuda_iov_length > 0);
-#endif /* OPAL_DATATYPE_CUDA_DEBUG */
-                contig_disp += current_cuda_iov_length;
-                DT_CUDA_DEBUG( opal_cuda_output(12, "Pack \tblock %d, ncontig_disp %ld, contig_disp %ld, nb_bytes %ld\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].ncontig_disp, cuda_iov_dist_h_current[nb_blocks_used].contig_disp, current_cuda_iov_length); );
-                nb_blocks_used ++;
-                assert (nb_blocks_used < CUDA_MAX_NB_BLOCKS*CUDA_IOV_MAX_TASK_PER_BLOCK);
-            }
-        }
-        cuda_iov_dist_h_current[nb_blocks_used].contig_disp = contig_disp;
+        buffer_isfull = opal_ddt_iov_to_cuda_iov(pConvertor, ddt_iov, cuda_iov_dist_h_current, ddt_iov_start_pos, ddt_iov_end_pos, &buffer_size, &nb_blocks_used, &total_packed, &contig_disp, &current_ddt_iov_pos);
 
 #if defined(OPAL_DATATYPE_CUDA_TIMING)    
         GET_TIME( end );
@@ -1114,10 +1055,10 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_non_cached( opal_converto
         iov_pipeline_block_id ++;
         iov_pipeline_block_id = iov_pipeline_block_id % NB_STREAMS;
         destination_base += contig_disp;
-        contig_disp = 0;
+        
         if (!buffer_isfull) {
-            pConvertor->current_iov_pos = i;
-            if (i == ddt_iov_count) {
+            pConvertor->current_iov_pos = current_ddt_iov_pos;
+            if (current_ddt_iov_pos == ddt_iov_count) {
                 pConvertor->current_count ++;
                 pConvertor->current_iov_pos = 0;
                 source_base += ddt_extent;
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
index c84f09ca738..648036b1bb1 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
@@ -631,15 +631,13 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_non_cached( opal_conver
                                                                      uint32_t* out_size,
                                                                      size_t* max_data )
 {
-    uint32_t i, j;
-    uint32_t count_desc, nb_blocks_per_description, dst_offset, residue_desc;
+    uint32_t i;
     uint32_t nb_blocks, thread_per_block, nb_blocks_used;
-    size_t buffer_size, length_per_iovec;
+    size_t buffer_size;
     unsigned char *source, *source_base, *destination_base;
     size_t total_unpacked;
     uint8_t buffer_isfull = 0;
     uint8_t free_required = 0;
-    uint8_t alignment;
     cudaError_t cuda_err;
     ddt_cuda_stream_t *cuda_streams = current_cuda_device->cuda_streams;
     ddt_cuda_iov_dist_cached_t* cuda_iov_dist_h_current;
@@ -649,10 +647,8 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_non_cached( opal_conver
     cudaStream_t *cuda_stream_iov = NULL;
     const struct iovec *ddt_iov = NULL;
     uint32_t ddt_iov_count = 0;
-    size_t current_cuda_iov_length = 0;
-    size_t ncontig_disp_base;
     size_t contig_disp = 0;
-    uint32_t ddt_iov_start_pos, ddt_iov_end_pos;
+    uint32_t ddt_iov_start_pos, ddt_iov_end_pos, current_ddt_iov_pos;
     OPAL_PTRDIFF_TYPE ddt_extent;
 
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
@@ -734,62 +730,7 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_non_cached( opal_conver
         GET_TIME(start);
 #endif
 
-        for (i = ddt_iov_start_pos; i < ddt_iov_end_pos && !buffer_isfull; i++) {
-            if (pConvertor->current_iov_partial_length > 0) {
-                ncontig_disp_base = (size_t)(ddt_iov[i].iov_base) + ddt_iov[i].iov_len - pConvertor->current_iov_partial_length;
-                length_per_iovec = pConvertor->current_iov_partial_length;
-                pConvertor->current_iov_partial_length = 0;
-            } else {
-                ncontig_disp_base = (size_t)(ddt_iov[i].iov_base);
-                length_per_iovec = ddt_iov[i].iov_len;
-            }
-            if (buffer_size < length_per_iovec) {
-                pConvertor->current_iov_pos = i;
-                pConvertor->current_iov_partial_length = length_per_iovec - buffer_size;
-                length_per_iovec = buffer_size; 
-                buffer_isfull = 1;
-            }
-            buffer_size -= length_per_iovec;
-            total_unpacked += length_per_iovec;
-
-            alignment = ALIGNMENT_DOUBLE;
-
-            count_desc = length_per_iovec / alignment;
-            residue_desc = length_per_iovec % alignment;
-            nb_blocks_per_description = (count_desc + thread_per_block - 1) / thread_per_block;
-            DT_CUDA_DEBUG ( opal_cuda_output(10, "Unpack description %d, size %d, residue %d, alignment %d\n", i, count_desc, residue_desc, alignment); );
-            for (j = 0; j < nb_blocks_per_description; j++) {
-                cuda_iov_dist_h_current[nb_blocks_used].ncontig_disp = ncontig_disp_base + j * thread_per_block * alignment;
-                cuda_iov_dist_h_current[nb_blocks_used].contig_disp = contig_disp;
-                if ( (j+1) * thread_per_block <= count_desc) {
-                    current_cuda_iov_length = thread_per_block * alignment;
-                } else {
-                    current_cuda_iov_length = (count_desc - j*thread_per_block) * alignment; 
-                }
-#if defined (OPAL_DATATYPE_CUDA_DEBUG)
-                assert(current_cuda_iov_length > 0);
-#endif /* OPAL_DATATYPE_CUDA_DEBUG */
-                contig_disp += current_cuda_iov_length;
-                DT_CUDA_DEBUG( opal_cuda_output(12, "Pack \tblock %d, ncontig_disp %ld, contig_disp %ld, nb_bytes %ld\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].ncontig_disp, cuda_iov_dist_h_current[nb_blocks_used].contig_disp, current_cuda_iov_length); );
-                nb_blocks_used ++;
-                assert (nb_blocks_used < CUDA_MAX_NB_BLOCKS*CUDA_IOV_MAX_TASK_PER_BLOCK);
-            }
-
-            /* handle residue */
-            if (residue_desc != 0) {
-                cuda_iov_dist_h_current[nb_blocks_used].ncontig_disp = ncontig_disp_base + length_per_iovec / alignment * alignment;
-                cuda_iov_dist_h_current[nb_blocks_used].contig_disp = contig_disp;
-                current_cuda_iov_length= length_per_iovec - length_per_iovec / alignment * alignment;
-#if defined (OPAL_DATATYPE_CUDA_DEBUG)
-                assert(current_cuda_iov_length > 0);
-#endif /* OPAL_DATATYPE_CUDA_DEBUG */
-                contig_disp += current_cuda_iov_length;
-                DT_CUDA_DEBUG( opal_cuda_output(12, "Pack \tblock %d, ncontig_disp %ld, contig_disp %ld, nb_bytes %ld\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].ncontig_disp, cuda_iov_dist_h_current[nb_blocks_used].contig_disp, current_cuda_iov_length); );
-                nb_blocks_used ++;
-                assert (nb_blocks_used < CUDA_MAX_NB_BLOCKS*CUDA_IOV_MAX_TASK_PER_BLOCK);
-            }
-            cuda_iov_dist_h_current[nb_blocks_used].contig_disp = contig_disp;
-        }
+        buffer_isfull = opal_ddt_iov_to_cuda_iov(pConvertor, ddt_iov, cuda_iov_dist_h_current, ddt_iov_start_pos, ddt_iov_end_pos, &buffer_size, &nb_blocks_used, &total_unpacked, &contig_disp, &current_ddt_iov_pos);
 
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
         GET_TIME( end );
@@ -804,10 +745,9 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_non_cached( opal_conver
         iov_pipeline_block_id ++;
         iov_pipeline_block_id = iov_pipeline_block_id % NB_STREAMS;
         source_base += contig_disp;
-        contig_disp = 0;
         if (!buffer_isfull) {
-            pConvertor->current_iov_pos = i;
-            if (i == ddt_iov_count) {
+            pConvertor->current_iov_pos = current_ddt_iov_pos;
+            if (current_ddt_iov_pos == ddt_iov_count) {
                 pConvertor->current_count ++;
                 pConvertor->current_iov_pos = 0;
                 destination_base += ddt_extent;

From d61f4246ee3651832b4384b19028566591771c79 Mon Sep 17 00:00:00 2001
From: eddy16112 <eddy16112@gmail.com>
Date: Mon, 30 Nov 2015 17:43:02 -0500
Subject: [PATCH 076/190] merge iov cached and non-cached

---
 opal/datatype/cuda/opal_datatype_cuda.cu      |   2 +
 opal/datatype/cuda/opal_datatype_cuda.cuh     |  24 +-
 .../cuda/opal_datatype_cuda_internal.cuh      |   1 +
 .../cuda/opal_datatype_pack_cuda_wrapper.cu   | 281 +++++++-----------
 .../cuda/opal_datatype_unpack_cuda_wrapper.cu | 238 ++++++---------
 5 files changed, 209 insertions(+), 337 deletions(-)

diff --git a/opal/datatype/cuda/opal_datatype_cuda.cu b/opal/datatype/cuda/opal_datatype_cuda.cu
index 02559ed283f..b488ac4ab6c 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cu
+++ b/opal/datatype/cuda/opal_datatype_cuda.cu
@@ -15,6 +15,7 @@ ddt_cuda_device_t *cuda_devices;
 ddt_cuda_device_t *current_cuda_device;
 struct iovec cuda_iov[CUDA_NB_IOV];
 uint32_t cuda_iov_count;
+uint32_t cuda_iov_cache_enabled;
 
 //uint8_t ALIGNMENT_DOUBLE, ALIGNMENT_FLOAT, ALIGNMENT_CHAR;
 
@@ -239,6 +240,7 @@ int32_t opal_ddt_cuda_kernel_init(void)
     current_cuda_device = &(cuda_devices[0]);
     
     /* init cuda_iov */
+    cuda_iov_cache_enabled = 1;
     cuda_iov_count = CUDA_NB_IOV;
     
     // /* init size for double, float, char */
diff --git a/opal/datatype/cuda/opal_datatype_cuda.cuh b/opal/datatype/cuda/opal_datatype_cuda.cuh
index 8e2a008ce22..c33ff606bd9 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cuh
+++ b/opal/datatype/cuda/opal_datatype_cuda.cuh
@@ -29,25 +29,13 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pCon
                                                           uint32_t* out_size,
                                                           size_t* max_data ); 
                                                           
-int32_t opal_ddt_generic_simple_pack_function_cuda_iov_non_cached( opal_convertor_t* pConvertor,
-                                                                   struct iovec* iov, 
-                                                                   uint32_t* out_size,
-                                                                   size_t* max_data );                                              
-
-int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_non_cached( opal_convertor_t* pConvertor,
-                                                                     struct iovec* iov, 
-                                                                     uint32_t* out_size,
-                                                                     size_t* max_data ); 
+int32_t opal_ddt_generic_simple_pack_function_cuda_iov_non_cached( opal_convertor_t* pConvertor, unsigned char *destination, size_t buffer_size, size_t *total_packed);                                        
+
+int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_non_cached( opal_convertor_t* pConvertor, unsigned char *source, size_t buffer_size, size_t *total_unpacked);
                                                                                                                     
-int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t* pConvertor,
-                                                               struct iovec* iov, 
-                                                               uint32_t* out_size,
-                                                               size_t* max_data );                                              
-
-int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_t* pConvertor,
-                                                                 struct iovec* iov, 
-                                                                 uint32_t* out_size,
-                                                                 size_t* max_data ); 
+int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t* pConvertor, unsigned char *destination, size_t buffer_size, size_t *total_packed);                                        
+
+int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_t* pConvertor, unsigned char *source, size_t buffer_size, size_t *total_unpacked);
 
 void pack_contiguous_loop_cuda( dt_elem_desc_t* ELEM,
                                 uint32_t* COUNT,
diff --git a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
index 99dc76f1e05..72edcb3d8a3 100644
--- a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
+++ b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
@@ -109,6 +109,7 @@ extern ddt_cuda_device_t *cuda_devices;
 extern ddt_cuda_device_t *current_cuda_device;
 extern struct iovec cuda_iov[CUDA_NB_IOV];
 extern uint32_t cuda_iov_count;
+extern uint32_t cuda_iov_cache_enabled;
 
 //extern uint8_t ALIGNMENT_DOUBLE, ALIGNMENT_FLOAT, ALIGNMENT_CHAR;
 
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
index ad0e2d771d5..0137601bf70 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
@@ -664,7 +664,98 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConve
                                                         uint32_t* out_size,
                                                         size_t* max_data )
 {
-    return opal_ddt_generic_simple_pack_function_cuda_iov_non_cached(pConvertor, iov, out_size, max_data);   
+    size_t buffer_size;
+    unsigned char *destination;
+    size_t total_packed;
+    uint8_t transfer_required, free_required;
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)    
+    TIMER_DATA_TYPE start, end, start_total, end_total;
+    long total_time, move_time;
+#endif
+
+ //   printf("buffer size %d, max_data %d\n", iov[0].iov_len, *max_data);
+    if ((iov[0].iov_base == NULL) || opal_ddt_cuda_is_gpu_buffer(iov[0].iov_base)) {
+        if (iov[0].iov_len == 0) {
+            buffer_size = DT_CUDA_BUFFER_SIZE;
+        } else {
+            buffer_size = iov[0].iov_len;
+        }
+        
+        if (iov[0].iov_base == NULL) {
+            iov[0].iov_base = (unsigned char *)opal_ddt_cuda_malloc_gpu_buffer(buffer_size, 0);
+            destination = (unsigned char *)iov[0].iov_base;
+            pConvertor->gpu_buffer_ptr = destination;
+            free_required = 1;
+        } else {
+            destination = (unsigned char *)iov[0].iov_base;
+            free_required = 0;
+        }
+        transfer_required = 0;
+    } else {
+        buffer_size = iov[0].iov_len;
+        if (OPAL_DATATYPE_VECTOR_USE_ZEROCPY) {
+            pConvertor->gpu_buffer_ptr = NULL;
+            transfer_required = 0;
+            free_required = 0;
+            cudaHostGetDevicePointer((void **)&destination, (void *)iov[0].iov_base, 0);
+        } else {
+            if (pConvertor->gpu_buffer_ptr == NULL) {
+                pConvertor->gpu_buffer_ptr = (unsigned char*)opal_ddt_cuda_malloc_gpu_buffer(buffer_size, 0);
+            }
+            transfer_required = 1;
+            free_required = 1;
+            destination = pConvertor->gpu_buffer_ptr;
+        }
+    }   
+
+    total_packed = 0;
+    
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+    GET_TIME(start_total);
+#endif
+    
+    /* start pack */
+    if (cuda_iov_cache_enabled) {
+        opal_ddt_generic_simple_pack_function_cuda_iov_cached(pConvertor, destination, buffer_size, &total_packed);
+    } else {
+        opal_ddt_generic_simple_pack_function_cuda_iov_non_cached(pConvertor, destination, buffer_size, &total_packed);
+    }
+
+    pConvertor->bConverted += total_packed;
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "Pack total packed %d\n", total_packed); );
+    
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+    GET_TIME(start);
+#endif
+    if (transfer_required) {
+        cudaMemcpy(iov[0].iov_base, pConvertor->gpu_buffer_ptr, total_packed, cudaMemcpyDeviceToHost);
+    } 
+#if defined(OPAL_DATATYPE_CUDA_TIMING) 
+    GET_TIME( end );
+    move_time = ELAPSED_TIME( start, end );
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: DtoH memcpy in %ld microsec, transfer required %d\n", move_time, transfer_required ); );
+#endif
+
+    iov[0].iov_len = total_packed;
+    *max_data = total_packed;
+    *out_size = 1;
+    
+#if defined(OPAL_DATATYPE_CUDA_TIMING)    
+    GET_TIME( end_total );
+    total_time = ELAPSED_TIME( start_total, end_total );
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: total packing in %ld microsec, kernel %ld microsec\n", total_time, total_time-move_time ); );
+#endif
+    
+    if( pConvertor->bConverted == pConvertor->local_size ) {
+        pConvertor->flags |= CONVERTOR_COMPLETED;
+        if (pConvertor->gpu_buffer_ptr != NULL && free_required) {
+           opal_ddt_cuda_free_gpu_buffer(pConvertor->gpu_buffer_ptr, 0);
+           pConvertor->gpu_buffer_ptr = NULL;
+        }
+        return 1;
+    }        
+    return 0; 
 }
 
 #if 0
@@ -932,17 +1023,12 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_non_cached( opal_converto
 #endif
 
 
-int32_t opal_ddt_generic_simple_pack_function_cuda_iov_non_cached( opal_convertor_t* pConvertor,
-                                                                   struct iovec* iov,
-                                                                   uint32_t* out_size,
-                                                                   size_t* max_data )
+int32_t opal_ddt_generic_simple_pack_function_cuda_iov_non_cached( opal_convertor_t* pConvertor, unsigned char *destination, size_t buffer_size, size_t *total_packed)
 {
     uint32_t i;
     uint32_t nb_blocks, thread_per_block, nb_blocks_used;
-    size_t buffer_size;
-    unsigned char *destination, *destination_base, *source_base;
-    size_t total_packed;
-    uint8_t buffer_isfull = 0, transfer_required, free_required;
+    unsigned char *destination_base, *source_base;
+    uint8_t buffer_isfull = 0;
     cudaError_t cuda_err;
     ddt_cuda_stream_t *cuda_streams = current_cuda_device->cuda_streams;
     ddt_cuda_iov_dist_cached_t* cuda_iov_dist_h_current;
@@ -957,51 +1043,11 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_non_cached( opal_converto
     OPAL_PTRDIFF_TYPE ddt_extent;
 
 #if defined(OPAL_DATATYPE_CUDA_TIMING)    
-    TIMER_DATA_TYPE start, end, start_total, end_total;
-    long total_time, move_time;
+    TIMER_DATA_TYPE start, end;
+    long total_time;
 #endif
     
-
- //   printf("buffer size %d, max_data %d\n", iov[0].iov_len, *max_data);
-    if ((iov[0].iov_base == NULL) || opal_ddt_cuda_is_gpu_buffer(iov[0].iov_base)) {
-        if (iov[0].iov_len == 0) {
-            buffer_size = DT_CUDA_BUFFER_SIZE;
-        } else {
-            buffer_size = iov[0].iov_len;
-        }
-        
-        if (iov[0].iov_base == NULL) {
-            iov[0].iov_base = (unsigned char *)opal_ddt_cuda_malloc_gpu_buffer(buffer_size, 0);
-            destination = (unsigned char *)iov[0].iov_base;
-            pConvertor->gpu_buffer_ptr = destination;
-            free_required = 1;
-        } else {
-            destination = (unsigned char *)iov[0].iov_base;
-            free_required = 0;
-        }
-        transfer_required = 0;
-    } else {
-        buffer_size = iov[0].iov_len;
-        if (OPAL_DATATYPE_VECTOR_USE_ZEROCPY) {
-            pConvertor->gpu_buffer_ptr = NULL;
-            transfer_required = 0;
-            free_required = 0;
-            cudaHostGetDevicePointer((void **)&destination, (void *)iov[0].iov_base, 0);
-        } else {
-            if (pConvertor->gpu_buffer_ptr == NULL) {
-                pConvertor->gpu_buffer_ptr = (unsigned char*)opal_ddt_cuda_malloc_gpu_buffer(buffer_size, 0);
-            }
-            transfer_required = 1;
-            free_required = 1;
-            destination = pConvertor->gpu_buffer_ptr;
-        }
-    }   
-    
     DT_CUDA_DEBUG ( opal_cuda_output(2, "Pack using IOV non cached, GPU base %p, pack to buffer %p\n", pConvertor->pBaseBuf, destination););
-
-#if defined(OPAL_DATATYPE_CUDA_TIMING)
-    GET_TIME(start_total);
-#endif
     
     opal_convertor_raw_cached( pConvertor, &ddt_iov, &ddt_iov_count);
     if (ddt_iov == NULL) {
@@ -1009,7 +1055,6 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_non_cached( opal_converto
         return OPAL_ERROR;
     }
     
-    total_packed = 0;
     cuda_streams->current_stream_id = 0;
     thread_per_block = CUDA_WARP_SIZE * 5;
     nb_blocks = 256;
@@ -1040,7 +1085,7 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_non_cached( opal_converto
         GET_TIME(start);
 #endif
 
-        buffer_isfull = opal_ddt_iov_to_cuda_iov(pConvertor, ddt_iov, cuda_iov_dist_h_current, ddt_iov_start_pos, ddt_iov_end_pos, &buffer_size, &nb_blocks_used, &total_packed, &contig_disp, &current_ddt_iov_pos);
+        buffer_isfull = opal_ddt_iov_to_cuda_iov(pConvertor, ddt_iov, cuda_iov_dist_h_current, ddt_iov_start_pos, ddt_iov_end_pos, &buffer_size, &nb_blocks_used, total_packed, &contig_disp, &current_ddt_iov_pos);
 
 #if defined(OPAL_DATATYPE_CUDA_TIMING)    
         GET_TIME( end );
@@ -1067,57 +1112,19 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_non_cached( opal_converto
         
     }
     
-
     for (i = 0; i < NB_STREAMS; i++) {
         cudaStreamSynchronize(cuda_streams->opal_cuda_stream[i]);
     }
-    
-#if defined(OPAL_DATATYPE_CUDA_TIMING)
-    GET_TIME(start);
-#endif
-    if (transfer_required) {
-        cudaMemcpy(iov[0].iov_base, pConvertor->gpu_buffer_ptr, total_packed, cudaMemcpyDeviceToHost);
-    } 
-#if defined(OPAL_DATATYPE_CUDA_TIMING) 
-    GET_TIME( end );
-    move_time = ELAPSED_TIME( start, end );
-    DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: DtoH memcpy in %ld microsec, transfer required %d\n", move_time, transfer_required ); );
-#endif
-
-    pConvertor->bConverted += total_packed;
-    iov[0].iov_len = total_packed;
-    *max_data = total_packed;
-    *out_size = 1;
-    DT_CUDA_DEBUG ( opal_cuda_output(2, "Pack total packed %d\n", total_packed); );
-    
-#if defined(OPAL_DATATYPE_CUDA_TIMING)    
-    GET_TIME( end_total );
-    total_time = ELAPSED_TIME( start_total, end_total );
-    DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: total packing in %ld microsec, kernel %ld microsec\n", total_time, total_time-move_time ); );
-#endif
-    
-    if( pConvertor->bConverted == pConvertor->local_size ) {
-        pConvertor->flags |= CONVERTOR_COMPLETED;
-        if (pConvertor->gpu_buffer_ptr != NULL && free_required) {
-           opal_ddt_cuda_free_gpu_buffer(pConvertor->gpu_buffer_ptr, 0);
-           pConvertor->gpu_buffer_ptr = NULL;
-        }
-        return 1;
-    }        
-    return 0;
+        
+    return OPAL_SUCCESS;
 }
 
-int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t* pConvertor,
-                                                               struct iovec* iov,
-                                                               uint32_t* out_size,
-                                                               size_t* max_data )
+int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t* pConvertor, unsigned char *destination, size_t buffer_size, size_t *total_packed)
 {
     uint32_t i;
     uint32_t nb_blocks, thread_per_block, nb_blocks_used;
-    size_t buffer_size;
-    unsigned char *destination, *destination_base, *source_base;
-    size_t total_packed;
-    uint8_t buffer_isfull = 0, transfer_required, free_required;
+    unsigned char *destination_base, *source_base;
+    uint8_t buffer_isfull = 0;
     cudaError_t cuda_err;
     ddt_cuda_stream_t *cuda_streams = current_cuda_device->cuda_streams;
     ddt_cuda_iov_pipeline_block_t *cuda_iov_pipeline_block = NULL;
@@ -1131,55 +1138,14 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
     OPAL_PTRDIFF_TYPE ddt_extent;
 
 #if defined(OPAL_DATATYPE_CUDA_TIMING)    
-    TIMER_DATA_TYPE start, end, start_total, end_total;
-    long total_time, move_time;
+    TIMER_DATA_TYPE start, end;
+    long total_time;
 #endif
-
- //   printf("buffer size %d, max_data %d\n", iov[0].iov_len, *max_data);
-    if ((iov[0].iov_base == NULL) || opal_ddt_cuda_is_gpu_buffer(iov[0].iov_base)) {
-        if (iov[0].iov_len == 0) {
-            buffer_size = DT_CUDA_BUFFER_SIZE;
-        } else {
-            buffer_size = iov[0].iov_len;
-        }
-        
-        if (iov[0].iov_base == NULL) {
-            iov[0].iov_base = (unsigned char *)opal_ddt_cuda_malloc_gpu_buffer(buffer_size, 0);
-            destination = (unsigned char *)iov[0].iov_base;
-            pConvertor->gpu_buffer_ptr = destination;
-            free_required = 1;
-        } else {
-            destination = (unsigned char *)iov[0].iov_base;
-            free_required = 0;
-        }
-        transfer_required = 0;
-    } else {
-        buffer_size = iov[0].iov_len;
-        if (OPAL_DATATYPE_VECTOR_USE_ZEROCPY) {
-            pConvertor->gpu_buffer_ptr = NULL;
-            transfer_required = 0;
-            free_required = 0;
-            cudaHostGetDevicePointer((void **)&destination, (void *)iov[0].iov_base, 0);
-        } else {
-            if (pConvertor->gpu_buffer_ptr == NULL) {
-                pConvertor->gpu_buffer_ptr = (unsigned char*)opal_ddt_cuda_malloc_gpu_buffer(buffer_size, 0);
-            }
-            transfer_required = 1;
-            free_required = 1;
-            destination = pConvertor->gpu_buffer_ptr;
-        }
-    }   
     
     DT_CUDA_DEBUG ( opal_cuda_output(2, "Pack using IOV cached, GPU base %p, pack to buffer %p\n", pConvertor->pBaseBuf, destination););
 
-    total_packed = 0;
     cuda_streams->current_stream_id = 0;
     destination_base = destination;
-
-#if defined(OPAL_DATATYPE_CUDA_TIMING)
-    GET_TIME(start_total);
-#endif
-    
     thread_per_block = CUDA_WARP_SIZE * 5;
     nb_blocks = 256;
     source_base = (unsigned char*)pConvertor->pBaseBuf; 
@@ -1224,7 +1190,7 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
     while( pConvertor->current_count < pConvertor->count && !buffer_isfull) {
         for (i = cuda_iov_start_pos; i < cuda_iov_end_pos && !buffer_isfull; i++) {
             if (buffer_size >= cached_cuda_iov_nb_bytes_list_h[i]) {
-                total_packed += cached_cuda_iov_nb_bytes_list_h[i];
+                *total_packed += cached_cuda_iov_nb_bytes_list_h[i];
                 buffer_size -= cached_cuda_iov_nb_bytes_list_h[i];
                 nb_blocks_used++;
             } else {
@@ -1250,41 +1216,8 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
     pConvertor->current_cuda_iov_pos = pConvertor->current_cuda_iov_pos % cached_cuda_iov->cuda_iov_count;
 
     cudaStreamSynchronize(cuda_streams->opal_cuda_stream[0]);
-
-    pConvertor->bConverted += total_packed;
-    DT_CUDA_DEBUG ( opal_cuda_output(2, "Pack total packed %d\n", total_packed); );
     
-#if defined(OPAL_DATATYPE_CUDA_TIMING)
-    GET_TIME(start);
-#endif
-    if (transfer_required) {
-        cudaMemcpy(iov[0].iov_base, pConvertor->gpu_buffer_ptr, total_packed, cudaMemcpyDeviceToHost);
-    } 
-#if defined(OPAL_DATATYPE_CUDA_TIMING) 
-    GET_TIME( end );
-    move_time = ELAPSED_TIME( start, end );
-    DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: DtoH memcpy in %ld microsec, transfer required %d\n", move_time, transfer_required ); );
-#endif
-
-    iov[0].iov_len = total_packed;
-    *max_data = total_packed;
-    *out_size = 1;
-    
-#if defined(OPAL_DATATYPE_CUDA_TIMING)    
-    GET_TIME( end_total );
-    total_time = ELAPSED_TIME( start_total, end_total );
-    DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: total packing in %ld microsec, kernel %ld microsec\n", total_time, total_time-move_time ); );
-#endif
-    
-    if( pConvertor->bConverted == pConvertor->local_size ) {
-        pConvertor->flags |= CONVERTOR_COMPLETED;
-        if (pConvertor->gpu_buffer_ptr != NULL && free_required) {
-           opal_ddt_cuda_free_gpu_buffer(pConvertor->gpu_buffer_ptr, 0);
-           pConvertor->gpu_buffer_ptr = NULL;
-        }
-        return 1;
-    }        
-    return 0;
+    return OPAL_SUCCESS;
 }
 
 void pack_predefined_data_cuda( dt_elem_desc_t* ELEM,
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
index 648036b1bb1..bb54dfeeb0a 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
@@ -376,7 +376,80 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pCon
                                                           uint32_t* out_size,
                                                           size_t* max_data )
 {
-    return opal_ddt_generic_simple_unpack_function_cuda_iov_non_cached(pConvertor, iov, out_size, max_data);
+    size_t buffer_size;
+    unsigned char *source;
+    size_t total_unpacked;
+    uint8_t free_required = 0;
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+    TIMER_DATA_TYPE start, end, start_total, end_total;
+    long total_time, move_time;
+#endif
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+    GET_TIME(start_total);
+#endif
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+    GET_TIME(start);
+#endif
+    if (opal_ddt_cuda_is_gpu_buffer(iov[0].iov_base)) {
+        source = (unsigned char*)iov[0].iov_base;
+        free_required = 0;
+    } else {
+        if (OPAL_DATATYPE_VECTOR_USE_ZEROCPY) {
+            cudaHostGetDevicePointer((void **)&source, (void *)iov[0].iov_base, 0);
+            pConvertor->gpu_buffer_ptr = NULL;
+            free_required = 0;
+        } else {
+            if (pConvertor->gpu_buffer_ptr == NULL) {
+                pConvertor->gpu_buffer_ptr = (unsigned char*)opal_ddt_cuda_malloc_gpu_buffer(iov[0].iov_len, 0);
+            }
+            source = pConvertor->gpu_buffer_ptr;
+            cudaMemcpy(source, iov[0].iov_base, iov[0].iov_len, cudaMemcpyHostToDevice);
+            free_required = 1;
+        }
+    }
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+    GET_TIME( end );
+    move_time = ELAPSED_TIME( start, end );
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: HtoD memcpy in %ld microsec, free required %d\n", move_time, free_required ); );
+#endif
+
+
+    buffer_size = iov[0].iov_len;
+    total_unpacked = 0;
+    
+    /* start unpack */
+    if (cuda_iov_cache_enabled) {
+        opal_ddt_generic_simple_unpack_function_cuda_iov_cached(pConvertor, source, buffer_size, &total_unpacked);
+    } else {
+        opal_ddt_generic_simple_unpack_function_cuda_iov_non_cached(pConvertor, source, buffer_size, &total_unpacked);
+    }
+    
+    pConvertor->bConverted += total_unpacked;
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "Unpack total unpacked %d\n", total_unpacked); );
+
+    iov[0].iov_len = total_unpacked;
+    *max_data = total_unpacked;
+    *out_size = 1;
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+    GET_TIME( end_total );
+    total_time = ELAPSED_TIME( start_total, end_total );
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: total unpacking in %ld microsec, kernel %ld microsec\n", total_time, total_time-move_time ); );
+#endif
+
+    if( pConvertor->bConverted == pConvertor->local_size ) {
+        pConvertor->flags |= CONVERTOR_COMPLETED;
+        if (pConvertor->gpu_buffer_ptr != NULL && free_required) {
+            opal_ddt_cuda_free_gpu_buffer(pConvertor->gpu_buffer_ptr, 0);
+            pConvertor->gpu_buffer_ptr = NULL;
+        }
+        return 1;
+    }
+    return 0;
 }
 
 #if 0
@@ -626,18 +699,12 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_non_cached( opal_conver
 
 #endif
 
-int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_non_cached( opal_convertor_t* pConvertor,
-                                                                     struct iovec* iov,
-                                                                     uint32_t* out_size,
-                                                                     size_t* max_data )
+int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_non_cached( opal_convertor_t* pConvertor, unsigned char *source, size_t buffer_size, size_t *total_unpacked)
 {
     uint32_t i;
     uint32_t nb_blocks, thread_per_block, nb_blocks_used;
-    size_t buffer_size;
-    unsigned char *source, *source_base, *destination_base;
-    size_t total_unpacked;
+    unsigned char *source_base, *destination_base;
     uint8_t buffer_isfull = 0;
-    uint8_t free_required = 0;
     cudaError_t cuda_err;
     ddt_cuda_stream_t *cuda_streams = current_cuda_device->cuda_streams;
     ddt_cuda_iov_dist_cached_t* cuda_iov_dist_h_current;
@@ -652,42 +719,12 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_non_cached( opal_conver
     OPAL_PTRDIFF_TYPE ddt_extent;
 
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
-    TIMER_DATA_TYPE start, end, start_total, end_total;
-    long total_time, move_time;
-#endif
-
-#if defined(OPAL_DATATYPE_CUDA_TIMING)
-    GET_TIME(start_total);
-#endif
-
-#if defined(OPAL_DATATYPE_CUDA_TIMING)
-    GET_TIME(start);
+    TIMER_DATA_TYPE start, end;
+    long total_time;
 #endif
-    if (opal_ddt_cuda_is_gpu_buffer(iov[0].iov_base)) {
-        source = (unsigned char*)iov[0].iov_base;
-        free_required = 0;
-    } else {
-        if (OPAL_DATATYPE_VECTOR_USE_ZEROCPY) {
-            cudaHostGetDevicePointer((void **)&source, (void *)iov[0].iov_base, 0);
-            pConvertor->gpu_buffer_ptr = NULL;
-            free_required = 0;
-        } else {
-            if (pConvertor->gpu_buffer_ptr == NULL) {
-                pConvertor->gpu_buffer_ptr = (unsigned char*)opal_ddt_cuda_malloc_gpu_buffer(iov[0].iov_len, 0);
-            }
-            source = pConvertor->gpu_buffer_ptr;
-            cudaMemcpy(source, iov[0].iov_base, iov[0].iov_len, cudaMemcpyHostToDevice);
-            free_required = 1;
-        }
-    }
-
+    
     DT_CUDA_DEBUG ( opal_cuda_output(2, "Unpack using IOV non cached, GPU base %p, unpack from buffer %p, total size %ld\n",
-                                     pConvertor->pBaseBuf, source, iov[0].iov_len); );
-#if defined(OPAL_DATATYPE_CUDA_TIMING)
-    GET_TIME( end );
-    move_time = ELAPSED_TIME( start, end );
-    DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: HtoD memcpy in %ld microsec, free required %d\n", move_time, free_required ); );
-#endif
+                                     pConvertor->pBaseBuf, source, buffer_size); );
     
     opal_convertor_raw_cached( pConvertor, &ddt_iov, &ddt_iov_count);
     if (ddt_iov == NULL) {
@@ -695,10 +732,7 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_non_cached( opal_conver
         return OPAL_ERROR;
     }
     
-    buffer_size = iov[0].iov_len;
-    total_unpacked = 0;
     cuda_streams->current_stream_id = 0;
-
     thread_per_block = CUDA_WARP_SIZE * 5;
     nb_blocks = 256;
     source_base = source;
@@ -730,7 +764,7 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_non_cached( opal_conver
         GET_TIME(start);
 #endif
 
-        buffer_isfull = opal_ddt_iov_to_cuda_iov(pConvertor, ddt_iov, cuda_iov_dist_h_current, ddt_iov_start_pos, ddt_iov_end_pos, &buffer_size, &nb_blocks_used, &total_unpacked, &contig_disp, &current_ddt_iov_pos);
+        buffer_isfull = opal_ddt_iov_to_cuda_iov(pConvertor, ddt_iov, cuda_iov_dist_h_current, ddt_iov_start_pos, ddt_iov_end_pos, &buffer_size, &nb_blocks_used, total_unpacked, &contig_disp, &current_ddt_iov_pos);
 
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
         GET_TIME( end );
@@ -759,41 +793,15 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_non_cached( opal_conver
         cudaStreamSynchronize(cuda_streams->opal_cuda_stream[i]);
     }
 
-    pConvertor->bConverted += total_unpacked;
-    iov[0].iov_len = total_unpacked;
-    *max_data = total_unpacked;
-    *out_size = 1;
-    DT_CUDA_DEBUG ( opal_cuda_output(2, "Unpack total unpacked %d\n", total_unpacked); );
-
-#if defined(OPAL_DATATYPE_CUDA_TIMING)
-    GET_TIME( end_total );
-    total_time = ELAPSED_TIME( start_total, end_total );
-    DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: total unpacking in %ld microsec, kernel %ld microsec\n", total_time, total_time-move_time ); );
-#endif
-
-    if( pConvertor->bConverted == pConvertor->local_size ) {
-        pConvertor->flags |= CONVERTOR_COMPLETED;
-        if (pConvertor->gpu_buffer_ptr != NULL && free_required) {
-            opal_ddt_cuda_free_gpu_buffer(pConvertor->gpu_buffer_ptr, 0);
-            pConvertor->gpu_buffer_ptr = NULL;
-        }
-        return 1;
-    }
-    return 0;
+    return OPAL_SUCCESS;
 }
 
-int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_t* pConvertor,
-                                                                 struct iovec* iov,
-                                                                 uint32_t* out_size,
-                                                                 size_t* max_data )
+int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_t* pConvertor, unsigned char *source, size_t buffer_size, size_t *total_unpacked)
 {
     uint32_t i;
     uint32_t nb_blocks, thread_per_block, nb_blocks_used;
-    size_t buffer_size;
-    unsigned char *source, *source_base, *destination_base, *destination;
-    size_t total_unpacked;
+    unsigned char *source_base, *destination_base;
     uint8_t buffer_isfull = 0;
-    uint8_t free_required = 0;
     cudaError_t cuda_err;
     ddt_cuda_stream_t *cuda_streams = current_cuda_device->cuda_streams;
     ddt_cuda_iov_pipeline_block_t *cuda_iov_pipeline_block = NULL;
@@ -809,58 +817,19 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
     OPAL_PTRDIFF_TYPE ddt_extent;
 
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
-    TIMER_DATA_TYPE start, end, start_total, end_total;
-    long total_time, move_time;
-#endif
-
-#if defined(OPAL_DATATYPE_CUDA_TIMING)
-    GET_TIME(start_total);
-#endif
-
-#if defined(OPAL_DATATYPE_CUDA_TIMING)
-    GET_TIME(start);
+    TIMER_DATA_TYPE start, end;
+    long total_time;
 #endif
-    if (opal_ddt_cuda_is_gpu_buffer(iov[0].iov_base)) {
-        source = (unsigned char*)iov[0].iov_base;
-        free_required = 0;
-    } else {
-        if (OPAL_DATATYPE_VECTOR_USE_ZEROCPY) {
-            cudaHostGetDevicePointer((void **)&source, (void *)iov[0].iov_base, 0);
-            pConvertor->gpu_buffer_ptr = NULL;
-            free_required = 0;
-        } else {
-            if (pConvertor->gpu_buffer_ptr == NULL) {
-                pConvertor->gpu_buffer_ptr = (unsigned char*)opal_ddt_cuda_malloc_gpu_buffer(iov[0].iov_len, 0);
-            }
-            source = pConvertor->gpu_buffer_ptr;
-            cudaMemcpy(source, iov[0].iov_base, iov[0].iov_len, cudaMemcpyHostToDevice);
-            free_required = 1;
-        }
-    }
-
+    
     DT_CUDA_DEBUG ( opal_cuda_output(2, "Unpack using IOV cached, GPU base %p, unpack from buffer %p, total size %ld\n",
-                                     pConvertor->pBaseBuf, source, iov[0].iov_len); );
-#if defined(OPAL_DATATYPE_CUDA_TIMING)
-    GET_TIME( end );
-    move_time = ELAPSED_TIME( start, end );
-    DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: HtoD memcpy in %ld microsec, free required %d\n", move_time, free_required ); );
-#endif
-
+                                     pConvertor->pBaseBuf, source, buffer_size); );
 
 #if defined (OPAL_DATATYPE_CUDA_TIMING)
     GET_TIME(start);
 #endif
-    buffer_size = iov[0].iov_len;
-    total_unpacked = 0;
+
     cuda_streams->current_stream_id = 0;
     source_base = source;
-
-#if defined (OPAL_DATATYPE_CUDA_TIMING)
-    GET_TIME( end );
-    total_time = ELAPSED_TIME( start, end );
-    DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: ddt to iov in %ld microsec\n", total_time ); );
-#endif
-
     thread_per_block = CUDA_WARP_SIZE * 5;
     nb_blocks = 256;
     destination_base = (unsigned char*)pConvertor->pBaseBuf;
@@ -899,7 +868,7 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
     
     if (pConvertor->current_iov_partial_length > 0) {
         cuda_iov_partial_length_start = pConvertor->current_iov_partial_length;
-        total_unpacked += cuda_iov_partial_length_start;
+        *total_unpacked += cuda_iov_partial_length_start;
         buffer_size -= cuda_iov_partial_length_start;
         pConvertor->current_iov_partial_length = 0;
         cuda_iov_start_pos ++;
@@ -912,13 +881,13 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
     while( pConvertor->current_count < pConvertor->count && !buffer_isfull) {
         for (i = cuda_iov_start_pos; i < cuda_iov_end_pos && !buffer_isfull; i++) {
             if (buffer_size >= cached_cuda_iov_nb_bytes_list_h[i]) {
-                total_unpacked += cached_cuda_iov_nb_bytes_list_h[i];
+                *total_unpacked += cached_cuda_iov_nb_bytes_list_h[i];
                 buffer_size -= cached_cuda_iov_nb_bytes_list_h[i];
                 nb_blocks_used ++;
             } else {
                 if (buffer_size > 0) {
                     cuda_iov_partial_length_end = buffer_size;
-                    total_unpacked += cuda_iov_partial_length_end;
+                    *total_unpacked += cuda_iov_partial_length_end;
                     nb_blocks_used ++;
                 }
                 buffer_size = 0;
@@ -943,28 +912,7 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
 
     cudaStreamSynchronize(cuda_streams->opal_cuda_stream[0]);
     
-    pConvertor->bConverted += total_unpacked;
-    DT_CUDA_DEBUG ( opal_cuda_output(2, "Unpack total unpacked %d\n", total_unpacked); );
-
-    iov[0].iov_len = total_unpacked;
-    *max_data = total_unpacked;
-    *out_size = 1;
-
-#if defined(OPAL_DATATYPE_CUDA_TIMING)
-    GET_TIME( end_total );
-    total_time = ELAPSED_TIME( start_total, end_total );
-    DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: total unpacking in %ld microsec, kernel %ld microsec\n", total_time, total_time-move_time ); );
-#endif
-
-    if( pConvertor->bConverted == pConvertor->local_size ) {
-        pConvertor->flags |= CONVERTOR_COMPLETED;
-        if (pConvertor->gpu_buffer_ptr != NULL && free_required) {
-            opal_ddt_cuda_free_gpu_buffer(pConvertor->gpu_buffer_ptr, 0);
-            pConvertor->gpu_buffer_ptr = NULL;
-        }
-        return 1;
-    }
-    return 0;
+    return OPAL_SUCCESS;
 }
 
 void unpack_contiguous_loop_cuda( dt_elem_desc_t* ELEM,

From 41edca1f71939359fcd2e7b7d0010ccea41a082b Mon Sep 17 00:00:00 2001
From: eddy16112 <eddy16112@gmail.com>
Date: Tue, 1 Dec 2015 16:40:27 -0500
Subject: [PATCH 077/190] for non cached iov, if there is no enough cuda iov
 space, break

---
 opal/datatype/cuda/opal_datatype_cuda.cu | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/opal/datatype/cuda/opal_datatype_cuda.cu b/opal/datatype/cuda/opal_datatype_cuda.cu
index b488ac4ab6c..2c76a327197 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cu
+++ b/opal/datatype/cuda/opal_datatype_cuda.cu
@@ -493,6 +493,9 @@ uint8_t opal_ddt_iov_to_cuda_iov(opal_convertor_t* pConvertor, const struct iove
         count_desc = length_per_iovec / alignment;
         residue_desc = length_per_iovec % alignment;
         nb_blocks_per_description = (count_desc + thread_per_block - 1) / thread_per_block;
+        if ((*nb_blocks_used + nb_blocks_per_description + 1) > (CUDA_MAX_NB_BLOCKS*CUDA_IOV_MAX_TASK_PER_BLOCK)) {
+            break;
+        }
         DT_CUDA_DEBUG ( opal_cuda_output(10, "DDT IOV to CUDA IOV description %d, size %d, residue %d, alignment %d, nb_block_aquired %d\n", i, count_desc, residue_desc, alignment, nb_blocks_per_description); );
         for (j = 0; j < nb_blocks_per_description; j++) {
             cuda_iov_dist_h_current[*nb_blocks_used].ncontig_disp = ncontig_disp_base + j * thread_per_block * alignment;

From 5cf6dba1b3294cad3e7ca0d28f7f75271b2199ad Mon Sep 17 00:00:00 2001
From: eddy16112 <eddy16112@gmail.com>
Date: Fri, 6 Nov 2015 23:23:33 -0500
Subject: [PATCH 078/190] cached iov is working for count = 1 check point use
 raw_cached, but cuda iov caching is not enabled

check point, split iov into two version, non-cached and cached

check point iov cache

another checkpoint

check point, cuda iov is cached, but not used for pack/unpack

check point, ready to use cached cuda iov

checkpoint, cached cuda iov is working with multiple send, but not for
count > 1

checkpoint, fix a bug for partial unpack

checkpoint, fix unpack size
---
 opal/datatype/cuda/opal_datatype_cuda.cu      | 100 ++++-
 opal/datatype/cuda/opal_datatype_cuda.cuh     |  48 ++-
 .../cuda/opal_datatype_cuda_internal.cuh      |  42 +-
 .../cuda/opal_datatype_pack_cuda_kernel.cu    |  70 ++-
 .../cuda/opal_datatype_pack_cuda_wrapper.cu   | 408 ++++++++++++++++--
 .../cuda/opal_datatype_unpack_cuda_kernel.cu  |  86 +++-
 .../cuda/opal_datatype_unpack_cuda_wrapper.cu | 381 +++++++++++++++-
 opal/datatype/opal_convertor.h                |   5 +-
 opal/datatype/opal_datatype.h                 |   4 +-
 opal/datatype/opal_datatype_create.c          |  16 +
 opal/datatype/opal_datatype_cuda.c            |  39 +-
 opal/datatype/opal_datatype_cuda.h            |   7 +-
 opal/datatype/opal_datatype_destroy.c         |  15 +-
 opal/datatype/opal_datatype_optimize.c        |   7 -
 opal/datatype/opal_datatype_pack.c            |   2 +-
 opal/datatype/opal_datatype_unpack.c          |   2 +-
 test/datatype/ddt_benchmark.c                 |  16 +-
 17 files changed, 1070 insertions(+), 178 deletions(-)

diff --git a/opal/datatype/cuda/opal_datatype_cuda.cu b/opal/datatype/cuda/opal_datatype_cuda.cu
index 6a6e06ff28d..18494bcba70 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cu
+++ b/opal/datatype/cuda/opal_datatype_cuda.cu
@@ -220,8 +220,11 @@ int32_t opal_ddt_cuda_kernel_init(void)
         for (j = 0; j < NB_STREAMS; j++) {
             cudaStreamCreate(&(cuda_streams->opal_cuda_stream[j]));
             cuda_iov_pipeline_block = (ddt_cuda_iov_pipeline_block_t *)malloc(sizeof(ddt_cuda_iov_pipeline_block_t));
-            cudaMallocHost((void **)(&(cuda_iov_pipeline_block->cuda_iov_dist_h)), sizeof(ddt_cuda_iov_dist_t) * CUDA_MAX_NB_BLOCKS * CUDA_IOV_MAX_TASK_PER_BLOCK);
-            cudaMalloc((void **)(&(cuda_iov_pipeline_block->cuda_iov_dist_d)), sizeof(ddt_cuda_iov_dist_t) * CUDA_MAX_NB_BLOCKS * CUDA_IOV_MAX_TASK_PER_BLOCK);
+            cudaMallocHost((void **)(&(cuda_iov_pipeline_block->cuda_iov_dist_non_cached_h)), sizeof(ddt_cuda_iov_dist_non_cached_t) * CUDA_MAX_NB_BLOCKS * CUDA_IOV_MAX_TASK_PER_BLOCK);
+            cudaMalloc((void **)(&(cuda_iov_pipeline_block->cuda_iov_dist_non_cached_d)), sizeof(ddt_cuda_iov_dist_non_cached_t) * CUDA_MAX_NB_BLOCKS * CUDA_IOV_MAX_TASK_PER_BLOCK);
+            cudaMallocHost((void **)(&(cuda_iov_pipeline_block->cuda_iov_dist_cached_h)), sizeof(ddt_cuda_iov_dist_cached_t) * CUDA_MAX_NB_BLOCKS * CUDA_IOV_MAX_TASK_PER_BLOCK);
+            cudaMallocHost((void **)(&(cuda_iov_pipeline_block->cuda_iov_contig_buf_h)), sizeof(uintptr_t) * CUDA_MAX_NB_BLOCKS * CUDA_IOV_MAX_TASK_PER_BLOCK);
+            cudaMalloc((void **)(&(cuda_iov_pipeline_block->cuda_iov_contig_buf_d)), sizeof(uintptr_t) * CUDA_MAX_NB_BLOCKS * CUDA_IOV_MAX_TASK_PER_BLOCK);
             cuda_iov_pipeline_block->cuda_stream = &(cuda_streams->opal_cuda_stream[0]);
             cuda_iov_pipeline_block->cuda_stream_id = 0;
             cudaEventCreate(&(cuda_iov_pipeline_block->cuda_event), cudaEventDisableTiming);
@@ -258,8 +261,11 @@ int32_t opal_ddt_cuda_kernel_fini(void)
             cudaStreamDestroy(cuda_devices[i].cuda_streams->opal_cuda_stream[j]);
             cuda_iov_pipeline_block = cuda_devices[i].cuda_iov_pipeline_block[j];
             if (cuda_iov_pipeline_block != NULL) {
-                cudaFreeHost(cuda_iov_pipeline_block->cuda_iov_dist_h);
-                cudaFree(cuda_iov_pipeline_block->cuda_iov_dist_d);
+                cudaFreeHost(cuda_iov_pipeline_block->cuda_iov_dist_non_cached_h);
+                cudaFree(cuda_iov_pipeline_block->cuda_iov_dist_non_cached_d);
+                cudaFreeHost(cuda_iov_pipeline_block->cuda_iov_dist_cached_h);
+                cudaFreeHost(cuda_iov_pipeline_block->cuda_iov_contig_buf_h);
+                cudaFree(cuda_iov_pipeline_block->cuda_iov_contig_buf_d);
                 cudaEventDestroy(cuda_iov_pipeline_block->cuda_event);
                 cuda_iov_pipeline_block->cuda_stream = NULL;
                 cuda_iov_pipeline_block->cuda_stream_id = -1;
@@ -275,35 +281,95 @@ int32_t opal_ddt_cuda_kernel_fini(void)
     return OPAL_SUCCESS;
 }
 
-void* opal_ddt_cuda_iov_dist_init(void) 
+void* opal_ddt_cached_cuda_iov_init(uint32_t size) 
 {
 #if OPAL_DATATYPE_CUDA_IOV_CACHE 
-    ddt_cuda_iov_dist_t *p = NULL;
-    cudaMalloc((void **)(&p), sizeof(ddt_cuda_iov_dist_t) * NUM_CUDA_IOV_PER_DDT);
-    if (p != NULL) {
-        DT_CUDA_DEBUG( opal_cuda_output( 2, "Malloc cuda_iov_dist for ddt is successed %p.\n", p); );
-        return p;
+    ddt_cuda_iov_total_cached_t *tmp = (ddt_cuda_iov_total_cached_t *)malloc(sizeof(ddt_cuda_iov_total_cached_t));
+    ddt_cuda_iov_dist_cached_t *tmp_cuda_iov_d = NULL;
+    cudaMalloc((void **)(&tmp_cuda_iov_d), sizeof(ddt_cuda_iov_dist_cached_t) * size);
+    uint32_t *tmp_nb_bytes = (uint32_t *)malloc(sizeof(uint32_t) * size);
+    if (tmp != NULL && tmp_cuda_iov_d != NULL && tmp_nb_bytes != NULL) {
+        tmp->cuda_iov_dist_d = tmp_cuda_iov_d;
+        tmp->cuda_iov_count = size;
+        tmp->cuda_iov_is_cached = 0;
+        tmp->nb_bytes_h = tmp_nb_bytes;
+        DT_CUDA_DEBUG( opal_cuda_output( 2, "Malloc cuda_iov_dist_cached for ddt is successed, cached cuda iov %p, cuda_iov_d %p, nb_bytes_h %p, size %d.\n", tmp, tmp_cuda_iov_d, tmp_nb_bytes, size); );
+        return tmp;
     } else {
-        DT_CUDA_DEBUG( opal_cuda_output( 0, "Malloc cuda_iov_dist for ddt is failed.\n"); );
+        DT_CUDA_DEBUG( opal_cuda_output( 0, "Malloc cuda_iov_dist_cached for ddt is failed.\n"); );
         return NULL;
     }
 #else
     DT_CUDA_DEBUG( opal_cuda_output( 2, "cuda iov cache is not enabled.\n"); );
-    return (void *)0xDEADBEEF;
+    return NULL;
 #endif /* OPAL_DATATYPE_CUDA_IOV_CACHE */
 }
 
-void opal_ddt_cuda_iov_dist_fini(void* cuda_iov_dist) 
+void opal_ddt_cached_cuda_iov_fini(void* cached_cuda_iov) 
 {
 #if OPAL_DATATYPE_CUDA_IOV_CACHE
-    ddt_cuda_iov_dist_t *p = (ddt_cuda_iov_dist_t *) cuda_iov_dist;
-    if (p != NULL) {
-        cudaFree(p);
-        DT_CUDA_DEBUG( opal_cuda_output( 2, "Free cuda_iov_dist for ddt is successed %p.\n", p); );
+    ddt_cuda_iov_total_cached_t *tmp = (ddt_cuda_iov_total_cached_t *) cached_cuda_iov;
+    if (tmp != NULL) {
+        DT_CUDA_DEBUG( opal_cuda_output( 2, "Free cuda_iov_dist for ddt is successed %p.\n", tmp); );
+        if (tmp->cuda_iov_dist_d != NULL) {
+            cudaFree(tmp->cuda_iov_dist_d);
+            tmp->cuda_iov_dist_d = NULL;
+        }
+        if (tmp->nb_bytes_h != NULL) {
+            free(tmp->nb_bytes_h);
+            tmp->nb_bytes_h = NULL;
+        }
+        free(tmp);
+        tmp = NULL;
     }
 #endif /* OPAL_DATATYPE_CUDA_IOV_CACHE */
 }
 
+void opal_ddt_get_cached_cuda_iov(struct opal_convertor_t *convertor, ddt_cuda_iov_total_cached_t **cached_cuda_iov)
+{
+    opal_datatype_t *datatype = (opal_datatype_t *)convertor->pDesc;
+    if (datatype->cached_cuda_iov == NULL) {
+        datatype->cached_cuda_iov = opal_ddt_cached_cuda_iov_init(NUM_CUDA_IOV_PER_DDT);
+    }
+    *cached_cuda_iov = (ddt_cuda_iov_total_cached_t *)datatype->cached_cuda_iov;                   
+}
+
+void opal_ddt_set_cuda_iov_cached(struct opal_convertor_t *convertor, uint32_t cuda_iov_count)
+{
+    opal_datatype_t *datatype = (opal_datatype_t *)convertor->pDesc;
+    assert(datatype->cached_cuda_iov != NULL);
+    ddt_cuda_iov_total_cached_t *tmp = (ddt_cuda_iov_total_cached_t *)datatype->cached_cuda_iov;
+    tmp->cuda_iov_count = cuda_iov_count;
+    tmp->cuda_iov_is_cached = 1;
+}
+
+uint8_t opal_ddt_cuda_iov_is_cached(struct opal_convertor_t *convertor)
+{
+    opal_datatype_t *datatype = (opal_datatype_t *)convertor->pDesc;
+    assert(datatype->cached_cuda_iov != NULL);
+    ddt_cuda_iov_total_cached_t *tmp = (ddt_cuda_iov_total_cached_t *)datatype->cached_cuda_iov;
+    return tmp->cuda_iov_is_cached;
+}
+
+void opal_ddt_check_cuda_iov_is_full(struct opal_convertor_t *convertor, uint32_t cuda_iov_count)
+{
+#if 0
+    opal_datatype_t *datatype = (opal_datatype_t *)convertor->pDesc;
+    assert(datatype->cached_cuda_iov_dist != NULL);
+    if (datatype->cached_cuda_iov_count < cuda_iov_count) {
+        printf("cuda count %d, new count %d\n", datatype->cached_cuda_iov_count, cuda_iov_count);
+  //      assert(0);
+        void *old_iov = datatype->cached_cuda_iov_dist;
+        void *new_iov = opal_ddt_cuda_iov_dist_init(datatype->cached_cuda_iov_count + NUM_CUDA_IOV_PER_DDT);
+        assert(new_iov != NULL);
+        cudaMemcpy(new_iov, old_iov, datatype->cached_cuda_iov_count * sizeof(ddt_cuda_iov_dist_cached_t), cudaMemcpyDeviceToDevice);
+        datatype->cached_cuda_iov_dist = new_iov;
+        datatype->cached_cuda_iov_count += NUM_CUDA_IOV_PER_DDT;
+        opal_ddt_cuda_iov_dist_fini(old_iov);
+    }
+#endif
+}
+
 int32_t opal_ddt_cuda_is_gpu_buffer(const void *ptr)
 {
     int res;
diff --git a/opal/datatype/cuda/opal_datatype_cuda.cuh b/opal/datatype/cuda/opal_datatype_cuda.cuh
index ea3631af67f..6c071188c2c 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cuh
+++ b/opal/datatype/cuda/opal_datatype_cuda.cuh
@@ -12,22 +12,42 @@ int32_t opal_ddt_cuda_kernel_fini(void);
 int32_t opal_ddt_generic_simple_pack_function_cuda_vector( opal_convertor_t* pConvertor,
                                                            struct iovec* iov, 
                                                            uint32_t* out_size,
-                                                           size_t* max_data );
+                                                           size_t* max_data ); 
                                                 
+int32_t opal_ddt_generic_simple_unpack_function_cuda_vector( opal_convertor_t* pConvertor,
+                                                             struct iovec* iov, 
+                                                             uint32_t* out_size,
+                                                             size_t* max_data );
+                                                             
 int32_t opal_ddt_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor,
                                                         struct iovec* iov, 
                                                         uint32_t* out_size,
                                                         size_t* max_data );                                              
-                                                  
+
 int32_t opal_ddt_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvertor,
                                                           struct iovec* iov, 
                                                           uint32_t* out_size,
-                                                          size_t* max_data );  
-                                                
-int32_t opal_ddt_generic_simple_unpack_function_cuda_vector( opal_convertor_t* pConvertor,
-                                                             struct iovec* iov, 
-                                                             uint32_t* out_size,
-                                                             size_t* max_data );
+                                                          size_t* max_data ); 
+                                                          
+int32_t opal_ddt_generic_simple_pack_function_cuda_iov_non_cached( opal_convertor_t* pConvertor,
+                                                                   struct iovec* iov, 
+                                                                   uint32_t* out_size,
+                                                                   size_t* max_data );                                              
+
+int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_non_cached( opal_convertor_t* pConvertor,
+                                                                     struct iovec* iov, 
+                                                                     uint32_t* out_size,
+                                                                     size_t* max_data ); 
+                                                                                                                    
+int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t* pConvertor,
+                                                               struct iovec* iov, 
+                                                               uint32_t* out_size,
+                                                               size_t* max_data );                                              
+
+int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_t* pConvertor,
+                                                                 struct iovec* iov, 
+                                                                 uint32_t* out_size,
+                                                                 size_t* max_data ); 
 
 void pack_contiguous_loop_cuda( dt_elem_desc_t* ELEM,
                                 uint32_t* COUNT,
@@ -95,12 +115,20 @@ void opal_ddt_cuda_d2dcpy(void* dst, const void* src, size_t count);
 
 void opal_dump_cuda_list(ddt_cuda_list_t *list);
 
-void* opal_ddt_cuda_iov_dist_init(void);
+void* opal_ddt_cached_cuda_iov_init(void);
 
-void opal_ddt_cuda_iov_dist_fini(void *cuda_iov_dist);
+void opal_ddt_cached_cuda_iov_fini(void *cached_cuda_iov);
 
 void pack_iov_cached(opal_convertor_t* pConvertor, unsigned char *destination);
 
+void opal_ddt_get_cached_cuda_iov(struct opal_convertor_t *convertor, ddt_cuda_iov_total_cached_t **cached_cuda_iov);
+                                  
+void opal_ddt_set_cuda_iov_cached(struct opal_convertor_t *convertor, uint32_t cuda_iov_count);
+
+uint8_t opal_ddt_cuda_iov_is_cached(struct opal_convertor_t *convertor);
+
+void opal_ddt_check_cuda_iov_is_full(struct opal_convertor_t *convertor, uint32_t cuda_iov_count);
+
 }
                             
 #endif  /* OPAL_DATATYPE_CUDA_H_HAS_BEEN_INCLUDED */
\ No newline at end of file
diff --git a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
index ca630fc1b93..b7e8e9405f6 100644
--- a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
+++ b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
@@ -37,8 +37,9 @@
 #define CUDA_IOV_MAX_TASK_PER_BLOCK 400
 #define ALIGNMENT_DOUBLE    8
 #define ALIGNMENT_FLOAT     4
-#define ALIGNMENT_CHAR      18
-#define NUM_CUDA_IOV_PER_DDT    100000
+#define ALIGNMENT_CHAR      1
+#define NUM_CUDA_IOV_PER_DDT    150000
+#define IOV_PIPELINE_SIZE   1000
 
 #define TIMER_DATA_TYPE struct timeval
 #define GET_TIME(TV)   gettimeofday( &(TV), NULL )
@@ -51,15 +52,30 @@ typedef struct {
 } ddt_cuda_stream_t;
 
 typedef struct {
-    size_t src_offset;
-    size_t dst_offset;
+    unsigned char* src;
+    unsigned char* dst;
     uint32_t nb_elements;
     uint8_t element_alignment;
-} ddt_cuda_iov_dist_t;
+} ddt_cuda_iov_dist_non_cached_t;
 
 typedef struct {
-    ddt_cuda_iov_dist_t* cuda_iov_dist_h;
-    ddt_cuda_iov_dist_t* cuda_iov_dist_d;
+    size_t ptr_offset;
+    uint32_t nb_bytes;
+} ddt_cuda_iov_dist_cached_t;
+
+typedef struct {
+    ddt_cuda_iov_dist_cached_t* cuda_iov_dist_d;
+    uint32_t cuda_iov_count;
+    uint32_t* nb_bytes_h;
+    uint8_t cuda_iov_is_cached;
+} ddt_cuda_iov_total_cached_t;
+
+typedef struct {
+    ddt_cuda_iov_dist_non_cached_t* cuda_iov_dist_non_cached_h;
+    ddt_cuda_iov_dist_non_cached_t* cuda_iov_dist_non_cached_d;
+    ddt_cuda_iov_dist_cached_t* cuda_iov_dist_cached_h;
+    uintptr_t *cuda_iov_contig_buf_h;
+    uintptr_t *cuda_iov_contig_buf_d;
     cudaStream_t *cuda_stream;
     int32_t cuda_stream_id;
     cudaEvent_t cuda_event;
@@ -118,9 +134,13 @@ __global__ void unpack_contiguous_loop_cuda_kernel_global( uint32_t copy_loops,
                                                            unsigned char* destination );
                                                            
 
-__global__ void opal_generic_simple_pack_cuda_iov_kernel( ddt_cuda_iov_dist_t* cuda_iov_dist, int nb_blocks_used, unsigned char* source_base, unsigned char* destination_base);
+__global__ void opal_generic_simple_pack_cuda_iov_non_cached_kernel( ddt_cuda_iov_dist_non_cached_t* cuda_iov_dist, int nb_blocks_used);
 
-__global__ void opal_generic_simple_unpack_cuda_iov_kernel( ddt_cuda_iov_dist_t* cuda_iov_dist, int nb_blocks_used, unsigned char* source_base, unsigned char* destination_base);
+__global__ void opal_generic_simple_unpack_cuda_iov_non_cached_kernel( ddt_cuda_iov_dist_non_cached_t* cuda_iov_dist, int nb_blocks_used);
+
+__global__ void opal_generic_simple_pack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, uintptr_t* cuda_iov_contig_buf_d, int nb_blocks_used, unsigned char* source_base);
+
+__global__ void opal_generic_simple_unpack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, uintptr_t* cuda_iov_contig_buf_d, int nb_blocks_used, unsigned char* destination_base, size_t cuda_iov_partial_length_start, size_t cuda_iov_partial_length_end);
 
 void opal_cuda_output(int output_id, const char *format, ...);
 
@@ -139,6 +159,10 @@ int32_t opal_convertor_set_position_nocheck( opal_convertor_t* convertor, size_t
 int32_t opal_convertor_raw( opal_convertor_t* pConvertor, 
 		                    struct iovec* iov, uint32_t* iov_count,
 		                    size_t* length );
+
+int opal_convertor_raw_cached(struct opal_convertor_t *convertor,
+                              const struct iovec **iov,
+                              uint32_t* iov_count);
 }
 
 #endif  /* OPAL_DATATYPE_CUDA_INTERNAL_H_HAS_BEEN_INCLUDED */
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
index 6b0e18b1078..e85b83e55b5 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
@@ -43,10 +43,10 @@ __global__ void pack_contiguous_loop_cuda_kernel_global( uint32_t copy_loops,
     }
 }
 
-__global__ void opal_generic_simple_pack_cuda_iov_kernel( ddt_cuda_iov_dist_t* cuda_iov_dist, int nb_blocks_used, unsigned char* source_base, unsigned char* destination_base)
+__global__ void opal_generic_simple_pack_cuda_iov_non_cached_kernel( ddt_cuda_iov_dist_non_cached_t* cuda_iov_dist, int nb_blocks_used)
 {
     uint32_t i, _copy_count;
-    size_t src_offset, dst_offset;
+    unsigned char *src, *dst;
     uint8_t alignment;
     unsigned char *_source_tmp, *_destination_tmp;
     
@@ -63,8 +63,8 @@ __global__ void opal_generic_simple_pack_cuda_iov_kernel( ddt_cuda_iov_dist_t* c
     __syncthreads();
     
     for (i = 0; i < nb_tasks; i++) {
-        src_offset = cuda_iov_dist[blockIdx.x + i * gridDim.x].src_offset;
-        dst_offset = cuda_iov_dist[blockIdx.x + i * gridDim.x].dst_offset;
+        src = cuda_iov_dist[blockIdx.x + i * gridDim.x].src;
+        dst = cuda_iov_dist[blockIdx.x + i * gridDim.x].dst;
         _copy_count = cuda_iov_dist[blockIdx.x + i * gridDim.x].nb_elements;
         alignment = cuda_iov_dist[blockIdx.x + i * gridDim.x].element_alignment;
         
@@ -73,8 +73,8 @@ __global__ void opal_generic_simple_pack_cuda_iov_kernel( ddt_cuda_iov_dist_t* c
         // }
         
         if (threadIdx.x < _copy_count) {
-            _source_tmp = source_base + src_offset + threadIdx.x * alignment;
-            _destination_tmp = destination_base + dst_offset + threadIdx.x * alignment;
+            _source_tmp = src + threadIdx.x * alignment;
+            _destination_tmp = dst + threadIdx.x * alignment;
 #if !defined (OPAL_DATATYPE_CUDA_DRY_RUN)
             if (alignment == ALIGNMENT_DOUBLE) {
                 *((long *)_destination_tmp) = *((long *)_source_tmp);
@@ -86,4 +86,62 @@ __global__ void opal_generic_simple_pack_cuda_iov_kernel( ddt_cuda_iov_dist_t* c
 #endif /* ! OPAL_DATATYPE_CUDA_DRY_RUN */
         }
     }
+}
+
+__global__ void opal_generic_simple_pack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, uintptr_t* cuda_iov_contig_buf_d, int nb_blocks_used, unsigned char* source_base)
+{
+    uint32_t i, j;
+    size_t src_offset;
+    unsigned char *dst;
+    unsigned char *_source_tmp, *_destination_tmp;
+    
+    __shared__ uint32_t nb_tasks;
+    __shared__ uint32_t copy_count;
+    __shared__ uint8_t alignment;
+    
+    if (threadIdx.x == 0) {
+        nb_tasks = nb_blocks_used / gridDim.x;
+        if (blockIdx.x < (nb_blocks_used % gridDim.x)) {
+            nb_tasks ++;
+        }
+   //     printf("nb_tasks %d, griddim %d, nb_blocks_used %d, bloid %d \n", nb_tasks, gridDim.x, nb_blocks_used, blockIdx.x);
+    }
+    __syncthreads();
+    
+    for (i = 0; i < nb_tasks; i++) {
+        src_offset = cuda_iov_dist[blockIdx.x + i * gridDim.x].ptr_offset;
+        dst = (unsigned char *)cuda_iov_contig_buf_d[blockIdx.x + i * gridDim.x];
+        
+        if (threadIdx.x == 0) {
+            _source_tmp = source_base + src_offset;
+            _destination_tmp = dst;
+            uint32_t _nb_bytes = cuda_iov_dist[blockIdx.x + i * gridDim.x].nb_bytes;
+            /* block size is either multiple of ALIGNMENT_DOUBLE or residule */
+            if ((uintptr_t)(_source_tmp) % ALIGNMENT_DOUBLE == 0 && (uintptr_t)_destination_tmp % ALIGNMENT_DOUBLE == 0 && _nb_bytes % ALIGNMENT_DOUBLE == 0) {
+                alignment = ALIGNMENT_DOUBLE;
+            } else if ((uintptr_t)(_source_tmp) % ALIGNMENT_FLOAT == 0 && (uintptr_t)_destination_tmp % ALIGNMENT_FLOAT == 0 && _nb_bytes % ALIGNMENT_FLOAT == 0) {
+                alignment = ALIGNMENT_FLOAT;
+            } else {
+                alignment = ALIGNMENT_CHAR;
+            }
+            copy_count = _nb_bytes / alignment;
+        }
+        __syncthreads();
+        
+        for (j = threadIdx.x; j < copy_count; j += blockDim.x) {
+            if (j < copy_count) {
+                _source_tmp = source_base + src_offset + j * alignment;
+                _destination_tmp = dst + j * alignment;
+#if !defined (OPAL_DATATYPE_CUDA_DRY_RUN)
+                if (alignment == ALIGNMENT_DOUBLE) {
+                    *((long *)_destination_tmp) = *((long *)_source_tmp);
+                } else if (alignment == ALIGNMENT_FLOAT) {
+                    *((int *)_destination_tmp) = *((int *)_source_tmp);
+                } else {
+                    * _destination_tmp = *_source_tmp;
+                }
+#endif /* ! OPAL_DATATYPE_CUDA_DRY_RUN */
+            }
+        }
+    }
 }
\ No newline at end of file
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
index b82888a3f96..55cb955808e 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
@@ -660,15 +660,23 @@ void pack_contiguous_loop_cuda_zerocopy( dt_elem_desc_t* ELEM,
 }
 
 int32_t opal_ddt_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor,
-                                                    struct iovec* iov,
-                                                    uint32_t* out_size,
-                                                    size_t* max_data )
+                                                        struct iovec* iov,
+                                                        uint32_t* out_size,
+                                                        size_t* max_data )
+{
+    return opal_ddt_generic_simple_pack_function_cuda_iov_cached(pConvertor, iov, out_size, max_data);   
+}
+
+int32_t opal_ddt_generic_simple_pack_function_cuda_iov_non_cached( opal_convertor_t* pConvertor,
+                                                                   struct iovec* iov,
+                                                                   uint32_t* out_size,
+                                                                   size_t* max_data )
 {
     uint32_t i, j;
     uint32_t count_desc, nb_blocks_per_description, residue_desc;
     uint32_t nb_blocks, thread_per_block, nb_blocks_used;
     size_t length, buffer_size, length_per_iovec, dst_offset;
-    unsigned char *destination, *destination_base, *source_base;
+    unsigned char *destination, *destination_base;
     size_t total_packed, total_converted;
     int32_t complete_flag = 0;
     uint8_t buffer_isfull = 0, transfer_required, free_required;
@@ -680,8 +688,8 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConve
 //    int32_t orig_stack_index;
     cudaError_t cuda_err;
     ddt_cuda_stream_t *cuda_streams = current_cuda_device->cuda_streams;
-    ddt_cuda_iov_dist_t* cuda_iov_dist_h_current;
-    ddt_cuda_iov_dist_t* cuda_iov_dist_d_current;
+    ddt_cuda_iov_dist_non_cached_t* cuda_iov_dist_h_current;
+    ddt_cuda_iov_dist_non_cached_t* cuda_iov_dist_d_current;
     ddt_cuda_iov_pipeline_block_t *cuda_iov_pipeline_block;
     int iov_pipeline_block_id = 0;
     cudaStream_t *cuda_stream_iov = NULL;
@@ -691,12 +699,6 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConve
     long total_time, move_time;
 #endif
     
-#if OPAL_DATATYPE_CUDA_IOV_CACHE
-    opal_datatype_t *pDesc = (opal_datatype_t *)pConvertor->pDesc;
-    ddt_cuda_iov_dist_t *cuda_iov_dist_cache = (ddt_cuda_iov_dist_t *)pDesc->cuda_iov_dist;
-    cuda_iov_dist_cache += pDesc->cuda_iov_count;    
-#endif /* OPAL_DATATYPE_CUDA_IOV_CACHE */
-    
     /*description = pConvertor->use_desc->desc;
     pStack = pConvertor->pStack + pConvertor->stack_pos;
     pElem = &(description[pStack->index]);
@@ -738,16 +740,10 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConve
             free_required = 1;
             destination = pConvertor->gpu_buffer_ptr;
         }
-    }
-    
-#if OPAL_DATATYPE_CUDA_IOV_CACHE
-    /* cuda iov is cached */
-    if (pDesc->cuda_iov_is_cached == 2) {
-        pack_iov_cached(pConvertor, destination);
-    }
-#endif /* OPAL_DATATYPE_CUDA_IOV_CACHE */    
+    }   
     
-    DT_CUDA_DEBUG ( opal_cuda_output(2, "Pack using IOV, GPU base %p, pack to buffer %p\n", pConvertor->pBaseBuf, destination););
+    destination_base = destination;
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "Pack using IOV non cached, GPU base %p, pack to buffer %p\n", pConvertor->pBaseBuf, destination););
 
     cuda_iov_count = 1000;//CUDA_NB_IOV;
     total_packed = 0;
@@ -755,7 +751,6 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConve
     cuda_streams->current_stream_id = 0;
     convertor_flags = pConvertor->flags;
   //  orig_stack_index = pStack->index;
-    destination_base = destination;
 
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
     GET_TIME(start_total);
@@ -781,12 +776,11 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConve
         
         nb_blocks_used = 0;
         cuda_iov_pipeline_block = current_cuda_device->cuda_iov_pipeline_block[iov_pipeline_block_id];
-        cuda_iov_dist_h_current = cuda_iov_pipeline_block->cuda_iov_dist_h;
-        cuda_iov_dist_d_current = cuda_iov_pipeline_block->cuda_iov_dist_d;
+        cuda_iov_dist_h_current = cuda_iov_pipeline_block->cuda_iov_dist_non_cached_h;
+        cuda_iov_dist_d_current = cuda_iov_pipeline_block->cuda_iov_dist_non_cached_d;
         cuda_stream_iov = cuda_iov_pipeline_block->cuda_stream;
         cuda_err = cudaStreamWaitEvent(*cuda_stream_iov, cuda_iov_pipeline_block->cuda_event, 0);
         opal_cuda_check_error(cuda_err);
-        source_base = (unsigned char*)cuda_iov[0].iov_base; 
 
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
         GET_TIME(start);
@@ -819,8 +813,8 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConve
             nb_blocks_per_description = (count_desc + thread_per_block - 1) / thread_per_block;
             DT_CUDA_DEBUG ( opal_cuda_output(10, "Pack description %d, size %d, residue %d, alignment %d, nb_block_aquired %d\n", i, count_desc, residue_desc, alignment, nb_blocks_per_description); );
             for (j = 0; j < nb_blocks_per_description; j++) {
-                cuda_iov_dist_h_current[nb_blocks_used].src_offset = (unsigned char *)(cuda_iov[i].iov_base) + j * thread_per_block * alignment - source_base;
-                cuda_iov_dist_h_current[nb_blocks_used].dst_offset = destination - destination_base;
+                cuda_iov_dist_h_current[nb_blocks_used].src = (unsigned char *)(cuda_iov[i].iov_base) + j * thread_per_block * alignment;
+                cuda_iov_dist_h_current[nb_blocks_used].dst = destination;
                 cuda_iov_dist_h_current[nb_blocks_used].element_alignment = alignment;
                 if ( (j+1) * thread_per_block <= count_desc) {
                     cuda_iov_dist_h_current[nb_blocks_used].nb_elements = thread_per_block;
@@ -831,7 +825,7 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConve
                 assert(cuda_iov_dist_h_current[nb_blocks_used].nb_elements > 0);
 #endif /* OPAL_DATATYPE_CUDA_DEBUG */
                 destination += cuda_iov_dist_h_current[nb_blocks_used].nb_elements * alignment;
-                DT_CUDA_DEBUG( opal_cuda_output(12, "Pack \tblock %d, src %p, dst %p, nb_elements %d, alignment %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].src_offset, cuda_iov_dist_h_current[nb_blocks_used].dst_offset, cuda_iov_dist_h_current[nb_blocks_used].nb_elements, cuda_iov_dist_h_current[nb_blocks_used].element_alignment); );
+                DT_CUDA_DEBUG( opal_cuda_output(12, "Pack \tblock %d, src %p, dst %p, nb_elements %d, alignment %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].src, cuda_iov_dist_h_current[nb_blocks_used].dst, cuda_iov_dist_h_current[nb_blocks_used].nb_elements, cuda_iov_dist_h_current[nb_blocks_used].element_alignment); );
                 nb_blocks_used ++;
                 assert (nb_blocks_used < CUDA_MAX_NB_BLOCKS*CUDA_IOV_MAX_TASK_PER_BLOCK);
             }
@@ -840,15 +834,15 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConve
             if (residue_desc != 0) {
                 /*orig_alignment = opal_datatype_basicDatatypes[pElem->elem.common.type]->size;*/
                 orig_alignment = ALIGNMENT_CHAR;
-                cuda_iov_dist_h_current[nb_blocks_used].src_offset = (unsigned char *)(cuda_iov[i].iov_base) + length_per_iovec / alignment * alignment - source_base;
-                cuda_iov_dist_h_current[nb_blocks_used].dst_offset = destination - destination_base;
+                cuda_iov_dist_h_current[nb_blocks_used].src = (unsigned char *)(cuda_iov[i].iov_base) + length_per_iovec / alignment * alignment;
+                cuda_iov_dist_h_current[nb_blocks_used].dst = destination;
                 cuda_iov_dist_h_current[nb_blocks_used].element_alignment = orig_alignment;
                 cuda_iov_dist_h_current[nb_blocks_used].nb_elements = (length_per_iovec - length_per_iovec / alignment * alignment) / orig_alignment;
 #if defined (OPAL_DATATYPE_CUDA_DEBUG)
                 assert(cuda_iov_dist_h_current[nb_blocks_used].nb_elements > 0);
 #endif /* OPAL_DATATYPE_CUDA_DEBUG */
                 destination += cuda_iov_dist_h_current[nb_blocks_used].nb_elements * orig_alignment;
-                DT_CUDA_DEBUG( opal_cuda_output(12, "Pack \tblock %d, src %p, dst %p, nb_elements %d, alignment %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].src_offset, cuda_iov_dist_h_current[nb_blocks_used].dst_offset, cuda_iov_dist_h_current[nb_blocks_used].nb_elements, cuda_iov_dist_h_current[nb_blocks_used].element_alignment); );
+                DT_CUDA_DEBUG( opal_cuda_output(12, "Pack \tblock %d, src %p, dst %p, nb_elements %d, alignment %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].src, cuda_iov_dist_h_current[nb_blocks_used].dst, cuda_iov_dist_h_current[nb_blocks_used].nb_elements, cuda_iov_dist_h_current[nb_blocks_used].element_alignment); );
                 nb_blocks_used ++;
                 assert (nb_blocks_used < CUDA_MAX_NB_BLOCKS*CUDA_IOV_MAX_TASK_PER_BLOCK);
             }
@@ -864,13 +858,8 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConve
         DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Pack to dest %p, iov is prepared in %ld microsec, kernel submitted to CUDA stream %d, nb_blocks %d\n", destination_base, total_time,  cuda_iov_pipeline_block->cuda_stream_id, nb_blocks_used); );
 #endif
 
-        cudaMemcpyAsync(cuda_iov_dist_d_current, cuda_iov_dist_h_current, sizeof(ddt_cuda_iov_dist_t)*(nb_blocks_used), cudaMemcpyHostToDevice, *cuda_stream_iov);
-#if OPAL_DATATYPE_CUDA_IOV_CACHE
-        cudaMemcpyAsync(cuda_iov_dist_cache, cuda_iov_dist_d_current, sizeof(ddt_cuda_iov_dist_t)*(nb_blocks_used), cudaMemcpyDeviceToDevice, *cuda_stream_iov);
-        pDesc->cuda_iov_count += nb_blocks_used;
-        cuda_iov_dist_cache += nb_blocks_used;
-#endif /* OPAL_DATATYPE_CUDA_IOV_CACHE */
-        opal_generic_simple_pack_cuda_iov_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cuda_iov_dist_d_current, nb_blocks_used, source_base, destination_base);
+        cudaMemcpyAsync(cuda_iov_dist_d_current, cuda_iov_dist_h_current, sizeof(ddt_cuda_iov_dist_non_cached_t)*(nb_blocks_used), cudaMemcpyHostToDevice, *cuda_stream_iov);
+        opal_generic_simple_pack_cuda_iov_non_cached_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cuda_iov_dist_d_current, nb_blocks_used);
         cuda_err = cudaEventRecord(cuda_iov_pipeline_block->cuda_event, *cuda_stream_iov);
         opal_cuda_check_error(cuda_err);
         iov_pipeline_block_id ++;
@@ -933,22 +922,347 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConve
            opal_ddt_cuda_free_gpu_buffer(pConvertor->gpu_buffer_ptr, 0);
            pConvertor->gpu_buffer_ptr = NULL;
         }
-#if OPAL_DATATYPE_CUDA_IOV_CACHE
-        pDesc->cuda_iov_is_cached = 2;
-#endif /* OPAL_DATATYPE_CUDA_IOV_CACHE */
         return 1;
     }        
     return 0;
 }
 
-#if OPAL_DATATYPE_CUDA_IOV_CACHE
-void pack_iov_cached(opal_convertor_t* pConvertor, unsigned char *destination)
+int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t* pConvertor,
+                                                               struct iovec* iov,
+                                                               uint32_t* out_size,
+                                                               size_t* max_data )
 {
-    const opal_datatype_t *datatype = pConvertor->pDesc;
-    DT_CUDA_DEBUG ( opal_cuda_output(2, "cuda iov cached %p, count %ld\n", datatype->cuda_iov_dist, datatype->cuda_iov_count ); );
-}
-#endif /* OPAL_DATATYPE_CUDA_IOV_CACHE */
+    uint32_t i, j;
+    uint32_t count_desc, nb_blocks_per_description, residue_desc;
+    uint32_t nb_blocks, thread_per_block, nb_blocks_used;
+    size_t length, buffer_size, length_per_iovec;
+    unsigned char *destination, *destination_base, *source_base, *source;
+    size_t total_packed, packed_w_cache ,packed_wo_cache;
+    int32_t complete_flag = 0;
+    uint8_t buffer_isfull = 0, transfer_required, free_required;
+    uint32_t convertor_flags;
+//    dt_elem_desc_t* description;
+//    dt_elem_desc_t* pElem;
+//    dt_stack_t* pStack;
+    uint8_t alignment, orig_alignment;
+//    int32_t orig_stack_index;
+    cudaError_t cuda_err;
+    ddt_cuda_stream_t *cuda_streams = current_cuda_device->cuda_streams;
+    ddt_cuda_iov_dist_cached_t* cuda_iov_dist_h_current;
+    ddt_cuda_iov_dist_cached_t* cuda_iov_dist_d_current;
+    uintptr_t *cuda_iov_contig_buf_h_current, *cuda_iov_contig_buf_d_current;
+    ddt_cuda_iov_pipeline_block_t *cuda_iov_pipeline_block;
+    int iov_pipeline_block_id = 0;
+    cudaStream_t *cuda_stream_iov = NULL;
+    const struct iovec *ddt_iov = NULL;
+    uint32_t ddt_iov_count = 0;
+    size_t iov_len = 0;
+    uint32_t iov_start_pos, iov_end_pos, cuda_iov_start_pos, cuda_iov_end_pos;
+    ddt_cuda_iov_total_cached_t* cached_cuda_iov;
+    ddt_cuda_iov_dist_cached_t* cached_cuda_iov_dist_d;
+    uint32_t *cached_cuda_iov_nb_bytes_list_h, *cuda_iov_nb_bytes_list_h_current;
+    uint32_t cached_cuda_iov_count = 0;
+    uint8_t cuda_iov_is_cached = 0;
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)    
+    TIMER_DATA_TYPE start, end, start_total, end_total;
+    long total_time, move_time;
+#endif
+    
+    /*description = pConvertor->use_desc->desc;
+    pStack = pConvertor->pStack + pConvertor->stack_pos;
+    pElem = &(description[pStack->index]);
+    printf("size elem %lu, size %d\n", pElem->elem.common.type, opal_datatype_basicDatatypes[pElem->elem.common.type]->size);
+    */
+    
+//    assert(opal_datatype_basicDatatypes[pElem->elem.common.type]->size != 0);
+
+ //   printf("buffer size %d, max_data %d\n", iov[0].iov_len, *max_data);
+    if ((iov[0].iov_base == NULL) || opal_ddt_cuda_is_gpu_buffer(iov[0].iov_base)) {
+        if (iov[0].iov_len == 0) {
+            buffer_size = DT_CUDA_BUFFER_SIZE;
+        } else {
+            buffer_size = iov[0].iov_len;
+        }
+        
+        if (iov[0].iov_base == NULL) {
+            iov[0].iov_base = (unsigned char *)opal_ddt_cuda_malloc_gpu_buffer(buffer_size, 0);
+            destination = (unsigned char *)iov[0].iov_base;
+            pConvertor->gpu_buffer_ptr = destination;
+            free_required = 1;
+        } else {
+            destination = (unsigned char *)iov[0].iov_base;
+            free_required = 0;
+        }
+        transfer_required = 0;
+    } else {
+        buffer_size = iov[0].iov_len;
+        if (OPAL_DATATYPE_VECTOR_USE_ZEROCPY) {
+            pConvertor->gpu_buffer_ptr = NULL;
+            transfer_required = 0;
+            free_required = 0;
+            cudaHostGetDevicePointer((void **)&destination, (void *)iov[0].iov_base, 0);
+        } else {
+            if (pConvertor->gpu_buffer_ptr == NULL) {
+                pConvertor->gpu_buffer_ptr = (unsigned char*)opal_ddt_cuda_malloc_gpu_buffer(buffer_size, 0);
+            }
+            transfer_required = 1;
+            free_required = 1;
+            destination = pConvertor->gpu_buffer_ptr;
+        }
+    }   
+    
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "Pack using IOV cached, GPU base %p, pack to buffer %p\n", pConvertor->pBaseBuf, destination););
 
+    total_packed = 0;
+    packed_wo_cache = 0;
+    packed_w_cache = 0;
+    cuda_streams->current_stream_id = 0;
+  //  orig_stack_index = pStack->index;
+    destination_base = destination;
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+    GET_TIME(start_total);
+#endif
+    
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+    GET_TIME(start);
+#endif
+    
+    opal_convertor_raw_cached( pConvertor, &ddt_iov, &ddt_iov_count);
+    assert(ddt_iov != NULL);
+    opal_ddt_get_cached_cuda_iov(pConvertor, &cached_cuda_iov);
+    cached_cuda_iov_dist_d = cached_cuda_iov->cuda_iov_dist_d;
+    assert(cached_cuda_iov_dist_d != NULL);
+    cached_cuda_iov_nb_bytes_list_h = cached_cuda_iov->nb_bytes_h;
+    assert(cached_cuda_iov_nb_bytes_list_h != NULL);
+    cached_cuda_iov_count = cached_cuda_iov->cuda_iov_count;
+    cuda_iov_is_cached = cached_cuda_iov->cuda_iov_is_cached;
+    DT_CUDA_DEBUG ( opal_cuda_output(4, "Pack iov count %d, submit to CUDA stream %d\n", ddt_iov_count, cuda_streams->current_stream_id); );
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)    
+    GET_TIME( end );
+    total_time = ELAPSED_TIME( start, end );
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: ddt to iov in %ld microsec\n", total_time ); );
+#endif
+    
+    thread_per_block = CUDA_WARP_SIZE * 5;
+    nb_blocks = 256;
+    source_base = (unsigned char*)pConvertor->pBaseBuf; 
+    
+    /* cuda iov is not cached, start to cache iov */
+    if(opal_ddt_cuda_iov_is_cached(pConvertor) == 0) {
+    
+        iov_start_pos = pConvertor->current_iov_pos;
+        iov_end_pos = iov_start_pos + IOV_PIPELINE_SIZE;
+        if (iov_end_pos > ddt_iov_count) {
+            iov_end_pos = ddt_iov_count;
+        }
+    
+        while (iov_start_pos < iov_end_pos && !buffer_isfull) {
+        
+            nb_blocks_used = 0;
+            cuda_iov_pipeline_block = current_cuda_device->cuda_iov_pipeline_block[iov_pipeline_block_id];
+            cuda_iov_dist_h_current = cuda_iov_pipeline_block->cuda_iov_dist_cached_h;
+            cuda_iov_contig_buf_h_current = cuda_iov_pipeline_block->cuda_iov_contig_buf_h;
+            cuda_iov_contig_buf_d_current = cuda_iov_pipeline_block->cuda_iov_contig_buf_d;
+            cuda_iov_dist_d_current = cached_cuda_iov_dist_d + pConvertor->current_cuda_iov_pos;
+            cuda_iov_nb_bytes_list_h_current = cached_cuda_iov_nb_bytes_list_h + pConvertor->current_cuda_iov_pos;
+            cuda_stream_iov = cuda_iov_pipeline_block->cuda_stream;
+            cuda_err = cudaStreamWaitEvent(*cuda_stream_iov, cuda_iov_pipeline_block->cuda_event, 0);
+            opal_cuda_check_error(cuda_err);
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+            GET_TIME(start);
+#endif
+
+            for (i = iov_start_pos; i < iov_end_pos && !buffer_isfull; i++) {
+                if (pConvertor->current_iov_partial_length > 0) {
+                    iov_len = pConvertor->current_iov_partial_length;
+                    pConvertor->current_iov_partial_length = 0;
+                } else {
+                    iov_len = ddt_iov[i].iov_len;
+                }
+                if (buffer_size >= iov_len) {
+                    length_per_iovec = iov_len;
+                } else {
+                    /*orig_alignment = opal_datatype_basicDatatypes[pElem->elem.common.type]->size;*/
+                    orig_alignment = ALIGNMENT_CHAR;
+                    length_per_iovec = buffer_size / orig_alignment * orig_alignment;
+                    buffer_isfull = 1;
+                    pConvertor->current_iov_partial_length = iov_len - length_per_iovec;
+                    pConvertor->current_iov_pos = i;
+                }
+                buffer_size -= length_per_iovec;
+                packed_wo_cache += length_per_iovec;
+                source = (size_t)(ddt_iov[i].iov_base) + (ddt_iov[i].iov_len - iov_len) + source_base;
+            
+                /* block size is either multiple of ALIGNMENT_DOUBLE or residule */
+                alignment = ALIGNMENT_DOUBLE;
+
+                count_desc = length_per_iovec / alignment;
+                residue_desc = length_per_iovec % alignment;
+                nb_blocks_per_description = (count_desc + thread_per_block - 1) / thread_per_block;
+                DT_CUDA_DEBUG ( opal_cuda_output(10, "Pack description %d, size %d, residue %d, alignment %d, nb_block_aquired %d\n", i, count_desc, residue_desc, alignment, nb_blocks_per_description); );
+                for (j = 0; j < nb_blocks_per_description; j++) {
+                    cuda_iov_dist_h_current[nb_blocks_used].ptr_offset = source + j * thread_per_block * alignment - source_base;
+                    cuda_iov_contig_buf_h_current[nb_blocks_used] = (uintptr_t)destination;
+                    if ( (j+1) * thread_per_block <= count_desc) {
+                        cuda_iov_dist_h_current[nb_blocks_used].nb_bytes = thread_per_block * alignment;
+                    } else {
+                        cuda_iov_dist_h_current[nb_blocks_used].nb_bytes = (count_desc - j*thread_per_block) * alignment; 
+                    }
+#if defined (OPAL_DATATYPE_CUDA_DEBUG)
+                    assert(cuda_iov_dist_h_current[nb_blocks_used].nb_bytes > 0);
+#endif /* OPAL_DATATYPE_CUDA_DEBUG */
+                    cuda_iov_nb_bytes_list_h_current[nb_blocks_used] = cuda_iov_dist_h_current[nb_blocks_used].nb_bytes;
+                    destination += cuda_iov_dist_h_current[nb_blocks_used].nb_bytes;
+                    DT_CUDA_DEBUG( opal_cuda_output(12, "Pack \tblock %d, src_offset %ld, dst %p, nb_bytes %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].ptr_offset, cuda_iov_contig_buf_h_current[nb_blocks_used], cuda_iov_dist_h_current[nb_blocks_used].nb_bytes); );
+                    nb_blocks_used ++;
+                    assert (nb_blocks_used < CUDA_MAX_NB_BLOCKS*CUDA_IOV_MAX_TASK_PER_BLOCK);
+                }
+            
+                /* handle residue */
+                if (residue_desc != 0) {
+                    /*orig_alignment = opal_datatype_basicDatatypes[pElem->elem.common.type]->size;*/
+                    orig_alignment = ALIGNMENT_CHAR;
+                    cuda_iov_dist_h_current[nb_blocks_used].ptr_offset = source + length_per_iovec / alignment * alignment - source_base;
+                    cuda_iov_contig_buf_h_current[nb_blocks_used] = (uintptr_t)destination;
+                    cuda_iov_dist_h_current[nb_blocks_used].nb_bytes = length_per_iovec - length_per_iovec / alignment * alignment;
+#if defined (OPAL_DATATYPE_CUDA_DEBUG)
+                    assert(cuda_iov_dist_h_current[nb_blocks_used].nb_bytes > 0);
+#endif /* OPAL_DATATYPE_CUDA_DEBUG */
+                    cuda_iov_nb_bytes_list_h_current[nb_blocks_used] = cuda_iov_dist_h_current[nb_blocks_used].nb_bytes;
+                    destination += cuda_iov_dist_h_current[nb_blocks_used].nb_bytes;
+                    DT_CUDA_DEBUG( opal_cuda_output(12, "Pack \tblock %d, src_offset %ld, dst %p, nb_bytes %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].ptr_offset, cuda_iov_contig_buf_h_current[nb_blocks_used], cuda_iov_dist_h_current[nb_blocks_used].nb_bytes); );
+                    nb_blocks_used ++;
+                    assert (nb_blocks_used < CUDA_MAX_NB_BLOCKS*CUDA_IOV_MAX_TASK_PER_BLOCK);
+                }
+            }
+            
+            if (!buffer_isfull) {
+                pConvertor->current_iov_pos = i;
+            }
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)    
+            GET_TIME( end );
+            total_time = ELAPSED_TIME( start, end );
+            DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Pack to dest %p, iov is prepared in %ld microsec, kernel submitted to CUDA stream %d, nb_blocks %d\n", destination_base, total_time,  cuda_iov_pipeline_block->cuda_stream_id, nb_blocks_used); );
+#endif
+
+          //  opal_ddt_check_cuda_iov_is_full(pConvertor, pConvertor->current_cuda_iov_pos + nb_blocks_used); /* make sure cuda iov has enough space */
+            cudaMemcpyAsync(cuda_iov_dist_d_current, cuda_iov_dist_h_current, sizeof(ddt_cuda_iov_dist_cached_t)*(nb_blocks_used), cudaMemcpyHostToDevice, *cuda_stream_iov);
+            cudaMemcpyAsync(cuda_iov_contig_buf_d_current, cuda_iov_contig_buf_h_current, sizeof(uintptr_t)*(nb_blocks_used), cudaMemcpyHostToDevice, *cuda_stream_iov);
+            DT_CUDA_DEBUG ( opal_cuda_output(2, "kernel launched src_base %p, dst_base %p, nb_blocks %ld\n", source_base, destination_base, nb_blocks_used ); );
+            opal_generic_simple_pack_cuda_iov_cached_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cuda_iov_dist_d_current, cuda_iov_contig_buf_d_current, nb_blocks_used, source_base);
+            cuda_err = cudaEventRecord(cuda_iov_pipeline_block->cuda_event, *cuda_stream_iov);
+            opal_cuda_check_error(cuda_err);
+            iov_pipeline_block_id ++;
+            iov_pipeline_block_id = iov_pipeline_block_id % NB_STREAMS;
+            pConvertor->current_cuda_iov_pos += nb_blocks_used;
+        
+    //        orig_stack_index = pStack->index;
+            iov_start_pos = iov_end_pos;
+            iov_end_pos = iov_start_pos + IOV_PIPELINE_SIZE;
+            if (iov_end_pos >= ddt_iov_count) {
+                iov_end_pos = ddt_iov_count;
+            }
+            /* count = 0 done, iov cached finished */
+            if (pConvertor->current_iov_pos == ddt_iov_count) {
+                pConvertor->current_count ++;
+                opal_ddt_set_cuda_iov_cached(pConvertor, pConvertor->current_cuda_iov_pos);
+                DT_CUDA_DEBUG ( opal_cuda_output(2, "Pack cuda iov is cached, count %d\n", pConvertor->current_cuda_iov_pos););
+            }
+            DT_CUDA_DEBUG ( opal_cuda_output(4, "Pack iov start pos %d end pos %d, submit to CUDA stream %d\n", iov_start_pos, iov_end_pos, cuda_streams->current_stream_id); );
+        }
+    }
+    total_packed += packed_wo_cache;
+    pConvertor->bConverted += packed_wo_cache;
+
+   
+    /* now we use cached cuda iov */
+    if( pConvertor->bConverted != pConvertor->local_size && !buffer_isfull) {
+        cuda_iov_start_pos = pConvertor->current_cuda_iov_pos;
+        cuda_iov_end_pos = cached_cuda_iov_count;
+        nb_blocks_used = 0;
+        cuda_iov_pipeline_block = current_cuda_device->cuda_iov_pipeline_block[iov_pipeline_block_id];
+        cuda_iov_contig_buf_h_current = cuda_iov_pipeline_block->cuda_iov_contig_buf_h;
+        cuda_iov_contig_buf_d_current = cuda_iov_pipeline_block->cuda_iov_contig_buf_d;
+        cuda_iov_dist_d_current = cached_cuda_iov_dist_d + pConvertor->current_cuda_iov_pos;
+        cuda_stream_iov = cuda_iov_pipeline_block->cuda_stream;
+        cuda_err = cudaStreamWaitEvent(*cuda_stream_iov, cuda_iov_pipeline_block->cuda_event, 0);
+        opal_cuda_check_error(cuda_err);
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+        GET_TIME(start);
+#endif
+        for (i = cuda_iov_start_pos; i < cuda_iov_end_pos && !buffer_isfull; i++) {
+            if (buffer_size >= cached_cuda_iov_nb_bytes_list_h[i]) {
+                cuda_iov_contig_buf_h_current[nb_blocks_used] = (uintptr_t)destination;
+                destination += cached_cuda_iov_nb_bytes_list_h[i];
+                packed_w_cache += cached_cuda_iov_nb_bytes_list_h[i];
+                buffer_size -= cached_cuda_iov_nb_bytes_list_h[i];
+                nb_blocks_used++;
+            } else {
+                buffer_isfull = 1;
+                break;
+            }
+        }
+        printf("nb_blocks_used %d, my %d\n", nb_blocks_used, i - cuda_iov_start_pos);
+#if defined(OPAL_DATATYPE_CUDA_TIMING)    
+        GET_TIME( end );
+        total_time = ELAPSED_TIME( start, end );
+        DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Pack to dest %p, cached cuda iov is prepared in %ld microsec, kernel submitted to CUDA stream %d, nb_blocks %d\n", destination_base, total_time,  cuda_iov_pipeline_block->cuda_stream_id, nb_blocks_used); );
+#endif
+        pConvertor->current_cuda_iov_pos += nb_blocks_used;
+        cudaMemcpyAsync(cuda_iov_contig_buf_d_current, cuda_iov_contig_buf_h_current, sizeof(uintptr_t)*(nb_blocks_used), cudaMemcpyHostToDevice, *cuda_stream_iov);
+        DT_CUDA_DEBUG ( opal_cuda_output(2, "kernel launched src_base %p, dst_base %p, nb_blocks %ld\n", source_base, destination_base, nb_blocks_used ); );
+        opal_generic_simple_pack_cuda_iov_cached_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cuda_iov_dist_d_current, cuda_iov_contig_buf_d_current, nb_blocks_used, source_base);
+        cuda_err = cudaEventRecord(cuda_iov_pipeline_block->cuda_event, *cuda_stream_iov);
+        opal_cuda_check_error(cuda_err);
+        iov_pipeline_block_id ++;
+        iov_pipeline_block_id = iov_pipeline_block_id % NB_STREAMS;
+    }
+
+    for (i = 0; i < NB_STREAMS; i++) {
+        cudaStreamSynchronize(cuda_streams->opal_cuda_stream[i]);
+    }
+    
+    total_packed += packed_w_cache;
+    pConvertor->bConverted += packed_w_cache;
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "Pack total packed %d\n", total_packed); );
+    
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+    GET_TIME(start);
+#endif
+    if (transfer_required) {
+        cudaMemcpy(iov[0].iov_base, pConvertor->gpu_buffer_ptr, total_packed, cudaMemcpyDeviceToHost);
+    } 
+#if defined(OPAL_DATATYPE_CUDA_TIMING) 
+    GET_TIME( end );
+    move_time = ELAPSED_TIME( start, end );
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: DtoH memcpy in %ld microsec, transfer required %d\n", move_time, transfer_required ); );
+#endif
+
+    iov[0].iov_len = total_packed;
+    *max_data = total_packed;
+    *out_size = 1;
+    
+#if defined(OPAL_DATATYPE_CUDA_TIMING)    
+    GET_TIME( end_total );
+    total_time = ELAPSED_TIME( start_total, end_total );
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: total packing in %ld microsec, kernel %ld microsec\n", total_time, total_time-move_time ); );
+#endif
+    
+    if( pConvertor->bConverted == pConvertor->local_size ) {
+        pConvertor->flags |= CONVERTOR_COMPLETED;
+        if (pConvertor->gpu_buffer_ptr != NULL && free_required) {
+           opal_ddt_cuda_free_gpu_buffer(pConvertor->gpu_buffer_ptr, 0);
+           pConvertor->gpu_buffer_ptr = NULL;
+        }
+        return 1;
+    }        
+    return 0;
+}
 
 void pack_predefined_data_cuda( dt_elem_desc_t* ELEM,
                                 uint32_t* COUNT,
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
index a23aff7710c..c553a7991b0 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
@@ -6,10 +6,10 @@
 #include <stdio.h> 
 
 
-__global__ void opal_generic_simple_unpack_cuda_iov_kernel( ddt_cuda_iov_dist_t* cuda_iov_dist, int nb_blocks_used, unsigned char* source_base, unsigned char* destination_base)
+__global__ void opal_generic_simple_unpack_cuda_iov_non_cached_kernel( ddt_cuda_iov_dist_non_cached_t* cuda_iov_dist, int nb_blocks_used)
 {
     uint32_t i, _copy_count;
-    size_t src_offset, dst_offset;
+    unsigned char *src, *dst;
     uint8_t alignment;
     unsigned char *_source_tmp, *_destination_tmp;
     
@@ -24,14 +24,14 @@ __global__ void opal_generic_simple_unpack_cuda_iov_kernel( ddt_cuda_iov_dist_t*
     __syncthreads();
     
     for (i = 0; i < nb_tasks; i++) {
-        src_offset = cuda_iov_dist[blockIdx.x + i * gridDim.x].src_offset;
-        dst_offset = cuda_iov_dist[blockIdx.x + i * gridDim.x].dst_offset;
+        src = cuda_iov_dist[blockIdx.x + i * gridDim.x].src;
+        dst = cuda_iov_dist[blockIdx.x + i * gridDim.x].dst;
         _copy_count = cuda_iov_dist[blockIdx.x + i * gridDim.x].nb_elements;
         alignment = cuda_iov_dist[blockIdx.x + i * gridDim.x].element_alignment;
         
         if (threadIdx.x < _copy_count) {
-            _source_tmp = source_base + src_offset + threadIdx.x * alignment;
-            _destination_tmp = destination_base + dst_offset + threadIdx.x * alignment;
+            _source_tmp = src + threadIdx.x * alignment;
+            _destination_tmp = dst + threadIdx.x * alignment;
 #if !defined (OPAL_DATATYPE_CUDA_DRY_RUN)
                 if (alignment == ALIGNMENT_DOUBLE) {
                     *((long *)_destination_tmp) = *((long *)_source_tmp);
@@ -45,6 +45,80 @@ __global__ void opal_generic_simple_unpack_cuda_iov_kernel( ddt_cuda_iov_dist_t*
         }
     }
 }
+
+__global__ void opal_generic_simple_unpack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, uintptr_t* cuda_iov_contig_buf_d, int nb_blocks_used, unsigned char* destination_base, size_t cuda_iov_partial_length_start, size_t cuda_iov_partial_length_end)
+{
+    uint32_t i, j;
+    size_t dst_offset;
+    unsigned char *src;
+    unsigned char *_source_tmp, *_destination_tmp;
+    
+    __shared__ uint32_t nb_tasks;
+    __shared__ uint32_t copy_count;
+    __shared__ uint8_t alignment;
+    
+    if (threadIdx.x == 0) {
+        nb_tasks = nb_blocks_used / gridDim.x;
+        if (blockIdx.x < nb_blocks_used % gridDim.x) {
+            nb_tasks ++;
+        }
+    }
+    __syncthreads();
+    
+    for (i = 0; i < nb_tasks; i++) {
+        src = (unsigned char *)cuda_iov_contig_buf_d[blockIdx.x + i * gridDim.x];
+        dst_offset = cuda_iov_dist[blockIdx.x + i * gridDim.x].ptr_offset;
+        if (i == 0 && blockIdx.x == 0 && cuda_iov_partial_length_start != 0) {
+         //   if (threadIdx.x == 0) printf("cuda_iov_partial_length_start %d", cuda_iov_partial_length_start);
+            dst_offset = dst_offset + cuda_iov_dist[blockIdx.x + i * gridDim.x].nb_bytes - cuda_iov_partial_length_start; 
+        }
+        if (threadIdx.x == 0) {
+            _source_tmp = src;
+            _destination_tmp = destination_base + dst_offset;
+            uint32_t _nb_bytes = 0;
+            if (i == 0 && blockIdx.x == 0 && cuda_iov_partial_length_start != 0) {
+                _nb_bytes = cuda_iov_partial_length_start;
+            } else if (i == nb_tasks-1 && (blockIdx.x == (nb_blocks_used-1) % gridDim.x) && cuda_iov_partial_length_end != 0) {
+                _nb_bytes = cuda_iov_partial_length_end;
+            } else {
+                _nb_bytes = cuda_iov_dist[blockIdx.x + i * gridDim.x].nb_bytes;
+            }
+            if ((uintptr_t)(_destination_tmp) % ALIGNMENT_DOUBLE == 0 && (uintptr_t)_source_tmp % ALIGNMENT_DOUBLE == 0 && _nb_bytes % ALIGNMENT_DOUBLE == 0) {
+                alignment = ALIGNMENT_DOUBLE;
+            } else if ((uintptr_t)(_destination_tmp) % ALIGNMENT_FLOAT == 0 && (uintptr_t)_source_tmp % ALIGNMENT_FLOAT == 0 && _nb_bytes % ALIGNMENT_FLOAT == 0) {
+                alignment = ALIGNMENT_FLOAT;
+            } else {
+                alignment = ALIGNMENT_CHAR;
+            }
+            copy_count = _nb_bytes / alignment;
+        }
+        __syncthreads();
+        
+        for (j = threadIdx.x; j < copy_count; j += blockDim.x) {
+/*            if (threadIdx.x == 0) {
+                if (copy_count > blockDim.x) printf("copy_count %d, dim %d\n", copy_count, blockDim.x);
+            }*/
+            if (j < copy_count) {
+                _source_tmp = src + j * alignment;
+                _destination_tmp = destination_base + dst_offset + j * alignment;
+  /*              if (threadIdx.x == 0) {
+                    printf("_src %p, dst %p, alignment %d, blk %d, j %d, count %d\n", _source_tmp, _destination_tmp, alignment, blockIdx.x, j, copy_count);
+                }*/
+#if !defined (OPAL_DATATYPE_CUDA_DRY_RUN)
+                    if (alignment == ALIGNMENT_DOUBLE) {
+                        *((long *)_destination_tmp) = *((long *)_source_tmp);
+                    } else if (alignment == ALIGNMENT_FLOAT) {
+                        *((int *)_destination_tmp) = *((int *)_source_tmp);
+                    } else {
+                        * _destination_tmp = *_source_tmp;
+                    }
+            //   printf("src %p, %1.f | dst %p, %1.f\n", _source_tmp, *_source_tmp, _destination_tmp, *_destination_tmp);
+#endif /* ! OPAL_DATATYPE_CUDA_DRY_RUN */
+            }
+        }
+    }
+}
+
 __global__ void unpack_contiguous_loop_cuda_kernel_global( uint32_t copy_loops,
                                                            size_t size,
                                                            OPAL_PTRDIFF_TYPE extent,
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
index f483d230934..84d5bd5ea1d 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
@@ -374,12 +374,20 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pCon
                                                           struct iovec* iov,
                                                           uint32_t* out_size,
                                                           size_t* max_data )
+{
+    return opal_ddt_generic_simple_unpack_function_cuda_iov_cached(pConvertor, iov, out_size, max_data);
+}
+
+int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_non_cached( opal_convertor_t* pConvertor,
+                                                                     struct iovec* iov,
+                                                                     uint32_t* out_size,
+                                                                     size_t* max_data )
 {
     uint32_t i, j;
     uint32_t count_desc, nb_blocks_per_description, dst_offset, residue_desc;
     uint32_t nb_blocks, thread_per_block, nb_blocks_used;
     size_t length, buffer_size, length_per_iovec;
-    unsigned char *source, *source_base, *destination_base;
+    unsigned char *source, *source_base;
     size_t total_unpacked, total_converted;
     int32_t complete_flag = 0;
     uint8_t buffer_isfull = 0;
@@ -392,8 +400,8 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pCon
 //    int32_t orig_stack_index;
     cudaError_t cuda_err;
     ddt_cuda_stream_t *cuda_streams = current_cuda_device->cuda_streams;
-    ddt_cuda_iov_dist_t* cuda_iov_dist_h_current;
-    ddt_cuda_iov_dist_t* cuda_iov_dist_d_current;
+    ddt_cuda_iov_dist_non_cached_t* cuda_iov_dist_h_current;
+    ddt_cuda_iov_dist_non_cached_t* cuda_iov_dist_d_current;
     ddt_cuda_iov_pipeline_block_t *cuda_iov_pipeline_block;
     int iov_pipeline_block_id = 0;
     cudaStream_t *cuda_stream_iov = NULL;
@@ -434,7 +442,8 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pCon
         }
     }
 
-    DT_CUDA_DEBUG ( opal_cuda_output(2, "Unpack using IOV, GPU base %p, unpack from buffer %p, total size %ld\n",
+    source_base = source;
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "Unpack using IOV non cached, GPU base %p, unpack from buffer %p, total size %ld\n",
                                      pConvertor->pBaseBuf, source, iov[0].iov_len); );
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
     GET_TIME( end );
@@ -456,7 +465,6 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pCon
     cuda_streams->current_stream_id = 0;
     convertor_flags = pConvertor->flags;
 //    orig_stack_index = pStack->index;
-    source_base = source;
     complete_flag = opal_convertor_raw( pConvertor, cuda_iov, &cuda_iov_count, &length );
     DT_CUDA_DEBUG ( opal_cuda_output(4, "Unpack complete flag %d, iov count %d, length %d, submit to CUDA stream %d\n", complete_flag, cuda_iov_count, length, cuda_streams->current_stream_id); );
 
@@ -474,12 +482,12 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pCon
 
         nb_blocks_used = 0;
         cuda_iov_pipeline_block = current_cuda_device->cuda_iov_pipeline_block[iov_pipeline_block_id];
-        cuda_iov_dist_h_current = cuda_iov_pipeline_block->cuda_iov_dist_h;
-        cuda_iov_dist_d_current = cuda_iov_pipeline_block->cuda_iov_dist_d;
+        cuda_iov_dist_h_current = cuda_iov_pipeline_block->cuda_iov_dist_non_cached_h;
+        cuda_iov_dist_d_current = cuda_iov_pipeline_block->cuda_iov_dist_non_cached_d;
         cuda_stream_iov = cuda_iov_pipeline_block->cuda_stream;
         cuda_err = cudaStreamWaitEvent(*cuda_stream_iov, cuda_iov_pipeline_block->cuda_event, 0);
         opal_cuda_check_error(cuda_err);
-        destination_base = (unsigned char*)cuda_iov[0].iov_base;
+        
 
 #if defined (OPAL_DATATYPE_CUDA_TIMING)
         GET_TIME(start);
@@ -514,8 +522,8 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pCon
             nb_blocks_per_description = (count_desc + thread_per_block - 1) / thread_per_block;
             DT_CUDA_DEBUG ( opal_cuda_output(10, "Unpack description %d, size %d, residue %d, alignment %d\n", i, count_desc, residue_desc, alignment); );
             for (j = 0; j < nb_blocks_per_description; j++) {
-                cuda_iov_dist_h_current[nb_blocks_used].dst_offset = (unsigned char *)(cuda_iov[i].iov_base) + j * thread_per_block * alignment - destination_base;
-                cuda_iov_dist_h_current[nb_blocks_used].src_offset = source - source_base;
+                cuda_iov_dist_h_current[nb_blocks_used].dst = (unsigned char *)(cuda_iov[i].iov_base) + j * thread_per_block * alignment;
+                cuda_iov_dist_h_current[nb_blocks_used].src = source;
                 cuda_iov_dist_h_current[nb_blocks_used].element_alignment = alignment;
                 if ( (j+1) * thread_per_block <= count_desc) {
                     cuda_iov_dist_h_current[nb_blocks_used].nb_elements = thread_per_block;// * sizeof(double);
@@ -526,7 +534,7 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pCon
                 assert (cuda_iov_dist_h_current[nb_blocks_used].nb_elements > 0); 
 #endif /* OPAL_DATATYPE_CUDA_DEBUG */
                 source += cuda_iov_dist_h_current[nb_blocks_used].nb_elements * alignment;
-                DT_CUDA_DEBUG( opal_cuda_output(12, "Unpack \tblock %d, src %p, dst %p, nb_elements %d, alignment %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].src_offset, cuda_iov_dist_h_current[nb_blocks_used].dst_offset, cuda_iov_dist_h_current[nb_blocks_used].nb_elements, cuda_iov_dist_h_current[nb_blocks_used].element_alignment); );
+                DT_CUDA_DEBUG( opal_cuda_output(12, "Unpack \tblock %d, src %p, dst %p, nb_elements %d, alignment %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].src, cuda_iov_dist_h_current[nb_blocks_used].dst, cuda_iov_dist_h_current[nb_blocks_used].nb_elements, cuda_iov_dist_h_current[nb_blocks_used].element_alignment); );
                 nb_blocks_used ++;
             }
 
@@ -534,15 +542,15 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pCon
             if (residue_desc != 0) {
                /* orig_alignment = opal_datatype_basicDatatypes[pElem->elem.common.type]->size;*/
                 orig_alignment = ALIGNMENT_CHAR;
-                cuda_iov_dist_h_current[nb_blocks_used].dst_offset = (unsigned char *)(cuda_iov[i].iov_base) + length_per_iovec / alignment * alignment - destination_base;
-                cuda_iov_dist_h_current[nb_blocks_used].src_offset = source - source_base;
+                cuda_iov_dist_h_current[nb_blocks_used].dst = (unsigned char *)(cuda_iov[i].iov_base) + length_per_iovec / alignment * alignment;
+                cuda_iov_dist_h_current[nb_blocks_used].src = source;
                 cuda_iov_dist_h_current[nb_blocks_used].element_alignment = orig_alignment;
                 cuda_iov_dist_h_current[nb_blocks_used].nb_elements = (length_per_iovec - length_per_iovec / alignment * alignment) / orig_alignment;
 #if defined (OPAL_DATATYPE_CUDA_DEBUG)
                 assert (cuda_iov_dist_h_current[nb_blocks_used].nb_elements > 0);
 #endif /* OPAL_DATATYPE_CUDA_DEBUG */
                 source += cuda_iov_dist_h_current[nb_blocks_used].nb_elements * orig_alignment;
-                DT_CUDA_DEBUG( opal_cuda_output(12, "Unpack \tblock %d, src %p, dst %p, nb_elements %d, alignment %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].src_offset, cuda_iov_dist_h_current[nb_blocks_used].dst_offset, cuda_iov_dist_h_current[nb_blocks_used].nb_elements, cuda_iov_dist_h_current[nb_blocks_used].element_alignment); );
+                DT_CUDA_DEBUG( opal_cuda_output(12, "Unpack \tblock %d, src %p, dst %p, nb_elements %d, alignment %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].src, cuda_iov_dist_h_current[nb_blocks_used].dst, cuda_iov_dist_h_current[nb_blocks_used].nb_elements, cuda_iov_dist_h_current[nb_blocks_used].element_alignment); );
                 nb_blocks_used ++;
             }
 
@@ -557,8 +565,8 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pCon
         DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Unpack src %p, iov is prepared in %ld microsec, kernel submitted to CUDA stream %d, nb_blocks_used %d\n", source_base, total_time,  cuda_iov_pipeline_block->cuda_stream_id, nb_blocks_used); );
 #endif
 
-        cudaMemcpyAsync(cuda_iov_dist_d_current, cuda_iov_dist_h_current, sizeof(ddt_cuda_iov_dist_t)*(nb_blocks_used), cudaMemcpyHostToDevice, *cuda_stream_iov);
-        opal_generic_simple_unpack_cuda_iov_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cuda_iov_dist_d_current, nb_blocks_used, source_base, destination_base);
+        cudaMemcpyAsync(cuda_iov_dist_d_current, cuda_iov_dist_h_current, sizeof(ddt_cuda_iov_dist_non_cached_t)*(nb_blocks_used), cudaMemcpyHostToDevice, *cuda_stream_iov);
+        opal_generic_simple_unpack_cuda_iov_non_cached_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cuda_iov_dist_d_current, nb_blocks_used);
         cuda_err = cudaEventRecord(cuda_iov_pipeline_block->cuda_event, *cuda_stream_iov);
         opal_cuda_check_error(cuda_err);
         iov_pipeline_block_id ++;
@@ -614,6 +622,347 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pCon
     return 0;
 }
 
+int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_t* pConvertor,
+                                                                 struct iovec* iov,
+                                                                 uint32_t* out_size,
+                                                                 size_t* max_data )
+{
+    uint32_t i, j;
+    uint32_t count_desc, nb_blocks_per_description, residue_desc;
+    uint32_t nb_blocks, thread_per_block, nb_blocks_used;
+    size_t length, buffer_size, length_per_iovec;
+    unsigned char *source, *source_base, *destination_base, *destination;
+    size_t total_unpacked, unpacked_wo_cache, unpacked_w_cache;
+    int32_t complete_flag = 0;
+    uint8_t buffer_isfull = 0;
+    uint8_t free_required = 0;
+    uint32_t convertor_flags;
+//    dt_elem_desc_t* description;
+//    dt_elem_desc_t* pElem;
+//    dt_stack_t* pStack;
+    uint8_t alignment, orig_alignment;
+//    int32_t orig_stack_index;
+    cudaError_t cuda_err;
+    ddt_cuda_stream_t *cuda_streams = current_cuda_device->cuda_streams;
+    ddt_cuda_iov_dist_cached_t* cuda_iov_dist_h_current;
+    ddt_cuda_iov_dist_cached_t* cuda_iov_dist_d_current;
+    uintptr_t *cuda_iov_contig_buf_h_current, *cuda_iov_contig_buf_d_current;
+    ddt_cuda_iov_pipeline_block_t *cuda_iov_pipeline_block;
+    int iov_pipeline_block_id = 0;
+    cudaStream_t *cuda_stream_iov = NULL;
+    const struct iovec *ddt_iov = NULL;
+    uint32_t ddt_iov_count = 0;
+    size_t iov_len = 0;
+    uint32_t iov_start_pos, iov_end_pos, cuda_iov_start_pos, cuda_iov_end_pos;
+    ddt_cuda_iov_total_cached_t* cached_cuda_iov;
+    ddt_cuda_iov_dist_cached_t* cached_cuda_iov_dist_d;
+    uint32_t *cached_cuda_iov_nb_bytes_list_h, *cuda_iov_nb_bytes_list_h_current;
+    uint32_t cached_cuda_iov_count = 0;
+    uint8_t cuda_iov_is_cached = 0;
+    size_t cuda_iov_partial_length_start = 0;
+    size_t cuda_iov_partial_length_end = 0;
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+    TIMER_DATA_TYPE start, end, start_total, end_total;
+    long total_time, move_time;
+#endif
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+    GET_TIME(start_total);
+#endif
+
+/*    description = pConvertor->use_desc->desc;
+    pStack = pConvertor->pStack + pConvertor->stack_pos;
+    pElem = &(description[pStack->index]);
+    printf("size elem %d, size %lu\n", pElem->elem.common.type, opal_datatype_basicDatatypes[pElem->elem.common.type]->size);
+*/
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+    GET_TIME(start);
+#endif
+    if (opal_ddt_cuda_is_gpu_buffer(iov[0].iov_base)) {
+        source = (unsigned char*)iov[0].iov_base;
+        free_required = 0;
+    } else {
+        if (OPAL_DATATYPE_VECTOR_USE_ZEROCPY) {
+            cudaHostGetDevicePointer((void **)&source, (void *)iov[0].iov_base, 0);
+            pConvertor->gpu_buffer_ptr = NULL;
+            free_required = 0;
+        } else {
+            if (pConvertor->gpu_buffer_ptr == NULL) {
+                pConvertor->gpu_buffer_ptr = (unsigned char*)opal_ddt_cuda_malloc_gpu_buffer(iov[0].iov_len, 0);
+            }
+            source = pConvertor->gpu_buffer_ptr;
+            cudaMemcpy(source, iov[0].iov_base, iov[0].iov_len, cudaMemcpyHostToDevice);
+            free_required = 1;
+        }
+    }
+
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "Unpack using IOV cached, GPU base %p, unpack from buffer %p, total size %ld\n",
+                                     pConvertor->pBaseBuf, source, iov[0].iov_len); );
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+    GET_TIME( end );
+    move_time = ELAPSED_TIME( start, end );
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: HtoD memcpy in %ld microsec, free required %d\n", move_time, free_required ); );
+#endif
+    
+//    cuda_err = cudaEventRecord(current_cuda_device->memcpy_event, current_cuda_device->cuda_streams->opal_cuda_stream[0]);
+//    opal_cuda_check_error(cuda_err);
+
+
+#if defined (OPAL_DATATYPE_CUDA_TIMING)
+    GET_TIME(start);
+#endif
+    buffer_size = iov[0].iov_len;
+    total_unpacked = 0;
+    unpacked_wo_cache = 0;
+    unpacked_w_cache = 0;
+    cuda_streams->current_stream_id = 0;
+    convertor_flags = pConvertor->flags;
+//    orig_stack_index = pStack->index;
+    source_base = source;
+    opal_convertor_raw_cached( pConvertor, &ddt_iov, &ddt_iov_count);
+    assert(ddt_iov != NULL);
+    opal_ddt_get_cached_cuda_iov(pConvertor, &cached_cuda_iov);
+    cached_cuda_iov_dist_d = cached_cuda_iov->cuda_iov_dist_d;
+    assert(cached_cuda_iov_dist_d != NULL);
+    cached_cuda_iov_nb_bytes_list_h = cached_cuda_iov->nb_bytes_h;
+    assert(cached_cuda_iov_nb_bytes_list_h != NULL);
+    cached_cuda_iov_count = cached_cuda_iov->cuda_iov_count;
+    cuda_iov_is_cached = cached_cuda_iov->cuda_iov_is_cached;
+    DT_CUDA_DEBUG ( opal_cuda_output(4, "Unpack iov count %d, submit to CUDA stream %d\n", ddt_iov_count, cuda_streams->current_stream_id); );
+
+#if defined (OPAL_DATATYPE_CUDA_TIMING)
+    GET_TIME( end );
+    total_time = ELAPSED_TIME( start, end );
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: ddt to iov in %ld microsec\n", total_time ); );
+#endif
+
+    thread_per_block = CUDA_WARP_SIZE * 5;
+    nb_blocks = 256;
+    destination_base = (unsigned char*)pConvertor->pBaseBuf;
+    
+    /* cuda iov is not cached, start to cache iov */
+    if(opal_ddt_cuda_iov_is_cached(pConvertor) == 0) {
+    
+        iov_start_pos = pConvertor->current_iov_pos;
+        iov_end_pos = iov_start_pos + IOV_PIPELINE_SIZE;
+        if (iov_end_pos > ddt_iov_count) {
+            iov_end_pos = ddt_iov_count;
+        }
+
+        while (iov_start_pos < iov_end_pos && !buffer_isfull) {
+
+            nb_blocks_used = 0;
+            cuda_iov_pipeline_block = current_cuda_device->cuda_iov_pipeline_block[iov_pipeline_block_id];
+            cuda_iov_dist_h_current = cuda_iov_pipeline_block->cuda_iov_dist_cached_h;
+            cuda_iov_contig_buf_h_current = cuda_iov_pipeline_block->cuda_iov_contig_buf_h;
+            cuda_iov_contig_buf_d_current = cuda_iov_pipeline_block->cuda_iov_contig_buf_d;
+            cuda_iov_dist_d_current = cached_cuda_iov_dist_d + pConvertor->current_cuda_iov_pos;
+            cuda_iov_nb_bytes_list_h_current = cached_cuda_iov_nb_bytes_list_h + pConvertor->current_cuda_iov_pos;
+            DT_CUDA_DEBUG ( opal_cuda_output(2, "Pack cuda iov pos %d\n", pConvertor->current_cuda_iov_pos););
+            cuda_stream_iov = cuda_iov_pipeline_block->cuda_stream;
+            cuda_err = cudaStreamWaitEvent(*cuda_stream_iov, cuda_iov_pipeline_block->cuda_event, 0);
+            opal_cuda_check_error(cuda_err);
+        
+
+#if defined (OPAL_DATATYPE_CUDA_TIMING)
+            GET_TIME(start);
+#endif
+
+            for (i = iov_start_pos; i < iov_end_pos && !buffer_isfull; i++) {
+                if (pConvertor->current_iov_partial_length > 0) {
+                    iov_len = pConvertor->current_iov_partial_length;
+                    pConvertor->current_iov_partial_length = 0;
+                } else {
+                    iov_len = ddt_iov[i].iov_len;
+                }
+                if (buffer_size >= iov_len) {
+                    length_per_iovec = iov_len;
+                } else {
+                  /*  orig_alignment = opal_datatype_basicDatatypes[pElem->elem.common.type]->size;*/
+                    orig_alignment = ALIGNMENT_CHAR;
+                    length_per_iovec = buffer_size / orig_alignment * orig_alignment;
+                    buffer_isfull = 1;
+                    pConvertor->current_iov_partial_length = iov_len - length_per_iovec;
+                    pConvertor->current_iov_pos = i;
+                }
+                buffer_size -= length_per_iovec;
+                unpacked_wo_cache += length_per_iovec;
+                destination = (size_t)(ddt_iov[i].iov_base) + (ddt_iov[i].iov_len - iov_len) + destination_base;
+
+                alignment = ALIGNMENT_DOUBLE;
+
+                count_desc = length_per_iovec / alignment;
+                residue_desc = length_per_iovec % alignment;
+                nb_blocks_per_description = (count_desc + thread_per_block - 1) / thread_per_block;
+                DT_CUDA_DEBUG ( opal_cuda_output(10, "Unpack description %d, size %d, residue %d, alignment %d\n", i, count_desc, residue_desc, alignment); );
+                for (j = 0; j < nb_blocks_per_description; j++) {
+                    cuda_iov_dist_h_current[nb_blocks_used].ptr_offset = destination + j * thread_per_block * alignment - destination_base;
+                    cuda_iov_contig_buf_h_current[nb_blocks_used] = (uintptr_t)source;
+                    if ( (j+1) * thread_per_block <= count_desc) {
+                        cuda_iov_dist_h_current[nb_blocks_used].nb_bytes = thread_per_block * alignment;
+                    } else {
+                        cuda_iov_dist_h_current[nb_blocks_used].nb_bytes = (thread_per_block - ((j+1)*thread_per_block - count_desc)) * alignment;
+                    }
+#if defined (OPAL_DATATYPE_CUDA_DEBUG)
+                    assert (cuda_iov_dist_h_current[nb_blocks_used].nb_bytes > 0); 
+#endif /* OPAL_DATATYPE_CUDA_DEBUG */
+                    cuda_iov_nb_bytes_list_h_current[nb_blocks_used] = cuda_iov_dist_h_current[nb_blocks_used].nb_bytes;
+                    source += cuda_iov_dist_h_current[nb_blocks_used].nb_bytes;
+                    DT_CUDA_DEBUG( opal_cuda_output(12, "Unpack \tblock %d, src_offset %ld, dst %p, nb_bytes %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].ptr_offset, cuda_iov_contig_buf_h_current[nb_blocks_used], cuda_iov_dist_h_current[nb_blocks_used].nb_bytes); );
+                    nb_blocks_used ++;
+                }
+
+                /* handle residue */
+                if (residue_desc != 0) {
+                   /* orig_alignment = opal_datatype_basicDatatypes[pElem->elem.common.type]->size;*/
+                    orig_alignment = ALIGNMENT_CHAR;
+                    cuda_iov_dist_h_current[nb_blocks_used].ptr_offset = destination + length_per_iovec / alignment * alignment - destination_base;
+                    cuda_iov_contig_buf_h_current[nb_blocks_used] = (uintptr_t)source;
+                    cuda_iov_dist_h_current[nb_blocks_used].nb_bytes = length_per_iovec - length_per_iovec / alignment * alignment;
+#if defined (OPAL_DATATYPE_CUDA_DEBUG)
+                    assert (cuda_iov_dist_h_current[nb_blocks_used].nb_bytes > 0);
+#endif /* OPAL_DATATYPE_CUDA_DEBUG */
+                    cuda_iov_nb_bytes_list_h_current[nb_blocks_used] = cuda_iov_dist_h_current[nb_blocks_used].nb_bytes;
+                    source += cuda_iov_dist_h_current[nb_blocks_used].nb_bytes;
+                    DT_CUDA_DEBUG( opal_cuda_output(12, "Unpack \tblock %d, src_offset %ld, dst %p, nb_bytes %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].ptr_offset, cuda_iov_contig_buf_h_current[nb_blocks_used], cuda_iov_dist_h_current[nb_blocks_used].nb_bytes); );
+                    nb_blocks_used ++;
+                }
+            }
+            
+            if (!buffer_isfull) {
+                pConvertor->current_iov_pos = i;
+            }
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+            GET_TIME( end );
+            total_time = ELAPSED_TIME( start, end );
+            DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Unpack src %p, iov is prepared in %ld microsec, kernel submitted to CUDA stream %d, nb_blocks_used %d\n", source_base, total_time,  cuda_iov_pipeline_block->cuda_stream_id, nb_blocks_used); );
+#endif
+
+            cudaMemcpyAsync(cuda_iov_dist_d_current, cuda_iov_dist_h_current, sizeof(ddt_cuda_iov_dist_cached_t)*(nb_blocks_used), cudaMemcpyHostToDevice, *cuda_stream_iov);
+            cudaMemcpyAsync(cuda_iov_contig_buf_d_current, cuda_iov_contig_buf_h_current, sizeof(uintptr_t)*(nb_blocks_used), cudaMemcpyHostToDevice, *cuda_stream_iov);
+            opal_generic_simple_unpack_cuda_iov_cached_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cuda_iov_dist_d_current, cuda_iov_contig_buf_d_current, nb_blocks_used, destination_base, 0, 0);
+            cuda_err = cudaEventRecord(cuda_iov_pipeline_block->cuda_event, *cuda_stream_iov);
+            opal_cuda_check_error(cuda_err);
+            iov_pipeline_block_id ++;
+            iov_pipeline_block_id = iov_pipeline_block_id % NB_STREAMS;
+            pConvertor->current_cuda_iov_pos += nb_blocks_used;
+        
+            iov_start_pos = iov_end_pos;
+            iov_end_pos = iov_start_pos + IOV_PIPELINE_SIZE;
+            if (iov_end_pos >= ddt_iov_count) {
+                iov_end_pos = ddt_iov_count;
+            }
+            /* finished */
+            if (pConvertor->current_iov_pos == ddt_iov_count) {
+                pConvertor->current_count ++;
+                opal_ddt_set_cuda_iov_cached(pConvertor, pConvertor->current_cuda_iov_pos);
+                DT_CUDA_DEBUG ( opal_cuda_output(2, "Pack cuda iov is cached, count %d\n", pConvertor->current_cuda_iov_pos););
+            }
+            DT_CUDA_DEBUG ( opal_cuda_output(4, "Unpack iov start pos %d end pos %d, submit to CUDA stream %d\n", iov_start_pos, iov_end_pos, cuda_streams->current_stream_id); );
+
+        }
+    }
+    total_unpacked += unpacked_wo_cache;
+    pConvertor->bConverted += unpacked_wo_cache;
+#if 1    
+    /* now we use cached cuda iov */
+    if( pConvertor->bConverted != pConvertor->local_size && !buffer_isfull) {
+        cuda_iov_start_pos = pConvertor->current_cuda_iov_pos;
+        cuda_iov_end_pos = cached_cuda_iov_count;
+        nb_blocks_used = 0;
+        cuda_iov_pipeline_block = current_cuda_device->cuda_iov_pipeline_block[iov_pipeline_block_id];
+        cuda_iov_contig_buf_h_current = cuda_iov_pipeline_block->cuda_iov_contig_buf_h;
+        cuda_iov_contig_buf_d_current = cuda_iov_pipeline_block->cuda_iov_contig_buf_d;
+        cuda_iov_dist_d_current = cached_cuda_iov_dist_d + pConvertor->current_cuda_iov_pos;
+        cuda_stream_iov = cuda_iov_pipeline_block->cuda_stream;
+        cuda_err = cudaStreamWaitEvent(*cuda_stream_iov, cuda_iov_pipeline_block->cuda_event, 0);
+        opal_cuda_check_error(cuda_err);
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+        GET_TIME(start);
+#endif
+        if (pConvertor->current_iov_partial_length > 0) {
+            cuda_iov_partial_length_start = pConvertor->current_iov_partial_length;
+            unpacked_w_cache += cuda_iov_partial_length_start;
+            buffer_size -= cuda_iov_partial_length_start;
+            pConvertor->current_iov_partial_length = 0;
+            cuda_iov_contig_buf_h_current[nb_blocks_used] = (uintptr_t)source;
+            source += cuda_iov_partial_length_start;
+            cuda_iov_start_pos ++;
+            nb_blocks_used ++;
+        }
+        for (i = cuda_iov_start_pos; i < cuda_iov_end_pos && !buffer_isfull; i++) {
+            if (buffer_size >= cached_cuda_iov_nb_bytes_list_h[i]) {
+                cuda_iov_contig_buf_h_current[nb_blocks_used] = (uintptr_t)source;
+                source += cached_cuda_iov_nb_bytes_list_h[i];
+                unpacked_w_cache += cached_cuda_iov_nb_bytes_list_h[i];
+                buffer_size -= cached_cuda_iov_nb_bytes_list_h[i];
+                nb_blocks_used ++;
+            } else {
+                if (buffer_size > 0) {
+                    cuda_iov_partial_length_end = buffer_size;
+                    unpacked_w_cache += cuda_iov_partial_length_end;
+                    cuda_iov_contig_buf_h_current[nb_blocks_used] = (uintptr_t)source;
+                    source += cuda_iov_partial_length_end;
+                    pConvertor->current_iov_partial_length = cached_cuda_iov_nb_bytes_list_h[i] - cuda_iov_partial_length_end;
+                    nb_blocks_used ++;
+                }
+                buffer_size = 0;
+                buffer_isfull = 1;
+                break;
+            }
+        }
+#if defined(OPAL_DATATYPE_CUDA_TIMING)    
+        GET_TIME( end );
+        total_time = ELAPSED_TIME( start, end );
+        DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Unpack src %p, cached cuda iov is prepared in %ld microsec, kernel submitted to CUDA stream %d, nb_blocks %d\n", source_base, total_time,  cuda_iov_pipeline_block->cuda_stream_id, nb_blocks_used); );
+#endif
+        if (pConvertor->current_iov_partial_length > 0) {
+            pConvertor->current_cuda_iov_pos += nb_blocks_used - 1;
+        } else {
+            pConvertor->current_cuda_iov_pos += nb_blocks_used;
+        }
+        cudaMemcpyAsync(cuda_iov_contig_buf_d_current, cuda_iov_contig_buf_h_current, sizeof(uintptr_t)*(nb_blocks_used), cudaMemcpyHostToDevice, *cuda_stream_iov);
+        DT_CUDA_DEBUG ( opal_cuda_output(2, "kernel launched src_base %p, dst_base %p, nb_blocks %ld\n", source_base, destination_base, nb_blocks_used ); );
+        opal_generic_simple_unpack_cuda_iov_cached_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cuda_iov_dist_d_current, cuda_iov_contig_buf_d_current, nb_blocks_used, destination_base, cuda_iov_partial_length_start, cuda_iov_partial_length_end);
+        cuda_err = cudaEventRecord(cuda_iov_pipeline_block->cuda_event, *cuda_stream_iov);
+        opal_cuda_check_error(cuda_err);
+        iov_pipeline_block_id ++;
+        iov_pipeline_block_id = iov_pipeline_block_id % NB_STREAMS;
+    }
+#endif
+
+    for (i = 0; i < NB_STREAMS; i++) {
+        cudaStreamSynchronize(cuda_streams->opal_cuda_stream[i]);
+    }
+    
+    total_unpacked += unpacked_w_cache;
+    pConvertor->bConverted += unpacked_w_cache;
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "Unpack total unpacked %d\n", total_unpacked); );
+
+    iov[0].iov_len = total_unpacked;
+    *max_data = total_unpacked;
+    *out_size = 1;
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+    GET_TIME( end_total );
+    total_time = ELAPSED_TIME( start_total, end_total );
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: total unpacking in %ld microsec, kernel %ld microsec\n", total_time, total_time-move_time ); );
+#endif
+
+    if( pConvertor->bConverted == pConvertor->local_size ) {
+        pConvertor->flags |= CONVERTOR_COMPLETED;
+        if (pConvertor->gpu_buffer_ptr != NULL && free_required) {
+            opal_ddt_cuda_free_gpu_buffer(pConvertor->gpu_buffer_ptr, 0);
+            pConvertor->gpu_buffer_ptr = NULL;
+        }
+        return 1;
+    }
+    return 0;
+}
+
 void unpack_contiguous_loop_cuda( dt_elem_desc_t* ELEM,
                                   uint32_t* COUNT,
                                   unsigned char** SOURCE,
diff --git a/opal/datatype/opal_convertor.h b/opal/datatype/opal_convertor.h
index fb8b4d630a4..b7c0a43a6ed 100644
--- a/opal/datatype/opal_convertor.h
+++ b/opal/datatype/opal_convertor.h
@@ -114,7 +114,10 @@ struct opal_convertor_t {
 
     unsigned char *               gpu_buffer_ptr; /**< GPU buffer used for pack/unpack */
     size_t                        gpu_buffer_size;
-    size_t                        current_cuda_iov_count;
+    uint32_t                      current_cuda_iov_pos;
+    uint32_t                      current_iov_pos;
+    size_t                        current_iov_partial_length;
+    opal_datatype_count_t         current_count;
 #endif
     /* size: 248, cachelines: 4, members: 20 */
     /* last cacheline: 56 bytes */
diff --git a/opal/datatype/opal_datatype.h b/opal/datatype/opal_datatype.h
index efbf357c7fd..6e161e96d76 100644
--- a/opal/datatype/opal_datatype.h
+++ b/opal/datatype/opal_datatype.h
@@ -132,9 +132,7 @@ struct opal_datatype_t {
     size_t             max_data;
     /* size: 416, cachelines: 7, members: 18 */
 #if OPAL_CUDA_SUPPORT
-    void *             cuda_iov_dist;
-    size_t             cuda_iov_count;
-    int8_t             cuda_iov_is_cached;
+    void *             cached_cuda_iov;
 #endif /* OPAL_CUDA_SUPPORT */
     /* last cacheline: 32 bytes */
 
diff --git a/opal/datatype/opal_datatype_create.c b/opal/datatype/opal_datatype_create.c
index b97a84f5174..44c0e3020b6 100644
--- a/opal/datatype/opal_datatype_create.c
+++ b/opal/datatype/opal_datatype_create.c
@@ -27,6 +27,10 @@
 #include "opal/datatype/opal_datatype_internal.h"
 #include "limits.h"
 #include "opal/prefetch.h"
+#if OPAL_CUDA_SUPPORT
+#include "opal/datatype/opal_convertor.h"
+#include "opal/datatype/opal_datatype_cuda.h"
+#endif /* OPAL_CUDA_SUPPORT */ 
 
 static void opal_datatype_construct( opal_datatype_t* pData )
 {
@@ -55,6 +59,10 @@ static void opal_datatype_construct( opal_datatype_t* pData )
 
     pData->cached_iovec       = NULL;
     pData->cached_iovec_count = 0;
+    
+#if OPAL_CUDA_SUPPORT
+    pData->cached_cuda_iov = NULL;
+#endif /* OPAL_CUDA_SUPPORT */
 
     for( i = 0; i < OPAL_DATATYPE_MAX_SUPPORTED; i++ )
         pData->btypes[i]      = 0;
@@ -90,6 +98,14 @@ static void opal_datatype_destruct( opal_datatype_t* datatype )
         free(datatype->cached_iovec);
         datatype->cached_iovec = NULL;
     }
+    
+#if OPAL_CUDA_SUPPORT   
+    /* free cuda iov */
+    if (opal_datatype_cuda_kernel_support == 1 && datatype->cached_cuda_iov != NULL) {
+        opal_cached_cuda_iov_fini(datatype->cached_cuda_iov);
+        datatype->cached_cuda_iov = NULL;
+    }
+#endif /* OPAL_CUDA_SUPPORT */
 }
 
 OBJ_CLASS_INSTANCE(opal_datatype_t, opal_object_t, opal_datatype_construct, opal_datatype_destruct);
diff --git a/opal/datatype/opal_datatype_cuda.c b/opal/datatype/opal_datatype_cuda.c
index 729e460de1a..c65e635a506 100644
--- a/opal/datatype/opal_datatype_cuda.c
+++ b/opal/datatype/opal_datatype_cuda.c
@@ -84,18 +84,11 @@ void mca_cuda_convertor_init(opal_convertor_t* convertor, const void *pUserBuf,
     if (OPAL_SUCCESS != opal_cuda_kernel_support_init()) {
         opal_cuda_kernel_support_fini();    
     }
-    if (opal_datatype_cuda_kernel_support == 1 && datatype->cuda_iov_is_cached == 0) {
-        struct opal_datatype_t* datatype_tmp = (opal_datatype_t *)datatype;
-        datatype_tmp->cuda_iov_dist = opal_cuda_iov_dist_init();
-        if (datatype_tmp->cuda_iov_dist == (void*)0xDEADBEEF || datatype_tmp->cuda_iov_dist == NULL) {
-            /* either cuda iov cache is not enabled or cuda_iov_cache malloc is failed, then we do not cache cuda iov */
-            datatype_tmp->cuda_iov_is_cached = -1;
-        } else {
-            /* cuda iov buffer is ready , the value will be marked to 2 when caching is finished*/
-            datatype_tmp->cuda_iov_is_cached = 1;
-        }
-    }
-    
+
+    convertor->current_cuda_iov_pos = 0;
+    convertor->current_iov_pos = 0;
+    convertor->current_iov_partial_length = 0;
+    convertor->current_count = 0;
 }
 
 /* Checks the type of pointer
@@ -253,8 +246,7 @@ int32_t opal_cuda_kernel_support_init(void)
         OPAL_DATATYPE_FIND_CUDA_KERNEL_FUNCTION_OR_RETURN( opal_datatype_cuda_kernel_handle, opal_ddt_cuda_malloc_gpu_buffer );
         OPAL_DATATYPE_FIND_CUDA_KERNEL_FUNCTION_OR_RETURN( opal_datatype_cuda_kernel_handle, opal_ddt_cuda_d2dcpy_async );
         OPAL_DATATYPE_FIND_CUDA_KERNEL_FUNCTION_OR_RETURN( opal_datatype_cuda_kernel_handle, opal_ddt_cuda_d2dcpy );
-        OPAL_DATATYPE_FIND_CUDA_KERNEL_FUNCTION_OR_RETURN( opal_datatype_cuda_kernel_handle, opal_ddt_cuda_iov_dist_init );
-        OPAL_DATATYPE_FIND_CUDA_KERNEL_FUNCTION_OR_RETURN( opal_datatype_cuda_kernel_handle, opal_ddt_cuda_iov_dist_fini );
+        OPAL_DATATYPE_FIND_CUDA_KERNEL_FUNCTION_OR_RETURN( opal_datatype_cuda_kernel_handle, opal_ddt_cached_cuda_iov_fini );
         
         if (OPAL_SUCCESS != cuda_kernel_table.opal_ddt_cuda_kernel_init_p()) {
             return OPAL_ERROR;
@@ -280,6 +272,7 @@ int32_t opal_cuda_kernel_support_fini(void)
         cuda_kernel_table.opal_ddt_cuda_malloc_gpu_buffer_p = NULL;
         cuda_kernel_table.opal_ddt_cuda_d2dcpy_async_p = NULL;
         cuda_kernel_table.opal_ddt_cuda_d2dcpy_p = NULL;
+        cuda_kernel_table.opal_ddt_cached_cuda_iov_fini_p = NULL;
 
         dlclose(opal_datatype_cuda_kernel_handle);
         opal_datatype_cuda_kernel_handle = NULL;
@@ -370,22 +363,12 @@ void opal_cuda_d2dcpy_async(void* dst, const void* src, size_t count)
     }
 }
 
-void* opal_cuda_iov_dist_init(void)
-{
-    if (cuda_kernel_table.opal_ddt_cuda_iov_dist_init_p != NULL) {
-        return cuda_kernel_table.opal_ddt_cuda_iov_dist_init_p();
-    } else {
-        opal_output(0, "opal_ddt_cuda_iov_dist_init function pointer is NULL\n");
-        return NULL;
-    }
-}
-
-void opal_cuda_iov_dist_fini(void *cuda_iov_dist)
+void opal_cached_cuda_iov_fini(void *cached_cuda_iov)
 {
-    if (cuda_kernel_table.opal_ddt_cuda_iov_dist_fini_p != NULL) {
-        cuda_kernel_table.opal_ddt_cuda_iov_dist_fini_p(cuda_iov_dist);
+    if (cuda_kernel_table.opal_ddt_cached_cuda_iov_fini_p != NULL) {
+        cuda_kernel_table.opal_ddt_cached_cuda_iov_fini_p(cached_cuda_iov);
     } else {
-        opal_output(0, "opal_ddt_cuda_iov_dist_fini function pointer is NULL\n");
+        opal_output(0, "opal_ddt_cached_cuda_iov_fini function pointer is NULL\n");
     }
 }
 
diff --git a/opal/datatype/opal_datatype_cuda.h b/opal/datatype/opal_datatype_cuda.h
index 24e85f649b9..7b613470ab0 100644
--- a/opal/datatype/opal_datatype_cuda.h
+++ b/opal/datatype/opal_datatype_cuda.h
@@ -28,8 +28,7 @@ struct opal_datatype_cuda_kernel_function_table {
     void* (*opal_ddt_cuda_malloc_gpu_buffer_p)(size_t size, int gpu_id);
     void (*opal_ddt_cuda_d2dcpy_async_p)(void* dst, const void* src, size_t count);
     void (*opal_ddt_cuda_d2dcpy_p)(void* dst, const void* src, size_t count);
-    void* (*opal_ddt_cuda_iov_dist_init_p)(void);
-    void (*opal_ddt_cuda_iov_dist_fini_p)(void *cuda_iov_dist);
+    void (*opal_ddt_cached_cuda_iov_fini_p)(void *cached_cuda_iov);
     int32_t (*opal_ddt_generic_simple_pack_function_cuda_iov_p)( opal_convertor_t* pConvertor, struct iovec* iov, uint32_t* out_size, size_t* max_data );
     int32_t (*opal_ddt_generic_simple_unpack_function_cuda_iov_p)( opal_convertor_t* pConvertor, struct iovec* iov, uint32_t* out_size, size_t* max_data );
     int32_t (*opal_ddt_generic_simple_pack_function_cuda_vector_p)( opal_convertor_t* pConvertor, struct iovec* iov, uint32_t* out_size, size_t* max_data );
@@ -56,7 +55,7 @@ void* opal_cuda_malloc_gpu_buffer(size_t size, int gpu_id);
 void opal_cuda_free_gpu_buffer(void *addr, int gpu_id);
 void opal_cuda_d2dcpy(void* dst, const void* src, size_t count);
 void opal_cuda_d2dcpy_async(void* dst, const void* src, size_t count);
-void* opal_cuda_iov_dist_init(void);
-void opal_cuda_iov_dist_fini(void *cuda_iov_dist);
+void* opal_cached_cuda_iov_init(void);
+void opal_cached_cuda_iov_fini(void *cached_cuda_iov);
 
 #endif
diff --git a/opal/datatype/opal_datatype_destroy.c b/opal/datatype/opal_datatype_destroy.c
index 8c225e698c0..593d5bfd67a 100644
--- a/opal/datatype/opal_datatype_destroy.c
+++ b/opal/datatype/opal_datatype_destroy.c
@@ -21,24 +21,11 @@
 #include "opal_config.h"
 #include "opal/constants.h"
 #include "opal/datatype/opal_datatype.h"
-#include "opal/datatype/opal_datatype_internal.h"
-#if OPAL_CUDA_SUPPORT
-#include "opal/datatype/opal_convertor.h"
-#include "opal/datatype/opal_datatype_cuda.h"
-#endif /* OPAL_CUDA_SUPPORT */   
+#include "opal/datatype/opal_datatype_internal.h"  
 
 int32_t opal_datatype_destroy( opal_datatype_t** dt )
 {
     opal_datatype_t* pData = *dt;
-    
-#if OPAL_CUDA_SUPPORT   
-    /* free cuda iov */
-    if (opal_datatype_cuda_kernel_support== 1 && pData->cuda_iov_dist != NULL && pData->cuda_iov_dist != (void*)0xDEADBEEF) {
-        opal_cuda_iov_dist_fini(pData->cuda_iov_dist);
-        pData->cuda_iov_dist = NULL;
-        pData->cuda_iov_count = 0;
-    }
-#endif /* OPAL_CUDA_SUPPORT */
 
     if( (pData->flags & OPAL_DATATYPE_FLAG_PREDEFINED) &&
         (pData->super.obj_reference_count <= 1) )
diff --git a/opal/datatype/opal_datatype_optimize.c b/opal/datatype/opal_datatype_optimize.c
index b492aa9381b..5ccea9ba1d3 100644
--- a/opal/datatype/opal_datatype_optimize.c
+++ b/opal/datatype/opal_datatype_optimize.c
@@ -305,13 +305,6 @@ int32_t opal_datatype_commit( opal_datatype_t * pData )
         pLast->size            = pData->size;
     }
 
-#if OPAL_CUDA_SUPPORT   
-    /* cuda iov for caching, it will be malloced latter when init convertor */
-    pData->cuda_iov_dist = NULL;
-    pData->cuda_iov_is_cached = 0;
-    pData->cuda_iov_count = 0;
-#endif /* OPAL_CUDA_SUPPORT */
-
     /* save a compressed datatype description as a iovec list */
 //    opal_convertor_t* conv = opal_convertor_create( opal_local_arch, 0 /* unused */);
 //    opal_convertor_prepare_for_send( conv, pData, 1, (void*)0 );
diff --git a/opal/datatype/opal_datatype_pack.c b/opal/datatype/opal_datatype_pack.c
index 9812a371a85..c8985db7913 100644
--- a/opal/datatype/opal_datatype_pack.c
+++ b/opal/datatype/opal_datatype_pack.c
@@ -416,7 +416,7 @@ opal_generic_simple_pack_cuda_function( opal_convertor_t* pConvertor,
     pos_desc   = pStack->index;
     pElem = &(description[pos_desc]);
    
-//    return opal_generic_simple_pack_function_cuda_vector( pConvertor, iov, out_size, max_data);
+    return opal_generic_simple_pack_function_cuda_iov( pConvertor, iov, out_size, max_data);
     if( OPAL_DATATYPE_LOOP == pElem->elem.common.type ) {
         return opal_generic_simple_pack_function_cuda_vector( pConvertor, iov, out_size, max_data);
     } else {
diff --git a/opal/datatype/opal_datatype_unpack.c b/opal/datatype/opal_datatype_unpack.c
index f5e1e76588f..5f51b3f828b 100644
--- a/opal/datatype/opal_datatype_unpack.c
+++ b/opal/datatype/opal_datatype_unpack.c
@@ -610,7 +610,7 @@ opal_generic_simple_unpack_cuda_function( opal_convertor_t* pConvertor,
     pos_desc   = pStack->index;
     pElem = &(description[pos_desc]);
    
-//    return opal_generic_simple_unpack_function_cuda_vector( pConvertor, iov, out_size, max_data);
+    return opal_generic_simple_unpack_function_cuda_iov( pConvertor, iov, out_size, max_data);
     if( OPAL_DATATYPE_LOOP == pElem->elem.common.type ) {
         return opal_generic_simple_unpack_function_cuda_vector( pConvertor, iov, out_size, max_data);
     } else {
diff --git a/test/datatype/ddt_benchmark.c b/test/datatype/ddt_benchmark.c
index 1bb91f663c8..c8c3fd7db45 100644
--- a/test/datatype/ddt_benchmark.c
+++ b/test/datatype/ddt_benchmark.c
@@ -1211,12 +1211,12 @@ int main( int argc, char* argv[] )
     
     printf( "\n\n#\n * TEST UPPER TRIANGULAR MATRIX (size 100)\n #\n\n" );
     int mat_size = 500;
-    for (mat_size = 6000; mat_size <= 6000; mat_size +=500) {
+    for (mat_size = 2000; mat_size <= 2000; mat_size +=500) {
         pdt = upper_matrix(mat_size);
         printf("----matrix size %d-----\n", mat_size);
         if( outputFlags & CHECK_PACK_UNPACK ) {
-            for (i = 1; i <= 2; i++) {
-    //            local_copy_with_convertor(pdt, 1, 1024*1024*200, mat_size);
+            for (i = 1; i <= 1; i++) {
+                local_copy_with_convertor(pdt, 1, 40000000, mat_size);
             }
         }
         OBJ_RELEASE( pdt ); assert( pdt == NULL );
@@ -1224,10 +1224,10 @@ int main( int argc, char* argv[] )
     
     ompi_datatype_t *column, *matt;
     mat_size = 1000;
-    ompi_datatype_create_vector( mat_size, 1, mat_size, MPI_DOUBLE, &column );
-    ompi_datatype_create_hvector( mat_size, 1, sizeof(double), column, &matt );
-    ompi_datatype_commit( &matt );
-    local_copy_with_convertor_mat(matt, 1, 200000000, mat_size);
+ //   ompi_datatype_create_vector( mat_size, 1, mat_size, MPI_DOUBLE, &column );
+ //   ompi_datatype_create_hvector( mat_size, 1, sizeof(double), column, &matt );
+ //   ompi_datatype_commit( &matt );
+ //   local_copy_with_convertor_mat(matt, 1, 200000000, mat_size);
     
     
     int packed_size = 256;
@@ -1285,7 +1285,7 @@ int main( int argc, char* argv[] )
         pdt = create_vector_type( MPI_DOUBLE, 1000, blk_len, blk_len*2);
         if( outputFlags & CHECK_PACK_UNPACK ) {
             for (i = 0; i < 1; i++) {
-                 vector_ddt( pdt, 1, pdt, 1, 2000000 , 1000, blk_len, blk_len*2);
+        //         vector_ddt( pdt, 1, pdt, 1, 2000000 , 1000, blk_len, blk_len*2);
      //          vector_ddt_2d( pdt, 1, pdt, 1, 1024*1024*100 , 8192, blk_len, blk_len+128);
             }
         }

From 9386ffb8160a786355d4bb4afbb93f576ebc2e2b Mon Sep 17 00:00:00 2001
From: eddy16112 <eddy16112@gmail.com>
Date: Wed, 11 Nov 2015 17:49:25 -0500
Subject: [PATCH 079/190] cache the entire cuda iov checkpoint, during unpack,
 cache the entire iov before unpack

another checkpoint

checkpoint , remove unnecessary cuda stream sync

use bit to replace %

rollback to use %, not bit, since it is faster, not sure why
---
 opal/datatype/cuda/opal_datatype_cuda.cu      |  14 ++
 opal/datatype/cuda/opal_datatype_cuda.cuh     |   2 +
 .../cuda/opal_datatype_cuda_internal.cuh      |   4 +-
 .../cuda/opal_datatype_pack_cuda_kernel.cu    |  37 ++--
 .../cuda/opal_datatype_pack_cuda_wrapper.cu   | 203 ++++++------------
 .../cuda/opal_datatype_unpack_cuda_kernel.cu  |  48 ++---
 .../cuda/opal_datatype_unpack_cuda_wrapper.cu | 203 ++++++------------
 test/datatype/ddt_benchmark.c                 |  19 +-
 test/datatype/ddt_lib.h                       |   4 +-
 9 files changed, 205 insertions(+), 329 deletions(-)

diff --git a/opal/datatype/cuda/opal_datatype_cuda.cu b/opal/datatype/cuda/opal_datatype_cuda.cu
index 18494bcba70..471c6e63709 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cu
+++ b/opal/datatype/cuda/opal_datatype_cuda.cu
@@ -351,6 +351,20 @@ uint8_t opal_ddt_cuda_iov_is_cached(struct opal_convertor_t *convertor)
     return tmp->cuda_iov_is_cached;
 }
 
+void opal_ddt_set_cuda_iov_position(struct opal_convertor_t *convertor, size_t ddt_offset, const uint32_t *cached_cuda_iov_nb_bytes_list_h, const uint32_t cuda_iov_count)
+{
+    int i;
+    size_t iov_size = 0;
+    for(i = 0; i < cuda_iov_count; i++) {
+        iov_size += cached_cuda_iov_nb_bytes_list_h[i];
+        if (iov_size > ddt_offset) {
+            convertor->current_iov_partial_length = iov_size - ddt_offset;
+            convertor->current_cuda_iov_pos = i;
+            break;
+        }
+    }
+}
+
 void opal_ddt_check_cuda_iov_is_full(struct opal_convertor_t *convertor, uint32_t cuda_iov_count)
 {
 #if 0
diff --git a/opal/datatype/cuda/opal_datatype_cuda.cuh b/opal/datatype/cuda/opal_datatype_cuda.cuh
index 6c071188c2c..8e30726ace2 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cuh
+++ b/opal/datatype/cuda/opal_datatype_cuda.cuh
@@ -129,6 +129,8 @@ uint8_t opal_ddt_cuda_iov_is_cached(struct opal_convertor_t *convertor);
 
 void opal_ddt_check_cuda_iov_is_full(struct opal_convertor_t *convertor, uint32_t cuda_iov_count);
 
+void opal_ddt_set_cuda_iov_position(struct opal_convertor_t *convertor, size_t ddt_offset, const uint32_t *cached_cuda_iov_nb_bytes_list_h, const uint32_t cuda_iov_count);
+
 }
                             
 #endif  /* OPAL_DATATYPE_CUDA_H_HAS_BEEN_INCLUDED */
\ No newline at end of file
diff --git a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
index b7e8e9405f6..b1c36b66e14 100644
--- a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
+++ b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
@@ -138,9 +138,9 @@ __global__ void opal_generic_simple_pack_cuda_iov_non_cached_kernel( ddt_cuda_io
 
 __global__ void opal_generic_simple_unpack_cuda_iov_non_cached_kernel( ddt_cuda_iov_dist_non_cached_t* cuda_iov_dist, int nb_blocks_used);
 
-__global__ void opal_generic_simple_pack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, uintptr_t* cuda_iov_contig_buf_d, int nb_blocks_used, unsigned char* source_base);
+__global__ void opal_generic_simple_pack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, uint32_t cuda_iov_pos, uintptr_t* cuda_iov_contig_buf_d, int nb_blocks_used, unsigned char* source_base);
 
-__global__ void opal_generic_simple_unpack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, uintptr_t* cuda_iov_contig_buf_d, int nb_blocks_used, unsigned char* destination_base, size_t cuda_iov_partial_length_start, size_t cuda_iov_partial_length_end);
+__global__ void opal_generic_simple_unpack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, uint32_t cuda_iov_pos, uintptr_t* cuda_iov_contig_buf_d, int nb_blocks_used, unsigned char* destination_base, size_t cuda_iov_partial_length_start, size_t cuda_iov_partial_length_end);
 
 void opal_cuda_output(int output_id, const char *format, ...);
 
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
index e85b83e55b5..93fb188ddcd 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
@@ -88,16 +88,17 @@ __global__ void opal_generic_simple_pack_cuda_iov_non_cached_kernel( ddt_cuda_io
     }
 }
 
-__global__ void opal_generic_simple_pack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, uintptr_t* cuda_iov_contig_buf_d, int nb_blocks_used, unsigned char* source_base)
+__global__ void opal_generic_simple_pack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, uint32_t cuda_iov_pos, uintptr_t* cuda_iov_contig_buf_d, int nb_blocks_used, unsigned char* source_base)
 {
-    uint32_t i, j;
+    uint32_t i, j, _nb_bytes;    
     size_t src_offset;
     unsigned char *dst;
     unsigned char *_source_tmp, *_destination_tmp;
+    uint32_t current_cuda_iov_pos = cuda_iov_pos;
     
     __shared__ uint32_t nb_tasks;
-    __shared__ uint32_t copy_count;
-    __shared__ uint8_t alignment;
+    uint32_t copy_count;
+    uint8_t alignment;
     
     if (threadIdx.x == 0) {
         nb_tasks = nb_blocks_used / gridDim.x;
@@ -109,24 +110,20 @@ __global__ void opal_generic_simple_pack_cuda_iov_cached_kernel( ddt_cuda_iov_di
     __syncthreads();
     
     for (i = 0; i < nb_tasks; i++) {
-        src_offset = cuda_iov_dist[blockIdx.x + i * gridDim.x].ptr_offset;
+        src_offset = cuda_iov_dist[blockIdx.x + i * gridDim.x + current_cuda_iov_pos].ptr_offset;
         dst = (unsigned char *)cuda_iov_contig_buf_d[blockIdx.x + i * gridDim.x];
+        _nb_bytes = cuda_iov_dist[blockIdx.x + i * gridDim.x + current_cuda_iov_pos].nb_bytes;
         
-        if (threadIdx.x == 0) {
-            _source_tmp = source_base + src_offset;
-            _destination_tmp = dst;
-            uint32_t _nb_bytes = cuda_iov_dist[blockIdx.x + i * gridDim.x].nb_bytes;
-            /* block size is either multiple of ALIGNMENT_DOUBLE or residule */
-            if ((uintptr_t)(_source_tmp) % ALIGNMENT_DOUBLE == 0 && (uintptr_t)_destination_tmp % ALIGNMENT_DOUBLE == 0 && _nb_bytes % ALIGNMENT_DOUBLE == 0) {
-                alignment = ALIGNMENT_DOUBLE;
-            } else if ((uintptr_t)(_source_tmp) % ALIGNMENT_FLOAT == 0 && (uintptr_t)_destination_tmp % ALIGNMENT_FLOAT == 0 && _nb_bytes % ALIGNMENT_FLOAT == 0) {
-                alignment = ALIGNMENT_FLOAT;
-            } else {
-                alignment = ALIGNMENT_CHAR;
-            }
-            copy_count = _nb_bytes / alignment;
+        _source_tmp = source_base + src_offset;
+        /* block size is either multiple of ALIGNMENT_DOUBLE or residule */
+        if ((uintptr_t)(_source_tmp) % ALIGNMENT_DOUBLE == 0 && (uintptr_t)dst % ALIGNMENT_DOUBLE == 0 && _nb_bytes % ALIGNMENT_DOUBLE == 0) {
+            alignment = ALIGNMENT_DOUBLE;
+        } else if ((uintptr_t)(_source_tmp) % ALIGNMENT_FLOAT == 0 && (uintptr_t)dst % ALIGNMENT_FLOAT == 0 && _nb_bytes % ALIGNMENT_FLOAT == 0) {
+            alignment = ALIGNMENT_FLOAT;
+        } else {
+            alignment = ALIGNMENT_CHAR;
         }
-        __syncthreads();
+        copy_count = _nb_bytes / alignment;
         
         for (j = threadIdx.x; j < copy_count; j += blockDim.x) {
             if (j < copy_count) {
@@ -144,4 +141,4 @@ __global__ void opal_generic_simple_pack_cuda_iov_cached_kernel( ddt_cuda_iov_di
             }
         }
     }
-}
\ No newline at end of file
+}
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
index 55cb955808e..1d14c000977 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
@@ -937,7 +937,7 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
     uint32_t nb_blocks, thread_per_block, nb_blocks_used;
     size_t length, buffer_size, length_per_iovec;
     unsigned char *destination, *destination_base, *source_base, *source;
-    size_t total_packed, packed_w_cache ,packed_wo_cache;
+    size_t total_packed;
     int32_t complete_flag = 0;
     uint8_t buffer_isfull = 0, transfer_required, free_required;
     uint32_t convertor_flags;
@@ -948,19 +948,21 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
 //    int32_t orig_stack_index;
     cudaError_t cuda_err;
     ddt_cuda_stream_t *cuda_streams = current_cuda_device->cuda_streams;
-    ddt_cuda_iov_dist_cached_t* cuda_iov_dist_h_current;
-    ddt_cuda_iov_dist_cached_t* cuda_iov_dist_d_current;
-    uintptr_t *cuda_iov_contig_buf_h_current, *cuda_iov_contig_buf_d_current;
-    ddt_cuda_iov_pipeline_block_t *cuda_iov_pipeline_block;
+    ddt_cuda_iov_dist_cached_t* cuda_iov_dist_h_current = NULL;
+    ddt_cuda_iov_dist_cached_t* cuda_iov_dist_d_current = NULL;
+    uintptr_t *cuda_iov_contig_buf_h_current = NULL; 
+    uintptr_t *cuda_iov_contig_buf_d_current = NULL;
+    ddt_cuda_iov_pipeline_block_t *cuda_iov_pipeline_block = NULL;
     int iov_pipeline_block_id = 0;
     cudaStream_t *cuda_stream_iov = NULL;
     const struct iovec *ddt_iov = NULL;
     uint32_t ddt_iov_count = 0;
     size_t iov_len = 0;
     uint32_t iov_start_pos, iov_end_pos, cuda_iov_start_pos, cuda_iov_end_pos;
-    ddt_cuda_iov_total_cached_t* cached_cuda_iov;
-    ddt_cuda_iov_dist_cached_t* cached_cuda_iov_dist_d;
-    uint32_t *cached_cuda_iov_nb_bytes_list_h, *cuda_iov_nb_bytes_list_h_current;
+    ddt_cuda_iov_total_cached_t* cached_cuda_iov = NULL;
+    ddt_cuda_iov_dist_cached_t* cached_cuda_iov_dist_d = NULL;
+    uint32_t *cached_cuda_iov_nb_bytes_list_h = NULL;
+    uint32_t *cuda_iov_nb_bytes_list_h_current = NULL;
     uint32_t cached_cuda_iov_count = 0;
     uint8_t cuda_iov_is_cached = 0;
 
@@ -1015,8 +1017,6 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
     DT_CUDA_DEBUG ( opal_cuda_output(2, "Pack using IOV cached, GPU base %p, pack to buffer %p\n", pConvertor->pBaseBuf, destination););
 
     total_packed = 0;
-    packed_wo_cache = 0;
-    packed_w_cache = 0;
     cuda_streams->current_stream_id = 0;
   //  orig_stack_index = pStack->index;
     destination_base = destination;
@@ -1036,8 +1036,6 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
     assert(cached_cuda_iov_dist_d != NULL);
     cached_cuda_iov_nb_bytes_list_h = cached_cuda_iov->nb_bytes_h;
     assert(cached_cuda_iov_nb_bytes_list_h != NULL);
-    cached_cuda_iov_count = cached_cuda_iov->cuda_iov_count;
-    cuda_iov_is_cached = cached_cuda_iov->cuda_iov_is_cached;
     DT_CUDA_DEBUG ( opal_cuda_output(4, "Pack iov count %d, submit to CUDA stream %d\n", ddt_iov_count, cuda_streams->current_stream_id); );
 
 #if defined(OPAL_DATATYPE_CUDA_TIMING)    
@@ -1052,133 +1050,69 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
     
     /* cuda iov is not cached, start to cache iov */
     if(opal_ddt_cuda_iov_is_cached(pConvertor) == 0) {
-    
-        iov_start_pos = pConvertor->current_iov_pos;
-        iov_end_pos = iov_start_pos + IOV_PIPELINE_SIZE;
-        if (iov_end_pos > ddt_iov_count) {
-            iov_end_pos = ddt_iov_count;
-        }
-    
-        while (iov_start_pos < iov_end_pos && !buffer_isfull) {
-        
-            nb_blocks_used = 0;
-            cuda_iov_pipeline_block = current_cuda_device->cuda_iov_pipeline_block[iov_pipeline_block_id];
-            cuda_iov_dist_h_current = cuda_iov_pipeline_block->cuda_iov_dist_cached_h;
-            cuda_iov_contig_buf_h_current = cuda_iov_pipeline_block->cuda_iov_contig_buf_h;
-            cuda_iov_contig_buf_d_current = cuda_iov_pipeline_block->cuda_iov_contig_buf_d;
-            cuda_iov_dist_d_current = cached_cuda_iov_dist_d + pConvertor->current_cuda_iov_pos;
-            cuda_iov_nb_bytes_list_h_current = cached_cuda_iov_nb_bytes_list_h + pConvertor->current_cuda_iov_pos;
-            cuda_stream_iov = cuda_iov_pipeline_block->cuda_stream;
-            cuda_err = cudaStreamWaitEvent(*cuda_stream_iov, cuda_iov_pipeline_block->cuda_event, 0);
-            opal_cuda_check_error(cuda_err);
+        nb_blocks_used = 0;
+        cuda_iov_pipeline_block = current_cuda_device->cuda_iov_pipeline_block[iov_pipeline_block_id];
+        cuda_iov_dist_h_current = cuda_iov_pipeline_block->cuda_iov_dist_cached_h;
+        cuda_stream_iov = cuda_iov_pipeline_block->cuda_stream;
+        cuda_err = cudaStreamWaitEvent(*cuda_stream_iov, cuda_iov_pipeline_block->cuda_event, 0);
+        opal_cuda_check_error(cuda_err);
 
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
-            GET_TIME(start);
+        GET_TIME(start);
 #endif
 
-            for (i = iov_start_pos; i < iov_end_pos && !buffer_isfull; i++) {
-                if (pConvertor->current_iov_partial_length > 0) {
-                    iov_len = pConvertor->current_iov_partial_length;
-                    pConvertor->current_iov_partial_length = 0;
-                } else {
-                    iov_len = ddt_iov[i].iov_len;
-                }
-                if (buffer_size >= iov_len) {
-                    length_per_iovec = iov_len;
+        for (i = 0; i < ddt_iov_count; i++) {
+            length_per_iovec = ddt_iov[i].iov_len;
+            source = (size_t)(ddt_iov[i].iov_base) + source_base;
+        
+            /* block size is either multiple of ALIGNMENT_DOUBLE or residule */
+            alignment = ALIGNMENT_DOUBLE;
+
+            count_desc = length_per_iovec / alignment;
+            residue_desc = length_per_iovec % alignment;
+            nb_blocks_per_description = (count_desc + thread_per_block - 1) / thread_per_block;
+            DT_CUDA_DEBUG ( opal_cuda_output(10, "Pack description %d, size %d, residue %d, alignment %d, nb_block_aquired %d\n", i, count_desc, residue_desc, alignment, nb_blocks_per_description); );
+            for (j = 0; j < nb_blocks_per_description; j++) {
+                cuda_iov_dist_h_current[nb_blocks_used].ptr_offset = source + j * thread_per_block * alignment - source_base;
+                if ( (j+1) * thread_per_block <= count_desc) {
+                    cuda_iov_dist_h_current[nb_blocks_used].nb_bytes = thread_per_block * alignment;
                 } else {
-                    /*orig_alignment = opal_datatype_basicDatatypes[pElem->elem.common.type]->size;*/
-                    orig_alignment = ALIGNMENT_CHAR;
-                    length_per_iovec = buffer_size / orig_alignment * orig_alignment;
-                    buffer_isfull = 1;
-                    pConvertor->current_iov_partial_length = iov_len - length_per_iovec;
-                    pConvertor->current_iov_pos = i;
+                    cuda_iov_dist_h_current[nb_blocks_used].nb_bytes = (count_desc - j*thread_per_block) * alignment; 
                 }
-                buffer_size -= length_per_iovec;
-                packed_wo_cache += length_per_iovec;
-                source = (size_t)(ddt_iov[i].iov_base) + (ddt_iov[i].iov_len - iov_len) + source_base;
-            
-                /* block size is either multiple of ALIGNMENT_DOUBLE or residule */
-                alignment = ALIGNMENT_DOUBLE;
-
-                count_desc = length_per_iovec / alignment;
-                residue_desc = length_per_iovec % alignment;
-                nb_blocks_per_description = (count_desc + thread_per_block - 1) / thread_per_block;
-                DT_CUDA_DEBUG ( opal_cuda_output(10, "Pack description %d, size %d, residue %d, alignment %d, nb_block_aquired %d\n", i, count_desc, residue_desc, alignment, nb_blocks_per_description); );
-                for (j = 0; j < nb_blocks_per_description; j++) {
-                    cuda_iov_dist_h_current[nb_blocks_used].ptr_offset = source + j * thread_per_block * alignment - source_base;
-                    cuda_iov_contig_buf_h_current[nb_blocks_used] = (uintptr_t)destination;
-                    if ( (j+1) * thread_per_block <= count_desc) {
-                        cuda_iov_dist_h_current[nb_blocks_used].nb_bytes = thread_per_block * alignment;
-                    } else {
-                        cuda_iov_dist_h_current[nb_blocks_used].nb_bytes = (count_desc - j*thread_per_block) * alignment; 
-                    }
 #if defined (OPAL_DATATYPE_CUDA_DEBUG)
-                    assert(cuda_iov_dist_h_current[nb_blocks_used].nb_bytes > 0);
+                assert(cuda_iov_dist_h_current[nb_blocks_used].nb_bytes > 0);
 #endif /* OPAL_DATATYPE_CUDA_DEBUG */
-                    cuda_iov_nb_bytes_list_h_current[nb_blocks_used] = cuda_iov_dist_h_current[nb_blocks_used].nb_bytes;
-                    destination += cuda_iov_dist_h_current[nb_blocks_used].nb_bytes;
-                    DT_CUDA_DEBUG( opal_cuda_output(12, "Pack \tblock %d, src_offset %ld, dst %p, nb_bytes %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].ptr_offset, cuda_iov_contig_buf_h_current[nb_blocks_used], cuda_iov_dist_h_current[nb_blocks_used].nb_bytes); );
-                    nb_blocks_used ++;
-                    assert (nb_blocks_used < CUDA_MAX_NB_BLOCKS*CUDA_IOV_MAX_TASK_PER_BLOCK);
-                }
-            
-                /* handle residue */
-                if (residue_desc != 0) {
-                    /*orig_alignment = opal_datatype_basicDatatypes[pElem->elem.common.type]->size;*/
-                    orig_alignment = ALIGNMENT_CHAR;
-                    cuda_iov_dist_h_current[nb_blocks_used].ptr_offset = source + length_per_iovec / alignment * alignment - source_base;
-                    cuda_iov_contig_buf_h_current[nb_blocks_used] = (uintptr_t)destination;
-                    cuda_iov_dist_h_current[nb_blocks_used].nb_bytes = length_per_iovec - length_per_iovec / alignment * alignment;
+                cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] = cuda_iov_dist_h_current[nb_blocks_used].nb_bytes;
+                DT_CUDA_DEBUG( opal_cuda_output(12, "Pack \tblock %d, src_offset %ld, nb_bytes %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].ptr_offset, cuda_iov_dist_h_current[nb_blocks_used].nb_bytes); );
+                nb_blocks_used ++;
+                assert (nb_blocks_used < CUDA_MAX_NB_BLOCKS*CUDA_IOV_MAX_TASK_PER_BLOCK);
+            }
+        
+            /* handle residue */
+            if (residue_desc != 0) {
+                /*orig_alignment = opal_datatype_basicDatatypes[pElem->elem.common.type]->size;*/
+                cuda_iov_dist_h_current[nb_blocks_used].ptr_offset = source + length_per_iovec / alignment * alignment - source_base;
+                cuda_iov_dist_h_current[nb_blocks_used].nb_bytes = length_per_iovec - length_per_iovec / alignment * alignment;
 #if defined (OPAL_DATATYPE_CUDA_DEBUG)
-                    assert(cuda_iov_dist_h_current[nb_blocks_used].nb_bytes > 0);
+                assert(cuda_iov_dist_h_current[nb_blocks_used].nb_bytes > 0);
 #endif /* OPAL_DATATYPE_CUDA_DEBUG */
-                    cuda_iov_nb_bytes_list_h_current[nb_blocks_used] = cuda_iov_dist_h_current[nb_blocks_used].nb_bytes;
-                    destination += cuda_iov_dist_h_current[nb_blocks_used].nb_bytes;
-                    DT_CUDA_DEBUG( opal_cuda_output(12, "Pack \tblock %d, src_offset %ld, dst %p, nb_bytes %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].ptr_offset, cuda_iov_contig_buf_h_current[nb_blocks_used], cuda_iov_dist_h_current[nb_blocks_used].nb_bytes); );
-                    nb_blocks_used ++;
-                    assert (nb_blocks_used < CUDA_MAX_NB_BLOCKS*CUDA_IOV_MAX_TASK_PER_BLOCK);
-                }
-            }
-            
-            if (!buffer_isfull) {
-                pConvertor->current_iov_pos = i;
+                cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] = cuda_iov_dist_h_current[nb_blocks_used].nb_bytes;
+                DT_CUDA_DEBUG( opal_cuda_output(12, "Pack \tblock %d, src_offset %ld, nb_bytes %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].ptr_offset, cuda_iov_dist_h_current[nb_blocks_used].nb_bytes); );
+                nb_blocks_used ++;
+                assert (nb_blocks_used < CUDA_MAX_NB_BLOCKS*CUDA_IOV_MAX_TASK_PER_BLOCK);
             }
-
+        }
+        cudaMemcpyAsync(cached_cuda_iov_dist_d, cuda_iov_dist_h_current, sizeof(ddt_cuda_iov_dist_cached_t)*(nb_blocks_used), cudaMemcpyHostToDevice, *cuda_stream_iov);
+        opal_ddt_set_cuda_iov_cached(pConvertor, nb_blocks_used);
+        DT_CUDA_DEBUG ( opal_cuda_output(2, "Pack cuda iov is cached, count %d\n", nb_blocks_used););
 #if defined(OPAL_DATATYPE_CUDA_TIMING)    
-            GET_TIME( end );
-            total_time = ELAPSED_TIME( start, end );
-            DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Pack to dest %p, iov is prepared in %ld microsec, kernel submitted to CUDA stream %d, nb_blocks %d\n", destination_base, total_time,  cuda_iov_pipeline_block->cuda_stream_id, nb_blocks_used); );
+        GET_TIME( end );
+        total_time = ELAPSED_TIME( start, end );
+        DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Pack cached cuda iov is prepared in %ld microsec, nb_blocks %d\n", total_time, nb_blocks_used); );
 #endif
-
-          //  opal_ddt_check_cuda_iov_is_full(pConvertor, pConvertor->current_cuda_iov_pos + nb_blocks_used); /* make sure cuda iov has enough space */
-            cudaMemcpyAsync(cuda_iov_dist_d_current, cuda_iov_dist_h_current, sizeof(ddt_cuda_iov_dist_cached_t)*(nb_blocks_used), cudaMemcpyHostToDevice, *cuda_stream_iov);
-            cudaMemcpyAsync(cuda_iov_contig_buf_d_current, cuda_iov_contig_buf_h_current, sizeof(uintptr_t)*(nb_blocks_used), cudaMemcpyHostToDevice, *cuda_stream_iov);
-            DT_CUDA_DEBUG ( opal_cuda_output(2, "kernel launched src_base %p, dst_base %p, nb_blocks %ld\n", source_base, destination_base, nb_blocks_used ); );
-            opal_generic_simple_pack_cuda_iov_cached_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cuda_iov_dist_d_current, cuda_iov_contig_buf_d_current, nb_blocks_used, source_base);
-            cuda_err = cudaEventRecord(cuda_iov_pipeline_block->cuda_event, *cuda_stream_iov);
-            opal_cuda_check_error(cuda_err);
-            iov_pipeline_block_id ++;
-            iov_pipeline_block_id = iov_pipeline_block_id % NB_STREAMS;
-            pConvertor->current_cuda_iov_pos += nb_blocks_used;
-        
-    //        orig_stack_index = pStack->index;
-            iov_start_pos = iov_end_pos;
-            iov_end_pos = iov_start_pos + IOV_PIPELINE_SIZE;
-            if (iov_end_pos >= ddt_iov_count) {
-                iov_end_pos = ddt_iov_count;
-            }
-            /* count = 0 done, iov cached finished */
-            if (pConvertor->current_iov_pos == ddt_iov_count) {
-                pConvertor->current_count ++;
-                opal_ddt_set_cuda_iov_cached(pConvertor, pConvertor->current_cuda_iov_pos);
-                DT_CUDA_DEBUG ( opal_cuda_output(2, "Pack cuda iov is cached, count %d\n", pConvertor->current_cuda_iov_pos););
-            }
-            DT_CUDA_DEBUG ( opal_cuda_output(4, "Pack iov start pos %d end pos %d, submit to CUDA stream %d\n", iov_start_pos, iov_end_pos, cuda_streams->current_stream_id); );
-        }
     }
-    total_packed += packed_wo_cache;
-    pConvertor->bConverted += packed_wo_cache;
-
+    
+    cached_cuda_iov_count = cached_cuda_iov->cuda_iov_count;
    
     /* now we use cached cuda iov */
     if( pConvertor->bConverted != pConvertor->local_size && !buffer_isfull) {
@@ -1188,10 +1122,7 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
         cuda_iov_pipeline_block = current_cuda_device->cuda_iov_pipeline_block[iov_pipeline_block_id];
         cuda_iov_contig_buf_h_current = cuda_iov_pipeline_block->cuda_iov_contig_buf_h;
         cuda_iov_contig_buf_d_current = cuda_iov_pipeline_block->cuda_iov_contig_buf_d;
-        cuda_iov_dist_d_current = cached_cuda_iov_dist_d + pConvertor->current_cuda_iov_pos;
         cuda_stream_iov = cuda_iov_pipeline_block->cuda_stream;
-        cuda_err = cudaStreamWaitEvent(*cuda_stream_iov, cuda_iov_pipeline_block->cuda_event, 0);
-        opal_cuda_check_error(cuda_err);
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
         GET_TIME(start);
 #endif
@@ -1199,7 +1130,7 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
             if (buffer_size >= cached_cuda_iov_nb_bytes_list_h[i]) {
                 cuda_iov_contig_buf_h_current[nb_blocks_used] = (uintptr_t)destination;
                 destination += cached_cuda_iov_nb_bytes_list_h[i];
-                packed_w_cache += cached_cuda_iov_nb_bytes_list_h[i];
+                total_packed += cached_cuda_iov_nb_bytes_list_h[i];
                 buffer_size -= cached_cuda_iov_nb_bytes_list_h[i];
                 nb_blocks_used++;
             } else {
@@ -1207,28 +1138,22 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
                 break;
             }
         }
-        printf("nb_blocks_used %d, my %d\n", nb_blocks_used, i - cuda_iov_start_pos);
 #if defined(OPAL_DATATYPE_CUDA_TIMING)    
         GET_TIME( end );
         total_time = ELAPSED_TIME( start, end );
         DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Pack to dest %p, cached cuda iov is prepared in %ld microsec, kernel submitted to CUDA stream %d, nb_blocks %d\n", destination_base, total_time,  cuda_iov_pipeline_block->cuda_stream_id, nb_blocks_used); );
 #endif
-        pConvertor->current_cuda_iov_pos += nb_blocks_used;
         cudaMemcpyAsync(cuda_iov_contig_buf_d_current, cuda_iov_contig_buf_h_current, sizeof(uintptr_t)*(nb_blocks_used), cudaMemcpyHostToDevice, *cuda_stream_iov);
         DT_CUDA_DEBUG ( opal_cuda_output(2, "kernel launched src_base %p, dst_base %p, nb_blocks %ld\n", source_base, destination_base, nb_blocks_used ); );
-        opal_generic_simple_pack_cuda_iov_cached_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cuda_iov_dist_d_current, cuda_iov_contig_buf_d_current, nb_blocks_used, source_base);
-        cuda_err = cudaEventRecord(cuda_iov_pipeline_block->cuda_event, *cuda_stream_iov);
-        opal_cuda_check_error(cuda_err);
-        iov_pipeline_block_id ++;
-        iov_pipeline_block_id = iov_pipeline_block_id % NB_STREAMS;
+        opal_generic_simple_pack_cuda_iov_cached_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cached_cuda_iov_dist_d, pConvertor->current_cuda_iov_pos, cuda_iov_contig_buf_d_current, nb_blocks_used, source_base);
+        pConvertor->current_cuda_iov_pos += nb_blocks_used;
     }
 
     for (i = 0; i < NB_STREAMS; i++) {
         cudaStreamSynchronize(cuda_streams->opal_cuda_stream[i]);
     }
-    
-    total_packed += packed_w_cache;
-    pConvertor->bConverted += packed_w_cache;
+
+    pConvertor->bConverted += total_packed;
     DT_CUDA_DEBUG ( opal_cuda_output(2, "Pack total packed %d\n", total_packed); );
     
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
index c553a7991b0..f98a8c0b2ea 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
@@ -46,16 +46,18 @@ __global__ void opal_generic_simple_unpack_cuda_iov_non_cached_kernel( ddt_cuda_
     }
 }
 
-__global__ void opal_generic_simple_unpack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, uintptr_t* cuda_iov_contig_buf_d, int nb_blocks_used, unsigned char* destination_base, size_t cuda_iov_partial_length_start, size_t cuda_iov_partial_length_end)
+__global__ void opal_generic_simple_unpack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, uint32_t cuda_iov_pos, uintptr_t* cuda_iov_contig_buf_d, int nb_blocks_used, unsigned char* destination_base, size_t cuda_iov_partial_length_start, size_t cuda_iov_partial_length_end)
 {
     uint32_t i, j;
     size_t dst_offset;
     unsigned char *src;
     unsigned char *_source_tmp, *_destination_tmp;
+    uint32_t _nb_bytes;
+    uint32_t current_cuda_iov_pos = cuda_iov_pos; 
     
     __shared__ uint32_t nb_tasks;
-    __shared__ uint32_t copy_count;
-    __shared__ uint8_t alignment;
+    uint32_t copy_count;
+    uint8_t alignment;
     
     if (threadIdx.x == 0) {
         nb_tasks = nb_blocks_used / gridDim.x;
@@ -67,32 +69,28 @@ __global__ void opal_generic_simple_unpack_cuda_iov_cached_kernel( ddt_cuda_iov_
     
     for (i = 0; i < nb_tasks; i++) {
         src = (unsigned char *)cuda_iov_contig_buf_d[blockIdx.x + i * gridDim.x];
-        dst_offset = cuda_iov_dist[blockIdx.x + i * gridDim.x].ptr_offset;
+        dst_offset = cuda_iov_dist[blockIdx.x + i * gridDim.x + current_cuda_iov_pos].ptr_offset;
         if (i == 0 && blockIdx.x == 0 && cuda_iov_partial_length_start != 0) {
          //   if (threadIdx.x == 0) printf("cuda_iov_partial_length_start %d", cuda_iov_partial_length_start);
-            dst_offset = dst_offset + cuda_iov_dist[blockIdx.x + i * gridDim.x].nb_bytes - cuda_iov_partial_length_start; 
+            dst_offset = dst_offset + cuda_iov_dist[blockIdx.x + i * gridDim.x + current_cuda_iov_pos].nb_bytes - cuda_iov_partial_length_start; 
         }
-        if (threadIdx.x == 0) {
-            _source_tmp = src;
-            _destination_tmp = destination_base + dst_offset;
-            uint32_t _nb_bytes = 0;
-            if (i == 0 && blockIdx.x == 0 && cuda_iov_partial_length_start != 0) {
-                _nb_bytes = cuda_iov_partial_length_start;
-            } else if (i == nb_tasks-1 && (blockIdx.x == (nb_blocks_used-1) % gridDim.x) && cuda_iov_partial_length_end != 0) {
-                _nb_bytes = cuda_iov_partial_length_end;
-            } else {
-                _nb_bytes = cuda_iov_dist[blockIdx.x + i * gridDim.x].nb_bytes;
-            }
-            if ((uintptr_t)(_destination_tmp) % ALIGNMENT_DOUBLE == 0 && (uintptr_t)_source_tmp % ALIGNMENT_DOUBLE == 0 && _nb_bytes % ALIGNMENT_DOUBLE == 0) {
-                alignment = ALIGNMENT_DOUBLE;
-            } else if ((uintptr_t)(_destination_tmp) % ALIGNMENT_FLOAT == 0 && (uintptr_t)_source_tmp % ALIGNMENT_FLOAT == 0 && _nb_bytes % ALIGNMENT_FLOAT == 0) {
-                alignment = ALIGNMENT_FLOAT;
-            } else {
-                alignment = ALIGNMENT_CHAR;
-            }
-            copy_count = _nb_bytes / alignment;
+        _destination_tmp = destination_base + dst_offset;
+
+        if (i == 0 && blockIdx.x == 0 && cuda_iov_partial_length_start != 0) {
+            _nb_bytes = cuda_iov_partial_length_start;
+        } else if (i == nb_tasks-1 && (blockIdx.x == (nb_blocks_used-1) % gridDim.x) && cuda_iov_partial_length_end != 0) {
+            _nb_bytes = cuda_iov_partial_length_end;
+        } else {
+            _nb_bytes = cuda_iov_dist[blockIdx.x + i * gridDim.x + current_cuda_iov_pos].nb_bytes;
+        }
+        if ((uintptr_t)(_destination_tmp) % ALIGNMENT_DOUBLE == 0 && (uintptr_t)src % ALIGNMENT_DOUBLE == 0 && _nb_bytes % ALIGNMENT_DOUBLE == 0) {
+            alignment = ALIGNMENT_DOUBLE;
+        } else if ((uintptr_t)(_destination_tmp) % ALIGNMENT_FLOAT == 0 && (uintptr_t)src % ALIGNMENT_FLOAT == 0 && _nb_bytes % ALIGNMENT_FLOAT == 0) {
+            alignment = ALIGNMENT_FLOAT;
+        } else {
+            alignment = ALIGNMENT_CHAR;
         }
-        __syncthreads();
+        copy_count = _nb_bytes / alignment;
         
         for (j = threadIdx.x; j < copy_count; j += blockDim.x) {
 /*            if (threadIdx.x == 0) {
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
index 84d5bd5ea1d..50009710d2d 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
@@ -632,7 +632,7 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
     uint32_t nb_blocks, thread_per_block, nb_blocks_used;
     size_t length, buffer_size, length_per_iovec;
     unsigned char *source, *source_base, *destination_base, *destination;
-    size_t total_unpacked, unpacked_wo_cache, unpacked_w_cache;
+    size_t total_unpacked;
     int32_t complete_flag = 0;
     uint8_t buffer_isfull = 0;
     uint8_t free_required = 0;
@@ -644,19 +644,21 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
 //    int32_t orig_stack_index;
     cudaError_t cuda_err;
     ddt_cuda_stream_t *cuda_streams = current_cuda_device->cuda_streams;
-    ddt_cuda_iov_dist_cached_t* cuda_iov_dist_h_current;
-    ddt_cuda_iov_dist_cached_t* cuda_iov_dist_d_current;
-    uintptr_t *cuda_iov_contig_buf_h_current, *cuda_iov_contig_buf_d_current;
-    ddt_cuda_iov_pipeline_block_t *cuda_iov_pipeline_block;
+    ddt_cuda_iov_dist_cached_t* cuda_iov_dist_h_current = NULL;
+    ddt_cuda_iov_dist_cached_t* cuda_iov_dist_d_current = NULL;
+    uintptr_t *cuda_iov_contig_buf_h_current = NULL;
+    uintptr_t *cuda_iov_contig_buf_d_current = NULL;
+    ddt_cuda_iov_pipeline_block_t *cuda_iov_pipeline_block = NULL;
     int iov_pipeline_block_id = 0;
     cudaStream_t *cuda_stream_iov = NULL;
     const struct iovec *ddt_iov = NULL;
     uint32_t ddt_iov_count = 0;
     size_t iov_len = 0;
     uint32_t iov_start_pos, iov_end_pos, cuda_iov_start_pos, cuda_iov_end_pos;
-    ddt_cuda_iov_total_cached_t* cached_cuda_iov;
-    ddt_cuda_iov_dist_cached_t* cached_cuda_iov_dist_d;
-    uint32_t *cached_cuda_iov_nb_bytes_list_h, *cuda_iov_nb_bytes_list_h_current;
+    ddt_cuda_iov_total_cached_t* cached_cuda_iov = NULL;
+    ddt_cuda_iov_dist_cached_t* cached_cuda_iov_dist_d = NULL;
+    uint32_t *cached_cuda_iov_nb_bytes_list_h = NULL;
+    uint32_t *cuda_iov_nb_bytes_list_h_current = NULL;
     uint32_t cached_cuda_iov_count = 0;
     uint8_t cuda_iov_is_cached = 0;
     size_t cuda_iov_partial_length_start = 0;
@@ -715,8 +717,6 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
 #endif
     buffer_size = iov[0].iov_len;
     total_unpacked = 0;
-    unpacked_wo_cache = 0;
-    unpacked_w_cache = 0;
     cuda_streams->current_stream_id = 0;
     convertor_flags = pConvertor->flags;
 //    orig_stack_index = pStack->index;
@@ -728,8 +728,6 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
     assert(cached_cuda_iov_dist_d != NULL);
     cached_cuda_iov_nb_bytes_list_h = cached_cuda_iov->nb_bytes_h;
     assert(cached_cuda_iov_nb_bytes_list_h != NULL);
-    cached_cuda_iov_count = cached_cuda_iov->cuda_iov_count;
-    cuda_iov_is_cached = cached_cuda_iov->cuda_iov_is_cached;
     DT_CUDA_DEBUG ( opal_cuda_output(4, "Unpack iov count %d, submit to CUDA stream %d\n", ddt_iov_count, cuda_streams->current_stream_id); );
 
 #if defined (OPAL_DATATYPE_CUDA_TIMING)
@@ -744,132 +742,73 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
     
     /* cuda iov is not cached, start to cache iov */
     if(opal_ddt_cuda_iov_is_cached(pConvertor) == 0) {
+        nb_blocks_used = 0;
+        cuda_iov_pipeline_block = current_cuda_device->cuda_iov_pipeline_block[iov_pipeline_block_id];
+        cuda_iov_dist_h_current = cuda_iov_pipeline_block->cuda_iov_dist_cached_h;
+        cuda_stream_iov = cuda_iov_pipeline_block->cuda_stream;
+        cuda_err = cudaStreamWaitEvent(*cuda_stream_iov, cuda_iov_pipeline_block->cuda_event, 0);
+        opal_cuda_check_error(cuda_err);
     
-        iov_start_pos = pConvertor->current_iov_pos;
-        iov_end_pos = iov_start_pos + IOV_PIPELINE_SIZE;
-        if (iov_end_pos > ddt_iov_count) {
-            iov_end_pos = ddt_iov_count;
-        }
-
-        while (iov_start_pos < iov_end_pos && !buffer_isfull) {
-
-            nb_blocks_used = 0;
-            cuda_iov_pipeline_block = current_cuda_device->cuda_iov_pipeline_block[iov_pipeline_block_id];
-            cuda_iov_dist_h_current = cuda_iov_pipeline_block->cuda_iov_dist_cached_h;
-            cuda_iov_contig_buf_h_current = cuda_iov_pipeline_block->cuda_iov_contig_buf_h;
-            cuda_iov_contig_buf_d_current = cuda_iov_pipeline_block->cuda_iov_contig_buf_d;
-            cuda_iov_dist_d_current = cached_cuda_iov_dist_d + pConvertor->current_cuda_iov_pos;
-            cuda_iov_nb_bytes_list_h_current = cached_cuda_iov_nb_bytes_list_h + pConvertor->current_cuda_iov_pos;
-            DT_CUDA_DEBUG ( opal_cuda_output(2, "Pack cuda iov pos %d\n", pConvertor->current_cuda_iov_pos););
-            cuda_stream_iov = cuda_iov_pipeline_block->cuda_stream;
-            cuda_err = cudaStreamWaitEvent(*cuda_stream_iov, cuda_iov_pipeline_block->cuda_event, 0);
-            opal_cuda_check_error(cuda_err);
-        
 
 #if defined (OPAL_DATATYPE_CUDA_TIMING)
-            GET_TIME(start);
+        GET_TIME(start);
 #endif
 
-            for (i = iov_start_pos; i < iov_end_pos && !buffer_isfull; i++) {
-                if (pConvertor->current_iov_partial_length > 0) {
-                    iov_len = pConvertor->current_iov_partial_length;
-                    pConvertor->current_iov_partial_length = 0;
-                } else {
-                    iov_len = ddt_iov[i].iov_len;
-                }
-                if (buffer_size >= iov_len) {
-                    length_per_iovec = iov_len;
-                } else {
-                  /*  orig_alignment = opal_datatype_basicDatatypes[pElem->elem.common.type]->size;*/
-                    orig_alignment = ALIGNMENT_CHAR;
-                    length_per_iovec = buffer_size / orig_alignment * orig_alignment;
-                    buffer_isfull = 1;
-                    pConvertor->current_iov_partial_length = iov_len - length_per_iovec;
-                    pConvertor->current_iov_pos = i;
-                }
-                buffer_size -= length_per_iovec;
-                unpacked_wo_cache += length_per_iovec;
-                destination = (size_t)(ddt_iov[i].iov_base) + (ddt_iov[i].iov_len - iov_len) + destination_base;
+        for (i = 0; i < ddt_iov_count; i++) {
+            length_per_iovec = ddt_iov[i].iov_len;
+            destination = (size_t)(ddt_iov[i].iov_base) + destination_base;
 
-                alignment = ALIGNMENT_DOUBLE;
+            alignment = ALIGNMENT_DOUBLE;
 
-                count_desc = length_per_iovec / alignment;
-                residue_desc = length_per_iovec % alignment;
-                nb_blocks_per_description = (count_desc + thread_per_block - 1) / thread_per_block;
-                DT_CUDA_DEBUG ( opal_cuda_output(10, "Unpack description %d, size %d, residue %d, alignment %d\n", i, count_desc, residue_desc, alignment); );
-                for (j = 0; j < nb_blocks_per_description; j++) {
-                    cuda_iov_dist_h_current[nb_blocks_used].ptr_offset = destination + j * thread_per_block * alignment - destination_base;
-                    cuda_iov_contig_buf_h_current[nb_blocks_used] = (uintptr_t)source;
-                    if ( (j+1) * thread_per_block <= count_desc) {
-                        cuda_iov_dist_h_current[nb_blocks_used].nb_bytes = thread_per_block * alignment;
-                    } else {
-                        cuda_iov_dist_h_current[nb_blocks_used].nb_bytes = (thread_per_block - ((j+1)*thread_per_block - count_desc)) * alignment;
-                    }
+            count_desc = length_per_iovec / alignment;
+            residue_desc = length_per_iovec % alignment;
+            nb_blocks_per_description = (count_desc + thread_per_block - 1) / thread_per_block;
+            DT_CUDA_DEBUG ( opal_cuda_output(10, "Unpack description %d, size %d, residue %d, alignment %d\n", i, count_desc, residue_desc, alignment); );
+            for (j = 0; j < nb_blocks_per_description; j++) {
+                cuda_iov_dist_h_current[nb_blocks_used].ptr_offset = destination + j * thread_per_block * alignment - destination_base;
+                if ( (j+1) * thread_per_block <= count_desc) {
+                    cuda_iov_dist_h_current[nb_blocks_used].nb_bytes = thread_per_block * alignment;
+                } else {
+                    cuda_iov_dist_h_current[nb_blocks_used].nb_bytes = (thread_per_block - ((j+1)*thread_per_block - count_desc)) * alignment;
+                }
 #if defined (OPAL_DATATYPE_CUDA_DEBUG)
-                    assert (cuda_iov_dist_h_current[nb_blocks_used].nb_bytes > 0); 
+                assert (cuda_iov_dist_h_current[nb_blocks_used].nb_bytes > 0); 
 #endif /* OPAL_DATATYPE_CUDA_DEBUG */
-                    cuda_iov_nb_bytes_list_h_current[nb_blocks_used] = cuda_iov_dist_h_current[nb_blocks_used].nb_bytes;
-                    source += cuda_iov_dist_h_current[nb_blocks_used].nb_bytes;
-                    DT_CUDA_DEBUG( opal_cuda_output(12, "Unpack \tblock %d, src_offset %ld, dst %p, nb_bytes %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].ptr_offset, cuda_iov_contig_buf_h_current[nb_blocks_used], cuda_iov_dist_h_current[nb_blocks_used].nb_bytes); );
-                    nb_blocks_used ++;
-                }
+                cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] = cuda_iov_dist_h_current[nb_blocks_used].nb_bytes;
+                DT_CUDA_DEBUG( opal_cuda_output(12, "Unpack \tblock %d, src_offset %ld, nb_bytes %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].ptr_offset, cuda_iov_dist_h_current[nb_blocks_used].nb_bytes); );
+                nb_blocks_used ++;
+            }
 
-                /* handle residue */
-                if (residue_desc != 0) {
-                   /* orig_alignment = opal_datatype_basicDatatypes[pElem->elem.common.type]->size;*/
-                    orig_alignment = ALIGNMENT_CHAR;
-                    cuda_iov_dist_h_current[nb_blocks_used].ptr_offset = destination + length_per_iovec / alignment * alignment - destination_base;
-                    cuda_iov_contig_buf_h_current[nb_blocks_used] = (uintptr_t)source;
-                    cuda_iov_dist_h_current[nb_blocks_used].nb_bytes = length_per_iovec - length_per_iovec / alignment * alignment;
+            /* handle residue */
+            if (residue_desc != 0) {
+               /* orig_alignment = opal_datatype_basicDatatypes[pElem->elem.common.type]->size;*/
+                orig_alignment = ALIGNMENT_CHAR;
+                cuda_iov_dist_h_current[nb_blocks_used].ptr_offset = destination + length_per_iovec / alignment * alignment - destination_base;
+                cuda_iov_dist_h_current[nb_blocks_used].nb_bytes = length_per_iovec - length_per_iovec / alignment * alignment;
 #if defined (OPAL_DATATYPE_CUDA_DEBUG)
-                    assert (cuda_iov_dist_h_current[nb_blocks_used].nb_bytes > 0);
+                assert (cuda_iov_dist_h_current[nb_blocks_used].nb_bytes > 0);
 #endif /* OPAL_DATATYPE_CUDA_DEBUG */
-                    cuda_iov_nb_bytes_list_h_current[nb_blocks_used] = cuda_iov_dist_h_current[nb_blocks_used].nb_bytes;
-                    source += cuda_iov_dist_h_current[nb_blocks_used].nb_bytes;
-                    DT_CUDA_DEBUG( opal_cuda_output(12, "Unpack \tblock %d, src_offset %ld, dst %p, nb_bytes %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].ptr_offset, cuda_iov_contig_buf_h_current[nb_blocks_used], cuda_iov_dist_h_current[nb_blocks_used].nb_bytes); );
-                    nb_blocks_used ++;
-                }
-            }
-            
-            if (!buffer_isfull) {
-                pConvertor->current_iov_pos = i;
+                cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] = cuda_iov_dist_h_current[nb_blocks_used].nb_bytes;
+                DT_CUDA_DEBUG( opal_cuda_output(12, "Unpack \tblock %d, src_offset %ld, nb_bytes %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].ptr_offset, cuda_iov_dist_h_current[nb_blocks_used].nb_bytes); );
+                nb_blocks_used ++;
             }
-
+        }
+        
+        cudaMemcpy(cached_cuda_iov_dist_d, cuda_iov_dist_h_current, sizeof(ddt_cuda_iov_dist_cached_t)*(nb_blocks_used), cudaMemcpyHostToDevice);
+        opal_ddt_set_cuda_iov_cached(pConvertor, nb_blocks_used);
+        DT_CUDA_DEBUG ( opal_cuda_output(2, "Unpack cuda iov is cached, count %d\n", nb_blocks_used););
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
-            GET_TIME( end );
-            total_time = ELAPSED_TIME( start, end );
-            DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Unpack src %p, iov is prepared in %ld microsec, kernel submitted to CUDA stream %d, nb_blocks_used %d\n", source_base, total_time,  cuda_iov_pipeline_block->cuda_stream_id, nb_blocks_used); );
+        GET_TIME( end );
+        total_time = ELAPSED_TIME( start, end );
+        DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Unpack cached cuda iov is prepared in %ld microsec, nb_blocks_used %d\n", total_time, nb_blocks_used); );
 #endif
-
-            cudaMemcpyAsync(cuda_iov_dist_d_current, cuda_iov_dist_h_current, sizeof(ddt_cuda_iov_dist_cached_t)*(nb_blocks_used), cudaMemcpyHostToDevice, *cuda_stream_iov);
-            cudaMemcpyAsync(cuda_iov_contig_buf_d_current, cuda_iov_contig_buf_h_current, sizeof(uintptr_t)*(nb_blocks_used), cudaMemcpyHostToDevice, *cuda_stream_iov);
-            opal_generic_simple_unpack_cuda_iov_cached_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cuda_iov_dist_d_current, cuda_iov_contig_buf_d_current, nb_blocks_used, destination_base, 0, 0);
-            cuda_err = cudaEventRecord(cuda_iov_pipeline_block->cuda_event, *cuda_stream_iov);
-            opal_cuda_check_error(cuda_err);
-            iov_pipeline_block_id ++;
-            iov_pipeline_block_id = iov_pipeline_block_id % NB_STREAMS;
-            pConvertor->current_cuda_iov_pos += nb_blocks_used;
-        
-            iov_start_pos = iov_end_pos;
-            iov_end_pos = iov_start_pos + IOV_PIPELINE_SIZE;
-            if (iov_end_pos >= ddt_iov_count) {
-                iov_end_pos = ddt_iov_count;
-            }
-            /* finished */
-            if (pConvertor->current_iov_pos == ddt_iov_count) {
-                pConvertor->current_count ++;
-                opal_ddt_set_cuda_iov_cached(pConvertor, pConvertor->current_cuda_iov_pos);
-                DT_CUDA_DEBUG ( opal_cuda_output(2, "Pack cuda iov is cached, count %d\n", pConvertor->current_cuda_iov_pos););
-            }
-            DT_CUDA_DEBUG ( opal_cuda_output(4, "Unpack iov start pos %d end pos %d, submit to CUDA stream %d\n", iov_start_pos, iov_end_pos, cuda_streams->current_stream_id); );
-
-        }
     }
-    total_unpacked += unpacked_wo_cache;
-    pConvertor->bConverted += unpacked_wo_cache;
-#if 1    
+    
+    cached_cuda_iov_count = cached_cuda_iov->cuda_iov_count;
+      
     /* now we use cached cuda iov */
     if( pConvertor->bConverted != pConvertor->local_size && !buffer_isfull) {
+        opal_ddt_set_cuda_iov_position(pConvertor, pConvertor->bConverted, cached_cuda_iov_nb_bytes_list_h, cached_cuda_iov_count);
         cuda_iov_start_pos = pConvertor->current_cuda_iov_pos;
         cuda_iov_end_pos = cached_cuda_iov_count;
         nb_blocks_used = 0;
@@ -878,14 +817,13 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
         cuda_iov_contig_buf_d_current = cuda_iov_pipeline_block->cuda_iov_contig_buf_d;
         cuda_iov_dist_d_current = cached_cuda_iov_dist_d + pConvertor->current_cuda_iov_pos;
         cuda_stream_iov = cuda_iov_pipeline_block->cuda_stream;
-        cuda_err = cudaStreamWaitEvent(*cuda_stream_iov, cuda_iov_pipeline_block->cuda_event, 0);
-        opal_cuda_check_error(cuda_err);
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
         GET_TIME(start);
 #endif
+        printf("[00000] partial_length %ld, pos %d\n", pConvertor->current_iov_partial_length, pConvertor->current_cuda_iov_pos);
         if (pConvertor->current_iov_partial_length > 0) {
             cuda_iov_partial_length_start = pConvertor->current_iov_partial_length;
-            unpacked_w_cache += cuda_iov_partial_length_start;
+            total_unpacked += cuda_iov_partial_length_start;
             buffer_size -= cuda_iov_partial_length_start;
             pConvertor->current_iov_partial_length = 0;
             cuda_iov_contig_buf_h_current[nb_blocks_used] = (uintptr_t)source;
@@ -897,13 +835,13 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
             if (buffer_size >= cached_cuda_iov_nb_bytes_list_h[i]) {
                 cuda_iov_contig_buf_h_current[nb_blocks_used] = (uintptr_t)source;
                 source += cached_cuda_iov_nb_bytes_list_h[i];
-                unpacked_w_cache += cached_cuda_iov_nb_bytes_list_h[i];
+                total_unpacked += cached_cuda_iov_nb_bytes_list_h[i];
                 buffer_size -= cached_cuda_iov_nb_bytes_list_h[i];
                 nb_blocks_used ++;
             } else {
                 if (buffer_size > 0) {
                     cuda_iov_partial_length_end = buffer_size;
-                    unpacked_w_cache += cuda_iov_partial_length_end;
+                    total_unpacked += cuda_iov_partial_length_end;
                     cuda_iov_contig_buf_h_current[nb_blocks_used] = (uintptr_t)source;
                     source += cuda_iov_partial_length_end;
                     pConvertor->current_iov_partial_length = cached_cuda_iov_nb_bytes_list_h[i] - cuda_iov_partial_length_end;
@@ -919,27 +857,16 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
         total_time = ELAPSED_TIME( start, end );
         DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Unpack src %p, cached cuda iov is prepared in %ld microsec, kernel submitted to CUDA stream %d, nb_blocks %d\n", source_base, total_time,  cuda_iov_pipeline_block->cuda_stream_id, nb_blocks_used); );
 #endif
-        if (pConvertor->current_iov_partial_length > 0) {
-            pConvertor->current_cuda_iov_pos += nb_blocks_used - 1;
-        } else {
-            pConvertor->current_cuda_iov_pos += nb_blocks_used;
-        }
         cudaMemcpyAsync(cuda_iov_contig_buf_d_current, cuda_iov_contig_buf_h_current, sizeof(uintptr_t)*(nb_blocks_used), cudaMemcpyHostToDevice, *cuda_stream_iov);
         DT_CUDA_DEBUG ( opal_cuda_output(2, "kernel launched src_base %p, dst_base %p, nb_blocks %ld\n", source_base, destination_base, nb_blocks_used ); );
-        opal_generic_simple_unpack_cuda_iov_cached_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cuda_iov_dist_d_current, cuda_iov_contig_buf_d_current, nb_blocks_used, destination_base, cuda_iov_partial_length_start, cuda_iov_partial_length_end);
-        cuda_err = cudaEventRecord(cuda_iov_pipeline_block->cuda_event, *cuda_stream_iov);
-        opal_cuda_check_error(cuda_err);
-        iov_pipeline_block_id ++;
-        iov_pipeline_block_id = iov_pipeline_block_id % NB_STREAMS;
+        opal_generic_simple_unpack_cuda_iov_cached_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cached_cuda_iov_dist_d, pConvertor->current_cuda_iov_pos, cuda_iov_contig_buf_d_current, nb_blocks_used, destination_base, cuda_iov_partial_length_start, cuda_iov_partial_length_end);
     }
-#endif
 
     for (i = 0; i < NB_STREAMS; i++) {
         cudaStreamSynchronize(cuda_streams->opal_cuda_stream[i]);
     }
     
-    total_unpacked += unpacked_w_cache;
-    pConvertor->bConverted += unpacked_w_cache;
+    pConvertor->bConverted += total_unpacked;
     DT_CUDA_DEBUG ( opal_cuda_output(2, "Unpack total unpacked %d\n", total_unpacked); );
 
     iov[0].iov_len = total_unpacked;
diff --git a/test/datatype/ddt_benchmark.c b/test/datatype/ddt_benchmark.c
index c8c3fd7db45..d961ef34e4e 100644
--- a/test/datatype/ddt_benchmark.c
+++ b/test/datatype/ddt_benchmark.c
@@ -793,6 +793,8 @@ static int local_copy_with_convertor( ompi_datatype_t* pdt, int count, int chunk
     int32_t length = 0, done1 = 0, done2 = 0;
     TIMER_DATA_TYPE start, end, unpack_start, unpack_end;
     long total_time, unpack_time = 0;
+    int j, t_error = 0;
+    unsigned char *mat_char;
 
     dt_length = compute_buffer_length(pdt, count);
     printf("length %lu\n", dt_length);
@@ -890,7 +892,18 @@ static int local_copy_with_convertor( ompi_datatype_t* pdt, int count, int chunk
 
         if( done1 == 0 ) {
             done1 = opal_convertor_pack( send_convertor, &iov, &iov_count, &max_data );
+            
+        }
+#if defined (TEST_CHAR)
+   /*     mat_char = (unsigned char *)ptemp;
+        for (j = 0; j < max_data; j++) {
+            if (mat_char[j] != 'a') {
+                t_error ++;
+                printf("error %d, %c\n", j, mat_char[j]);
+            }
         }
+        printf("total error %d\n", t_error);*/
+#endif
 
         if( done2 == 0 ) {
             GET_TIME( unpack_start );
@@ -1306,13 +1319,13 @@ int main( int argc, char* argv[] )
         OBJ_RELEASE( pdt ); assert( pdt == NULL );
     }
     
-    for (blk_len = 2000; blk_len <= 2000; blk_len += 500) {
+    for (blk_len = 51; blk_len <= 51; blk_len += 500) {
         printf( ">>--------------------------------------------<<\n" );
         printf( "Vector data-type (60000 times %d double stride 512)\n", blk_len );
         pdt = create_vector_type( MPI_DOUBLE, blk_len, blk_len, blk_len*2);
         if( outputFlags & CHECK_PACK_UNPACK ) {
-            for (i = 0; i < 4; i++) {
-        //          vector_ddt( pdt, 1, pdt, 1, 1024*1024*100 , blk_len, blk_len, blk_len*2);
+            for (i = 0; i < 1; i++) {
+      //           vector_ddt( pdt, 1, pdt, 1, 1024*1024*100 , blk_len, blk_len, blk_len*2);
     //             vector_ddt_2d( pdt, 1, pdt, 1, 1024*1024*100 , 8192, blk_len, blk_len+128);
             }
         }
diff --git a/test/datatype/ddt_lib.h b/test/datatype/ddt_lib.h
index 0f6bbc2cb37..ef462ce0f31 100644
--- a/test/datatype/ddt_lib.h
+++ b/test/datatype/ddt_lib.h
@@ -34,9 +34,9 @@
 #define DUMP_DATA_AFTER_COMMIT 0x00000001
 #define CHECK_PACK_UNPACK      0x00000002
 
-#define TEST_DOUBLE
+//#define TEST_DOUBLE
 //#define TEST_FLOAT
-//#define TEST_CHAR
+#define TEST_CHAR
 
 
 extern uint32_t outputFlags;

From 540e44843c9a08ccd4d854d117edd0fe18e2bb78 Mon Sep 17 00:00:00 2001
From: eddy16112 <eddy16112@gmail.com>
Date: Fri, 13 Nov 2015 16:48:09 -0500
Subject: [PATCH 080/190] now cuda iov is {nc_disp, c_disp}

---
 .../cuda/opal_datatype_cuda_internal.cuh      |  8 ++--
 .../cuda/opal_datatype_pack_cuda_kernel.cu    | 22 ++++++-----
 .../cuda/opal_datatype_pack_cuda_wrapper.cu   | 35 +++++++++--------
 .../cuda/opal_datatype_unpack_cuda_kernel.cu  | 35 +++++++++--------
 .../cuda/opal_datatype_unpack_cuda_wrapper.cu | 39 +++++++++----------
 test/datatype/ddt_benchmark.c                 |  4 +-
 6 files changed, 73 insertions(+), 70 deletions(-)

diff --git a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
index b1c36b66e14..ea4afa0b989 100644
--- a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
+++ b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
@@ -59,8 +59,8 @@ typedef struct {
 } ddt_cuda_iov_dist_non_cached_t;
 
 typedef struct {
-    size_t ptr_offset;
-    uint32_t nb_bytes;
+    size_t ncontig_disp;
+    size_t contig_disp;
 } ddt_cuda_iov_dist_cached_t;
 
 typedef struct {
@@ -138,9 +138,9 @@ __global__ void opal_generic_simple_pack_cuda_iov_non_cached_kernel( ddt_cuda_io
 
 __global__ void opal_generic_simple_unpack_cuda_iov_non_cached_kernel( ddt_cuda_iov_dist_non_cached_t* cuda_iov_dist, int nb_blocks_used);
 
-__global__ void opal_generic_simple_pack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, uint32_t cuda_iov_pos, uintptr_t* cuda_iov_contig_buf_d, int nb_blocks_used, unsigned char* source_base);
+__global__ void opal_generic_simple_pack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, uint32_t cuda_iov_pos, uintptr_t* cuda_iov_contig_buf_d, int nb_blocks_used, unsigned char* source_base, unsigned char* destination_base);
 
-__global__ void opal_generic_simple_unpack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, uint32_t cuda_iov_pos, uintptr_t* cuda_iov_contig_buf_d, int nb_blocks_used, unsigned char* destination_base, size_t cuda_iov_partial_length_start, size_t cuda_iov_partial_length_end);
+__global__ void opal_generic_simple_unpack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, uint32_t cuda_iov_pos, uintptr_t* cuda_iov_contig_buf_d, int nb_blocks_used, unsigned char* destination_base, unsigned char* source_base, size_t cuda_iov_partial_length_start, size_t cuda_iov_partial_length_end);
 
 void opal_cuda_output(int output_id, const char *format, ...);
 
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
index 93fb188ddcd..ddfd68b0e4c 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
@@ -88,13 +88,14 @@ __global__ void opal_generic_simple_pack_cuda_iov_non_cached_kernel( ddt_cuda_io
     }
 }
 
-__global__ void opal_generic_simple_pack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, uint32_t cuda_iov_pos, uintptr_t* cuda_iov_contig_buf_d, int nb_blocks_used, unsigned char* source_base)
+__global__ void opal_generic_simple_pack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, uint32_t cuda_iov_pos, uintptr_t* cuda_iov_contig_buf_d, int nb_blocks_used, unsigned char* source_base, unsigned char* destination_base)
 {
-    uint32_t i, j, _nb_bytes;    
-    size_t src_offset;
-    unsigned char *dst;
+    uint32_t i, j;
+    size_t _nb_bytes;    
+    size_t src_offset, dst_offset;
     unsigned char *_source_tmp, *_destination_tmp;
     uint32_t current_cuda_iov_pos = cuda_iov_pos;
+    size_t destination_disp = cuda_iov_dist[current_cuda_iov_pos].contig_disp;
     
     __shared__ uint32_t nb_tasks;
     uint32_t copy_count;
@@ -110,15 +111,16 @@ __global__ void opal_generic_simple_pack_cuda_iov_cached_kernel( ddt_cuda_iov_di
     __syncthreads();
     
     for (i = 0; i < nb_tasks; i++) {
-        src_offset = cuda_iov_dist[blockIdx.x + i * gridDim.x + current_cuda_iov_pos].ptr_offset;
-        dst = (unsigned char *)cuda_iov_contig_buf_d[blockIdx.x + i * gridDim.x];
-        _nb_bytes = cuda_iov_dist[blockIdx.x + i * gridDim.x + current_cuda_iov_pos].nb_bytes;
+        src_offset = cuda_iov_dist[blockIdx.x + i * gridDim.x + current_cuda_iov_pos].ncontig_disp;
+        dst_offset = cuda_iov_dist[blockIdx.x + i * gridDim.x + current_cuda_iov_pos].contig_disp - destination_disp;
+        _nb_bytes = cuda_iov_dist[blockIdx.x + i * gridDim.x + current_cuda_iov_pos + 1].contig_disp - cuda_iov_dist[blockIdx.x + i * gridDim.x + current_cuda_iov_pos].contig_disp;
         
         _source_tmp = source_base + src_offset;
+        _destination_tmp = destination_base + dst_offset;
         /* block size is either multiple of ALIGNMENT_DOUBLE or residule */
-        if ((uintptr_t)(_source_tmp) % ALIGNMENT_DOUBLE == 0 && (uintptr_t)dst % ALIGNMENT_DOUBLE == 0 && _nb_bytes % ALIGNMENT_DOUBLE == 0) {
+        if ((uintptr_t)(_source_tmp) % ALIGNMENT_DOUBLE == 0 && (uintptr_t)(_destination_tmp) % ALIGNMENT_DOUBLE == 0 && _nb_bytes % ALIGNMENT_DOUBLE == 0) {
             alignment = ALIGNMENT_DOUBLE;
-        } else if ((uintptr_t)(_source_tmp) % ALIGNMENT_FLOAT == 0 && (uintptr_t)dst % ALIGNMENT_FLOAT == 0 && _nb_bytes % ALIGNMENT_FLOAT == 0) {
+        } else if ((uintptr_t)(_source_tmp) % ALIGNMENT_FLOAT == 0 && (uintptr_t)(_destination_tmp) % ALIGNMENT_FLOAT == 0 && _nb_bytes % ALIGNMENT_FLOAT == 0) {
             alignment = ALIGNMENT_FLOAT;
         } else {
             alignment = ALIGNMENT_CHAR;
@@ -128,7 +130,7 @@ __global__ void opal_generic_simple_pack_cuda_iov_cached_kernel( ddt_cuda_iov_di
         for (j = threadIdx.x; j < copy_count; j += blockDim.x) {
             if (j < copy_count) {
                 _source_tmp = source_base + src_offset + j * alignment;
-                _destination_tmp = dst + j * alignment;
+                _destination_tmp = destination_base + dst_offset + j * alignment;
 #if !defined (OPAL_DATATYPE_CUDA_DRY_RUN)
                 if (alignment == ALIGNMENT_DOUBLE) {
                     *((long *)_destination_tmp) = *((long *)_source_tmp);
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
index 1d14c000977..f1ce6dbda7d 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
@@ -965,6 +965,7 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
     uint32_t *cuda_iov_nb_bytes_list_h_current = NULL;
     uint32_t cached_cuda_iov_count = 0;
     uint8_t cuda_iov_is_cached = 0;
+    size_t destionation_disp = 0;
 
 #if defined(OPAL_DATATYPE_CUDA_TIMING)    
     TIMER_DATA_TYPE start, end, start_total, end_total;
@@ -1073,17 +1074,18 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
             nb_blocks_per_description = (count_desc + thread_per_block - 1) / thread_per_block;
             DT_CUDA_DEBUG ( opal_cuda_output(10, "Pack description %d, size %d, residue %d, alignment %d, nb_block_aquired %d\n", i, count_desc, residue_desc, alignment, nb_blocks_per_description); );
             for (j = 0; j < nb_blocks_per_description; j++) {
-                cuda_iov_dist_h_current[nb_blocks_used].ptr_offset = source + j * thread_per_block * alignment - source_base;
+                cuda_iov_dist_h_current[nb_blocks_used].ncontig_disp = source + j * thread_per_block * alignment - source_base;
+                cuda_iov_dist_h_current[nb_blocks_used].contig_disp = destionation_disp;
                 if ( (j+1) * thread_per_block <= count_desc) {
-                    cuda_iov_dist_h_current[nb_blocks_used].nb_bytes = thread_per_block * alignment;
+                    cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] = thread_per_block * alignment;
                 } else {
-                    cuda_iov_dist_h_current[nb_blocks_used].nb_bytes = (count_desc - j*thread_per_block) * alignment; 
+                    cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] = (count_desc - j*thread_per_block) * alignment; 
                 }
 #if defined (OPAL_DATATYPE_CUDA_DEBUG)
-                assert(cuda_iov_dist_h_current[nb_blocks_used].nb_bytes > 0);
+                assert(cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] > 0);
 #endif /* OPAL_DATATYPE_CUDA_DEBUG */
-                cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] = cuda_iov_dist_h_current[nb_blocks_used].nb_bytes;
-                DT_CUDA_DEBUG( opal_cuda_output(12, "Pack \tblock %d, src_offset %ld, nb_bytes %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].ptr_offset, cuda_iov_dist_h_current[nb_blocks_used].nb_bytes); );
+                destionation_disp += cached_cuda_iov_nb_bytes_list_h[nb_blocks_used];
+                DT_CUDA_DEBUG( opal_cuda_output(12, "Pack \tblock %d, ncontig_disp %ld, contig_disp %ld, nb_bytes %ld\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].ncontig_disp, cuda_iov_dist_h_current[nb_blocks_used].contig_disp, cached_cuda_iov_nb_bytes_list_h[nb_blocks_used]); );
                 nb_blocks_used ++;
                 assert (nb_blocks_used < CUDA_MAX_NB_BLOCKS*CUDA_IOV_MAX_TASK_PER_BLOCK);
             }
@@ -1091,18 +1093,21 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
             /* handle residue */
             if (residue_desc != 0) {
                 /*orig_alignment = opal_datatype_basicDatatypes[pElem->elem.common.type]->size;*/
-                cuda_iov_dist_h_current[nb_blocks_used].ptr_offset = source + length_per_iovec / alignment * alignment - source_base;
-                cuda_iov_dist_h_current[nb_blocks_used].nb_bytes = length_per_iovec - length_per_iovec / alignment * alignment;
+                cuda_iov_dist_h_current[nb_blocks_used].ncontig_disp = source + length_per_iovec / alignment * alignment - source_base;
+                cuda_iov_dist_h_current[nb_blocks_used].contig_disp = destionation_disp;
+                cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] = length_per_iovec - length_per_iovec / alignment * alignment;
 #if defined (OPAL_DATATYPE_CUDA_DEBUG)
-                assert(cuda_iov_dist_h_current[nb_blocks_used].nb_bytes > 0);
+                assert(cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] > 0);
 #endif /* OPAL_DATATYPE_CUDA_DEBUG */
-                cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] = cuda_iov_dist_h_current[nb_blocks_used].nb_bytes;
-                DT_CUDA_DEBUG( opal_cuda_output(12, "Pack \tblock %d, src_offset %ld, nb_bytes %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].ptr_offset, cuda_iov_dist_h_current[nb_blocks_used].nb_bytes); );
+                destionation_disp += cached_cuda_iov_nb_bytes_list_h[nb_blocks_used];
+                DT_CUDA_DEBUG( opal_cuda_output(12, "Pack \tblock %d, ncontig_disp %ld, contig_disp %ld, nb_bytes %ld\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].ncontig_disp, cuda_iov_dist_h_current[nb_blocks_used].contig_disp, cached_cuda_iov_nb_bytes_list_h[nb_blocks_used]); );
                 nb_blocks_used ++;
                 assert (nb_blocks_used < CUDA_MAX_NB_BLOCKS*CUDA_IOV_MAX_TASK_PER_BLOCK);
             }
         }
-        cudaMemcpyAsync(cached_cuda_iov_dist_d, cuda_iov_dist_h_current, sizeof(ddt_cuda_iov_dist_cached_t)*(nb_blocks_used), cudaMemcpyHostToDevice, *cuda_stream_iov);
+        /* use additional entry to store the size of entire contiguous buffer needed for one ddt */
+        cuda_iov_dist_h_current[nb_blocks_used].contig_disp = destionation_disp;
+        cudaMemcpyAsync(cached_cuda_iov_dist_d, cuda_iov_dist_h_current, sizeof(ddt_cuda_iov_dist_cached_t)*(nb_blocks_used+1), cudaMemcpyHostToDevice, *cuda_stream_iov);
         opal_ddt_set_cuda_iov_cached(pConvertor, nb_blocks_used);
         DT_CUDA_DEBUG ( opal_cuda_output(2, "Pack cuda iov is cached, count %d\n", nb_blocks_used););
 #if defined(OPAL_DATATYPE_CUDA_TIMING)    
@@ -1128,8 +1133,6 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
 #endif
         for (i = cuda_iov_start_pos; i < cuda_iov_end_pos && !buffer_isfull; i++) {
             if (buffer_size >= cached_cuda_iov_nb_bytes_list_h[i]) {
-                cuda_iov_contig_buf_h_current[nb_blocks_used] = (uintptr_t)destination;
-                destination += cached_cuda_iov_nb_bytes_list_h[i];
                 total_packed += cached_cuda_iov_nb_bytes_list_h[i];
                 buffer_size -= cached_cuda_iov_nb_bytes_list_h[i];
                 nb_blocks_used++;
@@ -1143,9 +1146,9 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
         total_time = ELAPSED_TIME( start, end );
         DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Pack to dest %p, cached cuda iov is prepared in %ld microsec, kernel submitted to CUDA stream %d, nb_blocks %d\n", destination_base, total_time,  cuda_iov_pipeline_block->cuda_stream_id, nb_blocks_used); );
 #endif
-        cudaMemcpyAsync(cuda_iov_contig_buf_d_current, cuda_iov_contig_buf_h_current, sizeof(uintptr_t)*(nb_blocks_used), cudaMemcpyHostToDevice, *cuda_stream_iov);
+//        cudaMemcpyAsync(cuda_iov_contig_buf_d_current, cuda_iov_contig_buf_h_current, sizeof(uintptr_t)*(nb_blocks_used), cudaMemcpyHostToDevice, *cuda_stream_iov);
         DT_CUDA_DEBUG ( opal_cuda_output(2, "kernel launched src_base %p, dst_base %p, nb_blocks %ld\n", source_base, destination_base, nb_blocks_used ); );
-        opal_generic_simple_pack_cuda_iov_cached_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cached_cuda_iov_dist_d, pConvertor->current_cuda_iov_pos, cuda_iov_contig_buf_d_current, nb_blocks_used, source_base);
+        opal_generic_simple_pack_cuda_iov_cached_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cached_cuda_iov_dist_d, pConvertor->current_cuda_iov_pos, cuda_iov_contig_buf_d_current, nb_blocks_used, source_base, destination_base);
         pConvertor->current_cuda_iov_pos += nb_blocks_used;
     }
 
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
index f98a8c0b2ea..9cf705ae7e3 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
@@ -46,15 +46,17 @@ __global__ void opal_generic_simple_unpack_cuda_iov_non_cached_kernel( ddt_cuda_
     }
 }
 
-__global__ void opal_generic_simple_unpack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, uint32_t cuda_iov_pos, uintptr_t* cuda_iov_contig_buf_d, int nb_blocks_used, unsigned char* destination_base, size_t cuda_iov_partial_length_start, size_t cuda_iov_partial_length_end)
+__global__ void opal_generic_simple_unpack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, uint32_t cuda_iov_pos, uintptr_t* cuda_iov_contig_buf_d, int nb_blocks_used, unsigned char* destination_base, unsigned char* source_base, size_t cuda_iov_partial_length_start, size_t cuda_iov_partial_length_end)
 {
     uint32_t i, j;
-    size_t dst_offset;
+    size_t dst_offset, src_offset;
     unsigned char *src;
     unsigned char *_source_tmp, *_destination_tmp;
-    uint32_t _nb_bytes;
-    uint32_t current_cuda_iov_pos = cuda_iov_pos; 
-    
+    size_t _nb_bytes;
+    uint32_t current_cuda_iov_pos = cuda_iov_pos;
+    size_t source_disp = cuda_iov_dist[current_cuda_iov_pos].contig_disp;
+    size_t source_partial_disp = (cuda_iov_dist[current_cuda_iov_pos+1].contig_disp - cuda_iov_dist[current_cuda_iov_pos].contig_disp) - cuda_iov_partial_length_start; 
+
     __shared__ uint32_t nb_tasks;
     uint32_t copy_count;
     uint8_t alignment;
@@ -68,24 +70,23 @@ __global__ void opal_generic_simple_unpack_cuda_iov_cached_kernel( ddt_cuda_iov_
     __syncthreads();
     
     for (i = 0; i < nb_tasks; i++) {
-        src = (unsigned char *)cuda_iov_contig_buf_d[blockIdx.x + i * gridDim.x];
-        dst_offset = cuda_iov_dist[blockIdx.x + i * gridDim.x + current_cuda_iov_pos].ptr_offset;
-        if (i == 0 && blockIdx.x == 0 && cuda_iov_partial_length_start != 0) {
-         //   if (threadIdx.x == 0) printf("cuda_iov_partial_length_start %d", cuda_iov_partial_length_start);
-            dst_offset = dst_offset + cuda_iov_dist[blockIdx.x + i * gridDim.x + current_cuda_iov_pos].nb_bytes - cuda_iov_partial_length_start; 
-        }
-        _destination_tmp = destination_base + dst_offset;
+        src_offset = cuda_iov_dist[blockIdx.x + i * gridDim.x + current_cuda_iov_pos].contig_disp - source_disp - source_partial_disp;
+        dst_offset = cuda_iov_dist[blockIdx.x + i * gridDim.x + current_cuda_iov_pos].ncontig_disp;
+        _nb_bytes = cuda_iov_dist[blockIdx.x + i * gridDim.x + current_cuda_iov_pos + 1].contig_disp - cuda_iov_dist[blockIdx.x + i * gridDim.x + current_cuda_iov_pos].contig_disp;
 
         if (i == 0 && blockIdx.x == 0 && cuda_iov_partial_length_start != 0) {
+            src_offset = cuda_iov_dist[current_cuda_iov_pos].contig_disp - source_disp;
+            dst_offset = dst_offset + _nb_bytes - cuda_iov_partial_length_start;  
             _nb_bytes = cuda_iov_partial_length_start;
         } else if (i == nb_tasks-1 && (blockIdx.x == (nb_blocks_used-1) % gridDim.x) && cuda_iov_partial_length_end != 0) {
             _nb_bytes = cuda_iov_partial_length_end;
-        } else {
-            _nb_bytes = cuda_iov_dist[blockIdx.x + i * gridDim.x + current_cuda_iov_pos].nb_bytes;
         }
-        if ((uintptr_t)(_destination_tmp) % ALIGNMENT_DOUBLE == 0 && (uintptr_t)src % ALIGNMENT_DOUBLE == 0 && _nb_bytes % ALIGNMENT_DOUBLE == 0) {
+        
+        _destination_tmp = destination_base + dst_offset; 
+        _source_tmp = source_base + src_offset;
+        if ((uintptr_t)(_destination_tmp) % ALIGNMENT_DOUBLE == 0 && (uintptr_t)(_source_tmp) % ALIGNMENT_DOUBLE == 0 && _nb_bytes % ALIGNMENT_DOUBLE == 0) {
             alignment = ALIGNMENT_DOUBLE;
-        } else if ((uintptr_t)(_destination_tmp) % ALIGNMENT_FLOAT == 0 && (uintptr_t)src % ALIGNMENT_FLOAT == 0 && _nb_bytes % ALIGNMENT_FLOAT == 0) {
+        } else if ((uintptr_t)(_destination_tmp) % ALIGNMENT_FLOAT == 0 && (uintptr_t)(_source_tmp) % ALIGNMENT_FLOAT == 0 && _nb_bytes % ALIGNMENT_FLOAT == 0) {
             alignment = ALIGNMENT_FLOAT;
         } else {
             alignment = ALIGNMENT_CHAR;
@@ -97,7 +98,7 @@ __global__ void opal_generic_simple_unpack_cuda_iov_cached_kernel( ddt_cuda_iov_
                 if (copy_count > blockDim.x) printf("copy_count %d, dim %d\n", copy_count, blockDim.x);
             }*/
             if (j < copy_count) {
-                _source_tmp = src + j * alignment;
+                _source_tmp = source_base + src_offset + j * alignment;
                 _destination_tmp = destination_base + dst_offset + j * alignment;
   /*              if (threadIdx.x == 0) {
                     printf("_src %p, dst %p, alignment %d, blk %d, j %d, count %d\n", _source_tmp, _destination_tmp, alignment, blockIdx.x, j, copy_count);
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
index 50009710d2d..dc356d96471 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
@@ -663,6 +663,7 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
     uint8_t cuda_iov_is_cached = 0;
     size_t cuda_iov_partial_length_start = 0;
     size_t cuda_iov_partial_length_end = 0;
+    size_t source_disp = 0;
 
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
     TIMER_DATA_TYPE start, end, start_total, end_total;
@@ -765,17 +766,18 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
             nb_blocks_per_description = (count_desc + thread_per_block - 1) / thread_per_block;
             DT_CUDA_DEBUG ( opal_cuda_output(10, "Unpack description %d, size %d, residue %d, alignment %d\n", i, count_desc, residue_desc, alignment); );
             for (j = 0; j < nb_blocks_per_description; j++) {
-                cuda_iov_dist_h_current[nb_blocks_used].ptr_offset = destination + j * thread_per_block * alignment - destination_base;
+                cuda_iov_dist_h_current[nb_blocks_used].ncontig_disp = destination + j * thread_per_block * alignment - destination_base;
+                cuda_iov_dist_h_current[nb_blocks_used].contig_disp = source_disp;
                 if ( (j+1) * thread_per_block <= count_desc) {
-                    cuda_iov_dist_h_current[nb_blocks_used].nb_bytes = thread_per_block * alignment;
+                    cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] = thread_per_block * alignment;
                 } else {
-                    cuda_iov_dist_h_current[nb_blocks_used].nb_bytes = (thread_per_block - ((j+1)*thread_per_block - count_desc)) * alignment;
+                    cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] = (thread_per_block - ((j+1)*thread_per_block - count_desc)) * alignment;
                 }
 #if defined (OPAL_DATATYPE_CUDA_DEBUG)
-                assert (cuda_iov_dist_h_current[nb_blocks_used].nb_bytes > 0); 
+                assert (cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] > 0); 
 #endif /* OPAL_DATATYPE_CUDA_DEBUG */
-                cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] = cuda_iov_dist_h_current[nb_blocks_used].nb_bytes;
-                DT_CUDA_DEBUG( opal_cuda_output(12, "Unpack \tblock %d, src_offset %ld, nb_bytes %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].ptr_offset, cuda_iov_dist_h_current[nb_blocks_used].nb_bytes); );
+                source_disp += cached_cuda_iov_nb_bytes_list_h[nb_blocks_used];
+                DT_CUDA_DEBUG( opal_cuda_output(12, "Unpack \tblock %d, ncontig_disp %ld, contig_disp %ld, nb_bytes %ld\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].ncontig_disp, cuda_iov_dist_h_current[nb_blocks_used].contig_disp, cached_cuda_iov_nb_bytes_list_h[nb_blocks_used]); );
                 nb_blocks_used ++;
             }
 
@@ -783,18 +785,20 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
             if (residue_desc != 0) {
                /* orig_alignment = opal_datatype_basicDatatypes[pElem->elem.common.type]->size;*/
                 orig_alignment = ALIGNMENT_CHAR;
-                cuda_iov_dist_h_current[nb_blocks_used].ptr_offset = destination + length_per_iovec / alignment * alignment - destination_base;
-                cuda_iov_dist_h_current[nb_blocks_used].nb_bytes = length_per_iovec - length_per_iovec / alignment * alignment;
+                cuda_iov_dist_h_current[nb_blocks_used].ncontig_disp = destination + length_per_iovec / alignment * alignment - destination_base;
+                cuda_iov_dist_h_current[nb_blocks_used].contig_disp = source_disp;
+                cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] = length_per_iovec - length_per_iovec / alignment * alignment;
 #if defined (OPAL_DATATYPE_CUDA_DEBUG)
-                assert (cuda_iov_dist_h_current[nb_blocks_used].nb_bytes > 0);
+                assert (cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] > 0);
 #endif /* OPAL_DATATYPE_CUDA_DEBUG */
-                cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] = cuda_iov_dist_h_current[nb_blocks_used].nb_bytes;
-                DT_CUDA_DEBUG( opal_cuda_output(12, "Unpack \tblock %d, src_offset %ld, nb_bytes %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].ptr_offset, cuda_iov_dist_h_current[nb_blocks_used].nb_bytes); );
+                source_disp += cached_cuda_iov_nb_bytes_list_h[nb_blocks_used];
+                DT_CUDA_DEBUG( opal_cuda_output(12, "Unpack \tblock %d, ncontig_disp %ld, contig_disp %ld, nb_bytes %ld\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].ncontig_disp, cuda_iov_dist_h_current[nb_blocks_used].contig_disp, cached_cuda_iov_nb_bytes_list_h[nb_blocks_used]); );
                 nb_blocks_used ++;
             }
         }
-        
-        cudaMemcpy(cached_cuda_iov_dist_d, cuda_iov_dist_h_current, sizeof(ddt_cuda_iov_dist_cached_t)*(nb_blocks_used), cudaMemcpyHostToDevice);
+        /* use additional entry to store the size of entire contiguous buffer needed for one ddt */
+        cuda_iov_dist_h_current[nb_blocks_used].contig_disp = source_disp;
+        cudaMemcpy(cached_cuda_iov_dist_d, cuda_iov_dist_h_current, sizeof(ddt_cuda_iov_dist_cached_t)*(nb_blocks_used+1), cudaMemcpyHostToDevice);
         opal_ddt_set_cuda_iov_cached(pConvertor, nb_blocks_used);
         DT_CUDA_DEBUG ( opal_cuda_output(2, "Unpack cuda iov is cached, count %d\n", nb_blocks_used););
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
@@ -826,15 +830,11 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
             total_unpacked += cuda_iov_partial_length_start;
             buffer_size -= cuda_iov_partial_length_start;
             pConvertor->current_iov_partial_length = 0;
-            cuda_iov_contig_buf_h_current[nb_blocks_used] = (uintptr_t)source;
-            source += cuda_iov_partial_length_start;
             cuda_iov_start_pos ++;
             nb_blocks_used ++;
         }
         for (i = cuda_iov_start_pos; i < cuda_iov_end_pos && !buffer_isfull; i++) {
             if (buffer_size >= cached_cuda_iov_nb_bytes_list_h[i]) {
-                cuda_iov_contig_buf_h_current[nb_blocks_used] = (uintptr_t)source;
-                source += cached_cuda_iov_nb_bytes_list_h[i];
                 total_unpacked += cached_cuda_iov_nb_bytes_list_h[i];
                 buffer_size -= cached_cuda_iov_nb_bytes_list_h[i];
                 nb_blocks_used ++;
@@ -842,9 +842,6 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
                 if (buffer_size > 0) {
                     cuda_iov_partial_length_end = buffer_size;
                     total_unpacked += cuda_iov_partial_length_end;
-                    cuda_iov_contig_buf_h_current[nb_blocks_used] = (uintptr_t)source;
-                    source += cuda_iov_partial_length_end;
-                    pConvertor->current_iov_partial_length = cached_cuda_iov_nb_bytes_list_h[i] - cuda_iov_partial_length_end;
                     nb_blocks_used ++;
                 }
                 buffer_size = 0;
@@ -859,7 +856,7 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
 #endif
         cudaMemcpyAsync(cuda_iov_contig_buf_d_current, cuda_iov_contig_buf_h_current, sizeof(uintptr_t)*(nb_blocks_used), cudaMemcpyHostToDevice, *cuda_stream_iov);
         DT_CUDA_DEBUG ( opal_cuda_output(2, "kernel launched src_base %p, dst_base %p, nb_blocks %ld\n", source_base, destination_base, nb_blocks_used ); );
-        opal_generic_simple_unpack_cuda_iov_cached_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cached_cuda_iov_dist_d, pConvertor->current_cuda_iov_pos, cuda_iov_contig_buf_d_current, nb_blocks_used, destination_base, cuda_iov_partial_length_start, cuda_iov_partial_length_end);
+        opal_generic_simple_unpack_cuda_iov_cached_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cached_cuda_iov_dist_d, pConvertor->current_cuda_iov_pos, cuda_iov_contig_buf_d_current, nb_blocks_used, destination_base, source_base, cuda_iov_partial_length_start, cuda_iov_partial_length_end);
     }
 
     for (i = 0; i < NB_STREAMS; i++) {
diff --git a/test/datatype/ddt_benchmark.c b/test/datatype/ddt_benchmark.c
index d961ef34e4e..e879e5c0192 100644
--- a/test/datatype/ddt_benchmark.c
+++ b/test/datatype/ddt_benchmark.c
@@ -895,14 +895,14 @@ static int local_copy_with_convertor( ompi_datatype_t* pdt, int count, int chunk
             
         }
 #if defined (TEST_CHAR)
-   /*     mat_char = (unsigned char *)ptemp;
+        mat_char = (unsigned char *)ptemp;
         for (j = 0; j < max_data; j++) {
             if (mat_char[j] != 'a') {
                 t_error ++;
                 printf("error %d, %c\n", j, mat_char[j]);
             }
         }
-        printf("total error %d\n", t_error);*/
+        printf("total error %d\n", t_error);
 #endif
 
         if( done2 == 0 ) {

From 180382b5d6249c2ec4658d2d8367b440d337c44c Mon Sep 17 00:00:00 2001
From: eddy16112 <eddy16112@gmail.com>
Date: Fri, 13 Nov 2015 18:33:48 -0500
Subject: [PATCH 081/190] clean up kernel, put variables uses multiple times
 into register

---
 .../datatype/cuda/opal_datatype_pack_cuda_kernel.cu |  8 +++++---
 .../cuda/opal_datatype_unpack_cuda_kernel.cu        | 13 +++++++------
 .../cuda/opal_datatype_unpack_cuda_wrapper.cu       |  1 -
 3 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
index ddfd68b0e4c..92a96d1cb2b 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
@@ -91,11 +91,12 @@ __global__ void opal_generic_simple_pack_cuda_iov_non_cached_kernel( ddt_cuda_io
 __global__ void opal_generic_simple_pack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, uint32_t cuda_iov_pos, uintptr_t* cuda_iov_contig_buf_d, int nb_blocks_used, unsigned char* source_base, unsigned char* destination_base)
 {
     uint32_t i, j;
-    size_t _nb_bytes;    
+    uint32_t _nb_bytes;    
     size_t src_offset, dst_offset;
     unsigned char *_source_tmp, *_destination_tmp;
     uint32_t current_cuda_iov_pos = cuda_iov_pos;
     size_t destination_disp = cuda_iov_dist[current_cuda_iov_pos].contig_disp;
+    size_t contig_disp;
     
     __shared__ uint32_t nb_tasks;
     uint32_t copy_count;
@@ -111,9 +112,10 @@ __global__ void opal_generic_simple_pack_cuda_iov_cached_kernel( ddt_cuda_iov_di
     __syncthreads();
     
     for (i = 0; i < nb_tasks; i++) {
+        contig_disp = cuda_iov_dist[blockIdx.x + i * gridDim.x + current_cuda_iov_pos].contig_disp;  /* this variable is used multiple times, so put in in register */
         src_offset = cuda_iov_dist[blockIdx.x + i * gridDim.x + current_cuda_iov_pos].ncontig_disp;
-        dst_offset = cuda_iov_dist[blockIdx.x + i * gridDim.x + current_cuda_iov_pos].contig_disp - destination_disp;
-        _nb_bytes = cuda_iov_dist[blockIdx.x + i * gridDim.x + current_cuda_iov_pos + 1].contig_disp - cuda_iov_dist[blockIdx.x + i * gridDim.x + current_cuda_iov_pos].contig_disp;
+        dst_offset = contig_disp - destination_disp;
+        _nb_bytes = cuda_iov_dist[blockIdx.x + i * gridDim.x + current_cuda_iov_pos + 1].contig_disp - contig_disp;
         
         _source_tmp = source_base + src_offset;
         _destination_tmp = destination_base + dst_offset;
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
index 9cf705ae7e3..f2c337ea682 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
@@ -50,12 +50,12 @@ __global__ void opal_generic_simple_unpack_cuda_iov_cached_kernel( ddt_cuda_iov_
 {
     uint32_t i, j;
     size_t dst_offset, src_offset;
-    unsigned char *src;
     unsigned char *_source_tmp, *_destination_tmp;
-    size_t _nb_bytes;
+    uint32_t _nb_bytes;
     uint32_t current_cuda_iov_pos = cuda_iov_pos;
     size_t source_disp = cuda_iov_dist[current_cuda_iov_pos].contig_disp;
-    size_t source_partial_disp = (cuda_iov_dist[current_cuda_iov_pos+1].contig_disp - cuda_iov_dist[current_cuda_iov_pos].contig_disp) - cuda_iov_partial_length_start; 
+    size_t source_partial_disp = (cuda_iov_dist[current_cuda_iov_pos+1].contig_disp - cuda_iov_dist[current_cuda_iov_pos].contig_disp) - cuda_iov_partial_length_start;
+    size_t contig_disp; 
 
     __shared__ uint32_t nb_tasks;
     uint32_t copy_count;
@@ -70,12 +70,13 @@ __global__ void opal_generic_simple_unpack_cuda_iov_cached_kernel( ddt_cuda_iov_
     __syncthreads();
     
     for (i = 0; i < nb_tasks; i++) {
-        src_offset = cuda_iov_dist[blockIdx.x + i * gridDim.x + current_cuda_iov_pos].contig_disp - source_disp - source_partial_disp;
+        contig_disp = cuda_iov_dist[blockIdx.x + i * gridDim.x + current_cuda_iov_pos].contig_disp;  /* this variable is used multiple times, so put in in register */
+        src_offset = contig_disp - source_disp - source_partial_disp;
         dst_offset = cuda_iov_dist[blockIdx.x + i * gridDim.x + current_cuda_iov_pos].ncontig_disp;
-        _nb_bytes = cuda_iov_dist[blockIdx.x + i * gridDim.x + current_cuda_iov_pos + 1].contig_disp - cuda_iov_dist[blockIdx.x + i * gridDim.x + current_cuda_iov_pos].contig_disp;
+        _nb_bytes = cuda_iov_dist[blockIdx.x + i * gridDim.x + current_cuda_iov_pos + 1].contig_disp - contig_disp;
 
         if (i == 0 && blockIdx.x == 0 && cuda_iov_partial_length_start != 0) {
-            src_offset = cuda_iov_dist[current_cuda_iov_pos].contig_disp - source_disp;
+            src_offset = contig_disp - source_disp;
             dst_offset = dst_offset + _nb_bytes - cuda_iov_partial_length_start;  
             _nb_bytes = cuda_iov_partial_length_start;
         } else if (i == nb_tasks-1 && (blockIdx.x == (nb_blocks_used-1) % gridDim.x) && cuda_iov_partial_length_end != 0) {
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
index dc356d96471..d400e05efcf 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
@@ -854,7 +854,6 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
         total_time = ELAPSED_TIME( start, end );
         DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Unpack src %p, cached cuda iov is prepared in %ld microsec, kernel submitted to CUDA stream %d, nb_blocks %d\n", source_base, total_time,  cuda_iov_pipeline_block->cuda_stream_id, nb_blocks_used); );
 #endif
-        cudaMemcpyAsync(cuda_iov_contig_buf_d_current, cuda_iov_contig_buf_h_current, sizeof(uintptr_t)*(nb_blocks_used), cudaMemcpyHostToDevice, *cuda_stream_iov);
         DT_CUDA_DEBUG ( opal_cuda_output(2, "kernel launched src_base %p, dst_base %p, nb_blocks %ld\n", source_base, destination_base, nb_blocks_used ); );
         opal_generic_simple_unpack_cuda_iov_cached_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cached_cuda_iov_dist_d, pConvertor->current_cuda_iov_pos, cuda_iov_contig_buf_d_current, nb_blocks_used, destination_base, source_base, cuda_iov_partial_length_start, cuda_iov_partial_length_end);
     }

From 9ba68cab24cf197acf7b0ad6025908faa2350dca Mon Sep 17 00:00:00 2001
From: eddy16112 <eddy16112@gmail.com>
Date: Fri, 13 Nov 2015 22:18:41 -0500
Subject: [PATCH 082/190] cached cuda iov is working for count > 1

another checkpoint

now convertor->count > 1 is woring
---
 opal/datatype/cuda/opal_datatype_cuda.cu      | 12 ++++
 .../cuda/opal_datatype_cuda_internal.cuh      |  4 +-
 .../cuda/opal_datatype_pack_cuda_kernel.cu    | 27 +++++---
 .../cuda/opal_datatype_pack_cuda_wrapper.cu   | 43 +++++++------
 .../cuda/opal_datatype_unpack_cuda_kernel.cu  | 33 +++++++---
 .../cuda/opal_datatype_unpack_cuda_wrapper.cu | 63 ++++++++++---------
 6 files changed, 119 insertions(+), 63 deletions(-)

diff --git a/opal/datatype/cuda/opal_datatype_cuda.cu b/opal/datatype/cuda/opal_datatype_cuda.cu
index 471c6e63709..ec33b5c0e4d 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cu
+++ b/opal/datatype/cuda/opal_datatype_cuda.cu
@@ -355,12 +355,24 @@ void opal_ddt_set_cuda_iov_position(struct opal_convertor_t *convertor, size_t d
 {
     int i;
     size_t iov_size = 0;
+    size_t ddt_size;
+    convertor->current_iov_partial_length = 0;
+    convertor->current_cuda_iov_pos = 0;
+    if (ddt_offset == 0) {
+       return;
+    }
+    opal_datatype_type_size(convertor->pDesc, &ddt_size);
+    ddt_offset = ddt_offset % ddt_size;
     for(i = 0; i < cuda_iov_count; i++) {
         iov_size += cached_cuda_iov_nb_bytes_list_h[i];
         if (iov_size > ddt_offset) {
             convertor->current_iov_partial_length = iov_size - ddt_offset;
             convertor->current_cuda_iov_pos = i;
             break;
+        } else if (iov_size == ddt_offset){
+            convertor->current_iov_partial_length = 0;
+            convertor->current_cuda_iov_pos = i+1;
+            break;
         }
     }
 }
diff --git a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
index ea4afa0b989..82a28420580 100644
--- a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
+++ b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
@@ -138,9 +138,9 @@ __global__ void opal_generic_simple_pack_cuda_iov_non_cached_kernel( ddt_cuda_io
 
 __global__ void opal_generic_simple_unpack_cuda_iov_non_cached_kernel( ddt_cuda_iov_dist_non_cached_t* cuda_iov_dist, int nb_blocks_used);
 
-__global__ void opal_generic_simple_pack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, uint32_t cuda_iov_pos, uintptr_t* cuda_iov_contig_buf_d, int nb_blocks_used, unsigned char* source_base, unsigned char* destination_base);
+__global__ void opal_generic_simple_pack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, uint32_t cuda_iov_pos, uint32_t cuda_iov_count, uint32_t ddt_extent, uint32_t current_count, int nb_blocks_used, unsigned char* source_base, unsigned char* destination_base);
 
-__global__ void opal_generic_simple_unpack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, uint32_t cuda_iov_pos, uintptr_t* cuda_iov_contig_buf_d, int nb_blocks_used, unsigned char* destination_base, unsigned char* source_base, size_t cuda_iov_partial_length_start, size_t cuda_iov_partial_length_end);
+__global__ void opal_generic_simple_unpack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, uint32_t cuda_iov_pos, uint32_t cuda_iov_count, uint32_t ddt_extent, uint32_t current_count, int nb_blocks_used, unsigned char* destination_base, unsigned char* source_base, size_t cuda_iov_partial_length_start, size_t cuda_iov_partial_length_end);
 
 void opal_cuda_output(int output_id, const char *format, ...);
 
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
index 92a96d1cb2b..2564fe1393c 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
@@ -88,7 +88,7 @@ __global__ void opal_generic_simple_pack_cuda_iov_non_cached_kernel( ddt_cuda_io
     }
 }
 
-__global__ void opal_generic_simple_pack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, uint32_t cuda_iov_pos, uintptr_t* cuda_iov_contig_buf_d, int nb_blocks_used, unsigned char* source_base, unsigned char* destination_base)
+__global__ void opal_generic_simple_pack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, uint32_t cuda_iov_pos, uint32_t cuda_iov_count, uint32_t ddt_extent, uint32_t current_count, int nb_blocks_used, unsigned char* source_base, unsigned char* destination_base)
 {
     uint32_t i, j;
     uint32_t _nb_bytes;    
@@ -97,6 +97,9 @@ __global__ void opal_generic_simple_pack_cuda_iov_cached_kernel( ddt_cuda_iov_di
     uint32_t current_cuda_iov_pos = cuda_iov_pos;
     size_t destination_disp = cuda_iov_dist[current_cuda_iov_pos].contig_disp;
     size_t contig_disp;
+    uint32_t _my_cuda_iov_pos;
+    uint32_t _my_cuda_iov_iteration;
+    size_t ddt_size = cuda_iov_dist[cuda_iov_count].contig_disp;
     
     __shared__ uint32_t nb_tasks;
     uint32_t copy_count;
@@ -107,15 +110,20 @@ __global__ void opal_generic_simple_pack_cuda_iov_cached_kernel( ddt_cuda_iov_di
         if (blockIdx.x < (nb_blocks_used % gridDim.x)) {
             nb_tasks ++;
         }
-   //     printf("nb_tasks %d, griddim %d, nb_blocks_used %d, bloid %d \n", nb_tasks, gridDim.x, nb_blocks_used, blockIdx.x);
+    //    printf("cuda_iov_count %d, ddt_extent %d, current_count %d\n", cuda_iov_count, ddt_extent, current_count);
+    //     printf("nb_tasks %d, griddim %d, nb_blocks_used %d, bloid %d \n", nb_tasks, gridDim.x, nb_blocks_used, blockIdx.x);
     }
     __syncthreads();
     
     for (i = 0; i < nb_tasks; i++) {
-        contig_disp = cuda_iov_dist[blockIdx.x + i * gridDim.x + current_cuda_iov_pos].contig_disp;  /* this variable is used multiple times, so put in in register */
-        src_offset = cuda_iov_dist[blockIdx.x + i * gridDim.x + current_cuda_iov_pos].ncontig_disp;
-        dst_offset = contig_disp - destination_disp;
-        _nb_bytes = cuda_iov_dist[blockIdx.x + i * gridDim.x + current_cuda_iov_pos + 1].contig_disp - contig_disp;
+        /* these 3 variables are used multiple times, so put in in register */
+        _my_cuda_iov_pos = (blockIdx.x + i * gridDim.x + current_cuda_iov_pos) % cuda_iov_count;
+        _my_cuda_iov_iteration = (blockIdx.x + i * gridDim.x + current_cuda_iov_pos) / cuda_iov_count;
+        contig_disp = cuda_iov_dist[_my_cuda_iov_pos].contig_disp;  
+        
+        src_offset = cuda_iov_dist[_my_cuda_iov_pos].ncontig_disp + (_my_cuda_iov_iteration + current_count) * ddt_extent;
+        dst_offset = contig_disp + ddt_size * _my_cuda_iov_iteration - destination_disp;
+        _nb_bytes = cuda_iov_dist[_my_cuda_iov_pos + 1].contig_disp - contig_disp;
         
         _source_tmp = source_base + src_offset;
         _destination_tmp = destination_base + dst_offset;
@@ -128,7 +136,12 @@ __global__ void opal_generic_simple_pack_cuda_iov_cached_kernel( ddt_cuda_iov_di
             alignment = ALIGNMENT_CHAR;
         }
         copy_count = _nb_bytes / alignment;
-        
+    /*    
+        if (threadIdx.x == 0 && nb_tasks != 0) {
+            printf("pack block %d, src_offset %ld, dst_offset %ld, count %d, nb_bytes %d, nb_tasks %d, i %d\n", blockIdx.x, src_offset, dst_offset, copy_count, _nb_bytes, nb_tasks, i);
+        }
+        __syncthreads();
+      */  
         for (j = threadIdx.x; j < copy_count; j += blockDim.x) {
             if (j < copy_count) {
                 _source_tmp = source_base + src_offset + j * alignment;
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
index f1ce6dbda7d..fc9181e902b 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
@@ -966,6 +966,8 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
     uint32_t cached_cuda_iov_count = 0;
     uint8_t cuda_iov_is_cached = 0;
     size_t destionation_disp = 0;
+    opal_datatype_count_t convertor_current_count;
+    OPAL_PTRDIFF_TYPE ddt_extent;
 
 #if defined(OPAL_DATATYPE_CUDA_TIMING)    
     TIMER_DATA_TYPE start, end, start_total, end_total;
@@ -1117,20 +1119,19 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
 #endif
     }
     
+    /* now we use cached cuda iov */
     cached_cuda_iov_count = cached_cuda_iov->cuda_iov_count;
+    cuda_iov_start_pos = pConvertor->current_cuda_iov_pos;
+    cuda_iov_end_pos = cached_cuda_iov_count;
+    nb_blocks_used = 0;
+    cuda_iov_pipeline_block = current_cuda_device->cuda_iov_pipeline_block[iov_pipeline_block_id];
+    cuda_stream_iov = cuda_iov_pipeline_block->cuda_stream;
+    convertor_current_count = pConvertor->current_count;
    
-    /* now we use cached cuda iov */
-    if( pConvertor->bConverted != pConvertor->local_size && !buffer_isfull) {
-        cuda_iov_start_pos = pConvertor->current_cuda_iov_pos;
-        cuda_iov_end_pos = cached_cuda_iov_count;
-        nb_blocks_used = 0;
-        cuda_iov_pipeline_block = current_cuda_device->cuda_iov_pipeline_block[iov_pipeline_block_id];
-        cuda_iov_contig_buf_h_current = cuda_iov_pipeline_block->cuda_iov_contig_buf_h;
-        cuda_iov_contig_buf_d_current = cuda_iov_pipeline_block->cuda_iov_contig_buf_d;
-        cuda_stream_iov = cuda_iov_pipeline_block->cuda_stream;
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
-        GET_TIME(start);
+    GET_TIME(start);
 #endif
+    while( pConvertor->current_count < pConvertor->count && !buffer_isfull) {
         for (i = cuda_iov_start_pos; i < cuda_iov_end_pos && !buffer_isfull; i++) {
             if (buffer_size >= cached_cuda_iov_nb_bytes_list_h[i]) {
                 total_packed += cached_cuda_iov_nb_bytes_list_h[i];
@@ -1141,16 +1142,22 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
                 break;
             }
         }
+        if (!buffer_isfull) {
+            pConvertor->current_count ++;
+            cuda_iov_start_pos = 0;
+            cuda_iov_end_pos = cached_cuda_iov->cuda_iov_count;
+        }
+    }
 #if defined(OPAL_DATATYPE_CUDA_TIMING)    
-        GET_TIME( end );
-        total_time = ELAPSED_TIME( start, end );
-        DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Pack to dest %p, cached cuda iov is prepared in %ld microsec, kernel submitted to CUDA stream %d, nb_blocks %d\n", destination_base, total_time,  cuda_iov_pipeline_block->cuda_stream_id, nb_blocks_used); );
+    GET_TIME( end );
+    total_time = ELAPSED_TIME( start, end );
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Pack to dest %p, cached cuda iov is prepared in %ld microsec, kernel submitted to CUDA stream %d, nb_blocks %d\n", destination_base, total_time,  cuda_iov_pipeline_block->cuda_stream_id, nb_blocks_used); );
 #endif
-//        cudaMemcpyAsync(cuda_iov_contig_buf_d_current, cuda_iov_contig_buf_h_current, sizeof(uintptr_t)*(nb_blocks_used), cudaMemcpyHostToDevice, *cuda_stream_iov);
-        DT_CUDA_DEBUG ( opal_cuda_output(2, "kernel launched src_base %p, dst_base %p, nb_blocks %ld\n", source_base, destination_base, nb_blocks_used ); );
-        opal_generic_simple_pack_cuda_iov_cached_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cached_cuda_iov_dist_d, pConvertor->current_cuda_iov_pos, cuda_iov_contig_buf_d_current, nb_blocks_used, source_base, destination_base);
-        pConvertor->current_cuda_iov_pos += nb_blocks_used;
-    }
+    opal_datatype_type_extent(pConvertor->pDesc, &ddt_extent);
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "kernel launched src_base %p, dst_base %p, nb_blocks %ld, extent %ld\n", source_base, destination_base, nb_blocks_used, ddt_extent ); );
+    opal_generic_simple_pack_cuda_iov_cached_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cached_cuda_iov_dist_d, pConvertor->current_cuda_iov_pos, cached_cuda_iov_count, ddt_extent, convertor_current_count, nb_blocks_used, source_base, destination_base);
+    pConvertor->current_cuda_iov_pos += nb_blocks_used;
+    pConvertor->current_cuda_iov_pos = pConvertor->current_cuda_iov_pos % cached_cuda_iov->cuda_iov_count;
 
     for (i = 0; i < NB_STREAMS; i++) {
         cudaStreamSynchronize(cuda_streams->opal_cuda_stream[i]);
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
index f2c337ea682..f6ee8e0bfc4 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
@@ -46,7 +46,7 @@ __global__ void opal_generic_simple_unpack_cuda_iov_non_cached_kernel( ddt_cuda_
     }
 }
 
-__global__ void opal_generic_simple_unpack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, uint32_t cuda_iov_pos, uintptr_t* cuda_iov_contig_buf_d, int nb_blocks_used, unsigned char* destination_base, unsigned char* source_base, size_t cuda_iov_partial_length_start, size_t cuda_iov_partial_length_end)
+__global__ void opal_generic_simple_unpack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, uint32_t cuda_iov_pos, uint32_t cuda_iov_count, uint32_t ddt_extent, uint32_t current_count, int nb_blocks_used, unsigned char* destination_base, unsigned char* source_base, size_t cuda_iov_partial_length_start, size_t cuda_iov_partial_length_end)
 {
     uint32_t i, j;
     size_t dst_offset, src_offset;
@@ -54,8 +54,11 @@ __global__ void opal_generic_simple_unpack_cuda_iov_cached_kernel( ddt_cuda_iov_
     uint32_t _nb_bytes;
     uint32_t current_cuda_iov_pos = cuda_iov_pos;
     size_t source_disp = cuda_iov_dist[current_cuda_iov_pos].contig_disp;
-    size_t source_partial_disp = (cuda_iov_dist[current_cuda_iov_pos+1].contig_disp - cuda_iov_dist[current_cuda_iov_pos].contig_disp) - cuda_iov_partial_length_start;
+    size_t source_partial_disp = 0;
     size_t contig_disp; 
+    uint32_t _my_cuda_iov_pos;
+    uint32_t _my_cuda_iov_iteration;
+    size_t ddt_size = cuda_iov_dist[cuda_iov_count].contig_disp;
 
     __shared__ uint32_t nb_tasks;
     uint32_t copy_count;
@@ -66,17 +69,26 @@ __global__ void opal_generic_simple_unpack_cuda_iov_cached_kernel( ddt_cuda_iov_
         if (blockIdx.x < nb_blocks_used % gridDim.x) {
             nb_tasks ++;
         }
+     //   printf("cuda_iov_count %d, ddt_extent %d, current_count %d, ddt_size %d\n", cuda_iov_count, ddt_extent, current_count, ddt_size);
     }
     __syncthreads();
     
+    if (cuda_iov_partial_length_start != 0) {
+        source_partial_disp = (cuda_iov_dist[current_cuda_iov_pos+1].contig_disp - cuda_iov_dist[current_cuda_iov_pos].contig_disp) - cuda_iov_partial_length_start;
+    }
+    
     for (i = 0; i < nb_tasks; i++) {
-        contig_disp = cuda_iov_dist[blockIdx.x + i * gridDim.x + current_cuda_iov_pos].contig_disp;  /* this variable is used multiple times, so put in in register */
-        src_offset = contig_disp - source_disp - source_partial_disp;
-        dst_offset = cuda_iov_dist[blockIdx.x + i * gridDim.x + current_cuda_iov_pos].ncontig_disp;
-        _nb_bytes = cuda_iov_dist[blockIdx.x + i * gridDim.x + current_cuda_iov_pos + 1].contig_disp - contig_disp;
+        /* these 3 variables are used multiple times, so put in in register */
+        _my_cuda_iov_pos = (blockIdx.x + i * gridDim.x + current_cuda_iov_pos) % cuda_iov_count;
+        _my_cuda_iov_iteration = (blockIdx.x + i * gridDim.x + current_cuda_iov_pos) / cuda_iov_count;
+        contig_disp = cuda_iov_dist[_my_cuda_iov_pos].contig_disp; 
+        
+        src_offset = contig_disp + ddt_size * _my_cuda_iov_iteration - source_disp - source_partial_disp;
+        dst_offset = cuda_iov_dist[_my_cuda_iov_pos].ncontig_disp + (_my_cuda_iov_iteration + current_count) * ddt_extent;
+        _nb_bytes = cuda_iov_dist[_my_cuda_iov_pos + 1].contig_disp - contig_disp;
 
         if (i == 0 && blockIdx.x == 0 && cuda_iov_partial_length_start != 0) {
-            src_offset = contig_disp - source_disp;
+            src_offset = contig_disp + ddt_size * _my_cuda_iov_iteration - source_disp;
             dst_offset = dst_offset + _nb_bytes - cuda_iov_partial_length_start;  
             _nb_bytes = cuda_iov_partial_length_start;
         } else if (i == nb_tasks-1 && (blockIdx.x == (nb_blocks_used-1) % gridDim.x) && cuda_iov_partial_length_end != 0) {
@@ -93,7 +105,12 @@ __global__ void opal_generic_simple_unpack_cuda_iov_cached_kernel( ddt_cuda_iov_
             alignment = ALIGNMENT_CHAR;
         }
         copy_count = _nb_bytes / alignment;
-        
+   /*     
+        if (threadIdx.x == 0 && nb_tasks != 0) {
+            printf("unpack block %d, src_offset %ld, dst_offset %ld, count %d, nb_bytes %d, nb_tasks %d, i %d\n", blockIdx.x, src_offset, dst_offset, copy_count, _nb_bytes, nb_tasks, i);
+        }
+        __syncthreads();
+     */   
         for (j = threadIdx.x; j < copy_count; j += blockDim.x) {
 /*            if (threadIdx.x == 0) {
                 if (copy_count > blockDim.x) printf("copy_count %d, dim %d\n", copy_count, blockDim.x);
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
index d400e05efcf..49355e8e017 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
@@ -664,6 +664,8 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
     size_t cuda_iov_partial_length_start = 0;
     size_t cuda_iov_partial_length_end = 0;
     size_t source_disp = 0;
+    opal_datatype_count_t convertor_current_count;
+    OPAL_PTRDIFF_TYPE ddt_extent;
 
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
     TIMER_DATA_TYPE start, end, start_total, end_total;
@@ -807,32 +809,31 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
         DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Unpack cached cuda iov is prepared in %ld microsec, nb_blocks_used %d\n", total_time, nb_blocks_used); );
 #endif
     }
-    
-    cached_cuda_iov_count = cached_cuda_iov->cuda_iov_count;
       
     /* now we use cached cuda iov */
-    if( pConvertor->bConverted != pConvertor->local_size && !buffer_isfull) {
-        opal_ddt_set_cuda_iov_position(pConvertor, pConvertor->bConverted, cached_cuda_iov_nb_bytes_list_h, cached_cuda_iov_count);
-        cuda_iov_start_pos = pConvertor->current_cuda_iov_pos;
-        cuda_iov_end_pos = cached_cuda_iov_count;
-        nb_blocks_used = 0;
-        cuda_iov_pipeline_block = current_cuda_device->cuda_iov_pipeline_block[iov_pipeline_block_id];
-        cuda_iov_contig_buf_h_current = cuda_iov_pipeline_block->cuda_iov_contig_buf_h;
-        cuda_iov_contig_buf_d_current = cuda_iov_pipeline_block->cuda_iov_contig_buf_d;
-        cuda_iov_dist_d_current = cached_cuda_iov_dist_d + pConvertor->current_cuda_iov_pos;
-        cuda_stream_iov = cuda_iov_pipeline_block->cuda_stream;
+    cached_cuda_iov_count = cached_cuda_iov->cuda_iov_count;
+    opal_ddt_set_cuda_iov_position(pConvertor, pConvertor->bConverted, cached_cuda_iov_nb_bytes_list_h, cached_cuda_iov_count);
+    cuda_iov_start_pos = pConvertor->current_cuda_iov_pos;
+    cuda_iov_end_pos = cached_cuda_iov_count;
+    nb_blocks_used = 0;
+    cuda_iov_pipeline_block = current_cuda_device->cuda_iov_pipeline_block[iov_pipeline_block_id];
+    cuda_stream_iov = cuda_iov_pipeline_block->cuda_stream;
+    convertor_current_count = pConvertor->current_count;
+    
+    printf("[00000] partial_length %ld, pos %d\n", pConvertor->current_iov_partial_length, pConvertor->current_cuda_iov_pos);
+    if (pConvertor->current_iov_partial_length > 0) {
+        cuda_iov_partial_length_start = pConvertor->current_iov_partial_length;
+        total_unpacked += cuda_iov_partial_length_start;
+        buffer_size -= cuda_iov_partial_length_start;
+        pConvertor->current_iov_partial_length = 0;
+        cuda_iov_start_pos ++;
+        nb_blocks_used ++;
+    }
+    
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
-        GET_TIME(start);
+    GET_TIME(start);
 #endif
-        printf("[00000] partial_length %ld, pos %d\n", pConvertor->current_iov_partial_length, pConvertor->current_cuda_iov_pos);
-        if (pConvertor->current_iov_partial_length > 0) {
-            cuda_iov_partial_length_start = pConvertor->current_iov_partial_length;
-            total_unpacked += cuda_iov_partial_length_start;
-            buffer_size -= cuda_iov_partial_length_start;
-            pConvertor->current_iov_partial_length = 0;
-            cuda_iov_start_pos ++;
-            nb_blocks_used ++;
-        }
+    while( pConvertor->current_count < pConvertor->count && !buffer_isfull) {
         for (i = cuda_iov_start_pos; i < cuda_iov_end_pos && !buffer_isfull; i++) {
             if (buffer_size >= cached_cuda_iov_nb_bytes_list_h[i]) {
                 total_unpacked += cached_cuda_iov_nb_bytes_list_h[i];
@@ -849,14 +850,20 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
                 break;
             }
         }
+        if (!buffer_isfull) {
+            pConvertor->current_count ++;
+            cuda_iov_start_pos = 0;
+            cuda_iov_end_pos = cached_cuda_iov_count;
+        }
+    }
 #if defined(OPAL_DATATYPE_CUDA_TIMING)    
-        GET_TIME( end );
-        total_time = ELAPSED_TIME( start, end );
-        DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Unpack src %p, cached cuda iov is prepared in %ld microsec, kernel submitted to CUDA stream %d, nb_blocks %d\n", source_base, total_time,  cuda_iov_pipeline_block->cuda_stream_id, nb_blocks_used); );
+    GET_TIME( end );
+    total_time = ELAPSED_TIME( start, end );
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Unpack src %p, cached cuda iov is prepared in %ld microsec, kernel submitted to CUDA stream %d, nb_blocks %d\n", source_base, total_time,  cuda_iov_pipeline_block->cuda_stream_id, nb_blocks_used); );
 #endif
-        DT_CUDA_DEBUG ( opal_cuda_output(2, "kernel launched src_base %p, dst_base %p, nb_blocks %ld\n", source_base, destination_base, nb_blocks_used ); );
-        opal_generic_simple_unpack_cuda_iov_cached_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cached_cuda_iov_dist_d, pConvertor->current_cuda_iov_pos, cuda_iov_contig_buf_d_current, nb_blocks_used, destination_base, source_base, cuda_iov_partial_length_start, cuda_iov_partial_length_end);
-    }
+    opal_datatype_type_extent(pConvertor->pDesc, &ddt_extent);
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "kernel launched src_base %p, dst_base %p, nb_blocks %ld\n", source_base, destination_base, nb_blocks_used ); );
+    opal_generic_simple_unpack_cuda_iov_cached_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cached_cuda_iov_dist_d, pConvertor->current_cuda_iov_pos, cached_cuda_iov_count, ddt_extent, convertor_current_count, nb_blocks_used, destination_base, source_base, cuda_iov_partial_length_start, cuda_iov_partial_length_end);
 
     for (i = 0; i < NB_STREAMS; i++) {
         cudaStreamSynchronize(cuda_streams->opal_cuda_stream[i]);

From 12a3adeed3c5d00b3074f66dba9e8b0699afac5f Mon Sep 17 00:00:00 2001
From: eddy16112 <eddy16112@gmail.com>
Date: Mon, 16 Nov 2015 16:02:11 -0500
Subject: [PATCH 083/190] move the cuda iov caching into a seperate function

---
 opal/datatype/cuda/opal_datatype_cuda.cu      |  79 +++++++++++++
 opal/datatype/cuda/opal_datatype_cuda.cuh     |   2 +
 .../cuda/opal_datatype_pack_cuda_wrapper.cu   | 110 +++---------------
 .../cuda/opal_datatype_unpack_cuda_wrapper.cu | 107 ++---------------
 4 files changed, 105 insertions(+), 193 deletions(-)

diff --git a/opal/datatype/cuda/opal_datatype_cuda.cu b/opal/datatype/cuda/opal_datatype_cuda.cu
index ec33b5c0e4d..5747eb2b3a5 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cu
+++ b/opal/datatype/cuda/opal_datatype_cuda.cu
@@ -325,6 +325,85 @@ void opal_ddt_cached_cuda_iov_fini(void* cached_cuda_iov)
 #endif /* OPAL_DATATYPE_CUDA_IOV_CACHE */
 }
 
+/* cached_cuda_iov_d is not ready until explicitlt sync with cuda stream 0 
+*/
+int32_t opal_ddt_cache_cuda_iov(opal_convertor_t* pConvertor, ddt_cuda_iov_dist_cached_t *cached_cuda_iov_d, uint32_t *cached_cuda_iov_nb_bytes_list_h, uint32_t *cuda_iov_count)
+{
+    uint32_t i, j;
+    uint32_t count_desc, nb_blocks_per_description, residue_desc;
+    uint32_t thread_per_block, nb_blocks_used;
+    size_t length_per_iovec;
+    uint8_t alignment;
+    ddt_cuda_iov_pipeline_block_t *cuda_iov_pipeline_block = NULL;
+    ddt_cuda_iov_dist_cached_t *cuda_iov_h = NULL;
+    cudaStream_t *cuda_stream_iov = NULL;
+    const struct iovec *ddt_iov = NULL;
+    uint32_t ddt_iov_count = 0;
+    size_t ncontig_disp_base;
+    size_t contig_disp = 0;
+    
+    opal_convertor_raw_cached( pConvertor, &ddt_iov, &ddt_iov_count);
+    if (ddt_iov == NULL) {
+        DT_CUDA_DEBUG ( opal_cuda_output(0, "Can not get ddt iov\n"););
+        return OPAL_ERROR;
+    }
+    
+    nb_blocks_used = 0;
+    cuda_iov_pipeline_block = current_cuda_device->cuda_iov_pipeline_block[0];
+    cuda_iov_h = cuda_iov_pipeline_block->cuda_iov_dist_cached_h;
+    cuda_stream_iov = cuda_iov_pipeline_block->cuda_stream;
+
+    for (i = 0; i < ddt_iov_count; i++) {
+        length_per_iovec = ddt_iov[i].iov_len;
+        ncontig_disp_base = (size_t)(ddt_iov[i].iov_base);
+    
+        /* block size is either multiple of ALIGNMENT_DOUBLE or residule */
+        alignment = ALIGNMENT_DOUBLE;
+
+        count_desc = length_per_iovec / alignment;
+        residue_desc = length_per_iovec % alignment;
+        nb_blocks_per_description = (count_desc + thread_per_block - 1) / thread_per_block;
+        DT_CUDA_DEBUG ( opal_cuda_output(10, "Cache cuda IOV description %d, size %d, residue %d, alignment %d, nb_block_aquired %d\n", i, count_desc, residue_desc, alignment, nb_blocks_per_description); );
+        for (j = 0; j < nb_blocks_per_description; j++) {
+            cuda_iov_h[nb_blocks_used].ncontig_disp = ncontig_disp_base + j * thread_per_block * alignment;
+            cuda_iov_h[nb_blocks_used].contig_disp = contig_disp;
+            if ( (j+1) * thread_per_block <= count_desc) {
+                cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] = thread_per_block * alignment;
+            } else {
+                cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] = (count_desc - j*thread_per_block) * alignment; 
+            }
+#if defined (OPAL_DATATYPE_CUDA_DEBUG)
+            assert(cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] > 0);
+#endif /* OPAL_DATATYPE_CUDA_DEBUG */
+            contig_disp += cached_cuda_iov_nb_bytes_list_h[nb_blocks_used];
+            DT_CUDA_DEBUG( opal_cuda_output(12, "Cache cuda IOV \tblock %d, ncontig_disp %ld, contig_disp %ld, nb_bytes %ld\n", nb_blocks_used, cuda_iov_h[nb_blocks_used].ncontig_disp, cuda_iov_h[nb_blocks_used].contig_disp, cached_cuda_iov_nb_bytes_list_h[nb_blocks_used]); );
+            nb_blocks_used ++;
+            assert (nb_blocks_used < CUDA_MAX_NB_BLOCKS*CUDA_IOV_MAX_TASK_PER_BLOCK);
+        }
+    
+        /* handle residue */
+        if (residue_desc != 0) {
+            cuda_iov_h[nb_blocks_used].ncontig_disp = ncontig_disp_base + length_per_iovec / alignment * alignment;
+            cuda_iov_h[nb_blocks_used].contig_disp = contig_disp;
+            cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] = length_per_iovec - length_per_iovec / alignment * alignment;
+#if defined (OPAL_DATATYPE_CUDA_DEBUG)
+            assert(cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] > 0);
+#endif /* OPAL_DATATYPE_CUDA_DEBUG */
+            contig_disp += cached_cuda_iov_nb_bytes_list_h[nb_blocks_used];
+            DT_CUDA_DEBUG( opal_cuda_output(12, "Cache cuda IOV \tblock %d, ncontig_disp %ld, contig_disp %ld, nb_bytes %ld\n", nb_blocks_used, cuda_iov_h[nb_blocks_used].ncontig_disp, cuda_iov_h[nb_blocks_used].contig_disp, cached_cuda_iov_nb_bytes_list_h[nb_blocks_used]); );
+            nb_blocks_used ++;
+#if defined (OPAL_DATATYPE_CUDA_DEBUG)
+            assert (nb_blocks_used < CUDA_MAX_NB_BLOCKS*CUDA_IOV_MAX_TASK_PER_BLOCK);
+#endif /* OPAL_DATATYPE_CUDA_DEBUG */
+        }
+    }
+    /* use additional entry to store the size of entire contiguous buffer needed for one ddt */
+    cuda_iov_h[nb_blocks_used].contig_disp = contig_disp;
+    cudaMemcpyAsync(cached_cuda_iov_d, cuda_iov_h, sizeof(ddt_cuda_iov_dist_cached_t)*(nb_blocks_used+1), cudaMemcpyHostToDevice, *cuda_stream_iov);
+    *cuda_iov_count = nb_blocks_used;
+    return OPAL_SUCCESS;
+}
+
 void opal_ddt_get_cached_cuda_iov(struct opal_convertor_t *convertor, ddt_cuda_iov_total_cached_t **cached_cuda_iov)
 {
     opal_datatype_t *datatype = (opal_datatype_t *)convertor->pDesc;
diff --git a/opal/datatype/cuda/opal_datatype_cuda.cuh b/opal/datatype/cuda/opal_datatype_cuda.cuh
index 8e30726ace2..4a71ab37d63 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cuh
+++ b/opal/datatype/cuda/opal_datatype_cuda.cuh
@@ -131,6 +131,8 @@ void opal_ddt_check_cuda_iov_is_full(struct opal_convertor_t *convertor, uint32_
 
 void opal_ddt_set_cuda_iov_position(struct opal_convertor_t *convertor, size_t ddt_offset, const uint32_t *cached_cuda_iov_nb_bytes_list_h, const uint32_t cuda_iov_count);
 
+int32_t opal_ddt_cache_cuda_iov(opal_convertor_t* pConvertor, ddt_cuda_iov_dist_cached_t *cached_cuda_iov_d, uint32_t *cached_cuda_iov_nb_bytes_list_h, uint32_t *cuda_iov_count);
+
 }
                             
 #endif  /* OPAL_DATATYPE_CUDA_H_HAS_BEEN_INCLUDED */
\ No newline at end of file
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
index fc9181e902b..ddc2ec08a89 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
@@ -932,40 +932,21 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
                                                                uint32_t* out_size,
                                                                size_t* max_data )
 {
-    uint32_t i, j;
-    uint32_t count_desc, nb_blocks_per_description, residue_desc;
+    uint32_t i;
     uint32_t nb_blocks, thread_per_block, nb_blocks_used;
-    size_t length, buffer_size, length_per_iovec;
-    unsigned char *destination, *destination_base, *source_base, *source;
+    size_t buffer_size;
+    unsigned char *destination, *destination_base, *source_base;
     size_t total_packed;
-    int32_t complete_flag = 0;
     uint8_t buffer_isfull = 0, transfer_required, free_required;
-    uint32_t convertor_flags;
-//    dt_elem_desc_t* description;
-//    dt_elem_desc_t* pElem;
-//    dt_stack_t* pStack;
-    uint8_t alignment, orig_alignment;
-//    int32_t orig_stack_index;
     cudaError_t cuda_err;
     ddt_cuda_stream_t *cuda_streams = current_cuda_device->cuda_streams;
-    ddt_cuda_iov_dist_cached_t* cuda_iov_dist_h_current = NULL;
-    ddt_cuda_iov_dist_cached_t* cuda_iov_dist_d_current = NULL;
-    uintptr_t *cuda_iov_contig_buf_h_current = NULL; 
-    uintptr_t *cuda_iov_contig_buf_d_current = NULL;
     ddt_cuda_iov_pipeline_block_t *cuda_iov_pipeline_block = NULL;
-    int iov_pipeline_block_id = 0;
     cudaStream_t *cuda_stream_iov = NULL;
-    const struct iovec *ddt_iov = NULL;
-    uint32_t ddt_iov_count = 0;
-    size_t iov_len = 0;
-    uint32_t iov_start_pos, iov_end_pos, cuda_iov_start_pos, cuda_iov_end_pos;
+    uint32_t cuda_iov_start_pos, cuda_iov_end_pos;
     ddt_cuda_iov_total_cached_t* cached_cuda_iov = NULL;
     ddt_cuda_iov_dist_cached_t* cached_cuda_iov_dist_d = NULL;
     uint32_t *cached_cuda_iov_nb_bytes_list_h = NULL;
-    uint32_t *cuda_iov_nb_bytes_list_h_current = NULL;
     uint32_t cached_cuda_iov_count = 0;
-    uint8_t cuda_iov_is_cached = 0;
-    size_t destionation_disp = 0;
     opal_datatype_count_t convertor_current_count;
     OPAL_PTRDIFF_TYPE ddt_extent;
 
@@ -973,14 +954,6 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
     TIMER_DATA_TYPE start, end, start_total, end_total;
     long total_time, move_time;
 #endif
-    
-    /*description = pConvertor->use_desc->desc;
-    pStack = pConvertor->pStack + pConvertor->stack_pos;
-    pElem = &(description[pStack->index]);
-    printf("size elem %lu, size %d\n", pElem->elem.common.type, opal_datatype_basicDatatypes[pElem->elem.common.type]->size);
-    */
-    
-//    assert(opal_datatype_basicDatatypes[pElem->elem.common.type]->size != 0);
 
  //   printf("buffer size %d, max_data %d\n", iov[0].iov_len, *max_data);
     if ((iov[0].iov_base == NULL) || opal_ddt_cuda_is_gpu_buffer(iov[0].iov_base)) {
@@ -1021,7 +994,6 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
 
     total_packed = 0;
     cuda_streams->current_stream_id = 0;
-  //  orig_stack_index = pStack->index;
     destination_base = destination;
 
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
@@ -1032,14 +1004,11 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
     GET_TIME(start);
 #endif
     
-    opal_convertor_raw_cached( pConvertor, &ddt_iov, &ddt_iov_count);
-    assert(ddt_iov != NULL);
     opal_ddt_get_cached_cuda_iov(pConvertor, &cached_cuda_iov);
     cached_cuda_iov_dist_d = cached_cuda_iov->cuda_iov_dist_d;
     assert(cached_cuda_iov_dist_d != NULL);
     cached_cuda_iov_nb_bytes_list_h = cached_cuda_iov->nb_bytes_h;
     assert(cached_cuda_iov_nb_bytes_list_h != NULL);
-    DT_CUDA_DEBUG ( opal_cuda_output(4, "Pack iov count %d, submit to CUDA stream %d\n", ddt_iov_count, cuda_streams->current_stream_id); );
 
 #if defined(OPAL_DATATYPE_CUDA_TIMING)    
     GET_TIME( end );
@@ -1053,69 +1022,20 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
     
     /* cuda iov is not cached, start to cache iov */
     if(opal_ddt_cuda_iov_is_cached(pConvertor) == 0) {
-        nb_blocks_used = 0;
-        cuda_iov_pipeline_block = current_cuda_device->cuda_iov_pipeline_block[iov_pipeline_block_id];
-        cuda_iov_dist_h_current = cuda_iov_pipeline_block->cuda_iov_dist_cached_h;
-        cuda_stream_iov = cuda_iov_pipeline_block->cuda_stream;
-        cuda_err = cudaStreamWaitEvent(*cuda_stream_iov, cuda_iov_pipeline_block->cuda_event, 0);
-        opal_cuda_check_error(cuda_err);
-
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
         GET_TIME(start);
 #endif
-
-        for (i = 0; i < ddt_iov_count; i++) {
-            length_per_iovec = ddt_iov[i].iov_len;
-            source = (size_t)(ddt_iov[i].iov_base) + source_base;
-        
-            /* block size is either multiple of ALIGNMENT_DOUBLE or residule */
-            alignment = ALIGNMENT_DOUBLE;
-
-            count_desc = length_per_iovec / alignment;
-            residue_desc = length_per_iovec % alignment;
-            nb_blocks_per_description = (count_desc + thread_per_block - 1) / thread_per_block;
-            DT_CUDA_DEBUG ( opal_cuda_output(10, "Pack description %d, size %d, residue %d, alignment %d, nb_block_aquired %d\n", i, count_desc, residue_desc, alignment, nb_blocks_per_description); );
-            for (j = 0; j < nb_blocks_per_description; j++) {
-                cuda_iov_dist_h_current[nb_blocks_used].ncontig_disp = source + j * thread_per_block * alignment - source_base;
-                cuda_iov_dist_h_current[nb_blocks_used].contig_disp = destionation_disp;
-                if ( (j+1) * thread_per_block <= count_desc) {
-                    cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] = thread_per_block * alignment;
-                } else {
-                    cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] = (count_desc - j*thread_per_block) * alignment; 
-                }
-#if defined (OPAL_DATATYPE_CUDA_DEBUG)
-                assert(cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] > 0);
-#endif /* OPAL_DATATYPE_CUDA_DEBUG */
-                destionation_disp += cached_cuda_iov_nb_bytes_list_h[nb_blocks_used];
-                DT_CUDA_DEBUG( opal_cuda_output(12, "Pack \tblock %d, ncontig_disp %ld, contig_disp %ld, nb_bytes %ld\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].ncontig_disp, cuda_iov_dist_h_current[nb_blocks_used].contig_disp, cached_cuda_iov_nb_bytes_list_h[nb_blocks_used]); );
-                nb_blocks_used ++;
-                assert (nb_blocks_used < CUDA_MAX_NB_BLOCKS*CUDA_IOV_MAX_TASK_PER_BLOCK);
-            }
-        
-            /* handle residue */
-            if (residue_desc != 0) {
-                /*orig_alignment = opal_datatype_basicDatatypes[pElem->elem.common.type]->size;*/
-                cuda_iov_dist_h_current[nb_blocks_used].ncontig_disp = source + length_per_iovec / alignment * alignment - source_base;
-                cuda_iov_dist_h_current[nb_blocks_used].contig_disp = destionation_disp;
-                cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] = length_per_iovec - length_per_iovec / alignment * alignment;
-#if defined (OPAL_DATATYPE_CUDA_DEBUG)
-                assert(cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] > 0);
-#endif /* OPAL_DATATYPE_CUDA_DEBUG */
-                destionation_disp += cached_cuda_iov_nb_bytes_list_h[nb_blocks_used];
-                DT_CUDA_DEBUG( opal_cuda_output(12, "Pack \tblock %d, ncontig_disp %ld, contig_disp %ld, nb_bytes %ld\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].ncontig_disp, cuda_iov_dist_h_current[nb_blocks_used].contig_disp, cached_cuda_iov_nb_bytes_list_h[nb_blocks_used]); );
-                nb_blocks_used ++;
-                assert (nb_blocks_used < CUDA_MAX_NB_BLOCKS*CUDA_IOV_MAX_TASK_PER_BLOCK);
-            }
+        if (opal_ddt_cache_cuda_iov(pConvertor, cached_cuda_iov_dist_d, cached_cuda_iov_nb_bytes_list_h, &nb_blocks_used) == OPAL_SUCCESS) {
+            opal_ddt_set_cuda_iov_cached(pConvertor, nb_blocks_used);
+            DT_CUDA_DEBUG ( opal_cuda_output(2, "Pack cuda iov is cached, count %d\n", nb_blocks_used););
+        } else {
+            DT_CUDA_DEBUG ( opal_cuda_output(0, "Pack cache cuda iov is failed\n"););
+            return OPAL_ERROR;
         }
-        /* use additional entry to store the size of entire contiguous buffer needed for one ddt */
-        cuda_iov_dist_h_current[nb_blocks_used].contig_disp = destionation_disp;
-        cudaMemcpyAsync(cached_cuda_iov_dist_d, cuda_iov_dist_h_current, sizeof(ddt_cuda_iov_dist_cached_t)*(nb_blocks_used+1), cudaMemcpyHostToDevice, *cuda_stream_iov);
-        opal_ddt_set_cuda_iov_cached(pConvertor, nb_blocks_used);
-        DT_CUDA_DEBUG ( opal_cuda_output(2, "Pack cuda iov is cached, count %d\n", nb_blocks_used););
 #if defined(OPAL_DATATYPE_CUDA_TIMING)    
         GET_TIME( end );
         total_time = ELAPSED_TIME( start, end );
-        DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Pack cached cuda iov is prepared in %ld microsec, nb_blocks %d\n", total_time, nb_blocks_used); );
+        DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Pack cuda iov is cached in %ld microsec, nb_blocks %d\n", total_time, nb_blocks_used); );
 #endif
     }
     
@@ -1124,7 +1044,7 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
     cuda_iov_start_pos = pConvertor->current_cuda_iov_pos;
     cuda_iov_end_pos = cached_cuda_iov_count;
     nb_blocks_used = 0;
-    cuda_iov_pipeline_block = current_cuda_device->cuda_iov_pipeline_block[iov_pipeline_block_id];
+    cuda_iov_pipeline_block = current_cuda_device->cuda_iov_pipeline_block[0];
     cuda_stream_iov = cuda_iov_pipeline_block->cuda_stream;
     convertor_current_count = pConvertor->current_count;
    
@@ -1154,14 +1074,12 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
     DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Pack to dest %p, cached cuda iov is prepared in %ld microsec, kernel submitted to CUDA stream %d, nb_blocks %d\n", destination_base, total_time,  cuda_iov_pipeline_block->cuda_stream_id, nb_blocks_used); );
 #endif
     opal_datatype_type_extent(pConvertor->pDesc, &ddt_extent);
-    DT_CUDA_DEBUG ( opal_cuda_output(2, "kernel launched src_base %p, dst_base %p, nb_blocks %ld, extent %ld\n", source_base, destination_base, nb_blocks_used, ddt_extent ); );
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "Pack kernel launched src_base %p, dst_base %p, nb_blocks %ld, extent %ld\n", source_base, destination_base, nb_blocks_used, ddt_extent ); );
     opal_generic_simple_pack_cuda_iov_cached_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cached_cuda_iov_dist_d, pConvertor->current_cuda_iov_pos, cached_cuda_iov_count, ddt_extent, convertor_current_count, nb_blocks_used, source_base, destination_base);
     pConvertor->current_cuda_iov_pos += nb_blocks_used;
     pConvertor->current_cuda_iov_pos = pConvertor->current_cuda_iov_pos % cached_cuda_iov->cuda_iov_count;
 
-    for (i = 0; i < NB_STREAMS; i++) {
-        cudaStreamSynchronize(cuda_streams->opal_cuda_stream[i]);
-    }
+    cudaStreamSynchronize(cuda_streams->opal_cuda_stream[0]);
 
     pConvertor->bConverted += total_packed;
     DT_CUDA_DEBUG ( opal_cuda_output(2, "Pack total packed %d\n", total_packed); );
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
index 49355e8e017..fe8475a201a 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
@@ -627,43 +627,24 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
                                                                  uint32_t* out_size,
                                                                  size_t* max_data )
 {
-    uint32_t i, j;
-    uint32_t count_desc, nb_blocks_per_description, residue_desc;
+    uint32_t i;
     uint32_t nb_blocks, thread_per_block, nb_blocks_used;
-    size_t length, buffer_size, length_per_iovec;
+    size_t buffer_size;
     unsigned char *source, *source_base, *destination_base, *destination;
     size_t total_unpacked;
-    int32_t complete_flag = 0;
     uint8_t buffer_isfull = 0;
     uint8_t free_required = 0;
-    uint32_t convertor_flags;
-//    dt_elem_desc_t* description;
-//    dt_elem_desc_t* pElem;
-//    dt_stack_t* pStack;
-    uint8_t alignment, orig_alignment;
-//    int32_t orig_stack_index;
     cudaError_t cuda_err;
     ddt_cuda_stream_t *cuda_streams = current_cuda_device->cuda_streams;
-    ddt_cuda_iov_dist_cached_t* cuda_iov_dist_h_current = NULL;
-    ddt_cuda_iov_dist_cached_t* cuda_iov_dist_d_current = NULL;
-    uintptr_t *cuda_iov_contig_buf_h_current = NULL;
-    uintptr_t *cuda_iov_contig_buf_d_current = NULL;
     ddt_cuda_iov_pipeline_block_t *cuda_iov_pipeline_block = NULL;
-    int iov_pipeline_block_id = 0;
     cudaStream_t *cuda_stream_iov = NULL;
-    const struct iovec *ddt_iov = NULL;
-    uint32_t ddt_iov_count = 0;
-    size_t iov_len = 0;
-    uint32_t iov_start_pos, iov_end_pos, cuda_iov_start_pos, cuda_iov_end_pos;
+    uint32_t cuda_iov_start_pos, cuda_iov_end_pos;
     ddt_cuda_iov_total_cached_t* cached_cuda_iov = NULL;
     ddt_cuda_iov_dist_cached_t* cached_cuda_iov_dist_d = NULL;
     uint32_t *cached_cuda_iov_nb_bytes_list_h = NULL;
-    uint32_t *cuda_iov_nb_bytes_list_h_current = NULL;
     uint32_t cached_cuda_iov_count = 0;
-    uint8_t cuda_iov_is_cached = 0;
     size_t cuda_iov_partial_length_start = 0;
     size_t cuda_iov_partial_length_end = 0;
-    size_t source_disp = 0;
     opal_datatype_count_t convertor_current_count;
     OPAL_PTRDIFF_TYPE ddt_extent;
 
@@ -676,12 +657,6 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
     GET_TIME(start_total);
 #endif
 
-/*    description = pConvertor->use_desc->desc;
-    pStack = pConvertor->pStack + pConvertor->stack_pos;
-    pElem = &(description[pStack->index]);
-    printf("size elem %d, size %lu\n", pElem->elem.common.type, opal_datatype_basicDatatypes[pElem->elem.common.type]->size);
-*/
-
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
     GET_TIME(start);
 #endif
@@ -710,9 +685,6 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
     move_time = ELAPSED_TIME( start, end );
     DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: HtoD memcpy in %ld microsec, free required %d\n", move_time, free_required ); );
 #endif
-    
-//    cuda_err = cudaEventRecord(current_cuda_device->memcpy_event, current_cuda_device->cuda_streams->opal_cuda_stream[0]);
-//    opal_cuda_check_error(cuda_err);
 
 
 #if defined (OPAL_DATATYPE_CUDA_TIMING)
@@ -721,17 +693,12 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
     buffer_size = iov[0].iov_len;
     total_unpacked = 0;
     cuda_streams->current_stream_id = 0;
-    convertor_flags = pConvertor->flags;
-//    orig_stack_index = pStack->index;
     source_base = source;
-    opal_convertor_raw_cached( pConvertor, &ddt_iov, &ddt_iov_count);
-    assert(ddt_iov != NULL);
     opal_ddt_get_cached_cuda_iov(pConvertor, &cached_cuda_iov);
     cached_cuda_iov_dist_d = cached_cuda_iov->cuda_iov_dist_d;
     assert(cached_cuda_iov_dist_d != NULL);
     cached_cuda_iov_nb_bytes_list_h = cached_cuda_iov->nb_bytes_h;
     assert(cached_cuda_iov_nb_bytes_list_h != NULL);
-    DT_CUDA_DEBUG ( opal_cuda_output(4, "Unpack iov count %d, submit to CUDA stream %d\n", ddt_iov_count, cuda_streams->current_stream_id); );
 
 #if defined (OPAL_DATATYPE_CUDA_TIMING)
     GET_TIME( end );
@@ -745,68 +712,17 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
     
     /* cuda iov is not cached, start to cache iov */
     if(opal_ddt_cuda_iov_is_cached(pConvertor) == 0) {
-        nb_blocks_used = 0;
-        cuda_iov_pipeline_block = current_cuda_device->cuda_iov_pipeline_block[iov_pipeline_block_id];
-        cuda_iov_dist_h_current = cuda_iov_pipeline_block->cuda_iov_dist_cached_h;
-        cuda_stream_iov = cuda_iov_pipeline_block->cuda_stream;
-        cuda_err = cudaStreamWaitEvent(*cuda_stream_iov, cuda_iov_pipeline_block->cuda_event, 0);
-        opal_cuda_check_error(cuda_err);
-    
-
 #if defined (OPAL_DATATYPE_CUDA_TIMING)
         GET_TIME(start);
 #endif
-
-        for (i = 0; i < ddt_iov_count; i++) {
-            length_per_iovec = ddt_iov[i].iov_len;
-            destination = (size_t)(ddt_iov[i].iov_base) + destination_base;
-
-            alignment = ALIGNMENT_DOUBLE;
-
-            count_desc = length_per_iovec / alignment;
-            residue_desc = length_per_iovec % alignment;
-            nb_blocks_per_description = (count_desc + thread_per_block - 1) / thread_per_block;
-            DT_CUDA_DEBUG ( opal_cuda_output(10, "Unpack description %d, size %d, residue %d, alignment %d\n", i, count_desc, residue_desc, alignment); );
-            for (j = 0; j < nb_blocks_per_description; j++) {
-                cuda_iov_dist_h_current[nb_blocks_used].ncontig_disp = destination + j * thread_per_block * alignment - destination_base;
-                cuda_iov_dist_h_current[nb_blocks_used].contig_disp = source_disp;
-                if ( (j+1) * thread_per_block <= count_desc) {
-                    cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] = thread_per_block * alignment;
-                } else {
-                    cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] = (thread_per_block - ((j+1)*thread_per_block - count_desc)) * alignment;
-                }
-#if defined (OPAL_DATATYPE_CUDA_DEBUG)
-                assert (cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] > 0); 
-#endif /* OPAL_DATATYPE_CUDA_DEBUG */
-                source_disp += cached_cuda_iov_nb_bytes_list_h[nb_blocks_used];
-                DT_CUDA_DEBUG( opal_cuda_output(12, "Unpack \tblock %d, ncontig_disp %ld, contig_disp %ld, nb_bytes %ld\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].ncontig_disp, cuda_iov_dist_h_current[nb_blocks_used].contig_disp, cached_cuda_iov_nb_bytes_list_h[nb_blocks_used]); );
-                nb_blocks_used ++;
-            }
-
-            /* handle residue */
-            if (residue_desc != 0) {
-               /* orig_alignment = opal_datatype_basicDatatypes[pElem->elem.common.type]->size;*/
-                orig_alignment = ALIGNMENT_CHAR;
-                cuda_iov_dist_h_current[nb_blocks_used].ncontig_disp = destination + length_per_iovec / alignment * alignment - destination_base;
-                cuda_iov_dist_h_current[nb_blocks_used].contig_disp = source_disp;
-                cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] = length_per_iovec - length_per_iovec / alignment * alignment;
-#if defined (OPAL_DATATYPE_CUDA_DEBUG)
-                assert (cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] > 0);
-#endif /* OPAL_DATATYPE_CUDA_DEBUG */
-                source_disp += cached_cuda_iov_nb_bytes_list_h[nb_blocks_used];
-                DT_CUDA_DEBUG( opal_cuda_output(12, "Unpack \tblock %d, ncontig_disp %ld, contig_disp %ld, nb_bytes %ld\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].ncontig_disp, cuda_iov_dist_h_current[nb_blocks_used].contig_disp, cached_cuda_iov_nb_bytes_list_h[nb_blocks_used]); );
-                nb_blocks_used ++;
-            }
+        if (opal_ddt_cache_cuda_iov(pConvertor, cached_cuda_iov_dist_d, cached_cuda_iov_nb_bytes_list_h, &nb_blocks_used) == OPAL_SUCCESS) {
+            opal_ddt_set_cuda_iov_cached(pConvertor, nb_blocks_used);
+            DT_CUDA_DEBUG ( opal_cuda_output(2, "Unpack cuda iov is cached, count %d\n", nb_blocks_used););
         }
-        /* use additional entry to store the size of entire contiguous buffer needed for one ddt */
-        cuda_iov_dist_h_current[nb_blocks_used].contig_disp = source_disp;
-        cudaMemcpy(cached_cuda_iov_dist_d, cuda_iov_dist_h_current, sizeof(ddt_cuda_iov_dist_cached_t)*(nb_blocks_used+1), cudaMemcpyHostToDevice);
-        opal_ddt_set_cuda_iov_cached(pConvertor, nb_blocks_used);
-        DT_CUDA_DEBUG ( opal_cuda_output(2, "Unpack cuda iov is cached, count %d\n", nb_blocks_used););
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
         GET_TIME( end );
         total_time = ELAPSED_TIME( start, end );
-        DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Unpack cached cuda iov is prepared in %ld microsec, nb_blocks_used %d\n", total_time, nb_blocks_used); );
+        DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Unpack cuda iov is cached in %ld microsec, nb_blocks_used %d\n", total_time, nb_blocks_used); );
 #endif
     }
       
@@ -816,11 +732,10 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
     cuda_iov_start_pos = pConvertor->current_cuda_iov_pos;
     cuda_iov_end_pos = cached_cuda_iov_count;
     nb_blocks_used = 0;
-    cuda_iov_pipeline_block = current_cuda_device->cuda_iov_pipeline_block[iov_pipeline_block_id];
+    cuda_iov_pipeline_block = current_cuda_device->cuda_iov_pipeline_block[0];
     cuda_stream_iov = cuda_iov_pipeline_block->cuda_stream;
     convertor_current_count = pConvertor->current_count;
     
-    printf("[00000] partial_length %ld, pos %d\n", pConvertor->current_iov_partial_length, pConvertor->current_cuda_iov_pos);
     if (pConvertor->current_iov_partial_length > 0) {
         cuda_iov_partial_length_start = pConvertor->current_iov_partial_length;
         total_unpacked += cuda_iov_partial_length_start;
@@ -862,12 +777,10 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
     DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Unpack src %p, cached cuda iov is prepared in %ld microsec, kernel submitted to CUDA stream %d, nb_blocks %d\n", source_base, total_time,  cuda_iov_pipeline_block->cuda_stream_id, nb_blocks_used); );
 #endif
     opal_datatype_type_extent(pConvertor->pDesc, &ddt_extent);
-    DT_CUDA_DEBUG ( opal_cuda_output(2, "kernel launched src_base %p, dst_base %p, nb_blocks %ld\n", source_base, destination_base, nb_blocks_used ); );
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "Unpack kernel launched src_base %p, dst_base %p, nb_blocks %ld\n", source_base, destination_base, nb_blocks_used ); );
     opal_generic_simple_unpack_cuda_iov_cached_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cached_cuda_iov_dist_d, pConvertor->current_cuda_iov_pos, cached_cuda_iov_count, ddt_extent, convertor_current_count, nb_blocks_used, destination_base, source_base, cuda_iov_partial_length_start, cuda_iov_partial_length_end);
 
-    for (i = 0; i < NB_STREAMS; i++) {
-        cudaStreamSynchronize(cuda_streams->opal_cuda_stream[i]);
-    }
+    cudaStreamSynchronize(cuda_streams->opal_cuda_stream[0]);
     
     pConvertor->bConverted += total_unpacked;
     DT_CUDA_DEBUG ( opal_cuda_output(2, "Unpack total unpacked %d\n", total_unpacked); );

From a39bc35c19369a18eb802b80de0879a72297c0c9 Mon Sep 17 00:00:00 2001
From: eddy16112 <eddy16112@gmail.com>
Date: Mon, 16 Nov 2015 16:30:27 -0500
Subject: [PATCH 084/190] these two variables are useless now

---
 opal/datatype/cuda/opal_datatype_cuda.cu           | 4 ----
 opal/datatype/cuda/opal_datatype_cuda_internal.cuh | 2 --
 2 files changed, 6 deletions(-)

diff --git a/opal/datatype/cuda/opal_datatype_cuda.cu b/opal/datatype/cuda/opal_datatype_cuda.cu
index 5747eb2b3a5..a71099c41a3 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cu
+++ b/opal/datatype/cuda/opal_datatype_cuda.cu
@@ -223,8 +223,6 @@ int32_t opal_ddt_cuda_kernel_init(void)
             cudaMallocHost((void **)(&(cuda_iov_pipeline_block->cuda_iov_dist_non_cached_h)), sizeof(ddt_cuda_iov_dist_non_cached_t) * CUDA_MAX_NB_BLOCKS * CUDA_IOV_MAX_TASK_PER_BLOCK);
             cudaMalloc((void **)(&(cuda_iov_pipeline_block->cuda_iov_dist_non_cached_d)), sizeof(ddt_cuda_iov_dist_non_cached_t) * CUDA_MAX_NB_BLOCKS * CUDA_IOV_MAX_TASK_PER_BLOCK);
             cudaMallocHost((void **)(&(cuda_iov_pipeline_block->cuda_iov_dist_cached_h)), sizeof(ddt_cuda_iov_dist_cached_t) * CUDA_MAX_NB_BLOCKS * CUDA_IOV_MAX_TASK_PER_BLOCK);
-            cudaMallocHost((void **)(&(cuda_iov_pipeline_block->cuda_iov_contig_buf_h)), sizeof(uintptr_t) * CUDA_MAX_NB_BLOCKS * CUDA_IOV_MAX_TASK_PER_BLOCK);
-            cudaMalloc((void **)(&(cuda_iov_pipeline_block->cuda_iov_contig_buf_d)), sizeof(uintptr_t) * CUDA_MAX_NB_BLOCKS * CUDA_IOV_MAX_TASK_PER_BLOCK);
             cuda_iov_pipeline_block->cuda_stream = &(cuda_streams->opal_cuda_stream[0]);
             cuda_iov_pipeline_block->cuda_stream_id = 0;
             cudaEventCreate(&(cuda_iov_pipeline_block->cuda_event), cudaEventDisableTiming);
@@ -264,8 +262,6 @@ int32_t opal_ddt_cuda_kernel_fini(void)
                 cudaFreeHost(cuda_iov_pipeline_block->cuda_iov_dist_non_cached_h);
                 cudaFree(cuda_iov_pipeline_block->cuda_iov_dist_non_cached_d);
                 cudaFreeHost(cuda_iov_pipeline_block->cuda_iov_dist_cached_h);
-                cudaFreeHost(cuda_iov_pipeline_block->cuda_iov_contig_buf_h);
-                cudaFree(cuda_iov_pipeline_block->cuda_iov_contig_buf_d);
                 cudaEventDestroy(cuda_iov_pipeline_block->cuda_event);
                 cuda_iov_pipeline_block->cuda_stream = NULL;
                 cuda_iov_pipeline_block->cuda_stream_id = -1;
diff --git a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
index 82a28420580..5e7bb41d0dc 100644
--- a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
+++ b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
@@ -74,8 +74,6 @@ typedef struct {
     ddt_cuda_iov_dist_non_cached_t* cuda_iov_dist_non_cached_h;
     ddt_cuda_iov_dist_non_cached_t* cuda_iov_dist_non_cached_d;
     ddt_cuda_iov_dist_cached_t* cuda_iov_dist_cached_h;
-    uintptr_t *cuda_iov_contig_buf_h;
-    uintptr_t *cuda_iov_contig_buf_d;
     cudaStream_t *cuda_stream;
     int32_t cuda_stream_id;
     cudaEvent_t cuda_event;

From 1c3fb4554568a9a51ba54ef80c130eb4f1a6684c Mon Sep 17 00:00:00 2001
From: eddy16112 <eddy16112@gmail.com>
Date: Mon, 16 Nov 2015 16:49:55 -0500
Subject: [PATCH 085/190] fix a bug for ib, current count of convertor should
 be set in set_cuda_iov_position

---
 opal/datatype/cuda/opal_datatype_cuda.cu | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/opal/datatype/cuda/opal_datatype_cuda.cu b/opal/datatype/cuda/opal_datatype_cuda.cu
index a71099c41a3..3129c320068 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cu
+++ b/opal/datatype/cuda/opal_datatype_cuda.cu
@@ -433,10 +433,12 @@ void opal_ddt_set_cuda_iov_position(struct opal_convertor_t *convertor, size_t d
     size_t ddt_size;
     convertor->current_iov_partial_length = 0;
     convertor->current_cuda_iov_pos = 0;
+    convertor->current_count = 0;
     if (ddt_offset == 0) {
        return;
     }
     opal_datatype_type_size(convertor->pDesc, &ddt_size);
+    convertor->current_count = ddt_offset / ddt_size;
     ddt_offset = ddt_offset % ddt_size;
     for(i = 0; i < cuda_iov_count; i++) {
         iov_size += cached_cuda_iov_nb_bytes_list_h[i];

From 02d6560326b353870fea34c97eedd993a3025ace Mon Sep 17 00:00:00 2001
From: eddy16112 <eddy16112@gmail.com>
Date: Mon, 16 Nov 2015 21:00:09 -0500
Subject: [PATCH 086/190] cleanup, move cudamalloc into cache cuda iov

---
 opal/datatype/cuda/opal_datatype_cuda.cu      | 61 +++++++++++++------
 opal/datatype/cuda/opal_datatype_cuda.cuh     |  2 +-
 .../cuda/opal_datatype_pack_cuda_wrapper.cu   | 14 ++---
 .../cuda/opal_datatype_unpack_cuda_wrapper.cu | 13 ++--
 4 files changed, 56 insertions(+), 34 deletions(-)

diff --git a/opal/datatype/cuda/opal_datatype_cuda.cu b/opal/datatype/cuda/opal_datatype_cuda.cu
index 3129c320068..d0927dc4162 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cu
+++ b/opal/datatype/cuda/opal_datatype_cuda.cu
@@ -281,15 +281,13 @@ void* opal_ddt_cached_cuda_iov_init(uint32_t size)
 {
 #if OPAL_DATATYPE_CUDA_IOV_CACHE 
     ddt_cuda_iov_total_cached_t *tmp = (ddt_cuda_iov_total_cached_t *)malloc(sizeof(ddt_cuda_iov_total_cached_t));
-    ddt_cuda_iov_dist_cached_t *tmp_cuda_iov_d = NULL;
-    cudaMalloc((void **)(&tmp_cuda_iov_d), sizeof(ddt_cuda_iov_dist_cached_t) * size);
     uint32_t *tmp_nb_bytes = (uint32_t *)malloc(sizeof(uint32_t) * size);
-    if (tmp != NULL && tmp_cuda_iov_d != NULL && tmp_nb_bytes != NULL) {
-        tmp->cuda_iov_dist_d = tmp_cuda_iov_d;
+    if (tmp != NULL && tmp_nb_bytes != NULL) {
+        tmp->cuda_iov_dist_d = NULL;
         tmp->cuda_iov_count = size;
         tmp->cuda_iov_is_cached = 0;
         tmp->nb_bytes_h = tmp_nb_bytes;
-        DT_CUDA_DEBUG( opal_cuda_output( 2, "Malloc cuda_iov_dist_cached for ddt is successed, cached cuda iov %p, cuda_iov_d %p, nb_bytes_h %p, size %d.\n", tmp, tmp_cuda_iov_d, tmp_nb_bytes, size); );
+        DT_CUDA_DEBUG( opal_cuda_output( 2, "Malloc cuda_iov_dist_cached for ddt is successed, cached cuda iov %p, nb_bytes_h %p, size %d.\n", tmp, tmp_nb_bytes, size); );
         return tmp;
     } else {
         DT_CUDA_DEBUG( opal_cuda_output( 0, "Malloc cuda_iov_dist_cached for ddt is failed.\n"); );
@@ -323,7 +321,7 @@ void opal_ddt_cached_cuda_iov_fini(void* cached_cuda_iov)
 
 /* cached_cuda_iov_d is not ready until explicitlt sync with cuda stream 0 
 */
-int32_t opal_ddt_cache_cuda_iov(opal_convertor_t* pConvertor, ddt_cuda_iov_dist_cached_t *cached_cuda_iov_d, uint32_t *cached_cuda_iov_nb_bytes_list_h, uint32_t *cuda_iov_count)
+int32_t opal_ddt_cache_cuda_iov(opal_convertor_t* pConvertor, uint32_t *cuda_iov_count)
 {
     uint32_t i, j;
     uint32_t count_desc, nb_blocks_per_description, residue_desc;
@@ -331,12 +329,17 @@ int32_t opal_ddt_cache_cuda_iov(opal_convertor_t* pConvertor, ddt_cuda_iov_dist_
     size_t length_per_iovec;
     uint8_t alignment;
     ddt_cuda_iov_pipeline_block_t *cuda_iov_pipeline_block = NULL;
-    ddt_cuda_iov_dist_cached_t *cuda_iov_h = NULL;
+    ddt_cuda_iov_total_cached_t* cached_cuda_iov = NULL;
+    ddt_cuda_iov_dist_cached_t *cached_cuda_iov_dist_d = NULL;
+    ddt_cuda_iov_dist_cached_t *cuda_iov_dist_h = NULL;
     cudaStream_t *cuda_stream_iov = NULL;
     const struct iovec *ddt_iov = NULL;
     uint32_t ddt_iov_count = 0;
     size_t ncontig_disp_base;
     size_t contig_disp = 0;
+    uint32_t *cached_cuda_iov_nb_bytes_list_h = NULL;
+    
+    opal_datatype_t *datatype = (opal_datatype_t *)pConvertor->pDesc;
     
     opal_convertor_raw_cached( pConvertor, &ddt_iov, &ddt_iov_count);
     if (ddt_iov == NULL) {
@@ -344,10 +347,18 @@ int32_t opal_ddt_cache_cuda_iov(opal_convertor_t* pConvertor, ddt_cuda_iov_dist_
         return OPAL_ERROR;
     }
     
+    
+    cached_cuda_iov = (ddt_cuda_iov_total_cached_t *)opal_ddt_cached_cuda_iov_init(NUM_CUDA_IOV_PER_DDT);
+    if (cached_cuda_iov == NULL) {
+        DT_CUDA_DEBUG ( opal_cuda_output(0, "Can not init cuda iov\n"););
+        return OPAL_ERROR;
+    }
+    cached_cuda_iov_nb_bytes_list_h = cached_cuda_iov->nb_bytes_h;
     nb_blocks_used = 0;
     cuda_iov_pipeline_block = current_cuda_device->cuda_iov_pipeline_block[0];
-    cuda_iov_h = cuda_iov_pipeline_block->cuda_iov_dist_cached_h;
+    cuda_iov_dist_h = cuda_iov_pipeline_block->cuda_iov_dist_cached_h;
     cuda_stream_iov = cuda_iov_pipeline_block->cuda_stream;
+    thread_per_block = CUDA_WARP_SIZE * 5;
 
     for (i = 0; i < ddt_iov_count; i++) {
         length_per_iovec = ddt_iov[i].iov_len;
@@ -361,8 +372,8 @@ int32_t opal_ddt_cache_cuda_iov(opal_convertor_t* pConvertor, ddt_cuda_iov_dist_
         nb_blocks_per_description = (count_desc + thread_per_block - 1) / thread_per_block;
         DT_CUDA_DEBUG ( opal_cuda_output(10, "Cache cuda IOV description %d, size %d, residue %d, alignment %d, nb_block_aquired %d\n", i, count_desc, residue_desc, alignment, nb_blocks_per_description); );
         for (j = 0; j < nb_blocks_per_description; j++) {
-            cuda_iov_h[nb_blocks_used].ncontig_disp = ncontig_disp_base + j * thread_per_block * alignment;
-            cuda_iov_h[nb_blocks_used].contig_disp = contig_disp;
+            cuda_iov_dist_h[nb_blocks_used].ncontig_disp = ncontig_disp_base + j * thread_per_block * alignment;
+            cuda_iov_dist_h[nb_blocks_used].contig_disp = contig_disp;
             if ( (j+1) * thread_per_block <= count_desc) {
                 cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] = thread_per_block * alignment;
             } else {
@@ -372,21 +383,21 @@ int32_t opal_ddt_cache_cuda_iov(opal_convertor_t* pConvertor, ddt_cuda_iov_dist_
             assert(cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] > 0);
 #endif /* OPAL_DATATYPE_CUDA_DEBUG */
             contig_disp += cached_cuda_iov_nb_bytes_list_h[nb_blocks_used];
-            DT_CUDA_DEBUG( opal_cuda_output(12, "Cache cuda IOV \tblock %d, ncontig_disp %ld, contig_disp %ld, nb_bytes %ld\n", nb_blocks_used, cuda_iov_h[nb_blocks_used].ncontig_disp, cuda_iov_h[nb_blocks_used].contig_disp, cached_cuda_iov_nb_bytes_list_h[nb_blocks_used]); );
+            DT_CUDA_DEBUG( opal_cuda_output(12, "Cache cuda IOV \tblock %d, ncontig_disp %ld, contig_disp %ld, nb_bytes %ld\n", nb_blocks_used, cuda_iov_dist_h[nb_blocks_used].ncontig_disp, cuda_iov_dist_h[nb_blocks_used].contig_disp, cached_cuda_iov_nb_bytes_list_h[nb_blocks_used]); );
             nb_blocks_used ++;
             assert (nb_blocks_used < CUDA_MAX_NB_BLOCKS*CUDA_IOV_MAX_TASK_PER_BLOCK);
         }
     
         /* handle residue */
         if (residue_desc != 0) {
-            cuda_iov_h[nb_blocks_used].ncontig_disp = ncontig_disp_base + length_per_iovec / alignment * alignment;
-            cuda_iov_h[nb_blocks_used].contig_disp = contig_disp;
+            cuda_iov_dist_h[nb_blocks_used].ncontig_disp = ncontig_disp_base + length_per_iovec / alignment * alignment;
+            cuda_iov_dist_h[nb_blocks_used].contig_disp = contig_disp;
             cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] = length_per_iovec - length_per_iovec / alignment * alignment;
 #if defined (OPAL_DATATYPE_CUDA_DEBUG)
             assert(cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] > 0);
 #endif /* OPAL_DATATYPE_CUDA_DEBUG */
             contig_disp += cached_cuda_iov_nb_bytes_list_h[nb_blocks_used];
-            DT_CUDA_DEBUG( opal_cuda_output(12, "Cache cuda IOV \tblock %d, ncontig_disp %ld, contig_disp %ld, nb_bytes %ld\n", nb_blocks_used, cuda_iov_h[nb_blocks_used].ncontig_disp, cuda_iov_h[nb_blocks_used].contig_disp, cached_cuda_iov_nb_bytes_list_h[nb_blocks_used]); );
+            DT_CUDA_DEBUG( opal_cuda_output(12, "Cache cuda IOV \tblock %d, ncontig_disp %ld, contig_disp %ld, nb_bytes %ld\n", nb_blocks_used, cuda_iov_dist_h[nb_blocks_used].ncontig_disp, cuda_iov_dist_h[nb_blocks_used].contig_disp, cached_cuda_iov_nb_bytes_list_h[nb_blocks_used]); );
             nb_blocks_used ++;
 #if defined (OPAL_DATATYPE_CUDA_DEBUG)
             assert (nb_blocks_used < CUDA_MAX_NB_BLOCKS*CUDA_IOV_MAX_TASK_PER_BLOCK);
@@ -394,8 +405,15 @@ int32_t opal_ddt_cache_cuda_iov(opal_convertor_t* pConvertor, ddt_cuda_iov_dist_
         }
     }
     /* use additional entry to store the size of entire contiguous buffer needed for one ddt */
-    cuda_iov_h[nb_blocks_used].contig_disp = contig_disp;
-    cudaMemcpyAsync(cached_cuda_iov_d, cuda_iov_h, sizeof(ddt_cuda_iov_dist_cached_t)*(nb_blocks_used+1), cudaMemcpyHostToDevice, *cuda_stream_iov);
+    cuda_iov_dist_h[nb_blocks_used].contig_disp = contig_disp;
+    cudaMalloc((void **)(&cached_cuda_iov_dist_d), sizeof(ddt_cuda_iov_dist_cached_t) * (nb_blocks_used+1));
+    if (cached_cuda_iov_dist_d == NULL) {
+        DT_CUDA_DEBUG ( opal_cuda_output(0, "Can not malloc cuda iov in GPU\n"););
+        return OPAL_ERROR;
+    }
+    cudaMemcpyAsync(cached_cuda_iov_dist_d, cuda_iov_dist_h, sizeof(ddt_cuda_iov_dist_cached_t)*(nb_blocks_used+1), cudaMemcpyHostToDevice, *cuda_stream_iov);
+    cached_cuda_iov->cuda_iov_dist_d = cached_cuda_iov_dist_d;
+    datatype->cached_cuda_iov = cached_cuda_iov;
     *cuda_iov_count = nb_blocks_used;
     return OPAL_SUCCESS;
 }
@@ -404,9 +422,10 @@ void opal_ddt_get_cached_cuda_iov(struct opal_convertor_t *convertor, ddt_cuda_i
 {
     opal_datatype_t *datatype = (opal_datatype_t *)convertor->pDesc;
     if (datatype->cached_cuda_iov == NULL) {
-        datatype->cached_cuda_iov = opal_ddt_cached_cuda_iov_init(NUM_CUDA_IOV_PER_DDT);
-    }
-    *cached_cuda_iov = (ddt_cuda_iov_total_cached_t *)datatype->cached_cuda_iov;                   
+        *cached_cuda_iov = NULL;
+    } else {
+        *cached_cuda_iov = (ddt_cuda_iov_total_cached_t *)datatype->cached_cuda_iov;
+    }                 
 }
 
 void opal_ddt_set_cuda_iov_cached(struct opal_convertor_t *convertor, uint32_t cuda_iov_count)
@@ -421,7 +440,9 @@ void opal_ddt_set_cuda_iov_cached(struct opal_convertor_t *convertor, uint32_t c
 uint8_t opal_ddt_cuda_iov_is_cached(struct opal_convertor_t *convertor)
 {
     opal_datatype_t *datatype = (opal_datatype_t *)convertor->pDesc;
-    assert(datatype->cached_cuda_iov != NULL);
+    if (datatype->cached_cuda_iov == NULL) {
+        return 0;
+    }
     ddt_cuda_iov_total_cached_t *tmp = (ddt_cuda_iov_total_cached_t *)datatype->cached_cuda_iov;
     return tmp->cuda_iov_is_cached;
 }
diff --git a/opal/datatype/cuda/opal_datatype_cuda.cuh b/opal/datatype/cuda/opal_datatype_cuda.cuh
index 4a71ab37d63..8ad9b3ec658 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cuh
+++ b/opal/datatype/cuda/opal_datatype_cuda.cuh
@@ -131,7 +131,7 @@ void opal_ddt_check_cuda_iov_is_full(struct opal_convertor_t *convertor, uint32_
 
 void opal_ddt_set_cuda_iov_position(struct opal_convertor_t *convertor, size_t ddt_offset, const uint32_t *cached_cuda_iov_nb_bytes_list_h, const uint32_t cuda_iov_count);
 
-int32_t opal_ddt_cache_cuda_iov(opal_convertor_t* pConvertor, ddt_cuda_iov_dist_cached_t *cached_cuda_iov_d, uint32_t *cached_cuda_iov_nb_bytes_list_h, uint32_t *cuda_iov_count);
+int32_t opal_ddt_cache_cuda_iov(opal_convertor_t* pConvertor, uint32_t *cuda_iov_count);
 
 }
                             
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
index ddc2ec08a89..c98d540e54e 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
@@ -1003,12 +1003,6 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
     GET_TIME(start);
 #endif
-    
-    opal_ddt_get_cached_cuda_iov(pConvertor, &cached_cuda_iov);
-    cached_cuda_iov_dist_d = cached_cuda_iov->cuda_iov_dist_d;
-    assert(cached_cuda_iov_dist_d != NULL);
-    cached_cuda_iov_nb_bytes_list_h = cached_cuda_iov->nb_bytes_h;
-    assert(cached_cuda_iov_nb_bytes_list_h != NULL);
 
 #if defined(OPAL_DATATYPE_CUDA_TIMING)    
     GET_TIME( end );
@@ -1025,7 +1019,7 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
         GET_TIME(start);
 #endif
-        if (opal_ddt_cache_cuda_iov(pConvertor, cached_cuda_iov_dist_d, cached_cuda_iov_nb_bytes_list_h, &nb_blocks_used) == OPAL_SUCCESS) {
+        if (opal_ddt_cache_cuda_iov(pConvertor, &nb_blocks_used) == OPAL_SUCCESS) {
             opal_ddt_set_cuda_iov_cached(pConvertor, nb_blocks_used);
             DT_CUDA_DEBUG ( opal_cuda_output(2, "Pack cuda iov is cached, count %d\n", nb_blocks_used););
         } else {
@@ -1040,6 +1034,12 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
     }
     
     /* now we use cached cuda iov */
+    opal_ddt_get_cached_cuda_iov(pConvertor, &cached_cuda_iov);
+    cached_cuda_iov_dist_d = cached_cuda_iov->cuda_iov_dist_d;
+    assert(cached_cuda_iov_dist_d != NULL);
+    cached_cuda_iov_nb_bytes_list_h = cached_cuda_iov->nb_bytes_h;
+    assert(cached_cuda_iov_nb_bytes_list_h != NULL);
+    
     cached_cuda_iov_count = cached_cuda_iov->cuda_iov_count;
     cuda_iov_start_pos = pConvertor->current_cuda_iov_pos;
     cuda_iov_end_pos = cached_cuda_iov_count;
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
index fe8475a201a..6808ab56fed 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
@@ -694,11 +694,6 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
     total_unpacked = 0;
     cuda_streams->current_stream_id = 0;
     source_base = source;
-    opal_ddt_get_cached_cuda_iov(pConvertor, &cached_cuda_iov);
-    cached_cuda_iov_dist_d = cached_cuda_iov->cuda_iov_dist_d;
-    assert(cached_cuda_iov_dist_d != NULL);
-    cached_cuda_iov_nb_bytes_list_h = cached_cuda_iov->nb_bytes_h;
-    assert(cached_cuda_iov_nb_bytes_list_h != NULL);
 
 #if defined (OPAL_DATATYPE_CUDA_TIMING)
     GET_TIME( end );
@@ -715,7 +710,7 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
 #if defined (OPAL_DATATYPE_CUDA_TIMING)
         GET_TIME(start);
 #endif
-        if (opal_ddt_cache_cuda_iov(pConvertor, cached_cuda_iov_dist_d, cached_cuda_iov_nb_bytes_list_h, &nb_blocks_used) == OPAL_SUCCESS) {
+        if (opal_ddt_cache_cuda_iov(pConvertor, &nb_blocks_used) == OPAL_SUCCESS) {
             opal_ddt_set_cuda_iov_cached(pConvertor, nb_blocks_used);
             DT_CUDA_DEBUG ( opal_cuda_output(2, "Unpack cuda iov is cached, count %d\n", nb_blocks_used););
         }
@@ -727,6 +722,12 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
     }
       
     /* now we use cached cuda iov */
+    opal_ddt_get_cached_cuda_iov(pConvertor, &cached_cuda_iov);
+    cached_cuda_iov_dist_d = cached_cuda_iov->cuda_iov_dist_d;
+    assert(cached_cuda_iov_dist_d != NULL);
+    cached_cuda_iov_nb_bytes_list_h = cached_cuda_iov->nb_bytes_h;
+    assert(cached_cuda_iov_nb_bytes_list_h != NULL);
+    
     cached_cuda_iov_count = cached_cuda_iov->cuda_iov_count;
     opal_ddt_set_cuda_iov_position(pConvertor, pConvertor->bConverted, cached_cuda_iov_nb_bytes_list_h, cached_cuda_iov_count);
     cuda_iov_start_pos = pConvertor->current_cuda_iov_pos;

From 47cd9095f8260f921ae922e588307b29c5740617 Mon Sep 17 00:00:00 2001
From: eddy16112 <eddy16112@gmail.com>
Date: Tue, 17 Nov 2015 02:53:38 -0500
Subject: [PATCH 087/190] rearrange varibles

---
 opal/datatype/cuda/opal_datatype_cuda.cu | 2 +-
 opal/datatype/opal_datatype.h            | 7 ++++---
 opal/datatype/opal_datatype_create.c     | 2 +-
 3 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/opal/datatype/cuda/opal_datatype_cuda.cu b/opal/datatype/cuda/opal_datatype_cuda.cu
index d0927dc4162..f79e4e5ed0d 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cu
+++ b/opal/datatype/cuda/opal_datatype_cuda.cu
@@ -413,7 +413,7 @@ int32_t opal_ddt_cache_cuda_iov(opal_convertor_t* pConvertor, uint32_t *cuda_iov
     }
     cudaMemcpyAsync(cached_cuda_iov_dist_d, cuda_iov_dist_h, sizeof(ddt_cuda_iov_dist_cached_t)*(nb_blocks_used+1), cudaMemcpyHostToDevice, *cuda_stream_iov);
     cached_cuda_iov->cuda_iov_dist_d = cached_cuda_iov_dist_d;
-    datatype->cached_cuda_iov = cached_cuda_iov;
+    datatype->cached_cuda_iov = (unsigned char*)cached_cuda_iov;
     *cuda_iov_count = nb_blocks_used;
     return OPAL_SUCCESS;
 }
diff --git a/opal/datatype/opal_datatype.h b/opal/datatype/opal_datatype.h
index 6e161e96d76..1287cdb1410 100644
--- a/opal/datatype/opal_datatype.h
+++ b/opal/datatype/opal_datatype.h
@@ -131,13 +131,14 @@ struct opal_datatype_t {
     int                iov_count;
     size_t             max_data;
     /* size: 416, cachelines: 7, members: 18 */
-#if OPAL_CUDA_SUPPORT
-    void *             cached_cuda_iov;
-#endif /* OPAL_CUDA_SUPPORT */
     /* last cacheline: 32 bytes */
 
     struct iovec*      cached_iovec;
     uint32_t           cached_iovec_count;
+
+#if OPAL_CUDA_SUPPORT
+    unsigned char *             cached_cuda_iov;
+#endif /* OPAL_CUDA_SUPPORT */
 };
 
 typedef struct opal_datatype_t opal_datatype_t;
diff --git a/opal/datatype/opal_datatype_create.c b/opal/datatype/opal_datatype_create.c
index 44c0e3020b6..e57a7d6c668 100644
--- a/opal/datatype/opal_datatype_create.c
+++ b/opal/datatype/opal_datatype_create.c
@@ -102,7 +102,7 @@ static void opal_datatype_destruct( opal_datatype_t* datatype )
 #if OPAL_CUDA_SUPPORT   
     /* free cuda iov */
     if (opal_datatype_cuda_kernel_support == 1 && datatype->cached_cuda_iov != NULL) {
-        opal_cached_cuda_iov_fini(datatype->cached_cuda_iov);
+        opal_cached_cuda_iov_fini((void*)datatype->cached_cuda_iov);
         datatype->cached_cuda_iov = NULL;
     }
 #endif /* OPAL_CUDA_SUPPORT */

From c953c5b590587a96cd863566cd4293a048060615 Mon Sep 17 00:00:00 2001
From: eddy16112 <eddy16112@gmail.com>
Date: Tue, 17 Nov 2015 18:13:00 -0500
Subject: [PATCH 088/190] if cuda_iov is not big enough, use realloc. However,
 cudaMallocHost does not work with realloc, so use malloc instead

---
 opal/datatype/cuda/opal_datatype_cuda.cu | 35 +++++++++++++++++++++---
 1 file changed, 31 insertions(+), 4 deletions(-)

diff --git a/opal/datatype/cuda/opal_datatype_cuda.cu b/opal/datatype/cuda/opal_datatype_cuda.cu
index f79e4e5ed0d..cd74a081693 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cu
+++ b/opal/datatype/cuda/opal_datatype_cuda.cu
@@ -222,7 +222,10 @@ int32_t opal_ddt_cuda_kernel_init(void)
             cuda_iov_pipeline_block = (ddt_cuda_iov_pipeline_block_t *)malloc(sizeof(ddt_cuda_iov_pipeline_block_t));
             cudaMallocHost((void **)(&(cuda_iov_pipeline_block->cuda_iov_dist_non_cached_h)), sizeof(ddt_cuda_iov_dist_non_cached_t) * CUDA_MAX_NB_BLOCKS * CUDA_IOV_MAX_TASK_PER_BLOCK);
             cudaMalloc((void **)(&(cuda_iov_pipeline_block->cuda_iov_dist_non_cached_d)), sizeof(ddt_cuda_iov_dist_non_cached_t) * CUDA_MAX_NB_BLOCKS * CUDA_IOV_MAX_TASK_PER_BLOCK);
-            cudaMallocHost((void **)(&(cuda_iov_pipeline_block->cuda_iov_dist_cached_h)), sizeof(ddt_cuda_iov_dist_cached_t) * CUDA_MAX_NB_BLOCKS * CUDA_IOV_MAX_TASK_PER_BLOCK);
+            if (j == 0) {
+            //    cudaMallocHost((void **)(&(cuda_iov_pipeline_block->cuda_iov_dist_cached_h)), sizeof(ddt_cuda_iov_dist_cached_t) * NUM_CUDA_IOV_PER_DDT);
+                cuda_iov_pipeline_block->cuda_iov_dist_cached_h = (ddt_cuda_iov_dist_cached_t *)malloc(sizeof(ddt_cuda_iov_dist_cached_t) * NUM_CUDA_IOV_PER_DDT);
+            }
             cuda_iov_pipeline_block->cuda_stream = &(cuda_streams->opal_cuda_stream[0]);
             cuda_iov_pipeline_block->cuda_stream_id = 0;
             cudaEventCreate(&(cuda_iov_pipeline_block->cuda_event), cudaEventDisableTiming);
@@ -261,7 +264,8 @@ int32_t opal_ddt_cuda_kernel_fini(void)
             if (cuda_iov_pipeline_block != NULL) {
                 cudaFreeHost(cuda_iov_pipeline_block->cuda_iov_dist_non_cached_h);
                 cudaFree(cuda_iov_pipeline_block->cuda_iov_dist_non_cached_d);
-                cudaFreeHost(cuda_iov_pipeline_block->cuda_iov_dist_cached_h);
+                //cudaFreeHost(cuda_iov_pipeline_block->cuda_iov_dist_cached_h);
+                free(cuda_iov_pipeline_block->cuda_iov_dist_cached_h);
                 cudaEventDestroy(cuda_iov_pipeline_block->cuda_event);
                 cuda_iov_pipeline_block->cuda_stream = NULL;
                 cuda_iov_pipeline_block->cuda_stream_id = -1;
@@ -319,6 +323,22 @@ void opal_ddt_cached_cuda_iov_fini(void* cached_cuda_iov)
 #endif /* OPAL_DATATYPE_CUDA_IOV_CACHE */
 }
 
+static inline int32_t opal_ddt_cached_cuda_iov_isfull(ddt_cuda_iov_total_cached_t *cached_cuda_iov, ddt_cuda_iov_dist_cached_t **cuda_iov_dist_h, uint32_t nb_blocks_used)
+{
+    if (nb_blocks_used < cached_cuda_iov->cuda_iov_count) {
+        return 0;
+    } else {
+realloc_cuda_iov:
+        cached_cuda_iov->nb_bytes_h = (uint32_t *)realloc(cached_cuda_iov->nb_bytes_h, sizeof(uint32_t)*cached_cuda_iov->cuda_iov_count*2);
+        assert(cached_cuda_iov->nb_bytes_h != NULL);
+        cached_cuda_iov->cuda_iov_count *= 2;
+        if (nb_blocks_used >= cached_cuda_iov->cuda_iov_count) {
+            goto realloc_cuda_iov;
+        }
+        return 1;
+    }
+}
+
 /* cached_cuda_iov_d is not ready until explicitlt sync with cuda stream 0 
 */
 int32_t opal_ddt_cache_cuda_iov(opal_convertor_t* pConvertor, uint32_t *cuda_iov_count)
@@ -371,6 +391,13 @@ int32_t opal_ddt_cache_cuda_iov(opal_convertor_t* pConvertor, uint32_t *cuda_iov
         residue_desc = length_per_iovec % alignment;
         nb_blocks_per_description = (count_desc + thread_per_block - 1) / thread_per_block;
         DT_CUDA_DEBUG ( opal_cuda_output(10, "Cache cuda IOV description %d, size %d, residue %d, alignment %d, nb_block_aquired %d\n", i, count_desc, residue_desc, alignment, nb_blocks_per_description); );
+        if (opal_ddt_cached_cuda_iov_isfull(cached_cuda_iov, &(cuda_iov_pipeline_block->cuda_iov_dist_cached_h), nb_blocks_used + nb_blocks_per_description + 1)) {
+            cached_cuda_iov_nb_bytes_list_h = cached_cuda_iov->nb_bytes_h;
+            cuda_iov_dist_h = (ddt_cuda_iov_dist_cached_t *)realloc(cuda_iov_dist_h, sizeof(ddt_cuda_iov_dist_cached_t)*cached_cuda_iov->cuda_iov_count);
+            assert(cuda_iov_dist_h != NULL);
+            cuda_iov_pipeline_block->cuda_iov_dist_cached_h = cuda_iov_dist_h;
+        }
+        
         for (j = 0; j < nb_blocks_per_description; j++) {
             cuda_iov_dist_h[nb_blocks_used].ncontig_disp = ncontig_disp_base + j * thread_per_block * alignment;
             cuda_iov_dist_h[nb_blocks_used].contig_disp = contig_disp;
@@ -385,7 +412,7 @@ int32_t opal_ddt_cache_cuda_iov(opal_convertor_t* pConvertor, uint32_t *cuda_iov
             contig_disp += cached_cuda_iov_nb_bytes_list_h[nb_blocks_used];
             DT_CUDA_DEBUG( opal_cuda_output(12, "Cache cuda IOV \tblock %d, ncontig_disp %ld, contig_disp %ld, nb_bytes %ld\n", nb_blocks_used, cuda_iov_dist_h[nb_blocks_used].ncontig_disp, cuda_iov_dist_h[nb_blocks_used].contig_disp, cached_cuda_iov_nb_bytes_list_h[nb_blocks_used]); );
             nb_blocks_used ++;
-            assert (nb_blocks_used < CUDA_MAX_NB_BLOCKS*CUDA_IOV_MAX_TASK_PER_BLOCK);
+         //   assert (nb_blocks_used < NUM_CUDA_IOV_PER_DDT);
         }
     
         /* handle residue */
@@ -400,7 +427,7 @@ int32_t opal_ddt_cache_cuda_iov(opal_convertor_t* pConvertor, uint32_t *cuda_iov
             DT_CUDA_DEBUG( opal_cuda_output(12, "Cache cuda IOV \tblock %d, ncontig_disp %ld, contig_disp %ld, nb_bytes %ld\n", nb_blocks_used, cuda_iov_dist_h[nb_blocks_used].ncontig_disp, cuda_iov_dist_h[nb_blocks_used].contig_disp, cached_cuda_iov_nb_bytes_list_h[nb_blocks_used]); );
             nb_blocks_used ++;
 #if defined (OPAL_DATATYPE_CUDA_DEBUG)
-            assert (nb_blocks_used < CUDA_MAX_NB_BLOCKS*CUDA_IOV_MAX_TASK_PER_BLOCK);
+            //assert (nb_blocks_used < NUM_CUDA_IOV_PER_DDT);
 #endif /* OPAL_DATATYPE_CUDA_DEBUG */
         }
     }

From 5d3cca0612a2eff2e1766b8d26bdaae3b0d276d7 Mon Sep 17 00:00:00 2001
From: eddy16112 <eddy16112@gmail.com>
Date: Wed, 18 Nov 2015 15:26:31 -0500
Subject: [PATCH 089/190] make sure check pointer is not NULL before free it

---
 opal/datatype/cuda/opal_datatype_cuda.cu | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/opal/datatype/cuda/opal_datatype_cuda.cu b/opal/datatype/cuda/opal_datatype_cuda.cu
index cd74a081693..2df143f2c61 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cu
+++ b/opal/datatype/cuda/opal_datatype_cuda.cu
@@ -225,6 +225,8 @@ int32_t opal_ddt_cuda_kernel_init(void)
             if (j == 0) {
             //    cudaMallocHost((void **)(&(cuda_iov_pipeline_block->cuda_iov_dist_cached_h)), sizeof(ddt_cuda_iov_dist_cached_t) * NUM_CUDA_IOV_PER_DDT);
                 cuda_iov_pipeline_block->cuda_iov_dist_cached_h = (ddt_cuda_iov_dist_cached_t *)malloc(sizeof(ddt_cuda_iov_dist_cached_t) * NUM_CUDA_IOV_PER_DDT);
+            } else {
+                cuda_iov_pipeline_block->cuda_iov_dist_cached_h = NULL;
             }
             cuda_iov_pipeline_block->cuda_stream = &(cuda_streams->opal_cuda_stream[0]);
             cuda_iov_pipeline_block->cuda_stream_id = 0;
@@ -262,10 +264,19 @@ int32_t opal_ddt_cuda_kernel_fini(void)
             cudaStreamDestroy(cuda_devices[i].cuda_streams->opal_cuda_stream[j]);
             cuda_iov_pipeline_block = cuda_devices[i].cuda_iov_pipeline_block[j];
             if (cuda_iov_pipeline_block != NULL) {
-                cudaFreeHost(cuda_iov_pipeline_block->cuda_iov_dist_non_cached_h);
-                cudaFree(cuda_iov_pipeline_block->cuda_iov_dist_non_cached_d);
-                //cudaFreeHost(cuda_iov_pipeline_block->cuda_iov_dist_cached_h);
-                free(cuda_iov_pipeline_block->cuda_iov_dist_cached_h);
+                if (cuda_iov_pipeline_block->cuda_iov_dist_non_cached_h != NULL) {
+                    cudaFreeHost(cuda_iov_pipeline_block->cuda_iov_dist_non_cached_h);
+                    cuda_iov_pipeline_block->cuda_iov_dist_non_cached_h = NULL;
+                }
+                if (cuda_iov_pipeline_block->cuda_iov_dist_non_cached_d != NULL) {
+                    cudaFree(cuda_iov_pipeline_block->cuda_iov_dist_non_cached_d);
+                    cuda_iov_pipeline_block->cuda_iov_dist_non_cached_d = NULL;
+                }
+                if (cuda_iov_pipeline_block->cuda_iov_dist_cached_h != NULL) {
+                    //cudaFreeHost(cuda_iov_pipeline_block->cuda_iov_dist_cached_h);
+                    free(cuda_iov_pipeline_block->cuda_iov_dist_cached_h);
+                    cuda_iov_pipeline_block->cuda_iov_dist_cached_h = NULL;
+                }
                 cudaEventDestroy(cuda_iov_pipeline_block->cuda_event);
                 cuda_iov_pipeline_block->cuda_stream = NULL;
                 cuda_iov_pipeline_block->cuda_stream_id = -1;

From 9517b4dd75e79bb344dcfac0510bcda48dc7b874 Mon Sep 17 00:00:00 2001
From: eddy16112 <eddy16112@gmail.com>
Date: Tue, 24 Nov 2015 20:18:17 -0500
Subject: [PATCH 090/190] rewrite non cached iov, make it unified with cached
 iov

checkpoint, rewrite non-cached version

fix for non cached iov

fix the non cached iov, set position should be put at first

move ddt iov to cuda iov into a function

merge iov cached and non-cached

for non cached iov, if there is no enough cuda iov space, break
---
 opal/datatype/cuda/opal_datatype_cuda.cu      | 115 ++++++-
 opal/datatype/cuda/opal_datatype_cuda.cuh     |  28 +-
 .../cuda/opal_datatype_cuda_internal.cuh      |   5 +-
 .../cuda/opal_datatype_pack_cuda_wrapper.cu   | 297 ++++++++++++------
 .../cuda/opal_datatype_unpack_cuda_wrapper.cu | 262 ++++++++++-----
 5 files changed, 509 insertions(+), 198 deletions(-)

diff --git a/opal/datatype/cuda/opal_datatype_cuda.cu b/opal/datatype/cuda/opal_datatype_cuda.cu
index 2df143f2c61..2c76a327197 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cu
+++ b/opal/datatype/cuda/opal_datatype_cuda.cu
@@ -15,6 +15,7 @@ ddt_cuda_device_t *cuda_devices;
 ddt_cuda_device_t *current_cuda_device;
 struct iovec cuda_iov[CUDA_NB_IOV];
 uint32_t cuda_iov_count;
+uint32_t cuda_iov_cache_enabled;
 
 //uint8_t ALIGNMENT_DOUBLE, ALIGNMENT_FLOAT, ALIGNMENT_CHAR;
 
@@ -220,10 +221,9 @@ int32_t opal_ddt_cuda_kernel_init(void)
         for (j = 0; j < NB_STREAMS; j++) {
             cudaStreamCreate(&(cuda_streams->opal_cuda_stream[j]));
             cuda_iov_pipeline_block = (ddt_cuda_iov_pipeline_block_t *)malloc(sizeof(ddt_cuda_iov_pipeline_block_t));
-            cudaMallocHost((void **)(&(cuda_iov_pipeline_block->cuda_iov_dist_non_cached_h)), sizeof(ddt_cuda_iov_dist_non_cached_t) * CUDA_MAX_NB_BLOCKS * CUDA_IOV_MAX_TASK_PER_BLOCK);
-            cudaMalloc((void **)(&(cuda_iov_pipeline_block->cuda_iov_dist_non_cached_d)), sizeof(ddt_cuda_iov_dist_non_cached_t) * CUDA_MAX_NB_BLOCKS * CUDA_IOV_MAX_TASK_PER_BLOCK);
+            cudaMallocHost((void **)(&(cuda_iov_pipeline_block->cuda_iov_dist_non_cached_h)), sizeof(ddt_cuda_iov_dist_cached_t) * CUDA_MAX_NB_BLOCKS * CUDA_IOV_MAX_TASK_PER_BLOCK);
+            cudaMalloc((void **)(&(cuda_iov_pipeline_block->cuda_iov_dist_non_cached_d)), sizeof(ddt_cuda_iov_dist_cached_t) * CUDA_MAX_NB_BLOCKS * CUDA_IOV_MAX_TASK_PER_BLOCK);
             if (j == 0) {
-            //    cudaMallocHost((void **)(&(cuda_iov_pipeline_block->cuda_iov_dist_cached_h)), sizeof(ddt_cuda_iov_dist_cached_t) * NUM_CUDA_IOV_PER_DDT);
                 cuda_iov_pipeline_block->cuda_iov_dist_cached_h = (ddt_cuda_iov_dist_cached_t *)malloc(sizeof(ddt_cuda_iov_dist_cached_t) * NUM_CUDA_IOV_PER_DDT);
             } else {
                 cuda_iov_pipeline_block->cuda_iov_dist_cached_h = NULL;
@@ -240,6 +240,7 @@ int32_t opal_ddt_cuda_kernel_init(void)
     current_cuda_device = &(cuda_devices[0]);
     
     /* init cuda_iov */
+    cuda_iov_cache_enabled = 1;
     cuda_iov_count = CUDA_NB_IOV;
     
     // /* init size for double, float, char */
@@ -273,7 +274,6 @@ int32_t opal_ddt_cuda_kernel_fini(void)
                     cuda_iov_pipeline_block->cuda_iov_dist_non_cached_d = NULL;
                 }
                 if (cuda_iov_pipeline_block->cuda_iov_dist_cached_h != NULL) {
-                    //cudaFreeHost(cuda_iov_pipeline_block->cuda_iov_dist_cached_h);
                     free(cuda_iov_pipeline_block->cuda_iov_dist_cached_h);
                     cuda_iov_pipeline_block->cuda_iov_dist_cached_h = NULL;
                 }
@@ -456,6 +456,85 @@ int32_t opal_ddt_cache_cuda_iov(opal_convertor_t* pConvertor, uint32_t *cuda_iov
     return OPAL_SUCCESS;
 }
 
+uint8_t opal_ddt_iov_to_cuda_iov(opal_convertor_t* pConvertor, const struct iovec *ddt_iov, ddt_cuda_iov_dist_cached_t* cuda_iov_dist_h_current, uint32_t ddt_iov_start_pos, uint32_t ddt_iov_end_pos, size_t *buffer_size, uint32_t *nb_blocks_used, size_t *total_converted, size_t *contig_disp_out, uint32_t *current_ddt_iov_pos)
+{
+    size_t ncontig_disp_base;
+    size_t contig_disp = 0;
+    size_t current_cuda_iov_length = 0;
+    uint8_t buffer_isfull = 0;
+    uint8_t alignment;
+    uint32_t count_desc, nb_blocks_per_description, residue_desc;
+    uint32_t thread_per_block;
+    size_t length_per_iovec;
+    uint32_t i, j;
+    
+    thread_per_block = CUDA_WARP_SIZE * 5;
+    
+    for (i = ddt_iov_start_pos; i < ddt_iov_end_pos && !buffer_isfull; i++) {
+        if (pConvertor->current_iov_partial_length > 0) {
+            ncontig_disp_base = (size_t)(ddt_iov[i].iov_base) + ddt_iov[i].iov_len - pConvertor->current_iov_partial_length;
+            length_per_iovec = pConvertor->current_iov_partial_length;
+            pConvertor->current_iov_partial_length = 0;
+        } else {
+            ncontig_disp_base = (size_t)(ddt_iov[i].iov_base);
+            length_per_iovec = ddt_iov[i].iov_len;
+        }
+        if (*buffer_size < length_per_iovec) {
+            pConvertor->current_iov_pos = i;
+            pConvertor->current_iov_partial_length = length_per_iovec - *buffer_size;
+            length_per_iovec = *buffer_size; 
+            buffer_isfull = 1;
+        }
+        *buffer_size -= length_per_iovec;
+        *total_converted += length_per_iovec;
+        
+        alignment = ALIGNMENT_DOUBLE;
+
+        count_desc = length_per_iovec / alignment;
+        residue_desc = length_per_iovec % alignment;
+        nb_blocks_per_description = (count_desc + thread_per_block - 1) / thread_per_block;
+        if ((*nb_blocks_used + nb_blocks_per_description + 1) > (CUDA_MAX_NB_BLOCKS*CUDA_IOV_MAX_TASK_PER_BLOCK)) {
+            break;
+        }
+        DT_CUDA_DEBUG ( opal_cuda_output(10, "DDT IOV to CUDA IOV description %d, size %d, residue %d, alignment %d, nb_block_aquired %d\n", i, count_desc, residue_desc, alignment, nb_blocks_per_description); );
+        for (j = 0; j < nb_blocks_per_description; j++) {
+            cuda_iov_dist_h_current[*nb_blocks_used].ncontig_disp = ncontig_disp_base + j * thread_per_block * alignment;
+            cuda_iov_dist_h_current[*nb_blocks_used].contig_disp = contig_disp;
+            if ( (j+1) * thread_per_block <= count_desc) {
+                current_cuda_iov_length = thread_per_block * alignment;
+            } else {
+                current_cuda_iov_length = (count_desc - j*thread_per_block) * alignment; 
+            }
+#if defined (OPAL_DATATYPE_CUDA_DEBUG)
+            assert(current_cuda_iov_length > 0);
+#endif /* OPAL_DATATYPE_CUDA_DEBUG */
+            contig_disp += current_cuda_iov_length;
+            DT_CUDA_DEBUG( opal_cuda_output(12, "DDT IOV to CUDA IOV \tblock %d, ncontig_disp %ld, contig_disp %ld, nb_bytes %ld\n", nb_blocks_used, cuda_iov_dist_h_current[*nb_blocks_used].ncontig_disp, cuda_iov_dist_h_current[*nb_blocks_used].contig_disp, current_cuda_iov_length); );
+            (*nb_blocks_used) ++;
+            assert (*nb_blocks_used < CUDA_MAX_NB_BLOCKS*CUDA_IOV_MAX_TASK_PER_BLOCK);
+        }
+        
+        /* handle residue */
+        if (residue_desc != 0) {
+            cuda_iov_dist_h_current[*nb_blocks_used].ncontig_disp = ncontig_disp_base + length_per_iovec / alignment * alignment;
+            cuda_iov_dist_h_current[*nb_blocks_used].contig_disp = contig_disp;
+            current_cuda_iov_length= length_per_iovec - length_per_iovec / alignment * alignment;
+#if defined (OPAL_DATATYPE_CUDA_DEBUG)
+            assert(current_cuda_iov_length > 0);
+#endif /* OPAL_DATATYPE_CUDA_DEBUG */
+            contig_disp += current_cuda_iov_length;
+            DT_CUDA_DEBUG( opal_cuda_output(12, "DDT IOV to CUDA IOV \tblock %d, ncontig_disp %ld, contig_disp %ld, nb_bytes %ld\n", nb_blocks_used, cuda_iov_dist_h_current[*nb_blocks_used].ncontig_disp, cuda_iov_dist_h_current[*nb_blocks_used].contig_disp, current_cuda_iov_length); );
+            (*nb_blocks_used) ++;
+            assert (*nb_blocks_used < CUDA_MAX_NB_BLOCKS*CUDA_IOV_MAX_TASK_PER_BLOCK);
+        }
+    }
+    cuda_iov_dist_h_current[*nb_blocks_used].contig_disp = contig_disp;
+    *contig_disp_out = contig_disp;
+    *current_ddt_iov_pos = i;
+    return buffer_isfull;
+        
+}
+
 void opal_ddt_get_cached_cuda_iov(struct opal_convertor_t *convertor, ddt_cuda_iov_total_cached_t **cached_cuda_iov)
 {
     opal_datatype_t *datatype = (opal_datatype_t *)convertor->pDesc;
@@ -513,6 +592,34 @@ void opal_ddt_set_cuda_iov_position(struct opal_convertor_t *convertor, size_t d
     }
 }
 
+void opal_ddt_set_ddt_iov_position(struct opal_convertor_t *convertor, size_t ddt_offset, const struct iovec *ddt_iov, const uint32_t ddt_iov_count)
+{
+    int i;
+    size_t iov_size = 0;
+    size_t ddt_size;
+    convertor->current_iov_partial_length = 0;
+    convertor->current_iov_pos = 0;
+    convertor->current_count = 0;
+    if (ddt_offset == 0) {
+       return;
+    }
+    opal_datatype_type_size(convertor->pDesc, &ddt_size);
+    convertor->current_count = ddt_offset / ddt_size;
+    ddt_offset = ddt_offset % ddt_size;
+    for(i = 0; i < ddt_iov_count; i++) {
+        iov_size += ddt_iov[i].iov_len;
+        if (iov_size > ddt_offset) {
+            convertor->current_iov_partial_length = iov_size - ddt_offset;
+            convertor->current_iov_pos = i;
+            break;
+        } else if (iov_size == ddt_offset){
+            convertor->current_iov_partial_length = 0;
+            convertor->current_iov_pos = i+1;
+            break;
+        }
+    }
+}
+
 void opal_ddt_check_cuda_iov_is_full(struct opal_convertor_t *convertor, uint32_t cuda_iov_count)
 {
 #if 0
diff --git a/opal/datatype/cuda/opal_datatype_cuda.cuh b/opal/datatype/cuda/opal_datatype_cuda.cuh
index 8ad9b3ec658..c33ff606bd9 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cuh
+++ b/opal/datatype/cuda/opal_datatype_cuda.cuh
@@ -29,25 +29,13 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pCon
                                                           uint32_t* out_size,
                                                           size_t* max_data ); 
                                                           
-int32_t opal_ddt_generic_simple_pack_function_cuda_iov_non_cached( opal_convertor_t* pConvertor,
-                                                                   struct iovec* iov, 
-                                                                   uint32_t* out_size,
-                                                                   size_t* max_data );                                              
-
-int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_non_cached( opal_convertor_t* pConvertor,
-                                                                     struct iovec* iov, 
-                                                                     uint32_t* out_size,
-                                                                     size_t* max_data ); 
+int32_t opal_ddt_generic_simple_pack_function_cuda_iov_non_cached( opal_convertor_t* pConvertor, unsigned char *destination, size_t buffer_size, size_t *total_packed);                                        
+
+int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_non_cached( opal_convertor_t* pConvertor, unsigned char *source, size_t buffer_size, size_t *total_unpacked);
                                                                                                                     
-int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t* pConvertor,
-                                                               struct iovec* iov, 
-                                                               uint32_t* out_size,
-                                                               size_t* max_data );                                              
+int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t* pConvertor, unsigned char *destination, size_t buffer_size, size_t *total_packed);                                        
 
-int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_t* pConvertor,
-                                                                 struct iovec* iov, 
-                                                                 uint32_t* out_size,
-                                                                 size_t* max_data ); 
+int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_t* pConvertor, unsigned char *source, size_t buffer_size, size_t *total_unpacked);
 
 void pack_contiguous_loop_cuda( dt_elem_desc_t* ELEM,
                                 uint32_t* COUNT,
@@ -131,8 +119,12 @@ void opal_ddt_check_cuda_iov_is_full(struct opal_convertor_t *convertor, uint32_
 
 void opal_ddt_set_cuda_iov_position(struct opal_convertor_t *convertor, size_t ddt_offset, const uint32_t *cached_cuda_iov_nb_bytes_list_h, const uint32_t cuda_iov_count);
 
+void opal_ddt_set_ddt_iov_position(struct opal_convertor_t *convertor, size_t ddt_offset, const struct iovec *ddt_iov,  const uint32_t ddt_iov_count);
+
 int32_t opal_ddt_cache_cuda_iov(opal_convertor_t* pConvertor, uint32_t *cuda_iov_count);
 
+uint8_t opal_ddt_iov_to_cuda_iov(opal_convertor_t* pConvertor, const struct iovec *ddt_iov, ddt_cuda_iov_dist_cached_t* cuda_iov_dist_h_current, uint32_t ddt_iov_start_pos, uint32_t ddt_iov_end_pos, size_t *buffer_size, uint32_t *nb_blocks_used, size_t *total_packed, size_t *contig_disp_out, uint32_t *current_ddt_iov_pos);
+
 }
                             
-#endif  /* OPAL_DATATYPE_CUDA_H_HAS_BEEN_INCLUDED */
\ No newline at end of file
+#endif  /* OPAL_DATATYPE_CUDA_H_HAS_BEEN_INCLUDED */
diff --git a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
index 5e7bb41d0dc..72edcb3d8a3 100644
--- a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
+++ b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
@@ -71,8 +71,8 @@ typedef struct {
 } ddt_cuda_iov_total_cached_t;
 
 typedef struct {
-    ddt_cuda_iov_dist_non_cached_t* cuda_iov_dist_non_cached_h;
-    ddt_cuda_iov_dist_non_cached_t* cuda_iov_dist_non_cached_d;
+    ddt_cuda_iov_dist_cached_t* cuda_iov_dist_non_cached_h;
+    ddt_cuda_iov_dist_cached_t* cuda_iov_dist_non_cached_d;
     ddt_cuda_iov_dist_cached_t* cuda_iov_dist_cached_h;
     cudaStream_t *cuda_stream;
     int32_t cuda_stream_id;
@@ -109,6 +109,7 @@ extern ddt_cuda_device_t *cuda_devices;
 extern ddt_cuda_device_t *current_cuda_device;
 extern struct iovec cuda_iov[CUDA_NB_IOV];
 extern uint32_t cuda_iov_count;
+extern uint32_t cuda_iov_cache_enabled;
 
 //extern uint8_t ALIGNMENT_DOUBLE, ALIGNMENT_FLOAT, ALIGNMENT_CHAR;
 
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
index c98d540e54e..0137601bf70 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
@@ -664,9 +664,102 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConve
                                                         uint32_t* out_size,
                                                         size_t* max_data )
 {
-    return opal_ddt_generic_simple_pack_function_cuda_iov_cached(pConvertor, iov, out_size, max_data);   
+    size_t buffer_size;
+    unsigned char *destination;
+    size_t total_packed;
+    uint8_t transfer_required, free_required;
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)    
+    TIMER_DATA_TYPE start, end, start_total, end_total;
+    long total_time, move_time;
+#endif
+
+ //   printf("buffer size %d, max_data %d\n", iov[0].iov_len, *max_data);
+    if ((iov[0].iov_base == NULL) || opal_ddt_cuda_is_gpu_buffer(iov[0].iov_base)) {
+        if (iov[0].iov_len == 0) {
+            buffer_size = DT_CUDA_BUFFER_SIZE;
+        } else {
+            buffer_size = iov[0].iov_len;
+        }
+        
+        if (iov[0].iov_base == NULL) {
+            iov[0].iov_base = (unsigned char *)opal_ddt_cuda_malloc_gpu_buffer(buffer_size, 0);
+            destination = (unsigned char *)iov[0].iov_base;
+            pConvertor->gpu_buffer_ptr = destination;
+            free_required = 1;
+        } else {
+            destination = (unsigned char *)iov[0].iov_base;
+            free_required = 0;
+        }
+        transfer_required = 0;
+    } else {
+        buffer_size = iov[0].iov_len;
+        if (OPAL_DATATYPE_VECTOR_USE_ZEROCPY) {
+            pConvertor->gpu_buffer_ptr = NULL;
+            transfer_required = 0;
+            free_required = 0;
+            cudaHostGetDevicePointer((void **)&destination, (void *)iov[0].iov_base, 0);
+        } else {
+            if (pConvertor->gpu_buffer_ptr == NULL) {
+                pConvertor->gpu_buffer_ptr = (unsigned char*)opal_ddt_cuda_malloc_gpu_buffer(buffer_size, 0);
+            }
+            transfer_required = 1;
+            free_required = 1;
+            destination = pConvertor->gpu_buffer_ptr;
+        }
+    }   
+
+    total_packed = 0;
+    
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+    GET_TIME(start_total);
+#endif
+    
+    /* start pack */
+    if (cuda_iov_cache_enabled) {
+        opal_ddt_generic_simple_pack_function_cuda_iov_cached(pConvertor, destination, buffer_size, &total_packed);
+    } else {
+        opal_ddt_generic_simple_pack_function_cuda_iov_non_cached(pConvertor, destination, buffer_size, &total_packed);
+    }
+
+    pConvertor->bConverted += total_packed;
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "Pack total packed %d\n", total_packed); );
+    
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+    GET_TIME(start);
+#endif
+    if (transfer_required) {
+        cudaMemcpy(iov[0].iov_base, pConvertor->gpu_buffer_ptr, total_packed, cudaMemcpyDeviceToHost);
+    } 
+#if defined(OPAL_DATATYPE_CUDA_TIMING) 
+    GET_TIME( end );
+    move_time = ELAPSED_TIME( start, end );
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: DtoH memcpy in %ld microsec, transfer required %d\n", move_time, transfer_required ); );
+#endif
+
+    iov[0].iov_len = total_packed;
+    *max_data = total_packed;
+    *out_size = 1;
+    
+#if defined(OPAL_DATATYPE_CUDA_TIMING)    
+    GET_TIME( end_total );
+    total_time = ELAPSED_TIME( start_total, end_total );
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: total packing in %ld microsec, kernel %ld microsec\n", total_time, total_time-move_time ); );
+#endif
+    
+    if( pConvertor->bConverted == pConvertor->local_size ) {
+        pConvertor->flags |= CONVERTOR_COMPLETED;
+        if (pConvertor->gpu_buffer_ptr != NULL && free_required) {
+           opal_ddt_cuda_free_gpu_buffer(pConvertor->gpu_buffer_ptr, 0);
+           pConvertor->gpu_buffer_ptr = NULL;
+        }
+        return 1;
+    }        
+    return 0; 
 }
 
+#if 0
+
 int32_t opal_ddt_generic_simple_pack_function_cuda_iov_non_cached( opal_convertor_t* pConvertor,
                                                                    struct iovec* iov,
                                                                    uint32_t* out_size,
@@ -927,17 +1020,111 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_non_cached( opal_converto
     return 0;
 }
 
-int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t* pConvertor,
-                                                               struct iovec* iov,
-                                                               uint32_t* out_size,
-                                                               size_t* max_data )
+#endif
+
+
+int32_t opal_ddt_generic_simple_pack_function_cuda_iov_non_cached( opal_convertor_t* pConvertor, unsigned char *destination, size_t buffer_size, size_t *total_packed)
 {
     uint32_t i;
     uint32_t nb_blocks, thread_per_block, nb_blocks_used;
-    size_t buffer_size;
-    unsigned char *destination, *destination_base, *source_base;
-    size_t total_packed;
-    uint8_t buffer_isfull = 0, transfer_required, free_required;
+    unsigned char *destination_base, *source_base;
+    uint8_t buffer_isfull = 0;
+    cudaError_t cuda_err;
+    ddt_cuda_stream_t *cuda_streams = current_cuda_device->cuda_streams;
+    ddt_cuda_iov_dist_cached_t* cuda_iov_dist_h_current;
+    ddt_cuda_iov_dist_cached_t* cuda_iov_dist_d_current;
+    ddt_cuda_iov_pipeline_block_t *cuda_iov_pipeline_block;
+    int iov_pipeline_block_id = 0;
+    cudaStream_t *cuda_stream_iov = NULL;
+    const struct iovec *ddt_iov = NULL;
+    uint32_t ddt_iov_count = 0;
+    size_t contig_disp = 0;
+    uint32_t ddt_iov_start_pos, ddt_iov_end_pos, current_ddt_iov_pos;
+    OPAL_PTRDIFF_TYPE ddt_extent;
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)    
+    TIMER_DATA_TYPE start, end;
+    long total_time;
+#endif
+    
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "Pack using IOV non cached, GPU base %p, pack to buffer %p\n", pConvertor->pBaseBuf, destination););
+    
+    opal_convertor_raw_cached( pConvertor, &ddt_iov, &ddt_iov_count);
+    if (ddt_iov == NULL) {
+        DT_CUDA_DEBUG ( opal_cuda_output(0, "Can not get ddt iov\n"););
+        return OPAL_ERROR;
+    }
+    
+    cuda_streams->current_stream_id = 0;
+    thread_per_block = CUDA_WARP_SIZE * 5;
+    nb_blocks = 256;
+    opal_datatype_type_extent(pConvertor->pDesc, &ddt_extent);
+    source_base = (unsigned char*)pConvertor->pBaseBuf + pConvertor->current_count * ddt_extent; 
+    destination_base = destination;
+    
+    for (i = 0; i < NB_STREAMS; i++) {
+        cudaStreamSynchronize(cuda_streams->opal_cuda_stream[i]);
+    }
+    
+    while( pConvertor->current_count < pConvertor->count && !buffer_isfull) {
+        
+        nb_blocks_used = 0;
+        ddt_iov_start_pos = pConvertor->current_iov_pos;
+        ddt_iov_end_pos = ddt_iov_start_pos + IOV_PIPELINE_SIZE;
+        if (ddt_iov_end_pos > ddt_iov_count) {
+            ddt_iov_end_pos = ddt_iov_count;
+        }
+        cuda_iov_pipeline_block = current_cuda_device->cuda_iov_pipeline_block[iov_pipeline_block_id];
+        cuda_iov_dist_h_current = cuda_iov_pipeline_block->cuda_iov_dist_non_cached_h;
+        cuda_iov_dist_d_current = cuda_iov_pipeline_block->cuda_iov_dist_non_cached_d;
+        cuda_stream_iov = cuda_iov_pipeline_block->cuda_stream;
+        cuda_err = cudaStreamWaitEvent(*cuda_stream_iov, cuda_iov_pipeline_block->cuda_event, 0);
+        opal_cuda_check_error(cuda_err);
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+        GET_TIME(start);
+#endif
+
+        buffer_isfull = opal_ddt_iov_to_cuda_iov(pConvertor, ddt_iov, cuda_iov_dist_h_current, ddt_iov_start_pos, ddt_iov_end_pos, &buffer_size, &nb_blocks_used, total_packed, &contig_disp, &current_ddt_iov_pos);
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)    
+        GET_TIME( end );
+        total_time = ELAPSED_TIME( start, end );
+        DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Pack src %p to dest %p, iov is prepared in %ld microsec, kernel submitted to CUDA stream %d, nb_blocks %d\n", source_base, destination_base, total_time,  cuda_iov_pipeline_block->cuda_stream_id, nb_blocks_used); );
+#endif
+
+        cudaMemcpyAsync(cuda_iov_dist_d_current, cuda_iov_dist_h_current, sizeof(ddt_cuda_iov_dist_cached_t)*(nb_blocks_used+1), cudaMemcpyHostToDevice, *cuda_stream_iov);
+        opal_generic_simple_pack_cuda_iov_cached_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cuda_iov_dist_d_current, 0, nb_blocks_used, 0, 0, nb_blocks_used, source_base, destination_base);
+        cuda_err = cudaEventRecord(cuda_iov_pipeline_block->cuda_event, *cuda_stream_iov);
+        opal_cuda_check_error(cuda_err);
+        iov_pipeline_block_id ++;
+        iov_pipeline_block_id = iov_pipeline_block_id % NB_STREAMS;
+        destination_base += contig_disp;
+        
+        if (!buffer_isfull) {
+            pConvertor->current_iov_pos = current_ddt_iov_pos;
+            if (current_ddt_iov_pos == ddt_iov_count) {
+                pConvertor->current_count ++;
+                pConvertor->current_iov_pos = 0;
+                source_base += ddt_extent;
+            }
+        }
+        
+    }
+    
+    for (i = 0; i < NB_STREAMS; i++) {
+        cudaStreamSynchronize(cuda_streams->opal_cuda_stream[i]);
+    }
+        
+    return OPAL_SUCCESS;
+}
+
+int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t* pConvertor, unsigned char *destination, size_t buffer_size, size_t *total_packed)
+{
+    uint32_t i;
+    uint32_t nb_blocks, thread_per_block, nb_blocks_used;
+    unsigned char *destination_base, *source_base;
+    uint8_t buffer_isfull = 0;
     cudaError_t cuda_err;
     ddt_cuda_stream_t *cuda_streams = current_cuda_device->cuda_streams;
     ddt_cuda_iov_pipeline_block_t *cuda_iov_pipeline_block = NULL;
@@ -951,65 +1138,14 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
     OPAL_PTRDIFF_TYPE ddt_extent;
 
 #if defined(OPAL_DATATYPE_CUDA_TIMING)    
-    TIMER_DATA_TYPE start, end, start_total, end_total;
-    long total_time, move_time;
+    TIMER_DATA_TYPE start, end;
+    long total_time;
 #endif
-
- //   printf("buffer size %d, max_data %d\n", iov[0].iov_len, *max_data);
-    if ((iov[0].iov_base == NULL) || opal_ddt_cuda_is_gpu_buffer(iov[0].iov_base)) {
-        if (iov[0].iov_len == 0) {
-            buffer_size = DT_CUDA_BUFFER_SIZE;
-        } else {
-            buffer_size = iov[0].iov_len;
-        }
-        
-        if (iov[0].iov_base == NULL) {
-            iov[0].iov_base = (unsigned char *)opal_ddt_cuda_malloc_gpu_buffer(buffer_size, 0);
-            destination = (unsigned char *)iov[0].iov_base;
-            pConvertor->gpu_buffer_ptr = destination;
-            free_required = 1;
-        } else {
-            destination = (unsigned char *)iov[0].iov_base;
-            free_required = 0;
-        }
-        transfer_required = 0;
-    } else {
-        buffer_size = iov[0].iov_len;
-        if (OPAL_DATATYPE_VECTOR_USE_ZEROCPY) {
-            pConvertor->gpu_buffer_ptr = NULL;
-            transfer_required = 0;
-            free_required = 0;
-            cudaHostGetDevicePointer((void **)&destination, (void *)iov[0].iov_base, 0);
-        } else {
-            if (pConvertor->gpu_buffer_ptr == NULL) {
-                pConvertor->gpu_buffer_ptr = (unsigned char*)opal_ddt_cuda_malloc_gpu_buffer(buffer_size, 0);
-            }
-            transfer_required = 1;
-            free_required = 1;
-            destination = pConvertor->gpu_buffer_ptr;
-        }
-    }   
     
     DT_CUDA_DEBUG ( opal_cuda_output(2, "Pack using IOV cached, GPU base %p, pack to buffer %p\n", pConvertor->pBaseBuf, destination););
 
-    total_packed = 0;
     cuda_streams->current_stream_id = 0;
     destination_base = destination;
-
-#if defined(OPAL_DATATYPE_CUDA_TIMING)
-    GET_TIME(start_total);
-#endif
-    
-#if defined(OPAL_DATATYPE_CUDA_TIMING)
-    GET_TIME(start);
-#endif
-
-#if defined(OPAL_DATATYPE_CUDA_TIMING)    
-    GET_TIME( end );
-    total_time = ELAPSED_TIME( start, end );
-    DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: ddt to iov in %ld microsec\n", total_time ); );
-#endif
-    
     thread_per_block = CUDA_WARP_SIZE * 5;
     nb_blocks = 256;
     source_base = (unsigned char*)pConvertor->pBaseBuf; 
@@ -1054,7 +1190,7 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
     while( pConvertor->current_count < pConvertor->count && !buffer_isfull) {
         for (i = cuda_iov_start_pos; i < cuda_iov_end_pos && !buffer_isfull; i++) {
             if (buffer_size >= cached_cuda_iov_nb_bytes_list_h[i]) {
-                total_packed += cached_cuda_iov_nb_bytes_list_h[i];
+                *total_packed += cached_cuda_iov_nb_bytes_list_h[i];
                 buffer_size -= cached_cuda_iov_nb_bytes_list_h[i];
                 nb_blocks_used++;
             } else {
@@ -1080,41 +1216,8 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
     pConvertor->current_cuda_iov_pos = pConvertor->current_cuda_iov_pos % cached_cuda_iov->cuda_iov_count;
 
     cudaStreamSynchronize(cuda_streams->opal_cuda_stream[0]);
-
-    pConvertor->bConverted += total_packed;
-    DT_CUDA_DEBUG ( opal_cuda_output(2, "Pack total packed %d\n", total_packed); );
     
-#if defined(OPAL_DATATYPE_CUDA_TIMING)
-    GET_TIME(start);
-#endif
-    if (transfer_required) {
-        cudaMemcpy(iov[0].iov_base, pConvertor->gpu_buffer_ptr, total_packed, cudaMemcpyDeviceToHost);
-    } 
-#if defined(OPAL_DATATYPE_CUDA_TIMING) 
-    GET_TIME( end );
-    move_time = ELAPSED_TIME( start, end );
-    DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: DtoH memcpy in %ld microsec, transfer required %d\n", move_time, transfer_required ); );
-#endif
-
-    iov[0].iov_len = total_packed;
-    *max_data = total_packed;
-    *out_size = 1;
-    
-#if defined(OPAL_DATATYPE_CUDA_TIMING)    
-    GET_TIME( end_total );
-    total_time = ELAPSED_TIME( start_total, end_total );
-    DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: total packing in %ld microsec, kernel %ld microsec\n", total_time, total_time-move_time ); );
-#endif
-    
-    if( pConvertor->bConverted == pConvertor->local_size ) {
-        pConvertor->flags |= CONVERTOR_COMPLETED;
-        if (pConvertor->gpu_buffer_ptr != NULL && free_required) {
-           opal_ddt_cuda_free_gpu_buffer(pConvertor->gpu_buffer_ptr, 0);
-           pConvertor->gpu_buffer_ptr = NULL;
-        }
-        return 1;
-    }        
-    return 0;
+    return OPAL_SUCCESS;
 }
 
 void pack_predefined_data_cuda( dt_elem_desc_t* ELEM,
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
index 6808ab56fed..bb54dfeeb0a 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
@@ -370,14 +370,89 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_vector2( opal_convertor_t*
     return 0;
 }
 
+
 int32_t opal_ddt_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvertor,
                                                           struct iovec* iov,
                                                           uint32_t* out_size,
                                                           size_t* max_data )
 {
-    return opal_ddt_generic_simple_unpack_function_cuda_iov_cached(pConvertor, iov, out_size, max_data);
+    size_t buffer_size;
+    unsigned char *source;
+    size_t total_unpacked;
+    uint8_t free_required = 0;
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+    TIMER_DATA_TYPE start, end, start_total, end_total;
+    long total_time, move_time;
+#endif
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+    GET_TIME(start_total);
+#endif
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+    GET_TIME(start);
+#endif
+    if (opal_ddt_cuda_is_gpu_buffer(iov[0].iov_base)) {
+        source = (unsigned char*)iov[0].iov_base;
+        free_required = 0;
+    } else {
+        if (OPAL_DATATYPE_VECTOR_USE_ZEROCPY) {
+            cudaHostGetDevicePointer((void **)&source, (void *)iov[0].iov_base, 0);
+            pConvertor->gpu_buffer_ptr = NULL;
+            free_required = 0;
+        } else {
+            if (pConvertor->gpu_buffer_ptr == NULL) {
+                pConvertor->gpu_buffer_ptr = (unsigned char*)opal_ddt_cuda_malloc_gpu_buffer(iov[0].iov_len, 0);
+            }
+            source = pConvertor->gpu_buffer_ptr;
+            cudaMemcpy(source, iov[0].iov_base, iov[0].iov_len, cudaMemcpyHostToDevice);
+            free_required = 1;
+        }
+    }
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+    GET_TIME( end );
+    move_time = ELAPSED_TIME( start, end );
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: HtoD memcpy in %ld microsec, free required %d\n", move_time, free_required ); );
+#endif
+
+
+    buffer_size = iov[0].iov_len;
+    total_unpacked = 0;
+    
+    /* start unpack */
+    if (cuda_iov_cache_enabled) {
+        opal_ddt_generic_simple_unpack_function_cuda_iov_cached(pConvertor, source, buffer_size, &total_unpacked);
+    } else {
+        opal_ddt_generic_simple_unpack_function_cuda_iov_non_cached(pConvertor, source, buffer_size, &total_unpacked);
+    }
+    
+    pConvertor->bConverted += total_unpacked;
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "Unpack total unpacked %d\n", total_unpacked); );
+
+    iov[0].iov_len = total_unpacked;
+    *max_data = total_unpacked;
+    *out_size = 1;
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+    GET_TIME( end_total );
+    total_time = ELAPSED_TIME( start_total, end_total );
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: total unpacking in %ld microsec, kernel %ld microsec\n", total_time, total_time-move_time ); );
+#endif
+
+    if( pConvertor->bConverted == pConvertor->local_size ) {
+        pConvertor->flags |= CONVERTOR_COMPLETED;
+        if (pConvertor->gpu_buffer_ptr != NULL && free_required) {
+            opal_ddt_cuda_free_gpu_buffer(pConvertor->gpu_buffer_ptr, 0);
+            pConvertor->gpu_buffer_ptr = NULL;
+        }
+        return 1;
+    }
+    return 0;
 }
 
+#if 0
 int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_non_cached( opal_convertor_t* pConvertor,
                                                                      struct iovec* iov,
                                                                      uint32_t* out_size,
@@ -622,85 +697,139 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_non_cached( opal_conver
     return 0;
 }
 
-int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_t* pConvertor,
-                                                                 struct iovec* iov,
-                                                                 uint32_t* out_size,
-                                                                 size_t* max_data )
+#endif
+
+int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_non_cached( opal_convertor_t* pConvertor, unsigned char *source, size_t buffer_size, size_t *total_unpacked)
 {
     uint32_t i;
     uint32_t nb_blocks, thread_per_block, nb_blocks_used;
-    size_t buffer_size;
-    unsigned char *source, *source_base, *destination_base, *destination;
-    size_t total_unpacked;
+    unsigned char *source_base, *destination_base;
     uint8_t buffer_isfull = 0;
-    uint8_t free_required = 0;
     cudaError_t cuda_err;
     ddt_cuda_stream_t *cuda_streams = current_cuda_device->cuda_streams;
-    ddt_cuda_iov_pipeline_block_t *cuda_iov_pipeline_block = NULL;
+    ddt_cuda_iov_dist_cached_t* cuda_iov_dist_h_current;
+    ddt_cuda_iov_dist_cached_t* cuda_iov_dist_d_current;
+    ddt_cuda_iov_pipeline_block_t *cuda_iov_pipeline_block;
+    int iov_pipeline_block_id = 0;
     cudaStream_t *cuda_stream_iov = NULL;
-    uint32_t cuda_iov_start_pos, cuda_iov_end_pos;
-    ddt_cuda_iov_total_cached_t* cached_cuda_iov = NULL;
-    ddt_cuda_iov_dist_cached_t* cached_cuda_iov_dist_d = NULL;
-    uint32_t *cached_cuda_iov_nb_bytes_list_h = NULL;
-    uint32_t cached_cuda_iov_count = 0;
-    size_t cuda_iov_partial_length_start = 0;
-    size_t cuda_iov_partial_length_end = 0;
-    opal_datatype_count_t convertor_current_count;
+    const struct iovec *ddt_iov = NULL;
+    uint32_t ddt_iov_count = 0;
+    size_t contig_disp = 0;
+    uint32_t ddt_iov_start_pos, ddt_iov_end_pos, current_ddt_iov_pos;
     OPAL_PTRDIFF_TYPE ddt_extent;
 
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
-    TIMER_DATA_TYPE start, end, start_total, end_total;
-    long total_time, move_time;
+    TIMER_DATA_TYPE start, end;
+    long total_time;
 #endif
+    
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "Unpack using IOV non cached, GPU base %p, unpack from buffer %p, total size %ld\n",
+                                     pConvertor->pBaseBuf, source, buffer_size); );
+    
+    opal_convertor_raw_cached( pConvertor, &ddt_iov, &ddt_iov_count);
+    if (ddt_iov == NULL) {
+        DT_CUDA_DEBUG ( opal_cuda_output(0, "Can not get ddt iov\n"););
+        return OPAL_ERROR;
+    }
+    
+    cuda_streams->current_stream_id = 0;
+    thread_per_block = CUDA_WARP_SIZE * 5;
+    nb_blocks = 256;
+    source_base = source;
+    opal_datatype_type_extent(pConvertor->pDesc, &ddt_extent);
+    opal_ddt_set_ddt_iov_position(pConvertor, pConvertor->bConverted, ddt_iov, ddt_iov_count);
+    destination_base = (unsigned char*)pConvertor->pBaseBuf + pConvertor->current_count * ddt_extent;
+    
+    for (i = 0; i < NB_STREAMS; i++) {
+        cudaStreamSynchronize(cuda_streams->opal_cuda_stream[i]);
+    }
 
-#if defined(OPAL_DATATYPE_CUDA_TIMING)
-    GET_TIME(start_total);
+    while( pConvertor->current_count < pConvertor->count && !buffer_isfull) {
+
+        nb_blocks_used = 0;
+        ddt_iov_start_pos = pConvertor->current_iov_pos;
+        ddt_iov_end_pos = ddt_iov_start_pos + IOV_PIPELINE_SIZE;
+        if (ddt_iov_end_pos > ddt_iov_count) {
+            ddt_iov_end_pos = ddt_iov_count;
+        }
+        cuda_iov_pipeline_block = current_cuda_device->cuda_iov_pipeline_block[iov_pipeline_block_id];
+        cuda_iov_dist_h_current = cuda_iov_pipeline_block->cuda_iov_dist_non_cached_h;
+        cuda_iov_dist_d_current = cuda_iov_pipeline_block->cuda_iov_dist_non_cached_d;
+        cuda_stream_iov = cuda_iov_pipeline_block->cuda_stream;
+        cuda_err = cudaStreamWaitEvent(*cuda_stream_iov, cuda_iov_pipeline_block->cuda_event, 0);
+        opal_cuda_check_error(cuda_err);
+        
+
+#if defined (OPAL_DATATYPE_CUDA_TIMING)
+        GET_TIME(start);
 #endif
 
+        buffer_isfull = opal_ddt_iov_to_cuda_iov(pConvertor, ddt_iov, cuda_iov_dist_h_current, ddt_iov_start_pos, ddt_iov_end_pos, &buffer_size, &nb_blocks_used, total_unpacked, &contig_disp, &current_ddt_iov_pos);
+
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
-    GET_TIME(start);
+        GET_TIME( end );
+        total_time = ELAPSED_TIME( start, end );
+        DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Unpack src %p to dest %p, iov is prepared in %ld microsec, kernel submitted to CUDA stream %d, nb_blocks_used %d\n", source_base, destination_base, total_time,  cuda_iov_pipeline_block->cuda_stream_id, nb_blocks_used); );
 #endif
-    if (opal_ddt_cuda_is_gpu_buffer(iov[0].iov_base)) {
-        source = (unsigned char*)iov[0].iov_base;
-        free_required = 0;
-    } else {
-        if (OPAL_DATATYPE_VECTOR_USE_ZEROCPY) {
-            cudaHostGetDevicePointer((void **)&source, (void *)iov[0].iov_base, 0);
-            pConvertor->gpu_buffer_ptr = NULL;
-            free_required = 0;
-        } else {
-            if (pConvertor->gpu_buffer_ptr == NULL) {
-                pConvertor->gpu_buffer_ptr = (unsigned char*)opal_ddt_cuda_malloc_gpu_buffer(iov[0].iov_len, 0);
+
+        cudaMemcpyAsync(cuda_iov_dist_d_current, cuda_iov_dist_h_current, sizeof(ddt_cuda_iov_dist_cached_t)*(nb_blocks_used+1), cudaMemcpyHostToDevice, *cuda_stream_iov);
+        opal_generic_simple_unpack_cuda_iov_cached_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cuda_iov_dist_d_current, 0, nb_blocks_used, 0, 0, nb_blocks_used, destination_base, source_base, 0, 0);
+        cuda_err = cudaEventRecord(cuda_iov_pipeline_block->cuda_event, *cuda_stream_iov);
+        opal_cuda_check_error(cuda_err);
+        iov_pipeline_block_id ++;
+        iov_pipeline_block_id = iov_pipeline_block_id % NB_STREAMS;
+        source_base += contig_disp;
+        if (!buffer_isfull) {
+            pConvertor->current_iov_pos = current_ddt_iov_pos;
+            if (current_ddt_iov_pos == ddt_iov_count) {
+                pConvertor->current_count ++;
+                pConvertor->current_iov_pos = 0;
+                destination_base += ddt_extent;
             }
-            source = pConvertor->gpu_buffer_ptr;
-            cudaMemcpy(source, iov[0].iov_base, iov[0].iov_len, cudaMemcpyHostToDevice);
-            free_required = 1;
         }
     }
 
-    DT_CUDA_DEBUG ( opal_cuda_output(2, "Unpack using IOV cached, GPU base %p, unpack from buffer %p, total size %ld\n",
-                                     pConvertor->pBaseBuf, source, iov[0].iov_len); );
+    for (i = 0; i < NB_STREAMS; i++) {
+        cudaStreamSynchronize(cuda_streams->opal_cuda_stream[i]);
+    }
+
+    return OPAL_SUCCESS;
+}
+
+int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_t* pConvertor, unsigned char *source, size_t buffer_size, size_t *total_unpacked)
+{
+    uint32_t i;
+    uint32_t nb_blocks, thread_per_block, nb_blocks_used;
+    unsigned char *source_base, *destination_base;
+    uint8_t buffer_isfull = 0;
+    cudaError_t cuda_err;
+    ddt_cuda_stream_t *cuda_streams = current_cuda_device->cuda_streams;
+    ddt_cuda_iov_pipeline_block_t *cuda_iov_pipeline_block = NULL;
+    cudaStream_t *cuda_stream_iov = NULL;
+    uint32_t cuda_iov_start_pos, cuda_iov_end_pos;
+    ddt_cuda_iov_total_cached_t* cached_cuda_iov = NULL;
+    ddt_cuda_iov_dist_cached_t* cached_cuda_iov_dist_d = NULL;
+    uint32_t *cached_cuda_iov_nb_bytes_list_h = NULL;
+    uint32_t cached_cuda_iov_count = 0;
+    size_t cuda_iov_partial_length_start = 0;
+    size_t cuda_iov_partial_length_end = 0;
+    opal_datatype_count_t convertor_current_count;
+    OPAL_PTRDIFF_TYPE ddt_extent;
+
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
-    GET_TIME( end );
-    move_time = ELAPSED_TIME( start, end );
-    DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: HtoD memcpy in %ld microsec, free required %d\n", move_time, free_required ); );
+    TIMER_DATA_TYPE start, end;
+    long total_time;
 #endif
-
+    
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "Unpack using IOV cached, GPU base %p, unpack from buffer %p, total size %ld\n",
+                                     pConvertor->pBaseBuf, source, buffer_size); );
 
 #if defined (OPAL_DATATYPE_CUDA_TIMING)
     GET_TIME(start);
 #endif
-    buffer_size = iov[0].iov_len;
-    total_unpacked = 0;
+
     cuda_streams->current_stream_id = 0;
     source_base = source;
-
-#if defined (OPAL_DATATYPE_CUDA_TIMING)
-    GET_TIME( end );
-    total_time = ELAPSED_TIME( start, end );
-    DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: ddt to iov in %ld microsec\n", total_time ); );
-#endif
-
     thread_per_block = CUDA_WARP_SIZE * 5;
     nb_blocks = 256;
     destination_base = (unsigned char*)pConvertor->pBaseBuf;
@@ -739,7 +868,7 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
     
     if (pConvertor->current_iov_partial_length > 0) {
         cuda_iov_partial_length_start = pConvertor->current_iov_partial_length;
-        total_unpacked += cuda_iov_partial_length_start;
+        *total_unpacked += cuda_iov_partial_length_start;
         buffer_size -= cuda_iov_partial_length_start;
         pConvertor->current_iov_partial_length = 0;
         cuda_iov_start_pos ++;
@@ -752,13 +881,13 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
     while( pConvertor->current_count < pConvertor->count && !buffer_isfull) {
         for (i = cuda_iov_start_pos; i < cuda_iov_end_pos && !buffer_isfull; i++) {
             if (buffer_size >= cached_cuda_iov_nb_bytes_list_h[i]) {
-                total_unpacked += cached_cuda_iov_nb_bytes_list_h[i];
+                *total_unpacked += cached_cuda_iov_nb_bytes_list_h[i];
                 buffer_size -= cached_cuda_iov_nb_bytes_list_h[i];
                 nb_blocks_used ++;
             } else {
                 if (buffer_size > 0) {
                     cuda_iov_partial_length_end = buffer_size;
-                    total_unpacked += cuda_iov_partial_length_end;
+                    *total_unpacked += cuda_iov_partial_length_end;
                     nb_blocks_used ++;
                 }
                 buffer_size = 0;
@@ -783,28 +912,7 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
 
     cudaStreamSynchronize(cuda_streams->opal_cuda_stream[0]);
     
-    pConvertor->bConverted += total_unpacked;
-    DT_CUDA_DEBUG ( opal_cuda_output(2, "Unpack total unpacked %d\n", total_unpacked); );
-
-    iov[0].iov_len = total_unpacked;
-    *max_data = total_unpacked;
-    *out_size = 1;
-
-#if defined(OPAL_DATATYPE_CUDA_TIMING)
-    GET_TIME( end_total );
-    total_time = ELAPSED_TIME( start_total, end_total );
-    DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: total unpacking in %ld microsec, kernel %ld microsec\n", total_time, total_time-move_time ); );
-#endif
-
-    if( pConvertor->bConverted == pConvertor->local_size ) {
-        pConvertor->flags |= CONVERTOR_COMPLETED;
-        if (pConvertor->gpu_buffer_ptr != NULL && free_required) {
-            opal_ddt_cuda_free_gpu_buffer(pConvertor->gpu_buffer_ptr, 0);
-            pConvertor->gpu_buffer_ptr = NULL;
-        }
-        return 1;
-    }
-    return 0;
+    return OPAL_SUCCESS;
 }
 
 void unpack_contiguous_loop_cuda( dt_elem_desc_t* ELEM,

From d242b0cfcb580ac1325652e038f31f62a07bf8d9 Mon Sep 17 00:00:00 2001
From: Wei Wu <eddy16112@gmail.com>
Date: Fri, 5 Feb 2016 12:36:36 -0800
Subject: [PATCH 091/190] apply loop unroll on packing kernels

---
 opal/datatype/cuda/opal_datatype_cuda.cu      |   6 +-
 .../cuda/opal_datatype_cuda_internal.cuh      |   6 +-
 .../cuda/opal_datatype_pack_cuda_kernel.cu    | 512 +++++++++++++++++-
 .../cuda/opal_datatype_pack_cuda_wrapper.cu   |  17 +-
 .../cuda/opal_datatype_unpack_cuda_wrapper.cu |  13 +-
 test/datatype/Makefile.am                     |   2 +-
 test/datatype/ddt_benchmark.c                 | 125 +++--
 test/datatype/ddt_lib.c                       |   8 +
 test/datatype/ddt_lib.h                       |   4 +-
 9 files changed, 635 insertions(+), 58 deletions(-)

diff --git a/opal/datatype/cuda/opal_datatype_cuda.cu b/opal/datatype/cuda/opal_datatype_cuda.cu
index 2c76a327197..372edefa96a 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cu
+++ b/opal/datatype/cuda/opal_datatype_cuda.cu
@@ -358,7 +358,7 @@ int32_t opal_ddt_cache_cuda_iov(opal_convertor_t* pConvertor, uint32_t *cuda_iov
     uint32_t count_desc, nb_blocks_per_description, residue_desc;
     uint32_t thread_per_block, nb_blocks_used;
     size_t length_per_iovec;
-    uint8_t alignment;
+    uint32_t alignment;
     ddt_cuda_iov_pipeline_block_t *cuda_iov_pipeline_block = NULL;
     ddt_cuda_iov_total_cached_t* cached_cuda_iov = NULL;
     ddt_cuda_iov_dist_cached_t *cached_cuda_iov_dist_d = NULL;
@@ -389,14 +389,14 @@ int32_t opal_ddt_cache_cuda_iov(opal_convertor_t* pConvertor, uint32_t *cuda_iov
     cuda_iov_pipeline_block = current_cuda_device->cuda_iov_pipeline_block[0];
     cuda_iov_dist_h = cuda_iov_pipeline_block->cuda_iov_dist_cached_h;
     cuda_stream_iov = cuda_iov_pipeline_block->cuda_stream;
-    thread_per_block = CUDA_WARP_SIZE * 5;
+    thread_per_block = CUDA_WARP_SIZE * 32;
 
     for (i = 0; i < ddt_iov_count; i++) {
         length_per_iovec = ddt_iov[i].iov_len;
         ncontig_disp_base = (size_t)(ddt_iov[i].iov_base);
     
         /* block size is either multiple of ALIGNMENT_DOUBLE or residule */
-        alignment = ALIGNMENT_DOUBLE;
+        alignment = ALIGNMENT_DOUBLE * 1;
 
         count_desc = length_per_iovec / alignment;
         residue_desc = length_per_iovec % alignment;
diff --git a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
index 72edcb3d8a3..e6268fadc05 100644
--- a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
+++ b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
@@ -17,7 +17,7 @@
 #define OPAL_DATATYPE_VECTOR_USE_MEMCPY2D_D2H   0
 #define OPAL_DATATYPE_VECTOR_USE_ZEROCPY   0
 #define OPAL_DATATYPE_VECTOR_USE_PIPELINE   0
-#define OPAL_DATATYPE_VECTOR_USE_MEMCPY2D_AS_KERNEL   1
+#define OPAL_DATATYPE_VECTOR_USE_MEMCPY2D_AS_KERNEL   0
 #define OPAL_DATATYPE_CUDA_IOV_CACHE    1
 
 
@@ -40,6 +40,10 @@
 #define ALIGNMENT_CHAR      1
 #define NUM_CUDA_IOV_PER_DDT    150000
 #define IOV_PIPELINE_SIZE   1000
+#define KERNEL_UNROLL       16
+#define UNROLL_16           16
+#define UNROLL_8            8
+#define UNROLL_4            4
 
 #define TIMER_DATA_TYPE struct timeval
 #define GET_TIME(TV)   gettimeofday( &(TV), NULL )
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
index 2564fe1393c..79138a72f9a 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
@@ -5,6 +5,7 @@
 #include <stdio.h> 
 #include <time.h>
 
+#if 1
 __global__ void pack_contiguous_loop_cuda_kernel_global( uint32_t copy_loops,
                                                          size_t size,
                                                          OPAL_PTRDIFF_TYPE extent,
@@ -13,17 +14,17 @@ __global__ void pack_contiguous_loop_cuda_kernel_global( uint32_t copy_loops,
 {
     uint32_t _i, tid, num_threads;
     uint32_t gap, nb_elements;
-    double *_source_tmp, *_destination_tmp, *_src_disp_tmp;;
+    uint64_t *_source_tmp, *_destination_tmp, *_src_disp_tmp;;
     
     tid = threadIdx.x + blockIdx.x * blockDim.x;
     num_threads = gridDim.x * blockDim.x;
     
     gap = (extent - size) / 8;
     nb_elements = size / 8;
-    _src_disp_tmp = (double*)source;
-    _destination_tmp = (double*)destination;
+    _src_disp_tmp = (uint64_t*)source;
+    _destination_tmp = (uint64_t*)destination;
     _destination_tmp += tid;
-
+#if 0
     for (_i = tid; _i < copy_loops*nb_elements; _i+=num_threads) {
         _source_tmp = _src_disp_tmp + tid + _i/num_threads*num_threads + _i/nb_elements * gap;
 #if defined (OPAL_DATATYPE_CUDA_DEBUG)
@@ -41,8 +42,225 @@ __global__ void pack_contiguous_loop_cuda_kernel_global( uint32_t copy_loops,
 #endif /* ! OPAL_DATATYPE_CUDA_DRY_RUN */
         _destination_tmp += num_threads;
     }
+#else
+    for (_i = tid; _i < copy_loops*nb_elements; _i+=16*num_threads) {
+        uint64_t val[16];
+        uint32_t _j;
+        uint32_t u;
+        uint64_t *mysrc = _src_disp_tmp + tid;
+        
+        #pragma unroll      
+        for (u = 0; u < 16; u++) {
+            _j = _i + u * num_threads;
+            val[u] = *(mysrc + _j/num_threads*num_threads + _j/nb_elements * gap);
+        } 
+        
+        #pragma unroll
+        for (u = 0; u < 16; u++) {
+            *_destination_tmp = val[u];
+            _destination_tmp += num_threads;
+        } 
+/*
+        _j = _i;
+        val[0] = *(_src_disp_tmp + tid + _j/num_threads*num_threads + _j/nb_elements * gap);
+
+        _j += num_threads;
+        val[1] = *(_src_disp_tmp + tid + _j/num_threads*num_threads + _j/nb_elements * gap);
+
+        _j += num_threads;
+        val[2] = *(_src_disp_tmp + tid + _j/num_threads*num_threads + _j/nb_elements * gap);
+
+        _j += num_threads;
+        val[3] = *(_src_disp_tmp + tid + _j/num_threads*num_threads + _j/nb_elements * gap);
+        
+	_j += num_threads;
+        val[4] = *(_src_disp_tmp + tid + _j/num_threads*num_threads + _j/nb_elements * gap);
+
+	_j += num_threads;
+        val[5] = *(_src_disp_tmp + tid + _j/num_threads*num_threads + _j/nb_elements * gap);
+
+	_j += num_threads;
+        val[6] = *(_src_disp_tmp + tid + _j/num_threads*num_threads + _j/nb_elements * gap);
+
+	_j += num_threads;
+        val[7] = *(_src_disp_tmp + tid + _j/num_threads*num_threads + _j/nb_elements * gap);
+
+	_j += num_threads;
+        val[8] = *(_src_disp_tmp + tid + _j/num_threads*num_threads + _j/nb_elements * gap);
+
+	_j += num_threads;
+        val[9] = *(_src_disp_tmp + tid + _j/num_threads*num_threads + _j/nb_elements * gap);
+
+	_j += num_threads;
+        val[10] = *(_src_disp_tmp + tid + _j/num_threads*num_threads + _j/nb_elements * gap);
+
+        _j += num_threads;
+        val[11] = *(_src_disp_tmp + tid + _j/num_threads*num_threads + _j/nb_elements * gap);
+
+        _j += num_threads;
+        val[12] = *(_src_disp_tmp + tid + _j/num_threads*num_threads + _j/nb_elements * gap);
+
+        _j += num_threads;
+        val[13] = *(_src_disp_tmp + tid + _j/num_threads*num_threads + _j/nb_elements * gap);
+
+        _j += num_threads;
+        val[14] = *(_src_disp_tmp + tid + _j/num_threads*num_threads + _j/nb_elements * gap);
+
+        _j += num_threads;
+        val[15] = *(_src_disp_tmp + tid + _j/num_threads*num_threads + _j/nb_elements * gap);
+
+        *_destination_tmp = val[0];
+        _destination_tmp += num_threads;
+        *_destination_tmp = val[1];
+        _destination_tmp += num_threads;
+        *_destination_tmp = val[2];
+        _destination_tmp += num_threads;
+        *_destination_tmp = val[3];
+        _destination_tmp += num_threads;
+        *_destination_tmp = val[4];
+        _destination_tmp += num_threads;
+        *_destination_tmp = val[5];
+        _destination_tmp += num_threads;
+        *_destination_tmp = val[6];
+        _destination_tmp += num_threads;
+        *_destination_tmp = val[7];
+        _destination_tmp += num_threads;
+        *_destination_tmp = val[8];
+        _destination_tmp += num_threads;
+        *_destination_tmp = val[9];
+        _destination_tmp += num_threads;
+        *_destination_tmp = val[10];
+        _destination_tmp += num_threads;
+        *_destination_tmp = val[11];
+        _destination_tmp += num_threads;
+        *_destination_tmp = val[12];
+        _destination_tmp += num_threads;
+        *_destination_tmp = val[13];
+        _destination_tmp += num_threads;
+        *_destination_tmp = val[14];
+        _destination_tmp += num_threads;
+        *_destination_tmp = val[15];
+        _destination_tmp += num_threads;
+*/  
+    }
+#endif
+}
+
+#else
+
+#define SEG_ADD(s) \
+    l += s; \
+    while (l >= lines) { \
+	l -= lines; \
+	c += width; \
+    }
+
+__global__ void pack_contiguous_loop_cuda_kernel_global( uint32_t lines,
+                                                         size_t nb_size,
+                                                         OPAL_PTRDIFF_TYPE nb_extent,
+                                                         unsigned char * b_source,
+                                                         unsigned char * b_destination )
+{
+    uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
+    uint32_t num_threads = gridDim.x * blockDim.x;
+  
+    //size_t lines = (size_t)lines;
+    size_t size = nb_size / 8;
+    size_t extent = nb_extent / 8;
+    uint64_t * source = (uint64_t *) b_source;
+    uint64_t *destination = (uint64_t *) b_destination;
+    uint64_t val[KERNEL_UNROLL];
+    
+    int col = 0;
+    for (int width = 32; width > 0 && col < size; width >>= 1) {
+    	while (size-col >= width) {
+    	    const int warp_id = tid / width;
+    	    const int warp_tid = tid & (width-1);
+    	    const int warp_nb = num_threads / width;
+    	    const int c = col + warp_tid;
+            int l = warp_id * KERNEL_UNROLL;
+    	    uint64_t *src = source + c;
+    	    uint64_t *dst = destination + c;
+    	    for (int b=0; b<lines/(KERNEL_UNROLL*warp_nb); b++) {
+    		    #pragma unroll
+    		    for (int u=0; u<KERNEL_UNROLL; u++) {
+    		        val[u] = __ldg(src+(l+u)*extent);
+    		    }
+    		    #pragma unroll
+    		    for (int u=0; u<KERNEL_UNROLL; u++) {
+    		        dst[(l+u)*size] = val[u];
+    		    }
+    		    l += warp_nb * KERNEL_UNROLL;
+    	    }
+    	    /* Finish non-unrollable case */
+    	    for (int u=0; u<KERNEL_UNROLL && l<lines; u++) {
+    		    dst[l*size] = __ldg(src+l*extent);
+    		    l++;
+    	    }		
+    	    col += width;
+    	}
+    }
+
+    
 }
 
+/*
+#define COLOFF_INC(jump, width, ext) \
+     col += jump; \
+     off += jump; \
+     while (col >= width) { \
+         col -= width; \
+         off += ext - width; \
+     }
+
+#define ELEMSIZE 32
+
+__global__ void pack_contiguous_loop_cuda_kernel_global( uint32_t
+copy_loops,
+size_t size,
+OPAL_PTRDIFF_TYPE extent,
+unsigned char * source,
+unsigned char * destination )
+{
+     uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x ;
+     uint32_t num_threads = gridDim.x * blockDim.x;
+
+     int col = 0;
+     int off = 0;
+
+     COLOFF_INC(tid, size/ELEMSIZE, extent/ELEMSIZE);
+
+     if (ELEMSIZE % 8 == 0) {
+         volatile uint64_t * __restrict__ dst = (uint64_t*)destination +
+tid * ELEMSIZE/8;
+         for (int offset = tid; offset < copy_loops*size/ELEMSIZE;
+offset+=num_threads) {
+             const volatile uint64_t * __restrict__ src = (uint64_t*)source + off * ELEMSIZE/8;
+#if 1
+             uint64_t val[ELEMSIZE/8];
+             #pragma unroll
+             for (int i = 0; i < ELEMSIZE/8; i++) {
+                 val[i] = src[i];
+             }
+             #pragma unroll
+             for (int i = 0; i < ELEMSIZE/8; i++) {
+                 dst[i] = val[i];
+             }
+#else
+             #pragma unroll
+             for (int i = 0; i < ELEMSIZE/8; i++) {
+                 dst[i] = __ldg(src+i);
+             }
+#endif
+             dst += num_threads*ELEMSIZE/8;
+             COLOFF_INC(num_threads, size/ELEMSIZE, extent/ELEMSIZE);
+         }
+     }
+}
+*/
+#endif
+
+
 __global__ void opal_generic_simple_pack_cuda_iov_non_cached_kernel( ddt_cuda_iov_dist_non_cached_t* cuda_iov_dist, int nb_blocks_used)
 {
     uint32_t i, _copy_count;
@@ -88,6 +306,7 @@ __global__ void opal_generic_simple_pack_cuda_iov_non_cached_kernel( ddt_cuda_io
     }
 }
 
+#if 0
 __global__ void opal_generic_simple_pack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, uint32_t cuda_iov_pos, uint32_t cuda_iov_count, uint32_t ddt_extent, uint32_t current_count, int nb_blocks_used, unsigned char* source_base, unsigned char* destination_base)
 {
     uint32_t i, j;
@@ -141,7 +360,7 @@ __global__ void opal_generic_simple_pack_cuda_iov_cached_kernel( ddt_cuda_iov_di
             printf("pack block %d, src_offset %ld, dst_offset %ld, count %d, nb_bytes %d, nb_tasks %d, i %d\n", blockIdx.x, src_offset, dst_offset, copy_count, _nb_bytes, nb_tasks, i);
         }
         __syncthreads();
-      */  
+      */
         for (j = threadIdx.x; j < copy_count; j += blockDim.x) {
             if (j < copy_count) {
                 _source_tmp = source_base + src_offset + j * alignment;
@@ -159,3 +378,286 @@ __global__ void opal_generic_simple_pack_cuda_iov_cached_kernel( ddt_cuda_iov_di
         }
     }
 }
+
+#else
+__global__ void opal_generic_simple_pack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, uint32_t cuda_iov_pos, uint32_t cuda_iov_count, uint32_t ddt_extent, uint32_t current_count, int nb_blocks_used, unsigned char* source_base, unsigned char* destination_base)
+{
+    uint32_t i, j;
+    uint32_t _nb_bytes;    
+    size_t src_offset, dst_offset;
+    unsigned char *_source_tmp, *_destination_tmp;
+    uint32_t current_cuda_iov_pos = cuda_iov_pos;
+    size_t destination_disp = cuda_iov_dist[current_cuda_iov_pos].contig_disp;
+    size_t contig_disp;
+    uint32_t _my_cuda_iov_pos;
+    uint32_t _my_cuda_iov_iteration;
+    size_t ddt_size = cuda_iov_dist[cuda_iov_count].contig_disp;
+    
+    __shared__ uint32_t nb_tasks_per_block;
+    __shared__ uint32_t WARP_SIZE;
+    __shared__ uint32_t nb_warp_per_block;
+    uint32_t copy_count;
+    uint8_t alignment;
+    uint64_t tmp_var_64[KERNEL_UNROLL];
+    uint32_t tmp_var_32[KERNEL_UNROLL];
+    unsigned char tmp_var_8[KERNEL_UNROLL];
+    uint32_t u, k;
+    uint32_t copy_count_16, copy_count_8, copy_count_left;
+    
+    if (threadIdx.x == 0) {
+        nb_tasks_per_block = nb_blocks_used / gridDim.x;
+        if (blockIdx.x < (nb_blocks_used % gridDim.x)) {
+            nb_tasks_per_block ++;
+        }
+        if (nb_tasks_per_block >= 4) {
+            WARP_SIZE = 32;
+        } else if (nb_tasks_per_block == 1) {
+            WARP_SIZE = blockDim.x;
+        } else {
+            WARP_SIZE = 64;
+        }
+        nb_warp_per_block = blockDim.x / WARP_SIZE;
+ //       nb_warp_per_block = 1;
+     //   if (nb_tasks_per_block == )
+    //    printf("cuda_iov_count %d, ddt_extent %d, current_count %d\n", cuda_iov_count, ddt_extent, current_count);
+    //     printf("nb_tasks %d, griddim %d, nb_blocks_used %d, bloid %d \n", nb_tasks, gridDim.x, nb_blocks_used, blockIdx.x);
+    }
+    __syncthreads();
+      
+      const uint32_t warp_id_per_block = threadIdx.x / WARP_SIZE;
+      const uint32_t tid_per_warp = threadIdx.x & (WARP_SIZE - 1);
+ //     uint32_t warp_id_per_block = 0;
+ //     uint32_t tid_per_warp = threadIdx.x;  
+    
+    for (i = warp_id_per_block; i < nb_tasks_per_block; i+= nb_warp_per_block) {
+        /* these 3 variables are used multiple times, so put in in register */
+        _my_cuda_iov_pos = (blockIdx.x + i * gridDim.x + current_cuda_iov_pos) % cuda_iov_count;
+        _my_cuda_iov_iteration = (blockIdx.x + i * gridDim.x + current_cuda_iov_pos) / cuda_iov_count;
+        contig_disp = cuda_iov_dist[_my_cuda_iov_pos].contig_disp;  
+        
+        src_offset = cuda_iov_dist[_my_cuda_iov_pos].ncontig_disp + (_my_cuda_iov_iteration + current_count) * ddt_extent;
+        dst_offset = contig_disp + ddt_size * _my_cuda_iov_iteration - destination_disp;
+        _nb_bytes = cuda_iov_dist[_my_cuda_iov_pos + 1].contig_disp - contig_disp;
+        
+        _source_tmp = source_base + src_offset;
+        _destination_tmp = destination_base + dst_offset;
+        /* block size is either multiple of ALIGNMENT_DOUBLE or residule */
+        if ((uintptr_t)(_source_tmp) % ALIGNMENT_DOUBLE == 0 && (uintptr_t)(_destination_tmp) % ALIGNMENT_DOUBLE == 0 && _nb_bytes % ALIGNMENT_DOUBLE == 0) {
+            alignment = ALIGNMENT_DOUBLE;
+        } else if ((uintptr_t)(_source_tmp) % ALIGNMENT_FLOAT == 0 && (uintptr_t)(_destination_tmp) % ALIGNMENT_FLOAT == 0 && _nb_bytes % ALIGNMENT_FLOAT == 0) {
+            alignment = ALIGNMENT_FLOAT;
+        } else {
+            alignment = ALIGNMENT_CHAR;
+        }
+        
+     //   alignment = ALIGNMENT_DOUBLE;
+        copy_count = _nb_bytes / alignment;
+    /*    
+        if (threadIdx.x == 0 && nb_tasks != 0) {
+            printf("pack block %d, src_offset %ld, dst_offset %ld, count %d, nb_bytes %d, nb_tasks %d, i %d\n", blockIdx.x, src_offset, dst_offset, copy_count, _nb_bytes, nb_tasks, i);
+        }
+        __syncthreads();
+      */
+       /* if (threadIdx.x == 0){
+            printf("bytes %d, copy count %d, alignment %d, task %d, nb_block_used %d\n", _nb_bytes, copy_count, alignment, i, nb_blocks_used);
+        } */
+        if (alignment == ALIGNMENT_DOUBLE) {
+            uint64_t *_source_base_64, *_destination_base_64; 
+            copy_count_16 = copy_count  / (WARP_SIZE * UNROLL_16) * (WARP_SIZE * UNROLL_16);
+            _source_base_64 = (uint64_t *)(source_base + src_offset);
+            _destination_base_64 = (uint64_t *)(destination_base + dst_offset);
+            if (copy_count_16 > 0) {
+                for (k = 0; k < copy_count_16; k += UNROLL_16 * WARP_SIZE) {
+                    #pragma unroll
+                    for (u = 0; u < UNROLL_16; u++) {
+                        j = tid_per_warp + u * WARP_SIZE + k;
+                        tmp_var_64[u] = *(_source_base_64 + j);
+
+                    }
+                    #pragma unroll
+                    for (u = 0; u < UNROLL_16; u++) {
+                        j = tid_per_warp + u * WARP_SIZE + k;
+                        *(_destination_base_64 + j) = tmp_var_64[u];
+
+                    }
+                }
+            }
+            _source_base_64 += copy_count_16;
+            _destination_base_64 += copy_count_16;
+        
+            copy_count_8 = (copy_count - copy_count_16) / (WARP_SIZE * UNROLL_8) * (WARP_SIZE * UNROLL_8);
+            if (copy_count_8 > 0) {
+                for (k = 0; k < copy_count_8; k += UNROLL_8 * WARP_SIZE) {
+                    #pragma unroll
+                    for (u = 0; u < UNROLL_8; u++) {
+                        j = tid_per_warp + u * WARP_SIZE + k;
+                        tmp_var_64[u] = *(_source_base_64 + j);
+
+                    }
+                    #pragma unroll
+                    for (u = 0; u < UNROLL_8; u++) {
+                        j = tid_per_warp + u * WARP_SIZE + k;
+                        *(_destination_base_64 + j) = tmp_var_64[u];
+
+                    }
+                }
+            }
+            _source_base_64 += copy_count_8;
+            _destination_base_64 += copy_count_8;
+        
+            copy_count_left = copy_count - copy_count_16 - copy_count_8;
+            if (copy_count_left > 0) {
+                #pragma unroll
+                for (u = 0; u < UNROLL_8; u++) {
+                    j = tid_per_warp + u * WARP_SIZE;
+                    if (j < copy_count_left) {
+                        tmp_var_64[u] = *(_source_base_64 + j);
+                    } else {
+                        break;
+                    }
+                }
+                #pragma unroll
+                for (u = 0; u < UNROLL_8; u++) {
+                    j = tid_per_warp + u * WARP_SIZE;
+                    if (j < copy_count_left) {
+                        *(_destination_base_64 + j) = tmp_var_64[u];
+                    } else {
+                        break;
+                    }
+                }
+            }
+        } else if (alignment == ALIGNMENT_FLOAT) {
+            uint32_t *_source_base_32, *_destination_base_32;    
+            copy_count_16 = copy_count  / (WARP_SIZE * UNROLL_16) * (WARP_SIZE * UNROLL_16);
+            _source_base_32 = (uint32_t *)(source_base + src_offset);
+            _destination_base_32 = (uint32_t *)(destination_base + dst_offset);
+            if (copy_count_16 > 0) {
+                for (k = 0; k < copy_count_16; k += UNROLL_16 * WARP_SIZE) {
+                    #pragma unroll
+                    for (u = 0; u < UNROLL_16; u++) {
+                        j = tid_per_warp + u * WARP_SIZE + k;
+                        tmp_var_32[u] = *(_source_base_32 + j);
+
+                    }
+                    #pragma unroll
+                    for (u = 0; u < UNROLL_16; u++) {
+                        j = tid_per_warp + u * WARP_SIZE + k;
+                        *(_destination_base_32 + j) = tmp_var_32[u];
+
+                    }
+                }
+            }
+            _source_base_32 += copy_count_16;
+            _destination_base_32 += copy_count_16;
+        
+            copy_count_8 = (copy_count - copy_count_16) / (WARP_SIZE * UNROLL_8) * (WARP_SIZE * UNROLL_8);
+            if (copy_count_8 > 0) {
+                for (k = 0; k < copy_count_8; k += UNROLL_8 * WARP_SIZE) {
+                    #pragma unroll
+                    for (u = 0; u < UNROLL_8; u++) {
+                        j = tid_per_warp + u * WARP_SIZE + k;
+                        tmp_var_32[u] = *(_source_base_32 + j);
+
+                    }
+                    #pragma unroll
+                    for (u = 0; u < UNROLL_8; u++) {
+                        j = tid_per_warp + u * WARP_SIZE + k;
+                        *(_destination_base_32 + j) = tmp_var_32[u];
+
+                    }
+                }
+            }
+            _source_base_32 += copy_count_8;
+            _destination_base_32 += copy_count_8;
+        
+            copy_count_left = copy_count - copy_count_16 - copy_count_8;
+            if (copy_count_left > 0) {
+                #pragma unroll
+                for (u = 0; u < UNROLL_8; u++) {
+                    j = tid_per_warp + u * WARP_SIZE;
+                    if (j < copy_count_left) {
+                        tmp_var_32[u] = *(_source_base_32 + j);
+                    } else {
+                        break;
+                    }
+                }
+                #pragma unroll
+                for (u = 0; u < UNROLL_8; u++) {
+                    j = tid_per_warp + u * WARP_SIZE;
+                    if (j < copy_count_left) {
+                        *(_destination_base_32 + j) = tmp_var_32[u];
+                    } else {
+                        break;
+                    }
+                }
+            }
+        } else {
+            unsigned char *_source_base_8, *_destination_base_8;
+        
+            copy_count_16 = copy_count  / (WARP_SIZE * UNROLL_16) * (WARP_SIZE * UNROLL_16);
+            _source_base_8 = (unsigned char *)(source_base + src_offset);
+            _destination_base_8 = (unsigned char *)(destination_base + dst_offset);
+            if (copy_count_16 > 0) {
+                for (k = 0; k < copy_count_16; k += UNROLL_16 * WARP_SIZE) {
+                    #pragma unroll
+                    for (u = 0; u < UNROLL_16; u++) {
+                        j = tid_per_warp + u * WARP_SIZE + k;
+                        tmp_var_8[u] = *(_source_base_8 + j);
+
+                    }
+                    #pragma unroll
+                    for (u = 0; u < UNROLL_16; u++) {
+                        j = tid_per_warp + u * WARP_SIZE + k;
+                        *(_destination_base_8 + j) = tmp_var_8[u];
+
+                    }
+                }
+            }
+            _source_base_8 += copy_count_16;
+            _destination_base_8 += copy_count_16;
+        
+            copy_count_8 = (copy_count - copy_count_16) / (WARP_SIZE * UNROLL_8) * (WARP_SIZE * UNROLL_8);
+            if (copy_count_8 > 0) {
+                for (k = 0; k < copy_count_8; k += UNROLL_8 * WARP_SIZE) {
+                    #pragma unroll
+                    for (u = 0; u < UNROLL_8; u++) {
+                        j = tid_per_warp + u * WARP_SIZE + k;
+                        tmp_var_8[u] = *(_source_base_8 + j);
+
+                    }
+                    #pragma unroll
+                    for (u = 0; u < UNROLL_8; u++) {
+                        j = tid_per_warp + u * WARP_SIZE + k;
+                        *(_destination_base_8 + j) = tmp_var_8[u];
+
+                    }
+                }
+            }
+            _source_base_8 += copy_count_8;
+            _destination_base_8 += copy_count_8;
+        
+            copy_count_left = copy_count - copy_count_16 - copy_count_8;
+            if (copy_count_left > 0) {
+                #pragma unroll
+                for (u = 0; u < UNROLL_8; u++) {
+                    j = tid_per_warp + u * WARP_SIZE;
+                    if (j < copy_count_left) {
+                        tmp_var_8[u] = *(_source_base_8 + j);
+                    } else {
+                        break;
+                    }
+                }
+                #pragma unroll
+                for (u = 0; u < UNROLL_8; u++) {
+                    j = tid_per_warp + u * WARP_SIZE;
+                    if (j < copy_count_left) {
+                        *(_destination_base_8 + j) = tmp_var_8[u];
+                    } else {
+                        break;
+                    }
+                }
+            }
+        }
+    }
+}
+#endif
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
index 0137601bf70..dd23aa853ed 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
@@ -463,7 +463,7 @@ void pack_contiguous_loop_cuda( dt_elem_desc_t* ELEM,
 #if OPAL_DATATYPE_VECTOR_USE_MEMCPY2D_AS_KERNEL
     cudaMemcpy2DAsync(_destination, _end_loop->size, _source, _loop->extent, _end_loop->size, _copy_loops, cudaMemcpyDeviceToDevice, cuda_streams->opal_cuda_stream[0]);
 #else
-    pack_contiguous_loop_cuda_kernel_global<<<192, 4*THREAD_PER_BLOCK, 0, cuda_streams->opal_cuda_stream[0]>>>(_copy_loops, _end_loop->size, _loop->extent, _source, _destination);
+    pack_contiguous_loop_cuda_kernel_global<<<1, 8*THREAD_PER_BLOCK, 0, cuda_streams->opal_cuda_stream[0]>>>(_copy_loops, _end_loop->size, _loop->extent, _source, _destination);
 #endif /* OPAL_DATATYPE_VECTOR_USE_MEMCPY2D_AS_KERNEL */
 
 #if !defined(OPAL_DATATYPE_CUDA_DRY_RUN)    
@@ -1056,7 +1056,7 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_non_cached( opal_converto
     }
     
     cuda_streams->current_stream_id = 0;
-    thread_per_block = CUDA_WARP_SIZE * 5;
+    thread_per_block = CUDA_WARP_SIZE * 4;
     nb_blocks = 256;
     opal_datatype_type_extent(pConvertor->pDesc, &ddt_extent);
     source_base = (unsigned char*)pConvertor->pBaseBuf + pConvertor->current_count * ddt_extent; 
@@ -1146,8 +1146,8 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
 
     cuda_streams->current_stream_id = 0;
     destination_base = destination;
-    thread_per_block = CUDA_WARP_SIZE * 5;
-    nb_blocks = 256;
+    thread_per_block = CUDA_WARP_SIZE * 8;
+    nb_blocks = 1;
     source_base = (unsigned char*)pConvertor->pBaseBuf; 
     
     /* cuda iov is not cached, start to cache iov */
@@ -1211,12 +1211,19 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
 #endif
     opal_datatype_type_extent(pConvertor->pDesc, &ddt_extent);
     DT_CUDA_DEBUG ( opal_cuda_output(2, "Pack kernel launched src_base %p, dst_base %p, nb_blocks %ld, extent %ld\n", source_base, destination_base, nb_blocks_used, ddt_extent ); );
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+    GET_TIME(start);
+#endif    
     opal_generic_simple_pack_cuda_iov_cached_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cached_cuda_iov_dist_d, pConvertor->current_cuda_iov_pos, cached_cuda_iov_count, ddt_extent, convertor_current_count, nb_blocks_used, source_base, destination_base);
     pConvertor->current_cuda_iov_pos += nb_blocks_used;
     pConvertor->current_cuda_iov_pos = pConvertor->current_cuda_iov_pos % cached_cuda_iov->cuda_iov_count;
 
     cudaStreamSynchronize(cuda_streams->opal_cuda_stream[0]);
-    
+#if defined(OPAL_DATATYPE_CUDA_TIMING)    
+    GET_TIME( end );
+    total_time = ELAPSED_TIME( start, end );
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Pack kernel %ld microsec\n", total_time); );
+#endif    
     return OPAL_SUCCESS;
 }
 
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
index bb54dfeeb0a..f4e89accefe 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
@@ -830,7 +830,7 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
 
     cuda_streams->current_stream_id = 0;
     source_base = source;
-    thread_per_block = CUDA_WARP_SIZE * 5;
+    thread_per_block = CUDA_WARP_SIZE * 4;
     nb_blocks = 256;
     destination_base = (unsigned char*)pConvertor->pBaseBuf;
     
@@ -908,10 +908,19 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
 #endif
     opal_datatype_type_extent(pConvertor->pDesc, &ddt_extent);
     DT_CUDA_DEBUG ( opal_cuda_output(2, "Unpack kernel launched src_base %p, dst_base %p, nb_blocks %ld\n", source_base, destination_base, nb_blocks_used ); );
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+    GET_TIME(start);
+#endif    
     opal_generic_simple_unpack_cuda_iov_cached_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cached_cuda_iov_dist_d, pConvertor->current_cuda_iov_pos, cached_cuda_iov_count, ddt_extent, convertor_current_count, nb_blocks_used, destination_base, source_base, cuda_iov_partial_length_start, cuda_iov_partial_length_end);
 
     cudaStreamSynchronize(cuda_streams->opal_cuda_stream[0]);
-    
+#if defined(OPAL_DATATYPE_CUDA_TIMING)    
+    GET_TIME( end );
+    total_time = ELAPSED_TIME( start, end );
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Unpack kernel %ld microsec\n", total_time); );
+#endif
+
     return OPAL_SUCCESS;
 }
 
diff --git a/test/datatype/Makefile.am b/test/datatype/Makefile.am
index e516e08ae6f..85e5559923e 100644
--- a/test/datatype/Makefile.am
+++ b/test/datatype/Makefile.am
@@ -35,7 +35,7 @@ ddt_test_LDADD = $(top_builddir)/ompi/libmpi.la $(top_builddir)/opal/mca/common/
 ddt_benchmark_SOURCES = ddt_benchmark.c ddt_lib.c ddt_lib.h
 ddt_benchmark_LDFLAGS = $(WRAPPER_EXTRA_LDFLAGS)
 ddt_benchmark_CFLAGS = -I/mnt/sw/cuda/include -g -O0
-ddt_benchmark_LDADD = $(top_builddir)/ompi/libmpi.la $(top_builddir)/opal/mca/common/cuda/libmca_common_cuda.la -L/mnt/sw/cuda/lib64 -lcudart
+ddt_benchmark_LDADD = $(top_builddir)/ompi/libmpi.la $(top_builddir)/opal/mca/common/cuda/libmca_common_cuda.la -L/shared/apps/cuda/CUDA-v7.5.18/lib64 -lcudart
 
 #ddt_test_old_SOURCES = ddt_test_old.c ddt_lib.c ddt_lib.h
 #ddt_test_old_LDFLAGS = $(WRAPPER_EXTRA_LDFLAGS)
diff --git a/test/datatype/ddt_benchmark.c b/test/datatype/ddt_benchmark.c
index e879e5c0192..1ce768900db 100644
--- a/test/datatype/ddt_benchmark.c
+++ b/test/datatype/ddt_benchmark.c
@@ -198,24 +198,27 @@ static void fill_vectors(double* vp, int itera, int contig, int gap)
     for (i = (itera-1)*gap; i < (itera-1)*gap+contig; i++) {
         vp[i] = 1.1;
     }
-    
-    // printf("vector generated:\n");
-    // for (i = 0; i < (itera-1)*gap+contig; i++) {
-    //     printf("%1.f ", vp[i]);
-    // }
-    printf("\n");
+   /* 
+     printf("vector generated:\n");
+     for (i = 0; i < (itera-1)*gap+contig; i++) {
+         printf("%1.f ", vp[i]);
+         if ((i+1) % gap == 0) printf("\n");
+     }
+    printf("\n");*/
 }
 
 static void verify_vectors(double *vp, int itera, int contig, int gap)
 {
     int i, j;
     int error = 0;
+    int count = 0;
     for (i = 0; i < itera-1; i++) {
         for (j = i*gap; j < (i+1)*gap; j++) {
             if (j >= i*gap && j < i*gap+contig) {
                 if (vp[j] != 1.1) {
                     error ++;
                 }
+                count ++;
             } 
         }
     }
@@ -223,15 +226,19 @@ static void verify_vectors(double *vp, int itera, int contig, int gap)
         if (vp[i] != 1.1) {
             error ++;
         }
+        count ++;
     }
-    // printf("vector received:\n");
-    // for (i = 0; i < (itera-1)*gap+contig; i++) {
-    //     printf("%1.f ", vp[i]);
-    // }
-    if (error != 0) {
-        printf("%d error is found\n", error);
+/*
+     printf("vector received:\n");
+     for (i = 0; i < (itera-1)*gap+contig; i++) {
+         printf("%1.f ", vp[i]);
+         if ((i+1) % gap == 0) printf("\n");
+     }
+  */
+     if (error != 0) {
+        printf("%d errors out of %d\n", error, count);
     } else {
-        printf("no error is found\n");
+        printf("no errors out of %d\n", count);
     }
 }
 
@@ -249,9 +256,10 @@ vector_ddt( ompi_datatype_t* send_type, int send_count,
     TIMER_DATA_TYPE start, end, unpack_start, unpack_end;
     long total_time, unpack_time = 0, push_time = 0, pop_time = 0, pack_time = 0;
     size_t slength, rlength;
+    int shift_n = 0;
 
-    rlength = compute_buffer_length(recv_type, recv_count);
-    slength = compute_buffer_length(send_type, send_count);
+    rlength = compute_buffer_length(recv_type, recv_count) + sizeof(double)*shift_n;
+    slength = compute_buffer_length(send_type, send_count) + sizeof(double)*shift_n;
     
     cudaSetDevice(0);
 
@@ -261,6 +269,7 @@ vector_ddt( ompi_datatype_t* send_type, int send_count,
         exit(-1);
     }
     cudaMemset(psrc, 0, slength);
+    psrc += sizeof(double)*shift_n;
     printf("cudamalloc psrc %p\n", psrc);
     
     error = cudaMalloc((void **)&pdst, rlength);
@@ -269,6 +278,7 @@ vector_ddt( ompi_datatype_t* send_type, int send_count,
         exit(-1);
     }
     cudaMemset(pdst, 0, rlength); 
+    pdst += sizeof(double)*shift_n;
     printf("cudamalloc pdst %p\n", pdst);
     
  //   error = cudaHostAlloc((void **)&ptemp, chunk, cudaHostAllocMapped);
@@ -279,6 +289,7 @@ vector_ddt( ompi_datatype_t* send_type, int send_count,
         exit(-1);
     }
     memset(ptemp, 0, chunk);
+    ptemp += sizeof(double)*shift_n;
     printf("cudamallochost ptemp %p\n", ptemp);
     
     
@@ -290,6 +301,10 @@ vector_ddt( ompi_datatype_t* send_type, int send_count,
     
     memset(psrc_host, 0, slength);
     memset(pdst_host, 0, rlength);
+    pdst_host += sizeof(double)*shift_n;
+    psrc_host += sizeof(double)*shift_n;
+    slength -= sizeof(double)*shift_n;
+    rlength -= sizeof(double)*shift_n;
     if (itera > 0) {
         fill_vectors((double *)psrc_host, itera, contig, gap);
     }
@@ -708,6 +723,14 @@ static void fill_upper_matrix(void *matt, int msize)
         blklens[i] = msize - i;
         displs[i] = i*msize + i;
     }
+    /*int ct = 0;
+    for (i = 0; i < msize; i++) {
+        blklens[i] = msize - ct*160;
+        displs[i] = i*msize + ct*160;
+        if (i % 160 == 0 && i != 0) {
+            ct++;
+        }
+    }*/
     for (i = 0; i < msize; i++) {
         start = displs[i];
         end = start + blklens[i];
@@ -722,13 +745,14 @@ static void fill_upper_matrix(void *matt, int msize)
     free(blklens);
     free(displs);
 
-   // printf("matrix generate\n");
-   // for (i = 0; i < msize; i++) {
-   //     for (j = 0; j < msize; j++) {
-   //         printf(" %1.f ", mat[i*msize+j]);
-   //     }
-   //     printf("\n");
-   // }
+    /*
+    printf("matrix generate\n");
+    for (i = 0; i < msize; i++) {
+        for (j = 0; j < msize; j++) {
+            printf(" %1.f ", mat[i*msize+j]);
+        }
+        printf("\n");
+    }*/
 }
 
 static void verify_mat_result(void *matt, int msize)
@@ -752,6 +776,14 @@ static void verify_mat_result(void *matt, int msize)
         blklens[i] = msize - i;
         displs[i] = i*msize + i;
     }
+    /*int ct = 0;
+    for (i = 0; i < msize; i++) {
+        blklens[i] = msize - ct*160;
+        displs[i] = i*msize + ct*160;
+        if (i % 160 == 0 && i != 0) {
+            ct++;
+        }
+    }*/
     for (i = 0; i < msize; i++) {
         start = displs[i];
         end = start + blklens[i];
@@ -767,15 +799,15 @@ static void verify_mat_result(void *matt, int msize)
     }
     free(blklens);
     free(displs);
-    
-    // printf("matrix received\n");
-    // for (i = 0; i < msize; i++) {
-    //     for (j = 0; j < msize; j++) {
-    //         printf(" %1.f ", mat[i*msize+j]);
-    //     }
-    //     printf("\n");
-    // }
-    
+   /* 
+     printf("matrix received\n");
+     for (i = 0; i < msize; i++) {
+         for (j = 0; j < msize; j++) {
+             printf(" %1.f ", mat[i*msize+j]);
+         }
+         printf("\n");
+     }
+    */
     if (error != 0) {
         printf("error is found %d\n", error);
     } else {
@@ -795,8 +827,9 @@ static int local_copy_with_convertor( ompi_datatype_t* pdt, int count, int chunk
     long total_time, unpack_time = 0;
     int j, t_error = 0;
     unsigned char *mat_char;
+    int shift_n = 0;
 
-    dt_length = compute_buffer_length(pdt, count);
+    dt_length = compute_buffer_length(pdt, count) + sizeof(double) * shift_n;
     printf("length %lu\n", dt_length);
 
 #if defined (DDT_TEST_CUDA)
@@ -809,6 +842,7 @@ static int local_copy_with_convertor( ompi_datatype_t* pdt, int count, int chunk
         printf("CUDA error: %s\n", cudaGetErrorString(error));
         exit(-1);
     }
+    psrc += sizeof(double) * shift_n;
     cudaMemset(psrc, 0, dt_length);
     printf("cudamalloc psrc %p\n", psrc);
     
@@ -817,6 +851,7 @@ static int local_copy_with_convertor( ompi_datatype_t* pdt, int count, int chunk
         printf("CUDA error: %s\n", cudaGetErrorString(error));
         exit(-1);
     }
+    pdst += sizeof(double) * shift_n;
     cudaMemset(pdst, 0, dt_length); 
     printf("cudamalloc pdst %p\n", pdst);
     
@@ -825,6 +860,7 @@ static int local_copy_with_convertor( ompi_datatype_t* pdt, int count, int chunk
         printf("CUDA error: %s\n", cudaGetErrorString(error));
         exit(-1);
     }
+    ptemp += sizeof(double) * shift_n;
     memset(ptemp, 0, chunk);
     printf("cudamallochost ptemp %p\n", ptemp);
     
@@ -833,6 +869,7 @@ static int local_copy_with_convertor( ompi_datatype_t* pdt, int count, int chunk
         printf("CUDA error: %s\n", cudaGetErrorString(error));
         exit(-1);
     }
+    phost += sizeof(double) * shift_n;
     memset(phost, 0, dt_length);
     printf("cudamallochost phost %p\n", phost);
 #else
@@ -845,6 +882,7 @@ static int local_copy_with_convertor( ompi_datatype_t* pdt, int count, int chunk
 #endif
 
 #if defined (DDT_TEST_CUDA)
+    dt_length -= sizeof(double) * shift_n;
     if (msize > 0) {
         fill_upper_matrix(phost, msize);
     }
@@ -904,6 +942,11 @@ static int local_copy_with_convertor( ompi_datatype_t* pdt, int count, int chunk
         }
         printf("total error %d\n", t_error);
 #endif
+      /*  double *mat_d = (double *)ptemp;
+        for (j = 0; j < max_data/sizeof(double); j++) {
+            printf("%1.f ", mat_d[j]);
+        }*/
+      //  printf("max data %d, ptemp %p \n", max_data, ptemp);
 
         if( done2 == 0 ) {
             GET_TIME( unpack_start );
@@ -936,6 +979,10 @@ static int local_copy_with_convertor( ompi_datatype_t* pdt, int count, int chunk
     if( NULL != recv_convertor ) OBJ_RELEASE( recv_convertor );
 
 #if defined (DDT_TEST_CUDA)
+    psrc -= sizeof(double) * shift_n;
+    pdst -= sizeof(double) * shift_n;
+    ptemp -= sizeof(double) * shift_n;
+    phost -= sizeof(double) * shift_n;
     if( NULL != pdst ) cudaFree( pdst );
     if( NULL != psrc ) cudaFree( psrc );
     if( NULL != ptemp ) cudaFreeHost( ptemp );
@@ -1224,12 +1271,12 @@ int main( int argc, char* argv[] )
     
     printf( "\n\n#\n * TEST UPPER TRIANGULAR MATRIX (size 100)\n #\n\n" );
     int mat_size = 500;
-    for (mat_size = 2000; mat_size <= 2000; mat_size +=500) {
+    for (mat_size = 1000; mat_size <= 4000; mat_size +=1000) {
         pdt = upper_matrix(mat_size);
         printf("----matrix size %d-----\n", mat_size);
         if( outputFlags & CHECK_PACK_UNPACK ) {
-            for (i = 1; i <= 1; i++) {
-                local_copy_with_convertor(pdt, 1, 40000000, mat_size);
+            for (i = 1; i <= 5; i++) {
+          //       local_copy_with_convertor(pdt, 1, 200000000, mat_size);
             }
         }
         OBJ_RELEASE( pdt ); assert( pdt == NULL );
@@ -1292,13 +1339,13 @@ int main( int argc, char* argv[] )
     }
     
     
-    for (blk_len = 1000; blk_len <= 1000; blk_len += 2) {
+    for (blk_len = 4000; blk_len <= 4000; blk_len += 2000) {
         printf( ">>--------------------------------------------<<\n" );
         printf( "Vector data-type (1024 times %d double stride 512)\n", blk_len );
-        pdt = create_vector_type( MPI_DOUBLE, 1000, blk_len, blk_len*2);
+        pdt = create_vector_type( MPI_DOUBLE, blk_len, blk_len, blk_len*2);
         if( outputFlags & CHECK_PACK_UNPACK ) {
-            for (i = 0; i < 1; i++) {
-        //         vector_ddt( pdt, 1, pdt, 1, 2000000 , 1000, blk_len, blk_len*2);
+            for (i = 0; i < 4; i++) {
+                 vector_ddt( pdt, 1, pdt, 1, 1024*1024*200 , blk_len, blk_len, blk_len*2);
      //          vector_ddt_2d( pdt, 1, pdt, 1, 1024*1024*100 , 8192, blk_len, blk_len+128);
             }
         }
diff --git a/test/datatype/ddt_lib.c b/test/datatype/ddt_lib.c
index 321a5c4be88..a96ec085ddd 100644
--- a/test/datatype/ddt_lib.c
+++ b/test/datatype/ddt_lib.c
@@ -363,6 +363,14 @@ ompi_datatype_t* upper_matrix( unsigned int mat_size )
         disp[i] = i * mat_size + i;
         blocklen[i] = mat_size - i;
     }
+    /*int ct = 0;
+    for (i = 0; i < mat_size; i++) {
+        blocklen[i] = mat_size - ct*160;
+        disp[i] = i*mat_size + ct*160;
+        if (i % 160 == 0 && i != 0) {
+            ct++;
+        }
+    }*/
 #if defined (TEST_DOUBLE)
     ompi_datatype_create_indexed( mat_size, blocklen, disp, &ompi_mpi_double.dt,
                              &upper );
diff --git a/test/datatype/ddt_lib.h b/test/datatype/ddt_lib.h
index ef462ce0f31..0f6bbc2cb37 100644
--- a/test/datatype/ddt_lib.h
+++ b/test/datatype/ddt_lib.h
@@ -34,9 +34,9 @@
 #define DUMP_DATA_AFTER_COMMIT 0x00000001
 #define CHECK_PACK_UNPACK      0x00000002
 
-//#define TEST_DOUBLE
+#define TEST_DOUBLE
 //#define TEST_FLOAT
-#define TEST_CHAR
+//#define TEST_CHAR
 
 
 extern uint32_t outputFlags;

From 2e8b414a82a453eccf26e0bf93e62f42953b71ba Mon Sep 17 00:00:00 2001
From: Wei Wu <eddy16112@gmail.com>
Date: Mon, 22 Feb 2016 17:13:30 -0800
Subject: [PATCH 092/190] apply unroll to unpack

---
 opal/datatype/cuda/opal_datatype_cuda.cu      |   2 +-
 .../cuda/opal_datatype_pack_cuda_kernel.cu    |  12 +-
 .../cuda/opal_datatype_pack_cuda_wrapper.cu   |   7 +-
 .../cuda/opal_datatype_unpack_cuda_kernel.cu  | 288 ++++++++++++++++++
 .../cuda/opal_datatype_unpack_cuda_wrapper.cu |   5 +-
 test/datatype/ddt_benchmark.c                 |   2 +-
 6 files changed, 303 insertions(+), 13 deletions(-)

diff --git a/opal/datatype/cuda/opal_datatype_cuda.cu b/opal/datatype/cuda/opal_datatype_cuda.cu
index 372edefa96a..7d12a5d80db 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cu
+++ b/opal/datatype/cuda/opal_datatype_cuda.cu
@@ -389,7 +389,7 @@ int32_t opal_ddt_cache_cuda_iov(opal_convertor_t* pConvertor, uint32_t *cuda_iov
     cuda_iov_pipeline_block = current_cuda_device->cuda_iov_pipeline_block[0];
     cuda_iov_dist_h = cuda_iov_pipeline_block->cuda_iov_dist_cached_h;
     cuda_stream_iov = cuda_iov_pipeline_block->cuda_stream;
-    thread_per_block = CUDA_WARP_SIZE * 32;
+    thread_per_block = CUDA_WARP_SIZE * 64;
 
     for (i = 0; i < ddt_iov_count; i++) {
         length_per_iovec = ddt_iov[i].iov_len;
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
index 79138a72f9a..81e7f7c4dcd 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
@@ -5,7 +5,7 @@
 #include <stdio.h> 
 #include <time.h>
 
-#if 1
+#if 0
 __global__ void pack_contiguous_loop_cuda_kernel_global( uint32_t copy_loops,
                                                          size_t size,
                                                          OPAL_PTRDIFF_TYPE extent,
@@ -43,20 +43,20 @@ __global__ void pack_contiguous_loop_cuda_kernel_global( uint32_t copy_loops,
         _destination_tmp += num_threads;
     }
 #else
-    for (_i = tid; _i < copy_loops*nb_elements; _i+=16*num_threads) {
+    for (_i = tid; _i < copy_loops*nb_elements; _i+=8*num_threads) {
         uint64_t val[16];
         uint32_t _j;
         uint32_t u;
         uint64_t *mysrc = _src_disp_tmp + tid;
         
         #pragma unroll      
-        for (u = 0; u < 16; u++) {
+        for (u = 0; u < 8; u++) {
             _j = _i + u * num_threads;
             val[u] = *(mysrc + _j/num_threads*num_threads + _j/nb_elements * gap);
         } 
         
         #pragma unroll
-        for (u = 0; u < 16; u++) {
+        for (u = 0; u < 8; u++) {
             *_destination_tmp = val[u];
             _destination_tmp += num_threads;
         } 
@@ -184,7 +184,7 @@ __global__ void pack_contiguous_loop_cuda_kernel_global( uint32_t lines,
     	    for (int b=0; b<lines/(KERNEL_UNROLL*warp_nb); b++) {
     		    #pragma unroll
     		    for (int u=0; u<KERNEL_UNROLL; u++) {
-    		        val[u] = __ldg(src+(l+u)*extent);
+    		        val[u] = *(src+(l+u)*extent);
     		    }
     		    #pragma unroll
     		    for (int u=0; u<KERNEL_UNROLL; u++) {
@@ -194,7 +194,7 @@ __global__ void pack_contiguous_loop_cuda_kernel_global( uint32_t lines,
     	    }
     	    /* Finish non-unrollable case */
     	    for (int u=0; u<KERNEL_UNROLL && l<lines; u++) {
-    		    dst[l*size] = __ldg(src+l*extent);
+    		    dst[l*size] = *(src+l*extent);
     		    l++;
     	    }		
     	    col += width;
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
index dd23aa853ed..534c3372d60 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
@@ -463,7 +463,7 @@ void pack_contiguous_loop_cuda( dt_elem_desc_t* ELEM,
 #if OPAL_DATATYPE_VECTOR_USE_MEMCPY2D_AS_KERNEL
     cudaMemcpy2DAsync(_destination, _end_loop->size, _source, _loop->extent, _end_loop->size, _copy_loops, cudaMemcpyDeviceToDevice, cuda_streams->opal_cuda_stream[0]);
 #else
-    pack_contiguous_loop_cuda_kernel_global<<<1, 8*THREAD_PER_BLOCK, 0, cuda_streams->opal_cuda_stream[0]>>>(_copy_loops, _end_loop->size, _loop->extent, _source, _destination);
+    pack_contiguous_loop_cuda_kernel_global<<<32, 8*THREAD_PER_BLOCK, 0, cuda_streams->opal_cuda_stream[0]>>>(_copy_loops, _end_loop->size, _loop->extent, _source, _destination);
 #endif /* OPAL_DATATYPE_VECTOR_USE_MEMCPY2D_AS_KERNEL */
 
 #if !defined(OPAL_DATATYPE_CUDA_DRY_RUN)    
@@ -1056,7 +1056,7 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_non_cached( opal_converto
     }
     
     cuda_streams->current_stream_id = 0;
-    thread_per_block = CUDA_WARP_SIZE * 4;
+    thread_per_block = CUDA_WARP_SIZE * 5;
     nb_blocks = 256;
     opal_datatype_type_extent(pConvertor->pDesc, &ddt_extent);
     source_base = (unsigned char*)pConvertor->pBaseBuf + pConvertor->current_count * ddt_extent; 
@@ -1095,6 +1095,7 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_non_cached( opal_converto
 
         cudaMemcpyAsync(cuda_iov_dist_d_current, cuda_iov_dist_h_current, sizeof(ddt_cuda_iov_dist_cached_t)*(nb_blocks_used+1), cudaMemcpyHostToDevice, *cuda_stream_iov);
         opal_generic_simple_pack_cuda_iov_cached_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cuda_iov_dist_d_current, 0, nb_blocks_used, 0, 0, nb_blocks_used, source_base, destination_base);
+        //cudaStreamSynchronize(*cuda_stream_iov);
         cuda_err = cudaEventRecord(cuda_iov_pipeline_block->cuda_event, *cuda_stream_iov);
         opal_cuda_check_error(cuda_err);
         iov_pipeline_block_id ++;
@@ -1147,7 +1148,7 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
     cuda_streams->current_stream_id = 0;
     destination_base = destination;
     thread_per_block = CUDA_WARP_SIZE * 8;
-    nb_blocks = 1;
+    nb_blocks = 4;
     source_base = (unsigned char*)pConvertor->pBaseBuf; 
     
     /* cuda iov is not cached, start to cache iov */
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
index f6ee8e0bfc4..4774abf5f38 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
@@ -46,6 +46,7 @@ __global__ void opal_generic_simple_unpack_cuda_iov_non_cached_kernel( ddt_cuda_
     }
 }
 
+#if 0
 __global__ void opal_generic_simple_unpack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, uint32_t cuda_iov_pos, uint32_t cuda_iov_count, uint32_t ddt_extent, uint32_t current_count, int nb_blocks_used, unsigned char* destination_base, unsigned char* source_base, size_t cuda_iov_partial_length_start, size_t cuda_iov_partial_length_end)
 {
     uint32_t i, j;
@@ -136,6 +137,293 @@ __global__ void opal_generic_simple_unpack_cuda_iov_cached_kernel( ddt_cuda_iov_
     }
 }
 
+#else 
+__global__ void opal_generic_simple_unpack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, uint32_t cuda_iov_pos, uint32_t cuda_iov_count, uint32_t ddt_extent, uint32_t current_count, int nb_blocks_used, unsigned char* destination_base, unsigned char* source_base, size_t cuda_iov_partial_length_start, size_t cuda_iov_partial_length_end)
+{
+    uint32_t i, j;
+    size_t dst_offset, src_offset;
+    unsigned char *_source_tmp, *_destination_tmp;
+    uint32_t _nb_bytes;
+    uint32_t current_cuda_iov_pos = cuda_iov_pos;
+    size_t source_disp = cuda_iov_dist[current_cuda_iov_pos].contig_disp;
+    size_t source_partial_disp = 0;
+    size_t contig_disp; 
+    uint32_t _my_cuda_iov_pos;
+    uint32_t _my_cuda_iov_iteration;
+    size_t ddt_size = cuda_iov_dist[cuda_iov_count].contig_disp;
+
+    __shared__ uint32_t nb_tasks_per_block;
+    __shared__ uint32_t WARP_SIZE;
+    __shared__ uint32_t nb_warp_per_block;
+    uint32_t copy_count;
+    uint8_t alignment;
+    uint64_t tmp_var_64[KERNEL_UNROLL];
+    uint32_t tmp_var_32[KERNEL_UNROLL];
+    unsigned char tmp_var_8[KERNEL_UNROLL];
+    uint32_t u, k;
+    uint32_t copy_count_16, copy_count_8, copy_count_left;
+    
+    if (threadIdx.x == 0) {
+        nb_tasks_per_block = nb_blocks_used / gridDim.x;
+        if (blockIdx.x < nb_blocks_used % gridDim.x) {
+            nb_tasks_per_block ++;
+        }
+        if (nb_tasks_per_block >= 4) {
+            WARP_SIZE = 32;
+        } else if (nb_tasks_per_block == 1) {
+            WARP_SIZE = blockDim.x;
+        } else {
+            WARP_SIZE = 64;
+        }
+        nb_warp_per_block = blockDim.x / WARP_SIZE;
+     //   printf("cuda_iov_count %d, ddt_extent %d, current_count %d, ddt_size %d\n", cuda_iov_count, ddt_extent, current_count, ddt_size);
+    }
+    __syncthreads();
+    
+    const uint32_t warp_id_per_block = threadIdx.x / WARP_SIZE;
+    const uint32_t tid_per_warp = threadIdx.x & (WARP_SIZE - 1);
+    
+    if (cuda_iov_partial_length_start != 0) {
+        source_partial_disp = (cuda_iov_dist[current_cuda_iov_pos+1].contig_disp - cuda_iov_dist[current_cuda_iov_pos].contig_disp) - cuda_iov_partial_length_start;
+    }
+    
+    for (i = warp_id_per_block; i < nb_tasks_per_block; i+= nb_warp_per_block) {
+        /* these 3 variables are used multiple times, so put in in register */
+        _my_cuda_iov_pos = (blockIdx.x + i * gridDim.x + current_cuda_iov_pos) % cuda_iov_count;
+        _my_cuda_iov_iteration = (blockIdx.x + i * gridDim.x + current_cuda_iov_pos) / cuda_iov_count;
+        contig_disp = cuda_iov_dist[_my_cuda_iov_pos].contig_disp; 
+        
+        src_offset = contig_disp + ddt_size * _my_cuda_iov_iteration - source_disp - source_partial_disp;
+        dst_offset = cuda_iov_dist[_my_cuda_iov_pos].ncontig_disp + (_my_cuda_iov_iteration + current_count) * ddt_extent;
+        _nb_bytes = cuda_iov_dist[_my_cuda_iov_pos + 1].contig_disp - contig_disp;
+
+        if (i == 0 && blockIdx.x == 0 && cuda_iov_partial_length_start != 0) {
+            src_offset = contig_disp + ddt_size * _my_cuda_iov_iteration - source_disp;
+            dst_offset = dst_offset + _nb_bytes - cuda_iov_partial_length_start;  
+            _nb_bytes = cuda_iov_partial_length_start;
+        } else if (i == nb_tasks_per_block-1 && (blockIdx.x == (nb_blocks_used-1) % gridDim.x) && cuda_iov_partial_length_end != 0) {
+            _nb_bytes = cuda_iov_partial_length_end;
+        }
+        
+        _destination_tmp = destination_base + dst_offset; 
+        _source_tmp = source_base + src_offset;
+        if ((uintptr_t)(_destination_tmp) % ALIGNMENT_DOUBLE == 0 && (uintptr_t)(_source_tmp) % ALIGNMENT_DOUBLE == 0 && _nb_bytes % ALIGNMENT_DOUBLE == 0) {
+            alignment = ALIGNMENT_DOUBLE;
+        } else if ((uintptr_t)(_destination_tmp) % ALIGNMENT_FLOAT == 0 && (uintptr_t)(_source_tmp) % ALIGNMENT_FLOAT == 0 && _nb_bytes % ALIGNMENT_FLOAT == 0) {
+            alignment = ALIGNMENT_FLOAT;
+        } else {
+            alignment = ALIGNMENT_CHAR;
+        }
+        
+        copy_count = _nb_bytes / alignment;
+   /*     
+        if (threadIdx.x == 0 && nb_tasks != 0) {
+            printf("unpack block %d, src_offset %ld, dst_offset %ld, count %d, nb_bytes %d, nb_tasks %d, i %d\n", blockIdx.x, src_offset, dst_offset, copy_count, _nb_bytes, nb_tasks, i);
+        }
+        __syncthreads();
+     */   
+        if (alignment == ALIGNMENT_DOUBLE) {
+            uint64_t *_source_base_64, *_destination_base_64; 
+            copy_count_16 = copy_count  / (WARP_SIZE * UNROLL_16) * (WARP_SIZE * UNROLL_16);
+            _source_base_64 = (uint64_t *)(source_base + src_offset);
+            _destination_base_64 = (uint64_t *)(destination_base + dst_offset);
+            if (copy_count_16 > 0) {
+                for (k = 0; k < copy_count_16; k += UNROLL_16 * WARP_SIZE) {
+                    #pragma unroll
+                    for (u = 0; u < UNROLL_16; u++) {
+                        j = tid_per_warp + u * WARP_SIZE + k;
+                        tmp_var_64[u] = *(_source_base_64 + j);
+
+                    }
+                    #pragma unroll
+                    for (u = 0; u < UNROLL_16; u++) {
+                        j = tid_per_warp + u * WARP_SIZE + k;
+                        *(_destination_base_64 + j) = tmp_var_64[u];
+
+                    }
+                }
+            }
+            _source_base_64 += copy_count_16;
+            _destination_base_64 += copy_count_16;
+            
+            copy_count_8 = (copy_count - copy_count_16) / (WARP_SIZE * UNROLL_8) * (WARP_SIZE * UNROLL_8);
+            if (copy_count_8 > 0) {
+                for (k = 0; k < copy_count_8; k += UNROLL_8 * WARP_SIZE) {
+                    #pragma unroll
+                    for (u = 0; u < UNROLL_8; u++) {
+                        j = tid_per_warp + u * WARP_SIZE + k;
+                        tmp_var_64[u] = *(_source_base_64 + j);
+
+                    }
+                    #pragma unroll
+                    for (u = 0; u < UNROLL_8; u++) {
+                        j = tid_per_warp + u * WARP_SIZE + k;
+                        *(_destination_base_64 + j) = tmp_var_64[u];
+
+                    }
+                }
+            }
+            _source_base_64 += copy_count_8;
+            _destination_base_64 += copy_count_8;
+        
+            copy_count_left = copy_count - copy_count_16 - copy_count_8;
+            if (copy_count_left > 0) {
+                #pragma unroll
+                for (u = 0; u < UNROLL_8; u++) {
+                    j = tid_per_warp + u * WARP_SIZE;
+                    if (j < copy_count_left) {
+                        tmp_var_64[u] = *(_source_base_64 + j);
+                    } else {
+                        break;
+                    }
+                }
+                #pragma unroll
+                for (u = 0; u < UNROLL_8; u++) {
+                    j = tid_per_warp + u * WARP_SIZE;
+                    if (j < copy_count_left) {
+                        *(_destination_base_64 + j) = tmp_var_64[u];
+                    } else {
+                        break;
+                    }
+                }
+            }
+        } else if (alignment == ALIGNMENT_FLOAT) {
+            uint32_t *_source_base_32, *_destination_base_32;    
+            copy_count_16 = copy_count  / (WARP_SIZE * UNROLL_16) * (WARP_SIZE * UNROLL_16);
+            _source_base_32 = (uint32_t *)(source_base + src_offset);
+            _destination_base_32 = (uint32_t *)(destination_base + dst_offset);
+            if (copy_count_16 > 0) {
+                for (k = 0; k < copy_count_16; k += UNROLL_16 * WARP_SIZE) {
+                    #pragma unroll
+                    for (u = 0; u < UNROLL_16; u++) {
+                        j = tid_per_warp + u * WARP_SIZE + k;
+                        tmp_var_32[u] = *(_source_base_32 + j);
+
+                    }
+                    #pragma unroll
+                    for (u = 0; u < UNROLL_16; u++) {
+                        j = tid_per_warp + u * WARP_SIZE + k;
+                        *(_destination_base_32 + j) = tmp_var_32[u];
+
+                    }
+                }
+            }
+            _source_base_32 += copy_count_16;
+            _destination_base_32 += copy_count_16;
+        
+            copy_count_8 = (copy_count - copy_count_16) / (WARP_SIZE * UNROLL_8) * (WARP_SIZE * UNROLL_8);
+            if (copy_count_8 > 0) {
+                for (k = 0; k < copy_count_8; k += UNROLL_8 * WARP_SIZE) {
+                    #pragma unroll
+                    for (u = 0; u < UNROLL_8; u++) {
+                        j = tid_per_warp + u * WARP_SIZE + k;
+                        tmp_var_32[u] = *(_source_base_32 + j);
+
+                    }
+                    #pragma unroll
+                    for (u = 0; u < UNROLL_8; u++) {
+                        j = tid_per_warp + u * WARP_SIZE + k;
+                        *(_destination_base_32 + j) = tmp_var_32[u];
+
+                    }
+                }
+            }
+            _source_base_32 += copy_count_8;
+            _destination_base_32 += copy_count_8;
+        
+            copy_count_left = copy_count - copy_count_16 - copy_count_8;
+            if (copy_count_left > 0) {
+                #pragma unroll
+                for (u = 0; u < UNROLL_8; u++) {
+                    j = tid_per_warp + u * WARP_SIZE;
+                    if (j < copy_count_left) {
+                        tmp_var_32[u] = *(_source_base_32 + j);
+                    } else {
+                        break;
+                    }
+                }
+                #pragma unroll
+                for (u = 0; u < UNROLL_8; u++) {
+                    j = tid_per_warp + u * WARP_SIZE;
+                    if (j < copy_count_left) {
+                        *(_destination_base_32 + j) = tmp_var_32[u];
+                    } else {
+                        break;
+                    }
+                }
+            }
+        } else {
+            unsigned char *_source_base_8, *_destination_base_8;
+        
+            copy_count_16 = copy_count  / (WARP_SIZE * UNROLL_16) * (WARP_SIZE * UNROLL_16);
+            _source_base_8 = (unsigned char *)(source_base + src_offset);
+            _destination_base_8 = (unsigned char *)(destination_base + dst_offset);
+            if (copy_count_16 > 0) {
+                for (k = 0; k < copy_count_16; k += UNROLL_16 * WARP_SIZE) {
+                    #pragma unroll
+                    for (u = 0; u < UNROLL_16; u++) {
+                        j = tid_per_warp + u * WARP_SIZE + k;
+                        tmp_var_8[u] = *(_source_base_8 + j);
+
+                    }
+                    #pragma unroll
+                    for (u = 0; u < UNROLL_16; u++) {
+                        j = tid_per_warp + u * WARP_SIZE + k;
+                        *(_destination_base_8 + j) = tmp_var_8[u];
+
+                    }
+                }
+            }
+            _source_base_8 += copy_count_16;
+            _destination_base_8 += copy_count_16;
+        
+            copy_count_8 = (copy_count - copy_count_16) / (WARP_SIZE * UNROLL_8) * (WARP_SIZE * UNROLL_8);
+            if (copy_count_8 > 0) {
+                for (k = 0; k < copy_count_8; k += UNROLL_8 * WARP_SIZE) {
+                    #pragma unroll
+                    for (u = 0; u < UNROLL_8; u++) {
+                        j = tid_per_warp + u * WARP_SIZE + k;
+                        tmp_var_8[u] = *(_source_base_8 + j);
+
+                    }
+                    #pragma unroll
+                    for (u = 0; u < UNROLL_8; u++) {
+                        j = tid_per_warp + u * WARP_SIZE + k;
+                        *(_destination_base_8 + j) = tmp_var_8[u];
+
+                    }
+                }
+            }
+            _source_base_8 += copy_count_8;
+            _destination_base_8 += copy_count_8;
+        
+            copy_count_left = copy_count - copy_count_16 - copy_count_8;
+            if (copy_count_left > 0) {
+                #pragma unroll
+                for (u = 0; u < UNROLL_8; u++) {
+                    j = tid_per_warp + u * WARP_SIZE;
+                    if (j < copy_count_left) {
+                        tmp_var_8[u] = *(_source_base_8 + j);
+                    } else {
+                        break;
+                    }
+                }
+                #pragma unroll
+                for (u = 0; u < UNROLL_8; u++) {
+                    j = tid_per_warp + u * WARP_SIZE;
+                    if (j < copy_count_left) {
+                        *(_destination_base_8 + j) = tmp_var_8[u];
+                    } else {
+                        break;
+                    }
+                }
+            }
+        }
+    }
+}
+
+#endif
+
 __global__ void unpack_contiguous_loop_cuda_kernel_global( uint32_t copy_loops,
                                                            size_t size,
                                                            OPAL_PTRDIFF_TYPE extent,
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
index f4e89accefe..7e30f114d06 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
@@ -774,6 +774,7 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_non_cached( opal_conver
 
         cudaMemcpyAsync(cuda_iov_dist_d_current, cuda_iov_dist_h_current, sizeof(ddt_cuda_iov_dist_cached_t)*(nb_blocks_used+1), cudaMemcpyHostToDevice, *cuda_stream_iov);
         opal_generic_simple_unpack_cuda_iov_cached_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cuda_iov_dist_d_current, 0, nb_blocks_used, 0, 0, nb_blocks_used, destination_base, source_base, 0, 0);
+        //cudaStreamSynchronize(*cuda_stream_iov);
         cuda_err = cudaEventRecord(cuda_iov_pipeline_block->cuda_event, *cuda_stream_iov);
         opal_cuda_check_error(cuda_err);
         iov_pipeline_block_id ++;
@@ -830,8 +831,8 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
 
     cuda_streams->current_stream_id = 0;
     source_base = source;
-    thread_per_block = CUDA_WARP_SIZE * 4;
-    nb_blocks = 256;
+    thread_per_block = CUDA_WARP_SIZE * 8;
+    nb_blocks = 2;
     destination_base = (unsigned char*)pConvertor->pBaseBuf;
     
     /* cuda iov is not cached, start to cache iov */
diff --git a/test/datatype/ddt_benchmark.c b/test/datatype/ddt_benchmark.c
index 1ce768900db..8b3c7ce7981 100644
--- a/test/datatype/ddt_benchmark.c
+++ b/test/datatype/ddt_benchmark.c
@@ -1276,7 +1276,7 @@ int main( int argc, char* argv[] )
         printf("----matrix size %d-----\n", mat_size);
         if( outputFlags & CHECK_PACK_UNPACK ) {
             for (i = 1; i <= 5; i++) {
-          //       local_copy_with_convertor(pdt, 1, 200000000, mat_size);
+               //  local_copy_with_convertor(pdt, 1, 200000000, mat_size);
             }
         }
         OBJ_RELEASE( pdt ); assert( pdt == NULL );

From 0c680c2aef6fd066bf2811a9618d67a8db4fa4fe Mon Sep 17 00:00:00 2001
From: Wei Wu <eddy16112@gmail.com>
Date: Tue, 23 Feb 2016 15:48:40 -0800
Subject: [PATCH 093/190] fix a cuda event bug. cudaStreamWaitEvent is not
 blocking call. fix cuda stream

---
 opal/datatype/cuda/opal_datatype_cuda.cu      | 43 ++++++++++-----
 opal/datatype/cuda/opal_datatype_cuda.cuh     |  4 ++
 .../cuda/opal_datatype_cuda_internal.cuh      | 10 ++--
 .../cuda/opal_datatype_pack_cuda_kernel.cu    |  4 +-
 .../cuda/opal_datatype_pack_cuda_wrapper.cu   | 52 +++++++++----------
 .../cuda/opal_datatype_unpack_cuda_kernel.cu  |  4 +-
 .../cuda/opal_datatype_unpack_cuda_wrapper.cu | 42 +++++++--------
 opal/datatype/opal_datatype_cuda.c            | 23 ++++++++
 opal/datatype/opal_datatype_cuda.h            |  4 ++
 opal/mca/btl/smcuda/btl_smcuda.c              |  3 +-
 opal/mca/btl/smcuda/btl_smcuda_component.c    |  9 +++-
 test/datatype/ddt_benchmark.c                 |  6 +--
 12 files changed, 128 insertions(+), 76 deletions(-)

diff --git a/opal/datatype/cuda/opal_datatype_cuda.cu b/opal/datatype/cuda/opal_datatype_cuda.cu
index 7d12a5d80db..0a15fe3ab2b 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cu
+++ b/opal/datatype/cuda/opal_datatype_cuda.cu
@@ -217,9 +217,16 @@ int32_t opal_ddt_cuda_kernel_init(void)
     
         /* init cuda stream */
         ddt_cuda_stream_t *cuda_streams = (ddt_cuda_stream_t *)malloc(sizeof(ddt_cuda_stream_t));
-        ddt_cuda_iov_pipeline_block_t *cuda_iov_pipeline_block = NULL;
         for (j = 0; j < NB_STREAMS; j++) {
-            cudaStreamCreate(&(cuda_streams->opal_cuda_stream[j]));
+            cudaStreamCreate(&(cuda_streams->ddt_cuda_stream[j]));
+        }
+        cuda_streams->current_stream_id = 0;
+        cuda_devices[i].cuda_streams = cuda_streams;
+        cudaEventCreate(&(cuda_devices[i].memcpy_event), cudaEventDisableTiming);
+        
+        /* init iov pipeline blocks */
+        ddt_cuda_iov_pipeline_block_t *cuda_iov_pipeline_block = NULL;
+        for (j = 0; j < NB_PIPELINE_BLOCKS; j++) {
             cuda_iov_pipeline_block = (ddt_cuda_iov_pipeline_block_t *)malloc(sizeof(ddt_cuda_iov_pipeline_block_t));
             cudaMallocHost((void **)(&(cuda_iov_pipeline_block->cuda_iov_dist_non_cached_h)), sizeof(ddt_cuda_iov_dist_cached_t) * CUDA_MAX_NB_BLOCKS * CUDA_IOV_MAX_TASK_PER_BLOCK);
             cudaMalloc((void **)(&(cuda_iov_pipeline_block->cuda_iov_dist_non_cached_d)), sizeof(ddt_cuda_iov_dist_cached_t) * CUDA_MAX_NB_BLOCKS * CUDA_IOV_MAX_TASK_PER_BLOCK);
@@ -228,14 +235,11 @@ int32_t opal_ddt_cuda_kernel_init(void)
             } else {
                 cuda_iov_pipeline_block->cuda_iov_dist_cached_h = NULL;
             }
-            cuda_iov_pipeline_block->cuda_stream = &(cuda_streams->opal_cuda_stream[0]);
-            cuda_iov_pipeline_block->cuda_stream_id = 0;
-            cudaEventCreate(&(cuda_iov_pipeline_block->cuda_event), cudaEventDisableTiming);
+            // cuda_iov_pipeline_block->cuda_stream = &(cuda_streams->opal_cuda_stream[0]);
+            // cuda_iov_pipeline_block->cuda_stream_id = 0;
+            cudaEventCreateWithFlags(&(cuda_iov_pipeline_block->cuda_event), cudaEventDisableTiming);
             cuda_devices[i].cuda_iov_pipeline_block[j] = cuda_iov_pipeline_block;
         }
-        cuda_streams->current_stream_id = 0;
-        cuda_devices[i].cuda_streams = cuda_streams;
-        cudaEventCreate(&(cuda_devices[i].memcpy_event), cudaEventDisableTiming);
     }
     current_cuda_device = &(cuda_devices[0]);
     
@@ -262,7 +266,7 @@ int32_t opal_ddt_cuda_kernel_fini(void)
         /* destory cuda stream and iov*/
         ddt_cuda_iov_pipeline_block_t *cuda_iov_pipeline_block = NULL;
         for (j = 0; j < NB_STREAMS; j++) {
-            cudaStreamDestroy(cuda_devices[i].cuda_streams->opal_cuda_stream[j]);
+            cudaStreamDestroy(cuda_devices[i].cuda_streams->ddt_cuda_stream[j]);
             cuda_iov_pipeline_block = cuda_devices[i].cuda_iov_pipeline_block[j];
             if (cuda_iov_pipeline_block != NULL) {
                 if (cuda_iov_pipeline_block->cuda_iov_dist_non_cached_h != NULL) {
@@ -279,7 +283,6 @@ int32_t opal_ddt_cuda_kernel_fini(void)
                 }
                 cudaEventDestroy(cuda_iov_pipeline_block->cuda_event);
                 cuda_iov_pipeline_block->cuda_stream = NULL;
-                cuda_iov_pipeline_block->cuda_stream_id = -1;
                 free(cuda_iov_pipeline_block);
                 cuda_iov_pipeline_block = NULL;
             }
@@ -369,6 +372,7 @@ int32_t opal_ddt_cache_cuda_iov(opal_convertor_t* pConvertor, uint32_t *cuda_iov
     size_t ncontig_disp_base;
     size_t contig_disp = 0;
     uint32_t *cached_cuda_iov_nb_bytes_list_h = NULL;
+    ddt_cuda_stream_t *cuda_streams = current_cuda_device->cuda_streams;
     
     opal_datatype_t *datatype = (opal_datatype_t *)pConvertor->pDesc;
     
@@ -387,6 +391,7 @@ int32_t opal_ddt_cache_cuda_iov(opal_convertor_t* pConvertor, uint32_t *cuda_iov
     cached_cuda_iov_nb_bytes_list_h = cached_cuda_iov->nb_bytes_h;
     nb_blocks_used = 0;
     cuda_iov_pipeline_block = current_cuda_device->cuda_iov_pipeline_block[0];
+    cuda_iov_pipeline_block->cuda_stream = &(cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]);
     cuda_iov_dist_h = cuda_iov_pipeline_block->cuda_iov_dist_cached_h;
     cuda_stream_iov = cuda_iov_pipeline_block->cuda_stream;
     thread_per_block = CUDA_WARP_SIZE * 64;
@@ -735,13 +740,25 @@ void opal_cuda_check_error(cudaError_t err)
 
 void opal_ddt_cuda_d2dcpy_async(void* dst, const void* src, size_t count)
 {
-    cudaMemcpyAsync(dst, src, count, cudaMemcpyDeviceToDevice, current_cuda_device->cuda_streams->opal_cuda_stream[0]);
+    cudaMemcpyAsync(dst, src, count, cudaMemcpyDeviceToDevice, current_cuda_device->cuda_streams->ddt_cuda_stream[current_cuda_device->cuda_streams->current_stream_id]);
 }
 
 void opal_ddt_cuda_d2dcpy(void* dst, const void* src, size_t count)
 {
-    cudaMemcpyAsync(dst, src, count, cudaMemcpyDeviceToDevice, current_cuda_device->cuda_streams->opal_cuda_stream[0]);
-    cudaStreamSynchronize(current_cuda_device->cuda_streams->opal_cuda_stream[0]);
+    cudaMemcpyAsync(dst, src, count, cudaMemcpyDeviceToDevice, current_cuda_device->cuda_streams->ddt_cuda_stream[current_cuda_device->cuda_streams->current_stream_id]);
+    cudaStreamSynchronize(current_cuda_device->cuda_streams->ddt_cuda_stream[current_cuda_device->cuda_streams->current_stream_id]);
+}
+
+void opal_ddt_cuda_set_cuda_stream()
+{
+    ddt_cuda_stream_t *cuda_streams = current_cuda_device->cuda_streams;
+    cuda_streams->current_stream_id ++;
+    cuda_streams->current_stream_id = cuda_streams->current_stream_id & (NB_STREAMS-1);
+}
+
+int32_t opal_ddt_cuda_get_cuda_stream()
+{
+    return current_cuda_device->cuda_streams->current_stream_id;
 }
 
 void opal_dump_cuda_list(ddt_cuda_list_t *list)
diff --git a/opal/datatype/cuda/opal_datatype_cuda.cuh b/opal/datatype/cuda/opal_datatype_cuda.cuh
index c33ff606bd9..cab006e0f3f 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cuh
+++ b/opal/datatype/cuda/opal_datatype_cuda.cuh
@@ -125,6 +125,10 @@ int32_t opal_ddt_cache_cuda_iov(opal_convertor_t* pConvertor, uint32_t *cuda_iov
 
 uint8_t opal_ddt_iov_to_cuda_iov(opal_convertor_t* pConvertor, const struct iovec *ddt_iov, ddt_cuda_iov_dist_cached_t* cuda_iov_dist_h_current, uint32_t ddt_iov_start_pos, uint32_t ddt_iov_end_pos, size_t *buffer_size, uint32_t *nb_blocks_used, size_t *total_packed, size_t *contig_disp_out, uint32_t *current_ddt_iov_pos);
 
+void opal_ddt_cuda_set_cuda_stream();
+
+int32_t opal_ddt_cuda_get_cuda_stream();
+
 }
                             
 #endif  /* OPAL_DATATYPE_CUDA_H_HAS_BEEN_INCLUDED */
diff --git a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
index e6268fadc05..31be1def712 100644
--- a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
+++ b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
@@ -30,7 +30,8 @@
 #define THREAD_PER_BLOCK    32
 #define CUDA_WARP_SIZE      32
 #define TASK_PER_THREAD     2
-#define NB_STREAMS          8
+#define NB_STREAMS          4
+#define NB_PIPELINE_BLOCKS  4
 #define CUDA_NB_IOV         1024*20
 #define CUDA_IOV_LEN        1024*1204
 #define CUDA_MAX_NB_BLOCKS  1024
@@ -51,8 +52,8 @@
 
 
 typedef struct {
-    cudaStream_t opal_cuda_stream[NB_STREAMS];
-    uint32_t current_stream_id;
+    cudaStream_t ddt_cuda_stream[NB_STREAMS];
+    int32_t current_stream_id;
 } ddt_cuda_stream_t;
 
 typedef struct {
@@ -79,7 +80,6 @@ typedef struct {
     ddt_cuda_iov_dist_cached_t* cuda_iov_dist_non_cached_d;
     ddt_cuda_iov_dist_cached_t* cuda_iov_dist_cached_h;
     cudaStream_t *cuda_stream;
-    int32_t cuda_stream_id;
     cudaEvent_t cuda_event;
 } ddt_cuda_iov_pipeline_block_t;
 
@@ -104,7 +104,7 @@ typedef struct {
     size_t buffer_free_size;
     size_t buffer_used_size;
     ddt_cuda_stream_t *cuda_streams;
-    ddt_cuda_iov_pipeline_block_t *cuda_iov_pipeline_block[NB_STREAMS];
+    ddt_cuda_iov_pipeline_block_t *cuda_iov_pipeline_block[NB_PIPELINE_BLOCKS];
     cudaEvent_t memcpy_event;
 } ddt_cuda_device_t;
 
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
index 81e7f7c4dcd..929d1f7de88 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
@@ -412,9 +412,9 @@ __global__ void opal_generic_simple_pack_cuda_iov_cached_kernel( ddt_cuda_iov_di
         if (nb_tasks_per_block >= 4) {
             WARP_SIZE = 32;
         } else if (nb_tasks_per_block == 1) {
-            WARP_SIZE = blockDim.x;
+            WARP_SIZE = 32;//blockDim.x;
         } else {
-            WARP_SIZE = 64;
+            WARP_SIZE = 32;
         }
         nb_warp_per_block = blockDim.x / WARP_SIZE;
  //       nb_warp_per_block = 1;
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
index 534c3372d60..882c26a72b4 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
@@ -193,7 +193,7 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_vector(opal_convertor_t* pCon
         total_packed += iov[iov_count].iov_len;
  //       printf("iov_len %d, local %d\n", iov[iov_count].iov_len, iov_len_local);
         for (i = 0; i < NB_STREAMS; i++) {
-            cudaStreamSynchronize(cuda_streams->opal_cuda_stream[i]);
+            cudaStreamSynchronize(cuda_streams->ddt_cuda_stream[i]);
         }
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
         GET_TIME(start);
@@ -461,9 +461,9 @@ void pack_contiguous_loop_cuda( dt_elem_desc_t* ELEM,
  //   num_blocks = (*COUNT + tasks_per_block - 1) / tasks_per_block;
 //    printf("extent %ld, size %ld, count %ld\n", _loop->extent, _end_loop->size, _copy_loops);
 #if OPAL_DATATYPE_VECTOR_USE_MEMCPY2D_AS_KERNEL
-    cudaMemcpy2DAsync(_destination, _end_loop->size, _source, _loop->extent, _end_loop->size, _copy_loops, cudaMemcpyDeviceToDevice, cuda_streams->opal_cuda_stream[0]);
+    cudaMemcpy2DAsync(_destination, _end_loop->size, _source, _loop->extent, _end_loop->size, _copy_loops, cudaMemcpyDeviceToDevice, cuda_streams->ddt_cuda_stream[0]);
 #else
-    pack_contiguous_loop_cuda_kernel_global<<<32, 8*THREAD_PER_BLOCK, 0, cuda_streams->opal_cuda_stream[0]>>>(_copy_loops, _end_loop->size, _loop->extent, _source, _destination);
+    pack_contiguous_loop_cuda_kernel_global<<<32, 8*THREAD_PER_BLOCK, 0, cuda_streams->ddt_cuda_stream[0]>>>(_copy_loops, _end_loop->size, _loop->extent, _source, _destination);
 #endif /* OPAL_DATATYPE_VECTOR_USE_MEMCPY2D_AS_KERNEL */
 
 #if !defined(OPAL_DATATYPE_CUDA_DRY_RUN)    
@@ -473,7 +473,7 @@ void pack_contiguous_loop_cuda( dt_elem_desc_t* ELEM,
     *(COUNT) -= _copy_loops;
 #endif
     
-    cudaStreamSynchronize(cuda_streams->opal_cuda_stream[0]);
+    cudaStreamSynchronize(cuda_streams->ddt_cuda_stream[0]);
     
 #if defined(OPAL_DATATYPE_CUDA_TIMING) 
     GET_TIME( end );
@@ -525,9 +525,9 @@ void pack_contiguous_loop_cuda_pipeline( dt_elem_desc_t* ELEM,
     pipeline_blocks = 4;
     cuda_streams->current_stream_id = 0;
     _copy_loops_per_pipeline = (_copy_loops + pipeline_blocks -1 )/ pipeline_blocks;
-    pack_contiguous_loop_cuda_kernel_global<<<192, 4*THREAD_PER_BLOCK, 0, cuda_streams->opal_cuda_stream[cuda_streams->current_stream_id]>>>(_copy_loops_per_pipeline, _end_loop->size, _loop->extent, _source, _destination_dev);
+    pack_contiguous_loop_cuda_kernel_global<<<192, 4*THREAD_PER_BLOCK, 0, cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]>>>(_copy_loops_per_pipeline, _end_loop->size, _loop->extent, _source, _destination_dev);
     for (i = 1; i <= pipeline_blocks; i++) {
-        cudaMemcpyAsync(_destination_host, _destination_dev, _end_loop->size * _copy_loops_per_pipeline, cudaMemcpyDeviceToHost, cuda_streams->opal_cuda_stream[cuda_streams->current_stream_id]);
+        cudaMemcpyAsync(_destination_host, _destination_dev, _end_loop->size * _copy_loops_per_pipeline, cudaMemcpyDeviceToHost, cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]);
         cuda_streams->current_stream_id ++;
         cuda_streams->current_stream_id = cuda_streams->current_stream_id % NB_STREAMS;
         _source += _loop->extent * _copy_loops_per_pipeline;
@@ -536,9 +536,9 @@ void pack_contiguous_loop_cuda_pipeline( dt_elem_desc_t* ELEM,
         if (i == pipeline_blocks) {
             _copy_loops_per_pipeline = _copy_loops - _copy_loops_per_pipeline * (pipeline_blocks - 1);
         }
-        pack_contiguous_loop_cuda_kernel_global<<<192, 4*THREAD_PER_BLOCK, 0, cuda_streams->opal_cuda_stream[cuda_streams->current_stream_id]>>>(_copy_loops_per_pipeline, _end_loop->size, _loop->extent, _source, _destination_dev);
+        pack_contiguous_loop_cuda_kernel_global<<<192, 4*THREAD_PER_BLOCK, 0, cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]>>>(_copy_loops_per_pipeline, _end_loop->size, _loop->extent, _source, _destination_dev);
     }
-    cudaMemcpyAsync(_destination_host, _destination_dev, _end_loop->size * _copy_loops_per_pipeline, cudaMemcpyDeviceToHost, cuda_streams->opal_cuda_stream[cuda_streams->current_stream_id]);
+    cudaMemcpyAsync(_destination_host, _destination_dev, _end_loop->size * _copy_loops_per_pipeline, cudaMemcpyDeviceToHost, cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]);
 
 #if !defined(OPAL_DATATYPE_CUDA_DRY_RUN)    
     *(SOURCE) = _source +  _loop->extent*_copy_loops - _end_loop->first_elem_disp;
@@ -584,7 +584,7 @@ void pack_contiguous_loop_cuda_memcpy2d_d2h( dt_elem_desc_t* ELEM,
     GET_TIME(start);
 #endif    
 
-    cudaMemcpy2DAsync(_destination, _end_loop->size, _source, _loop->extent, _end_loop->size, _copy_loops, cudaMemcpyDeviceToHost, cuda_streams->opal_cuda_stream[0]);
+    cudaMemcpy2DAsync(_destination, _end_loop->size, _source, _loop->extent, _end_loop->size, _copy_loops, cudaMemcpyDeviceToHost, cuda_streams->ddt_cuda_stream[0]);
 
 #if !defined(OPAL_DATATYPE_CUDA_DRY_RUN)    
     *(SOURCE) = _source +  _loop->extent*_copy_loops - _end_loop->first_elem_disp;
@@ -593,7 +593,7 @@ void pack_contiguous_loop_cuda_memcpy2d_d2h( dt_elem_desc_t* ELEM,
     *(COUNT) -= _copy_loops;
 #endif
     
-    cudaStreamSynchronize(cuda_streams->opal_cuda_stream[0]);
+    cudaStreamSynchronize(cuda_streams->ddt_cuda_stream[0]);
     
 #if defined(OPAL_DATATYPE_CUDA_TIMING) 
     GET_TIME( end );
@@ -638,9 +638,9 @@ void pack_contiguous_loop_cuda_zerocopy( dt_elem_desc_t* ELEM,
         printf("can not get dev  mem, %s\n", cuda_err);
     }
 #if OPAL_DATATYPE_VECTOR_USE_MEMCPY2D_AS_KERNEL
-    cudaMemcpy2DAsync(_destination_dev, _end_loop->size, _source, _loop->extent, _end_loop->size, _copy_loops, cudaMemcpyDeviceToDevice, cuda_streams->opal_cuda_stream[0]);
+    cudaMemcpy2DAsync(_destination_dev, _end_loop->size, _source, _loop->extent, _end_loop->size, _copy_loops, cudaMemcpyDeviceToDevice, cuda_streams->ddt_cuda_stream[0]);
 #else
-    pack_contiguous_loop_cuda_kernel_global<<<192, 4*THREAD_PER_BLOCK, 0, cuda_streams->opal_cuda_stream[0]>>>(_copy_loops, _end_loop->size, _loop->extent, _source, _destination_dev);
+    pack_contiguous_loop_cuda_kernel_global<<<192, 4*THREAD_PER_BLOCK, 0, cuda_streams->ddt_cuda_stream[0]>>>(_copy_loops, _end_loop->size, _loop->extent, _source, _destination_dev);
 #endif /* OPAL_DATATYPE_VECTOR_USE_MEMCPY2D_AS_KERNEL */
 
 #if !defined(OPAL_DATATYPE_CUDA_DRY_RUN)    
@@ -650,7 +650,7 @@ void pack_contiguous_loop_cuda_zerocopy( dt_elem_desc_t* ELEM,
     *(COUNT) -= _copy_loops;
 #endif
     
-    cudaStreamSynchronize(cuda_streams->opal_cuda_stream[0]);
+    cudaStreamSynchronize(cuda_streams->ddt_cuda_stream[0]);
     
 #if defined(OPAL_DATATYPE_CUDA_TIMING) 
     GET_TIME( end );
@@ -1055,16 +1055,14 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_non_cached( opal_converto
         return OPAL_ERROR;
     }
     
-    cuda_streams->current_stream_id = 0;
+   // cuda_streams->current_stream_id = 0;
     thread_per_block = CUDA_WARP_SIZE * 5;
     nb_blocks = 256;
     opal_datatype_type_extent(pConvertor->pDesc, &ddt_extent);
     source_base = (unsigned char*)pConvertor->pBaseBuf + pConvertor->current_count * ddt_extent; 
     destination_base = destination;
     
-    for (i = 0; i < NB_STREAMS; i++) {
-        cudaStreamSynchronize(cuda_streams->opal_cuda_stream[i]);
-    }
+    cudaStreamSynchronize(cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]);
     
     while( pConvertor->current_count < pConvertor->count && !buffer_isfull) {
         
@@ -1075,10 +1073,11 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_non_cached( opal_converto
             ddt_iov_end_pos = ddt_iov_count;
         }
         cuda_iov_pipeline_block = current_cuda_device->cuda_iov_pipeline_block[iov_pipeline_block_id];
+        cuda_iov_pipeline_block->cuda_stream = &(cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]);
         cuda_iov_dist_h_current = cuda_iov_pipeline_block->cuda_iov_dist_non_cached_h;
         cuda_iov_dist_d_current = cuda_iov_pipeline_block->cuda_iov_dist_non_cached_d;
         cuda_stream_iov = cuda_iov_pipeline_block->cuda_stream;
-        cuda_err = cudaStreamWaitEvent(*cuda_stream_iov, cuda_iov_pipeline_block->cuda_event, 0);
+        cuda_err = cudaEventSynchronize(cuda_iov_pipeline_block->cuda_event);
         opal_cuda_check_error(cuda_err);
 
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
@@ -1090,7 +1089,7 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_non_cached( opal_converto
 #if defined(OPAL_DATATYPE_CUDA_TIMING)    
         GET_TIME( end );
         total_time = ELAPSED_TIME( start, end );
-        DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Pack src %p to dest %p, iov is prepared in %ld microsec, kernel submitted to CUDA stream %d, nb_blocks %d\n", source_base, destination_base, total_time,  cuda_iov_pipeline_block->cuda_stream_id, nb_blocks_used); );
+        DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Pack src %p to dest %p, iov is prepared in %ld microsec, kernel submitted to CUDA stream %d, nb_blocks %d\n", source_base, destination_base, total_time,  cuda_streams->current_stream_id, nb_blocks_used); );
 #endif
 
         cudaMemcpyAsync(cuda_iov_dist_d_current, cuda_iov_dist_h_current, sizeof(ddt_cuda_iov_dist_cached_t)*(nb_blocks_used+1), cudaMemcpyHostToDevice, *cuda_stream_iov);
@@ -1113,9 +1112,7 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_non_cached( opal_converto
         
     }
     
-    for (i = 0; i < NB_STREAMS; i++) {
-        cudaStreamSynchronize(cuda_streams->opal_cuda_stream[i]);
-    }
+    cudaStreamSynchronize(cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]);
         
     return OPAL_SUCCESS;
 }
@@ -1145,10 +1142,10 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
     
     DT_CUDA_DEBUG ( opal_cuda_output(2, "Pack using IOV cached, GPU base %p, pack to buffer %p\n", pConvertor->pBaseBuf, destination););
 
-    cuda_streams->current_stream_id = 0;
+   // cuda_streams->current_stream_id = 0;
     destination_base = destination;
     thread_per_block = CUDA_WARP_SIZE * 8;
-    nb_blocks = 4;
+    nb_blocks = 16;
     source_base = (unsigned char*)pConvertor->pBaseBuf; 
     
     /* cuda iov is not cached, start to cache iov */
@@ -1182,6 +1179,7 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
     cuda_iov_end_pos = cached_cuda_iov_count;
     nb_blocks_used = 0;
     cuda_iov_pipeline_block = current_cuda_device->cuda_iov_pipeline_block[0];
+    cuda_iov_pipeline_block->cuda_stream = &(cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]);
     cuda_stream_iov = cuda_iov_pipeline_block->cuda_stream;
     convertor_current_count = pConvertor->current_count;
    
@@ -1208,7 +1206,7 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
 #if defined(OPAL_DATATYPE_CUDA_TIMING)    
     GET_TIME( end );
     total_time = ELAPSED_TIME( start, end );
-    DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Pack to dest %p, cached cuda iov is prepared in %ld microsec, kernel submitted to CUDA stream %d, nb_blocks %d\n", destination_base, total_time,  cuda_iov_pipeline_block->cuda_stream_id, nb_blocks_used); );
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Pack to dest %p, cached cuda iov is prepared in %ld microsec, kernel submitted to CUDA stream %d, nb_blocks %d\n", destination_base, total_time,  cuda_streams->current_stream_id, nb_blocks_used); );
 #endif
     opal_datatype_type_extent(pConvertor->pDesc, &ddt_extent);
     DT_CUDA_DEBUG ( opal_cuda_output(2, "Pack kernel launched src_base %p, dst_base %p, nb_blocks %ld, extent %ld\n", source_base, destination_base, nb_blocks_used, ddt_extent ); );
@@ -1219,7 +1217,7 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
     pConvertor->current_cuda_iov_pos += nb_blocks_used;
     pConvertor->current_cuda_iov_pos = pConvertor->current_cuda_iov_pos % cached_cuda_iov->cuda_iov_count;
 
-    cudaStreamSynchronize(cuda_streams->opal_cuda_stream[0]);
+    cudaStreamSynchronize(cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]);
 #if defined(OPAL_DATATYPE_CUDA_TIMING)    
     GET_TIME( end );
     total_time = ELAPSED_TIME( start, end );
@@ -1265,7 +1263,7 @@ void pack_predefined_data_cuda( dt_elem_desc_t* ELEM,
  //   DBGPRINT("num_blocks %d, thread %d\n", nb_blocks, tasks_per_block);
  //   DBGPRINT( "GPU pack 1. memcpy( %p, %p, %lu ) => space %lu\n", _destination, _source, (unsigned long)_copy_count, (unsigned long)(*(SPACE)) );
     
-    pack_contiguous_loop_cuda_kernel_global<<<nb_blocks, thread_per_block, 0, cuda_streams->opal_cuda_stream[cuda_streams->current_stream_id]>>>(_copy_count, _copy_blength, _elem->extent, _source, _destination);
+    pack_contiguous_loop_cuda_kernel_global<<<nb_blocks, thread_per_block, 0, cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]>>>(_copy_count, _copy_blength, _elem->extent, _source, _destination);
     cuda_streams->current_stream_id ++;
     cuda_streams->current_stream_id = cuda_streams->current_stream_id % NB_STREAMS;
     
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
index 4774abf5f38..fb533d4cfc8 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
@@ -171,9 +171,9 @@ __global__ void opal_generic_simple_unpack_cuda_iov_cached_kernel( ddt_cuda_iov_
         if (nb_tasks_per_block >= 4) {
             WARP_SIZE = 32;
         } else if (nb_tasks_per_block == 1) {
-            WARP_SIZE = blockDim.x;
+            WARP_SIZE = 32;//blockDim.x;
         } else {
-            WARP_SIZE = 64;
+            WARP_SIZE = 32;
         }
         nb_warp_per_block = blockDim.x / WARP_SIZE;
      //   printf("cuda_iov_count %d, ddt_extent %d, current_count %d, ddt_size %d\n", cuda_iov_count, ddt_extent, current_count, ddt_size);
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
index 7e30f114d06..703e52280b5 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
@@ -179,7 +179,7 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_vector( opal_convertor_t* p
     }
  complete_conversion:
     for (i = 0; i < NB_STREAMS; i++) {
-        cudaStreamSynchronize(cuda_streams->opal_cuda_stream[i]);
+        cudaStreamSynchronize(cuda_streams->ddt_cuda_stream[i]);
     }
     *max_data = total_unpacked;
     pConvertor->bConverted += total_unpacked;  /* update the already converted bytes */
@@ -732,7 +732,7 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_non_cached( opal_conver
         return OPAL_ERROR;
     }
     
-    cuda_streams->current_stream_id = 0;
+  //  cuda_streams->current_stream_id = 0;
     thread_per_block = CUDA_WARP_SIZE * 5;
     nb_blocks = 256;
     source_base = source;
@@ -741,7 +741,7 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_non_cached( opal_conver
     destination_base = (unsigned char*)pConvertor->pBaseBuf + pConvertor->current_count * ddt_extent;
     
     for (i = 0; i < NB_STREAMS; i++) {
-        cudaStreamSynchronize(cuda_streams->opal_cuda_stream[i]);
+        cudaStreamSynchronize(cuda_streams->ddt_cuda_stream[i]);
     }
 
     while( pConvertor->current_count < pConvertor->count && !buffer_isfull) {
@@ -753,10 +753,11 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_non_cached( opal_conver
             ddt_iov_end_pos = ddt_iov_count;
         }
         cuda_iov_pipeline_block = current_cuda_device->cuda_iov_pipeline_block[iov_pipeline_block_id];
+        cuda_iov_pipeline_block->cuda_stream = &(cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]);
         cuda_iov_dist_h_current = cuda_iov_pipeline_block->cuda_iov_dist_non_cached_h;
         cuda_iov_dist_d_current = cuda_iov_pipeline_block->cuda_iov_dist_non_cached_d;
         cuda_stream_iov = cuda_iov_pipeline_block->cuda_stream;
-        cuda_err = cudaStreamWaitEvent(*cuda_stream_iov, cuda_iov_pipeline_block->cuda_event, 0);
+        cuda_err = cudaEventSynchronize(cuda_iov_pipeline_block->cuda_event);
         opal_cuda_check_error(cuda_err);
         
 
@@ -769,7 +770,7 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_non_cached( opal_conver
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
         GET_TIME( end );
         total_time = ELAPSED_TIME( start, end );
-        DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Unpack src %p to dest %p, iov is prepared in %ld microsec, kernel submitted to CUDA stream %d, nb_blocks_used %d\n", source_base, destination_base, total_time,  cuda_iov_pipeline_block->cuda_stream_id, nb_blocks_used); );
+        DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Unpack src %p to dest %p, iov is prepared in %ld microsec, kernel submitted to CUDA stream %d, nb_blocks_used %d\n", source_base, destination_base, total_time,  cuda_streams->current_stream_id, nb_blocks_used); );
 #endif
 
         cudaMemcpyAsync(cuda_iov_dist_d_current, cuda_iov_dist_h_current, sizeof(ddt_cuda_iov_dist_cached_t)*(nb_blocks_used+1), cudaMemcpyHostToDevice, *cuda_stream_iov);
@@ -790,9 +791,7 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_non_cached( opal_conver
         }
     }
 
-    for (i = 0; i < NB_STREAMS; i++) {
-        cudaStreamSynchronize(cuda_streams->opal_cuda_stream[i]);
-    }
+    cudaStreamSynchronize(cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]);
 
     return OPAL_SUCCESS;
 }
@@ -829,10 +828,10 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
     GET_TIME(start);
 #endif
 
-    cuda_streams->current_stream_id = 0;
+ //   cuda_streams->current_stream_id = 0;
     source_base = source;
     thread_per_block = CUDA_WARP_SIZE * 8;
-    nb_blocks = 2;
+    nb_blocks = 64;
     destination_base = (unsigned char*)pConvertor->pBaseBuf;
     
     /* cuda iov is not cached, start to cache iov */
@@ -864,6 +863,7 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
     cuda_iov_end_pos = cached_cuda_iov_count;
     nb_blocks_used = 0;
     cuda_iov_pipeline_block = current_cuda_device->cuda_iov_pipeline_block[0];
+    cuda_iov_pipeline_block->cuda_stream = &(cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]);
     cuda_stream_iov = cuda_iov_pipeline_block->cuda_stream;
     convertor_current_count = pConvertor->current_count;
     
@@ -905,7 +905,7 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
 #if defined(OPAL_DATATYPE_CUDA_TIMING)    
     GET_TIME( end );
     total_time = ELAPSED_TIME( start, end );
-    DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Unpack src %p, cached cuda iov is prepared in %ld microsec, kernel submitted to CUDA stream %d, nb_blocks %d\n", source_base, total_time,  cuda_iov_pipeline_block->cuda_stream_id, nb_blocks_used); );
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Unpack src %p, cached cuda iov is prepared in %ld microsec, kernel submitted to CUDA stream %d, nb_blocks %d\n", source_base, total_time,  cuda_streams->current_stream_id, nb_blocks_used); );
 #endif
     opal_datatype_type_extent(pConvertor->pDesc, &ddt_extent);
     DT_CUDA_DEBUG ( opal_cuda_output(2, "Unpack kernel launched src_base %p, dst_base %p, nb_blocks %ld\n", source_base, destination_base, nb_blocks_used ); );
@@ -915,7 +915,7 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
 #endif    
     opal_generic_simple_unpack_cuda_iov_cached_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cached_cuda_iov_dist_d, pConvertor->current_cuda_iov_pos, cached_cuda_iov_count, ddt_extent, convertor_current_count, nb_blocks_used, destination_base, source_base, cuda_iov_partial_length_start, cuda_iov_partial_length_end);
 
-    cudaStreamSynchronize(cuda_streams->opal_cuda_stream[0]);
+    cudaStreamSynchronize(cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]);
 #if defined(OPAL_DATATYPE_CUDA_TIMING)    
     GET_TIME( end );
     total_time = ELAPSED_TIME( start, end );
@@ -955,9 +955,9 @@ void unpack_contiguous_loop_cuda( dt_elem_desc_t* ELEM,
 //    tasks_per_block = THREAD_PER_BLOCK * TASK_PER_THREAD;
 //    num_blocks = (*COUNT + tasks_per_block - 1) / tasks_per_block;
 #if OPAL_DATATYPE_VECTOR_USE_MEMCPY2D_AS_KERNEL
-     cudaMemcpy2DAsync(_destination, _loop->extent, _source, _end_loop->size, _end_loop->size, _copy_loops, cudaMemcpyDeviceToDevice, cuda_streams->opal_cuda_stream[0]);
+     cudaMemcpy2DAsync(_destination, _loop->extent, _source, _end_loop->size, _end_loop->size, _copy_loops, cudaMemcpyDeviceToDevice, cuda_streams->ddt_cuda_stream[0]);
 #else
-     unpack_contiguous_loop_cuda_kernel_global<<<192, 4*THREAD_PER_BLOCK, 0, cuda_streams->opal_cuda_stream[0]>>>(_copy_loops, _end_loop->size, _loop->extent, _source, _destination);
+     unpack_contiguous_loop_cuda_kernel_global<<<192, 4*THREAD_PER_BLOCK, 0, cuda_streams->ddt_cuda_stream[0]>>>(_copy_loops, _end_loop->size, _loop->extent, _source, _destination);
 #endif /* OPAL_DATATYPE_VECTOR_USE_MEMCPY2D_AS_KERNEL */
 
 #if !defined(OPAL_DATATYPE_CUDA_DRY_RUN)
@@ -967,7 +967,7 @@ void unpack_contiguous_loop_cuda( dt_elem_desc_t* ELEM,
     *(COUNT) -= _copy_loops;
 #endif
 
-    cudaStreamSynchronize(cuda_streams->opal_cuda_stream[0]);
+    cudaStreamSynchronize(cuda_streams->ddt_cuda_stream[0]);
 #if defined(OPAL_DATATYPE_CUDA_TIMING) 
     GET_TIME( end );
     total_time = ELAPSED_TIME( start, end );
@@ -1002,7 +1002,7 @@ void unpack_contiguous_loop_cuda_memcpy2d( dt_elem_desc_t* ELEM,
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
     GET_TIME(start);
 #endif
-    cudaMemcpy2DAsync(_destination, _loop->extent, _source, _end_loop->size, _end_loop->size, _copy_loops, cudaMemcpyHostToDevice, cuda_streams->opal_cuda_stream[0]);
+    cudaMemcpy2DAsync(_destination, _loop->extent, _source, _end_loop->size, _end_loop->size, _copy_loops, cudaMemcpyHostToDevice, cuda_streams->ddt_cuda_stream[0]);
 
 #if !defined(OPAL_DATATYPE_CUDA_DRY_RUN)
     *(DESTINATION) = _destination + _loop->extent*_copy_loops - _end_loop->first_elem_disp;
@@ -1011,7 +1011,7 @@ void unpack_contiguous_loop_cuda_memcpy2d( dt_elem_desc_t* ELEM,
     *(COUNT) -= _copy_loops;
 #endif
     
-    cudaStreamSynchronize(cuda_streams->opal_cuda_stream[0]);
+    cudaStreamSynchronize(cuda_streams->ddt_cuda_stream[0]);
     
 #if defined(OPAL_DATATYPE_CUDA_TIMING) 
     GET_TIME( end );
@@ -1057,9 +1057,9 @@ void unpack_contiguous_loop_cuda_zerocopy( dt_elem_desc_t* ELEM,
         printf("can not get dev mem, %s\n", cuda_err);
     }
 #if OPAL_DATATYPE_VECTOR_USE_MEMCPY2D_AS_KERNEL
-    cudaMemcpy2DAsync(_destination, _loop->extent, _source_dev, _end_loop->size, _end_loop->size, _copy_loops, cudaMemcpyDeviceToDevice, cuda_streams->opal_cuda_stream[0]);
+    cudaMemcpy2DAsync(_destination, _loop->extent, _source_dev, _end_loop->size, _end_loop->size, _copy_loops, cudaMemcpyDeviceToDevice, cuda_streams->ddt_cuda_stream[0]);
 #else
-    unpack_contiguous_loop_cuda_kernel_global<<<192, 4*THREAD_PER_BLOCK, 0, cuda_streams->opal_cuda_stream[0]>>>(_copy_loops, _end_loop->size, _loop->extent, _source_dev, _destination);
+    unpack_contiguous_loop_cuda_kernel_global<<<192, 4*THREAD_PER_BLOCK, 0, cuda_streams->ddt_cuda_stream[0]>>>(_copy_loops, _end_loop->size, _loop->extent, _source_dev, _destination);
 #endif /* OPAL_DATATYPE_VECTOR_USE_MEMCPY2D_AS_KERNEL */
 
 #if !defined(OPAL_DATATYPE_CUDA_DRY_RUN)
@@ -1069,7 +1069,7 @@ void unpack_contiguous_loop_cuda_zerocopy( dt_elem_desc_t* ELEM,
     *(COUNT) -= _copy_loops;
 #endif
 
-    cudaStreamSynchronize(cuda_streams->opal_cuda_stream[0]);
+    cudaStreamSynchronize(cuda_streams->ddt_cuda_stream[0]);
   //  cudaHostUnregister(_source);
 #if defined(OPAL_DATATYPE_CUDA_TIMING) 
     GET_TIME( end );
@@ -1115,7 +1115,7 @@ void unpack_predefined_data_cuda( dt_elem_desc_t* ELEM,
  //   DBGPRINT("num_blocks %d, thread %d\n", nb_blocks, tasks_per_block);
  //   DBGPRINT( "GPU pack 1. memcpy( %p, %p, %lu ) => space %lu\n", _destination, _source, (unsigned long)_copy_count, (unsigned long)(*(SPACE)) );
     
-    unpack_contiguous_loop_cuda_kernel_global<<<nb_blocks, thread_per_block, 0, cuda_streams->opal_cuda_stream[cuda_streams->current_stream_id]>>>(_copy_count, _copy_blength, _elem->extent, _source, _destination);
+    unpack_contiguous_loop_cuda_kernel_global<<<nb_blocks, thread_per_block, 0, cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]>>>(_copy_count, _copy_blength, _elem->extent, _source, _destination);
     cuda_streams->current_stream_id ++;
     cuda_streams->current_stream_id = cuda_streams->current_stream_id % NB_STREAMS;
     
diff --git a/opal/datatype/opal_datatype_cuda.c b/opal/datatype/opal_datatype_cuda.c
index c65e635a506..2aa73454724 100644
--- a/opal/datatype/opal_datatype_cuda.c
+++ b/opal/datatype/opal_datatype_cuda.c
@@ -247,6 +247,8 @@ int32_t opal_cuda_kernel_support_init(void)
         OPAL_DATATYPE_FIND_CUDA_KERNEL_FUNCTION_OR_RETURN( opal_datatype_cuda_kernel_handle, opal_ddt_cuda_d2dcpy_async );
         OPAL_DATATYPE_FIND_CUDA_KERNEL_FUNCTION_OR_RETURN( opal_datatype_cuda_kernel_handle, opal_ddt_cuda_d2dcpy );
         OPAL_DATATYPE_FIND_CUDA_KERNEL_FUNCTION_OR_RETURN( opal_datatype_cuda_kernel_handle, opal_ddt_cached_cuda_iov_fini );
+        OPAL_DATATYPE_FIND_CUDA_KERNEL_FUNCTION_OR_RETURN( opal_datatype_cuda_kernel_handle, opal_ddt_cuda_set_cuda_stream );
+        OPAL_DATATYPE_FIND_CUDA_KERNEL_FUNCTION_OR_RETURN( opal_datatype_cuda_kernel_handle, opal_ddt_cuda_get_cuda_stream );
         
         if (OPAL_SUCCESS != cuda_kernel_table.opal_ddt_cuda_kernel_init_p()) {
             return OPAL_ERROR;
@@ -273,6 +275,8 @@ int32_t opal_cuda_kernel_support_fini(void)
         cuda_kernel_table.opal_ddt_cuda_d2dcpy_async_p = NULL;
         cuda_kernel_table.opal_ddt_cuda_d2dcpy_p = NULL;
         cuda_kernel_table.opal_ddt_cached_cuda_iov_fini_p = NULL;
+        cuda_kernel_table.opal_ddt_cuda_set_cuda_stream_p = NULL;
+        cuda_kernel_table.opal_ddt_cuda_get_cuda_stream_p = NULL;
 
         dlclose(opal_datatype_cuda_kernel_handle);
         opal_datatype_cuda_kernel_handle = NULL;
@@ -372,3 +376,22 @@ void opal_cached_cuda_iov_fini(void *cached_cuda_iov)
     }
 }
 
+void opal_cuda_set_cuda_stream(void)
+{
+    if (cuda_kernel_table.opal_ddt_cuda_set_cuda_stream_p != NULL) {
+        cuda_kernel_table.opal_ddt_cuda_set_cuda_stream_p();
+    } else {
+        opal_output(0, "opal_ddt_cuda_set_cuda_stream function pointer is NULL\n");
+    }
+}
+
+int32_t opal_cuda_get_cuda_stream(void)
+{
+    if (cuda_kernel_table.opal_ddt_cuda_get_cuda_stream_p != NULL) {
+        return cuda_kernel_table.opal_ddt_cuda_get_cuda_stream_p();
+    } else {
+        opal_output(0, "opal_ddt_cuda_get_cuda_stream function pointer is NULL\n");
+        return -2;
+    }
+}
+
diff --git a/opal/datatype/opal_datatype_cuda.h b/opal/datatype/opal_datatype_cuda.h
index 7b613470ab0..cb82e93add3 100644
--- a/opal/datatype/opal_datatype_cuda.h
+++ b/opal/datatype/opal_datatype_cuda.h
@@ -29,6 +29,8 @@ struct opal_datatype_cuda_kernel_function_table {
     void (*opal_ddt_cuda_d2dcpy_async_p)(void* dst, const void* src, size_t count);
     void (*opal_ddt_cuda_d2dcpy_p)(void* dst, const void* src, size_t count);
     void (*opal_ddt_cached_cuda_iov_fini_p)(void *cached_cuda_iov);
+    void (*opal_ddt_cuda_set_cuda_stream_p)(void);
+    int32_t (*opal_ddt_cuda_get_cuda_stream_p)(void);
     int32_t (*opal_ddt_generic_simple_pack_function_cuda_iov_p)( opal_convertor_t* pConvertor, struct iovec* iov, uint32_t* out_size, size_t* max_data );
     int32_t (*opal_ddt_generic_simple_unpack_function_cuda_iov_p)( opal_convertor_t* pConvertor, struct iovec* iov, uint32_t* out_size, size_t* max_data );
     int32_t (*opal_ddt_generic_simple_pack_function_cuda_vector_p)( opal_convertor_t* pConvertor, struct iovec* iov, uint32_t* out_size, size_t* max_data );
@@ -57,5 +59,7 @@ void opal_cuda_d2dcpy(void* dst, const void* src, size_t count);
 void opal_cuda_d2dcpy_async(void* dst, const void* src, size_t count);
 void* opal_cached_cuda_iov_init(void);
 void opal_cached_cuda_iov_fini(void *cached_cuda_iov);
+void opal_cuda_set_cuda_stream(void);
+int32_t opal_cuda_get_cuda_stream(void);
 
 #endif
diff --git a/opal/mca/btl/smcuda/btl_smcuda.c b/opal/mca/btl/smcuda/btl_smcuda.c
index eeafea57fb6..7e1441fd8e1 100644
--- a/opal/mca/btl/smcuda/btl_smcuda.c
+++ b/opal/mca/btl/smcuda/btl_smcuda.c
@@ -1186,11 +1186,12 @@ int mca_btl_smcuda_get_cuda (struct mca_btl_base_module_t *btl,
                 struct iovec iov;
                 uint32_t iov_count = 1;
                 size_t max_data;
+                opal_cuda_set_cuda_stream();
                 if (!OPAL_DATATYPE_DIRECT_COPY_GPUMEM && remote_device != local_device) {
                     unpack_convertor->gpu_buffer_ptr = opal_cuda_malloc_gpu_buffer(size, 0);
                     opal_cuda_d2dcpy_async(unpack_convertor->gpu_buffer_ptr, remote_memory_address, size);
                     iov.iov_base = unpack_convertor->gpu_buffer_ptr;
-                    opal_output(0, "start D2D copy src %p, dst %p, size %lu\n", remote_memory_address, unpack_convertor->gpu_buffer_ptr, size);
+                    opal_output(0, "start D2D copy src %p, dst %p, size %lu, stream id %d\n", remote_memory_address, unpack_convertor->gpu_buffer_ptr, size, opal_cuda_get_cuda_stream());
                 } else {
                     iov.iov_base = unpack_convertor->gpu_buffer_ptr;
                 }
diff --git a/opal/mca/btl/smcuda/btl_smcuda_component.c b/opal/mca/btl/smcuda/btl_smcuda_component.c
index c4a299ef84a..f8bcb5eb865 100644
--- a/opal/mca/btl/smcuda/btl_smcuda_component.c
+++ b/opal/mca/btl/smcuda/btl_smcuda_component.c
@@ -895,17 +895,19 @@ static void btl_smcuda_datatype_unpack(mca_btl_base_module_t* btl,
             convertor->flags |= CONVERTOR_CUDA;
             unsigned char *local_address = my_cuda_dt_clone->current_unpack_convertor_pBaseBuf;
             remote_address = (unsigned char*)my_cuda_dt_clone->remote_gpu_address + seq * pipeline_size;
-            opal_output(0, "no unpack, start D2D copy local %p, remote %p, size %ld\n", local_address, remote_address, packed_size);
+            opal_output(0, "no unpack, start D2D copy local %p, remote %p, size %ld, stream id %d\n", local_address, remote_address, packed_size, opal_cuda_get_cuda_stream());
+            opal_cuda_set_cuda_stream();
             mca_common_cuda_memp2pcpy(local_address, (unsigned char*)my_cuda_dt_clone->remote_gpu_address + seq*pipeline_size, packed_size);
             my_cuda_dt_clone->current_unpack_convertor_pBaseBuf += packed_size;
         } else {     /* unpack */
             convertor->flags |= CONVERTOR_CUDA;
+            opal_cuda_set_cuda_stream();
             if (!OPAL_DATATYPE_DIRECT_COPY_GPUMEM && my_cuda_dt_clone->remote_device != my_cuda_dt_clone->local_device) {
                 convertor->gpu_buffer_ptr = opal_cuda_malloc_gpu_buffer(packed_size, 0);
                 remote_address = (unsigned char*)my_cuda_dt_clone->remote_gpu_address + seq * pipeline_size;
                 opal_cuda_d2dcpy_async(convertor->gpu_buffer_ptr, remote_address, packed_size);
                 iov.iov_base = convertor->gpu_buffer_ptr;
-                opal_output(0, "unpack, start D2D copy src %p, dst %p, size %lu\n", remote_address, convertor->gpu_buffer_ptr, packed_size);        
+                opal_output(0, "unpack, start D2D copy src %p, dst %p, size %lu, stream id %d\n", remote_address, convertor->gpu_buffer_ptr, packed_size, opal_cuda_get_cuda_stream());        
             } else {
                 iov.iov_base = convertor->gpu_buffer_ptr + seq * pipeline_size;
             }
@@ -968,6 +970,7 @@ static void btl_smcuda_datatype_pack(mca_btl_base_module_t* btl,
             struct iovec iov;
             iov.iov_base = convertor->gpu_buffer_ptr + seq * mca_btl_smcuda_component.cuda_ddt_pipeline_size;
             iov.iov_len = mca_btl_smcuda_component.cuda_ddt_pipeline_size;
+            opal_cuda_set_cuda_stream();
             rv_dt = opal_convertor_pack(convertor, &iov, &iov_count, &max_data );
             packed_size = max_data;
             send_msg.packed_size = packed_size;
@@ -985,6 +988,7 @@ static void btl_smcuda_datatype_pack(mca_btl_base_module_t* btl,
         iov.iov_len = mca_btl_smcuda_component.cuda_ddt_pipeline_size;
         seq = 0;
         while (rv_dt != 1 && convertor->gpu_buffer_size > 0) {
+            opal_cuda_set_cuda_stream();
             rv_dt = opal_convertor_pack(convertor, &iov, &iov_count, &max_data );
             iov.iov_base = (void*)((unsigned char*)iov.iov_base + mca_btl_smcuda_component.cuda_ddt_pipeline_size);
             convertor->gpu_buffer_size -= mca_btl_smcuda_component.cuda_ddt_pipeline_size;
@@ -1041,6 +1045,7 @@ static void btl_smcuda_datatype_put(mca_btl_base_module_t* btl,
     size_t max_data = 0;
     iov.iov_len = convertor->local_size;
     iov.iov_base = convertor->gpu_buffer_ptr;
+    opal_cuda_set_cuda_stream();
     rv_dt = opal_convertor_pack(convertor, &iov, &iov_count, &max_data );
     assert(rv_dt == 1);
     send_msg.lindex = lindex;
diff --git a/test/datatype/ddt_benchmark.c b/test/datatype/ddt_benchmark.c
index 8b3c7ce7981..afc33e1075e 100644
--- a/test/datatype/ddt_benchmark.c
+++ b/test/datatype/ddt_benchmark.c
@@ -1271,12 +1271,12 @@ int main( int argc, char* argv[] )
     
     printf( "\n\n#\n * TEST UPPER TRIANGULAR MATRIX (size 100)\n #\n\n" );
     int mat_size = 500;
-    for (mat_size = 1000; mat_size <= 4000; mat_size +=1000) {
+    for (mat_size = 4000; mat_size <= 4000; mat_size +=1000) {
         pdt = upper_matrix(mat_size);
         printf("----matrix size %d-----\n", mat_size);
         if( outputFlags & CHECK_PACK_UNPACK ) {
             for (i = 1; i <= 5; i++) {
-               //  local_copy_with_convertor(pdt, 1, 200000000, mat_size);
+                 local_copy_with_convertor(pdt, 1, 200000000, mat_size);
             }
         }
         OBJ_RELEASE( pdt ); assert( pdt == NULL );
@@ -1345,7 +1345,7 @@ int main( int argc, char* argv[] )
         pdt = create_vector_type( MPI_DOUBLE, blk_len, blk_len, blk_len*2);
         if( outputFlags & CHECK_PACK_UNPACK ) {
             for (i = 0; i < 4; i++) {
-                 vector_ddt( pdt, 1, pdt, 1, 1024*1024*200 , blk_len, blk_len, blk_len*2);
+    //             vector_ddt( pdt, 1, pdt, 1, 1024*1024*200 , blk_len, blk_len, blk_len*2);
      //          vector_ddt_2d( pdt, 1, pdt, 1, 1024*1024*100 , 8192, blk_len, blk_len+128);
             }
         }

From b6d56ebb85f9075be65806ea0bae651f48ebe9a2 Mon Sep 17 00:00:00 2001
From: Wei Wu <eddy16112@gmail.com>
Date: Fri, 26 Feb 2016 13:41:42 -0800
Subject: [PATCH 094/190] new vector kernel

---
 .../cuda/opal_datatype_pack_cuda_kernel.cu    | 213 ++++++++++++++----
 .../cuda/opal_datatype_pack_cuda_wrapper.cu   |  20 +-
 .../cuda/opal_datatype_unpack_cuda_wrapper.cu |  20 +-
 opal/datatype/opal_datatype_pack.c            |   2 +-
 opal/datatype/opal_datatype_unpack.c          |   2 +-
 test/datatype/ddt_benchmark.c                 |   6 +-
 6 files changed, 186 insertions(+), 77 deletions(-)

diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
index 929d1f7de88..0f887753bf5 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
@@ -148,62 +148,175 @@ __global__ void pack_contiguous_loop_cuda_kernel_global( uint32_t copy_loops,
 
 #else
 
-#define SEG_ADD(s) \
-    l += s; \
-    while (l >= lines) { \
-	l -= lines; \
-	c += width; \
-    }
-
-__global__ void pack_contiguous_loop_cuda_kernel_global( uint32_t lines,
-                                                         size_t nb_size,
-                                                         OPAL_PTRDIFF_TYPE nb_extent,
-                                                         unsigned char * b_source,
-                                                         unsigned char * b_destination )
+__global__ void pack_contiguous_loop_cuda_kernel_global( uint32_t copy_loops,
+                                                         size_t size,
+                                                         OPAL_PTRDIFF_TYPE extent,
+                                                         unsigned char* source,
+                                                         unsigned char* destination )
 {
-    uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
-    uint32_t num_threads = gridDim.x * blockDim.x;
-  
-    //size_t lines = (size_t)lines;
-    size_t size = nb_size / 8;
-    size_t extent = nb_extent / 8;
-    uint64_t * source = (uint64_t *) b_source;
-    uint64_t *destination = (uint64_t *) b_destination;
-    uint64_t val[KERNEL_UNROLL];
+    uint32_t i, u, tid, num_threads, warp_id, tid_per_warp, nb_warps, nb_warps_x, nb_warps_y, pos_x, pos_y, size_last_y, size_last_x;
+    uint32_t size_nb, extent_nb;
+    uint64_t *_source_tmp, *_destination_tmp, *source_64, *destination_64, *_source_left_tmp, *_destination_left_tmp;
+    uint64_t val[UNROLL_16];
     
-    int col = 0;
-    for (int width = 32; width > 0 && col < size; width >>= 1) {
-    	while (size-col >= width) {
-    	    const int warp_id = tid / width;
-    	    const int warp_tid = tid & (width-1);
-    	    const int warp_nb = num_threads / width;
-    	    const int c = col + warp_tid;
-            int l = warp_id * KERNEL_UNROLL;
-    	    uint64_t *src = source + c;
-    	    uint64_t *dst = destination + c;
-    	    for (int b=0; b<lines/(KERNEL_UNROLL*warp_nb); b++) {
-    		    #pragma unroll
-    		    for (int u=0; u<KERNEL_UNROLL; u++) {
-    		        val[u] = *(src+(l+u)*extent);
-    		    }
-    		    #pragma unroll
-    		    for (int u=0; u<KERNEL_UNROLL; u++) {
-    		        dst[(l+u)*size] = val[u];
-    		    }
-    		    l += warp_nb * KERNEL_UNROLL;
-    	    }
-    	    /* Finish non-unrollable case */
-    	    for (int u=0; u<KERNEL_UNROLL && l<lines; u++) {
-    		    dst[l*size] = *(src+l*extent);
-    		    l++;
-    	    }		
-    	    col += width;
-    	}
+    tid = threadIdx.x + blockIdx.x * blockDim.x;
+    num_threads = gridDim.x * blockDim.x;
+    warp_id = tid / CUDA_WARP_SIZE;
+    tid_per_warp = threadIdx.x & (CUDA_WARP_SIZE-1);
+    nb_warps = num_threads / CUDA_WARP_SIZE;
+    
+    extent_nb = extent / 8;
+    size_nb = size / 8;
+    source_64 = (uint64_t*)source;
+    destination_64 = (uint64_t*)destination;
+    
+    nb_warps_x = size_nb / CUDA_WARP_SIZE;
+    size_last_x = size_nb & (CUDA_WARP_SIZE-1);
+    if ( size_last_x != 0) {
+        nb_warps_x ++;
+    } else {
+        size_last_x = CUDA_WARP_SIZE;
+    }
+    nb_warps_y = copy_loops / UNROLL_16;
+    size_last_y = copy_loops & (UNROLL_16-1);
+    if ( size_last_y != 0) {
+        nb_warps_y ++;
+    } else {
+        size_last_y = UNROLL_16;
+    }
+    // if (threadIdx.x == 0) {
+    //     printf("warp_id %u, nb_warps_x %u, nb_warps_y %u, tid_per_warps %u, nb_warps %u\n", warp_id, nb_warps_x, nb_warps_y, tid_per_warp, nb_warps);
+    // }
+    
+    const uint32_t extent_nb_times_UNROLL_16 =  extent_nb * UNROLL_16;
+    const uint32_t size_nb_times_UNROLL_16 = size_nb * UNROLL_16;
+    source_64 += tid_per_warp;
+    destination_64 += tid_per_warp;
+    
+    for (i = warp_id; i < (nb_warps_x-1) * (nb_warps_y-1); i += nb_warps) {
+        pos_x = i / (nb_warps_y-1);
+        pos_y = i % (nb_warps_y-1);
+        _source_tmp = source_64 + pos_y * extent_nb_times_UNROLL_16 + pos_x * CUDA_WARP_SIZE;
+        _destination_tmp = destination_64 + pos_y * size_nb_times_UNROLL_16 + pos_x * CUDA_WARP_SIZE;
+        #pragma unroll
+        for (u = 0; u < UNROLL_16; u++) {
+            val[u] = *(_source_tmp + u * extent_nb);
+        }
+        #pragma unroll
+        for (uint32_t u = 0; u < UNROLL_16; u++) {
+            *(_destination_tmp + u * size_nb) = val[u];
+        }
+    }
+    if (tid_per_warp < size_last_x) {
+        pos_x = nb_warps_x - 1;
+        _source_left_tmp = source_64 + pos_x * CUDA_WARP_SIZE;
+        _destination_left_tmp = destination_64 + pos_x * CUDA_WARP_SIZE;
+        for (i = warp_id; i < nb_warps_y-1; i += nb_warps) {
+            _source_tmp = _source_left_tmp + i * extent_nb_times_UNROLL_16;
+            _destination_tmp = _destination_left_tmp + i * size_nb_times_UNROLL_16;
+            #pragma unroll
+            for (u = 0; u < UNROLL_16; u++) {
+                val[u] = *(_source_tmp + u * extent_nb);
+            }
+            #pragma unroll
+            for (uint32_t u = 0; u < UNROLL_16; u++) {
+                *(_destination_tmp + u * size_nb) = val[u];
+            }
+        }
     }
-
     
+    pos_y = nb_warps_y - 1;
+    _source_left_tmp = source_64 + pos_y * extent_nb_times_UNROLL_16;
+    _destination_left_tmp = destination_64 + pos_y * size_nb_times_UNROLL_16;
+    if (size_last_y == UNROLL_16) {
+        for (i = warp_id; i < nb_warps_x-1; i += nb_warps) {
+            _source_tmp = _source_left_tmp + i * CUDA_WARP_SIZE;
+            _destination_tmp = _destination_left_tmp + i * CUDA_WARP_SIZE;
+            #pragma unroll
+            for (u = 0; u < UNROLL_16; u++) {
+                val[u] = *(_source_tmp + u * extent_nb);
+            }
+            #pragma unroll
+            for (uint32_t u = 0; u < UNROLL_16; u++) {
+                *(_destination_tmp + u * size_nb) = val[u];
+            }  
+        } 
+    } else {
+        for (i = warp_id; i < nb_warps_x-1; i += nb_warps) {
+            _source_tmp = _source_left_tmp + i * CUDA_WARP_SIZE;
+            _destination_tmp = _destination_left_tmp + i * CUDA_WARP_SIZE;
+            for (u = 0; u < size_last_y; u++) {
+                *(_destination_tmp + u * size_nb) = *(_source_tmp + u * extent_nb);
+            }
+        }
+    }
+    
+    if (warp_id == 0 && tid_per_warp < size_last_x) {
+        _source_tmp = source_64 + (nb_warps_y-1) * extent_nb_times_UNROLL_16 + (nb_warps_x-1) * CUDA_WARP_SIZE;
+        _destination_tmp = destination_64 + (nb_warps_y-1) * size_nb_times_UNROLL_16 + (nb_warps_x-1) * CUDA_WARP_SIZE;
+        for (u = 0; u < size_last_y; u++) {
+            *(_destination_tmp + u * size_nb) = *(_source_tmp + u * extent_nb);
+        }
+    }
 }
 
+
+// #define SEG_ADD(s) \
+//     l += s; \
+//     while (l >= lines) { \
+//     l -= lines; \
+//     c += width; \
+//     }
+//
+// __global__ void pack_contiguous_loop_cuda_kernel_global( uint32_t lines,
+//                                                          size_t nb_size,
+//                                                          OPAL_PTRDIFF_TYPE nb_extent,
+//                                                          unsigned char * b_source,
+//                                                          unsigned char * b_destination )
+// {
+//     uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
+//     uint32_t num_threads = gridDim.x * blockDim.x;
+//
+//     //size_t lines = (size_t)lines;
+//     size_t size = nb_size / 8;
+//     size_t extent = nb_extent / 8;
+//     uint64_t * source = (uint64_t *) b_source;
+//     uint64_t *destination = (uint64_t *) b_destination;
+//     uint64_t val[KERNEL_UNROLL];
+//
+//     int col = 0;
+//     for (int width = 32; width > 0 && col < size; width >>= 1) {
+//         while (size-col >= width) {
+//             const int warp_id = tid / width;
+//             const int warp_tid = tid & (width-1);
+//             const int warp_nb = num_threads / width;
+//             const int c = col + warp_tid;
+//             int l = warp_id * KERNEL_UNROLL;
+//             uint64_t *src = source + c;
+//             uint64_t *dst = destination + c;
+//             for (int b=0; b<lines/(KERNEL_UNROLL*warp_nb); b++) {
+//                 #pragma unroll
+//                 for (int u=0; u<KERNEL_UNROLL; u++) {
+//                     val[u] = *(src+(l+u)*extent);
+//                 }
+//                 #pragma unroll
+//                 for (int u=0; u<KERNEL_UNROLL; u++) {
+//                     dst[(l+u)*size] = val[u];
+//                 }
+//                 l += warp_nb * KERNEL_UNROLL;
+//             }
+//             /* Finish non-unrollable case */
+//             for (int u=0; u<KERNEL_UNROLL && l<lines; u++) {
+//                 dst[l*size] = *(src+l*extent);
+//                 l++;
+//             }
+//             col += width;
+//         }
+//     }
+//
+//
+// }
+
 /*
 #define COLOFF_INC(jump, width, ext) \
      col += jump; \
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
index 882c26a72b4..20e3b381994 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
@@ -192,9 +192,7 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_vector(opal_convertor_t* pCon
         iov[iov_count].iov_len -= iov_len_local;  /* update the amount of valid data */
         total_packed += iov[iov_count].iov_len;
  //       printf("iov_len %d, local %d\n", iov[iov_count].iov_len, iov_len_local);
-        for (i = 0; i < NB_STREAMS; i++) {
-            cudaStreamSynchronize(cuda_streams->ddt_cuda_stream[i]);
-        }
+        cudaStreamSynchronize(cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]);
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
         GET_TIME(start);
 #endif
@@ -461,9 +459,9 @@ void pack_contiguous_loop_cuda( dt_elem_desc_t* ELEM,
  //   num_blocks = (*COUNT + tasks_per_block - 1) / tasks_per_block;
 //    printf("extent %ld, size %ld, count %ld\n", _loop->extent, _end_loop->size, _copy_loops);
 #if OPAL_DATATYPE_VECTOR_USE_MEMCPY2D_AS_KERNEL
-    cudaMemcpy2DAsync(_destination, _end_loop->size, _source, _loop->extent, _end_loop->size, _copy_loops, cudaMemcpyDeviceToDevice, cuda_streams->ddt_cuda_stream[0]);
+    cudaMemcpy2DAsync(_destination, _end_loop->size, _source, _loop->extent, _end_loop->size, _copy_loops, cudaMemcpyDeviceToDevice, cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]);
 #else
-    pack_contiguous_loop_cuda_kernel_global<<<32, 8*THREAD_PER_BLOCK, 0, cuda_streams->ddt_cuda_stream[0]>>>(_copy_loops, _end_loop->size, _loop->extent, _source, _destination);
+    pack_contiguous_loop_cuda_kernel_global<<<16, 8*THREAD_PER_BLOCK, 0, cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]>>>(_copy_loops, _end_loop->size, _loop->extent, _source, _destination);
 #endif /* OPAL_DATATYPE_VECTOR_USE_MEMCPY2D_AS_KERNEL */
 
 #if !defined(OPAL_DATATYPE_CUDA_DRY_RUN)    
@@ -473,7 +471,7 @@ void pack_contiguous_loop_cuda( dt_elem_desc_t* ELEM,
     *(COUNT) -= _copy_loops;
 #endif
     
-    cudaStreamSynchronize(cuda_streams->ddt_cuda_stream[0]);
+    cudaStreamSynchronize(cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]);
     
 #if defined(OPAL_DATATYPE_CUDA_TIMING) 
     GET_TIME( end );
@@ -584,7 +582,7 @@ void pack_contiguous_loop_cuda_memcpy2d_d2h( dt_elem_desc_t* ELEM,
     GET_TIME(start);
 #endif    
 
-    cudaMemcpy2DAsync(_destination, _end_loop->size, _source, _loop->extent, _end_loop->size, _copy_loops, cudaMemcpyDeviceToHost, cuda_streams->ddt_cuda_stream[0]);
+    cudaMemcpy2DAsync(_destination, _end_loop->size, _source, _loop->extent, _end_loop->size, _copy_loops, cudaMemcpyDeviceToHost, cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]);
 
 #if !defined(OPAL_DATATYPE_CUDA_DRY_RUN)    
     *(SOURCE) = _source +  _loop->extent*_copy_loops - _end_loop->first_elem_disp;
@@ -593,7 +591,7 @@ void pack_contiguous_loop_cuda_memcpy2d_d2h( dt_elem_desc_t* ELEM,
     *(COUNT) -= _copy_loops;
 #endif
     
-    cudaStreamSynchronize(cuda_streams->ddt_cuda_stream[0]);
+    cudaStreamSynchronize(cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]);
     
 #if defined(OPAL_DATATYPE_CUDA_TIMING) 
     GET_TIME( end );
@@ -638,9 +636,9 @@ void pack_contiguous_loop_cuda_zerocopy( dt_elem_desc_t* ELEM,
         printf("can not get dev  mem, %s\n", cuda_err);
     }
 #if OPAL_DATATYPE_VECTOR_USE_MEMCPY2D_AS_KERNEL
-    cudaMemcpy2DAsync(_destination_dev, _end_loop->size, _source, _loop->extent, _end_loop->size, _copy_loops, cudaMemcpyDeviceToDevice, cuda_streams->ddt_cuda_stream[0]);
+    cudaMemcpy2DAsync(_destination_dev, _end_loop->size, _source, _loop->extent, _end_loop->size, _copy_loops, cudaMemcpyDeviceToDevice, cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]);
 #else
-    pack_contiguous_loop_cuda_kernel_global<<<192, 4*THREAD_PER_BLOCK, 0, cuda_streams->ddt_cuda_stream[0]>>>(_copy_loops, _end_loop->size, _loop->extent, _source, _destination_dev);
+    pack_contiguous_loop_cuda_kernel_global<<<192, 4*THREAD_PER_BLOCK, 0, cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]>>>(_copy_loops, _end_loop->size, _loop->extent, _source, _destination_dev);
 #endif /* OPAL_DATATYPE_VECTOR_USE_MEMCPY2D_AS_KERNEL */
 
 #if !defined(OPAL_DATATYPE_CUDA_DRY_RUN)    
@@ -650,7 +648,7 @@ void pack_contiguous_loop_cuda_zerocopy( dt_elem_desc_t* ELEM,
     *(COUNT) -= _copy_loops;
 #endif
     
-    cudaStreamSynchronize(cuda_streams->ddt_cuda_stream[0]);
+    cudaStreamSynchronize(cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]);
     
 #if defined(OPAL_DATATYPE_CUDA_TIMING) 
     GET_TIME( end );
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
index 703e52280b5..9be53d2d5a7 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
@@ -178,9 +178,7 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_vector( opal_convertor_t* p
         total_unpacked += iov[iov_count].iov_len;
     }
  complete_conversion:
-    for (i = 0; i < NB_STREAMS; i++) {
-        cudaStreamSynchronize(cuda_streams->ddt_cuda_stream[i]);
-    }
+    cudaStreamSynchronize(cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]);
     *max_data = total_unpacked;
     pConvertor->bConverted += total_unpacked;  /* update the already converted bytes */
     *out_size = iov_count;
@@ -955,9 +953,9 @@ void unpack_contiguous_loop_cuda( dt_elem_desc_t* ELEM,
 //    tasks_per_block = THREAD_PER_BLOCK * TASK_PER_THREAD;
 //    num_blocks = (*COUNT + tasks_per_block - 1) / tasks_per_block;
 #if OPAL_DATATYPE_VECTOR_USE_MEMCPY2D_AS_KERNEL
-     cudaMemcpy2DAsync(_destination, _loop->extent, _source, _end_loop->size, _end_loop->size, _copy_loops, cudaMemcpyDeviceToDevice, cuda_streams->ddt_cuda_stream[0]);
+     cudaMemcpy2DAsync(_destination, _loop->extent, _source, _end_loop->size, _end_loop->size, _copy_loops, cudaMemcpyDeviceToDevice, cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]);
 #else
-     unpack_contiguous_loop_cuda_kernel_global<<<192, 4*THREAD_PER_BLOCK, 0, cuda_streams->ddt_cuda_stream[0]>>>(_copy_loops, _end_loop->size, _loop->extent, _source, _destination);
+     unpack_contiguous_loop_cuda_kernel_global<<<192, 4*THREAD_PER_BLOCK, 0, cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]>>>(_copy_loops, _end_loop->size, _loop->extent, _source, _destination);
 #endif /* OPAL_DATATYPE_VECTOR_USE_MEMCPY2D_AS_KERNEL */
 
 #if !defined(OPAL_DATATYPE_CUDA_DRY_RUN)
@@ -967,7 +965,7 @@ void unpack_contiguous_loop_cuda( dt_elem_desc_t* ELEM,
     *(COUNT) -= _copy_loops;
 #endif
 
-    cudaStreamSynchronize(cuda_streams->ddt_cuda_stream[0]);
+    cudaStreamSynchronize(cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]);
 #if defined(OPAL_DATATYPE_CUDA_TIMING) 
     GET_TIME( end );
     total_time = ELAPSED_TIME( start, end );
@@ -1002,7 +1000,7 @@ void unpack_contiguous_loop_cuda_memcpy2d( dt_elem_desc_t* ELEM,
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
     GET_TIME(start);
 #endif
-    cudaMemcpy2DAsync(_destination, _loop->extent, _source, _end_loop->size, _end_loop->size, _copy_loops, cudaMemcpyHostToDevice, cuda_streams->ddt_cuda_stream[0]);
+    cudaMemcpy2DAsync(_destination, _loop->extent, _source, _end_loop->size, _end_loop->size, _copy_loops, cudaMemcpyHostToDevice, cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]);
 
 #if !defined(OPAL_DATATYPE_CUDA_DRY_RUN)
     *(DESTINATION) = _destination + _loop->extent*_copy_loops - _end_loop->first_elem_disp;
@@ -1011,7 +1009,7 @@ void unpack_contiguous_loop_cuda_memcpy2d( dt_elem_desc_t* ELEM,
     *(COUNT) -= _copy_loops;
 #endif
     
-    cudaStreamSynchronize(cuda_streams->ddt_cuda_stream[0]);
+    cudaStreamSynchronize(cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]);
     
 #if defined(OPAL_DATATYPE_CUDA_TIMING) 
     GET_TIME( end );
@@ -1057,9 +1055,9 @@ void unpack_contiguous_loop_cuda_zerocopy( dt_elem_desc_t* ELEM,
         printf("can not get dev mem, %s\n", cuda_err);
     }
 #if OPAL_DATATYPE_VECTOR_USE_MEMCPY2D_AS_KERNEL
-    cudaMemcpy2DAsync(_destination, _loop->extent, _source_dev, _end_loop->size, _end_loop->size, _copy_loops, cudaMemcpyDeviceToDevice, cuda_streams->ddt_cuda_stream[0]);
+    cudaMemcpy2DAsync(_destination, _loop->extent, _source_dev, _end_loop->size, _end_loop->size, _copy_loops, cudaMemcpyDeviceToDevice, cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]);
 #else
-    unpack_contiguous_loop_cuda_kernel_global<<<192, 4*THREAD_PER_BLOCK, 0, cuda_streams->ddt_cuda_stream[0]>>>(_copy_loops, _end_loop->size, _loop->extent, _source_dev, _destination);
+    unpack_contiguous_loop_cuda_kernel_global<<<192, 4*THREAD_PER_BLOCK, 0, cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]>>>(_copy_loops, _end_loop->size, _loop->extent, _source_dev, _destination);
 #endif /* OPAL_DATATYPE_VECTOR_USE_MEMCPY2D_AS_KERNEL */
 
 #if !defined(OPAL_DATATYPE_CUDA_DRY_RUN)
@@ -1069,7 +1067,7 @@ void unpack_contiguous_loop_cuda_zerocopy( dt_elem_desc_t* ELEM,
     *(COUNT) -= _copy_loops;
 #endif
 
-    cudaStreamSynchronize(cuda_streams->ddt_cuda_stream[0]);
+    cudaStreamSynchronize(cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]);
   //  cudaHostUnregister(_source);
 #if defined(OPAL_DATATYPE_CUDA_TIMING) 
     GET_TIME( end );
diff --git a/opal/datatype/opal_datatype_pack.c b/opal/datatype/opal_datatype_pack.c
index c8985db7913..1ae08565b73 100644
--- a/opal/datatype/opal_datatype_pack.c
+++ b/opal/datatype/opal_datatype_pack.c
@@ -416,7 +416,7 @@ opal_generic_simple_pack_cuda_function( opal_convertor_t* pConvertor,
     pos_desc   = pStack->index;
     pElem = &(description[pos_desc]);
    
-    return opal_generic_simple_pack_function_cuda_iov( pConvertor, iov, out_size, max_data);
+    //return opal_generic_simple_pack_function_cuda_iov( pConvertor, iov, out_size, max_data);
     if( OPAL_DATATYPE_LOOP == pElem->elem.common.type ) {
         return opal_generic_simple_pack_function_cuda_vector( pConvertor, iov, out_size, max_data);
     } else {
diff --git a/opal/datatype/opal_datatype_unpack.c b/opal/datatype/opal_datatype_unpack.c
index 5f51b3f828b..815f7b1e4bf 100644
--- a/opal/datatype/opal_datatype_unpack.c
+++ b/opal/datatype/opal_datatype_unpack.c
@@ -610,7 +610,7 @@ opal_generic_simple_unpack_cuda_function( opal_convertor_t* pConvertor,
     pos_desc   = pStack->index;
     pElem = &(description[pos_desc]);
    
-    return opal_generic_simple_unpack_function_cuda_iov( pConvertor, iov, out_size, max_data);
+    //return opal_generic_simple_unpack_function_cuda_iov( pConvertor, iov, out_size, max_data);
     if( OPAL_DATATYPE_LOOP == pElem->elem.common.type ) {
         return opal_generic_simple_unpack_function_cuda_vector( pConvertor, iov, out_size, max_data);
     } else {
diff --git a/test/datatype/ddt_benchmark.c b/test/datatype/ddt_benchmark.c
index afc33e1075e..de3f43a8759 100644
--- a/test/datatype/ddt_benchmark.c
+++ b/test/datatype/ddt_benchmark.c
@@ -1276,7 +1276,7 @@ int main( int argc, char* argv[] )
         printf("----matrix size %d-----\n", mat_size);
         if( outputFlags & CHECK_PACK_UNPACK ) {
             for (i = 1; i <= 5; i++) {
-                 local_copy_with_convertor(pdt, 1, 200000000, mat_size);
+               //  local_copy_with_convertor(pdt, 1, 200000000, mat_size);
             }
         }
         OBJ_RELEASE( pdt ); assert( pdt == NULL );
@@ -1339,13 +1339,13 @@ int main( int argc, char* argv[] )
     }
     
     
-    for (blk_len = 4000; blk_len <= 4000; blk_len += 2000) {
+    for (blk_len = 1000; blk_len <= 4000; blk_len += 2000) {
         printf( ">>--------------------------------------------<<\n" );
         printf( "Vector data-type (1024 times %d double stride 512)\n", blk_len );
         pdt = create_vector_type( MPI_DOUBLE, blk_len, blk_len, blk_len*2);
         if( outputFlags & CHECK_PACK_UNPACK ) {
             for (i = 0; i < 4; i++) {
-    //             vector_ddt( pdt, 1, pdt, 1, 1024*1024*200 , blk_len, blk_len, blk_len*2);
+                 vector_ddt( pdt, 1, pdt, 1, 1024*1024*200 , blk_len, blk_len, blk_len*2);
      //          vector_ddt_2d( pdt, 1, pdt, 1, 1024*1024*100 , 8192, blk_len, blk_len+128);
             }
         }

From 48b2d06017196b937b88e842a30565566ad1e0d6 Mon Sep 17 00:00:00 2001
From: rolfv <rvandevaart@nvidia.com>
Date: Fri, 7 Nov 2014 11:00:45 -0800
Subject: [PATCH 095/190] Add GPU packing and unpacking add cuda stream for
 submmitting multiple kernels. add suppot for predefined datatypes.

Conflicts:
	opal/datatype/opal_datatype_unpack.c
	test/datatype/ddt_test.c
---
 opal/datatype/Makefile.am                     |   6 +-
 opal/datatype/cuda/Makefile                   |  40 ++
 opal/datatype/cuda/opal_datatype_cuda.cu      |  78 +++
 opal/datatype/cuda/opal_datatype_cuda.cuh     |  42 ++
 .../cuda/opal_datatype_cuda_internal.cuh      | 397 ++++++++++++++
 .../cuda/opal_datatype_pack_cuda_kernel.cu    | 502 ++++++++++++++++++
 .../cuda/opal_datatype_pack_cuda_wrapper.cu   | 196 +++++++
 .../cuda/opal_datatype_unpack_cuda_kernel.cu  | 288 ++++++++++
 .../cuda/opal_datatype_unpack_cuda_wrapper.cu | 123 +++++
 opal/datatype/opal_datatype_gpu.c             | 167 ++++++
 opal/datatype/opal_datatype_gpu.h             |  40 ++
 opal/datatype/opal_datatype_module.c          |  11 +
 opal/datatype/opal_datatype_pack.c            |  19 +-
 opal/datatype/opal_datatype_pack.h            |   2 +
 opal/datatype/opal_datatype_unpack.c          |  13 +-
 opal/include/opal_config_top.h                |   2 +
 test/datatype/ddt_test.c                      | 122 ++++-
 17 files changed, 2019 insertions(+), 29 deletions(-)
 create mode 100644 opal/datatype/cuda/Makefile
 create mode 100644 opal/datatype/cuda/opal_datatype_cuda.cu
 create mode 100644 opal/datatype/cuda/opal_datatype_cuda.cuh
 create mode 100644 opal/datatype/cuda/opal_datatype_cuda_internal.cuh
 create mode 100644 opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
 create mode 100644 opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
 create mode 100644 opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
 create mode 100644 opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
 create mode 100644 opal/datatype/opal_datatype_gpu.c
 create mode 100644 opal/datatype/opal_datatype_gpu.h

diff --git a/opal/datatype/Makefile.am b/opal/datatype/Makefile.am
index 6002a739f20..7683c2e8786 100644
--- a/opal/datatype/Makefile.am
+++ b/opal/datatype/Makefile.am
@@ -32,7 +32,8 @@ headers = \
         opal_datatype_memcpy.h \
         opal_datatype_pack.h \
         opal_datatype_prototypes.h \
-        opal_datatype_unpack.h
+        opal_datatype_unpack.h \
+		opal_datatype_gpu.h
 
 
 noinst_LTLIBRARIES = \
@@ -60,10 +61,11 @@ libdatatype_la_SOURCES = \
         opal_datatype_get_count.c \
         opal_datatype_module.c \
         opal_datatype_optimize.c \
+		opal_datatype_gpu.c \
         opal_datatype_pack.c \
         opal_datatype_position.c \
         opal_datatype_resize.c \
-        opal_datatype_unpack.c
+        opal_datatype_unpack.c 
 
 libdatatype_la_LIBADD = libdatatype_reliable.la
 
diff --git a/opal/datatype/cuda/Makefile b/opal/datatype/cuda/Makefile
new file mode 100644
index 00000000000..d42ab556fae
--- /dev/null
+++ b/opal/datatype/cuda/Makefile
@@ -0,0 +1,40 @@
+CC			= gcc
+NVCC		= nvcc
+ARCH		= ar
+ARCHFLAGS	= cr
+RANLIB		= ranlib
+STLIB		?= opal_datatype_cuda.a
+DYLIB		?= opal_datatype_cuda.so
+CFLAGS		= -g -G -O0
+EXTLIB		= -L/home/wwu12/ompi/ompi-cuda/opal/datatype/.libs -ldatatype
+INC			=
+
+SRC	:= \
+    opal_datatype_cuda.cu \
+    opal_datatype_pack_cuda_kernel.cu \
+    opal_datatype_pack_cuda_wrapper.cu \
+	opal_datatype_unpack_cuda_kernel.cu \
+	opal_datatype_unpack_cuda_wrapper.cu \
+	
+OBJ := $(SRC:.cu=.o)
+
+.PHONY: all clean cleanall
+
+all: $(STLIB) $(DYLIB)
+
+$(STLIB): $(OBJ)
+	$(ARCH) $(ARCHFLAGS) $@ $(OBJ) 
+	$(RANLIB) $@
+	
+$(DYLIB): $(OBJ)
+	$(NVCC) $(CFLAGS) $(EXTLIB) -shared --compiler-options '-fPIC' -o $(DYLIB) $(OBJ)
+	
+%.o: %.cu
+	$(NVCC) $(CFLAGS) $(EXTLIB) -gencode arch=compute_35,code=sm_35 $(INC) -c --compiler-options '-fPIC' $< -o $@ 
+
+clean:
+	rm -f *.o
+
+cleanall: clean
+	rm -f $(STLIB)
+	rm -f $(DYLIB)
diff --git a/opal/datatype/cuda/opal_datatype_cuda.cu b/opal/datatype/cuda/opal_datatype_cuda.cu
new file mode 100644
index 00000000000..ea1f3633480
--- /dev/null
+++ b/opal/datatype/cuda/opal_datatype_cuda.cu
@@ -0,0 +1,78 @@
+#include "opal_datatype_cuda_internal.cuh"
+#include "opal_datatype_cuda.cuh"
+#include <cuda_runtime_api.h>
+#include <stdio.h>
+
+ddt_cuda_desc_t *cuda_desc_d, *cuda_desc_h;
+unsigned char *pBaseBuf_GPU, *gpu_src_const, *gpu_dest_const;
+ddt_cuda_stream_t* cuda_streams;
+
+void opal_datatype_cuda_init(void)
+{
+    uint32_t i;
+    
+    int cuda_device = OPAL_GPU_INDEX;
+    cudaSetDevice(cuda_device);
+    
+    cudaMalloc((void **)&cuda_desc_d, sizeof(ddt_cuda_desc_t));
+    cudaMallocHost((void **)&cuda_desc_h, sizeof(ddt_cuda_desc_t));
+    printf("size cuda_desc %d\n", sizeof(ddt_cuda_desc_t));
+    
+    printf("malloc iov\n");
+    for (i = 0; i < IOV_ARRAY_SIZE; i++) {
+        void* iov_base;
+        cudaMalloc( (void **)&iov_base, sizeof(char)*IOV_LEN);
+        cuda_desc_h->iov[i].iov_base = iov_base;
+        cuda_desc_h->iov[i].iov_len = IOV_LEN;
+    }
+    cudaMalloc((void **)(&pBaseBuf_GPU), sizeof(char)*IOV_LEN);
+    gpu_src_const = pBaseBuf_GPU;
+    gpu_dest_const = (unsigned char*)cuda_desc_h->iov[0].iov_base; 
+    
+    cuda_desc_h->description_max_count = 0;
+    cuda_desc_h->description_count = 0;
+    
+    cuda_streams = (ddt_cuda_stream_t*)malloc(sizeof(ddt_cuda_stream_t));
+    /* init cuda stream */
+    for (i = 0; i < NB_STREAMS; i++) {
+        cudaStreamCreate(&(cuda_streams->opal_cuda_stream[i]));
+    }
+    cuda_streams->current_stream_id = 0;
+}
+
+void opal_datatype_cuda_fini(void)
+{
+    uint32_t i;
+    
+    if (cuda_desc_d != NULL) {
+        cudaFree(cuda_desc_d);
+        cuda_desc_d = NULL;
+    }
+    if (cuda_desc_h->description != NULL) {
+        cudaFree(cuda_desc_h->description);
+        cuda_desc_h->description = NULL;
+    }
+    printf("free iov\n");
+    if (cuda_desc_h != NULL) {    
+        for (i = 0; i < IOV_ARRAY_SIZE; i++) {
+            cudaFree(cuda_desc_h->iov[i].iov_base);
+            cuda_desc_h->iov[i].iov_base = NULL;
+        }
+    
+        cudaFreeHost(cuda_desc_h);
+        cuda_desc_h = NULL;
+    }
+    
+    /* destory cuda stream */
+    for (i = 0; i < NB_STREAMS; i++) {
+        cudaStreamDestroy(cuda_streams->opal_cuda_stream[i]);
+    }
+    free(cuda_streams);
+}
+
+void opal_cuda_sync_device(void)
+{
+    cudaDeviceSynchronize();
+    pBaseBuf_GPU = gpu_src_const;
+    cuda_desc_h->iov[0].iov_base = (void*)gpu_dest_const;
+}
\ No newline at end of file
diff --git a/opal/datatype/cuda/opal_datatype_cuda.cuh b/opal/datatype/cuda/opal_datatype_cuda.cuh
new file mode 100644
index 00000000000..82ab78b2ff7
--- /dev/null
+++ b/opal/datatype/cuda/opal_datatype_cuda.cuh
@@ -0,0 +1,42 @@
+#ifndef OPAL_DATATYPE_CUDA_H_HAS_BEEN_INCLUDED
+#define OPAL_DATATYPE_CUDA_H_HAS_BEEN_INCLUDED
+
+extern "C"
+{
+    
+void opal_datatype_cuda_init(void);
+
+void opal_datatype_cuda_fini(void);
+                                
+int32_t opal_generic_simple_pack_function_cuda( opal_convertor_t* pConvertor,
+                                                struct iovec* iov, 
+                                                uint32_t* out_size,
+                                                size_t* max_data );
+
+int32_t opal_generic_simple_unpack_function_cuda( opal_convertor_t* pConvertor,
+                                                  struct iovec* iov, 
+                                                  uint32_t* out_size,
+                                                  size_t* max_data );
+
+void pack_contiguous_loop_cuda( dt_elem_desc_t* ELEM,
+                                uint32_t* COUNT,
+                                unsigned char** SOURCE,
+                                unsigned char** DESTINATION,
+                                size_t* SPACE );
+                                
+void unpack_contiguous_loop_cuda( dt_elem_desc_t* ELEM,
+                                  uint32_t* COUNT,
+                                  unsigned char** SOURCE,
+                                  unsigned char** DESTINATION,
+                                  size_t* SPACE );
+                                  
+void pack_predefined_data_cuda( dt_elem_desc_t* ELEM,
+                                uint32_t* COUNT,
+                                unsigned char** SOURCE,
+                                unsigned char** DESTINATION,
+                                size_t* SPACE );
+
+void opal_cuda_sync_device(void);
+}
+                            
+#endif  /* OPAL_DATATYPE_CUDA_H_HAS_BEEN_INCLUDED */
\ No newline at end of file
diff --git a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
new file mode 100644
index 00000000000..84fbbe856a0
--- /dev/null
+++ b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
@@ -0,0 +1,397 @@
+#ifndef OPAL_DATATYPE_CUDA_INTERNAL_H_HAS_BEEN_INCLUDED
+#define OPAL_DATATYPE_CUDA_INTERNAL_H_HAS_BEEN_INCLUDED
+
+#include <stdint.h>
+#include <stddef.h>
+
+//#define OPAL_DATATYPE_CUDA_DRY_RUN
+//#define OPAL_DATATYPE_CUDA_DEBUG
+//#define OPAL_DATATYPE_CUDA_KERNEL_TIME
+#define OPAL_ENABLE_DEBUG   1
+
+#define DT_STATIC_STACK_SIZE    5                /**< This should be sufficient for most applications */
+#define IOV_ARRAY_SIZE          10
+#define IOV_LEN                 1024*1024*200
+
+#define THREAD_PER_BLOCK    32
+#define TASK_PER_THREAD     1
+#define OPAL_GPU_INDEX      0
+#define NB_STREAMS          4
+
+#define OPAL_PTRDIFF_TYPE ptrdiff_t
+
+/* keep the last 16 bits free for data flags */
+#define CONVERTOR_DATATYPE_MASK    0x0000FFFF
+#define CONVERTOR_SEND_CONVERSION  0x00010000
+#define CONVERTOR_RECV             0x00020000
+#define CONVERTOR_SEND             0x00040000
+#define CONVERTOR_HOMOGENEOUS      0x00080000
+#define CONVERTOR_NO_OP            0x00100000
+#define CONVERTOR_WITH_CHECKSUM    0x00200000
+#define CONVERTOR_CUDA             0x00400000
+#define CONVERTOR_CUDA_ASYNC       0x00800000
+#define CONVERTOR_TYPE_MASK        0x00FF0000
+#define CONVERTOR_STATE_START      0x01000000
+#define CONVERTOR_STATE_COMPLETE   0x02000000
+#define CONVERTOR_STATE_ALLOC      0x04000000
+#define CONVERTOR_COMPLETED        0x08000000
+
+#define OPAL_DATATYPE_LOOP           0
+#define OPAL_DATATYPE_END_LOOP       1
+#define OPAL_DATATYPE_LB             2
+#define OPAL_DATATYPE_UB             3
+#define OPAL_DATATYPE_FIRST_TYPE     4 /* Number of first real type */
+#define OPAL_DATATYPE_INT1           4
+#define OPAL_DATATYPE_INT2           5
+#define OPAL_DATATYPE_INT4           6
+#define OPAL_DATATYPE_INT8           7
+#define OPAL_DATATYPE_INT16          8
+#define OPAL_DATATYPE_UINT1          9
+#define OPAL_DATATYPE_UINT2          10
+#define OPAL_DATATYPE_UINT4          11
+#define OPAL_DATATYPE_UINT8          12
+#define OPAL_DATATYPE_UINT16         13
+#define OPAL_DATATYPE_FLOAT2         14
+#define OPAL_DATATYPE_FLOAT4         15
+#define OPAL_DATATYPE_FLOAT8         16
+#define OPAL_DATATYPE_FLOAT12        17
+#define OPAL_DATATYPE_FLOAT16        18
+#define OPAL_DATATYPE_FLOAT_COMPLEX  19
+#define OPAL_DATATYPE_DOUBLE_COMPLEX 20
+#define OPAL_DATATYPE_LONG_DOUBLE_COMPLEX 21
+#define OPAL_DATATYPE_BOOL           22
+#define OPAL_DATATYPE_WCHAR          23
+#define OPAL_DATATYPE_UNAVAILABLE    24
+
+/* flags for the datatypes. */
+#define OPAL_DATATYPE_FLAG_UNAVAILABLE   0x0001  /**< datatypes unavailable on the build (OS or compiler dependant) */
+#define OPAL_DATATYPE_FLAG_PREDEFINED    0x0002  /**< cannot be removed: initial and predefined datatypes */
+#define OPAL_DATATYPE_FLAG_COMMITED      0x0004  /**< ready to be used for a send/recv operation */
+#define OPAL_DATATYPE_FLAG_OVERLAP       0x0008  /**< datatype is unpropper for a recv operation */
+#define OPAL_DATATYPE_FLAG_CONTIGUOUS    0x0010  /**< contiguous datatype */
+#define OPAL_DATATYPE_FLAG_NO_GAPS       0x0020  /**< no gaps around the datatype, aka OPAL_DATATYPE_FLAG_CONTIGUOUS and extent == size */
+#define OPAL_DATATYPE_FLAG_USER_LB       0x0040  /**< has a user defined LB */
+#define OPAL_DATATYPE_FLAG_USER_UB       0x0080  /**< has a user defined UB */
+#define OPAL_DATATYPE_FLAG_DATA          0x0100  /**< data or control structure */
+/*
+ * We should make the difference here between the predefined contiguous and non contiguous
+ * datatypes. The OPAL_DATATYPE_FLAG_BASIC is held by all predefined contiguous datatypes.
+ */
+#define OPAL_DATATYPE_FLAG_BASIC         (OPAL_DATATYPE_FLAG_PREDEFINED | \
+                                          OPAL_DATATYPE_FLAG_CONTIGUOUS | \
+                                          OPAL_DATATYPE_FLAG_NO_GAPS |    \
+                                          OPAL_DATATYPE_FLAG_DATA |       \
+                                          OPAL_DATATYPE_FLAG_COMMITED)
+ 
+/* typedefs ***********************************************************/
+
+typedef struct opal_object_t opal_object_t;
+typedef struct opal_class_t opal_class_t;
+typedef void (*opal_construct_t) (opal_object_t *);
+typedef void (*opal_destruct_t) (opal_object_t *);
+
+
+/* types **************************************************************/
+
+/**
+* Class descriptor.
+*
+* There should be a single instance of this descriptor for each class
+* definition.
+*/
+struct opal_class_t {
+  const char *cls_name;           /**< symbolic name for class */
+  opal_class_t *cls_parent;       /**< parent class descriptor */
+  opal_construct_t cls_construct; /**< class constructor */
+  opal_destruct_t cls_destruct;   /**< class destructor */
+  int cls_initialized;            /**< is class initialized */
+  int cls_depth;                  /**< depth of class hierarchy tree */
+  opal_construct_t *cls_construct_array;
+                                  /**< array of parent class constructors */
+  opal_destruct_t *cls_destruct_array;
+                                  /**< array of parent class destructors */
+  size_t cls_sizeof;              /**< size of an object instance */
+};
+
+/**
+ * Base object.
+ *
+ * This is special and does not follow the pattern for other classes.
+ */
+struct opal_object_t {
+#if OPAL_ENABLE_DEBUG
+    /** Magic ID -- want this to be the very first item in the
+        struct's memory */
+    uint64_t obj_magic_id;
+#endif
+    opal_class_t *obj_class;            /**< class descriptor */
+    volatile int32_t obj_reference_count;   /**< reference count */
+#if OPAL_ENABLE_DEBUG
+   const char* cls_init_file_name;        /**< In debug mode store the file where the object get contructed */
+   int   cls_init_lineno;           /**< In debug mode store the line number where the object get contructed */
+#endif  /* OPAL_ENABLE_DEBUG */
+};
+
+ 
+ 
+struct ddt_elem_id_description {
+    uint16_t   flags;  /**< flags for the record */
+    uint16_t   type;   /**< the basic data type id */
+};
+typedef struct ddt_elem_id_description ddt_elem_id_description;
+
+/* the basic element. A data description is composed
+ * by a set of basic elements.
+ */
+struct ddt_elem_desc {
+    ddt_elem_id_description common;           /**< basic data description and flags */
+    uint32_t                count;            /**< number of blocks */
+    uint32_t                blocklen;         /**< number of elements on each block */
+    OPAL_PTRDIFF_TYPE       extent;           /**< extent of each block (in bytes) */
+    OPAL_PTRDIFF_TYPE       disp;             /**< displacement of the first block */
+};
+typedef struct ddt_elem_desc ddt_elem_desc_t;
+
+struct ddt_loop_desc {
+    ddt_elem_id_description common;           /**< basic data description and flags */
+    uint32_t                loops;            /**< number of elements */
+    uint32_t                items;            /**< number of items in the loop */
+    size_t                  unused;           /**< not used right now */
+    OPAL_PTRDIFF_TYPE       extent;           /**< extent of the whole loop */
+};
+typedef struct ddt_loop_desc ddt_loop_desc_t;
+
+struct ddt_endloop_desc {
+    ddt_elem_id_description common;           /**< basic data description and flags */
+    uint32_t                items;            /**< number of elements */
+    uint32_t                unused;           /**< not used right now */
+    size_t                  size;             /**< real size of the data in the loop */
+    OPAL_PTRDIFF_TYPE       first_elem_disp;  /**< the displacement of the first block in the loop */
+};
+typedef struct ddt_endloop_desc ddt_endloop_desc_t;
+
+union dt_elem_desc {
+    ddt_elem_desc_t    elem;
+    ddt_loop_desc_t    loop;
+    ddt_endloop_desc_t end_loop;
+};
+typedef union dt_elem_desc dt_elem_desc_t;
+
+/* dt_type_description */
+typedef uint32_t opal_datatype_count_t;
+
+struct dt_type_desc_t {
+    opal_datatype_count_t  length;  /**< the maximum number of elements in the description array */
+    opal_datatype_count_t  used;    /**< the number of used elements in the description array */
+    dt_elem_desc_t*        desc;
+};
+typedef struct dt_type_desc_t dt_type_desc_t;
+
+/*
+ * The datatype description.
+ */
+#define OPAL_DATATYPE_MAX_PREDEFINED 25
+#define OPAL_DATATYPE_MAX_SUPPORTED  47
+#define OPAL_MAX_OBJECT_NAME         64
+
+struct opal_datatype_t {
+    opal_object_t      super;    /**< basic superclass */
+    uint16_t           flags;    /**< the flags */
+    uint16_t           id;       /**< data id, normally the index in the data array. */
+    uint32_t           bdt_used; /**< bitset of which basic datatypes are used in the data description */
+    size_t             size;     /**< total size in bytes of the memory used by the data if
+                                      the data is put on a contiguous buffer */
+    OPAL_PTRDIFF_TYPE  true_lb;  /**< the true lb of the data without user defined lb and ub */
+    OPAL_PTRDIFF_TYPE  true_ub;  /**< the true ub of the data without user defined lb and ub */
+    OPAL_PTRDIFF_TYPE  lb;       /**< lower bound in memory */
+    OPAL_PTRDIFF_TYPE  ub;       /**< upper bound in memory */
+    /* --- cacheline 1 boundary (64 bytes) --- */
+    size_t             nbElems;  /**< total number of elements inside the datatype */
+    uint32_t           align;    /**< data should be aligned to */
+
+    /* Attribute fields */
+    char               name[OPAL_MAX_OBJECT_NAME];  /**< name of the datatype */
+    /* --- cacheline 2 boundary (128 bytes) was 8-12 bytes ago --- */
+    dt_type_desc_t     desc;     /**< the data description */
+    dt_type_desc_t     opt_desc; /**< short description of the data used when conversion is useless
+                                      or in the send case (without conversion) */
+
+    uint32_t           btypes[OPAL_DATATYPE_MAX_SUPPORTED];
+                                 /**< basic elements count used to compute the size of the
+                                      datatype for remote nodes. The length of the array is dependent on
+                                      the maximum number of datatypes of all top layers.
+                                      Reason being is that Fortran is not at the OPAL layer. */
+    /* --- cacheline 5 boundary (320 bytes) was 32-36 bytes ago --- */
+
+    /* size: 352, cachelines: 6, members: 15 */
+    /* last cacheline: 28-32 bytes */
+};
+
+typedef struct opal_datatype_t opal_datatype_t;
+
+/* convertor and stack */
+typedef struct opal_convertor_t opal_convertor_t;
+
+typedef int32_t (*convertor_advance_fct_t)( opal_convertor_t* pConvertor,
+                                            struct iovec* iov,
+                                            uint32_t* out_size,
+                                            size_t* max_data );
+typedef void*(*memalloc_fct_t)( size_t* pLength, void* userdata );
+typedef void*(*memcpy_fct_t)( void* dest, const void* src, size_t n, opal_convertor_t* pConvertor );
+
+/* The master convertor struct (defined in convertor_internal.h) */
+struct opal_convertor_master_t;
+
+struct dt_stack_t {
+    int32_t           index;    /**< index in the element description */
+    int16_t           type;     /**< the type used for the last pack/unpack (original or OPAL_DATATYPE_UINT1) */
+    size_t            count;    /**< number of times we still have to do it */
+    OPAL_PTRDIFF_TYPE disp;     /**< actual displacement depending on the count field */
+};
+typedef struct dt_stack_t dt_stack_t;
+
+typedef int32_t (*conversion_fct_t)( opal_convertor_t* pConvertor, uint32_t count,
+                                     const void* from, size_t from_len, OPAL_PTRDIFF_TYPE from_extent,
+                                     void* to, size_t to_length, OPAL_PTRDIFF_TYPE to_extent,
+                                     OPAL_PTRDIFF_TYPE *advance );
+
+typedef struct opal_convertor_master_t {
+    struct opal_convertor_master_t* next;
+    uint32_t                        remote_arch;
+    uint32_t                        flags;
+    uint32_t                        hetero_mask;
+    const size_t                    remote_sizes[OPAL_DATATYPE_MAX_PREDEFINED];
+    conversion_fct_t*               pFunctions;   /**< the convertor functions pointer */
+} opal_convertor_master_t;
+
+struct opal_convertor_t {
+    opal_object_t                 super;          /**< basic superclass */
+    uint32_t                      remoteArch;     /**< the remote architecture */
+    uint32_t                      flags;          /**< the properties of this convertor */
+    size_t                        local_size;     /**< overall length data on local machine, compared to bConverted */
+    size_t                        remote_size;    /**< overall length data on remote machine, compared to bConverted */
+    const opal_datatype_t*        pDesc;          /**< the datatype description associated with the convertor */
+    const dt_type_desc_t*         use_desc;       /**< the version used by the convertor (normal or optimized) */
+    opal_datatype_count_t         count;          /**< the total number of full datatype elements */
+    uint32_t                      stack_size;     /**< size of the allocated stack */
+    /* --- cacheline 1 boundary (64 bytes) --- */
+    unsigned char*                pBaseBuf;       /**< initial buffer as supplied by the user */
+    dt_stack_t*                   pStack;         /**< the local stack for the actual conversion */
+    convertor_advance_fct_t       fAdvance;       /**< pointer to the pack/unpack functions */
+    struct opal_convertor_master_t* master;       /**< the master convertor */
+
+    /* All others fields get modified for every call to pack/unpack functions */
+    uint32_t                      stack_pos;      /**< the actual position on the stack */
+    uint32_t                      partial_length; /**< amount of data left over from the last unpack */
+    size_t                        bConverted;     /**< # of bytes already converted */
+    uint32_t                      checksum;       /**< checksum computed by pack/unpack operation */
+    uint32_t                      csum_ui1;       /**< partial checksum computed by pack/unpack operation */
+    size_t                        csum_ui2;       /**< partial checksum computed by pack/unpack operation */
+     /* --- cacheline 2 boundary (128 bytes) --- */
+    dt_stack_t                    static_stack[DT_STATIC_STACK_SIZE];  /**< local stack for small datatypes */
+    /* --- cacheline 3 boundary (192 bytes) was 56 bytes ago --- */
+
+#if OPAL_CUDA_SUPPORT
+    memcpy_fct_t                  cbmemcpy;       /**< memcpy or cuMemcpy */
+    void *                        stream;         /**< CUstream for async copy */
+#endif
+    /* size: 248, cachelines: 4, members: 20 */
+    /* last cacheline: 56 bytes */
+};
+
+struct iovec {  
+    void *iov_base; /* Starting address */  
+    size_t iov_len; /* Length in bytes */  
+};
+
+typedef struct {
+    dt_stack_t pStack[DT_STATIC_STACK_SIZE];
+    dt_elem_desc_t* description;
+    struct iovec iov[IOV_ARRAY_SIZE];
+    uint32_t stack_pos;
+    uint32_t stack_size;
+    unsigned char* pBaseBuf; /* const */
+    OPAL_PTRDIFF_TYPE lb;  /* const */
+    OPAL_PTRDIFF_TYPE ub;  /* const */
+    size_t bConverted;
+    size_t local_size; /* const */
+    uint32_t out_size;
+    size_t max_data;
+    uint32_t description_count;
+    uint32_t description_max_count;
+} ddt_cuda_desc_t;
+
+typedef struct {
+    cudaStream_t opal_cuda_stream[NB_STREAMS];
+    uint32_t current_stream_id;
+} ddt_cuda_stream_t;
+
+extern ddt_cuda_desc_t *cuda_desc_d, *cuda_desc_h;
+extern unsigned char* pBaseBuf_GPU;
+extern ddt_cuda_stream_t* cuda_streams;
+
+#define SAVE_STACK( PSTACK, INDEX, TYPE, COUNT, DISP) \
+do { \
+   (PSTACK)->index    = (INDEX); \
+   (PSTACK)->type     = (TYPE); \
+   (PSTACK)->count    = (COUNT); \
+   (PSTACK)->disp     = (DISP); \
+} while(0)
+
+#define PUSH_STACK( PSTACK, STACK_POS, INDEX, TYPE, COUNT, DISP) \
+do { \
+   dt_stack_t* pTempStack = (PSTACK) + 1; \
+   if (threadIdx.x == 0) {  \
+       SAVE_STACK( pTempStack, (INDEX), (TYPE), (COUNT), (DISP) );  \
+   }    \
+   __syncthreads(); \
+   (STACK_POS)++; \
+   (PSTACK) = pTempStack; \
+} while(0)
+
+#define UPDATE_INTERNAL_COUNTERS( DESCRIPTION, POSITION, ELEMENT, COUNTER ) \
+    do {                                                                \
+        (ELEMENT) = &((DESCRIPTION)[(POSITION)]);                       \
+        (COUNTER) = (ELEMENT)->elem.count;                              \
+    } while (0)
+        
+#if defined (OPAL_DATATYPE_CUDA_DEBUG) 
+#define DBGPRINT(fmt, ...) printf(fmt, __VA_ARGS__) 
+#else 
+#define DBGPRINT(fmt, ...) 
+#endif 
+
+__device__ void pack_contiguous_loop_cuda_kernel( dt_elem_desc_t* ELEM,
+                                                  uint32_t* COUNT,
+                                                  unsigned char** SOURCE,
+                                                  unsigned char** DESTINATION,
+                                                  size_t* SPACE );
+                                                            
+__device__ void unpack_contiguous_loop_cuda_kernel( dt_elem_desc_t* ELEM,
+                                                    uint32_t* COUNT,
+                                                    unsigned char** SOURCE,
+                                                    unsigned char** DESTINATION,
+                                                    size_t* SPACE );
+                                                  
+__global__ void opal_generic_simple_pack_cuda_kernel(ddt_cuda_desc_t* cuda_desc);
+
+__global__ void opal_generic_simple_unpack_cuda_kernel(ddt_cuda_desc_t* cuda_desc);
+
+__global__ void pack_contiguous_loop_cuda_kernel_global( uint32_t copy_loops,
+                                                         size_t size,
+                                                         OPAL_PTRDIFF_TYPE extent,
+                                                         unsigned char* source,
+                                                         unsigned char* destination );
+                                                         
+__global__ void unpack_contiguous_loop_cuda_kernel_global( uint32_t copy_loops,
+                                                           size_t size,
+                                                           OPAL_PTRDIFF_TYPE extent,
+                                                           unsigned char* source,
+                                                           unsigned char* destination );
+
+extern "C"
+{
+int32_t opal_convertor_set_position_nocheck( opal_convertor_t* convertor, size_t* position );
+}
+
+#endif  /* OPAL_DATATYPE_CUDA_INTERNAL_H_HAS_BEEN_INCLUDED */
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
new file mode 100644
index 00000000000..d56ebfe6954
--- /dev/null
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
@@ -0,0 +1,502 @@
+#include "opal_datatype_cuda_internal.cuh"
+#include <stdio.h> 
+#include <time.h>
+
+__device__ void pack_contiguous_loop_cuda_kernel( dt_elem_desc_t* ELEM,
+                                                  uint32_t* COUNT,
+                                                  unsigned char** SOURCE,
+                                                  unsigned char** DESTINATION,
+                                                  size_t* SPACE )
+{
+    ddt_loop_desc_t *_loop = (ddt_loop_desc_t*)(ELEM);
+    ddt_endloop_desc_t* _end_loop = (ddt_endloop_desc_t*)((ELEM) + _loop->items);
+    unsigned char* _src_disp = (*SOURCE) + _end_loop->first_elem_disp;
+    uint32_t _copy_loops = *(COUNT);
+    uint32_t _i, tid, num_threads;
+    unsigned char* _destination = *DESTINATION;
+//    unsigned char* _source = _src_disp;
+    uint32_t gap, nb_elements;
+    double *_source_tmp, *_destination_tmp, *_src_disp_tmp;
+    
+    tid = threadIdx.x + blockIdx.x * blockDim.x;
+    num_threads = gridDim.x * blockDim.x;
+
+    if( (_copy_loops * _end_loop->size) > *(SPACE) )
+        _copy_loops = (uint32_t)(*(SPACE) / _end_loop->size);
+    
+//     num_task_per_thread = _copy_loops / num_threads;
+//     residue = _copy_loops % num_threads;
+//     if ( ((tid < residue) && (residue != 0)) || (residue == 0) ) {
+//         num_task_per_thread += residue == 0 ? 0 : 1;
+//         start_index = tid * num_task_per_thread;
+//     } else {
+//         start_index = residue * (num_task_per_thread+1) + (tid-residue) * num_task_per_thread;
+//     }
+//
+//     end_index = start_index + num_task_per_thread;
+//     DBGPRINT("tid %d, start %d, end %d, num_task_per_thread %d, copy_loops %d\n", tid, start_index, end_index, num_task_per_thread, _copy_loops);
+//     for( _i = start_index; _i < end_index; _i++ ) {
+//         // OPAL_DATATYPE_SAFEGUARD_POINTER( _source, _loop->extent, (CONVERTOR)->pBaseBuf,
+//         //                             (CONVERTOR)->pDesc, (CONVERTOR)->count );
+//         _source = _src_disp + _i * _loop->extent;
+//         _destination = *DESTINATION + _i * _end_loop->size;
+//         DBGPRINT("tid %d, pack 3. memcpy( %p, %p, %lu ) => space %lu, _i %d\n",
+//                                tid, _destination, _source, (unsigned long)_end_loop->size, (unsigned long)(*(SPACE) - _i * _end_loop->size), _i );
+//     //    MEMCPY_CSUM( *(DESTINATION), _source, _end_loop->size, (CONVERTOR) );
+// #if !defined (OPAL_DATATYPE_CUDA_DRY_RUN)
+//  //       memcpy(_destination, _source, _end_loop->size);
+//         _source_tmp = (double *)_source;
+//         _destination_tmp = (double *)_destination;
+//         for (_j = 0; _j < _end_loop->size/8; _j++)
+//         {
+//             *_destination_tmp = *_source_tmp;
+//             _destination_tmp ++;
+//             _source_tmp ++;
+//         }
+// #endif /* OPAL_DATATYPE_CUDA_DRY_RUN */
+//     }
+    
+    gap = (_loop->extent - _end_loop->size) / 8;
+    nb_elements = _end_loop->size / 8;
+    _src_disp_tmp = (double*)_src_disp;
+    _destination_tmp = (double*)_destination;
+    _destination_tmp += tid;
+
+    __syncthreads();
+
+    for (_i = tid; _i < _copy_loops*nb_elements; _i+=num_threads) {
+        _source_tmp = _src_disp_tmp + tid + _i/num_threads*num_threads + _i/nb_elements * gap;
+#if defined (OPAL_DATATYPE_CUDA_DEBUG)
+        if (_i % nb_elements == 0 ) {
+            DBGPRINT("tid %d, pack 3. memcpy( %p, %p, %lu ) => space %lu, _i %d, actual _i %d\n",
+                                            tid, _destination_tmp, _source_tmp, (unsigned long)_end_loop->size, (unsigned long)(*(SPACE) - _i/nb_elements * _end_loop->size), _i/nb_elements, _i );
+        }
+        // if (_i / nb_elements ==1 && tid == 0 ) {
+        //     DBGPRINT("tid %d, pack 3. memcpy( %p, %p, %lu ) => space %lu, _i %d, actual _i %d\n",
+        //                                     tid, _destination_tmp, _source_tmp, (unsigned long)_end_loop->size, (unsigned long)(*(SPACE) - _i/nb_elements * _end_loop->size), _i/nb_elements, _i );
+        // }
+#endif /* OPAL_DATATYPE_CUDA_DEBUG */
+#if !defined (OPAL_DATATYPE_CUDA_DRY_RUN)
+        *_destination_tmp = *_source_tmp;
+#endif /* ! OPAL_DATATYPE_CUDA_DRY_RUN */
+        _destination_tmp += num_threads;
+
+    }
+    *(SOURCE) = _src_disp + _copy_loops*_loop->extent - _end_loop->first_elem_disp;
+    *(DESTINATION) = *(DESTINATION)  + _copy_loops * _end_loop->size;
+    *(SPACE) -= _copy_loops * _end_loop->size;
+    *(COUNT) -= _copy_loops;
+
+    __syncthreads();
+}
+
+__device__ void pack_predefined_data_cuda_kernel( dt_elem_desc_t* ELEM,
+                                                  uint32_t* COUNT,
+                                                  unsigned char** SOURCE,
+                                                  unsigned char** DESTINATION,
+                                                  size_t* SPACE )
+{
+    uint32_t _copy_count = *(COUNT);
+    size_t _copy_blength;
+    ddt_elem_desc_t* _elem = &((ELEM)->elem);
+    unsigned char* _src_disp = (*SOURCE) + _elem->disp;
+    uint32_t _i, tid, num_threads;
+    unsigned char* _destination = *DESTINATION;
+    uint32_t gap, nb_elements;
+    double *_source_tmp, *_destination_tmp, *_src_disp_tmp;;
+
+    _copy_blength = 8;//opal_datatype_basicDatatypes[_elem->common.type]->size;
+    if( (_copy_count * _copy_blength) > *(SPACE) ) {
+        _copy_count = (uint32_t)(*(SPACE) / _copy_blength);
+        if( 0 == _copy_count ) return;  /* nothing to do */
+    }
+    
+    tid = threadIdx.x + blockIdx.x * blockDim.x;
+    num_threads = gridDim.x * blockDim.x;
+    
+    gap = (_elem->extent - _copy_blength) / 8;
+    nb_elements = _copy_blength / 8;
+    _src_disp_tmp = (double*)_src_disp;
+    _destination_tmp = (double*)_destination;
+    _source_tmp = _src_disp_tmp + tid;
+    _destination_tmp += tid;
+    
+    __syncthreads();
+    
+    for (_i = tid; _i < _copy_count*nb_elements; _i+=num_threads) {
+        _source_tmp = _src_disp_tmp + tid + _i/num_threads*num_threads + _i/nb_elements * gap;
+#if defined (OPAL_DATATYPE_CUDA_DEBUG)
+        if (_i == 0 ) {
+            DBGPRINT("tid %d, pack 3. memcpy( %p, %p, %lu ) => space %lu, _i %d, actual _i %d\n",
+                                            tid, _destination_tmp, _source_tmp, (unsigned long)_copy_blength*_copy_count, (unsigned long)(*(SPACE) - _i/nb_elements * _copy_blength), _i/nb_elements, _i );
+        }
+        // if (_i / nb_elements ==1 && tid == 0 ) {
+        //     DBGPRINT("tid %d, pack 3. memcpy( %p, %p, %lu ) => space %lu, _i %d, actual _i %d\n",
+        //                                     tid, _destination_tmp, _source_tmp, (unsigned long)_end_loop->size, (unsigned long)(*(SPACE) - _i/nb_elements * _end_loop->size), _i/nb_elements, _i );
+        // }
+#endif /* OPAL_DATATYPE_CUDA_DEBUG */
+#if !defined (OPAL_DATATYPE_CUDA_DRY_RUN)
+        *_destination_tmp = *_source_tmp;
+#endif /* ! OPAL_DATATYPE_CUDA_DRY_RUN */
+        _destination_tmp += num_threads;
+
+    }
+    
+    _copy_blength *= _copy_count;
+    *(SOURCE)  = _src_disp + _elem->extent*_copy_count - _elem->disp;
+    *(DESTINATION) += _copy_blength;
+    *(SPACE)  -= _copy_blength;
+    *(COUNT)  -= _copy_count;
+    
+    __syncthreads();
+}
+
+__global__ void opal_generic_simple_pack_cuda_kernel(ddt_cuda_desc_t* cuda_desc)
+{
+    dt_stack_t *pStack, *pStack_head;       /* pointer to the position on the stack */
+    uint32_t pos_desc;        /* actual position in the description of the derived datatype */
+    uint32_t count_desc;      /* the number of items already done in the actual pos_desc */
+    size_t total_packed = 0;  /* total amount packed this time */
+    dt_elem_desc_t* description;
+    dt_elem_desc_t* pElem;
+    unsigned char *conv_ptr, *iov_ptr, *pBaseBuf;
+    size_t iov_len_local;
+    uint32_t iov_count;
+    uint32_t stack_pos;
+    struct iovec* iov;
+
+    OPAL_PTRDIFF_TYPE lb;
+    OPAL_PTRDIFF_TYPE ub;
+    uint32_t out_size;
+    uint32_t tid;
+
+    tid = threadIdx.x + blockIdx.x * blockDim.x;
+
+    __shared__ ddt_cuda_desc_t cuda_desc_b;
+
+    if (threadIdx.x == 0) {
+        memcpy(&cuda_desc_b, cuda_desc, sizeof(ddt_cuda_desc_t));
+    }
+    __syncthreads();
+
+    // load cuda descriptor from constant memory
+    iov = cuda_desc_b.iov;
+    pStack_head = cuda_desc_b.pStack;
+    pStack = pStack_head;
+    description = cuda_desc_b.description;
+    stack_pos = cuda_desc_b.stack_pos;
+    pBaseBuf = cuda_desc_b.pBaseBuf;
+    lb = cuda_desc_b.lb;
+    ub = cuda_desc_b.ub;
+    out_size = cuda_desc_b.out_size;
+
+    pStack = pStack + stack_pos;
+    pos_desc   = pStack->index;
+    conv_ptr   = pBaseBuf + pStack->disp;
+    count_desc = (uint32_t)pStack->count;
+    pStack--;
+    stack_pos--;
+    pElem = &(description[pos_desc]);
+
+//    printf("pack start pos_desc %d count_desc %d disp %ld, stack_pos %d pos_desc %d count_desc %d disp %ld\n",
+//            pos_desc, count_desc, (long)(conv_ptr - pBaseBuf), stack_pos, pStack->index, (int)pStack->count, (long)pStack->disp);
+
+    for( iov_count = 0; iov_count < out_size; iov_count++ ) {
+        iov_ptr = (unsigned char *) iov[iov_count].iov_base;
+        iov_len_local = iov[iov_count].iov_len;
+        DBGPRINT("iov_len_local %lu, flags %d, types %d, count %d\n", iov_len_local, description->elem.common.flags, description->elem.common.type, description->elem.count);
+        while( 1 ) {
+            while( pElem->elem.common.flags & OPAL_DATATYPE_FLAG_DATA ) {
+                /* now here we have a basic datatype */
+                // PACK_PREDEFINED_DATATYPE( pConvertor, pElem, count_desc,
+                //                           conv_ptr, iov_ptr, iov_len_local );
+                pack_predefined_data_cuda_kernel(pElem, &count_desc, &conv_ptr, &iov_ptr, &iov_len_local);
+                if( 0 == count_desc ) {  /* completed */
+                    conv_ptr = pBaseBuf + pStack->disp;
+                    pos_desc++;  /* advance to the next data */
+                    UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
+                    continue;
+                }
+                goto complete_loop;
+            }
+            if( OPAL_DATATYPE_END_LOOP == pElem->elem.common.type ) { /* end of the current loop */
+                // DO_DEBUG( opal_output( 0, "pack end_loop count %d stack_pos %d"
+                //                        " pos_desc %d disp %ld space %lu\n",
+                //                        (int)pStack->count, pConvertor->stack_pos,
+                //                        pos_desc, (long)pStack->disp, (unsigned long)iov_len_local ); );
+                if (threadIdx.x == 0) {
+                    (pStack->count)--;
+                }
+                __syncthreads();
+
+                if( (pStack->count) == 0 ) { /* end of loop */
+                    if( 0 == stack_pos ) {
+                        /* we lie about the size of the next element in order to
+                         * make sure we exit the main loop.
+                         */
+                        out_size = iov_count;
+                        goto complete_loop;  /* completed */
+                    }
+                    stack_pos--;
+                    pStack--;
+                    pos_desc++;
+                } else {
+                    pos_desc = pStack->index + 1;
+                    if (threadIdx.x == 0) {
+                        if( pStack->index == -1 ) {
+                            pStack->disp += (ub - lb);
+                        } else {
+                            // assert( OPAL_DATATYPE_LOOP == description[pStack->index].loop.common.type );
+                            pStack->disp += description[pStack->index].loop.extent;
+                        }
+                    }
+                    __syncthreads();
+                }
+                conv_ptr = pBaseBuf + pStack->disp;
+                UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
+                // DO_DEBUG( opal_output( 0, "pack new_loop count %d stack_pos %d pos_desc %d count_desc %d disp %ld space %lu\n",
+                //                        (int)pStack->count, pConvertor->stack_pos, pos_desc,
+                //                        count_desc, (long)pStack->disp, (unsigned long)iov_len_local ); );
+            }
+            if( OPAL_DATATYPE_LOOP == pElem->elem.common.type ) {
+                OPAL_PTRDIFF_TYPE local_disp = (OPAL_PTRDIFF_TYPE)conv_ptr;
+                if( pElem->loop.common.flags & OPAL_DATATYPE_FLAG_CONTIGUOUS ) {
+                    pack_contiguous_loop_cuda_kernel( pElem, &count_desc,
+                                          &conv_ptr, &iov_ptr, &iov_len_local );
+                    if( 0 == count_desc ) {  /* completed */
+                        pos_desc += pElem->loop.items + 1;
+                        goto update_loop_description;
+                    }
+                    /* Save the stack with the correct last_count value. */
+                }
+                local_disp = (OPAL_PTRDIFF_TYPE)conv_ptr - local_disp;
+
+                PUSH_STACK( pStack, stack_pos, pos_desc, OPAL_DATATYPE_LOOP, count_desc,
+                            pStack->disp + local_disp);
+
+                pos_desc++;
+            update_loop_description:  /* update the current state */
+                conv_ptr = pBaseBuf + pStack->disp;
+                UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
+                // DDT_DUMP_STACK( pConvertor->pStack, pConvertor->stack_pos, pElem, "advance loop" );
+                continue;
+            }
+        }
+    complete_loop:
+        if (threadIdx.x == 0) {
+            iov[iov_count].iov_len -= iov_len_local;  /* update the amount of valid data */
+        }
+        __syncthreads();
+        total_packed += iov[iov_count].iov_len;
+    }
+
+    if (tid == 0) {
+        cuda_desc->max_data = total_packed;
+        cuda_desc->out_size = iov_count;
+        // cuda_desc->bConverted += total_packed;  /* update the already converted bytes */
+        // if( cuda_desc->bConverted == cuda_desc->local_size ) {
+        //     cuda_desc->stack_pos = stack_pos;
+        //     memcpy(cuda_desc->pStack, pStack_head, sizeof(dt_stack_t)*cuda_desc->stack_size);
+        //     return;
+        // }
+        // /* Save the global position for the next round */
+        // PUSH_STACK( pStack, stack_pos, pos_desc, OPAL_DATATYPE_INT8, count_desc,
+        //             conv_ptr - pBaseBuf );
+        // memcpy(cuda_desc->pStack, pStack_head, sizeof(dt_stack_t)*cuda_desc->stack_size);
+        // cuda_desc->stack_pos = stack_pos;
+    }
+    __syncthreads();
+
+    return;
+}
+
+// __global__ void opal_generic_simple_pack_cuda_kernel(ddt_cuda_desc_t* cuda_desc)
+// {
+//     dt_stack_t *pStack, *pStack_head;       /* pointer to the position on the stack */
+//     uint32_t pos_desc;        /* actual position in the description of the derived datatype */
+//     uint32_t count_desc;      /* the number of items already done in the actual pos_desc */
+//     size_t total_packed = 0;  /* total amount packed this time */
+//     dt_elem_desc_t* description;
+//     dt_elem_desc_t* pElem;
+//     unsigned char *conv_ptr, *iov_ptr, *pBaseBuf;
+//     size_t iov_len_local;
+//     uint32_t iov_count;
+//     uint32_t stack_pos;
+//     struct iovec* iov;
+//
+//     OPAL_PTRDIFF_TYPE lb;
+//     OPAL_PTRDIFF_TYPE ub;
+//     uint32_t out_size;
+//     uint32_t tid;
+//
+//     tid = threadIdx.x + blockIdx.x * blockDim.x;
+//
+//     __shared__ ddt_cuda_desc_t cuda_desc_b;
+//
+//     if (threadIdx.x == 0) {
+//         memcpy(&cuda_desc_b, cuda_desc, sizeof(ddt_cuda_desc_t));
+//     }
+//     __syncthreads();
+//
+//
+//     // load cuda descriptor from constant memory
+//     iov = cuda_desc_b.iov;
+//     pStack_head = cuda_desc_b.pStack;
+//     pStack = pStack_head;
+//     description = cuda_desc_b.description;
+//     stack_pos = cuda_desc_b.stack_pos;
+//     pBaseBuf = cuda_desc_b.pBaseBuf;
+//     lb = cuda_desc_b.lb;
+//     ub = cuda_desc_b.ub;
+//     out_size = cuda_desc_b.out_size;
+//
+//     pStack = pStack + stack_pos;
+//     pos_desc   = pStack->index;
+//     conv_ptr   = pBaseBuf + pStack->disp;
+//     count_desc = (uint32_t)pStack->count;
+//     pStack--;
+//     stack_pos--;
+//     pElem = &(description[pos_desc]);
+//
+// //    printf("pack start pos_desc %d count_desc %d disp %ld, stack_pos %d pos_desc %d count_desc %d disp %ld\n",
+// //            pos_desc, count_desc, (long)(conv_ptr - pBaseBuf), stack_pos, pStack->index, (int)pStack->count, (long)pStack->disp);
+//
+//     if (threadIdx.x == 0) {
+//     for( iov_count = 0; iov_count < out_size; iov_count++ ) {
+//         iov_ptr = (unsigned char *) iov[iov_count].iov_base;
+//         iov_len_local = iov[iov_count].iov_len;
+//         DBGPRINT("iov_len_local %lu, flags %d, types %d, count %d\n", iov_len_local, description->elem.common.flags, description->elem.common.type, description->elem.count);
+//         while( 1 ) {
+//             while( pElem->elem.common.flags & OPAL_DATATYPE_FLAG_DATA ) {
+//                 /* now here we have a basic datatype */
+//                 // PACK_PREDEFINED_DATATYPE( pConvertor, pElem, count_desc,
+//                 //                           conv_ptr, iov_ptr, iov_len_local );
+//                 if( 0 == count_desc ) {  /* completed */
+//                     conv_ptr = pBaseBuf + pStack->disp;
+//                     pos_desc++;  /* advance to the next data */
+//                     UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
+//                     continue;
+//                 }
+//                 goto complete_loop;
+//             }
+//             if( OPAL_DATATYPE_END_LOOP == pElem->elem.common.type ) { /* end of the current loop */
+//                 // DO_DEBUG( opal_output( 0, "pack end_loop count %d stack_pos %d"
+//                 //                        " pos_desc %d disp %ld space %lu\n",
+//                 //                        (int)pStack->count, pConvertor->stack_pos,
+//                 //                        pos_desc, (long)pStack->disp, (unsigned long)iov_len_local ); );
+//
+//                 if( --(pStack->count) == 0 ) { /* end of loop */
+//                     if( 0 == stack_pos ) {
+//                         /* we lie about the size of the next element in order to
+//                          * make sure we exit the main loop.
+//                          */
+//                         out_size = iov_count;
+//                         goto complete_loop;  /* completed */
+//                     }
+//                     stack_pos--;
+//                     pStack--;
+//                     pos_desc++;
+//                 } else {
+//                     pos_desc = pStack->index + 1;
+//                     if( pStack->index == -1 ) {
+//                         pStack->disp += (ub - lb);
+//                     } else {
+//                         // assert( OPAL_DATATYPE_LOOP == description[pStack->index].loop.common.type );
+//                         pStack->disp += description[pStack->index].loop.extent;
+//                     }
+//
+//                 }
+//                 conv_ptr = pBaseBuf + pStack->disp;
+//                 UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
+//                 // DO_DEBUG( opal_output( 0, "pack new_loop count %d stack_pos %d pos_desc %d count_desc %d disp %ld space %lu\n",
+//                 //                        (int)pStack->count, pConvertor->stack_pos, pos_desc,
+//                 //                        count_desc, (long)pStack->disp, (unsigned long)iov_len_local ); );
+//             }
+//             if( OPAL_DATATYPE_LOOP == pElem->elem.common.type ) {
+//                 OPAL_PTRDIFF_TYPE local_disp = (OPAL_PTRDIFF_TYPE)conv_ptr;
+//                 if( pElem->loop.common.flags & OPAL_DATATYPE_FLAG_CONTIGUOUS ) {
+//                     // pack_contiguous_loop_cuda_kernel( pElem, &count_desc,
+//                     //                       &conv_ptr, &iov_ptr, &iov_len_local );
+//                     count_desc = 0;
+//                     if( 0 == count_desc ) {  /* completed */
+//                         pos_desc += pElem->loop.items + 1;
+//                         goto update_loop_description;
+//                     }
+//                     /* Save the stack with the correct last_count value. */
+//                 }
+//                 local_disp = (OPAL_PTRDIFF_TYPE)conv_ptr - local_disp;
+//
+//                 PUSH_STACK( pStack, stack_pos, pos_desc, OPAL_DATATYPE_LOOP, count_desc,
+//                             pStack->disp + local_disp);
+//
+//                 pos_desc++;
+//             update_loop_description:  /* update the current state */
+//                 conv_ptr = pBaseBuf + pStack->disp;
+//                 UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
+//                 // DDT_DUMP_STACK( pConvertor->pStack, pConvertor->stack_pos, pElem, "advance loop" );
+//                 continue;
+//             }
+//         }
+//     complete_loop:
+//         iov[iov_count].iov_len -= iov_len_local;  /* update the amount of valid data */
+//         total_packed += iov[iov_count].iov_len;
+//     }
+//
+//     }
+//     __syncthreads();
+//     if (tid == 0) {
+//         cuda_desc->max_data = total_packed;
+//         cuda_desc->out_size = iov_count;
+//         // cuda_desc->bConverted += total_packed;  /* update the already converted bytes */
+//         // if( cuda_desc->bConverted == cuda_desc->local_size ) {
+//         //     cuda_desc->stack_pos = stack_pos;
+//         //     memcpy(cuda_desc->pStack, pStack_head, sizeof(dt_stack_t)*cuda_desc->stack_size);
+//         //     return;
+//         // }
+//         // /* Save the global position for the next round */
+//         // PUSH_STACK( pStack, stack_pos, pos_desc, OPAL_DATATYPE_INT8, count_desc,
+//         //             conv_ptr - pBaseBuf );
+//         // memcpy(cuda_desc->pStack, pStack_head, sizeof(dt_stack_t)*cuda_desc->stack_size);
+//         // cuda_desc->stack_pos = stack_pos;
+//     }
+//     return;
+// }
+
+__global__ void pack_contiguous_loop_cuda_kernel_global( uint32_t copy_loops,
+                                                         size_t size,
+                                                         OPAL_PTRDIFF_TYPE extent,
+                                                         unsigned char* source,
+                                                         unsigned char* destination )
+{
+    uint32_t _i, tid, num_threads;
+    uint32_t gap, nb_elements;
+    double *_source_tmp, *_destination_tmp, *_src_disp_tmp;;
+    
+    tid = threadIdx.x + blockIdx.x * blockDim.x;
+    num_threads = gridDim.x * blockDim.x;
+    
+    gap = (extent - size) / 8;
+    nb_elements = size / 8;
+    _src_disp_tmp = (double*)source;
+    _destination_tmp = (double*)destination;
+    _source_tmp = _src_disp_tmp + tid;
+    _destination_tmp += tid;
+
+    for (_i = tid; _i < copy_loops*nb_elements; _i+=num_threads) {
+        _source_tmp = _src_disp_tmp + tid + _i/num_threads*num_threads + _i/nb_elements * gap;
+#if defined (OPAL_DATATYPE_CUDA_DEBUG)
+        if (_i % nb_elements == 0 ) {
+            DBGPRINT("tid %d, pack 3. memcpy( %p, %p, %lu ) => _i %d, actual _i %d, count %d\n",
+                                            tid, _destination_tmp, _source_tmp, (unsigned long)size, _i/nb_elements, _i, copy_loops );
+        }
+        // if (_i / nb_elements ==1 && tid == 0 ) {
+        //     DBGPRINT("tid %d, pack 3. memcpy( %p, %p, %lu ) => space %lu, _i %d, actual _i %d\n",
+        //                                     tid, _destination_tmp, _source_tmp, (unsigned long)_end_loop->size, (unsigned long)(*(SPACE) - _i/nb_elements * _end_loop->size), _i/nb_elements, _i );
+        // }
+#endif /* OPAL_DATATYPE_CUDA_DEBUG */
+#if !defined (OPAL_DATATYPE_CUDA_DRY_RUN)
+        *_destination_tmp = *_source_tmp;
+#endif /* ! OPAL_DATATYPE_CUDA_DRY_RUN */
+        _destination_tmp += num_threads;
+    }
+}
\ No newline at end of file
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
new file mode 100644
index 00000000000..3b04bf025e8
--- /dev/null
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
@@ -0,0 +1,196 @@
+#include "opal_datatype_cuda_internal.cuh"
+#include "opal_datatype_cuda.cuh"
+
+#include <stdio.h>
+
+int32_t opal_generic_simple_pack_function_cuda( opal_convertor_t* pConvertor,
+                                                struct iovec* iov, 
+                                                uint32_t* out_size,
+                                                size_t* max_data )
+{
+    uint32_t i;
+    dt_elem_desc_t* description;
+    const opal_datatype_t *pData = pConvertor->pDesc;
+    uint32_t tasks_per_block, num_blocks;
+    dt_stack_t* pStack;
+    
+    description = pConvertor->use_desc->desc;
+    
+    cuda_desc_h->stack_pos = pConvertor->stack_pos;
+#if defined(OPAL_DATATYPE_CUDA_DRY_RUN)
+    cuda_desc_h->pBaseBuf = pConvertor->pBaseBuf;
+#else
+    cuda_desc_h->pBaseBuf = pBaseBuf_GPU;
+#endif /* OPAL_DATATYPE_CUDA_DRY_RUN */
+    cuda_desc_h->lb = pData->lb;
+    cuda_desc_h->ub = pData->ub;
+    cuda_desc_h->out_size = *out_size;
+    cuda_desc_h->max_data = *max_data;
+    cuda_desc_h->bConverted = pConvertor->bConverted;
+    cuda_desc_h->local_size = pConvertor->local_size;
+    cuda_desc_h->stack_size = pConvertor->stack_size;
+    
+    for (i = 0; i < pConvertor->stack_size; i++) {
+        cuda_desc_h->pStack[i] = pConvertor->pStack[i];
+    }
+    if (cuda_desc_h->description_max_count != 0) {
+        if (cuda_desc_h->description_max_count >= (pConvertor->use_desc->used+1)) {
+            cuda_desc_h->description_count = pConvertor->use_desc->used+1;
+        } else {
+            cudaFree(cuda_desc_h->description);
+            cuda_desc_h->description = NULL;
+            cudaMalloc((void **)&(cuda_desc_h->description), sizeof(dt_elem_desc_t)*(pConvertor->use_desc->used+1));
+            cuda_desc_h->description_max_count = pConvertor->use_desc->used+1;
+            cuda_desc_h->description_count = pConvertor->use_desc->used+1;
+        }
+        
+    } else {
+        cudaMalloc((void **)&(cuda_desc_h->description), sizeof(dt_elem_desc_t)*(pConvertor->use_desc->used+1));
+        cuda_desc_h->description_max_count = pConvertor->use_desc->used+1;
+        cuda_desc_h->description_count = pConvertor->use_desc->used+1;
+    }
+    cudaMemcpy(cuda_desc_h->description, description, sizeof(dt_elem_desc_t)*(pConvertor->use_desc->used+1), cudaMemcpyHostToDevice);
+    
+    // for (i = 0; i < pConvertor->use_desc->used+1; i++) {
+    //     cuda_desc_h->description[i] = description[i];
+    // }
+    
+    DBGPRINT("stack_size %d\n", pConvertor->stack_size);
+
+    DBGPRINT("flags %d, types %d, count %d\n", description->elem.common.flags, description->elem.common.type, description->elem.count);
+    
+    for (i = 0; i < *out_size; i++) {
+#if defined (OPAL_DATATYPE_CUDA_DRY_RUN)
+        cuda_desc_h->iov[i].iov_base = iov[i].iov_base;
+#endif /* OPAL_DATATYPE_CUDA_DRY_RUN */
+        cuda_desc_h->iov[i].iov_len = iov[i].iov_len;
+    }
+    
+    cudaMemcpy(cuda_desc_d, cuda_desc_h, sizeof(ddt_cuda_desc_t), cudaMemcpyHostToDevice);
+    
+    pStack = pConvertor->pStack + pConvertor->stack_pos;
+    tasks_per_block = THREAD_PER_BLOCK * TASK_PER_THREAD;
+    num_blocks = ((uint32_t)pStack->count + tasks_per_block - 1) / tasks_per_block;
+    printf("launch kernel, count %d, num_blocks %d, total threads %d\n", (uint32_t)pStack->count, num_blocks, num_blocks*2*THREAD_PER_BLOCK);
+    opal_generic_simple_pack_cuda_kernel<<<192,4*THREAD_PER_BLOCK>>>(cuda_desc_d);
+#if !defined(OPAL_DATATYPE_CUDA_DRY_RUN)
+    size_t position = pConvertor->pDesc->size;
+    opal_convertor_set_position_nocheck(pConvertor, &position);
+#endif
+    cudaDeviceSynchronize();
+    
+#if defined(OPAL_DATATYPE_CUDA_DRY_RUN)
+    return -99;
+#else
+    // /* copy stack and description data back to CPU */
+    // cudaMemcpy(cuda_desc_h, cuda_desc_d, sizeof(ddt_cuda_desc_t), cudaMemcpyDeviceToHost);
+    //
+    // for (i = 0; i < pConvertor->stack_size; i++) {
+    //     pConvertor->pStack[i] = cuda_desc_h->pStack[i];
+    // }
+    //
+    // pConvertor->stack_pos = cuda_desc_h->stack_pos;
+    // *out_size = cuda_desc_h->out_size;
+    // *max_data = cuda_desc_h->max_data;
+    // pConvertor->bConverted = cuda_desc_h->bConverted;
+    // pConvertor->local_size = cuda_desc_h->local_size;
+    //
+    // for (i = 0; i < *out_size; i++) {
+    //     iov[i].iov_len = cuda_desc_h->iov[i].iov_len;
+    // }
+    //
+    if( pConvertor->bConverted == pConvertor->local_size ) {
+        // pConvertor->flags |= CONVERTOR_COMPLETED;
+        return 1;
+    }
+
+    return 0;
+#endif /* OPAL_DATATYPE_CUDA_DRY_RUN */
+                                                  
+}
+
+void pack_contiguous_loop_cuda( dt_elem_desc_t* ELEM,
+                                uint32_t* COUNT,
+                                unsigned char** SOURCE,
+                                unsigned char** DESTINATION,
+                                size_t* SPACE )
+{
+    ddt_loop_desc_t *_loop = (ddt_loop_desc_t*)(ELEM);
+    ddt_endloop_desc_t* _end_loop = (ddt_endloop_desc_t*)((ELEM) + _loop->items);
+    unsigned char* _source = (*SOURCE) + _end_loop->first_elem_disp;
+    uint32_t _copy_loops = *(COUNT);
+    uint32_t num_blocks, tasks_per_block;
+    unsigned char* _destination = *(DESTINATION);
+
+    printf("I am in pack_contiguous_loop_cuda\n");
+
+    if( (_copy_loops * _end_loop->size) > *(SPACE) )
+        _copy_loops = (uint32_t)(*(SPACE) / _end_loop->size);
+
+#if !defined(OPAL_DATATYPE_CUDA_DRY_RUN)
+    _source = pBaseBuf_GPU;
+    _destination = (unsigned char*)cuda_desc_h->iov[0].iov_base;
+#endif
+    
+    tasks_per_block = THREAD_PER_BLOCK * TASK_PER_THREAD;
+    num_blocks = (*COUNT + tasks_per_block - 1) / tasks_per_block;
+    pack_contiguous_loop_cuda_kernel_global<<<192, 4*THREAD_PER_BLOCK>>>(_copy_loops, _end_loop->size, _loop->extent, _source, _destination);
+
+#if !defined(OPAL_DATATYPE_CUDA_DRY_RUN)    
+    *(SOURCE) = _source +  _loop->extent*_copy_loops - _end_loop->first_elem_disp;
+    *(DESTINATION) = *(DESTINATION)  + _copy_loops * _end_loop->size;
+    *(SPACE) -= _copy_loops * _end_loop->size;
+    *(COUNT) -= _copy_loops;
+#endif
+    
+    cudaDeviceSynchronize();
+}
+
+
+void pack_predefined_data_cuda( dt_elem_desc_t* ELEM,
+                                uint32_t* COUNT,
+                                unsigned char** SOURCE,
+                                unsigned char** DESTINATION,
+                                size_t* SPACE )
+{
+    uint32_t _copy_count = *(COUNT);
+    size_t _copy_blength;
+    ddt_elem_desc_t* _elem = &((ELEM)->elem);
+    unsigned char* _source = (*SOURCE) + _elem->disp;
+    uint32_t num_blocks, tasks_per_block;
+    unsigned char* _destination = *(DESTINATION);
+
+    _copy_blength = 8;//opal_datatype_basicDatatypes[_elem->common.type]->size;
+    if( (_copy_count * _copy_blength) > *(SPACE) ) {
+        _copy_count = (uint32_t)(*(SPACE) / _copy_blength);
+        if( 0 == _copy_count ) return;  /* nothing to do */
+    }
+    
+#if !defined(OPAL_DATATYPE_CUDA_DRY_RUN)
+    _source = pBaseBuf_GPU;
+    _destination = (unsigned char*)cuda_desc_h->iov[0].iov_base;
+#endif
+    
+    tasks_per_block = THREAD_PER_BLOCK*4;
+    num_blocks = (*COUNT + tasks_per_block - 1) / tasks_per_block;
+
+    DBGPRINT("num_blocks %d, thread %d\n", num_blocks, tasks_per_block);
+    DBGPRINT( "GPU pack 1. memcpy( %p, %p, %lu ) => space %lu\n", _destination, _source, (unsigned long)_copy_count, (unsigned long)(*(SPACE)) );
+    
+    pack_contiguous_loop_cuda_kernel_global<<<1, THREAD_PER_BLOCK, 0, cuda_streams->opal_cuda_stream[cuda_streams->current_stream_id]>>>(_copy_count, _copy_blength, _elem->extent, _source, _destination);
+    cuda_streams->current_stream_id ++;
+    cuda_streams->current_stream_id = cuda_streams->current_stream_id % NB_STREAMS;
+    
+#if !defined(OPAL_DATATYPE_CUDA_DRY_RUN)  
+    _copy_blength *= _copy_count;
+    *(SOURCE)  = _source + _elem->extent*_copy_count - _elem->disp;
+    *(DESTINATION) += _copy_blength;
+    *(SPACE)  -= _copy_blength;
+    *(COUNT)  -= _copy_count;
+#endif
+    
+    pBaseBuf_GPU += _elem->extent*_copy_count;
+    cuda_desc_h->iov[0].iov_base = (unsigned char*)cuda_desc_h->iov[0].iov_base + _copy_blength;
+ //   cudaDeviceSynchronize();
+}
+
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
new file mode 100644
index 00000000000..f59b2bb0e00
--- /dev/null
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
@@ -0,0 +1,288 @@
+#include "opal_datatype_cuda_internal.cuh"
+#include <cuda.h>
+#include <stdio.h> 
+
+__device__ void unpack_contiguous_loop_cuda_kernel( dt_elem_desc_t* ELEM,
+                                                    uint32_t* COUNT,
+                                                    unsigned char** SOURCE,
+                                                    unsigned char** DESTINATION,
+                                                    size_t* SPACE )
+{
+    ddt_loop_desc_t *_loop = (ddt_loop_desc_t*)(ELEM);
+    ddt_endloop_desc_t* _end_loop = (ddt_endloop_desc_t*)((ELEM) + _loop->items);
+    unsigned char* _dst_disp = (*DESTINATION) + _end_loop->first_elem_disp;
+    uint32_t _copy_loops = *(COUNT);
+    uint32_t _i, tid, num_threads;
+    unsigned char* _source = *SOURCE;
+//    unsigned char* _source = _src_disp;
+    uint32_t gap, nb_elements;
+    double *_source_tmp, *_destination_tmp, *_dst_disp_tmp;;
+    
+    tid = threadIdx.x + blockIdx.x * blockDim.x;
+    num_threads = gridDim.x * blockDim.x;
+
+    if( (_copy_loops * _end_loop->size) > *(SPACE) )
+        _copy_loops = (uint32_t)(*(SPACE) / _end_loop->size);
+    
+    gap = (_loop->extent - _end_loop->size) / 8;
+    nb_elements = _end_loop->size / 8;
+    _dst_disp_tmp = (double*)_dst_disp;
+    _source_tmp = (double*)_source;
+    _destination_tmp = _dst_disp_tmp + tid;
+    _source_tmp += tid;
+
+    __syncthreads();
+    for (_i = tid; _i < _copy_loops*nb_elements; _i+=num_threads) {
+        _destination_tmp = _dst_disp_tmp + tid + _i/num_threads*num_threads + _i/nb_elements * gap;
+#if defined (OPAL_DATATYPE_CUDA_DEBUG)
+        if (_i % nb_elements == 0 ) {
+            DBGPRINT("tid %d, pack 3. memcpy( %p, %p, %lu ) => space %lu, _i %d, actual _i %d\n",
+                                            tid, _destination_tmp, _source_tmp, (unsigned long)_end_loop->size, (unsigned long)(*(SPACE) - _i/nb_elements * _end_loop->size), _i/nb_elements, _i );
+        }
+        // if (_i / nb_elements ==1 && tid == 0 ) {
+        //     DBGPRINT("tid %d, pack 3. memcpy( %p, %p, %lu ) => space %lu, _i %d, actual _i %d\n",
+        //                                     tid, _destination_tmp, _source_tmp, (unsigned long)_end_loop->size, (unsigned long)(*(SPACE) - _i/nb_elements * _end_loop->size), _i/nb_elements, _i );
+        // }
+#endif /* OPAL_DATATYPE_CUDA_DEBUG */
+#if !defined (OPAL_DATATYPE_CUDA_DRY_RUN)
+        *_destination_tmp = *_source_tmp;
+#endif /* ! OPAL_DATATYPE_CUDA_DRY_RUN */
+        _source_tmp += num_threads;
+//        _source_tmp += num_threads;
+
+    }
+    *(DESTINATION) = _dst_disp + _copy_loops*_loop->extent - _end_loop->first_elem_disp;
+    *(SOURCE) = *(SOURCE)  + _copy_loops * _end_loop->size;
+    *(SPACE) -= _copy_loops * _end_loop->size;
+    *(COUNT) -= _copy_loops;
+
+    __syncthreads();
+}
+
+__global__ void opal_generic_simple_unpack_cuda_kernel(ddt_cuda_desc_t* cuda_desc)
+{
+    dt_stack_t* pStack, *pStack_head;                /* pointer to the position on the stack */
+    uint32_t pos_desc;                 /* actual position in the description of the derived datatype */
+    uint32_t count_desc;               /* the number of items already done in the actual pos_desc */
+    size_t total_unpacked = 0;         /* total size unpacked this time */
+    dt_elem_desc_t* description;
+    dt_elem_desc_t* pElem;
+    unsigned char *conv_ptr, *iov_ptr, *pBaseBuf;
+    size_t iov_len_local;
+    uint32_t iov_count;
+    uint32_t stack_pos;
+    struct iovec* iov;
+
+    OPAL_PTRDIFF_TYPE lb; 
+    OPAL_PTRDIFF_TYPE ub;
+    uint32_t out_size;
+    uint32_t tid;
+
+    tid = threadIdx.x + blockIdx.x * blockDim.x;
+    
+    __shared__ ddt_cuda_desc_t cuda_desc_b;
+    
+    if (threadIdx.x == 0) {
+        memcpy(&cuda_desc_b, cuda_desc, sizeof(ddt_cuda_desc_t));
+    }
+    __syncthreads();
+    
+    // load cuda descriptor from constant memory
+    iov = cuda_desc_b.iov;
+    pStack_head = cuda_desc_b.pStack;
+    pStack = pStack_head;
+    description = cuda_desc_b.description;
+    stack_pos = cuda_desc_b.stack_pos;
+    pBaseBuf = cuda_desc_b.pBaseBuf;
+    lb = cuda_desc_b.lb;
+    ub = cuda_desc_b.ub;
+    out_size = cuda_desc_b.out_size;
+
+    /* For the first step we have to add both displacement to the source. After in the
+     * main while loop we will set back the source_base to the correct value. This is
+     * due to the fact that the convertor can stop in the middle of a data with a count
+     */
+    pStack     = pStack + stack_pos;
+    pos_desc   = pStack->index;
+    conv_ptr   = pBaseBuf + pStack->disp;
+    count_desc = (uint32_t)pStack->count;
+    pStack--;
+    stack_pos--;
+    pElem = &(description[pos_desc]);
+
+
+    for( iov_count = 0; iov_count < out_size; iov_count++ ) {
+        iov_ptr = (unsigned char *) iov[iov_count].iov_base;
+        iov_len_local = iov[iov_count].iov_len;
+        // if( 0 != pConvertor->partial_length ) {
+        //     size_t element_length = opal_datatype_basicDatatypes[pElem->elem.common.type]->size;
+        //     size_t missing_length = element_length - pConvertor->partial_length;
+        //
+        //     assert( pElem->elem.common.flags & OPAL_DATATYPE_FLAG_DATA );
+        //     COMPUTE_CSUM( iov_ptr, missing_length, pConvertor );
+        //     opal_unpack_partial_datatype( pConvertor, pElem,
+        //                                   iov_ptr,
+        //                                   pConvertor->partial_length, element_length - pConvertor->partial_length,
+        //                                   &conv_ptr );
+        //     --count_desc;
+        //     if( 0 == count_desc ) {
+        //         conv_ptr = pConvertor->pBaseBuf + pStack->disp;
+        //         pos_desc++;  /* advance to the next data */
+        //         UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
+        //     }
+        //     iov_ptr       += missing_length;
+        //     iov_len_local -= missing_length;
+        //     pConvertor->partial_length = 0;  /* nothing more inside */
+        // }
+        while( 1 ) {
+            while( pElem->elem.common.flags & OPAL_DATATYPE_FLAG_DATA ) {
+                /* now here we have a basic datatype */
+                // UNPACK_PREDEFINED_DATATYPE( pConvertor, pElem, count_desc,
+                //                             iov_ptr, conv_ptr, iov_len_local );
+                if( 0 == count_desc ) {  /* completed */
+                    conv_ptr = pBaseBuf + pStack->disp;
+                    pos_desc++;  /* advance to the next data */
+                    UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
+                    continue;
+                }
+                // assert( pElem->elem.common.type < OPAL_DATATYPE_MAX_PREDEFINED );
+                if( 0 != iov_len_local ) {
+                    unsigned char* temp = conv_ptr;
+                    /* We have some partial data here. Let's copy it into the convertor
+                     * and keep it hot until the next round.
+                     */
+                    // assert( iov_len_local < opal_datatype_basicDatatypes[pElem->elem.common.type]->size );
+                    // COMPUTE_CSUM( iov_ptr, iov_len_local, pConvertor );
+                    //
+                    // opal_unpack_partial_datatype( pConvertor, pElem,
+                    //                               iov_ptr, 0, iov_len_local,
+                    //                               &temp );
+                    //
+                    // pConvertor->partial_length = (uint32_t)iov_len_local;
+                    iov_len_local = 0;
+                }
+                goto complete_loop;
+            }
+            if( OPAL_DATATYPE_END_LOOP == pElem->elem.common.type ) { /* end of the current loop */
+                // DO_DEBUG( opal_output( 0, "unpack end_loop count %d stack_pos %d pos_desc %d disp %ld space %lu\n",
+                //                        (int)pStack->count, pConvertor->stack_pos, pos_desc,
+                //                        (long)pStack->disp, (unsigned long)iov_len_local ); );
+                if (threadIdx.x == 0) {
+                    (pStack->count)--;
+                }
+                __syncthreads();
+                
+                if( pStack->count == 0 ) { /* end of loop */
+                    if( 0 == stack_pos ) {
+                        /* Do the same thing as when the loop is completed */
+                        iov[iov_count].iov_len -= iov_len_local;  /* update the amount of valid data */
+                        total_unpacked += iov[iov_count].iov_len;
+                        iov_count++;  /* go to the next */
+                        goto complete_conversion;
+                    }
+                    stack_pos--;
+                    pStack--;
+                    pos_desc++;
+                } else {
+                    pos_desc = pStack->index + 1;
+                    if (threadIdx.x == 0) {
+                        if( pStack->index == -1 ) {
+                            pStack->disp += (ub - lb);
+                        } else {
+                            //assert( OPAL_DATATYPE_LOOP == description[pStack->index].loop.common.type );
+                            pStack->disp += description[pStack->index].loop.extent;
+                        }
+                    }
+                    __syncthreads();
+                }
+                conv_ptr = pBaseBuf + pStack->disp;
+                UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
+                // DO_DEBUG( opal_output( 0, "unpack new_loop count %d stack_pos %d pos_desc %d disp %ld space %lu\n",
+                //                        (int)pStack->count, pConvertor->stack_pos, pos_desc,
+                //                        (long)pStack->disp, (unsigned long)iov_len_local ); );
+            }
+            if( OPAL_DATATYPE_LOOP == pElem->elem.common.type ) {
+                OPAL_PTRDIFF_TYPE local_disp = (OPAL_PTRDIFF_TYPE)conv_ptr;
+                if( pElem->loop.common.flags & OPAL_DATATYPE_FLAG_CONTIGUOUS ) {
+                    unpack_contiguous_loop_cuda_kernel( pElem, &count_desc,
+                                                        &iov_ptr, &conv_ptr, &iov_len_local );
+                    count_desc = 0;
+                    if( 0 == count_desc ) {  /* completed */
+                        pos_desc += pElem->loop.items + 1;
+                        goto update_loop_description;
+                    }
+                    /* Save the stack with the correct last_count value. */
+                }
+                local_disp = (OPAL_PTRDIFF_TYPE)conv_ptr - local_disp;
+                PUSH_STACK( pStack, stack_pos, pos_desc, OPAL_DATATYPE_LOOP, count_desc,
+                            pStack->disp + local_disp);
+                pos_desc++;
+            update_loop_description:  /* update the current state */
+                conv_ptr = pBaseBuf + pStack->disp;
+                UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
+                // DDT_DUMP_STACK( pConvertor->pStack, pConvertor->stack_pos, pElem, "advance loop" );
+                continue;
+            }
+        }
+    complete_loop:
+        if (threadIdx.x == 0) { 
+            iov[iov_count].iov_len -= iov_len_local;  /* update the amount of valid data */
+        }
+        __syncthreads();
+        total_unpacked += iov[iov_count].iov_len;
+    }
+ complete_conversion:
+    if (tid == 0) {
+        cuda_desc->max_data = total_unpacked;
+    //    pConvertor->bConverted += total_unpacked;  /* update the already converted bytes */
+        cuda_desc->out_size = iov_count;
+        // if( pConvertor->bConverted == pConvertor->remote_size ) {
+        //     pConvertor->flags |= CONVERTOR_COMPLETED;
+        //     return 1;
+        // }
+        // /* Save the global position for the next round */
+        // PUSH_STACK( pStack, pConvertor->stack_pos, pos_desc, OPAL_DATATYPE_UINT1, count_desc,
+        //             conv_ptr - pConvertor->pBaseBuf );
+        // DO_DEBUG( opal_output( 0, "unpack save stack stack_pos %d pos_desc %d count_desc %d disp %ld\n",
+        //                        pConvertor->stack_pos, pStack->index, (int)pStack->count, (long)pStack->disp ); );
+    }
+}
+
+__global__ void unpack_contiguous_loop_cuda_kernel_global( uint32_t copy_loops,
+                                                           size_t size,
+                                                           OPAL_PTRDIFF_TYPE extent,
+                                                           unsigned char* source,
+                                                           unsigned char* destination )
+{
+    uint32_t _i, tid, num_threads;
+    uint32_t gap, nb_elements;
+    double *_source_tmp, *_destination_tmp, *_dst_disp_tmp;;
+    
+    tid = threadIdx.x + blockIdx.x * blockDim.x;
+    num_threads = gridDim.x * blockDim.x;
+    
+    gap = (extent - size) / 8;
+    nb_elements = size / 8;
+    _dst_disp_tmp = (double*)destination;
+    _source_tmp = (double*)source;
+    _destination_tmp = _dst_disp_tmp + tid;
+    _source_tmp += tid;
+
+    for (_i = tid; _i < copy_loops*nb_elements; _i+=num_threads) {
+        _destination_tmp = _dst_disp_tmp + tid + _i/num_threads*num_threads + _i/nb_elements * gap;
+#if defined (OPAL_DATATYPE_CUDA_DEBUG)
+        if (_i % nb_elements == 0 ) {
+            DBGPRINT("tid %d, pack 3. memcpy( %p, %p, %lu ) => _i %d, actual _i %d\n",
+                                            tid, _destination_tmp, _source_tmp, (unsigned long)size,  _i/nb_elements, _i );
+        }
+        // if (_i / nb_elements ==1 && tid == 0 ) {
+        //     DBGPRINT("tid %d, pack 3. memcpy( %p, %p, %lu ) => space %lu, _i %d, actual _i %d\n",
+        //                                     tid, _destination_tmp, _source_tmp, (unsigned long)_end_loop->size, (unsigned long)(*(SPACE) - _i/nb_elements * _end_loop->size), _i/nb_elements, _i );
+        // }
+#endif /* OPAL_DATATYPE_CUDA_DEBUG */
+#if !defined (OPAL_DATATYPE_CUDA_DRY_RUN)
+        *_destination_tmp = *_source_tmp;
+#endif /* ! OPAL_DATATYPE_CUDA_DRY_RUN */
+        _source_tmp += num_threads;
+    }
+}
\ No newline at end of file
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
new file mode 100644
index 00000000000..7181f3cd362
--- /dev/null
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
@@ -0,0 +1,123 @@
+#include "opal_datatype_cuda_internal.cuh"
+#include "opal_datatype_cuda.cuh"
+
+#include <stdio.h>
+
+int32_t opal_generic_simple_unpack_function_cuda( opal_convertor_t* pConvertor,
+                                                  struct iovec* iov, 
+                                                  uint32_t* out_size,
+                                                  size_t* max_data )
+{
+    uint32_t i;
+    dt_elem_desc_t* description;
+    const opal_datatype_t *pData = pConvertor->pDesc;
+    uint32_t tasks_per_block, num_blocks;
+    dt_stack_t* pStack;
+    
+    description = pConvertor->use_desc->desc;
+    
+    cuda_desc_h->stack_pos = pConvertor->stack_pos;
+#if defined(OPAL_DATATYPE_CUDA_DRY_RUN)
+    cuda_desc_h->pBaseBuf = pConvertor->pBaseBuf;
+#else
+    cuda_desc_h->pBaseBuf = pBaseBuf_GPU;
+#endif /* OPAL_DATATYPE_CUDA_DRY_RUN */
+    cuda_desc_h->lb = pData->lb;
+    cuda_desc_h->ub = pData->ub;
+    cuda_desc_h->out_size = *out_size;
+    cuda_desc_h->max_data = *max_data;
+    cuda_desc_h->bConverted = pConvertor->bConverted;
+    cuda_desc_h->local_size = pConvertor->local_size;
+    cuda_desc_h->stack_size = pConvertor->stack_size;
+    
+    for (i = 0; i < pConvertor->stack_size; i++) {
+        cuda_desc_h->pStack[i] = pConvertor->pStack[i];
+    }
+    for (i = 0; i < pConvertor->use_desc->used+1; i++) {
+        cuda_desc_h->description[i] = description[i];
+    }
+    
+    DBGPRINT("stack_size %d\n", pConvertor->stack_size);
+
+    DBGPRINT("flags %d, types %d, count %d\n", description->elem.common.flags, description->elem.common.type, description->elem.count);
+    
+    for (i = 0; i < *out_size; i++) {
+#if defined (OPAL_DATATYPE_CUDA_DRY_RUN)
+        cuda_desc_h->iov[i].iov_base = iov[i].iov_base;
+#endif /* OPAL_DATATYPE_CUDA_DRY_RUN */
+        cuda_desc_h->iov[i].iov_len = iov[i].iov_len;
+    }
+    
+    cudaMemcpy(cuda_desc_d, cuda_desc_h, sizeof(ddt_cuda_desc_t), cudaMemcpyHostToDevice);
+    
+    pStack = pConvertor->pStack + pConvertor->stack_pos;
+    tasks_per_block = THREAD_PER_BLOCK * TASK_PER_THREAD;
+    num_blocks = ((uint32_t)pStack->count + tasks_per_block - 1) / tasks_per_block;
+    printf("launch kernel, count %d, num_blocks %d, total threads %d\n", (uint32_t)pStack->count, num_blocks, num_blocks*4*THREAD_PER_BLOCK);
+    opal_generic_simple_unpack_cuda_kernel<<<2*num_blocks,2*THREAD_PER_BLOCK>>>(cuda_desc_d);
+#if !defined(OPAL_DATATYPE_CUDA_DRY_RUN)
+    size_t position = pConvertor->pDesc->size;
+    opal_convertor_set_position_nocheck(pConvertor, &position);
+#endif
+    cudaDeviceSynchronize();
+    
+#if defined(OPAL_DATATYPE_CUDA_DRY_RUN)
+    return -99;
+#else
+    // /* copy stack and description data back to CPU */
+    // cudaMemcpy(cuda_desc_h, cuda_desc_d, sizeof(ddt_cuda_desc_t), cudaMemcpyDeviceToHost);
+    //
+    // for (i = 0; i < pConvertor->stack_size; i++) {
+    //     pConvertor->pStack[i] = cuda_desc_h->pStack[i];
+    // }
+    //
+    // pConvertor->stack_pos = cuda_desc_h->stack_pos;
+    // *out_size = cuda_desc_h->out_size;
+    // *max_data = cuda_desc_h->max_data;
+    // pConvertor->bConverted = cuda_desc_h->bConverted;
+    // pConvertor->local_size = cuda_desc_h->local_size;
+    //
+    // for (i = 0; i < *out_size; i++) {
+    //     iov[i].iov_len = cuda_desc_h->iov[i].iov_len;
+    // }
+    //
+    if( pConvertor->bConverted == pConvertor->local_size ) {
+        // pConvertor->flags |= CONVERTOR_COMPLETED;
+        return 1;
+    }
+
+    return 0;
+#endif /* OPAL_DATATYPE_CUDA_DRY_RUN */
+}
+
+void unpack_contiguous_loop_cuda( dt_elem_desc_t* ELEM,
+                                  uint32_t* COUNT,
+                                  unsigned char** SOURCE,
+                                  unsigned char** DESTINATION,
+                                  size_t* SPACE )
+{
+    ddt_loop_desc_t *_loop = (ddt_loop_desc_t*)(ELEM);
+    ddt_endloop_desc_t* _end_loop = (ddt_endloop_desc_t*)((ELEM) + _loop->items);
+    unsigned char* _destination = (*DESTINATION) + _end_loop->first_elem_disp;
+    uint32_t _copy_loops = *(COUNT);
+    uint32_t num_blocks, tasks_per_block;
+    unsigned char* _source = *(SOURCE);
+
+    printf("I am in unpack_contiguous_loop_cuda\n");
+
+    if( (_copy_loops * _end_loop->size) > *(SPACE) )
+        _copy_loops = (uint32_t)(*(SPACE) / _end_loop->size);
+
+    _destination = pBaseBuf_GPU;
+    _source = (unsigned char*)cuda_desc_h->iov[0].iov_base;
+    
+    tasks_per_block = THREAD_PER_BLOCK * TASK_PER_THREAD;
+    num_blocks = (*COUNT + tasks_per_block - 1) / tasks_per_block;
+    unpack_contiguous_loop_cuda_kernel_global<<<192, 4*THREAD_PER_BLOCK>>>(_copy_loops, _end_loop->size, _loop->extent, _source, _destination);
+    
+    *(DESTINATION) = _destination - _end_loop->first_elem_disp;
+    *(SPACE) -= _copy_loops * _end_loop->size;
+    *(COUNT) -= _copy_loops;
+    
+    cudaDeviceSynchronize();
+}
\ No newline at end of file
diff --git a/opal/datatype/opal_datatype_gpu.c b/opal/datatype/opal_datatype_gpu.c
new file mode 100644
index 00000000000..e77a4f77325
--- /dev/null
+++ b/opal/datatype/opal_datatype_gpu.c
@@ -0,0 +1,167 @@
+/* -*- Mode: C; c-basic-offset:4 ; -*- */
+/*
+ * Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana
+ *                         University Research and Technology
+ *                         Corporation.  All rights reserved.
+ * Copyright (c) 2004-2014 The University of Tennessee and The University
+ *                         of Tennessee Research Foundation.  All rights
+ *                         reserved.
+ * Copyright (c) 2004-2006 High Performance Computing Center Stuttgart,
+ *                         University of Stuttgart.  All rights reserved.
+ * Copyright (c) 2004-2006 The Regents of the University of California.
+ *                         All rights reserved.
+ * Copyright (c) 2009      Oak Ridge National Labs.  All rights reserved.
+ * Copyright (c) 2013 Cisco Systems, Inc.  All rights reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+
+#include "opal_config.h"
+
+#include <stddef.h>
+#include <dlfcn.h>
+
+#include "opal/datatype/opal_convertor_internal.h"
+#include "opal/datatype/opal_datatype_internal.h"
+
+#if OPAL_ENABLE_DEBUG
+#include "opal/util/output.h"
+
+#define DO_DEBUG(INST)  if( opal_pack_debug ) { INST }
+#else
+#define DO_DEBUG(INST)
+#endif  /* OPAL_ENABLE_DEBUG */
+
+#include "opal/datatype/opal_datatype_gpu.h"
+
+static void *opal_datatype_cuda_handle = NULL; 
+
+void (*opal_datatype_cuda_init_p)(void) = NULL;
+
+void (*opal_datatype_cuda_fini_p)(void) = NULL;
+
+int32_t (*opal_generic_simple_pack_function_cuda_p)( opal_convertor_t* pConvertor,
+                                                     struct iovec* iov, 
+                                                     uint32_t* out_size,
+                                                     size_t* max_data ) = NULL;
+
+int32_t (*opal_generic_simple_unpack_function_cuda_p)( opal_convertor_t* pConvertor,
+                                                       struct iovec* iov, 
+                                                       uint32_t* out_size,
+                                                       size_t* max_data ) = NULL;
+                                                       
+void (*pack_contiguous_loop_cuda_p)( dt_elem_desc_t* ELEM,
+                                     uint32_t* COUNT,
+                                     unsigned char** SOURCE,
+                                     unsigned char** DESTINATION,
+                                     size_t* SPACE ) = NULL;
+                                     
+void (*unpack_contiguous_loop_cuda_p)( dt_elem_desc_t* ELEM,
+                                       uint32_t* COUNT,
+                                       unsigned char** SOURCE,
+                                       unsigned char** DESTINATION,
+                                       size_t* SPACE ) = NULL;
+                                       
+void (*pack_predefined_data_cuda_p)( dt_elem_desc_t* ELEM,
+                                     uint32_t* COUNT,
+                                     unsigned char** SOURCE,
+                                     unsigned char** DESTINATION,
+                                     size_t* SPACE ) = NULL;
+
+void (*opal_cuda_sync_device_p)(void) = NULL;
+
+int32_t opal_datatype_gpu_init(void)
+{
+    char *error;
+    char *lib = "/home/wwu12/ompi/ompi-cuda/opal/datatype/cuda/opal_datatype_cuda.so";
+    
+    if (opal_datatype_cuda_handle ==  NULL) {
+        opal_datatype_cuda_handle = dlopen(lib, RTLD_LAZY);
+        if (!opal_datatype_cuda_handle) {
+            fprintf(stderr, "%s\n", dlerror());
+            opal_datatype_cuda_handle = NULL;
+            return OPAL_ERROR;
+        }
+        
+        *(void **)(&opal_datatype_cuda_init_p) = dlsym(opal_datatype_cuda_handle, "opal_datatype_cuda_init");
+        if ((error = dlerror()) != NULL)  {
+            fprintf(stderr, "opal_datatype_cuda_init error: %s\n", error);
+            opal_datatype_cuda_init_p = NULL;
+            return OPAL_ERROR;
+        }
+        
+        *(void **)(&opal_datatype_cuda_fini_p) = dlsym(opal_datatype_cuda_handle, "opal_datatype_cuda_fini");
+        if ((error = dlerror()) != NULL)  {
+            fprintf(stderr, "opal_datatype_cuda_fini error: %s\n", error);
+            opal_datatype_cuda_fini_p = NULL;
+            return OPAL_ERROR;
+        }
+        
+        *(void **)(&opal_generic_simple_pack_function_cuda_p) = dlsym(opal_datatype_cuda_handle, "opal_generic_simple_pack_function_cuda");
+        if ((error = dlerror()) != NULL)  {
+            fprintf(stderr, "opal_generic_simple_pack_function_cuda error: %s\n", error);
+            opal_generic_simple_pack_function_cuda_p = NULL;
+            return OPAL_ERROR;
+        }
+        
+        *(void **)(&opal_generic_simple_unpack_function_cuda_p) = dlsym(opal_datatype_cuda_handle, "opal_generic_simple_unpack_function_cuda");
+        if ((error = dlerror()) != NULL)  {
+            fprintf(stderr, "opal_generic_simple_unpack_function_cuda error: %s\n", error);
+            opal_generic_simple_unpack_function_cuda_p = NULL;
+            return OPAL_ERROR;
+        }
+        
+        *(void **)(&pack_contiguous_loop_cuda_p) = dlsym(opal_datatype_cuda_handle, "pack_contiguous_loop_cuda");
+        if ((error = dlerror()) != NULL)  {
+            fprintf(stderr, "pack_contiguous_loop_cuda error: %s\n", error);
+            pack_contiguous_loop_cuda_p = NULL;
+            return OPAL_ERROR;
+        }
+        
+        *(void **)(&unpack_contiguous_loop_cuda_p) = dlsym(opal_datatype_cuda_handle, "unpack_contiguous_loop_cuda");
+        if ((error = dlerror()) != NULL)  {
+            fprintf(stderr, "unpack_contiguous_loop_cuda error: %s\n", error);
+            unpack_contiguous_loop_cuda_p = NULL;
+            return OPAL_ERROR;
+        }
+        
+        *(void **)(&pack_predefined_data_cuda_p) = dlsym(opal_datatype_cuda_handle, "pack_predefined_data_cuda");
+        if ((error = dlerror()) != NULL)  {
+            fprintf(stderr, "pack_predefined_data_cuda error: %s\n", error);
+            pack_predefined_data_cuda_p = NULL;
+            return OPAL_ERROR;
+        }
+        
+        *(void **)(&opal_cuda_sync_device_p) = dlsym(opal_datatype_cuda_handle, "opal_cuda_sync_device");
+        if ((error = dlerror()) != NULL)  {
+            fprintf(stderr, "opal_cuda_sync_device error: %s\n", error);
+            opal_cuda_sync_device_p = NULL;
+            return OPAL_ERROR;
+        }
+        
+        (*opal_datatype_cuda_init_p)();
+        printf("cuda init done\n");   
+    }
+    return OPAL_SUCCESS;
+}
+int32_t opal_datatype_gpu_fini(void)
+{
+    if (opal_datatype_cuda_handle != NULL) {
+        (*opal_datatype_cuda_fini_p)();
+        dlclose(opal_datatype_cuda_handle);
+        opal_datatype_cuda_handle = NULL;
+        opal_datatype_cuda_init_p = NULL;
+        opal_datatype_cuda_fini_p = NULL;
+        opal_generic_simple_pack_function_cuda_p = NULL;
+        opal_generic_simple_unpack_function_cuda_p = NULL;
+        pack_contiguous_loop_cuda_p = NULL;
+        unpack_contiguous_loop_cuda_p = NULL;
+        pack_predefined_data_cuda_p = NULL;
+        opal_cuda_sync_device_p = NULL;
+        printf("cuda fini done\n");
+    }
+    return OPAL_SUCCESS;
+}
diff --git a/opal/datatype/opal_datatype_gpu.h b/opal/datatype/opal_datatype_gpu.h
new file mode 100644
index 00000000000..385d7cdb73c
--- /dev/null
+++ b/opal/datatype/opal_datatype_gpu.h
@@ -0,0 +1,40 @@
+#ifndef OPAL_DATATYPE_GPU_H_HAS_BEEN_INCLUDED
+#define OPAL_DATATYPE_GPU_H_HAS_BEEN_INCLUDED
+
+int32_t opal_datatype_gpu_init(void);
+int32_t opal_datatype_gpu_fini(void);
+
+extern void (*opal_datatype_cuda_init_p)(void);
+
+extern void (*opal_datatype_cuda_fini_p)(void);
+
+extern int32_t (*opal_generic_simple_pack_function_cuda_p)( opal_convertor_t* pConvertor,
+                                                            struct iovec* iov, 
+                                                            uint32_t* out_size,
+                                                            size_t* max_data );
+                                                            
+extern int32_t (*opal_generic_simple_unpack_function_cuda_p)( opal_convertor_t* pConvertor,
+                                                              struct iovec* iov, 
+                                                              uint32_t* out_size,
+                                                              size_t* max_data );
+                                                              
+extern void (*pack_contiguous_loop_cuda_p)( dt_elem_desc_t* ELEM,
+                                            uint32_t* COUNT,
+                                            unsigned char** SOURCE,
+                                            unsigned char** DESTINATION,
+                                            size_t* SPACE );
+                                            
+extern void (*unpack_contiguous_loop_cuda_p)( dt_elem_desc_t* ELEM,
+                                             uint32_t* COUNT,
+                                             unsigned char** SOURCE,
+                                             unsigned char** DESTINATION,
+                                             size_t* SPACE );
+
+extern void (*pack_predefined_data_cuda_p)( dt_elem_desc_t* ELEM,
+                                            uint32_t* COUNT,
+                                            unsigned char** SOURCE,
+                                            unsigned char** DESTINATION,
+                                            size_t* SPACE );
+                                            
+extern void (*opal_cuda_sync_device_p)(void);
+#endif /* OPAL_DATATYPE_GPU_H_HAS_BEEN_INCLUDED */
\ No newline at end of file
diff --git a/opal/datatype/opal_datatype_module.c b/opal/datatype/opal_datatype_module.c
index 7de8fae5b08..520105d8de9 100644
--- a/opal/datatype/opal_datatype_module.c
+++ b/opal/datatype/opal_datatype_module.c
@@ -33,6 +33,7 @@
 #include "opal/datatype/opal_datatype.h"
 #include "opal/datatype/opal_convertor_internal.h"
 #include "opal/mca/base/mca_base_var.h"
+#include "opal/datatype/opal_datatype_gpu.h"
 
 /* by default the debuging is turned off */
 int opal_datatype_dfd = -1;
@@ -225,6 +226,12 @@ int32_t opal_datatype_init( void )
         datatype->desc.desc[1].end_loop.first_elem_disp = datatype->desc.desc[0].elem.disp;
         datatype->desc.desc[1].end_loop.size            = datatype->size;
     }
+    
+#if defined (OPAL_DATATYPE_CUDA)
+    if (opal_datatype_gpu_init() != OPAL_SUCCESS) {
+        opal_datatype_gpu_fini();
+    }
+#endif /* defined OPAL_DATATYPE_CUDA */
 
     return OPAL_SUCCESS;
 }
@@ -248,6 +255,10 @@ int32_t opal_datatype_finalize( void )
     /* clear all master convertors */
     opal_convertor_destroy_masters();
 
+#if defined (OPAL_DATATYPE_CUDA)  
+    opal_datatype_gpu_fini();
+#endif /* defined OPAL_DATATYPE_CUDA */
+
     return OPAL_SUCCESS;
 }
 
diff --git a/opal/datatype/opal_datatype_pack.c b/opal/datatype/opal_datatype_pack.c
index 45f1213b811..9dc0666eb4e 100644
--- a/opal/datatype/opal_datatype_pack.c
+++ b/opal/datatype/opal_datatype_pack.c
@@ -37,6 +37,7 @@
 #include "opal/datatype/opal_datatype_checksum.h"
 #include "opal/datatype/opal_datatype_pack.h"
 #include "opal/datatype/opal_datatype_prototypes.h"
+#include "opal/datatype/opal_datatype_gpu.h"
 
 #if defined(CHECKSUM)
 #define opal_pack_homogeneous_contig_function           opal_pack_homogeneous_contig_checksum
@@ -287,6 +288,13 @@ opal_generic_simple_pack_function( opal_convertor_t* pConvertor,
                            (void*)pConvertor, (void*)pConvertor->pBaseBuf,
                            iov[0].iov_base, (unsigned long)iov[0].iov_len, *out_size ); );
 
+   if (opal_generic_simple_pack_function_cuda_p != NULL) {
+       int32_t rvalue = (*opal_generic_simple_pack_function_cuda_p)( pConvertor, iov, out_size, max_data);
+       if (rvalue != -99) { /* -99 is DRY RUN, to verify the result with CPU packing*/
+           return rvalue;
+       }
+   }
+
     description = pConvertor->use_desc->desc;
 
     /* For the first step we have to add both displacement to the source. After in the
@@ -312,8 +320,9 @@ opal_generic_simple_pack_function( opal_convertor_t* pConvertor,
         while( 1 ) {
             while( pElem->elem.common.flags & OPAL_DATATYPE_FLAG_DATA ) {
                 /* now here we have a basic datatype */
-                PACK_PREDEFINED_DATATYPE( pConvertor, pElem, count_desc,
-                                          conv_ptr, iov_ptr, iov_len_local );
+                (*pack_predefined_data_cuda_p)(pElem, &count_desc, &conv_ptr, &iov_ptr, &iov_len_local);
+                // PACK_PREDEFINED_DATATYPE( pConvertor, pElem, count_desc,
+                //                           conv_ptr, iov_ptr, iov_len_local );
                 if( 0 == count_desc ) {  /* completed */
                     conv_ptr = pConvertor->pBaseBuf + pStack->disp;
                     pos_desc++;  /* advance to the next data */
@@ -356,8 +365,9 @@ opal_generic_simple_pack_function( opal_convertor_t* pConvertor,
             if( OPAL_DATATYPE_LOOP == pElem->elem.common.type ) {
                 OPAL_PTRDIFF_TYPE local_disp = (OPAL_PTRDIFF_TYPE)conv_ptr;
                 if( pElem->loop.common.flags & OPAL_DATATYPE_FLAG_CONTIGUOUS ) {
-                    PACK_CONTIGUOUS_LOOP( pConvertor, pElem, count_desc,
-                                          conv_ptr, iov_ptr, iov_len_local );
+                    (*pack_contiguous_loop_cuda_p)(pElem, &count_desc, &conv_ptr, &iov_ptr, &iov_len_local);
+                    //PACK_CONTIGUOUS_LOOP( pConvertor, pElem, count_desc,
+                    //                      conv_ptr, iov_ptr, iov_len_local );
                     if( 0 == count_desc ) {  /* completed */
                         pos_desc += pElem->loop.items + 1;
                         goto update_loop_description;
@@ -379,6 +389,7 @@ opal_generic_simple_pack_function( opal_convertor_t* pConvertor,
         iov[iov_count].iov_len -= iov_len_local;  /* update the amount of valid data */
         total_packed += iov[iov_count].iov_len;
     }
+    (*opal_cuda_sync_device_p)();
     *max_data = total_packed;
     pConvertor->bConverted += total_packed;  /* update the already converted bytes */
     *out_size = iov_count;
diff --git a/opal/datatype/opal_datatype_pack.h b/opal/datatype/opal_datatype_pack.h
index c02ecf86ec5..b011f434472 100644
--- a/opal/datatype/opal_datatype_pack.h
+++ b/opal/datatype/opal_datatype_pack.h
@@ -51,6 +51,8 @@ static inline void pack_predefined_data( opal_convertor_t* CONVERTOR,
                                     (CONVERTOR)->pDesc, (CONVERTOR)->count );
         DO_DEBUG( opal_output( 0, "pack 1. memcpy( %p, %p, %lu ) => space %lu\n",
                                *(DESTINATION), _source, (unsigned long)_copy_blength, (unsigned long)(*(SPACE)) ); );
+        printf("pack 1. memcpy( %p, %p, %lu ) => space %lu\n",
+                               *(DESTINATION), _source, (unsigned long)_copy_blength, (unsigned long)(*(SPACE)) );
         MEMCPY_CSUM( *(DESTINATION), _source, _copy_blength, (CONVERTOR) );
         _source        += _copy_blength;
         *(DESTINATION) += _copy_blength;
diff --git a/opal/datatype/opal_datatype_unpack.c b/opal/datatype/opal_datatype_unpack.c
index e5c05e14e2d..f2c57593bcc 100644
--- a/opal/datatype/opal_datatype_unpack.c
+++ b/opal/datatype/opal_datatype_unpack.c
@@ -27,6 +27,7 @@
 
 #include "opal/datatype/opal_convertor_internal.h"
 #include "opal/datatype/opal_datatype_internal.h"
+#include "opal/datatype/opal_datatype_gpu.h"
 
 #if OPAL_ENABLE_DEBUG
 #include "opal/util/output.h"
@@ -275,6 +276,13 @@ opal_generic_simple_unpack_function( opal_convertor_t* pConvertor,
     DO_DEBUG( opal_output( 0, "opal_convertor_generic_simple_unpack( %p, {%p, %lu}, %u )\n",
                            (void*)pConvertor, iov[0].iov_base, (unsigned long)iov[0].iov_len, *out_size ); );
 
+//  if (opal_generic_simple_unpack_function_cuda_p != NULL) {
+//      int32_t rvalue = (*opal_generic_simple_unpack_function_cuda_p)( pConvertor, iov, out_size, max_data);
+//      if (rvalue != -99) { /* -99 is DRY RUN, to verify the result with CPU packing*/
+//          return rvalue;
+//      }
+//  }                      
+
     description = pConvertor->use_desc->desc;
 
     /* For the first step we have to add both displacement to the source. After in the
@@ -379,8 +387,9 @@ opal_generic_simple_unpack_function( opal_convertor_t* pConvertor,
             if( OPAL_DATATYPE_LOOP == pElem->elem.common.type ) {
                 OPAL_PTRDIFF_TYPE local_disp = (OPAL_PTRDIFF_TYPE)conv_ptr;
                 if( pElem->loop.common.flags & OPAL_DATATYPE_FLAG_CONTIGUOUS ) {
-                    UNPACK_CONTIGUOUS_LOOP( pConvertor, pElem, count_desc,
-                                            iov_ptr, conv_ptr, iov_len_local );
+                //    UNPACK_CONTIGUOUS_LOOP( pConvertor, pElem, count_desc, 
+                //                            iov_ptr, conv_ptr, iov_len_local );
+                    (*unpack_contiguous_loop_cuda_p)(pElem, &count_desc, &iov_ptr, &conv_ptr, &iov_len_local);
                     if( 0 == count_desc ) {  /* completed */
                         pos_desc += pElem->loop.items + 1;
                         goto update_loop_description;
diff --git a/opal/include/opal_config_top.h b/opal/include/opal_config_top.h
index 1ce5267c389..2f5ad1adec2 100644
--- a/opal/include/opal_config_top.h
+++ b/opal/include/opal_config_top.h
@@ -19,6 +19,8 @@
 #error "opal_config_top.h should only be included from opal_config.h"
 #endif
 
+#define OPAL_DATATYPE_CUDA
+
 /* The only purpose of this file is to undef the PACKAGE_<foo> macros
    that are put in by autoconf/automake projects.  Specifically, if
    you include a .h file from another project that defines these
diff --git a/test/datatype/ddt_test.c b/test/datatype/ddt_test.c
index 0afac9b49ec..12b4b31fc15 100644
--- a/test/datatype/ddt_test.c
+++ b/test/datatype/ddt_test.c
@@ -341,7 +341,7 @@ static int local_copy_with_convertor( ompi_datatype_t* pdt, int count, int chunk
 int main( int argc, char* argv[] )
 {
     ompi_datatype_t *pdt, *pdt1, *pdt2, *pdt3;
-    int rc, length = 500;
+    int rc, length = 500, i;
 
     opal_init_util(&argc, &argv);
     ompi_datatype_init();
@@ -350,7 +350,7 @@ int main( int argc, char* argv[] )
      * By default simulate homogeneous architectures.
      */
     remote_arch = opal_local_arch;
-    printf( "\n\n#\n * TEST INVERSED VECTOR\n #\n\n" );
+/*    printf( "\n\n#\n * TEST INVERSED VECTOR\n #\n\n" );
     pdt = create_inversed_vector( &ompi_mpi_int.dt, 10 );
     if( outputFlags & CHECK_PACK_UNPACK ) {
         local_copy_ddt_count(pdt, 100);
@@ -364,15 +364,17 @@ int main( int argc, char* argv[] )
         local_copy_with_convertor(pdt, 1, 956);
     }
     OBJ_RELEASE( pdt ); assert( pdt == NULL );
-
+*/    
     printf( "\n\n#\n * TEST UPPER TRIANGULAR MATRIX (size 100)\n #\n\n" );
-    pdt = upper_matrix(100);
+    pdt = upper_matrix(4000);
     if( outputFlags & CHECK_PACK_UNPACK ) {
-        local_copy_ddt_count(pdt, 1);
-        local_copy_with_convertor(pdt, 1, 48);
+        for (i = 1; i <= 4; i++) {
+//        local_copy_ddt_count(pdt, 1);
+    //    local_copy_with_convertor(pdt, 1, 1024*1024*200);
+        }
     }
     OBJ_RELEASE( pdt ); assert( pdt == NULL );
-
+  /*  
     mpich_typeub();
     mpich_typeub2();
     mpich_typeub3();
@@ -476,26 +478,104 @@ int main( int argc, char* argv[] )
         local_copy_with_convertor( pdt, 4500, 12 );
         local_copy_with_convertor_2datatypes( pdt, 4500, pdt, 4500, 12 );
         OBJ_RELEASE( pdt ); assert( pdt == NULL );
+    }*/
+    printf( ">>--------------------------------------------<<\n" );
+    printf( "Vector data-type (4000 times 512 double stride 640)\n" );
+#if 0
+    pdt = create_vector_type( MPI_DOUBLE, 4000, 512, 640 );
+    opal_datatype_resize(&pdt->super, 0, pdt->super.size+sizeof(MPI_DOUBLE));
+    ompi_datatype_create_contiguous( 1, pdt, &pdt1 );
+#else
+    pdt = create_vector_type( MPI_DOUBLE, 4000, 512, 640 );
+  //  opal_datatype_resize(&pdt->super, 0, pdt->super.size+sizeof(MPI_DOUBLE));
+  //  ompi_datatype_create_contiguous( 4000, pdt, &pdt1 );
+#endif
+//    ompi_datatype_dump( pdt );
+ //   ompi_datatype_commit(&pdt1);
+    if( outputFlags & CHECK_PACK_UNPACK ) {
+        for (i = 0; i < 10; i++) {
+            local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 1024*1024*30 );
+        }
     }
     printf( ">>--------------------------------------------<<\n" );
+    printf( "Vector data-type (4000 times 384 double stride 512)\n" );
+    pdt = create_vector_type( MPI_DOUBLE, 4000, 384, 512 );
+    opal_datatype_resize(&pdt->super, 0, pdt->super.size+sizeof(MPI_DOUBLE));
+    ompi_datatype_create_contiguous( 1, pdt, &pdt1 );
+//    ompi_datatype_dump( pdt );
+    if( outputFlags & CHECK_PACK_UNPACK ) {
+        for (i = 0; i < 10; i++) {
+       // local_copy_ddt_count(pdt, 1);
+      //  local_copy_with_convertor( pdt, 1, 12 );
+      //  local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 12 );
+      //  local_copy_with_convertor( pdt, 1, 82 );
+      //  local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 82 );
+      //  local_copy_with_convertor( pdt, 1, 6000 );
+      //  local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 6000 );
+      //  local_copy_with_convertor( pdt, 1, 36000 );
+     //     local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 1024*1024*20 );
+        }
+    }
     printf( ">>--------------------------------------------<<\n" );
-    printf( "Vector data-type (450 times 10 double stride 11)\n" );
-    pdt = create_vector_type( MPI_DOUBLE, 450, 10, 11 );
-    ompi_datatype_dump( pdt );
+    OBJ_RELEASE( pdt ); assert( pdt == NULL );
+    
+    printf( "Vector data-type (4000 times 256 double stride 384)\n" );
+    pdt = create_vector_type( MPI_DOUBLE, 4000, 256, 384 );
+//    ompi_datatype_dump( pdt );
     if( outputFlags & CHECK_PACK_UNPACK ) {
-        local_copy_ddt_count(pdt, 1);
-        local_copy_with_convertor( pdt, 1, 12 );
-        local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 12 );
-        local_copy_with_convertor( pdt, 1, 82 );
-        local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 82 );
-        local_copy_with_convertor( pdt, 1, 6000 );
-        local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 6000 );
-        local_copy_with_convertor( pdt, 1, 36000 );
-        local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 36000 );
+        for (i = 0; i < 10; i++) {
+       // local_copy_ddt_count(pdt, 1);
+      //  local_copy_with_convertor( pdt, 1, 12 );
+      //  local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 12 );
+      //  local_copy_with_convertor( pdt, 1, 82 );
+      //  local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 82 );
+      //  local_copy_with_convertor( pdt, 1, 6000 );
+      //  local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 6000 );
+      //  local_copy_with_convertor( pdt, 1, 36000 );
+  //        local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 1024*1024*10 );
+        }
     }
     printf( ">>--------------------------------------------<<\n" );
     OBJ_RELEASE( pdt ); assert( pdt == NULL );
-
+    
+    printf( "Vector data-type (4000 times 128 double stride 256)\n" );
+    pdt = create_vector_type( MPI_DOUBLE, 4000, 128, 256 );
+//    ompi_datatype_dump( pdt );
+    if( outputFlags & CHECK_PACK_UNPACK ) {
+        for (i = 0; i < 10; i++) {
+       // local_copy_ddt_count(pdt, 1);
+      //  local_copy_with_convertor( pdt, 1, 12 );
+      //  local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 12 );
+      //  local_copy_with_convertor( pdt, 1, 82 );
+      //  local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 82 );
+      //  local_copy_with_convertor( pdt, 1, 6000 );
+      //  local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 6000 );
+      //  local_copy_with_convertor( pdt, 1, 36000 );
+   //       local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 1024*1024*5 );
+        }
+    }
+    printf( ">>--------------------------------------------<<\n" );
+    OBJ_RELEASE( pdt ); assert( pdt == NULL );
+    
+    printf( "Vector data-type (2000 times 3 double stride 4)\n" );
+    pdt = create_vector_type( MPI_DOUBLE, 2000, 3, 4 );
+//    ompi_datatype_dump( pdt );
+    if( outputFlags & CHECK_PACK_UNPACK ) {
+        for (i = 0; i < 10; i++) {
+       // local_copy_ddt_count(pdt, 1);
+      //  local_copy_with_convertor( pdt, 1, 12 );
+      //  local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 12 );
+      //  local_copy_with_convertor( pdt, 1, 82 );
+      //  local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 82 );
+      //  local_copy_with_convertor( pdt, 1, 6000 );
+      //  local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 6000 );
+      //  local_copy_with_convertor( pdt, 1, 36000 );
+      //  local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 1024*1024*4 );
+        }
+    }
+    printf( ">>--------------------------------------------<<\n" );
+    OBJ_RELEASE( pdt ); assert( pdt == NULL );
+    /*
     printf( ">>--------------------------------------------<<\n" );
     pdt = test_struct_char_double();
     if( outputFlags & CHECK_PACK_UNPACK ) {
@@ -541,7 +621,7 @@ int main( int argc, char* argv[] )
     printf( ">>--------------------------------------------<<\n" );
     OBJ_RELEASE( pdt1 ); assert( pdt1 == NULL );
     OBJ_RELEASE( pdt2 ); assert( pdt2 == NULL );
-
+*/
     /* clean-ups all data allocations */
     ompi_datatype_finalize();
 

From 3f3ee943174bc8579e68b01529d701218702c81d Mon Sep 17 00:00:00 2001
From: eddy16112 <eddy16112@gmail.com>
Date: Fri, 14 Nov 2014 14:03:34 -0500
Subject: [PATCH 096/190] indexed datatype new, bonus stask support. Add
 support for iovec and for pipeline iovec. a new way to compute nb_block and
 thread_per_block

Conflicts:
	test/datatype/Makefile.am
---
 opal/datatype/cuda/Makefile                   |    2 +-
 opal/datatype/cuda/opal_config.h              | 2792 +++++++++++++++++
 opal/datatype/cuda/opal_datatype_cuda.cu      |  117 +-
 opal/datatype/cuda/opal_datatype_cuda.cuh     |   10 +
 .../cuda/opal_datatype_cuda_internal.cuh      |  383 +--
 .../cuda/opal_datatype_orig_internal.h        |  646 ++++
 .../cuda/opal_datatype_pack_cuda_kernel.cu    |  518 +--
 .../cuda/opal_datatype_pack_cuda_wrapper.cu   |  437 ++-
 .../cuda/opal_datatype_unpack_cuda_kernel.cu  |   67 +-
 .../cuda/opal_datatype_unpack_cuda_wrapper.cu |  252 +-
 opal/datatype/opal_convertor.c                |   23 +-
 opal/datatype/opal_datatype_cuda.c            |    1 +
 opal/datatype/opal_datatype_gpu.c             |   26 +
 opal/datatype/opal_datatype_gpu.h             |   20 +-
 opal/datatype/opal_datatype_module.c          |    6 -
 opal/datatype/opal_datatype_pack.c            |   44 +-
 opal/datatype/opal_datatype_pack.h            |    2 -
 opal/datatype/opal_datatype_prototypes.h      |   16 +
 opal/datatype/opal_datatype_unpack.c          |   39 +-
 test/datatype/Makefile.am                     |   13 +-
 test/datatype/ddt_lib.c                       |   33 +-
 test/datatype/ddt_lib.h                       |    7 +-
 test/datatype/ddt_test.c                      |  477 ++-
 23 files changed, 5310 insertions(+), 621 deletions(-)
 create mode 100644 opal/datatype/cuda/opal_config.h
 create mode 100644 opal/datatype/cuda/opal_datatype_orig_internal.h

diff --git a/opal/datatype/cuda/Makefile b/opal/datatype/cuda/Makefile
index d42ab556fae..6be10afd0fd 100644
--- a/opal/datatype/cuda/Makefile
+++ b/opal/datatype/cuda/Makefile
@@ -5,7 +5,7 @@ ARCHFLAGS	= cr
 RANLIB		= ranlib
 STLIB		?= opal_datatype_cuda.a
 DYLIB		?= opal_datatype_cuda.so
-CFLAGS		= -g -G -O0
+CFLAGS		= -g -G -O0 
 EXTLIB		= -L/home/wwu12/ompi/ompi-cuda/opal/datatype/.libs -ldatatype
 INC			=
 
diff --git a/opal/datatype/cuda/opal_config.h b/opal/datatype/cuda/opal_config.h
new file mode 100644
index 00000000000..19fa55f52ed
--- /dev/null
+++ b/opal/datatype/cuda/opal_config.h
@@ -0,0 +1,2792 @@
+/* opal/include/opal_config.h.  Generated from opal_config.h.in by configure.  */
+/* opal/include/opal_config.h.in.  Generated from configure.ac by autoheader.  */
+
+/* -*- c -*-
+ *
+ * Copyright (c) 2004-2005 The Trustees of Indiana University.
+ *                         All rights reserved.
+ * Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
+ *                         All rights reserved.
+ * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
+ *                         University of Stuttgart.  All rights reserved.
+ * Copyright (c) 2004-2005 The Regents of the University of California.
+ *                         All rights reserved.
+ * Copyright (c) 2014      Intel, Inc. All rights reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ *
+ * Function: - OS, CPU and compiler dependent configuration
+ */
+
+#ifndef OPAL_CONFIG_H
+#define OPAL_CONFIG_H
+
+/* Define if building universal (internal helper macro) */
+/* #undef AC_APPLE_UNIVERSAL_BUILD */
+
+/* enable openib BTL failover */
+#define BTL_OPENIB_FAILOVER_ENABLED 0
+
+/* Whether the openib BTL malloc hooks are enabled */
+#define BTL_OPENIB_MALLOC_HOOKS_ENABLED 1
+
+/* rdmacm without IB_AF addressing support */
+/* #undef BTL_OPENIB_RDMACM_IB_ADDR */
+
+/* BLCR cr_request_file check */
+/* #undef CRS_BLCR_HAVE_CR_REQUEST */
+
+/* BLCR cr_request_checkpoint check */
+/* #undef CRS_BLCR_HAVE_CR_REQUEST_CHECKPOINT */
+
+/* BLCRs cr_checkpoint_info.requester member availability */
+/* #undef CRS_BLCR_HAVE_INFO_REQUESTER */
+
+/* Version of event */
+/* #undef EVENT_EXTERNAL_EVENT_VERSION */
+
+/* Define to 1 if you have the <aio.h> header file. */
+#define HAVE_AIO_H 1
+
+/* Define to 1 if you have the <alloca.h> header file. */
+#define HAVE_ALLOCA_H 1
+
+/* Define to 1 if you have the <alps/apInfo.h> header file. */
+/* #undef HAVE_ALPS_APINFO_H */
+
+/* Define to 1 if you have the <arpa/inet.h> header file. */
+#define HAVE_ARPA_INET_H 1
+
+/* Define to 1 if you have the `asprintf' function. */
+#define HAVE_ASPRINTF 1
+
+/* Define to 1 if the system has the type `CACHE_DESCRIPTOR'. */
+/* #undef HAVE_CACHE_DESCRIPTOR */
+
+/* Define to 1 if the system has the type `CACHE_RELATIONSHIP'. */
+/* #undef HAVE_CACHE_RELATIONSHIP */
+
+/* Define to 1 if you have the `clz' function. */
+/* #undef HAVE_CLZ */
+
+/* Define to 1 if you have the `clzl' function. */
+/* #undef HAVE_CLZL */
+
+/* Define to 1 if you have the <CL/cl_ext.h> header file. */
+#define HAVE_CL_CL_EXT_H 1
+
+/* Define to 1 if you have the <complex.h> header file. */
+#define HAVE_COMPLEX_H 1
+
+/* Define to 1 if you have the `cpuset_setaffinity' function. */
+/* #undef HAVE_CPUSET_SETAFFINITY */
+
+/* Define to 1 if you have the `cpuset_setid' function. */
+/* #undef HAVE_CPUSET_SETID */
+
+/* Define to 1 if you have the <criu/criu.h> header file. */
+/* #undef HAVE_CRIU_CRIU_H */
+
+/* Define to 1 if you have the <crt_externs.h> header file. */
+/* #undef HAVE_CRT_EXTERNS_H */
+
+/* Define to 1 if we have -lcuda */
+/* #undef HAVE_CUDA */
+
+/* Define to 1 if you have the <cuda.h> header file. */
+/* #undef HAVE_CUDA_H */
+
+/* Define to 1 if you have the <cuda_runtime_api.h> header file. */
+/* #undef HAVE_CUDA_RUNTIME_API_H */
+
+/* Define to 1 if you have the <curl/curl.h> header file. */
+/* #undef HAVE_CURL_CURL_H */
+
+/* Define to 1 if you have the `dbm_open' function. */
+/* #undef HAVE_DBM_OPEN */
+
+/* Define to 1 if you have the `dbopen' function. */
+/* #undef HAVE_DBOPEN */
+
+/* Define to 1 if you have the <db.h> header file. */
+/* #undef HAVE_DB_H */
+
+/* Define to 1 if you have the declaration of `AF_INET6', and to 0 if you
+   don't. */
+#define HAVE_DECL_AF_INET6 1
+
+/* Define to 1 if you have the declaration of `AF_UNSPEC', and to 0 if you
+   don't. */
+#define HAVE_DECL_AF_UNSPEC 1
+
+/* Define to 1 if you have the declaration of `CL_DEVICE_TOPOLOGY_AMD', and to
+   0 if you don't. */
+#define HAVE_DECL_CL_DEVICE_TOPOLOGY_AMD 0
+
+/* Define to 1 if you have the declaration of `CTL_HW', and to 0 if you don't.
+   */
+#define HAVE_DECL_CTL_HW 0
+
+/* Define to 1 if you have the declaration of `fabsf', and to 0 if you don't.
+   */
+#define HAVE_DECL_FABSF 1
+
+/* Define to 1 if you have the declaration of `HW_NCPU', and to 0 if you
+   don't. */
+#define HAVE_DECL_HW_NCPU 0
+
+/* Define to 1 if you have the declaration of `HZ', and to 0 if you don't. */
+#define HAVE_DECL_HZ 1
+
+/* Define to 1 if you have the declaration of `IBV_ACCESS_ALLOCATE_MR', and to
+   0 if you don't. */
+/* #undef HAVE_DECL_IBV_ACCESS_ALLOCATE_MR */
+
+/* Define to 1 if you have the declaration of
+   `IBV_ACCESS_SHARED_MR_USER_READ', and to 0 if you don't. */
+/* #undef HAVE_DECL_IBV_ACCESS_SHARED_MR_USER_READ */
+
+/* Define to 1 if you have the declaration of `IBV_ACCESS_SO', and to 0 if you
+   don't. */
+/* #undef HAVE_DECL_IBV_ACCESS_SO */
+
+/* Define to 1 if you have the declaration of `IBV_EVENT_CLIENT_REREGISTER',
+   and to 0 if you don't. */
+/* #undef HAVE_DECL_IBV_EVENT_CLIENT_REREGISTER */
+
+/* Define to 1 if you have the declaration of `IBV_EVENT_GID_CHANGE', and to 0
+   if you don't. */
+/* #undef HAVE_DECL_IBV_EVENT_GID_CHANGE */
+
+/* Define to 1 if you have the declaration of `ibv_event_type_str', and to 0
+   if you don't. */
+/* #undef HAVE_DECL_IBV_EVENT_TYPE_STR */
+
+/* Define to 1 if you have the declaration of `IBV_EXP_ACCESS_ALLOCATE_MR',
+   and to 0 if you don't. */
+/* #undef HAVE_DECL_IBV_EXP_ACCESS_ALLOCATE_MR */
+
+/* Define to 1 if you have the declaration of
+   `IBV_EXP_ACCESS_SHARED_MR_USER_READ', and to 0 if you don't. */
+/* #undef HAVE_DECL_IBV_EXP_ACCESS_SHARED_MR_USER_READ */
+
+/* Define to 1 if you have the declaration of `IBV_LINK_LAYER_ETHERNET', and
+   to 0 if you don't. */
+/* #undef HAVE_DECL_IBV_LINK_LAYER_ETHERNET */
+
+/* Define to 1 if you have the declaration of `IBV_NODE_USNIC', and to 0 if
+   you don't. */
+/* #undef HAVE_DECL_IBV_NODE_USNIC */
+
+/* Define to 1 if you have the declaration of `IBV_TRANSPORT_USNIC', and to 0
+   if you don't. */
+/* #undef HAVE_DECL_IBV_TRANSPORT_USNIC */
+
+/* Define to 1 if you have the declaration of `IBV_TRANSPORT_USNIC_UDP', and
+   to 0 if you don't. */
+/* #undef HAVE_DECL_IBV_TRANSPORT_USNIC_UDP */
+
+/* Define to 1 if you have the declaration of
+   `nvmlDeviceGetMaxPcieLinkGeneration', and to 0 if you don't. */
+/* #undef HAVE_DECL_NVMLDEVICEGETMAXPCIELINKGENERATION */
+
+/* Define to 1 if you have the declaration of `PCI_LOOKUP_NO_NUMBERS', and to
+   0 if you don't. */
+/* #undef HAVE_DECL_PCI_LOOKUP_NO_NUMBERS */
+
+/* Define to 1 if you have the declaration of `PF_INET6', and to 0 if you
+   don't. */
+#define HAVE_DECL_PF_INET6 1
+
+/* Define to 1 if you have the declaration of `PF_UNSPEC', and to 0 if you
+   don't. */
+#define HAVE_DECL_PF_UNSPEC 1
+
+/* Define to 1 if you have the declaration of `pthread_getaffinity_np', and to
+   0 if you don't. */
+#define HAVE_DECL_PTHREAD_GETAFFINITY_NP 1
+
+/* Define to 1 if you have the declaration of `pthread_setaffinity_np', and to
+   0 if you don't. */
+#define HAVE_DECL_PTHREAD_SETAFFINITY_NP 1
+
+/* Define to 1 if you have the declaration of `RLIMIT_AS', and to 0 if you
+   don't. */
+#define HAVE_DECL_RLIMIT_AS 1
+
+/* Define to 1 if you have the declaration of `RLIMIT_CORE', and to 0 if you
+   don't. */
+#define HAVE_DECL_RLIMIT_CORE 1
+
+/* Define to 1 if you have the declaration of `RLIMIT_FSIZE', and to 0 if you
+   don't. */
+#define HAVE_DECL_RLIMIT_FSIZE 1
+
+/* Define to 1 if you have the declaration of `RLIMIT_MEMLOCK', and to 0 if
+   you don't. */
+#define HAVE_DECL_RLIMIT_MEMLOCK 1
+
+/* Define to 1 if you have the declaration of `RLIMIT_NOFILE', and to 0 if you
+   don't. */
+#define HAVE_DECL_RLIMIT_NOFILE 1
+
+/* Define to 1 if you have the declaration of `RLIMIT_NPROC', and to 0 if you
+   don't. */
+#define HAVE_DECL_RLIMIT_NPROC 1
+
+/* Define to 1 if you have the declaration of `RLIMIT_STACK', and to 0 if you
+   don't. */
+#define HAVE_DECL_RLIMIT_STACK 1
+
+/* Define to 1 if you have the declaration of `sbrk', and to 0 if you don't.
+   */
+#define HAVE_DECL_SBRK 1
+
+/* Define to 1 if you have the declaration of `strtoull', and to 0 if you
+   don't. */
+#define HAVE_DECL_STRTOULL 1
+
+/* Define to 1 if you have the declaration of `_SC_LARGE_PAGESIZE', and to 0
+   if you don't. */
+#define HAVE_DECL__SC_LARGE_PAGESIZE 0
+
+/* Define to 1 if you have the declaration of `_SC_NPROCESSORS_CONF', and to 0
+   if you don't. */
+#define HAVE_DECL__SC_NPROCESSORS_CONF 1
+
+/* Define to 1 if you have the declaration of `_SC_NPROCESSORS_ONLN', and to 0
+   if you don't. */
+#define HAVE_DECL__SC_NPROCESSORS_ONLN 1
+
+/* Define to 1 if you have the declaration of `_SC_NPROC_CONF', and to 0 if
+   you don't. */
+#define HAVE_DECL__SC_NPROC_CONF 0
+
+/* Define to 1 if you have the declaration of `_SC_NPROC_ONLN', and to 0 if
+   you don't. */
+#define HAVE_DECL__SC_NPROC_ONLN 0
+
+/* Define to 1 if you have the declaration of `_SC_PAGESIZE', and to 0 if you
+   don't. */
+#define HAVE_DECL__SC_PAGESIZE 1
+
+/* Define to 1 if you have the declaration of `_SC_PAGE_SIZE', and to 0 if you
+   don't. */
+#define HAVE_DECL__SC_PAGE_SIZE 1
+
+/* Define to 1 if you have the declaration of `__func__', and to 0 if you
+   don't. */
+#define HAVE_DECL___FUNC__ 1
+
+/* Define to 1 if you have the <dirent.h> header file. */
+#define HAVE_DIRENT_H 1
+
+/* Define to 1 if you have the <dlfcn.h> header file. */
+#define HAVE_DLFCN_H 1
+
+/* Define to 1 if you have the `dlsym' function. */
+#define HAVE_DLSYM 1
+
+/* Define to 1 if the system has the type `double _Complex'. */
+#define HAVE_DOUBLE__COMPLEX 1
+
+/* Define to 1 if you have the <err.h> header file. */
+#define HAVE_ERR_H 1
+
+/* Define to 1 if you have the <event.h> header file. */
+/* #undef HAVE_EVENT_H */
+
+/* Define to 1 if you have the <execinfo.h> header file. */
+#define HAVE_EXECINFO_H 1
+
+/* Define to 1 if you have the `execve' function. */
+#define HAVE_EXECVE 1
+
+/* Define to 1 if you have the <fca_api.h> header file. */
+/* #undef HAVE_FCA_API_H */
+
+/* Define to 1 if you have the <fcntl.h> header file. */
+#define HAVE_FCNTL_H 1
+
+/* Define to 1 if you have the `ffs' function. */
+#define HAVE_FFS 1
+
+/* Define to 1 if you have the `ffsl' function. */
+#define HAVE_FFSL 1
+
+/* Define to 1 if the system has the type `float _Complex'. */
+#define HAVE_FLOAT__COMPLEX 1
+
+/* Define to 1 if you have the `fls' function. */
+/* #undef HAVE_FLS */
+
+/* Define to 1 if you have the `flsl' function. */
+/* #undef HAVE_FLSL */
+
+/* Define to 1 if you have the `fork' function. */
+#define HAVE_FORK 1
+
+/* Define to 1 if you have the `getpagesize' function. */
+#define HAVE_GETPAGESIZE 1
+
+/* Define to 1 if you have the `getpwuid' function. */
+#define HAVE_GETPWUID 1
+
+/* Define to 1 if you have the `GNI_GetJobResInfo' function. */
+/* #undef HAVE_GNI_GETJOBRESINFO */
+
+/* Define to 1 if the system has the type `GROUP_AFFINITY'. */
+/* #undef HAVE_GROUP_AFFINITY */
+
+/* Define to 1 if the system has the type `GROUP_RELATIONSHIP'. */
+/* #undef HAVE_GROUP_RELATIONSHIP */
+
+/* Define to 1 if you have the <grp.h> header file. */
+#define HAVE_GRP_H 1
+
+/* Define to 1 if you have the <hcoll_api.h> header file. */
+/* #undef HAVE_HCOLL_API_H */
+
+/* Define to 1 if you have the <hostLib.h> header file. */
+/* #undef HAVE_HOSTLIB_H */
+
+/* Define to 1 if you have the `host_info' function. */
+/* #undef HAVE_HOST_INFO */
+
+/* Define to 1 if you have the <hwloc.h> header file. */
+/* #undef HAVE_HWLOC_H */
+
+/* Define to 1 if you have the `ibv_create_xrc_rcv_qp' function. */
+/* #undef HAVE_IBV_CREATE_XRC_RCV_QP */
+
+/* Define to 1 if you have the `ibv_fork_init' function. */
+/* #undef HAVE_IBV_FORK_INIT */
+
+/* Define to 1 if you have the `ibv_get_device_list' function. */
+/* #undef HAVE_IBV_GET_DEVICE_LIST */
+
+/* Define to 1 if you have the `ibv_resize_cq' function. */
+/* #undef HAVE_IBV_RESIZE_CQ */
+
+/* Define to 1 if you have the <ifaddrs.h> header file. */
+#define HAVE_IFADDRS_H 1
+
+/* Define to 1 if you have the <infiniband/driver.h> header file. */
+/* #undef HAVE_INFINIBAND_DRIVER_H */
+
+/* Define to 1 if you have the <infiniband/verbs.h> header file. */
+/* #undef HAVE_INFINIBAND_VERBS_H */
+
+/* Define to 1 if the system has the type `int128_t'. */
+/* #undef HAVE_INT128_T */
+
+/* Define to 1 if the system has the type `int16_t'. */
+#define HAVE_INT16_T 1
+
+/* Define to 1 if the system has the type `int32_t'. */
+#define HAVE_INT32_T 1
+
+/* Define to 1 if the system has the type `int64_t'. */
+#define HAVE_INT64_T 1
+
+/* Define to 1 if the system has the type `int8_t'. */
+#define HAVE_INT8_T 1
+
+/* Define to 1 if the system has the type `intptr_t'. */
+#define HAVE_INTPTR_T 1
+
+/* Define to 1 if you have the <inttypes.h> header file. */
+#define HAVE_INTTYPES_H 1
+
+/* Define to 1 if you have the <ioLib.h> header file. */
+/* #undef HAVE_IOLIB_H */
+
+/* Define to 1 if you have the `isatty' function. */
+#define HAVE_ISATTY 1
+
+/* Define to 1 if the system has the type `KAFFINITY'. */
+/* #undef HAVE_KAFFINITY */
+
+/* Define to 1 if you have the <knem_io.h> header file. */
+/* #undef HAVE_KNEM_IO_H */
+
+/* Define to 1 if you have the <kstat.h> header file. */
+/* #undef HAVE_KSTAT_H */
+
+/* Define to 1 if you have the <libcr.h> header file. */
+/* #undef HAVE_LIBCR_H */
+
+/* Define to 1 if you have the `event' library (-levent). */
+/* #undef HAVE_LIBEVENT */
+
+/* Define to 1 if you have the `event_pthreads' library (-levent_pthreads). */
+/* #undef HAVE_LIBEVENT_PTHREADS */
+
+/* Define to 1 if we have -lgdi32 */
+/* #undef HAVE_LIBGDI32 */
+
+/* Define to 1 if you have the <libgen.h> header file. */
+#define HAVE_LIBGEN_H 1
+
+/* Define to 1 if we have -lkstat */
+/* #undef HAVE_LIBKSTAT */
+
+/* Define to 1 if we have -llgrp */
+/* #undef HAVE_LIBLGRP */
+
+/* Define to 1 if you have the `pci' library (-lpci). */
+/* #undef HAVE_LIBPCI */
+
+/* Define to 1 if you have the <libutil.h> header file. */
+/* #undef HAVE_LIBUTIL_H */
+
+/* Define to 1 if you have the <limits.h> header file. */
+#define HAVE_LIMITS_H 1
+
+/* Define to 1 if the system has the type `LOGICAL_PROCESSOR_RELATIONSHIP'. */
+/* #undef HAVE_LOGICAL_PROCESSOR_RELATIONSHIP */
+
+/* Define to 1 if the system has the type `long double'. */
+#define HAVE_LONG_DOUBLE 1
+
+/* Define to 1 if the system has the type `long double _Complex'. */
+#define HAVE_LONG_DOUBLE__COMPLEX 1
+
+/* Define to 1 if the system has the type `long long'. */
+#define HAVE_LONG_LONG 1
+
+/* Define to 1 if you have the <lsf/lsbatch.h> header file. */
+/* #undef HAVE_LSF_LSBATCH_H */
+
+/* Define to 1 if you have the <lsf/lsf.h> header file. */
+/* #undef HAVE_LSF_LSF_H */
+
+/* Define to 1 if you have the <ltdl.h> header file. */
+/* #undef HAVE_LTDL_H */
+
+/* Define to 1 if you have the <lustre/liblustreapi.h> header file. */
+/* #undef HAVE_LUSTRE_LIBLUSTREAPI_H */
+
+/* Define to 1 if you have the <mach/mach_host.h> header file. */
+/* #undef HAVE_MACH_MACH_HOST_H */
+
+/* Define to 1 if you have the <mach/mach_init.h> header file. */
+/* #undef HAVE_MACH_MACH_INIT_H */
+
+/* Define to 1 if you have the <mach/mach_time.h> header file. */
+/* #undef HAVE_MACH_MACH_TIME_H */
+
+/* Define to 1 if you have the <malloc.h> header file. */
+#define HAVE_MALLOC_H 1
+
+/* Define to 1 if you have the `memalign' function. */
+#define HAVE_MEMALIGN 1
+
+/* Define to 1 if you have the <memory.h> header file. */
+#define HAVE_MEMORY_H 1
+
+/* Define to 1 if you have the `mkfifo' function. */
+#define HAVE_MKFIFO 1
+
+/* Define to 1 if you have the `mmap' function. */
+#define HAVE_MMAP 1
+
+/* Define to 1 if the system has the type `mode_t'. */
+#define HAVE_MODE_T 1
+
+/* Define to 1 if you have the <mtcp.h> header file. */
+/* #undef HAVE_MTCP_H */
+
+/* Define to 1 if you have the <mxm/api/mxm_api.h> header file. */
+/* #undef HAVE_MXM_API_MXM_API_H */
+
+/* Define to 1 if you have the <ndbm.h> header file. */
+/* #undef HAVE_NDBM_H */
+
+/* Define to 1 if you have the <netdb.h> header file. */
+#define HAVE_NETDB_H 1
+
+/* Define to 1 if you have the <netinet/in.h> header file. */
+#define HAVE_NETINET_IN_H 1
+
+/* Define to 1 if you have the <netinet/tcp.h> header file. */
+#define HAVE_NETINET_TCP_H 1
+
+/* Define to 1 if you have the <netlink/netlink.h> header file. */
+/* #undef HAVE_NETLINK_NETLINK_H */
+
+/* Define to 1 if you have the <net/if.h> header file. */
+#define HAVE_NET_IF_H 1
+
+/* Define to 1 if you have the <net/uio.h> header file. */
+/* #undef HAVE_NET_UIO_H */
+
+/* Define to 1 if you have the <numaif.h> header file. */
+/* #undef HAVE_NUMAIF_H */
+
+/* Define to 1 if the system has the type `NUMA_NODE_RELATIONSHIP'. */
+/* #undef HAVE_NUMA_NODE_RELATIONSHIP */
+
+/* Define to 1 if you have the <NVCtrl/NVCtrl.h> header file. */
+/* #undef HAVE_NVCTRL_NVCTRL_H */
+
+/* Define to 1 if you have the <nvml.h> header file. */
+/* #undef HAVE_NVML_H */
+
+/* Define to 1 if you have the `on_exit' function. */
+#define HAVE_ON_EXIT 1
+
+/* Define to 1 if you have the `openat' function. */
+#define HAVE_OPENAT 1
+
+/* Define to 1 if you have the `openpty' function. */
+#define HAVE_OPENPTY 1
+
+/* Define to 1 if you have the <pci/pci.h> header file. */
+/* #undef HAVE_PCI_PCI_H */
+
+/* Define to 1 if you have the <picl.h> header file. */
+/* #undef HAVE_PICL_H */
+
+/* Define to 1 if you have the `pipe' function. */
+#define HAVE_PIPE 1
+
+/* Define to 1 if you have the <plfs.h> header file. */
+/* #undef HAVE_PLFS_H */
+
+/* Define to 1 if you have the <pmapi.h> header file. */
+/* #undef HAVE_PMAPI_H */
+
+/* Define to 1 if you have the `pm_cycles' function. */
+/* #undef HAVE_PM_CYCLES */
+
+/* Define to 1 if you have the <poll.h> header file. */
+#define HAVE_POLL_H 1
+
+/* Define to 1 if you have the <portals4.h> header file. */
+/* #undef HAVE_PORTALS4_H */
+
+/* Define to 1 if you have the `posix_memalign' function. */
+#define HAVE_POSIX_MEMALIGN 1
+
+/* Define to 1 if you have the `printstack' function. */
+/* #undef HAVE_PRINTSTACK */
+
+/* Define to 1 if the system has the type `PROCESSOR_CACHE_TYPE'. */
+/* #undef HAVE_PROCESSOR_CACHE_TYPE */
+
+/* Define to 1 if the system has the type `PROCESSOR_GROUP_INFO'. */
+/* #undef HAVE_PROCESSOR_GROUP_INFO */
+
+/* Define to 1 if the system has the type `PROCESSOR_RELATIONSHIP'. */
+/* #undef HAVE_PROCESSOR_RELATIONSHIP */
+
+/* Define to 1 if the system has the type `PSAPI_WORKING_SET_EX_BLOCK'. */
+/* #undef HAVE_PSAPI_WORKING_SET_EX_BLOCK */
+
+/* Define to 1 if the system has the type `PSAPI_WORKING_SET_EX_INFORMATION'.
+   */
+/* #undef HAVE_PSAPI_WORKING_SET_EX_INFORMATION */
+
+/* Define to 1 if you have the <psm.h> header file. */
+/* #undef HAVE_PSM_H */
+
+/* Define to 1 if you have the `pthread_condattr_setpshared' function. */
+#define HAVE_PTHREAD_CONDATTR_SETPSHARED 1
+
+/* Define to 1 if you have the <pthread.h> header file. */
+#define HAVE_PTHREAD_H 1
+
+/* Define to 1 if you have the `pthread_mutexattr_setpshared' function. */
+#define HAVE_PTHREAD_MUTEXATTR_SETPSHARED 1
+
+/* Define to 1 if you have the <pthread_np.h> header file. */
+/* #undef HAVE_PTHREAD_NP_H */
+
+/* Define to 1 if the system has the type `pthread_t'. */
+#define HAVE_PTHREAD_T 1
+
+/* Define to 1 if the system has the type `ptrdiff_t'. */
+#define HAVE_PTRDIFF_T 1
+
+/* Define to 1 if you have the `ptsname' function. */
+#define HAVE_PTSNAME 1
+
+/* Define to 1 if you have the <pty.h> header file. */
+#define HAVE_PTY_H 1
+
+/* Define to 1 if you have the <pvfs2.h> header file. */
+/* #undef HAVE_PVFS2_H */
+
+/* Define to 1 if you have the <pwd.h> header file. */
+#define HAVE_PWD_H 1
+
+/* Define to 1 if you have the <rdma/rdma_cma.h> header file. */
+/* #undef HAVE_RDMA_RDMA_CMA_H */
+
+/* Define to 1 if you have the <rdma/rsocket.h> header file. */
+/* #undef HAVE_RDMA_RSOCKET_H */
+
+/* Define to 1 if you have the `regcmp' function. */
+/* #undef HAVE_REGCMP */
+
+/* Define to 1 if you have the `regexec' function. */
+#define HAVE_REGEXEC 1
+
+/* Define to 1 if you have the <regex.h> header file. */
+#define HAVE_REGEX_H 1
+
+/* Define to 1 if you have the `regfree' function. */
+#define HAVE_REGFREE 1
+
+/* Define to 1 if the system has the type `RelationProcessorPackage'. */
+/* #undef HAVE_RELATIONPROCESSORPACKAGE */
+
+/* Define to 1 if you have the <sched.h> header file. */
+#define HAVE_SCHED_H 1
+
+/* Define to 1 if you have the <scif.h> header file. */
+#define HAVE_SCIF_H 1
+
+/* Define to 1 if you have the `setenv' function. */
+#define HAVE_SETENV 1
+
+/* Define to 1 if you have the `setlocale' function. */
+#define HAVE_SETLOCALE 1
+
+/* Define to 1 if you have the `setpgid' function. */
+#define HAVE_SETPGID 1
+
+/* Define to 1 if you have the `setsid' function. */
+#define HAVE_SETSID 1
+
+/* Define to 1 if you have the <shlwapi.h> header file. */
+/* #undef HAVE_SHLWAPI_H */
+
+/* Define to 1 if `si_band' is a member of `siginfo_t'. */
+#define HAVE_SIGINFO_T_SI_BAND 1
+
+/* Define to 1 if `si_fd' is a member of `siginfo_t'. */
+#define HAVE_SIGINFO_T_SI_FD 1
+
+/* Define to 1 if you have the <signal.h> header file. */
+#define HAVE_SIGNAL_H 1
+
+/* Define to 1 if you have the `snprintf' function. */
+#define HAVE_SNPRINTF 1
+
+/* Define to 1 if you have the <sn/xpmem.h> header file. */
+/* #undef HAVE_SN_XPMEM_H */
+
+/* Define to 1 if you have the `socketpair' function. */
+#define HAVE_SOCKETPAIR 1
+
+/* Define to 1 if the system has the type `socklen_t'. */
+#define HAVE_SOCKLEN_T 1
+
+/* Define to 1 if you have the <sockLib.h> header file. */
+/* #undef HAVE_SOCKLIB_H */
+
+/* Define to 1 if the system has the type `ssize_t'. */
+#define HAVE_SSIZE_T 1
+
+/* Define to 1 if you have the `statfs' function. */
+#define HAVE_STATFS 1
+
+/* Define to 1 if you have the `statvfs' function. */
+#define HAVE_STATVFS 1
+
+/* Define to 1 if you have the <stdarg.h> header file. */
+#define HAVE_STDARG_H 1
+
+/* Define to 1 if you have the <stdbool.h> header file. */
+#define HAVE_STDBOOL_H 1
+
+/* Define to 1 if you have the <stddef.h> header file. */
+#define HAVE_STDDEF_H 1
+
+/* Define to 1 if you have the <stdint.h> header file. */
+#define HAVE_STDINT_H 1
+
+/* Define to 1 if you have the <stdlib.h> header file. */
+#define HAVE_STDLIB_H 1
+
+/* Define to 1 if you have the `strftime' function. */
+#define HAVE_STRFTIME 1
+
+/* Define to 1 if you have the <strings.h> header file. */
+#define HAVE_STRINGS_H 1
+
+/* Define to 1 if you have the <string.h> header file. */
+#define HAVE_STRING_H 1
+
+/* Define to 1 if you have the `strncasecmp' function. */
+#define HAVE_STRNCASECMP 1
+
+/* Define to 1 if you have the `strncpy_s' function. */
+/* #undef HAVE_STRNCPY_S */
+
+/* Define to 1 if you have the <stropts.h> header file. */
+/* #undef HAVE_STROPTS_H */
+
+/* Define to 1 if you have the `strsignal' function. */
+#define HAVE_STRSIGNAL 1
+
+/* Define to 1 if `d_type' is a member of `struct dirent'. */
+#define HAVE_STRUCT_DIRENT_D_TYPE 1
+
+/* Define to 1 if `transport_type' is a member of `struct ibv_device'. */
+/* #undef HAVE_STRUCT_IBV_DEVICE_TRANSPORT_TYPE */
+
+/* Define to 1 if `ifr_hwaddr' is a member of `struct ifreq'. */
+#define HAVE_STRUCT_IFREQ_IFR_HWADDR 1
+
+/* Define to 1 if `ifr_mtu' is a member of `struct ifreq'. */
+#define HAVE_STRUCT_IFREQ_IFR_MTU 1
+
+/* Define to 1 if the system has the type `struct sockaddr_in'. */
+#define HAVE_STRUCT_SOCKADDR_IN 1
+
+/* Define to 1 if the system has the type `struct sockaddr_in6'. */
+#define HAVE_STRUCT_SOCKADDR_IN6 1
+
+/* Define to 1 if `sa_len' is a member of `struct sockaddr'. */
+/* #undef HAVE_STRUCT_SOCKADDR_SA_LEN */
+
+/* Define to 1 if the system has the type `struct sockaddr_storage'. */
+#define HAVE_STRUCT_SOCKADDR_STORAGE 1
+
+/* Define to 1 if the system has the type `struct sockaddr_un'. */
+#define HAVE_STRUCT_SOCKADDR_UN 1
+
+/* Define to 1 if `f_fstypename' is a member of `struct statfs'. */
+/* #undef HAVE_STRUCT_STATFS_F_FSTYPENAME */
+
+/* Define to 1 if `f_type' is a member of `struct statfs'. */
+#define HAVE_STRUCT_STATFS_F_TYPE 1
+
+/* Define to 1 if `f_basetype' is a member of `struct statvfs'. */
+/* #undef HAVE_STRUCT_STATVFS_F_BASETYPE */
+
+/* Define to 1 if `f_fstypename' is a member of `struct statvfs'. */
+/* #undef HAVE_STRUCT_STATVFS_F_FSTYPENAME */
+
+/* Define to 1 if you have the `syscall' function. */
+#define HAVE_SYSCALL 1
+
+/* Define to 1 if you have the `sysconf' function. */
+#define HAVE_SYSCONF 1
+
+/* Define to '1' if sysctl is present and usable */
+#define HAVE_SYSCTL 1
+
+/* Define to '1' if sysctlbyname is present and usable */
+/* #undef HAVE_SYSCTLBYNAME */
+
+/* Define to 1 if you have the `syslog' function. */
+#define HAVE_SYSLOG 1
+
+/* Define to 1 if you have the <syslog.h> header file. */
+#define HAVE_SYSLOG_H 1
+
+/* Define to 1 if the system has the type
+   `SYSTEM_LOGICAL_PROCESSOR_INFORMATION'. */
+/* #undef HAVE_SYSTEM_LOGICAL_PROCESSOR_INFORMATION */
+
+/* Define to 1 if the system has the type
+   `SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX'. */
+/* #undef HAVE_SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX */
+
+/* Define to 1 if you have the <sys/cpuset.h> header file. */
+/* #undef HAVE_SYS_CPUSET_H */
+
+/* Define to 1 if you have the <sys/fcntl.h> header file. */
+#define HAVE_SYS_FCNTL_H 1
+
+/* Define to 1 if you have the <sys/ioctl.h> header file. */
+#define HAVE_SYS_IOCTL_H 1
+
+/* Define to 1 if you have the <sys/ipc.h> header file. */
+#define HAVE_SYS_IPC_H 1
+
+/* Define to 1 if you have the <sys/lgrp_user.h> header file. */
+/* #undef HAVE_SYS_LGRP_USER_H */
+
+/* Define to 1 if you have the <sys/mman.h> header file. */
+#define HAVE_SYS_MMAN_H 1
+
+/* Define to 1 if you have the <sys/mount.h> header file. */
+#define HAVE_SYS_MOUNT_H 1
+
+/* Define to 1 if you have the <sys/param.h> header file. */
+#define HAVE_SYS_PARAM_H 1
+
+/* Define to 1 if you have the <sys/poll.h> header file. */
+#define HAVE_SYS_POLL_H 1
+
+/* Define to 1 if you have the <sys/prctl.h> header file. */
+/* #undef HAVE_SYS_PRCTL_H */
+
+/* Define to 1 if you have the <sys/queue.h> header file. */
+#define HAVE_SYS_QUEUE_H 1
+
+/* Define to 1 if you have the <sys/resource.h> header file. */
+#define HAVE_SYS_RESOURCE_H 1
+
+/* Define to 1 if you have the <sys/select.h> header file. */
+#define HAVE_SYS_SELECT_H 1
+
+/* Define to 1 if you have the <sys/shm.h> header file. */
+#define HAVE_SYS_SHM_H 1
+
+/* Define to 1 if you have the <sys/socket.h> header file. */
+#define HAVE_SYS_SOCKET_H 1
+
+/* Define to 1 if you have the <sys/sockio.h> header file. */
+/* #undef HAVE_SYS_SOCKIO_H */
+
+/* Define to 1 if you have the <sys/statfs.h> header file. */
+#define HAVE_SYS_STATFS_H 1
+
+/* Define to 1 if you have the <sys/statvfs.h> header file. */
+#define HAVE_SYS_STATVFS_H 1
+
+/* Define to 1 if you have the <sys/stat.h> header file. */
+#define HAVE_SYS_STAT_H 1
+
+/* Define to 1 if you have the <sys/synch.h> header file. */
+/* #undef HAVE_SYS_SYNCH_H */
+
+/* Define to 1 if you have the <sys/sysctl.h> header file. */
+#define HAVE_SYS_SYSCTL_H 1
+
+/* Define to 1 if you have the <sys/time.h> header file. */
+#define HAVE_SYS_TIME_H 1
+
+/* Define to 1 if you have the <sys/tree.h> header file. */
+/* #undef HAVE_SYS_TREE_H */
+
+/* Define to 1 if you have the <sys/types.h> header file. */
+#define HAVE_SYS_TYPES_H 1
+
+/* Define to 1 if you have the <sys/uio.h> header file. */
+#define HAVE_SYS_UIO_H 1
+
+/* Define to 1 if you have the <sys/un.h> header file. */
+#define HAVE_SYS_UN_H 1
+
+/* Define to 1 if you have the <sys/utsname.h> header file. */
+#define HAVE_SYS_UTSNAME_H 1
+
+/* Define to 1 if you have the <sys/vfs.h> header file. */
+#define HAVE_SYS_VFS_H 1
+
+/* Define to 1 if you have the <sys/wait.h> header file. */
+#define HAVE_SYS_WAIT_H 1
+
+/* Define to 1 if you have the <TargetConditionals.h> header file. */
+/* #undef HAVE_TARGETCONDITIONALS_H */
+
+/* Define to 1 if you have the `tcgetpgrp' function. */
+#define HAVE_TCGETPGRP 1
+
+/* Define to 1 if you have the <termios.h> header file. */
+#define HAVE_TERMIOS_H 1
+
+/* Define to 1 if you have the <time.h> header file. */
+#define HAVE_TIME_H 1
+
+/* Define to 1 if you have the <tm.h> header file. */
+/* #undef HAVE_TM_H */
+
+/* Define to 1 if you have the <ucontext.h> header file. */
+#define HAVE_UCONTEXT_H 1
+
+/* Define to 1 if the system has the type `uint128_t'. */
+/* #undef HAVE_UINT128_T */
+
+/* Define to 1 if the system has the type `uint16_t'. */
+#define HAVE_UINT16_T 1
+
+/* Define to 1 if the system has the type `uint32_t'. */
+#define HAVE_UINT32_T 1
+
+/* Define to 1 if the system has the type `uint64_t'. */
+#define HAVE_UINT64_T 1
+
+/* Define to 1 if the system has the type `uint8_t'. */
+#define HAVE_UINT8_T 1
+
+/* Define to 1 if the system has the type `uintptr_t'. */
+#define HAVE_UINTPTR_T 1
+
+/* Define to 1 if you have the <ulimit.h> header file. */
+#define HAVE_ULIMIT_H 1
+
+/* Define to 1 if you have the `uname' function. */
+#define HAVE_UNAME 1
+
+/* Define to 1 if you have the <unistd.h> header file. */
+#define HAVE_UNISTD_H 1
+
+/* whether unix byteswap routines -- htonl, htons, nothl, ntohs -- are
+   available */
+#define HAVE_UNIX_BYTESWAP 1
+
+/* Define to 1 if you have the `usleep' function. */
+#define HAVE_USLEEP 1
+
+/* Define to 1 if you have the <util.h> header file. */
+/* #undef HAVE_UTIL_H */
+
+/* Define to 1 if you have the <utmp.h> header file. */
+#define HAVE_UTMP_H 1
+
+/* Define to 1 if you have the <valgrind/valgrind.h> header file. */
+/* #undef HAVE_VALGRIND_VALGRIND_H */
+
+/* Define to 1 if you have the `vasprintf' function. */
+#define HAVE_VASPRINTF 1
+
+/* Define to 1 if you have the `vsnprintf' function. */
+#define HAVE_VSNPRINTF 1
+
+/* Define to 1 if you have the `vsyslog' function. */
+#define HAVE_VSYSLOG 1
+
+/* Define to 1 if you have the `waitpid' function. */
+#define HAVE_WAITPID 1
+
+/* Define to 1 if you have the <X11/keysym.h> header file. */
+#define HAVE_X11_KEYSYM_H 1
+
+/* Define to 1 if you have the <X11/Xlib.h> header file. */
+#define HAVE_X11_XLIB_H 1
+
+/* Define to 1 if you have the <X11/Xutil.h> header file. */
+#define HAVE_X11_XUTIL_H 1
+
+/* Define to 1 if you have the <xpmem.h> header file. */
+/* #undef HAVE_XPMEM_H */
+
+/* Define to 1 if you have the `_NSGetEnviron' function. */
+/* #undef HAVE__NSGETENVIRON */
+
+/* Define to 1 if the system has the type `__float128'. */
+#define HAVE___FLOAT128 1
+
+/* Define to 1 if you have the `__mmap' function. */
+/* #undef HAVE___MMAP */
+
+/* Define to 1 if you have the `__munmap' function. */
+/* #undef HAVE___MUNMAP */
+
+/* Define to 1 on AIX */
+/* #undef HWLOC_AIX_SYS */
+
+/* Define to 1 on BlueGene/Q */
+/* #undef HWLOC_BGQ_SYS */
+
+/* Whether C compiler supports symbol visibility or not */
+#define HWLOC_C_HAVE_VISIBILITY 1
+
+/* Define to 1 on Darwin */
+/* #undef HWLOC_DARWIN_SYS */
+
+/* Whether we are in debugging mode or not */
+/* #undef HWLOC_DEBUG */
+
+/* Version of hwloc */
+/* #undef HWLOC_EXTERNAL_HWLOC_VERSION */
+
+/* Define to 1 on *FREEBSD */
+/* #undef HWLOC_FREEBSD_SYS */
+
+/* Whether your compiler has __attribute__ or not */
+#define HWLOC_HAVE_ATTRIBUTE 1
+
+/* Whether your compiler has __attribute__ aligned or not */
+#define HWLOC_HAVE_ATTRIBUTE_ALIGNED 1
+
+/* Whether your compiler has __attribute__ always_inline or not */
+#define HWLOC_HAVE_ATTRIBUTE_ALWAYS_INLINE 1
+
+/* Whether your compiler has __attribute__ cold or not */
+#define HWLOC_HAVE_ATTRIBUTE_COLD 1
+
+/* Whether your compiler has __attribute__ const or not */
+#define HWLOC_HAVE_ATTRIBUTE_CONST 1
+
+/* Whether your compiler has __attribute__ deprecated or not */
+#define HWLOC_HAVE_ATTRIBUTE_DEPRECATED 1
+
+/* Whether your compiler has __attribute__ format or not */
+#define HWLOC_HAVE_ATTRIBUTE_FORMAT 1
+
+/* Whether your compiler has __attribute__ hot or not */
+#define HWLOC_HAVE_ATTRIBUTE_HOT 1
+
+/* Whether your compiler has __attribute__ malloc or not */
+#define HWLOC_HAVE_ATTRIBUTE_MALLOC 1
+
+/* Whether your compiler has __attribute__ may_alias or not */
+#define HWLOC_HAVE_ATTRIBUTE_MAY_ALIAS 1
+
+/* Whether your compiler has __attribute__ nonnull or not */
+#define HWLOC_HAVE_ATTRIBUTE_NONNULL 1
+
+/* Whether your compiler has __attribute__ noreturn or not */
+#define HWLOC_HAVE_ATTRIBUTE_NORETURN 1
+
+/* Whether your compiler has __attribute__ no_instrument_function or not */
+#define HWLOC_HAVE_ATTRIBUTE_NO_INSTRUMENT_FUNCTION 1
+
+/* Whether your compiler has __attribute__ packed or not */
+#define HWLOC_HAVE_ATTRIBUTE_PACKED 1
+
+/* Whether your compiler has __attribute__ pure or not */
+#define HWLOC_HAVE_ATTRIBUTE_PURE 1
+
+/* Whether your compiler has __attribute__ sentinel or not */
+#define HWLOC_HAVE_ATTRIBUTE_SENTINEL 1
+
+/* Whether your compiler has __attribute__ unused or not */
+#define HWLOC_HAVE_ATTRIBUTE_UNUSED 1
+
+/* Whether your compiler has __attribute__ warn unused result or not */
+#define HWLOC_HAVE_ATTRIBUTE_WARN_UNUSED_RESULT 1
+
+/* Whether your compiler has __attribute__ weak alias or not */
+#define HWLOC_HAVE_ATTRIBUTE_WEAK_ALIAS 1
+
+/* Define to 1 if your `ffs' function is known to be broken. */
+/* #undef HWLOC_HAVE_BROKEN_FFS */
+
+/* Define to 1 if you have the `clz' function. */
+/* #undef HWLOC_HAVE_CLZ */
+
+/* Define to 1 if you have the `clzl' function. */
+/* #undef HWLOC_HAVE_CLZL */
+
+/* Define to 1 if the CPU_SET macro works */
+#define HWLOC_HAVE_CPU_SET 1
+
+/* Define to 1 if the CPU_SET_S macro works */
+#define HWLOC_HAVE_CPU_SET_S 1
+
+/* Define to 1 if you have the `cudart' SDK. */
+/* #undef HWLOC_HAVE_CUDART */
+
+/* Define to 1 if function `clz' is declared by system headers */
+/* #undef HWLOC_HAVE_DECL_CLZ */
+
+/* Define to 1 if function `clzl' is declared by system headers */
+/* #undef HWLOC_HAVE_DECL_CLZL */
+
+/* Define to 1 if function `ffs' is declared by system headers */
+#define HWLOC_HAVE_DECL_FFS 1
+
+/* Define to 1 if function `ffsl' is declared by system headers */
+#define HWLOC_HAVE_DECL_FFSL 1
+
+/* Define to 1 if function `fls' is declared by system headers */
+/* #undef HWLOC_HAVE_DECL_FLS */
+
+/* Define to 1 if function `flsl' is declared by system headers */
+/* #undef HWLOC_HAVE_DECL_FLSL */
+
+/* Define to 1 if function `strncasecmp' is declared by system headers */
+#define HWLOC_HAVE_DECL_STRNCASECMP 1
+
+/* Define to 1 if you have the `ffs' function. */
+#define HWLOC_HAVE_FFS 1
+
+/* Define to 1 if you have the `ffsl' function. */
+#define HWLOC_HAVE_FFSL 1
+
+/* Define to 1 if you have the `fls' function. */
+/* #undef HWLOC_HAVE_FLS */
+
+/* Define to 1 if you have the `flsl' function. */
+/* #undef HWLOC_HAVE_FLSL */
+
+/* Define to 1 if you have the GL module components. */
+/* #undef HWLOC_HAVE_GL */
+
+/* Define to 1 if you have the `libpciaccess' library. */
+/* #undef HWLOC_HAVE_LIBPCIACCESS */
+
+/* Define to 1 if you have the `libxml2' library. */
+/* #undef HWLOC_HAVE_LIBXML2 */
+
+/* Define to 1 if building the Linux PCI component */
+#define HWLOC_HAVE_LINUXPCI 1
+
+/* Define to 1 if mbind is available. */
+/* #undef HWLOC_HAVE_MBIND */
+
+/* Define to 1 if migrate_pages is available. */
+/* #undef HWLOC_HAVE_MIGRATE_PAGES */
+
+/* Define to 1 if you have the `NVML' library. */
+/* #undef HWLOC_HAVE_NVML */
+
+/* Define to 1 if glibc provides the old prototype (without length) of
+   sched_setaffinity() */
+/* #undef HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
+
+/* Define to 1 if you have the `OpenCL' library. */
+/* #undef HWLOC_HAVE_OPENCL */
+
+/* Define to 1 if `libpci' struct pci_dev has a `device_class' field. */
+/* #undef HWLOC_HAVE_PCIDEV_DEVICE_CLASS */
+
+/* Define to 1 if `libpci' struct pci_dev has a `domain' field. */
+/* #undef HWLOC_HAVE_PCIDEV_DOMAIN */
+
+/* Define to 1 if you have the pciutils `libpci' library. */
+/* #undef HWLOC_HAVE_PCIUTILS */
+
+/* Define to 1 if `libpci' has the `pci_find_cap' function. */
+/* #undef HWLOC_HAVE_PCI_FIND_CAP */
+
+/* Define to 1 if the hwloc library should support dynamically-loaded plugins
+   */
+/* #undef HWLOC_HAVE_PLUGINS */
+
+/* `Define to 1 if you have pthread_getthrds_np' */
+/* #undef HWLOC_HAVE_PTHREAD_GETTHRDS_NP */
+
+/* Define to 1 if pthread mutexes are available */
+#define HWLOC_HAVE_PTHREAD_MUTEX 1
+
+/* Define to 1 if glibc provides a prototype of sched_setaffinity() */
+#define HWLOC_HAVE_SCHED_SETAFFINITY 1
+
+/* Define to 1 if set_mempolicy is available. */
+/* #undef HWLOC_HAVE_SET_MEMPOLICY */
+
+/* Define to 1 if you have the <stdint.h> header file. */
+#define HWLOC_HAVE_STDINT_H 1
+
+/* Define to 1 if you have the `windows.h' header. */
+/* #undef HWLOC_HAVE_WINDOWS_H */
+
+/* Define to 1 if X11 headers including Xutil.h and keysym.h are available. */
+#define HWLOC_HAVE_X11_KEYSYM 1
+
+/* Define to 1 if you have x86 cpuid */
+#define HWLOC_HAVE_X86_CPUID 1
+
+/* Define to 1 if the _syscall3 macro works */
+/* #undef HWLOC_HAVE__SYSCALL3 */
+
+/* Define to 1 on HP-UX */
+/* #undef HWLOC_HPUX_SYS */
+
+/* Version of hwloc */
+#define HWLOC_HWLOC191_HWLOC_VERSION "internal v1.9.1"
+
+/* Define to 1 on Irix */
+/* #undef HWLOC_IRIX_SYS */
+
+/* Define to 1 on Linux */
+#define HWLOC_LINUX_SYS 1
+
+/* Define to 1 on *NETBSD */
+/* #undef HWLOC_NETBSD_SYS */
+
+/* Define to 1 on OSF */
+/* #undef HWLOC_OSF_SYS */
+
+/* The size of `unsigned int', as computed by sizeof */
+#define HWLOC_SIZEOF_UNSIGNED_INT 4
+
+/* The size of `unsigned long', as computed by sizeof */
+#define HWLOC_SIZEOF_UNSIGNED_LONG 8
+
+/* Define to 1 on Solaris */
+/* #undef HWLOC_SOLARIS_SYS */
+
+/* The hwloc symbol prefix */
+#define HWLOC_SYM_PREFIX opal_hwloc191_
+
+/* The hwloc symbol prefix in all caps */
+#define HWLOC_SYM_PREFIX_CAPS OPAL_HWLOC191_
+
+/* Whether we need to re-define all the hwloc public symbols or not */
+#define HWLOC_SYM_TRANSFORM 1
+
+/* Define to 1 on unsupported systems */
+/* #undef HWLOC_UNSUPPORTED_SYS */
+
+/* Define to 1 on WINDOWS */
+/* #undef HWLOC_WIN_SYS */
+
+/* Define to 1 on x86_32 */
+/* #undef HWLOC_X86_32_ARCH */
+
+/* Define to 1 on x86_64 */
+#define HWLOC_X86_64_ARCH 1
+
+/* Define to the sub-directory in which libtool stores uninstalled libraries.
+   */
+#define LT_OBJDIR ".libs/"
+
+/* Header to include for event implementation */
+#define MCA_event_IMPLEMENTATION_HEADER "opal/mca/event/libevent2021/libevent2021.h"
+
+/* Header to include for hwloc implementation */
+#define MCA_hwloc_IMPLEMENTATION_HEADER "opal/mca/hwloc/hwloc191/hwloc191.h"
+
+/* Location of external hwloc header */
+/* #undef MCA_hwloc_external_header */
+
+/* Location of external hwloc header */
+/* #undef MCA_hwloc_external_openfabrics_header */
+
+/* Complete set of command line arguments given to ROMIOs configure script */
+#define MCA_io_romio_COMPLETE_CONFIGURE_FLAGS " FROM_OMPI=yes CC='gcc -std=gnu99' CFLAGS='-g -Wall -Wundef -Wno-long-long -Wsign-compare -Wmissing-prototypes -Wstrict-prototypes -Wcomment -pedantic -Werror-implicit-function-declaration -finline-functions -fno-strict-aliasing -pthread' CPPFLAGS='  -I/home/wwu12/ompi/ompi-cuda/opal/mca/hwloc/hwloc191/hwloc/include -I/home/wwu12/ompi/ompi-cuda/opal/mca/event/libevent2021/libevent -I/home/wwu12/ompi/ompi-cuda/opal/mca/event/libevent2021/libevent/include' FFLAGS='' LDFLAGS='  ' --enable-shared --disable-static  --prefix=/home/wwu12/ompi/build-cuda --disable-aio"
+
+/* Set of user-defined configure flags given to ROMIOs configure script via
+   --with-io-romio-flags */
+#define MCA_io_romio_USER_CONFIGURE_FLAGS ""
+
+/* Header to include for memcpy implementation */
+#define MCA_memcpy_IMPLEMENTATION_HEADER "opal/mca/memcpy/base/memcpy_base_default.h"
+
+/* Header to include for parts of the memory implementation */
+#define MCA_memory_IMPLEMENTATION_HEADER "opal/mca/memory/base/empty.h"
+
+/* Defined to 1 if ompi:mtl should use direct calls instead of components */
+#define MCA_ompi_mtl_DIRECT_CALL 0
+
+/* name of component to use for direct calls, if MCA_ompi_mtl_DIRECT_CALL is 1
+   */
+#define MCA_ompi_mtl_DIRECT_CALL_COMPONENT 
+
+/* Header ompi:mtl includes to be direct called */
+#define MCA_ompi_mtl_DIRECT_CALL_HEADER ""
+
+/* Defined to 1 if ompi:pml should use direct calls instead of components */
+#define MCA_ompi_pml_DIRECT_CALL 0
+
+/* name of component to use for direct calls, if MCA_ompi_pml_DIRECT_CALL is 1
+   */
+#define MCA_ompi_pml_DIRECT_CALL_COMPONENT 
+
+/* Header ompi:pml includes to be direct called */
+#define MCA_ompi_pml_DIRECT_CALL_HEADER ""
+
+/* Defined to 1 if oshmem:memheap should use direct calls instead of
+   components */
+#define MCA_oshmem_memheap_DIRECT_CALL 0
+
+/* name of component to use for direct calls, if
+   MCA_oshmem_memheap_DIRECT_CALL is 1 */
+#define MCA_oshmem_memheap_DIRECT_CALL_COMPONENT 
+
+/* Header oshmem:memheap includes to be direct called */
+#define MCA_oshmem_memheap_DIRECT_CALL_HEADER ""
+
+/* Defined to 1 if oshmem:spml should use direct calls instead of components
+   */
+#define MCA_oshmem_spml_DIRECT_CALL 0
+
+/* name of component to use for direct calls, if MCA_oshmem_spml_DIRECT_CALL
+   is 1 */
+#define MCA_oshmem_spml_DIRECT_CALL_COMPONENT 
+
+/* Header oshmem:spml includes to be direct called */
+#define MCA_oshmem_spml_DIRECT_CALL_HEADER ""
+
+/* Header to include for rte implementation */
+#define MCA_rte_IMPLEMENTATION_HEADER "ompi/mca/rte/orte/rte_orte.h"
+
+/* Header to include for timer implementation */
+#define MCA_timer_IMPLEMENTATION_HEADER "opal/mca/timer/linux/timer_linux.h"
+
+/* Whether ptmalloc2 is supported on this system or not */
+#define MEMORY_LINUX_PTMALLOC2 1
+
+/* Whether ummunotify is supported on this system or not */
+#define MEMORY_LINUX_UMMUNOTIFY 0
+
+/* Whether we can use M-PAGE supported since MOFED 1.8 */
+#define MPAGE_ENABLE 0
+
+/* create_flags field is part of ibv_exp_reg_mr_in */
+#define MPAGE_HAVE_IBV_EXP_REG_MR_CREATE_FLAGS 0
+
+/* exp_access field is part of ibv_exp_reg_shared_mr_in */
+#define MPAGE_HAVE_SMR_EXP_ACCESS 0
+
+/* Maximum value for an MPI_Count */
+#define MPI_COUNT_MAX 0x7fffffffffffffffll
+
+/* Whether we want to check MPI parameters always, never, or decide at
+   run-time */
+#define MPI_PARAM_CHECK ompi_mpi_param_check
+
+/* Alignment of Fortran CHARACTER */
+#define OMPI_ALIGNMENT_FORTRAN_CHARACTER 1
+
+/* Alignment of Fortran COMPLEX */
+#define OMPI_ALIGNMENT_FORTRAN_COMPLEX 4
+
+/* Alignment of Fortran COMPLEX*16 */
+#define OMPI_ALIGNMENT_FORTRAN_COMPLEX16 8
+
+/* Alignment of Fortran COMPLEX*32 */
+#define OMPI_ALIGNMENT_FORTRAN_COMPLEX32 4
+
+/* Alignment of Fortran COMPLEX*4 */
+#define OMPI_ALIGNMENT_FORTRAN_COMPLEX4 4
+
+/* Alignment of Fortran COMPLEX*8 */
+#define OMPI_ALIGNMENT_FORTRAN_COMPLEX8 4
+
+/* Alignment of Fortran DOUBLE COMPLEX */
+#define OMPI_ALIGNMENT_FORTRAN_DOUBLE_COMPLEX 8
+
+/* Alignment of Fortran DOUBLE PRECISION */
+#define OMPI_ALIGNMENT_FORTRAN_DOUBLE_PRECISION 8
+
+/* Alignment of Fortran INTEGER */
+#define OMPI_ALIGNMENT_FORTRAN_INTEGER 4
+
+/* Alignment of Fortran INTEGER*1 */
+#define OMPI_ALIGNMENT_FORTRAN_INTEGER1 1
+
+/* Alignment of Fortran INTEGER*16 */
+#define OMPI_ALIGNMENT_FORTRAN_INTEGER16 4
+
+/* Alignment of Fortran INTEGER*2 */
+#define OMPI_ALIGNMENT_FORTRAN_INTEGER2 2
+
+/* Alignment of Fortran INTEGER*4 */
+#define OMPI_ALIGNMENT_FORTRAN_INTEGER4 4
+
+/* Alignment of Fortran INTEGER*8 */
+#define OMPI_ALIGNMENT_FORTRAN_INTEGER8 8
+
+/* Alignment of Fortran LOGICAL */
+#define OMPI_ALIGNMENT_FORTRAN_LOGICAL 4
+
+/* Alignment of Fortran LOGICAL*1 */
+#define OMPI_ALIGNMENT_FORTRAN_LOGICAL1 1
+
+/* Alignment of Fortran LOGICAL*2 */
+#define OMPI_ALIGNMENT_FORTRAN_LOGICAL2 2
+
+/* Alignment of Fortran LOGICAL*4 */
+#define OMPI_ALIGNMENT_FORTRAN_LOGICAL4 4
+
+/* Alignment of Fortran LOGICAL*8 */
+#define OMPI_ALIGNMENT_FORTRAN_LOGICAL8 8
+
+/* Alignment of Fortran REAL */
+#define OMPI_ALIGNMENT_FORTRAN_REAL 4
+
+/* Alignment of Fortran REAL*16 */
+#define OMPI_ALIGNMENT_FORTRAN_REAL16 4
+
+/* Alignment of Fortran REAL*2 */
+#define OMPI_ALIGNMENT_FORTRAN_REAL2 4
+
+/* Alignment of Fortran REAL*4 */
+#define OMPI_ALIGNMENT_FORTRAN_REAL4 4
+
+/* Alignment of Fortran REAL*8 */
+#define OMPI_ALIGNMENT_FORTRAN_REAL8 8
+
+/* Whether we want MPI C++ support or not */
+#define OMPI_BUILD_CXX_BINDINGS 0
+
+/* Whether we built the 'use mpi_f08' prototype subarray-based implementation
+   or not (i.e., whether to build the use-mpi-f08-desc prototype or the
+   regular use-mpi-f08 implementation) */
+#define OMPI_BUILD_FORTRAN_F08_SUBARRAYS 0
+
+/* Whether we will build the MPI Fortran mpif.h bindings or not */
+#define OMPI_BUILD_FORTRAN_MPIFH_BINDINGS 1
+
+/* For ompi_info: Whether we will build the MPI Fortran "use mpi_f08" bindings
+   or not */
+#define OMPI_BUILD_FORTRAN_USEMPIF08_BINDINGS 0
+
+/* Whether we will build the MPI Fortran "use mpi" bindings or not */
+#define OMPI_BUILD_FORTRAN_USEMPI_BINDINGS 1
+
+/* OMPI underlying C++ compiler */
+#define OMPI_CXX "g++"
+
+/* Whether C++ compiler supports __builtin_expect */
+#define OMPI_CXX_HAVE_BUILTIN_EXPECT 0
+
+/* Whether C++ compiler supports __builtin_prefetch */
+#define OMPI_CXX_HAVE_BUILTIN_PREFETCH 0
+
+/* Whether a const_cast on a 2-d array will work with the C++ compiler */
+#define OMPI_CXX_SUPPORTS_2D_CONST_CAST 0
+
+/* Enable contributed software package libompitrace */
+#define OMPI_ENABLE_CONTRIB_libompitrace 1
+
+/* Enable contributed software package vt */
+#define OMPI_ENABLE_CONTRIB_vt 1
+
+/* Whether we want MPI profiling or not */
+#define OMPI_ENABLE_MPI_PROFILING 1
+
+/* Enable MPI_THREAD_MULTIPLE */
+#define OMPI_ENABLE_THREAD_MULTIPLE 0
+
+/* Underlying Fortran compiler */
+#define OMPI_FC "gfortran"
+
+/* Absolutey path to the underlying Fortran compiler found by configure */
+#define OMPI_FC_ABSOLUTE "/usr/bin/gfortran"
+
+/* Whether the mpif.h interface supports the MPI_SIZEOF interface or not */
+#define OMPI_FORTRAN_BUILD_SIZEOF 0
+
+/* Whether fortran symbols are all caps or not */
+#define OMPI_FORTRAN_CAPS 0
+
+/* Whether fortran symbols have a trailing double underscore or not */
+#define OMPI_FORTRAN_DOUBLE_UNDERSCORE 0
+
+/* How many bytes the mpi_f08 TYPE(MPI_<foo>) handles will be */
+#define OMPI_FORTRAN_F08_HANDLE_SIZE 4
+
+/* Max handle value for fortran MPI handles, effectively min(INT_MAX, max
+   fortran INTEGER value) */
+#define OMPI_FORTRAN_HANDLE_MAX 2147483647
+
+/* For mpi-f08-interfaces-callbacks.f90 and ompi_info: whether the compiler
+   supports the "abstract" keyword or not */
+#define OMPI_FORTRAN_HAVE_ABSTRACT 0
+
+/* For ompi/mpi/fortran/use-mpi-f08/blah.F90 and blah.h and ompi_info: whether
+   the compiler supports the "asynchronous" keyword or not */
+#define OMPI_FORTRAN_HAVE_ASYNCHRONOUS 0
+
+/* For ompi_info: Whether the compiler supports all forms of BIND(C) that we
+   need */
+#define OMPI_FORTRAN_HAVE_BIND_C 0
+
+/* For ompi_info: Whether the compiler supports SUBROUTINE ... BIND(C) or not
+   */
+#define OMPI_FORTRAN_HAVE_BIND_C_SUB 0
+
+/* For ompi_info: Whether the compiler supports TYPE, BIND(C) or not */
+#define OMPI_FORTRAN_HAVE_BIND_C_TYPE 0
+
+/* For ompi_info: Whether the compiler supports TYPE, BIND(C, NAME="name") or
+   not */
+#define OMPI_FORTRAN_HAVE_BIND_C_TYPE_NAME 0
+
+/* For ompi_info: Whether the Fortran compiler supports the Fortran 2008
+   "assumed rank" syntax or not */
+#define OMPI_FORTRAN_HAVE_F08_ASSUMED_RANK 0
+
+/* Whether the Fortran compiler supports ignore TKR functionality or not */
+#define OMPI_FORTRAN_HAVE_IGNORE_TKR 0
+
+/* Whether the compiler supports INTERFACE or not */
+#define OMPI_FORTRAN_HAVE_INTERFACE 1
+
+/* For ompi_info: Whether the compiler supports ISO_C_BINDING or not */
+#define OMPI_FORTRAN_HAVE_ISO_C_BINDING 1
+
+/* Whether the compiler supports ISO_FORTRAN_ENV or not */
+#define OMPI_FORTRAN_HAVE_ISO_FORTRAN_ENV 0
+
+/* For ompi_info: whether the Fortran compiler supports optional arguments or
+   not */
+#define OMPI_FORTRAN_HAVE_OPTIONAL_ARGS 0
+
+/* For mpi-f08-types.f90 and ompi_info: whether the compiler supports the
+   "private" keyword or not (used in MPI_Status) */
+#define OMPI_FORTRAN_HAVE_PRIVATE 0
+
+/* For ompi/mpi/fortran/use-mpi-f08/blah.F90 and blah.h and ompi_info: whether
+   the compiler supports the "procedure" keyword or not */
+#define OMPI_FORTRAN_HAVE_PROCEDURE 0
+
+/* For mpi-f08-types.f90 and .F90 and ompi_info: whether the compiler supports
+   the "protected" keyword or not */
+#define OMPI_FORTRAN_HAVE_PROTECTED 0
+
+/* Whether the compiler supports STORAGE_SIZE on relevant types */
+#define OMPI_FORTRAN_HAVE_STORAGE_SIZE 0
+
+/* Pre declaration for FORTRAN ignore parameter TKR behavior */
+#define OMPI_FORTRAN_IGNORE_TKR_PREDECL ""
+
+/* Type declaration for FORTRAN ignore parameter TKR behavior */
+#define OMPI_FORTRAN_IGNORE_TKR_TYPE 
+
+/* Max dimension rank of Fortran arrays */
+#define OMPI_FORTRAN_MAX_ARRAY_RANK 7
+
+/* Whether the mpi_f08 implementation is using wrapper routines ("bad" Fortran
+   compiler) or weak symbols ("good" Fortran compiler) for the F08 interface
+   definition implementations */
+#define OMPI_FORTRAN_NEED_WRAPPER_ROUTINES 0
+
+/* Whether fortran symbols have no trailing underscore or not */
+#define OMPI_FORTRAN_PLAIN 0
+
+/* Whether fortran symbols have a trailing underscore or not */
+#define OMPI_FORTRAN_SINGLE_UNDERSCORE 1
+
+/* Value to load to the MPI_SUBARRAYS_SUPPORTED compile-time constant */
+#define OMPI_FORTRAN_SUBARRAYS_SUPPORTED .FALSE.
+
+/* Fortran value for LOGICAL .TRUE. value */
+#define OMPI_FORTRAN_VALUE_TRUE 1
+
+/* Greek - alpha, beta, etc - release number of Open MPI */
+#define OMPI_GREEK_VERSION "a1"
+
+/* Wether we want sparse process groups */
+#define OMPI_GROUP_SPARSE 0
+
+/* Whether or not we have compiled with C++ exceptions support */
+#define OMPI_HAVE_CXX_EXCEPTION_SUPPORT 0
+
+/* Whether we have Fortran CHARACTER or not */
+#define OMPI_HAVE_FORTRAN_CHARACTER 1
+
+/* Whether we have Fortran COMPLEX or not */
+#define OMPI_HAVE_FORTRAN_COMPLEX 1
+
+/* Whether we have Fortran COMPLEX*16 or not */
+#define OMPI_HAVE_FORTRAN_COMPLEX16 1
+
+/* Whether we have Fortran COMPLEX*32 or not */
+#define OMPI_HAVE_FORTRAN_COMPLEX32 0
+
+/* Whether we have Fortran COMPLEX*4 or not */
+#define OMPI_HAVE_FORTRAN_COMPLEX4 0
+
+/* Whether we have Fortran COMPLEX*8 or not */
+#define OMPI_HAVE_FORTRAN_COMPLEX8 1
+
+/* Whether we have Fortran DOUBLE COMPLEX or not */
+#define OMPI_HAVE_FORTRAN_DOUBLE_COMPLEX 1
+
+/* Whether we have Fortran DOUBLE PRECISION or not */
+#define OMPI_HAVE_FORTRAN_DOUBLE_PRECISION 1
+
+/* Whether we have Fortran INTEGER or not */
+#define OMPI_HAVE_FORTRAN_INTEGER 1
+
+/* Whether we have Fortran INTEGER*1 or not */
+#define OMPI_HAVE_FORTRAN_INTEGER1 1
+
+/* Whether we have Fortran INTEGER*16 or not */
+#define OMPI_HAVE_FORTRAN_INTEGER16 0
+
+/* Whether we have Fortran INTEGER*2 or not */
+#define OMPI_HAVE_FORTRAN_INTEGER2 1
+
+/* Whether we have Fortran INTEGER*4 or not */
+#define OMPI_HAVE_FORTRAN_INTEGER4 1
+
+/* Whether we have Fortran INTEGER*8 or not */
+#define OMPI_HAVE_FORTRAN_INTEGER8 1
+
+/* Whether we have Fortran LOGICAL or not */
+#define OMPI_HAVE_FORTRAN_LOGICAL 1
+
+/* Whether we have Fortran LOGICAL*1 or not */
+#define OMPI_HAVE_FORTRAN_LOGICAL1 1
+
+/* Whether we have Fortran LOGICAL*2 or not */
+#define OMPI_HAVE_FORTRAN_LOGICAL2 1
+
+/* Whether we have Fortran LOGICAL*4 or not */
+#define OMPI_HAVE_FORTRAN_LOGICAL4 1
+
+/* Whether we have Fortran LOGICAL*8 or not */
+#define OMPI_HAVE_FORTRAN_LOGICAL8 1
+
+/* Whether we have Fortran REAL or not */
+#define OMPI_HAVE_FORTRAN_REAL 1
+
+/* Whether we have Fortran REAL*16 or not */
+#define OMPI_HAVE_FORTRAN_REAL16 0
+
+/* Whether we have Fortran REAL*2 or not */
+#define OMPI_HAVE_FORTRAN_REAL2 0
+
+/* Whether we have Fortran REAL*4 or not */
+#define OMPI_HAVE_FORTRAN_REAL4 1
+
+/* Whether we have Fortran REAL*8 or not */
+#define OMPI_HAVE_FORTRAN_REAL8 1
+
+/* Fortrn KIND number for CHARACTER */
+#define OMPI_KIND_FORTRAN_CHARACTER C_SIGNED_CHAR
+
+/* Fortrn KIND number for COMPLEX */
+#define OMPI_KIND_FORTRAN_COMPLEX C_FLOAT_COMPLEX
+
+/* Fortrn KIND number for COMPLEX*16 */
+#define OMPI_KIND_FORTRAN_COMPLEX16 C_DOUBLE_COMPLEX
+
+/* Fortrn KIND number for COMPLEX*32 */
+#define OMPI_KIND_FORTRAN_COMPLEX32 0
+
+/* Fortrn KIND number for COMPLEX*4 */
+#define OMPI_KIND_FORTRAN_COMPLEX4 0
+
+/* Fortrn KIND number for COMPLEX*8 */
+#define OMPI_KIND_FORTRAN_COMPLEX8 C_FLOAT_COMPLEX
+
+/* Fortrn KIND number for DOUBLE COMPLEX */
+#define OMPI_KIND_FORTRAN_DOUBLE_COMPLEX C_DOUBLE_COMPLEX
+
+/* Fortrn KIND number for DOUBLE PRECISION */
+#define OMPI_KIND_FORTRAN_DOUBLE_PRECISION C_DOUBLE
+
+/* Fortrn KIND number for INTEGER */
+#define OMPI_KIND_FORTRAN_INTEGER C_INT
+
+/* Fortrn KIND number for INTEGER*1 */
+#define OMPI_KIND_FORTRAN_INTEGER1 C_SIGNED_CHAR
+
+/* Fortrn KIND number for INTEGER*16 */
+#define OMPI_KIND_FORTRAN_INTEGER16 0
+
+/* Fortrn KIND number for INTEGER*2 */
+#define OMPI_KIND_FORTRAN_INTEGER2 C_SHORT
+
+/* Fortrn KIND number for INTEGER*4 */
+#define OMPI_KIND_FORTRAN_INTEGER4 C_INT
+
+/* Fortrn KIND number for INTEGER*8 */
+#define OMPI_KIND_FORTRAN_INTEGER8 C_LONG_LONG
+
+/* Fortrn KIND number for LOGICAL */
+#define OMPI_KIND_FORTRAN_LOGICAL C_INT
+
+/* Fortrn KIND number for LOGICAL*1 */
+#define OMPI_KIND_FORTRAN_LOGICAL1 C_SIGNED_CHAR
+
+/* Fortrn KIND number for LOGICAL*2 */
+#define OMPI_KIND_FORTRAN_LOGICAL2 C_SHORT
+
+/* Fortrn KIND number for LOGICAL*4 */
+#define OMPI_KIND_FORTRAN_LOGICAL4 C_INT
+
+/* Fortrn KIND number for LOGICAL*8 */
+#define OMPI_KIND_FORTRAN_LOGICAL8 C_LONG_LONG
+
+/* Fortrn KIND number for REAL */
+#define OMPI_KIND_FORTRAN_REAL C_FLOAT
+
+/* Fortrn KIND number for REAL*16 */
+#define OMPI_KIND_FORTRAN_REAL16 0
+
+/* Fortrn KIND number for REAL*2 */
+#define OMPI_KIND_FORTRAN_REAL2 0
+
+/* Fortrn KIND number for REAL*4 */
+#define OMPI_KIND_FORTRAN_REAL4 C_FLOAT
+
+/* Fortrn KIND number for REAL*8 */
+#define OMPI_KIND_FORTRAN_REAL8 C_DOUBLE
+
+/* Major release number of Open MPI */
+#define OMPI_MAJOR_VERSION 1
+
+/* Minor release number of Open MPI */
+#define OMPI_MINOR_VERSION 9
+
+/* MPI Extensions included in libmpi */
+#define OMPI_MPIEXT_COMPONENTS ""
+
+/* Type of MPI_Aint */
+#define OMPI_MPI_AINT_TYPE ptrdiff_t
+
+/* Contributed software packages built with Open MPI */
+#define OMPI_MPI_CONTRIBS "vt, libompitrace"
+
+/* Size of the MPI_Count datatype */
+#define OMPI_MPI_COUNT_SIZE 8
+
+/* Type of the MPI_Count datatype */
+#define OMPI_MPI_COUNT_TYPE long long
+
+/* Size of the MPI_Offset */
+#define OMPI_MPI_OFFSET_SIZE 8
+
+/* Type of MPI_Offset */
+#define OMPI_MPI_OFFSET_TYPE long long
+
+/* Enable flow control for Portals4 MTL */
+#define OMPI_MTL_PORTALS4_FLOW_CONTROL 1
+
+/* MPI datatype corresponding to MPI_Offset */
+#define OMPI_OFFSET_DATATYPE MPI_LONG_LONG
+
+/* Whether we want to check MPI parameters never or possible (an integer
+   constant) */
+#define OMPI_PARAM_CHECK 1
+
+/* Index into endpoint array for BML */
+#define OMPI_PROC_ENDPOINT_TAG_BML 0
+
+/* Maximum number of endpoint entries to be attached to an ompi_proc_t */
+#define OMPI_PROC_ENDPOINT_TAG_MAX 1
+
+/* Index into endpoint array for MTL */
+/* #undef OMPI_PROC_ENDPOINT_TAG_MTL */
+
+/* Index into endpoint array for PML */
+/* #undef OMPI_PROC_ENDPOINT_TAG_PML */
+
+/* Index into endpoint array for PORTALS4 */
+/* #undef OMPI_PROC_ENDPOINT_TAG_PORTALS4 */
+
+/* Whether OMPI should provide MPI File interface */
+#define OMPI_PROVIDE_MPI_FILE_INTERFACE 1
+
+/* Whether Fortran REAL*16 matches the bit format of the equivalent C type */
+#define OMPI_REAL16_MATCHES_C 0
+
+/* Release date of Open MPI */
+#define OMPI_RELEASE_DATE "Unreleased developer copy"
+
+/* Release release number of Open MPI */
+#define OMPI_RELEASE_VERSION 0
+
+/* The repository version Open MPI */
+#define OMPI_REPO_REV "dev-267-g51b4521"
+
+/* Defined to 1 if the OMPI runtime component is ORTE */
+#define OMPI_RTE_ORTE 1
+
+/* Size of Fortran CHARACTER */
+#define OMPI_SIZEOF_FORTRAN_CHARACTER 1
+
+/* Size of Fortran COMPLEX */
+#define OMPI_SIZEOF_FORTRAN_COMPLEX 8
+
+/* Size of Fortran COMPLEX*16 */
+#define OMPI_SIZEOF_FORTRAN_COMPLEX16 16
+
+/* Size of Fortran COMPLEX*32 */
+#define OMPI_SIZEOF_FORTRAN_COMPLEX32 4
+
+/* Size of Fortran COMPLEX*4 */
+#define OMPI_SIZEOF_FORTRAN_COMPLEX4 4
+
+/* Size of Fortran COMPLEX*8 */
+#define OMPI_SIZEOF_FORTRAN_COMPLEX8 8
+
+/* Size of Fortran DOUBLE COMPLEX */
+#define OMPI_SIZEOF_FORTRAN_DOUBLE_COMPLEX 16
+
+/* Size of Fortran DOUBLE PRECISION */
+#define OMPI_SIZEOF_FORTRAN_DOUBLE_PRECISION 8
+
+/* Size of Fortran INTEGER */
+#define OMPI_SIZEOF_FORTRAN_INTEGER 4
+
+/* Size of Fortran INTEGER*1 */
+#define OMPI_SIZEOF_FORTRAN_INTEGER1 1
+
+/* Size of Fortran INTEGER*16 */
+#define OMPI_SIZEOF_FORTRAN_INTEGER16 16
+
+/* Size of Fortran INTEGER*2 */
+#define OMPI_SIZEOF_FORTRAN_INTEGER2 2
+
+/* Size of Fortran INTEGER*4 */
+#define OMPI_SIZEOF_FORTRAN_INTEGER4 4
+
+/* Size of Fortran INTEGER*8 */
+#define OMPI_SIZEOF_FORTRAN_INTEGER8 8
+
+/* Size of Fortran LOGICAL */
+#define OMPI_SIZEOF_FORTRAN_LOGICAL 4
+
+/* Size of Fortran LOGICAL*1 */
+#define OMPI_SIZEOF_FORTRAN_LOGICAL1 1
+
+/* Size of Fortran LOGICAL*2 */
+#define OMPI_SIZEOF_FORTRAN_LOGICAL2 2
+
+/* Size of Fortran LOGICAL*4 */
+#define OMPI_SIZEOF_FORTRAN_LOGICAL4 4
+
+/* Size of Fortran LOGICAL*8 */
+#define OMPI_SIZEOF_FORTRAN_LOGICAL8 8
+
+/* Size of Fortran REAL */
+#define OMPI_SIZEOF_FORTRAN_REAL 4
+
+/* Size of Fortran REAL*16 */
+#define OMPI_SIZEOF_FORTRAN_REAL16 4
+
+/* Size of Fortran REAL*2 */
+#define OMPI_SIZEOF_FORTRAN_REAL2 4
+
+/* Size of Fortran REAL*4 */
+#define OMPI_SIZEOF_FORTRAN_REAL4 4
+
+/* Size of Fortran REAL*8 */
+#define OMPI_SIZEOF_FORTRAN_REAL8 8
+
+/* Tarball filename version string of Open MPI */
+#define OMPI_TARBALL_VERSION "gitclone"
+
+/* Complete release number of Open MPI */
+#define OMPI_VERSION "0"
+
+/* do we want java mpi bindings */
+#define OMPI_WANT_JAVA_BINDINGS 0
+
+/* do we want to try to work around C++ bindings SEEK_* issue? */
+#define OMPI_WANT_MPI_CXX_SEEK 1
+
+/* Enable warnings when using deprecated MPI functions */
+#define OMPI_WANT_MPI_INTERFACE_WARNING 1
+
+/* if the peruse interface should be enabled */
+#define OMPI_WANT_PERUSE 0
+
+/* Alignment of type _Bool */
+#define OPAL_ALIGNMENT_BOOL 1
+
+/* Alignment of type char */
+#define OPAL_ALIGNMENT_CHAR 1
+
+/* Alignment of type bool */
+#define OPAL_ALIGNMENT_CXX_BOOL 1
+
+/* Alignment of type double */
+#define OPAL_ALIGNMENT_DOUBLE 8
+
+/* Alignment of type double _Complex */
+#define OPAL_ALIGNMENT_DOUBLE_COMPLEX 8
+
+/* Alignment of type float */
+#define OPAL_ALIGNMENT_FLOAT 4
+
+/* Alignment of type float _Complex */
+#define OPAL_ALIGNMENT_FLOAT_COMPLEX 4
+
+/* Alignment of type int */
+#define OPAL_ALIGNMENT_INT 4
+
+/* Alignment of type int128_t */
+/* #undef OPAL_ALIGNMENT_INT128 */
+
+/* Alignment of type int16_t */
+#define OPAL_ALIGNMENT_INT16 2
+
+/* Alignment of type int32_t */
+#define OPAL_ALIGNMENT_INT32 4
+
+/* Alignment of type int64_t */
+#define OPAL_ALIGNMENT_INT64 8
+
+/* Alignment of type int8_t */
+#define OPAL_ALIGNMENT_INT8 1
+
+/* Alignment of type long */
+#define OPAL_ALIGNMENT_LONG 8
+
+/* Alignment of type long double */
+#define OPAL_ALIGNMENT_LONG_DOUBLE 16
+
+/* Alignment of type long double _Complex */
+#define OPAL_ALIGNMENT_LONG_DOUBLE_COMPLEX 16
+
+/* Alignment of type long long */
+#define OPAL_ALIGNMENT_LONG_LONG 8
+
+/* Alignment of type short */
+#define OPAL_ALIGNMENT_SHORT 2
+
+/* Alignment of type size_t */
+#define OPAL_ALIGNMENT_SIZE_T 8
+
+/* Alignment of type void * */
+#define OPAL_ALIGNMENT_VOID_P 8
+
+/* Alignment of type wchar_t */
+#define OPAL_ALIGNMENT_WCHAR 4
+
+/* Alignment of type __float128 */
+#define OPAL_ALIGNMENT___FLOAT128 16
+
+/* set to 1 if word-size integers must be aligned to word-size padding to
+   prevent bus errors */
+#define OPAL_ALIGN_WORD_SIZE_INTEGERS 0
+
+/* OMPI architecture string */
+#define OPAL_ARCH "x86_64-unknown-linux-gnu"
+
+/* Assembly align directive expects logarithmic value */
+#define OPAL_ASM_ALIGN_LOG 
+
+/* What ARM assembly version to use */
+/* #undef OPAL_ASM_ARM_VERSION */
+
+/* Assembly directive for exporting symbols */
+#define OPAL_ASM_GLOBAL ".globl"
+
+/* Assembly prefix for gsym labels */
+#define OPAL_ASM_GSYM ""
+
+/* Assembly suffix for labels */
+#define OPAL_ASM_LABEL_SUFFIX ":"
+
+/* Assembly prefix for lsym labels */
+#define OPAL_ASM_LSYM ".L"
+
+/* Do we need to give a .size directive */
+#define OPAL_ASM_SIZE "1"
+
+/* Whether we can do 64bit assembly operations or not. Should not be used
+   outside of the assembly header files */
+#define OPAL_ASM_SUPPORT_64BIT 1
+
+/* Assembly directive for setting text section */
+#define OPAL_ASM_TEXT ".text"
+
+/* How to set function type in .type directive */
+#define OPAL_ASM_TYPE "@"
+
+/* Architecture type of assembly to use for atomic operations and CMA */
+#define OPAL_ASSEMBLY_ARCH OPAL_AMD64
+
+/* Whether to use builtin atomics */
+#define OPAL_ASSEMBLY_BUILTIN OPAL_BUILTIN_NO
+
+/* Format of assembly file */
+#define OPAL_ASSEMBLY_FORMAT "default-.text-.globl-:--.L-@-1-0-1-1-1"
+
+/* Enable flow control for Portals4 BTL */
+#define OPAL_BTL_PORTALS4_FLOW_CONTROL 0
+
+/* If CMA support can be enabled */
+#define OPAL_BTL_SM_HAVE_CMA 0
+
+/* If knem support can be enabled */
+#define OPAL_BTL_SM_HAVE_KNEM 0
+
+/* define to 1 if usnic BTL unit tests are enabled, 0 otherwise */
+#define OPAL_BTL_USNIC_UNIT_TESTS 0
+
+/* If CMA support can be enabled within vader */
+#define OPAL_BTL_VADER_HAVE_CMA 0
+
+/* If KNEM support can be enabled within vader */
+#define OPAL_BTL_VADER_HAVE_KNEM 0
+
+/* If XPMEM support can be enabled within vader */
+#define OPAL_BTL_VADER_HAVE_XPMEM 0
+
+/* The compiler $lower which OMPI was built with */
+#define OPAL_BUILD_PLATFORM_COMPILER_FAMILYID 1
+
+/* The compiler $lower which OMPI was built with */
+#define OPAL_BUILD_PLATFORM_COMPILER_FAMILYNAME GNU
+
+/* The compiler $lower which OMPI was built with */
+#define OPAL_BUILD_PLATFORM_COMPILER_VERSION 263175
+
+/* The compiler $lower which OMPI was built with */
+#define OPAL_BUILD_PLATFORM_COMPILER_VERSION_STR 4.4.7
+
+/* OMPI underlying C compiler */
+#define OPAL_CC "gcc"
+
+/* Use static const char[] strings for C files */
+#define OPAL_CC_USE_CONST_CHAR_IDENT 0
+
+/* Use #ident strings for C files */
+#define OPAL_CC_USE_IDENT 1
+
+/* Use #pragma comment for C files */
+#define OPAL_CC_USE_PRAGMA_COMMENT 
+
+/* Use #pragma ident strings for C files */
+#define OPAL_CC_USE_PRAGMA_IDENT 0
+
+/* Need CMA syscalls defined */
+/* #undef OPAL_CMA_NEED_SYSCALL_DEFS */
+
+/* Whether we have CUDA GDR support available */
+#define OPAL_CUDA_GDR_SUPPORT 1
+
+/* Whether we have CUDA cuPointerGetAttributes function available */
+#define OPAL_CUDA_GET_ATTRIBUTES 0
+
+/* Whether we want cuda device pointer support */
+#define OPAL_CUDA_SUPPORT 1
+
+/* Whether we have CUDA 4.1 support available */
+#define OPAL_CUDA_SUPPORT_41 1
+
+/* Whether we have CUDA CU_POINTER_ATTRIBUTE_SYNC_MEMOPS support available */
+#define OPAL_CUDA_SYNC_MEMOPS 1
+
+/* OPAL underlying C++ compiler */
+#define OPAL_CXX "g++"
+
+/* Use static const char[] strings for C++ files */
+/* #undef OPAL_CXX_USE_CONST_CHAR_IDENT */
+
+/* Use #ident strings for C++ files */
+/* #undef OPAL_CXX_USE_IDENT */
+
+/* Use #pragma comment for C++ files */
+/* #undef OPAL_CXX_USE_PRAGMA_COMMENT */
+
+/* Use #pragma ident strings for C++ files */
+/* #undef OPAL_CXX_USE_PRAGMA_IDENT */
+
+/* Whether C compiler supports DEC style inline assembly */
+#define OPAL_C_DEC_INLINE_ASSEMBLY 0
+
+/* Whether C compiler supports GCC style inline assembly */
+#define OPAL_C_GCC_INLINE_ASSEMBLY 1
+
+/* Whether C compiler supports __builtin_clz */
+#define OPAL_C_HAVE_BUILTIN_CLZ 1
+
+/* Whether C compiler supports __builtin_expect */
+#define OPAL_C_HAVE_BUILTIN_EXPECT 1
+
+/* Whether C compiler supports __builtin_prefetch */
+#define OPAL_C_HAVE_BUILTIN_PREFETCH 1
+
+/* Whether C compiler supports symbol visibility or not */
+#define OPAL_C_HAVE_VISIBILITY 1
+
+/* Whether C compiler supports XLC style inline assembly */
+#define OPAL_C_XLC_INLINE_ASSEMBLY 0
+
+/* Whether we want checkpoint/restart enabled debugging functionality or not
+   */
+#define OPAL_ENABLE_CRDEBUG 0
+
+/* Whether we want developer-level debugging code or not */
+#define OPAL_ENABLE_DEBUG 1
+
+/* Enable features required for dynamic SL support */
+#define OPAL_ENABLE_DYNAMIC_SL 0
+
+/* Enable fault tolerance general components and logic */
+#define OPAL_ENABLE_FT 0
+
+/* Enable fault tolerance checkpoint/restart components and logic */
+#define OPAL_ENABLE_FT_CR 0
+
+/* Enable fault tolerance thread in Open PAL */
+#define OPAL_ENABLE_FT_THREAD 0
+
+/* Disable getpwuid support (default: enabled) */
+#define OPAL_ENABLE_GETPWUID 1
+
+/* Enable features required for heterogeneous support */
+#define OPAL_ENABLE_HETEROGENEOUS_SUPPORT 0
+
+/* Enable IPv6 support, but only if the underlying system supports it */
+#define OPAL_ENABLE_IPV6 0
+
+/* Whether we want the memory profiling or not */
+#define OPAL_ENABLE_MEM_DEBUG 1
+
+/* Whether we want the memory profiling or not */
+#define OPAL_ENABLE_MEM_PROFILE 1
+
+/* Whether we should enable thread support within the OPAL code base */
+#define OPAL_ENABLE_MULTI_THREADS 1
+
+/* Whether we want BTL progress threads enabled */
+#define OPAL_ENABLE_PROGRESS_THREADS 0
+
+/* Whether user wants PTY support or not */
+#define OPAL_ENABLE_PTY_SUPPORT 1
+
+/* Whether we want developer-level timing framework or not */
+#define OPAL_ENABLE_TIMING 0
+
+/* Greek - alpha, beta, etc - release number of Open Portable Access Layer */
+#define OPAL_GREEK_VERSION "a1"
+
+/* Whether there is an atomic assembly file available */
+#define OPAL_HAVE_ASM_FILE 1
+
+/* Whether your compiler has __attribute__ or not */
+#define OPAL_HAVE_ATTRIBUTE 1
+
+/* Whether your compiler has __attribute__ aligned or not */
+#define OPAL_HAVE_ATTRIBUTE_ALIGNED 1
+
+/* Whether your compiler has __attribute__ always_inline or not */
+#define OPAL_HAVE_ATTRIBUTE_ALWAYS_INLINE 1
+
+/* Whether your compiler has __attribute__ cold or not */
+#define OPAL_HAVE_ATTRIBUTE_COLD 1
+
+/* Whether your compiler has __attribute__ const or not */
+#define OPAL_HAVE_ATTRIBUTE_CONST 1
+
+/* Whether your compiler has __attribute__ deprecated or not */
+#define OPAL_HAVE_ATTRIBUTE_DEPRECATED 1
+
+/* Whether your compiler has __attribute__ deprecated with optional argument
+   */
+#define OPAL_HAVE_ATTRIBUTE_DEPRECATED_ARGUMENT 0
+
+/* Whether your compiler has __attribute__ destructor or not */
+#define OPAL_HAVE_ATTRIBUTE_DESTRUCTOR 1
+
+/* Whether your compiler has __attribute__ format or not */
+#define OPAL_HAVE_ATTRIBUTE_FORMAT 1
+
+/* Whether your compiler has __attribute__ format and it works on function
+   pointers */
+#define OPAL_HAVE_ATTRIBUTE_FORMAT_FUNCPTR 1
+
+/* Whether your compiler has __attribute__ hot or not */
+#define OPAL_HAVE_ATTRIBUTE_HOT 1
+
+/* Whether your compiler has __attribute__ malloc or not */
+#define OPAL_HAVE_ATTRIBUTE_MALLOC 1
+
+/* Whether your compiler has __attribute__ may_alias or not */
+#define OPAL_HAVE_ATTRIBUTE_MAY_ALIAS 1
+
+/* Whether your compiler has __attribute__ noinline or not */
+#define OPAL_HAVE_ATTRIBUTE_NOINLINE 1
+
+/* Whether your compiler has __attribute__ nonnull or not */
+#define OPAL_HAVE_ATTRIBUTE_NONNULL 1
+
+/* Whether your compiler has __attribute__ noreturn or not */
+#define OPAL_HAVE_ATTRIBUTE_NORETURN 1
+
+/* Whether your compiler has __attribute__ noreturn and it works on function
+   pointers */
+#define OPAL_HAVE_ATTRIBUTE_NORETURN_FUNCPTR 1
+
+/* Whether your compiler has __attribute__ no_instrument_function or not */
+#define OPAL_HAVE_ATTRIBUTE_NO_INSTRUMENT_FUNCTION 1
+
+/* Whether your compiler has __attribute__ packed or not */
+#define OPAL_HAVE_ATTRIBUTE_PACKED 1
+
+/* Whether your compiler has __attribute__ pure or not */
+#define OPAL_HAVE_ATTRIBUTE_PURE 1
+
+/* Whether your compiler has __attribute__ sentinel or not */
+#define OPAL_HAVE_ATTRIBUTE_SENTINEL 1
+
+/* Whether your compiler has __attribute__ unused or not */
+#define OPAL_HAVE_ATTRIBUTE_UNUSED 1
+
+/* Whether your compiler has __attribute__ visibility or not */
+#define OPAL_HAVE_ATTRIBUTE_VISIBILITY 1
+
+/* Whether your compiler has __attribute__ warn unused result or not */
+#define OPAL_HAVE_ATTRIBUTE_WARN_UNUSED_RESULT 1
+
+/* Whether your compiler has __attribute__ weak alias or not */
+#define OPAL_HAVE_ATTRIBUTE_WEAK_ALIAS 1
+
+/* whether backtrace_execinfo is found and available */
+#define OPAL_HAVE_BACKTRACE_EXECINFO 1
+
+/* whether qsort is broken or not */
+#define OPAL_HAVE_BROKEN_QSORT 0
+
+/* whether ceil is found and available */
+#define OPAL_HAVE_CEIL 1
+
+/* Enable features required for ConnectX XRC support */
+#define OPAL_HAVE_CONNECTX_XRC 0
+
+/* whether crs_blcr is found and available */
+/* #undef OPAL_HAVE_CRS_BLCR */
+
+/* whether dirname is found and available */
+#define OPAL_HAVE_DIRNAME 1
+
+/* whether fbtl_posix is found and available */
+#define OPAL_HAVE_FBTL_POSIX 1
+
+/* whether gethostbyname is found and available */
+#define OPAL_HAVE_GETHOSTBYNAME 1
+
+/* Whether we have hwloc support or not */
+#define OPAL_HAVE_HWLOC 1
+
+/* do we have Java support */
+#define OPAL_HAVE_JAVA_SUPPORT 1
+
+/* Do not use outside of mpi.h. Define to 1 if the system has the type `long
+   long'. */
+#define OPAL_HAVE_LONG_LONG 1
+
+/* Whether libltdl appears to have the lt_dladvise interface */
+#define OPAL_HAVE_LTDL_ADVISE 0
+
+/* whether openpty is found and available */
+#define OPAL_HAVE_OPENPTY 1
+
+/* Do we have POSIX threads */
+#define OPAL_HAVE_POSIX_THREADS 1
+
+/* If PTHREADS implementation supports PTHREAD_MUTEX_ERRORCHECK */
+#define OPAL_HAVE_PTHREAD_MUTEX_ERRORCHECK 1
+
+/* If PTHREADS implementation supports PTHREAD_MUTEX_ERRORCHECK_NP */
+#define OPAL_HAVE_PTHREAD_MUTEX_ERRORCHECK_NP 1
+
+/* Whether RDMA CM is available or not */
+/* #undef OPAL_HAVE_RDMACM */
+
+/* Enable RDMAoE support */
+/* #undef OPAL_HAVE_RDMAOE */
+
+/* Whether we have SA_RESTART in <signal.h> or not */
+#define OPAL_HAVE_SA_RESTART 1
+
+/* whether sched_yield is found and available */
+#define OPAL_HAVE_SCHED_YIELD 1
+
+/* whether shmem_posix is found and available */
+#define OPAL_HAVE_SHMEM_POSIX 1
+
+/* whether socket is found and available */
+#define OPAL_HAVE_SOCKET 1
+
+/* Whether or not we have solaris */
+#define OPAL_HAVE_SOLARIS 0
+
+/* Do not use outside of mpi.h. Define to 1 if you have the <sys/synch.h>
+   header file. */
+/* #undef OPAL_HAVE_SYS_SYNCH_H */
+
+/* Do not use outside of mpi.h. Define to 1 if you have the <sys/time.h>
+   header file. */
+#define OPAL_HAVE_SYS_TIME_H 1
+
+/* Whether UD CM is available or not */
+/* #undef OPAL_HAVE_UDCM */
+
+/* Whether we have __va_copy or not */
+#define OPAL_HAVE_UNDERSCORE_VA_COPY 1
+
+/* Whether we have va_copy or not */
+#define OPAL_HAVE_VA_COPY 1
+
+/* Whether we have weak symbols or not */
+#define OPAL_HAVE_WEAK_SYMBOLS 1
+
+/* Whether our event component has working event operations or not (if not,
+   then assumedly it only has working timers and signals) */
+#define OPAL_HAVE_WORKING_EVENTOPS 1
+
+/* whether yp_all_nsl is found and available */
+#define OPAL_HAVE_YP_ALL_NSL 1
+
+/* Define to 1 ifyou have the declaration of _SC_NPROCESSORS_ONLN, and to 0
+   otherwise */
+#define OPAL_HAVE__SC_NPROCESSORS_ONLN 1
+
+/* Number of arguments to ibv_create_cq */
+/* #undef OPAL_IBV_CREATE_CQ_ARGS */
+
+/* ident string for Open MPI */
+#define OPAL_IDENT_STRING "1.9.0a1"
+
+/* Whether we are using the internal libltdl or not */
+#define OPAL_LIBLTDL_INTERNAL 1
+
+/* Major release number of Open Portable Access Layer */
+#define OPAL_MAJOR_VERSION 1
+
+/* Maximum length of datarep strings (default is 128) */
+#define OPAL_MAX_DATAREP_STRING 128
+
+/* Maximum length of error strings (default is 256) */
+#define OPAL_MAX_ERROR_STRING 256
+
+/* Maximum length of info keys (default is 36) */
+#define OPAL_MAX_INFO_KEY 36
+
+/* Maximum length of info vals (default is 256) */
+#define OPAL_MAX_INFO_VAL 256
+
+/* Maximum length of object names (default is 64) */
+#define OPAL_MAX_OBJECT_NAME 64
+
+/* Maximum length of port names (default is 1024) */
+#define OPAL_MAX_PORT_NAME 1024
+
+/* Maximum length of processor names (default is 256) */
+#define OPAL_MAX_PROCESSOR_NAME 256
+
+/* MCA cmd line identifier */
+#define OPAL_MCA_CMD_LINE_ID "mca"
+
+/* MCA prefix string for envars */
+#define OPAL_MCA_PREFIX "OMPI_MCA_"
+
+/* Whether any opal memory mca components were found */
+#define OPAL_MEMORY_HAVE_COMPONENT 1
+
+/* Minor release number of Open Portable Access Layer */
+#define OPAL_MINOR_VERSION 9
+
+/* Whether the C compiler supports "bool" without any other help (such as
+   <stdbool.h>) */
+#define OPAL_NEED_C_BOOL 1
+
+/* Add padding bytes to the openib BTL control header */
+#define OPAL_OPENIB_PAD_HDR 0
+
+/* package/branding string for Open MPI */
+#define OPAL_PACKAGE_STRING "Open MPI wwu12@bunsen.icl.utk.edu Distribution"
+
+/* Log base 2 of the maximum size in bytes of a memory descriptor. Set to 0 if
+   MD can bind all of memory. */
+#define OPAL_PORTALS4_MAX_MD_SIZE 0
+
+/* Log base 2 of the maximum size in bytes of the user virtual address space.
+   Set to 0 if MD can bind all of memory. */
+#define OPAL_PORTALS4_MAX_VA_SIZE 0
+
+/* Whether r notation is used for ppc registers */
+/* #undef OPAL_POWERPC_R_REGISTERS */
+
+/* type to use for ptrdiff_t */
+#define OPAL_PTRDIFF_TYPE ptrdiff_t
+
+/* Release date of Open Portable Access Layer */
+#define OPAL_RELEASE_DATE "Unreleased developer copy"
+
+/* Release release number of Open Portable Access Layer */
+#define OPAL_RELEASE_VERSION 0
+
+/* The repository version Open Portable Access Layer */
+#define OPAL_REPO_REV "dev-267-g51b4521"
+
+/* Whether we have shared memory support for mmap or not */
+#define OPAL_SHMEM_MMAP 1
+
+/* Whether we have shared memory support for POSIX or not */
+#define OPAL_SHMEM_POSIX 1
+
+/* Whether we have shared memory support for SYSV or not */
+#define OPAL_SHMEM_SYSV 1
+
+/* Do not use outside of mpi.h. Define to 1 if you have the ANSI C header
+   files. */
+#define OPAL_STDC_HEADERS 1
+
+/* Tarball filename version string of Open Portable Access Layer */
+#define OPAL_TARBALL_VERSION "gitclone"
+
+/* Whether to use <stdbool.h> or not */
+#define OPAL_USE_STDBOOL_H 1
+
+/* Complete release number of Open Portable Access Layer */
+#define OPAL_VERSION "0"
+
+/* Enable per-user config files */
+#define OPAL_WANT_HOME_CONFIG_FILES 1
+
+/* Whether to include support for libltdl or not */
+#define OPAL_WANT_LIBLTDL 1
+
+/* if the memory and buffer checking should be enabled */
+#define OPAL_WANT_MEMCHECKER 0
+
+/* if want pretty-print stack trace feature */
+#define OPAL_WANT_PRETTY_PRINT_STACKTRACE 1
+
+/* whether we want to have smp locks in atomic ops or not */
+#define OPAL_WANT_SMP_LOCKS 1
+
+/* Specific ps command to use in orte-clean */
+#define ORTE_CLEAN_PS_CMD "ps -A -o fname,pid,user"
+
+/* Whether we want static ports enabled */
+#define ORTE_ENABLE_STATIC_PORTS 1
+
+/* Greek - alpha, beta, etc - release number of Open MPI Run-Time Environment
+   */
+#define ORTE_GREEK_VERSION "a1"
+
+/* Major release number of Open MPI Run-Time Environment */
+#define ORTE_MAJOR_VERSION 1
+
+/* Minor release number of Open MPI Run-Time Environment */
+#define ORTE_MINOR_VERSION 9
+
+/* Release date of Open MPI Run-Time Environment */
+#define ORTE_RELEASE_DATE "Unreleased developer copy"
+
+/* Release release number of Open MPI Run-Time Environment */
+#define ORTE_RELEASE_VERSION 0
+
+/* The repository version Open MPI Run-Time Environment */
+#define ORTE_REPO_REV "dev-267-g51b4521"
+
+/* Tarball filename version string of Open MPI Run-Time Environment */
+#define ORTE_TARBALL_VERSION "gitclone"
+
+/* Complete release number of Open MPI Run-Time Environment */
+#define ORTE_VERSION "0"
+
+/* Whether we want orterun to effect "--prefix $prefix" by default */
+#define ORTE_WANT_ORTERUN_PREFIX_BY_DEFAULT 0
+
+/* Greek - alpha, beta, etc - release number of Open SHMEM */
+#define OSHMEM_GREEK_VERSION "a1"
+
+/* mxm support is available */
+/* #undef OSHMEM_HAS_ATOMIC_MXM */
+
+/* Major release number of Open SHMEM */
+#define OSHMEM_MAJOR_VERSION 1
+
+/* Minor release number of Open SHMEM */
+#define OSHMEM_MINOR_VERSION 9
+
+/* Whether we want to check OSHMEM parameters always or never */
+#define OSHMEM_PARAM_CHECK 1
+
+/* Release date of Open SHMEM */
+#define OSHMEM_RELEASE_DATE "Unreleased developer copy"
+
+/* Release release number of Open SHMEM */
+#define OSHMEM_RELEASE_VERSION 0
+
+/* The repository version Open SHMEM */
+#define OSHMEM_REPO_REV "dev-267-g51b4521"
+
+/* Whether user wants OSHMEM in compatibility mode or not */
+#define OSHMEM_SPEC_COMPAT 1
+
+/* Whether we have shared memory support for mmap or not */
+#define OSHMEM_SSHMEM_MMAP 1
+
+/* Whether we have shared memory support for SYSV or not */
+#define OSHMEM_SSHMEM_SYSV 1
+
+/* Whether we have shared memory support for verbs or not */
+#define OSHMEM_SSHMEM_VERBS 0
+
+/* Tarball filename version string of Open SHMEM */
+#define OSHMEM_TARBALL_VERSION "gitclone"
+
+/* Complete release number of Open SHMEM */
+#define OSHMEM_VERSION "0"
+
+/* do we want java oshmem bindings */
+#define OSHMEM_WANT_JAVA_BINDINGS 0
+
+/* Define to the address where bug reports for this package should be sent. */
+#define PACKAGE_BUGREPORT "http://www.open-mpi.org/community/help/"
+
+/* Define to the full name of this package. */
+#define PACKAGE_NAME "Open MPI"
+
+/* Define to the full name and version of this package. */
+#define PACKAGE_STRING "Open MPI gitclone"
+
+/* Define to the one symbol short name of this package. */
+#define PACKAGE_TARNAME "openmpi"
+
+/* Define to the home page for this package. */
+#define PACKAGE_URL ""
+
+/* Define to the version of this package. */
+#define PACKAGE_VERSION "gitclone"
+
+/* The size of `bool', as computed by sizeof. */
+#define SIZEOF_BOOL 1
+
+/* The size of `char', as computed by sizeof. */
+#define SIZEOF_CHAR 1
+
+/* The size of `double', as computed by sizeof. */
+#define SIZEOF_DOUBLE 8
+
+/* The size of `double _Complex', as computed by sizeof. */
+#define SIZEOF_DOUBLE__COMPLEX 16
+
+/* The size of `float', as computed by sizeof. */
+#define SIZEOF_FLOAT 4
+
+/* The size of `float _Complex', as computed by sizeof. */
+#define SIZEOF_FLOAT__COMPLEX 8
+
+/* The size of `int', as computed by sizeof. */
+#define SIZEOF_INT 4
+
+/* The size of `long', as computed by sizeof. */
+#define SIZEOF_LONG 8
+
+/* The size of `long double', as computed by sizeof. */
+#define SIZEOF_LONG_DOUBLE 16
+
+/* The size of `long double _Complex', as computed by sizeof. */
+#define SIZEOF_LONG_DOUBLE__COMPLEX 32
+
+/* The size of `long long', as computed by sizeof. */
+#define SIZEOF_LONG_LONG 8
+
+/* The size of `pid_t', as computed by sizeof. */
+#define SIZEOF_PID_T 4
+
+/* The size of `ptrdiff_t', as computed by sizeof. */
+#define SIZEOF_PTRDIFF_T 8
+
+/* The size of `short', as computed by sizeof. */
+#define SIZEOF_SHORT 2
+
+/* The size of `size_t', as computed by sizeof. */
+#define SIZEOF_SIZE_T 8
+
+/* The size of `ssize_t', as computed by sizeof. */
+#define SIZEOF_SSIZE_T 8
+
+/* The size of `unsigned int', as computed by sizeof. */
+#define SIZEOF_UNSIGNED_INT 4
+
+/* The size of `unsigned long', as computed by sizeof. */
+#define SIZEOF_UNSIGNED_LONG 8
+
+/* The size of `void *', as computed by sizeof. */
+#define SIZEOF_VOID_P 8
+
+/* The size of `wchar_t', as computed by sizeof. */
+#define SIZEOF_WCHAR_T 4
+
+/* The size of `_Bool', as computed by sizeof. */
+#define SIZEOF__BOOL 1
+
+/* The size of `__float128', as computed by sizeof. */
+#define SIZEOF___FLOAT128 16
+
+/* Define to 1 if you have the ANSI C header files. */
+#define STDC_HEADERS 1
+
+/* Enable extensions on HP-UX. */
+#ifndef _HPUX_SOURCE
+# define _HPUX_SOURCE 1
+#endif
+
+
+/* Whether to use the legacy Solaris munmap prototype or not */
+/* #undef USE_SOLARIS_LEGACY_MUNMAP_PROTOTYPE */
+
+/* Enable extensions on AIX 3, Interix.  */
+#ifndef _ALL_SOURCE
+# define _ALL_SOURCE 1
+#endif
+/* Enable GNU extensions on systems that have them.  */
+#ifndef _GNU_SOURCE
+# define _GNU_SOURCE 1
+#endif
+/* Enable threading extensions on Solaris.  */
+#ifndef _POSIX_PTHREAD_SEMANTICS
+# define _POSIX_PTHREAD_SEMANTICS 1
+#endif
+/* Enable extensions on HP NonStop.  */
+#ifndef _TANDEM_SOURCE
+# define _TANDEM_SOURCE 1
+#endif
+/* Enable general extensions on Solaris.  */
+#ifndef __EXTENSIONS__
+# define __EXTENSIONS__ 1
+#endif
+
+
+/* Define WORDS_BIGENDIAN to 1 if your processor stores words with the most
+   significant byte first (like Motorola and SPARC, unlike Intel). */
+#if defined AC_APPLE_UNIVERSAL_BUILD
+# if defined __BIG_ENDIAN__
+#  define WORDS_BIGENDIAN 1
+# endif
+#else
+# ifndef WORDS_BIGENDIAN
+/* #  undef WORDS_BIGENDIAN */
+# endif
+#endif
+
+/* Additional CFLAGS to pass through the wrapper compilers */
+#define WRAPPER_EXTRA_CFLAGS "-pthread "
+
+/* Additional CFLAGS_PREFIX to pass through the wrapper compilers */
+#define WRAPPER_EXTRA_CFLAGS_PREFIX ""
+
+/* Additional CXXFLAGS to pass through the wrapper compilers */
+#define WRAPPER_EXTRA_CXXFLAGS "-pthread "
+
+/* Additional CXXFLAGS_PREFIX to pass through the wrapper compilers */
+#define WRAPPER_EXTRA_CXXFLAGS_PREFIX ""
+
+/* Additional FCFLAGS to pass through the wrapper compilers */
+#define WRAPPER_EXTRA_FCFLAGS "-pthread  -I${libdir}"
+
+/* Additional FCFLAGS to pass through the wrapper compilers */
+#define WRAPPER_EXTRA_FCFLAGS_PREFIX ""
+
+/* Additional LDFLAGS to pass through the wrapper compilers */
+#define WRAPPER_EXTRA_LDFLAGS "    -Wl,-rpath -Wl,@{libdir} -Wl,--enable-new-dtags"
+
+/* Additional LIBS to pass through the wrapper compilers */
+#define WRAPPER_EXTRA_LIBS "-lm -ldl -lutil "
+
+/* Whether the wrapper compilers add rpath flags by default */
+#define WRAPPER_RPATH_SUPPORT "runpath"
+
+/* Define to 1 if the X Window System is missing or not being used. */
+/* #undef X_DISPLAY_MISSING */
+
+/* Define to 1 if `lex' declares `yytext' as a `char *' by default, not a
+   `char[]'. */
+#define YYTEXT_POINTER 1
+
+/* Enable GNU extensions on systems that have them.  */
+#ifndef _GNU_SOURCE
+# define _GNU_SOURCE 1
+#endif
+
+/* Are we building for HP-UX? */
+#define _HPUX_SOURCE 1
+
+/* Define to 1 if on MINIX. */
+/* #undef _MINIX */
+
+/* Define to 2 if the system does not provide POSIX.1 features except with
+   this defined. */
+/* #undef _POSIX_1_SOURCE */
+
+/* Define to 1 if you need to in order for `stat' and other things to work. */
+/* #undef _POSIX_SOURCE */
+
+/* Define this to the process ID type */
+#define hwloc_pid_t pid_t
+
+/* Define this to the thread ID type */
+#define hwloc_thread_t pthread_t
+
+/* Define to `__inline__' or `__inline' if that's what the C compiler
+   calls it, or to nothing if 'inline' is not supported under any name.  */
+#ifndef __cplusplus
+#define inline __inline__
+#endif
+
+/* A bogus type that allows us to have sentinel type values that are still
+   valid */
+#define ompi_fortran_bogus_type_t int
+
+/* C type corresponding to Fortran CHARACTER */
+#define ompi_fortran_character_t char
+
+/* C type corresponding to Fortran COMPLEX*16 */
+/* #undef ompi_fortran_complex16_t */
+
+/* C type corresponding to Fortran COMPLEX*32 */
+/* #undef ompi_fortran_complex32_t */
+
+/* C type corresponding to Fortran COMPLEX*4 */
+/* #undef ompi_fortran_complex4_t */
+
+/* C type corresponding to Fortran COMPLEX*8 */
+/* #undef ompi_fortran_complex8_t */
+
+/* C type corresponding to Fortran COMPLEX */
+/* #undef ompi_fortran_complex_t */
+
+/* C type corresponding to Fortran DOUBLE COMPLEX */
+/* #undef ompi_fortran_double_complex_t */
+
+/* C type corresponding to Fortran DOUBLE PRECISION */
+#define ompi_fortran_double_precision_t double
+
+/* C type corresponding to Fortran INTEGER*16 */
+#define ompi_fortran_integer16_t 
+
+/* C type corresponding to Fortran INTEGER*1 */
+#define ompi_fortran_integer1_t char
+
+/* C type corresponding to Fortran INTEGER*2 */
+#define ompi_fortran_integer2_t short
+
+/* C type corresponding to Fortran INTEGER*4 */
+#define ompi_fortran_integer4_t int
+
+/* C type corresponding to Fortran INTEGER*8 */
+#define ompi_fortran_integer8_t long long
+
+/* C type corresponding to Fortran INTEGER */
+#define ompi_fortran_integer_t int
+
+/* C type corresponding to Fortran LOGICAL*1 */
+#define ompi_fortran_logical1_t char
+
+/* C type corresponding to Fortran LOGICAL*2 */
+#define ompi_fortran_logical2_t short
+
+/* C type corresponding to Fortran LOGICAL*4 */
+#define ompi_fortran_logical4_t int
+
+/* C type corresponding to Fortran LOGICAL*8 */
+#define ompi_fortran_logical8_t long long
+
+/* C type corresponding to Fortran LOGICAL */
+#define ompi_fortran_logical_t int
+
+/* C type corresponding to Fortran REAL*16 */
+#define ompi_fortran_real16_t ompi_fortran_bogus_type_t
+
+/* C type corresponding to Fortran REAL*2 */
+#define ompi_fortran_real2_t ompi_fortran_bogus_type_t
+
+/* C type corresponding to Fortran REAL*4 */
+#define ompi_fortran_real4_t float
+
+/* C type corresponding to Fortran REAL*8 */
+#define ompi_fortran_real8_t double
+
+/* C type corresponding to Fortran REAL */
+#define ompi_fortran_real_t float
+
+/* Define to the equivalent of the C99 'restrict' keyword, or to
+   nothing if this is not supported.  Do not define if restrict is
+   supported directly.  */
+#define restrict __restrict
+/* Work around a bug in Sun C++: it does not support _Restrict or
+   __restrict__, even though the corresponding Sun C compiler ends up with
+   "#define restrict _Restrict" or "#define restrict __restrict__" in the
+   previous line.  Perhaps some future version of Sun C++ will work with
+   restrict; if so, hopefully it defines __RESTRICT like Sun C does.  */
+#if defined __SUNPRO_CC && !defined __RESTRICT
+# define _Restrict
+# define __restrict__
+#endif
+
+#endif /* OPAL_CONFIG_H */
+
diff --git a/opal/datatype/cuda/opal_datatype_cuda.cu b/opal/datatype/cuda/opal_datatype_cuda.cu
index ea1f3633480..105ba2bfeba 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cu
+++ b/opal/datatype/cuda/opal_datatype_cuda.cu
@@ -2,10 +2,56 @@
 #include "opal_datatype_cuda.cuh"
 #include <cuda_runtime_api.h>
 #include <stdio.h>
+#include <stdarg.h> 
+
+/*
+ * NOTE: The order of this array *MUST* match what is listed in datatype.h
+ * (use of designated initializers should relax this restrictions some)
+ */
+OPAL_DECLSPEC const size_t opal_datatype_basicDatatypesSize[OPAL_DATATYPE_MAX_PREDEFINED] = {
+    OPAL_DATATYPE_LOOP_SIZE,
+    OPAL_DATATYPE_END_LOOP_SIZE,
+    OPAL_DATATYPE_LB_SIZE,
+    OPAL_DATATYPE_UB_SIZE,
+    OPAL_DATATYPE_INT1_SIZE,
+    OPAL_DATATYPE_INT2_SIZE,
+    OPAL_DATATYPE_INT4_SIZE,
+    OPAL_DATATYPE_INT8_SIZE,
+    OPAL_DATATYPE_INT16_SIZE,       /* Yes, double-machine word integers are available */
+    OPAL_DATATYPE_UINT1_SIZE,
+    OPAL_DATATYPE_UINT2_SIZE,
+    OPAL_DATATYPE_UINT4_SIZE,
+    OPAL_DATATYPE_UINT8_SIZE,
+    OPAL_DATATYPE_UINT16_SIZE,      /* Yes, double-machine word integers are available */
+    OPAL_DATATYPE_FLOAT2_SIZE,
+    OPAL_DATATYPE_FLOAT4_SIZE,
+    OPAL_DATATYPE_FLOAT8_SIZE,
+    OPAL_DATATYPE_FLOAT12_SIZE,
+    OPAL_DATATYPE_FLOAT16_SIZE,
+    OPAL_DATATYPE_FLOAT_COMPLEX_SIZE,
+    OPAL_DATATYPE_DOUBLE_COMPLEX_SIZE,
+    OPAL_DATATYPE_LONG_DOUBLE_COMPLEX_SIZE,
+    OPAL_DATATYPE_BOOL_SIZE,
+    OPAL_DATATYPE_WCHAR_SIZE,
+    OPAL_DATATYPE_UNAVAILABLE_SIZE,
+};
+
+/***** my variables ********/
 
 ddt_cuda_desc_t *cuda_desc_d, *cuda_desc_h;
 unsigned char *pBaseBuf_GPU, *gpu_src_const, *gpu_dest_const;
+unsigned char *ddt_cuda_pack_buffer, *ddt_cuda_unpack_buffer;
 ddt_cuda_stream_t* cuda_streams;
+struct iovec cuda_iov[CUDA_NB_IOV];
+uint32_t cuda_iov_count;
+ddt_cuda_description_dist_t description_dist_h[CUDA_MAX_NB_BLOCKS];
+ddt_cuda_description_dist_t* description_dist_d;
+ddt_cuda_iov_dist_t cuda_iov_dist_h[NB_STREAMS][CUDA_MAX_NB_BLOCKS];
+ddt_cuda_iov_dist_t* cuda_iov_dist_d[NB_STREAMS];
+dt_elem_desc_t* description_d;
+uint8_t opal_datatype_cuda_debug;
+
+//uint8_t ALIGNMENT_DOUBLE, ALIGNMENT_FLOAT, ALIGNMENT_CHAR;
 
 void opal_datatype_cuda_init(void)
 {
@@ -18,26 +64,57 @@ void opal_datatype_cuda_init(void)
     cudaMallocHost((void **)&cuda_desc_h, sizeof(ddt_cuda_desc_t));
     printf("size cuda_desc %d\n", sizeof(ddt_cuda_desc_t));
     
-    printf("malloc iov\n");
-    for (i = 0; i < IOV_ARRAY_SIZE; i++) {
-        void* iov_base;
-        cudaMalloc( (void **)&iov_base, sizeof(char)*IOV_LEN);
-        cuda_desc_h->iov[i].iov_base = iov_base;
-        cuda_desc_h->iov[i].iov_len = IOV_LEN;
-    }
-    cudaMalloc((void **)(&pBaseBuf_GPU), sizeof(char)*IOV_LEN);
+    // printf("malloc iov\n");
+    // for (i = 0; i < IOV_ARRAY_SIZE; i++) {
+    //     void* iov_base;
+    //     cudaMalloc( (void **)&iov_base, sizeof(char)*IOV_LEN);
+    //     cuda_desc_h->iov[i].iov_base = iov_base;
+    //     cuda_desc_h->iov[i].iov_len = IOV_LEN;
+    // }
+    printf("malloc cuda packing buffer\n");
+    cudaMalloc((void **)(&ddt_cuda_pack_buffer), sizeof(char)*DT_CUDA_BUFFER_SIZE);
+    cudaMemset(ddt_cuda_pack_buffer, 0, sizeof(char)*DT_CUDA_BUFFER_SIZE);
+    printf("malloc cuda unpacking buffer\n");
+    cudaMalloc((void **)(&ddt_cuda_unpack_buffer), sizeof(char)*DT_CUDA_BUFFER_SIZE);
+    cudaMemset(ddt_cuda_unpack_buffer, 0, sizeof(char)*DT_CUDA_BUFFER_SIZE);
+
+    cuda_desc_h->iov[0].iov_base = ddt_cuda_pack_buffer;
+    cuda_desc_h->iov[0].iov_len = DT_CUDA_BUFFER_SIZE;
+    
+    cudaMalloc((void **)(&pBaseBuf_GPU), sizeof(char)*DT_CUDA_BUFFER_SIZE);
     gpu_src_const = pBaseBuf_GPU;
     gpu_dest_const = (unsigned char*)cuda_desc_h->iov[0].iov_base; 
     
     cuda_desc_h->description_max_count = 0;
     cuda_desc_h->description_count = 0;
     
-    cuda_streams = (ddt_cuda_stream_t*)malloc(sizeof(ddt_cuda_stream_t));
     /* init cuda stream */
+    cuda_streams = (ddt_cuda_stream_t*)malloc(sizeof(ddt_cuda_stream_t));
     for (i = 0; i < NB_STREAMS; i++) {
         cudaStreamCreate(&(cuda_streams->opal_cuda_stream[i]));
     }
     cuda_streams->current_stream_id = 0;
+    
+    /* init cuda_iov */
+    cuda_iov_count = CUDA_NB_IOV;
+    
+    /* init description dist array */
+    cudaMalloc((void **)(&description_dist_d), sizeof(ddt_cuda_description_dist_t)*CUDA_MAX_NB_BLOCKS);
+    cuda_desc_h->description_dist = description_dist_d;
+    
+    /* only for iov version */
+    for (i = 0; i < NB_STREAMS; i++) {
+        cudaMalloc((void **)(&cuda_iov_dist_d[i]), sizeof(ddt_cuda_iov_dist_t)*CUDA_MAX_NB_BLOCKS);
+    }
+    
+    opal_datatype_cuda_debug = 1;
+    
+    // /* init size for double, float, char */
+    // ALIGNMENT_DOUBLE = sizeof(double);
+    // ALIGNMENT_FLOAT = sizeof(float);
+    // ALIGNMENT_CHAR = sizeof(char);
+    
+    
 }
 
 void opal_datatype_cuda_fini(void)
@@ -52,6 +129,10 @@ void opal_datatype_cuda_fini(void)
         cudaFree(cuda_desc_h->description);
         cuda_desc_h->description = NULL;
     }
+    if (cuda_desc_h->description_dist != NULL) {
+        cudaFree(cuda_desc_h->description_dist);
+        cuda_desc_h->description_dist = NULL;
+    }
     printf("free iov\n");
     if (cuda_desc_h != NULL) {    
         for (i = 0; i < IOV_ARRAY_SIZE; i++) {
@@ -68,6 +149,11 @@ void opal_datatype_cuda_fini(void)
         cudaStreamDestroy(cuda_streams->opal_cuda_stream[i]);
     }
     free(cuda_streams);
+    
+    /* only for iov version */
+    for (i = 0; i < NB_STREAMS; i++) {
+        cudaFree(cuda_iov_dist_d[i]);
+    }
 }
 
 void opal_cuda_sync_device(void)
@@ -75,4 +161,15 @@ void opal_cuda_sync_device(void)
     cudaDeviceSynchronize();
     pBaseBuf_GPU = gpu_src_const;
     cuda_desc_h->iov[0].iov_base = (void*)gpu_dest_const;
-}
\ No newline at end of file
+}
+
+void opal_cuda_output(int output_id, const char *format, ...)
+{
+    if (output_id >= 0 && output_id <= OPAL_DATATYPE_CUDA_DEBUG_LEVEL) {
+        va_list arglist;
+        fprintf( stderr, "[Debug %d]: ", output_id );
+        va_start(arglist, format);
+        vfprintf(stderr, format, arglist);
+        va_end(arglist);
+    }
+}
diff --git a/opal/datatype/cuda/opal_datatype_cuda.cuh b/opal/datatype/cuda/opal_datatype_cuda.cuh
index 82ab78b2ff7..ebaad5a06fc 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cuh
+++ b/opal/datatype/cuda/opal_datatype_cuda.cuh
@@ -12,11 +12,21 @@ int32_t opal_generic_simple_pack_function_cuda( opal_convertor_t* pConvertor,
                                                 struct iovec* iov, 
                                                 uint32_t* out_size,
                                                 size_t* max_data );
+                                                
+int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor,
+                                                    struct iovec* iov, 
+                                                    uint32_t* out_size,
+                                                    size_t* max_data );                                              
 
 int32_t opal_generic_simple_unpack_function_cuda( opal_convertor_t* pConvertor,
                                                   struct iovec* iov, 
                                                   uint32_t* out_size,
                                                   size_t* max_data );
+                                                  
+int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvertor,
+                                                  struct iovec* iov, 
+                                                  uint32_t* out_size,
+                                                  size_t* max_data );  
 
 void pack_contiguous_loop_cuda( dt_elem_desc_t* ELEM,
                                 uint32_t* COUNT,
diff --git a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
index 84fbbe856a0..b510a2f5808 100644
--- a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
+++ b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
@@ -3,306 +3,48 @@
 
 #include <stdint.h>
 #include <stddef.h>
+#include <sys/time.h>
 
-//#define OPAL_DATATYPE_CUDA_DRY_RUN
-//#define OPAL_DATATYPE_CUDA_DEBUG
+#include "opal_datatype_orig_internal.h"
+
+
+/* OPAL_CUDA */
+// #define OPAL_DATATYPE_CUDA_DRY_RUN
+#define OPAL_DATATYPE_CUDA_DEBUG
 //#define OPAL_DATATYPE_CUDA_KERNEL_TIME
-#define OPAL_ENABLE_DEBUG   1
+#define OPAL_DATATYPE_CUDA_DEBUG_LEVEL  0
+#define OPAL_DATATYPE_CUDA_IOV
+#define OPAL_DATATYPE_CUDA_TIMING
+
 
-#define DT_STATIC_STACK_SIZE    5                /**< This should be sufficient for most applications */
-#define IOV_ARRAY_SIZE          10
-#define IOV_LEN                 1024*1024*200
+#define IOV_ARRAY_SIZE          1
+#define DT_CUDA_BUFFER_SIZE    1024*1024*200
 
 #define THREAD_PER_BLOCK    32
-#define TASK_PER_THREAD     1
+#define CUDA_WARP_SIZE      32
+#define TASK_PER_THREAD     2
 #define OPAL_GPU_INDEX      0
 #define NB_STREAMS          4
+#define CUDA_NB_IOV         4096
+#define CUDA_IOV_LEN        1024*1204
+#define CUDA_MAX_NB_BLOCKS  1024
+#define CUDA_IOV_MAX_TASK_PER_BLOCK 200
+#define ALIGNMENT_DOUBLE    8
+#define ALIGNMENT_FLOAT     4
+#define ALIGNMENT_CHAR      1
 
-#define OPAL_PTRDIFF_TYPE ptrdiff_t
-
-/* keep the last 16 bits free for data flags */
-#define CONVERTOR_DATATYPE_MASK    0x0000FFFF
-#define CONVERTOR_SEND_CONVERSION  0x00010000
-#define CONVERTOR_RECV             0x00020000
-#define CONVERTOR_SEND             0x00040000
-#define CONVERTOR_HOMOGENEOUS      0x00080000
-#define CONVERTOR_NO_OP            0x00100000
-#define CONVERTOR_WITH_CHECKSUM    0x00200000
-#define CONVERTOR_CUDA             0x00400000
-#define CONVERTOR_CUDA_ASYNC       0x00800000
-#define CONVERTOR_TYPE_MASK        0x00FF0000
-#define CONVERTOR_STATE_START      0x01000000
-#define CONVERTOR_STATE_COMPLETE   0x02000000
-#define CONVERTOR_STATE_ALLOC      0x04000000
-#define CONVERTOR_COMPLETED        0x08000000
-
-#define OPAL_DATATYPE_LOOP           0
-#define OPAL_DATATYPE_END_LOOP       1
-#define OPAL_DATATYPE_LB             2
-#define OPAL_DATATYPE_UB             3
-#define OPAL_DATATYPE_FIRST_TYPE     4 /* Number of first real type */
-#define OPAL_DATATYPE_INT1           4
-#define OPAL_DATATYPE_INT2           5
-#define OPAL_DATATYPE_INT4           6
-#define OPAL_DATATYPE_INT8           7
-#define OPAL_DATATYPE_INT16          8
-#define OPAL_DATATYPE_UINT1          9
-#define OPAL_DATATYPE_UINT2          10
-#define OPAL_DATATYPE_UINT4          11
-#define OPAL_DATATYPE_UINT8          12
-#define OPAL_DATATYPE_UINT16         13
-#define OPAL_DATATYPE_FLOAT2         14
-#define OPAL_DATATYPE_FLOAT4         15
-#define OPAL_DATATYPE_FLOAT8         16
-#define OPAL_DATATYPE_FLOAT12        17
-#define OPAL_DATATYPE_FLOAT16        18
-#define OPAL_DATATYPE_FLOAT_COMPLEX  19
-#define OPAL_DATATYPE_DOUBLE_COMPLEX 20
-#define OPAL_DATATYPE_LONG_DOUBLE_COMPLEX 21
-#define OPAL_DATATYPE_BOOL           22
-#define OPAL_DATATYPE_WCHAR          23
-#define OPAL_DATATYPE_UNAVAILABLE    24
-
-/* flags for the datatypes. */
-#define OPAL_DATATYPE_FLAG_UNAVAILABLE   0x0001  /**< datatypes unavailable on the build (OS or compiler dependant) */
-#define OPAL_DATATYPE_FLAG_PREDEFINED    0x0002  /**< cannot be removed: initial and predefined datatypes */
-#define OPAL_DATATYPE_FLAG_COMMITED      0x0004  /**< ready to be used for a send/recv operation */
-#define OPAL_DATATYPE_FLAG_OVERLAP       0x0008  /**< datatype is unpropper for a recv operation */
-#define OPAL_DATATYPE_FLAG_CONTIGUOUS    0x0010  /**< contiguous datatype */
-#define OPAL_DATATYPE_FLAG_NO_GAPS       0x0020  /**< no gaps around the datatype, aka OPAL_DATATYPE_FLAG_CONTIGUOUS and extent == size */
-#define OPAL_DATATYPE_FLAG_USER_LB       0x0040  /**< has a user defined LB */
-#define OPAL_DATATYPE_FLAG_USER_UB       0x0080  /**< has a user defined UB */
-#define OPAL_DATATYPE_FLAG_DATA          0x0100  /**< data or control structure */
-/*
- * We should make the difference here between the predefined contiguous and non contiguous
- * datatypes. The OPAL_DATATYPE_FLAG_BASIC is held by all predefined contiguous datatypes.
- */
-#define OPAL_DATATYPE_FLAG_BASIC         (OPAL_DATATYPE_FLAG_PREDEFINED | \
-                                          OPAL_DATATYPE_FLAG_CONTIGUOUS | \
-                                          OPAL_DATATYPE_FLAG_NO_GAPS |    \
-                                          OPAL_DATATYPE_FLAG_DATA |       \
-                                          OPAL_DATATYPE_FLAG_COMMITED)
- 
-/* typedefs ***********************************************************/
-
-typedef struct opal_object_t opal_object_t;
-typedef struct opal_class_t opal_class_t;
-typedef void (*opal_construct_t) (opal_object_t *);
-typedef void (*opal_destruct_t) (opal_object_t *);
-
-
-/* types **************************************************************/
-
-/**
-* Class descriptor.
-*
-* There should be a single instance of this descriptor for each class
-* definition.
-*/
-struct opal_class_t {
-  const char *cls_name;           /**< symbolic name for class */
-  opal_class_t *cls_parent;       /**< parent class descriptor */
-  opal_construct_t cls_construct; /**< class constructor */
-  opal_destruct_t cls_destruct;   /**< class destructor */
-  int cls_initialized;            /**< is class initialized */
-  int cls_depth;                  /**< depth of class hierarchy tree */
-  opal_construct_t *cls_construct_array;
-                                  /**< array of parent class constructors */
-  opal_destruct_t *cls_destruct_array;
-                                  /**< array of parent class destructors */
-  size_t cls_sizeof;              /**< size of an object instance */
-};
-
-/**
- * Base object.
- *
- * This is special and does not follow the pattern for other classes.
- */
-struct opal_object_t {
-#if OPAL_ENABLE_DEBUG
-    /** Magic ID -- want this to be the very first item in the
-        struct's memory */
-    uint64_t obj_magic_id;
-#endif
-    opal_class_t *obj_class;            /**< class descriptor */
-    volatile int32_t obj_reference_count;   /**< reference count */
-#if OPAL_ENABLE_DEBUG
-   const char* cls_init_file_name;        /**< In debug mode store the file where the object get contructed */
-   int   cls_init_lineno;           /**< In debug mode store the line number where the object get contructed */
-#endif  /* OPAL_ENABLE_DEBUG */
-};
-
- 
- 
-struct ddt_elem_id_description {
-    uint16_t   flags;  /**< flags for the record */
-    uint16_t   type;   /**< the basic data type id */
-};
-typedef struct ddt_elem_id_description ddt_elem_id_description;
-
-/* the basic element. A data description is composed
- * by a set of basic elements.
- */
-struct ddt_elem_desc {
-    ddt_elem_id_description common;           /**< basic data description and flags */
-    uint32_t                count;            /**< number of blocks */
-    uint32_t                blocklen;         /**< number of elements on each block */
-    OPAL_PTRDIFF_TYPE       extent;           /**< extent of each block (in bytes) */
-    OPAL_PTRDIFF_TYPE       disp;             /**< displacement of the first block */
-};
-typedef struct ddt_elem_desc ddt_elem_desc_t;
-
-struct ddt_loop_desc {
-    ddt_elem_id_description common;           /**< basic data description and flags */
-    uint32_t                loops;            /**< number of elements */
-    uint32_t                items;            /**< number of items in the loop */
-    size_t                  unused;           /**< not used right now */
-    OPAL_PTRDIFF_TYPE       extent;           /**< extent of the whole loop */
-};
-typedef struct ddt_loop_desc ddt_loop_desc_t;
-
-struct ddt_endloop_desc {
-    ddt_elem_id_description common;           /**< basic data description and flags */
-    uint32_t                items;            /**< number of elements */
-    uint32_t                unused;           /**< not used right now */
-    size_t                  size;             /**< real size of the data in the loop */
-    OPAL_PTRDIFF_TYPE       first_elem_disp;  /**< the displacement of the first block in the loop */
-};
-typedef struct ddt_endloop_desc ddt_endloop_desc_t;
-
-union dt_elem_desc {
-    ddt_elem_desc_t    elem;
-    ddt_loop_desc_t    loop;
-    ddt_endloop_desc_t end_loop;
-};
-typedef union dt_elem_desc dt_elem_desc_t;
-
-/* dt_type_description */
-typedef uint32_t opal_datatype_count_t;
-
-struct dt_type_desc_t {
-    opal_datatype_count_t  length;  /**< the maximum number of elements in the description array */
-    opal_datatype_count_t  used;    /**< the number of used elements in the description array */
-    dt_elem_desc_t*        desc;
-};
-typedef struct dt_type_desc_t dt_type_desc_t;
-
-/*
- * The datatype description.
- */
-#define OPAL_DATATYPE_MAX_PREDEFINED 25
-#define OPAL_DATATYPE_MAX_SUPPORTED  47
-#define OPAL_MAX_OBJECT_NAME         64
-
-struct opal_datatype_t {
-    opal_object_t      super;    /**< basic superclass */
-    uint16_t           flags;    /**< the flags */
-    uint16_t           id;       /**< data id, normally the index in the data array. */
-    uint32_t           bdt_used; /**< bitset of which basic datatypes are used in the data description */
-    size_t             size;     /**< total size in bytes of the memory used by the data if
-                                      the data is put on a contiguous buffer */
-    OPAL_PTRDIFF_TYPE  true_lb;  /**< the true lb of the data without user defined lb and ub */
-    OPAL_PTRDIFF_TYPE  true_ub;  /**< the true ub of the data without user defined lb and ub */
-    OPAL_PTRDIFF_TYPE  lb;       /**< lower bound in memory */
-    OPAL_PTRDIFF_TYPE  ub;       /**< upper bound in memory */
-    /* --- cacheline 1 boundary (64 bytes) --- */
-    size_t             nbElems;  /**< total number of elements inside the datatype */
-    uint32_t           align;    /**< data should be aligned to */
-
-    /* Attribute fields */
-    char               name[OPAL_MAX_OBJECT_NAME];  /**< name of the datatype */
-    /* --- cacheline 2 boundary (128 bytes) was 8-12 bytes ago --- */
-    dt_type_desc_t     desc;     /**< the data description */
-    dt_type_desc_t     opt_desc; /**< short description of the data used when conversion is useless
-                                      or in the send case (without conversion) */
+#define TIMER_DATA_TYPE struct timeval
+#define GET_TIME(TV)   gettimeofday( &(TV), NULL )
+#define ELAPSED_TIME(TSTART, TEND)  (((TEND).tv_sec - (TSTART).tv_sec) * 1000000 + ((TEND).tv_usec - (TSTART).tv_usec))
 
-    uint32_t           btypes[OPAL_DATATYPE_MAX_SUPPORTED];
-                                 /**< basic elements count used to compute the size of the
-                                      datatype for remote nodes. The length of the array is dependent on
-                                      the maximum number of datatypes of all top layers.
-                                      Reason being is that Fortran is not at the OPAL layer. */
-    /* --- cacheline 5 boundary (320 bytes) was 32-36 bytes ago --- */
 
-    /* size: 352, cachelines: 6, members: 15 */
-    /* last cacheline: 28-32 bytes */
-};
 
-typedef struct opal_datatype_t opal_datatype_t;
-
-/* convertor and stack */
-typedef struct opal_convertor_t opal_convertor_t;
-
-typedef int32_t (*convertor_advance_fct_t)( opal_convertor_t* pConvertor,
-                                            struct iovec* iov,
-                                            uint32_t* out_size,
-                                            size_t* max_data );
-typedef void*(*memalloc_fct_t)( size_t* pLength, void* userdata );
-typedef void*(*memcpy_fct_t)( void* dest, const void* src, size_t n, opal_convertor_t* pConvertor );
-
-/* The master convertor struct (defined in convertor_internal.h) */
-struct opal_convertor_master_t;
-
-struct dt_stack_t {
-    int32_t           index;    /**< index in the element description */
-    int16_t           type;     /**< the type used for the last pack/unpack (original or OPAL_DATATYPE_UINT1) */
-    size_t            count;    /**< number of times we still have to do it */
-    OPAL_PTRDIFF_TYPE disp;     /**< actual displacement depending on the count field */
-};
-typedef struct dt_stack_t dt_stack_t;
-
-typedef int32_t (*conversion_fct_t)( opal_convertor_t* pConvertor, uint32_t count,
-                                     const void* from, size_t from_len, OPAL_PTRDIFF_TYPE from_extent,
-                                     void* to, size_t to_length, OPAL_PTRDIFF_TYPE to_extent,
-                                     OPAL_PTRDIFF_TYPE *advance );
-
-typedef struct opal_convertor_master_t {
-    struct opal_convertor_master_t* next;
-    uint32_t                        remote_arch;
-    uint32_t                        flags;
-    uint32_t                        hetero_mask;
-    const size_t                    remote_sizes[OPAL_DATATYPE_MAX_PREDEFINED];
-    conversion_fct_t*               pFunctions;   /**< the convertor functions pointer */
-} opal_convertor_master_t;
-
-struct opal_convertor_t {
-    opal_object_t                 super;          /**< basic superclass */
-    uint32_t                      remoteArch;     /**< the remote architecture */
-    uint32_t                      flags;          /**< the properties of this convertor */
-    size_t                        local_size;     /**< overall length data on local machine, compared to bConverted */
-    size_t                        remote_size;    /**< overall length data on remote machine, compared to bConverted */
-    const opal_datatype_t*        pDesc;          /**< the datatype description associated with the convertor */
-    const dt_type_desc_t*         use_desc;       /**< the version used by the convertor (normal or optimized) */
-    opal_datatype_count_t         count;          /**< the total number of full datatype elements */
-    uint32_t                      stack_size;     /**< size of the allocated stack */
-    /* --- cacheline 1 boundary (64 bytes) --- */
-    unsigned char*                pBaseBuf;       /**< initial buffer as supplied by the user */
-    dt_stack_t*                   pStack;         /**< the local stack for the actual conversion */
-    convertor_advance_fct_t       fAdvance;       /**< pointer to the pack/unpack functions */
-    struct opal_convertor_master_t* master;       /**< the master convertor */
-
-    /* All others fields get modified for every call to pack/unpack functions */
-    uint32_t                      stack_pos;      /**< the actual position on the stack */
-    uint32_t                      partial_length; /**< amount of data left over from the last unpack */
-    size_t                        bConverted;     /**< # of bytes already converted */
-    uint32_t                      checksum;       /**< checksum computed by pack/unpack operation */
-    uint32_t                      csum_ui1;       /**< partial checksum computed by pack/unpack operation */
-    size_t                        csum_ui2;       /**< partial checksum computed by pack/unpack operation */
-     /* --- cacheline 2 boundary (128 bytes) --- */
-    dt_stack_t                    static_stack[DT_STATIC_STACK_SIZE];  /**< local stack for small datatypes */
-    /* --- cacheline 3 boundary (192 bytes) was 56 bytes ago --- */
-
-#if OPAL_CUDA_SUPPORT
-    memcpy_fct_t                  cbmemcpy;       /**< memcpy or cuMemcpy */
-    void *                        stream;         /**< CUstream for async copy */
-#endif
-    /* size: 248, cachelines: 4, members: 20 */
-    /* last cacheline: 56 bytes */
-};
-
-struct iovec {  
-    void *iov_base; /* Starting address */  
-    size_t iov_len; /* Length in bytes */  
-};
+typedef struct {
+    uint32_t description_index[200];     /* index of y direction */
+    uint32_t description_local_index[200];   /* index of x direction */
+    uint32_t dst_offset[200];
+    uint32_t description_used;
+} ddt_cuda_description_dist_t;
 
 typedef struct {
     dt_stack_t pStack[DT_STATIC_STACK_SIZE];
@@ -319,6 +61,7 @@ typedef struct {
     size_t max_data;
     uint32_t description_count;
     uint32_t description_max_count;
+    ddt_cuda_description_dist_t *description_dist;
 } ddt_cuda_desc_t;
 
 typedef struct {
@@ -326,34 +69,30 @@ typedef struct {
     uint32_t current_stream_id;
 } ddt_cuda_stream_t;
 
+typedef struct {
+    unsigned char* src[CUDA_IOV_MAX_TASK_PER_BLOCK];
+    unsigned char* dst[CUDA_IOV_MAX_TASK_PER_BLOCK];
+    uint32_t nb_elements[CUDA_IOV_MAX_TASK_PER_BLOCK];
+    uint8_t element_alignment[CUDA_IOV_MAX_TASK_PER_BLOCK];
+    uint32_t nb_tasks;
+} ddt_cuda_iov_dist_t;
+
 extern ddt_cuda_desc_t *cuda_desc_d, *cuda_desc_h;
 extern unsigned char* pBaseBuf_GPU;
+extern unsigned char *ddt_cuda_pack_buffer, *ddt_cuda_unpack_buffer;
+extern size_t ddt_cuda_buffer_space;
 extern ddt_cuda_stream_t* cuda_streams;
+extern struct iovec cuda_iov[CUDA_NB_IOV];
+extern uint32_t cuda_iov_count;
+extern ddt_cuda_description_dist_t description_dist_h[CUDA_MAX_NB_BLOCKS];
+extern ddt_cuda_description_dist_t* description_dist_d;
+extern ddt_cuda_iov_dist_t cuda_iov_dist_h[NB_STREAMS][CUDA_MAX_NB_BLOCKS];
+extern ddt_cuda_iov_dist_t* cuda_iov_dist_d[NB_STREAMS];
+extern dt_elem_desc_t* description_d;
+extern uint8_t opal_datatype_cuda_debug;
 
-#define SAVE_STACK( PSTACK, INDEX, TYPE, COUNT, DISP) \
-do { \
-   (PSTACK)->index    = (INDEX); \
-   (PSTACK)->type     = (TYPE); \
-   (PSTACK)->count    = (COUNT); \
-   (PSTACK)->disp     = (DISP); \
-} while(0)
+//extern uint8_t ALIGNMENT_DOUBLE, ALIGNMENT_FLOAT, ALIGNMENT_CHAR;
 
-#define PUSH_STACK( PSTACK, STACK_POS, INDEX, TYPE, COUNT, DISP) \
-do { \
-   dt_stack_t* pTempStack = (PSTACK) + 1; \
-   if (threadIdx.x == 0) {  \
-       SAVE_STACK( pTempStack, (INDEX), (TYPE), (COUNT), (DISP) );  \
-   }    \
-   __syncthreads(); \
-   (STACK_POS)++; \
-   (PSTACK) = pTempStack; \
-} while(0)
-
-#define UPDATE_INTERNAL_COUNTERS( DESCRIPTION, POSITION, ELEMENT, COUNTER ) \
-    do {                                                                \
-        (ELEMENT) = &((DESCRIPTION)[(POSITION)]);                       \
-        (COUNTER) = (ELEMENT)->elem.count;                              \
-    } while (0)
         
 #if defined (OPAL_DATATYPE_CUDA_DEBUG) 
 #define DBGPRINT(fmt, ...) printf(fmt, __VA_ARGS__) 
@@ -375,6 +114,8 @@ __device__ void unpack_contiguous_loop_cuda_kernel( dt_elem_desc_t* ELEM,
                                                   
 __global__ void opal_generic_simple_pack_cuda_kernel(ddt_cuda_desc_t* cuda_desc);
 
+__global__ void opal_generic_simple_pack_cuda_kernel_v2(ddt_cuda_desc_t* cuda_desc);
+
 __global__ void opal_generic_simple_unpack_cuda_kernel(ddt_cuda_desc_t* cuda_desc);
 
 __global__ void pack_contiguous_loop_cuda_kernel_global( uint32_t copy_loops,
@@ -388,10 +129,28 @@ __global__ void unpack_contiguous_loop_cuda_kernel_global( uint32_t copy_loops,
                                                            OPAL_PTRDIFF_TYPE extent,
                                                            unsigned char* source,
                                                            unsigned char* destination );
+                                                           
+// __global__ void opal_generic_simple_pack_cuda_iov_kernel( ddt_cuda_description_dist_t* desc_dist_d, dt_elem_desc_t* desc_d, uint32_t required_blocks, struct iovec* iov, unsigned char* pBaseBuf);
+
+__global__ void opal_generic_simple_pack_cuda_iov_kernel( ddt_cuda_iov_dist_t* cuda_iov_dist);
+
+__global__ void opal_generic_simple_unpack_cuda_iov_kernel( ddt_cuda_iov_dist_t* cuda_iov_dist);
+
+void opal_cuda_output(int output_id, const char *format, ...);
+
+#if defined (OPAL_DATATYPE_CUDA_DEBUG)
+#define DT_CUDA_DEBUG( INST ) if (opal_datatype_cuda_debug) { INST }
+#else
+#define DT_CUDA_DEBUG( INST )
+#endif
 
 extern "C"
 {
 int32_t opal_convertor_set_position_nocheck( opal_convertor_t* convertor, size_t* position );
+
+int32_t opal_convertor_raw( opal_convertor_t* pConvertor, 
+		                    struct iovec* iov, uint32_t* iov_count,
+		                    size_t* length );
 }
 
 #endif  /* OPAL_DATATYPE_CUDA_INTERNAL_H_HAS_BEEN_INCLUDED */
diff --git a/opal/datatype/cuda/opal_datatype_orig_internal.h b/opal/datatype/cuda/opal_datatype_orig_internal.h
new file mode 100644
index 00000000000..fc30fc87741
--- /dev/null
+++ b/opal/datatype/cuda/opal_datatype_orig_internal.h
@@ -0,0 +1,646 @@
+#ifndef OPAL_DATATYPE_ORIG_INTERNAL_H_HAS_BEEN_INCLUDED
+#define OPAL_DATATYPE_ORIG_INTERNAL_H_HAS_BEEN_INCLUDED
+
+#include <stdbool.h>
+
+#include "opal_config.h"
+
+/* original OMPI */
+#define OPAL_DECLSPEC
+
+#define OPAL_PTRDIFF_TYPE ptrdiff_t
+#define DT_STATIC_STACK_SIZE    5                /**< This should be sufficient for most applications */
+
+#if OPAL_ENABLE_DEBUG
+/* Any kind of unique ID should do the job */
+#define OPAL_OBJ_MAGIC_ID ((0xdeafbeedULL << 32) + 0xdeafbeedULL)
+#endif
+
+/* keep the last 16 bits free for data flags */
+#define CONVERTOR_DATATYPE_MASK    0x0000FFFF
+#define CONVERTOR_SEND_CONVERSION  0x00010000
+#define CONVERTOR_RECV             0x00020000
+#define CONVERTOR_SEND             0x00040000
+#define CONVERTOR_HOMOGENEOUS      0x00080000
+#define CONVERTOR_NO_OP            0x00100000
+#define CONVERTOR_WITH_CHECKSUM    0x00200000
+#define CONVERTOR_CUDA             0x00400000
+#define CONVERTOR_CUDA_ASYNC       0x00800000
+#define CONVERTOR_TYPE_MASK        0x00FF0000
+#define CONVERTOR_STATE_START      0x01000000
+#define CONVERTOR_STATE_COMPLETE   0x02000000
+#define CONVERTOR_STATE_ALLOC      0x04000000
+#define CONVERTOR_COMPLETED        0x08000000
+
+#define OPAL_DATATYPE_LOOP           0
+#define OPAL_DATATYPE_END_LOOP       1
+#define OPAL_DATATYPE_LB             2
+#define OPAL_DATATYPE_UB             3
+#define OPAL_DATATYPE_FIRST_TYPE     4 /* Number of first real type */
+#define OPAL_DATATYPE_INT1           4
+#define OPAL_DATATYPE_INT2           5
+#define OPAL_DATATYPE_INT4           6
+#define OPAL_DATATYPE_INT8           7
+#define OPAL_DATATYPE_INT16          8
+#define OPAL_DATATYPE_UINT1          9
+#define OPAL_DATATYPE_UINT2          10
+#define OPAL_DATATYPE_UINT4          11
+#define OPAL_DATATYPE_UINT8          12
+#define OPAL_DATATYPE_UINT16         13
+#define OPAL_DATATYPE_FLOAT2         14
+#define OPAL_DATATYPE_FLOAT4         15
+#define OPAL_DATATYPE_FLOAT8         16
+#define OPAL_DATATYPE_FLOAT12        17
+#define OPAL_DATATYPE_FLOAT16        18
+#define OPAL_DATATYPE_FLOAT_COMPLEX  19
+#define OPAL_DATATYPE_DOUBLE_COMPLEX 20
+#define OPAL_DATATYPE_LONG_DOUBLE_COMPLEX 21
+#define OPAL_DATATYPE_BOOL           22
+#define OPAL_DATATYPE_WCHAR          23
+#define OPAL_DATATYPE_UNAVAILABLE    24
+
+/* flags for the datatypes. */
+#define OPAL_DATATYPE_FLAG_UNAVAILABLE   0x0001  /**< datatypes unavailable on the build (OS or compiler dependant) */
+#define OPAL_DATATYPE_FLAG_PREDEFINED    0x0002  /**< cannot be removed: initial and predefined datatypes */
+#define OPAL_DATATYPE_FLAG_COMMITED      0x0004  /**< ready to be used for a send/recv operation */
+#define OPAL_DATATYPE_FLAG_OVERLAP       0x0008  /**< datatype is unpropper for a recv operation */
+#define OPAL_DATATYPE_FLAG_CONTIGUOUS    0x0010  /**< contiguous datatype */
+#define OPAL_DATATYPE_FLAG_NO_GAPS       0x0020  /**< no gaps around the datatype, aka OPAL_DATATYPE_FLAG_CONTIGUOUS and extent == size */
+#define OPAL_DATATYPE_FLAG_USER_LB       0x0040  /**< has a user defined LB */
+#define OPAL_DATATYPE_FLAG_USER_UB       0x0080  /**< has a user defined UB */
+#define OPAL_DATATYPE_FLAG_DATA          0x0100  /**< data or control structure */
+/*
+ * We should make the difference here between the predefined contiguous and non contiguous
+ * datatypes. The OPAL_DATATYPE_FLAG_BASIC is held by all predefined contiguous datatypes.
+ */
+#define OPAL_DATATYPE_FLAG_BASIC         (OPAL_DATATYPE_FLAG_PREDEFINED | \
+                                          OPAL_DATATYPE_FLAG_CONTIGUOUS | \
+                                          OPAL_DATATYPE_FLAG_NO_GAPS |    \
+                                          OPAL_DATATYPE_FLAG_DATA |       \
+                                          OPAL_DATATYPE_FLAG_COMMITED)
+ 
+/* typedefs ***********************************************************/
+
+typedef struct opal_object_t opal_object_t;
+typedef struct opal_class_t opal_class_t;
+typedef void (*opal_construct_t) (opal_object_t *);
+typedef void (*opal_destruct_t) (opal_object_t *);
+
+
+/* types **************************************************************/
+
+/**
+* Class descriptor.
+*
+* There should be a single instance of this descriptor for each class
+* definition.
+*/
+struct opal_class_t {
+  const char *cls_name;           /**< symbolic name for class */
+  opal_class_t *cls_parent;       /**< parent class descriptor */
+  opal_construct_t cls_construct; /**< class constructor */
+  opal_destruct_t cls_destruct;   /**< class destructor */
+  int cls_initialized;            /**< is class initialized */
+  int cls_depth;                  /**< depth of class hierarchy tree */
+  opal_construct_t *cls_construct_array;
+                                  /**< array of parent class constructors */
+  opal_destruct_t *cls_destruct_array;
+                                  /**< array of parent class destructors */
+  size_t cls_sizeof;              /**< size of an object instance */
+};
+
+/**
+ * Base object.
+ *
+ * This is special and does not follow the pattern for other classes.
+ */
+struct opal_object_t {
+#if OPAL_ENABLE_DEBUG
+    /** Magic ID -- want this to be the very first item in the
+        struct's memory */
+    uint64_t obj_magic_id;
+#endif
+    opal_class_t *obj_class;            /**< class descriptor */
+    volatile int32_t obj_reference_count;   /**< reference count */
+#if OPAL_ENABLE_DEBUG
+   const char* cls_init_file_name;        /**< In debug mode store the file where the object get contructed */
+   int   cls_init_lineno;           /**< In debug mode store the line number where the object get contructed */
+#endif  /* OPAL_ENABLE_DEBUG */
+};
+
+/**
+ * Declaration for class descriptor
+ *
+ * @param NAME          Name of class
+ *
+ * Put this in NAME.h
+ */
+#define OBJ_CLASS_DECLARATION(NAME)             \
+    extern opal_class_t NAME ## _class
+
+/**
+ * Return a pointer to the class descriptor associated with a
+ * class type.
+ *
+ * @param NAME          Name of class
+ * @return              Pointer to class descriptor
+ */
+#define OBJ_CLASS(NAME)     (&(NAME ## _class))
+
+/**
+ * For static initializations of OBJects.
+ *
+ * @param NAME   Name of the class to initialize
+ */
+#if OPAL_ENABLE_DEBUG
+#define OPAL_OBJ_STATIC_INIT(BASE_CLASS) { OPAL_OBJ_MAGIC_ID, OBJ_CLASS(BASE_CLASS), 1, __FILE__, __LINE__ }
+#else
+#define OPAL_OBJ_STATIC_INIT(BASE_CLASS) { OBJ_CLASS(BASE_CLASS), 1 }
+#endif
+
+
+
+struct ddt_elem_id_description {
+    uint16_t   flags;  /**< flags for the record */
+    uint16_t   type;   /**< the basic data type id */
+};
+typedef struct ddt_elem_id_description ddt_elem_id_description;
+
+/* the basic element. A data description is composed
+ * by a set of basic elements.
+ */
+struct ddt_elem_desc {
+    ddt_elem_id_description common;           /**< basic data description and flags */
+    uint32_t                count;            /**< number of blocks */
+    uint32_t                blocklen;         /**< number of elements on each block */
+    OPAL_PTRDIFF_TYPE       extent;           /**< extent of each block (in bytes) */
+    OPAL_PTRDIFF_TYPE       disp;             /**< displacement of the first block */
+};
+typedef struct ddt_elem_desc ddt_elem_desc_t;
+
+struct ddt_loop_desc {
+    ddt_elem_id_description common;           /**< basic data description and flags */
+    uint32_t                loops;            /**< number of elements */
+    uint32_t                items;            /**< number of items in the loop */
+    size_t                  unused;           /**< not used right now */
+    OPAL_PTRDIFF_TYPE       extent;           /**< extent of the whole loop */
+};
+typedef struct ddt_loop_desc ddt_loop_desc_t;
+
+struct ddt_endloop_desc {
+    ddt_elem_id_description common;           /**< basic data description and flags */
+    uint32_t                items;            /**< number of elements */
+    uint32_t                unused;           /**< not used right now */
+    size_t                  size;             /**< real size of the data in the loop */
+    OPAL_PTRDIFF_TYPE       first_elem_disp;  /**< the displacement of the first block in the loop */
+};
+typedef struct ddt_endloop_desc ddt_endloop_desc_t;
+
+union dt_elem_desc {
+    ddt_elem_desc_t    elem;
+    ddt_loop_desc_t    loop;
+    ddt_endloop_desc_t end_loop;
+};
+typedef union dt_elem_desc dt_elem_desc_t;
+
+/* dt_type_description */
+typedef uint32_t opal_datatype_count_t;
+
+struct dt_type_desc_t {
+    opal_datatype_count_t  length;  /**< the maximum number of elements in the description array */
+    opal_datatype_count_t  used;    /**< the number of used elements in the description array */
+    dt_elem_desc_t*        desc;
+};
+typedef struct dt_type_desc_t dt_type_desc_t;
+
+/*
+ * The datatype description.
+ */
+#define OPAL_DATATYPE_MAX_PREDEFINED 25
+#define OPAL_DATATYPE_MAX_SUPPORTED  47
+#define OPAL_MAX_OBJECT_NAME         64
+
+struct opal_datatype_t {
+    opal_object_t      super;    /**< basic superclass */
+    uint16_t           flags;    /**< the flags */
+    uint16_t           id;       /**< data id, normally the index in the data array. */
+    uint32_t           bdt_used; /**< bitset of which basic datatypes are used in the data description */
+    size_t             size;     /**< total size in bytes of the memory used by the data if
+                                      the data is put on a contiguous buffer */
+    OPAL_PTRDIFF_TYPE  true_lb;  /**< the true lb of the data without user defined lb and ub */
+    OPAL_PTRDIFF_TYPE  true_ub;  /**< the true ub of the data without user defined lb and ub */
+    OPAL_PTRDIFF_TYPE  lb;       /**< lower bound in memory */
+    OPAL_PTRDIFF_TYPE  ub;       /**< upper bound in memory */
+    /* --- cacheline 1 boundary (64 bytes) --- */
+    size_t             nbElems;  /**< total number of elements inside the datatype */
+    uint32_t           align;    /**< data should be aligned to */
+
+    /* Attribute fields */
+    char               name[OPAL_MAX_OBJECT_NAME];  /**< name of the datatype */
+    /* --- cacheline 2 boundary (128 bytes) was 8-12 bytes ago --- */
+    dt_type_desc_t     desc;     /**< the data description */
+    dt_type_desc_t     opt_desc; /**< short description of the data used when conversion is useless
+                                      or in the send case (without conversion) */
+
+    uint32_t           btypes[OPAL_DATATYPE_MAX_SUPPORTED];
+                                 /**< basic elements count used to compute the size of the
+                                      datatype for remote nodes. The length of the array is dependent on
+                                      the maximum number of datatypes of all top layers.
+                                      Reason being is that Fortran is not at the OPAL layer. */
+    /* --- cacheline 5 boundary (320 bytes) was 32-36 bytes ago --- */
+
+    /* size: 352, cachelines: 6, members: 15 */
+    /* last cacheline: 28-32 bytes */
+};
+
+typedef struct opal_datatype_t opal_datatype_t;
+
+OPAL_DECLSPEC OBJ_CLASS_DECLARATION( opal_datatype_t );
+
+/* convertor and stack */
+typedef struct opal_convertor_t opal_convertor_t;
+
+typedef int32_t (*convertor_advance_fct_t)( opal_convertor_t* pConvertor,
+                                            struct iovec* iov,
+                                            uint32_t* out_size,
+                                            size_t* max_data );
+typedef void*(*memalloc_fct_t)( size_t* pLength, void* userdata );
+typedef void*(*memcpy_fct_t)( void* dest, const void* src, size_t n, opal_convertor_t* pConvertor );
+
+/* The master convertor struct (defined in convertor_internal.h) */
+struct opal_convertor_master_t;
+
+struct dt_stack_t {
+    int32_t           index;    /**< index in the element description */
+    int16_t           type;     /**< the type used for the last pack/unpack (original or OPAL_DATATYPE_UINT1) */
+    size_t            count;    /**< number of times we still have to do it */
+    OPAL_PTRDIFF_TYPE disp;     /**< actual displacement depending on the count field */
+};
+typedef struct dt_stack_t dt_stack_t;
+
+typedef int32_t (*conversion_fct_t)( opal_convertor_t* pConvertor, uint32_t count,
+                                     const void* from, size_t from_len, OPAL_PTRDIFF_TYPE from_extent,
+                                     void* to, size_t to_length, OPAL_PTRDIFF_TYPE to_extent,
+                                     OPAL_PTRDIFF_TYPE *advance );
+
+typedef struct opal_convertor_master_t {
+    struct opal_convertor_master_t* next;
+    uint32_t                        remote_arch;
+    uint32_t                        flags;
+    uint32_t                        hetero_mask;
+    const size_t                    remote_sizes[OPAL_DATATYPE_MAX_PREDEFINED];
+    conversion_fct_t*               pFunctions;   /**< the convertor functions pointer */
+} opal_convertor_master_t;
+
+struct opal_convertor_t {
+    opal_object_t                 super;          /**< basic superclass */
+    uint32_t                      remoteArch;     /**< the remote architecture */
+    uint32_t                      flags;          /**< the properties of this convertor */
+    size_t                        local_size;     /**< overall length data on local machine, compared to bConverted */
+    size_t                        remote_size;    /**< overall length data on remote machine, compared to bConverted */
+    const opal_datatype_t*        pDesc;          /**< the datatype description associated with the convertor */
+    const dt_type_desc_t*         use_desc;       /**< the version used by the convertor (normal or optimized) */
+    opal_datatype_count_t         count;          /**< the total number of full datatype elements */
+    uint32_t                      stack_size;     /**< size of the allocated stack */
+    /* --- cacheline 1 boundary (64 bytes) --- */
+    unsigned char*                pBaseBuf;       /**< initial buffer as supplied by the user */
+    dt_stack_t*                   pStack;         /**< the local stack for the actual conversion */
+    convertor_advance_fct_t       fAdvance;       /**< pointer to the pack/unpack functions */
+    struct opal_convertor_master_t* master;       /**< the master convertor */
+
+    /* All others fields get modified for every call to pack/unpack functions */
+    uint32_t                      stack_pos;      /**< the actual position on the stack */
+    uint32_t                      partial_length; /**< amount of data left over from the last unpack */
+    size_t                        bConverted;     /**< # of bytes already converted */
+    uint32_t                      checksum;       /**< checksum computed by pack/unpack operation */
+    uint32_t                      csum_ui1;       /**< partial checksum computed by pack/unpack operation */
+    size_t                        csum_ui2;       /**< partial checksum computed by pack/unpack operation */
+     /* --- cacheline 2 boundary (128 bytes) --- */
+    dt_stack_t                    static_stack[DT_STATIC_STACK_SIZE];  /**< local stack for small datatypes */
+    /* --- cacheline 3 boundary (192 bytes) was 56 bytes ago --- */
+
+#if OPAL_CUDA_SUPPORT
+    memcpy_fct_t                  cbmemcpy;       /**< memcpy or cuMemcpy */
+    void *                        stream;         /**< CUstream for async copy */
+#endif
+    /* size: 248, cachelines: 4, members: 20 */
+    /* last cacheline: 56 bytes */
+};
+
+struct iovec {  
+    void *iov_base; /* Starting address */  
+    size_t iov_len; /* Length in bytes */  
+};
+
+
+OPAL_DECLSPEC extern union dt_elem_desc opal_datatype_predefined_elem_desc[2 * OPAL_DATATYPE_MAX_PREDEFINED];
+
+#define OPAL_DATATYPE_INIT_BTYPES_ARRAY_UNAVAILABLE { 0 }
+#define OPAL_DATATYPE_INIT_BTYPES_ARRAY(NAME) { [OPAL_DATATYPE_ ## NAME] = 1 }
+
+#define OPAL_DATATYPE_INIT_NAME(NAME) "OPAL_" #NAME
+
+/*
+ * Macro to initialize the main description for basic types, setting the pointer
+ * into the array opal_datatype_predefined_type_desc, which is initialized at
+ * runtime in opal_datatype_init(). Each basic type has two desc-elements....
+ */
+#define OPAL_DATATYPE_INIT_DESC_PREDEFINED(NAME)                                     \
+    {                                                                                \
+        .length = 1, .used = 1,                                                      \
+        .desc = &(opal_datatype_predefined_elem_desc[2 * OPAL_DATATYPE_ ## NAME])    \
+    }
+#define OPAL_DATATYPE_INIT_DESC_NULL  {.length = 0, .used = 0, .desc = NULL}
+
+#define OPAL_DATATYPE_INITIALIZER_UNAVAILABLE_NAMED( NAME, FLAGS )                   \
+    {                                                                                \
+        .super = OPAL_OBJ_STATIC_INIT(opal_datatype_t),                              \
+        .flags = OPAL_DATATYPE_FLAG_UNAVAILABLE | OPAL_DATATYPE_FLAG_PREDEFINED | (FLAGS), \
+        .id = OPAL_DATATYPE_ ## NAME,                                                \
+        .bdt_used = 0,                                                               \
+        .size = 0,                                                                   \
+        .true_lb = 0, .true_ub = 0, .lb = 0, .ub = 0,                                \
+        .align = 0,                                                                  \
+        .nbElems = 1,                                                                \
+        .name = OPAL_DATATYPE_INIT_NAME(NAME),                                       \
+        .desc = OPAL_DATATYPE_INIT_DESC_PREDEFINED(UNAVAILABLE),                     \
+        .opt_desc = OPAL_DATATYPE_INIT_DESC_PREDEFINED(UNAVAILABLE),                 \
+        .btypes = OPAL_DATATYPE_INIT_BTYPES_ARRAY_UNAVAILABLE                        \
+    }
+
+#define OPAL_DATATYPE_INITIALIZER_EMPTY( FLAGS )                        \
+    {                                                                   \
+        .super = OPAL_OBJ_STATIC_INIT(opal_datatype_t),                 \
+        .flags = OPAL_DATATYPE_FLAG_PREDEFINED | (FLAGS),               \
+        .id = 0,                                                        \
+        .bdt_used = 0,                                                  \
+        .size = 0,                                                      \
+        .true_lb = 0, .true_ub = 0, .lb = 0, .ub = 0,                   \
+        .align = 0,                                                     \
+        .nbElems = 1,                                                   \
+        .name = OPAL_DATATYPE_INIT_NAME(EMPTY),                         \
+        .desc = OPAL_DATATYPE_INIT_DESC_NULL,                           \
+        .opt_desc = OPAL_DATATYPE_INIT_DESC_NULL,                       \
+        .btypes = OPAL_DATATYPE_INIT_BTYPES_ARRAY_UNAVAILABLE           \
+    }
+
+#define OPAL_DATATYPE_INIT_BASIC_TYPE( TYPE, NAME, FLAGS )              \
+    {                                                                   \
+        .super = OPAL_OBJ_STATIC_INIT(opal_datatype_t),                 \
+        .flags = OPAL_DATATYPE_FLAG_PREDEFINED | (FLAGS),               \
+        .id = TYPE,                                                     \
+        .bdt_used = (((uint32_t)1)<<(TYPE)),                            \
+        .size = 0,                                                      \
+        .true_lb = 0, .true_ub = 0, .lb = 0, .ub = 0,                   \
+        .align = 0,                                                     \
+        .nbElems = 1,                                                   \
+        .name = OPAL_DATATYPE_INIT_NAME(NAME),                          \
+        .desc = OPAL_DATATYPE_INIT_DESC_NULL,                           \
+        .opt_desc = OPAL_DATATYPE_INIT_DESC_NULL,                       \
+        .btypes = OPAL_DATATYPE_INIT_BTYPES_ARRAY(NAME)                 \
+    }
+    
+#define OPAL_DATATYPE_INIT_BASIC_DATATYPE( TYPE, ALIGN, NAME, FLAGS )                \
+    {                                                                                \
+        .super = OPAL_OBJ_STATIC_INIT(opal_datatype_t),                              \
+        .flags = OPAL_DATATYPE_FLAG_BASIC | (FLAGS),                                 \
+        .id = OPAL_DATATYPE_ ## NAME,                                                \
+        .bdt_used = (((uint32_t)1)<<(OPAL_DATATYPE_ ## NAME)),                       \
+        .size = sizeof(TYPE),                                                        \
+        .true_lb = 0, .true_ub = sizeof(TYPE), .lb = 0, .ub = sizeof(TYPE),          \
+        .align = (ALIGN),                                                            \
+        .nbElems = 1,                                                                \
+        .name = OPAL_DATATYPE_INIT_NAME(NAME),                                       \
+        .desc = OPAL_DATATYPE_INIT_DESC_PREDEFINED(NAME),                            \
+        .opt_desc = OPAL_DATATYPE_INIT_DESC_PREDEFINED(NAME),                        \
+        .btypes = OPAL_DATATYPE_INIT_BTYPES_ARRAY(NAME)                              \
+    }
+
+#define OPAL_DATATYPE_INITIALIZER_LOOP(FLAGS)       OPAL_DATATYPE_INIT_BASIC_TYPE( OPAL_DATATYPE_LOOP, LOOP, FLAGS )
+#define OPAL_DATATYPE_INITIALIZER_END_LOOP(FLAGS)   OPAL_DATATYPE_INIT_BASIC_TYPE( OPAL_DATATYPE_END_LOOP, END_LOOP, FLAGS )
+#define OPAL_DATATYPE_INITIALIZER_LB(FLAGS)         OPAL_DATATYPE_INIT_BASIC_TYPE( OPAL_DATATYPE_LB, LB, FLAGS )
+#define OPAL_DATATYPE_INITIALIZER_UB(FLAGS)         OPAL_DATATYPE_INIT_BASIC_TYPE( OPAL_DATATYPE_UB, UB, FLAGS ) 
+#define OPAL_DATATYPE_INITIALIZER_INT1(FLAGS)       OPAL_DATATYPE_INIT_BASIC_DATATYPE( int8_t, OPAL_ALIGNMENT_INT8, INT1, FLAGS )
+#define OPAL_DATATYPE_INITIALIZER_INT2(FLAGS)       OPAL_DATATYPE_INIT_BASIC_DATATYPE( int16_t, OPAL_ALIGNMENT_INT16, INT2, FLAGS )
+#define OPAL_DATATYPE_INITIALIZER_INT4(FLAGS)       OPAL_DATATYPE_INIT_BASIC_DATATYPE( int32_t, OPAL_ALIGNMENT_INT32, INT4, FLAGS )
+#define OPAL_DATATYPE_INITIALIZER_INT8(FLAGS)       OPAL_DATATYPE_INIT_BASIC_DATATYPE( int64_t, OPAL_ALIGNMENT_INT64, INT8, FLAGS )
+#ifdef HAVE_INT128_T
+#define OPAL_DATATYPE_INITIALIZER_INT16(FLAGS)      OPAL_DATATYPE_INIT_BASIC_DATATYPE( int128_t, OPAL_ALIGNMENT_INT128, INT16, FLAGS )
+#else
+#define OPAL_DATATYPE_INITIALIZER_INT16(FLAGS)      OPAL_DATATYPE_INITIALIZER_UNAVAILABLE_NAMED( INT16, FLAGS )
+#endif
+#define OPAL_DATATYPE_INITIALIZER_UINT1(FLAGS)      OPAL_DATATYPE_INIT_BASIC_DATATYPE( uint8_t, OPAL_ALIGNMENT_INT8, UINT1, FLAGS )
+#define OPAL_DATATYPE_INITIALIZER_UINT2(FLAGS)      OPAL_DATATYPE_INIT_BASIC_DATATYPE( uint16_t, OPAL_ALIGNMENT_INT16, UINT2, FLAGS )
+#define OPAL_DATATYPE_INITIALIZER_UINT4(FLAGS)      OPAL_DATATYPE_INIT_BASIC_DATATYPE( uint32_t, OPAL_ALIGNMENT_INT32, UINT4, FLAGS )
+#define OPAL_DATATYPE_INITIALIZER_UINT8(FLAGS)      OPAL_DATATYPE_INIT_BASIC_DATATYPE( uint64_t, OPAL_ALIGNMENT_INT64, UINT8, FLAGS )
+#ifdef HAVE_UINT128_T
+#define OPAL_DATATYPE_INITIALIZER_UINT16(FLAGS)     OPAL_DATATYPE_INIT_BASIC_DATATYPE( uint128_t, OPAL_ALIGNMENT_INT128, UINT16, FLAGS )
+#else
+#define OPAL_DATATYPE_INITIALIZER_UINT16(FLAGS)     OPAL_DATATYPE_INITIALIZER_UNAVAILABLE_NAMED( INT16, FLAGS )
+#endif
+
+#if SIZEOF_FLOAT == 2
+#define OPAL_DATATYPE_INITIALIZER_FLOAT2(FLAGS)     OPAL_DATATYPE_INIT_BASIC_DATATYPE( float, OPAL_ALIGNMENT_FLOAT, FLOAT2, FLAGS )
+#elif SIZEOF_DOUBLE == 2
+#define OPAL_DATATYPE_INITIALIZER_FLOAT2(FLAGS)     OPAL_DATATYPE_INIT_BASIC_DATATYPE( double, OPAL_ALIGNMENT_DOUBLE, FLOAT2, FLAGS )
+#elif SIZEOF_LONG_DOUBLE == 2
+#define OPAL_DATATYPE_INITIALIZER_FLOAT2(FLAGS)     OPAL_DATATYPE_INIT_BASIC_DATATYPE( long double, OPAL_ALIGNMENT_LONG_DOUBLE, FLOAT2, FLAGS )
+#else
+#define OPAL_DATATYPE_INITIALIZER_FLOAT2(FLAGS)     OPAL_DATATYPE_INITIALIZER_UNAVAILABLE_NAMED( FLOAT2, FLAGS )
+#endif
+
+#if SIZEOF_FLOAT == 4
+#define OPAL_DATATYPE_INITIALIZER_FLOAT4(FLAGS)     OPAL_DATATYPE_INIT_BASIC_DATATYPE( float, OPAL_ALIGNMENT_FLOAT, FLOAT4, FLAGS )
+#elif SIZEOF_DOUBLE == 4
+#define OPAL_DATATYPE_INITIALIZER_FLOAT4(FLAGS)     OPAL_DATATYPE_INIT_BASIC_DATATYPE( double, OPAL_ALIGNMENT_DOUBLE, FLOAT4, FLAGS )
+#elif SIZEOF_LONG_DOUBLE == 4
+#define OPAL_DATATYPE_INITIALIZER_FLOAT4(FLAGS)     OPAL_DATATYPE_INIT_BASIC_DATATYPE( long double, OPAL_ALIGNMENT_LONG_DOUBLE, FLOAT4, FLAGS )
+#else
+#define OPAL_DATATYPE_INITIALIZER_FLOAT4(FLAGS)     OPAL_DATATYPE_INITIALIZER_UNAVAILABLE_NAMED( FLOAT4, FLAGS )
+#endif
+
+#if SIZEOF_FLOAT == 8
+#define OPAL_DATATYPE_INITIALIZER_FLOAT8(FLAGS)     OPAL_DATATYPE_INIT_BASIC_DATATYPE( float, OPAL_ALIGNMENT_FLOAT, FLOAT8, FLAGS )
+#elif SIZEOF_DOUBLE == 8
+#define OPAL_DATATYPE_INITIALIZER_FLOAT8(FLAGS)     OPAL_DATATYPE_INIT_BASIC_DATATYPE( double, OPAL_ALIGNMENT_DOUBLE, FLOAT8, FLAGS )
+#elif SIZEOF_LONG_DOUBLE == 8
+#define OPAL_DATATYPE_INITIALIZER_FLOAT8(FLAGS)     OPAL_DATATYPE_INIT_BASIC_DATATYPE( long double, OPAL_ALIGNMENT_LONG_DOUBLE, FLOAT8, FLAGS )
+#else
+#define OPAL_DATATYPE_INITIALIZER_FLOAT8(FLAGS)     OPAL_DATATYPE_INITIALIZER_UNAVAILABLE_NAMED( FLOAT8, FLAGS )
+#endif
+
+#if SIZEOF_FLOAT == 12
+#define OPAL_DATATYPE_INITIALIZER_FLOAT12(FLAGS)    OPAL_DATATYPE_INIT_BASIC_DATATYPE( float, OPAL_ALIGNMENT_FLOAT, FLOAT12, FLAGS )
+#elif SIZEOF_DOUBLE == 12
+#define OPAL_DATATYPE_INITIALIZER_FLOAT12(FLAGS)    OPAL_DATATYPE_INIT_BASIC_DATATYPE( double, OPAL_ALIGNMENT_DOUBLE, FLOAT12, FLAGS )
+#elif SIZEOF_LONG_DOUBLE == 12
+#define OPAL_DATATYPE_INITIALIZER_FLOAT12(FLAGS)    OPAL_DATATYPE_INIT_BASIC_DATATYPE( long double, OPAL_ALIGNMENT_LONG_DOUBLE, FLOAT12, FLAGS )
+#else
+#define OPAL_DATATYPE_INITIALIZER_FLOAT12(FLAGS)    OPAL_DATATYPE_INITIALIZER_UNAVAILABLE_NAMED( FLOAT12, FLAGS )
+#endif
+
+#if SIZEOF_FLOAT == 16
+#define OPAL_DATATYPE_INITIALIZER_FLOAT16(FLAGS)    OPAL_DATATYPE_INIT_BASIC_DATATYPE( float, OPAL_ALIGNMENT_FLOAT, FLOAT16, FLAGS )
+#elif SIZEOF_DOUBLE == 16
+#define OPAL_DATATYPE_INITIALIZER_FLOAT16(FLAGS)    OPAL_DATATYPE_INIT_BASIC_DATATYPE( double, OPAL_ALIGNMENT_DOUBLE, FLOAT16, FLAGS )
+#elif SIZEOF_LONG_DOUBLE == 16
+#define OPAL_DATATYPE_INITIALIZER_FLOAT16(FLAGS)    OPAL_DATATYPE_INIT_BASIC_DATATYPE( long double, OPAL_ALIGNMENT_LONG_DOUBLE, FLOAT16, FLAGS )
+#else
+#define OPAL_DATATYPE_INITIALIZER_FLOAT16(FLAGS)    OPAL_DATATYPE_INITIALIZER_UNAVAILABLE_NAMED( FLOAT16, FLAGS )
+#endif
+
+#if HAVE_FLOAT__COMPLEX
+#define OPAL_DATATYPE_INITIALIZER_FLOAT_COMPLEX(FLAGS) OPAL_DATATYPE_INIT_BASIC_DATATYPE( float _Complex, OPAL_ALIGNMENT_FLOAT_COMPLEX, FLOAT_COMPLEX, FLAGS )
+#else
+#define OPAL_DATATYPE_INITIALIZER_FLOAT_COMPLEX(FLAGS) OPAL_DATATYPE_INITIALIZER_UNAVAILABLE_NAMED( FLOAT_COMPLEX, FLAGS)
+#endif
+
+#if HAVE_DOUBLE__COMPLEX
+#define OPAL_DATATYPE_INITIALIZER_DOUBLE_COMPLEX(FLAGS) OPAL_DATATYPE_INIT_BASIC_DATATYPE( double _Complex, OPAL_ALIGNMENT_DOUBLE_COMPLEX, DOUBLE_COMPLEX, FLAGS )
+#else
+#define OPAL_DATATYPE_INITIALIZER_DOUBLE_COMPLEX(FLAGS) OPAL_DATATYPE_INITIALIZER_UNAVAILABLE_NAMED( DOUBLE_COMPLEX, FLAGS)
+#endif
+
+#if HAVE_LONG_DOUBLE__COMPLEX
+#define OPAL_DATATYPE_INITIALIZER_LONG_DOUBLE_COMPLEX(FLAGS) OPAL_DATATYPE_INIT_BASIC_DATATYPE( long double _Complex, OPAL_ALIGNMENT_LONG_DOUBLE_COMPLEX, LONG_DOUBLE_COMPLEX, FLAGS )
+#else
+#define OPAL_DATATYPE_INITIALIZER_LONG_DOUBLE_COMPLEX(FLAGS) OPAL_DATATYPE_INITIALIZER_UNAVAILABLE_NAMED( LONG_DOUBLE_COMPLEX, FLAGS)
+#endif
+
+#define OPAL_DATATYPE_INITIALIZER_BOOL(FLAGS)       OPAL_DATATYPE_INIT_BASIC_DATATYPE( _Bool, OPAL_ALIGNMENT_BOOL, BOOL, FLAGS )
+
+#if OPAL_ALIGNMENT_WCHAR != 0
+#define OPAL_DATATYPE_INITIALIZER_WCHAR(FLAGS)      OPAL_DATATYPE_INIT_BASIC_DATATYPE( wchar_t, OPAL_ALIGNMENT_WCHAR, WCHAR, FLAGS )
+#else
+#define OPAL_DATATYPE_INITIALIZER_WCHAR(FLAGS)      OPAL_DATATYPE_INITIALIZER_UNAVAILABLE_NAMED( WCHAR, FLAGS )
+#endif
+    
+#define SAVE_STACK( PSTACK, INDEX, TYPE, COUNT, DISP) \
+do { \
+   (PSTACK)->index    = (INDEX); \
+   (PSTACK)->type     = (TYPE); \
+   (PSTACK)->count    = (COUNT); \
+   (PSTACK)->disp     = (DISP); \
+} while(0)
+
+#define PUSH_STACK( PSTACK, STACK_POS, INDEX, TYPE, COUNT, DISP) \
+do { \
+   dt_stack_t* pTempStack = (PSTACK) + 1; \
+   if (threadIdx.x == 0) {  \
+       SAVE_STACK( pTempStack, (INDEX), (TYPE), (COUNT), (DISP) );  \
+   }    \
+   __syncthreads(); \
+   (STACK_POS)++; \
+   (PSTACK) = pTempStack; \
+} while(0)
+
+#define UPDATE_INTERNAL_COUNTERS( DESCRIPTION, POSITION, ELEMENT, COUNTER ) \
+    do {                                                                \
+        (ELEMENT) = &((DESCRIPTION)[(POSITION)]);                       \
+        (COUNTER) = (ELEMENT)->elem.count;                              \
+    } while (0)   
+
+OPAL_DECLSPEC extern const size_t opal_datatype_basicDatatypesSize[OPAL_DATATYPE_MAX_PREDEFINED];
+
+#define     OPAL_DATATYPE_LOOP_SIZE         0
+#define     OPAL_DATATYPE_END_LOOP_SIZE     0
+#define     OPAL_DATATYPE_LB_SIZE           0
+#define     OPAL_DATATYPE_UB_SIZE           0
+#define     OPAL_DATATYPE_INT1_SIZE         sizeof(int8_t)
+#define     OPAL_DATATYPE_INT2_SIZE         sizeof(int16_t)
+#define     OPAL_DATATYPE_INT4_SIZE         sizeof(int32_t)
+#define     OPAL_DATATYPE_INT8_SIZE         sizeof(int64_t)
+#ifdef HAVE_INT128_T
+#   define  OPAL_DATATYPE_INT16_SIZE        sizeof(int128_t)       /* Yes, double-machine word integers are available */
+#else
+#   define  OPAL_DATATYPE_INT16_SIZE        0
+#endif
+
+#define     OPAL_DATATYPE_UINT1_SIZE        sizeof(uint8_t)
+#define     OPAL_DATATYPE_UINT2_SIZE        sizeof(uint16_t)
+#define     OPAL_DATATYPE_UINT4_SIZE        sizeof(uint32_t)
+#define     OPAL_DATATYPE_UINT8_SIZE        sizeof(uint64_t)
+#ifdef HAVE_UINT128_T
+#   define  OPAL_DATATYPE_UINT16_SIZE       sizeof(uint128_t)      /* Yes, double-machine word integers are available */
+#else
+#   define  OPAL_DATATYPE_UINT16_SIZE       0
+#endif
+
+#if SIZEOF_FLOAT == 2
+#   define  OPAL_DATATYPE_FLOAT2_SIZE       sizeof(float)
+#elif SIZEOF_DOUBLE == 2
+#   define  OPAL_DATATYPE_FLOAT2_SIZE       sizeof(double)
+#elif SIZEOF_LONG_DOUBLE == 2
+#   define  OPAL_DATATYPE_FLOAT2_SIZE       sizeof(long double)
+#else
+#   define  OPAL_DATATYPE_FLOAT2_SIZE       0
+#endif
+
+#if SIZEOF_FLOAT == 4
+#   define  OPAL_DATATYPE_FLOAT4_SIZE       sizeof(float)
+#elif SIZEOF_DOUBLE == 4
+#   define  OPAL_DATATYPE_FLOAT4_SIZE       sizeof(double)
+#elif SIZEOF_LONG_DOUBLE == 4
+#   define  OPAL_DATATYPE_FLOAT4_SIZE       sizeof(long double)
+#else
+#   define  OPAL_DATATYPE_FLOAT4_SIZE       0
+#endif
+
+#if SIZEOF_FLOAT == 8
+#   define  OPAL_DATATYPE_FLOAT8_SIZE       sizeof(float)
+#elif SIZEOF_DOUBLE == 8
+#   define  OPAL_DATATYPE_FLOAT8_SIZE       sizeof(double)
+#elif SIZEOF_LONG_DOUBLE == 8
+#   define  OPAL_DATATYPE_FLOAT8_SIZE       sizeof(long double)
+#else
+#   define  OPAL_DATATYPE_FLOAT8_SIZE       0
+#endif
+
+#if SIZEOF_FLOAT == 12
+#   define  OPAL_DATATYPE_FLOAT12_SIZE      sizeof(float)
+#elif SIZEOF_DOUBLE == 12
+#   define  OPAL_DATATYPE_FLOAT12_SIZE      sizeof(double)
+#elif SIZEOF_LONG_DOUBLE == 12
+#   define  OPAL_DATATYPE_FLOAT12_SIZE      sizeof(long double)
+#else
+#   define  OPAL_DATATYPE_FLOAT12_SIZE      0
+#endif
+
+#if SIZEOF_FLOAT == 16
+#   define  OPAL_DATATYPE_FLOAT16_SIZE      sizeof(float)
+#elif SIZEOF_DOUBLE == 16
+#   define  OPAL_DATATYPE_FLOAT16_SIZE      sizeof(double)
+#elif SIZEOF_LONG_DOUBLE == 16
+#   define  OPAL_DATATYPE_FLOAT16_SIZE      sizeof(long double)
+#else
+#   define  OPAL_DATATYPE_FLOAT16_SIZE      0
+#endif
+        
+#if HAVE_FLOAT__COMPLEX
+#   define  OPAL_DATATYPE_FLOAT_COMPLEX_SIZE    sizeof(float _Complex)
+#else
+#   define  OPAL_DATATYPE_FLOAT_COMPLEX_SIZE    0
+#endif
+
+#if HAVE_DOUBLE__COMPLEX
+#   define  OPAL_DATATYPE_DOUBLE_COMPLEX_SIZE    sizeof(float _Complex)
+#else
+#   define  OPAL_DATATYPE_DOUBLE_COMPLEX_SIZE    0
+#endif
+    
+#if HAVE_LONG_DOUBLE__COMPLEX
+#   define  OPAL_DATATYPE_LONG_DOUBLE_COMPLEX_SIZE    sizeof(float _Complex)
+#else
+#   define  OPAL_DATATYPE_LONG_DOUBLE_COMPLEX_SIZE    0
+#endif
+
+#define     OPAL_DATATYPE_BOOL_SIZE         sizeof(_Bool)
+#if OPAL_ALIGNMENT_WCHAR != 0
+#   define  OPAL_DATATYPE_WCHAR_SIZE        sizeof(wchar_t)
+#else 
+#   define  OPAL_DATATYPE_WCHAR_SIZE        0
+#endif
+
+#define     OPAL_DATATYPE_UNAVAILABLE_SIZE  0
+
+#endif  /* OPAL_DATATYPE_ORIG_INTERNAL_H_HAS_BEEN_INCLUDED */
\ No newline at end of file
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
index d56ebfe6954..98208dc0f39 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
@@ -1,4 +1,4 @@
-#include "opal_datatype_cuda_internal.cuh"
+ #include "opal_datatype_cuda_internal.cuh"
 #include <stdio.h> 
 #include <time.h>
 
@@ -87,7 +87,6 @@ __device__ void pack_contiguous_loop_cuda_kernel( dt_elem_desc_t* ELEM,
     *(SPACE) -= _copy_loops * _end_loop->size;
     *(COUNT) -= _copy_loops;
 
-    __syncthreads();
 }
 
 __device__ void pack_predefined_data_cuda_kernel( dt_elem_desc_t* ELEM,
@@ -118,7 +117,6 @@ __device__ void pack_predefined_data_cuda_kernel( dt_elem_desc_t* ELEM,
     nb_elements = _copy_blength / 8;
     _src_disp_tmp = (double*)_src_disp;
     _destination_tmp = (double*)_destination;
-    _source_tmp = _src_disp_tmp + tid;
     _destination_tmp += tid;
     
     __syncthreads();
@@ -127,8 +125,8 @@ __device__ void pack_predefined_data_cuda_kernel( dt_elem_desc_t* ELEM,
         _source_tmp = _src_disp_tmp + tid + _i/num_threads*num_threads + _i/nb_elements * gap;
 #if defined (OPAL_DATATYPE_CUDA_DEBUG)
         if (_i == 0 ) {
-            DBGPRINT("tid %d, pack 3. memcpy( %p, %p, %lu ) => space %lu, _i %d, actual _i %d\n",
-                                            tid, _destination_tmp, _source_tmp, (unsigned long)_copy_blength*_copy_count, (unsigned long)(*(SPACE) - _i/nb_elements * _copy_blength), _i/nb_elements, _i );
+            DBGPRINT("tid %d, pack 3. memcpy( %p, %p, %lu ) => space %lu, _i %d, count %d\n",
+                                            tid, _destination_tmp, _source_tmp, (unsigned long)_copy_blength*_copy_count, (unsigned long)(*(SPACE) - _i/nb_elements * _copy_blength), _i/nb_elements, _copy_count );
         }
         // if (_i / nb_elements ==1 && tid == 0 ) {
         //     DBGPRINT("tid %d, pack 3. memcpy( %p, %p, %lu ) => space %lu, _i %d, actual _i %d\n",
@@ -148,12 +146,52 @@ __device__ void pack_predefined_data_cuda_kernel( dt_elem_desc_t* ELEM,
     *(SPACE)  -= _copy_blength;
     *(COUNT)  -= _copy_count;
     
-    __syncthreads();
+}
+
+__device__ void pack_predefined_data_cuda_kernel_v2( dt_elem_desc_t* ELEM,
+                                                     uint32_t* COUNT,
+                                                     unsigned char* SOURCE,
+                                                     unsigned char* DESTINATION,
+                                                     size_t* SPACE,
+                                                     uint32_t local_index,
+                                                     uint32_t dst_offset )
+{
+    uint32_t _copy_count = *(COUNT);
+    size_t _copy_blength;
+    ddt_elem_desc_t* _elem = &((ELEM)->elem);
+    unsigned char* _src_disp = (SOURCE) + _elem->disp;
+    uint32_t local_tid;
+    unsigned char* _destination = DESTINATION;
+    double *_source_tmp, *_destination_tmp, *_src_disp_tmp;;
+
+    _copy_blength = 8;//opal_datatype_basicDatatypes[_elem->common.type]->size;
+    // if( (_copy_count * _copy_blength) > *(SPACE) ) {
+    //     _copy_count = (uint32_t)(*(SPACE) / _copy_blength);
+    //     if( 0 == _copy_count ) return;  /* nothing to do */
+    // }
+    
+    local_tid = threadIdx.x + local_index * blockDim.x;
+    _src_disp_tmp = (double*)_src_disp;
+    _destination_tmp = (double*)_destination + dst_offset;
+    
+    if (local_tid < _copy_count) {
+        _source_tmp = _src_disp_tmp + local_tid;
+        _destination_tmp += local_tid;
+#if defined (OPAL_DATATYPE_CUDA_DEBUG)
+       if (local_tid == 0 ) {
+            DBGPRINT("tid %d, local_index %d, pack 1. memcpy( %p, %p, %lu ) => space %lu, blockIdx %d, count %d, destination %p, offset %d\n",
+                                            local_tid, local_index, _destination_tmp, _source_tmp, (unsigned long)_copy_blength*_copy_count, (unsigned long)(*(SPACE) - local_tid * _copy_blength), blockIdx.x, _copy_count, _destination, dst_offset );
+       }
+#endif /* OPAL_DATATYPE_CUDA_DEBUG */
+#if !defined (OPAL_DATATYPE_CUDA_DRY_RUN)
+       *_destination_tmp = *_source_tmp;
+#endif /* ! OPAL_DATATYPE_CUDA_DRY_RUN */
+    }
 }
 
 __global__ void opal_generic_simple_pack_cuda_kernel(ddt_cuda_desc_t* cuda_desc)
 {
-    dt_stack_t *pStack, *pStack_head;       /* pointer to the position on the stack */
+    dt_stack_t *pStack;       /* pointer to the position on the stack */
     uint32_t pos_desc;        /* actual position in the description of the derived datatype */
     uint32_t count_desc;      /* the number of items already done in the actual pos_desc */
     size_t total_packed = 0;  /* total amount packed this time */
@@ -165,30 +203,26 @@ __global__ void opal_generic_simple_pack_cuda_kernel(ddt_cuda_desc_t* cuda_desc)
     uint32_t stack_pos;
     struct iovec* iov;
 
-    OPAL_PTRDIFF_TYPE lb;
-    OPAL_PTRDIFF_TYPE ub;
+    OPAL_PTRDIFF_TYPE extent;
     uint32_t out_size;
-    uint32_t tid;
-
-    tid = threadIdx.x + blockIdx.x * blockDim.x;
 
-    __shared__ ddt_cuda_desc_t cuda_desc_b;
+    // __shared__ ddt_cuda_desc_t cuda_desc_b;
+    __shared__ dt_stack_t shared_pStack[DT_STATIC_STACK_SIZE];
 
-    if (threadIdx.x == 0) {
-        memcpy(&cuda_desc_b, cuda_desc, sizeof(ddt_cuda_desc_t));
+    if (threadIdx.x < DT_STATIC_STACK_SIZE) {
+        shared_pStack[threadIdx.x] = cuda_desc->pStack[threadIdx.x];
     }
     __syncthreads();
 
+
     // load cuda descriptor from constant memory
-    iov = cuda_desc_b.iov;
-    pStack_head = cuda_desc_b.pStack;
-    pStack = pStack_head;
-    description = cuda_desc_b.description;
-    stack_pos = cuda_desc_b.stack_pos;
-    pBaseBuf = cuda_desc_b.pBaseBuf;
-    lb = cuda_desc_b.lb;
-    ub = cuda_desc_b.ub;
-    out_size = cuda_desc_b.out_size;
+    iov = cuda_desc->iov;
+    pStack = shared_pStack;
+    description = cuda_desc->description;
+    stack_pos = cuda_desc->stack_pos;
+    pBaseBuf = cuda_desc->pBaseBuf;
+    extent = cuda_desc->ub - cuda_desc->lb;
+    out_size = cuda_desc->out_size;
 
     pStack = pStack + stack_pos;
     pos_desc   = pStack->index;
@@ -209,7 +243,7 @@ __global__ void opal_generic_simple_pack_cuda_kernel(ddt_cuda_desc_t* cuda_desc)
             while( pElem->elem.common.flags & OPAL_DATATYPE_FLAG_DATA ) {
                 /* now here we have a basic datatype */
                 // PACK_PREDEFINED_DATATYPE( pConvertor, pElem, count_desc,
-                //                           conv_ptr, iov_ptr, iov_len_local );
+                //                           conv_ptr, iov_ptr, iov_len_local );     
                 pack_predefined_data_cuda_kernel(pElem, &count_desc, &conv_ptr, &iov_ptr, &iov_len_local);
                 if( 0 == count_desc ) {  /* completed */
                     conv_ptr = pBaseBuf + pStack->disp;
@@ -244,7 +278,7 @@ __global__ void opal_generic_simple_pack_cuda_kernel(ddt_cuda_desc_t* cuda_desc)
                     pos_desc = pStack->index + 1;
                     if (threadIdx.x == 0) {
                         if( pStack->index == -1 ) {
-                            pStack->disp += (ub - lb);
+                            pStack->disp += extent;
                         } else {
                             // assert( OPAL_DATATYPE_LOOP == description[pStack->index].loop.common.type );
                             pStack->disp += description[pStack->index].loop.extent;
@@ -290,178 +324,207 @@ __global__ void opal_generic_simple_pack_cuda_kernel(ddt_cuda_desc_t* cuda_desc)
         total_packed += iov[iov_count].iov_len;
     }
 
-    if (tid == 0) {
-        cuda_desc->max_data = total_packed;
-        cuda_desc->out_size = iov_count;
-        // cuda_desc->bConverted += total_packed;  /* update the already converted bytes */
-        // if( cuda_desc->bConverted == cuda_desc->local_size ) {
-        //     cuda_desc->stack_pos = stack_pos;
-        //     memcpy(cuda_desc->pStack, pStack_head, sizeof(dt_stack_t)*cuda_desc->stack_size);
-        //     return;
-        // }
-        // /* Save the global position for the next round */
-        // PUSH_STACK( pStack, stack_pos, pos_desc, OPAL_DATATYPE_INT8, count_desc,
-        //             conv_ptr - pBaseBuf );
-        // memcpy(cuda_desc->pStack, pStack_head, sizeof(dt_stack_t)*cuda_desc->stack_size);
-        // cuda_desc->stack_pos = stack_pos;
+    // if (tid == 0) {
+    //     cuda_desc->max_data = total_packed;
+    //     cuda_desc->out_size = iov_count;
+    //     // cuda_desc->bConverted += total_packed;  /* update the already converted bytes */
+    //     // if( cuda_desc->bConverted == cuda_desc->local_size ) {
+    //     //     cuda_desc->stack_pos = stack_pos;
+    //     //     memcpy(cuda_desc->pStack, pStack_head, sizeof(dt_stack_t)*cuda_desc->stack_size);
+    //     //     return;
+    //     // }
+    //     // /* Save the global position for the next round */
+    //     // PUSH_STACK( pStack, stack_pos, pos_desc, OPAL_DATATYPE_INT8, count_desc,
+    //     //             conv_ptr - pBaseBuf );
+    //     // memcpy(cuda_desc->pStack, pStack_head, sizeof(dt_stack_t)*cuda_desc->stack_size);
+    //     // cuda_desc->stack_pos = stack_pos;
+    // }
+
+    return;
+}
+
+__global__ void opal_generic_simple_pack_cuda_kernel_v2(ddt_cuda_desc_t* cuda_desc)
+{
+    dt_stack_t *pStack;       /* pointer to the position on the stack */
+    uint32_t pos_desc;        /* actual position in the description of the derived datatype */
+    uint32_t count_desc;      /* the number of items already done in the actual pos_desc */
+    size_t total_packed = 0;  /* total amount packed this time */
+    dt_elem_desc_t* description;
+    dt_elem_desc_t* pElem;
+    unsigned char *conv_ptr, *iov_ptr, *pBaseBuf;
+    size_t iov_len_local;
+    uint32_t iov_count;
+    uint32_t stack_pos;
+    struct iovec* iov;
+    ddt_cuda_description_dist_t* description_dist_d;
+    uint32_t ct = 0, local_index, dst_offset;
+
+    OPAL_PTRDIFF_TYPE extent;
+    uint32_t out_size;
+
+    // __shared__ ddt_cuda_desc_t cuda_desc_b;
+    __shared__ dt_stack_t shared_pStack[DT_STATIC_STACK_SIZE];
+
+    if (threadIdx.x < DT_STATIC_STACK_SIZE) {
+        shared_pStack[threadIdx.x] = cuda_desc->pStack[threadIdx.x];
     }
     __syncthreads();
 
+
+    // load cuda descriptor from constant memory
+    iov = cuda_desc->iov;
+    pStack = shared_pStack;
+    description = cuda_desc->description;
+    stack_pos = cuda_desc->stack_pos;
+    pBaseBuf = cuda_desc->pBaseBuf;
+    extent = cuda_desc->ub - cuda_desc->lb;
+    out_size = cuda_desc->out_size;
+    description_dist_d = cuda_desc->description_dist;
+
+    pStack = pStack + stack_pos;
+    pos_desc = description_dist_d[blockIdx.x].description_index[ct];
+    local_index = description_dist_d[blockIdx.x].description_local_index[ct];
+    dst_offset = description_dist_d[blockIdx.x].dst_offset[ct];
+    pElem = &(description[pos_desc]);
+    count_desc = pElem->elem.count;
+    conv_ptr = pBaseBuf + pStack->disp;
+    pStack--;
+    stack_pos--;
+
+//    printf("pack start pos_desc %d count_desc %d disp %ld, stack_pos %d pos_desc %d count_desc %d disp %ld\n",
+//            pos_desc, count_desc, (long)(conv_ptr - pBaseBuf), stack_pos, pStack->index, (int)pStack->count, (long)pStack->disp);
+
+    for( iov_count = 0; iov_count < out_size; iov_count++ ) {
+        iov_ptr = (unsigned char *) iov[iov_count].iov_base;
+        iov_len_local = iov[iov_count].iov_len;
+//        DBGPRINT("iov_len_local %lu, flags %d, types %d, count %d\n", iov_len_local, description->elem.common.flags, description->elem.common.type, description->elem.count);
+        while( 1 ) {
+            while( pElem->elem.common.flags & OPAL_DATATYPE_FLAG_DATA ) {
+                /* now here we have a basic datatype */
+                // PACK_PREDEFINED_DATATYPE( pConvertor, pElem, count_desc,
+                //                           conv_ptr, iov_ptr, iov_len_local );  
+               pack_predefined_data_cuda_kernel_v2(pElem, &count_desc, conv_ptr, iov_ptr, &iov_len_local, local_index, dst_offset);
+               count_desc = 0;
+                if( 0 == count_desc ) {  /* completed */
+                    conv_ptr = pBaseBuf + pStack->disp;
+                    ct ++;
+                    if (ct >= description_dist_d[blockIdx.x].description_used) {
+                        pos_desc = cuda_desc->description_count-1;
+                    } else {
+                        pos_desc = description_dist_d[blockIdx.x].description_index[ct];  /* advance to the next data */
+                        local_index = description_dist_d[blockIdx.x].description_local_index[ct];
+                        dst_offset = description_dist_d[blockIdx.x].dst_offset[ct];
+                    }
+#if defined (OPAL_DATATYPE_CUDA_DEBUG)
+                    if (pos_desc > (cuda_desc->description_count - 1)) {
+                        printf("EEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEERROR, block %d, thread %d, pos_desc %d\n", blockIdx.x, threadIdx.x, pos_desc);
+                    }
+#endif /* OPAL_DATATYPE_CUDA_DEBUG */
+                    UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
+#if defined (OPAL_DATATYPE_CUDA_DEBUG)
+                    if (pos_desc < (cuda_desc->description_count - 1) && !(pElem->elem.common.flags & OPAL_DATATYPE_FLAG_DATA)) {
+                        printf("I get a error block %d, thread %d, pos_desc %d\n", blockIdx.x, threadIdx.x, pos_desc);
+                    }
+#endif /* OPAL_DATATYPE_CUDA_DEBUG */
+                    continue;
+                }
+                goto complete_loop;
+            }
+            if( OPAL_DATATYPE_END_LOOP == pElem->elem.common.type ) { /* end of the current loop */
+                // DO_DEBUG( opal_output( 0, "pack end_loop count %d stack_pos %d"
+                //                        " pos_desc %d disp %ld space %lu\n",
+                //                        (int)pStack->count, pConvertor->stack_pos,
+                //                        pos_desc, (long)pStack->disp, (unsigned long)iov_len_local ); );
+                if (threadIdx.x == 0) {
+                    (pStack->count)--;
+                }
+                __syncthreads();
+
+                if( (pStack->count) == 0 ) { /* end of loop */
+                    if( 0 == stack_pos ) {
+                        /* we lie about the size of the next element in order to
+                         * make sure we exit the main loop.
+                         */
+                        out_size = iov_count;
+                        goto complete_loop;  /* completed */
+                    }
+                    stack_pos--;
+                    pStack--;
+                    pos_desc++;
+                } else {
+                    pos_desc = pStack->index + 1;
+                    if (threadIdx.x == 0) {
+                        if( pStack->index == -1 ) {
+                            pStack->disp += extent;
+                        } else {
+                            // assert( OPAL_DATATYPE_LOOP == description[pStack->index].loop.common.type );
+                            pStack->disp += description[pStack->index].loop.extent;
+                        }
+                    }
+                    __syncthreads();
+                }
+                conv_ptr = pBaseBuf + pStack->disp;
+                UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
+                // DO_DEBUG( opal_output( 0, "pack new_loop count %d stack_pos %d pos_desc %d count_desc %d disp %ld space %lu\n",
+                //                        (int)pStack->count, pConvertor->stack_pos, pos_desc,
+                //                        count_desc, (long)pStack->disp, (unsigned long)iov_len_local ); );
+            }
+            if( OPAL_DATATYPE_LOOP == pElem->elem.common.type ) {
+                OPAL_PTRDIFF_TYPE local_disp = (OPAL_PTRDIFF_TYPE)conv_ptr;
+                if( pElem->loop.common.flags & OPAL_DATATYPE_FLAG_CONTIGUOUS ) {
+                    pack_contiguous_loop_cuda_kernel( pElem, &count_desc,
+                                          &conv_ptr, &iov_ptr, &iov_len_local );
+                    if( 0 == count_desc ) {  /* completed */
+                        pos_desc += pElem->loop.items + 1;
+                        goto update_loop_description;
+                    }
+                    /* Save the stack with the correct last_count value. */
+                }
+                local_disp = (OPAL_PTRDIFF_TYPE)conv_ptr - local_disp;
+
+                PUSH_STACK( pStack, stack_pos, pos_desc, OPAL_DATATYPE_LOOP, count_desc,
+                            pStack->disp + local_disp);
+
+                pos_desc++;
+            update_loop_description:  /* update the current state */
+                conv_ptr = pBaseBuf + pStack->disp;
+                UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
+                // DDT_DUMP_STACK( pConvertor->pStack, pConvertor->stack_pos, pElem, "advance loop" );
+                continue;
+            }
+        }
+    complete_loop:
+        if (threadIdx.x == 0) {
+            iov[iov_count].iov_len -= iov_len_local;  /* update the amount of valid data */
+        }
+        __syncthreads();
+        total_packed += iov[iov_count].iov_len;
+    }
+#if defined (OPAL_DATATYPE_CUDA_DEBUG)    
+    if (ct != description_dist_d[blockIdx.x].description_used) {
+        printf("I am at the end, but error,ct %d\n", ct);
+    }
+#endif /* OPAL_DATATYPE_CUDA_DEBUG */
+
+    // if (tid == 0) {
+    //     cuda_desc->max_data = total_packed;
+    //     cuda_desc->out_size = iov_count;
+    //     // cuda_desc->bConverted += total_packed;  /* update the already converted bytes */
+    //     // if( cuda_desc->bConverted == cuda_desc->local_size ) {
+    //     //     cuda_desc->stack_pos = stack_pos;
+    //     //     memcpy(cuda_desc->pStack, pStack_head, sizeof(dt_stack_t)*cuda_desc->stack_size);
+    //     //     return;
+    //     // }
+    //     // /* Save the global position for the next round */
+    //     // PUSH_STACK( pStack, stack_pos, pos_desc, OPAL_DATATYPE_INT8, count_desc,
+    //     //             conv_ptr - pBaseBuf );
+    //     // memcpy(cuda_desc->pStack, pStack_head, sizeof(dt_stack_t)*cuda_desc->stack_size);
+    //     // cuda_desc->stack_pos = stack_pos;
+    // }
+
     return;
 }
 
-// __global__ void opal_generic_simple_pack_cuda_kernel(ddt_cuda_desc_t* cuda_desc)
-// {
-//     dt_stack_t *pStack, *pStack_head;       /* pointer to the position on the stack */
-//     uint32_t pos_desc;        /* actual position in the description of the derived datatype */
-//     uint32_t count_desc;      /* the number of items already done in the actual pos_desc */
-//     size_t total_packed = 0;  /* total amount packed this time */
-//     dt_elem_desc_t* description;
-//     dt_elem_desc_t* pElem;
-//     unsigned char *conv_ptr, *iov_ptr, *pBaseBuf;
-//     size_t iov_len_local;
-//     uint32_t iov_count;
-//     uint32_t stack_pos;
-//     struct iovec* iov;
-//
-//     OPAL_PTRDIFF_TYPE lb;
-//     OPAL_PTRDIFF_TYPE ub;
-//     uint32_t out_size;
-//     uint32_t tid;
-//
-//     tid = threadIdx.x + blockIdx.x * blockDim.x;
-//
-//     __shared__ ddt_cuda_desc_t cuda_desc_b;
-//
-//     if (threadIdx.x == 0) {
-//         memcpy(&cuda_desc_b, cuda_desc, sizeof(ddt_cuda_desc_t));
-//     }
-//     __syncthreads();
-//
-//
-//     // load cuda descriptor from constant memory
-//     iov = cuda_desc_b.iov;
-//     pStack_head = cuda_desc_b.pStack;
-//     pStack = pStack_head;
-//     description = cuda_desc_b.description;
-//     stack_pos = cuda_desc_b.stack_pos;
-//     pBaseBuf = cuda_desc_b.pBaseBuf;
-//     lb = cuda_desc_b.lb;
-//     ub = cuda_desc_b.ub;
-//     out_size = cuda_desc_b.out_size;
-//
-//     pStack = pStack + stack_pos;
-//     pos_desc   = pStack->index;
-//     conv_ptr   = pBaseBuf + pStack->disp;
-//     count_desc = (uint32_t)pStack->count;
-//     pStack--;
-//     stack_pos--;
-//     pElem = &(description[pos_desc]);
-//
-// //    printf("pack start pos_desc %d count_desc %d disp %ld, stack_pos %d pos_desc %d count_desc %d disp %ld\n",
-// //            pos_desc, count_desc, (long)(conv_ptr - pBaseBuf), stack_pos, pStack->index, (int)pStack->count, (long)pStack->disp);
-//
-//     if (threadIdx.x == 0) {
-//     for( iov_count = 0; iov_count < out_size; iov_count++ ) {
-//         iov_ptr = (unsigned char *) iov[iov_count].iov_base;
-//         iov_len_local = iov[iov_count].iov_len;
-//         DBGPRINT("iov_len_local %lu, flags %d, types %d, count %d\n", iov_len_local, description->elem.common.flags, description->elem.common.type, description->elem.count);
-//         while( 1 ) {
-//             while( pElem->elem.common.flags & OPAL_DATATYPE_FLAG_DATA ) {
-//                 /* now here we have a basic datatype */
-//                 // PACK_PREDEFINED_DATATYPE( pConvertor, pElem, count_desc,
-//                 //                           conv_ptr, iov_ptr, iov_len_local );
-//                 if( 0 == count_desc ) {  /* completed */
-//                     conv_ptr = pBaseBuf + pStack->disp;
-//                     pos_desc++;  /* advance to the next data */
-//                     UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
-//                     continue;
-//                 }
-//                 goto complete_loop;
-//             }
-//             if( OPAL_DATATYPE_END_LOOP == pElem->elem.common.type ) { /* end of the current loop */
-//                 // DO_DEBUG( opal_output( 0, "pack end_loop count %d stack_pos %d"
-//                 //                        " pos_desc %d disp %ld space %lu\n",
-//                 //                        (int)pStack->count, pConvertor->stack_pos,
-//                 //                        pos_desc, (long)pStack->disp, (unsigned long)iov_len_local ); );
-//
-//                 if( --(pStack->count) == 0 ) { /* end of loop */
-//                     if( 0 == stack_pos ) {
-//                         /* we lie about the size of the next element in order to
-//                          * make sure we exit the main loop.
-//                          */
-//                         out_size = iov_count;
-//                         goto complete_loop;  /* completed */
-//                     }
-//                     stack_pos--;
-//                     pStack--;
-//                     pos_desc++;
-//                 } else {
-//                     pos_desc = pStack->index + 1;
-//                     if( pStack->index == -1 ) {
-//                         pStack->disp += (ub - lb);
-//                     } else {
-//                         // assert( OPAL_DATATYPE_LOOP == description[pStack->index].loop.common.type );
-//                         pStack->disp += description[pStack->index].loop.extent;
-//                     }
-//
-//                 }
-//                 conv_ptr = pBaseBuf + pStack->disp;
-//                 UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
-//                 // DO_DEBUG( opal_output( 0, "pack new_loop count %d stack_pos %d pos_desc %d count_desc %d disp %ld space %lu\n",
-//                 //                        (int)pStack->count, pConvertor->stack_pos, pos_desc,
-//                 //                        count_desc, (long)pStack->disp, (unsigned long)iov_len_local ); );
-//             }
-//             if( OPAL_DATATYPE_LOOP == pElem->elem.common.type ) {
-//                 OPAL_PTRDIFF_TYPE local_disp = (OPAL_PTRDIFF_TYPE)conv_ptr;
-//                 if( pElem->loop.common.flags & OPAL_DATATYPE_FLAG_CONTIGUOUS ) {
-//                     // pack_contiguous_loop_cuda_kernel( pElem, &count_desc,
-//                     //                       &conv_ptr, &iov_ptr, &iov_len_local );
-//                     count_desc = 0;
-//                     if( 0 == count_desc ) {  /* completed */
-//                         pos_desc += pElem->loop.items + 1;
-//                         goto update_loop_description;
-//                     }
-//                     /* Save the stack with the correct last_count value. */
-//                 }
-//                 local_disp = (OPAL_PTRDIFF_TYPE)conv_ptr - local_disp;
-//
-//                 PUSH_STACK( pStack, stack_pos, pos_desc, OPAL_DATATYPE_LOOP, count_desc,
-//                             pStack->disp + local_disp);
-//
-//                 pos_desc++;
-//             update_loop_description:  /* update the current state */
-//                 conv_ptr = pBaseBuf + pStack->disp;
-//                 UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
-//                 // DDT_DUMP_STACK( pConvertor->pStack, pConvertor->stack_pos, pElem, "advance loop" );
-//                 continue;
-//             }
-//         }
-//     complete_loop:
-//         iov[iov_count].iov_len -= iov_len_local;  /* update the amount of valid data */
-//         total_packed += iov[iov_count].iov_len;
-//     }
-//
-//     }
-//     __syncthreads();
-//     if (tid == 0) {
-//         cuda_desc->max_data = total_packed;
-//         cuda_desc->out_size = iov_count;
-//         // cuda_desc->bConverted += total_packed;  /* update the already converted bytes */
-//         // if( cuda_desc->bConverted == cuda_desc->local_size ) {
-//         //     cuda_desc->stack_pos = stack_pos;
-//         //     memcpy(cuda_desc->pStack, pStack_head, sizeof(dt_stack_t)*cuda_desc->stack_size);
-//         //     return;
-//         // }
-//         // /* Save the global position for the next round */
-//         // PUSH_STACK( pStack, stack_pos, pos_desc, OPAL_DATATYPE_INT8, count_desc,
-//         //             conv_ptr - pBaseBuf );
-//         // memcpy(cuda_desc->pStack, pStack_head, sizeof(dt_stack_t)*cuda_desc->stack_size);
-//         // cuda_desc->stack_pos = stack_pos;
-//     }
-//     return;
-// }
-
 __global__ void pack_contiguous_loop_cuda_kernel_global( uint32_t copy_loops,
                                                          size_t size,
                                                          OPAL_PTRDIFF_TYPE extent,
@@ -479,7 +542,6 @@ __global__ void pack_contiguous_loop_cuda_kernel_global( uint32_t copy_loops,
     nb_elements = size / 8;
     _src_disp_tmp = (double*)source;
     _destination_tmp = (double*)destination;
-    _source_tmp = _src_disp_tmp + tid;
     _destination_tmp += tid;
 
     for (_i = tid; _i < copy_loops*nb_elements; _i+=num_threads) {
@@ -499,4 +561,72 @@ __global__ void pack_contiguous_loop_cuda_kernel_global( uint32_t copy_loops,
 #endif /* ! OPAL_DATATYPE_CUDA_DRY_RUN */
         _destination_tmp += num_threads;
     }
-}
\ No newline at end of file
+}
+
+// __global__ void opal_generic_simple_pack_cuda_iov_kernel( ddt_cuda_description_dist_t* desc_dist_d,
+//                                                         dt_elem_desc_t* desc_d,
+//                                                         uint32_t required_blocks, struct iovec* iov, unsigned char* pBaseBuf)
+// {
+//     uint32_t i;
+//     dt_elem_desc_t* pElem;
+//     unsigned char *conv_ptr, *iov_ptr;
+//     uint32_t local_index, dst_offset, pos_desc, count_desc;
+//     size_t iov_len_local;
+//
+//     iov_ptr = (unsigned char *) iov[0].iov_base;
+//     iov_len_local = iov[0].iov_len;
+//     conv_ptr = pBaseBuf;
+//     for (i = 0; i < desc_dist_d[blockIdx.x].description_used; i++) {
+//         pos_desc = desc_dist_d[blockIdx.x].description_index[i];
+//         local_index = desc_dist_d[blockIdx.x].description_local_index[i];
+//         dst_offset = desc_dist_d[blockIdx.x].dst_offset[i];
+//         pElem = &(desc_d[pos_desc]);
+//         count_desc = pElem->elem.count;
+//
+//   //      if ( pElem->elem.common.flags & OPAL_DATATYPE_FLAG_DATA ) {
+//             pack_predefined_data_cuda_kernel_v2(pElem, &count_desc, conv_ptr, iov_ptr, &iov_len_local, local_index, dst_offset);
+// //        }
+//     }
+//
+// }
+
+__global__ void opal_generic_simple_pack_cuda_iov_kernel( ddt_cuda_iov_dist_t* cuda_iov_dist)
+{
+    uint32_t i, _copy_count;
+    unsigned char *src, *dst;
+    uint8_t alignment;
+    unsigned char *_source_tmp, *_destination_tmp;
+    
+    __shared__ uint32_t nb_tasks;
+    
+    if (threadIdx.x == 0) {
+        //printf("iov pack kernel \n");
+        nb_tasks = cuda_iov_dist[blockIdx.x].nb_tasks;
+    }
+    __syncthreads();
+    
+    for (i = 0; i < nb_tasks; i++) {
+        src = cuda_iov_dist[blockIdx.x].src[i];
+        dst = cuda_iov_dist[blockIdx.x].dst[i];
+        _copy_count = cuda_iov_dist[blockIdx.x].nb_elements[i];
+        alignment = cuda_iov_dist[blockIdx.x].element_alignment[i];
+        
+        // if (threadIdx.x == 0) {
+        //     printf("block %d, ali %d, nb_element %d\n", blockIdx.x, cuda_iov_dist[blockIdx.x].element_alignment[i], _copy_count);
+        // }
+        
+        if (threadIdx.x < _copy_count) {
+            _source_tmp = src + threadIdx.x * alignment;
+            _destination_tmp = dst + threadIdx.x * alignment;
+#if !defined (OPAL_DATATYPE_CUDA_DRY_RUN)
+            if (alignment == ALIGNMENT_DOUBLE) {
+                *((double *)_destination_tmp) = *((double *)_source_tmp);
+            } else if (alignment == ALIGNMENT_FLOAT) {
+                *((float *)_destination_tmp) = *((float *)_source_tmp);
+            } else {
+                * _destination_tmp = *_source_tmp;
+            }
+#endif /* ! OPAL_DATATYPE_CUDA_DRY_RUN */
+        }
+    }
+}
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
index 3b04bf025e8..f13610fc1bf 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
@@ -2,6 +2,7 @@
 #include "opal_datatype_cuda.cuh"
 
 #include <stdio.h>
+#include <assert.h>
 
 int32_t opal_generic_simple_pack_function_cuda( opal_convertor_t* pConvertor,
                                                 struct iovec* iov, 
@@ -10,10 +11,13 @@ int32_t opal_generic_simple_pack_function_cuda( opal_convertor_t* pConvertor,
 {
     uint32_t i;
     dt_elem_desc_t* description;
+    dt_elem_desc_t* pElem;
     const opal_datatype_t *pData = pConvertor->pDesc;
-    uint32_t tasks_per_block, num_blocks;
+    uint32_t tasks_per_block, num_blocks, thread_per_block;
     dt_stack_t* pStack;
     
+    //return -99;
+
     description = pConvertor->use_desc->desc;
     
     cuda_desc_h->stack_pos = pConvertor->stack_pos;
@@ -49,7 +53,8 @@ int32_t opal_generic_simple_pack_function_cuda( opal_convertor_t* pConvertor,
         cuda_desc_h->description_max_count = pConvertor->use_desc->used+1;
         cuda_desc_h->description_count = pConvertor->use_desc->used+1;
     }
-    cudaMemcpy(cuda_desc_h->description, description, sizeof(dt_elem_desc_t)*(pConvertor->use_desc->used+1), cudaMemcpyHostToDevice);
+    cudaMemcpy(cuda_desc_h->description, description, sizeof(dt_elem_desc_t)*(cuda_desc_h->description_count), cudaMemcpyHostToDevice);
+    printf("description ct %d\n", cuda_desc_h->description_count);
     
     // for (i = 0; i < pConvertor->use_desc->used+1; i++) {
     //     cuda_desc_h->description[i] = description[i];
@@ -66,19 +71,73 @@ int32_t opal_generic_simple_pack_function_cuda( opal_convertor_t* pConvertor,
         cuda_desc_h->iov[i].iov_len = iov[i].iov_len;
     }
     
-    cudaMemcpy(cuda_desc_d, cuda_desc_h, sizeof(ddt_cuda_desc_t), cudaMemcpyHostToDevice);
-    
     pStack = pConvertor->pStack + pConvertor->stack_pos;
-    tasks_per_block = THREAD_PER_BLOCK * TASK_PER_THREAD;
+    thread_per_block = CUDA_WARP_SIZE * 5;
+    tasks_per_block = thread_per_block * TASK_PER_THREAD;
     num_blocks = ((uint32_t)pStack->count + tasks_per_block - 1) / tasks_per_block;
-    printf("launch kernel, count %d, num_blocks %d, total threads %d\n", (uint32_t)pStack->count, num_blocks, num_blocks*2*THREAD_PER_BLOCK);
-    opal_generic_simple_pack_cuda_kernel<<<192,4*THREAD_PER_BLOCK>>>(cuda_desc_d);
+    num_blocks = 512;
+
+    /***/
+    uint32_t pos_desc, count_desc, current_block, task_iteration, nb_blocks_per_description, j, dst_offset;
+    pos_desc   = pStack->index;
+    pElem = &(description[pos_desc]);
+    count_desc = (uint32_t)pStack->count;
+    current_block = 0;
+    task_iteration = 0;
+    dst_offset = 0;
+    while( 1 ) {
+        while( pElem->elem.common.flags & OPAL_DATATYPE_FLAG_DATA ) {
+            nb_blocks_per_description = (count_desc + thread_per_block - 1) / thread_per_block;
+            for (i = 0; i < nb_blocks_per_description; i++) {
+                description_dist_h[current_block].description_index[task_iteration] = pos_desc;
+                description_dist_h[current_block].description_local_index[task_iteration] = i;
+                description_dist_h[current_block].dst_offset[task_iteration] = dst_offset;
+                description_dist_h[current_block].description_used = task_iteration + 1;
+                if ( (i+1) * thread_per_block <= count_desc) {
+                    dst_offset += thread_per_block;
+                } else {
+                    dst_offset += thread_per_block - ((i+1)*thread_per_block - count_desc);
+                }
+                current_block += 1;
+                if (current_block >= num_blocks) {
+                    current_block = 0;
+                    task_iteration ++;
+                }
+            }
+            pos_desc ++;
+            pElem = &(description[pos_desc]);
+            count_desc = pElem->elem.count;
+        }
+        if( OPAL_DATATYPE_END_LOOP == pElem->elem.common.type ) {
+            break;
+        }
+    }
+
+    // for (i = 0; i < num_blocks; i++) {
+    //     printf("block %d\t, used %d\n", i, description_dist_h[i].description_used);
+    //     for (j = 0; j < description_dist_h[i].description_used; j++) {
+    //         pos_desc = description_dist_h[i].description_index[j];
+    //         pElem = &(description[pos_desc]);
+    //         printf("i %d\t, descp_pos %d\t, local_index %d\t, count %d\t, dst offset %d\n", j, description_dist_h[i].description_index[j], description_dist_h[i].description_local_index[j], pElem->elem.count, description_dist_h[i].dst_offset[j]);
+    //     }
+    // }
+
+    cudaMemcpy(cuda_desc_h->description_dist, description_dist_h, sizeof(ddt_cuda_description_dist_t)*(num_blocks), cudaMemcpyHostToDevice);
+    /***/
+    
+    cudaMemcpy(cuda_desc_d, cuda_desc_h, sizeof(ddt_cuda_desc_t), cudaMemcpyHostToDevice);
+      
+    printf("launch pack kernel, count %d, num_blocks %d, total threads %d\n", (uint32_t)pStack->count, num_blocks, num_blocks*thread_per_block);
+    opal_generic_simple_pack_cuda_kernel_v2<<<num_blocks, thread_per_block>>>(cuda_desc_d);
 #if !defined(OPAL_DATATYPE_CUDA_DRY_RUN)
     size_t position = pConvertor->pDesc->size;
-    opal_convertor_set_position_nocheck(pConvertor, &position);
+//    opal_convertor_set_position_nocheck(pConvertor, &position);
 #endif
     cudaDeviceSynchronize();
     
+   return 1;
+    
+    
 #if defined(OPAL_DATATYPE_CUDA_DRY_RUN)
     return -99;
 #else
@@ -147,6 +206,346 @@ void pack_contiguous_loop_cuda( dt_elem_desc_t* ELEM,
 }
 
 
+// int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor,
+//                                                     struct iovec* iov,
+//                                                     uint32_t* out_size,
+//                                                     size_t* max_data )
+// {
+//     uint32_t i;
+//     uint32_t count_desc, current_block, task_iteration, nb_blocks_per_description, j, dst_offset;
+//     uint32_t nb_blocks, thread_per_block;
+//     dt_elem_desc_t* description;
+//     size_t length;
+//
+//  //   return -99;
+//
+//     cuda_iov_count = 4000;
+//     opal_convertor_raw( pConvertor, cuda_iov, &cuda_iov_count, &length );
+//     printf("iov count %d, length %d\n", cuda_iov_count, length);
+//
+//     description = pConvertor->use_desc->desc;
+//     current_block = 0;
+//     task_iteration = 0;
+//     dst_offset = 0;
+//     thread_per_block = CUDA_WARP_SIZE * 4;
+//     nb_blocks = 512;
+//     for (i = 0; i < cuda_iov_count; i++) {
+//         count_desc = cuda_iov[i].iov_len / sizeof(double);
+// //        printf("i = %d\t, iov_base %p\t, iov_len %ld\t, count %d\n", i, cuda_iov[i].iov_base, cuda_iov[i].iov_len, count_desc);
+//         nb_blocks_per_description = (count_desc + thread_per_block - 1) / thread_per_block;
+//         for (j = 0; j < nb_blocks_per_description; j++) {
+//             description_dist_h[current_block].description_index[task_iteration] = i;
+//             description_dist_h[current_block].description_local_index[task_iteration] = j;
+//             description_dist_h[current_block].dst_offset[task_iteration] = dst_offset;
+//             description_dist_h[current_block].description_used = task_iteration + 1;
+//             if ( (j+1) * thread_per_block <= count_desc) {
+//                 dst_offset += thread_per_block;
+//             } else {
+//                 dst_offset += thread_per_block - ((j+1)*thread_per_block - count_desc);
+//             }
+//             current_block += 1;
+//             if (current_block >= nb_blocks) {
+//                 current_block = 0;
+//                 task_iteration ++;
+//             }
+//         }
+//     }
+//
+//     uint32_t pos_desc;
+//     dt_elem_desc_t* pElem;
+//     // for (i = 0; i < nb_blocks; i++) {
+//     //     printf("block %d\t, used %d\n", i, description_dist_h[i].description_used);
+//     //     for (j = 0; j < description_dist_h[i].description_used; j++) {
+//     //         pos_desc = description_dist_h[i].description_index[j];
+//     //         pElem = &(description[pos_desc]);
+//     //         printf("i %d\t, descp_pos %d\t, local_index %d\t, count %d\t, dst offset %d\n", j, description_dist_h[i].description_index[j], description_dist_h[i].description_local_index[j], pElem->elem.count, description_dist_h[i].dst_offset[j]);
+//     //     }
+//     // }
+//
+//     cudaMemcpy(description_dist_d, description_dist_h, sizeof(ddt_cuda_description_dist_t)*(nb_blocks), cudaMemcpyHostToDevice);
+//
+//     if (cuda_desc_h->description_max_count != 0) {
+//         if (cuda_desc_h->description_max_count >= (pConvertor->use_desc->used+1)) {
+//             cuda_desc_h->description_count = pConvertor->use_desc->used+1;
+//         } else {
+//             cudaFree(cuda_desc_h->description);
+//             cuda_desc_h->description = NULL;
+//             cudaMalloc((void **)&(cuda_desc_h->description), sizeof(dt_elem_desc_t)*(pConvertor->use_desc->used+1));
+//             description_d = cuda_desc_h->description;
+//             cuda_desc_h->description_max_count = pConvertor->use_desc->used+1;
+//             cuda_desc_h->description_count = pConvertor->use_desc->used+1;
+//         }
+//
+//     } else {
+//         cudaMalloc((void **)&(cuda_desc_h->description), sizeof(dt_elem_desc_t)*(pConvertor->use_desc->used+1));
+//         description_d = cuda_desc_h->description;
+//         cuda_desc_h->description_max_count = pConvertor->use_desc->used+1;
+//         cuda_desc_h->description_count = pConvertor->use_desc->used+1;
+//     }
+//     cudaMemcpy(description_d, description, sizeof(dt_elem_desc_t)*(cuda_desc_h->description_count), cudaMemcpyHostToDevice);
+//
+//     unsigned char* pBaseBuf;
+// #if defined(OPAL_DATATYPE_CUDA_DRY_RUN)
+//     pBaseBuf = pConvertor->pBaseBuf;
+// #else
+//     pBaseBuf = pBaseBuf_GPU;
+// #endif /* OPAL_DATATYPE_CUDA_DRY_RUN */
+//
+//     for (i = 0; i < *out_size; i++) {
+// #if defined (OPAL_DATATYPE_CUDA_DRY_RUN)
+//         cuda_desc_h->iov[i].iov_base = iov[i].iov_base;
+// #endif /* OPAL_DATATYPE_CUDA_DRY_RUN */
+//         cuda_desc_h->iov[i].iov_len = iov[i].iov_len;
+//     }
+//
+//     opal_generic_simple_pack_cuda_iov_kernel<<<nb_blocks, thread_per_block>>>(description_dist_d, description_d, current_block, cuda_desc_h->iov, pBaseBuf);
+//     cudaDeviceSynchronize();
+//
+//     return 1;
+// }
+
+int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor,
+                                                    struct iovec* iov,
+                                                    uint32_t* out_size,
+                                                    size_t* max_data )
+{
+    uint32_t i, j;
+    uint32_t count_desc, current_block, task_iteration, nb_blocks_per_description, residue_desc;
+    uint32_t nb_blocks, thread_per_block;
+    size_t length, buffer_size, length_per_iovec, dst_offset;
+    unsigned char *destination;
+    size_t total_packed, total_converted;
+    int32_t complete_flag = 0;
+    uint8_t buffer_isfull = 0;
+    uint32_t convertor_flags;
+    dt_elem_desc_t* description;
+    dt_elem_desc_t* pElem;
+    dt_stack_t* pStack;
+    uint8_t alignment, orig_alignment;
+    
+    ddt_cuda_iov_dist_t* cuda_iov_dist_h_current;
+    ddt_cuda_iov_dist_t* cuda_iov_dist_d_current;
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)    
+    TIMER_DATA_TYPE start, end, start_total, end_total;
+    long total_time;
+#endif
+    
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+    GET_TIME(start_total);
+#endif
+    
+    DT_CUDA_DEBUG ( opal_cuda_output(0, "GPU datatype packing using iovec\n"); );
+
+#if defined(OPAL_DATATYPE_CUDA_DRY_RUN)
+    destination = (unsigned char*)iov[0].iov_base;
+#else
+//    pConvertor->pBaseBuf = pBaseBuf_GPU;
+  //  printf("Pack GPU base %p, iov_buffer %p\n", pConvertor->pBaseBuf, iov[0].iov_base);
+    destination = ddt_cuda_pack_buffer;
+#endif /* OPAL_DATATYPE_CUDA_DRY_RUN */
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+    GET_TIME(start);
+#endif
+    
+    description = pConvertor->use_desc->desc;
+    pStack = pConvertor->pStack + pConvertor->stack_pos;
+    pElem = &(description[pStack->index]);
+    printf("size elem %lu, size %d\n", pElem->elem.common.type, opal_datatype_basicDatatypesSize[pElem->elem.common.type]);
+
+    printf("buffer size %d, max_data %d\n", iov[0].iov_len, *max_data);
+    buffer_size = iov[0].iov_len;
+    cuda_iov_count = 1000;
+    total_packed = 0;
+    total_converted = pConvertor->bConverted;
+    cuda_streams->current_stream_id = 0;
+    convertor_flags = pConvertor->flags;
+    complete_flag = opal_convertor_raw( pConvertor, cuda_iov, &cuda_iov_count, &length );
+    DT_CUDA_DEBUG ( opal_cuda_output(1, "complete flag %d, iov count %d, length %d, submit to CUDA stream %d\n", complete_flag, cuda_iov_count, length, cuda_streams->current_stream_id); );
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)    
+    GET_TIME( end );
+    total_time = ELAPSED_TIME( start, end );
+    printf( "[Timing]: ddt to iov in %ld microsec\n", total_time );
+#endif
+    
+    dst_offset = 0;
+    thread_per_block = CUDA_WARP_SIZE * 4;
+    nb_blocks = 256;
+    
+    while (cuda_iov_count > 0) {
+    // void* temp_addr;
+    // size_t temp_size;
+    // for (i = 1; i < cuda_iov_count/2; i+=2) {
+    //     temp_addr = cuda_iov[i].iov_base;
+    //     temp_size = cuda_iov[i].iov_len;
+    //     cuda_iov[i].iov_base = cuda_iov[cuda_iov_count-i].iov_base;
+    //     cuda_iov[i].iov_len = cuda_iov[cuda_iov_count-i].iov_len;
+    //     cuda_iov[cuda_iov_count-i].iov_base = temp_addr;
+    //     cuda_iov[cuda_iov_count-i].iov_len = temp_size;
+    //     // printf("swap %d, %d, len %d %d\n", i, cuda_iov_count-i, cuda_iov[i].iov_len, cuda_iov[cuda_iov_count-i].iov_len);
+    // }
+        
+        current_block = 0;
+        task_iteration = 0;
+        cuda_iov_dist_h_current = cuda_iov_dist_h[cuda_streams->current_stream_id];
+        cuda_iov_dist_d_current = cuda_iov_dist_d[cuda_streams->current_stream_id]; 
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+        GET_TIME(start);
+#endif
+        for (i = 0; i < nb_blocks; i++) {
+            cuda_iov_dist_h_current[i].nb_tasks = 0;
+        }
+
+        for (i = 0; i < cuda_iov_count; i++) {
+            pElem = &(description[pStack->index+i]);
+            if (buffer_size >= cuda_iov[i].iov_len) {
+                length_per_iovec = cuda_iov[i].iov_len;
+            } else {
+                orig_alignment = opal_datatype_basicDatatypesSize[pElem->elem.common.type];
+                length_per_iovec = buffer_size / orig_alignment * orig_alignment;
+                buffer_isfull = 1;
+            }
+            buffer_size -= length_per_iovec;
+            total_packed += length_per_iovec;
+            
+            /* check alignment */
+            if ((uintptr_t)(cuda_iov[i].iov_base) % ALIGNMENT_DOUBLE == 0 && (uintptr_t)destination % ALIGNMENT_DOUBLE == 0) {
+                alignment = ALIGNMENT_DOUBLE;
+            } else if ((uintptr_t)(cuda_iov[i].iov_base) % ALIGNMENT_FLOAT == 0 && (uintptr_t)destination % ALIGNMENT_FLOAT == 0) {
+                alignment = ALIGNMENT_FLOAT;
+            } else {
+                alignment = ALIGNMENT_CHAR;
+            }
+            
+        //    alignment = ALIGNMENT_CHAR;
+
+            count_desc = length_per_iovec / alignment;
+            residue_desc = length_per_iovec % alignment;
+            nb_blocks_per_description = (count_desc + thread_per_block - 1) / thread_per_block;
+            DT_CUDA_DEBUG ( opal_cuda_output(2, "description %d, size %d, residue %d, alignment %d, nb_block_aquired %d\n", i, count_desc, residue_desc, alignment, nb_blocks_per_description); );
+            for (j = 0; j < nb_blocks_per_description; j++) {
+                cuda_iov_dist_h_current[current_block].src[task_iteration] = (unsigned char *)(cuda_iov[i].iov_base) + j * thread_per_block * alignment;
+                cuda_iov_dist_h_current[current_block].dst[task_iteration] = destination;
+                cuda_iov_dist_h_current[current_block].element_alignment[task_iteration] = alignment;
+                cuda_iov_dist_h_current[current_block].nb_tasks = task_iteration + 1;
+                if ( (j+1) * thread_per_block <= count_desc) {
+                    cuda_iov_dist_h_current[current_block].nb_elements[task_iteration] = thread_per_block;// * sizeof(double);
+                } else {
+                    cuda_iov_dist_h_current[current_block].nb_elements[task_iteration] = count_desc - j*thread_per_block; //(thread_per_block - ((j+1)*thread_per_block - count_desc));// * sizeof(double);
+                }
+                destination += cuda_iov_dist_h_current[current_block].nb_elements[task_iteration] * alignment;
+                DT_CUDA_DEBUG( opal_cuda_output(3, "\tblock %d, src %p, dst %p, nb_elements %d, alignment %d\n", current_block, cuda_iov_dist_h_current[current_block].src[task_iteration], cuda_iov_dist_h_current[current_block].dst[task_iteration], cuda_iov_dist_h_current[current_block].nb_elements[task_iteration], cuda_iov_dist_h_current[current_block].element_alignment[task_iteration]); );
+                current_block += 1;
+                if (current_block >= nb_blocks) {
+                    current_block = 0;
+                    task_iteration ++;
+                    assert(task_iteration < CUDA_IOV_MAX_TASK_PER_BLOCK);
+                }
+            }
+            
+            /* handle residue */
+            if (residue_desc != 0) {
+                orig_alignment = opal_datatype_basicDatatypesSize[pElem->elem.common.type];
+                cuda_iov_dist_h_current[current_block].src[task_iteration] = (unsigned char *)(cuda_iov[i].iov_base) + length_per_iovec / alignment * alignment;
+                cuda_iov_dist_h_current[current_block].dst[task_iteration] = destination;
+                cuda_iov_dist_h_current[current_block].element_alignment[task_iteration] = orig_alignment;
+                cuda_iov_dist_h_current[current_block].nb_tasks = task_iteration + 1;
+                cuda_iov_dist_h_current[current_block].nb_elements[task_iteration] = (length_per_iovec - length_per_iovec / alignment * alignment) / orig_alignment;
+                destination += cuda_iov_dist_h_current[current_block].nb_elements[task_iteration] * orig_alignment;
+                DT_CUDA_DEBUG( opal_cuda_output(3, "\tblock %d, src %p, dst %p, nb_elements %d, alignment %d\n", current_block, cuda_iov_dist_h_current[current_block].src[task_iteration], cuda_iov_dist_h_current[current_block].dst[task_iteration], cuda_iov_dist_h_current[current_block].nb_elements[task_iteration], cuda_iov_dist_h_current[current_block].element_alignment[task_iteration]); );
+                current_block += 1;
+                if (current_block >= nb_blocks) {
+                    current_block = 0;
+                    task_iteration ++;
+                    assert(task_iteration < CUDA_IOV_MAX_TASK_PER_BLOCK);
+                }
+            }
+            
+            if (buffer_isfull) {
+                break;
+            }
+        }
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)    
+        GET_TIME( end );
+        total_time = ELAPSED_TIME( start, end );
+        printf( "[Timing]: iov is prepared in %ld microsec, cudaMemcpy will be submit to CUDA stream %d\n", total_time,  cuda_streams->current_stream_id);
+#endif
+
+        cudaMemcpyAsync(cuda_iov_dist_d_current, cuda_iov_dist_h_current, sizeof(ddt_cuda_iov_dist_t)*(nb_blocks), cudaMemcpyHostToDevice, cuda_streams->opal_cuda_stream[cuda_streams->current_stream_id]);
+
+        for (i = 0; i < *out_size; i++) {
+#if defined (OPAL_DATATYPE_CUDA_DRY_RUN)
+            cuda_desc_h->iov[i].iov_base = iov[i].iov_base;
+#endif /* OPAL_DATATYPE_CUDA_DRY_RUN */
+            cuda_desc_h->iov[i].iov_len = iov[i].iov_len;
+        }
+    
+        opal_generic_simple_pack_cuda_iov_kernel<<<nb_blocks, thread_per_block, 0, cuda_streams->opal_cuda_stream[cuda_streams->current_stream_id]>>>(cuda_iov_dist_d_current);
+        cuda_streams->current_stream_id ++;
+        cuda_streams->current_stream_id = cuda_streams->current_stream_id % NB_STREAMS;
+        
+        /* buffer is full */
+        if (buffer_isfull) {
+            pConvertor->flags = convertor_flags;
+            total_converted += total_packed;
+            opal_convertor_set_position_nocheck(pConvertor, &total_converted);
+            break;
+        }
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+        GET_TIME(start);
+#endif
+        convertor_flags = pConvertor->flags;
+        complete_flag = opal_convertor_raw( pConvertor, cuda_iov, &cuda_iov_count, &length );
+        DT_CUDA_DEBUG ( opal_cuda_output(1, "complete flag %d, iov count %d, length %d, submit to CUDA stream %d\n", complete_flag, cuda_iov_count, length, cuda_streams->current_stream_id); );
+#if defined(OPAL_DATATYPE_CUDA_TIMING)    
+        GET_TIME( end );
+        total_time = ELAPSED_TIME( start, end );
+        printf( "[Timing]: ddt to iov in %ld microsec\n", total_time );
+#endif
+    }
+    
+
+    cudaDeviceSynchronize();
+    
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+    GET_TIME(start);
+#endif
+    cudaMemcpy(iov[0].iov_base, ddt_cuda_pack_buffer, total_packed, cudaMemcpyDeviceToHost);
+#if defined(OPAL_DATATYPE_CUDA_TIMING) 
+    GET_TIME( end );
+    total_time = ELAPSED_TIME( start, end );
+    printf( "[Timing]: DtoH memcpy in %ld microsec\n", total_time );
+#endif
+    // float *vtmp = (float *)iov[0].iov_base;
+    // DT_CUDA_DEBUG ( opal_cuda_output(0, "packed iov buffer, total packed %d\n", total_packed); );
+    // for (uint32_t i = 0; i < total_packed/sizeof(float); i++) {
+    //     printf(" %1.f ", *vtmp);
+    //     vtmp ++;
+    // }
+    // printf("\n");
+    iov[0].iov_len = total_packed;
+    *max_data = total_packed;
+    *out_size = 1;
+    DT_CUDA_DEBUG ( opal_cuda_output(0, "total packed %d\n", total_packed); );
+    
+#if defined(OPAL_DATATYPE_CUDA_TIMING)    
+    GET_TIME( end_total );
+    total_time = ELAPSED_TIME( start_total, end_total );
+    printf( "[Timing]: total packing in %ld microsec\n", total_time );
+#endif
+    
+    if( pConvertor->bConverted == pConvertor->local_size ) {
+        pConvertor->flags |= CONVERTOR_COMPLETED;
+        return 1;
+    }        
+    return 0;
+}
+
+
 void pack_predefined_data_cuda( dt_elem_desc_t* ELEM,
                                 uint32_t* COUNT,
                                 unsigned char** SOURCE,
@@ -157,7 +556,7 @@ void pack_predefined_data_cuda( dt_elem_desc_t* ELEM,
     size_t _copy_blength;
     ddt_elem_desc_t* _elem = &((ELEM)->elem);
     unsigned char* _source = (*SOURCE) + _elem->disp;
-    uint32_t num_blocks, tasks_per_block;
+    uint32_t nb_blocks, tasks_per_block, thread_per_block;
     unsigned char* _destination = *(DESTINATION);
 
     _copy_blength = 8;//opal_datatype_basicDatatypes[_elem->common.type]->size;
@@ -167,17 +566,26 @@ void pack_predefined_data_cuda( dt_elem_desc_t* ELEM,
     }
     
 #if !defined(OPAL_DATATYPE_CUDA_DRY_RUN)
-    _source = pBaseBuf_GPU;
+    _source = pBaseBuf_GPU + _elem->disp;
     _destination = (unsigned char*)cuda_desc_h->iov[0].iov_base;
 #endif
     
-    tasks_per_block = THREAD_PER_BLOCK*4;
-    num_blocks = (*COUNT + tasks_per_block - 1) / tasks_per_block;
+    if (*COUNT / TASK_PER_THREAD < CUDA_WARP_SIZE) {
+        thread_per_block = CUDA_WARP_SIZE;
+    } else if (*COUNT / TASK_PER_THREAD < CUDA_WARP_SIZE * 2) {
+        thread_per_block = CUDA_WARP_SIZE * 2;
+    } else if (*COUNT / TASK_PER_THREAD < CUDA_WARP_SIZE * 3) {
+        thread_per_block = CUDA_WARP_SIZE * 3;
+    } else {
+        thread_per_block = CUDA_WARP_SIZE * 4;
+    }
+    tasks_per_block = thread_per_block * TASK_PER_THREAD;
+    nb_blocks = (*COUNT + tasks_per_block - 1) / tasks_per_block;
 
-    DBGPRINT("num_blocks %d, thread %d\n", num_blocks, tasks_per_block);
+    DBGPRINT("num_blocks %d, thread %d\n", nb_blocks, tasks_per_block);
     DBGPRINT( "GPU pack 1. memcpy( %p, %p, %lu ) => space %lu\n", _destination, _source, (unsigned long)_copy_count, (unsigned long)(*(SPACE)) );
     
-    pack_contiguous_loop_cuda_kernel_global<<<1, THREAD_PER_BLOCK, 0, cuda_streams->opal_cuda_stream[cuda_streams->current_stream_id]>>>(_copy_count, _copy_blength, _elem->extent, _source, _destination);
+    pack_contiguous_loop_cuda_kernel_global<<<nb_blocks, thread_per_block, 0, cuda_streams->opal_cuda_stream[cuda_streams->current_stream_id]>>>(_copy_count, _copy_blength, _elem->extent, _source, _destination);
     cuda_streams->current_stream_id ++;
     cuda_streams->current_stream_id = cuda_streams->current_stream_id % NB_STREAMS;
     
@@ -189,7 +597,6 @@ void pack_predefined_data_cuda( dt_elem_desc_t* ELEM,
     *(COUNT)  -= _copy_count;
 #endif
     
-    pBaseBuf_GPU += _elem->extent*_copy_count;
     cuda_desc_h->iov[0].iov_base = (unsigned char*)cuda_desc_h->iov[0].iov_base + _copy_blength;
  //   cudaDeviceSynchronize();
 }
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
index f59b2bb0e00..0ae85e22eef 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
@@ -61,7 +61,7 @@ __device__ void unpack_contiguous_loop_cuda_kernel( dt_elem_desc_t* ELEM,
 
 __global__ void opal_generic_simple_unpack_cuda_kernel(ddt_cuda_desc_t* cuda_desc)
 {
-    dt_stack_t* pStack, *pStack_head;                /* pointer to the position on the stack */
+    dt_stack_t* pStack;                /* pointer to the position on the stack */
     uint32_t pos_desc;                 /* actual position in the description of the derived datatype */
     uint32_t count_desc;               /* the number of items already done in the actual pos_desc */
     size_t total_unpacked = 0;         /* total size unpacked this time */
@@ -80,23 +80,23 @@ __global__ void opal_generic_simple_unpack_cuda_kernel(ddt_cuda_desc_t* cuda_des
 
     tid = threadIdx.x + blockIdx.x * blockDim.x;
     
-    __shared__ ddt_cuda_desc_t cuda_desc_b;
-    
-    if (threadIdx.x == 0) {
-        memcpy(&cuda_desc_b, cuda_desc, sizeof(ddt_cuda_desc_t));
+ //   __shared__ ddt_cuda_desc_t cuda_desc_b;
+    __shared__ dt_stack_t shared_pStack[DT_STATIC_STACK_SIZE];
+
+    if (threadIdx.x < DT_STATIC_STACK_SIZE) {
+        shared_pStack[threadIdx.x] = cuda_desc->pStack[threadIdx.x];
     }
     __syncthreads();
     
     // load cuda descriptor from constant memory
-    iov = cuda_desc_b.iov;
-    pStack_head = cuda_desc_b.pStack;
-    pStack = pStack_head;
-    description = cuda_desc_b.description;
-    stack_pos = cuda_desc_b.stack_pos;
-    pBaseBuf = cuda_desc_b.pBaseBuf;
-    lb = cuda_desc_b.lb;
-    ub = cuda_desc_b.ub;
-    out_size = cuda_desc_b.out_size;
+    iov = cuda_desc->iov;
+    pStack = shared_pStack;
+    description = cuda_desc->description;
+    stack_pos = cuda_desc->stack_pos;
+    pBaseBuf = cuda_desc->pBaseBuf;
+    lb = cuda_desc->lb;
+    ub = cuda_desc->ub;
+    out_size = cuda_desc->out_size;
 
     /* For the first step we have to add both displacement to the source. After in the
      * main while loop we will set back the source_base to the correct value. This is
@@ -248,6 +248,43 @@ __global__ void opal_generic_simple_unpack_cuda_kernel(ddt_cuda_desc_t* cuda_des
     }
 }
 
+
+__global__ void opal_generic_simple_unpack_cuda_iov_kernel( ddt_cuda_iov_dist_t* cuda_iov_dist)
+{
+    uint32_t i, _copy_count;
+    unsigned char *src, *dst;
+    uint8_t alignment;
+    unsigned char *_source_tmp, *_destination_tmp;
+    
+    __shared__ uint32_t nb_tasks;
+    
+    if (threadIdx.x == 0) {
+        nb_tasks = cuda_iov_dist[blockIdx.x].nb_tasks;
+    }
+    __syncthreads();
+    
+    for (i = 0; i < nb_tasks; i++) {
+        src = cuda_iov_dist[blockIdx.x].src[i];
+        dst = cuda_iov_dist[blockIdx.x].dst[i];
+        _copy_count = cuda_iov_dist[blockIdx.x].nb_elements[i];
+        alignment = cuda_iov_dist[blockIdx.x].element_alignment[i];
+        
+        if (threadIdx.x < _copy_count) {
+            _source_tmp = src + threadIdx.x * alignment;
+            _destination_tmp = dst + threadIdx.x * alignment;
+#if !defined (OPAL_DATATYPE_CUDA_DRY_RUN)
+                if (alignment == ALIGNMENT_DOUBLE) {
+                    *((double *)_destination_tmp) = *((double *)_source_tmp);
+                } else if (alignment == ALIGNMENT_FLOAT) {
+                    *((float *)_destination_tmp) = *((float *)_source_tmp);
+                } else {
+                    * _destination_tmp = *_source_tmp;
+                }
+        //   printf("src %p, %1.f | dst %p, %1.f\n", _source_tmp, *_source_tmp, _destination_tmp, *_destination_tmp);
+#endif /* ! OPAL_DATATYPE_CUDA_DRY_RUN */
+        }
+    }
+}
 __global__ void unpack_contiguous_loop_cuda_kernel_global( uint32_t copy_loops,
                                                            size_t size,
                                                            OPAL_PTRDIFF_TYPE extent,
@@ -285,4 +322,4 @@ __global__ void unpack_contiguous_loop_cuda_kernel_global( uint32_t copy_loops,
 #endif /* ! OPAL_DATATYPE_CUDA_DRY_RUN */
         _source_tmp += num_threads;
     }
-}
\ No newline at end of file
+}
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
index 7181f3cd362..88a66de5f02 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
@@ -2,6 +2,7 @@
 #include "opal_datatype_cuda.cuh"
 
 #include <stdio.h>
+#include <assert.h>
 
 int32_t opal_generic_simple_unpack_function_cuda( opal_convertor_t* pConvertor,
                                                   struct iovec* iov, 
@@ -11,9 +12,10 @@ int32_t opal_generic_simple_unpack_function_cuda( opal_convertor_t* pConvertor,
     uint32_t i;
     dt_elem_desc_t* description;
     const opal_datatype_t *pData = pConvertor->pDesc;
-    uint32_t tasks_per_block, num_blocks;
+    uint32_t tasks_per_block, num_blocks, thread_per_block;
     dt_stack_t* pStack;
     
+    return -99;
     description = pConvertor->use_desc->desc;
     
     cuda_desc_h->stack_pos = pConvertor->stack_pos;
@@ -33,9 +35,23 @@ int32_t opal_generic_simple_unpack_function_cuda( opal_convertor_t* pConvertor,
     for (i = 0; i < pConvertor->stack_size; i++) {
         cuda_desc_h->pStack[i] = pConvertor->pStack[i];
     }
-    for (i = 0; i < pConvertor->use_desc->used+1; i++) {
-        cuda_desc_h->description[i] = description[i];
+    if (cuda_desc_h->description_max_count != 0) {
+        if (cuda_desc_h->description_max_count >= (pConvertor->use_desc->used+1)) {
+            cuda_desc_h->description_count = pConvertor->use_desc->used+1;
+        } else {
+            cudaFree(cuda_desc_h->description);
+            cuda_desc_h->description = NULL;
+            cudaMalloc((void **)&(cuda_desc_h->description), sizeof(dt_elem_desc_t)*(pConvertor->use_desc->used+1));
+            cuda_desc_h->description_max_count = pConvertor->use_desc->used+1;
+            cuda_desc_h->description_count = pConvertor->use_desc->used+1;
+        }
+        
+    } else {
+        cudaMalloc((void **)&(cuda_desc_h->description), sizeof(dt_elem_desc_t)*(pConvertor->use_desc->used+1));
+        cuda_desc_h->description_max_count = pConvertor->use_desc->used+1;
+        cuda_desc_h->description_count = pConvertor->use_desc->used+1;
     }
+    cudaMemcpy(cuda_desc_h->description, description, sizeof(dt_elem_desc_t)*(pConvertor->use_desc->used+1), cudaMemcpyHostToDevice);
     
     DBGPRINT("stack_size %d\n", pConvertor->stack_size);
 
@@ -51,10 +67,11 @@ int32_t opal_generic_simple_unpack_function_cuda( opal_convertor_t* pConvertor,
     cudaMemcpy(cuda_desc_d, cuda_desc_h, sizeof(ddt_cuda_desc_t), cudaMemcpyHostToDevice);
     
     pStack = pConvertor->pStack + pConvertor->stack_pos;
-    tasks_per_block = THREAD_PER_BLOCK * TASK_PER_THREAD;
+    thread_per_block = CUDA_WARP_SIZE * 3;
+    tasks_per_block = thread_per_block * TASK_PER_THREAD;
     num_blocks = ((uint32_t)pStack->count + tasks_per_block - 1) / tasks_per_block;
-    printf("launch kernel, count %d, num_blocks %d, total threads %d\n", (uint32_t)pStack->count, num_blocks, num_blocks*4*THREAD_PER_BLOCK);
-    opal_generic_simple_unpack_cuda_kernel<<<2*num_blocks,2*THREAD_PER_BLOCK>>>(cuda_desc_d);
+    printf("launch unpack kernel, count %d, num_blocks %d, total threads %d\n", (uint32_t)pStack->count, num_blocks, num_blocks*thread_per_block);
+    opal_generic_simple_unpack_cuda_kernel<<<192, thread_per_block>>>(cuda_desc_d);
 #if !defined(OPAL_DATATYPE_CUDA_DRY_RUN)
     size_t position = pConvertor->pDesc->size;
     opal_convertor_set_position_nocheck(pConvertor, &position);
@@ -90,6 +107,227 @@ int32_t opal_generic_simple_unpack_function_cuda( opal_convertor_t* pConvertor,
 #endif /* OPAL_DATATYPE_CUDA_DRY_RUN */
 }
 
+int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvertor,
+                                                  struct iovec* iov, 
+                                                  uint32_t* out_size,
+                                                  size_t* max_data )
+{
+    uint32_t i, j;
+    uint32_t count_desc, current_block, task_iteration, nb_blocks_per_description, dst_offset, residue_desc;
+    uint32_t nb_blocks, thread_per_block;
+    size_t length, buffer_size, length_per_iovec;
+    unsigned char *source;
+    size_t total_unpacked, total_converted;
+    int32_t complete_flag = 0;
+    uint8_t buffer_isfull = 0;
+    uint32_t convertor_flags;
+    dt_elem_desc_t* description;
+    dt_elem_desc_t* pElem;
+    dt_stack_t* pStack;
+    uint8_t alignment, orig_alignment;
+    
+    ddt_cuda_iov_dist_t* cuda_iov_dist_h_current;
+    ddt_cuda_iov_dist_t* cuda_iov_dist_d_current;
+    
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+    TIMER_DATA_TYPE start, end, start_total, end_total;
+    long total_time;
+#endif
+    
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+    GET_TIME(start_total);
+#endif
+    
+    description = pConvertor->use_desc->desc;
+    pStack = pConvertor->pStack + pConvertor->stack_pos;
+    pElem = &(description[pStack->index]);
+    printf("size elem %lu, size %d\n", pElem->elem.common.type, opal_datatype_basicDatatypesSize[pElem->elem.common.type]);
+    
+    DT_CUDA_DEBUG ( opal_cuda_output(0, "GPU datatype UNpacking using iovec\n"); );
+    
+#if defined(OPAL_DATATYPE_CUDA_DRY_RUN)
+    source = (unsigned char*)iov[0].iov_base;
+#else
+//    pConvertor->pBaseBuf = pBaseBuf_GPU;
+ //   printf("Unpack GPU base %p, iov buffer %p\n", pConvertor->pBaseBuf, iov[0].iov_base);
+    source = ddt_cuda_unpack_buffer;
+#endif /* OPAL_DATATYPE_CUDA_DRY_RUN */
+    
+    // double *vtmp = (double *)iov[0].iov_base;
+    printf("recevied unpacked iov buffer, len %d\n", iov[0].iov_len);
+    // for (uint32_t i = 0; i < iov[0].iov_len/sizeof(double); i++) {
+    //     printf(" %1.f ", *vtmp);
+    //     vtmp ++;
+    // }
+    // printf("\n");
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+    GET_TIME(start);
+#endif    
+    cudaMemcpy(source, iov[0].iov_base, iov[0].iov_len, cudaMemcpyHostToDevice);
+#if defined(OPAL_DATATYPE_CUDA_TIMING) 
+    GET_TIME( end );
+    total_time = ELAPSED_TIME( start, end );
+    printf( "[Timing]: HtoD memcpy in %ld microsec\n", total_time );
+#endif
+
+
+#if defined (OPAL_DATATYPE_CUDA_TIMING)
+    GET_TIME(start);
+#endif
+    buffer_size = iov[0].iov_len;
+    cuda_iov_count = 1000;
+    total_unpacked = 0;
+    total_converted = pConvertor->bConverted;
+    cuda_streams->current_stream_id = 0;
+    convertor_flags = pConvertor->flags;
+    complete_flag = opal_convertor_raw( pConvertor, cuda_iov, &cuda_iov_count, &length );
+    DT_CUDA_DEBUG ( opal_cuda_output(1, "complete flag %d, iov count %d, length %d, submit to CUDA stream %d\n", complete_flag, cuda_iov_count, length, cuda_streams->current_stream_id); );
+    
+#if defined (OPAL_DATATYPE_CUDA_TIMING)
+    GET_TIME( end );
+    total_time = ELAPSED_TIME( start, end );
+    printf( "[Timing]: ddt to iov in %ld microsec\n", total_time );
+#endif
+    
+    dst_offset = 0;
+    thread_per_block = CUDA_WARP_SIZE * 4;
+    nb_blocks = 256;
+    
+    while (cuda_iov_count > 0) {
+        
+        current_block = 0;
+        task_iteration = 0;
+        cuda_iov_dist_h_current = cuda_iov_dist_h[cuda_streams->current_stream_id];
+        cuda_iov_dist_d_current = cuda_iov_dist_d[cuda_streams->current_stream_id]; 
+        
+#if defined (OPAL_DATATYPE_CUDA_TIMING)
+        GET_TIME(start);
+#endif
+        for (i = 0; i < nb_blocks; i++) {
+            cuda_iov_dist_h_current[i].nb_tasks = 0;
+        }
+        
+        for (i = 0; i < cuda_iov_count; i++) {
+            if (buffer_size >= cuda_iov[i].iov_len) {
+                length_per_iovec = cuda_iov[i].iov_len;
+            } else {
+                orig_alignment = opal_datatype_basicDatatypesSize[pElem->elem.common.type];
+                length_per_iovec = buffer_size / orig_alignment * orig_alignment;
+                buffer_isfull = 1;
+            }
+            buffer_size -= length_per_iovec;
+            total_unpacked += length_per_iovec;
+            
+            /* check alignment */
+            if ((uintptr_t)(cuda_iov[i].iov_base) % ALIGNMENT_DOUBLE == 0 && (uintptr_t)source % ALIGNMENT_DOUBLE == 0) {
+                alignment = ALIGNMENT_DOUBLE;
+            } else if ((uintptr_t)(cuda_iov[i].iov_base) % ALIGNMENT_FLOAT == 0 && (uintptr_t)source % ALIGNMENT_FLOAT == 0) {
+                alignment = ALIGNMENT_FLOAT;
+            } else {
+                alignment = ALIGNMENT_CHAR;
+            }
+            
+           // alignment = ALIGNMENT_CHAR;
+
+            count_desc = length_per_iovec / alignment;
+            residue_desc = length_per_iovec % alignment;
+            nb_blocks_per_description = (count_desc + thread_per_block - 1) / thread_per_block;
+            DT_CUDA_DEBUG ( opal_cuda_output(2, "description %d, size %d, residue %d, alignment %d\n", i, count_desc, residue_desc, alignment); );
+            for (j = 0; j < nb_blocks_per_description; j++) {
+                cuda_iov_dist_h_current[current_block].dst[task_iteration] = (unsigned char *)(cuda_iov[i].iov_base) + j * thread_per_block * alignment;
+                cuda_iov_dist_h_current[current_block].src[task_iteration] = source;
+                cuda_iov_dist_h_current[current_block].element_alignment[task_iteration] = alignment;
+                cuda_iov_dist_h_current[current_block].nb_tasks = task_iteration + 1;
+                if ( (j+1) * thread_per_block <= count_desc) {
+                    cuda_iov_dist_h_current[current_block].nb_elements[task_iteration] = thread_per_block;// * sizeof(double);
+                } else {
+                    cuda_iov_dist_h_current[current_block].nb_elements[task_iteration] = (thread_per_block - ((j+1)*thread_per_block - count_desc));// * sizeof(double);
+                }
+                source += cuda_iov_dist_h_current[current_block].nb_elements[task_iteration] * alignment;
+                DT_CUDA_DEBUG( opal_cuda_output(3, "\tblock %d, src %p, dst %p, nb_elements %d, alignment %d\n", current_block, cuda_iov_dist_h_current[current_block].src[task_iteration], cuda_iov_dist_h_current[current_block].dst[task_iteration], cuda_iov_dist_h_current[current_block].nb_elements[task_iteration], cuda_iov_dist_h_current[current_block].element_alignment[task_iteration]); );
+                current_block += 1;
+                if (current_block >= nb_blocks) {
+                    current_block = 0;
+                    task_iteration ++;
+                    assert(task_iteration < CUDA_IOV_MAX_TASK_PER_BLOCK);
+                }
+            }
+            
+            /* handle residue */
+            if (residue_desc != 0) {
+                orig_alignment = opal_datatype_basicDatatypesSize[pElem->elem.common.type];
+                cuda_iov_dist_h_current[current_block].dst[task_iteration] = (unsigned char *)(cuda_iov[i].iov_base) + length_per_iovec / alignment * alignment;
+                cuda_iov_dist_h_current[current_block].src[task_iteration] = source;
+                cuda_iov_dist_h_current[current_block].element_alignment[task_iteration] = orig_alignment;
+                cuda_iov_dist_h_current[current_block].nb_tasks = task_iteration + 1;
+                cuda_iov_dist_h_current[current_block].nb_elements[task_iteration] = (length_per_iovec - length_per_iovec / alignment * alignment) / orig_alignment;
+                source += cuda_iov_dist_h_current[current_block].nb_elements[task_iteration] * orig_alignment;
+                DT_CUDA_DEBUG( opal_cuda_output(3, "\tblock %d, src %p, dst %p, nb_elements %d, alignment %d\n", current_block, cuda_iov_dist_h_current[current_block].src[task_iteration], cuda_iov_dist_h_current[current_block].dst[task_iteration], cuda_iov_dist_h_current[current_block].nb_elements[task_iteration], cuda_iov_dist_h_current[current_block].element_alignment[task_iteration]); );
+                current_block += 1;
+                if (current_block >= nb_blocks) {
+                    current_block = 0;
+                    task_iteration ++;
+                    assert(task_iteration < CUDA_IOV_MAX_TASK_PER_BLOCK);
+                }
+            }
+            
+            if (buffer_isfull) {
+                break;
+            }
+        }
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)    
+        GET_TIME( end );
+        total_time = ELAPSED_TIME( start, end );
+        printf( "[Timing]: iov is prepared in %ld microsec, cudaMemcpy will be submit to CUDA stream %d\n", total_time,  cuda_streams->current_stream_id);
+#endif
+                
+        cudaMemcpyAsync(cuda_iov_dist_d_current, cuda_iov_dist_h_current, sizeof(ddt_cuda_iov_dist_t)*(nb_blocks), cudaMemcpyHostToDevice, cuda_streams->opal_cuda_stream[cuda_streams->current_stream_id]);
+        opal_generic_simple_unpack_cuda_iov_kernel<<<nb_blocks, thread_per_block, 0, cuda_streams->opal_cuda_stream[cuda_streams->current_stream_id]>>>(cuda_iov_dist_d_current);
+        cuda_streams->current_stream_id ++;
+        cuda_streams->current_stream_id = cuda_streams->current_stream_id % NB_STREAMS;    
+        
+        /* buffer is full */
+        if (buffer_isfull) {
+            pConvertor->flags = convertor_flags;
+            total_converted += total_unpacked;
+            opal_convertor_set_position_nocheck(pConvertor, &total_converted);
+            break;
+        }
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+        GET_TIME(start);
+#endif   
+        convertor_flags = pConvertor->flags;     
+        complete_flag = opal_convertor_raw( pConvertor, cuda_iov, &cuda_iov_count, &length );
+        DT_CUDA_DEBUG ( opal_cuda_output(1, "complete flag %d, iov count %d, length %d, submit to CUDA stream %d\n", complete_flag, cuda_iov_count, length, cuda_streams->current_stream_id); );
+#if defined(OPAL_DATATYPE_CUDA_TIMING)    
+        GET_TIME( end );
+        total_time = ELAPSED_TIME( start, end );
+        printf( "[Timing]: ddt to iov in %ld microsec\n", total_time );
+#endif
+
+    }
+    cudaDeviceSynchronize();
+    
+    iov[0].iov_len = total_unpacked;
+    *max_data = total_unpacked;
+    *out_size = 1;
+    
+    DT_CUDA_DEBUG ( opal_cuda_output(0, "total unpacked %d\n", total_unpacked); );
+    
+#if defined(OPAL_DATATYPE_CUDA_TIMING)    
+    GET_TIME( end_total );
+    total_time = ELAPSED_TIME( start_total, end_total );
+    printf( "[Timing]: total unpacking in %ld microsec\n", total_time );
+#endif
+    
+    if( pConvertor->bConverted == pConvertor->local_size ) {
+        pConvertor->flags |= CONVERTOR_COMPLETED;
+        return 1;
+    }        
+    return 0;   
+}
+
 void unpack_contiguous_loop_cuda( dt_elem_desc_t* ELEM,
                                   uint32_t* COUNT,
                                   unsigned char** SOURCE,
@@ -120,4 +358,4 @@ void unpack_contiguous_loop_cuda( dt_elem_desc_t* ELEM,
     *(COUNT) -= _copy_loops;
     
     cudaDeviceSynchronize();
-}
\ No newline at end of file
+}
diff --git a/opal/datatype/opal_convertor.c b/opal/datatype/opal_convertor.c
index d5481283183..d4f0cebb722 100644
--- a/opal/datatype/opal_convertor.c
+++ b/opal/datatype/opal_convertor.c
@@ -39,6 +39,7 @@
 #include "opal/datatype/opal_convertor_internal.h"
 #if OPAL_CUDA_SUPPORT
 #include "opal/datatype/opal_datatype_cuda.h"
+#include "opal/datatype/opal_datatype_gpu.h"
 #define MEMCPY_CUDA( DST, SRC, BLENGTH, CONVERTOR ) \
     CONVERTOR->cbmemcpy( (DST), (SRC), (BLENGTH), (CONVERTOR) )
 #endif
@@ -561,6 +562,11 @@ int32_t opal_convertor_prepare_for_recv( opal_convertor_t* convertor,
     convertor->flags |= CONVERTOR_RECV;
 #if OPAL_CUDA_SUPPORT
     mca_cuda_convertor_init(convertor, pUserBuf);
+#if defined (OPAL_DATATYPE_CUDA)
+    if (opal_datatype_gpu_init() != OPAL_SUCCESS) {
+        opal_datatype_gpu_fini();
+    }
+#endif /* defined OPAL_DATATYPE_CUDA */
 #endif
 
     OPAL_CONVERTOR_PREPARE( convertor, datatype, count, pUserBuf );
@@ -585,7 +591,11 @@ int32_t opal_convertor_prepare_for_recv( opal_convertor_t* convertor,
         if( convertor->pDesc->flags & OPAL_DATATYPE_FLAG_CONTIGUOUS ) {
             convertor->fAdvance = opal_unpack_homogeneous_contig;
         } else {
-            convertor->fAdvance = opal_generic_simple_unpack;
+            if (convertor->flags & CONVERTOR_CUDA ) {
+                convertor->fAdvance = opal_generic_simple_unpack_cuda;
+            } else {
+                convertor->fAdvance = opal_generic_simple_unpack;
+            }
         }
     }
     return OPAL_SUCCESS;
@@ -600,6 +610,11 @@ int32_t opal_convertor_prepare_for_send( opal_convertor_t* convertor,
     convertor->flags |= CONVERTOR_SEND;
 #if OPAL_CUDA_SUPPORT
     mca_cuda_convertor_init(convertor, pUserBuf);
+#if defined (OPAL_DATATYPE_CUDA)
+    if (opal_datatype_gpu_init() != OPAL_SUCCESS) {
+        opal_datatype_gpu_fini();
+    }
+#endif /* defined OPAL_DATATYPE_CUDA */
 #endif
 
     OPAL_CONVERTOR_PREPARE( convertor, datatype, count, pUserBuf );
@@ -622,7 +637,11 @@ int32_t opal_convertor_prepare_for_send( opal_convertor_t* convertor,
             else
                 convertor->fAdvance = opal_pack_homogeneous_contig_with_gaps;
         } else {
-            convertor->fAdvance = opal_generic_simple_pack;
+            if (convertor->flags & CONVERTOR_CUDA ) {
+                convertor->fAdvance = opal_generic_simple_pack_cuda;
+            } else {
+                convertor->fAdvance = opal_generic_simple_pack;
+            }
         }
     }
     return OPAL_SUCCESS;
diff --git a/opal/datatype/opal_datatype_cuda.c b/opal/datatype/opal_datatype_cuda.c
index 71b60e60801..caaab68208d 100644
--- a/opal/datatype/opal_datatype_cuda.c
+++ b/opal/datatype/opal_datatype_cuda.c
@@ -180,6 +180,7 @@ static void opal_cuda_support_init(void)
     }
 
     initialized = true;
+    
 }
 
 /**
diff --git a/opal/datatype/opal_datatype_gpu.c b/opal/datatype/opal_datatype_gpu.c
index e77a4f77325..787e86e4f4c 100644
--- a/opal/datatype/opal_datatype_gpu.c
+++ b/opal/datatype/opal_datatype_gpu.c
@@ -52,6 +52,16 @@ int32_t (*opal_generic_simple_unpack_function_cuda_p)( opal_convertor_t* pConver
                                                        struct iovec* iov, 
                                                        uint32_t* out_size,
                                                        size_t* max_data ) = NULL;
+                                                     
+int32_t (*opal_generic_simple_pack_function_cuda_iov_p)( opal_convertor_t* pConvertor,
+                                                        struct iovec* iov, 
+                                                        uint32_t* out_size,
+                                                        size_t* max_data ) = NULL;
+                                                        
+int32_t (*opal_generic_simple_unpack_function_cuda_iov_p)( opal_convertor_t* pConvertor,
+                                                        struct iovec* iov, 
+                                                        uint32_t* out_size,
+                                                        size_t* max_data ) = NULL;
                                                        
 void (*pack_contiguous_loop_cuda_p)( dt_elem_desc_t* ELEM,
                                      uint32_t* COUNT,
@@ -114,6 +124,20 @@ int32_t opal_datatype_gpu_init(void)
             return OPAL_ERROR;
         }
         
+        *(void **)(&opal_generic_simple_pack_function_cuda_iov_p) = dlsym(opal_datatype_cuda_handle, "opal_generic_simple_pack_function_cuda_iov");
+        if ((error = dlerror()) != NULL)  {
+            fprintf(stderr, "opal_generic_simple_pack_function_cuda_iov error: %s\n", error);
+            opal_generic_simple_pack_function_cuda_iov_p = NULL;
+            return OPAL_ERROR;
+        }
+        
+        *(void **)(&opal_generic_simple_unpack_function_cuda_iov_p) = dlsym(opal_datatype_cuda_handle, "opal_generic_simple_unpack_function_cuda_iov");
+        if ((error = dlerror()) != NULL)  {
+            fprintf(stderr, "opal_generic_simple_unpack_function_cuda_iov error: %s\n", error);
+            opal_generic_simple_unpack_function_cuda_iov_p = NULL;
+            return OPAL_ERROR;
+        }
+        
         *(void **)(&pack_contiguous_loop_cuda_p) = dlsym(opal_datatype_cuda_handle, "pack_contiguous_loop_cuda");
         if ((error = dlerror()) != NULL)  {
             fprintf(stderr, "pack_contiguous_loop_cuda error: %s\n", error);
@@ -157,6 +181,8 @@ int32_t opal_datatype_gpu_fini(void)
         opal_datatype_cuda_fini_p = NULL;
         opal_generic_simple_pack_function_cuda_p = NULL;
         opal_generic_simple_unpack_function_cuda_p = NULL;
+        opal_generic_simple_pack_function_cuda_iov_p = NULL;
+        opal_generic_simple_unpack_function_cuda_iov_p = NULL;
         pack_contiguous_loop_cuda_p = NULL;
         unpack_contiguous_loop_cuda_p = NULL;
         pack_predefined_data_cuda_p = NULL;
diff --git a/opal/datatype/opal_datatype_gpu.h b/opal/datatype/opal_datatype_gpu.h
index 385d7cdb73c..b8dc828a0df 100644
--- a/opal/datatype/opal_datatype_gpu.h
+++ b/opal/datatype/opal_datatype_gpu.h
@@ -1,6 +1,8 @@
 #ifndef OPAL_DATATYPE_GPU_H_HAS_BEEN_INCLUDED
 #define OPAL_DATATYPE_GPU_H_HAS_BEEN_INCLUDED
 
+#define OPAL_DATATYPE_CUDA_IOV
+
 int32_t opal_datatype_gpu_init(void);
 int32_t opal_datatype_gpu_fini(void);
 
@@ -18,6 +20,16 @@ extern int32_t (*opal_generic_simple_unpack_function_cuda_p)( opal_convertor_t*
                                                               uint32_t* out_size,
                                                               size_t* max_data );
                                                               
+extern int32_t (*opal_generic_simple_pack_function_cuda_iov_p)( opal_convertor_t* pConvertor,
+                                                                struct iovec* iov, 
+                                                                uint32_t* out_size,
+                                                                size_t* max_data );
+                                                                
+extern int32_t (*opal_generic_simple_unpack_function_cuda_iov_p)( opal_convertor_t* pConvertor,
+                                                                struct iovec* iov, 
+                                                                uint32_t* out_size,
+                                                                size_t* max_data );
+                                                              
 extern void (*pack_contiguous_loop_cuda_p)( dt_elem_desc_t* ELEM,
                                             uint32_t* COUNT,
                                             unsigned char** SOURCE,
@@ -25,10 +37,10 @@ extern void (*pack_contiguous_loop_cuda_p)( dt_elem_desc_t* ELEM,
                                             size_t* SPACE );
                                             
 extern void (*unpack_contiguous_loop_cuda_p)( dt_elem_desc_t* ELEM,
-                                             uint32_t* COUNT,
-                                             unsigned char** SOURCE,
-                                             unsigned char** DESTINATION,
-                                             size_t* SPACE );
+                                            uint32_t* COUNT,
+                                            unsigned char** SOURCE,
+                                            unsigned char** DESTINATION,
+                                            size_t* SPACE );
 
 extern void (*pack_predefined_data_cuda_p)( dt_elem_desc_t* ELEM,
                                             uint32_t* COUNT,
diff --git a/opal/datatype/opal_datatype_module.c b/opal/datatype/opal_datatype_module.c
index 520105d8de9..307eb001085 100644
--- a/opal/datatype/opal_datatype_module.c
+++ b/opal/datatype/opal_datatype_module.c
@@ -226,12 +226,6 @@ int32_t opal_datatype_init( void )
         datatype->desc.desc[1].end_loop.first_elem_disp = datatype->desc.desc[0].elem.disp;
         datatype->desc.desc[1].end_loop.size            = datatype->size;
     }
-    
-#if defined (OPAL_DATATYPE_CUDA)
-    if (opal_datatype_gpu_init() != OPAL_SUCCESS) {
-        opal_datatype_gpu_fini();
-    }
-#endif /* defined OPAL_DATATYPE_CUDA */
 
     return OPAL_SUCCESS;
 }
diff --git a/opal/datatype/opal_datatype_pack.c b/opal/datatype/opal_datatype_pack.c
index 9dc0666eb4e..dbfc1cec12d 100644
--- a/opal/datatype/opal_datatype_pack.c
+++ b/opal/datatype/opal_datatype_pack.c
@@ -43,10 +43,12 @@
 #define opal_pack_homogeneous_contig_function           opal_pack_homogeneous_contig_checksum
 #define opal_pack_homogeneous_contig_with_gaps_function opal_pack_homogeneous_contig_with_gaps_checksum
 #define opal_generic_simple_pack_function               opal_generic_simple_pack_checksum
+#define opal_generic_simple_pack_cuda_function          opal_generic_simple_pack_cuda_checksum
 #else
 #define opal_pack_homogeneous_contig_function           opal_pack_homogeneous_contig
 #define opal_pack_homogeneous_contig_with_gaps_function opal_pack_homogeneous_contig_with_gaps
 #define opal_generic_simple_pack_function               opal_generic_simple_pack
+#define opal_generic_simple_pack_cuda_function          opal_generic_simple_pack_cuda
 #endif  /* defined(CHECKSUM) */
 
 
@@ -288,13 +290,7 @@ opal_generic_simple_pack_function( opal_convertor_t* pConvertor,
                            (void*)pConvertor, (void*)pConvertor->pBaseBuf,
                            iov[0].iov_base, (unsigned long)iov[0].iov_len, *out_size ); );
 
-   if (opal_generic_simple_pack_function_cuda_p != NULL) {
-       int32_t rvalue = (*opal_generic_simple_pack_function_cuda_p)( pConvertor, iov, out_size, max_data);
-       if (rvalue != -99) { /* -99 is DRY RUN, to verify the result with CPU packing*/
-           return rvalue;
-       }
-   }
-
+    printf("I am in simple pack, max_data %lu, iov_len %lu\n", *max_data, iov[0].iov_len);
     description = pConvertor->use_desc->desc;
 
     /* For the first step we have to add both displacement to the source. After in the
@@ -320,9 +316,9 @@ opal_generic_simple_pack_function( opal_convertor_t* pConvertor,
         while( 1 ) {
             while( pElem->elem.common.flags & OPAL_DATATYPE_FLAG_DATA ) {
                 /* now here we have a basic datatype */
-                (*pack_predefined_data_cuda_p)(pElem, &count_desc, &conv_ptr, &iov_ptr, &iov_len_local);
-                // PACK_PREDEFINED_DATATYPE( pConvertor, pElem, count_desc,
-                //                           conv_ptr, iov_ptr, iov_len_local );
+//                (*pack_predefined_data_cuda_p)(pElem, &count_desc, &conv_ptr, &iov_ptr, &iov_len_local);
+                PACK_PREDEFINED_DATATYPE( pConvertor, pElem, count_desc,
+                                        conv_ptr, iov_ptr, iov_len_local );
                 if( 0 == count_desc ) {  /* completed */
                     conv_ptr = pConvertor->pBaseBuf + pStack->disp;
                     pos_desc++;  /* advance to the next data */
@@ -365,9 +361,9 @@ opal_generic_simple_pack_function( opal_convertor_t* pConvertor,
             if( OPAL_DATATYPE_LOOP == pElem->elem.common.type ) {
                 OPAL_PTRDIFF_TYPE local_disp = (OPAL_PTRDIFF_TYPE)conv_ptr;
                 if( pElem->loop.common.flags & OPAL_DATATYPE_FLAG_CONTIGUOUS ) {
-                    (*pack_contiguous_loop_cuda_p)(pElem, &count_desc, &conv_ptr, &iov_ptr, &iov_len_local);
-                    //PACK_CONTIGUOUS_LOOP( pConvertor, pElem, count_desc,
-                    //                      conv_ptr, iov_ptr, iov_len_local );
+                    //(*pack_contiguous_loop_cuda_p)(pElem, &count_desc, &conv_ptr, &iov_ptr, &iov_len_local);
+                    PACK_CONTIGUOUS_LOOP( pConvertor, pElem, count_desc,
+                                          conv_ptr, iov_ptr, iov_len_local );
                     if( 0 == count_desc ) {  /* completed */
                         pos_desc += pElem->loop.items + 1;
                         goto update_loop_description;
@@ -389,12 +385,18 @@ opal_generic_simple_pack_function( opal_convertor_t* pConvertor,
         iov[iov_count].iov_len -= iov_len_local;  /* update the amount of valid data */
         total_packed += iov[iov_count].iov_len;
     }
-    (*opal_cuda_sync_device_p)();
     *max_data = total_packed;
     pConvertor->bConverted += total_packed;  /* update the already converted bytes */
     *out_size = iov_count;
     if( pConvertor->bConverted == pConvertor->local_size ) {
         pConvertor->flags |= CONVERTOR_COMPLETED;
+        printf("total packed %lu\n", pConvertor->bConverted);
+        // double *vtmp = (double *)iov[0].iov_base;
+        // for (uint32_t i = 0; i < total_packed/8; i++) {
+        //     printf(" %1.f ", *vtmp);
+        //     vtmp ++;
+        // }
+        // printf("\n");
         return 1;
     }
     /* Save the global position for the next round */
@@ -404,3 +406,17 @@ opal_generic_simple_pack_function( opal_convertor_t* pConvertor,
                            pConvertor->stack_pos, pStack->index, (int)pStack->count, (long)pStack->disp ); );
     return 0;
 }
+
+int32_t
+opal_generic_simple_pack_cuda_function( opal_convertor_t* pConvertor,
+                                        struct iovec* iov, uint32_t* out_size,
+                                        size_t* max_data )
+{
+#if defined (OPAL_DATATYPE_CUDA_IOV)
+    if (opal_generic_simple_pack_function_cuda_iov_p != NULL) {
+        return (*opal_generic_simple_pack_function_cuda_iov_p)( pConvertor, iov, out_size, max_data);
+
+    }
+#endif
+    return 0;
+}
diff --git a/opal/datatype/opal_datatype_pack.h b/opal/datatype/opal_datatype_pack.h
index b011f434472..c02ecf86ec5 100644
--- a/opal/datatype/opal_datatype_pack.h
+++ b/opal/datatype/opal_datatype_pack.h
@@ -51,8 +51,6 @@ static inline void pack_predefined_data( opal_convertor_t* CONVERTOR,
                                     (CONVERTOR)->pDesc, (CONVERTOR)->count );
         DO_DEBUG( opal_output( 0, "pack 1. memcpy( %p, %p, %lu ) => space %lu\n",
                                *(DESTINATION), _source, (unsigned long)_copy_blength, (unsigned long)(*(SPACE)) ); );
-        printf("pack 1. memcpy( %p, %p, %lu ) => space %lu\n",
-                               *(DESTINATION), _source, (unsigned long)_copy_blength, (unsigned long)(*(SPACE)) );
         MEMCPY_CSUM( *(DESTINATION), _source, _copy_blength, (CONVERTOR) );
         _source        += _copy_blength;
         *(DESTINATION) += _copy_blength;
diff --git a/opal/datatype/opal_datatype_prototypes.h b/opal/datatype/opal_datatype_prototypes.h
index bcfb59b9b31..0f9099f552f 100644
--- a/opal/datatype/opal_datatype_prototypes.h
+++ b/opal/datatype/opal_datatype_prototypes.h
@@ -60,6 +60,14 @@ opal_generic_simple_pack_checksum( opal_convertor_t* pConvertor,
                                    struct iovec* iov, uint32_t* out_size,
                                    size_t* max_data );
 int32_t
+opal_generic_simple_pack_cuda( opal_convertor_t* pConvertor,
+                               struct iovec* iov, uint32_t* out_size,
+                               size_t* max_data );
+int32_t
+opal_generic_simple_pack_cuda_checksum( opal_convertor_t* pConvertor,
+                                        struct iovec* iov, uint32_t* out_size,
+                                        size_t* max_data );
+int32_t
 opal_unpack_homogeneous_contig( opal_convertor_t* pConv,
                                 struct iovec* iov, uint32_t* out_size,
                                 size_t* max_data );
@@ -75,6 +83,14 @@ int32_t
 opal_generic_simple_unpack_checksum( opal_convertor_t* pConvertor,
                                      struct iovec* iov, uint32_t* out_size,
                                      size_t* max_data );
+int32_t
+opal_generic_simple_unpack_cuda( opal_convertor_t* pConvertor,
+                                struct iovec* iov, uint32_t* out_size,
+                                size_t* max_data );                                     
+int32_t
+opal_generic_simple_unpack_cuda_checksum( opal_convertor_t* pConvertor,
+                                        struct iovec* iov, uint32_t* out_size,
+                                        size_t* max_data );
 
 END_C_DECLS
 
diff --git a/opal/datatype/opal_datatype_unpack.c b/opal/datatype/opal_datatype_unpack.c
index f2c57593bcc..b569b40cd81 100644
--- a/opal/datatype/opal_datatype_unpack.c
+++ b/opal/datatype/opal_datatype_unpack.c
@@ -45,10 +45,12 @@
 #define opal_unpack_general_function            opal_unpack_general_checksum
 #define opal_unpack_homogeneous_contig_function opal_unpack_homogeneous_contig_checksum
 #define opal_generic_simple_unpack_function     opal_generic_simple_unpack_checksum
+#define opal_generic_simple_unpack_cuda_function     opal_generic_simple_unpack_cuda_checksum
 #else
 #define opal_unpack_general_function            opal_unpack_general
 #define opal_unpack_homogeneous_contig_function opal_unpack_homogeneous_contig
 #define opal_generic_simple_unpack_function     opal_generic_simple_unpack
+#define opal_generic_simple_unpack_cuda_function     opal_generic_simple_unpack_cuda
 #endif  /* defined(CHECKSUM) */
 
 
@@ -273,15 +275,9 @@ opal_generic_simple_unpack_function( opal_convertor_t* pConvertor,
     size_t iov_len_local;
     uint32_t iov_count;
 
+    printf("i am in simple unpack, max_data %lu, iov len %lu\n", *max_data, iov[0].iov_len);
     DO_DEBUG( opal_output( 0, "opal_convertor_generic_simple_unpack( %p, {%p, %lu}, %u )\n",
-                           (void*)pConvertor, iov[0].iov_base, (unsigned long)iov[0].iov_len, *out_size ); );
-
-//  if (opal_generic_simple_unpack_function_cuda_p != NULL) {
-//      int32_t rvalue = (*opal_generic_simple_unpack_function_cuda_p)( pConvertor, iov, out_size, max_data);
-//      if (rvalue != -99) { /* -99 is DRY RUN, to verify the result with CPU packing*/
-//          return rvalue;
-//      }
-//  }                      
+                           (void*)pConvertor, iov[0].iov_base, (unsigned long)iov[0].iov_len, *out_size ); )                  
 
     description = pConvertor->use_desc->desc;
 
@@ -387,9 +383,9 @@ opal_generic_simple_unpack_function( opal_convertor_t* pConvertor,
             if( OPAL_DATATYPE_LOOP == pElem->elem.common.type ) {
                 OPAL_PTRDIFF_TYPE local_disp = (OPAL_PTRDIFF_TYPE)conv_ptr;
                 if( pElem->loop.common.flags & OPAL_DATATYPE_FLAG_CONTIGUOUS ) {
-                //    UNPACK_CONTIGUOUS_LOOP( pConvertor, pElem, count_desc, 
-                //                            iov_ptr, conv_ptr, iov_len_local );
-                    (*unpack_contiguous_loop_cuda_p)(pElem, &count_desc, &iov_ptr, &conv_ptr, &iov_len_local);
+                    UNPACK_CONTIGUOUS_LOOP( pConvertor, pElem, count_desc, 
+                                            iov_ptr, conv_ptr, iov_len_local );
+                //    (*unpack_contiguous_loop_cuda_p)(pElem, &count_desc, &iov_ptr, &conv_ptr, &iov_len_local);
                     if( 0 == count_desc ) {  /* completed */
                         pos_desc += pElem->loop.items + 1;
                         goto update_loop_description;
@@ -417,6 +413,13 @@ opal_generic_simple_unpack_function( opal_convertor_t* pConvertor,
     *out_size = iov_count;
     if( pConvertor->bConverted == pConvertor->remote_size ) {
         pConvertor->flags |= CONVERTOR_COMPLETED;
+        printf("total unpacked %lu\n", pConvertor->bConverted);
+        // double *vtmp = (double *)iov[0].iov_base;
+        // for (uint32_t i = 0; i < total_unpacked/8; i++) {
+        //     printf(" %1.f ", *vtmp);
+        //     vtmp ++;
+        // }
+        // printf("\n");
         return 1;
     }
     /* Save the global position for the next round */
@@ -590,3 +593,17 @@ opal_unpack_general_function( opal_convertor_t* pConvertor,
                            pConvertor->stack_pos, pStack->index, (int)pStack->count, (long)pStack->disp ); );
     return 0;
 }
+
+int32_t
+opal_generic_simple_unpack_cuda_function( opal_convertor_t* pConvertor,
+                                          struct iovec* iov, uint32_t* out_size,
+                                          size_t* max_data )
+{
+#if defined (OPAL_DATATYPE_CUDA_IOV)
+    if (opal_generic_simple_unpack_function_cuda_iov_p != NULL) {
+        return (*opal_generic_simple_unpack_function_cuda_iov_p)( pConvertor, iov, out_size, max_data);
+
+    }
+#endif
+    return 0;
+}
diff --git a/test/datatype/Makefile.am b/test/datatype/Makefile.am
index d870e6902e0..d69037c8491 100644
--- a/test/datatype/Makefile.am
+++ b/test/datatype/Makefile.am
@@ -14,7 +14,7 @@
 #
 
 if PROJECT_OMPI
-    MPI_TESTS = checksum position position_noncontig ddt_test ddt_raw unpack_ooo ddt_pack
+    MPI_TESTS = checksum position position_noncontig ddt_test ddt_test_old ddt_raw unpack_ooo ddt_pack
     MPI_CHECKS = to_self
 endif
 TESTS = opal_datatype_test $(MPI_TESTS)
@@ -28,10 +28,13 @@ unpack_ooo_LDADD = \
         $(top_builddir)/opal/lib@OPAL_LIB_PREFIX@open-pal.la
 
 ddt_test_SOURCES = ddt_test.c ddt_lib.c ddt_lib.h
-ddt_test_LDFLAGS = $(OMPI_PKG_CONFIG_LDFLAGS)
-ddt_test_LDADD = \
-        $(top_builddir)/ompi/libmpi.la \
-        $(top_builddir)/opal/lib@OPAL_LIB_PREFIX@open-pal.la
+ddt_test_LDFLAGS = $(WRAPPER_EXTRA_LDFLAGS)
+ddt_test_CFLAGS = -I/mnt/scratch/cuda-6.5.14/include -g 
+ddt_test_LDADD = $(top_builddir)/ompi/libmpi.la $(top_builddir)/opal/mca/common/cuda/libmca_common_cuda.la -L/mnt/scratch/cuda-6.5.14/lib64 -lcudart
+
+ddt_test_old_SOURCES = ddt_test_old.c ddt_lib.c ddt_lib.h
+ddt_test_old_LDFLAGS = $(WRAPPER_EXTRA_LDFLAGS)
+ddt_test_old_LDADD = $(top_builddir)/ompi/libmpi.la
 
 ddt_raw_SOURCES = ddt_raw.c ddt_lib.c ddt_lib.h
 ddt_raw_LDFLAGS = $(OMPI_PKG_CONFIG_LDFLAGS)
diff --git a/test/datatype/ddt_lib.c b/test/datatype/ddt_lib.c
index 9170da0914a..321a5c4be88 100644
--- a/test/datatype/ddt_lib.c
+++ b/test/datatype/ddt_lib.c
@@ -358,14 +358,20 @@ ompi_datatype_t* upper_matrix( unsigned int mat_size )
 
     disp = (int*)malloc( sizeof(int) * mat_size );
     blocklen = (int*)malloc( sizeof(int) * mat_size );
-
+    
     for( i = 0; i < mat_size; i++ ) {
         disp[i] = i * mat_size + i;
         blocklen[i] = mat_size - i;
     }
-
+#if defined (TEST_DOUBLE)
     ompi_datatype_create_indexed( mat_size, blocklen, disp, &ompi_mpi_double.dt,
                              &upper );
+#elif defined (TEST_FLOAT)
+    ompi_datatype_create_indexed( mat_size, blocklen, disp, &ompi_mpi_float.dt, &upper );
+#elif defined (TEST_CHAR)
+    ompi_datatype_create_indexed( mat_size, blocklen, disp, &ompi_mpi_char.dt, &upper );
+#else
+#endif
     ompi_datatype_commit( &upper );
     if( outputFlags & DUMP_DATA_AFTER_COMMIT ) {
         ompi_datatype_dump( upper );
@@ -686,3 +692,26 @@ ompi_datatype_t* create_vector_type( const ompi_datatype_t* data, int count, int
     return vector;
 }
 
+ompi_datatype_t* create_struct_type(int count)
+{
+    ompi_datatype_t* dt_struct;
+    ompi_datatype_t* dt_struct_vector;
+    ompi_datatype_t* oldtypes[2];
+    MPI_Aint offsets[2], extent, lb;
+    int blockcounts[2];
+    
+    offsets[0] = 0; 
+    oldtypes[0] = MPI_FLOAT; 
+    blockcounts[0] = 4; 
+    
+    ompi_datatype_get_extent(MPI_FLOAT, &lb, &extent);
+    offsets[1] = 4 * extent; 
+    oldtypes[1] = MPI_DOUBLE; 
+    blockcounts[1] = 2;
+    
+    ompi_datatype_create_struct( 2, blockcounts, offsets, oldtypes, &dt_struct );
+    dt_struct_vector = create_vector_type( dt_struct, 10, 2, 4 );
+    ompi_datatype_commit( &dt_struct_vector );
+    return dt_struct_vector;
+}
+
diff --git a/test/datatype/ddt_lib.h b/test/datatype/ddt_lib.h
index d94690047a7..539434f9525 100644
--- a/test/datatype/ddt_lib.h
+++ b/test/datatype/ddt_lib.h
@@ -34,6 +34,11 @@
 #define DUMP_DATA_AFTER_COMMIT 0x00000001
 #define CHECK_PACK_UNPACK      0x00000002
 
+#define TEST_DOUBLE
+//#define TEST_FLOAT
+//#define TEST_CHAR
+
+
 extern uint32_t outputFlags;
 
 /**
@@ -91,5 +96,5 @@ extern ompi_datatype_t* create_strange_dt( void );
 extern ompi_datatype_t* create_contiguous_type( const ompi_datatype_t* data, int count );
 extern ompi_datatype_t* create_vector_type( const ompi_datatype_t* data, int count,
                                             int length, int stride );
-extern ompi_datatype_t* create_struct_constant_gap_resized_ddt( ompi_datatype_t* type );
+extern ompi_datatype_t* create_struct_type(int count);
 
diff --git a/test/datatype/ddt_test.c b/test/datatype/ddt_test.c
index 12b4b31fc15..e5f58a5b348 100644
--- a/test/datatype/ddt_test.c
+++ b/test/datatype/ddt_test.c
@@ -30,6 +30,14 @@
 #include <stdio.h>
 #include <string.h>
 
+#define DDT_TEST_CUDA
+
+#if defined (DDT_TEST_CUDA)
+#include <cuda_runtime_api.h>
+#include "opal/mca/common/cuda/common_cuda.h"
+#include "opal/runtime/opal_params.h"
+#endif
+
 /* Compile with:
 mpicc -DHAVE_CONFIG_H -I. -I../../include -I../../../ompi-trunk/include  -I../.. -I../../include -I../../../ompi-trunk/opal -I../../../ompi-trunk/orte -I../../../ompi-trunk/ompi -g ddt_test.c -o ddt_test
 */
@@ -171,12 +179,64 @@ static int local_copy_ddt_count( ompi_datatype_t* pdt, int count )
     return OMPI_SUCCESS;
 }
 
+static void fill_vectors(double* vp, int itera, int contig, int gap)
+{
+    int i, j;
+    for (i = 0; i < itera-1; i++ ){
+        for (j = i*gap; j < (i+1)*gap; j++) {
+            if (j >= i*gap && j < i*gap+contig) {
+                vp[j] = 1.0;
+            } else {
+                vp[j] = 0.0;
+            }
+        }
+    }
+    for (i = (itera-1)*gap; i < (itera-1)*gap+contig; i++) {
+        vp[i] = 1.0;
+    }
+    
+    // printf("vector generated:\n");
+    // for (i = 0; i < (itera-1)*gap+contig; i++) {
+    //     printf("%1.f ", vp[i]);
+    // }
+    // printf("\n");
+}
+
+static void verify_vectors(double *vp, int itera, int contig, int gap)
+{
+    int i, j;
+    int error = 0;
+    for (i = 0; i < itera-1; i++) {
+        for (j = i*gap; j < (i+1)*gap; j++) {
+            if (j >= i*gap && j < i*gap+contig) {
+                if (vp[j] != 1.0) {
+                    error ++;
+                }
+            } 
+        }
+    }
+    for (i = (itera-1)*gap; i < (itera-1)*gap+contig; i++) {
+        if (vp[i] != 1.0) {
+            error ++;
+        }
+    }
+    // printf("vector received:\n");
+    // for (i = 0; i < (itera-1)*gap+contig; i++) {
+    //     printf("%1.f ", vp[i]);
+    // }
+    if (error != 0) {
+        printf("%d error is found\n", error);
+    } else {
+        printf("no error is found\n");
+    }
+}
+
 static int
 local_copy_with_convertor_2datatypes( ompi_datatype_t* send_type, int send_count,
                                       ompi_datatype_t* recv_type, int recv_count,
-                                      int chunk )
+                                      int chunk, int itera, int contig, int gap )
 {
-    void *pdst = NULL, *psrc = NULL, *ptemp = NULL;
+    void *pdst = NULL, *psrc = NULL, *ptemp = NULL, *phost = NULL;
     opal_convertor_t *send_convertor = NULL, *recv_convertor = NULL;
     struct iovec iov;
     uint32_t iov_count;
@@ -188,6 +248,40 @@ local_copy_with_convertor_2datatypes( ompi_datatype_t* send_type, int send_count
 
     rlength = compute_buffer_length(recv_type, recv_count);
     slength = compute_buffer_length(send_type, send_count);
+
+#if defined (DDT_TEST_CUDA)
+    cudaError_t error = cudaMalloc((void **)&psrc, slength);
+    if ( error != cudaSuccess) {
+        printf("CUDA error: %s\n", cudaGetErrorString(error));
+        exit(-1);
+    }
+    cudaMemset(psrc, 0, slength);
+    printf("cudamalloc psrc %p\n", psrc);
+    
+    error = cudaMalloc((void **)&pdst, rlength);
+    if ( error != cudaSuccess) {
+        printf("CUDA error: %s\n", cudaGetErrorString(error));
+        exit(-1);
+    }
+    cudaMemset(pdst, 0, rlength); 
+    printf("cudamalloc pdst %p\n", pdst);
+    
+    error = cudaMallocHost((void **)&ptemp, chunk);
+    if ( error != cudaSuccess) {
+        printf("CUDA error: %s\n", cudaGetErrorString(error));
+        exit(-1);
+    }
+    memset(ptemp, 0, chunk);
+    printf("cudamallochost ptemp %p\n", ptemp);
+    
+    error = cudaMallocHost((void **)&phost, slength);
+    if ( error != cudaSuccess) {
+        printf("CUDA error: %s\n", cudaGetErrorString(error));
+        exit(-1);
+    }
+    memset(phost, 0, slength);
+    printf("cudamallochost phost %p\n", phost);
+#else
     pdst  = malloc( rlength );
     psrc  = malloc( slength );
     ptemp = malloc( chunk );
@@ -196,6 +290,18 @@ local_copy_with_convertor_2datatypes( ompi_datatype_t* send_type, int send_count
     for( size_t i = 0; i < slength; i++ )
             ((char*)psrc)[i] = i % 128 + 32;
     memset(pdst, 0, rlength);
+#endif
+    
+#if defined (DDT_TEST_CUDA)
+    if (itera > 0) {
+        fill_vectors((double *)phost, itera, contig, gap);
+    }
+    cudaMemcpy(psrc, phost, slength, cudaMemcpyHostToDevice);
+#else 
+    if (itera > 0) {
+        fill_vectors(psrc, itera, contig, gap);
+    }
+#endif
 
     send_convertor = opal_convertor_create( remote_arch, 0 );
     if( OPAL_SUCCESS != opal_convertor_prepare_for_send( send_convertor, &(send_type->super), send_count, psrc ) ) {
@@ -242,6 +348,18 @@ local_copy_with_convertor_2datatypes( ompi_datatype_t* send_type, int send_count
     printf( "copying different data-types using convertors in %ld microsec\n", total_time );
     printf( "\t unpack in %ld microsec [pack in %ld microsec]\n", unpack_time,
             total_time - unpack_time );
+            
+#if defined (DDT_TEST_CUDA)
+    memset(phost, 0, slength);
+    cudaMemcpy(phost, pdst, rlength, cudaMemcpyDeviceToHost);
+    if (itera > 0) {
+        verify_vectors((double *)phost, itera, contig, gap);
+    }
+#else
+    if (itera > 0) {
+        verify_vectors((double *)pdst, itera, contig, gap);
+    }
+#endif
  clean_and_return:
     if( send_convertor != NULL ) {
         OBJ_RELEASE( send_convertor ); assert( send_convertor == NULL );
@@ -249,15 +367,25 @@ local_copy_with_convertor_2datatypes( ompi_datatype_t* send_type, int send_count
     if( recv_convertor != NULL ) {
         OBJ_RELEASE( recv_convertor ); assert( recv_convertor == NULL );
     }
+#if defined (DDT_TEST_CUDA)
+    if( NULL != pdst ) cudaFree( pdst );
+    if( NULL != psrc ) cudaFree( psrc );
+    if( NULL != ptemp ) cudaFreeHost( ptemp );
+    if( NULL != phost ) cudaFreeHost( phost );
+#else
     if( NULL != pdst ) free( pdst );
     if( NULL != psrc ) free( psrc );
     if( NULL != ptemp ) free( ptemp );
+#endif
     return OMPI_SUCCESS;
 }
 
-static int local_copy_with_convertor( ompi_datatype_t* pdt, int count, int chunk )
+static int
+local_copy_with_convertor_2datatypes_struct( ompi_datatype_t* send_type, int send_count,
+                                      ompi_datatype_t* recv_type, int recv_count,
+                                      int chunk, int count)
 {
-    void *pdst = NULL, *psrc = NULL, *ptemp = NULL;
+    void *pdst = NULL, *psrc = NULL, *ptemp = NULL, *phost = NULL;
     opal_convertor_t *send_convertor = NULL, *recv_convertor = NULL;
     struct iovec iov;
     uint32_t iov_count;
@@ -265,15 +393,295 @@ static int local_copy_with_convertor( ompi_datatype_t* pdt, int count, int chunk
     int32_t length = 0, done1 = 0, done2 = 0;
     TIMER_DATA_TYPE start, end, unpack_start, unpack_end;
     long total_time, unpack_time = 0;
+    size_t slength, rlength;
 
-    max_data = compute_buffer_length(pdt, count);
+    rlength = compute_buffer_length(recv_type, recv_count);
+    slength = compute_buffer_length(send_type, send_count);
+
+#if defined (DDT_TEST_CUDA)
+    cudaError_t error = cudaMalloc((void **)&psrc, slength);
+    if ( error != cudaSuccess) {
+        printf("CUDA error: %s\n", cudaGetErrorString(error));
+        exit(-1);
+    }
+    cudaMemset(psrc, 0, slength);
+    printf("cudamalloc psrc %p\n", psrc);
+    
+    error = cudaMalloc((void **)&pdst, rlength);
+    if ( error != cudaSuccess) {
+        printf("CUDA error: %s\n", cudaGetErrorString(error));
+        exit(-1);
+    }
+    cudaMemset(pdst, 0, rlength); 
+    printf("cudamalloc pdst %p\n", pdst);
+    
+    error = cudaMallocHost((void **)&ptemp, chunk);
+    if ( error != cudaSuccess) {
+        printf("CUDA error: %s\n", cudaGetErrorString(error));
+        exit(-1);
+    }
+    memset(ptemp, 0, chunk);
+    printf("cudamallochost ptemp %p\n", ptemp);
+    
+    error = cudaMallocHost((void **)&phost, slength);
+    if ( error != cudaSuccess) {
+        printf("CUDA error: %s\n", cudaGetErrorString(error));
+        exit(-1);
+    }
+    memset(phost, 0, slength);
+    printf("cudamallochost phost %p\n", phost);
+#else
+    pdst  = malloc( rlength );
+    psrc  = malloc( slength );
+    ptemp = malloc( chunk );
+
+    /* initialize the buffers to prevent valgrind from complaining */
+    for( size_t i = 0; i < slength; i++ )
+            ((char*)psrc)[i] = i % 128 + 32;
+    memset(pdst, 0, rlength);
+#endif
+    
+#if defined (DDT_TEST_CUDA)
+
+    cudaMemcpy(psrc, phost, slength, cudaMemcpyHostToDevice);
+#else 
+
+#endif
+
+    send_convertor = opal_convertor_create( remote_arch, 0 );
+    if( OPAL_SUCCESS != opal_convertor_prepare_for_send( send_convertor, &(send_type->super), send_count, psrc ) ) {
+        printf( "Unable to create the send convertor. Is the datatype committed ?\n" );
+        goto clean_and_return;
+    }
+    recv_convertor = opal_convertor_create( remote_arch, 0 );
+    if( OPAL_SUCCESS != opal_convertor_prepare_for_recv( recv_convertor, &(recv_type->super), recv_count, pdst ) ) {
+        printf( "Unable to create the recv convertor. Is the datatype committed ?\n" );
+        goto clean_and_return;
+    }
+
+    cache_trash();  /* make sure the cache is useless */
+
+    GET_TIME( start );
+    while( (done1 & done2) != 1 ) {
+        /* They are supposed to finish in exactly the same time. */
+        if( done1 | done2 ) {
+            printf( "WRONG !!! the send is %s but the receive is %s in local_copy_with_convertor_2datatypes\n",
+                    (done1 ? "finish" : "not finish"),
+                    (done2 ? "finish" : "not finish") );
+        }
+
+        max_data = chunk;
+        iov_count = 1;
+        iov.iov_base = ptemp;
+        iov.iov_len = chunk;
+
+        if( done1 == 0 ) {
+            done1 = opal_convertor_pack( send_convertor, &iov, &iov_count, &max_data );
+        }
+
+        if( done2 == 0 ) {
+            GET_TIME( unpack_start );
+            done2 = opal_convertor_unpack( recv_convertor, &iov, &iov_count, &max_data );
+            GET_TIME( unpack_end );
+            unpack_time += ELAPSED_TIME( unpack_start, unpack_end );
+        }
+
+        length += max_data;
+    }
+    GET_TIME( end );
+    total_time = ELAPSED_TIME( start, end );
+    printf( "copying different data-types using convertors in %ld microsec\n", total_time );
+    printf( "\t unpack in %ld microsec [pack in %ld microsec]\n", unpack_time,
+            total_time - unpack_time );
+            
+#if defined (DDT_TEST_CUDA)
+    memset(phost, 0, slength);
+    cudaMemcpy(phost, pdst, rlength, cudaMemcpyDeviceToHost);
+
+#else
+
+#endif
+ clean_and_return:
+    if( send_convertor != NULL ) {
+        OBJ_RELEASE( send_convertor ); assert( send_convertor == NULL );
+    }
+    if( recv_convertor != NULL ) {
+        OBJ_RELEASE( recv_convertor ); assert( recv_convertor == NULL );
+    }
+#if defined (DDT_TEST_CUDA)
+    if( NULL != pdst ) cudaFree( pdst );
+    if( NULL != psrc ) cudaFree( psrc );
+    if( NULL != ptemp ) cudaFreeHost( ptemp );
+    if( NULL != phost ) cudaFreeHost( phost );
+#else
+    if( NULL != pdst ) free( pdst );
+    if( NULL != psrc ) free( psrc );
+    if( NULL != ptemp ) free( ptemp );
+#endif
+    return OMPI_SUCCESS;
+}
 
-    pdst  = malloc(max_data);
-    psrc  = malloc(max_data);
-    ptemp = malloc(chunk);
 
+static void fill_upper_matrix(void *matt, int msize)
+{
+    int i, j, start, end;
+    int *blklens, *displs;
+#if defined (TEST_DOUBLE)
+    double *mat = (double *)matt;
+#elif defined (TEST_FLOAT)
+    float *mat = (float *)matt;
+#elif defined (TEST_CHAR)
+    char *mat = (char *)matt;
+#else
+    void *mat = matt;
+#endif
+    
+    blklens = (int *)malloc(sizeof(int)*msize);
+    displs = (int *)malloc(sizeof(int)*msize);
+    for (i = 0; i < msize; i++) {
+        blklens[i] = msize - i;
+        displs[i] = i*msize + i;
+    }
+    for (i = 0; i < msize; i++) {
+        start = displs[i];
+        end = start + blklens[i];
+        for (j = start; j < end; j++) {
+#if defined (TEST_CHAR)
+            mat[j] = 'a';
+#else
+            mat[j] = 0.0 + i;
+#endif
+        }
+    }
+    free(blklens);
+    free(displs);
+
+   // printf("matrix generate\n");
+   // for (i = 0; i < msize; i++) {
+   //     for (j = 0; j < msize; j++) {
+   //         printf(" %1.f ", mat[i*msize+j]);
+   //     }
+   //     printf("\n");
+   // }
+}
+
+static void verify_mat_result(void *matt, int msize)
+{
+    int *blklens, *displs;
+    int i, j, error = 0;
+    int start, end;
+#if defined (TEST_DOUBLE)
+    double *mat = (double *)matt;
+#elif defined (TEST_FLOAT)
+    float *mat = (float *)matt;
+#elif defined (TEST_CHAR)
+    char *mat = (char *)matt;
+#else
+    void *mat = matt;
+#endif
+    
+    blklens = (int *)malloc(sizeof(int)*msize);
+    displs = (int *)malloc(sizeof(int)*msize);
+    for (i = 0; i < msize; i++) {
+        blklens[i] = msize - i;
+        displs[i] = i*msize + i;
+    }
+    for (i = 0; i < msize; i++) {
+        start = displs[i];
+        end = start + blklens[i];
+        for (j = start; j < end; j++) {
+#if defined (TEST_CHAR) 
+            if (mat[j] != 'a') {
+#else
+            if (mat[j] != (0.0+i)) {
+#endif
+                error ++;
+            }
+        }
+    }
+    free(blklens);
+    free(displs);
+    
+    // printf("matrix received\n");
+    // for (i = 0; i < msize; i++) {
+    //     for (j = 0; j < msize; j++) {
+    //         printf(" %1.f ", mat[i*msize+j]);
+    //     }
+    //     printf("\n");
+    // }
+    
+    if (error != 0) {
+        printf("error is found %d\n", error);
+    } else {
+        printf("no error is found\n");
+    }
+}
+
+static int local_copy_with_convertor( ompi_datatype_t* pdt, int count, int chunk, int msize )
+{
+    void *pdst = NULL, *psrc = NULL, *ptemp = NULL, *phost = NULL;
+    opal_convertor_t *send_convertor = NULL, *recv_convertor = NULL;
+    struct iovec iov;
+    uint32_t iov_count;
+    size_t max_data, dt_length;
+    int32_t length = 0, done1 = 0, done2 = 0;
+    TIMER_DATA_TYPE start, end, unpack_start, unpack_end;
+    long total_time, unpack_time = 0;
+
+    dt_length = compute_buffer_length(pdt, count);
+    printf("length %lu\n", dt_length);
+
+#if defined (DDT_TEST_CUDA)
+    cudaError_t error = cudaMalloc((void **)&psrc, dt_length);
+    if ( error != cudaSuccess) {
+        printf("CUDA error: %s\n", cudaGetErrorString(error));
+        exit(-1);
+    }
+    cudaMemset(psrc, 0, dt_length);
+    printf("cudamalloc psrc %p\n", psrc);
+    
+    error = cudaMalloc((void **)&pdst, dt_length);
+    if ( error != cudaSuccess) {
+        printf("CUDA error: %s\n", cudaGetErrorString(error));
+        exit(-1);
+    }
+    cudaMemset(pdst, 0, dt_length); 
+    printf("cudamalloc pdst %p\n", pdst);
+    
+    error = cudaMallocHost((void **)&ptemp, chunk);
+    if ( error != cudaSuccess) {
+        printf("CUDA error: %s\n", cudaGetErrorString(error));
+        exit(-1);
+    }
+    memset(ptemp, 0, chunk);
+    printf("cudamallochost ptemp %p\n", ptemp);
+    
+    error = cudaMallocHost((void **)&phost, dt_length);
+    if ( error != cudaSuccess) {
+        printf("CUDA error: %s\n", cudaGetErrorString(error));
+        exit(-1);
+    }
+    memset(phost, 0, dt_length);
+    printf("cudamallochost phost %p\n", phost);
+#else
+    pdst  = malloc(dt_length);
+    psrc  = malloc(dt_length);
+    ptemp = malloc(chunk);
+    
     for( int i = 0; i < length; ((char*)psrc)[i] = i % 128 + 32, i++ );
     memset( pdst, 0, length );
+#endif
+
+#if defined (DDT_TEST_CUDA)
+    if (msize > 0) {
+        fill_upper_matrix(phost, msize);
+    }
+    cudaMemcpy(psrc, phost, dt_length, cudaMemcpyHostToDevice);
+#else 
+    if (msize > 0) {
+        fill_upper_matrix(psrc, msize);
+    }
+#endif
 
     send_convertor = opal_convertor_create( remote_arch, 0 );
     if( OPAL_SUCCESS != opal_convertor_prepare_for_send( send_convertor, &(pdt->super), count, psrc ) ) {
@@ -321,13 +729,32 @@ static int local_copy_with_convertor( ompi_datatype_t* pdt, int count, int chunk
     printf( "copying same data-type using convertors in %ld microsec\n", total_time );
     printf( "\t unpack in %ld microsec [pack in %ld microsec]\n", unpack_time,
             total_time - unpack_time );
- clean_and_return:
+            
+#if defined (DDT_TEST_CUDA)
+    memset(phost, 0, dt_length);
+    cudaMemcpy(phost, pdst, dt_length, cudaMemcpyDeviceToHost);
+    if (msize > 0) {
+        verify_mat_result(phost, msize);
+    }
+#else
+    if (msize > 0) {
+        verify_mat_result(pdst, msize);
+    }
+#endif
+clean_and_return:
     if( NULL != send_convertor ) OBJ_RELEASE( send_convertor );
     if( NULL != recv_convertor ) OBJ_RELEASE( recv_convertor );
 
+#if defined (DDT_TEST_CUDA)
+    if( NULL != pdst ) cudaFree( pdst );
+    if( NULL != psrc ) cudaFree( psrc );
+    if( NULL != ptemp ) cudaFreeHost( ptemp );
+    if( NULL != phost ) cudaFreeHost( phost );
+#else
     if( NULL != pdst ) free( pdst );
     if( NULL != psrc ) free( psrc );
     if( NULL != ptemp ) free( ptemp );
+#endif
     return OMPI_SUCCESS;
 }
 
@@ -343,7 +770,13 @@ int main( int argc, char* argv[] )
     ompi_datatype_t *pdt, *pdt1, *pdt2, *pdt3;
     int rc, length = 500, i;
 
+#if defined (DDT_TEST_CUDA)
+    opal_cuda_support = 1;
+#endif
     opal_init_util(&argc, &argv);
+#if defined (DDT_TEST_CUDA)
+    mca_common_cuda_stage_one_init();
+#endif
     ompi_datatype_init();
 
     /**
@@ -365,12 +798,20 @@ int main( int argc, char* argv[] )
     }
     OBJ_RELEASE( pdt ); assert( pdt == NULL );
 */    
+    printf("\n TEST STRUCT \n");
+    pdt = create_struct_type(5);
+    if( outputFlags & CHECK_PACK_UNPACK ) {
+        for (i = 1; i <= 1; i++) {
+  //          local_copy_with_convertor_2datatypes_struct(pdt, 1, pdt, 1, 1024*1024*100, 5);
+        }
+    }
+    
     printf( "\n\n#\n * TEST UPPER TRIANGULAR MATRIX (size 100)\n #\n\n" );
-    pdt = upper_matrix(4000);
+    pdt = upper_matrix(1000);
     if( outputFlags & CHECK_PACK_UNPACK ) {
-        for (i = 1; i <= 4; i++) {
+        for (i = 1; i <= 3; i++) {
 //        local_copy_ddt_count(pdt, 1);
-    //    local_copy_with_convertor(pdt, 1, 1024*1024*200);
+            local_copy_with_convertor(pdt, 1, 1024*1024*200, 1000);
         }
     }
     OBJ_RELEASE( pdt ); assert( pdt == NULL );
@@ -403,7 +844,6 @@ int main( int argc, char* argv[] )
     ompi_datatype_create_contiguous(0, &ompi_mpi_datatype_null.dt, &pdt1);
     ompi_datatype_create_contiguous(0, &ompi_mpi_datatype_null.dt, &pdt2);
     ompi_datatype_create_contiguous(0, &ompi_mpi_datatype_null.dt, &pdt3);
-
     ompi_datatype_add( pdt3, &ompi_mpi_int.dt, 10, 0, -1 );
     ompi_datatype_add( pdt3, &ompi_mpi_float.dt, 5, 10 * sizeof(int), -1 );
 
@@ -429,7 +869,6 @@ int main( int argc, char* argv[] )
     OBJ_RELEASE( pdt1 ); assert( pdt1 == NULL );
     OBJ_RELEASE( pdt2 ); assert( pdt2 == NULL );
     OBJ_RELEASE( pdt3 ); assert( pdt3 == NULL );
-
     printf( ">>--------------------------------------------<<\n" );
     printf( " Contiguous data-type (MPI_DOUBLE)\n" );
     pdt = MPI_DOUBLE;
@@ -494,7 +933,7 @@ int main( int argc, char* argv[] )
  //   ompi_datatype_commit(&pdt1);
     if( outputFlags & CHECK_PACK_UNPACK ) {
         for (i = 0; i < 10; i++) {
-            local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 1024*1024*30 );
+    //         local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 1024*1024*30 );
         }
     }
     printf( ">>--------------------------------------------<<\n" );
@@ -504,7 +943,7 @@ int main( int argc, char* argv[] )
     ompi_datatype_create_contiguous( 1, pdt, &pdt1 );
 //    ompi_datatype_dump( pdt );
     if( outputFlags & CHECK_PACK_UNPACK ) {
-        for (i = 0; i < 10; i++) {
+        for (i = 0; i < 1; i++) {
        // local_copy_ddt_count(pdt, 1);
       //  local_copy_with_convertor( pdt, 1, 12 );
       //  local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 12 );
@@ -513,7 +952,7 @@ int main( int argc, char* argv[] )
       //  local_copy_with_convertor( pdt, 1, 6000 );
       //  local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 6000 );
       //  local_copy_with_convertor( pdt, 1, 36000 );
-     //     local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 1024*1024*20 );
+           // local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 1024*1024*20 , 4000, 384, 512);
         }
     }
     printf( ">>--------------------------------------------<<\n" );
@@ -551,7 +990,7 @@ int main( int argc, char* argv[] )
       //  local_copy_with_convertor( pdt, 1, 6000 );
       //  local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 6000 );
       //  local_copy_with_convertor( pdt, 1, 36000 );
-   //       local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 1024*1024*5 );
+    //      local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 1024*1024*5 );
         }
     }
     printf( ">>--------------------------------------------<<\n" );
@@ -595,7 +1034,6 @@ int main( int argc, char* argv[] )
     }
     printf( ">>--------------------------------------------<<\n" );
     OBJ_RELEASE( pdt ); assert( pdt == NULL );
-
     printf( ">>--------------------------------------------<<\n" );
     pdt = test_create_blacs_type();
     if( outputFlags & CHECK_PACK_UNPACK ) {
@@ -611,7 +1049,6 @@ int main( int argc, char* argv[] )
     }
     printf( ">>--------------------------------------------<<\n" );
     OBJ_RELEASE( pdt ); assert( pdt == NULL );
-
     printf( ">>--------------------------------------------<<\n" );
     pdt1 = test_create_blacs_type1( &ompi_mpi_int.dt );
     pdt2 = test_create_blacs_type2( &ompi_mpi_int.dt );

From 34f4a3b76a9f9479946e8bf9a5a84db6b743254d Mon Sep 17 00:00:00 2001
From: eddy16112 <eddy16112@gmail.com>
Date: Thu, 9 Apr 2015 03:23:21 -0400
Subject: [PATCH 097/190] RDMA send is now working.

Conflicts:
	test/datatype/Makefile.am
---
 ompi/mca/pml/ob1/pml_ob1_cuda.c               |  74 +++++++-
 ompi/mca/pml/ob1/pml_ob1_recvreq.c            |   7 +-
 ompi/mca/pml/ob1/pml_ob1_sendreq.c            |  18 +-
 opal/datatype/cuda/Makefile                   |   2 +-
 opal/datatype/cuda/opal_config.h              | 171 +++++++++++++-----
 opal/datatype/cuda/opal_datatype_cuda.cu      |  34 ++++
 opal/datatype/cuda/opal_datatype_cuda.cuh     |   4 +
 .../cuda/opal_datatype_cuda_internal.cuh      |   1 -
 .../cuda/opal_datatype_orig_internal.h        |   8 +-
 .../cuda/opal_datatype_pack_cuda_wrapper.cu   |  40 +++-
 .../cuda/opal_datatype_unpack_cuda_wrapper.cu |  11 +-
 opal/datatype/opal_convertor.c                |  12 +-
 opal/datatype/opal_convertor.h                |   6 +
 opal/datatype/opal_datatype_gpu.c             |  27 ++-
 opal/datatype/opal_datatype_gpu.h             |   5 +-
 opal/datatype/opal_datatype_module.c          |   4 +-
 opal/datatype/opal_datatype_pack.c            |   2 -
 opal/datatype/opal_datatype_unpack.c          |   2 -
 opal/include/opal_config_top.h                |   2 -
 opal/mca/btl/smcuda/btl_smcuda.c              |  52 +++++-
 opal/mca/common/cuda/common_cuda.c            |  64 +++++++
 opal/mca/common/cuda/common_cuda.h            |   9 +
 test/datatype/Makefile.am                     |  14 +-
 test/datatype/ddt_test.c                      |  13 +-
 24 files changed, 480 insertions(+), 102 deletions(-)

diff --git a/ompi/mca/pml/ob1/pml_ob1_cuda.c b/ompi/mca/pml/ob1/pml_ob1_cuda.c
index 12ad396363d..b51fc299cdd 100644
--- a/ompi/mca/pml/ob1/pml_ob1_cuda.c
+++ b/ompi/mca/pml/ob1/pml_ob1_cuda.c
@@ -37,11 +37,21 @@
 #include "ompi/mca/bml/base/base.h"
 #include "ompi/memchecker.h"
 
+#include "opal/datatype/opal_datatype_gpu.h"
+#include "opal/mca/common/cuda/common_cuda.h"
+
+#define CUDA_DDT_WITH_RDMA 1
+
 size_t mca_pml_ob1_rdma_cuda_btls(
     mca_bml_base_endpoint_t* bml_endpoint,
     unsigned char* base,
     size_t size,
     mca_pml_ob1_com_btl_t* rdma_btls);
+    
+int mca_pml_ob1_rdma_cuda_btl_register_events(
+    mca_pml_ob1_com_btl_t* rdma_btls, 
+    uint32_t num_btls_used, 
+    struct opal_convertor_t* convertor);
 
 int mca_pml_ob1_cuda_need_buffers(void * rreq,
                                   mca_btl_base_module_t* btl);
@@ -92,7 +102,45 @@ int mca_pml_ob1_send_request_start_cuda(mca_pml_ob1_send_request_t* sendreq,
         /* Do not send anything with first rendezvous message as copying GPU
          * memory into RNDV message is expensive. */
         sendreq->req_send.req_base.req_convertor.flags |= CONVERTOR_CUDA;
-        rc = mca_pml_ob1_send_request_start_rndv(sendreq, bml_btl, 0, 0);
+        mca_bml_base_btl_t* bml_endpoint_btl = mca_bml_base_btl_array_get_index(&(sendreq->req_endpoint->btl_send), 0);
+        if ((bml_endpoint_btl->btl_flags & MCA_BTL_FLAGS_CUDA_GET) && CUDA_DDT_WITH_RDMA) {
+            printf("GPU data ready for GET!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n");
+            unsigned char *base;
+            struct opal_convertor_t *convertor = &(sendreq->req_send.req_base.req_convertor);
+            base = opal_datatype_get_gpu_buffer();
+            sendreq->req_send.req_bytes_packed = convertor->local_size;
+            printf("GPU BUFFER %p, local %lu, remote %lu\n", base, convertor->local_size, convertor->remote_size);
+            if( 0 != (sendreq->req_rdma_cnt = (uint32_t)mca_pml_ob1_rdma_cuda_btls(
+                                                                           sendreq->req_endpoint,
+                                                                           base,
+                                                                           sendreq->req_send.req_bytes_packed,
+                                                                           sendreq->req_rdma))) {
+                
+                mca_pml_ob1_rdma_cuda_btl_register_events(sendreq->req_rdma, sendreq->req_rdma_cnt, convertor);
+                struct iovec iov;
+                int rc_dt = 0;
+                uint32_t iov_count = 1;
+                iov.iov_base = NULL;
+                iov.iov_len = 0;
+                size_t max_data = 0;
+                rc_dt = opal_convertor_pack(convertor, &iov, &iov_count, &max_data );
+              //  mca_common_cuda_record_event(&convertor->pipeline_event[0]);
+           //      uint64_t event, *ep;
+           //      ep = &event;
+           //      mca_common_cuda_create_event((uint64_t**)ep);
+           // //     mca_common_cuda_record_event(ep);
+           //      printf("success record event %d\n", event);
+                rc = mca_pml_ob1_send_request_start_rdma(sendreq, bml_btl,
+                                                         sendreq->req_send.req_bytes_packed);
+                if( OPAL_UNLIKELY(OMPI_SUCCESS != rc) ) {
+                    mca_pml_ob1_free_rdma_resources(sendreq);
+                }
+            } else {
+                rc = mca_pml_ob1_send_request_start_rndv(sendreq, bml_btl, 0, 0);
+            }
+        } else {
+            rc = mca_pml_ob1_send_request_start_rndv(sendreq, bml_btl, 0, 0);
+        }
     }
     return rc;
 }
@@ -152,6 +200,30 @@ size_t mca_pml_ob1_rdma_cuda_btls(
     return num_btls_used;
 }
 
+int mca_pml_ob1_rdma_cuda_btl_register_events(
+    mca_pml_ob1_com_btl_t* rdma_btls, 
+    uint32_t num_btls_used, 
+    struct opal_convertor_t* convertor)
+{
+    // uint32_t i, j;
+    // for (i = 0; i < num_btls_used; i++) {
+    //     mca_btl_base_registration_handle_t *handle = rdma_btls[i].btl_reg;
+    //     mca_mpool_common_cuda_reg_t *cuda_reg = (mca_mpool_common_cuda_reg_t *)
+    //             ((intptr_t) handle - offsetof (mca_mpool_common_cuda_reg_t, data));
+    //     printf("base %p\n", cuda_reg->base.base);
+    //     for (j = 0; j < MAX_IPC_EVENT_HANDLE; j++) {
+    //         uint64_t *event = &convertor->pipeline_event[j];
+    //         convertor->pipeline_event[j] = 0;
+    //         mca_common_cuda_geteventhandle(&event, j, (mca_mpool_base_registration_t *)cuda_reg);
+    //         convertor->pipeline_event[j] = *event;
+    //   //      printf("event %lu, j %d\n", convertor->pipeline_event[j], j);
+    //     }
+    //     cuda_reg->data.pipeline_size = 1000;
+    //
+    // }
+    return 0;
+}
+
 int mca_pml_ob1_cuda_need_buffers(void * rreq,
                                   mca_btl_base_module_t* btl)
 {
diff --git a/ompi/mca/pml/ob1/pml_ob1_recvreq.c b/ompi/mca/pml/ob1/pml_ob1_recvreq.c
index b7646890d03..15cfe8560ba 100644
--- a/ompi/mca/pml/ob1/pml_ob1_recvreq.c
+++ b/ompi/mca/pml/ob1/pml_ob1_recvreq.c
@@ -649,8 +649,11 @@ void mca_pml_ob1_recv_request_progress_rget( mca_pml_ob1_recv_request_t* recvreq
         if (mca_pml_ob1_cuda_need_buffers(recvreq, btl))
 #endif /* OPAL_CUDA_SUPPORT */
         {
-            mca_pml_ob1_recv_request_ack(recvreq, &hdr->hdr_rndv, 0);
-            return;
+            /* need more careful check here */
+            if (! (recvreq->req_recv.req_base.req_convertor.flags & CONVERTOR_CUDA)) {
+                mca_pml_ob1_recv_request_ack(recvreq, &hdr->hdr_rndv, 0);
+                return;    
+            }
         }
     }
 
diff --git a/ompi/mca/pml/ob1/pml_ob1_sendreq.c b/ompi/mca/pml/ob1/pml_ob1_sendreq.c
index f1f2744b2e3..50b11d36dff 100644
--- a/ompi/mca/pml/ob1/pml_ob1_sendreq.c
+++ b/ompi/mca/pml/ob1/pml_ob1_sendreq.c
@@ -675,10 +675,26 @@ int mca_pml_ob1_send_request_start_rdma( mca_pml_ob1_send_request_t* sendreq,
                                                     MCA_PML_OB1_HDR_FLAGS_PIN);
     }
 
+#if OPAL_CUDA_SUPPORT
+    if ( (sendreq->req_send.req_base.req_convertor.flags & CONVERTOR_CUDA)) {
+        sendreq->req_send.req_base.req_convertor.flags &= ~CONVERTOR_CUDA;
+        if (opal_convertor_need_buffers(&sendreq->req_send.req_base.req_convertor) == true) {
+            data_ptr = sendreq->req_send.req_base.req_convertor.gpu_buffer_ptr_source;
+            printf("START RMDA data_ptr %p\n", data_ptr);
+        } else {
+            opal_convertor_get_current_pointer (&sendreq->req_send.req_base.req_convertor, &data_ptr);
+        }
+        /* Set flag back */
+        sendreq->req_send.req_base.req_convertor.flags |= CONVERTOR_CUDA;
+    } else {
+        opal_convertor_get_current_pointer (&sendreq->req_send.req_base.req_convertor, &data_ptr);
+    }
+#else
     /* at this time ob1 does not support non-contiguous gets. the convertor represents a
      * contiguous block of memory */
     opal_convertor_get_current_pointer (&sendreq->req_send.req_base.req_convertor, &data_ptr);
-
+#endif
+    
     local_handle = sendreq->req_rdma[0].btl_reg;
 
     /* allocate an rdma fragment to keep track of the request size for use in the fin message */
diff --git a/opal/datatype/cuda/Makefile b/opal/datatype/cuda/Makefile
index 6be10afd0fd..e76f160fb88 100644
--- a/opal/datatype/cuda/Makefile
+++ b/opal/datatype/cuda/Makefile
@@ -6,7 +6,7 @@ RANLIB		= ranlib
 STLIB		?= opal_datatype_cuda.a
 DYLIB		?= opal_datatype_cuda.so
 CFLAGS		= -g -G -O0 
-EXTLIB		= -L/home/wwu12/ompi/ompi-cuda/opal/datatype/.libs -ldatatype
+EXTLIB		= -L/home/wwu12/ompi/ompi-gpu/opal/datatype/.libs -ldatatype -L/usr/lib64 -lcuda
 INC			=
 
 SRC	:= \
diff --git a/opal/datatype/cuda/opal_config.h b/opal/datatype/cuda/opal_config.h
index 19fa55f52ed..d23f071a86a 100644
--- a/opal/datatype/cuda/opal_config.h
+++ b/opal/datatype/cuda/opal_config.h
@@ -24,6 +24,10 @@
 #ifndef OPAL_CONFIG_H
 #define OPAL_CONFIG_H
 
+//#include "opal_config_top.h"
+
+
+
 /* Define if building universal (internal helper macro) */
 /* #undef AC_APPLE_UNIVERSAL_BUILD */
 
@@ -51,6 +55,9 @@
 /* Define to 1 if you have the <aio.h> header file. */
 #define HAVE_AIO_H 1
 
+/* Define to 1 if the linker supports alias attribute. */
+/* #undef HAVE_ALIAS_ATTRIBUTE */
+
 /* Define to 1 if you have the <alloca.h> header file. */
 #define HAVE_ALLOCA_H 1
 
@@ -63,6 +70,9 @@
 /* Define to 1 if you have the `asprintf' function. */
 #define HAVE_ASPRINTF 1
 
+/* Set to use c11 atomic functions */
+/* #undef HAVE_ATOMICS */
+
 /* Define to 1 if the system has the type `CACHE_DESCRIPTOR'. */
 /* #undef HAVE_CACHE_DESCRIPTOR */
 
@@ -93,6 +103,9 @@
 /* Define to 1 if you have the <crt_externs.h> header file. */
 /* #undef HAVE_CRT_EXTERNS_H */
 
+/* Define to 1 if you have the <ctype.h> header file. */
+#define HAVE_CTYPE_H 1
+
 /* Define to 1 if we have -lcuda */
 /* #undef HAVE_CUDA */
 
@@ -153,18 +166,14 @@
    don't. */
 /* #undef HAVE_DECL_IBV_ACCESS_SO */
 
+/* Define to 1 if you have the declaration of `IBV_ATOMIC_HCA', and to 0 if
+   you don't. */
+/* #undef HAVE_DECL_IBV_ATOMIC_HCA */
+
 /* Define to 1 if you have the declaration of `IBV_EVENT_CLIENT_REREGISTER',
    and to 0 if you don't. */
 /* #undef HAVE_DECL_IBV_EVENT_CLIENT_REREGISTER */
 
-/* Define to 1 if you have the declaration of `IBV_EVENT_GID_CHANGE', and to 0
-   if you don't. */
-/* #undef HAVE_DECL_IBV_EVENT_GID_CHANGE */
-
-/* Define to 1 if you have the declaration of `ibv_event_type_str', and to 0
-   if you don't. */
-/* #undef HAVE_DECL_IBV_EVENT_TYPE_STR */
-
 /* Define to 1 if you have the declaration of `IBV_EXP_ACCESS_ALLOCATE_MR',
    and to 0 if you don't. */
 /* #undef HAVE_DECL_IBV_EXP_ACCESS_ALLOCATE_MR */
@@ -177,17 +186,9 @@
    to 0 if you don't. */
 /* #undef HAVE_DECL_IBV_LINK_LAYER_ETHERNET */
 
-/* Define to 1 if you have the declaration of `IBV_NODE_USNIC', and to 0 if
-   you don't. */
-/* #undef HAVE_DECL_IBV_NODE_USNIC */
-
-/* Define to 1 if you have the declaration of `IBV_TRANSPORT_USNIC', and to 0
-   if you don't. */
-/* #undef HAVE_DECL_IBV_TRANSPORT_USNIC */
-
-/* Define to 1 if you have the declaration of `IBV_TRANSPORT_USNIC_UDP', and
-   to 0 if you don't. */
-/* #undef HAVE_DECL_IBV_TRANSPORT_USNIC_UDP */
+/* Define to 1 if you have the declaration of `IBV_SRQT_XRC', and to 0 if you
+   don't. */
+/* #undef HAVE_DECL_IBV_SRQT_XRC */
 
 /* Define to 1 if you have the declaration of
    `nvmlDeviceGetMaxPcieLinkGeneration', and to 0 if you don't. */
@@ -359,6 +360,9 @@
 /* Define to 1 if you have the <hwloc.h> header file. */
 /* #undef HAVE_HWLOC_H */
 
+/* Define to 1 if you have the `ibv_cmd_open_xrcd' function. */
+/* #undef HAVE_IBV_CMD_OPEN_XRCD */
+
 /* Define to 1 if you have the `ibv_create_xrc_rcv_qp' function. */
 /* #undef HAVE_IBV_CREATE_XRC_RCV_QP */
 
@@ -437,9 +441,21 @@
 /* Define to 1 if we have -llgrp */
 /* #undef HAVE_LIBLGRP */
 
+/* set to 1 if should use libnl v3, set to 0 for libnl v11 */
+#define HAVE_LIBNL3 0
+
 /* Define to 1 if you have the `pci' library (-lpci). */
 /* #undef HAVE_LIBPCI */
 
+/* Define to 1 if you have the `psm_infinipath' library (-lpsm_infinipath). */
+/* #undef HAVE_LIBPSM_INFINIPATH */
+
+/* Define to 1 if you have the `pthread' library (-lpthread). */
+#define HAVE_LIBPTHREAD 1
+
+/* Define to 1 if you have the `rt' library (-lrt). */
+#define HAVE_LIBRT 1
+
 /* Define to 1 if you have the <libutil.h> header file. */
 /* #undef HAVE_LIBUTIL_H */
 
@@ -494,12 +510,18 @@
 /* Define to 1 if you have the `mmap' function. */
 #define HAVE_MMAP 1
 
+/* Define to 1 if you have the <mntent.h> header file. */
+#define HAVE_MNTENT_H 1
+
 /* Define to 1 if the system has the type `mode_t'. */
 #define HAVE_MODE_T 1
 
 /* Define to 1 if you have the <mtcp.h> header file. */
 /* #undef HAVE_MTCP_H */
 
+/* Define to 1 if you have the <munge.h> header file. */
+/* #undef HAVE_MUNGE_H */
+
 /* Define to 1 if you have the <mxm/api/mxm_api.h> header file. */
 /* #undef HAVE_MXM_API_MXM_API_H */
 
@@ -515,9 +537,6 @@
 /* Define to 1 if you have the <netinet/tcp.h> header file. */
 #define HAVE_NETINET_TCP_H 1
 
-/* Define to 1 if you have the <netlink/netlink.h> header file. */
-/* #undef HAVE_NETLINK_NETLINK_H */
-
 /* Define to 1 if you have the <net/if.h> header file. */
 #define HAVE_NET_IF_H 1
 
@@ -545,6 +564,9 @@
 /* Define to 1 if you have the `openpty' function. */
 #define HAVE_OPENPTY 1
 
+/* Define to 1 if you have the <paths.h> header file. */
+#define HAVE_PATHS_H 1
+
 /* Define to 1 if you have the <pci/pci.h> header file. */
 /* #undef HAVE_PCI_PCI_H */
 
@@ -591,6 +613,12 @@
    */
 /* #undef HAVE_PSAPI_WORKING_SET_EX_INFORMATION */
 
+/* libfabric: whether to build the PSM provider or not */
+/* #undef HAVE_PSM */
+
+/* libfabric: do not build PSM provider as a DL */
+/* #undef HAVE_PSM_DL */
+
 /* Define to 1 if you have the <psm.h> header file. */
 /* #undef HAVE_PSM_H */
 
@@ -624,6 +652,9 @@
 /* Define to 1 if you have the <pwd.h> header file. */
 #define HAVE_PWD_H 1
 
+/* Define to 1 if you have the <rdma/fabric.h> header file. */
+/* #undef HAVE_RDMA_FABRIC_H */
+
 /* Define to 1 if you have the <rdma/rdma_cma.h> header file. */
 /* #undef HAVE_RDMA_RDMA_CMA_H */
 
@@ -678,12 +709,15 @@
 /* Define to 1 if you have the `snprintf' function. */
 #define HAVE_SNPRINTF 1
 
-/* Define to 1 if you have the <sn/xpmem.h> header file. */
-/* #undef HAVE_SN_XPMEM_H */
-
 /* Define to 1 if you have the `socketpair' function. */
 #define HAVE_SOCKETPAIR 1
 
+/* libfabric: do not build sockets provider */
+/* #undef HAVE_SOCKETS */
+
+/* libfabric: do not build sockets provider */
+/* #undef HAVE_SOCKETS_DL */
+
 /* Define to 1 if the system has the type `socklen_t'. */
 #define HAVE_SOCKLEN_T 1
 
@@ -902,6 +936,9 @@
 /* Define to 1 if you have the <tm.h> header file. */
 /* #undef HAVE_TM_H */
 
+/* Define to 1 if you have the <tm_tree.h> header file. */
+/* #undef HAVE_TM_TREE_H */
+
 /* Define to 1 if you have the <ucontext.h> header file. */
 #define HAVE_UCONTEXT_H 1
 
@@ -939,6 +976,12 @@
 /* Define to 1 if you have the `usleep' function. */
 #define HAVE_USLEEP 1
 
+/* libfabric: whether to build the usnic provider or not */
+/* #undef HAVE_USNIC */
+
+/* libfabric: do not build usnic provider as a DL */
+/* #undef HAVE_USNIC_DL */
+
 /* Define to 1 if you have the <util.h> header file. */
 /* #undef HAVE_UTIL_H */
 
@@ -951,6 +994,12 @@
 /* Define to 1 if you have the `vasprintf' function. */
 #define HAVE_VASPRINTF 1
 
+/* libfabric: do not build verbs provider */
+/* #undef HAVE_VERBS */
+
+/* libfabric: do not build verbs provider */
+/* #undef HAVE_VERBS_DL */
+
 /* Define to 1 if you have the `vsnprintf' function. */
 #define HAVE_VSNPRINTF 1
 
@@ -978,6 +1027,9 @@
 /* Define to 1 if the system has the type `__float128'. */
 #define HAVE___FLOAT128 1
 
+/* Define to 1 if the system has the type `__int128'. */
+/* #undef HAVE___INT128 */
+
 /* Define to 1 if you have the `__mmap' function. */
 /* #undef HAVE___MMAP */
 
@@ -1188,7 +1240,7 @@
 /* #undef HWLOC_HPUX_SYS */
 
 /* Version of hwloc */
-#define HWLOC_HWLOC191_HWLOC_VERSION "internal v1.9.1"
+#define HWLOC_HWLOC191_HWLOC_VERSION "internal v1.9.2"
 
 /* Define to 1 on Irix */
 /* #undef HWLOC_IRIX_SYS */
@@ -1237,7 +1289,7 @@
 #define LT_OBJDIR ".libs/"
 
 /* Header to include for event implementation */
-#define MCA_event_IMPLEMENTATION_HEADER "opal/mca/event/libevent2021/libevent2021.h"
+#define MCA_event_IMPLEMENTATION_HEADER "opal/mca/event/libevent2022/libevent2022.h"
 
 /* Header to include for hwloc implementation */
 #define MCA_hwloc_IMPLEMENTATION_HEADER "opal/mca/hwloc/hwloc191/hwloc191.h"
@@ -1249,7 +1301,7 @@
 /* #undef MCA_hwloc_external_openfabrics_header */
 
 /* Complete set of command line arguments given to ROMIOs configure script */
-#define MCA_io_romio_COMPLETE_CONFIGURE_FLAGS " FROM_OMPI=yes CC='gcc -std=gnu99' CFLAGS='-g -Wall -Wundef -Wno-long-long -Wsign-compare -Wmissing-prototypes -Wstrict-prototypes -Wcomment -pedantic -Werror-implicit-function-declaration -finline-functions -fno-strict-aliasing -pthread' CPPFLAGS='  -I/home/wwu12/ompi/ompi-cuda/opal/mca/hwloc/hwloc191/hwloc/include -I/home/wwu12/ompi/ompi-cuda/opal/mca/event/libevent2021/libevent -I/home/wwu12/ompi/ompi-cuda/opal/mca/event/libevent2021/libevent/include' FFLAGS='' LDFLAGS='  ' --enable-shared --disable-static  --prefix=/home/wwu12/ompi/build-cuda --disable-aio"
+#define MCA_io_romio_COMPLETE_CONFIGURE_FLAGS " FROM_OMPI=yes CC='gcc -std=gnu99' CFLAGS='-g -Wall -Wundef -Wno-long-long -Wsign-compare -Wmissing-prototypes -Wstrict-prototypes -Wcomment -pedantic -Werror-implicit-function-declaration -finline-functions -fno-strict-aliasing -pthread -D__EXTENSIONS__' CPPFLAGS='  -I/home/wwu12/ompi/ompi-gpu/opal/mca/hwloc/hwloc191/hwloc/include -I/home/wwu12/ompi/ompi-gpu/opal/mca/event/libevent2022/libevent -I/home/wwu12/ompi/ompi-gpu/opal/mca/event/libevent2022/libevent/include' FFLAGS='' LDFLAGS='  ' --enable-shared --disable-static  --prefix=/home/wwu12/ompi/build-gpu --disable-aio --disable-weak-symbols --enable-strict"
 
 /* Set of user-defined configure flags given to ROMIOs configure script via
    --with-io-romio-flags */
@@ -1436,9 +1488,6 @@
 /* Enable contributed software package libompitrace */
 #define OMPI_ENABLE_CONTRIB_libompitrace 1
 
-/* Enable contributed software package vt */
-#define OMPI_ENABLE_CONTRIB_vt 1
-
 /* Whether we want MPI profiling or not */
 #define OMPI_ENABLE_MPI_PROFILING 1
 
@@ -1490,6 +1539,10 @@
    not */
 #define OMPI_FORTRAN_HAVE_BIND_C_TYPE_NAME 0
 
+/* For ompi/mpi/fortran/use-mpi-f08/blah.F90 and blah.h and ompi_info: whether
+   the compiler supports c_funloc or not */
+#define OMPI_FORTRAN_HAVE_C_FUNLOC 0
+
 /* For ompi_info: Whether the Fortran compiler supports the Fortran 2008
    "assumed rank" syntax or not */
 #define OMPI_FORTRAN_HAVE_F08_ASSUMED_RANK 0
@@ -1717,7 +1770,7 @@
 #define OMPI_MPI_AINT_TYPE ptrdiff_t
 
 /* Contributed software packages built with Open MPI */
-#define OMPI_MPI_CONTRIBS "vt, libompitrace"
+#define OMPI_MPI_CONTRIBS "libompitrace"
 
 /* Size of the MPI_Count datatype */
 #define OMPI_MPI_COUNT_SIZE 8
@@ -1769,7 +1822,7 @@
 #define OMPI_RELEASE_VERSION 0
 
 /* The repository version Open MPI */
-#define OMPI_REPO_REV "dev-267-g51b4521"
+#define OMPI_REPO_REV "dev-1510-g40fe521"
 
 /* Defined to 1 if the OMPI runtime component is ORTE */
 #define OMPI_RTE_ORTE 1
@@ -1977,6 +2030,9 @@
 /* Format of assembly file */
 #define OPAL_ASSEMBLY_FORMAT "default-.text-.globl-:--.L-@-1-0-1-1-1"
 
+/* Whether we have support for RDTSCP instruction */
+#define OPAL_ASSEMBLY_SUPPORTS_RDTSCP 0
+
 /* Enable flow control for Portals4 BTL */
 #define OPAL_BTL_PORTALS4_FLOW_CONTROL 0
 
@@ -1986,6 +2042,9 @@
 /* If knem support can be enabled */
 #define OPAL_BTL_SM_HAVE_KNEM 0
 
+/* Path by which to include fi_ext_usnic.h */
+/* #undef OPAL_BTL_USNIC_FI_EXT_USNIC_H */
+
 /* define to 1 if usnic BTL unit tests are enabled, 0 otherwise */
 #define OPAL_BTL_USNIC_UNIT_TESTS 0
 
@@ -2032,7 +2091,7 @@
 #define OPAL_CUDA_GDR_SUPPORT 1
 
 /* Whether we have CUDA cuPointerGetAttributes function available */
-#define OPAL_CUDA_GET_ATTRIBUTES 0
+#define OPAL_CUDA_GET_ATTRIBUTES 1
 
 /* Whether we want cuda device pointer support */
 #define OPAL_CUDA_SUPPORT 1
@@ -2079,6 +2138,9 @@
 /* Whether C compiler supports XLC style inline assembly */
 #define OPAL_C_XLC_INLINE_ASSEMBLY 0
 
+/* Whether we have lt_dladvise or not */
+#define OPAL_DL_LIBLTDL_HAVE_LT_DLADVISE 0
+
 /* Whether we want checkpoint/restart enabled debugging functionality or not
    */
 #define OPAL_ENABLE_CRDEBUG 0
@@ -2218,15 +2280,27 @@
 /* whether ceil is found and available */
 #define OPAL_HAVE_CEIL 1
 
+/* whether clock_gettime is found and available */
+#define OPAL_HAVE_CLOCK_GETTIME 1
+
+/* Whether the processor supports the cmpxchg16b instruction */
+#define OPAL_HAVE_CMPXCHG16B 1
+
 /* Enable features required for ConnectX XRC support */
 #define OPAL_HAVE_CONNECTX_XRC 0
 
+/* Enable features required for XRC domains support */
+#define OPAL_HAVE_CONNECTX_XRC_DOMAINS 0
+
 /* whether crs_blcr is found and available */
 /* #undef OPAL_HAVE_CRS_BLCR */
 
 /* whether dirname is found and available */
 #define OPAL_HAVE_DIRNAME 1
 
+/* Whether the OPAL DL framework is functional or not */
+#define OPAL_HAVE_DL_SUPPORT 1
+
 /* whether fbtl_posix is found and available */
 #define OPAL_HAVE_FBTL_POSIX 1
 
@@ -2243,15 +2317,9 @@
    long'. */
 #define OPAL_HAVE_LONG_LONG 1
 
-/* Whether libltdl appears to have the lt_dladvise interface */
-#define OPAL_HAVE_LTDL_ADVISE 0
-
 /* whether openpty is found and available */
 #define OPAL_HAVE_OPENPTY 1
 
-/* Do we have POSIX threads */
-#define OPAL_HAVE_POSIX_THREADS 1
-
 /* If PTHREADS implementation supports PTHREAD_MUTEX_ERRORCHECK */
 #define OPAL_HAVE_PTHREAD_MUTEX_ERRORCHECK 1
 
@@ -2279,6 +2347,10 @@
 /* Whether or not we have solaris */
 #define OPAL_HAVE_SOLARIS 0
 
+/* Whether the __sync builtin atomic compare and swap supports 128-bit values
+   */
+/* #undef OPAL_HAVE_SYNC_BUILTIN_CSWAP_INT128 */
+
 /* Do not use outside of mpi.h. Define to 1 if you have the <sys/synch.h>
    header file. */
 /* #undef OPAL_HAVE_SYS_SYNCH_H */
@@ -2316,9 +2388,6 @@
 /* ident string for Open MPI */
 #define OPAL_IDENT_STRING "1.9.0a1"
 
-/* Whether we are using the internal libltdl or not */
-#define OPAL_LIBLTDL_INTERNAL 1
-
 /* Major release number of Open Portable Access Layer */
 #define OPAL_MAJOR_VERSION 1
 
@@ -2386,7 +2455,7 @@
 #define OPAL_RELEASE_VERSION 0
 
 /* The repository version Open Portable Access Layer */
-#define OPAL_REPO_REV "dev-267-g51b4521"
+#define OPAL_REPO_REV "dev-1510-g40fe521"
 
 /* Whether we have shared memory support for mmap or not */
 #define OPAL_SHMEM_MMAP 1
@@ -2413,9 +2482,6 @@
 /* Enable per-user config files */
 #define OPAL_WANT_HOME_CONFIG_FILES 1
 
-/* Whether to include support for libltdl or not */
-#define OPAL_WANT_LIBLTDL 1
-
 /* if the memory and buffer checking should be enabled */
 #define OPAL_WANT_MEMCHECKER 0
 
@@ -2448,7 +2514,7 @@
 #define ORTE_RELEASE_VERSION 0
 
 /* The repository version Open MPI Run-Time Environment */
-#define ORTE_REPO_REV "dev-267-g51b4521"
+#define ORTE_REPO_REV "dev-1510-g40fe521"
 
 /* Tarball filename version string of Open MPI Run-Time Environment */
 #define ORTE_TARBALL_VERSION "gitclone"
@@ -2481,7 +2547,7 @@
 #define OSHMEM_RELEASE_VERSION 0
 
 /* The repository version Open SHMEM */
-#define OSHMEM_REPO_REV "dev-267-g51b4521"
+#define OSHMEM_REPO_REV "dev-1510-g40fe521"
 
 /* Whether user wants OSHMEM in compatibility mode or not */
 #define OSHMEM_SPEC_COMPAT 1
@@ -2522,6 +2588,9 @@
 /* Define to the version of this package. */
 #define PACKAGE_VERSION "gitclone"
 
+/* Define PT_LOCK_SPIN to 1 if available. */
+/* #undef PT_LOCK_SPIN */
+
 /* The size of `bool', as computed by sizeof. */
 #define SIZEOF_BOOL 1
 
@@ -2656,7 +2725,7 @@
 #define WRAPPER_EXTRA_LDFLAGS "    -Wl,-rpath -Wl,@{libdir} -Wl,--enable-new-dtags"
 
 /* Additional LIBS to pass through the wrapper compilers */
-#define WRAPPER_EXTRA_LIBS "-lm -ldl -lutil "
+#define WRAPPER_EXTRA_LIBS "-lm -ldl -lutil -lrt "
 
 /* Whether the wrapper compilers add rpath flags by default */
 #define WRAPPER_RPATH_SUPPORT "runpath"
@@ -2788,5 +2857,7 @@
 # define __restrict__
 #endif
 
+
+//#include "opal_config_bottom.h"
 #endif /* OPAL_CONFIG_H */
 
diff --git a/opal/datatype/cuda/opal_datatype_cuda.cu b/opal/datatype/cuda/opal_datatype_cuda.cu
index 105ba2bfeba..1debbd221a5 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cu
+++ b/opal/datatype/cuda/opal_datatype_cuda.cu
@@ -1,6 +1,7 @@
 #include "opal_datatype_cuda_internal.cuh"
 #include "opal_datatype_cuda.cuh"
 #include <cuda_runtime_api.h>
+#include <cuda.h>
 #include <stdio.h>
 #include <stdarg.h> 
 
@@ -163,6 +164,39 @@ void opal_cuda_sync_device(void)
     cuda_desc_h->iov[0].iov_base = (void*)gpu_dest_const;
 }
 
+int32_t opal_cuda_is_gpu_buffer(const void *ptr)
+{
+    int res;
+    CUmemorytype memType;
+    CUdeviceptr dbuf = (CUdeviceptr)ptr;
+    res = cuPointerGetAttribute(&memType, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, dbuf);
+    if (res != CUDA_SUCCESS) {
+        /* If we cannot determine it is device pointer,
+         * just assume it is not. */
+        printf("!!!!!!!is gpu buffer error\n");
+        return 0;
+    } 
+    if (memType == CU_MEMORYTYPE_DEVICE) {
+        return 1;
+    } else if (memType == CU_MEMORYTYPE_HOST){
+        return 0;
+    } else if (memType == 0) {
+        return 0;
+    } else {
+        return 0;
+    }
+}
+
+unsigned char* opal_cuda_get_gpu_pack_buffer()
+{
+    if (ddt_cuda_pack_buffer != NULL) {
+        return ddt_cuda_pack_buffer;
+    } else {
+        return NULL;
+    }
+}
+
+/* from internal.h*/
 void opal_cuda_output(int output_id, const char *format, ...)
 {
     if (output_id >= 0 && output_id <= OPAL_DATATYPE_CUDA_DEBUG_LEVEL) {
diff --git a/opal/datatype/cuda/opal_datatype_cuda.cuh b/opal/datatype/cuda/opal_datatype_cuda.cuh
index ebaad5a06fc..5797ceb55d8 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cuh
+++ b/opal/datatype/cuda/opal_datatype_cuda.cuh
@@ -47,6 +47,10 @@ void pack_predefined_data_cuda( dt_elem_desc_t* ELEM,
                                 size_t* SPACE );
 
 void opal_cuda_sync_device(void);
+
+int32_t opal_cuda_is_gpu_buffer(const void *ptr);
+
+unsigned char* opal_cuda_get_gpu_pack_buffer();
 }
                             
 #endif  /* OPAL_DATATYPE_CUDA_H_HAS_BEEN_INCLUDED */
\ No newline at end of file
diff --git a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
index b510a2f5808..be264484153 100644
--- a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
+++ b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
@@ -13,7 +13,6 @@
 #define OPAL_DATATYPE_CUDA_DEBUG
 //#define OPAL_DATATYPE_CUDA_KERNEL_TIME
 #define OPAL_DATATYPE_CUDA_DEBUG_LEVEL  0
-#define OPAL_DATATYPE_CUDA_IOV
 #define OPAL_DATATYPE_CUDA_TIMING
 
 
diff --git a/opal/datatype/cuda/opal_datatype_orig_internal.h b/opal/datatype/cuda/opal_datatype_orig_internal.h
index fc30fc87741..37b1d1be51b 100644
--- a/opal/datatype/cuda/opal_datatype_orig_internal.h
+++ b/opal/datatype/cuda/opal_datatype_orig_internal.h
@@ -292,6 +292,8 @@ typedef struct opal_convertor_master_t {
     conversion_fct_t*               pFunctions;   /**< the convertor functions pointer */
 } opal_convertor_master_t;
 
+#define MAX_IPC_EVENT_HANDLE   10
+
 struct opal_convertor_t {
     opal_object_t                 super;          /**< basic superclass */
     uint32_t                      remoteArch;     /**< the remote architecture */
@@ -322,6 +324,10 @@ struct opal_convertor_t {
 #if OPAL_CUDA_SUPPORT
     memcpy_fct_t                  cbmemcpy;       /**< memcpy or cuMemcpy */
     void *                        stream;         /**< CUstream for async copy */
+
+    unsigned char *               gpu_buffer_ptr; /**< GPU buffer used for pack/unpack */
+    unsigned char *               gpu_buffer_ptr_source; /**< source address of GPU buffer start to pack, update in packing function */
+    uint64_t *                    pipeline_event[MAX_IPC_EVENT_HANDLE]; /**< cuda event for pipeline */
 #endif
     /* size: 248, cachelines: 4, members: 20 */
     /* last cacheline: 56 bytes */
@@ -643,4 +649,4 @@ OPAL_DECLSPEC extern const size_t opal_datatype_basicDatatypesSize[OPAL_DATATYPE
 
 #define     OPAL_DATATYPE_UNAVAILABLE_SIZE  0
 
-#endif  /* OPAL_DATATYPE_ORIG_INTERNAL_H_HAS_BEEN_INCLUDED */
\ No newline at end of file
+#endif  /* OPAL_DATATYPE_ORIG_INTERNAL_H_HAS_BEEN_INCLUDED */
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
index f13610fc1bf..14fdcfca346 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
@@ -316,7 +316,7 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
     unsigned char *destination;
     size_t total_packed, total_converted;
     int32_t complete_flag = 0;
-    uint8_t buffer_isfull = 0;
+    uint8_t buffer_isfull = 0, transfer_required;
     uint32_t convertor_flags;
     dt_elem_desc_t* description;
     dt_elem_desc_t* pElem;
@@ -341,7 +341,7 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
     destination = (unsigned char*)iov[0].iov_base;
 #else
 //    pConvertor->pBaseBuf = pBaseBuf_GPU;
-  //  printf("Pack GPU base %p, iov_buffer %p\n", pConvertor->pBaseBuf, iov[0].iov_base);
+    printf("Pack GPU base %p, gpu_buffer %p\n", pConvertor->pBaseBuf, ddt_cuda_pack_buffer);
     destination = ddt_cuda_pack_buffer;
 #endif /* OPAL_DATATYPE_CUDA_DRY_RUN */
 
@@ -353,9 +353,35 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
     pStack = pConvertor->pStack + pConvertor->stack_pos;
     pElem = &(description[pStack->index]);
     printf("size elem %lu, size %d\n", pElem->elem.common.type, opal_datatype_basicDatatypesSize[pElem->elem.common.type]);
+    
+    assert(opal_datatype_basicDatatypesSize[pElem->elem.common.type] != 0);
 
     printf("buffer size %d, max_data %d\n", iov[0].iov_len, *max_data);
-    buffer_size = iov[0].iov_len;
+    if ((iov[0].iov_base == NULL) || opal_cuda_is_gpu_buffer(iov[0].iov_base)) {
+        if (iov[0].iov_len == 0) {
+            buffer_size = DT_CUDA_BUFFER_SIZE;
+        } else {
+            buffer_size = iov[0].iov_len;
+        }
+        pConvertor->gpu_buffer_ptr = ddt_cuda_pack_buffer;
+        pConvertor->gpu_buffer_ptr_source = pConvertor->gpu_buffer_ptr + pConvertor->bConverted;
+        
+        if (iov[0].iov_base == NULL) {
+            iov[0].iov_base = ddt_cuda_pack_buffer;
+            destination = ddt_cuda_pack_buffer;
+        } else {
+            destination = (unsigned char *)iov[0].iov_base;
+        }
+        transfer_required = 0;
+    } else {
+        buffer_size = iov[0].iov_len;
+        pConvertor->gpu_buffer_ptr = NULL;
+        pConvertor->gpu_buffer_ptr_source = NULL;
+        transfer_required = 1;
+    }
+    
+    printf("start packing from %p\n", destination);
+
     cuda_iov_count = 1000;
     total_packed = 0;
     total_converted = pConvertor->bConverted;
@@ -371,7 +397,7 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
 #endif
     
     dst_offset = 0;
-    thread_per_block = CUDA_WARP_SIZE * 4;
+    thread_per_block = CUDA_WARP_SIZE * 5;
     nb_blocks = 256;
     
     while (cuda_iov_count > 0) {
@@ -400,7 +426,7 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
         }
 
         for (i = 0; i < cuda_iov_count; i++) {
-            pElem = &(description[pStack->index+i]);
+    //        pElem = &(description[pStack->index+i]);
             if (buffer_size >= cuda_iov[i].iov_len) {
                 length_per_iovec = cuda_iov[i].iov_len;
             } else {
@@ -514,7 +540,9 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
     GET_TIME(start);
 #endif
-    cudaMemcpy(iov[0].iov_base, ddt_cuda_pack_buffer, total_packed, cudaMemcpyDeviceToHost);
+    if (transfer_required) {
+        cudaMemcpy(iov[0].iov_base, ddt_cuda_pack_buffer, total_packed, cudaMemcpyDeviceToHost);
+    } 
 #if defined(OPAL_DATATYPE_CUDA_TIMING) 
     GET_TIME( end );
     total_time = ELAPSED_TIME( start, end );
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
index 88a66de5f02..dccf9f23e82 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
@@ -162,8 +162,12 @@ int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvert
     // printf("\n");
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
     GET_TIME(start);
-#endif    
-    cudaMemcpy(source, iov[0].iov_base, iov[0].iov_len, cudaMemcpyHostToDevice);
+#endif
+    if (opal_cuda_is_gpu_buffer(iov[0].iov_base)) {
+        source = (unsigned char*)iov[0].iov_base;
+    } else {    
+        cudaMemcpy(source, iov[0].iov_base, iov[0].iov_len, cudaMemcpyHostToDevice);
+    }
 #if defined(OPAL_DATATYPE_CUDA_TIMING) 
     GET_TIME( end );
     total_time = ELAPSED_TIME( start, end );
@@ -190,7 +194,7 @@ int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvert
 #endif
     
     dst_offset = 0;
-    thread_per_block = CUDA_WARP_SIZE * 4;
+    thread_per_block = CUDA_WARP_SIZE * 5;
     nb_blocks = 256;
     
     while (cuda_iov_count > 0) {
@@ -312,7 +316,6 @@ int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvert
     iov[0].iov_len = total_unpacked;
     *max_data = total_unpacked;
     *out_size = 1;
-    
     DT_CUDA_DEBUG ( opal_cuda_output(0, "total unpacked %d\n", total_unpacked); );
     
 #if defined(OPAL_DATATYPE_CUDA_TIMING)    
diff --git a/opal/datatype/opal_convertor.c b/opal/datatype/opal_convertor.c
index d4f0cebb722..3f35a0e6b41 100644
--- a/opal/datatype/opal_convertor.c
+++ b/opal/datatype/opal_convertor.c
@@ -562,11 +562,11 @@ int32_t opal_convertor_prepare_for_recv( opal_convertor_t* convertor,
     convertor->flags |= CONVERTOR_RECV;
 #if OPAL_CUDA_SUPPORT
     mca_cuda_convertor_init(convertor, pUserBuf);
-#if defined (OPAL_DATATYPE_CUDA)
+#if OPAL_DATATYPE_CUDA_KERNEL
     if (opal_datatype_gpu_init() != OPAL_SUCCESS) {
         opal_datatype_gpu_fini();
     }
-#endif /* defined OPAL_DATATYPE_CUDA */
+#endif /* defined OPAL_DATATYPE_CUDA_KERNEL */
 #endif
 
     OPAL_CONVERTOR_PREPARE( convertor, datatype, count, pUserBuf );
@@ -591,7 +591,7 @@ int32_t opal_convertor_prepare_for_recv( opal_convertor_t* convertor,
         if( convertor->pDesc->flags & OPAL_DATATYPE_FLAG_CONTIGUOUS ) {
             convertor->fAdvance = opal_unpack_homogeneous_contig;
         } else {
-            if (convertor->flags & CONVERTOR_CUDA ) {
+            if ((convertor->flags & CONVERTOR_CUDA) && OPAL_DATATYPE_CUDA_KERNEL) {
                 convertor->fAdvance = opal_generic_simple_unpack_cuda;
             } else {
                 convertor->fAdvance = opal_generic_simple_unpack;
@@ -610,11 +610,11 @@ int32_t opal_convertor_prepare_for_send( opal_convertor_t* convertor,
     convertor->flags |= CONVERTOR_SEND;
 #if OPAL_CUDA_SUPPORT
     mca_cuda_convertor_init(convertor, pUserBuf);
-#if defined (OPAL_DATATYPE_CUDA)
+#if OPAL_DATATYPE_CUDA_KERNEL
     if (opal_datatype_gpu_init() != OPAL_SUCCESS) {
         opal_datatype_gpu_fini();
     }
-#endif /* defined OPAL_DATATYPE_CUDA */
+#endif /* defined OPAL_DATATYPE_CUDA_KERNEL */
 #endif
 
     OPAL_CONVERTOR_PREPARE( convertor, datatype, count, pUserBuf );
@@ -637,7 +637,7 @@ int32_t opal_convertor_prepare_for_send( opal_convertor_t* convertor,
             else
                 convertor->fAdvance = opal_pack_homogeneous_contig_with_gaps;
         } else {
-            if (convertor->flags & CONVERTOR_CUDA ) {
+            if ((convertor->flags & CONVERTOR_CUDA) && OPAL_DATATYPE_CUDA_KERNEL ) {
                 convertor->fAdvance = opal_generic_simple_pack_cuda;
             } else {
                 convertor->fAdvance = opal_generic_simple_pack;
diff --git a/opal/datatype/opal_convertor.h b/opal/datatype/opal_convertor.h
index 5b26b7e7d63..6ed9e311d84 100644
--- a/opal/datatype/opal_convertor.h
+++ b/opal/datatype/opal_convertor.h
@@ -79,6 +79,8 @@ typedef struct dt_stack_t dt_stack_t;
  */
 #define DT_STATIC_STACK_SIZE   5                /**< This should be sufficient for most applications */
 
+#define MAX_IPC_EVENT_HANDLE   10
+
 struct opal_convertor_t {
     opal_object_t                 super;          /**< basic superclass */
     uint32_t                      remoteArch;     /**< the remote architecture */
@@ -109,6 +111,10 @@ struct opal_convertor_t {
 #if OPAL_CUDA_SUPPORT
     memcpy_fct_t                  cbmemcpy;       /**< memcpy or cuMemcpy */
     void *                        stream;         /**< CUstream for async copy */
+
+    unsigned char *               gpu_buffer_ptr; /**< GPU buffer used for pack/unpack */
+    unsigned char *               gpu_buffer_ptr_source; /**< source address of GPU buffer start to pack, update in packing function */
+    uint64_t *                    pipeline_event[MAX_IPC_EVENT_HANDLE]; /**< cuda event for pipeline */
 #endif
     /* size: 248, cachelines: 4, members: 20 */
     /* last cacheline: 56 bytes */
diff --git a/opal/datatype/opal_datatype_gpu.c b/opal/datatype/opal_datatype_gpu.c
index 787e86e4f4c..f8c4785994d 100644
--- a/opal/datatype/opal_datatype_gpu.c
+++ b/opal/datatype/opal_datatype_gpu.c
@@ -83,10 +83,12 @@ void (*pack_predefined_data_cuda_p)( dt_elem_desc_t* ELEM,
 
 void (*opal_cuda_sync_device_p)(void) = NULL;
 
+unsigned char* (*opal_cuda_get_gpu_pack_buffer_p)(void) = NULL;
+
 int32_t opal_datatype_gpu_init(void)
 {
     char *error;
-    char *lib = "/home/wwu12/ompi/ompi-cuda/opal/datatype/cuda/opal_datatype_cuda.so";
+    char *lib = "/home/wwu12/ompi/ompi-gpu/opal/datatype/cuda/opal_datatype_cuda.so";
     
     if (opal_datatype_cuda_handle ==  NULL) {
         opal_datatype_cuda_handle = dlopen(lib, RTLD_LAZY);
@@ -166,11 +168,19 @@ int32_t opal_datatype_gpu_init(void)
             return OPAL_ERROR;
         }
         
+        *(void **)(&opal_cuda_get_gpu_pack_buffer_p) = dlsym(opal_datatype_cuda_handle, "opal_cuda_get_gpu_pack_buffer");
+        if ((error = dlerror()) != NULL)  {
+            fprintf(stderr, "opal_cuda_get_gpu_pack_buffer error: %s\n", error);
+            opal_cuda_get_gpu_pack_buffer_p = NULL;
+            return OPAL_ERROR;
+        }
+        
         (*opal_datatype_cuda_init_p)();
         printf("cuda init done\n");   
     }
     return OPAL_SUCCESS;
 }
+
 int32_t opal_datatype_gpu_fini(void)
 {
     if (opal_datatype_cuda_handle != NULL) {
@@ -187,7 +197,22 @@ int32_t opal_datatype_gpu_fini(void)
         unpack_contiguous_loop_cuda_p = NULL;
         pack_predefined_data_cuda_p = NULL;
         opal_cuda_sync_device_p = NULL;
+        opal_cuda_get_gpu_pack_buffer_p = NULL;
         printf("cuda fini done\n");
     }
     return OPAL_SUCCESS;
 }
+
+unsigned char* opal_datatype_get_gpu_buffer(void)
+{
+#if OPAL_DATATYPE_CUDA_KERNEL
+    if (opal_datatype_gpu_init() != OPAL_SUCCESS) {
+        opal_datatype_gpu_fini();
+        return NULL;
+    }
+    return (*opal_cuda_get_gpu_pack_buffer_p)();
+#else
+    return NULL;
+#endif /* defined OPAL_DATATYPE_CUDA_KERNEL */
+    
+}
\ No newline at end of file
diff --git a/opal/datatype/opal_datatype_gpu.h b/opal/datatype/opal_datatype_gpu.h
index b8dc828a0df..49060bde8d1 100644
--- a/opal/datatype/opal_datatype_gpu.h
+++ b/opal/datatype/opal_datatype_gpu.h
@@ -1,10 +1,11 @@
 #ifndef OPAL_DATATYPE_GPU_H_HAS_BEEN_INCLUDED
 #define OPAL_DATATYPE_GPU_H_HAS_BEEN_INCLUDED
 
-#define OPAL_DATATYPE_CUDA_IOV
+#define OPAL_DATATYPE_CUDA_KERNEL   1
 
 int32_t opal_datatype_gpu_init(void);
 int32_t opal_datatype_gpu_fini(void);
+unsigned char* opal_datatype_get_gpu_buffer(void);
 
 extern void (*opal_datatype_cuda_init_p)(void);
 
@@ -49,4 +50,6 @@ extern void (*pack_predefined_data_cuda_p)( dt_elem_desc_t* ELEM,
                                             size_t* SPACE );
                                             
 extern void (*opal_cuda_sync_device_p)(void);
+
+extern unsigned char* (*opal_cuda_get_gpu_pack_buffer_p)(void);
 #endif /* OPAL_DATATYPE_GPU_H_HAS_BEEN_INCLUDED */
\ No newline at end of file
diff --git a/opal/datatype/opal_datatype_module.c b/opal/datatype/opal_datatype_module.c
index 307eb001085..09940374ab3 100644
--- a/opal/datatype/opal_datatype_module.c
+++ b/opal/datatype/opal_datatype_module.c
@@ -249,9 +249,9 @@ int32_t opal_datatype_finalize( void )
     /* clear all master convertors */
     opal_convertor_destroy_masters();
 
-#if defined (OPAL_DATATYPE_CUDA)  
+#if OPAL_DATATYPE_CUDA_KERNEL
     opal_datatype_gpu_fini();
-#endif /* defined OPAL_DATATYPE_CUDA */
+#endif /* defined OPAL_DATATYPE_CUDA_KERNEL */
 
     return OPAL_SUCCESS;
 }
diff --git a/opal/datatype/opal_datatype_pack.c b/opal/datatype/opal_datatype_pack.c
index dbfc1cec12d..a9aaa6541d7 100644
--- a/opal/datatype/opal_datatype_pack.c
+++ b/opal/datatype/opal_datatype_pack.c
@@ -412,11 +412,9 @@ opal_generic_simple_pack_cuda_function( opal_convertor_t* pConvertor,
                                         struct iovec* iov, uint32_t* out_size,
                                         size_t* max_data )
 {
-#if defined (OPAL_DATATYPE_CUDA_IOV)
     if (opal_generic_simple_pack_function_cuda_iov_p != NULL) {
         return (*opal_generic_simple_pack_function_cuda_iov_p)( pConvertor, iov, out_size, max_data);
 
     }
-#endif
     return 0;
 }
diff --git a/opal/datatype/opal_datatype_unpack.c b/opal/datatype/opal_datatype_unpack.c
index b569b40cd81..cad655000d6 100644
--- a/opal/datatype/opal_datatype_unpack.c
+++ b/opal/datatype/opal_datatype_unpack.c
@@ -599,11 +599,9 @@ opal_generic_simple_unpack_cuda_function( opal_convertor_t* pConvertor,
                                           struct iovec* iov, uint32_t* out_size,
                                           size_t* max_data )
 {
-#if defined (OPAL_DATATYPE_CUDA_IOV)
     if (opal_generic_simple_unpack_function_cuda_iov_p != NULL) {
         return (*opal_generic_simple_unpack_function_cuda_iov_p)( pConvertor, iov, out_size, max_data);
 
     }
-#endif
     return 0;
 }
diff --git a/opal/include/opal_config_top.h b/opal/include/opal_config_top.h
index 2f5ad1adec2..1ce5267c389 100644
--- a/opal/include/opal_config_top.h
+++ b/opal/include/opal_config_top.h
@@ -19,8 +19,6 @@
 #error "opal_config_top.h should only be included from opal_config.h"
 #endif
 
-#define OPAL_DATATYPE_CUDA
-
 /* The only purpose of this file is to undef the PACKAGE_<foo> macros
    that are put in by autoconf/automake projects.  Specifically, if
    you include a .h file from another project that defines these
diff --git a/opal/mca/btl/smcuda/btl_smcuda.c b/opal/mca/btl/smcuda/btl_smcuda.c
index bf470f4fb72..2e42d4babc8 100644
--- a/opal/mca/btl/smcuda/btl_smcuda.c
+++ b/opal/mca/btl/smcuda/btl_smcuda.c
@@ -71,6 +71,9 @@
 #include "btl_smcuda_frag.h"
 #include "btl_smcuda_fifo.h"
 
+#include "ompi/mca/pml/ob1/pml_ob1_recvreq.h"
+#include "ompi/mca/pml/ob1/pml_ob1_rdmafrag.h"
+
 #if OPAL_CUDA_SUPPORT
 static struct mca_btl_base_registration_handle_t *mca_btl_smcuda_register_mem (
     struct mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t *endpoint, void *base,
@@ -1107,6 +1110,7 @@ int mca_btl_smcuda_get_cuda (struct mca_btl_base_module_t *btl,
     offset = (size_t) ((intptr_t) remote_address - (intptr_t) reg_ptr->base.base);
     remote_memory_address = (unsigned char *)reg_ptr->base.alloc_base + offset;
     if (0 != offset) {
+        printf("!!!!!!offset %d, ra %p, base %p\n", offset, (void*)remote_address, (void*)reg_ptr->base.base);
         opal_output(-1, "OFFSET=%d", (int)offset);
     }
 
@@ -1116,18 +1120,48 @@ int mca_btl_smcuda_get_cuda (struct mca_btl_base_module_t *btl,
      * on the IPC event that we received.  Note that we pull it from
      * rget_reg, not reg_ptr, as we do not cache the event. */
     mca_common_wait_stream_synchronize(&rget_reg);
-
-    rc = mca_common_cuda_memcpy(local_address, remote_memory_address, size,
-				"mca_btl_smcuda_get", (mca_btl_base_descriptor_t *)frag,
-				&done);
-    if (OPAL_SUCCESS != rc) {
-        /* Out of resources can be handled by upper layers. */
-        if (OPAL_ERR_OUT_OF_RESOURCE != rc) {
-            opal_output(0, "Failed to cuMemcpy GPU memory, rc=%d", rc);
+    
+    /* datatype RDMA */
+    mca_pml_ob1_rdma_frag_t *frag_ob1 = cbdata;
+    mca_pml_ob1_recv_request_t *recvreq = (mca_pml_ob1_recv_request_t *) frag_ob1->rdma_req;
+    mca_bml_base_btl_t *bml_btl = frag_ob1->rdma_bml;
+    
+    if ((recvreq->req_recv.req_base.req_convertor.flags & CONVERTOR_CUDA) &&
+        (bml_btl->btl_flags & MCA_BTL_FLAGS_CUDA_GET)) {
+        recvreq->req_recv.req_base.req_convertor.flags &= ~CONVERTOR_CUDA;
+        if(opal_convertor_need_buffers(&recvreq->req_recv.req_base.req_convertor) == true) {
+            recvreq->req_recv.req_base.req_convertor.flags |= CONVERTOR_CUDA;
+            printf("RGET NOT IMPLEMENT YET!!!!!!!!!!!!!!\n");
+            struct iovec iov;
+            uint32_t iov_count = 1;
+            iov.iov_base = remote_memory_address;
+            iov.iov_len = size;
+            int rc;
+            size_t max_data = size;
+            struct opal_convertor_t *convertor = &(recvreq->req_recv.req_base.req_convertor);   
+        //    uint64_t *event = &convertor->pipeline_event[0];
+            // mca_common_cuda_openeventhandle(&event, 0, (mca_mpool_common_cuda_reg_data_t*)remote_handle);
+            // if (mca_common_cuda_query_event(event) == OPAL_SUCCESS){
+            //     printf("get event\n");
+                rc = opal_convertor_unpack(convertor, &iov, &iov_count, &max_data );
+                done = 1;
+            // }
+        } else {
+            recvreq->req_recv.req_base.req_convertor.flags |= CONVERTOR_CUDA;
+            rc = mca_common_cuda_memcpy(local_address, remote_memory_address, size,
+        				"mca_btl_smcuda_get", (mca_btl_base_descriptor_t *)frag,
+        				&done);
+            if (OPAL_SUCCESS != rc) {
+                /* Out of resources can be handled by upper layers. */
+                if (OPAL_ERR_OUT_OF_RESOURCE != rc) {
+                    opal_output(0, "Failed to cuMemcpy GPU memory, rc=%d", rc);
+                }
+                return rc;
+            }
         }
-        return rc;
     }
 
+
     if (OPAL_UNLIKELY(1 == done)) {
         cbfunc (btl, ep, local_address, local_handle, cbcontext, cbdata, OPAL_SUCCESS);
         mca_btl_smcuda_free(btl, (mca_btl_base_descriptor_t *)frag);
diff --git a/opal/mca/common/cuda/common_cuda.c b/opal/mca/common/cuda/common_cuda.c
index 0afe0dd94a2..87b4c8cce02 100644
--- a/opal/mca/common/cuda/common_cuda.c
+++ b/opal/mca/common/cuda/common_cuda.c
@@ -1046,6 +1046,7 @@ int cuda_getmemhandle(void *base, size_t size, mca_mpool_base_registration_t *ne
                             "CUDA: cuMemGetAddressRange passed: addr=%p, size=%d, pbase=%p, psize=%d ",
                             base, (int)size, (void *)pbase, (int)psize);
     }
+    printf("sizeof memhandle %lu, CUipcMemHandle %lu, cuEvent %lu, char %lu\n", sizeof(memHandle), sizeof(CUipcMemHandle), sizeof(CUevent), sizeof(char));
 
     /* Store all the information in the registration */
     cuda_reg->base.base = (void *)pbase;
@@ -1638,6 +1639,69 @@ int progress_one_cuda_htod_event(struct mca_btl_base_descriptor_t **frag) {
     return 0;
 }
 
+int mca_common_cuda_geteventhandle(uint64_t **event, int n, mca_mpool_base_registration_t *newreg) 
+{
+    // CUipcEventHandle evtHandle;
+    // mca_mpool_common_cuda_reg_t *cuda_reg = (mca_mpool_common_cuda_reg_t*)newreg;
+    // mca_common_cuda_construct_event_and_handle(event, (void**)&evtHandle);
+    // memcpy(&cuda_reg->data.pipeline_evtHandle[n], &evtHandle, sizeof(evtHandle));
+    return OPAL_SUCCESS;
+}
+
+int mca_common_cuda_create_event(uint64_t **event)
+{
+    CUresult result;
+
+    result = cuFunc.cuEventCreate((CUevent *)event, CU_EVENT_INTERPROCESS | CU_EVENT_DISABLE_TIMING);
+    if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
+        opal_show_help("help-mpi-common-cuda.txt", "cuEventCreate failed",
+                       true, OPAL_PROC_MY_HOSTNAME, result);
+        return OPAL_ERROR;
+    }
+    return OPAL_SUCCESS;
+}
+
+int mca_common_cuda_record_event(uint64_t *event)
+{
+    CUresult result;
+    result = cuFunc.cuEventRecord((CUevent)event,0);
+    if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
+        printf("record event error %d\n", result);
+        return OPAL_ERROR;
+    } else {
+        return OPAL_SUCCESS;
+    }
+}
+
+int mca_common_cuda_query_event(uint64_t *event)
+{
+    CUresult result;
+    result = cuFunc.cuEventQuery((CUevent)event);
+    if (OPAL_UNLIKELY(CUDA_SUCCESS == result)) {
+        return OPAL_SUCCESS;
+    } else if (OPAL_UNLIKELY(CUDA_ERROR_NOT_READY == result)) {
+        return OPAL_ERROR;
+    } else {
+        printf("query event error %d\n", result);
+        return OPAL_ERROR;
+    }
+}
+
+int mca_common_cuda_openeventhandle(uint64_t **event, int n, mca_mpool_common_cuda_reg_data_t *handle)
+{
+    // CUipcEventHandle evtHandle;
+    // CUresult result;
+    // mca_mpool_common_cuda_reg_data_t *cuda_handle = (mca_mpool_common_cuda_reg_data_t*)handle;
+    // memcpy(&evtHandle, cuda_handle->pipeline_evtHandle[n], sizeof(evtHandle));
+    // result = cuFunc.cuIpcOpenEventHandle((CUevent *)event, evtHandle);
+    // if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
+    //     opal_show_help("help-mpi-common-cuda.txt", "cuIpcOpenEventHandle failed",
+    //                        true, result);
+    //     return OPAL_ERROR;
+    // }
+    return OPAL_SUCCESS;
+}
+
 
 /**
  * Need to make sure the handle we are retrieving from the cache is still
diff --git a/opal/mca/common/cuda/common_cuda.h b/opal/mca/common/cuda/common_cuda.h
index c0cd59c359b..da6b86d2464 100644
--- a/opal/mca/common/cuda/common_cuda.h
+++ b/opal/mca/common/cuda/common_cuda.h
@@ -28,12 +28,16 @@
 #define MEMHANDLE_SIZE 8
 #define EVTHANDLE_SIZE 8
 
+typedef uint64_t cuIPCHandle[EVTHANDLE_SIZE];
+
 struct mca_mpool_common_cuda_reg_data_t {
     uint64_t memHandle[MEMHANDLE_SIZE];
     uint64_t evtHandle[EVTHANDLE_SIZE];
     uint64_t event;
     opal_ptr_t memh_seg_addr;
     size_t memh_seg_len;
+//    cuIPCHandle pipeline_evtHandle[MAX_IPC_EVENT_HANDLE];
+    uint32_t pipeline_size;
 };
 typedef struct mca_mpool_common_cuda_reg_data_t mca_mpool_common_cuda_reg_data_t;
 
@@ -86,6 +90,11 @@ OPAL_DECLSPEC int mca_common_cuda_device_can_access_peer(int *access, int dev1,
 OPAL_DECLSPEC int mca_common_cuda_stage_one_init(void);
 OPAL_DECLSPEC int mca_common_cuda_get_address_range(void *pbase, size_t *psize, void *base);
 OPAL_DECLSPEC void mca_common_cuda_fini(void);
+OPAL_DECLSPEC int mca_common_cuda_geteventhandle(uint64_t **event, int n, mca_mpool_base_registration_t *newreg);
+OPAL_DECLSPEC int mca_common_cuda_create_event(uint64_t **event);
+OPAL_DECLSPEC int mca_common_cuda_record_event(uint64_t *event);
+OPAL_DECLSPEC int mca_common_cuda_query_event(uint64_t *event);
+OPAL_DECLSPEC int mca_common_cuda_openeventhandle(uint64_t **event, int n, mca_mpool_common_cuda_reg_data_t *handle);
 #if OPAL_CUDA_GDR_SUPPORT
 OPAL_DECLSPEC bool mca_common_cuda_previously_freed_memory(mca_mpool_base_registration_t *reg);
 OPAL_DECLSPEC void mca_common_cuda_get_buffer_id(mca_mpool_base_registration_t *reg);
diff --git a/test/datatype/Makefile.am b/test/datatype/Makefile.am
index d69037c8491..97db4bda506 100644
--- a/test/datatype/Makefile.am
+++ b/test/datatype/Makefile.am
@@ -14,8 +14,8 @@
 #
 
 if PROJECT_OMPI
-    MPI_TESTS = checksum position position_noncontig ddt_test ddt_test_old ddt_raw unpack_ooo ddt_pack
-    MPI_CHECKS = to_self
+    MPI_TESTS = checksum position position_noncontig ddt_test ddt_raw unpack_ooo ddt_pack
+    MPI_CHECKS = to_self ddt_pack
 endif
 TESTS = opal_datatype_test $(MPI_TESTS)
 
@@ -29,12 +29,12 @@ unpack_ooo_LDADD = \
 
 ddt_test_SOURCES = ddt_test.c ddt_lib.c ddt_lib.h
 ddt_test_LDFLAGS = $(WRAPPER_EXTRA_LDFLAGS)
-ddt_test_CFLAGS = -I/mnt/scratch/cuda-6.5.14/include -g 
-ddt_test_LDADD = $(top_builddir)/ompi/libmpi.la $(top_builddir)/opal/mca/common/cuda/libmca_common_cuda.la -L/mnt/scratch/cuda-6.5.14/lib64 -lcudart
+ddt_test_CFLAGS = -I/mnt/sw/cuda/include -g 
+ddt_test_LDADD = $(top_builddir)/ompi/libmpi.la $(top_builddir)/opal/mca/common/cuda/libmca_common_cuda.la -L/mnt/sw/cuda/lib64 -lcudart
 
-ddt_test_old_SOURCES = ddt_test_old.c ddt_lib.c ddt_lib.h
-ddt_test_old_LDFLAGS = $(WRAPPER_EXTRA_LDFLAGS)
-ddt_test_old_LDADD = $(top_builddir)/ompi/libmpi.la
+#ddt_test_old_SOURCES = ddt_test_old.c ddt_lib.c ddt_lib.h
+#ddt_test_old_LDFLAGS = $(WRAPPER_EXTRA_LDFLAGS)
+#ddt_test_old_LDADD = $(top_builddir)/ompi/libmpi.la
 
 ddt_raw_SOURCES = ddt_raw.c ddt_lib.c ddt_lib.h
 ddt_raw_LDFLAGS = $(OMPI_PKG_CONFIG_LDFLAGS)
diff --git a/test/datatype/ddt_test.c b/test/datatype/ddt_test.c
index e5f58a5b348..6a41001a770 100644
--- a/test/datatype/ddt_test.c
+++ b/test/datatype/ddt_test.c
@@ -36,6 +36,7 @@
 #include <cuda_runtime_api.h>
 #include "opal/mca/common/cuda/common_cuda.h"
 #include "opal/runtime/opal_params.h"
+#define CONVERTOR_CUDA             0x00400000
 #endif
 
 /* Compile with:
@@ -684,12 +685,18 @@ static int local_copy_with_convertor( ompi_datatype_t* pdt, int count, int chunk
 #endif
 
     send_convertor = opal_convertor_create( remote_arch, 0 );
+#if defined (DDT_TEST_CUDA)
+    send_convertor->flags |= CONVERTOR_CUDA;
+#endif
     if( OPAL_SUCCESS != opal_convertor_prepare_for_send( send_convertor, &(pdt->super), count, psrc ) ) {
         printf( "Unable to create the send convertor. Is the datatype committed ?\n" );
         goto clean_and_return;
     }
 
     recv_convertor = opal_convertor_create( remote_arch, 0 );
+#if defined (DDT_TEST_CUDA)
+    recv_convertor->flags |= CONVERTOR_CUDA;
+#endif
     if( OPAL_SUCCESS != opal_convertor_prepare_for_recv( recv_convertor, &(pdt->super), count, pdst ) ) {
         printf( "Unable to create the recv convertor. Is the datatype committed ?\n" );
         goto clean_and_return;
@@ -775,7 +782,7 @@ int main( int argc, char* argv[] )
 #endif
     opal_init_util(&argc, &argv);
 #if defined (DDT_TEST_CUDA)
-    mca_common_cuda_stage_one_init();
+   // mca_common_cuda_stage_one_init();
 #endif
     ompi_datatype_init();
 
@@ -807,11 +814,11 @@ int main( int argc, char* argv[] )
     }
     
     printf( "\n\n#\n * TEST UPPER TRIANGULAR MATRIX (size 100)\n #\n\n" );
-    pdt = upper_matrix(1000);
+    pdt = upper_matrix(4000);
     if( outputFlags & CHECK_PACK_UNPACK ) {
         for (i = 1; i <= 3; i++) {
 //        local_copy_ddt_count(pdt, 1);
-            local_copy_with_convertor(pdt, 1, 1024*1024*200, 1000);
+            local_copy_with_convertor(pdt, 1, 1024*1024*100, 4000);
         }
     }
     OBJ_RELEASE( pdt ); assert( pdt == NULL );

From fb1014481309941f2d5ef2a192954c3df885a387 Mon Sep 17 00:00:00 2001
From: eddy16112 <eddy16112@gmail.com>
Date: Wed, 22 Apr 2015 00:16:10 -0400
Subject: [PATCH 098/190] Add support for vector datatype. Add pipeline.
 Improve the GPU memory management.

Conflicts:
	opal/mca/mpool/gpusm/mpool_gpusm.h
	opal/mca/mpool/gpusm/mpool_gpusm_module.c
---
 ompi/mca/pml/ob1/pml_ob1_cuda.c               |  69 ++--
 ompi/mca/pml/ob1/pml_ob1_sendreq.c            |   2 +-
 opal/datatype/cuda/opal_datatype_cuda.cu      | 260 +++++++++++++-
 opal/datatype/cuda/opal_datatype_cuda.cuh     |  16 +
 .../cuda/opal_datatype_cuda_internal.cuh      |  25 ++
 .../cuda/opal_datatype_orig_internal.h        |  12 +-
 .../cuda/opal_datatype_pack_cuda_kernel.cu    |   8 +-
 .../cuda/opal_datatype_pack_cuda_wrapper.cu   | 319 +++++++++++-------
 .../cuda/opal_datatype_unpack_cuda_kernel.cu  |   8 +-
 .../cuda/opal_datatype_unpack_cuda_wrapper.cu | 181 +++++++++-
 opal/datatype/opal_convertor.c                |   2 +
 opal/datatype/opal_convertor.h                |   1 -
 opal/datatype/opal_datatype_gpu.c             |  46 +++
 opal/datatype/opal_datatype_gpu.h             |  20 +-
 opal/datatype/opal_datatype_pack.c            |  21 +-
 opal/datatype/opal_datatype_unpack.c          |  21 +-
 opal/mca/btl/btl.h                            |   2 +
 opal/mca/btl/smcuda/btl_smcuda.c              | 106 +++++-
 opal/mca/btl/smcuda/btl_smcuda.h              |  36 ++
 opal/mca/btl/smcuda/btl_smcuda_component.c    |  65 ++++
 opal/mca/common/cuda/common_cuda.c            |  11 +-
 opal/mca/common/cuda/common_cuda.h            |   9 +-
 opal/mca/mpool/gpusm/mpool_gpusm_module.c     |   2 +-
 test/datatype/ddt_test.c                      |  22 +-
 24 files changed, 1033 insertions(+), 231 deletions(-)

diff --git a/ompi/mca/pml/ob1/pml_ob1_cuda.c b/ompi/mca/pml/ob1/pml_ob1_cuda.c
index b51fc299cdd..4361d2f5918 100644
--- a/ompi/mca/pml/ob1/pml_ob1_cuda.c
+++ b/ompi/mca/pml/ob1/pml_ob1_cuda.c
@@ -39,6 +39,7 @@
 
 #include "opal/datatype/opal_datatype_gpu.h"
 #include "opal/mca/common/cuda/common_cuda.h"
+#include "opal/mca/btl/smcuda/btl_smcuda.h"
 
 #define CUDA_DDT_WITH_RDMA 1
 
@@ -51,7 +52,7 @@ size_t mca_pml_ob1_rdma_cuda_btls(
 int mca_pml_ob1_rdma_cuda_btl_register_events(
     mca_pml_ob1_com_btl_t* rdma_btls, 
     uint32_t num_btls_used, 
-    struct opal_convertor_t* convertor);
+    struct opal_convertor_t* convertor, size_t pipeline_size, int lindex);
 
 int mca_pml_ob1_cuda_need_buffers(void * rreq,
                                   mca_btl_base_module_t* btl);
@@ -107,7 +108,8 @@ int mca_pml_ob1_send_request_start_cuda(mca_pml_ob1_send_request_t* sendreq,
             printf("GPU data ready for GET!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n");
             unsigned char *base;
             struct opal_convertor_t *convertor = &(sendreq->req_send.req_base.req_convertor);
-            base = opal_datatype_get_gpu_buffer();
+            base = opal_cuda_malloc_gpu_buffer_p(convertor->local_size, 0);
+            convertor->gpu_buffer_ptr = base;
             sendreq->req_send.req_bytes_packed = convertor->local_size;
             printf("GPU BUFFER %p, local %lu, remote %lu\n", base, convertor->local_size, convertor->remote_size);
             if( 0 != (sendreq->req_rdma_cnt = (uint32_t)mca_pml_ob1_rdma_cuda_btls(
@@ -116,22 +118,34 @@ int mca_pml_ob1_send_request_start_cuda(mca_pml_ob1_send_request_t* sendreq,
                                                                            sendreq->req_send.req_bytes_packed,
                                                                            sendreq->req_rdma))) {
                 
-                mca_pml_ob1_rdma_cuda_btl_register_events(sendreq->req_rdma, sendreq->req_rdma_cnt, convertor);
+                size_t pipeline_size = convertor->local_size;
                 struct iovec iov;
                 int rc_dt = 0;
                 uint32_t iov_count = 1;
-                iov.iov_base = NULL;
-                iov.iov_len = 0;
+                iov.iov_base = base;
+                iov.iov_len = pipeline_size;
                 size_t max_data = 0;
+                int seq = 0;
+                /* the first pack here is used to get the correct size of pipeline_size */
+                /* because pack may not use the whole pipeline size */
                 rc_dt = opal_convertor_pack(convertor, &iov, &iov_count, &max_data );
-              //  mca_common_cuda_record_event(&convertor->pipeline_event[0]);
-           //      uint64_t event, *ep;
-           //      ep = &event;
-           //      mca_common_cuda_create_event((uint64_t**)ep);
-           // //     mca_common_cuda_record_event(ep);
-           //      printf("success record event %d\n", event);
+                pipeline_size = max_data;
+                int lindex = mca_btl_smcuda_alloc_cuda_dt_clone();
+                assert(lindex >= 0);
+                mca_pml_ob1_rdma_cuda_btl_register_events(sendreq->req_rdma, sendreq->req_rdma_cnt, convertor, pipeline_size, lindex); 
+                mca_btl_smcuda_cuda_dt_clone(convertor, bml_btl->btl_endpoint, NULL, NULL, NULL, NULL, NULL, pipeline_size, lindex);
+                
                 rc = mca_pml_ob1_send_request_start_rdma(sendreq, bml_btl,
                                                          sendreq->req_send.req_bytes_packed);
+                
+                mca_btl_smcuda_send_cuda_unpack_sig(bml_btl->btl, bml_btl->btl_endpoint, lindex, seq);
+                while (rc_dt != 1) {
+                    iov.iov_base += pipeline_size;
+                    seq ++;
+                    rc_dt = opal_convertor_pack(convertor, &iov, &iov_count, &max_data );
+                    mca_btl_smcuda_send_cuda_unpack_sig(bml_btl->btl, bml_btl->btl_endpoint, lindex, seq);
+                }
+                mca_btl_smcuda_send_cuda_unpack_sig(bml_btl->btl, bml_btl->btl_endpoint, lindex, -1);
                 if( OPAL_UNLIKELY(OMPI_SUCCESS != rc) ) {
                     mca_pml_ob1_free_rdma_resources(sendreq);
                 }
@@ -203,24 +217,23 @@ size_t mca_pml_ob1_rdma_cuda_btls(
 int mca_pml_ob1_rdma_cuda_btl_register_events(
     mca_pml_ob1_com_btl_t* rdma_btls, 
     uint32_t num_btls_used, 
-    struct opal_convertor_t* convertor)
+    struct opal_convertor_t* convertor, size_t pipeline_size, int lindex)
 {
-    // uint32_t i, j;
-    // for (i = 0; i < num_btls_used; i++) {
-    //     mca_btl_base_registration_handle_t *handle = rdma_btls[i].btl_reg;
-    //     mca_mpool_common_cuda_reg_t *cuda_reg = (mca_mpool_common_cuda_reg_t *)
-    //             ((intptr_t) handle - offsetof (mca_mpool_common_cuda_reg_t, data));
-    //     printf("base %p\n", cuda_reg->base.base);
-    //     for (j = 0; j < MAX_IPC_EVENT_HANDLE; j++) {
-    //         uint64_t *event = &convertor->pipeline_event[j];
-    //         convertor->pipeline_event[j] = 0;
-    //         mca_common_cuda_geteventhandle(&event, j, (mca_mpool_base_registration_t *)cuda_reg);
-    //         convertor->pipeline_event[j] = *event;
-    //   //      printf("event %lu, j %d\n", convertor->pipeline_event[j], j);
-    //     }
-    //     cuda_reg->data.pipeline_size = 1000;
-    //
-    // }
+    uint32_t i, j;
+    for (i = 0; i < num_btls_used; i++) {
+        mca_btl_base_registration_handle_t *handle = rdma_btls[i].btl_reg;
+        mca_mpool_common_cuda_reg_t *cuda_reg = (mca_mpool_common_cuda_reg_t *)
+                ((intptr_t) handle - offsetof (mca_mpool_common_cuda_reg_t, data));
+      //   printf("base %p\n", cuda_reg->base.base);
+      //   for (j = 0; j < MAX_IPC_EVENT_HANDLE; j++) {
+      //       mca_common_cuda_geteventhandle(&convertor->pipeline_event[j], j, (mca_mpool_base_registration_t *)cuda_reg);
+      // //      printf("event %lu, j %d\n", convertor->pipeline_event[j], j);
+      //   }
+        printf("i send pipeline %ld\n", pipeline_size);
+        cuda_reg->data.pipeline_size = pipeline_size;
+        cuda_reg->data.lindex = lindex;
+
+    }
     return 0;
 }
 
diff --git a/ompi/mca/pml/ob1/pml_ob1_sendreq.c b/ompi/mca/pml/ob1/pml_ob1_sendreq.c
index 50b11d36dff..78b7188cdbb 100644
--- a/ompi/mca/pml/ob1/pml_ob1_sendreq.c
+++ b/ompi/mca/pml/ob1/pml_ob1_sendreq.c
@@ -679,7 +679,7 @@ int mca_pml_ob1_send_request_start_rdma( mca_pml_ob1_send_request_t* sendreq,
     if ( (sendreq->req_send.req_base.req_convertor.flags & CONVERTOR_CUDA)) {
         sendreq->req_send.req_base.req_convertor.flags &= ~CONVERTOR_CUDA;
         if (opal_convertor_need_buffers(&sendreq->req_send.req_base.req_convertor) == true) {
-            data_ptr = sendreq->req_send.req_base.req_convertor.gpu_buffer_ptr_source;
+            data_ptr = sendreq->req_send.req_base.req_convertor.gpu_buffer_ptr;
             printf("START RMDA data_ptr %p\n", data_ptr);
         } else {
             opal_convertor_get_current_pointer (&sendreq->req_send.req_base.req_convertor, &data_ptr);
diff --git a/opal/datatype/cuda/opal_datatype_cuda.cu b/opal/datatype/cuda/opal_datatype_cuda.cu
index 1debbd221a5..387f75583ce 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cu
+++ b/opal/datatype/cuda/opal_datatype_cuda.cu
@@ -3,6 +3,7 @@
 #include <cuda_runtime_api.h>
 #include <cuda.h>
 #include <stdio.h>
+#include <assert.h>
 #include <stdarg.h> 
 
 /*
@@ -39,6 +40,9 @@ OPAL_DECLSPEC const size_t opal_datatype_basicDatatypesSize[OPAL_DATATYPE_MAX_PR
 
 /***** my variables ********/
 
+
+ddt_cuda_list_t *cuda_free_list;
+ddt_cuda_device_t *cuda_device;
 ddt_cuda_desc_t *cuda_desc_d, *cuda_desc_h;
 unsigned char *pBaseBuf_GPU, *gpu_src_const, *gpu_dest_const;
 unsigned char *ddt_cuda_pack_buffer, *ddt_cuda_unpack_buffer;
@@ -54,12 +58,172 @@ uint8_t opal_datatype_cuda_debug;
 
 //uint8_t ALIGNMENT_DOUBLE, ALIGNMENT_FLOAT, ALIGNMENT_CHAR;
 
+
+static inline ddt_cuda_buffer_t* obj_ddt_cuda_buffer_new()
+{
+    ddt_cuda_buffer_t *p = (ddt_cuda_buffer_t *)malloc(sizeof(ddt_cuda_buffer_t));
+    p->next = NULL;
+    p->prev = NULL;
+    p->size = 0;
+    p->gpu_addr = NULL;
+    return p; 
+}
+
+static inline void obj_ddt_cuda_buffer_chop(ddt_cuda_buffer_t *p)
+{
+    p->next = NULL;
+    p->prev = NULL;
+}
+
+static inline void obj_ddt_cuda_buffer_reset(ddt_cuda_buffer_t *p)
+{
+    p->size = 0;
+    p->gpu_addr = NULL;
+}
+
+static ddt_cuda_list_t* init_cuda_free_list()
+{
+    ddt_cuda_list_t *list = NULL;
+    ddt_cuda_buffer_t *p, *prev;
+    int i;
+    list = (ddt_cuda_list_t *)malloc(sizeof(ddt_cuda_list_t));
+    p = obj_ddt_cuda_buffer_new();
+    list->head = p;
+    prev = p;
+    for (i = 1; i < DT_CUDA_FREE_LIST_SIZE; i++) {
+        p = obj_ddt_cuda_buffer_new();
+        prev->next = p;
+        p->prev = prev;
+        prev = p;
+    }
+    list->tail = p;
+    list->nb_elements = DT_CUDA_FREE_LIST_SIZE;
+    return list;
+} 
+
+static inline ddt_cuda_buffer_t* cuda_list_pop_tail(ddt_cuda_list_t *list)
+{
+    ddt_cuda_buffer_t *p = NULL;
+    p = list->tail;
+    if (p == NULL) {
+        return p;
+    } else {
+        list->nb_elements --;
+        if (list->head == p) {
+            list->head = NULL;
+            list->tail = NULL;
+        } else {
+            list->tail = p->prev;
+            p->prev->next = NULL;
+            obj_ddt_cuda_buffer_chop(p);
+        }
+        return p;
+    }
+}
+
+static inline void cuda_list_push_head(ddt_cuda_list_t *list, ddt_cuda_buffer_t *item)
+{
+    ddt_cuda_buffer_t * orig_head = list->head;
+    assert(item->next == NULL && item->prev == NULL);
+    list->head = item;
+    item->next = orig_head;
+    if (orig_head == NULL) {
+        list->tail = item;
+    }
+    list->nb_elements ++;
+}
+
+static inline void cuda_list_push_tail(ddt_cuda_list_t *list, ddt_cuda_buffer_t *item)
+{
+    ddt_cuda_buffer_t * orig_tail = list->tail;
+    assert(item->next == NULL && item->prev == NULL);
+    list->tail = item;
+    item->prev = orig_tail;
+    if (orig_tail == NULL) {
+        list->head = item;
+    }
+    list->nb_elements ++;
+}
+
+static inline void cuda_list_delete(ddt_cuda_list_t *list, ddt_cuda_buffer_t *item)
+{
+    if (item->prev == NULL && item->next == NULL) {
+        list->head = NULL;
+        list->tail = NULL;
+    }else if (item->prev == NULL && item->next != NULL) {
+        list->head = item->next;
+        item->next->prev = NULL;
+    } else if (item->next == NULL && item->prev != NULL) {
+        list->tail = item->prev;
+        item->prev->next = NULL;
+    } else {
+        item->prev->next = item->next;
+        item->next->prev = item->prev;
+    }
+    list->nb_elements --;
+    obj_ddt_cuda_buffer_chop(item);
+}
+
+static inline void cuda_list_insert_before(ddt_cuda_list_t *list, ddt_cuda_buffer_t *item, ddt_cuda_buffer_t *next)
+{
+    assert(item->next == NULL && item->prev == NULL);
+    item->next = next;
+    item->prev = next->prev;
+    next->prev = item;
+    if (list->head == next) {
+        list->head = item;
+    }
+    list->nb_elements ++;
+}
+
+static inline void cuda_list_item_merge_by_addr(ddt_cuda_list_t *list)
+{
+    ddt_cuda_buffer_t *ptr = NULL;
+    ddt_cuda_buffer_t *next = NULL;
+    ptr = list->head;
+    while(ptr != NULL) {
+        next = ptr->next;
+        if (next == NULL) {
+            break;
+        } else if ((ptr->gpu_addr + ptr->size) == next->gpu_addr) {
+            ptr->size += next->size;
+            cuda_list_delete(list, next);
+        } else {
+            ptr = ptr->next;
+        }
+    }
+}
+
 void opal_datatype_cuda_init(void)
 {
     uint32_t i;
     
-    int cuda_device = OPAL_GPU_INDEX;
-    cudaSetDevice(cuda_device);
+    int device = OPAL_GPU_INDEX;
+    cudaSetDevice(device);
+    
+    cuda_free_list = init_cuda_free_list();
+    
+    /* init device */
+    cuda_device = (ddt_cuda_device_t *)malloc(sizeof(ddt_cuda_device_t)*1);
+    for (i = 0; i < 1; i++) {
+        unsigned char *gpu_ptr = NULL;
+        if (cudaMalloc((void **)(&gpu_ptr), sizeof(char)*DT_CUDA_BUFFER_SIZE) != cudaSuccess) {
+            DT_CUDA_DEBUG( opal_cuda_output( 0, "cudaMalloc is failed in GPU %d\n", i); );
+        }
+        cudaMemset(gpu_ptr, 0, sizeof(char)*DT_CUDA_BUFFER_SIZE);
+        cuda_device[i].gpu_buffer = gpu_ptr;
+        
+        cuda_device[i].buffer_free_size = DT_CUDA_BUFFER_SIZE;
+        ddt_cuda_buffer_t *p = obj_ddt_cuda_buffer_new();
+        p->size = DT_CUDA_BUFFER_SIZE;
+        p->gpu_addr = gpu_ptr;
+        cuda_device[i].buffer_free.head = p;
+        cuda_device[i].buffer_free.tail = cuda_device[i].buffer_free.head;
+        
+        cuda_device[i].buffer_used.head = NULL;
+        cuda_device[i].buffer_used.tail = NULL;
+        cuda_device[i].buffer_used_size = 0;
+    }
     
     cudaMalloc((void **)&cuda_desc_d, sizeof(ddt_cuda_desc_t));
     cudaMallocHost((void **)&cuda_desc_h, sizeof(ddt_cuda_desc_t));
@@ -72,11 +236,12 @@ void opal_datatype_cuda_init(void)
     //     cuda_desc_h->iov[i].iov_base = iov_base;
     //     cuda_desc_h->iov[i].iov_len = IOV_LEN;
     // }
-    printf("malloc cuda packing buffer\n");
+    
     cudaMalloc((void **)(&ddt_cuda_pack_buffer), sizeof(char)*DT_CUDA_BUFFER_SIZE);
+    printf("malloc cuda packing buffer, %p\n", ddt_cuda_pack_buffer);
     cudaMemset(ddt_cuda_pack_buffer, 0, sizeof(char)*DT_CUDA_BUFFER_SIZE);
-    printf("malloc cuda unpacking buffer\n");
     cudaMalloc((void **)(&ddt_cuda_unpack_buffer), sizeof(char)*DT_CUDA_BUFFER_SIZE);
+    printf("malloc cuda unpacking buffer, %p\n", ddt_cuda_unpack_buffer);
     cudaMemset(ddt_cuda_unpack_buffer, 0, sizeof(char)*DT_CUDA_BUFFER_SIZE);
 
     cuda_desc_h->iov[0].iov_base = ddt_cuda_pack_buffer;
@@ -196,6 +361,93 @@ unsigned char* opal_cuda_get_gpu_pack_buffer()
     }
 }
 
+void* opal_cuda_malloc_gpu_buffer(size_t size, int gpu_id)
+{
+    ddt_cuda_device_t *device = &cuda_device[gpu_id];
+    if (device->buffer_free_size < size) {
+        return NULL;
+    }
+    ddt_cuda_buffer_t *ptr = NULL;
+    void *addr = NULL;
+    ptr = device->buffer_free.head;
+    while (ptr != NULL) {
+        if (ptr->size >= size) {
+            addr = ptr->gpu_addr;
+            ptr->size -= size;
+            if (ptr->size == 0) {
+                cuda_list_delete(&device->buffer_free, ptr);
+                obj_ddt_cuda_buffer_reset(ptr);
+                cuda_list_push_head(cuda_free_list, ptr);
+            } else {
+                ptr->gpu_addr += size;
+            }
+            break;
+        }
+        ptr = ptr->next;
+    }
+    
+    if (ptr == NULL) {
+        return NULL;
+    } else {    
+        ddt_cuda_buffer_t *p = cuda_list_pop_tail(cuda_free_list);
+        if (p == NULL) {
+            p = obj_ddt_cuda_buffer_new();
+        }
+        p->size = size;
+        p->gpu_addr = (unsigned char*)addr;
+        cuda_list_push_head(&device->buffer_used, p);
+        device->buffer_used_size += size;
+        device->buffer_free_size -= size;
+        DT_CUDA_DEBUG( opal_cuda_output( 0, "Malloc GPU buffer %p.\n", addr); );
+        return addr;
+    }
+}
+
+void opal_cuda_free_gpu_buffer(void *addr, int gpu_id)
+{
+    ddt_cuda_device_t *device = &cuda_device[gpu_id];
+    ddt_cuda_buffer_t *ptr = NULL;
+    ddt_cuda_buffer_t *ptr_next = NULL;
+    ptr = device->buffer_used.head;
+    while (ptr != NULL) {
+        if (ptr->gpu_addr == addr) {
+            cuda_list_delete(&device->buffer_used, ptr);
+            ptr_next = device->buffer_free.head;
+            while (ptr_next != NULL) {
+                if (ptr_next->gpu_addr > addr) {
+                    break;
+                }
+                ptr_next = ptr_next->next;
+            }
+            if (ptr_next == NULL) {
+                /* buffer_free is empty, or insert to last one */
+                cuda_list_push_tail(&device->buffer_free, ptr);
+            } else {
+                cuda_list_insert_before(&device->buffer_free, ptr, ptr_next);
+            }
+            cuda_list_item_merge_by_addr(&device->buffer_free);
+            device->buffer_free_size += ptr->size;
+            break;
+        }
+        ptr = ptr->next;
+    }
+    if (ptr == NULL) {
+        DT_CUDA_DEBUG( opal_cuda_output( 0, "addr %p is not managed.\n", addr); );
+    }
+    DT_CUDA_DEBUG( opal_cuda_output( 0, "Free GPU buffer %p.\n", addr); );
+}
+
+void opal_dump_cuda_list(ddt_cuda_list_t *list)
+{
+    ddt_cuda_buffer_t *ptr = NULL;
+    ptr = list->head;
+    DT_CUDA_DEBUG( opal_cuda_output( 0, "DUMP cuda list %p, nb_elements %d\n", list, list->nb_elements); );
+    while (ptr != NULL) {
+        DT_CUDA_DEBUG( opal_cuda_output( 0, "\titem addr %p, size %ld.\n", ptr->gpu_addr, ptr->size); );
+        ptr = ptr->next;
+    }
+}
+
 /* from internal.h*/
 void opal_cuda_output(int output_id, const char *format, ...)
 {
diff --git a/opal/datatype/cuda/opal_datatype_cuda.cuh b/opal/datatype/cuda/opal_datatype_cuda.cuh
index 5797ceb55d8..04dd5f88a26 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cuh
+++ b/opal/datatype/cuda/opal_datatype_cuda.cuh
@@ -13,6 +13,11 @@ int32_t opal_generic_simple_pack_function_cuda( opal_convertor_t* pConvertor,
                                                 uint32_t* out_size,
                                                 size_t* max_data );
                                                 
+int32_t opal_generic_simple_pack_function_cuda_vector( opal_convertor_t* pConvertor,
+                                                       struct iovec* iov, 
+                                                       uint32_t* out_size,
+                                                       size_t* max_data );
+                                                
 int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor,
                                                     struct iovec* iov, 
                                                     uint32_t* out_size,
@@ -27,6 +32,11 @@ int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvert
                                                   struct iovec* iov, 
                                                   uint32_t* out_size,
                                                   size_t* max_data );  
+                                                
+int32_t opal_generic_simple_unpack_function_cuda_vector( opal_convertor_t* pConvertor,
+                                                         struct iovec* iov, 
+                                                         uint32_t* out_size,
+                                                         size_t* max_data );
 
 void pack_contiguous_loop_cuda( dt_elem_desc_t* ELEM,
                                 uint32_t* COUNT,
@@ -50,6 +60,12 @@ void opal_cuda_sync_device(void);
 
 int32_t opal_cuda_is_gpu_buffer(const void *ptr);
 
+void* opal_cuda_malloc_gpu_buffer(size_t size, int gpu_id);
+
+void opal_cuda_free_gpu_buffer(void *addr, int gpu_id);
+
+void opal_dump_cuda_list(ddt_cuda_list_t *list);
+
 unsigned char* opal_cuda_get_gpu_pack_buffer();
 }
                             
diff --git a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
index be264484153..567e81218ec 100644
--- a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
+++ b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
@@ -18,6 +18,7 @@
 
 #define IOV_ARRAY_SIZE          1
 #define DT_CUDA_BUFFER_SIZE    1024*1024*200
+#define DT_CUDA_FREE_LIST_SIZE  50
 
 #define THREAD_PER_BLOCK    32
 #define CUDA_WARP_SIZE      32
@@ -76,6 +77,30 @@ typedef struct {
     uint32_t nb_tasks;
 } ddt_cuda_iov_dist_t;
 
+typedef struct ddt_cuda_buffer{
+    unsigned char* gpu_addr;
+    size_t size;
+    struct ddt_cuda_buffer *next;
+    struct ddt_cuda_buffer *prev;
+} ddt_cuda_buffer_t;
+
+typedef struct {
+    ddt_cuda_buffer_t *head;
+    ddt_cuda_buffer_t *tail;
+    size_t nb_elements;
+} ddt_cuda_list_t;
+
+typedef struct {
+    int device_id;
+    unsigned char* gpu_buffer;
+    ddt_cuda_list_t buffer_free;
+    ddt_cuda_list_t buffer_used;
+    size_t buffer_free_size;
+    size_t buffer_used_size;
+} ddt_cuda_device_t;
+
+extern ddt_cuda_list_t *cuda_free_list;
+extern ddt_cuda_device_t *cuda_device;
 extern ddt_cuda_desc_t *cuda_desc_d, *cuda_desc_h;
 extern unsigned char* pBaseBuf_GPU;
 extern unsigned char *ddt_cuda_pack_buffer, *ddt_cuda_unpack_buffer;
diff --git a/opal/datatype/cuda/opal_datatype_orig_internal.h b/opal/datatype/cuda/opal_datatype_orig_internal.h
index 37b1d1be51b..90561359f75 100644
--- a/opal/datatype/cuda/opal_datatype_orig_internal.h
+++ b/opal/datatype/cuda/opal_datatype_orig_internal.h
@@ -326,7 +326,6 @@ struct opal_convertor_t {
     void *                        stream;         /**< CUstream for async copy */
 
     unsigned char *               gpu_buffer_ptr; /**< GPU buffer used for pack/unpack */
-    unsigned char *               gpu_buffer_ptr_source; /**< source address of GPU buffer start to pack, update in packing function */
     uint64_t *                    pipeline_event[MAX_IPC_EVENT_HANDLE]; /**< cuda event for pipeline */
 #endif
     /* size: 248, cachelines: 4, members: 20 */
@@ -531,13 +530,10 @@ do { \
 
 #define PUSH_STACK( PSTACK, STACK_POS, INDEX, TYPE, COUNT, DISP) \
 do { \
-   dt_stack_t* pTempStack = (PSTACK) + 1; \
-   if (threadIdx.x == 0) {  \
-       SAVE_STACK( pTempStack, (INDEX), (TYPE), (COUNT), (DISP) );  \
-   }    \
-   __syncthreads(); \
-   (STACK_POS)++; \
-   (PSTACK) = pTempStack; \
+    dt_stack_t* pTempStack = (PSTACK) + 1; \
+    SAVE_STACK( pTempStack, (INDEX), (TYPE), (COUNT), (DISP) );  \
+    (STACK_POS)++; \
+    (PSTACK) = pTempStack; \
 } while(0)
 
 #define UPDATE_INTERNAL_COUNTERS( DESCRIPTION, POSITION, ELEMENT, COUNTER ) \
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
index 98208dc0f39..96bdc12d961 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
@@ -547,10 +547,10 @@ __global__ void pack_contiguous_loop_cuda_kernel_global( uint32_t copy_loops,
     for (_i = tid; _i < copy_loops*nb_elements; _i+=num_threads) {
         _source_tmp = _src_disp_tmp + tid + _i/num_threads*num_threads + _i/nb_elements * gap;
 #if defined (OPAL_DATATYPE_CUDA_DEBUG)
-        if (_i % nb_elements == 0 ) {
-            DBGPRINT("tid %d, pack 3. memcpy( %p, %p, %lu ) => _i %d, actual _i %d, count %d\n",
-                                            tid, _destination_tmp, _source_tmp, (unsigned long)size, _i/nb_elements, _i, copy_loops );
-        }
+        // if (_i % nb_elements == 0 ) {
+        //     DBGPRINT("tid %d, pack 3. memcpy( %p, %p, %lu ) => _i %d, actual _i %d, count %d\n",
+        //                                     tid, _destination_tmp, _source_tmp, (unsigned long)size, _i/nb_elements, _i, copy_loops );
+        // }
         // if (_i / nb_elements ==1 && tid == 0 ) {
         //     DBGPRINT("tid %d, pack 3. memcpy( %p, %p, %lu ) => space %lu, _i %d, actual _i %d\n",
         //                                     tid, _destination_tmp, _source_tmp, (unsigned long)_end_loop->size, (unsigned long)(*(SPACE) - _i/nb_elements * _end_loop->size), _i/nb_elements, _i );
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
index 14fdcfca346..a5963b74d3f 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
@@ -168,6 +168,167 @@ int32_t opal_generic_simple_pack_function_cuda( opal_convertor_t* pConvertor,
                                                   
 }
 
+int32_t opal_generic_simple_pack_function_cuda_vector(opal_convertor_t* pConvertor,
+                                                      struct iovec* iov,
+                                                      uint32_t* out_size,
+                                                      size_t* max_data )
+{
+    dt_stack_t* pStack;       /* pointer to the position on the stack */
+    uint32_t pos_desc;        /* actual position in the description of the derived datatype */
+    uint32_t count_desc;      /* the number of items already done in the actual pos_desc */
+    size_t total_packed = 0;  /* total amount packed this time */
+    dt_elem_desc_t* description;
+    dt_elem_desc_t* pElem;
+    const opal_datatype_t *pData = pConvertor->pDesc;
+    unsigned char *conv_ptr, *iov_ptr;
+    size_t iov_len_local;
+    uint32_t iov_count;
+    uint8_t transfer_required;
+    
+#if defined(OPAL_DATATYPE_CUDA_TIMING)    
+    TIMER_DATA_TYPE start, end, start_total, end_total;
+    long total_time;
+#endif
+    
+    DT_CUDA_DEBUG( opal_cuda_output( 1, "opal_convertor_generic_simple_pack( %p:%p, {%p, %lu}, %d )\n",
+                                (void*)pConvertor, (void*)pConvertor->pBaseBuf,
+                                iov[0].iov_base, (unsigned long)iov[0].iov_len, *out_size ); );
+
+    printf("I am in simple pack vector, max_data %lu, iov_len %lu\n", *max_data, iov[0].iov_len);
+    description = pConvertor->use_desc->desc;
+    
+    /* For the first step we have to add both displacement to the source. After in the
+     * main while loop we will set back the conv_ptr to the correct value. This is
+     * due to the fact that the convertor can stop in the middle of a data with a count
+     */
+    pStack = pConvertor->pStack + pConvertor->stack_pos;
+    pos_desc   = pStack->index;
+    conv_ptr   = pConvertor->pBaseBuf + pStack->disp;
+    count_desc = (uint32_t)pStack->count;
+    pStack--;
+    pConvertor->stack_pos--;
+    pElem = &(description[pos_desc]);
+
+    DT_CUDA_DEBUG( opal_cuda_output( 0, "pack start pos_desc %d count_desc %d disp %ld\n"
+                           "stack_pos %d pos_desc %d count_desc %d disp %ld\n",
+                           pos_desc, count_desc, (long)(conv_ptr - pConvertor->pBaseBuf),
+                           pConvertor->stack_pos, pStack->index, (int)pStack->count, (long)pStack->disp ); );
+    
+    
+    for( iov_count = 0; iov_count < (*out_size); iov_count++ ) {
+        if ((iov[0].iov_base == NULL) || opal_cuda_is_gpu_buffer(iov[iov_count].iov_base)) {
+            // if (iov[0].iov_len == 0) {
+            //     buffer_size = DT_CUDA_BUFFER_SIZE;
+            // } else {
+            //     buffer_size = iov[0].iov_len;
+            // }
+            pConvertor->gpu_buffer_ptr = ddt_cuda_pack_buffer;
+        
+            if (iov[iov_count].iov_base == NULL) {
+                iov[iov_count].iov_base = ddt_cuda_pack_buffer;
+                iov_ptr = ddt_cuda_pack_buffer;
+            } else {
+                iov_ptr = (unsigned char *)iov[iov_count].iov_base;
+            }
+            transfer_required = 0;
+        } else {
+            pConvertor->gpu_buffer_ptr = NULL;
+            transfer_required = 1;
+        }
+        iov_ptr = ddt_cuda_pack_buffer;
+        iov_len_local = iov[iov_count].iov_len;
+        printf("original local %d\n", iov_len_local);
+        while( 1 ) {
+            while( pElem->elem.common.flags & OPAL_DATATYPE_FLAG_DATA ) {
+                /* now here we have a basic datatype */
+                /* should not go into here */
+                goto complete_loop;
+            }
+            if( OPAL_DATATYPE_END_LOOP == pElem->elem.common.type ) { /* end of the current loop */
+                DT_CUDA_DEBUG( opal_cuda_output( 1, "pack end_loop count %d stack_pos %d"
+                                                 " pos_desc %d disp %ld space %lu\n",
+                                                 (int)pStack->count, pConvertor->stack_pos,
+                                                 pos_desc, (long)pStack->disp, (unsigned long)iov_len_local ); );
+                if( --(pStack->count) == 0 ) { /* end of loop */
+                    if( 0 == pConvertor->stack_pos ) {
+                        /* we lie about the size of the next element in order to
+                         * make sure we exit the main loop.
+                         */
+                        *out_size = iov_count;
+                        goto complete_loop;  /* completed */
+                    }
+                    pConvertor->stack_pos--;
+                    pStack--;
+                    pos_desc++;
+                } else {
+                    pos_desc = pStack->index + 1;
+                    if( pStack->index == -1 ) {
+                        pStack->disp += (pData->ub - pData->lb);
+                    } else {
+                        assert( OPAL_DATATYPE_LOOP == description[pStack->index].loop.common.type );
+                        pStack->disp += description[pStack->index].loop.extent;
+                    }
+                }
+                conv_ptr = pConvertor->pBaseBuf + pStack->disp;
+                UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
+                DT_CUDA_DEBUG( opal_cuda_output( 1, "pack new_loop count %d stack_pos %d pos_desc %d count_desc %d disp %ld space %lu\n",
+                                                 (int)pStack->count, pConvertor->stack_pos, pos_desc,
+                                                 count_desc, (long)pStack->disp, (unsigned long)iov_len_local ); );
+            }
+            if( OPAL_DATATYPE_LOOP == pElem->elem.common.type ) {
+                OPAL_PTRDIFF_TYPE local_disp = (OPAL_PTRDIFF_TYPE)conv_ptr;
+                if( pElem->loop.common.flags & OPAL_DATATYPE_FLAG_CONTIGUOUS ) {
+                    pack_contiguous_loop_cuda(pElem, &count_desc, &conv_ptr, &iov_ptr, &iov_len_local);
+                    count_desc = 0;
+                    if( 0 == count_desc ) {  /* completed */
+                        pos_desc += pElem->loop.items + 1;
+                        goto update_loop_description;
+                    }
+                    /* Save the stack with the correct last_count value. */
+                }
+                local_disp = (OPAL_PTRDIFF_TYPE)conv_ptr - local_disp;
+                PUSH_STACK( pStack, pConvertor->stack_pos, pos_desc, OPAL_DATATYPE_LOOP, count_desc,
+                            pStack->disp + local_disp);
+                pos_desc++;
+            update_loop_description:  /* update the current state */
+                conv_ptr = pConvertor->pBaseBuf + pStack->disp;
+                UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
+                continue;
+            }
+        }
+    complete_loop:
+        iov[iov_count].iov_len -= iov_len_local;  /* update the amount of valid data */
+        total_packed += iov[iov_count].iov_len;
+        printf("iov_len %d, local %d\n", iov[iov_count].iov_len, iov_len_local);
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+        GET_TIME(start);
+#endif
+        if (transfer_required) {
+            cudaMemcpy(iov[iov_count].iov_base, ddt_cuda_pack_buffer, total_packed, cudaMemcpyDeviceToHost);
+        } 
+#if defined(OPAL_DATATYPE_CUDA_TIMING) 
+        GET_TIME( end );
+        total_time = ELAPSED_TIME( start, end );
+        printf( "[Timing]: DtoH memcpy in %ld microsec\n", total_time );
+#endif
+    }
+    *max_data = total_packed;
+    pConvertor->bConverted += total_packed;  /* update the already converted bytes */
+    pConvertor->bConverted = pConvertor->local_size;
+    *out_size = iov_count;
+    if( pConvertor->bConverted == pConvertor->local_size ) {
+        pConvertor->flags |= CONVERTOR_COMPLETED;
+        printf("total packed %lu\n", pConvertor->bConverted);
+        return 1;
+    }
+    /* Save the global position for the next round */
+    PUSH_STACK( pStack, pConvertor->stack_pos, pos_desc, pElem->elem.common.type, count_desc,
+                conv_ptr - pConvertor->pBaseBuf );
+    DT_CUDA_DEBUG( opal_cuda_output( 2, "pack save stack stack_pos %d pos_desc %d count_desc %d disp %ld\n",
+                                     pConvertor->stack_pos, pStack->index, (int)pStack->count, (long)pStack->disp ); );
+    return 0;
+}
+
 void pack_contiguous_loop_cuda( dt_elem_desc_t* ELEM,
                                 uint32_t* COUNT,
                                 unsigned char** SOURCE,
@@ -187,8 +348,8 @@ void pack_contiguous_loop_cuda( dt_elem_desc_t* ELEM,
         _copy_loops = (uint32_t)(*(SPACE) / _end_loop->size);
 
 #if !defined(OPAL_DATATYPE_CUDA_DRY_RUN)
-    _source = pBaseBuf_GPU;
-    _destination = (unsigned char*)cuda_desc_h->iov[0].iov_base;
+ //   _source = pBaseBuf_GPU;
+ //   _destination = (unsigned char*)cuda_desc_h->iov[0].iov_base;
 #endif
     
     tasks_per_block = THREAD_PER_BLOCK * TASK_PER_THREAD;
@@ -205,105 +366,6 @@ void pack_contiguous_loop_cuda( dt_elem_desc_t* ELEM,
     cudaDeviceSynchronize();
 }
 
-
-// int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor,
-//                                                     struct iovec* iov,
-//                                                     uint32_t* out_size,
-//                                                     size_t* max_data )
-// {
-//     uint32_t i;
-//     uint32_t count_desc, current_block, task_iteration, nb_blocks_per_description, j, dst_offset;
-//     uint32_t nb_blocks, thread_per_block;
-//     dt_elem_desc_t* description;
-//     size_t length;
-//
-//  //   return -99;
-//
-//     cuda_iov_count = 4000;
-//     opal_convertor_raw( pConvertor, cuda_iov, &cuda_iov_count, &length );
-//     printf("iov count %d, length %d\n", cuda_iov_count, length);
-//
-//     description = pConvertor->use_desc->desc;
-//     current_block = 0;
-//     task_iteration = 0;
-//     dst_offset = 0;
-//     thread_per_block = CUDA_WARP_SIZE * 4;
-//     nb_blocks = 512;
-//     for (i = 0; i < cuda_iov_count; i++) {
-//         count_desc = cuda_iov[i].iov_len / sizeof(double);
-// //        printf("i = %d\t, iov_base %p\t, iov_len %ld\t, count %d\n", i, cuda_iov[i].iov_base, cuda_iov[i].iov_len, count_desc);
-//         nb_blocks_per_description = (count_desc + thread_per_block - 1) / thread_per_block;
-//         for (j = 0; j < nb_blocks_per_description; j++) {
-//             description_dist_h[current_block].description_index[task_iteration] = i;
-//             description_dist_h[current_block].description_local_index[task_iteration] = j;
-//             description_dist_h[current_block].dst_offset[task_iteration] = dst_offset;
-//             description_dist_h[current_block].description_used = task_iteration + 1;
-//             if ( (j+1) * thread_per_block <= count_desc) {
-//                 dst_offset += thread_per_block;
-//             } else {
-//                 dst_offset += thread_per_block - ((j+1)*thread_per_block - count_desc);
-//             }
-//             current_block += 1;
-//             if (current_block >= nb_blocks) {
-//                 current_block = 0;
-//                 task_iteration ++;
-//             }
-//         }
-//     }
-//
-//     uint32_t pos_desc;
-//     dt_elem_desc_t* pElem;
-//     // for (i = 0; i < nb_blocks; i++) {
-//     //     printf("block %d\t, used %d\n", i, description_dist_h[i].description_used);
-//     //     for (j = 0; j < description_dist_h[i].description_used; j++) {
-//     //         pos_desc = description_dist_h[i].description_index[j];
-//     //         pElem = &(description[pos_desc]);
-//     //         printf("i %d\t, descp_pos %d\t, local_index %d\t, count %d\t, dst offset %d\n", j, description_dist_h[i].description_index[j], description_dist_h[i].description_local_index[j], pElem->elem.count, description_dist_h[i].dst_offset[j]);
-//     //     }
-//     // }
-//
-//     cudaMemcpy(description_dist_d, description_dist_h, sizeof(ddt_cuda_description_dist_t)*(nb_blocks), cudaMemcpyHostToDevice);
-//
-//     if (cuda_desc_h->description_max_count != 0) {
-//         if (cuda_desc_h->description_max_count >= (pConvertor->use_desc->used+1)) {
-//             cuda_desc_h->description_count = pConvertor->use_desc->used+1;
-//         } else {
-//             cudaFree(cuda_desc_h->description);
-//             cuda_desc_h->description = NULL;
-//             cudaMalloc((void **)&(cuda_desc_h->description), sizeof(dt_elem_desc_t)*(pConvertor->use_desc->used+1));
-//             description_d = cuda_desc_h->description;
-//             cuda_desc_h->description_max_count = pConvertor->use_desc->used+1;
-//             cuda_desc_h->description_count = pConvertor->use_desc->used+1;
-//         }
-//
-//     } else {
-//         cudaMalloc((void **)&(cuda_desc_h->description), sizeof(dt_elem_desc_t)*(pConvertor->use_desc->used+1));
-//         description_d = cuda_desc_h->description;
-//         cuda_desc_h->description_max_count = pConvertor->use_desc->used+1;
-//         cuda_desc_h->description_count = pConvertor->use_desc->used+1;
-//     }
-//     cudaMemcpy(description_d, description, sizeof(dt_elem_desc_t)*(cuda_desc_h->description_count), cudaMemcpyHostToDevice);
-//
-//     unsigned char* pBaseBuf;
-// #if defined(OPAL_DATATYPE_CUDA_DRY_RUN)
-//     pBaseBuf = pConvertor->pBaseBuf;
-// #else
-//     pBaseBuf = pBaseBuf_GPU;
-// #endif /* OPAL_DATATYPE_CUDA_DRY_RUN */
-//
-//     for (i = 0; i < *out_size; i++) {
-// #if defined (OPAL_DATATYPE_CUDA_DRY_RUN)
-//         cuda_desc_h->iov[i].iov_base = iov[i].iov_base;
-// #endif /* OPAL_DATATYPE_CUDA_DRY_RUN */
-//         cuda_desc_h->iov[i].iov_len = iov[i].iov_len;
-//     }
-//
-//     opal_generic_simple_pack_cuda_iov_kernel<<<nb_blocks, thread_per_block>>>(description_dist_d, description_d, current_block, cuda_desc_h->iov, pBaseBuf);
-//     cudaDeviceSynchronize();
-//
-//     return 1;
-// }
-
 int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor,
                                                     struct iovec* iov,
                                                     uint32_t* out_size,
@@ -313,10 +375,10 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
     uint32_t count_desc, current_block, task_iteration, nb_blocks_per_description, residue_desc;
     uint32_t nb_blocks, thread_per_block;
     size_t length, buffer_size, length_per_iovec, dst_offset;
-    unsigned char *destination;
+    unsigned char *destination, *destination_tmp;
     size_t total_packed, total_converted;
     int32_t complete_flag = 0;
-    uint8_t buffer_isfull = 0, transfer_required;
+    uint8_t buffer_isfull = 0, transfer_required, free_required;
     uint32_t convertor_flags;
     dt_elem_desc_t* description;
     dt_elem_desc_t* pElem;
@@ -337,13 +399,6 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
     
     DT_CUDA_DEBUG ( opal_cuda_output(0, "GPU datatype packing using iovec\n"); );
 
-#if defined(OPAL_DATATYPE_CUDA_DRY_RUN)
-    destination = (unsigned char*)iov[0].iov_base;
-#else
-//    pConvertor->pBaseBuf = pBaseBuf_GPU;
-    printf("Pack GPU base %p, gpu_buffer %p\n", pConvertor->pBaseBuf, ddt_cuda_pack_buffer);
-    destination = ddt_cuda_pack_buffer;
-#endif /* OPAL_DATATYPE_CUDA_DRY_RUN */
 
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
     GET_TIME(start);
@@ -354,7 +409,7 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
     pElem = &(description[pStack->index]);
     printf("size elem %lu, size %d\n", pElem->elem.common.type, opal_datatype_basicDatatypesSize[pElem->elem.common.type]);
     
-    assert(opal_datatype_basicDatatypesSize[pElem->elem.common.type] != 0);
+//    assert(opal_datatype_basicDatatypesSize[pElem->elem.common.type] != 0);
 
     printf("buffer size %d, max_data %d\n", iov[0].iov_len, *max_data);
     if ((iov[0].iov_base == NULL) || opal_cuda_is_gpu_buffer(iov[0].iov_base)) {
@@ -363,24 +418,34 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
         } else {
             buffer_size = iov[0].iov_len;
         }
-        pConvertor->gpu_buffer_ptr = ddt_cuda_pack_buffer;
-        pConvertor->gpu_buffer_ptr_source = pConvertor->gpu_buffer_ptr + pConvertor->bConverted;
         
         if (iov[0].iov_base == NULL) {
-            iov[0].iov_base = ddt_cuda_pack_buffer;
-            destination = ddt_cuda_pack_buffer;
+            iov[0].iov_base = (unsigned char *)opal_cuda_malloc_gpu_buffer(buffer_size, 0);
+            destination = (unsigned char *)iov[0].iov_base;
+            free_required = 1;
         } else {
             destination = (unsigned char *)iov[0].iov_base;
+            free_required = 0;
         }
         transfer_required = 0;
+        pConvertor->gpu_buffer_ptr = destination;
     } else {
         buffer_size = iov[0].iov_len;
-        pConvertor->gpu_buffer_ptr = NULL;
-        pConvertor->gpu_buffer_ptr_source = NULL;
+        if (pConvertor->gpu_buffer_ptr == NULL) {
+            pConvertor->gpu_buffer_ptr = (unsigned char*)opal_cuda_malloc_gpu_buffer(buffer_size, 0);
+        }
         transfer_required = 1;
+        free_required = 1;
+#if defined(OPAL_DATATYPE_CUDA_DRY_RUN)
+        destination = (unsigned char*)iov[0].iov_base;
+#else
+        destination = pConvertor->gpu_buffer_ptr;
+#endif /* OPAL_DATATYPE_CUDA_DRY_RUN */
     }
     
-    printf("start packing from %p\n", destination);
+    destination_tmp = destination;
+    
+    DT_CUDA_DEBUG ( opal_cuda_output(0, "Pack GPU base %p, pack to buffer %p\n", pConvertor->pBaseBuf, destination););
 
     cuda_iov_count = 1000;
     total_packed = 0;
@@ -446,7 +511,7 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
                 alignment = ALIGNMENT_CHAR;
             }
             
-        //    alignment = ALIGNMENT_CHAR;
+            alignment = ALIGNMENT_DOUBLE;
 
             count_desc = length_per_iovec / alignment;
             residue_desc = length_per_iovec % alignment;
@@ -498,18 +563,10 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
 #if defined(OPAL_DATATYPE_CUDA_TIMING)    
         GET_TIME( end );
         total_time = ELAPSED_TIME( start, end );
-        printf( "[Timing]: iov is prepared in %ld microsec, cudaMemcpy will be submit to CUDA stream %d\n", total_time,  cuda_streams->current_stream_id);
+        printf( "[Timing]: Pack to dest %p, iov is prepared in %ld microsec, kernel submitted to CUDA stream %d\n", destination_tmp, total_time,  cuda_streams->current_stream_id);
 #endif
 
         cudaMemcpyAsync(cuda_iov_dist_d_current, cuda_iov_dist_h_current, sizeof(ddt_cuda_iov_dist_t)*(nb_blocks), cudaMemcpyHostToDevice, cuda_streams->opal_cuda_stream[cuda_streams->current_stream_id]);
-
-        for (i = 0; i < *out_size; i++) {
-#if defined (OPAL_DATATYPE_CUDA_DRY_RUN)
-            cuda_desc_h->iov[i].iov_base = iov[i].iov_base;
-#endif /* OPAL_DATATYPE_CUDA_DRY_RUN */
-            cuda_desc_h->iov[i].iov_len = iov[i].iov_len;
-        }
-    
         opal_generic_simple_pack_cuda_iov_kernel<<<nb_blocks, thread_per_block, 0, cuda_streams->opal_cuda_stream[cuda_streams->current_stream_id]>>>(cuda_iov_dist_d_current);
         cuda_streams->current_stream_id ++;
         cuda_streams->current_stream_id = cuda_streams->current_stream_id % NB_STREAMS;
@@ -541,7 +598,7 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
     GET_TIME(start);
 #endif
     if (transfer_required) {
-        cudaMemcpy(iov[0].iov_base, ddt_cuda_pack_buffer, total_packed, cudaMemcpyDeviceToHost);
+        cudaMemcpy(iov[0].iov_base, destination_tmp, total_packed, cudaMemcpyDeviceToHost);
     } 
 #if defined(OPAL_DATATYPE_CUDA_TIMING) 
     GET_TIME( end );
@@ -568,6 +625,10 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
     
     if( pConvertor->bConverted == pConvertor->local_size ) {
         pConvertor->flags |= CONVERTOR_COMPLETED;
+        if (pConvertor->gpu_buffer_ptr != NULL && free_required) {
+           opal_cuda_free_gpu_buffer(pConvertor->gpu_buffer_ptr, 0);
+           pConvertor->gpu_buffer_ptr = NULL;
+        }
         return 1;
     }        
     return 0;
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
index 0ae85e22eef..35a4ff73078 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
@@ -308,10 +308,10 @@ __global__ void unpack_contiguous_loop_cuda_kernel_global( uint32_t copy_loops,
     for (_i = tid; _i < copy_loops*nb_elements; _i+=num_threads) {
         _destination_tmp = _dst_disp_tmp + tid + _i/num_threads*num_threads + _i/nb_elements * gap;
 #if defined (OPAL_DATATYPE_CUDA_DEBUG)
-        if (_i % nb_elements == 0 ) {
-            DBGPRINT("tid %d, pack 3. memcpy( %p, %p, %lu ) => _i %d, actual _i %d\n",
-                                            tid, _destination_tmp, _source_tmp, (unsigned long)size,  _i/nb_elements, _i );
-        }
+        // if (_i % nb_elements == 0 ) {
+        //     DBGPRINT("tid %d, pack 3. memcpy( %p, %p, %lu ) => _i %d, actual _i %d\n",
+        //                                     tid, _destination_tmp, _source_tmp, (unsigned long)size,  _i/nb_elements, _i );
+        // }
         // if (_i / nb_elements ==1 && tid == 0 ) {
         //     DBGPRINT("tid %d, pack 3. memcpy( %p, %p, %lu ) => space %lu, _i %d, actual _i %d\n",
         //                                     tid, _destination_tmp, _source_tmp, (unsigned long)_end_loop->size, (unsigned long)(*(SPACE) - _i/nb_elements * _end_loop->size), _i/nb_elements, _i );
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
index dccf9f23e82..e1f96ea6a2f 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
@@ -107,6 +107,147 @@ int32_t opal_generic_simple_unpack_function_cuda( opal_convertor_t* pConvertor,
 #endif /* OPAL_DATATYPE_CUDA_DRY_RUN */
 }
 
+int32_t opal_generic_simple_unpack_function_cuda_vector( opal_convertor_t* pConvertor,
+                                                         struct iovec* iov, uint32_t* out_size,
+                                                         size_t* max_data )
+{
+    dt_stack_t* pStack;                /* pointer to the position on the stack */
+    uint32_t pos_desc;                 /* actual position in the description of the derived datatype */
+    uint32_t count_desc;               /* the number of items already done in the actual pos_desc */
+    size_t total_unpacked = 0;         /* total size unpacked this time */
+    dt_elem_desc_t* description;
+    dt_elem_desc_t* pElem;
+    const opal_datatype_t *pData = pConvertor->pDesc;
+    unsigned char *conv_ptr, *iov_ptr;
+    size_t iov_len_local;
+    uint32_t iov_count;
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+    TIMER_DATA_TYPE start, end, start_total, end_total;
+    long total_time;
+#endif
+    
+    printf("i am in simple unpack vector, max_data %lu, iov len %lu\n", *max_data, iov[0].iov_len);
+    DT_CUDA_DEBUG( opal_cuda_output( 1, "opal_convertor_generic_simple_unpack( %p, {%p, %lu}, %u )\n",
+                                     (void*)pConvertor, iov[0].iov_base, (unsigned long)iov[0].iov_len, *out_size ); )                  
+
+    description = pConvertor->use_desc->desc;
+
+    /* For the first step we have to add both displacement to the source. After in the
+     * main while loop we will set back the source_base to the correct value. This is
+     * due to the fact that the convertor can stop in the middle of a data with a count
+     */
+    pStack     = pConvertor->pStack + pConvertor->stack_pos;
+    pos_desc   = pStack->index;
+    conv_ptr   = pConvertor->pBaseBuf + pStack->disp;
+    count_desc = (uint32_t)pStack->count;
+    pStack--;
+    pConvertor->stack_pos--;
+    pElem = &(description[pos_desc]);
+
+    DT_CUDA_DEBUG( opal_cuda_output( 0, "unpack start pos_desc %d count_desc %d disp %ld\n"
+                           "stack_pos %d pos_desc %d count_desc %d disp %ld\n",
+                           pos_desc, count_desc, (long)(conv_ptr - pConvertor->pBaseBuf),
+                           pConvertor->stack_pos, pStack->index, (int)pStack->count, (long)(pStack->disp) ); );
+
+    for( iov_count = 0; iov_count < (*out_size); iov_count++ ) {
+        iov_ptr = ddt_cuda_unpack_buffer;
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+        GET_TIME(start);
+#endif
+        if (opal_cuda_is_gpu_buffer(iov[iov_count].iov_base)) {
+            iov_ptr = (unsigned char*)iov[iov_count].iov_base;
+        } else {    
+            cudaMemcpy(iov_ptr, iov[iov_count].iov_base, iov[iov_count].iov_len, cudaMemcpyHostToDevice);
+        }
+#if defined(OPAL_DATATYPE_CUDA_TIMING) 
+        GET_TIME( end );
+        total_time = ELAPSED_TIME( start, end );
+        printf( "[Timing]: HtoD memcpy in %ld microsec\n", total_time );
+#endif
+        iov_len_local = iov[iov_count].iov_len;
+        if( 0 != pConvertor->partial_length ) {
+            /* not support yet */
+        }
+        while( 1 ) {
+            while( pElem->elem.common.flags & OPAL_DATATYPE_FLAG_DATA ) {
+                /* now here we have a basic datatype */
+                /* should not go to here */
+                goto complete_loop;
+            }
+            if( OPAL_DATATYPE_END_LOOP == pElem->elem.common.type ) { /* end of the current loop */
+                DT_CUDA_DEBUG( opal_cuda_output( 0, "unpack end_loop count %d stack_pos %d pos_desc %d disp %ld space %lu\n",
+                                                 (int)pStack->count, pConvertor->stack_pos, pos_desc,
+                                                 (long)pStack->disp, (unsigned long)iov_len_local ); );
+                if( --(pStack->count) == 0 ) { /* end of loop */
+                    if( 0 == pConvertor->stack_pos ) {
+                        /* Do the same thing as when the loop is completed */
+                        iov[iov_count].iov_len -= iov_len_local;  /* update the amount of valid data */
+                        total_unpacked += iov[iov_count].iov_len;
+                        iov_count++;  /* go to the next */
+                        goto complete_conversion;
+                    }
+                    pConvertor->stack_pos--;
+                    pStack--;
+                    pos_desc++;
+                } else {
+                    pos_desc = pStack->index + 1;
+                    if( pStack->index == -1 ) {
+                        pStack->disp += (pData->ub - pData->lb);
+                    } else {
+                        assert( OPAL_DATATYPE_LOOP == description[pStack->index].loop.common.type );
+                        pStack->disp += description[pStack->index].loop.extent;
+                    }
+                }
+                conv_ptr = pConvertor->pBaseBuf + pStack->disp;
+                UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
+                DT_CUDA_DEBUG( opal_cuda_output( 0, "unpack new_loop count %d stack_pos %d pos_desc %d disp %ld space %lu\n",
+                                                 (int)pStack->count, pConvertor->stack_pos, pos_desc,
+                                                 (long)pStack->disp, (unsigned long)iov_len_local ); );
+            }
+            if( OPAL_DATATYPE_LOOP == pElem->elem.common.type ) {
+                OPAL_PTRDIFF_TYPE local_disp = (OPAL_PTRDIFF_TYPE)conv_ptr;
+                if( pElem->loop.common.flags & OPAL_DATATYPE_FLAG_CONTIGUOUS ) {
+                    unpack_contiguous_loop_cuda(pElem, &count_desc, &iov_ptr, &conv_ptr, &iov_len_local);
+                    count_desc = 0;
+                    if( 0 == count_desc ) {  /* completed */
+                        pos_desc += pElem->loop.items + 1;
+                        goto update_loop_description;
+                    }
+                    /* Save the stack with the correct last_count value. */
+                }
+                local_disp = (OPAL_PTRDIFF_TYPE)conv_ptr - local_disp;
+                PUSH_STACK( pStack, pConvertor->stack_pos, pos_desc, OPAL_DATATYPE_LOOP, count_desc,
+                            pStack->disp + local_disp);
+                pos_desc++;
+            update_loop_description:  /* update the current state */
+                conv_ptr = pConvertor->pBaseBuf + pStack->disp;
+                UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
+                continue;
+            }
+        }
+    complete_loop:
+        iov[iov_count].iov_len -= iov_len_local;  /* update the amount of valid data */
+        total_unpacked += iov[iov_count].iov_len;
+    }
+ complete_conversion:
+    *max_data = total_unpacked;
+    pConvertor->bConverted += total_unpacked;  /* update the already converted bytes */
+    *out_size = iov_count;
+    pConvertor->bConverted = pConvertor->local_size;
+    if( pConvertor->bConverted == pConvertor->remote_size ) {
+        pConvertor->flags |= CONVERTOR_COMPLETED;
+        printf("total unpacked %lu\n", pConvertor->bConverted);
+        return 1;
+    }
+    /* Save the global position for the next round */
+    PUSH_STACK( pStack, pConvertor->stack_pos, pos_desc, pElem->elem.common.type, count_desc,
+                conv_ptr - pConvertor->pBaseBuf );
+    DT_CUDA_DEBUG( opal_cuda_output( 1, "unpack save stack stack_pos %d pos_desc %d count_desc %d disp %ld\n",
+                                     pConvertor->stack_pos, pStack->index, (int)pStack->count, (long)pStack->disp ); );
+    return 0;
+}
+
 int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvertor,
                                                   struct iovec* iov, 
                                                   uint32_t* out_size,
@@ -116,10 +257,11 @@ int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvert
     uint32_t count_desc, current_block, task_iteration, nb_blocks_per_description, dst_offset, residue_desc;
     uint32_t nb_blocks, thread_per_block;
     size_t length, buffer_size, length_per_iovec;
-    unsigned char *source;
+    unsigned char *source, *source_tmp;
     size_t total_unpacked, total_converted;
     int32_t complete_flag = 0;
     uint8_t buffer_isfull = 0;
+    uint8_t free_required = 0;
     uint32_t convertor_flags;
     dt_elem_desc_t* description;
     dt_elem_desc_t* pElem;
@@ -145,16 +287,7 @@ int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvert
     
     DT_CUDA_DEBUG ( opal_cuda_output(0, "GPU datatype UNpacking using iovec\n"); );
     
-#if defined(OPAL_DATATYPE_CUDA_DRY_RUN)
-    source = (unsigned char*)iov[0].iov_base;
-#else
-//    pConvertor->pBaseBuf = pBaseBuf_GPU;
- //   printf("Unpack GPU base %p, iov buffer %p\n", pConvertor->pBaseBuf, iov[0].iov_base);
-    source = ddt_cuda_unpack_buffer;
-#endif /* OPAL_DATATYPE_CUDA_DRY_RUN */
-    
     // double *vtmp = (double *)iov[0].iov_base;
-    printf("recevied unpacked iov buffer, len %d\n", iov[0].iov_len);
     // for (uint32_t i = 0; i < iov[0].iov_len/sizeof(double); i++) {
     //     printf(" %1.f ", *vtmp);
     //     vtmp ++;
@@ -165,9 +298,23 @@ int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvert
 #endif
     if (opal_cuda_is_gpu_buffer(iov[0].iov_base)) {
         source = (unsigned char*)iov[0].iov_base;
-    } else {    
+        free_required = 0;
+    } else {  
+#if defined(OPAL_DATATYPE_CUDA_DRY_RUN)
+        source = (unsigned char*)iov[0].iov_base;
+#else
+        if (pConvertor->gpu_buffer_ptr == NULL) {
+            pConvertor->gpu_buffer_ptr = (unsigned char*)opal_cuda_malloc_gpu_buffer(iov[0].iov_len, 0);
+        }
+        source = pConvertor->gpu_buffer_ptr;
+#endif /* OPAL_DATATYPE_CUDA_DRY_RUN */  
         cudaMemcpy(source, iov[0].iov_base, iov[0].iov_len, cudaMemcpyHostToDevice);
+        free_required = 1;
     }
+    
+    source_tmp = source;
+
+    DT_CUDA_DEBUG ( opal_cuda_output(0, "UNpack GPU base %p, unpack from buffer %p, total size %ld\n", pConvertor->pBaseBuf, source, iov[0].iov_len); );
 #if defined(OPAL_DATATYPE_CUDA_TIMING) 
     GET_TIME( end );
     total_time = ELAPSED_TIME( start, end );
@@ -231,7 +378,7 @@ int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvert
                 alignment = ALIGNMENT_CHAR;
             }
             
-           // alignment = ALIGNMENT_CHAR;
+            alignment = ALIGNMENT_DOUBLE;
 
             count_desc = length_per_iovec / alignment;
             residue_desc = length_per_iovec % alignment;
@@ -283,7 +430,7 @@ int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvert
 #if defined(OPAL_DATATYPE_CUDA_TIMING)    
         GET_TIME( end );
         total_time = ELAPSED_TIME( start, end );
-        printf( "[Timing]: iov is prepared in %ld microsec, cudaMemcpy will be submit to CUDA stream %d\n", total_time,  cuda_streams->current_stream_id);
+        printf( "[Timing]: UNpack src %p, iov is prepared in %ld microsec, kernel submitted to CUDA stream %d\n", source_tmp, total_time,  cuda_streams->current_stream_id);
 #endif
                 
         cudaMemcpyAsync(cuda_iov_dist_d_current, cuda_iov_dist_h_current, sizeof(ddt_cuda_iov_dist_t)*(nb_blocks), cudaMemcpyHostToDevice, cuda_streams->opal_cuda_stream[cuda_streams->current_stream_id]);
@@ -326,6 +473,10 @@ int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvert
     
     if( pConvertor->bConverted == pConvertor->local_size ) {
         pConvertor->flags |= CONVERTOR_COMPLETED;
+        if (pConvertor->gpu_buffer_ptr != NULL && free_required) {
+            opal_cuda_free_gpu_buffer(pConvertor->gpu_buffer_ptr, 0);
+            pConvertor->gpu_buffer_ptr = NULL;
+        }
         return 1;
     }        
     return 0;   
@@ -349,8 +500,8 @@ void unpack_contiguous_loop_cuda( dt_elem_desc_t* ELEM,
     if( (_copy_loops * _end_loop->size) > *(SPACE) )
         _copy_loops = (uint32_t)(*(SPACE) / _end_loop->size);
 
-    _destination = pBaseBuf_GPU;
-    _source = (unsigned char*)cuda_desc_h->iov[0].iov_base;
+    // _destination = pBaseBuf_GPU;
+    // _source = (unsigned char*)cuda_desc_h->iov[0].iov_base;
     
     tasks_per_block = THREAD_PER_BLOCK * TASK_PER_THREAD;
     num_blocks = (*COUNT + tasks_per_block - 1) / tasks_per_block;
diff --git a/opal/datatype/opal_convertor.c b/opal/datatype/opal_convertor.c
index 3f35a0e6b41..e74a1d67883 100644
--- a/opal/datatype/opal_convertor.c
+++ b/opal/datatype/opal_convertor.c
@@ -593,6 +593,7 @@ int32_t opal_convertor_prepare_for_recv( opal_convertor_t* convertor,
         } else {
             if ((convertor->flags & CONVERTOR_CUDA) && OPAL_DATATYPE_CUDA_KERNEL) {
                 convertor->fAdvance = opal_generic_simple_unpack_cuda;
+                convertor->gpu_buffer_ptr = NULL;
             } else {
                 convertor->fAdvance = opal_generic_simple_unpack;
             }
@@ -639,6 +640,7 @@ int32_t opal_convertor_prepare_for_send( opal_convertor_t* convertor,
         } else {
             if ((convertor->flags & CONVERTOR_CUDA) && OPAL_DATATYPE_CUDA_KERNEL ) {
                 convertor->fAdvance = opal_generic_simple_pack_cuda;
+                convertor->gpu_buffer_ptr = NULL;
             } else {
                 convertor->fAdvance = opal_generic_simple_pack;
             }
diff --git a/opal/datatype/opal_convertor.h b/opal/datatype/opal_convertor.h
index 6ed9e311d84..1ee0c010e63 100644
--- a/opal/datatype/opal_convertor.h
+++ b/opal/datatype/opal_convertor.h
@@ -113,7 +113,6 @@ struct opal_convertor_t {
     void *                        stream;         /**< CUstream for async copy */
 
     unsigned char *               gpu_buffer_ptr; /**< GPU buffer used for pack/unpack */
-    unsigned char *               gpu_buffer_ptr_source; /**< source address of GPU buffer start to pack, update in packing function */
     uint64_t *                    pipeline_event[MAX_IPC_EVENT_HANDLE]; /**< cuda event for pipeline */
 #endif
     /* size: 248, cachelines: 4, members: 20 */
diff --git a/opal/datatype/opal_datatype_gpu.c b/opal/datatype/opal_datatype_gpu.c
index f8c4785994d..c136a55ea71 100644
--- a/opal/datatype/opal_datatype_gpu.c
+++ b/opal/datatype/opal_datatype_gpu.c
@@ -62,6 +62,16 @@ int32_t (*opal_generic_simple_unpack_function_cuda_iov_p)( opal_convertor_t* pCo
                                                         struct iovec* iov, 
                                                         uint32_t* out_size,
                                                         size_t* max_data ) = NULL;
+                                                        
+int32_t (*opal_generic_simple_pack_function_cuda_vector_p)( opal_convertor_t* pConvertor,
+                                                            struct iovec* iov, 
+                                                            uint32_t* out_size,
+                                                            size_t* max_data ) = NULL;
+
+int32_t (*opal_generic_simple_unpack_function_cuda_vector_p)( opal_convertor_t* pConvertor,
+                                                              struct iovec* iov, 
+                                                              uint32_t* out_size,
+                                                              size_t* max_data ) = NULL;
                                                        
 void (*pack_contiguous_loop_cuda_p)( dt_elem_desc_t* ELEM,
                                      uint32_t* COUNT,
@@ -85,6 +95,10 @@ void (*opal_cuda_sync_device_p)(void) = NULL;
 
 unsigned char* (*opal_cuda_get_gpu_pack_buffer_p)(void) = NULL;
 
+void (*opal_cuda_free_gpu_buffer_p)(void *addr, int gpu_id) = NULL;
+
+void* (*opal_cuda_malloc_gpu_buffer_p)(size_t size, int gpu_id) = NULL;
+
 int32_t opal_datatype_gpu_init(void)
 {
     char *error;
@@ -140,6 +154,20 @@ int32_t opal_datatype_gpu_init(void)
             return OPAL_ERROR;
         }
         
+        *(void **)(&opal_generic_simple_pack_function_cuda_vector_p) = dlsym(opal_datatype_cuda_handle, "opal_generic_simple_pack_function_cuda_vector");
+        if ((error = dlerror()) != NULL)  {
+            fprintf(stderr, "opal_generic_simple_pack_function_cuda_vector error: %s\n", error);
+            opal_generic_simple_pack_function_cuda_vector_p = NULL;
+            return OPAL_ERROR;
+        }
+        
+        *(void **)(&opal_generic_simple_unpack_function_cuda_vector_p) = dlsym(opal_datatype_cuda_handle, "opal_generic_simple_unpack_function_cuda_vector");
+        if ((error = dlerror()) != NULL)  {
+            fprintf(stderr, "opal_generic_simple_unpack_function_cuda_vector error: %s\n", error);
+            opal_generic_simple_unpack_function_cuda_vector_p = NULL;
+            return OPAL_ERROR;
+        }
+        
         *(void **)(&pack_contiguous_loop_cuda_p) = dlsym(opal_datatype_cuda_handle, "pack_contiguous_loop_cuda");
         if ((error = dlerror()) != NULL)  {
             fprintf(stderr, "pack_contiguous_loop_cuda error: %s\n", error);
@@ -175,6 +203,20 @@ int32_t opal_datatype_gpu_init(void)
             return OPAL_ERROR;
         }
         
+        *(void **)(&opal_cuda_free_gpu_buffer_p) = dlsym(opal_datatype_cuda_handle, "opal_cuda_free_gpu_buffer");
+        if ((error = dlerror()) != NULL)  {
+            fprintf(stderr, "opal_cuda_free_gpu_buffer error: %s\n", error);
+            opal_cuda_free_gpu_buffer_p = NULL;
+            return OPAL_ERROR;
+        }
+        
+        *(void **)(&opal_cuda_malloc_gpu_buffer_p) = dlsym(opal_datatype_cuda_handle, "opal_cuda_malloc_gpu_buffer");
+        if ((error = dlerror()) != NULL)  {
+            fprintf(stderr, "opal_cuda_malloc_gpu_buffer error: %s\n", error);
+            opal_cuda_malloc_gpu_buffer_p = NULL;
+            return OPAL_ERROR;
+        }
+        
         (*opal_datatype_cuda_init_p)();
         printf("cuda init done\n");   
     }
@@ -193,11 +235,15 @@ int32_t opal_datatype_gpu_fini(void)
         opal_generic_simple_unpack_function_cuda_p = NULL;
         opal_generic_simple_pack_function_cuda_iov_p = NULL;
         opal_generic_simple_unpack_function_cuda_iov_p = NULL;
+        opal_generic_simple_pack_function_cuda_vector_p = NULL;
+        opal_generic_simple_unpack_function_cuda_vector_p = NULL;
         pack_contiguous_loop_cuda_p = NULL;
         unpack_contiguous_loop_cuda_p = NULL;
         pack_predefined_data_cuda_p = NULL;
         opal_cuda_sync_device_p = NULL;
         opal_cuda_get_gpu_pack_buffer_p = NULL;
+        opal_cuda_free_gpu_buffer_p = NULL;
+        opal_cuda_malloc_gpu_buffer_p = NULL;
         printf("cuda fini done\n");
     }
     return OPAL_SUCCESS;
diff --git a/opal/datatype/opal_datatype_gpu.h b/opal/datatype/opal_datatype_gpu.h
index 49060bde8d1..8ae90cde92f 100644
--- a/opal/datatype/opal_datatype_gpu.h
+++ b/opal/datatype/opal_datatype_gpu.h
@@ -26,10 +26,20 @@ extern int32_t (*opal_generic_simple_pack_function_cuda_iov_p)( opal_convertor_t
                                                                 uint32_t* out_size,
                                                                 size_t* max_data );
                                                                 
+extern int32_t (*opal_generic_simple_pack_function_cuda_vector_p)( opal_convertor_t* pConvertor,
+                                                                   struct iovec* iov, 
+                                                                   uint32_t* out_size,
+                                                                   size_t* max_data );
+                                                                
 extern int32_t (*opal_generic_simple_unpack_function_cuda_iov_p)( opal_convertor_t* pConvertor,
-                                                                struct iovec* iov, 
-                                                                uint32_t* out_size,
-                                                                size_t* max_data );
+                                                                  struct iovec* iov, 
+                                                                  uint32_t* out_size,
+                                                                  size_t* max_data );
+                                                                  
+extern int32_t (*opal_generic_simple_unpack_function_cuda_vector_p)( opal_convertor_t* pConvertor,
+                                                                     struct iovec* iov, 
+                                                                     uint32_t* out_size,
+                                                                     size_t* max_data );
                                                               
 extern void (*pack_contiguous_loop_cuda_p)( dt_elem_desc_t* ELEM,
                                             uint32_t* COUNT,
@@ -52,4 +62,8 @@ extern void (*pack_predefined_data_cuda_p)( dt_elem_desc_t* ELEM,
 extern void (*opal_cuda_sync_device_p)(void);
 
 extern unsigned char* (*opal_cuda_get_gpu_pack_buffer_p)(void);
+
+extern void (*opal_cuda_free_gpu_buffer_p)(void *addr, int gpu_id);
+
+extern void* (*opal_cuda_malloc_gpu_buffer_p)(size_t size, int gpu_id);
 #endif /* OPAL_DATATYPE_GPU_H_HAS_BEEN_INCLUDED */
\ No newline at end of file
diff --git a/opal/datatype/opal_datatype_pack.c b/opal/datatype/opal_datatype_pack.c
index a9aaa6541d7..7ddefdd1728 100644
--- a/opal/datatype/opal_datatype_pack.c
+++ b/opal/datatype/opal_datatype_pack.c
@@ -412,9 +412,24 @@ opal_generic_simple_pack_cuda_function( opal_convertor_t* pConvertor,
                                         struct iovec* iov, uint32_t* out_size,
                                         size_t* max_data )
 {
-    if (opal_generic_simple_pack_function_cuda_iov_p != NULL) {
-        return (*opal_generic_simple_pack_function_cuda_iov_p)( pConvertor, iov, out_size, max_data);
-
+    dt_stack_t* pStack;
+    uint32_t pos_desc;
+    dt_elem_desc_t* description;
+    dt_elem_desc_t* pElem;
+    
+    description = pConvertor->use_desc->desc;
+    pStack = pConvertor->pStack + pConvertor->stack_pos;
+    pos_desc   = pStack->index;
+    pElem = &(description[pos_desc]);
+    
+    if( OPAL_DATATYPE_LOOP == pElem->elem.common.type ) {
+        if (opal_generic_simple_pack_function_cuda_vector_p != NULL) {
+            return (*opal_generic_simple_pack_function_cuda_vector_p)( pConvertor, iov, out_size, max_data);
+        }
+    } else {
+        if (opal_generic_simple_pack_function_cuda_iov_p != NULL) {
+            return (*opal_generic_simple_pack_function_cuda_iov_p)( pConvertor, iov, out_size, max_data);
+        }
     }
     return 0;
 }
diff --git a/opal/datatype/opal_datatype_unpack.c b/opal/datatype/opal_datatype_unpack.c
index cad655000d6..ff8dae77971 100644
--- a/opal/datatype/opal_datatype_unpack.c
+++ b/opal/datatype/opal_datatype_unpack.c
@@ -599,9 +599,24 @@ opal_generic_simple_unpack_cuda_function( opal_convertor_t* pConvertor,
                                           struct iovec* iov, uint32_t* out_size,
                                           size_t* max_data )
 {
-    if (opal_generic_simple_unpack_function_cuda_iov_p != NULL) {
-        return (*opal_generic_simple_unpack_function_cuda_iov_p)( pConvertor, iov, out_size, max_data);
-
+    dt_stack_t* pStack;
+    uint32_t pos_desc;
+    dt_elem_desc_t* description;
+    dt_elem_desc_t* pElem;
+    
+    description = pConvertor->use_desc->desc;
+    pStack = pConvertor->pStack + pConvertor->stack_pos;
+    pos_desc   = pStack->index;
+    pElem = &(description[pos_desc]);
+    
+    if( OPAL_DATATYPE_LOOP == pElem->elem.common.type ) {
+        if (opal_generic_simple_unpack_function_cuda_vector_p != NULL) {
+            return (*opal_generic_simple_unpack_function_cuda_vector_p)( pConvertor, iov, out_size, max_data);
+        }
+    } else {
+        if (opal_generic_simple_unpack_function_cuda_iov_p != NULL) {
+            return (*opal_generic_simple_unpack_function_cuda_iov_p)( pConvertor, iov, out_size, max_data);
+        }
     }
     return 0;
 }
diff --git a/opal/mca/btl/btl.h b/opal/mca/btl/btl.h
index 691af933d14..431610ff17f 100644
--- a/opal/mca/btl/btl.h
+++ b/opal/mca/btl/btl.h
@@ -188,6 +188,8 @@ typedef uint8_t mca_btl_base_tag_t;
 #define MCA_BTL_TAG_IB                (MCA_BTL_TAG_BTL + 0)
 #define MCA_BTL_TAG_UDAPL             (MCA_BTL_TAG_BTL + 1)
 #define MCA_BTL_TAG_SMCUDA            (MCA_BTL_TAG_BTL + 2)
+#define MCA_BTL_TAG_SMCUDA_DATATYPE_UNPACK   (MCA_BTL_TAG_BTL + 3)
+#define MCA_BTL_TAG_SMCUDA_DATATYPE_PACK     (MCA_BTL_TAG_BTL + 4)
 
 /* prefered protocol */
 #define MCA_BTL_FLAGS_SEND            0x0001
diff --git a/opal/mca/btl/smcuda/btl_smcuda.c b/opal/mca/btl/smcuda/btl_smcuda.c
index 2e42d4babc8..3a711e40cdf 100644
--- a/opal/mca/btl/smcuda/btl_smcuda.c
+++ b/opal/mca/btl/smcuda/btl_smcuda.c
@@ -1131,21 +1131,15 @@ int mca_btl_smcuda_get_cuda (struct mca_btl_base_module_t *btl,
         recvreq->req_recv.req_base.req_convertor.flags &= ~CONVERTOR_CUDA;
         if(opal_convertor_need_buffers(&recvreq->req_recv.req_base.req_convertor) == true) {
             recvreq->req_recv.req_base.req_convertor.flags |= CONVERTOR_CUDA;
-            printf("RGET NOT IMPLEMENT YET!!!!!!!!!!!!!!\n");
-            struct iovec iov;
-            uint32_t iov_count = 1;
-            iov.iov_base = remote_memory_address;
-            iov.iov_len = size;
-            int rc;
-            size_t max_data = size;
+            printf("RECEIVE REGT!!!!!!!!!!!\n");
+            
             struct opal_convertor_t *convertor = &(recvreq->req_recv.req_base.req_convertor);   
-        //    uint64_t *event = &convertor->pipeline_event[0];
-            // mca_common_cuda_openeventhandle(&event, 0, (mca_mpool_common_cuda_reg_data_t*)remote_handle);
-            // if (mca_common_cuda_query_event(event) == OPAL_SUCCESS){
-            //     printf("get event\n");
-                rc = opal_convertor_unpack(convertor, &iov, &iov_count, &max_data );
-                done = 1;
-            // }
+            size_t pipeline_size = remote_handle->reg_data.pipeline_size;
+            uint32_t lindex = remote_handle->reg_data.lindex;
+            printf("i receive pipeline %ld, lindex %d\n", pipeline_size, lindex);
+            convertor->gpu_buffer_ptr = remote_memory_address;
+            mca_btl_smcuda_cuda_dt_clone(convertor, ep, local_address, local_handle, (mca_btl_base_completion_fn_t)cbfunc, cbcontext, cbdata, pipeline_size, lindex);
+            done = 0;
         } else {
             recvreq->req_recv.req_base.req_convertor.flags |= CONVERTOR_CUDA;
             rc = mca_common_cuda_memcpy(local_address, remote_memory_address, size,
@@ -1251,6 +1245,90 @@ static void mca_btl_smcuda_send_cuda_ipc_request(struct mca_btl_base_module_t* b
 
 }
 
+int mca_btl_smcuda_send_cuda_unpack_sig(struct mca_btl_base_module_t* btl,
+                                           struct mca_btl_base_endpoint_t* endpoint, int lindex, int seq)
+{
+    mca_btl_smcuda_frag_t* frag;
+    int rc;
+    cuda_dt_hdr_t cuda_dt_hdr;
+    
+    /* allocate a fragment, giving up if we can't get one */
+    MCA_BTL_SMCUDA_FRAG_ALLOC_EAGER(frag);
+    if( OPAL_UNLIKELY(NULL == frag) ) {
+        return OPAL_ERR_OUT_OF_RESOURCE;;
+    }
+
+    /* Fill in fragment fields. */
+    frag->base.des_flags = MCA_BTL_DES_FLAGS_BTL_OWNERSHIP;
+    cuda_dt_hdr.seq = seq;
+    cuda_dt_hdr.lindex = lindex;
+    memcpy(frag->segment.seg_addr.pval, &cuda_dt_hdr, sizeof(cuda_dt_hdr_t));
+    
+    rc = mca_btl_smcuda_send(btl, endpoint, (struct mca_btl_base_descriptor_t*)frag,  MCA_BTL_TAG_SMCUDA_DATATYPE_UNPACK);
+    return rc;
+}
+
+int mca_btl_smcuda_send_cuda_pack_sig(struct mca_btl_base_module_t* btl,
+                                      struct mca_btl_base_endpoint_t* endpoint, int lindex, int seq)
+{
+    mca_btl_smcuda_frag_t* frag;
+    int rc;
+    cuda_dt_hdr_t cuda_dt_hdr;
+    
+    /* allocate a fragment, giving up if we can't get one */
+    MCA_BTL_SMCUDA_FRAG_ALLOC_EAGER(frag);
+    if( OPAL_UNLIKELY(NULL == frag) ) {
+        return OPAL_ERR_OUT_OF_RESOURCE;;
+    }
+
+    /* Fill in fragment fields. */
+    frag->base.des_flags = MCA_BTL_DES_FLAGS_BTL_OWNERSHIP;
+    cuda_dt_hdr.seq = seq;
+    cuda_dt_hdr.lindex = lindex;
+    memcpy(frag->segment.seg_addr.pval, &cuda_dt_hdr, sizeof(cuda_dt_hdr_t));
+    
+    rc = mca_btl_smcuda_send(btl, endpoint, (struct mca_btl_base_descriptor_t*)frag,  MCA_BTL_TAG_SMCUDA_DATATYPE_PACK);
+    return rc;
+}
+
+int mca_btl_smcuda_alloc_cuda_dt_clone(void)
+{
+    int i;
+    for (i = 0; i < SMCUDA_DT_CLONE_SIZE; i++) {
+        if (smcuda_dt_clone[i].lindex == -1) {
+            return i;
+        }
+    }
+    return -1;
+}
+
+void mca_btl_smcuda_free_cuda_dt_clone(int lindex)
+{
+    assert(smcuda_dt_clone[lindex].lindex == lindex);
+    smcuda_dt_clone[lindex].lindex = -1;
+}
+
+void mca_btl_smcuda_cuda_dt_clone(struct opal_convertor_t *convertor,
+                                  struct mca_btl_base_endpoint_t *endpoint,
+                                  void *local_address,
+                                  struct mca_btl_base_registration_handle_t *local_handle,
+                                  mca_btl_base_completion_fn_t cbfunc,
+                                  void *cbcontext,
+                                  void *cbdata,
+                                  size_t pipeline_size,
+                                  int lindex)
+{
+    smcuda_dt_clone[lindex].convertor = convertor;
+    smcuda_dt_clone[lindex].endpoint = endpoint;
+    smcuda_dt_clone[lindex].local_address = local_address;
+    smcuda_dt_clone[lindex].local_handle = local_handle;
+    smcuda_dt_clone[lindex].cbfunc = cbfunc;
+    smcuda_dt_clone[lindex].cbcontext = cbcontext;
+    smcuda_dt_clone[lindex].cbdata = cbdata;
+    smcuda_dt_clone[lindex].pipeline_size = pipeline_size;
+    smcuda_dt_clone[lindex].lindex = lindex;
+}
+
 #endif /* OPAL_CUDA_SUPPORT */
 
 /**
diff --git a/opal/mca/btl/smcuda/btl_smcuda.h b/opal/mca/btl/smcuda/btl_smcuda.h
index 7c9d30faded..3e9f2a46db2 100644
--- a/opal/mca/btl/smcuda/btl_smcuda.h
+++ b/opal/mca/btl/smcuda/btl_smcuda.h
@@ -508,6 +508,42 @@ enum ipcState {
     IPC_BAD
 };
 
+/* cuda datatype control message */
+typedef struct {
+    int seq;
+    int lindex;
+} cuda_dt_hdr_t;
+
+/* package save pack/unpack convertor and cbfunc */
+typedef struct {
+    struct opal_convertor_t *convertor;
+    struct mca_btl_base_endpoint_t *endpoint;
+    void *local_address;
+    struct mca_btl_base_registration_handle_t *local_handle;
+    mca_btl_base_completion_fn_t cbfunc;
+    void *cbcontext;
+    void *cbdata;
+    size_t pipeline_size;
+    int lindex;
+} cuda_dt_clone_t;
+
+#define SMCUDA_DT_CLONE_SIZE 20
+extern cuda_dt_clone_t smcuda_dt_clone[SMCUDA_DT_CLONE_SIZE];
+
+int mca_btl_smcuda_send_cuda_unpack_sig(struct mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t* endpoint, int lindex, int seq);
+int mca_btl_smcuda_send_cuda_pack_sig(struct mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t* endpoint, int lindex, int seq);
+int mca_btl_smcuda_alloc_cuda_dt_clone(void);
+void mca_btl_smcuda_free_cuda_dt_clone(int lindex);
+void mca_btl_smcuda_cuda_dt_clone(struct opal_convertor_t *convertor,
+                                  struct mca_btl_base_endpoint_t *endpoint,
+                                  void *local_address,
+                                  struct mca_btl_base_registration_handle_t *local_handle,
+                                  mca_btl_base_completion_fn_t cbfunc,
+                                  void *cbcontext,
+                                  void *cbdata,
+                                  size_t pipeline_size,
+                                  int lindex);
+
 #endif /* OPAL_CUDA_SUPPORT */
 
 
diff --git a/opal/mca/btl/smcuda/btl_smcuda_component.c b/opal/mca/btl/smcuda/btl_smcuda_component.c
index dcbf0ec5180..727308c1df9 100644
--- a/opal/mca/btl/smcuda/btl_smcuda_component.c
+++ b/opal/mca/btl/smcuda/btl_smcuda_component.c
@@ -54,6 +54,7 @@
 
 #if OPAL_CUDA_SUPPORT
 #include "opal/mca/common/cuda/common_cuda.h"
+#include "opal/datatype/opal_datatype_gpu.h"
 #endif /* OPAL_CUDA_SUPPORT */
 #if OPAL_ENABLE_FT_CR    == 1
 #include "opal/runtime/opal_cr.h"
@@ -846,6 +847,62 @@ static void btl_smcuda_control(mca_btl_base_module_t* btl,
     }
 }
 
+cuda_dt_clone_t smcuda_dt_clone[SMCUDA_DT_CLONE_SIZE];
+
+static void btl_smcuda_datatype_unpack(mca_btl_base_module_t* btl,
+                                       mca_btl_base_tag_t tag,
+                                       mca_btl_base_descriptor_t* des, void* cbdata)
+{   
+    cuda_dt_hdr_t cuda_dt_hdr;
+    mca_btl_base_segment_t* segments = des->des_segments;
+    memcpy(&cuda_dt_hdr, segments->seg_addr.pval, sizeof(cuda_dt_hdr_t));
+    int seq = cuda_dt_hdr.seq;
+    int lindex = cuda_dt_hdr.lindex;
+    cuda_dt_clone_t *my_cuda_dt_clone = &smcuda_dt_clone[lindex];
+    
+    assert(my_cuda_dt_clone->lindex == lindex);
+    
+    printf("$$$$$$$$$$$$$$hello, rank %d in smcuda unpack seq %d, index %d\n", my_cuda_dt_clone->endpoint->my_smp_rank, seq, lindex);
+    
+    if (seq == -2) {
+        mca_btl_base_rdma_completion_fn_t cbfunc = (mca_btl_base_rdma_completion_fn_t)my_cuda_dt_clone->cbfunc;
+        cbfunc(btl, my_cuda_dt_clone->endpoint, my_cuda_dt_clone->local_address, my_cuda_dt_clone->local_handle, my_cuda_dt_clone->cbcontext, my_cuda_dt_clone->cbdata, OPAL_SUCCESS);
+        mca_btl_smcuda_free_cuda_dt_clone(lindex);
+    } else if (seq == -1) {
+        mca_btl_smcuda_send_cuda_pack_sig(btl, my_cuda_dt_clone->endpoint, lindex, -1);
+    } else {
+        struct iovec iov;
+        uint32_t iov_count = 1;
+        size_t max_data;
+        struct opal_convertor_t *convertor = my_cuda_dt_clone->convertor;   
+        iov.iov_base = convertor->gpu_buffer_ptr + seq * my_cuda_dt_clone->pipeline_size;
+        max_data = my_cuda_dt_clone->pipeline_size;
+        iov.iov_len = my_cuda_dt_clone->pipeline_size;
+        opal_convertor_unpack(convertor, &iov, &iov_count, &max_data );
+    }
+    
+}
+
+static void btl_smcuda_datatype_pack(mca_btl_base_module_t* btl,
+                                     mca_btl_base_tag_t tag,
+                                     mca_btl_base_descriptor_t* des, void* cbdata)
+{
+    cuda_dt_hdr_t cuda_dt_hdr;
+    mca_btl_base_segment_t* segments = des->des_segments;
+    memcpy(&cuda_dt_hdr, segments->seg_addr.pval, sizeof(cuda_dt_hdr_t));
+    int seq = cuda_dt_hdr.seq;
+    int lindex = cuda_dt_hdr.lindex;
+    cuda_dt_clone_t *my_cuda_dt_clone = &smcuda_dt_clone[lindex];
+    
+    printf("$$$$$$$$$$$$$$hello, rank %d in smcuda pack seq %d, index %d\n", my_cuda_dt_clone->endpoint->my_smp_rank, seq, lindex);
+    
+    if (seq == -1) {
+        mca_btl_smcuda_send_cuda_unpack_sig(btl, my_cuda_dt_clone->endpoint, lindex, -2);
+        opal_cuda_free_gpu_buffer_p(my_cuda_dt_clone->convertor->gpu_buffer_ptr, 0);
+        mca_btl_smcuda_free_cuda_dt_clone(lindex);
+    }
+}
+
 #endif /* OPAL_CUDA_SUPPORT */
 
 /*
@@ -960,6 +1017,14 @@ mca_btl_smcuda_component_init(int *num_btls,
     /* Register a smcuda control function to help setup IPC support */
     mca_btl_base_active_message_trigger[MCA_BTL_TAG_SMCUDA].cbfunc = btl_smcuda_control;
     mca_btl_base_active_message_trigger[MCA_BTL_TAG_SMCUDA].cbdata = NULL;
+    mca_btl_base_active_message_trigger[MCA_BTL_TAG_SMCUDA_DATATYPE_UNPACK].cbfunc = btl_smcuda_datatype_unpack;
+    mca_btl_base_active_message_trigger[MCA_BTL_TAG_SMCUDA_DATATYPE_UNPACK].cbdata = NULL;
+    mca_btl_base_active_message_trigger[MCA_BTL_TAG_SMCUDA_DATATYPE_PACK].cbfunc = btl_smcuda_datatype_pack;
+    mca_btl_base_active_message_trigger[MCA_BTL_TAG_SMCUDA_DATATYPE_PACK].cbdata = NULL;
+    
+    for (int i = 0; i < SMCUDA_DT_CLONE_SIZE; i++) {
+        smcuda_dt_clone[i].lindex = -1;
+    }
 #endif /* OPAL_CUDA_SUPPORT */
 
     return btls;
diff --git a/opal/mca/common/cuda/common_cuda.c b/opal/mca/common/cuda/common_cuda.c
index 87b4c8cce02..990dc3fc119 100644
--- a/opal/mca/common/cuda/common_cuda.c
+++ b/opal/mca/common/cuda/common_cuda.c
@@ -1641,10 +1641,11 @@ int progress_one_cuda_htod_event(struct mca_btl_base_descriptor_t **frag) {
 
 int mca_common_cuda_geteventhandle(uint64_t **event, int n, mca_mpool_base_registration_t *newreg) 
 {
-    // CUipcEventHandle evtHandle;
-    // mca_mpool_common_cuda_reg_t *cuda_reg = (mca_mpool_common_cuda_reg_t*)newreg;
-    // mca_common_cuda_construct_event_and_handle(event, (void**)&evtHandle);
-    // memcpy(&cuda_reg->data.pipeline_evtHandle[n], &evtHandle, sizeof(evtHandle));
+    CUipcEventHandle evtHandle;
+    mca_mpool_common_cuda_reg_t *cuda_reg = (mca_mpool_common_cuda_reg_t*)newreg;
+ //   mca_common_cuda_construct_event_and_handle(event, (void**)&evtHandle);
+//    printf("0 %p, 1 %p\n",&cuda_reg->data.pipeline_evtHandle[0], &cuda_reg->data.pipeline_evtHandle[EVTHANDLE_SIZE]);
+ //   memcpy(&cuda_reg->data.pipeline_evtHandle[n*EVTHANDLE_SIZE], &evtHandle, sizeof(evtHandle));
     return OPAL_SUCCESS;
 }
 
@@ -1692,7 +1693,7 @@ int mca_common_cuda_openeventhandle(uint64_t **event, int n, mca_mpool_common_cu
     // CUipcEventHandle evtHandle;
     // CUresult result;
     // mca_mpool_common_cuda_reg_data_t *cuda_handle = (mca_mpool_common_cuda_reg_data_t*)handle;
-    // memcpy(&evtHandle, cuda_handle->pipeline_evtHandle[n], sizeof(evtHandle));
+    // memcpy(&evtHandle, &cuda_handle->pipeline_evtHandle[n*EVTHANDLE_SIZE], sizeof(evtHandle));
     // result = cuFunc.cuIpcOpenEventHandle((CUevent *)event, evtHandle);
     // if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
     //     opal_show_help("help-mpi-common-cuda.txt", "cuIpcOpenEventHandle failed",
diff --git a/opal/mca/common/cuda/common_cuda.h b/opal/mca/common/cuda/common_cuda.h
index da6b86d2464..0b5a724d9dc 100644
--- a/opal/mca/common/cuda/common_cuda.h
+++ b/opal/mca/common/cuda/common_cuda.h
@@ -28,7 +28,9 @@
 #define MEMHANDLE_SIZE 8
 #define EVTHANDLE_SIZE 8
 
-typedef uint64_t cuIPCHandle[EVTHANDLE_SIZE];
+typedef struct {
+    uint64_t evtHandle[EVTHANDLE_SIZE];
+}cuIPCHandle_t;
 
 struct mca_mpool_common_cuda_reg_data_t {
     uint64_t memHandle[MEMHANDLE_SIZE];
@@ -36,8 +38,9 @@ struct mca_mpool_common_cuda_reg_data_t {
     uint64_t event;
     opal_ptr_t memh_seg_addr;
     size_t memh_seg_len;
-//    cuIPCHandle pipeline_evtHandle[MAX_IPC_EVENT_HANDLE];
-    uint32_t pipeline_size;
+    // uint64_t pipeline_evtHandle[MAX_IPC_EVENT_HANDLE*EVTHANDLE_SIZE];
+    size_t pipeline_size;
+    uint32_t lindex;
 };
 typedef struct mca_mpool_common_cuda_reg_data_t mca_mpool_common_cuda_reg_data_t;
 
diff --git a/opal/mca/mpool/gpusm/mpool_gpusm_module.c b/opal/mca/mpool/gpusm/mpool_gpusm_module.c
index 98740bbdcde..50dcbc859fb 100644
--- a/opal/mca/mpool/gpusm/mpool_gpusm_module.c
+++ b/opal/mca/mpool/gpusm/mpool_gpusm_module.c
@@ -49,7 +49,7 @@
 static void mca_mpool_gpusm_registration_constructor( mca_mpool_gpusm_registration_t *item )
 {
     mca_common_cuda_construct_event_and_handle(&item->event,
-                                               (void *)&item->evtHandle);
+                                               (void *)item->evtHandle);
 }
 
 /**
diff --git a/test/datatype/ddt_test.c b/test/datatype/ddt_test.c
index 6a41001a770..3e6a2a531ff 100644
--- a/test/datatype/ddt_test.c
+++ b/test/datatype/ddt_test.c
@@ -305,11 +305,17 @@ local_copy_with_convertor_2datatypes( ompi_datatype_t* send_type, int send_count
 #endif
 
     send_convertor = opal_convertor_create( remote_arch, 0 );
+#if defined (DDT_TEST_CUDA)
+    send_convertor->flags |= CONVERTOR_CUDA;
+#endif
     if( OPAL_SUCCESS != opal_convertor_prepare_for_send( send_convertor, &(send_type->super), send_count, psrc ) ) {
         printf( "Unable to create the send convertor. Is the datatype committed ?\n" );
         goto clean_and_return;
     }
     recv_convertor = opal_convertor_create( remote_arch, 0 );
+#if defined (DDT_TEST_CUDA)
+    recv_convertor->flags |= CONVERTOR_CUDA;
+#endif
     if( OPAL_SUCCESS != opal_convertor_prepare_for_recv( recv_convertor, &(recv_type->super), recv_count, pdst ) ) {
         printf( "Unable to create the recv convertor. Is the datatype committed ?\n" );
         goto clean_and_return;
@@ -450,11 +456,17 @@ local_copy_with_convertor_2datatypes_struct( ompi_datatype_t* send_type, int sen
 #endif
 
     send_convertor = opal_convertor_create( remote_arch, 0 );
+#if defined (DDT_TEST_CUDA)
+    send_convertor->flags |= CONVERTOR_CUDA;
+#endif
     if( OPAL_SUCCESS != opal_convertor_prepare_for_send( send_convertor, &(send_type->super), send_count, psrc ) ) {
         printf( "Unable to create the send convertor. Is the datatype committed ?\n" );
         goto clean_and_return;
     }
     recv_convertor = opal_convertor_create( remote_arch, 0 );
+#if defined (DDT_TEST_CUDA)
+    recv_convertor->flags |= CONVERTOR_CUDA;
+#endif
     if( OPAL_SUCCESS != opal_convertor_prepare_for_recv( recv_convertor, &(recv_type->super), recv_count, pdst ) ) {
         printf( "Unable to create the recv convertor. Is the datatype committed ?\n" );
         goto clean_and_return;
@@ -816,9 +828,9 @@ int main( int argc, char* argv[] )
     printf( "\n\n#\n * TEST UPPER TRIANGULAR MATRIX (size 100)\n #\n\n" );
     pdt = upper_matrix(4000);
     if( outputFlags & CHECK_PACK_UNPACK ) {
-        for (i = 1; i <= 3; i++) {
+        for (i = 1; i <= 1; i++) {
 //        local_copy_ddt_count(pdt, 1);
-            local_copy_with_convertor(pdt, 1, 1024*1024*100, 4000);
+            local_copy_with_convertor(pdt, 1, 1024*1024*10, 4000);
         }
     }
     OBJ_RELEASE( pdt ); assert( pdt == NULL );
@@ -959,7 +971,7 @@ int main( int argc, char* argv[] )
       //  local_copy_with_convertor( pdt, 1, 6000 );
       //  local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 6000 );
       //  local_copy_with_convertor( pdt, 1, 36000 );
-           // local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 1024*1024*20 , 4000, 384, 512);
+          local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 1024*1024*20 , 4000, 384, 512);
         }
     }
     printf( ">>--------------------------------------------<<\n" );
@@ -969,7 +981,7 @@ int main( int argc, char* argv[] )
     pdt = create_vector_type( MPI_DOUBLE, 4000, 256, 384 );
 //    ompi_datatype_dump( pdt );
     if( outputFlags & CHECK_PACK_UNPACK ) {
-        for (i = 0; i < 10; i++) {
+        for (i = 0; i < 1; i++) {
        // local_copy_ddt_count(pdt, 1);
       //  local_copy_with_convertor( pdt, 1, 12 );
       //  local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 12 );
@@ -978,7 +990,7 @@ int main( int argc, char* argv[] )
       //  local_copy_with_convertor( pdt, 1, 6000 );
       //  local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 6000 );
       //  local_copy_with_convertor( pdt, 1, 36000 );
-  //        local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 1024*1024*10 );
+          local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 1024*1024*10, 4000, 256, 384 );
         }
     }
     printf( ">>--------------------------------------------<<\n" );

From ae49135dd4ceada61a61b62cf3355e73194aca4c Mon Sep 17 00:00:00 2001
From: eddy16112 <eddy16112@gmail.com>
Date: Fri, 1 May 2015 19:41:58 -0400
Subject: [PATCH 099/190] fix gpu memory and vector datatype

---
 opal/datatype/cuda/opal_datatype_cuda.cu      |  6 +++
 .../cuda/opal_datatype_pack_cuda_wrapper.cu   | 49 ++++++++++++-------
 .../cuda/opal_datatype_unpack_cuda_wrapper.cu | 32 +++++++++---
 test/datatype/ddt_test.c                      |  6 +--
 4 files changed, 66 insertions(+), 27 deletions(-)

diff --git a/opal/datatype/cuda/opal_datatype_cuda.cu b/opal/datatype/cuda/opal_datatype_cuda.cu
index 387f75583ce..3ec7b9e53ce 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cu
+++ b/opal/datatype/cuda/opal_datatype_cuda.cu
@@ -129,6 +129,8 @@ static inline void cuda_list_push_head(ddt_cuda_list_t *list, ddt_cuda_buffer_t
     item->next = orig_head;
     if (orig_head == NULL) {
         list->tail = item;
+    } else {
+        orig_head->prev = item;
     }
     list->nb_elements ++;
 }
@@ -141,6 +143,8 @@ static inline void cuda_list_push_tail(ddt_cuda_list_t *list, ddt_cuda_buffer_t
     item->prev = orig_tail;
     if (orig_tail == NULL) {
         list->head = item;
+    } else {
+        orig_tail->next = item;
     }
     list->nb_elements ++;
 }
@@ -219,10 +223,12 @@ void opal_datatype_cuda_init(void)
         p->gpu_addr = gpu_ptr;
         cuda_device[i].buffer_free.head = p;
         cuda_device[i].buffer_free.tail = cuda_device[i].buffer_free.head;
+        cuda_device[i].buffer_free.nb_elements = 1;
         
         cuda_device[i].buffer_used.head = NULL;
         cuda_device[i].buffer_used.tail = NULL;
         cuda_device[i].buffer_used_size = 0;
+        cuda_device[i].buffer_used.nb_elements = 0;
     }
     
     cudaMalloc((void **)&cuda_desc_d, sizeof(ddt_cuda_desc_t));
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
index a5963b74d3f..636e413bc21 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
@@ -184,6 +184,8 @@ int32_t opal_generic_simple_pack_function_cuda_vector(opal_convertor_t* pConvert
     size_t iov_len_local;
     uint32_t iov_count;
     uint8_t transfer_required;
+    uint8_t free_required;
+    uint32_t count_desc_tmp;
     
 #if defined(OPAL_DATATYPE_CUDA_TIMING)    
     TIMER_DATA_TYPE start, end, start_total, end_total;
@@ -216,32 +218,42 @@ int32_t opal_generic_simple_pack_function_cuda_vector(opal_convertor_t* pConvert
     
     
     for( iov_count = 0; iov_count < (*out_size); iov_count++ ) {
-        if ((iov[0].iov_base == NULL) || opal_cuda_is_gpu_buffer(iov[iov_count].iov_base)) {
-            // if (iov[0].iov_len == 0) {
-            //     buffer_size = DT_CUDA_BUFFER_SIZE;
-            // } else {
-            //     buffer_size = iov[0].iov_len;
-            // }
-            pConvertor->gpu_buffer_ptr = ddt_cuda_pack_buffer;
+        if ((iov[iov_count].iov_base == NULL) || opal_cuda_is_gpu_buffer(iov[iov_count].iov_base)) {
+            if (iov[iov_count].iov_len == 0) {
+                iov_len_local = DT_CUDA_BUFFER_SIZE;
+            } else {
+                iov_len_local = iov[iov_count].iov_len;
+            }
         
             if (iov[iov_count].iov_base == NULL) {
-                iov[iov_count].iov_base = ddt_cuda_pack_buffer;
-                iov_ptr = ddt_cuda_pack_buffer;
+                iov[iov_count].iov_base = (unsigned char *)opal_cuda_malloc_gpu_buffer(iov_len_local, 0);
+                iov_ptr = (unsigned char *)iov[iov_count].iov_base;
+                free_required = 1;
             } else {
                 iov_ptr = (unsigned char *)iov[iov_count].iov_base;
+                free_required = 0;
             }
             transfer_required = 0;
+            pConvertor->gpu_buffer_ptr = iov_ptr;
         } else {
-            pConvertor->gpu_buffer_ptr = NULL;
+            iov_len_local = iov[iov_count].iov_len;
+            if (pConvertor->gpu_buffer_ptr == NULL) {
+                pConvertor->gpu_buffer_ptr = (unsigned char*)opal_cuda_malloc_gpu_buffer(iov_len_local, 0);
+            }
             transfer_required = 1;
+            free_required = 1;
+            iov_ptr = pConvertor->gpu_buffer_ptr;
         }
-        iov_ptr = ddt_cuda_pack_buffer;
-        iov_len_local = iov[iov_count].iov_len;
         printf("original local %d\n", iov_len_local);
         while( 1 ) {
             while( pElem->elem.common.flags & OPAL_DATATYPE_FLAG_DATA ) {
                 /* now here we have a basic datatype */
                 /* should not go into here */
+                pStack--;
+                pConvertor->stack_pos--;
+                pos_desc --;
+                pElem = &(description[pos_desc]);
+                count_desc = count_desc_tmp;
                 goto complete_loop;
             }
             if( OPAL_DATATYPE_END_LOOP == pElem->elem.common.type ) { /* end of the current loop */
@@ -279,7 +291,6 @@ int32_t opal_generic_simple_pack_function_cuda_vector(opal_convertor_t* pConvert
                 OPAL_PTRDIFF_TYPE local_disp = (OPAL_PTRDIFF_TYPE)conv_ptr;
                 if( pElem->loop.common.flags & OPAL_DATATYPE_FLAG_CONTIGUOUS ) {
                     pack_contiguous_loop_cuda(pElem, &count_desc, &conv_ptr, &iov_ptr, &iov_len_local);
-                    count_desc = 0;
                     if( 0 == count_desc ) {  /* completed */
                         pos_desc += pElem->loop.items + 1;
                         goto update_loop_description;
@@ -291,7 +302,8 @@ int32_t opal_generic_simple_pack_function_cuda_vector(opal_convertor_t* pConvert
                             pStack->disp + local_disp);
                 pos_desc++;
             update_loop_description:  /* update the current state */
-                conv_ptr = pConvertor->pBaseBuf + pStack->disp;
+              //  conv_ptr = pConvertor->pBaseBuf + pStack->disp;
+                count_desc_tmp = count_desc;
                 UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
                 continue;
             }
@@ -304,7 +316,7 @@ int32_t opal_generic_simple_pack_function_cuda_vector(opal_convertor_t* pConvert
         GET_TIME(start);
 #endif
         if (transfer_required) {
-            cudaMemcpy(iov[iov_count].iov_base, ddt_cuda_pack_buffer, total_packed, cudaMemcpyDeviceToHost);
+            cudaMemcpy(iov[iov_count].iov_base, pConvertor->gpu_buffer_ptr, total_packed, cudaMemcpyDeviceToHost);
         } 
 #if defined(OPAL_DATATYPE_CUDA_TIMING) 
         GET_TIME( end );
@@ -314,11 +326,14 @@ int32_t opal_generic_simple_pack_function_cuda_vector(opal_convertor_t* pConvert
     }
     *max_data = total_packed;
     pConvertor->bConverted += total_packed;  /* update the already converted bytes */
-    pConvertor->bConverted = pConvertor->local_size;
     *out_size = iov_count;
     if( pConvertor->bConverted == pConvertor->local_size ) {
         pConvertor->flags |= CONVERTOR_COMPLETED;
         printf("total packed %lu\n", pConvertor->bConverted);
+        if (pConvertor->gpu_buffer_ptr != NULL && free_required) {
+           opal_cuda_free_gpu_buffer(pConvertor->gpu_buffer_ptr, 0);
+           pConvertor->gpu_buffer_ptr = NULL;
+        }
         return 1;
     }
     /* Save the global position for the next round */
@@ -598,7 +613,7 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
     GET_TIME(start);
 #endif
     if (transfer_required) {
-        cudaMemcpy(iov[0].iov_base, destination_tmp, total_packed, cudaMemcpyDeviceToHost);
+        cudaMemcpy(iov[0].iov_base, pConvertor->gpu_buffer_ptr, total_packed, cudaMemcpyDeviceToHost);
     } 
 #if defined(OPAL_DATATYPE_CUDA_TIMING) 
     GET_TIME( end );
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
index e1f96ea6a2f..fd4fec00a73 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
@@ -121,6 +121,8 @@ int32_t opal_generic_simple_unpack_function_cuda_vector( opal_convertor_t* pConv
     unsigned char *conv_ptr, *iov_ptr;
     size_t iov_len_local;
     uint32_t iov_count;
+    uint8_t free_required;
+    uint32_t count_desc_tmp;
 
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
     TIMER_DATA_TYPE start, end, start_total, end_total;
@@ -151,14 +153,19 @@ int32_t opal_generic_simple_unpack_function_cuda_vector( opal_convertor_t* pConv
                            pConvertor->stack_pos, pStack->index, (int)pStack->count, (long)(pStack->disp) ); );
 
     for( iov_count = 0; iov_count < (*out_size); iov_count++ ) {
-        iov_ptr = ddt_cuda_unpack_buffer;
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
         GET_TIME(start);
 #endif
         if (opal_cuda_is_gpu_buffer(iov[iov_count].iov_base)) {
             iov_ptr = (unsigned char*)iov[iov_count].iov_base;
-        } else {    
+            free_required = 0;
+        } else {  
+            if (pConvertor->gpu_buffer_ptr == NULL) {
+                pConvertor->gpu_buffer_ptr = (unsigned char*)opal_cuda_malloc_gpu_buffer(iov[iov_count].iov_len, 0);
+            }
+            iov_ptr = pConvertor->gpu_buffer_ptr;
             cudaMemcpy(iov_ptr, iov[iov_count].iov_base, iov[iov_count].iov_len, cudaMemcpyHostToDevice);
+            free_required = 1;
         }
 #if defined(OPAL_DATATYPE_CUDA_TIMING) 
         GET_TIME( end );
@@ -173,6 +180,11 @@ int32_t opal_generic_simple_unpack_function_cuda_vector( opal_convertor_t* pConv
             while( pElem->elem.common.flags & OPAL_DATATYPE_FLAG_DATA ) {
                 /* now here we have a basic datatype */
                 /* should not go to here */
+                pStack--;
+                pConvertor->stack_pos--;
+                pos_desc --;
+                pElem = &(description[pos_desc]);
+                count_desc = count_desc_tmp;
                 goto complete_loop;
             }
             if( OPAL_DATATYPE_END_LOOP == pElem->elem.common.type ) { /* end of the current loop */
@@ -209,7 +221,6 @@ int32_t opal_generic_simple_unpack_function_cuda_vector( opal_convertor_t* pConv
                 OPAL_PTRDIFF_TYPE local_disp = (OPAL_PTRDIFF_TYPE)conv_ptr;
                 if( pElem->loop.common.flags & OPAL_DATATYPE_FLAG_CONTIGUOUS ) {
                     unpack_contiguous_loop_cuda(pElem, &count_desc, &iov_ptr, &conv_ptr, &iov_len_local);
-                    count_desc = 0;
                     if( 0 == count_desc ) {  /* completed */
                         pos_desc += pElem->loop.items + 1;
                         goto update_loop_description;
@@ -221,7 +232,8 @@ int32_t opal_generic_simple_unpack_function_cuda_vector( opal_convertor_t* pConv
                             pStack->disp + local_disp);
                 pos_desc++;
             update_loop_description:  /* update the current state */
-                conv_ptr = pConvertor->pBaseBuf + pStack->disp;
+            //    conv_ptr = pConvertor->pBaseBuf + pStack->disp;
+                count_desc_tmp = count_desc;
                 UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
                 continue;
             }
@@ -234,10 +246,13 @@ int32_t opal_generic_simple_unpack_function_cuda_vector( opal_convertor_t* pConv
     *max_data = total_unpacked;
     pConvertor->bConverted += total_unpacked;  /* update the already converted bytes */
     *out_size = iov_count;
-    pConvertor->bConverted = pConvertor->local_size;
     if( pConvertor->bConverted == pConvertor->remote_size ) {
         pConvertor->flags |= CONVERTOR_COMPLETED;
         printf("total unpacked %lu\n", pConvertor->bConverted);
+        if (pConvertor->gpu_buffer_ptr != NULL && free_required) {
+            opal_cuda_free_gpu_buffer(pConvertor->gpu_buffer_ptr, 0);
+            pConvertor->gpu_buffer_ptr = NULL;
+        }
         return 1;
     }
     /* Save the global position for the next round */
@@ -506,10 +521,13 @@ void unpack_contiguous_loop_cuda( dt_elem_desc_t* ELEM,
     tasks_per_block = THREAD_PER_BLOCK * TASK_PER_THREAD;
     num_blocks = (*COUNT + tasks_per_block - 1) / tasks_per_block;
     unpack_contiguous_loop_cuda_kernel_global<<<192, 4*THREAD_PER_BLOCK>>>(_copy_loops, _end_loop->size, _loop->extent, _source, _destination);
-    
-    *(DESTINATION) = _destination - _end_loop->first_elem_disp;
+
+#if !defined(OPAL_DATATYPE_CUDA_DRY_RUN)     
+    *(DESTINATION) = _destination + _loop->extent*_copy_loops - _end_loop->first_elem_disp;
+    *(SOURCE) = *(SOURCE)  + _copy_loops * _end_loop->size;
     *(SPACE) -= _copy_loops * _end_loop->size;
     *(COUNT) -= _copy_loops;
+#endif
     
     cudaDeviceSynchronize();
 }
diff --git a/test/datatype/ddt_test.c b/test/datatype/ddt_test.c
index 3e6a2a531ff..98aa6f1347a 100644
--- a/test/datatype/ddt_test.c
+++ b/test/datatype/ddt_test.c
@@ -830,7 +830,7 @@ int main( int argc, char* argv[] )
     if( outputFlags & CHECK_PACK_UNPACK ) {
         for (i = 1; i <= 1; i++) {
 //        local_copy_ddt_count(pdt, 1);
-            local_copy_with_convertor(pdt, 1, 1024*1024*10, 4000);
+    //        local_copy_with_convertor(pdt, 1, 1024*1024*10, 4000);
         }
     }
     OBJ_RELEASE( pdt ); assert( pdt == NULL );
@@ -971,7 +971,7 @@ int main( int argc, char* argv[] )
       //  local_copy_with_convertor( pdt, 1, 6000 );
       //  local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 6000 );
       //  local_copy_with_convertor( pdt, 1, 36000 );
-          local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 1024*1024*20 , 4000, 384, 512);
+   //       local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 1024*1024*20 , 4000, 384, 512);
         }
     }
     printf( ">>--------------------------------------------<<\n" );
@@ -990,7 +990,7 @@ int main( int argc, char* argv[] )
       //  local_copy_with_convertor( pdt, 1, 6000 );
       //  local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 6000 );
       //  local_copy_with_convertor( pdt, 1, 36000 );
-          local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 1024*1024*10, 4000, 256, 384 );
+          local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 1024*2000, 4000, 256, 384 );
         }
     }
     printf( ">>--------------------------------------------<<\n" );

From ef54b4d93df508498e4875bdd668ad66b96c37d1 Mon Sep 17 00:00:00 2001
From: George Bosilca <bosilca@icl.utk.edu>
Date: Thu, 7 May 2015 00:43:19 -0400
Subject: [PATCH 100/190] unrestricted GPU. Instead of forcing everything to go
 on device 0, we now use the devices already opened.

---
 opal/datatype/cuda/opal_datatype_cuda.cu      | 30 ++++++++-----------
 .../cuda/opal_datatype_cuda_internal.cuh      |  1 -
 .../cuda/opal_datatype_pack_cuda_wrapper.cu   |  2 +-
 opal/datatype/opal_datatype_cuda.c            |  9 ++----
 4 files changed, 16 insertions(+), 26 deletions(-)

diff --git a/opal/datatype/cuda/opal_datatype_cuda.cu b/opal/datatype/cuda/opal_datatype_cuda.cu
index 3ec7b9e53ce..8451b143487 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cu
+++ b/opal/datatype/cuda/opal_datatype_cuda.cu
@@ -201,10 +201,15 @@ static inline void cuda_list_item_merge_by_addr(ddt_cuda_list_t *list)
 void opal_datatype_cuda_init(void)
 {
     uint32_t i;
-    
-    int device = OPAL_GPU_INDEX;
-    cudaSetDevice(device);
-    
+    int device;
+    cudaError res;
+
+    res = cudaGetDevice(&device);
+    if( cudaSuccess != res ) {
+        opal_cuda_output(0, "Cannot retrieve the device being used. Drop CUDA support!\n");
+        return;
+    }    
+
     cuda_free_list = init_cuda_free_list();
     
     /* init device */
@@ -245,10 +250,8 @@ void opal_datatype_cuda_init(void)
     
     cudaMalloc((void **)(&ddt_cuda_pack_buffer), sizeof(char)*DT_CUDA_BUFFER_SIZE);
     printf("malloc cuda packing buffer, %p\n", ddt_cuda_pack_buffer);
-    cudaMemset(ddt_cuda_pack_buffer, 0, sizeof(char)*DT_CUDA_BUFFER_SIZE);
     cudaMalloc((void **)(&ddt_cuda_unpack_buffer), sizeof(char)*DT_CUDA_BUFFER_SIZE);
     printf("malloc cuda unpacking buffer, %p\n", ddt_cuda_unpack_buffer);
-    cudaMemset(ddt_cuda_unpack_buffer, 0, sizeof(char)*DT_CUDA_BUFFER_SIZE);
 
     cuda_desc_h->iov[0].iov_base = ddt_cuda_pack_buffer;
     cuda_desc_h->iov[0].iov_len = DT_CUDA_BUFFER_SIZE;
@@ -285,8 +288,6 @@ void opal_datatype_cuda_init(void)
     // ALIGNMENT_DOUBLE = sizeof(double);
     // ALIGNMENT_FLOAT = sizeof(float);
     // ALIGNMENT_CHAR = sizeof(char);
-    
-    
 }
 
 void opal_datatype_cuda_fini(void)
@@ -344,18 +345,11 @@ int32_t opal_cuda_is_gpu_buffer(const void *ptr)
     if (res != CUDA_SUCCESS) {
         /* If we cannot determine it is device pointer,
          * just assume it is not. */
-        printf("!!!!!!!is gpu buffer error\n");
-        return 0;
-    } 
-    if (memType == CU_MEMORYTYPE_DEVICE) {
-        return 1;
-    } else if (memType == CU_MEMORYTYPE_HOST){
-        return 0;
-    } else if (memType == 0) {
-        return 0;
-    } else {
+        printf("!!!!!!! %p is not a gpu buffer. Take no-CUDA path!\n", ptr);
         return 0;
     }
+    /* Anything but CU_MEMORYTYPE_DEVICE is not a GPU memory */
+    return (memType == CU_MEMORYTYPE_DEVICE) ? 1 : 0;
 }
 
 unsigned char* opal_cuda_get_gpu_pack_buffer()
diff --git a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
index 567e81218ec..e9359209c01 100644
--- a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
+++ b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
@@ -23,7 +23,6 @@
 #define THREAD_PER_BLOCK    32
 #define CUDA_WARP_SIZE      32
 #define TASK_PER_THREAD     2
-#define OPAL_GPU_INDEX      0
 #define NB_STREAMS          4
 #define CUDA_NB_IOV         4096
 #define CUDA_IOV_LEN        1024*1204
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
index 636e413bc21..b55c59a5c1e 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
@@ -462,7 +462,7 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
     
     DT_CUDA_DEBUG ( opal_cuda_output(0, "Pack GPU base %p, pack to buffer %p\n", pConvertor->pBaseBuf, destination););
 
-    cuda_iov_count = 1000;
+    cuda_iov_count = CUDA_NB_IOV;
     total_packed = 0;
     total_converted = pConvertor->bConverted;
     cuda_streams->current_stream_id = 0;
diff --git a/opal/datatype/opal_datatype_cuda.c b/opal/datatype/opal_datatype_cuda.c
index caaab68208d..e09618e747b 100644
--- a/opal/datatype/opal_datatype_cuda.c
+++ b/opal/datatype/opal_datatype_cuda.c
@@ -80,9 +80,8 @@ bool opal_cuda_check_bufs(char *dest, char *src)
 
     if (ftable.gpu_is_gpu_buffer(dest, NULL) || ftable.gpu_is_gpu_buffer(src, NULL)) {
         return true;
-    } else {
-        return false;
     }
+    return false;
 }
 
 /*
@@ -109,9 +108,8 @@ void *opal_cuda_memcpy(void *dest, const void *src, size_t size, opal_convertor_
         opal_output(0, "CUDA: Error in cuMemcpy: res=%d, dest=%p, src=%p, size=%d",
                     res, dest, src, (int)size);
         abort();
-    } else {
-        return dest;
     }
+    return dest;
 }
 
 /*
@@ -127,9 +125,8 @@ void *opal_cuda_memcpy_sync(void *dest, const void *src, size_t size)
         opal_output(0, "CUDA: Error in cuMemcpy: res=%d, dest=%p, src=%p, size=%d",
                     res, dest, src, (int)size);
         abort();
-    } else {
-        return dest;
     }
+    return dest;
 }
 
 /*

From 653f54de6d25c4aba4cad5e8a852fba3254ac785 Mon Sep 17 00:00:00 2001
From: George Bosilca <bosilca@icl.utk.edu>
Date: Thu, 18 Jun 2015 11:17:30 -0400
Subject: [PATCH 101/190] Using globally defined indexes lead to several
 synchronization issues, when 2 peers were doing a send/recv or when multiple
 senders were targetting the same receiver. Rolf provided a patch to solve
 this issue, by moving the IPC communication index from a global location onto
 each endpoint.

---
 ompi/mca/pml/ob1/pml_ob1_cuda.c            |  4 +-
 opal/mca/btl/smcuda/btl_smcuda.c           | 88 ++++++++++++++++------
 opal/mca/btl/smcuda/btl_smcuda.h           | 33 +++++---
 opal/mca/btl/smcuda/btl_smcuda_component.c | 32 ++++----
 opal/mca/btl/smcuda/btl_smcuda_endpoint.h  |  2 +
 5 files changed, 109 insertions(+), 50 deletions(-)

diff --git a/ompi/mca/pml/ob1/pml_ob1_cuda.c b/ompi/mca/pml/ob1/pml_ob1_cuda.c
index 4361d2f5918..f9f3a2cbe02 100644
--- a/ompi/mca/pml/ob1/pml_ob1_cuda.c
+++ b/ompi/mca/pml/ob1/pml_ob1_cuda.c
@@ -130,10 +130,10 @@ int mca_pml_ob1_send_request_start_cuda(mca_pml_ob1_send_request_t* sendreq,
                 /* because pack may not use the whole pipeline size */
                 rc_dt = opal_convertor_pack(convertor, &iov, &iov_count, &max_data );
                 pipeline_size = max_data;
-                int lindex = mca_btl_smcuda_alloc_cuda_dt_clone();
+                int lindex = mca_btl_smcuda_alloc_cuda_dt_pack_clone(bml_btl->btl_endpoint);
                 assert(lindex >= 0);
                 mca_pml_ob1_rdma_cuda_btl_register_events(sendreq->req_rdma, sendreq->req_rdma_cnt, convertor, pipeline_size, lindex); 
-                mca_btl_smcuda_cuda_dt_clone(convertor, bml_btl->btl_endpoint, NULL, NULL, NULL, NULL, NULL, pipeline_size, lindex);
+                mca_btl_smcuda_cuda_dt_pack_clone(convertor, bml_btl->btl_endpoint, NULL, NULL, NULL, NULL, NULL, pipeline_size, lindex);
                 
                 rc = mca_pml_ob1_send_request_start_rdma(sendreq, bml_btl,
                                                          sendreq->req_send.req_bytes_packed);
diff --git a/opal/mca/btl/smcuda/btl_smcuda.c b/opal/mca/btl/smcuda/btl_smcuda.c
index 3a711e40cdf..4814b6c996a 100644
--- a/opal/mca/btl/smcuda/btl_smcuda.c
+++ b/opal/mca/btl/smcuda/btl_smcuda.c
@@ -491,6 +491,10 @@ create_sm_endpoint(int local_proc, struct opal_proc_t *proc)
         ep->mpool = mca_mpool_base_module_create("rgpusm",
                                                  NULL,
                                                  &resources);
+        for (int i = 0; i < SMCUDA_DT_CLONE_SIZE; i++) {
+            ep->smcuda_dt_pack_clone[i].lindex = -1;
+            ep->smcuda_dt_unpack_clone[i].lindex = -1;
+        }
     }
 #endif /* OPAL_CUDA_SUPPORT */
     return ep;
@@ -1138,7 +1142,7 @@ int mca_btl_smcuda_get_cuda (struct mca_btl_base_module_t *btl,
             uint32_t lindex = remote_handle->reg_data.lindex;
             printf("i receive pipeline %ld, lindex %d\n", pipeline_size, lindex);
             convertor->gpu_buffer_ptr = remote_memory_address;
-            mca_btl_smcuda_cuda_dt_clone(convertor, ep, local_address, local_handle, (mca_btl_base_completion_fn_t)cbfunc, cbcontext, cbdata, pipeline_size, lindex);
+            mca_btl_smcuda_cuda_dt_unpack_clone(convertor, ep, local_address, local_handle, (mca_btl_base_completion_fn_t)cbfunc, cbcontext, cbdata, pipeline_size, lindex);
             done = 0;
         } else {
             recvreq->req_recv.req_base.req_convertor.flags |= CONVERTOR_CUDA;
@@ -1291,42 +1295,78 @@ int mca_btl_smcuda_send_cuda_pack_sig(struct mca_btl_base_module_t* btl,
     return rc;
 }
 
-int mca_btl_smcuda_alloc_cuda_dt_clone(void)
+int mca_btl_smcuda_alloc_cuda_dt_pack_clone(struct mca_btl_base_endpoint_t *endpoint)
+{
+    int i;
+    for (i = 0; i < SMCUDA_DT_CLONE_SIZE; i++) {
+        if (endpoint->smcuda_dt_pack_clone[i].lindex == -1) {
+            return i;
+        }
+    }
+    return -1;
+}
+int mca_btl_smcuda_alloc_cuda_dt_unpack_clone(struct mca_btl_base_endpoint_t *endpoint)
 {
     int i;
     for (i = 0; i < SMCUDA_DT_CLONE_SIZE; i++) {
-        if (smcuda_dt_clone[i].lindex == -1) {
+        if (endpoint->smcuda_dt_unpack_clone[i].lindex == -1) {
             return i;
         }
     }
     return -1;
 }
 
-void mca_btl_smcuda_free_cuda_dt_clone(int lindex)
+void mca_btl_smcuda_free_cuda_dt_pack_clone(struct mca_btl_base_endpoint_t *endpoint, int lindex)
+{
+    assert(endpoint->smcuda_dt_pack_clone[lindex].lindex == lindex);
+    endpoint->smcuda_dt_pack_clone[lindex].lindex = -1;
+}
+void mca_btl_smcuda_free_cuda_dt_unpack_clone(struct mca_btl_base_endpoint_t *endpoint, int lindex)
+{
+    assert(endpoint->smcuda_dt_unpack_clone[lindex].lindex == lindex);
+    endpoint->smcuda_dt_unpack_clone[lindex].lindex = -1;
+}
+
+void mca_btl_smcuda_cuda_dt_pack_clone(struct opal_convertor_t *convertor,
+                                       struct mca_btl_base_endpoint_t *endpoint,
+                                       void *local_address,
+                                       struct mca_btl_base_registration_handle_t *local_handle,
+                                       mca_btl_base_completion_fn_t cbfunc,
+                                       void *cbcontext,
+                                       void *cbdata,
+                                       size_t pipeline_size,
+                                       int lindex)
 {
-    assert(smcuda_dt_clone[lindex].lindex == lindex);
-    smcuda_dt_clone[lindex].lindex = -1;
+    endpoint->smcuda_dt_pack_clone[lindex].convertor = convertor;
+    endpoint->smcuda_dt_pack_clone[lindex].endpoint = endpoint;
+    endpoint->smcuda_dt_pack_clone[lindex].local_address = local_address;
+    endpoint->smcuda_dt_pack_clone[lindex].local_handle = local_handle;
+    endpoint->smcuda_dt_pack_clone[lindex].cbfunc = cbfunc;
+    endpoint->smcuda_dt_pack_clone[lindex].cbcontext = cbcontext;
+    endpoint->smcuda_dt_pack_clone[lindex].cbdata = cbdata;
+    endpoint->smcuda_dt_pack_clone[lindex].pipeline_size = pipeline_size;
+    endpoint->smcuda_dt_pack_clone[lindex].lindex = lindex;
 }
 
-void mca_btl_smcuda_cuda_dt_clone(struct opal_convertor_t *convertor,
-                                  struct mca_btl_base_endpoint_t *endpoint,
-                                  void *local_address,
-                                  struct mca_btl_base_registration_handle_t *local_handle,
-                                  mca_btl_base_completion_fn_t cbfunc,
-                                  void *cbcontext,
-                                  void *cbdata,
-                                  size_t pipeline_size,
-                                  int lindex)
+void mca_btl_smcuda_cuda_dt_unpack_clone(struct opal_convertor_t *convertor,
+                                         struct mca_btl_base_endpoint_t *endpoint,
+                                         void *local_address,
+                                         struct mca_btl_base_registration_handle_t *local_handle,
+                                         mca_btl_base_completion_fn_t cbfunc,
+                                         void *cbcontext,
+                                         void *cbdata,
+                                         size_t pipeline_size,
+                                         int lindex)
 {
-    smcuda_dt_clone[lindex].convertor = convertor;
-    smcuda_dt_clone[lindex].endpoint = endpoint;
-    smcuda_dt_clone[lindex].local_address = local_address;
-    smcuda_dt_clone[lindex].local_handle = local_handle;
-    smcuda_dt_clone[lindex].cbfunc = cbfunc;
-    smcuda_dt_clone[lindex].cbcontext = cbcontext;
-    smcuda_dt_clone[lindex].cbdata = cbdata;
-    smcuda_dt_clone[lindex].pipeline_size = pipeline_size;
-    smcuda_dt_clone[lindex].lindex = lindex;
+    endpoint->smcuda_dt_unpack_clone[lindex].convertor = convertor;
+    endpoint->smcuda_dt_unpack_clone[lindex].endpoint = endpoint;
+    endpoint->smcuda_dt_unpack_clone[lindex].local_address = local_address;
+    endpoint->smcuda_dt_unpack_clone[lindex].local_handle = local_handle;
+    endpoint->smcuda_dt_unpack_clone[lindex].cbfunc = cbfunc;
+    endpoint->smcuda_dt_unpack_clone[lindex].cbcontext = cbcontext;
+    endpoint->smcuda_dt_unpack_clone[lindex].cbdata = cbdata;
+    endpoint->smcuda_dt_unpack_clone[lindex].pipeline_size = pipeline_size;
+    endpoint->smcuda_dt_unpack_clone[lindex].lindex = lindex;
 }
 
 #endif /* OPAL_CUDA_SUPPORT */
diff --git a/opal/mca/btl/smcuda/btl_smcuda.h b/opal/mca/btl/smcuda/btl_smcuda.h
index 3e9f2a46db2..00765f0a276 100644
--- a/opal/mca/btl/smcuda/btl_smcuda.h
+++ b/opal/mca/btl/smcuda/btl_smcuda.h
@@ -532,17 +532,28 @@ extern cuda_dt_clone_t smcuda_dt_clone[SMCUDA_DT_CLONE_SIZE];
 
 int mca_btl_smcuda_send_cuda_unpack_sig(struct mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t* endpoint, int lindex, int seq);
 int mca_btl_smcuda_send_cuda_pack_sig(struct mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t* endpoint, int lindex, int seq);
-int mca_btl_smcuda_alloc_cuda_dt_clone(void);
-void mca_btl_smcuda_free_cuda_dt_clone(int lindex);
-void mca_btl_smcuda_cuda_dt_clone(struct opal_convertor_t *convertor,
-                                  struct mca_btl_base_endpoint_t *endpoint,
-                                  void *local_address,
-                                  struct mca_btl_base_registration_handle_t *local_handle,
-                                  mca_btl_base_completion_fn_t cbfunc,
-                                  void *cbcontext,
-                                  void *cbdata,
-                                  size_t pipeline_size,
-                                  int lindex);
+int mca_btl_smcuda_alloc_cuda_dt_pack_clone(struct mca_btl_base_endpoint_t *endpoint);
+int mca_btl_smcuda_alloc_cuda_dt_unpack_clone(struct mca_btl_base_endpoint_t *endpoint);
+void mca_btl_smcuda_free_cuda_dt_pack_clone(struct mca_btl_base_endpoint_t *endpoint, int lindex);
+void mca_btl_smcuda_free_cuda_dt_unpack_clone(struct mca_btl_base_endpoint_t *endpoint, int lindex);
+void mca_btl_smcuda_cuda_dt_pack_clone(struct opal_convertor_t *convertor,
+                                       struct mca_btl_base_endpoint_t *endpoint,
+                                       void *local_address,
+                                       struct mca_btl_base_registration_handle_t *local_handle,
+                                       mca_btl_base_completion_fn_t cbfunc,
+                                       void *cbcontext,
+                                       void *cbdata,
+                                       size_t pipeline_size,
+                                       int lindex);
+void mca_btl_smcuda_cuda_dt_unpack_clone(struct opal_convertor_t *convertor,
+                                         struct mca_btl_base_endpoint_t *endpoint,
+                                         void *local_address,
+                                         struct mca_btl_base_registration_handle_t *local_handle,
+                                         mca_btl_base_completion_fn_t cbfunc,
+                                         void *cbcontext,
+                                         void *cbdata,
+                                         size_t pipeline_size,
+                                         int lindex);
 
 #endif /* OPAL_CUDA_SUPPORT */
 
diff --git a/opal/mca/btl/smcuda/btl_smcuda_component.c b/opal/mca/btl/smcuda/btl_smcuda_component.c
index 727308c1df9..f035578bd5d 100644
--- a/opal/mca/btl/smcuda/btl_smcuda_component.c
+++ b/opal/mca/btl/smcuda/btl_smcuda_component.c
@@ -847,29 +847,32 @@ static void btl_smcuda_control(mca_btl_base_module_t* btl,
     }
 }
 
-cuda_dt_clone_t smcuda_dt_clone[SMCUDA_DT_CLONE_SIZE];
-
 static void btl_smcuda_datatype_unpack(mca_btl_base_module_t* btl,
                                        mca_btl_base_tag_t tag,
                                        mca_btl_base_descriptor_t* des, void* cbdata)
 {   
+    struct mca_btl_base_endpoint_t *endpoint;
     cuda_dt_hdr_t cuda_dt_hdr;
     mca_btl_base_segment_t* segments = des->des_segments;
     memcpy(&cuda_dt_hdr, segments->seg_addr.pval, sizeof(cuda_dt_hdr_t));
     int seq = cuda_dt_hdr.seq;
     int lindex = cuda_dt_hdr.lindex;
-    cuda_dt_clone_t *my_cuda_dt_clone = &smcuda_dt_clone[lindex];
-    
+    mca_btl_smcuda_frag_t *frag = (mca_btl_smcuda_frag_t *)des;
+    cuda_dt_clone_t *my_cuda_dt_clone;
+
+    /* We can find the endoint back from the rank embedded in the header */
+    endpoint = mca_btl_smcuda_component.sm_peers[frag->hdr->my_smp_rank];
+    my_cuda_dt_clone = &endpoint->smcuda_dt_unpack_clone[lindex];
     assert(my_cuda_dt_clone->lindex == lindex);
     
     printf("$$$$$$$$$$$$$$hello, rank %d in smcuda unpack seq %d, index %d\n", my_cuda_dt_clone->endpoint->my_smp_rank, seq, lindex);
     
     if (seq == -2) {
         mca_btl_base_rdma_completion_fn_t cbfunc = (mca_btl_base_rdma_completion_fn_t)my_cuda_dt_clone->cbfunc;
-        cbfunc(btl, my_cuda_dt_clone->endpoint, my_cuda_dt_clone->local_address, my_cuda_dt_clone->local_handle, my_cuda_dt_clone->cbcontext, my_cuda_dt_clone->cbdata, OPAL_SUCCESS);
-        mca_btl_smcuda_free_cuda_dt_clone(lindex);
+        cbfunc(btl, endpoint, my_cuda_dt_clone->local_address, my_cuda_dt_clone->local_handle, my_cuda_dt_clone->cbcontext, my_cuda_dt_clone->cbdata, OPAL_SUCCESS);
+        mca_btl_smcuda_free_cuda_dt_unpack_clone(endpoint, lindex);
     } else if (seq == -1) {
-        mca_btl_smcuda_send_cuda_pack_sig(btl, my_cuda_dt_clone->endpoint, lindex, -1);
+        mca_btl_smcuda_send_cuda_pack_sig(btl, endpoint, lindex, -1);
     } else {
         struct iovec iov;
         uint32_t iov_count = 1;
@@ -887,19 +890,25 @@ static void btl_smcuda_datatype_pack(mca_btl_base_module_t* btl,
                                      mca_btl_base_tag_t tag,
                                      mca_btl_base_descriptor_t* des, void* cbdata)
 {
+    struct mca_btl_base_endpoint_t *endpoint;
     cuda_dt_hdr_t cuda_dt_hdr;
     mca_btl_base_segment_t* segments = des->des_segments;
     memcpy(&cuda_dt_hdr, segments->seg_addr.pval, sizeof(cuda_dt_hdr_t));
     int seq = cuda_dt_hdr.seq;
     int lindex = cuda_dt_hdr.lindex;
-    cuda_dt_clone_t *my_cuda_dt_clone = &smcuda_dt_clone[lindex];
+    mca_btl_smcuda_frag_t *frag = (mca_btl_smcuda_frag_t *)des;
+    cuda_dt_clone_t *my_cuda_dt_clone;
+
+    /* We can find the endoint back from the rank embedded in the header */
+    endpoint = mca_btl_smcuda_component.sm_peers[frag->hdr->my_smp_rank];
+    my_cuda_dt_clone = &endpoint->smcuda_dt_pack_clone[lindex];
     
     printf("$$$$$$$$$$$$$$hello, rank %d in smcuda pack seq %d, index %d\n", my_cuda_dt_clone->endpoint->my_smp_rank, seq, lindex);
     
     if (seq == -1) {
         mca_btl_smcuda_send_cuda_unpack_sig(btl, my_cuda_dt_clone->endpoint, lindex, -2);
         opal_cuda_free_gpu_buffer_p(my_cuda_dt_clone->convertor->gpu_buffer_ptr, 0);
-        mca_btl_smcuda_free_cuda_dt_clone(lindex);
+        mca_btl_smcuda_free_cuda_dt_pack_clone(my_cuda_dt_clone->endpoint, lindex);
     }
 }
 
@@ -1021,10 +1030,7 @@ mca_btl_smcuda_component_init(int *num_btls,
     mca_btl_base_active_message_trigger[MCA_BTL_TAG_SMCUDA_DATATYPE_UNPACK].cbdata = NULL;
     mca_btl_base_active_message_trigger[MCA_BTL_TAG_SMCUDA_DATATYPE_PACK].cbfunc = btl_smcuda_datatype_pack;
     mca_btl_base_active_message_trigger[MCA_BTL_TAG_SMCUDA_DATATYPE_PACK].cbdata = NULL;
-    
-    for (int i = 0; i < SMCUDA_DT_CLONE_SIZE; i++) {
-        smcuda_dt_clone[i].lindex = -1;
-    }
+
 #endif /* OPAL_CUDA_SUPPORT */
 
     return btls;
diff --git a/opal/mca/btl/smcuda/btl_smcuda_endpoint.h b/opal/mca/btl/smcuda/btl_smcuda_endpoint.h
index cead5ec7a5c..e4df5ee56d0 100644
--- a/opal/mca/btl/smcuda/btl_smcuda_endpoint.h
+++ b/opal/mca/btl/smcuda/btl_smcuda_endpoint.h
@@ -49,6 +49,8 @@ struct mca_btl_base_endpoint_t {
     opal_proc_t *proc_opal;  /**< Needed for adding CUDA IPC support dynamically */
     enum ipcState ipcstate;  /**< CUDA IPC connection status */
     int ipctries;            /**< Number of times CUDA IPC connect was sent */
+    cuda_dt_clone_t smcuda_dt_pack_clone[SMCUDA_DT_CLONE_SIZE];
+    cuda_dt_clone_t smcuda_dt_unpack_clone[SMCUDA_DT_CLONE_SIZE];
 #endif /* OPAL_CUDA_SUPPORT */
 };
 

From 10f554328a91ad9130f2557d7bef2fa5a0a4b9f6 Mon Sep 17 00:00:00 2001
From: George Bosilca <bosilca@icl.utk.edu>
Date: Thu, 18 Jun 2015 15:15:47 -0400
Subject: [PATCH 102/190] Generate the Makefile. It will now be placed in the
 bindir and will be populated with all the known information. Beware: one
 still has to manually set the CUDA lib and path as they are not available
 after configure (unlike the include which is).

Conflicts:
	opal/datatype/cuda/Makefile
---
 configure.ac                   |  4 +++
 opal/datatype/cuda/Makefile    | 40 ------------------------
 opal/datatype/cuda/Makefile.in | 57 ++++++++++++++++++++++++++++++++++
 3 files changed, 61 insertions(+), 40 deletions(-)
 delete mode 100644 opal/datatype/cuda/Makefile
 create mode 100644 opal/datatype/cuda/Makefile.in

diff --git a/configure.ac b/configure.ac
index 7bb7cbe8eb7..3e87ec6276e 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1357,6 +1357,10 @@ m4_ifdef([project_oshmem],
 
 opal_show_subtitle "Final output"
 
+if test "$OPAL_cuda_support" != "0"; then
+  AC_CONFIG_FILES([opal/datatype/cuda/Makefile])
+fi
+
 AC_CONFIG_FILES([
     Makefile
 
diff --git a/opal/datatype/cuda/Makefile b/opal/datatype/cuda/Makefile
deleted file mode 100644
index e76f160fb88..00000000000
--- a/opal/datatype/cuda/Makefile
+++ /dev/null
@@ -1,40 +0,0 @@
-CC			= gcc
-NVCC		= nvcc
-ARCH		= ar
-ARCHFLAGS	= cr
-RANLIB		= ranlib
-STLIB		?= opal_datatype_cuda.a
-DYLIB		?= opal_datatype_cuda.so
-CFLAGS		= -g -G -O0 
-EXTLIB		= -L/home/wwu12/ompi/ompi-gpu/opal/datatype/.libs -ldatatype -L/usr/lib64 -lcuda
-INC			=
-
-SRC	:= \
-    opal_datatype_cuda.cu \
-    opal_datatype_pack_cuda_kernel.cu \
-    opal_datatype_pack_cuda_wrapper.cu \
-	opal_datatype_unpack_cuda_kernel.cu \
-	opal_datatype_unpack_cuda_wrapper.cu \
-	
-OBJ := $(SRC:.cu=.o)
-
-.PHONY: all clean cleanall
-
-all: $(STLIB) $(DYLIB)
-
-$(STLIB): $(OBJ)
-	$(ARCH) $(ARCHFLAGS) $@ $(OBJ) 
-	$(RANLIB) $@
-	
-$(DYLIB): $(OBJ)
-	$(NVCC) $(CFLAGS) $(EXTLIB) -shared --compiler-options '-fPIC' -o $(DYLIB) $(OBJ)
-	
-%.o: %.cu
-	$(NVCC) $(CFLAGS) $(EXTLIB) -gencode arch=compute_35,code=sm_35 $(INC) -c --compiler-options '-fPIC' $< -o $@ 
-
-clean:
-	rm -f *.o
-
-cleanall: clean
-	rm -f $(STLIB)
-	rm -f $(DYLIB)
diff --git a/opal/datatype/cuda/Makefile.in b/opal/datatype/cuda/Makefile.in
new file mode 100644
index 00000000000..519de6100ae
--- /dev/null
+++ b/opal/datatype/cuda/Makefile.in
@@ -0,0 +1,57 @@
+@SET_MAKE@
+
+AM_CPPFLAGS = @common_cuda_CPPFLAGS@
+srcdir = @srcdir@
+top_builddir = @top_builddir@
+VPATH = @srcdir@
+
+NVCC		= nvcc
+ARCH		= ar
+ARCHFLAGS	= cr
+STLIB		?= opal_datatype_cuda.a
+DYLIB		?= opal_datatype_cuda.so
+EXTLIB		= -L$(top_builddir)/opal/datatype/.libs -ldatatype -L$(top_builddir)/opal/.libs -lopen-pal -L/usr/local/cuda/lib -lcuda
+subdir = opal/datatype/cuda
+
+CC = nvcc
+CFLAGS = -gencode arch=compute_35,code=sm_35 --compiler-options '-fPIC @CFLAGS@'
+LDFLAGS += -shared --compiler-options '-fPIC @LDFLAGS@'
+
+SRC := \
+    opal_datatype_cuda.cu \
+    opal_datatype_pack_cuda_kernel.cu \
+    opal_datatype_pack_cuda_wrapper.cu \
+    opal_datatype_unpack_cuda_kernel.cu \
+    opal_datatype_unpack_cuda_wrapper.cu
+
+OBJ := $(SRC:.cu=.o)
+
+.PHONY: all clean cleanall
+
+all: Makefile $(STLIB) $(DYLIB)
+
+Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
+	@case '$?' in \
+	  *config.status*) \
+	    cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \
+	  *) \
+	    echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe)'; \
+	    cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe);; \
+	esac;
+
+$(STLIB): $(OBJ)
+	$(ARCH) $(ARCHFLAGS) $@ $(OBJ) 
+	@RANLIB@ $@
+	
+$(DYLIB): $(OBJ)
+	$(NVCC) $(LDFLAGS) $(EXTLIB) -o $(DYLIB) $(OBJ)
+	
+%.o: %.cu
+	$(NVCC) $(CFLAGS) $(EXTLIB) $(INC) -c $< -o $@ 
+
+clean:
+	rm -f *.o
+
+cleanall: clean
+	rm -f $(STLIB)
+	rm -f $(DYLIB)

From 96ec3c55d48177eb8d04a920e2d4acfae92da687 Mon Sep 17 00:00:00 2001
From: George Bosilca <bosilca@icl.utk.edu>
Date: Thu, 18 Jun 2015 19:47:14 -0400
Subject: [PATCH 103/190] This file was certainly not supposed to be here.
 There is NO valid reason to have a copy of a locally generated file in the
 source.

---
 opal/datatype/cuda/opal_config.h | 2863 ------------------------------
 1 file changed, 2863 deletions(-)
 delete mode 100644 opal/datatype/cuda/opal_config.h

diff --git a/opal/datatype/cuda/opal_config.h b/opal/datatype/cuda/opal_config.h
deleted file mode 100644
index d23f071a86a..00000000000
--- a/opal/datatype/cuda/opal_config.h
+++ /dev/null
@@ -1,2863 +0,0 @@
-/* opal/include/opal_config.h.  Generated from opal_config.h.in by configure.  */
-/* opal/include/opal_config.h.in.  Generated from configure.ac by autoheader.  */
-
-/* -*- c -*-
- *
- * Copyright (c) 2004-2005 The Trustees of Indiana University.
- *                         All rights reserved.
- * Copyright (c) 2004-2005 The Trustees of the University of Tennessee.
- *                         All rights reserved.
- * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
- *                         University of Stuttgart.  All rights reserved.
- * Copyright (c) 2004-2005 The Regents of the University of California.
- *                         All rights reserved.
- * Copyright (c) 2014      Intel, Inc. All rights reserved.
- * $COPYRIGHT$
- *
- * Additional copyrights may follow
- *
- * $HEADER$
- *
- * Function: - OS, CPU and compiler dependent configuration
- */
-
-#ifndef OPAL_CONFIG_H
-#define OPAL_CONFIG_H
-
-//#include "opal_config_top.h"
-
-
-
-/* Define if building universal (internal helper macro) */
-/* #undef AC_APPLE_UNIVERSAL_BUILD */
-
-/* enable openib BTL failover */
-#define BTL_OPENIB_FAILOVER_ENABLED 0
-
-/* Whether the openib BTL malloc hooks are enabled */
-#define BTL_OPENIB_MALLOC_HOOKS_ENABLED 1
-
-/* rdmacm without IB_AF addressing support */
-/* #undef BTL_OPENIB_RDMACM_IB_ADDR */
-
-/* BLCR cr_request_file check */
-/* #undef CRS_BLCR_HAVE_CR_REQUEST */
-
-/* BLCR cr_request_checkpoint check */
-/* #undef CRS_BLCR_HAVE_CR_REQUEST_CHECKPOINT */
-
-/* BLCRs cr_checkpoint_info.requester member availability */
-/* #undef CRS_BLCR_HAVE_INFO_REQUESTER */
-
-/* Version of event */
-/* #undef EVENT_EXTERNAL_EVENT_VERSION */
-
-/* Define to 1 if you have the <aio.h> header file. */
-#define HAVE_AIO_H 1
-
-/* Define to 1 if the linker supports alias attribute. */
-/* #undef HAVE_ALIAS_ATTRIBUTE */
-
-/* Define to 1 if you have the <alloca.h> header file. */
-#define HAVE_ALLOCA_H 1
-
-/* Define to 1 if you have the <alps/apInfo.h> header file. */
-/* #undef HAVE_ALPS_APINFO_H */
-
-/* Define to 1 if you have the <arpa/inet.h> header file. */
-#define HAVE_ARPA_INET_H 1
-
-/* Define to 1 if you have the `asprintf' function. */
-#define HAVE_ASPRINTF 1
-
-/* Set to use c11 atomic functions */
-/* #undef HAVE_ATOMICS */
-
-/* Define to 1 if the system has the type `CACHE_DESCRIPTOR'. */
-/* #undef HAVE_CACHE_DESCRIPTOR */
-
-/* Define to 1 if the system has the type `CACHE_RELATIONSHIP'. */
-/* #undef HAVE_CACHE_RELATIONSHIP */
-
-/* Define to 1 if you have the `clz' function. */
-/* #undef HAVE_CLZ */
-
-/* Define to 1 if you have the `clzl' function. */
-/* #undef HAVE_CLZL */
-
-/* Define to 1 if you have the <CL/cl_ext.h> header file. */
-#define HAVE_CL_CL_EXT_H 1
-
-/* Define to 1 if you have the <complex.h> header file. */
-#define HAVE_COMPLEX_H 1
-
-/* Define to 1 if you have the `cpuset_setaffinity' function. */
-/* #undef HAVE_CPUSET_SETAFFINITY */
-
-/* Define to 1 if you have the `cpuset_setid' function. */
-/* #undef HAVE_CPUSET_SETID */
-
-/* Define to 1 if you have the <criu/criu.h> header file. */
-/* #undef HAVE_CRIU_CRIU_H */
-
-/* Define to 1 if you have the <crt_externs.h> header file. */
-/* #undef HAVE_CRT_EXTERNS_H */
-
-/* Define to 1 if you have the <ctype.h> header file. */
-#define HAVE_CTYPE_H 1
-
-/* Define to 1 if we have -lcuda */
-/* #undef HAVE_CUDA */
-
-/* Define to 1 if you have the <cuda.h> header file. */
-/* #undef HAVE_CUDA_H */
-
-/* Define to 1 if you have the <cuda_runtime_api.h> header file. */
-/* #undef HAVE_CUDA_RUNTIME_API_H */
-
-/* Define to 1 if you have the <curl/curl.h> header file. */
-/* #undef HAVE_CURL_CURL_H */
-
-/* Define to 1 if you have the `dbm_open' function. */
-/* #undef HAVE_DBM_OPEN */
-
-/* Define to 1 if you have the `dbopen' function. */
-/* #undef HAVE_DBOPEN */
-
-/* Define to 1 if you have the <db.h> header file. */
-/* #undef HAVE_DB_H */
-
-/* Define to 1 if you have the declaration of `AF_INET6', and to 0 if you
-   don't. */
-#define HAVE_DECL_AF_INET6 1
-
-/* Define to 1 if you have the declaration of `AF_UNSPEC', and to 0 if you
-   don't. */
-#define HAVE_DECL_AF_UNSPEC 1
-
-/* Define to 1 if you have the declaration of `CL_DEVICE_TOPOLOGY_AMD', and to
-   0 if you don't. */
-#define HAVE_DECL_CL_DEVICE_TOPOLOGY_AMD 0
-
-/* Define to 1 if you have the declaration of `CTL_HW', and to 0 if you don't.
-   */
-#define HAVE_DECL_CTL_HW 0
-
-/* Define to 1 if you have the declaration of `fabsf', and to 0 if you don't.
-   */
-#define HAVE_DECL_FABSF 1
-
-/* Define to 1 if you have the declaration of `HW_NCPU', and to 0 if you
-   don't. */
-#define HAVE_DECL_HW_NCPU 0
-
-/* Define to 1 if you have the declaration of `HZ', and to 0 if you don't. */
-#define HAVE_DECL_HZ 1
-
-/* Define to 1 if you have the declaration of `IBV_ACCESS_ALLOCATE_MR', and to
-   0 if you don't. */
-/* #undef HAVE_DECL_IBV_ACCESS_ALLOCATE_MR */
-
-/* Define to 1 if you have the declaration of
-   `IBV_ACCESS_SHARED_MR_USER_READ', and to 0 if you don't. */
-/* #undef HAVE_DECL_IBV_ACCESS_SHARED_MR_USER_READ */
-
-/* Define to 1 if you have the declaration of `IBV_ACCESS_SO', and to 0 if you
-   don't. */
-/* #undef HAVE_DECL_IBV_ACCESS_SO */
-
-/* Define to 1 if you have the declaration of `IBV_ATOMIC_HCA', and to 0 if
-   you don't. */
-/* #undef HAVE_DECL_IBV_ATOMIC_HCA */
-
-/* Define to 1 if you have the declaration of `IBV_EVENT_CLIENT_REREGISTER',
-   and to 0 if you don't. */
-/* #undef HAVE_DECL_IBV_EVENT_CLIENT_REREGISTER */
-
-/* Define to 1 if you have the declaration of `IBV_EXP_ACCESS_ALLOCATE_MR',
-   and to 0 if you don't. */
-/* #undef HAVE_DECL_IBV_EXP_ACCESS_ALLOCATE_MR */
-
-/* Define to 1 if you have the declaration of
-   `IBV_EXP_ACCESS_SHARED_MR_USER_READ', and to 0 if you don't. */
-/* #undef HAVE_DECL_IBV_EXP_ACCESS_SHARED_MR_USER_READ */
-
-/* Define to 1 if you have the declaration of `IBV_LINK_LAYER_ETHERNET', and
-   to 0 if you don't. */
-/* #undef HAVE_DECL_IBV_LINK_LAYER_ETHERNET */
-
-/* Define to 1 if you have the declaration of `IBV_SRQT_XRC', and to 0 if you
-   don't. */
-/* #undef HAVE_DECL_IBV_SRQT_XRC */
-
-/* Define to 1 if you have the declaration of
-   `nvmlDeviceGetMaxPcieLinkGeneration', and to 0 if you don't. */
-/* #undef HAVE_DECL_NVMLDEVICEGETMAXPCIELINKGENERATION */
-
-/* Define to 1 if you have the declaration of `PCI_LOOKUP_NO_NUMBERS', and to
-   0 if you don't. */
-/* #undef HAVE_DECL_PCI_LOOKUP_NO_NUMBERS */
-
-/* Define to 1 if you have the declaration of `PF_INET6', and to 0 if you
-   don't. */
-#define HAVE_DECL_PF_INET6 1
-
-/* Define to 1 if you have the declaration of `PF_UNSPEC', and to 0 if you
-   don't. */
-#define HAVE_DECL_PF_UNSPEC 1
-
-/* Define to 1 if you have the declaration of `pthread_getaffinity_np', and to
-   0 if you don't. */
-#define HAVE_DECL_PTHREAD_GETAFFINITY_NP 1
-
-/* Define to 1 if you have the declaration of `pthread_setaffinity_np', and to
-   0 if you don't. */
-#define HAVE_DECL_PTHREAD_SETAFFINITY_NP 1
-
-/* Define to 1 if you have the declaration of `RLIMIT_AS', and to 0 if you
-   don't. */
-#define HAVE_DECL_RLIMIT_AS 1
-
-/* Define to 1 if you have the declaration of `RLIMIT_CORE', and to 0 if you
-   don't. */
-#define HAVE_DECL_RLIMIT_CORE 1
-
-/* Define to 1 if you have the declaration of `RLIMIT_FSIZE', and to 0 if you
-   don't. */
-#define HAVE_DECL_RLIMIT_FSIZE 1
-
-/* Define to 1 if you have the declaration of `RLIMIT_MEMLOCK', and to 0 if
-   you don't. */
-#define HAVE_DECL_RLIMIT_MEMLOCK 1
-
-/* Define to 1 if you have the declaration of `RLIMIT_NOFILE', and to 0 if you
-   don't. */
-#define HAVE_DECL_RLIMIT_NOFILE 1
-
-/* Define to 1 if you have the declaration of `RLIMIT_NPROC', and to 0 if you
-   don't. */
-#define HAVE_DECL_RLIMIT_NPROC 1
-
-/* Define to 1 if you have the declaration of `RLIMIT_STACK', and to 0 if you
-   don't. */
-#define HAVE_DECL_RLIMIT_STACK 1
-
-/* Define to 1 if you have the declaration of `sbrk', and to 0 if you don't.
-   */
-#define HAVE_DECL_SBRK 1
-
-/* Define to 1 if you have the declaration of `strtoull', and to 0 if you
-   don't. */
-#define HAVE_DECL_STRTOULL 1
-
-/* Define to 1 if you have the declaration of `_SC_LARGE_PAGESIZE', and to 0
-   if you don't. */
-#define HAVE_DECL__SC_LARGE_PAGESIZE 0
-
-/* Define to 1 if you have the declaration of `_SC_NPROCESSORS_CONF', and to 0
-   if you don't. */
-#define HAVE_DECL__SC_NPROCESSORS_CONF 1
-
-/* Define to 1 if you have the declaration of `_SC_NPROCESSORS_ONLN', and to 0
-   if you don't. */
-#define HAVE_DECL__SC_NPROCESSORS_ONLN 1
-
-/* Define to 1 if you have the declaration of `_SC_NPROC_CONF', and to 0 if
-   you don't. */
-#define HAVE_DECL__SC_NPROC_CONF 0
-
-/* Define to 1 if you have the declaration of `_SC_NPROC_ONLN', and to 0 if
-   you don't. */
-#define HAVE_DECL__SC_NPROC_ONLN 0
-
-/* Define to 1 if you have the declaration of `_SC_PAGESIZE', and to 0 if you
-   don't. */
-#define HAVE_DECL__SC_PAGESIZE 1
-
-/* Define to 1 if you have the declaration of `_SC_PAGE_SIZE', and to 0 if you
-   don't. */
-#define HAVE_DECL__SC_PAGE_SIZE 1
-
-/* Define to 1 if you have the declaration of `__func__', and to 0 if you
-   don't. */
-#define HAVE_DECL___FUNC__ 1
-
-/* Define to 1 if you have the <dirent.h> header file. */
-#define HAVE_DIRENT_H 1
-
-/* Define to 1 if you have the <dlfcn.h> header file. */
-#define HAVE_DLFCN_H 1
-
-/* Define to 1 if you have the `dlsym' function. */
-#define HAVE_DLSYM 1
-
-/* Define to 1 if the system has the type `double _Complex'. */
-#define HAVE_DOUBLE__COMPLEX 1
-
-/* Define to 1 if you have the <err.h> header file. */
-#define HAVE_ERR_H 1
-
-/* Define to 1 if you have the <event.h> header file. */
-/* #undef HAVE_EVENT_H */
-
-/* Define to 1 if you have the <execinfo.h> header file. */
-#define HAVE_EXECINFO_H 1
-
-/* Define to 1 if you have the `execve' function. */
-#define HAVE_EXECVE 1
-
-/* Define to 1 if you have the <fca_api.h> header file. */
-/* #undef HAVE_FCA_API_H */
-
-/* Define to 1 if you have the <fcntl.h> header file. */
-#define HAVE_FCNTL_H 1
-
-/* Define to 1 if you have the `ffs' function. */
-#define HAVE_FFS 1
-
-/* Define to 1 if you have the `ffsl' function. */
-#define HAVE_FFSL 1
-
-/* Define to 1 if the system has the type `float _Complex'. */
-#define HAVE_FLOAT__COMPLEX 1
-
-/* Define to 1 if you have the `fls' function. */
-/* #undef HAVE_FLS */
-
-/* Define to 1 if you have the `flsl' function. */
-/* #undef HAVE_FLSL */
-
-/* Define to 1 if you have the `fork' function. */
-#define HAVE_FORK 1
-
-/* Define to 1 if you have the `getpagesize' function. */
-#define HAVE_GETPAGESIZE 1
-
-/* Define to 1 if you have the `getpwuid' function. */
-#define HAVE_GETPWUID 1
-
-/* Define to 1 if you have the `GNI_GetJobResInfo' function. */
-/* #undef HAVE_GNI_GETJOBRESINFO */
-
-/* Define to 1 if the system has the type `GROUP_AFFINITY'. */
-/* #undef HAVE_GROUP_AFFINITY */
-
-/* Define to 1 if the system has the type `GROUP_RELATIONSHIP'. */
-/* #undef HAVE_GROUP_RELATIONSHIP */
-
-/* Define to 1 if you have the <grp.h> header file. */
-#define HAVE_GRP_H 1
-
-/* Define to 1 if you have the <hcoll_api.h> header file. */
-/* #undef HAVE_HCOLL_API_H */
-
-/* Define to 1 if you have the <hostLib.h> header file. */
-/* #undef HAVE_HOSTLIB_H */
-
-/* Define to 1 if you have the `host_info' function. */
-/* #undef HAVE_HOST_INFO */
-
-/* Define to 1 if you have the <hwloc.h> header file. */
-/* #undef HAVE_HWLOC_H */
-
-/* Define to 1 if you have the `ibv_cmd_open_xrcd' function. */
-/* #undef HAVE_IBV_CMD_OPEN_XRCD */
-
-/* Define to 1 if you have the `ibv_create_xrc_rcv_qp' function. */
-/* #undef HAVE_IBV_CREATE_XRC_RCV_QP */
-
-/* Define to 1 if you have the `ibv_fork_init' function. */
-/* #undef HAVE_IBV_FORK_INIT */
-
-/* Define to 1 if you have the `ibv_get_device_list' function. */
-/* #undef HAVE_IBV_GET_DEVICE_LIST */
-
-/* Define to 1 if you have the `ibv_resize_cq' function. */
-/* #undef HAVE_IBV_RESIZE_CQ */
-
-/* Define to 1 if you have the <ifaddrs.h> header file. */
-#define HAVE_IFADDRS_H 1
-
-/* Define to 1 if you have the <infiniband/driver.h> header file. */
-/* #undef HAVE_INFINIBAND_DRIVER_H */
-
-/* Define to 1 if you have the <infiniband/verbs.h> header file. */
-/* #undef HAVE_INFINIBAND_VERBS_H */
-
-/* Define to 1 if the system has the type `int128_t'. */
-/* #undef HAVE_INT128_T */
-
-/* Define to 1 if the system has the type `int16_t'. */
-#define HAVE_INT16_T 1
-
-/* Define to 1 if the system has the type `int32_t'. */
-#define HAVE_INT32_T 1
-
-/* Define to 1 if the system has the type `int64_t'. */
-#define HAVE_INT64_T 1
-
-/* Define to 1 if the system has the type `int8_t'. */
-#define HAVE_INT8_T 1
-
-/* Define to 1 if the system has the type `intptr_t'. */
-#define HAVE_INTPTR_T 1
-
-/* Define to 1 if you have the <inttypes.h> header file. */
-#define HAVE_INTTYPES_H 1
-
-/* Define to 1 if you have the <ioLib.h> header file. */
-/* #undef HAVE_IOLIB_H */
-
-/* Define to 1 if you have the `isatty' function. */
-#define HAVE_ISATTY 1
-
-/* Define to 1 if the system has the type `KAFFINITY'. */
-/* #undef HAVE_KAFFINITY */
-
-/* Define to 1 if you have the <knem_io.h> header file. */
-/* #undef HAVE_KNEM_IO_H */
-
-/* Define to 1 if you have the <kstat.h> header file. */
-/* #undef HAVE_KSTAT_H */
-
-/* Define to 1 if you have the <libcr.h> header file. */
-/* #undef HAVE_LIBCR_H */
-
-/* Define to 1 if you have the `event' library (-levent). */
-/* #undef HAVE_LIBEVENT */
-
-/* Define to 1 if you have the `event_pthreads' library (-levent_pthreads). */
-/* #undef HAVE_LIBEVENT_PTHREADS */
-
-/* Define to 1 if we have -lgdi32 */
-/* #undef HAVE_LIBGDI32 */
-
-/* Define to 1 if you have the <libgen.h> header file. */
-#define HAVE_LIBGEN_H 1
-
-/* Define to 1 if we have -lkstat */
-/* #undef HAVE_LIBKSTAT */
-
-/* Define to 1 if we have -llgrp */
-/* #undef HAVE_LIBLGRP */
-
-/* set to 1 if should use libnl v3, set to 0 for libnl v11 */
-#define HAVE_LIBNL3 0
-
-/* Define to 1 if you have the `pci' library (-lpci). */
-/* #undef HAVE_LIBPCI */
-
-/* Define to 1 if you have the `psm_infinipath' library (-lpsm_infinipath). */
-/* #undef HAVE_LIBPSM_INFINIPATH */
-
-/* Define to 1 if you have the `pthread' library (-lpthread). */
-#define HAVE_LIBPTHREAD 1
-
-/* Define to 1 if you have the `rt' library (-lrt). */
-#define HAVE_LIBRT 1
-
-/* Define to 1 if you have the <libutil.h> header file. */
-/* #undef HAVE_LIBUTIL_H */
-
-/* Define to 1 if you have the <limits.h> header file. */
-#define HAVE_LIMITS_H 1
-
-/* Define to 1 if the system has the type `LOGICAL_PROCESSOR_RELATIONSHIP'. */
-/* #undef HAVE_LOGICAL_PROCESSOR_RELATIONSHIP */
-
-/* Define to 1 if the system has the type `long double'. */
-#define HAVE_LONG_DOUBLE 1
-
-/* Define to 1 if the system has the type `long double _Complex'. */
-#define HAVE_LONG_DOUBLE__COMPLEX 1
-
-/* Define to 1 if the system has the type `long long'. */
-#define HAVE_LONG_LONG 1
-
-/* Define to 1 if you have the <lsf/lsbatch.h> header file. */
-/* #undef HAVE_LSF_LSBATCH_H */
-
-/* Define to 1 if you have the <lsf/lsf.h> header file. */
-/* #undef HAVE_LSF_LSF_H */
-
-/* Define to 1 if you have the <ltdl.h> header file. */
-/* #undef HAVE_LTDL_H */
-
-/* Define to 1 if you have the <lustre/liblustreapi.h> header file. */
-/* #undef HAVE_LUSTRE_LIBLUSTREAPI_H */
-
-/* Define to 1 if you have the <mach/mach_host.h> header file. */
-/* #undef HAVE_MACH_MACH_HOST_H */
-
-/* Define to 1 if you have the <mach/mach_init.h> header file. */
-/* #undef HAVE_MACH_MACH_INIT_H */
-
-/* Define to 1 if you have the <mach/mach_time.h> header file. */
-/* #undef HAVE_MACH_MACH_TIME_H */
-
-/* Define to 1 if you have the <malloc.h> header file. */
-#define HAVE_MALLOC_H 1
-
-/* Define to 1 if you have the `memalign' function. */
-#define HAVE_MEMALIGN 1
-
-/* Define to 1 if you have the <memory.h> header file. */
-#define HAVE_MEMORY_H 1
-
-/* Define to 1 if you have the `mkfifo' function. */
-#define HAVE_MKFIFO 1
-
-/* Define to 1 if you have the `mmap' function. */
-#define HAVE_MMAP 1
-
-/* Define to 1 if you have the <mntent.h> header file. */
-#define HAVE_MNTENT_H 1
-
-/* Define to 1 if the system has the type `mode_t'. */
-#define HAVE_MODE_T 1
-
-/* Define to 1 if you have the <mtcp.h> header file. */
-/* #undef HAVE_MTCP_H */
-
-/* Define to 1 if you have the <munge.h> header file. */
-/* #undef HAVE_MUNGE_H */
-
-/* Define to 1 if you have the <mxm/api/mxm_api.h> header file. */
-/* #undef HAVE_MXM_API_MXM_API_H */
-
-/* Define to 1 if you have the <ndbm.h> header file. */
-/* #undef HAVE_NDBM_H */
-
-/* Define to 1 if you have the <netdb.h> header file. */
-#define HAVE_NETDB_H 1
-
-/* Define to 1 if you have the <netinet/in.h> header file. */
-#define HAVE_NETINET_IN_H 1
-
-/* Define to 1 if you have the <netinet/tcp.h> header file. */
-#define HAVE_NETINET_TCP_H 1
-
-/* Define to 1 if you have the <net/if.h> header file. */
-#define HAVE_NET_IF_H 1
-
-/* Define to 1 if you have the <net/uio.h> header file. */
-/* #undef HAVE_NET_UIO_H */
-
-/* Define to 1 if you have the <numaif.h> header file. */
-/* #undef HAVE_NUMAIF_H */
-
-/* Define to 1 if the system has the type `NUMA_NODE_RELATIONSHIP'. */
-/* #undef HAVE_NUMA_NODE_RELATIONSHIP */
-
-/* Define to 1 if you have the <NVCtrl/NVCtrl.h> header file. */
-/* #undef HAVE_NVCTRL_NVCTRL_H */
-
-/* Define to 1 if you have the <nvml.h> header file. */
-/* #undef HAVE_NVML_H */
-
-/* Define to 1 if you have the `on_exit' function. */
-#define HAVE_ON_EXIT 1
-
-/* Define to 1 if you have the `openat' function. */
-#define HAVE_OPENAT 1
-
-/* Define to 1 if you have the `openpty' function. */
-#define HAVE_OPENPTY 1
-
-/* Define to 1 if you have the <paths.h> header file. */
-#define HAVE_PATHS_H 1
-
-/* Define to 1 if you have the <pci/pci.h> header file. */
-/* #undef HAVE_PCI_PCI_H */
-
-/* Define to 1 if you have the <picl.h> header file. */
-/* #undef HAVE_PICL_H */
-
-/* Define to 1 if you have the `pipe' function. */
-#define HAVE_PIPE 1
-
-/* Define to 1 if you have the <plfs.h> header file. */
-/* #undef HAVE_PLFS_H */
-
-/* Define to 1 if you have the <pmapi.h> header file. */
-/* #undef HAVE_PMAPI_H */
-
-/* Define to 1 if you have the `pm_cycles' function. */
-/* #undef HAVE_PM_CYCLES */
-
-/* Define to 1 if you have the <poll.h> header file. */
-#define HAVE_POLL_H 1
-
-/* Define to 1 if you have the <portals4.h> header file. */
-/* #undef HAVE_PORTALS4_H */
-
-/* Define to 1 if you have the `posix_memalign' function. */
-#define HAVE_POSIX_MEMALIGN 1
-
-/* Define to 1 if you have the `printstack' function. */
-/* #undef HAVE_PRINTSTACK */
-
-/* Define to 1 if the system has the type `PROCESSOR_CACHE_TYPE'. */
-/* #undef HAVE_PROCESSOR_CACHE_TYPE */
-
-/* Define to 1 if the system has the type `PROCESSOR_GROUP_INFO'. */
-/* #undef HAVE_PROCESSOR_GROUP_INFO */
-
-/* Define to 1 if the system has the type `PROCESSOR_RELATIONSHIP'. */
-/* #undef HAVE_PROCESSOR_RELATIONSHIP */
-
-/* Define to 1 if the system has the type `PSAPI_WORKING_SET_EX_BLOCK'. */
-/* #undef HAVE_PSAPI_WORKING_SET_EX_BLOCK */
-
-/* Define to 1 if the system has the type `PSAPI_WORKING_SET_EX_INFORMATION'.
-   */
-/* #undef HAVE_PSAPI_WORKING_SET_EX_INFORMATION */
-
-/* libfabric: whether to build the PSM provider or not */
-/* #undef HAVE_PSM */
-
-/* libfabric: do not build PSM provider as a DL */
-/* #undef HAVE_PSM_DL */
-
-/* Define to 1 if you have the <psm.h> header file. */
-/* #undef HAVE_PSM_H */
-
-/* Define to 1 if you have the `pthread_condattr_setpshared' function. */
-#define HAVE_PTHREAD_CONDATTR_SETPSHARED 1
-
-/* Define to 1 if you have the <pthread.h> header file. */
-#define HAVE_PTHREAD_H 1
-
-/* Define to 1 if you have the `pthread_mutexattr_setpshared' function. */
-#define HAVE_PTHREAD_MUTEXATTR_SETPSHARED 1
-
-/* Define to 1 if you have the <pthread_np.h> header file. */
-/* #undef HAVE_PTHREAD_NP_H */
-
-/* Define to 1 if the system has the type `pthread_t'. */
-#define HAVE_PTHREAD_T 1
-
-/* Define to 1 if the system has the type `ptrdiff_t'. */
-#define HAVE_PTRDIFF_T 1
-
-/* Define to 1 if you have the `ptsname' function. */
-#define HAVE_PTSNAME 1
-
-/* Define to 1 if you have the <pty.h> header file. */
-#define HAVE_PTY_H 1
-
-/* Define to 1 if you have the <pvfs2.h> header file. */
-/* #undef HAVE_PVFS2_H */
-
-/* Define to 1 if you have the <pwd.h> header file. */
-#define HAVE_PWD_H 1
-
-/* Define to 1 if you have the <rdma/fabric.h> header file. */
-/* #undef HAVE_RDMA_FABRIC_H */
-
-/* Define to 1 if you have the <rdma/rdma_cma.h> header file. */
-/* #undef HAVE_RDMA_RDMA_CMA_H */
-
-/* Define to 1 if you have the <rdma/rsocket.h> header file. */
-/* #undef HAVE_RDMA_RSOCKET_H */
-
-/* Define to 1 if you have the `regcmp' function. */
-/* #undef HAVE_REGCMP */
-
-/* Define to 1 if you have the `regexec' function. */
-#define HAVE_REGEXEC 1
-
-/* Define to 1 if you have the <regex.h> header file. */
-#define HAVE_REGEX_H 1
-
-/* Define to 1 if you have the `regfree' function. */
-#define HAVE_REGFREE 1
-
-/* Define to 1 if the system has the type `RelationProcessorPackage'. */
-/* #undef HAVE_RELATIONPROCESSORPACKAGE */
-
-/* Define to 1 if you have the <sched.h> header file. */
-#define HAVE_SCHED_H 1
-
-/* Define to 1 if you have the <scif.h> header file. */
-#define HAVE_SCIF_H 1
-
-/* Define to 1 if you have the `setenv' function. */
-#define HAVE_SETENV 1
-
-/* Define to 1 if you have the `setlocale' function. */
-#define HAVE_SETLOCALE 1
-
-/* Define to 1 if you have the `setpgid' function. */
-#define HAVE_SETPGID 1
-
-/* Define to 1 if you have the `setsid' function. */
-#define HAVE_SETSID 1
-
-/* Define to 1 if you have the <shlwapi.h> header file. */
-/* #undef HAVE_SHLWAPI_H */
-
-/* Define to 1 if `si_band' is a member of `siginfo_t'. */
-#define HAVE_SIGINFO_T_SI_BAND 1
-
-/* Define to 1 if `si_fd' is a member of `siginfo_t'. */
-#define HAVE_SIGINFO_T_SI_FD 1
-
-/* Define to 1 if you have the <signal.h> header file. */
-#define HAVE_SIGNAL_H 1
-
-/* Define to 1 if you have the `snprintf' function. */
-#define HAVE_SNPRINTF 1
-
-/* Define to 1 if you have the `socketpair' function. */
-#define HAVE_SOCKETPAIR 1
-
-/* libfabric: do not build sockets provider */
-/* #undef HAVE_SOCKETS */
-
-/* libfabric: do not build sockets provider */
-/* #undef HAVE_SOCKETS_DL */
-
-/* Define to 1 if the system has the type `socklen_t'. */
-#define HAVE_SOCKLEN_T 1
-
-/* Define to 1 if you have the <sockLib.h> header file. */
-/* #undef HAVE_SOCKLIB_H */
-
-/* Define to 1 if the system has the type `ssize_t'. */
-#define HAVE_SSIZE_T 1
-
-/* Define to 1 if you have the `statfs' function. */
-#define HAVE_STATFS 1
-
-/* Define to 1 if you have the `statvfs' function. */
-#define HAVE_STATVFS 1
-
-/* Define to 1 if you have the <stdarg.h> header file. */
-#define HAVE_STDARG_H 1
-
-/* Define to 1 if you have the <stdbool.h> header file. */
-#define HAVE_STDBOOL_H 1
-
-/* Define to 1 if you have the <stddef.h> header file. */
-#define HAVE_STDDEF_H 1
-
-/* Define to 1 if you have the <stdint.h> header file. */
-#define HAVE_STDINT_H 1
-
-/* Define to 1 if you have the <stdlib.h> header file. */
-#define HAVE_STDLIB_H 1
-
-/* Define to 1 if you have the `strftime' function. */
-#define HAVE_STRFTIME 1
-
-/* Define to 1 if you have the <strings.h> header file. */
-#define HAVE_STRINGS_H 1
-
-/* Define to 1 if you have the <string.h> header file. */
-#define HAVE_STRING_H 1
-
-/* Define to 1 if you have the `strncasecmp' function. */
-#define HAVE_STRNCASECMP 1
-
-/* Define to 1 if you have the `strncpy_s' function. */
-/* #undef HAVE_STRNCPY_S */
-
-/* Define to 1 if you have the <stropts.h> header file. */
-/* #undef HAVE_STROPTS_H */
-
-/* Define to 1 if you have the `strsignal' function. */
-#define HAVE_STRSIGNAL 1
-
-/* Define to 1 if `d_type' is a member of `struct dirent'. */
-#define HAVE_STRUCT_DIRENT_D_TYPE 1
-
-/* Define to 1 if `transport_type' is a member of `struct ibv_device'. */
-/* #undef HAVE_STRUCT_IBV_DEVICE_TRANSPORT_TYPE */
-
-/* Define to 1 if `ifr_hwaddr' is a member of `struct ifreq'. */
-#define HAVE_STRUCT_IFREQ_IFR_HWADDR 1
-
-/* Define to 1 if `ifr_mtu' is a member of `struct ifreq'. */
-#define HAVE_STRUCT_IFREQ_IFR_MTU 1
-
-/* Define to 1 if the system has the type `struct sockaddr_in'. */
-#define HAVE_STRUCT_SOCKADDR_IN 1
-
-/* Define to 1 if the system has the type `struct sockaddr_in6'. */
-#define HAVE_STRUCT_SOCKADDR_IN6 1
-
-/* Define to 1 if `sa_len' is a member of `struct sockaddr'. */
-/* #undef HAVE_STRUCT_SOCKADDR_SA_LEN */
-
-/* Define to 1 if the system has the type `struct sockaddr_storage'. */
-#define HAVE_STRUCT_SOCKADDR_STORAGE 1
-
-/* Define to 1 if the system has the type `struct sockaddr_un'. */
-#define HAVE_STRUCT_SOCKADDR_UN 1
-
-/* Define to 1 if `f_fstypename' is a member of `struct statfs'. */
-/* #undef HAVE_STRUCT_STATFS_F_FSTYPENAME */
-
-/* Define to 1 if `f_type' is a member of `struct statfs'. */
-#define HAVE_STRUCT_STATFS_F_TYPE 1
-
-/* Define to 1 if `f_basetype' is a member of `struct statvfs'. */
-/* #undef HAVE_STRUCT_STATVFS_F_BASETYPE */
-
-/* Define to 1 if `f_fstypename' is a member of `struct statvfs'. */
-/* #undef HAVE_STRUCT_STATVFS_F_FSTYPENAME */
-
-/* Define to 1 if you have the `syscall' function. */
-#define HAVE_SYSCALL 1
-
-/* Define to 1 if you have the `sysconf' function. */
-#define HAVE_SYSCONF 1
-
-/* Define to '1' if sysctl is present and usable */
-#define HAVE_SYSCTL 1
-
-/* Define to '1' if sysctlbyname is present and usable */
-/* #undef HAVE_SYSCTLBYNAME */
-
-/* Define to 1 if you have the `syslog' function. */
-#define HAVE_SYSLOG 1
-
-/* Define to 1 if you have the <syslog.h> header file. */
-#define HAVE_SYSLOG_H 1
-
-/* Define to 1 if the system has the type
-   `SYSTEM_LOGICAL_PROCESSOR_INFORMATION'. */
-/* #undef HAVE_SYSTEM_LOGICAL_PROCESSOR_INFORMATION */
-
-/* Define to 1 if the system has the type
-   `SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX'. */
-/* #undef HAVE_SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX */
-
-/* Define to 1 if you have the <sys/cpuset.h> header file. */
-/* #undef HAVE_SYS_CPUSET_H */
-
-/* Define to 1 if you have the <sys/fcntl.h> header file. */
-#define HAVE_SYS_FCNTL_H 1
-
-/* Define to 1 if you have the <sys/ioctl.h> header file. */
-#define HAVE_SYS_IOCTL_H 1
-
-/* Define to 1 if you have the <sys/ipc.h> header file. */
-#define HAVE_SYS_IPC_H 1
-
-/* Define to 1 if you have the <sys/lgrp_user.h> header file. */
-/* #undef HAVE_SYS_LGRP_USER_H */
-
-/* Define to 1 if you have the <sys/mman.h> header file. */
-#define HAVE_SYS_MMAN_H 1
-
-/* Define to 1 if you have the <sys/mount.h> header file. */
-#define HAVE_SYS_MOUNT_H 1
-
-/* Define to 1 if you have the <sys/param.h> header file. */
-#define HAVE_SYS_PARAM_H 1
-
-/* Define to 1 if you have the <sys/poll.h> header file. */
-#define HAVE_SYS_POLL_H 1
-
-/* Define to 1 if you have the <sys/prctl.h> header file. */
-/* #undef HAVE_SYS_PRCTL_H */
-
-/* Define to 1 if you have the <sys/queue.h> header file. */
-#define HAVE_SYS_QUEUE_H 1
-
-/* Define to 1 if you have the <sys/resource.h> header file. */
-#define HAVE_SYS_RESOURCE_H 1
-
-/* Define to 1 if you have the <sys/select.h> header file. */
-#define HAVE_SYS_SELECT_H 1
-
-/* Define to 1 if you have the <sys/shm.h> header file. */
-#define HAVE_SYS_SHM_H 1
-
-/* Define to 1 if you have the <sys/socket.h> header file. */
-#define HAVE_SYS_SOCKET_H 1
-
-/* Define to 1 if you have the <sys/sockio.h> header file. */
-/* #undef HAVE_SYS_SOCKIO_H */
-
-/* Define to 1 if you have the <sys/statfs.h> header file. */
-#define HAVE_SYS_STATFS_H 1
-
-/* Define to 1 if you have the <sys/statvfs.h> header file. */
-#define HAVE_SYS_STATVFS_H 1
-
-/* Define to 1 if you have the <sys/stat.h> header file. */
-#define HAVE_SYS_STAT_H 1
-
-/* Define to 1 if you have the <sys/synch.h> header file. */
-/* #undef HAVE_SYS_SYNCH_H */
-
-/* Define to 1 if you have the <sys/sysctl.h> header file. */
-#define HAVE_SYS_SYSCTL_H 1
-
-/* Define to 1 if you have the <sys/time.h> header file. */
-#define HAVE_SYS_TIME_H 1
-
-/* Define to 1 if you have the <sys/tree.h> header file. */
-/* #undef HAVE_SYS_TREE_H */
-
-/* Define to 1 if you have the <sys/types.h> header file. */
-#define HAVE_SYS_TYPES_H 1
-
-/* Define to 1 if you have the <sys/uio.h> header file. */
-#define HAVE_SYS_UIO_H 1
-
-/* Define to 1 if you have the <sys/un.h> header file. */
-#define HAVE_SYS_UN_H 1
-
-/* Define to 1 if you have the <sys/utsname.h> header file. */
-#define HAVE_SYS_UTSNAME_H 1
-
-/* Define to 1 if you have the <sys/vfs.h> header file. */
-#define HAVE_SYS_VFS_H 1
-
-/* Define to 1 if you have the <sys/wait.h> header file. */
-#define HAVE_SYS_WAIT_H 1
-
-/* Define to 1 if you have the <TargetConditionals.h> header file. */
-/* #undef HAVE_TARGETCONDITIONALS_H */
-
-/* Define to 1 if you have the `tcgetpgrp' function. */
-#define HAVE_TCGETPGRP 1
-
-/* Define to 1 if you have the <termios.h> header file. */
-#define HAVE_TERMIOS_H 1
-
-/* Define to 1 if you have the <time.h> header file. */
-#define HAVE_TIME_H 1
-
-/* Define to 1 if you have the <tm.h> header file. */
-/* #undef HAVE_TM_H */
-
-/* Define to 1 if you have the <tm_tree.h> header file. */
-/* #undef HAVE_TM_TREE_H */
-
-/* Define to 1 if you have the <ucontext.h> header file. */
-#define HAVE_UCONTEXT_H 1
-
-/* Define to 1 if the system has the type `uint128_t'. */
-/* #undef HAVE_UINT128_T */
-
-/* Define to 1 if the system has the type `uint16_t'. */
-#define HAVE_UINT16_T 1
-
-/* Define to 1 if the system has the type `uint32_t'. */
-#define HAVE_UINT32_T 1
-
-/* Define to 1 if the system has the type `uint64_t'. */
-#define HAVE_UINT64_T 1
-
-/* Define to 1 if the system has the type `uint8_t'. */
-#define HAVE_UINT8_T 1
-
-/* Define to 1 if the system has the type `uintptr_t'. */
-#define HAVE_UINTPTR_T 1
-
-/* Define to 1 if you have the <ulimit.h> header file. */
-#define HAVE_ULIMIT_H 1
-
-/* Define to 1 if you have the `uname' function. */
-#define HAVE_UNAME 1
-
-/* Define to 1 if you have the <unistd.h> header file. */
-#define HAVE_UNISTD_H 1
-
-/* whether unix byteswap routines -- htonl, htons, nothl, ntohs -- are
-   available */
-#define HAVE_UNIX_BYTESWAP 1
-
-/* Define to 1 if you have the `usleep' function. */
-#define HAVE_USLEEP 1
-
-/* libfabric: whether to build the usnic provider or not */
-/* #undef HAVE_USNIC */
-
-/* libfabric: do not build usnic provider as a DL */
-/* #undef HAVE_USNIC_DL */
-
-/* Define to 1 if you have the <util.h> header file. */
-/* #undef HAVE_UTIL_H */
-
-/* Define to 1 if you have the <utmp.h> header file. */
-#define HAVE_UTMP_H 1
-
-/* Define to 1 if you have the <valgrind/valgrind.h> header file. */
-/* #undef HAVE_VALGRIND_VALGRIND_H */
-
-/* Define to 1 if you have the `vasprintf' function. */
-#define HAVE_VASPRINTF 1
-
-/* libfabric: do not build verbs provider */
-/* #undef HAVE_VERBS */
-
-/* libfabric: do not build verbs provider */
-/* #undef HAVE_VERBS_DL */
-
-/* Define to 1 if you have the `vsnprintf' function. */
-#define HAVE_VSNPRINTF 1
-
-/* Define to 1 if you have the `vsyslog' function. */
-#define HAVE_VSYSLOG 1
-
-/* Define to 1 if you have the `waitpid' function. */
-#define HAVE_WAITPID 1
-
-/* Define to 1 if you have the <X11/keysym.h> header file. */
-#define HAVE_X11_KEYSYM_H 1
-
-/* Define to 1 if you have the <X11/Xlib.h> header file. */
-#define HAVE_X11_XLIB_H 1
-
-/* Define to 1 if you have the <X11/Xutil.h> header file. */
-#define HAVE_X11_XUTIL_H 1
-
-/* Define to 1 if you have the <xpmem.h> header file. */
-/* #undef HAVE_XPMEM_H */
-
-/* Define to 1 if you have the `_NSGetEnviron' function. */
-/* #undef HAVE__NSGETENVIRON */
-
-/* Define to 1 if the system has the type `__float128'. */
-#define HAVE___FLOAT128 1
-
-/* Define to 1 if the system has the type `__int128'. */
-/* #undef HAVE___INT128 */
-
-/* Define to 1 if you have the `__mmap' function. */
-/* #undef HAVE___MMAP */
-
-/* Define to 1 if you have the `__munmap' function. */
-/* #undef HAVE___MUNMAP */
-
-/* Define to 1 on AIX */
-/* #undef HWLOC_AIX_SYS */
-
-/* Define to 1 on BlueGene/Q */
-/* #undef HWLOC_BGQ_SYS */
-
-/* Whether C compiler supports symbol visibility or not */
-#define HWLOC_C_HAVE_VISIBILITY 1
-
-/* Define to 1 on Darwin */
-/* #undef HWLOC_DARWIN_SYS */
-
-/* Whether we are in debugging mode or not */
-/* #undef HWLOC_DEBUG */
-
-/* Version of hwloc */
-/* #undef HWLOC_EXTERNAL_HWLOC_VERSION */
-
-/* Define to 1 on *FREEBSD */
-/* #undef HWLOC_FREEBSD_SYS */
-
-/* Whether your compiler has __attribute__ or not */
-#define HWLOC_HAVE_ATTRIBUTE 1
-
-/* Whether your compiler has __attribute__ aligned or not */
-#define HWLOC_HAVE_ATTRIBUTE_ALIGNED 1
-
-/* Whether your compiler has __attribute__ always_inline or not */
-#define HWLOC_HAVE_ATTRIBUTE_ALWAYS_INLINE 1
-
-/* Whether your compiler has __attribute__ cold or not */
-#define HWLOC_HAVE_ATTRIBUTE_COLD 1
-
-/* Whether your compiler has __attribute__ const or not */
-#define HWLOC_HAVE_ATTRIBUTE_CONST 1
-
-/* Whether your compiler has __attribute__ deprecated or not */
-#define HWLOC_HAVE_ATTRIBUTE_DEPRECATED 1
-
-/* Whether your compiler has __attribute__ format or not */
-#define HWLOC_HAVE_ATTRIBUTE_FORMAT 1
-
-/* Whether your compiler has __attribute__ hot or not */
-#define HWLOC_HAVE_ATTRIBUTE_HOT 1
-
-/* Whether your compiler has __attribute__ malloc or not */
-#define HWLOC_HAVE_ATTRIBUTE_MALLOC 1
-
-/* Whether your compiler has __attribute__ may_alias or not */
-#define HWLOC_HAVE_ATTRIBUTE_MAY_ALIAS 1
-
-/* Whether your compiler has __attribute__ nonnull or not */
-#define HWLOC_HAVE_ATTRIBUTE_NONNULL 1
-
-/* Whether your compiler has __attribute__ noreturn or not */
-#define HWLOC_HAVE_ATTRIBUTE_NORETURN 1
-
-/* Whether your compiler has __attribute__ no_instrument_function or not */
-#define HWLOC_HAVE_ATTRIBUTE_NO_INSTRUMENT_FUNCTION 1
-
-/* Whether your compiler has __attribute__ packed or not */
-#define HWLOC_HAVE_ATTRIBUTE_PACKED 1
-
-/* Whether your compiler has __attribute__ pure or not */
-#define HWLOC_HAVE_ATTRIBUTE_PURE 1
-
-/* Whether your compiler has __attribute__ sentinel or not */
-#define HWLOC_HAVE_ATTRIBUTE_SENTINEL 1
-
-/* Whether your compiler has __attribute__ unused or not */
-#define HWLOC_HAVE_ATTRIBUTE_UNUSED 1
-
-/* Whether your compiler has __attribute__ warn unused result or not */
-#define HWLOC_HAVE_ATTRIBUTE_WARN_UNUSED_RESULT 1
-
-/* Whether your compiler has __attribute__ weak alias or not */
-#define HWLOC_HAVE_ATTRIBUTE_WEAK_ALIAS 1
-
-/* Define to 1 if your `ffs' function is known to be broken. */
-/* #undef HWLOC_HAVE_BROKEN_FFS */
-
-/* Define to 1 if you have the `clz' function. */
-/* #undef HWLOC_HAVE_CLZ */
-
-/* Define to 1 if you have the `clzl' function. */
-/* #undef HWLOC_HAVE_CLZL */
-
-/* Define to 1 if the CPU_SET macro works */
-#define HWLOC_HAVE_CPU_SET 1
-
-/* Define to 1 if the CPU_SET_S macro works */
-#define HWLOC_HAVE_CPU_SET_S 1
-
-/* Define to 1 if you have the `cudart' SDK. */
-/* #undef HWLOC_HAVE_CUDART */
-
-/* Define to 1 if function `clz' is declared by system headers */
-/* #undef HWLOC_HAVE_DECL_CLZ */
-
-/* Define to 1 if function `clzl' is declared by system headers */
-/* #undef HWLOC_HAVE_DECL_CLZL */
-
-/* Define to 1 if function `ffs' is declared by system headers */
-#define HWLOC_HAVE_DECL_FFS 1
-
-/* Define to 1 if function `ffsl' is declared by system headers */
-#define HWLOC_HAVE_DECL_FFSL 1
-
-/* Define to 1 if function `fls' is declared by system headers */
-/* #undef HWLOC_HAVE_DECL_FLS */
-
-/* Define to 1 if function `flsl' is declared by system headers */
-/* #undef HWLOC_HAVE_DECL_FLSL */
-
-/* Define to 1 if function `strncasecmp' is declared by system headers */
-#define HWLOC_HAVE_DECL_STRNCASECMP 1
-
-/* Define to 1 if you have the `ffs' function. */
-#define HWLOC_HAVE_FFS 1
-
-/* Define to 1 if you have the `ffsl' function. */
-#define HWLOC_HAVE_FFSL 1
-
-/* Define to 1 if you have the `fls' function. */
-/* #undef HWLOC_HAVE_FLS */
-
-/* Define to 1 if you have the `flsl' function. */
-/* #undef HWLOC_HAVE_FLSL */
-
-/* Define to 1 if you have the GL module components. */
-/* #undef HWLOC_HAVE_GL */
-
-/* Define to 1 if you have the `libpciaccess' library. */
-/* #undef HWLOC_HAVE_LIBPCIACCESS */
-
-/* Define to 1 if you have the `libxml2' library. */
-/* #undef HWLOC_HAVE_LIBXML2 */
-
-/* Define to 1 if building the Linux PCI component */
-#define HWLOC_HAVE_LINUXPCI 1
-
-/* Define to 1 if mbind is available. */
-/* #undef HWLOC_HAVE_MBIND */
-
-/* Define to 1 if migrate_pages is available. */
-/* #undef HWLOC_HAVE_MIGRATE_PAGES */
-
-/* Define to 1 if you have the `NVML' library. */
-/* #undef HWLOC_HAVE_NVML */
-
-/* Define to 1 if glibc provides the old prototype (without length) of
-   sched_setaffinity() */
-/* #undef HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
-
-/* Define to 1 if you have the `OpenCL' library. */
-/* #undef HWLOC_HAVE_OPENCL */
-
-/* Define to 1 if `libpci' struct pci_dev has a `device_class' field. */
-/* #undef HWLOC_HAVE_PCIDEV_DEVICE_CLASS */
-
-/* Define to 1 if `libpci' struct pci_dev has a `domain' field. */
-/* #undef HWLOC_HAVE_PCIDEV_DOMAIN */
-
-/* Define to 1 if you have the pciutils `libpci' library. */
-/* #undef HWLOC_HAVE_PCIUTILS */
-
-/* Define to 1 if `libpci' has the `pci_find_cap' function. */
-/* #undef HWLOC_HAVE_PCI_FIND_CAP */
-
-/* Define to 1 if the hwloc library should support dynamically-loaded plugins
-   */
-/* #undef HWLOC_HAVE_PLUGINS */
-
-/* `Define to 1 if you have pthread_getthrds_np' */
-/* #undef HWLOC_HAVE_PTHREAD_GETTHRDS_NP */
-
-/* Define to 1 if pthread mutexes are available */
-#define HWLOC_HAVE_PTHREAD_MUTEX 1
-
-/* Define to 1 if glibc provides a prototype of sched_setaffinity() */
-#define HWLOC_HAVE_SCHED_SETAFFINITY 1
-
-/* Define to 1 if set_mempolicy is available. */
-/* #undef HWLOC_HAVE_SET_MEMPOLICY */
-
-/* Define to 1 if you have the <stdint.h> header file. */
-#define HWLOC_HAVE_STDINT_H 1
-
-/* Define to 1 if you have the `windows.h' header. */
-/* #undef HWLOC_HAVE_WINDOWS_H */
-
-/* Define to 1 if X11 headers including Xutil.h and keysym.h are available. */
-#define HWLOC_HAVE_X11_KEYSYM 1
-
-/* Define to 1 if you have x86 cpuid */
-#define HWLOC_HAVE_X86_CPUID 1
-
-/* Define to 1 if the _syscall3 macro works */
-/* #undef HWLOC_HAVE__SYSCALL3 */
-
-/* Define to 1 on HP-UX */
-/* #undef HWLOC_HPUX_SYS */
-
-/* Version of hwloc */
-#define HWLOC_HWLOC191_HWLOC_VERSION "internal v1.9.2"
-
-/* Define to 1 on Irix */
-/* #undef HWLOC_IRIX_SYS */
-
-/* Define to 1 on Linux */
-#define HWLOC_LINUX_SYS 1
-
-/* Define to 1 on *NETBSD */
-/* #undef HWLOC_NETBSD_SYS */
-
-/* Define to 1 on OSF */
-/* #undef HWLOC_OSF_SYS */
-
-/* The size of `unsigned int', as computed by sizeof */
-#define HWLOC_SIZEOF_UNSIGNED_INT 4
-
-/* The size of `unsigned long', as computed by sizeof */
-#define HWLOC_SIZEOF_UNSIGNED_LONG 8
-
-/* Define to 1 on Solaris */
-/* #undef HWLOC_SOLARIS_SYS */
-
-/* The hwloc symbol prefix */
-#define HWLOC_SYM_PREFIX opal_hwloc191_
-
-/* The hwloc symbol prefix in all caps */
-#define HWLOC_SYM_PREFIX_CAPS OPAL_HWLOC191_
-
-/* Whether we need to re-define all the hwloc public symbols or not */
-#define HWLOC_SYM_TRANSFORM 1
-
-/* Define to 1 on unsupported systems */
-/* #undef HWLOC_UNSUPPORTED_SYS */
-
-/* Define to 1 on WINDOWS */
-/* #undef HWLOC_WIN_SYS */
-
-/* Define to 1 on x86_32 */
-/* #undef HWLOC_X86_32_ARCH */
-
-/* Define to 1 on x86_64 */
-#define HWLOC_X86_64_ARCH 1
-
-/* Define to the sub-directory in which libtool stores uninstalled libraries.
-   */
-#define LT_OBJDIR ".libs/"
-
-/* Header to include for event implementation */
-#define MCA_event_IMPLEMENTATION_HEADER "opal/mca/event/libevent2022/libevent2022.h"
-
-/* Header to include for hwloc implementation */
-#define MCA_hwloc_IMPLEMENTATION_HEADER "opal/mca/hwloc/hwloc191/hwloc191.h"
-
-/* Location of external hwloc header */
-/* #undef MCA_hwloc_external_header */
-
-/* Location of external hwloc header */
-/* #undef MCA_hwloc_external_openfabrics_header */
-
-/* Complete set of command line arguments given to ROMIOs configure script */
-#define MCA_io_romio_COMPLETE_CONFIGURE_FLAGS " FROM_OMPI=yes CC='gcc -std=gnu99' CFLAGS='-g -Wall -Wundef -Wno-long-long -Wsign-compare -Wmissing-prototypes -Wstrict-prototypes -Wcomment -pedantic -Werror-implicit-function-declaration -finline-functions -fno-strict-aliasing -pthread -D__EXTENSIONS__' CPPFLAGS='  -I/home/wwu12/ompi/ompi-gpu/opal/mca/hwloc/hwloc191/hwloc/include -I/home/wwu12/ompi/ompi-gpu/opal/mca/event/libevent2022/libevent -I/home/wwu12/ompi/ompi-gpu/opal/mca/event/libevent2022/libevent/include' FFLAGS='' LDFLAGS='  ' --enable-shared --disable-static  --prefix=/home/wwu12/ompi/build-gpu --disable-aio --disable-weak-symbols --enable-strict"
-
-/* Set of user-defined configure flags given to ROMIOs configure script via
-   --with-io-romio-flags */
-#define MCA_io_romio_USER_CONFIGURE_FLAGS ""
-
-/* Header to include for memcpy implementation */
-#define MCA_memcpy_IMPLEMENTATION_HEADER "opal/mca/memcpy/base/memcpy_base_default.h"
-
-/* Header to include for parts of the memory implementation */
-#define MCA_memory_IMPLEMENTATION_HEADER "opal/mca/memory/base/empty.h"
-
-/* Defined to 1 if ompi:mtl should use direct calls instead of components */
-#define MCA_ompi_mtl_DIRECT_CALL 0
-
-/* name of component to use for direct calls, if MCA_ompi_mtl_DIRECT_CALL is 1
-   */
-#define MCA_ompi_mtl_DIRECT_CALL_COMPONENT 
-
-/* Header ompi:mtl includes to be direct called */
-#define MCA_ompi_mtl_DIRECT_CALL_HEADER ""
-
-/* Defined to 1 if ompi:pml should use direct calls instead of components */
-#define MCA_ompi_pml_DIRECT_CALL 0
-
-/* name of component to use for direct calls, if MCA_ompi_pml_DIRECT_CALL is 1
-   */
-#define MCA_ompi_pml_DIRECT_CALL_COMPONENT 
-
-/* Header ompi:pml includes to be direct called */
-#define MCA_ompi_pml_DIRECT_CALL_HEADER ""
-
-/* Defined to 1 if oshmem:memheap should use direct calls instead of
-   components */
-#define MCA_oshmem_memheap_DIRECT_CALL 0
-
-/* name of component to use for direct calls, if
-   MCA_oshmem_memheap_DIRECT_CALL is 1 */
-#define MCA_oshmem_memheap_DIRECT_CALL_COMPONENT 
-
-/* Header oshmem:memheap includes to be direct called */
-#define MCA_oshmem_memheap_DIRECT_CALL_HEADER ""
-
-/* Defined to 1 if oshmem:spml should use direct calls instead of components
-   */
-#define MCA_oshmem_spml_DIRECT_CALL 0
-
-/* name of component to use for direct calls, if MCA_oshmem_spml_DIRECT_CALL
-   is 1 */
-#define MCA_oshmem_spml_DIRECT_CALL_COMPONENT 
-
-/* Header oshmem:spml includes to be direct called */
-#define MCA_oshmem_spml_DIRECT_CALL_HEADER ""
-
-/* Header to include for rte implementation */
-#define MCA_rte_IMPLEMENTATION_HEADER "ompi/mca/rte/orte/rte_orte.h"
-
-/* Header to include for timer implementation */
-#define MCA_timer_IMPLEMENTATION_HEADER "opal/mca/timer/linux/timer_linux.h"
-
-/* Whether ptmalloc2 is supported on this system or not */
-#define MEMORY_LINUX_PTMALLOC2 1
-
-/* Whether ummunotify is supported on this system or not */
-#define MEMORY_LINUX_UMMUNOTIFY 0
-
-/* Whether we can use M-PAGE supported since MOFED 1.8 */
-#define MPAGE_ENABLE 0
-
-/* create_flags field is part of ibv_exp_reg_mr_in */
-#define MPAGE_HAVE_IBV_EXP_REG_MR_CREATE_FLAGS 0
-
-/* exp_access field is part of ibv_exp_reg_shared_mr_in */
-#define MPAGE_HAVE_SMR_EXP_ACCESS 0
-
-/* Maximum value for an MPI_Count */
-#define MPI_COUNT_MAX 0x7fffffffffffffffll
-
-/* Whether we want to check MPI parameters always, never, or decide at
-   run-time */
-#define MPI_PARAM_CHECK ompi_mpi_param_check
-
-/* Alignment of Fortran CHARACTER */
-#define OMPI_ALIGNMENT_FORTRAN_CHARACTER 1
-
-/* Alignment of Fortran COMPLEX */
-#define OMPI_ALIGNMENT_FORTRAN_COMPLEX 4
-
-/* Alignment of Fortran COMPLEX*16 */
-#define OMPI_ALIGNMENT_FORTRAN_COMPLEX16 8
-
-/* Alignment of Fortran COMPLEX*32 */
-#define OMPI_ALIGNMENT_FORTRAN_COMPLEX32 4
-
-/* Alignment of Fortran COMPLEX*4 */
-#define OMPI_ALIGNMENT_FORTRAN_COMPLEX4 4
-
-/* Alignment of Fortran COMPLEX*8 */
-#define OMPI_ALIGNMENT_FORTRAN_COMPLEX8 4
-
-/* Alignment of Fortran DOUBLE COMPLEX */
-#define OMPI_ALIGNMENT_FORTRAN_DOUBLE_COMPLEX 8
-
-/* Alignment of Fortran DOUBLE PRECISION */
-#define OMPI_ALIGNMENT_FORTRAN_DOUBLE_PRECISION 8
-
-/* Alignment of Fortran INTEGER */
-#define OMPI_ALIGNMENT_FORTRAN_INTEGER 4
-
-/* Alignment of Fortran INTEGER*1 */
-#define OMPI_ALIGNMENT_FORTRAN_INTEGER1 1
-
-/* Alignment of Fortran INTEGER*16 */
-#define OMPI_ALIGNMENT_FORTRAN_INTEGER16 4
-
-/* Alignment of Fortran INTEGER*2 */
-#define OMPI_ALIGNMENT_FORTRAN_INTEGER2 2
-
-/* Alignment of Fortran INTEGER*4 */
-#define OMPI_ALIGNMENT_FORTRAN_INTEGER4 4
-
-/* Alignment of Fortran INTEGER*8 */
-#define OMPI_ALIGNMENT_FORTRAN_INTEGER8 8
-
-/* Alignment of Fortran LOGICAL */
-#define OMPI_ALIGNMENT_FORTRAN_LOGICAL 4
-
-/* Alignment of Fortran LOGICAL*1 */
-#define OMPI_ALIGNMENT_FORTRAN_LOGICAL1 1
-
-/* Alignment of Fortran LOGICAL*2 */
-#define OMPI_ALIGNMENT_FORTRAN_LOGICAL2 2
-
-/* Alignment of Fortran LOGICAL*4 */
-#define OMPI_ALIGNMENT_FORTRAN_LOGICAL4 4
-
-/* Alignment of Fortran LOGICAL*8 */
-#define OMPI_ALIGNMENT_FORTRAN_LOGICAL8 8
-
-/* Alignment of Fortran REAL */
-#define OMPI_ALIGNMENT_FORTRAN_REAL 4
-
-/* Alignment of Fortran REAL*16 */
-#define OMPI_ALIGNMENT_FORTRAN_REAL16 4
-
-/* Alignment of Fortran REAL*2 */
-#define OMPI_ALIGNMENT_FORTRAN_REAL2 4
-
-/* Alignment of Fortran REAL*4 */
-#define OMPI_ALIGNMENT_FORTRAN_REAL4 4
-
-/* Alignment of Fortran REAL*8 */
-#define OMPI_ALIGNMENT_FORTRAN_REAL8 8
-
-/* Whether we want MPI C++ support or not */
-#define OMPI_BUILD_CXX_BINDINGS 0
-
-/* Whether we built the 'use mpi_f08' prototype subarray-based implementation
-   or not (i.e., whether to build the use-mpi-f08-desc prototype or the
-   regular use-mpi-f08 implementation) */
-#define OMPI_BUILD_FORTRAN_F08_SUBARRAYS 0
-
-/* Whether we will build the MPI Fortran mpif.h bindings or not */
-#define OMPI_BUILD_FORTRAN_MPIFH_BINDINGS 1
-
-/* For ompi_info: Whether we will build the MPI Fortran "use mpi_f08" bindings
-   or not */
-#define OMPI_BUILD_FORTRAN_USEMPIF08_BINDINGS 0
-
-/* Whether we will build the MPI Fortran "use mpi" bindings or not */
-#define OMPI_BUILD_FORTRAN_USEMPI_BINDINGS 1
-
-/* OMPI underlying C++ compiler */
-#define OMPI_CXX "g++"
-
-/* Whether C++ compiler supports __builtin_expect */
-#define OMPI_CXX_HAVE_BUILTIN_EXPECT 0
-
-/* Whether C++ compiler supports __builtin_prefetch */
-#define OMPI_CXX_HAVE_BUILTIN_PREFETCH 0
-
-/* Whether a const_cast on a 2-d array will work with the C++ compiler */
-#define OMPI_CXX_SUPPORTS_2D_CONST_CAST 0
-
-/* Enable contributed software package libompitrace */
-#define OMPI_ENABLE_CONTRIB_libompitrace 1
-
-/* Whether we want MPI profiling or not */
-#define OMPI_ENABLE_MPI_PROFILING 1
-
-/* Enable MPI_THREAD_MULTIPLE */
-#define OMPI_ENABLE_THREAD_MULTIPLE 0
-
-/* Underlying Fortran compiler */
-#define OMPI_FC "gfortran"
-
-/* Absolutey path to the underlying Fortran compiler found by configure */
-#define OMPI_FC_ABSOLUTE "/usr/bin/gfortran"
-
-/* Whether the mpif.h interface supports the MPI_SIZEOF interface or not */
-#define OMPI_FORTRAN_BUILD_SIZEOF 0
-
-/* Whether fortran symbols are all caps or not */
-#define OMPI_FORTRAN_CAPS 0
-
-/* Whether fortran symbols have a trailing double underscore or not */
-#define OMPI_FORTRAN_DOUBLE_UNDERSCORE 0
-
-/* How many bytes the mpi_f08 TYPE(MPI_<foo>) handles will be */
-#define OMPI_FORTRAN_F08_HANDLE_SIZE 4
-
-/* Max handle value for fortran MPI handles, effectively min(INT_MAX, max
-   fortran INTEGER value) */
-#define OMPI_FORTRAN_HANDLE_MAX 2147483647
-
-/* For mpi-f08-interfaces-callbacks.f90 and ompi_info: whether the compiler
-   supports the "abstract" keyword or not */
-#define OMPI_FORTRAN_HAVE_ABSTRACT 0
-
-/* For ompi/mpi/fortran/use-mpi-f08/blah.F90 and blah.h and ompi_info: whether
-   the compiler supports the "asynchronous" keyword or not */
-#define OMPI_FORTRAN_HAVE_ASYNCHRONOUS 0
-
-/* For ompi_info: Whether the compiler supports all forms of BIND(C) that we
-   need */
-#define OMPI_FORTRAN_HAVE_BIND_C 0
-
-/* For ompi_info: Whether the compiler supports SUBROUTINE ... BIND(C) or not
-   */
-#define OMPI_FORTRAN_HAVE_BIND_C_SUB 0
-
-/* For ompi_info: Whether the compiler supports TYPE, BIND(C) or not */
-#define OMPI_FORTRAN_HAVE_BIND_C_TYPE 0
-
-/* For ompi_info: Whether the compiler supports TYPE, BIND(C, NAME="name") or
-   not */
-#define OMPI_FORTRAN_HAVE_BIND_C_TYPE_NAME 0
-
-/* For ompi/mpi/fortran/use-mpi-f08/blah.F90 and blah.h and ompi_info: whether
-   the compiler supports c_funloc or not */
-#define OMPI_FORTRAN_HAVE_C_FUNLOC 0
-
-/* For ompi_info: Whether the Fortran compiler supports the Fortran 2008
-   "assumed rank" syntax or not */
-#define OMPI_FORTRAN_HAVE_F08_ASSUMED_RANK 0
-
-/* Whether the Fortran compiler supports ignore TKR functionality or not */
-#define OMPI_FORTRAN_HAVE_IGNORE_TKR 0
-
-/* Whether the compiler supports INTERFACE or not */
-#define OMPI_FORTRAN_HAVE_INTERFACE 1
-
-/* For ompi_info: Whether the compiler supports ISO_C_BINDING or not */
-#define OMPI_FORTRAN_HAVE_ISO_C_BINDING 1
-
-/* Whether the compiler supports ISO_FORTRAN_ENV or not */
-#define OMPI_FORTRAN_HAVE_ISO_FORTRAN_ENV 0
-
-/* For ompi_info: whether the Fortran compiler supports optional arguments or
-   not */
-#define OMPI_FORTRAN_HAVE_OPTIONAL_ARGS 0
-
-/* For mpi-f08-types.f90 and ompi_info: whether the compiler supports the
-   "private" keyword or not (used in MPI_Status) */
-#define OMPI_FORTRAN_HAVE_PRIVATE 0
-
-/* For ompi/mpi/fortran/use-mpi-f08/blah.F90 and blah.h and ompi_info: whether
-   the compiler supports the "procedure" keyword or not */
-#define OMPI_FORTRAN_HAVE_PROCEDURE 0
-
-/* For mpi-f08-types.f90 and .F90 and ompi_info: whether the compiler supports
-   the "protected" keyword or not */
-#define OMPI_FORTRAN_HAVE_PROTECTED 0
-
-/* Whether the compiler supports STORAGE_SIZE on relevant types */
-#define OMPI_FORTRAN_HAVE_STORAGE_SIZE 0
-
-/* Pre declaration for FORTRAN ignore parameter TKR behavior */
-#define OMPI_FORTRAN_IGNORE_TKR_PREDECL ""
-
-/* Type declaration for FORTRAN ignore parameter TKR behavior */
-#define OMPI_FORTRAN_IGNORE_TKR_TYPE 
-
-/* Max dimension rank of Fortran arrays */
-#define OMPI_FORTRAN_MAX_ARRAY_RANK 7
-
-/* Whether the mpi_f08 implementation is using wrapper routines ("bad" Fortran
-   compiler) or weak symbols ("good" Fortran compiler) for the F08 interface
-   definition implementations */
-#define OMPI_FORTRAN_NEED_WRAPPER_ROUTINES 0
-
-/* Whether fortran symbols have no trailing underscore or not */
-#define OMPI_FORTRAN_PLAIN 0
-
-/* Whether fortran symbols have a trailing underscore or not */
-#define OMPI_FORTRAN_SINGLE_UNDERSCORE 1
-
-/* Value to load to the MPI_SUBARRAYS_SUPPORTED compile-time constant */
-#define OMPI_FORTRAN_SUBARRAYS_SUPPORTED .FALSE.
-
-/* Fortran value for LOGICAL .TRUE. value */
-#define OMPI_FORTRAN_VALUE_TRUE 1
-
-/* Greek - alpha, beta, etc - release number of Open MPI */
-#define OMPI_GREEK_VERSION "a1"
-
-/* Wether we want sparse process groups */
-#define OMPI_GROUP_SPARSE 0
-
-/* Whether or not we have compiled with C++ exceptions support */
-#define OMPI_HAVE_CXX_EXCEPTION_SUPPORT 0
-
-/* Whether we have Fortran CHARACTER or not */
-#define OMPI_HAVE_FORTRAN_CHARACTER 1
-
-/* Whether we have Fortran COMPLEX or not */
-#define OMPI_HAVE_FORTRAN_COMPLEX 1
-
-/* Whether we have Fortran COMPLEX*16 or not */
-#define OMPI_HAVE_FORTRAN_COMPLEX16 1
-
-/* Whether we have Fortran COMPLEX*32 or not */
-#define OMPI_HAVE_FORTRAN_COMPLEX32 0
-
-/* Whether we have Fortran COMPLEX*4 or not */
-#define OMPI_HAVE_FORTRAN_COMPLEX4 0
-
-/* Whether we have Fortran COMPLEX*8 or not */
-#define OMPI_HAVE_FORTRAN_COMPLEX8 1
-
-/* Whether we have Fortran DOUBLE COMPLEX or not */
-#define OMPI_HAVE_FORTRAN_DOUBLE_COMPLEX 1
-
-/* Whether we have Fortran DOUBLE PRECISION or not */
-#define OMPI_HAVE_FORTRAN_DOUBLE_PRECISION 1
-
-/* Whether we have Fortran INTEGER or not */
-#define OMPI_HAVE_FORTRAN_INTEGER 1
-
-/* Whether we have Fortran INTEGER*1 or not */
-#define OMPI_HAVE_FORTRAN_INTEGER1 1
-
-/* Whether we have Fortran INTEGER*16 or not */
-#define OMPI_HAVE_FORTRAN_INTEGER16 0
-
-/* Whether we have Fortran INTEGER*2 or not */
-#define OMPI_HAVE_FORTRAN_INTEGER2 1
-
-/* Whether we have Fortran INTEGER*4 or not */
-#define OMPI_HAVE_FORTRAN_INTEGER4 1
-
-/* Whether we have Fortran INTEGER*8 or not */
-#define OMPI_HAVE_FORTRAN_INTEGER8 1
-
-/* Whether we have Fortran LOGICAL or not */
-#define OMPI_HAVE_FORTRAN_LOGICAL 1
-
-/* Whether we have Fortran LOGICAL*1 or not */
-#define OMPI_HAVE_FORTRAN_LOGICAL1 1
-
-/* Whether we have Fortran LOGICAL*2 or not */
-#define OMPI_HAVE_FORTRAN_LOGICAL2 1
-
-/* Whether we have Fortran LOGICAL*4 or not */
-#define OMPI_HAVE_FORTRAN_LOGICAL4 1
-
-/* Whether we have Fortran LOGICAL*8 or not */
-#define OMPI_HAVE_FORTRAN_LOGICAL8 1
-
-/* Whether we have Fortran REAL or not */
-#define OMPI_HAVE_FORTRAN_REAL 1
-
-/* Whether we have Fortran REAL*16 or not */
-#define OMPI_HAVE_FORTRAN_REAL16 0
-
-/* Whether we have Fortran REAL*2 or not */
-#define OMPI_HAVE_FORTRAN_REAL2 0
-
-/* Whether we have Fortran REAL*4 or not */
-#define OMPI_HAVE_FORTRAN_REAL4 1
-
-/* Whether we have Fortran REAL*8 or not */
-#define OMPI_HAVE_FORTRAN_REAL8 1
-
-/* Fortrn KIND number for CHARACTER */
-#define OMPI_KIND_FORTRAN_CHARACTER C_SIGNED_CHAR
-
-/* Fortrn KIND number for COMPLEX */
-#define OMPI_KIND_FORTRAN_COMPLEX C_FLOAT_COMPLEX
-
-/* Fortrn KIND number for COMPLEX*16 */
-#define OMPI_KIND_FORTRAN_COMPLEX16 C_DOUBLE_COMPLEX
-
-/* Fortrn KIND number for COMPLEX*32 */
-#define OMPI_KIND_FORTRAN_COMPLEX32 0
-
-/* Fortrn KIND number for COMPLEX*4 */
-#define OMPI_KIND_FORTRAN_COMPLEX4 0
-
-/* Fortrn KIND number for COMPLEX*8 */
-#define OMPI_KIND_FORTRAN_COMPLEX8 C_FLOAT_COMPLEX
-
-/* Fortrn KIND number for DOUBLE COMPLEX */
-#define OMPI_KIND_FORTRAN_DOUBLE_COMPLEX C_DOUBLE_COMPLEX
-
-/* Fortrn KIND number for DOUBLE PRECISION */
-#define OMPI_KIND_FORTRAN_DOUBLE_PRECISION C_DOUBLE
-
-/* Fortrn KIND number for INTEGER */
-#define OMPI_KIND_FORTRAN_INTEGER C_INT
-
-/* Fortrn KIND number for INTEGER*1 */
-#define OMPI_KIND_FORTRAN_INTEGER1 C_SIGNED_CHAR
-
-/* Fortrn KIND number for INTEGER*16 */
-#define OMPI_KIND_FORTRAN_INTEGER16 0
-
-/* Fortrn KIND number for INTEGER*2 */
-#define OMPI_KIND_FORTRAN_INTEGER2 C_SHORT
-
-/* Fortrn KIND number for INTEGER*4 */
-#define OMPI_KIND_FORTRAN_INTEGER4 C_INT
-
-/* Fortrn KIND number for INTEGER*8 */
-#define OMPI_KIND_FORTRAN_INTEGER8 C_LONG_LONG
-
-/* Fortrn KIND number for LOGICAL */
-#define OMPI_KIND_FORTRAN_LOGICAL C_INT
-
-/* Fortrn KIND number for LOGICAL*1 */
-#define OMPI_KIND_FORTRAN_LOGICAL1 C_SIGNED_CHAR
-
-/* Fortrn KIND number for LOGICAL*2 */
-#define OMPI_KIND_FORTRAN_LOGICAL2 C_SHORT
-
-/* Fortrn KIND number for LOGICAL*4 */
-#define OMPI_KIND_FORTRAN_LOGICAL4 C_INT
-
-/* Fortrn KIND number for LOGICAL*8 */
-#define OMPI_KIND_FORTRAN_LOGICAL8 C_LONG_LONG
-
-/* Fortrn KIND number for REAL */
-#define OMPI_KIND_FORTRAN_REAL C_FLOAT
-
-/* Fortrn KIND number for REAL*16 */
-#define OMPI_KIND_FORTRAN_REAL16 0
-
-/* Fortrn KIND number for REAL*2 */
-#define OMPI_KIND_FORTRAN_REAL2 0
-
-/* Fortrn KIND number for REAL*4 */
-#define OMPI_KIND_FORTRAN_REAL4 C_FLOAT
-
-/* Fortrn KIND number for REAL*8 */
-#define OMPI_KIND_FORTRAN_REAL8 C_DOUBLE
-
-/* Major release number of Open MPI */
-#define OMPI_MAJOR_VERSION 1
-
-/* Minor release number of Open MPI */
-#define OMPI_MINOR_VERSION 9
-
-/* MPI Extensions included in libmpi */
-#define OMPI_MPIEXT_COMPONENTS ""
-
-/* Type of MPI_Aint */
-#define OMPI_MPI_AINT_TYPE ptrdiff_t
-
-/* Contributed software packages built with Open MPI */
-#define OMPI_MPI_CONTRIBS "libompitrace"
-
-/* Size of the MPI_Count datatype */
-#define OMPI_MPI_COUNT_SIZE 8
-
-/* Type of the MPI_Count datatype */
-#define OMPI_MPI_COUNT_TYPE long long
-
-/* Size of the MPI_Offset */
-#define OMPI_MPI_OFFSET_SIZE 8
-
-/* Type of MPI_Offset */
-#define OMPI_MPI_OFFSET_TYPE long long
-
-/* Enable flow control for Portals4 MTL */
-#define OMPI_MTL_PORTALS4_FLOW_CONTROL 1
-
-/* MPI datatype corresponding to MPI_Offset */
-#define OMPI_OFFSET_DATATYPE MPI_LONG_LONG
-
-/* Whether we want to check MPI parameters never or possible (an integer
-   constant) */
-#define OMPI_PARAM_CHECK 1
-
-/* Index into endpoint array for BML */
-#define OMPI_PROC_ENDPOINT_TAG_BML 0
-
-/* Maximum number of endpoint entries to be attached to an ompi_proc_t */
-#define OMPI_PROC_ENDPOINT_TAG_MAX 1
-
-/* Index into endpoint array for MTL */
-/* #undef OMPI_PROC_ENDPOINT_TAG_MTL */
-
-/* Index into endpoint array for PML */
-/* #undef OMPI_PROC_ENDPOINT_TAG_PML */
-
-/* Index into endpoint array for PORTALS4 */
-/* #undef OMPI_PROC_ENDPOINT_TAG_PORTALS4 */
-
-/* Whether OMPI should provide MPI File interface */
-#define OMPI_PROVIDE_MPI_FILE_INTERFACE 1
-
-/* Whether Fortran REAL*16 matches the bit format of the equivalent C type */
-#define OMPI_REAL16_MATCHES_C 0
-
-/* Release date of Open MPI */
-#define OMPI_RELEASE_DATE "Unreleased developer copy"
-
-/* Release release number of Open MPI */
-#define OMPI_RELEASE_VERSION 0
-
-/* The repository version Open MPI */
-#define OMPI_REPO_REV "dev-1510-g40fe521"
-
-/* Defined to 1 if the OMPI runtime component is ORTE */
-#define OMPI_RTE_ORTE 1
-
-/* Size of Fortran CHARACTER */
-#define OMPI_SIZEOF_FORTRAN_CHARACTER 1
-
-/* Size of Fortran COMPLEX */
-#define OMPI_SIZEOF_FORTRAN_COMPLEX 8
-
-/* Size of Fortran COMPLEX*16 */
-#define OMPI_SIZEOF_FORTRAN_COMPLEX16 16
-
-/* Size of Fortran COMPLEX*32 */
-#define OMPI_SIZEOF_FORTRAN_COMPLEX32 4
-
-/* Size of Fortran COMPLEX*4 */
-#define OMPI_SIZEOF_FORTRAN_COMPLEX4 4
-
-/* Size of Fortran COMPLEX*8 */
-#define OMPI_SIZEOF_FORTRAN_COMPLEX8 8
-
-/* Size of Fortran DOUBLE COMPLEX */
-#define OMPI_SIZEOF_FORTRAN_DOUBLE_COMPLEX 16
-
-/* Size of Fortran DOUBLE PRECISION */
-#define OMPI_SIZEOF_FORTRAN_DOUBLE_PRECISION 8
-
-/* Size of Fortran INTEGER */
-#define OMPI_SIZEOF_FORTRAN_INTEGER 4
-
-/* Size of Fortran INTEGER*1 */
-#define OMPI_SIZEOF_FORTRAN_INTEGER1 1
-
-/* Size of Fortran INTEGER*16 */
-#define OMPI_SIZEOF_FORTRAN_INTEGER16 16
-
-/* Size of Fortran INTEGER*2 */
-#define OMPI_SIZEOF_FORTRAN_INTEGER2 2
-
-/* Size of Fortran INTEGER*4 */
-#define OMPI_SIZEOF_FORTRAN_INTEGER4 4
-
-/* Size of Fortran INTEGER*8 */
-#define OMPI_SIZEOF_FORTRAN_INTEGER8 8
-
-/* Size of Fortran LOGICAL */
-#define OMPI_SIZEOF_FORTRAN_LOGICAL 4
-
-/* Size of Fortran LOGICAL*1 */
-#define OMPI_SIZEOF_FORTRAN_LOGICAL1 1
-
-/* Size of Fortran LOGICAL*2 */
-#define OMPI_SIZEOF_FORTRAN_LOGICAL2 2
-
-/* Size of Fortran LOGICAL*4 */
-#define OMPI_SIZEOF_FORTRAN_LOGICAL4 4
-
-/* Size of Fortran LOGICAL*8 */
-#define OMPI_SIZEOF_FORTRAN_LOGICAL8 8
-
-/* Size of Fortran REAL */
-#define OMPI_SIZEOF_FORTRAN_REAL 4
-
-/* Size of Fortran REAL*16 */
-#define OMPI_SIZEOF_FORTRAN_REAL16 4
-
-/* Size of Fortran REAL*2 */
-#define OMPI_SIZEOF_FORTRAN_REAL2 4
-
-/* Size of Fortran REAL*4 */
-#define OMPI_SIZEOF_FORTRAN_REAL4 4
-
-/* Size of Fortran REAL*8 */
-#define OMPI_SIZEOF_FORTRAN_REAL8 8
-
-/* Tarball filename version string of Open MPI */
-#define OMPI_TARBALL_VERSION "gitclone"
-
-/* Complete release number of Open MPI */
-#define OMPI_VERSION "0"
-
-/* do we want java mpi bindings */
-#define OMPI_WANT_JAVA_BINDINGS 0
-
-/* do we want to try to work around C++ bindings SEEK_* issue? */
-#define OMPI_WANT_MPI_CXX_SEEK 1
-
-/* Enable warnings when using deprecated MPI functions */
-#define OMPI_WANT_MPI_INTERFACE_WARNING 1
-
-/* if the peruse interface should be enabled */
-#define OMPI_WANT_PERUSE 0
-
-/* Alignment of type _Bool */
-#define OPAL_ALIGNMENT_BOOL 1
-
-/* Alignment of type char */
-#define OPAL_ALIGNMENT_CHAR 1
-
-/* Alignment of type bool */
-#define OPAL_ALIGNMENT_CXX_BOOL 1
-
-/* Alignment of type double */
-#define OPAL_ALIGNMENT_DOUBLE 8
-
-/* Alignment of type double _Complex */
-#define OPAL_ALIGNMENT_DOUBLE_COMPLEX 8
-
-/* Alignment of type float */
-#define OPAL_ALIGNMENT_FLOAT 4
-
-/* Alignment of type float _Complex */
-#define OPAL_ALIGNMENT_FLOAT_COMPLEX 4
-
-/* Alignment of type int */
-#define OPAL_ALIGNMENT_INT 4
-
-/* Alignment of type int128_t */
-/* #undef OPAL_ALIGNMENT_INT128 */
-
-/* Alignment of type int16_t */
-#define OPAL_ALIGNMENT_INT16 2
-
-/* Alignment of type int32_t */
-#define OPAL_ALIGNMENT_INT32 4
-
-/* Alignment of type int64_t */
-#define OPAL_ALIGNMENT_INT64 8
-
-/* Alignment of type int8_t */
-#define OPAL_ALIGNMENT_INT8 1
-
-/* Alignment of type long */
-#define OPAL_ALIGNMENT_LONG 8
-
-/* Alignment of type long double */
-#define OPAL_ALIGNMENT_LONG_DOUBLE 16
-
-/* Alignment of type long double _Complex */
-#define OPAL_ALIGNMENT_LONG_DOUBLE_COMPLEX 16
-
-/* Alignment of type long long */
-#define OPAL_ALIGNMENT_LONG_LONG 8
-
-/* Alignment of type short */
-#define OPAL_ALIGNMENT_SHORT 2
-
-/* Alignment of type size_t */
-#define OPAL_ALIGNMENT_SIZE_T 8
-
-/* Alignment of type void * */
-#define OPAL_ALIGNMENT_VOID_P 8
-
-/* Alignment of type wchar_t */
-#define OPAL_ALIGNMENT_WCHAR 4
-
-/* Alignment of type __float128 */
-#define OPAL_ALIGNMENT___FLOAT128 16
-
-/* set to 1 if word-size integers must be aligned to word-size padding to
-   prevent bus errors */
-#define OPAL_ALIGN_WORD_SIZE_INTEGERS 0
-
-/* OMPI architecture string */
-#define OPAL_ARCH "x86_64-unknown-linux-gnu"
-
-/* Assembly align directive expects logarithmic value */
-#define OPAL_ASM_ALIGN_LOG 
-
-/* What ARM assembly version to use */
-/* #undef OPAL_ASM_ARM_VERSION */
-
-/* Assembly directive for exporting symbols */
-#define OPAL_ASM_GLOBAL ".globl"
-
-/* Assembly prefix for gsym labels */
-#define OPAL_ASM_GSYM ""
-
-/* Assembly suffix for labels */
-#define OPAL_ASM_LABEL_SUFFIX ":"
-
-/* Assembly prefix for lsym labels */
-#define OPAL_ASM_LSYM ".L"
-
-/* Do we need to give a .size directive */
-#define OPAL_ASM_SIZE "1"
-
-/* Whether we can do 64bit assembly operations or not. Should not be used
-   outside of the assembly header files */
-#define OPAL_ASM_SUPPORT_64BIT 1
-
-/* Assembly directive for setting text section */
-#define OPAL_ASM_TEXT ".text"
-
-/* How to set function type in .type directive */
-#define OPAL_ASM_TYPE "@"
-
-/* Architecture type of assembly to use for atomic operations and CMA */
-#define OPAL_ASSEMBLY_ARCH OPAL_AMD64
-
-/* Whether to use builtin atomics */
-#define OPAL_ASSEMBLY_BUILTIN OPAL_BUILTIN_NO
-
-/* Format of assembly file */
-#define OPAL_ASSEMBLY_FORMAT "default-.text-.globl-:--.L-@-1-0-1-1-1"
-
-/* Whether we have support for RDTSCP instruction */
-#define OPAL_ASSEMBLY_SUPPORTS_RDTSCP 0
-
-/* Enable flow control for Portals4 BTL */
-#define OPAL_BTL_PORTALS4_FLOW_CONTROL 0
-
-/* If CMA support can be enabled */
-#define OPAL_BTL_SM_HAVE_CMA 0
-
-/* If knem support can be enabled */
-#define OPAL_BTL_SM_HAVE_KNEM 0
-
-/* Path by which to include fi_ext_usnic.h */
-/* #undef OPAL_BTL_USNIC_FI_EXT_USNIC_H */
-
-/* define to 1 if usnic BTL unit tests are enabled, 0 otherwise */
-#define OPAL_BTL_USNIC_UNIT_TESTS 0
-
-/* If CMA support can be enabled within vader */
-#define OPAL_BTL_VADER_HAVE_CMA 0
-
-/* If KNEM support can be enabled within vader */
-#define OPAL_BTL_VADER_HAVE_KNEM 0
-
-/* If XPMEM support can be enabled within vader */
-#define OPAL_BTL_VADER_HAVE_XPMEM 0
-
-/* The compiler $lower which OMPI was built with */
-#define OPAL_BUILD_PLATFORM_COMPILER_FAMILYID 1
-
-/* The compiler $lower which OMPI was built with */
-#define OPAL_BUILD_PLATFORM_COMPILER_FAMILYNAME GNU
-
-/* The compiler $lower which OMPI was built with */
-#define OPAL_BUILD_PLATFORM_COMPILER_VERSION 263175
-
-/* The compiler $lower which OMPI was built with */
-#define OPAL_BUILD_PLATFORM_COMPILER_VERSION_STR 4.4.7
-
-/* OMPI underlying C compiler */
-#define OPAL_CC "gcc"
-
-/* Use static const char[] strings for C files */
-#define OPAL_CC_USE_CONST_CHAR_IDENT 0
-
-/* Use #ident strings for C files */
-#define OPAL_CC_USE_IDENT 1
-
-/* Use #pragma comment for C files */
-#define OPAL_CC_USE_PRAGMA_COMMENT 
-
-/* Use #pragma ident strings for C files */
-#define OPAL_CC_USE_PRAGMA_IDENT 0
-
-/* Need CMA syscalls defined */
-/* #undef OPAL_CMA_NEED_SYSCALL_DEFS */
-
-/* Whether we have CUDA GDR support available */
-#define OPAL_CUDA_GDR_SUPPORT 1
-
-/* Whether we have CUDA cuPointerGetAttributes function available */
-#define OPAL_CUDA_GET_ATTRIBUTES 1
-
-/* Whether we want cuda device pointer support */
-#define OPAL_CUDA_SUPPORT 1
-
-/* Whether we have CUDA 4.1 support available */
-#define OPAL_CUDA_SUPPORT_41 1
-
-/* Whether we have CUDA CU_POINTER_ATTRIBUTE_SYNC_MEMOPS support available */
-#define OPAL_CUDA_SYNC_MEMOPS 1
-
-/* OPAL underlying C++ compiler */
-#define OPAL_CXX "g++"
-
-/* Use static const char[] strings for C++ files */
-/* #undef OPAL_CXX_USE_CONST_CHAR_IDENT */
-
-/* Use #ident strings for C++ files */
-/* #undef OPAL_CXX_USE_IDENT */
-
-/* Use #pragma comment for C++ files */
-/* #undef OPAL_CXX_USE_PRAGMA_COMMENT */
-
-/* Use #pragma ident strings for C++ files */
-/* #undef OPAL_CXX_USE_PRAGMA_IDENT */
-
-/* Whether C compiler supports DEC style inline assembly */
-#define OPAL_C_DEC_INLINE_ASSEMBLY 0
-
-/* Whether C compiler supports GCC style inline assembly */
-#define OPAL_C_GCC_INLINE_ASSEMBLY 1
-
-/* Whether C compiler supports __builtin_clz */
-#define OPAL_C_HAVE_BUILTIN_CLZ 1
-
-/* Whether C compiler supports __builtin_expect */
-#define OPAL_C_HAVE_BUILTIN_EXPECT 1
-
-/* Whether C compiler supports __builtin_prefetch */
-#define OPAL_C_HAVE_BUILTIN_PREFETCH 1
-
-/* Whether C compiler supports symbol visibility or not */
-#define OPAL_C_HAVE_VISIBILITY 1
-
-/* Whether C compiler supports XLC style inline assembly */
-#define OPAL_C_XLC_INLINE_ASSEMBLY 0
-
-/* Whether we have lt_dladvise or not */
-#define OPAL_DL_LIBLTDL_HAVE_LT_DLADVISE 0
-
-/* Whether we want checkpoint/restart enabled debugging functionality or not
-   */
-#define OPAL_ENABLE_CRDEBUG 0
-
-/* Whether we want developer-level debugging code or not */
-#define OPAL_ENABLE_DEBUG 1
-
-/* Enable features required for dynamic SL support */
-#define OPAL_ENABLE_DYNAMIC_SL 0
-
-/* Enable fault tolerance general components and logic */
-#define OPAL_ENABLE_FT 0
-
-/* Enable fault tolerance checkpoint/restart components and logic */
-#define OPAL_ENABLE_FT_CR 0
-
-/* Enable fault tolerance thread in Open PAL */
-#define OPAL_ENABLE_FT_THREAD 0
-
-/* Disable getpwuid support (default: enabled) */
-#define OPAL_ENABLE_GETPWUID 1
-
-/* Enable features required for heterogeneous support */
-#define OPAL_ENABLE_HETEROGENEOUS_SUPPORT 0
-
-/* Enable IPv6 support, but only if the underlying system supports it */
-#define OPAL_ENABLE_IPV6 0
-
-/* Whether we want the memory profiling or not */
-#define OPAL_ENABLE_MEM_DEBUG 1
-
-/* Whether we want the memory profiling or not */
-#define OPAL_ENABLE_MEM_PROFILE 1
-
-/* Whether we should enable thread support within the OPAL code base */
-#define OPAL_ENABLE_MULTI_THREADS 1
-
-/* Whether we want BTL progress threads enabled */
-#define OPAL_ENABLE_PROGRESS_THREADS 0
-
-/* Whether user wants PTY support or not */
-#define OPAL_ENABLE_PTY_SUPPORT 1
-
-/* Whether we want developer-level timing framework or not */
-#define OPAL_ENABLE_TIMING 0
-
-/* Greek - alpha, beta, etc - release number of Open Portable Access Layer */
-#define OPAL_GREEK_VERSION "a1"
-
-/* Whether there is an atomic assembly file available */
-#define OPAL_HAVE_ASM_FILE 1
-
-/* Whether your compiler has __attribute__ or not */
-#define OPAL_HAVE_ATTRIBUTE 1
-
-/* Whether your compiler has __attribute__ aligned or not */
-#define OPAL_HAVE_ATTRIBUTE_ALIGNED 1
-
-/* Whether your compiler has __attribute__ always_inline or not */
-#define OPAL_HAVE_ATTRIBUTE_ALWAYS_INLINE 1
-
-/* Whether your compiler has __attribute__ cold or not */
-#define OPAL_HAVE_ATTRIBUTE_COLD 1
-
-/* Whether your compiler has __attribute__ const or not */
-#define OPAL_HAVE_ATTRIBUTE_CONST 1
-
-/* Whether your compiler has __attribute__ deprecated or not */
-#define OPAL_HAVE_ATTRIBUTE_DEPRECATED 1
-
-/* Whether your compiler has __attribute__ deprecated with optional argument
-   */
-#define OPAL_HAVE_ATTRIBUTE_DEPRECATED_ARGUMENT 0
-
-/* Whether your compiler has __attribute__ destructor or not */
-#define OPAL_HAVE_ATTRIBUTE_DESTRUCTOR 1
-
-/* Whether your compiler has __attribute__ format or not */
-#define OPAL_HAVE_ATTRIBUTE_FORMAT 1
-
-/* Whether your compiler has __attribute__ format and it works on function
-   pointers */
-#define OPAL_HAVE_ATTRIBUTE_FORMAT_FUNCPTR 1
-
-/* Whether your compiler has __attribute__ hot or not */
-#define OPAL_HAVE_ATTRIBUTE_HOT 1
-
-/* Whether your compiler has __attribute__ malloc or not */
-#define OPAL_HAVE_ATTRIBUTE_MALLOC 1
-
-/* Whether your compiler has __attribute__ may_alias or not */
-#define OPAL_HAVE_ATTRIBUTE_MAY_ALIAS 1
-
-/* Whether your compiler has __attribute__ noinline or not */
-#define OPAL_HAVE_ATTRIBUTE_NOINLINE 1
-
-/* Whether your compiler has __attribute__ nonnull or not */
-#define OPAL_HAVE_ATTRIBUTE_NONNULL 1
-
-/* Whether your compiler has __attribute__ noreturn or not */
-#define OPAL_HAVE_ATTRIBUTE_NORETURN 1
-
-/* Whether your compiler has __attribute__ noreturn and it works on function
-   pointers */
-#define OPAL_HAVE_ATTRIBUTE_NORETURN_FUNCPTR 1
-
-/* Whether your compiler has __attribute__ no_instrument_function or not */
-#define OPAL_HAVE_ATTRIBUTE_NO_INSTRUMENT_FUNCTION 1
-
-/* Whether your compiler has __attribute__ packed or not */
-#define OPAL_HAVE_ATTRIBUTE_PACKED 1
-
-/* Whether your compiler has __attribute__ pure or not */
-#define OPAL_HAVE_ATTRIBUTE_PURE 1
-
-/* Whether your compiler has __attribute__ sentinel or not */
-#define OPAL_HAVE_ATTRIBUTE_SENTINEL 1
-
-/* Whether your compiler has __attribute__ unused or not */
-#define OPAL_HAVE_ATTRIBUTE_UNUSED 1
-
-/* Whether your compiler has __attribute__ visibility or not */
-#define OPAL_HAVE_ATTRIBUTE_VISIBILITY 1
-
-/* Whether your compiler has __attribute__ warn unused result or not */
-#define OPAL_HAVE_ATTRIBUTE_WARN_UNUSED_RESULT 1
-
-/* Whether your compiler has __attribute__ weak alias or not */
-#define OPAL_HAVE_ATTRIBUTE_WEAK_ALIAS 1
-
-/* whether backtrace_execinfo is found and available */
-#define OPAL_HAVE_BACKTRACE_EXECINFO 1
-
-/* whether qsort is broken or not */
-#define OPAL_HAVE_BROKEN_QSORT 0
-
-/* whether ceil is found and available */
-#define OPAL_HAVE_CEIL 1
-
-/* whether clock_gettime is found and available */
-#define OPAL_HAVE_CLOCK_GETTIME 1
-
-/* Whether the processor supports the cmpxchg16b instruction */
-#define OPAL_HAVE_CMPXCHG16B 1
-
-/* Enable features required for ConnectX XRC support */
-#define OPAL_HAVE_CONNECTX_XRC 0
-
-/* Enable features required for XRC domains support */
-#define OPAL_HAVE_CONNECTX_XRC_DOMAINS 0
-
-/* whether crs_blcr is found and available */
-/* #undef OPAL_HAVE_CRS_BLCR */
-
-/* whether dirname is found and available */
-#define OPAL_HAVE_DIRNAME 1
-
-/* Whether the OPAL DL framework is functional or not */
-#define OPAL_HAVE_DL_SUPPORT 1
-
-/* whether fbtl_posix is found and available */
-#define OPAL_HAVE_FBTL_POSIX 1
-
-/* whether gethostbyname is found and available */
-#define OPAL_HAVE_GETHOSTBYNAME 1
-
-/* Whether we have hwloc support or not */
-#define OPAL_HAVE_HWLOC 1
-
-/* do we have Java support */
-#define OPAL_HAVE_JAVA_SUPPORT 1
-
-/* Do not use outside of mpi.h. Define to 1 if the system has the type `long
-   long'. */
-#define OPAL_HAVE_LONG_LONG 1
-
-/* whether openpty is found and available */
-#define OPAL_HAVE_OPENPTY 1
-
-/* If PTHREADS implementation supports PTHREAD_MUTEX_ERRORCHECK */
-#define OPAL_HAVE_PTHREAD_MUTEX_ERRORCHECK 1
-
-/* If PTHREADS implementation supports PTHREAD_MUTEX_ERRORCHECK_NP */
-#define OPAL_HAVE_PTHREAD_MUTEX_ERRORCHECK_NP 1
-
-/* Whether RDMA CM is available or not */
-/* #undef OPAL_HAVE_RDMACM */
-
-/* Enable RDMAoE support */
-/* #undef OPAL_HAVE_RDMAOE */
-
-/* Whether we have SA_RESTART in <signal.h> or not */
-#define OPAL_HAVE_SA_RESTART 1
-
-/* whether sched_yield is found and available */
-#define OPAL_HAVE_SCHED_YIELD 1
-
-/* whether shmem_posix is found and available */
-#define OPAL_HAVE_SHMEM_POSIX 1
-
-/* whether socket is found and available */
-#define OPAL_HAVE_SOCKET 1
-
-/* Whether or not we have solaris */
-#define OPAL_HAVE_SOLARIS 0
-
-/* Whether the __sync builtin atomic compare and swap supports 128-bit values
-   */
-/* #undef OPAL_HAVE_SYNC_BUILTIN_CSWAP_INT128 */
-
-/* Do not use outside of mpi.h. Define to 1 if you have the <sys/synch.h>
-   header file. */
-/* #undef OPAL_HAVE_SYS_SYNCH_H */
-
-/* Do not use outside of mpi.h. Define to 1 if you have the <sys/time.h>
-   header file. */
-#define OPAL_HAVE_SYS_TIME_H 1
-
-/* Whether UD CM is available or not */
-/* #undef OPAL_HAVE_UDCM */
-
-/* Whether we have __va_copy or not */
-#define OPAL_HAVE_UNDERSCORE_VA_COPY 1
-
-/* Whether we have va_copy or not */
-#define OPAL_HAVE_VA_COPY 1
-
-/* Whether we have weak symbols or not */
-#define OPAL_HAVE_WEAK_SYMBOLS 1
-
-/* Whether our event component has working event operations or not (if not,
-   then assumedly it only has working timers and signals) */
-#define OPAL_HAVE_WORKING_EVENTOPS 1
-
-/* whether yp_all_nsl is found and available */
-#define OPAL_HAVE_YP_ALL_NSL 1
-
-/* Define to 1 ifyou have the declaration of _SC_NPROCESSORS_ONLN, and to 0
-   otherwise */
-#define OPAL_HAVE__SC_NPROCESSORS_ONLN 1
-
-/* Number of arguments to ibv_create_cq */
-/* #undef OPAL_IBV_CREATE_CQ_ARGS */
-
-/* ident string for Open MPI */
-#define OPAL_IDENT_STRING "1.9.0a1"
-
-/* Major release number of Open Portable Access Layer */
-#define OPAL_MAJOR_VERSION 1
-
-/* Maximum length of datarep strings (default is 128) */
-#define OPAL_MAX_DATAREP_STRING 128
-
-/* Maximum length of error strings (default is 256) */
-#define OPAL_MAX_ERROR_STRING 256
-
-/* Maximum length of info keys (default is 36) */
-#define OPAL_MAX_INFO_KEY 36
-
-/* Maximum length of info vals (default is 256) */
-#define OPAL_MAX_INFO_VAL 256
-
-/* Maximum length of object names (default is 64) */
-#define OPAL_MAX_OBJECT_NAME 64
-
-/* Maximum length of port names (default is 1024) */
-#define OPAL_MAX_PORT_NAME 1024
-
-/* Maximum length of processor names (default is 256) */
-#define OPAL_MAX_PROCESSOR_NAME 256
-
-/* MCA cmd line identifier */
-#define OPAL_MCA_CMD_LINE_ID "mca"
-
-/* MCA prefix string for envars */
-#define OPAL_MCA_PREFIX "OMPI_MCA_"
-
-/* Whether any opal memory mca components were found */
-#define OPAL_MEMORY_HAVE_COMPONENT 1
-
-/* Minor release number of Open Portable Access Layer */
-#define OPAL_MINOR_VERSION 9
-
-/* Whether the C compiler supports "bool" without any other help (such as
-   <stdbool.h>) */
-#define OPAL_NEED_C_BOOL 1
-
-/* Add padding bytes to the openib BTL control header */
-#define OPAL_OPENIB_PAD_HDR 0
-
-/* package/branding string for Open MPI */
-#define OPAL_PACKAGE_STRING "Open MPI wwu12@bunsen.icl.utk.edu Distribution"
-
-/* Log base 2 of the maximum size in bytes of a memory descriptor. Set to 0 if
-   MD can bind all of memory. */
-#define OPAL_PORTALS4_MAX_MD_SIZE 0
-
-/* Log base 2 of the maximum size in bytes of the user virtual address space.
-   Set to 0 if MD can bind all of memory. */
-#define OPAL_PORTALS4_MAX_VA_SIZE 0
-
-/* Whether r notation is used for ppc registers */
-/* #undef OPAL_POWERPC_R_REGISTERS */
-
-/* type to use for ptrdiff_t */
-#define OPAL_PTRDIFF_TYPE ptrdiff_t
-
-/* Release date of Open Portable Access Layer */
-#define OPAL_RELEASE_DATE "Unreleased developer copy"
-
-/* Release release number of Open Portable Access Layer */
-#define OPAL_RELEASE_VERSION 0
-
-/* The repository version Open Portable Access Layer */
-#define OPAL_REPO_REV "dev-1510-g40fe521"
-
-/* Whether we have shared memory support for mmap or not */
-#define OPAL_SHMEM_MMAP 1
-
-/* Whether we have shared memory support for POSIX or not */
-#define OPAL_SHMEM_POSIX 1
-
-/* Whether we have shared memory support for SYSV or not */
-#define OPAL_SHMEM_SYSV 1
-
-/* Do not use outside of mpi.h. Define to 1 if you have the ANSI C header
-   files. */
-#define OPAL_STDC_HEADERS 1
-
-/* Tarball filename version string of Open Portable Access Layer */
-#define OPAL_TARBALL_VERSION "gitclone"
-
-/* Whether to use <stdbool.h> or not */
-#define OPAL_USE_STDBOOL_H 1
-
-/* Complete release number of Open Portable Access Layer */
-#define OPAL_VERSION "0"
-
-/* Enable per-user config files */
-#define OPAL_WANT_HOME_CONFIG_FILES 1
-
-/* if the memory and buffer checking should be enabled */
-#define OPAL_WANT_MEMCHECKER 0
-
-/* if want pretty-print stack trace feature */
-#define OPAL_WANT_PRETTY_PRINT_STACKTRACE 1
-
-/* whether we want to have smp locks in atomic ops or not */
-#define OPAL_WANT_SMP_LOCKS 1
-
-/* Specific ps command to use in orte-clean */
-#define ORTE_CLEAN_PS_CMD "ps -A -o fname,pid,user"
-
-/* Whether we want static ports enabled */
-#define ORTE_ENABLE_STATIC_PORTS 1
-
-/* Greek - alpha, beta, etc - release number of Open MPI Run-Time Environment
-   */
-#define ORTE_GREEK_VERSION "a1"
-
-/* Major release number of Open MPI Run-Time Environment */
-#define ORTE_MAJOR_VERSION 1
-
-/* Minor release number of Open MPI Run-Time Environment */
-#define ORTE_MINOR_VERSION 9
-
-/* Release date of Open MPI Run-Time Environment */
-#define ORTE_RELEASE_DATE "Unreleased developer copy"
-
-/* Release release number of Open MPI Run-Time Environment */
-#define ORTE_RELEASE_VERSION 0
-
-/* The repository version Open MPI Run-Time Environment */
-#define ORTE_REPO_REV "dev-1510-g40fe521"
-
-/* Tarball filename version string of Open MPI Run-Time Environment */
-#define ORTE_TARBALL_VERSION "gitclone"
-
-/* Complete release number of Open MPI Run-Time Environment */
-#define ORTE_VERSION "0"
-
-/* Whether we want orterun to effect "--prefix $prefix" by default */
-#define ORTE_WANT_ORTERUN_PREFIX_BY_DEFAULT 0
-
-/* Greek - alpha, beta, etc - release number of Open SHMEM */
-#define OSHMEM_GREEK_VERSION "a1"
-
-/* mxm support is available */
-/* #undef OSHMEM_HAS_ATOMIC_MXM */
-
-/* Major release number of Open SHMEM */
-#define OSHMEM_MAJOR_VERSION 1
-
-/* Minor release number of Open SHMEM */
-#define OSHMEM_MINOR_VERSION 9
-
-/* Whether we want to check OSHMEM parameters always or never */
-#define OSHMEM_PARAM_CHECK 1
-
-/* Release date of Open SHMEM */
-#define OSHMEM_RELEASE_DATE "Unreleased developer copy"
-
-/* Release release number of Open SHMEM */
-#define OSHMEM_RELEASE_VERSION 0
-
-/* The repository version Open SHMEM */
-#define OSHMEM_REPO_REV "dev-1510-g40fe521"
-
-/* Whether user wants OSHMEM in compatibility mode or not */
-#define OSHMEM_SPEC_COMPAT 1
-
-/* Whether we have shared memory support for mmap or not */
-#define OSHMEM_SSHMEM_MMAP 1
-
-/* Whether we have shared memory support for SYSV or not */
-#define OSHMEM_SSHMEM_SYSV 1
-
-/* Whether we have shared memory support for verbs or not */
-#define OSHMEM_SSHMEM_VERBS 0
-
-/* Tarball filename version string of Open SHMEM */
-#define OSHMEM_TARBALL_VERSION "gitclone"
-
-/* Complete release number of Open SHMEM */
-#define OSHMEM_VERSION "0"
-
-/* do we want java oshmem bindings */
-#define OSHMEM_WANT_JAVA_BINDINGS 0
-
-/* Define to the address where bug reports for this package should be sent. */
-#define PACKAGE_BUGREPORT "http://www.open-mpi.org/community/help/"
-
-/* Define to the full name of this package. */
-#define PACKAGE_NAME "Open MPI"
-
-/* Define to the full name and version of this package. */
-#define PACKAGE_STRING "Open MPI gitclone"
-
-/* Define to the one symbol short name of this package. */
-#define PACKAGE_TARNAME "openmpi"
-
-/* Define to the home page for this package. */
-#define PACKAGE_URL ""
-
-/* Define to the version of this package. */
-#define PACKAGE_VERSION "gitclone"
-
-/* Define PT_LOCK_SPIN to 1 if available. */
-/* #undef PT_LOCK_SPIN */
-
-/* The size of `bool', as computed by sizeof. */
-#define SIZEOF_BOOL 1
-
-/* The size of `char', as computed by sizeof. */
-#define SIZEOF_CHAR 1
-
-/* The size of `double', as computed by sizeof. */
-#define SIZEOF_DOUBLE 8
-
-/* The size of `double _Complex', as computed by sizeof. */
-#define SIZEOF_DOUBLE__COMPLEX 16
-
-/* The size of `float', as computed by sizeof. */
-#define SIZEOF_FLOAT 4
-
-/* The size of `float _Complex', as computed by sizeof. */
-#define SIZEOF_FLOAT__COMPLEX 8
-
-/* The size of `int', as computed by sizeof. */
-#define SIZEOF_INT 4
-
-/* The size of `long', as computed by sizeof. */
-#define SIZEOF_LONG 8
-
-/* The size of `long double', as computed by sizeof. */
-#define SIZEOF_LONG_DOUBLE 16
-
-/* The size of `long double _Complex', as computed by sizeof. */
-#define SIZEOF_LONG_DOUBLE__COMPLEX 32
-
-/* The size of `long long', as computed by sizeof. */
-#define SIZEOF_LONG_LONG 8
-
-/* The size of `pid_t', as computed by sizeof. */
-#define SIZEOF_PID_T 4
-
-/* The size of `ptrdiff_t', as computed by sizeof. */
-#define SIZEOF_PTRDIFF_T 8
-
-/* The size of `short', as computed by sizeof. */
-#define SIZEOF_SHORT 2
-
-/* The size of `size_t', as computed by sizeof. */
-#define SIZEOF_SIZE_T 8
-
-/* The size of `ssize_t', as computed by sizeof. */
-#define SIZEOF_SSIZE_T 8
-
-/* The size of `unsigned int', as computed by sizeof. */
-#define SIZEOF_UNSIGNED_INT 4
-
-/* The size of `unsigned long', as computed by sizeof. */
-#define SIZEOF_UNSIGNED_LONG 8
-
-/* The size of `void *', as computed by sizeof. */
-#define SIZEOF_VOID_P 8
-
-/* The size of `wchar_t', as computed by sizeof. */
-#define SIZEOF_WCHAR_T 4
-
-/* The size of `_Bool', as computed by sizeof. */
-#define SIZEOF__BOOL 1
-
-/* The size of `__float128', as computed by sizeof. */
-#define SIZEOF___FLOAT128 16
-
-/* Define to 1 if you have the ANSI C header files. */
-#define STDC_HEADERS 1
-
-/* Enable extensions on HP-UX. */
-#ifndef _HPUX_SOURCE
-# define _HPUX_SOURCE 1
-#endif
-
-
-/* Whether to use the legacy Solaris munmap prototype or not */
-/* #undef USE_SOLARIS_LEGACY_MUNMAP_PROTOTYPE */
-
-/* Enable extensions on AIX 3, Interix.  */
-#ifndef _ALL_SOURCE
-# define _ALL_SOURCE 1
-#endif
-/* Enable GNU extensions on systems that have them.  */
-#ifndef _GNU_SOURCE
-# define _GNU_SOURCE 1
-#endif
-/* Enable threading extensions on Solaris.  */
-#ifndef _POSIX_PTHREAD_SEMANTICS
-# define _POSIX_PTHREAD_SEMANTICS 1
-#endif
-/* Enable extensions on HP NonStop.  */
-#ifndef _TANDEM_SOURCE
-# define _TANDEM_SOURCE 1
-#endif
-/* Enable general extensions on Solaris.  */
-#ifndef __EXTENSIONS__
-# define __EXTENSIONS__ 1
-#endif
-
-
-/* Define WORDS_BIGENDIAN to 1 if your processor stores words with the most
-   significant byte first (like Motorola and SPARC, unlike Intel). */
-#if defined AC_APPLE_UNIVERSAL_BUILD
-# if defined __BIG_ENDIAN__
-#  define WORDS_BIGENDIAN 1
-# endif
-#else
-# ifndef WORDS_BIGENDIAN
-/* #  undef WORDS_BIGENDIAN */
-# endif
-#endif
-
-/* Additional CFLAGS to pass through the wrapper compilers */
-#define WRAPPER_EXTRA_CFLAGS "-pthread "
-
-/* Additional CFLAGS_PREFIX to pass through the wrapper compilers */
-#define WRAPPER_EXTRA_CFLAGS_PREFIX ""
-
-/* Additional CXXFLAGS to pass through the wrapper compilers */
-#define WRAPPER_EXTRA_CXXFLAGS "-pthread "
-
-/* Additional CXXFLAGS_PREFIX to pass through the wrapper compilers */
-#define WRAPPER_EXTRA_CXXFLAGS_PREFIX ""
-
-/* Additional FCFLAGS to pass through the wrapper compilers */
-#define WRAPPER_EXTRA_FCFLAGS "-pthread  -I${libdir}"
-
-/* Additional FCFLAGS to pass through the wrapper compilers */
-#define WRAPPER_EXTRA_FCFLAGS_PREFIX ""
-
-/* Additional LDFLAGS to pass through the wrapper compilers */
-#define WRAPPER_EXTRA_LDFLAGS "    -Wl,-rpath -Wl,@{libdir} -Wl,--enable-new-dtags"
-
-/* Additional LIBS to pass through the wrapper compilers */
-#define WRAPPER_EXTRA_LIBS "-lm -ldl -lutil -lrt "
-
-/* Whether the wrapper compilers add rpath flags by default */
-#define WRAPPER_RPATH_SUPPORT "runpath"
-
-/* Define to 1 if the X Window System is missing or not being used. */
-/* #undef X_DISPLAY_MISSING */
-
-/* Define to 1 if `lex' declares `yytext' as a `char *' by default, not a
-   `char[]'. */
-#define YYTEXT_POINTER 1
-
-/* Enable GNU extensions on systems that have them.  */
-#ifndef _GNU_SOURCE
-# define _GNU_SOURCE 1
-#endif
-
-/* Are we building for HP-UX? */
-#define _HPUX_SOURCE 1
-
-/* Define to 1 if on MINIX. */
-/* #undef _MINIX */
-
-/* Define to 2 if the system does not provide POSIX.1 features except with
-   this defined. */
-/* #undef _POSIX_1_SOURCE */
-
-/* Define to 1 if you need to in order for `stat' and other things to work. */
-/* #undef _POSIX_SOURCE */
-
-/* Define this to the process ID type */
-#define hwloc_pid_t pid_t
-
-/* Define this to the thread ID type */
-#define hwloc_thread_t pthread_t
-
-/* Define to `__inline__' or `__inline' if that's what the C compiler
-   calls it, or to nothing if 'inline' is not supported under any name.  */
-#ifndef __cplusplus
-#define inline __inline__
-#endif
-
-/* A bogus type that allows us to have sentinel type values that are still
-   valid */
-#define ompi_fortran_bogus_type_t int
-
-/* C type corresponding to Fortran CHARACTER */
-#define ompi_fortran_character_t char
-
-/* C type corresponding to Fortran COMPLEX*16 */
-/* #undef ompi_fortran_complex16_t */
-
-/* C type corresponding to Fortran COMPLEX*32 */
-/* #undef ompi_fortran_complex32_t */
-
-/* C type corresponding to Fortran COMPLEX*4 */
-/* #undef ompi_fortran_complex4_t */
-
-/* C type corresponding to Fortran COMPLEX*8 */
-/* #undef ompi_fortran_complex8_t */
-
-/* C type corresponding to Fortran COMPLEX */
-/* #undef ompi_fortran_complex_t */
-
-/* C type corresponding to Fortran DOUBLE COMPLEX */
-/* #undef ompi_fortran_double_complex_t */
-
-/* C type corresponding to Fortran DOUBLE PRECISION */
-#define ompi_fortran_double_precision_t double
-
-/* C type corresponding to Fortran INTEGER*16 */
-#define ompi_fortran_integer16_t 
-
-/* C type corresponding to Fortran INTEGER*1 */
-#define ompi_fortran_integer1_t char
-
-/* C type corresponding to Fortran INTEGER*2 */
-#define ompi_fortran_integer2_t short
-
-/* C type corresponding to Fortran INTEGER*4 */
-#define ompi_fortran_integer4_t int
-
-/* C type corresponding to Fortran INTEGER*8 */
-#define ompi_fortran_integer8_t long long
-
-/* C type corresponding to Fortran INTEGER */
-#define ompi_fortran_integer_t int
-
-/* C type corresponding to Fortran LOGICAL*1 */
-#define ompi_fortran_logical1_t char
-
-/* C type corresponding to Fortran LOGICAL*2 */
-#define ompi_fortran_logical2_t short
-
-/* C type corresponding to Fortran LOGICAL*4 */
-#define ompi_fortran_logical4_t int
-
-/* C type corresponding to Fortran LOGICAL*8 */
-#define ompi_fortran_logical8_t long long
-
-/* C type corresponding to Fortran LOGICAL */
-#define ompi_fortran_logical_t int
-
-/* C type corresponding to Fortran REAL*16 */
-#define ompi_fortran_real16_t ompi_fortran_bogus_type_t
-
-/* C type corresponding to Fortran REAL*2 */
-#define ompi_fortran_real2_t ompi_fortran_bogus_type_t
-
-/* C type corresponding to Fortran REAL*4 */
-#define ompi_fortran_real4_t float
-
-/* C type corresponding to Fortran REAL*8 */
-#define ompi_fortran_real8_t double
-
-/* C type corresponding to Fortran REAL */
-#define ompi_fortran_real_t float
-
-/* Define to the equivalent of the C99 'restrict' keyword, or to
-   nothing if this is not supported.  Do not define if restrict is
-   supported directly.  */
-#define restrict __restrict
-/* Work around a bug in Sun C++: it does not support _Restrict or
-   __restrict__, even though the corresponding Sun C compiler ends up with
-   "#define restrict _Restrict" or "#define restrict __restrict__" in the
-   previous line.  Perhaps some future version of Sun C++ will work with
-   restrict; if so, hopefully it defines __RESTRICT like Sun C does.  */
-#if defined __SUNPRO_CC && !defined __RESTRICT
-# define _Restrict
-# define __restrict__
-#endif
-
-
-//#include "opal_config_bottom.h"
-#endif /* OPAL_CONFIG_H */
-

From d9ca4ae78f0a9bda0945f6672f1f5b74f06aa4d7 Mon Sep 17 00:00:00 2001
From: George Bosilca <bosilca@icl.utk.edu>
Date: Thu, 18 Jun 2015 19:48:40 -0400
Subject: [PATCH 104/190] Add the capability to install the generated library
 and other minor cleanups.

---
 opal/datatype/cuda/Makefile.in | 31 +++++++++++++++++--------------
 1 file changed, 17 insertions(+), 14 deletions(-)

diff --git a/opal/datatype/cuda/Makefile.in b/opal/datatype/cuda/Makefile.in
index 519de6100ae..f00ca4e030c 100644
--- a/opal/datatype/cuda/Makefile.in
+++ b/opal/datatype/cuda/Makefile.in
@@ -3,19 +3,20 @@
 AM_CPPFLAGS = @common_cuda_CPPFLAGS@
 srcdir = @srcdir@
 top_builddir = @top_builddir@
+top_srcdir = @top_srcdir@
 VPATH = @srcdir@
 
-NVCC		= nvcc
-ARCH		= ar
-ARCHFLAGS	= cr
-STLIB		?= opal_datatype_cuda.a
-DYLIB		?= opal_datatype_cuda.so
-EXTLIB		= -L$(top_builddir)/opal/datatype/.libs -ldatatype -L$(top_builddir)/opal/.libs -lopen-pal -L/usr/local/cuda/lib -lcuda
-subdir = opal/datatype/cuda
+NVCC       = nvcc
+ARCH       = @AR@
+ARCHFLAGS  = cr
+STLIB     ?= opal_datatype_cuda.a
+DYLIB     ?= opal_datatype_cuda.so
+EXTLIB     = -L$(top_builddir)/opal/datatype/.libs -ldatatype -L$(top_builddir)/opal/.libs -lopen-pal -L/usr/local/cuda/lib -lcuda
+subdir     = opal/datatype/cuda
 
 CC = nvcc
-CFLAGS = -gencode arch=compute_35,code=sm_35 --compiler-options '-fPIC @CFLAGS@'
-LDFLAGS += -shared --compiler-options '-fPIC @LDFLAGS@'
+CFLAGS  = -I$(top_builddir)/opal/include -I$(top_srcdir)/opal/include -gencode arch=compute_35,code=sm_35 --compiler-options '-fPIC @CFLAGS@'
+LDFLAGS = -shared --compiler-options '-fPIC @LDFLAGS@'
 
 SRC := \
     opal_datatype_cuda.cu \
@@ -42,16 +43,18 @@ Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
 $(STLIB): $(OBJ)
 	$(ARCH) $(ARCHFLAGS) $@ $(OBJ) 
 	@RANLIB@ $@
-	
+
 $(DYLIB): $(OBJ)
 	$(NVCC) $(LDFLAGS) $(EXTLIB) -o $(DYLIB) $(OBJ)
-	
+
 %.o: %.cu
 	$(NVCC) $(CFLAGS) $(EXTLIB) $(INC) -c $< -o $@ 
 
+install: $(DYLIB)
+	cp -f $(DYLIB) @OMPI_WRAPPER_LIBDIR@/
+
 clean:
-	rm -f *.o
+	rm -f $(OBJ)
 
 cleanall: clean
-	rm -f $(STLIB)
-	rm -f $(DYLIB)
+	rm -f $(STLIB) $(DYLIB)

From 805938de536616b89f80aee007c843e2bc00843c Mon Sep 17 00:00:00 2001
From: George Bosilca <bosilca@icl.utk.edu>
Date: Thu, 18 Jun 2015 19:49:11 -0400
Subject: [PATCH 105/190] Open the datatype CUDA library from a default install
 location. Various other minor cleanups.

---
 opal/datatype/opal_datatype_gpu.c | 190 ++++++++++--------------------
 1 file changed, 61 insertions(+), 129 deletions(-)

diff --git a/opal/datatype/opal_datatype_gpu.c b/opal/datatype/opal_datatype_gpu.c
index c136a55ea71..ef7a8f41d27 100644
--- a/opal/datatype/opal_datatype_gpu.c
+++ b/opal/datatype/opal_datatype_gpu.c
@@ -3,7 +3,7 @@
  * Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana
  *                         University Research and Technology
  *                         Corporation.  All rights reserved.
- * Copyright (c) 2004-2014 The University of Tennessee and The University
+ * Copyright (c) 2004-2015 The University of Tennessee and The University
  *                         of Tennessee Research Foundation.  All rights
  *                         reserved.
  * Copyright (c) 2004-2006 High Performance Computing Center Stuttgart,
@@ -24,6 +24,7 @@
 #include <stddef.h>
 #include <dlfcn.h>
 
+#include "opal/mca/installdirs/installdirs.h"
 #include "opal/datatype/opal_convertor_internal.h"
 #include "opal/datatype/opal_datatype_internal.h"
 
@@ -37,54 +38,55 @@
 
 #include "opal/datatype/opal_datatype_gpu.h"
 
-static void *opal_datatype_cuda_handle = NULL; 
+static void *opal_datatype_cuda_handle = NULL;
+static char *opal_datatype_cuda_lib = NULL;
 
 void (*opal_datatype_cuda_init_p)(void) = NULL;
 
 void (*opal_datatype_cuda_fini_p)(void) = NULL;
 
 int32_t (*opal_generic_simple_pack_function_cuda_p)( opal_convertor_t* pConvertor,
-                                                     struct iovec* iov, 
+                                                     struct iovec* iov,
                                                      uint32_t* out_size,
                                                      size_t* max_data ) = NULL;
 
 int32_t (*opal_generic_simple_unpack_function_cuda_p)( opal_convertor_t* pConvertor,
-                                                       struct iovec* iov, 
+                                                       struct iovec* iov,
                                                        uint32_t* out_size,
                                                        size_t* max_data ) = NULL;
-                                                     
+
 int32_t (*opal_generic_simple_pack_function_cuda_iov_p)( opal_convertor_t* pConvertor,
-                                                        struct iovec* iov, 
+                                                        struct iovec* iov,
                                                         uint32_t* out_size,
                                                         size_t* max_data ) = NULL;
-                                                        
+
 int32_t (*opal_generic_simple_unpack_function_cuda_iov_p)( opal_convertor_t* pConvertor,
-                                                        struct iovec* iov, 
+                                                        struct iovec* iov,
                                                         uint32_t* out_size,
                                                         size_t* max_data ) = NULL;
-                                                        
+
 int32_t (*opal_generic_simple_pack_function_cuda_vector_p)( opal_convertor_t* pConvertor,
-                                                            struct iovec* iov, 
+                                                            struct iovec* iov,
                                                             uint32_t* out_size,
                                                             size_t* max_data ) = NULL;
 
 int32_t (*opal_generic_simple_unpack_function_cuda_vector_p)( opal_convertor_t* pConvertor,
-                                                              struct iovec* iov, 
+                                                              struct iovec* iov,
                                                               uint32_t* out_size,
                                                               size_t* max_data ) = NULL;
-                                                       
+
 void (*pack_contiguous_loop_cuda_p)( dt_elem_desc_t* ELEM,
                                      uint32_t* COUNT,
                                      unsigned char** SOURCE,
                                      unsigned char** DESTINATION,
                                      size_t* SPACE ) = NULL;
-                                     
+
 void (*unpack_contiguous_loop_cuda_p)( dt_elem_desc_t* ELEM,
                                        uint32_t* COUNT,
                                        unsigned char** SOURCE,
                                        unsigned char** DESTINATION,
                                        size_t* SPACE ) = NULL;
-                                       
+
 void (*pack_predefined_data_cuda_p)( dt_elem_desc_t* ELEM,
                                      uint32_t* COUNT,
                                      unsigned char** SOURCE,
@@ -99,126 +101,50 @@ void (*opal_cuda_free_gpu_buffer_p)(void *addr, int gpu_id) = NULL;
 
 void* (*opal_cuda_malloc_gpu_buffer_p)(size_t size, int gpu_id) = NULL;
 
+#define OPAL_DATATYPE_FIND_CUDA_FUNCTION_OR_RETURN(handle, fname)       \
+    do {                                                                \
+        char* _error;                                                   \
+        *(void **)(&(fname ## _p)) = dlsym((handle), # fname);          \
+        if(NULL != (_error = dlerror()) )  {                            \
+            opal_output(0, "Finding %s error: %s\n", # fname, _error);  \
+            fname ## _p = NULL;                                         \
+            return OPAL_ERROR;                                          \
+        }                                                               \
+    } while (0)
+
 int32_t opal_datatype_gpu_init(void)
 {
-    char *error;
-    char *lib = "/home/wwu12/ompi/ompi-gpu/opal/datatype/cuda/opal_datatype_cuda.so";
-    
     if (opal_datatype_cuda_handle ==  NULL) {
-        opal_datatype_cuda_handle = dlopen(lib, RTLD_LAZY);
+
+        /* If the library name was initialized but the load failed, we have another chance to change it */
+        if( NULL != opal_datatype_cuda_lib )
+            free(opal_datatype_cuda_lib);
+        asprintf(&opal_datatype_cuda_lib, "%s/%s", opal_install_dirs.libdir, "opal_datatype_cuda.so");
+
+        opal_datatype_cuda_handle = dlopen(opal_datatype_cuda_lib , RTLD_LAZY);
         if (!opal_datatype_cuda_handle) {
-            fprintf(stderr, "%s\n", dlerror());
+            opal_output( 0, "Failed to load %s library: error %s\n", opal_datatype_cuda_lib, dlerror());
             opal_datatype_cuda_handle = NULL;
             return OPAL_ERROR;
         }
-        
-        *(void **)(&opal_datatype_cuda_init_p) = dlsym(opal_datatype_cuda_handle, "opal_datatype_cuda_init");
-        if ((error = dlerror()) != NULL)  {
-            fprintf(stderr, "opal_datatype_cuda_init error: %s\n", error);
-            opal_datatype_cuda_init_p = NULL;
-            return OPAL_ERROR;
-        }
-        
-        *(void **)(&opal_datatype_cuda_fini_p) = dlsym(opal_datatype_cuda_handle, "opal_datatype_cuda_fini");
-        if ((error = dlerror()) != NULL)  {
-            fprintf(stderr, "opal_datatype_cuda_fini error: %s\n", error);
-            opal_datatype_cuda_fini_p = NULL;
-            return OPAL_ERROR;
-        }
-        
-        *(void **)(&opal_generic_simple_pack_function_cuda_p) = dlsym(opal_datatype_cuda_handle, "opal_generic_simple_pack_function_cuda");
-        if ((error = dlerror()) != NULL)  {
-            fprintf(stderr, "opal_generic_simple_pack_function_cuda error: %s\n", error);
-            opal_generic_simple_pack_function_cuda_p = NULL;
-            return OPAL_ERROR;
-        }
-        
-        *(void **)(&opal_generic_simple_unpack_function_cuda_p) = dlsym(opal_datatype_cuda_handle, "opal_generic_simple_unpack_function_cuda");
-        if ((error = dlerror()) != NULL)  {
-            fprintf(stderr, "opal_generic_simple_unpack_function_cuda error: %s\n", error);
-            opal_generic_simple_unpack_function_cuda_p = NULL;
-            return OPAL_ERROR;
-        }
-        
-        *(void **)(&opal_generic_simple_pack_function_cuda_iov_p) = dlsym(opal_datatype_cuda_handle, "opal_generic_simple_pack_function_cuda_iov");
-        if ((error = dlerror()) != NULL)  {
-            fprintf(stderr, "opal_generic_simple_pack_function_cuda_iov error: %s\n", error);
-            opal_generic_simple_pack_function_cuda_iov_p = NULL;
-            return OPAL_ERROR;
-        }
-        
-        *(void **)(&opal_generic_simple_unpack_function_cuda_iov_p) = dlsym(opal_datatype_cuda_handle, "opal_generic_simple_unpack_function_cuda_iov");
-        if ((error = dlerror()) != NULL)  {
-            fprintf(stderr, "opal_generic_simple_unpack_function_cuda_iov error: %s\n", error);
-            opal_generic_simple_unpack_function_cuda_iov_p = NULL;
-            return OPAL_ERROR;
-        }
-        
-        *(void **)(&opal_generic_simple_pack_function_cuda_vector_p) = dlsym(opal_datatype_cuda_handle, "opal_generic_simple_pack_function_cuda_vector");
-        if ((error = dlerror()) != NULL)  {
-            fprintf(stderr, "opal_generic_simple_pack_function_cuda_vector error: %s\n", error);
-            opal_generic_simple_pack_function_cuda_vector_p = NULL;
-            return OPAL_ERROR;
-        }
-        
-        *(void **)(&opal_generic_simple_unpack_function_cuda_vector_p) = dlsym(opal_datatype_cuda_handle, "opal_generic_simple_unpack_function_cuda_vector");
-        if ((error = dlerror()) != NULL)  {
-            fprintf(stderr, "opal_generic_simple_unpack_function_cuda_vector error: %s\n", error);
-            opal_generic_simple_unpack_function_cuda_vector_p = NULL;
-            return OPAL_ERROR;
-        }
-        
-        *(void **)(&pack_contiguous_loop_cuda_p) = dlsym(opal_datatype_cuda_handle, "pack_contiguous_loop_cuda");
-        if ((error = dlerror()) != NULL)  {
-            fprintf(stderr, "pack_contiguous_loop_cuda error: %s\n", error);
-            pack_contiguous_loop_cuda_p = NULL;
-            return OPAL_ERROR;
-        }
-        
-        *(void **)(&unpack_contiguous_loop_cuda_p) = dlsym(opal_datatype_cuda_handle, "unpack_contiguous_loop_cuda");
-        if ((error = dlerror()) != NULL)  {
-            fprintf(stderr, "unpack_contiguous_loop_cuda error: %s\n", error);
-            unpack_contiguous_loop_cuda_p = NULL;
-            return OPAL_ERROR;
-        }
-        
-        *(void **)(&pack_predefined_data_cuda_p) = dlsym(opal_datatype_cuda_handle, "pack_predefined_data_cuda");
-        if ((error = dlerror()) != NULL)  {
-            fprintf(stderr, "pack_predefined_data_cuda error: %s\n", error);
-            pack_predefined_data_cuda_p = NULL;
-            return OPAL_ERROR;
-        }
-        
-        *(void **)(&opal_cuda_sync_device_p) = dlsym(opal_datatype_cuda_handle, "opal_cuda_sync_device");
-        if ((error = dlerror()) != NULL)  {
-            fprintf(stderr, "opal_cuda_sync_device error: %s\n", error);
-            opal_cuda_sync_device_p = NULL;
-            return OPAL_ERROR;
-        }
-        
-        *(void **)(&opal_cuda_get_gpu_pack_buffer_p) = dlsym(opal_datatype_cuda_handle, "opal_cuda_get_gpu_pack_buffer");
-        if ((error = dlerror()) != NULL)  {
-            fprintf(stderr, "opal_cuda_get_gpu_pack_buffer error: %s\n", error);
-            opal_cuda_get_gpu_pack_buffer_p = NULL;
-            return OPAL_ERROR;
-        }
-        
-        *(void **)(&opal_cuda_free_gpu_buffer_p) = dlsym(opal_datatype_cuda_handle, "opal_cuda_free_gpu_buffer");
-        if ((error = dlerror()) != NULL)  {
-            fprintf(stderr, "opal_cuda_free_gpu_buffer error: %s\n", error);
-            opal_cuda_free_gpu_buffer_p = NULL;
-            return OPAL_ERROR;
-        }
-        
-        *(void **)(&opal_cuda_malloc_gpu_buffer_p) = dlsym(opal_datatype_cuda_handle, "opal_cuda_malloc_gpu_buffer");
-        if ((error = dlerror()) != NULL)  {
-            fprintf(stderr, "opal_cuda_malloc_gpu_buffer error: %s\n", error);
-            opal_cuda_malloc_gpu_buffer_p = NULL;
-            return OPAL_ERROR;
-        }
-        
+        OPAL_DATATYPE_FIND_CUDA_FUNCTION_OR_RETURN( opal_datatype_cuda_handle, opal_datatype_cuda_init );
+        OPAL_DATATYPE_FIND_CUDA_FUNCTION_OR_RETURN( opal_datatype_cuda_handle, opal_datatype_cuda_fini );
+        OPAL_DATATYPE_FIND_CUDA_FUNCTION_OR_RETURN( opal_datatype_cuda_handle, opal_generic_simple_pack_function_cuda );
+        OPAL_DATATYPE_FIND_CUDA_FUNCTION_OR_RETURN( opal_datatype_cuda_handle, opal_generic_simple_unpack_function_cuda );
+        OPAL_DATATYPE_FIND_CUDA_FUNCTION_OR_RETURN( opal_datatype_cuda_handle, opal_generic_simple_pack_function_cuda_iov );
+        OPAL_DATATYPE_FIND_CUDA_FUNCTION_OR_RETURN( opal_datatype_cuda_handle, opal_generic_simple_unpack_function_cuda_iov );
+        OPAL_DATATYPE_FIND_CUDA_FUNCTION_OR_RETURN( opal_datatype_cuda_handle, opal_generic_simple_pack_function_cuda_vector );
+        OPAL_DATATYPE_FIND_CUDA_FUNCTION_OR_RETURN( opal_datatype_cuda_handle, opal_generic_simple_unpack_function_cuda_vector );
+        OPAL_DATATYPE_FIND_CUDA_FUNCTION_OR_RETURN( opal_datatype_cuda_handle, pack_contiguous_loop_cuda );
+        OPAL_DATATYPE_FIND_CUDA_FUNCTION_OR_RETURN( opal_datatype_cuda_handle, unpack_contiguous_loop_cuda );
+        OPAL_DATATYPE_FIND_CUDA_FUNCTION_OR_RETURN( opal_datatype_cuda_handle, pack_predefined_data_cuda );
+        OPAL_DATATYPE_FIND_CUDA_FUNCTION_OR_RETURN( opal_datatype_cuda_handle, opal_cuda_sync_device );
+        OPAL_DATATYPE_FIND_CUDA_FUNCTION_OR_RETURN( opal_datatype_cuda_handle, opal_cuda_get_gpu_pack_buffer );
+        OPAL_DATATYPE_FIND_CUDA_FUNCTION_OR_RETURN( opal_datatype_cuda_handle, opal_cuda_free_gpu_buffer );
+        OPAL_DATATYPE_FIND_CUDA_FUNCTION_OR_RETURN( opal_datatype_cuda_handle, opal_cuda_malloc_gpu_buffer );
+
         (*opal_datatype_cuda_init_p)();
-        printf("cuda init done\n");   
+        printf("cuda init done\n");
     }
     return OPAL_SUCCESS;
 }
@@ -227,8 +153,7 @@ int32_t opal_datatype_gpu_fini(void)
 {
     if (opal_datatype_cuda_handle != NULL) {
         (*opal_datatype_cuda_fini_p)();
-        dlclose(opal_datatype_cuda_handle);
-        opal_datatype_cuda_handle = NULL;
+        /* Reset all functions to NULL */
         opal_datatype_cuda_init_p = NULL;
         opal_datatype_cuda_fini_p = NULL;
         opal_generic_simple_pack_function_cuda_p = NULL;
@@ -244,6 +169,13 @@ int32_t opal_datatype_gpu_fini(void)
         opal_cuda_get_gpu_pack_buffer_p = NULL;
         opal_cuda_free_gpu_buffer_p = NULL;
         opal_cuda_malloc_gpu_buffer_p = NULL;
+
+        dlclose(opal_datatype_cuda_handle);
+        opal_datatype_cuda_handle = NULL;
+
+        if( NULL != opal_datatype_cuda_lib )
+            free(opal_datatype_cuda_lib);
+        opal_datatype_cuda_lib = NULL;
         printf("cuda fini done\n");
     }
     return OPAL_SUCCESS;
@@ -261,4 +193,4 @@ unsigned char* opal_datatype_get_gpu_buffer(void)
     return NULL;
 #endif /* defined OPAL_DATATYPE_CUDA_KERNEL */
     
-}
\ No newline at end of file
+}

From 0b4c5dfc3107cb5e5029c56ca774f98f4e4ef696 Mon Sep 17 00:00:00 2001
From: George Bosilca <bosilca@icl.utk.edu>
Date: Tue, 30 Jun 2015 17:28:34 -0400
Subject: [PATCH 106/190] Add a patch from Rolf fixing 2 issues: 1. free code
 did not work right because we were computing the amount we freed after
 merging the list 2. we need to store original malloc GPU buffer in extra
 place because the one in the convertor gets changed over time

Conflicts:
	opal/datatype/cuda/opal_datatype_cuda.cu
	opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
---
 opal/datatype/cuda/Makefile.in                        | 2 +-
 opal/datatype/cuda/opal_datatype_cuda.cu              | 2 ++
 opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu | 2 +-
 opal/mca/btl/smcuda/btl_smcuda.c                      | 2 ++
 opal/mca/btl/smcuda/btl_smcuda.h                      | 1 +
 opal/mca/btl/smcuda/btl_smcuda_component.c            | 2 +-
 opal/mca/common/cuda/common_cuda.c                    | 1 +
 7 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/opal/datatype/cuda/Makefile.in b/opal/datatype/cuda/Makefile.in
index f00ca4e030c..ded04f1ed3c 100644
--- a/opal/datatype/cuda/Makefile.in
+++ b/opal/datatype/cuda/Makefile.in
@@ -15,7 +15,7 @@ EXTLIB     = -L$(top_builddir)/opal/datatype/.libs -ldatatype -L$(top_builddir)/
 subdir     = opal/datatype/cuda
 
 CC = nvcc
-CFLAGS  = -I$(top_builddir)/opal/include -I$(top_srcdir)/opal/include -gencode arch=compute_35,code=sm_35 --compiler-options '-fPIC @CFLAGS@'
+CFLAGS  = -I$(top_builddir)/opal/include -I$(top_srcdir)/opal/include -I$(top_builddir) -I$(top_srcdir) -gencode arch=compute_35,code=sm_35 --compiler-options '-fPIC @CFLAGS@'
 LDFLAGS = -shared --compiler-options '-fPIC @LDFLAGS@'
 
 SRC := \
diff --git a/opal/datatype/cuda/opal_datatype_cuda.cu b/opal/datatype/cuda/opal_datatype_cuda.cu
index 8451b143487..b81e5196a8f 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cu
+++ b/opal/datatype/cuda/opal_datatype_cuda.cu
@@ -434,6 +434,8 @@ void opal_cuda_free_gpu_buffer(void *addr, int gpu_id)
     if (ptr == NULL) {
         DT_CUDA_DEBUG( opal_cuda_output( 0, "addr %p is not managed.\n", addr); );
     }
+    cuda_list_item_merge_by_addr(&device->buffer_free, ptr);
+    device->buffer_free_size += ptr->size;
     DT_CUDA_DEBUG( opal_cuda_output( 0, "Free GPU buffer %p.\n", addr); );
 }
 
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
index b55c59a5c1e..87184277d9a 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
@@ -531,7 +531,7 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
             count_desc = length_per_iovec / alignment;
             residue_desc = length_per_iovec % alignment;
             nb_blocks_per_description = (count_desc + thread_per_block - 1) / thread_per_block;
-            DT_CUDA_DEBUG ( opal_cuda_output(2, "description %d, size %d, residue %d, alignment %d, nb_block_aquired %d\n", i, count_desc, residue_desc, alignment, nb_blocks_per_description); );
+            DT_CUDA_DEBUG ( opal_cuda_output(-1, "description %d, size %d, residue %d, alignment %d, nb_block_aquired %d\n", i, count_desc, residue_desc, alignment, nb_blocks_per_description); );
             for (j = 0; j < nb_blocks_per_description; j++) {
                 cuda_iov_dist_h_current[current_block].src[task_iteration] = (unsigned char *)(cuda_iov[i].iov_base) + j * thread_per_block * alignment;
                 cuda_iov_dist_h_current[current_block].dst[task_iteration] = destination;
diff --git a/opal/mca/btl/smcuda/btl_smcuda.c b/opal/mca/btl/smcuda/btl_smcuda.c
index 4814b6c996a..6041a8b64e8 100644
--- a/opal/mca/btl/smcuda/btl_smcuda.c
+++ b/opal/mca/btl/smcuda/btl_smcuda.c
@@ -1338,6 +1338,7 @@ void mca_btl_smcuda_cuda_dt_pack_clone(struct opal_convertor_t *convertor,
                                        int lindex)
 {
     endpoint->smcuda_dt_pack_clone[lindex].convertor = convertor;
+    endpoint->smcuda_dt_pack_clone[lindex].gpu_ptr = convertor->gpu_buffer_ptr;
     endpoint->smcuda_dt_pack_clone[lindex].endpoint = endpoint;
     endpoint->smcuda_dt_pack_clone[lindex].local_address = local_address;
     endpoint->smcuda_dt_pack_clone[lindex].local_handle = local_handle;
@@ -1359,6 +1360,7 @@ void mca_btl_smcuda_cuda_dt_unpack_clone(struct opal_convertor_t *convertor,
                                          int lindex)
 {
     endpoint->smcuda_dt_unpack_clone[lindex].convertor = convertor;
+    endpoint->smcuda_dt_unpack_clone[lindex].gpu_ptr = convertor->gpu_buffer_ptr;
     endpoint->smcuda_dt_unpack_clone[lindex].endpoint = endpoint;
     endpoint->smcuda_dt_unpack_clone[lindex].local_address = local_address;
     endpoint->smcuda_dt_unpack_clone[lindex].local_handle = local_handle;
diff --git a/opal/mca/btl/smcuda/btl_smcuda.h b/opal/mca/btl/smcuda/btl_smcuda.h
index 00765f0a276..c43fbe0b190 100644
--- a/opal/mca/btl/smcuda/btl_smcuda.h
+++ b/opal/mca/btl/smcuda/btl_smcuda.h
@@ -517,6 +517,7 @@ typedef struct {
 /* package save pack/unpack convertor and cbfunc */
 typedef struct {
     struct opal_convertor_t *convertor;
+    void *gpu_ptr;
     struct mca_btl_base_endpoint_t *endpoint;
     void *local_address;
     struct mca_btl_base_registration_handle_t *local_handle;
diff --git a/opal/mca/btl/smcuda/btl_smcuda_component.c b/opal/mca/btl/smcuda/btl_smcuda_component.c
index f035578bd5d..4633134bac5 100644
--- a/opal/mca/btl/smcuda/btl_smcuda_component.c
+++ b/opal/mca/btl/smcuda/btl_smcuda_component.c
@@ -907,7 +907,7 @@ static void btl_smcuda_datatype_pack(mca_btl_base_module_t* btl,
     
     if (seq == -1) {
         mca_btl_smcuda_send_cuda_unpack_sig(btl, my_cuda_dt_clone->endpoint, lindex, -2);
-        opal_cuda_free_gpu_buffer_p(my_cuda_dt_clone->convertor->gpu_buffer_ptr, 0);
+        opal_cuda_free_gpu_buffer_p(my_cuda_dt_clone->gpu_ptr, 0);
         mca_btl_smcuda_free_cuda_dt_pack_clone(my_cuda_dt_clone->endpoint, lindex);
     }
 }
diff --git a/opal/mca/common/cuda/common_cuda.c b/opal/mca/common/cuda/common_cuda.c
index 990dc3fc119..5ce92cab8cd 100644
--- a/opal/mca/common/cuda/common_cuda.c
+++ b/opal/mca/common/cuda/common_cuda.c
@@ -33,6 +33,7 @@
 #include "opal/align.h"
 #include "opal/datatype/opal_convertor.h"
 #include "opal/datatype/opal_datatype_cuda.h"
+#include "opal/datatype/opal_datatype_gpu.h"
 #include "opal/util/output.h"
 #include "opal/util/show_help.h"
 #include "opal/util/proc.h"

From b74997e148f88b7d6017522f4653bf4be1216df4 Mon Sep 17 00:00:00 2001
From: eddy16112 <eddy16112@gmail.com>
Date: Wed, 19 Aug 2015 17:20:39 -0400
Subject: [PATCH 107/190] clean up code in pack and unpack

Conflicts:
	ompi/mca/pml/ob1/pml_ob1_cuda.c
	opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
---
 ompi/mca/pml/ob1/pml_ob1_cuda.c               | 128 ++++++++++++------
 opal/datatype/cuda/opal_datatype_cuda.cu      |  10 +-
 .../cuda/opal_datatype_cuda_internal.cuh      |   4 +-
 .../cuda/opal_datatype_pack_cuda_kernel.cu    |   5 +-
 .../cuda/opal_datatype_pack_cuda_wrapper.cu   |  38 ++----
 .../cuda/opal_datatype_unpack_cuda_kernel.cu  |   3 +
 .../cuda/opal_datatype_unpack_cuda_wrapper.cu |  29 ++--
 opal/mca/btl/smcuda/btl_smcuda.c              |  42 +++++-
 opal/mca/btl/smcuda/btl_smcuda.h              |   5 +
 opal/mca/btl/smcuda/btl_smcuda_component.c    |   9 +-
 opal/mca/common/cuda/common_cuda.h            |   1 +
 test/datatype/Makefile.am                     |   2 +-
 test/datatype/ddt_test.c                      |   2 +-
 13 files changed, 192 insertions(+), 86 deletions(-)

diff --git a/ompi/mca/pml/ob1/pml_ob1_cuda.c b/ompi/mca/pml/ob1/pml_ob1_cuda.c
index f9f3a2cbe02..826c1e9c6b3 100644
--- a/ompi/mca/pml/ob1/pml_ob1_cuda.c
+++ b/ompi/mca/pml/ob1/pml_ob1_cuda.c
@@ -105,53 +105,103 @@ int mca_pml_ob1_send_request_start_cuda(mca_pml_ob1_send_request_t* sendreq,
         sendreq->req_send.req_base.req_convertor.flags |= CONVERTOR_CUDA;
         mca_bml_base_btl_t* bml_endpoint_btl = mca_bml_base_btl_array_get_index(&(sendreq->req_endpoint->btl_send), 0);
         if ((bml_endpoint_btl->btl_flags & MCA_BTL_FLAGS_CUDA_GET) && CUDA_DDT_WITH_RDMA) {
-            printf("GPU data ready for GET!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n");
+            
+            int seq = 0;
+            int rc_dt = 0;
+            int rc_sig = 0;
             unsigned char *base;
+            struct iovec iov;
+            size_t pipeline_size = 0;
+            uint32_t iov_count = 1;
+            size_t max_data = 0;
             struct opal_convertor_t *convertor = &(sendreq->req_send.req_base.req_convertor);
-            base = opal_cuda_malloc_gpu_buffer_p(convertor->local_size, 0);
-            convertor->gpu_buffer_ptr = base;
-            sendreq->req_send.req_bytes_packed = convertor->local_size;
-            printf("GPU BUFFER %p, local %lu, remote %lu\n", base, convertor->local_size, convertor->remote_size);
-            if( 0 != (sendreq->req_rdma_cnt = (uint32_t)mca_pml_ob1_rdma_cuda_btls(
-                                                                           sendreq->req_endpoint,
-                                                                           base,
-                                                                           sendreq->req_send.req_bytes_packed,
-                                                                           sendreq->req_rdma))) {
+            int lindex = mca_btl_smcuda_check_cuda_dt_pack_clone_exist(bml_btl->btl_endpoint, convertor); 
+            if (lindex == -1) {
+                /* this is the first time for this convertor */
+                printf("GPU data ready for GET!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n");
+                base = opal_cuda_malloc_gpu_buffer_p(convertor->local_size, 0);
+                convertor->gpu_buffer_ptr = base;
+                sendreq->req_send.req_bytes_packed = convertor->local_size;
+                printf("GPU BUFFER %p, local %lu, remote %lu\n", base, convertor->local_size, convertor->remote_size);
+                if( 0 != (sendreq->req_rdma_cnt = (uint32_t)mca_pml_ob1_rdma_cuda_btls(
+                                                                               sendreq->req_endpoint,
+                                                                               base,
+                                                                               sendreq->req_send.req_bytes_packed,
+                                                                               sendreq->req_rdma))) {
                 
-                size_t pipeline_size = convertor->local_size;
-                struct iovec iov;
-                int rc_dt = 0;
-                uint32_t iov_count = 1;
-                iov.iov_base = base;
-                iov.iov_len = pipeline_size;
-                size_t max_data = 0;
-                int seq = 0;
-                /* the first pack here is used to get the correct size of pipeline_size */
-                /* because pack may not use the whole pipeline size */
-                rc_dt = opal_convertor_pack(convertor, &iov, &iov_count, &max_data );
-                pipeline_size = max_data;
-                int lindex = mca_btl_smcuda_alloc_cuda_dt_pack_clone(bml_btl->btl_endpoint);
-                assert(lindex >= 0);
-                mca_pml_ob1_rdma_cuda_btl_register_events(sendreq->req_rdma, sendreq->req_rdma_cnt, convertor, pipeline_size, lindex); 
-                mca_btl_smcuda_cuda_dt_pack_clone(convertor, bml_btl->btl_endpoint, NULL, NULL, NULL, NULL, NULL, pipeline_size, lindex);
+                    pipeline_size = 1024*1024;
+                    iov.iov_base = base;
+                    iov.iov_len = pipeline_size;
+                    max_data = 0;
+                    /* the first pack here is used to get the correct size of pipeline_size */
+                    /* because pack may not use the whole pipeline size */
+                    rc_dt = opal_convertor_pack(convertor, &iov, &iov_count, &max_data );
+                    pipeline_size = max_data;
+                    lindex = mca_btl_smcuda_alloc_cuda_dt_pack_clone(bml_btl->btl_endpoint);
+                    assert(lindex >= 0);
+                    mca_pml_ob1_rdma_cuda_btl_register_events(sendreq->req_rdma, sendreq->req_rdma_cnt, convertor, pipeline_size, lindex); 
+                    mca_btl_smcuda_cuda_dt_pack_clone(convertor, bml_btl->btl_endpoint, NULL, NULL, NULL, NULL, NULL, pipeline_size, lindex);
                 
-                rc = mca_pml_ob1_send_request_start_rdma(sendreq, bml_btl,
-                                                         sendreq->req_send.req_bytes_packed);
+                    rc = mca_pml_ob1_send_request_start_rdma(sendreq, bml_btl,
+                                                             sendreq->req_send.req_bytes_packed);
                 
-                mca_btl_smcuda_send_cuda_unpack_sig(bml_btl->btl, bml_btl->btl_endpoint, lindex, seq);
-                while (rc_dt != 1) {
-                    iov.iov_base += pipeline_size;
-                    seq ++;
-                    rc_dt = opal_convertor_pack(convertor, &iov, &iov_count, &max_data );
-                    mca_btl_smcuda_send_cuda_unpack_sig(bml_btl->btl, bml_btl->btl_endpoint, lindex, seq);
+                    rc_sig = mca_btl_smcuda_send_cuda_unpack_sig(bml_btl->btl, bml_btl->btl_endpoint, lindex, seq);
+                    if (rc_sig == OPAL_ERR_OUT_OF_RESOURCE) {
+                        mca_btl_smcuda_set_cuda_dt_pack_seq(bml_btl->btl_endpoint, lindex, seq);
+                        return rc_sig;
+                    }
+                    while (rc_dt != 1) {
+                        iov.iov_base += pipeline_size;
+                        seq ++;
+                        rc_dt = opal_convertor_pack(convertor, &iov, &iov_count, &max_data );
+                        rc_sig = mca_btl_smcuda_send_cuda_unpack_sig(bml_btl->btl, bml_btl->btl_endpoint, lindex, seq);
+                        if (rc_sig == OPAL_ERR_OUT_OF_RESOURCE) {
+                            mca_btl_smcuda_set_cuda_dt_pack_seq(bml_btl->btl_endpoint, lindex, seq);
+                            return rc_sig;
+                        }
+                    }
+                    rc_sig = mca_btl_smcuda_send_cuda_unpack_sig(bml_btl->btl, bml_btl->btl_endpoint, lindex, -1);
+                    if (rc_sig == OPAL_ERR_OUT_OF_RESOURCE) {
+                        mca_btl_smcuda_set_cuda_dt_pack_seq(bml_btl->btl_endpoint, lindex, -1);
+                        return rc_sig;
+                    }
+                    if( OPAL_UNLIKELY(OMPI_SUCCESS != rc) ) {
+                        mca_pml_ob1_free_rdma_resources(sendreq);
+                    }
+                } else {
+                    rc = mca_pml_ob1_send_request_start_rndv(sendreq, bml_btl, 0, 0);
                 }
-                mca_btl_smcuda_send_cuda_unpack_sig(bml_btl->btl, bml_btl->btl_endpoint, lindex, -1);
-                if( OPAL_UNLIKELY(OMPI_SUCCESS != rc) ) {
-                    mca_pml_ob1_free_rdma_resources(sendreq);
+            } else { /* RMDA has been started before, but no resource (frag) last time, so back to re-schedule */
+                seq = mca_btl_smcuda_get_cuda_dt_pack_seq(bml_btl->btl_endpoint, lindex);
+                pipeline_size = mca_btl_smcuda_get_cuda_dt_pack_pipeline_size(bml_btl->btl_endpoint, lindex);
+                printf("*****************I resent seq %d, pipeline %lu\n", seq, pipeline_size);
+                rc_dt = 0;
+                rc_sig = mca_btl_smcuda_send_cuda_unpack_sig(bml_btl->btl, bml_btl->btl_endpoint, lindex, seq);
+                if (rc_sig == OPAL_ERR_OUT_OF_RESOURCE) {
+                    mca_btl_smcuda_set_cuda_dt_pack_seq(bml_btl->btl_endpoint, lindex, seq);
+                    return rc_sig;
+                }
+                if (seq != -1) {
+                    
+                    while (rc_dt != 1) {
+                        seq ++;
+                        iov.iov_base = convertor->gpu_buffer_ptr + pipeline_size * seq;
+                        iov.iov_len = pipeline_size;
+                        rc_dt = opal_convertor_pack(convertor, &iov, &iov_count, &pipeline_size );     
+                        rc_sig = mca_btl_smcuda_send_cuda_unpack_sig(bml_btl->btl, bml_btl->btl_endpoint, lindex, seq);
+                        if (rc_sig == OPAL_ERR_OUT_OF_RESOURCE) {
+                            mca_btl_smcuda_set_cuda_dt_pack_seq(bml_btl->btl_endpoint, lindex, seq);
+                            return rc_sig;
+                        }
+                    }
+                    rc_sig = mca_btl_smcuda_send_cuda_unpack_sig(bml_btl->btl, bml_btl->btl_endpoint, lindex, -1);
+                    if (rc_sig == OPAL_ERR_OUT_OF_RESOURCE) {
+                        mca_btl_smcuda_set_cuda_dt_pack_seq(bml_btl->btl_endpoint, lindex, -1);
+                        return rc_sig;
+                    }
                 }
-            } else {
-                rc = mca_pml_ob1_send_request_start_rndv(sendreq, bml_btl, 0, 0);
             }
+            
         } else {
             rc = mca_pml_ob1_send_request_start_rndv(sendreq, bml_btl, 0, 0);
         }
diff --git a/opal/datatype/cuda/opal_datatype_cuda.cu b/opal/datatype/cuda/opal_datatype_cuda.cu
index b81e5196a8f..b6ed096b7d9 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cu
+++ b/opal/datatype/cuda/opal_datatype_cuda.cu
@@ -1,3 +1,6 @@
+#include "opal/datatype/opal_convertor_internal.h"
+#include "opal/datatype/opal_datatype_internal.h"
+
 #include "opal_datatype_cuda_internal.cuh"
 #include "opal_datatype_cuda.cuh"
 #include <cuda_runtime_api.h>
@@ -10,6 +13,7 @@
  * NOTE: The order of this array *MUST* match what is listed in datatype.h
  * (use of designated initializers should relax this restrictions some)
  */
+/*
 OPAL_DECLSPEC const size_t opal_datatype_basicDatatypesSize[OPAL_DATATYPE_MAX_PREDEFINED] = {
     OPAL_DATATYPE_LOOP_SIZE,
     OPAL_DATATYPE_END_LOOP_SIZE,
@@ -19,12 +23,12 @@ OPAL_DECLSPEC const size_t opal_datatype_basicDatatypesSize[OPAL_DATATYPE_MAX_PR
     OPAL_DATATYPE_INT2_SIZE,
     OPAL_DATATYPE_INT4_SIZE,
     OPAL_DATATYPE_INT8_SIZE,
-    OPAL_DATATYPE_INT16_SIZE,       /* Yes, double-machine word integers are available */
+    OPAL_DATATYPE_INT16_SIZE,   
     OPAL_DATATYPE_UINT1_SIZE,
     OPAL_DATATYPE_UINT2_SIZE,
     OPAL_DATATYPE_UINT4_SIZE,
     OPAL_DATATYPE_UINT8_SIZE,
-    OPAL_DATATYPE_UINT16_SIZE,      /* Yes, double-machine word integers are available */
+    OPAL_DATATYPE_UINT16_SIZE,  
     OPAL_DATATYPE_FLOAT2_SIZE,
     OPAL_DATATYPE_FLOAT4_SIZE,
     OPAL_DATATYPE_FLOAT8_SIZE,
@@ -37,7 +41,7 @@ OPAL_DECLSPEC const size_t opal_datatype_basicDatatypesSize[OPAL_DATATYPE_MAX_PR
     OPAL_DATATYPE_WCHAR_SIZE,
     OPAL_DATATYPE_UNAVAILABLE_SIZE,
 };
-
+*/
 /***** my variables ********/
 
 
diff --git a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
index e9359209c01..50e7cb18a68 100644
--- a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
+++ b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
@@ -5,7 +5,7 @@
 #include <stddef.h>
 #include <sys/time.h>
 
-#include "opal_datatype_orig_internal.h"
+//#include "opal_datatype_orig_internal.h"
 
 
 /* OPAL_CUDA */
@@ -13,7 +13,7 @@
 #define OPAL_DATATYPE_CUDA_DEBUG
 //#define OPAL_DATATYPE_CUDA_KERNEL_TIME
 #define OPAL_DATATYPE_CUDA_DEBUG_LEVEL  0
-#define OPAL_DATATYPE_CUDA_TIMING
+//#define OPAL_DATATYPE_CUDA_TIMING
 
 
 #define IOV_ARRAY_SIZE          1
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
index 96bdc12d961..bb2cb63048e 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
@@ -1,4 +1,7 @@
- #include "opal_datatype_cuda_internal.cuh"
+#include "opal/datatype/opal_convertor_internal.h"
+#include "opal/datatype/opal_datatype_internal.h"
+
+#include "opal_datatype_cuda_internal.cuh"
 #include <stdio.h> 
 #include <time.h>
 
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
index 87184277d9a..6c10f17d398 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
@@ -1,3 +1,6 @@
+#include "opal/datatype/opal_convertor_internal.h"
+#include "opal/datatype/opal_datatype_internal.h"
+
 #include "opal_datatype_cuda_internal.cuh"
 #include "opal_datatype_cuda.cuh"
 
@@ -412,7 +415,7 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
     GET_TIME(start_total);
 #endif
     
-    DT_CUDA_DEBUG ( opal_cuda_output(0, "GPU datatype packing using iovec\n"); );
+    DT_CUDA_DEBUG ( opal_cuda_output(0, "GPU datatype PACKING using iovec\n"); );
 
 
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
@@ -422,11 +425,11 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
     description = pConvertor->use_desc->desc;
     pStack = pConvertor->pStack + pConvertor->stack_pos;
     pElem = &(description[pStack->index]);
-    printf("size elem %lu, size %d\n", pElem->elem.common.type, opal_datatype_basicDatatypesSize[pElem->elem.common.type]);
+//    printf("size elem %lu, size %d\n", pElem->elem.common.type, opal_datatype_basicDatatypes[pElem->elem.common.type]->size);
     
-//    assert(opal_datatype_basicDatatypesSize[pElem->elem.common.type] != 0);
+//    assert(opal_datatype_basicDatatypes[pElem->elem.common.type]->size != 0);
 
-    printf("buffer size %d, max_data %d\n", iov[0].iov_len, *max_data);
+ //   printf("buffer size %d, max_data %d\n", iov[0].iov_len, *max_data);
     if ((iov[0].iov_base == NULL) || opal_cuda_is_gpu_buffer(iov[0].iov_base)) {
         if (iov[0].iov_len == 0) {
             buffer_size = DT_CUDA_BUFFER_SIZE;
@@ -468,7 +471,7 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
     cuda_streams->current_stream_id = 0;
     convertor_flags = pConvertor->flags;
     complete_flag = opal_convertor_raw( pConvertor, cuda_iov, &cuda_iov_count, &length );
-    DT_CUDA_DEBUG ( opal_cuda_output(1, "complete flag %d, iov count %d, length %d, submit to CUDA stream %d\n", complete_flag, cuda_iov_count, length, cuda_streams->current_stream_id); );
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "PACKING complete flag %d, iov count %d, length %d, submit to CUDA stream %d\n", complete_flag, cuda_iov_count, length, cuda_streams->current_stream_id); );
 
 #if defined(OPAL_DATATYPE_CUDA_TIMING)    
     GET_TIME( end );
@@ -481,17 +484,6 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
     nb_blocks = 256;
     
     while (cuda_iov_count > 0) {
-    // void* temp_addr;
-    // size_t temp_size;
-    // for (i = 1; i < cuda_iov_count/2; i+=2) {
-    //     temp_addr = cuda_iov[i].iov_base;
-    //     temp_size = cuda_iov[i].iov_len;
-    //     cuda_iov[i].iov_base = cuda_iov[cuda_iov_count-i].iov_base;
-    //     cuda_iov[i].iov_len = cuda_iov[cuda_iov_count-i].iov_len;
-    //     cuda_iov[cuda_iov_count-i].iov_base = temp_addr;
-    //     cuda_iov[cuda_iov_count-i].iov_len = temp_size;
-    //     // printf("swap %d, %d, len %d %d\n", i, cuda_iov_count-i, cuda_iov[i].iov_len, cuda_iov[cuda_iov_count-i].iov_len);
-    // }
         
         current_block = 0;
         task_iteration = 0;
@@ -510,7 +502,7 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
             if (buffer_size >= cuda_iov[i].iov_len) {
                 length_per_iovec = cuda_iov[i].iov_len;
             } else {
-                orig_alignment = opal_datatype_basicDatatypesSize[pElem->elem.common.type];
+                orig_alignment = opal_datatype_basicDatatypes[pElem->elem.common.type]->size;
                 length_per_iovec = buffer_size / orig_alignment * orig_alignment;
                 buffer_isfull = 1;
             }
@@ -531,7 +523,7 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
             count_desc = length_per_iovec / alignment;
             residue_desc = length_per_iovec % alignment;
             nb_blocks_per_description = (count_desc + thread_per_block - 1) / thread_per_block;
-            DT_CUDA_DEBUG ( opal_cuda_output(-1, "description %d, size %d, residue %d, alignment %d, nb_block_aquired %d\n", i, count_desc, residue_desc, alignment, nb_blocks_per_description); );
+            DT_CUDA_DEBUG ( opal_cuda_output(10, "PACKING description %d, size %d, residue %d, alignment %d, nb_block_aquired %d\n", i, count_desc, residue_desc, alignment, nb_blocks_per_description); );
             for (j = 0; j < nb_blocks_per_description; j++) {
                 cuda_iov_dist_h_current[current_block].src[task_iteration] = (unsigned char *)(cuda_iov[i].iov_base) + j * thread_per_block * alignment;
                 cuda_iov_dist_h_current[current_block].dst[task_iteration] = destination;
@@ -543,7 +535,7 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
                     cuda_iov_dist_h_current[current_block].nb_elements[task_iteration] = count_desc - j*thread_per_block; //(thread_per_block - ((j+1)*thread_per_block - count_desc));// * sizeof(double);
                 }
                 destination += cuda_iov_dist_h_current[current_block].nb_elements[task_iteration] * alignment;
-                DT_CUDA_DEBUG( opal_cuda_output(3, "\tblock %d, src %p, dst %p, nb_elements %d, alignment %d\n", current_block, cuda_iov_dist_h_current[current_block].src[task_iteration], cuda_iov_dist_h_current[current_block].dst[task_iteration], cuda_iov_dist_h_current[current_block].nb_elements[task_iteration], cuda_iov_dist_h_current[current_block].element_alignment[task_iteration]); );
+                DT_CUDA_DEBUG( opal_cuda_output(12, "PACKING \tblock %d, src %p, dst %p, nb_elements %d, alignment %d\n", current_block, cuda_iov_dist_h_current[current_block].src[task_iteration], cuda_iov_dist_h_current[current_block].dst[task_iteration], cuda_iov_dist_h_current[current_block].nb_elements[task_iteration], cuda_iov_dist_h_current[current_block].element_alignment[task_iteration]); );
                 current_block += 1;
                 if (current_block >= nb_blocks) {
                     current_block = 0;
@@ -554,14 +546,14 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
             
             /* handle residue */
             if (residue_desc != 0) {
-                orig_alignment = opal_datatype_basicDatatypesSize[pElem->elem.common.type];
+                orig_alignment = opal_datatype_basicDatatypes[pElem->elem.common.type]->size;
                 cuda_iov_dist_h_current[current_block].src[task_iteration] = (unsigned char *)(cuda_iov[i].iov_base) + length_per_iovec / alignment * alignment;
                 cuda_iov_dist_h_current[current_block].dst[task_iteration] = destination;
                 cuda_iov_dist_h_current[current_block].element_alignment[task_iteration] = orig_alignment;
                 cuda_iov_dist_h_current[current_block].nb_tasks = task_iteration + 1;
                 cuda_iov_dist_h_current[current_block].nb_elements[task_iteration] = (length_per_iovec - length_per_iovec / alignment * alignment) / orig_alignment;
                 destination += cuda_iov_dist_h_current[current_block].nb_elements[task_iteration] * orig_alignment;
-                DT_CUDA_DEBUG( opal_cuda_output(3, "\tblock %d, src %p, dst %p, nb_elements %d, alignment %d\n", current_block, cuda_iov_dist_h_current[current_block].src[task_iteration], cuda_iov_dist_h_current[current_block].dst[task_iteration], cuda_iov_dist_h_current[current_block].nb_elements[task_iteration], cuda_iov_dist_h_current[current_block].element_alignment[task_iteration]); );
+                DT_CUDA_DEBUG( opal_cuda_output(12, "PACKING \tblock %d, src %p, dst %p, nb_elements %d, alignment %d\n", current_block, cuda_iov_dist_h_current[current_block].src[task_iteration], cuda_iov_dist_h_current[current_block].dst[task_iteration], cuda_iov_dist_h_current[current_block].nb_elements[task_iteration], cuda_iov_dist_h_current[current_block].element_alignment[task_iteration]); );
                 current_block += 1;
                 if (current_block >= nb_blocks) {
                     current_block = 0;
@@ -598,7 +590,7 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
 #endif
         convertor_flags = pConvertor->flags;
         complete_flag = opal_convertor_raw( pConvertor, cuda_iov, &cuda_iov_count, &length );
-        DT_CUDA_DEBUG ( opal_cuda_output(1, "complete flag %d, iov count %d, length %d, submit to CUDA stream %d\n", complete_flag, cuda_iov_count, length, cuda_streams->current_stream_id); );
+        DT_CUDA_DEBUG ( opal_cuda_output(2, "PACKING complete flag %d, iov count %d, length %d, submit to CUDA stream %d\n", complete_flag, cuda_iov_count, length, cuda_streams->current_stream_id); );
 #if defined(OPAL_DATATYPE_CUDA_TIMING)    
         GET_TIME( end );
         total_time = ELAPSED_TIME( start, end );
@@ -630,7 +622,7 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
     iov[0].iov_len = total_packed;
     *max_data = total_packed;
     *out_size = 1;
-    DT_CUDA_DEBUG ( opal_cuda_output(0, "total packed %d\n", total_packed); );
+    DT_CUDA_DEBUG ( opal_cuda_output(0, "PACKING total packed %d\n", total_packed); );
     
 #if defined(OPAL_DATATYPE_CUDA_TIMING)    
     GET_TIME( end_total );
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
index 35a4ff73078..bbc18989e6e 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
@@ -1,3 +1,6 @@
+#include "opal/datatype/opal_convertor_internal.h"
+#include "opal/datatype/opal_datatype_internal.h"
+
 #include "opal_datatype_cuda_internal.cuh"
 #include <cuda.h>
 #include <stdio.h> 
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
index fd4fec00a73..13531b93d3e 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
@@ -1,3 +1,6 @@
+#include "opal/datatype/opal_convertor_internal.h"
+#include "opal/datatype/opal_datatype_internal.h"
+
 #include "opal_datatype_cuda_internal.cuh"
 #include "opal_datatype_cuda.cuh"
 
@@ -298,8 +301,6 @@ int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvert
     description = pConvertor->use_desc->desc;
     pStack = pConvertor->pStack + pConvertor->stack_pos;
     pElem = &(description[pStack->index]);
-    printf("size elem %lu, size %d\n", pElem->elem.common.type, opal_datatype_basicDatatypesSize[pElem->elem.common.type]);
-    
     DT_CUDA_DEBUG ( opal_cuda_output(0, "GPU datatype UNpacking using iovec\n"); );
     
     // double *vtmp = (double *)iov[0].iov_base;
@@ -347,8 +348,8 @@ int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvert
     cuda_streams->current_stream_id = 0;
     convertor_flags = pConvertor->flags;
     complete_flag = opal_convertor_raw( pConvertor, cuda_iov, &cuda_iov_count, &length );
-    DT_CUDA_DEBUG ( opal_cuda_output(1, "complete flag %d, iov count %d, length %d, submit to CUDA stream %d\n", complete_flag, cuda_iov_count, length, cuda_streams->current_stream_id); );
-    
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "UNPACKING complete flag %d, iov count %d, length %d, submit to CUDA stream %d\n", complete_flag, cuda_iov_count, length, cuda_streams->current_stream_id); );
+
 #if defined (OPAL_DATATYPE_CUDA_TIMING)
     GET_TIME( end );
     total_time = ELAPSED_TIME( start, end );
@@ -377,7 +378,7 @@ int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvert
             if (buffer_size >= cuda_iov[i].iov_len) {
                 length_per_iovec = cuda_iov[i].iov_len;
             } else {
-                orig_alignment = opal_datatype_basicDatatypesSize[pElem->elem.common.type];
+                orig_alignment = opal_datatype_basicDatatypes[pElem->elem.common.type]->size;
                 length_per_iovec = buffer_size / orig_alignment * orig_alignment;
                 buffer_isfull = 1;
             }
@@ -398,7 +399,7 @@ int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvert
             count_desc = length_per_iovec / alignment;
             residue_desc = length_per_iovec % alignment;
             nb_blocks_per_description = (count_desc + thread_per_block - 1) / thread_per_block;
-            DT_CUDA_DEBUG ( opal_cuda_output(2, "description %d, size %d, residue %d, alignment %d\n", i, count_desc, residue_desc, alignment); );
+            DT_CUDA_DEBUG ( opal_cuda_output(10, "UNPACKING description %d, size %d, residue %d, alignment %d\n", i, count_desc, residue_desc, alignment); );
             for (j = 0; j < nb_blocks_per_description; j++) {
                 cuda_iov_dist_h_current[current_block].dst[task_iteration] = (unsigned char *)(cuda_iov[i].iov_base) + j * thread_per_block * alignment;
                 cuda_iov_dist_h_current[current_block].src[task_iteration] = source;
@@ -410,7 +411,7 @@ int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvert
                     cuda_iov_dist_h_current[current_block].nb_elements[task_iteration] = (thread_per_block - ((j+1)*thread_per_block - count_desc));// * sizeof(double);
                 }
                 source += cuda_iov_dist_h_current[current_block].nb_elements[task_iteration] * alignment;
-                DT_CUDA_DEBUG( opal_cuda_output(3, "\tblock %d, src %p, dst %p, nb_elements %d, alignment %d\n", current_block, cuda_iov_dist_h_current[current_block].src[task_iteration], cuda_iov_dist_h_current[current_block].dst[task_iteration], cuda_iov_dist_h_current[current_block].nb_elements[task_iteration], cuda_iov_dist_h_current[current_block].element_alignment[task_iteration]); );
+                DT_CUDA_DEBUG( opal_cuda_output(12, "UNPACKING \tblock %d, src %p, dst %p, nb_elements %d, alignment %d\n", current_block, cuda_iov_dist_h_current[current_block].src[task_iteration], cuda_iov_dist_h_current[current_block].dst[task_iteration], cuda_iov_dist_h_current[current_block].nb_elements[task_iteration], cuda_iov_dist_h_current[current_block].element_alignment[task_iteration]); );
                 current_block += 1;
                 if (current_block >= nb_blocks) {
                     current_block = 0;
@@ -421,14 +422,14 @@ int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvert
             
             /* handle residue */
             if (residue_desc != 0) {
-                orig_alignment = opal_datatype_basicDatatypesSize[pElem->elem.common.type];
+                orig_alignment = opal_datatype_basicDatatypes[pElem->elem.common.type]->size;
                 cuda_iov_dist_h_current[current_block].dst[task_iteration] = (unsigned char *)(cuda_iov[i].iov_base) + length_per_iovec / alignment * alignment;
                 cuda_iov_dist_h_current[current_block].src[task_iteration] = source;
                 cuda_iov_dist_h_current[current_block].element_alignment[task_iteration] = orig_alignment;
                 cuda_iov_dist_h_current[current_block].nb_tasks = task_iteration + 1;
                 cuda_iov_dist_h_current[current_block].nb_elements[task_iteration] = (length_per_iovec - length_per_iovec / alignment * alignment) / orig_alignment;
                 source += cuda_iov_dist_h_current[current_block].nb_elements[task_iteration] * orig_alignment;
-                DT_CUDA_DEBUG( opal_cuda_output(3, "\tblock %d, src %p, dst %p, nb_elements %d, alignment %d\n", current_block, cuda_iov_dist_h_current[current_block].src[task_iteration], cuda_iov_dist_h_current[current_block].dst[task_iteration], cuda_iov_dist_h_current[current_block].nb_elements[task_iteration], cuda_iov_dist_h_current[current_block].element_alignment[task_iteration]); );
+                DT_CUDA_DEBUG( opal_cuda_output(12, "UNPACKING \tblock %d, src %p, dst %p, nb_elements %d, alignment %d\n", current_block, cuda_iov_dist_h_current[current_block].src[task_iteration], cuda_iov_dist_h_current[current_block].dst[task_iteration], cuda_iov_dist_h_current[current_block].nb_elements[task_iteration], cuda_iov_dist_h_current[current_block].element_alignment[task_iteration]); );
                 current_block += 1;
                 if (current_block >= nb_blocks) {
                     current_block = 0;
@@ -465,8 +466,8 @@ int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvert
 #endif   
         convertor_flags = pConvertor->flags;     
         complete_flag = opal_convertor_raw( pConvertor, cuda_iov, &cuda_iov_count, &length );
-        DT_CUDA_DEBUG ( opal_cuda_output(1, "complete flag %d, iov count %d, length %d, submit to CUDA stream %d\n", complete_flag, cuda_iov_count, length, cuda_streams->current_stream_id); );
-#if defined(OPAL_DATATYPE_CUDA_TIMING)    
+        DT_CUDA_DEBUG ( opal_cuda_output(8, "UNPACKING complete flag %d, iov count %d, length %d, submit to CUDA stream %d\n", complete_flag, cuda_iov_count, length, cuda_streams->current_stream_id); );
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
         GET_TIME( end );
         total_time = ELAPSED_TIME( start, end );
         printf( "[Timing]: ddt to iov in %ld microsec\n", total_time );
@@ -478,9 +479,9 @@ int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvert
     iov[0].iov_len = total_unpacked;
     *max_data = total_unpacked;
     *out_size = 1;
-    DT_CUDA_DEBUG ( opal_cuda_output(0, "total unpacked %d\n", total_unpacked); );
-    
-#if defined(OPAL_DATATYPE_CUDA_TIMING)    
+    DT_CUDA_DEBUG ( opal_cuda_output(0, "UNPACKING total unpacked %d\n", total_unpacked); );
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
     GET_TIME( end_total );
     total_time = ELAPSED_TIME( start_total, end_total );
     printf( "[Timing]: total unpacking in %ld microsec\n", total_time );
diff --git a/opal/mca/btl/smcuda/btl_smcuda.c b/opal/mca/btl/smcuda/btl_smcuda.c
index 6041a8b64e8..0c80a1d8b5b 100644
--- a/opal/mca/btl/smcuda/btl_smcuda.c
+++ b/opal/mca/btl/smcuda/btl_smcuda.c
@@ -1114,7 +1114,7 @@ int mca_btl_smcuda_get_cuda (struct mca_btl_base_module_t *btl,
     offset = (size_t) ((intptr_t) remote_address - (intptr_t) reg_ptr->base.base);
     remote_memory_address = (unsigned char *)reg_ptr->base.alloc_base + offset;
     if (0 != offset) {
-        printf("!!!!!!offset %d, ra %p, base %p\n", offset, (void*)remote_address, (void*)reg_ptr->base.base);
+        printf("!!!!!!offset %lu, ra %p, base %p\n", offset, (void*)remote_address, (void*)reg_ptr->base.base);
         opal_output(-1, "OFFSET=%d", (int)offset);
     }
 
@@ -1144,6 +1144,7 @@ int mca_btl_smcuda_get_cuda (struct mca_btl_base_module_t *btl,
             convertor->gpu_buffer_ptr = remote_memory_address;
             mca_btl_smcuda_cuda_dt_unpack_clone(convertor, ep, local_address, local_handle, (mca_btl_base_completion_fn_t)cbfunc, cbcontext, cbdata, pipeline_size, lindex);
             done = 0;
+            mca_btl_smcuda_free(btl, (mca_btl_base_descriptor_t *)frag);
         } else {
             recvreq->req_recv.req_base.req_convertor.flags |= CONVERTOR_CUDA;
             rc = mca_common_cuda_memcpy(local_address, remote_memory_address, size,
@@ -1259,6 +1260,7 @@ int mca_btl_smcuda_send_cuda_unpack_sig(struct mca_btl_base_module_t* btl,
     /* allocate a fragment, giving up if we can't get one */
     MCA_BTL_SMCUDA_FRAG_ALLOC_EAGER(frag);
     if( OPAL_UNLIKELY(NULL == frag) ) {
+        printf("!!!!!!!!!! no frag \n");
         return OPAL_ERR_OUT_OF_RESOURCE;;
     }
 
@@ -1269,6 +1271,7 @@ int mca_btl_smcuda_send_cuda_unpack_sig(struct mca_btl_base_module_t* btl,
     memcpy(frag->segment.seg_addr.pval, &cuda_dt_hdr, sizeof(cuda_dt_hdr_t));
     
     rc = mca_btl_smcuda_send(btl, endpoint, (struct mca_btl_base_descriptor_t*)frag,  MCA_BTL_TAG_SMCUDA_DATATYPE_UNPACK);
+    printf("######## rank %d, send seq %d, endpoint %p\n", endpoint->my_smp_rank, seq, endpoint);
     return rc;
 }
 
@@ -1295,6 +1298,41 @@ int mca_btl_smcuda_send_cuda_pack_sig(struct mca_btl_base_module_t* btl,
     return rc;
 }
 
+int mca_btl_smcuda_check_cuda_dt_pack_clone_exist(struct mca_btl_base_endpoint_t *endpoint, struct opal_convertor_t *convertor)
+{
+    int i;
+    for (i = 0; i < SMCUDA_DT_CLONE_SIZE; i++) {
+        if (endpoint->smcuda_dt_pack_clone[i].convertor == convertor) {
+            return i;
+        }
+    }
+    return -1;
+}
+
+int mca_btl_smcuda_set_cuda_dt_pack_seq(struct mca_btl_base_endpoint_t *endpoint, int lindex, int seq)
+{
+    endpoint->smcuda_dt_pack_clone[lindex].seq = seq;
+    return 0;
+}
+
+int mca_btl_smcuda_get_cuda_dt_pack_seq(struct mca_btl_base_endpoint_t *endpoint, int lindex)
+{
+    if (lindex >= SMCUDA_DT_CLONE_SIZE) {
+        return -9;
+    } else {
+        return endpoint->smcuda_dt_pack_clone[lindex].seq;
+    }
+}
+
+int mca_btl_smcuda_get_cuda_dt_pack_pipeline_size(struct mca_btl_base_endpoint_t *endpoint, int lindex)
+{
+    if (lindex >= SMCUDA_DT_CLONE_SIZE) {
+        return -9;
+    } else {
+        return endpoint->smcuda_dt_pack_clone[lindex].pipeline_size;
+    }
+}
+
 int mca_btl_smcuda_alloc_cuda_dt_pack_clone(struct mca_btl_base_endpoint_t *endpoint)
 {
     int i;
@@ -1347,6 +1385,7 @@ void mca_btl_smcuda_cuda_dt_pack_clone(struct opal_convertor_t *convertor,
     endpoint->smcuda_dt_pack_clone[lindex].cbdata = cbdata;
     endpoint->smcuda_dt_pack_clone[lindex].pipeline_size = pipeline_size;
     endpoint->smcuda_dt_pack_clone[lindex].lindex = lindex;
+    endpoint->smcuda_dt_pack_clone[lindex].seq = -9;
 }
 
 void mca_btl_smcuda_cuda_dt_unpack_clone(struct opal_convertor_t *convertor,
@@ -1369,6 +1408,7 @@ void mca_btl_smcuda_cuda_dt_unpack_clone(struct opal_convertor_t *convertor,
     endpoint->smcuda_dt_unpack_clone[lindex].cbdata = cbdata;
     endpoint->smcuda_dt_unpack_clone[lindex].pipeline_size = pipeline_size;
     endpoint->smcuda_dt_unpack_clone[lindex].lindex = lindex;
+    endpoint->smcuda_dt_unpack_clone[lindex].seq = -9;
 }
 
 #endif /* OPAL_CUDA_SUPPORT */
diff --git a/opal/mca/btl/smcuda/btl_smcuda.h b/opal/mca/btl/smcuda/btl_smcuda.h
index c43fbe0b190..a1173502449 100644
--- a/opal/mca/btl/smcuda/btl_smcuda.h
+++ b/opal/mca/btl/smcuda/btl_smcuda.h
@@ -526,6 +526,7 @@ typedef struct {
     void *cbdata;
     size_t pipeline_size;
     int lindex;
+    int seq;
 } cuda_dt_clone_t;
 
 #define SMCUDA_DT_CLONE_SIZE 20
@@ -533,6 +534,10 @@ extern cuda_dt_clone_t smcuda_dt_clone[SMCUDA_DT_CLONE_SIZE];
 
 int mca_btl_smcuda_send_cuda_unpack_sig(struct mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t* endpoint, int lindex, int seq);
 int mca_btl_smcuda_send_cuda_pack_sig(struct mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t* endpoint, int lindex, int seq);
+int mca_btl_smcuda_check_cuda_dt_pack_clone_exist(struct mca_btl_base_endpoint_t *endpoint, struct opal_convertor_t *convertor);
+int mca_btl_smcuda_set_cuda_dt_pack_seq(struct mca_btl_base_endpoint_t *endpoint, int lindex, int seq);
+int mca_btl_smcuda_get_cuda_dt_pack_seq(struct mca_btl_base_endpoint_t *endpoint, int lindex);
+int mca_btl_smcuda_get_cuda_dt_pack_pipeline_size(struct mca_btl_base_endpoint_t *endpoint, int lindex);
 int mca_btl_smcuda_alloc_cuda_dt_pack_clone(struct mca_btl_base_endpoint_t *endpoint);
 int mca_btl_smcuda_alloc_cuda_dt_unpack_clone(struct mca_btl_base_endpoint_t *endpoint);
 void mca_btl_smcuda_free_cuda_dt_pack_clone(struct mca_btl_base_endpoint_t *endpoint, int lindex);
diff --git a/opal/mca/btl/smcuda/btl_smcuda_component.c b/opal/mca/btl/smcuda/btl_smcuda_component.c
index 4633134bac5..8a113ab5a01 100644
--- a/opal/mca/btl/smcuda/btl_smcuda_component.c
+++ b/opal/mca/btl/smcuda/btl_smcuda_component.c
@@ -883,7 +883,7 @@ static void btl_smcuda_datatype_unpack(mca_btl_base_module_t* btl,
         iov.iov_len = my_cuda_dt_clone->pipeline_size;
         opal_convertor_unpack(convertor, &iov, &iov_count, &max_data );
     }
-    
+   // MCA_BTL_SMCUDA_FRAG_RETURN(frag);
 }
 
 static void btl_smcuda_datatype_pack(mca_btl_base_module_t* btl,
@@ -910,6 +910,7 @@ static void btl_smcuda_datatype_pack(mca_btl_base_module_t* btl,
         opal_cuda_free_gpu_buffer_p(my_cuda_dt_clone->gpu_ptr, 0);
         mca_btl_smcuda_free_cuda_dt_pack_clone(my_cuda_dt_clone->endpoint, lindex);
     }
+  //  MCA_BTL_SMCUDA_FRAG_RETURN(frag);
 }
 
 #endif /* OPAL_CUDA_SUPPORT */
@@ -1187,6 +1188,12 @@ int mca_btl_smcuda_component_progress(void)
                                           &frag->base, status?OPAL_ERROR:OPAL_SUCCESS);
                 }
                 if( btl_ownership ) {
+                    if (frag->hdr->tag == MCA_BTL_TAG_SMCUDA_DATATYPE_PACK) {
+                        printf("&&&&&&&&&&&&&&&&&&got PACK TAG\n");
+                    }
+                    if (frag->hdr->tag == MCA_BTL_TAG_SMCUDA_DATATYPE_UNPACK) {
+                        printf("&&&&&&&&&&&&&&&&&&got UNPACK TAG\n");
+                    }
                     MCA_BTL_SMCUDA_FRAG_RETURN(frag);
                 }
                 OPAL_THREAD_ADD32(&mca_btl_smcuda_component.num_outstanding_frags, -1);
diff --git a/opal/mca/common/cuda/common_cuda.h b/opal/mca/common/cuda/common_cuda.h
index 0b5a724d9dc..20290dff7d8 100644
--- a/opal/mca/common/cuda/common_cuda.h
+++ b/opal/mca/common/cuda/common_cuda.h
@@ -41,6 +41,7 @@ struct mca_mpool_common_cuda_reg_data_t {
     // uint64_t pipeline_evtHandle[MAX_IPC_EVENT_HANDLE*EVTHANDLE_SIZE];
     size_t pipeline_size;
     uint32_t lindex;
+    uint8_t pack_required;
 };
 typedef struct mca_mpool_common_cuda_reg_data_t mca_mpool_common_cuda_reg_data_t;
 
diff --git a/test/datatype/Makefile.am b/test/datatype/Makefile.am
index 97db4bda506..4085be3936f 100644
--- a/test/datatype/Makefile.am
+++ b/test/datatype/Makefile.am
@@ -29,7 +29,7 @@ unpack_ooo_LDADD = \
 
 ddt_test_SOURCES = ddt_test.c ddt_lib.c ddt_lib.h
 ddt_test_LDFLAGS = $(WRAPPER_EXTRA_LDFLAGS)
-ddt_test_CFLAGS = -I/mnt/sw/cuda/include -g 
+ddt_test_CFLAGS = -I/mnt/sw/cuda/include -g -O0
 ddt_test_LDADD = $(top_builddir)/ompi/libmpi.la $(top_builddir)/opal/mca/common/cuda/libmca_common_cuda.la -L/mnt/sw/cuda/lib64 -lcudart
 
 #ddt_test_old_SOURCES = ddt_test_old.c ddt_lib.c ddt_lib.h
diff --git a/test/datatype/ddt_test.c b/test/datatype/ddt_test.c
index 98aa6f1347a..459566eaa09 100644
--- a/test/datatype/ddt_test.c
+++ b/test/datatype/ddt_test.c
@@ -830,7 +830,7 @@ int main( int argc, char* argv[] )
     if( outputFlags & CHECK_PACK_UNPACK ) {
         for (i = 1; i <= 1; i++) {
 //        local_copy_ddt_count(pdt, 1);
-    //        local_copy_with_convertor(pdt, 1, 1024*1024*10, 4000);
+            local_copy_with_convertor(pdt, 1, 1024*1024*10, 4000);
         }
     }
     OBJ_RELEASE( pdt ); assert( pdt == NULL );

From c182b30192c61446801ea607381bd63364f9e440 Mon Sep 17 00:00:00 2001
From: eddy16112 <eddy16112@gmail.com>
Date: Fri, 21 Aug 2015 22:20:54 -0400
Subject: [PATCH 108/190] big changes, now pack is driven by receiver by active
 message

---
 ompi/mca/pml/ob1/pml_ob1_cuda.c            | 126 +++++----------------
 opal/mca/btl/smcuda/btl_smcuda.c           |  12 +-
 opal/mca/btl/smcuda/btl_smcuda.h           |   5 +-
 opal/mca/btl/smcuda/btl_smcuda_component.c |  33 +++++-
 4 files changed, 69 insertions(+), 107 deletions(-)

diff --git a/ompi/mca/pml/ob1/pml_ob1_cuda.c b/ompi/mca/pml/ob1/pml_ob1_cuda.c
index 826c1e9c6b3..1e98a7757db 100644
--- a/ompi/mca/pml/ob1/pml_ob1_cuda.c
+++ b/ompi/mca/pml/ob1/pml_ob1_cuda.c
@@ -49,10 +49,10 @@ size_t mca_pml_ob1_rdma_cuda_btls(
     size_t size,
     mca_pml_ob1_com_btl_t* rdma_btls);
     
-int mca_pml_ob1_rdma_cuda_btl_register_events(
+int mca_pml_ob1_rdma_cuda_btl_register_data(
     mca_pml_ob1_com_btl_t* rdma_btls, 
     uint32_t num_btls_used, 
-    struct opal_convertor_t* convertor, size_t pipeline_size, int lindex);
+    size_t pipeline_size, int lindex, uint8_t pack_required);
 
 int mca_pml_ob1_cuda_need_buffers(void * rreq,
                                   mca_btl_base_module_t* btl);
@@ -105,102 +105,34 @@ int mca_pml_ob1_send_request_start_cuda(mca_pml_ob1_send_request_t* sendreq,
         sendreq->req_send.req_base.req_convertor.flags |= CONVERTOR_CUDA;
         mca_bml_base_btl_t* bml_endpoint_btl = mca_bml_base_btl_array_get_index(&(sendreq->req_endpoint->btl_send), 0);
         if ((bml_endpoint_btl->btl_flags & MCA_BTL_FLAGS_CUDA_GET) && CUDA_DDT_WITH_RDMA) {
-            
-            int seq = 0;
-            int rc_dt = 0;
-            int rc_sig = 0;
+            printf("GPU data ready for GET!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n");
             unsigned char *base;
-            struct iovec iov;
-            size_t pipeline_size = 0;
-            uint32_t iov_count = 1;
-            size_t max_data = 0;
             struct opal_convertor_t *convertor = &(sendreq->req_send.req_base.req_convertor);
-            int lindex = mca_btl_smcuda_check_cuda_dt_pack_clone_exist(bml_btl->btl_endpoint, convertor); 
-            if (lindex == -1) {
-                /* this is the first time for this convertor */
-                printf("GPU data ready for GET!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n");
-                base = opal_cuda_malloc_gpu_buffer_p(convertor->local_size, 0);
-                convertor->gpu_buffer_ptr = base;
-                sendreq->req_send.req_bytes_packed = convertor->local_size;
-                printf("GPU BUFFER %p, local %lu, remote %lu\n", base, convertor->local_size, convertor->remote_size);
-                if( 0 != (sendreq->req_rdma_cnt = (uint32_t)mca_pml_ob1_rdma_cuda_btls(
-                                                                               sendreq->req_endpoint,
-                                                                               base,
-                                                                               sendreq->req_send.req_bytes_packed,
-                                                                               sendreq->req_rdma))) {
-                
-                    pipeline_size = 1024*1024;
-                    iov.iov_base = base;
-                    iov.iov_len = pipeline_size;
-                    max_data = 0;
-                    /* the first pack here is used to get the correct size of pipeline_size */
-                    /* because pack may not use the whole pipeline size */
-                    rc_dt = opal_convertor_pack(convertor, &iov, &iov_count, &max_data );
-                    pipeline_size = max_data;
-                    lindex = mca_btl_smcuda_alloc_cuda_dt_pack_clone(bml_btl->btl_endpoint);
-                    assert(lindex >= 0);
-                    mca_pml_ob1_rdma_cuda_btl_register_events(sendreq->req_rdma, sendreq->req_rdma_cnt, convertor, pipeline_size, lindex); 
-                    mca_btl_smcuda_cuda_dt_pack_clone(convertor, bml_btl->btl_endpoint, NULL, NULL, NULL, NULL, NULL, pipeline_size, lindex);
-                
-                    rc = mca_pml_ob1_send_request_start_rdma(sendreq, bml_btl,
-                                                             sendreq->req_send.req_bytes_packed);
-                
-                    rc_sig = mca_btl_smcuda_send_cuda_unpack_sig(bml_btl->btl, bml_btl->btl_endpoint, lindex, seq);
-                    if (rc_sig == OPAL_ERR_OUT_OF_RESOURCE) {
-                        mca_btl_smcuda_set_cuda_dt_pack_seq(bml_btl->btl_endpoint, lindex, seq);
-                        return rc_sig;
-                    }
-                    while (rc_dt != 1) {
-                        iov.iov_base += pipeline_size;
-                        seq ++;
-                        rc_dt = opal_convertor_pack(convertor, &iov, &iov_count, &max_data );
-                        rc_sig = mca_btl_smcuda_send_cuda_unpack_sig(bml_btl->btl, bml_btl->btl_endpoint, lindex, seq);
-                        if (rc_sig == OPAL_ERR_OUT_OF_RESOURCE) {
-                            mca_btl_smcuda_set_cuda_dt_pack_seq(bml_btl->btl_endpoint, lindex, seq);
-                            return rc_sig;
-                        }
-                    }
-                    rc_sig = mca_btl_smcuda_send_cuda_unpack_sig(bml_btl->btl, bml_btl->btl_endpoint, lindex, -1);
-                    if (rc_sig == OPAL_ERR_OUT_OF_RESOURCE) {
-                        mca_btl_smcuda_set_cuda_dt_pack_seq(bml_btl->btl_endpoint, lindex, -1);
-                        return rc_sig;
-                    }
-                    if( OPAL_UNLIKELY(OMPI_SUCCESS != rc) ) {
-                        mca_pml_ob1_free_rdma_resources(sendreq);
-                    }
-                } else {
-                    rc = mca_pml_ob1_send_request_start_rndv(sendreq, bml_btl, 0, 0);
-                }
-            } else { /* RMDA has been started before, but no resource (frag) last time, so back to re-schedule */
-                seq = mca_btl_smcuda_get_cuda_dt_pack_seq(bml_btl->btl_endpoint, lindex);
-                pipeline_size = mca_btl_smcuda_get_cuda_dt_pack_pipeline_size(bml_btl->btl_endpoint, lindex);
-                printf("*****************I resent seq %d, pipeline %lu\n", seq, pipeline_size);
-                rc_dt = 0;
-                rc_sig = mca_btl_smcuda_send_cuda_unpack_sig(bml_btl->btl, bml_btl->btl_endpoint, lindex, seq);
-                if (rc_sig == OPAL_ERR_OUT_OF_RESOURCE) {
-                    mca_btl_smcuda_set_cuda_dt_pack_seq(bml_btl->btl_endpoint, lindex, seq);
-                    return rc_sig;
-                }
-                if (seq != -1) {
-                    
-                    while (rc_dt != 1) {
-                        seq ++;
-                        iov.iov_base = convertor->gpu_buffer_ptr + pipeline_size * seq;
-                        iov.iov_len = pipeline_size;
-                        rc_dt = opal_convertor_pack(convertor, &iov, &iov_count, &pipeline_size );     
-                        rc_sig = mca_btl_smcuda_send_cuda_unpack_sig(bml_btl->btl, bml_btl->btl_endpoint, lindex, seq);
-                        if (rc_sig == OPAL_ERR_OUT_OF_RESOURCE) {
-                            mca_btl_smcuda_set_cuda_dt_pack_seq(bml_btl->btl_endpoint, lindex, seq);
-                            return rc_sig;
-                        }
-                    }
-                    rc_sig = mca_btl_smcuda_send_cuda_unpack_sig(bml_btl->btl, bml_btl->btl_endpoint, lindex, -1);
-                    if (rc_sig == OPAL_ERR_OUT_OF_RESOURCE) {
-                        mca_btl_smcuda_set_cuda_dt_pack_seq(bml_btl->btl_endpoint, lindex, -1);
-                        return rc_sig;
-                    }
+            base = opal_cuda_malloc_gpu_buffer_p(convertor->local_size, 0);
+            convertor->gpu_buffer_ptr = base;
+            sendreq->req_send.req_bytes_packed = convertor->local_size;
+            printf("GPU BUFFER %p, local %lu, remote %lu\n", base, convertor->local_size, convertor->remote_size);
+            if( 0 != (sendreq->req_rdma_cnt = (uint32_t)mca_pml_ob1_rdma_cuda_btls(
+                                                                           sendreq->req_endpoint,
+                                                                           base,
+                                                                           sendreq->req_send.req_bytes_packed,
+                                                                           sendreq->req_rdma))) {
+    
+                int lindex = mca_btl_smcuda_alloc_cuda_dt_pack_clone(bml_btl->btl_endpoint);
+                assert(lindex >= 0);
+                mca_pml_ob1_rdma_cuda_btl_register_data(sendreq->req_rdma, sendreq->req_rdma_cnt, 0, lindex, 1); 
+                mca_btl_smcuda_cuda_dt_pack_clone(convertor, bml_btl->btl_endpoint, NULL, NULL, NULL, NULL, NULL, 0, lindex);
+    
+                rc = mca_pml_ob1_send_request_start_rdma(sendreq, bml_btl,
+                                                         sendreq->req_send.req_bytes_packed);
+    
+                if( OPAL_UNLIKELY(OMPI_SUCCESS != rc) ) {
+                    mca_pml_ob1_free_rdma_resources(sendreq);
                 }
+            } else {
+                rc = mca_pml_ob1_send_request_start_rndv(sendreq, bml_btl, 0, 0);
             }
+
             
         } else {
             rc = mca_pml_ob1_send_request_start_rndv(sendreq, bml_btl, 0, 0);
@@ -264,10 +196,10 @@ size_t mca_pml_ob1_rdma_cuda_btls(
     return num_btls_used;
 }
 
-int mca_pml_ob1_rdma_cuda_btl_register_events(
+int mca_pml_ob1_rdma_cuda_btl_register_data(
     mca_pml_ob1_com_btl_t* rdma_btls, 
     uint32_t num_btls_used, 
-    struct opal_convertor_t* convertor, size_t pipeline_size, int lindex)
+    size_t pipeline_size, int lindex, uint8_t pack_required)
 {
     uint32_t i, j;
     for (i = 0; i < num_btls_used; i++) {
@@ -279,9 +211,9 @@ int mca_pml_ob1_rdma_cuda_btl_register_events(
       //       mca_common_cuda_geteventhandle(&convertor->pipeline_event[j], j, (mca_mpool_base_registration_t *)cuda_reg);
       // //      printf("event %lu, j %d\n", convertor->pipeline_event[j], j);
       //   }
-        printf("i send pipeline %ld\n", pipeline_size);
         cuda_reg->data.pipeline_size = pipeline_size;
         cuda_reg->data.lindex = lindex;
+        cuda_reg->data.pack_required = pack_required;
 
     }
     return 0;
diff --git a/opal/mca/btl/smcuda/btl_smcuda.c b/opal/mca/btl/smcuda/btl_smcuda.c
index 0c80a1d8b5b..96ca945e0dc 100644
--- a/opal/mca/btl/smcuda/btl_smcuda.c
+++ b/opal/mca/btl/smcuda/btl_smcuda.c
@@ -1140,9 +1140,11 @@ int mca_btl_smcuda_get_cuda (struct mca_btl_base_module_t *btl,
             struct opal_convertor_t *convertor = &(recvreq->req_recv.req_base.req_convertor);   
             size_t pipeline_size = remote_handle->reg_data.pipeline_size;
             uint32_t lindex = remote_handle->reg_data.lindex;
-            printf("i receive pipeline %ld, lindex %d\n", pipeline_size, lindex);
+            uint8_t pack_required = remote_handle->reg_data.pack_required;
+            printf("i receive pipeline %ld, lindex %d, pack_required %d\n", pipeline_size, lindex, pack_required);
             convertor->gpu_buffer_ptr = remote_memory_address;
             mca_btl_smcuda_cuda_dt_unpack_clone(convertor, ep, local_address, local_handle, (mca_btl_base_completion_fn_t)cbfunc, cbcontext, cbdata, pipeline_size, lindex);
+            mca_btl_smcuda_send_cuda_pack_sig(btl, ep, lindex, 0, 0);
             done = 0;
             mca_btl_smcuda_free(btl, (mca_btl_base_descriptor_t *)frag);
         } else {
@@ -1251,7 +1253,8 @@ static void mca_btl_smcuda_send_cuda_ipc_request(struct mca_btl_base_module_t* b
 }
 
 int mca_btl_smcuda_send_cuda_unpack_sig(struct mca_btl_base_module_t* btl,
-                                           struct mca_btl_base_endpoint_t* endpoint, int lindex, int seq)
+                                           struct mca_btl_base_endpoint_t* endpoint, 
+                                           int lindex, int pipeline_size, int seq)
 {
     mca_btl_smcuda_frag_t* frag;
     int rc;
@@ -1268,6 +1271,7 @@ int mca_btl_smcuda_send_cuda_unpack_sig(struct mca_btl_base_module_t* btl,
     frag->base.des_flags = MCA_BTL_DES_FLAGS_BTL_OWNERSHIP;
     cuda_dt_hdr.seq = seq;
     cuda_dt_hdr.lindex = lindex;
+    cuda_dt_hdr.pipeline_size = pipeline_size;
     memcpy(frag->segment.seg_addr.pval, &cuda_dt_hdr, sizeof(cuda_dt_hdr_t));
     
     rc = mca_btl_smcuda_send(btl, endpoint, (struct mca_btl_base_descriptor_t*)frag,  MCA_BTL_TAG_SMCUDA_DATATYPE_UNPACK);
@@ -1276,7 +1280,8 @@ int mca_btl_smcuda_send_cuda_unpack_sig(struct mca_btl_base_module_t* btl,
 }
 
 int mca_btl_smcuda_send_cuda_pack_sig(struct mca_btl_base_module_t* btl,
-                                      struct mca_btl_base_endpoint_t* endpoint, int lindex, int seq)
+                                      struct mca_btl_base_endpoint_t* endpoint, 
+                                      int lindex, int pipeline_size, int seq)
 {
     mca_btl_smcuda_frag_t* frag;
     int rc;
@@ -1292,6 +1297,7 @@ int mca_btl_smcuda_send_cuda_pack_sig(struct mca_btl_base_module_t* btl,
     frag->base.des_flags = MCA_BTL_DES_FLAGS_BTL_OWNERSHIP;
     cuda_dt_hdr.seq = seq;
     cuda_dt_hdr.lindex = lindex;
+    cuda_dt_hdr.pipeline_size = pipeline_size;
     memcpy(frag->segment.seg_addr.pval, &cuda_dt_hdr, sizeof(cuda_dt_hdr_t));
     
     rc = mca_btl_smcuda_send(btl, endpoint, (struct mca_btl_base_descriptor_t*)frag,  MCA_BTL_TAG_SMCUDA_DATATYPE_PACK);
diff --git a/opal/mca/btl/smcuda/btl_smcuda.h b/opal/mca/btl/smcuda/btl_smcuda.h
index a1173502449..a90ba5c0f19 100644
--- a/opal/mca/btl/smcuda/btl_smcuda.h
+++ b/opal/mca/btl/smcuda/btl_smcuda.h
@@ -512,6 +512,7 @@ enum ipcState {
 typedef struct {
     int seq;
     int lindex;
+    int pipeline_size;
 } cuda_dt_hdr_t;
 
 /* package save pack/unpack convertor and cbfunc */
@@ -532,8 +533,8 @@ typedef struct {
 #define SMCUDA_DT_CLONE_SIZE 20
 extern cuda_dt_clone_t smcuda_dt_clone[SMCUDA_DT_CLONE_SIZE];
 
-int mca_btl_smcuda_send_cuda_unpack_sig(struct mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t* endpoint, int lindex, int seq);
-int mca_btl_smcuda_send_cuda_pack_sig(struct mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t* endpoint, int lindex, int seq);
+int mca_btl_smcuda_send_cuda_unpack_sig(struct mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t* endpoint, int lindex, int pipeline_size, int seq);
+int mca_btl_smcuda_send_cuda_pack_sig(struct mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t* endpoint, int lindex, int pipeline_size, int seq);
 int mca_btl_smcuda_check_cuda_dt_pack_clone_exist(struct mca_btl_base_endpoint_t *endpoint, struct opal_convertor_t *convertor);
 int mca_btl_smcuda_set_cuda_dt_pack_seq(struct mca_btl_base_endpoint_t *endpoint, int lindex, int seq);
 int mca_btl_smcuda_get_cuda_dt_pack_seq(struct mca_btl_base_endpoint_t *endpoint, int lindex);
diff --git a/opal/mca/btl/smcuda/btl_smcuda_component.c b/opal/mca/btl/smcuda/btl_smcuda_component.c
index 8a113ab5a01..5fd845edf24 100644
--- a/opal/mca/btl/smcuda/btl_smcuda_component.c
+++ b/opal/mca/btl/smcuda/btl_smcuda_component.c
@@ -857,6 +857,7 @@ static void btl_smcuda_datatype_unpack(mca_btl_base_module_t* btl,
     memcpy(&cuda_dt_hdr, segments->seg_addr.pval, sizeof(cuda_dt_hdr_t));
     int seq = cuda_dt_hdr.seq;
     int lindex = cuda_dt_hdr.lindex;
+    int pipeline_size = cuda_dt_hdr.pipeline_size;
     mca_btl_smcuda_frag_t *frag = (mca_btl_smcuda_frag_t *)des;
     cuda_dt_clone_t *my_cuda_dt_clone;
 
@@ -872,15 +873,15 @@ static void btl_smcuda_datatype_unpack(mca_btl_base_module_t* btl,
         cbfunc(btl, endpoint, my_cuda_dt_clone->local_address, my_cuda_dt_clone->local_handle, my_cuda_dt_clone->cbcontext, my_cuda_dt_clone->cbdata, OPAL_SUCCESS);
         mca_btl_smcuda_free_cuda_dt_unpack_clone(endpoint, lindex);
     } else if (seq == -1) {
-        mca_btl_smcuda_send_cuda_pack_sig(btl, endpoint, lindex, -1);
+        mca_btl_smcuda_send_cuda_pack_sig(btl, endpoint, lindex, pipeline_size, -1);
     } else {
         struct iovec iov;
         uint32_t iov_count = 1;
         size_t max_data;
         struct opal_convertor_t *convertor = my_cuda_dt_clone->convertor;   
-        iov.iov_base = convertor->gpu_buffer_ptr + seq * my_cuda_dt_clone->pipeline_size;
-        max_data = my_cuda_dt_clone->pipeline_size;
-        iov.iov_len = my_cuda_dt_clone->pipeline_size;
+        iov.iov_base = convertor->gpu_buffer_ptr + seq * pipeline_size;
+        max_data = pipeline_size;
+        iov.iov_len = pipeline_size;
         opal_convertor_unpack(convertor, &iov, &iov_count, &max_data );
     }
    // MCA_BTL_SMCUDA_FRAG_RETURN(frag);
@@ -906,9 +907,31 @@ static void btl_smcuda_datatype_pack(mca_btl_base_module_t* btl,
     printf("$$$$$$$$$$$$$$hello, rank %d in smcuda pack seq %d, index %d\n", my_cuda_dt_clone->endpoint->my_smp_rank, seq, lindex);
     
     if (seq == -1) {
-        mca_btl_smcuda_send_cuda_unpack_sig(btl, my_cuda_dt_clone->endpoint, lindex, -2);
+        mca_btl_smcuda_send_cuda_unpack_sig(btl, my_cuda_dt_clone->endpoint, lindex, 0, -2);
         opal_cuda_free_gpu_buffer_p(my_cuda_dt_clone->gpu_ptr, 0);
         mca_btl_smcuda_free_cuda_dt_pack_clone(my_cuda_dt_clone->endpoint, lindex);
+    } else {
+        struct opal_convertor_t *convertor = my_cuda_dt_clone->convertor;
+        struct iovec iov;
+        int rc_dt = 0;
+        size_t pipeline_size = 1024*1024;
+        uint32_t iov_count = 1;
+        iov.iov_base = convertor->gpu_buffer_ptr;
+        iov.iov_len = pipeline_size;
+        size_t max_data = 0;
+        int seq = 0;
+        /* the first pack here is used to get the correct size of pipeline_size */
+        /* because pack may not use the whole pipeline size */
+        rc_dt = opal_convertor_pack(convertor, &iov, &iov_count, &max_data );
+        pipeline_size = max_data;
+        mca_btl_smcuda_send_cuda_unpack_sig(btl, endpoint, lindex, pipeline_size, seq);
+        while (rc_dt != 1) {
+            iov.iov_base += pipeline_size;
+            seq ++;
+            rc_dt = opal_convertor_pack(convertor, &iov, &iov_count, &max_data );
+            mca_btl_smcuda_send_cuda_unpack_sig(btl, endpoint, lindex, pipeline_size, seq);
+        }
+        mca_btl_smcuda_send_cuda_unpack_sig(btl, endpoint, lindex, pipeline_size, -1);
     }
   //  MCA_BTL_SMCUDA_FRAG_RETURN(frag);
 }

From d131f81cfa7efdf130278833cfa0d059c8d40f65 Mon Sep 17 00:00:00 2001
From: eddy16112 <eddy16112@gmail.com>
Date: Mon, 31 Aug 2015 01:03:21 -0400
Subject: [PATCH 109/190] intel test working

Conflicts:
	opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
	opal/mca/btl/smcuda/btl_smcuda.c
---
 .../cuda/opal_datatype_cuda_internal.cuh      |  2 +-
 .../cuda/opal_datatype_pack_cuda_kernel.cu    | 14 +++++-----
 .../cuda/opal_datatype_pack_cuda_wrapper.cu   |  9 ++++---
 .../cuda/opal_datatype_unpack_cuda_kernel.cu  | 14 +++++-----
 .../cuda/opal_datatype_unpack_cuda_wrapper.cu |  6 +++--
 opal/mca/btl/smcuda/btl_smcuda.c              | 27 ++++++++++++++++++-
 opal/mca/btl/smcuda/btl_smcuda_component.c    |  2 +-
 test/datatype/ddt_test.c                      |  6 ++---
 8 files changed, 55 insertions(+), 25 deletions(-)

diff --git a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
index 50e7cb18a68..3d8640bcbc2 100644
--- a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
+++ b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
@@ -13,7 +13,7 @@
 #define OPAL_DATATYPE_CUDA_DEBUG
 //#define OPAL_DATATYPE_CUDA_KERNEL_TIME
 #define OPAL_DATATYPE_CUDA_DEBUG_LEVEL  0
-//#define OPAL_DATATYPE_CUDA_TIMING
+#define OPAL_DATATYPE_CUDA_TIMING
 
 
 #define IOV_ARRAY_SIZE          1
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
index bb2cb63048e..42962316da3 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
@@ -536,15 +536,15 @@ __global__ void pack_contiguous_loop_cuda_kernel_global( uint32_t copy_loops,
 {
     uint32_t _i, tid, num_threads;
     uint32_t gap, nb_elements;
-    double *_source_tmp, *_destination_tmp, *_src_disp_tmp;;
+    char *_source_tmp, *_destination_tmp, *_src_disp_tmp;;
     
     tid = threadIdx.x + blockIdx.x * blockDim.x;
     num_threads = gridDim.x * blockDim.x;
     
-    gap = (extent - size) / 8;
-    nb_elements = size / 8;
-    _src_disp_tmp = (double*)source;
-    _destination_tmp = (double*)destination;
+    gap = (extent - size) / 1;
+    nb_elements = size / 1;
+    _src_disp_tmp = (char*)source;
+    _destination_tmp = (char*)destination;
     _destination_tmp += tid;
 
     for (_i = tid; _i < copy_loops*nb_elements; _i+=num_threads) {
@@ -623,9 +623,9 @@ __global__ void opal_generic_simple_pack_cuda_iov_kernel( ddt_cuda_iov_dist_t* c
             _destination_tmp = dst + threadIdx.x * alignment;
 #if !defined (OPAL_DATATYPE_CUDA_DRY_RUN)
             if (alignment == ALIGNMENT_DOUBLE) {
-                *((double *)_destination_tmp) = *((double *)_source_tmp);
+                *((long *)_destination_tmp) = *((long *)_source_tmp);
             } else if (alignment == ALIGNMENT_FLOAT) {
-                *((float *)_destination_tmp) = *((float *)_source_tmp);
+                *((int *)_destination_tmp) = *((int *)_source_tmp);
             } else {
                 * _destination_tmp = *_source_tmp;
             }
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
index 6c10f17d398..608e56dcd67 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
@@ -465,7 +465,7 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
     
     DT_CUDA_DEBUG ( opal_cuda_output(0, "Pack GPU base %p, pack to buffer %p\n", pConvertor->pBaseBuf, destination););
 
-    cuda_iov_count = CUDA_NB_IOV;
+    cuda_iov_count = 1000;//CUDA_NB_IOV;
     total_packed = 0;
     total_converted = pConvertor->bConverted;
     cuda_streams->current_stream_id = 0;
@@ -498,7 +498,7 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
         }
 
         for (i = 0; i < cuda_iov_count; i++) {
-    //        pElem = &(description[pStack->index+i]);
+        //    pElem = &(description[pStack->index+i]);
             if (buffer_size >= cuda_iov[i].iov_len) {
                 length_per_iovec = cuda_iov[i].iov_len;
             } else {
@@ -518,7 +518,7 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
                 alignment = ALIGNMENT_CHAR;
             }
             
-            alignment = ALIGNMENT_DOUBLE;
+           // alignment = ALIGNMENT_DOUBLE;
 
             count_desc = length_per_iovec / alignment;
             residue_desc = length_per_iovec % alignment;
@@ -552,6 +552,9 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
                 cuda_iov_dist_h_current[current_block].element_alignment[task_iteration] = orig_alignment;
                 cuda_iov_dist_h_current[current_block].nb_tasks = task_iteration + 1;
                 cuda_iov_dist_h_current[current_block].nb_elements[task_iteration] = (length_per_iovec - length_per_iovec / alignment * alignment) / orig_alignment;
+                if (cuda_iov_dist_h_current[current_block].nb_elements[task_iteration] == 0) {
+                    cuda_iov_dist_h_current[current_block].nb_elements[task_iteration] = 1;
+                }
                 destination += cuda_iov_dist_h_current[current_block].nb_elements[task_iteration] * orig_alignment;
                 DT_CUDA_DEBUG( opal_cuda_output(12, "PACKING \tblock %d, src %p, dst %p, nb_elements %d, alignment %d\n", current_block, cuda_iov_dist_h_current[current_block].src[task_iteration], cuda_iov_dist_h_current[current_block].dst[task_iteration], cuda_iov_dist_h_current[current_block].nb_elements[task_iteration], cuda_iov_dist_h_current[current_block].element_alignment[task_iteration]); );
                 current_block += 1;
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
index bbc18989e6e..6ff69eaba12 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
@@ -277,9 +277,9 @@ __global__ void opal_generic_simple_unpack_cuda_iov_kernel( ddt_cuda_iov_dist_t*
             _destination_tmp = dst + threadIdx.x * alignment;
 #if !defined (OPAL_DATATYPE_CUDA_DRY_RUN)
                 if (alignment == ALIGNMENT_DOUBLE) {
-                    *((double *)_destination_tmp) = *((double *)_source_tmp);
+                    *((long *)_destination_tmp) = *((long *)_source_tmp);
                 } else if (alignment == ALIGNMENT_FLOAT) {
-                    *((float *)_destination_tmp) = *((float *)_source_tmp);
+                    *((int *)_destination_tmp) = *((int *)_source_tmp);
                 } else {
                     * _destination_tmp = *_source_tmp;
                 }
@@ -296,15 +296,15 @@ __global__ void unpack_contiguous_loop_cuda_kernel_global( uint32_t copy_loops,
 {
     uint32_t _i, tid, num_threads;
     uint32_t gap, nb_elements;
-    double *_source_tmp, *_destination_tmp, *_dst_disp_tmp;;
+    char *_source_tmp, *_destination_tmp, *_dst_disp_tmp;;
     
     tid = threadIdx.x + blockIdx.x * blockDim.x;
     num_threads = gridDim.x * blockDim.x;
     
-    gap = (extent - size) / 8;
-    nb_elements = size / 8;
-    _dst_disp_tmp = (double*)destination;
-    _source_tmp = (double*)source;
+    gap = (extent - size) / 1;
+    nb_elements = size / 1;
+    _dst_disp_tmp = (char*)destination;
+    _source_tmp = (char*)source;
     _destination_tmp = _dst_disp_tmp + tid;
     _source_tmp += tid;
 
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
index 13531b93d3e..24a0bfc034f 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
@@ -375,6 +375,7 @@ int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvert
         }
         
         for (i = 0; i < cuda_iov_count; i++) {
+    //        pElem = &(description[pStack->index+i]);
             if (buffer_size >= cuda_iov[i].iov_len) {
                 length_per_iovec = cuda_iov[i].iov_len;
             } else {
@@ -393,8 +394,6 @@ int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvert
             } else {
                 alignment = ALIGNMENT_CHAR;
             }
-            
-            alignment = ALIGNMENT_DOUBLE;
 
             count_desc = length_per_iovec / alignment;
             residue_desc = length_per_iovec % alignment;
@@ -428,6 +427,9 @@ int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvert
                 cuda_iov_dist_h_current[current_block].element_alignment[task_iteration] = orig_alignment;
                 cuda_iov_dist_h_current[current_block].nb_tasks = task_iteration + 1;
                 cuda_iov_dist_h_current[current_block].nb_elements[task_iteration] = (length_per_iovec - length_per_iovec / alignment * alignment) / orig_alignment;
+                if (cuda_iov_dist_h_current[current_block].nb_elements[task_iteration] == 0) {
+                    cuda_iov_dist_h_current[current_block].nb_elements[task_iteration] = 1;
+                }
                 source += cuda_iov_dist_h_current[current_block].nb_elements[task_iteration] * orig_alignment;
                 DT_CUDA_DEBUG( opal_cuda_output(12, "UNPACKING \tblock %d, src %p, dst %p, nb_elements %d, alignment %d\n", current_block, cuda_iov_dist_h_current[current_block].src[task_iteration], cuda_iov_dist_h_current[current_block].dst[task_iteration], cuda_iov_dist_h_current[current_block].nb_elements[task_iteration], cuda_iov_dist_h_current[current_block].element_alignment[task_iteration]); );
                 current_block += 1;
diff --git a/opal/mca/btl/smcuda/btl_smcuda.c b/opal/mca/btl/smcuda/btl_smcuda.c
index 96ca945e0dc..221150d5ccc 100644
--- a/opal/mca/btl/smcuda/btl_smcuda.c
+++ b/opal/mca/btl/smcuda/btl_smcuda.c
@@ -400,6 +400,7 @@ smcuda_btl_first_time_init(mca_btl_smcuda_t *smcuda_btl,
 
     /* allocation will be for the fragment descriptor and payload buffer */
     length = sizeof(mca_btl_smcuda_frag1_t);
+    printf("free list %d\n", mca_btl_smcuda_component.sm_free_list_num);
     length_payload =
         sizeof(mca_btl_smcuda_hdr_t) + mca_btl_smcuda_component.eager_limit;
     i = opal_free_list_init (&mca_btl_smcuda_component.sm_frags_eager, length,
@@ -1044,6 +1045,28 @@ static int mca_btl_smcuda_deregister_mem (struct mca_btl_base_module_t* btl,
     return OPAL_SUCCESS;
 }
 
+int mca_btl_smcuda_notify_packing_done(void* send_value, int my_rank, int peer_rank)
+{
+    sm_fifo_t* fifo_send = &(mca_btl_smcuda_component.fifo[peer_rank][FIFO_MAP(my_rank)]);
+    if (fifo_send == NULL) {
+        return OPAL_ERROR;
+    } else {
+   //     return sm_fifo_write(send_value, fifo_send);
+        int tail = fifo_send->tail;
+        int head = fifo_send->head;
+        if ((head + 1) & fifo_send->mask == tail) {
+            printf("fifo is full\n");
+            return OPAL_ERR_OUT_OF_RESOURCE;
+        } else {
+            volatile void **q = (volatile void **) RELATIVE2VIRTUAL(fifo_send->queue);
+            tail = (tail - 1) & fifo_send->mask;
+            q[tail] = send_value;
+            printf("write to place %d tail %d head %d\n", tail, fifo_send->tail, fifo_send->head);
+            return OPAL_SUCCESS;
+        }
+    }
+}
+
 int mca_btl_smcuda_get_cuda (struct mca_btl_base_module_t *btl,
     struct mca_btl_base_endpoint_t *ep, void *local_address,
     uint64_t remote_address, struct mca_btl_base_registration_handle_t *local_handle,
@@ -1144,7 +1167,9 @@ int mca_btl_smcuda_get_cuda (struct mca_btl_base_module_t *btl,
             printf("i receive pipeline %ld, lindex %d, pack_required %d\n", pipeline_size, lindex, pack_required);
             convertor->gpu_buffer_ptr = remote_memory_address;
             mca_btl_smcuda_cuda_dt_unpack_clone(convertor, ep, local_address, local_handle, (mca_btl_base_completion_fn_t)cbfunc, cbcontext, cbdata, pipeline_size, lindex);
-            mca_btl_smcuda_send_cuda_pack_sig(btl, ep, lindex, 0, 0);
+            if (pack_required) {
+                mca_btl_smcuda_send_cuda_pack_sig(btl, ep, lindex, 0, 0);
+            }
             done = 0;
             mca_btl_smcuda_free(btl, (mca_btl_base_descriptor_t *)frag);
         } else {
diff --git a/opal/mca/btl/smcuda/btl_smcuda_component.c b/opal/mca/btl/smcuda/btl_smcuda_component.c
index 5fd845edf24..3962e12af7f 100644
--- a/opal/mca/btl/smcuda/btl_smcuda_component.c
+++ b/opal/mca/btl/smcuda/btl_smcuda_component.c
@@ -914,7 +914,7 @@ static void btl_smcuda_datatype_pack(mca_btl_base_module_t* btl,
         struct opal_convertor_t *convertor = my_cuda_dt_clone->convertor;
         struct iovec iov;
         int rc_dt = 0;
-        size_t pipeline_size = 1024*1024;
+        size_t pipeline_size = 1024*1024*200;
         uint32_t iov_count = 1;
         iov.iov_base = convertor->gpu_buffer_ptr;
         iov.iov_len = pipeline_size;
diff --git a/test/datatype/ddt_test.c b/test/datatype/ddt_test.c
index 459566eaa09..bb69643ee17 100644
--- a/test/datatype/ddt_test.c
+++ b/test/datatype/ddt_test.c
@@ -828,9 +828,9 @@ int main( int argc, char* argv[] )
     printf( "\n\n#\n * TEST UPPER TRIANGULAR MATRIX (size 100)\n #\n\n" );
     pdt = upper_matrix(4000);
     if( outputFlags & CHECK_PACK_UNPACK ) {
-        for (i = 1; i <= 1; i++) {
+        for (i = 1; i <= 4; i++) {
 //        local_copy_ddt_count(pdt, 1);
-            local_copy_with_convertor(pdt, 1, 1024*1024*10, 4000);
+            local_copy_with_convertor(pdt, 1, 1024*1024*200, 4000);
         }
     }
     OBJ_RELEASE( pdt ); assert( pdt == NULL );
@@ -990,7 +990,7 @@ int main( int argc, char* argv[] )
       //  local_copy_with_convertor( pdt, 1, 6000 );
       //  local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 6000 );
       //  local_copy_with_convertor( pdt, 1, 36000 );
-          local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 1024*2000, 4000, 256, 384 );
+      //    local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 1024*2000, 4000, 256, 384 );
         }
     }
     printf( ">>--------------------------------------------<<\n" );

From c5fb93969f0067bff08d5c16d88a308082baaa4f Mon Sep 17 00:00:00 2001
From: eddy16112 <eddy16112@gmail.com>
Date: Mon, 31 Aug 2015 19:17:04 -0400
Subject: [PATCH 110/190] fix a bug when buffer is not big enough for whole ddt

Conflicts:
	opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
---
 .../cuda/opal_datatype_pack_cuda_wrapper.cu   | 18 ++++++++++-----
 .../cuda/opal_datatype_unpack_cuda_wrapper.cu | 23 ++++++++++++-------
 opal/mca/btl/smcuda/btl_smcuda_component.c    |  2 +-
 test/datatype/ddt_test.c                      |  2 ++
 4 files changed, 30 insertions(+), 15 deletions(-)

diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
index 608e56dcd67..b5443ffb3b9 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
@@ -402,6 +402,7 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
     dt_elem_desc_t* pElem;
     dt_stack_t* pStack;
     uint8_t alignment, orig_alignment;
+    int32_t orig_stack_index;
     
     ddt_cuda_iov_dist_t* cuda_iov_dist_h_current;
     ddt_cuda_iov_dist_t* cuda_iov_dist_d_current;
@@ -470,6 +471,7 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
     total_converted = pConvertor->bConverted;
     cuda_streams->current_stream_id = 0;
     convertor_flags = pConvertor->flags;
+    orig_stack_index = pStack->index;
     complete_flag = opal_convertor_raw( pConvertor, cuda_iov, &cuda_iov_count, &length );
     DT_CUDA_DEBUG ( opal_cuda_output(2, "PACKING complete flag %d, iov count %d, length %d, submit to CUDA stream %d\n", complete_flag, cuda_iov_count, length, cuda_streams->current_stream_id); );
 
@@ -498,7 +500,7 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
         }
 
         for (i = 0; i < cuda_iov_count; i++) {
-        //    pElem = &(description[pStack->index+i]);
+            pElem = &(description[orig_stack_index+i]);
             if (buffer_size >= cuda_iov[i].iov_len) {
                 length_per_iovec = cuda_iov[i].iov_len;
             } else {
@@ -510,9 +512,9 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
             total_packed += length_per_iovec;
             
             /* check alignment */
-            if ((uintptr_t)(cuda_iov[i].iov_base) % ALIGNMENT_DOUBLE == 0 && (uintptr_t)destination % ALIGNMENT_DOUBLE == 0) {
+            if ((uintptr_t)(cuda_iov[i].iov_base) % ALIGNMENT_DOUBLE == 0 && (uintptr_t)destination % ALIGNMENT_DOUBLE == 0 && length_per_iovec >= ALIGNMENT_DOUBLE) {
                 alignment = ALIGNMENT_DOUBLE;
-            } else if ((uintptr_t)(cuda_iov[i].iov_base) % ALIGNMENT_FLOAT == 0 && (uintptr_t)destination % ALIGNMENT_FLOAT == 0) {
+            } else if ((uintptr_t)(cuda_iov[i].iov_base) % ALIGNMENT_FLOAT == 0 && (uintptr_t)destination % ALIGNMENT_FLOAT == 0 && length_per_iovec >= ALIGNMENT_FLOAT) {
                 alignment = ALIGNMENT_FLOAT;
             } else {
                 alignment = ALIGNMENT_CHAR;
@@ -534,6 +536,9 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
                 } else {
                     cuda_iov_dist_h_current[current_block].nb_elements[task_iteration] = count_desc - j*thread_per_block; //(thread_per_block - ((j+1)*thread_per_block - count_desc));// * sizeof(double);
                 }
+#if defined (OPAL_DATATYPE_CUDA_DEBUG)
+                assert(cuda_iov_dist_h_current[current_block].nb_elements[task_iteration] > 0);
+#endif /* OPAL_DATATYPE_CUDA_DEBUG */
                 destination += cuda_iov_dist_h_current[current_block].nb_elements[task_iteration] * alignment;
                 DT_CUDA_DEBUG( opal_cuda_output(12, "PACKING \tblock %d, src %p, dst %p, nb_elements %d, alignment %d\n", current_block, cuda_iov_dist_h_current[current_block].src[task_iteration], cuda_iov_dist_h_current[current_block].dst[task_iteration], cuda_iov_dist_h_current[current_block].nb_elements[task_iteration], cuda_iov_dist_h_current[current_block].element_alignment[task_iteration]); );
                 current_block += 1;
@@ -552,9 +557,9 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
                 cuda_iov_dist_h_current[current_block].element_alignment[task_iteration] = orig_alignment;
                 cuda_iov_dist_h_current[current_block].nb_tasks = task_iteration + 1;
                 cuda_iov_dist_h_current[current_block].nb_elements[task_iteration] = (length_per_iovec - length_per_iovec / alignment * alignment) / orig_alignment;
-                if (cuda_iov_dist_h_current[current_block].nb_elements[task_iteration] == 0) {
-                    cuda_iov_dist_h_current[current_block].nb_elements[task_iteration] = 1;
-                }
+#if defined (OPAL_DATATYPE_CUDA_DEBUG)
+                assert(cuda_iov_dist_h_current[current_block].nb_elements[task_iteration] > 0);
+#endif /* OPAL_DATATYPE_CUDA_DEBUG */
                 destination += cuda_iov_dist_h_current[current_block].nb_elements[task_iteration] * orig_alignment;
                 DT_CUDA_DEBUG( opal_cuda_output(12, "PACKING \tblock %d, src %p, dst %p, nb_elements %d, alignment %d\n", current_block, cuda_iov_dist_h_current[current_block].src[task_iteration], cuda_iov_dist_h_current[current_block].dst[task_iteration], cuda_iov_dist_h_current[current_block].nb_elements[task_iteration], cuda_iov_dist_h_current[current_block].element_alignment[task_iteration]); );
                 current_block += 1;
@@ -592,6 +597,7 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
         GET_TIME(start);
 #endif
         convertor_flags = pConvertor->flags;
+        orig_stack_index = pStack->index;
         complete_flag = opal_convertor_raw( pConvertor, cuda_iov, &cuda_iov_count, &length );
         DT_CUDA_DEBUG ( opal_cuda_output(2, "PACKING complete flag %d, iov count %d, length %d, submit to CUDA stream %d\n", complete_flag, cuda_iov_count, length, cuda_streams->current_stream_id); );
 #if defined(OPAL_DATATYPE_CUDA_TIMING)    
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
index 24a0bfc034f..555d41f9517 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
@@ -285,7 +285,8 @@ int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvert
     dt_elem_desc_t* pElem;
     dt_stack_t* pStack;
     uint8_t alignment, orig_alignment;
-    
+    int32_t orig_stack_index;
+
     ddt_cuda_iov_dist_t* cuda_iov_dist_h_current;
     ddt_cuda_iov_dist_t* cuda_iov_dist_d_current;
     
@@ -347,6 +348,7 @@ int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvert
     total_converted = pConvertor->bConverted;
     cuda_streams->current_stream_id = 0;
     convertor_flags = pConvertor->flags;
+    orig_stack_index = pStack->index;
     complete_flag = opal_convertor_raw( pConvertor, cuda_iov, &cuda_iov_count, &length );
     DT_CUDA_DEBUG ( opal_cuda_output(2, "UNPACKING complete flag %d, iov count %d, length %d, submit to CUDA stream %d\n", complete_flag, cuda_iov_count, length, cuda_streams->current_stream_id); );
 
@@ -375,7 +377,7 @@ int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvert
         }
         
         for (i = 0; i < cuda_iov_count; i++) {
-    //        pElem = &(description[pStack->index+i]);
+            pElem = &(description[orig_stack_index+i]);
             if (buffer_size >= cuda_iov[i].iov_len) {
                 length_per_iovec = cuda_iov[i].iov_len;
             } else {
@@ -387,9 +389,9 @@ int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvert
             total_unpacked += length_per_iovec;
             
             /* check alignment */
-            if ((uintptr_t)(cuda_iov[i].iov_base) % ALIGNMENT_DOUBLE == 0 && (uintptr_t)source % ALIGNMENT_DOUBLE == 0) {
+            if ((uintptr_t)(cuda_iov[i].iov_base) % ALIGNMENT_DOUBLE == 0 && (uintptr_t)source % ALIGNMENT_DOUBLE == 0 && length_per_iovec >= ALIGNMENT_DOUBLE) {
                 alignment = ALIGNMENT_DOUBLE;
-            } else if ((uintptr_t)(cuda_iov[i].iov_base) % ALIGNMENT_FLOAT == 0 && (uintptr_t)source % ALIGNMENT_FLOAT == 0) {
+            } else if ((uintptr_t)(cuda_iov[i].iov_base) % ALIGNMENT_FLOAT == 0 && (uintptr_t)source % ALIGNMENT_FLOAT == 0 && length_per_iovec >= ALIGNMENT_FLOAT) {
                 alignment = ALIGNMENT_FLOAT;
             } else {
                 alignment = ALIGNMENT_CHAR;
@@ -409,6 +411,9 @@ int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvert
                 } else {
                     cuda_iov_dist_h_current[current_block].nb_elements[task_iteration] = (thread_per_block - ((j+1)*thread_per_block - count_desc));// * sizeof(double);
                 }
+#if defined (OPAL_DATATYPE_CUDA_DEBUG)
+                assert (cuda_iov_dist_h_current[current_block].nb_elements[task_iteration] > 0); 
+#endif /* OPAL_DATATYPE_CUDA_DEBUG */
                 source += cuda_iov_dist_h_current[current_block].nb_elements[task_iteration] * alignment;
                 DT_CUDA_DEBUG( opal_cuda_output(12, "UNPACKING \tblock %d, src %p, dst %p, nb_elements %d, alignment %d\n", current_block, cuda_iov_dist_h_current[current_block].src[task_iteration], cuda_iov_dist_h_current[current_block].dst[task_iteration], cuda_iov_dist_h_current[current_block].nb_elements[task_iteration], cuda_iov_dist_h_current[current_block].element_alignment[task_iteration]); );
                 current_block += 1;
@@ -427,9 +432,9 @@ int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvert
                 cuda_iov_dist_h_current[current_block].element_alignment[task_iteration] = orig_alignment;
                 cuda_iov_dist_h_current[current_block].nb_tasks = task_iteration + 1;
                 cuda_iov_dist_h_current[current_block].nb_elements[task_iteration] = (length_per_iovec - length_per_iovec / alignment * alignment) / orig_alignment;
-                if (cuda_iov_dist_h_current[current_block].nb_elements[task_iteration] == 0) {
-                    cuda_iov_dist_h_current[current_block].nb_elements[task_iteration] = 1;
-                }
+#if defined (OPAL_DATATYPE_CUDA_DEBUG)
+                assert (cuda_iov_dist_h_current[current_block].nb_elements[task_iteration] > 0);
+#endif /* OPAL_DATATYPE_CUDA_DEBUG */
                 source += cuda_iov_dist_h_current[current_block].nb_elements[task_iteration] * orig_alignment;
                 DT_CUDA_DEBUG( opal_cuda_output(12, "UNPACKING \tblock %d, src %p, dst %p, nb_elements %d, alignment %d\n", current_block, cuda_iov_dist_h_current[current_block].src[task_iteration], cuda_iov_dist_h_current[current_block].dst[task_iteration], cuda_iov_dist_h_current[current_block].nb_elements[task_iteration], cuda_iov_dist_h_current[current_block].element_alignment[task_iteration]); );
                 current_block += 1;
@@ -465,8 +470,10 @@ int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvert
         }
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
         GET_TIME(start);
-#endif   
         convertor_flags = pConvertor->flags;     
+#endif
+        convertor_flags = pConvertor->flags;
+        orig_stack_index = pStack->index;
         complete_flag = opal_convertor_raw( pConvertor, cuda_iov, &cuda_iov_count, &length );
         DT_CUDA_DEBUG ( opal_cuda_output(8, "UNPACKING complete flag %d, iov count %d, length %d, submit to CUDA stream %d\n", complete_flag, cuda_iov_count, length, cuda_streams->current_stream_id); );
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
diff --git a/opal/mca/btl/smcuda/btl_smcuda_component.c b/opal/mca/btl/smcuda/btl_smcuda_component.c
index 3962e12af7f..cd6c7ce071b 100644
--- a/opal/mca/btl/smcuda/btl_smcuda_component.c
+++ b/opal/mca/btl/smcuda/btl_smcuda_component.c
@@ -914,7 +914,7 @@ static void btl_smcuda_datatype_pack(mca_btl_base_module_t* btl,
         struct opal_convertor_t *convertor = my_cuda_dt_clone->convertor;
         struct iovec iov;
         int rc_dt = 0;
-        size_t pipeline_size = 1024*1024*200;
+        size_t pipeline_size = 1024*1024*20;
         uint32_t iov_count = 1;
         iov.iov_base = convertor->gpu_buffer_ptr;
         iov.iov_len = pipeline_size;
diff --git a/test/datatype/ddt_test.c b/test/datatype/ddt_test.c
index bb69643ee17..ae72785b86c 100644
--- a/test/datatype/ddt_test.c
+++ b/test/datatype/ddt_test.c
@@ -644,6 +644,8 @@ static int local_copy_with_convertor( ompi_datatype_t* pdt, int count, int chunk
     dt_length = compute_buffer_length(pdt, count);
     printf("length %lu\n", dt_length);
 
+    cudaSetDevice(1);
+
 #if defined (DDT_TEST_CUDA)
     cudaError_t error = cudaMalloc((void **)&psrc, dt_length);
     if ( error != cudaSuccess) {

From bcb1e05f9a710503dbef44246c6de4f61ccb16aa Mon Sep 17 00:00:00 2001
From: eddy16112 <eddy16112@gmail.com>
Date: Wed, 2 Sep 2015 17:33:48 -0400
Subject: [PATCH 111/190] if data in different gpu, instead of copy direct from
 one to the other, we do a D2D copy

Conflicts:
	opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
	test/datatype/Makefile.am
---
 ompi/mca/pml/ob1/pml_ob1_cuda.c               | 15 ++++++--
 opal/datatype/cuda/opal_datatype_cuda.cu      |  8 +++-
 .../cuda/opal_datatype_pack_cuda_kernel.cu    | 10 ++---
 .../cuda/opal_datatype_pack_cuda_wrapper.cu   | 38 ++++++++++++-------
 .../cuda/opal_datatype_unpack_cuda_kernel.cu  | 10 ++---
 .../cuda/opal_datatype_unpack_cuda_wrapper.cu | 38 ++++++++++++-------
 opal/mca/btl/smcuda/btl_smcuda.c              | 34 +++++++++++++----
 opal/mca/btl/smcuda/btl_smcuda.h              | 12 ++++--
 opal/mca/btl/smcuda/btl_smcuda_component.c    | 27 ++++++++++---
 opal/mca/common/cuda/common_cuda.c            | 13 +++++++
 opal/mca/common/cuda/common_cuda.h            |  2 +
 test/datatype/Makefile.am                     |  7 +++-
 12 files changed, 156 insertions(+), 58 deletions(-)

diff --git a/ompi/mca/pml/ob1/pml_ob1_cuda.c b/ompi/mca/pml/ob1/pml_ob1_cuda.c
index 1e98a7757db..0befb5078af 100644
--- a/ompi/mca/pml/ob1/pml_ob1_cuda.c
+++ b/ompi/mca/pml/ob1/pml_ob1_cuda.c
@@ -52,7 +52,7 @@ size_t mca_pml_ob1_rdma_cuda_btls(
 int mca_pml_ob1_rdma_cuda_btl_register_data(
     mca_pml_ob1_com_btl_t* rdma_btls, 
     uint32_t num_btls_used, 
-    size_t pipeline_size, int lindex, uint8_t pack_required);
+    size_t pipeline_size, int lindex, uint8_t pack_required, uint8_t gpu_device);
 
 int mca_pml_ob1_cuda_need_buffers(void * rreq,
                                   mca_btl_base_module_t* btl);
@@ -108,6 +108,7 @@ int mca_pml_ob1_send_request_start_cuda(mca_pml_ob1_send_request_t* sendreq,
             printf("GPU data ready for GET!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n");
             unsigned char *base;
             struct opal_convertor_t *convertor = &(sendreq->req_send.req_base.req_convertor);
+            int local_device = 0;
             base = opal_cuda_malloc_gpu_buffer_p(convertor->local_size, 0);
             convertor->gpu_buffer_ptr = base;
             sendreq->req_send.req_bytes_packed = convertor->local_size;
@@ -120,8 +121,13 @@ int mca_pml_ob1_send_request_start_cuda(mca_pml_ob1_send_request_t* sendreq,
     
                 int lindex = mca_btl_smcuda_alloc_cuda_dt_pack_clone(bml_btl->btl_endpoint);
                 assert(lindex >= 0);
-                mca_pml_ob1_rdma_cuda_btl_register_data(sendreq->req_rdma, sendreq->req_rdma_cnt, 0, lindex, 1); 
-                mca_btl_smcuda_cuda_dt_pack_clone(convertor, bml_btl->btl_endpoint, NULL, NULL, NULL, NULL, NULL, 0, lindex);
+                rc = mca_common_cuda_get_device(&local_device);
+                if (rc != 0) {
+                    opal_output_verbose(0, "Failed to get the GPU device ID, rc=%d", rc);
+                    return rc;
+                }
+                mca_pml_ob1_rdma_cuda_btl_register_data(sendreq->req_rdma, sendreq->req_rdma_cnt, 0, lindex, 1, local_device); 
+                mca_btl_smcuda_cuda_dt_pack_clone(convertor, bml_btl->btl_endpoint, NULL, NULL, NULL, NULL, NULL, NULL, 0, lindex, 0, local_device);
     
                 rc = mca_pml_ob1_send_request_start_rdma(sendreq, bml_btl,
                                                          sendreq->req_send.req_bytes_packed);
@@ -199,7 +205,7 @@ size_t mca_pml_ob1_rdma_cuda_btls(
 int mca_pml_ob1_rdma_cuda_btl_register_data(
     mca_pml_ob1_com_btl_t* rdma_btls, 
     uint32_t num_btls_used, 
-    size_t pipeline_size, int lindex, uint8_t pack_required)
+    size_t pipeline_size, int lindex, uint8_t pack_required, uint8_t gpu_device)
 {
     uint32_t i, j;
     for (i = 0; i < num_btls_used; i++) {
@@ -214,6 +220,7 @@ int mca_pml_ob1_rdma_cuda_btl_register_data(
         cuda_reg->data.pipeline_size = pipeline_size;
         cuda_reg->data.lindex = lindex;
         cuda_reg->data.pack_required = pack_required;
+        cuda_reg->data.gpu_device = gpu_device;
 
     }
     return 0;
diff --git a/opal/datatype/cuda/opal_datatype_cuda.cu b/opal/datatype/cuda/opal_datatype_cuda.cu
index b6ed096b7d9..b94679358a0 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cu
+++ b/opal/datatype/cuda/opal_datatype_cuda.cu
@@ -213,6 +213,7 @@ void opal_datatype_cuda_init(void)
         opal_cuda_output(0, "Cannot retrieve the device being used. Drop CUDA support!\n");
         return;
     }    
+    printf("current device %d\n", device);
 
     cuda_free_list = init_cuda_free_list();
     
@@ -367,6 +368,9 @@ unsigned char* opal_cuda_get_gpu_pack_buffer()
 
 void* opal_cuda_malloc_gpu_buffer(size_t size, int gpu_id)
 {
+    int dev_id;
+    cudaGetDevice(&dev_id);
+    printf("malloc gpu buffer in dev %d\n", dev_id);
     ddt_cuda_device_t *device = &cuda_device[gpu_id];
     if (device->buffer_free_size < size) {
         return NULL;
@@ -402,7 +406,7 @@ void* opal_cuda_malloc_gpu_buffer(size_t size, int gpu_id)
         cuda_list_push_head(&device->buffer_used, p);
         device->buffer_used_size += size;
         device->buffer_free_size -= size;
-        DT_CUDA_DEBUG( opal_cuda_output( 0, "Malloc GPU buffer %p.\n", addr); );
+        DT_CUDA_DEBUG( opal_cuda_output( 1, "Malloc GPU buffer %p.\n", addr); );
         return addr;
     }
 }
@@ -440,7 +444,7 @@ void opal_cuda_free_gpu_buffer(void *addr, int gpu_id)
     }
     cuda_list_item_merge_by_addr(&device->buffer_free, ptr);
     device->buffer_free_size += ptr->size;
-    DT_CUDA_DEBUG( opal_cuda_output( 0, "Free GPU buffer %p.\n", addr); );
+    DT_CUDA_DEBUG( opal_cuda_output( 1, "Free GPU buffer %p.\n", addr); );
 }
 
 void opal_dump_cuda_list(ddt_cuda_list_t *list)
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
index 42962316da3..9bf130630f9 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
@@ -536,15 +536,15 @@ __global__ void pack_contiguous_loop_cuda_kernel_global( uint32_t copy_loops,
 {
     uint32_t _i, tid, num_threads;
     uint32_t gap, nb_elements;
-    char *_source_tmp, *_destination_tmp, *_src_disp_tmp;;
+    double *_source_tmp, *_destination_tmp, *_src_disp_tmp;;
     
     tid = threadIdx.x + blockIdx.x * blockDim.x;
     num_threads = gridDim.x * blockDim.x;
     
-    gap = (extent - size) / 1;
-    nb_elements = size / 1;
-    _src_disp_tmp = (char*)source;
-    _destination_tmp = (char*)destination;
+    gap = (extent - size) / 8;
+    nb_elements = size / 8;
+    _src_disp_tmp = (double*)source;
+    _destination_tmp = (double*)destination;
     _destination_tmp += tid;
 
     for (_i = tid; _i < copy_loops*nb_elements; _i+=num_threads) {
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
index b5443ffb3b9..56a85e3709d 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
@@ -195,11 +195,10 @@ int32_t opal_generic_simple_pack_function_cuda_vector(opal_convertor_t* pConvert
     long total_time;
 #endif
     
-    DT_CUDA_DEBUG( opal_cuda_output( 1, "opal_convertor_generic_simple_pack( %p:%p, {%p, %lu}, %d )\n",
+    DT_CUDA_DEBUG( opal_cuda_output( 1, "opal_convertor_generic_simple_pack_cuda_vector( %p:%p, {%p, %lu}, %u, %u )\n",
                                 (void*)pConvertor, (void*)pConvertor->pBaseBuf,
-                                iov[0].iov_base, (unsigned long)iov[0].iov_len, *out_size ); );
+                                iov[0].iov_base, (unsigned long)iov[0].iov_len, *out_size, *max_data ); );
 
-    printf("I am in simple pack vector, max_data %lu, iov_len %lu\n", *max_data, iov[0].iov_len);
     description = pConvertor->use_desc->desc;
     
     /* For the first step we have to add both displacement to the source. After in the
@@ -214,7 +213,7 @@ int32_t opal_generic_simple_pack_function_cuda_vector(opal_convertor_t* pConvert
     pConvertor->stack_pos--;
     pElem = &(description[pos_desc]);
 
-    DT_CUDA_DEBUG( opal_cuda_output( 0, "pack start pos_desc %d count_desc %d disp %ld\n"
+    DT_CUDA_DEBUG( opal_cuda_output( 1, "pack start pos_desc %d count_desc %d disp %ld\n"
                            "stack_pos %d pos_desc %d count_desc %d disp %ld\n",
                            pos_desc, count_desc, (long)(conv_ptr - pConvertor->pBaseBuf),
                            pConvertor->stack_pos, pStack->index, (int)pStack->count, (long)pStack->disp ); );
@@ -247,7 +246,6 @@ int32_t opal_generic_simple_pack_function_cuda_vector(opal_convertor_t* pConvert
             free_required = 1;
             iov_ptr = pConvertor->gpu_buffer_ptr;
         }
-        printf("original local %d\n", iov_len_local);
         while( 1 ) {
             while( pElem->elem.common.flags & OPAL_DATATYPE_FLAG_DATA ) {
                 /* now here we have a basic datatype */
@@ -260,7 +258,7 @@ int32_t opal_generic_simple_pack_function_cuda_vector(opal_convertor_t* pConvert
                 goto complete_loop;
             }
             if( OPAL_DATATYPE_END_LOOP == pElem->elem.common.type ) { /* end of the current loop */
-                DT_CUDA_DEBUG( opal_cuda_output( 1, "pack end_loop count %d stack_pos %d"
+                DT_CUDA_DEBUG( opal_cuda_output( 2, "pack end_loop count %d stack_pos %d"
                                                  " pos_desc %d disp %ld space %lu\n",
                                                  (int)pStack->count, pConvertor->stack_pos,
                                                  pos_desc, (long)pStack->disp, (unsigned long)iov_len_local ); );
@@ -286,7 +284,7 @@ int32_t opal_generic_simple_pack_function_cuda_vector(opal_convertor_t* pConvert
                 }
                 conv_ptr = pConvertor->pBaseBuf + pStack->disp;
                 UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
-                DT_CUDA_DEBUG( opal_cuda_output( 1, "pack new_loop count %d stack_pos %d pos_desc %d count_desc %d disp %ld space %lu\n",
+                DT_CUDA_DEBUG( opal_cuda_output( 2, "pack new_loop count %d stack_pos %d pos_desc %d count_desc %d disp %ld space %lu\n",
                                                  (int)pStack->count, pConvertor->stack_pos, pos_desc,
                                                  count_desc, (long)pStack->disp, (unsigned long)iov_len_local ); );
             }
@@ -314,7 +312,7 @@ int32_t opal_generic_simple_pack_function_cuda_vector(opal_convertor_t* pConvert
     complete_loop:
         iov[iov_count].iov_len -= iov_len_local;  /* update the amount of valid data */
         total_packed += iov[iov_count].iov_len;
-        printf("iov_len %d, local %d\n", iov[iov_count].iov_len, iov_len_local);
+ //       printf("iov_len %d, local %d\n", iov[iov_count].iov_len, iov_len_local);
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
         GET_TIME(start);
 #endif
@@ -324,7 +322,7 @@ int32_t opal_generic_simple_pack_function_cuda_vector(opal_convertor_t* pConvert
 #if defined(OPAL_DATATYPE_CUDA_TIMING) 
         GET_TIME( end );
         total_time = ELAPSED_TIME( start, end );
-        printf( "[Timing]: DtoH memcpy in %ld microsec\n", total_time );
+        printf( "[Timing]: DtoH memcpy in %ld microsec, transfer required %d\n", total_time, transfer_required );
 #endif
     }
     *max_data = total_packed;
@@ -332,7 +330,7 @@ int32_t opal_generic_simple_pack_function_cuda_vector(opal_convertor_t* pConvert
     *out_size = iov_count;
     if( pConvertor->bConverted == pConvertor->local_size ) {
         pConvertor->flags |= CONVERTOR_COMPLETED;
-        printf("total packed %lu\n", pConvertor->bConverted);
+        DT_CUDA_DEBUG( opal_cuda_output( 1, "total packed %lu\n", pConvertor->bConverted); );
         if (pConvertor->gpu_buffer_ptr != NULL && free_required) {
            opal_cuda_free_gpu_buffer(pConvertor->gpu_buffer_ptr, 0);
            pConvertor->gpu_buffer_ptr = NULL;
@@ -359,8 +357,13 @@ void pack_contiguous_loop_cuda( dt_elem_desc_t* ELEM,
     uint32_t _copy_loops = *(COUNT);
     uint32_t num_blocks, tasks_per_block;
     unsigned char* _destination = *(DESTINATION);
+    
+#if defined(OPAL_DATATYPE_CUDA_TIMING)    
+    TIMER_DATA_TYPE start, end, start_total, end_total;
+    long total_time;
+#endif
 
-    printf("I am in pack_contiguous_loop_cuda\n");
+    DT_CUDA_DEBUG( opal_cuda_output( 0, "I am in pack_contiguous_loop_cuda\n"); );
 
     if( (_copy_loops * _end_loop->size) > *(SPACE) )
         _copy_loops = (uint32_t)(*(SPACE) / _end_loop->size);
@@ -369,7 +372,10 @@ void pack_contiguous_loop_cuda( dt_elem_desc_t* ELEM,
  //   _source = pBaseBuf_GPU;
  //   _destination = (unsigned char*)cuda_desc_h->iov[0].iov_base;
 #endif
-    
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+    GET_TIME(start);
+#endif    
     tasks_per_block = THREAD_PER_BLOCK * TASK_PER_THREAD;
     num_blocks = (*COUNT + tasks_per_block - 1) / tasks_per_block;
     pack_contiguous_loop_cuda_kernel_global<<<192, 4*THREAD_PER_BLOCK>>>(_copy_loops, _end_loop->size, _loop->extent, _source, _destination);
@@ -382,6 +388,12 @@ void pack_contiguous_loop_cuda( dt_elem_desc_t* ELEM,
 #endif
     
     cudaDeviceSynchronize();
+    
+#if defined(OPAL_DATATYPE_CUDA_TIMING) 
+    GET_TIME( end );
+    total_time = ELAPSED_TIME( start, end );
+    printf( "[Timing]: vector packing in %ld microsec\n", total_time );
+#endif
 }
 
 int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor,
@@ -619,7 +631,7 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
 #if defined(OPAL_DATATYPE_CUDA_TIMING) 
     GET_TIME( end );
     total_time = ELAPSED_TIME( start, end );
-    printf( "[Timing]: DtoH memcpy in %ld microsec\n", total_time );
+    printf( "[Timing]: DtoH memcpy in %ld microsec, transfer required %d\n", total_time, transfer_required );
 #endif
     // float *vtmp = (float *)iov[0].iov_base;
     // DT_CUDA_DEBUG ( opal_cuda_output(0, "packed iov buffer, total packed %d\n", total_packed); );
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
index 6ff69eaba12..3303e6fe9f5 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
@@ -296,15 +296,15 @@ __global__ void unpack_contiguous_loop_cuda_kernel_global( uint32_t copy_loops,
 {
     uint32_t _i, tid, num_threads;
     uint32_t gap, nb_elements;
-    char *_source_tmp, *_destination_tmp, *_dst_disp_tmp;;
+    double *_source_tmp, *_destination_tmp, *_dst_disp_tmp;;
     
     tid = threadIdx.x + blockIdx.x * blockDim.x;
     num_threads = gridDim.x * blockDim.x;
     
-    gap = (extent - size) / 1;
-    nb_elements = size / 1;
-    _dst_disp_tmp = (char*)destination;
-    _source_tmp = (char*)source;
+    gap = (extent - size) / 8;
+    nb_elements = size / 8;
+    _dst_disp_tmp = (double*)destination;
+    _source_tmp = (double*)source;
     _destination_tmp = _dst_disp_tmp + tid;
     _source_tmp += tid;
 
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
index 555d41f9517..36316ae877f 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
@@ -131,10 +131,9 @@ int32_t opal_generic_simple_unpack_function_cuda_vector( opal_convertor_t* pConv
     TIMER_DATA_TYPE start, end, start_total, end_total;
     long total_time;
 #endif
-    
-    printf("i am in simple unpack vector, max_data %lu, iov len %lu\n", *max_data, iov[0].iov_len);
-    DT_CUDA_DEBUG( opal_cuda_output( 1, "opal_convertor_generic_simple_unpack( %p, {%p, %lu}, %u )\n",
-                                     (void*)pConvertor, iov[0].iov_base, (unsigned long)iov[0].iov_len, *out_size ); )                  
+
+    DT_CUDA_DEBUG( opal_cuda_output( 1, "opal_convertor_generic_simple_unpack( %p, {%p, %lu}, %u , %u)\n",
+                                     (void*)pConvertor, iov[0].iov_base, (unsigned long)iov[0].iov_len, *out_size, *max_data ); )
 
     description = pConvertor->use_desc->desc;
 
@@ -150,7 +149,7 @@ int32_t opal_generic_simple_unpack_function_cuda_vector( opal_convertor_t* pConv
     pConvertor->stack_pos--;
     pElem = &(description[pos_desc]);
 
-    DT_CUDA_DEBUG( opal_cuda_output( 0, "unpack start pos_desc %d count_desc %d disp %ld\n"
+    DT_CUDA_DEBUG( opal_cuda_output( 1, "unpack start pos_desc %d count_desc %d disp %ld\n"
                            "stack_pos %d pos_desc %d count_desc %d disp %ld\n",
                            pos_desc, count_desc, (long)(conv_ptr - pConvertor->pBaseBuf),
                            pConvertor->stack_pos, pStack->index, (int)pStack->count, (long)(pStack->disp) ); );
@@ -173,7 +172,7 @@ int32_t opal_generic_simple_unpack_function_cuda_vector( opal_convertor_t* pConv
 #if defined(OPAL_DATATYPE_CUDA_TIMING) 
         GET_TIME( end );
         total_time = ELAPSED_TIME( start, end );
-        printf( "[Timing]: HtoD memcpy in %ld microsec\n", total_time );
+        printf( "[Timing]: HtoD memcpy in %ld microsec, free required %d\n", total_time, free_required );
 #endif
         iov_len_local = iov[iov_count].iov_len;
         if( 0 != pConvertor->partial_length ) {
@@ -191,7 +190,7 @@ int32_t opal_generic_simple_unpack_function_cuda_vector( opal_convertor_t* pConv
                 goto complete_loop;
             }
             if( OPAL_DATATYPE_END_LOOP == pElem->elem.common.type ) { /* end of the current loop */
-                DT_CUDA_DEBUG( opal_cuda_output( 0, "unpack end_loop count %d stack_pos %d pos_desc %d disp %ld space %lu\n",
+                DT_CUDA_DEBUG( opal_cuda_output( 2, "unpack end_loop count %d stack_pos %d pos_desc %d disp %ld space %lu\n",
                                                  (int)pStack->count, pConvertor->stack_pos, pos_desc,
                                                  (long)pStack->disp, (unsigned long)iov_len_local ); );
                 if( --(pStack->count) == 0 ) { /* end of loop */
@@ -216,7 +215,7 @@ int32_t opal_generic_simple_unpack_function_cuda_vector( opal_convertor_t* pConv
                 }
                 conv_ptr = pConvertor->pBaseBuf + pStack->disp;
                 UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
-                DT_CUDA_DEBUG( opal_cuda_output( 0, "unpack new_loop count %d stack_pos %d pos_desc %d disp %ld space %lu\n",
+                DT_CUDA_DEBUG( opal_cuda_output( 2, "unpack new_loop count %d stack_pos %d pos_desc %d disp %ld space %lu\n",
                                                  (int)pStack->count, pConvertor->stack_pos, pos_desc,
                                                  (long)pStack->disp, (unsigned long)iov_len_local ); );
             }
@@ -251,7 +250,7 @@ int32_t opal_generic_simple_unpack_function_cuda_vector( opal_convertor_t* pConv
     *out_size = iov_count;
     if( pConvertor->bConverted == pConvertor->remote_size ) {
         pConvertor->flags |= CONVERTOR_COMPLETED;
-        printf("total unpacked %lu\n", pConvertor->bConverted);
+        DT_CUDA_DEBUG( opal_cuda_output( 1, "total packed %lu\n", pConvertor->bConverted); );
         if (pConvertor->gpu_buffer_ptr != NULL && free_required) {
             opal_cuda_free_gpu_buffer(pConvertor->gpu_buffer_ptr, 0);
             pConvertor->gpu_buffer_ptr = NULL;
@@ -261,7 +260,7 @@ int32_t opal_generic_simple_unpack_function_cuda_vector( opal_convertor_t* pConv
     /* Save the global position for the next round */
     PUSH_STACK( pStack, pConvertor->stack_pos, pos_desc, pElem->elem.common.type, count_desc,
                 conv_ptr - pConvertor->pBaseBuf );
-    DT_CUDA_DEBUG( opal_cuda_output( 1, "unpack save stack stack_pos %d pos_desc %d count_desc %d disp %ld\n",
+    DT_CUDA_DEBUG( opal_cuda_output( 2, "unpack save stack stack_pos %d pos_desc %d count_desc %d disp %ld\n",
                                      pConvertor->stack_pos, pStack->index, (int)pStack->count, (long)pStack->disp ); );
     return 0;
 }
@@ -335,7 +334,7 @@ int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvert
 #if defined(OPAL_DATATYPE_CUDA_TIMING) 
     GET_TIME( end );
     total_time = ELAPSED_TIME( start, end );
-    printf( "[Timing]: HtoD memcpy in %ld microsec\n", total_time );
+    printf( "[Timing]: HtoD memcpy in %ld microsec, free required %d\n", total_time, free_required );
 #endif
 
 
@@ -520,14 +519,22 @@ void unpack_contiguous_loop_cuda( dt_elem_desc_t* ELEM,
     uint32_t num_blocks, tasks_per_block;
     unsigned char* _source = *(SOURCE);
 
-    printf("I am in unpack_contiguous_loop_cuda\n");
+#if defined(OPAL_DATATYPE_CUDA_TIMING)    
+    TIMER_DATA_TYPE start, end, start_total, end_total;
+    long total_time;
+#endif
+    
+    DT_CUDA_DEBUG( opal_cuda_output( 0, "I am in unpack_contiguous_loop_cuda\n"); );
 
     if( (_copy_loops * _end_loop->size) > *(SPACE) )
         _copy_loops = (uint32_t)(*(SPACE) / _end_loop->size);
 
     // _destination = pBaseBuf_GPU;
     // _source = (unsigned char*)cuda_desc_h->iov[0].iov_base;
-    
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+    GET_TIME(start);
+#endif
     tasks_per_block = THREAD_PER_BLOCK * TASK_PER_THREAD;
     num_blocks = (*COUNT + tasks_per_block - 1) / tasks_per_block;
     unpack_contiguous_loop_cuda_kernel_global<<<192, 4*THREAD_PER_BLOCK>>>(_copy_loops, _end_loop->size, _loop->extent, _source, _destination);
@@ -540,4 +547,9 @@ void unpack_contiguous_loop_cuda( dt_elem_desc_t* ELEM,
 #endif
     
     cudaDeviceSynchronize();
+#if defined(OPAL_DATATYPE_CUDA_TIMING) 
+    GET_TIME( end );
+    total_time = ELAPSED_TIME( start, end );
+    printf( "[Timing]: vector unpacking in %ld microsec\n", total_time );
+#endif
 }
diff --git a/opal/mca/btl/smcuda/btl_smcuda.c b/opal/mca/btl/smcuda/btl_smcuda.c
index 221150d5ccc..cc10683752f 100644
--- a/opal/mca/btl/smcuda/btl_smcuda.c
+++ b/opal/mca/btl/smcuda/btl_smcuda.c
@@ -74,6 +74,7 @@
 #include "ompi/mca/pml/ob1/pml_ob1_recvreq.h"
 #include "ompi/mca/pml/ob1/pml_ob1_rdmafrag.h"
 
+
 #if OPAL_CUDA_SUPPORT
 static struct mca_btl_base_registration_handle_t *mca_btl_smcuda_register_mem (
     struct mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t *endpoint, void *base,
@@ -1164,9 +1165,20 @@ int mca_btl_smcuda_get_cuda (struct mca_btl_base_module_t *btl,
             size_t pipeline_size = remote_handle->reg_data.pipeline_size;
             uint32_t lindex = remote_handle->reg_data.lindex;
             uint8_t pack_required = remote_handle->reg_data.pack_required;
-            printf("i receive pipeline %ld, lindex %d, pack_required %d\n", pipeline_size, lindex, pack_required);
-            convertor->gpu_buffer_ptr = remote_memory_address;
-            mca_btl_smcuda_cuda_dt_unpack_clone(convertor, ep, local_address, local_handle, (mca_btl_base_completion_fn_t)cbfunc, cbcontext, cbdata, pipeline_size, lindex);
+            uint8_t remote_device = remote_handle->reg_data.gpu_device;
+            uint8_t local_device = 0;
+            rc = mca_common_cuda_get_device(&local_device);
+            printf("i receive pipeline %ld, lindex %d, pack_required %d, remote_device %d， local_device %d\n", pipeline_size, lindex, pack_required, remote_device, local_device);
+            if (rc != 0) {
+                opal_output(0, "Failed to get the GPU device ID, rc=%d", rc);
+                return rc;
+            }
+            if (remote_device != local_device && !OPAL_DATATYPE_DIRECT_COPY_GPUMEM) {
+                convertor->gpu_buffer_ptr = NULL;  
+            } else {
+                convertor->gpu_buffer_ptr = remote_memory_address;   
+            }
+            mca_btl_smcuda_cuda_dt_unpack_clone(convertor, ep, local_address, local_handle, remote_memory_address, (mca_btl_base_completion_fn_t)cbfunc, cbcontext, cbdata, pipeline_size, lindex, remote_device, local_device);
             if (pack_required) {
                 mca_btl_smcuda_send_cuda_pack_sig(btl, ep, lindex, 0, 0);
             }
@@ -1400,46 +1412,54 @@ void mca_btl_smcuda_cuda_dt_pack_clone(struct opal_convertor_t *convertor,
                                        struct mca_btl_base_endpoint_t *endpoint,
                                        void *local_address,
                                        struct mca_btl_base_registration_handle_t *local_handle,
+                                       void *remote_gpu_address,
                                        mca_btl_base_completion_fn_t cbfunc,
                                        void *cbcontext,
                                        void *cbdata,
                                        size_t pipeline_size,
-                                       int lindex)
+                                       int lindex, uint8_t remote_device, uint8_t local_device)
 {
     endpoint->smcuda_dt_pack_clone[lindex].convertor = convertor;
-    endpoint->smcuda_dt_pack_clone[lindex].gpu_ptr = convertor->gpu_buffer_ptr;
+ //   endpoint->smcuda_dt_pack_clone[lindex].gpu_ptr = convertor->gpu_buffer_ptr;
     endpoint->smcuda_dt_pack_clone[lindex].endpoint = endpoint;
     endpoint->smcuda_dt_pack_clone[lindex].local_address = local_address;
     endpoint->smcuda_dt_pack_clone[lindex].local_handle = local_handle;
+    endpoint->smcuda_dt_pack_clone[lindex].remote_gpu_address = remote_gpu_address;
     endpoint->smcuda_dt_pack_clone[lindex].cbfunc = cbfunc;
     endpoint->smcuda_dt_pack_clone[lindex].cbcontext = cbcontext;
     endpoint->smcuda_dt_pack_clone[lindex].cbdata = cbdata;
     endpoint->smcuda_dt_pack_clone[lindex].pipeline_size = pipeline_size;
     endpoint->smcuda_dt_pack_clone[lindex].lindex = lindex;
     endpoint->smcuda_dt_pack_clone[lindex].seq = -9;
+    endpoint->smcuda_dt_pack_clone[lindex].remote_device = remote_device;
+    endpoint->smcuda_dt_pack_clone[lindex].local_device = local_device;
 }
 
 void mca_btl_smcuda_cuda_dt_unpack_clone(struct opal_convertor_t *convertor,
                                          struct mca_btl_base_endpoint_t *endpoint,
                                          void *local_address,
                                          struct mca_btl_base_registration_handle_t *local_handle,
+                                         void *remote_gpu_address,
                                          mca_btl_base_completion_fn_t cbfunc,
                                          void *cbcontext,
                                          void *cbdata,
                                          size_t pipeline_size,
-                                         int lindex)
+                                         int lindex, uint8_t remote_device, uint8_t local_device)
 {
     endpoint->smcuda_dt_unpack_clone[lindex].convertor = convertor;
-    endpoint->smcuda_dt_unpack_clone[lindex].gpu_ptr = convertor->gpu_buffer_ptr;
+//    endpoint->smcuda_dt_unpack_clone[lindex].gpu_ptr = convertor->gpu_buffer_ptr;
     endpoint->smcuda_dt_unpack_clone[lindex].endpoint = endpoint;
     endpoint->smcuda_dt_unpack_clone[lindex].local_address = local_address;
     endpoint->smcuda_dt_unpack_clone[lindex].local_handle = local_handle;
+    endpoint->smcuda_dt_unpack_clone[lindex].remote_gpu_address = remote_gpu_address;
     endpoint->smcuda_dt_unpack_clone[lindex].cbfunc = cbfunc;
     endpoint->smcuda_dt_unpack_clone[lindex].cbcontext = cbcontext;
     endpoint->smcuda_dt_unpack_clone[lindex].cbdata = cbdata;
     endpoint->smcuda_dt_unpack_clone[lindex].pipeline_size = pipeline_size;
     endpoint->smcuda_dt_unpack_clone[lindex].lindex = lindex;
     endpoint->smcuda_dt_unpack_clone[lindex].seq = -9;
+    endpoint->smcuda_dt_unpack_clone[lindex].remote_device = remote_device;
+    endpoint->smcuda_dt_unpack_clone[lindex].local_device = local_device;
 }
 
 #endif /* OPAL_CUDA_SUPPORT */
diff --git a/opal/mca/btl/smcuda/btl_smcuda.h b/opal/mca/btl/smcuda/btl_smcuda.h
index a90ba5c0f19..d562be32904 100644
--- a/opal/mca/btl/smcuda/btl_smcuda.h
+++ b/opal/mca/btl/smcuda/btl_smcuda.h
@@ -41,6 +41,8 @@
 #include "opal/mca/btl/btl.h"
 #include "opal/mca/common/sm/common_sm.h"
 
+#define OPAL_DATATYPE_DIRECT_COPY_GPUMEM    1
+
 BEGIN_C_DECLS
 
 /*
@@ -518,16 +520,18 @@ typedef struct {
 /* package save pack/unpack convertor and cbfunc */
 typedef struct {
     struct opal_convertor_t *convertor;
-    void *gpu_ptr;
     struct mca_btl_base_endpoint_t *endpoint;
     void *local_address;
     struct mca_btl_base_registration_handle_t *local_handle;
+    void *remote_gpu_address;
     mca_btl_base_completion_fn_t cbfunc;
     void *cbcontext;
     void *cbdata;
     size_t pipeline_size;
     int lindex;
     int seq;
+    uint8_t remote_device;
+    uint8_t local_device;
 } cuda_dt_clone_t;
 
 #define SMCUDA_DT_CLONE_SIZE 20
@@ -547,20 +551,22 @@ void mca_btl_smcuda_cuda_dt_pack_clone(struct opal_convertor_t *convertor,
                                        struct mca_btl_base_endpoint_t *endpoint,
                                        void *local_address,
                                        struct mca_btl_base_registration_handle_t *local_handle,
+                                       void *remote_gpu_address,
                                        mca_btl_base_completion_fn_t cbfunc,
                                        void *cbcontext,
                                        void *cbdata,
                                        size_t pipeline_size,
-                                       int lindex);
+                                       int lindex, uint8_t remote_device, uint8_t local_device);
 void mca_btl_smcuda_cuda_dt_unpack_clone(struct opal_convertor_t *convertor,
                                          struct mca_btl_base_endpoint_t *endpoint,
                                          void *local_address,
                                          struct mca_btl_base_registration_handle_t *local_handle,
+                                         void *remote_gpu_address,
                                          mca_btl_base_completion_fn_t cbfunc,
                                          void *cbcontext,
                                          void *cbdata,
                                          size_t pipeline_size,
-                                         int lindex);
+                                         int lindex, uint8_t remote_device, uint8_t local_device);
 
 #endif /* OPAL_CUDA_SUPPORT */
 
diff --git a/opal/mca/btl/smcuda/btl_smcuda_component.c b/opal/mca/btl/smcuda/btl_smcuda_component.c
index cd6c7ce071b..78568ab952e 100644
--- a/opal/mca/btl/smcuda/btl_smcuda_component.c
+++ b/opal/mca/btl/smcuda/btl_smcuda_component.c
@@ -878,11 +878,26 @@ static void btl_smcuda_datatype_unpack(mca_btl_base_module_t* btl,
         struct iovec iov;
         uint32_t iov_count = 1;
         size_t max_data;
-        struct opal_convertor_t *convertor = my_cuda_dt_clone->convertor;   
-        iov.iov_base = convertor->gpu_buffer_ptr + seq * pipeline_size;
+        struct opal_convertor_t *convertor = my_cuda_dt_clone->convertor;     
+        if (!OPAL_DATATYPE_DIRECT_COPY_GPUMEM && my_cuda_dt_clone->remote_device != my_cuda_dt_clone->local_device) {
+            convertor->gpu_buffer_ptr = opal_cuda_malloc_gpu_buffer_p(pipeline_size, 0);
+            mca_common_cuda_memp2pcpy(convertor->gpu_buffer_ptr, my_cuda_dt_clone->remote_gpu_address + seq*pipeline_size, pipeline_size);
+            iov.iov_base = convertor->gpu_buffer_ptr;
+            printf("start D2D copy src %p, dst %p, size %lu\n", my_cuda_dt_clone->remote_gpu_address + seq*pipeline_size, convertor->gpu_buffer_ptr, pipeline_size);
+            
+        } else {
+            iov.iov_base = convertor->gpu_buffer_ptr + seq * pipeline_size;
+        }
         max_data = pipeline_size;
         iov.iov_len = pipeline_size;
         opal_convertor_unpack(convertor, &iov, &iov_count, &max_data );
+        if (!OPAL_DATATYPE_DIRECT_COPY_GPUMEM && my_cuda_dt_clone->remote_device != my_cuda_dt_clone->local_device) {
+            if (convertor->gpu_buffer_ptr != NULL) {
+                opal_cuda_free_gpu_buffer_p(convertor->gpu_buffer_ptr, 0);
+                convertor->gpu_buffer_ptr = NULL;
+            }
+            
+        }
     }
    // MCA_BTL_SMCUDA_FRAG_RETURN(frag);
 }
@@ -905,13 +920,15 @@ static void btl_smcuda_datatype_pack(mca_btl_base_module_t* btl,
     my_cuda_dt_clone = &endpoint->smcuda_dt_pack_clone[lindex];
     
     printf("$$$$$$$$$$$$$$hello, rank %d in smcuda pack seq %d, index %d\n", my_cuda_dt_clone->endpoint->my_smp_rank, seq, lindex);
-    
+    struct opal_convertor_t *convertor = my_cuda_dt_clone->convertor;
     if (seq == -1) {
         mca_btl_smcuda_send_cuda_unpack_sig(btl, my_cuda_dt_clone->endpoint, lindex, 0, -2);
-        opal_cuda_free_gpu_buffer_p(my_cuda_dt_clone->gpu_ptr, 0);
+        if (convertor->gpu_buffer_ptr != NULL) {
+            opal_cuda_free_gpu_buffer_p(convertor->gpu_buffer_ptr, 0);
+            convertor->gpu_buffer_ptr = NULL;
+        }
         mca_btl_smcuda_free_cuda_dt_pack_clone(my_cuda_dt_clone->endpoint, lindex);
     } else {
-        struct opal_convertor_t *convertor = my_cuda_dt_clone->convertor;
         struct iovec iov;
         int rc_dt = 0;
         size_t pipeline_size = 1024*1024*20;
diff --git a/opal/mca/common/cuda/common_cuda.c b/opal/mca/common/cuda/common_cuda.c
index 5ce92cab8cd..efcc380d3d2 100644
--- a/opal/mca/common/cuda/common_cuda.c
+++ b/opal/mca/common/cuda/common_cuda.c
@@ -2067,6 +2067,19 @@ int mca_common_cuda_get_address_range(void *pbase, size_t *psize, void *base)
     return 0;
 }
 
+int mca_common_cuda_memp2pcpy(void *dest, const void *src, size_t size)
+{
+    CUresult result;
+
+    result = cuFunc.cuMemcpy((CUdeviceptr)dest, (CUdeviceptr)src, size);
+    if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
+        opal_show_help("help-mpi-common-cuda.txt", "cuMemcpy failed",
+                        true, OPAL_PROC_MY_HOSTNAME, result);
+        return OPAL_ERROR;
+    }
+    return OPAL_SUCCESS;
+}
+
 #if OPAL_CUDA_GDR_SUPPORT
 /* Check to see if the memory was freed between the time it was stored in
  * the registration cache and now.  Return true if the memory was previously
diff --git a/opal/mca/common/cuda/common_cuda.h b/opal/mca/common/cuda/common_cuda.h
index 20290dff7d8..d5220052d63 100644
--- a/opal/mca/common/cuda/common_cuda.h
+++ b/opal/mca/common/cuda/common_cuda.h
@@ -42,6 +42,7 @@ struct mca_mpool_common_cuda_reg_data_t {
     size_t pipeline_size;
     uint32_t lindex;
     uint8_t pack_required;
+    uint8_t gpu_device;
 };
 typedef struct mca_mpool_common_cuda_reg_data_t mca_mpool_common_cuda_reg_data_t;
 
@@ -99,6 +100,7 @@ OPAL_DECLSPEC int mca_common_cuda_create_event(uint64_t **event);
 OPAL_DECLSPEC int mca_common_cuda_record_event(uint64_t *event);
 OPAL_DECLSPEC int mca_common_cuda_query_event(uint64_t *event);
 OPAL_DECLSPEC int mca_common_cuda_openeventhandle(uint64_t **event, int n, mca_mpool_common_cuda_reg_data_t *handle);
+OPAL_DECLSPEC int mca_common_cuda_memp2pcpy(void *dest, const void *src, size_t size);
 #if OPAL_CUDA_GDR_SUPPORT
 OPAL_DECLSPEC bool mca_common_cuda_previously_freed_memory(mca_mpool_base_registration_t *reg);
 OPAL_DECLSPEC void mca_common_cuda_get_buffer_id(mca_mpool_base_registration_t *reg);
diff --git a/test/datatype/Makefile.am b/test/datatype/Makefile.am
index 4085be3936f..8c240423139 100644
--- a/test/datatype/Makefile.am
+++ b/test/datatype/Makefile.am
@@ -14,7 +14,7 @@
 #
 
 if PROJECT_OMPI
-    MPI_TESTS = checksum position position_noncontig ddt_test ddt_raw unpack_ooo ddt_pack
+    MPI_TESTS = checksum position position_noncontig ddt_test ddt_raw unpack_ooo ddt_pack ddt_benchmark
     MPI_CHECKS = to_self ddt_pack
 endif
 TESTS = opal_datatype_test $(MPI_TESTS)
@@ -32,6 +32,11 @@ ddt_test_LDFLAGS = $(WRAPPER_EXTRA_LDFLAGS)
 ddt_test_CFLAGS = -I/mnt/sw/cuda/include -g -O0
 ddt_test_LDADD = $(top_builddir)/ompi/libmpi.la $(top_builddir)/opal/mca/common/cuda/libmca_common_cuda.la -L/mnt/sw/cuda/lib64 -lcudart
 
+ddt_benchmark_SOURCES = ddt_benchmark.c ddt_lib.c ddt_lib.h
+ddt_benchmark_LDFLAGS = $(WRAPPER_EXTRA_LDFLAGS)
+ddt_benchmark_CFLAGS = -I/mnt/sw/cuda/include -g -O0
+ddt_benchmark_LDADD = $(top_builddir)/ompi/libmpi.la $(top_builddir)/opal/mca/common/cuda/libmca_common_cuda.la -L/mnt/sw/cuda/lib64 -lcudart
+
 #ddt_test_old_SOURCES = ddt_test_old.c ddt_lib.c ddt_lib.h
 #ddt_test_old_LDFLAGS = $(WRAPPER_EXTRA_LDFLAGS)
 #ddt_test_old_LDADD = $(top_builddir)/ompi/libmpi.la

From 7a86b4bc76fc3a929ed410e95083e5c2a7f8d16c Mon Sep 17 00:00:00 2001
From: eddy16112 <eddy16112@gmail.com>
Date: Tue, 8 Sep 2015 00:42:11 -0400
Subject: [PATCH 112/190] now we can use cudamemcpy2d

Conflicts:
	opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
---
 opal/datatype/cuda/opal_datatype_cuda.cuh     | 12 +++
 .../cuda/opal_datatype_cuda_internal.cuh      |  1 +
 .../cuda/opal_datatype_pack_cuda_wrapper.cu   | 73 +++++++++++++++---
 .../cuda/opal_datatype_unpack_cuda_wrapper.cu | 75 ++++++++++++++++---
 opal/mca/btl/smcuda/btl_smcuda.h              |  2 +-
 opal/mca/btl/smcuda/btl_smcuda_component.c    |  4 +-
 6 files changed, 143 insertions(+), 24 deletions(-)

diff --git a/opal/datatype/cuda/opal_datatype_cuda.cuh b/opal/datatype/cuda/opal_datatype_cuda.cuh
index 04dd5f88a26..6e86640b5e6 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cuh
+++ b/opal/datatype/cuda/opal_datatype_cuda.cuh
@@ -44,11 +44,23 @@ void pack_contiguous_loop_cuda( dt_elem_desc_t* ELEM,
                                 unsigned char** DESTINATION,
                                 size_t* SPACE );
                                 
+void pack_contiguous_loop_cuda_memcpy2d( dt_elem_desc_t* ELEM,
+                                         uint32_t* COUNT,
+                                         unsigned char** SOURCE,
+                                         unsigned char** DESTINATION,
+                                         size_t* SPACE, uint8_t* transfer_required );
+                                
 void unpack_contiguous_loop_cuda( dt_elem_desc_t* ELEM,
                                   uint32_t* COUNT,
                                   unsigned char** SOURCE,
                                   unsigned char** DESTINATION,
                                   size_t* SPACE );
+
+void unpack_contiguous_loop_cuda_memcpy2d( dt_elem_desc_t* ELEM,
+                                           uint32_t* COUNT,
+                                           unsigned char** SOURCE,
+                                           unsigned char** DESTINATION,
+                                           size_t* SPACE, uint8_t* free_required );
                                   
 void pack_predefined_data_cuda( dt_elem_desc_t* ELEM,
                                 uint32_t* COUNT,
diff --git a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
index 3d8640bcbc2..98d787ac650 100644
--- a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
+++ b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
@@ -14,6 +14,7 @@
 //#define OPAL_DATATYPE_CUDA_KERNEL_TIME
 #define OPAL_DATATYPE_CUDA_DEBUG_LEVEL  0
 #define OPAL_DATATYPE_CUDA_TIMING
+#define OPAL_DATATYPE_VECTOR_USE_MEMCPY2D   0
 
 
 #define IOV_ARRAY_SIZE          1
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
index 56a85e3709d..9a589501ae4 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
@@ -230,13 +230,13 @@ int32_t opal_generic_simple_pack_function_cuda_vector(opal_convertor_t* pConvert
             if (iov[iov_count].iov_base == NULL) {
                 iov[iov_count].iov_base = (unsigned char *)opal_cuda_malloc_gpu_buffer(iov_len_local, 0);
                 iov_ptr = (unsigned char *)iov[iov_count].iov_base;
+                pConvertor->gpu_buffer_ptr = iov_ptr;
                 free_required = 1;
             } else {
                 iov_ptr = (unsigned char *)iov[iov_count].iov_base;
                 free_required = 0;
             }
             transfer_required = 0;
-            pConvertor->gpu_buffer_ptr = iov_ptr;
         } else {
             iov_len_local = iov[iov_count].iov_len;
             if (pConvertor->gpu_buffer_ptr == NULL) {
@@ -291,7 +291,12 @@ int32_t opal_generic_simple_pack_function_cuda_vector(opal_convertor_t* pConvert
             if( OPAL_DATATYPE_LOOP == pElem->elem.common.type ) {
                 OPAL_PTRDIFF_TYPE local_disp = (OPAL_PTRDIFF_TYPE)conv_ptr;
                 if( pElem->loop.common.flags & OPAL_DATATYPE_FLAG_CONTIGUOUS ) {
-                    pack_contiguous_loop_cuda(pElem, &count_desc, &conv_ptr, &iov_ptr, &iov_len_local);
+                    if (transfer_required && OPAL_DATATYPE_VECTOR_USE_MEMCPY2D) {
+                        iov_ptr = (unsigned char*)iov[iov_count].iov_base;
+                        pack_contiguous_loop_cuda_memcpy2d(pElem, &count_desc, &conv_ptr, &iov_ptr, &iov_len_local, &transfer_required);
+                    } else {
+                        pack_contiguous_loop_cuda(pElem, &count_desc, &conv_ptr, &iov_ptr, &iov_len_local);
+                    }
                     if( 0 == count_desc ) {  /* completed */
                         pos_desc += pElem->loop.items + 1;
                         goto update_loop_description;
@@ -330,8 +335,8 @@ int32_t opal_generic_simple_pack_function_cuda_vector(opal_convertor_t* pConvert
     *out_size = iov_count;
     if( pConvertor->bConverted == pConvertor->local_size ) {
         pConvertor->flags |= CONVERTOR_COMPLETED;
-        DT_CUDA_DEBUG( opal_cuda_output( 1, "total packed %lu\n", pConvertor->bConverted); );
-        if (pConvertor->gpu_buffer_ptr != NULL && free_required) {
+        DT_CUDA_DEBUG( opal_cuda_output( 0, "Total packed %lu\n", pConvertor->bConverted); );
+        if (pConvertor->gpu_buffer_ptr != NULL && free_required == 1) {
            opal_cuda_free_gpu_buffer(pConvertor->gpu_buffer_ptr, 0);
            pConvertor->gpu_buffer_ptr = NULL;
         }
@@ -376,9 +381,10 @@ void pack_contiguous_loop_cuda( dt_elem_desc_t* ELEM,
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
     GET_TIME(start);
 #endif    
-    tasks_per_block = THREAD_PER_BLOCK * TASK_PER_THREAD;
-    num_blocks = (*COUNT + tasks_per_block - 1) / tasks_per_block;
-    pack_contiguous_loop_cuda_kernel_global<<<192, 4*THREAD_PER_BLOCK>>>(_copy_loops, _end_loop->size, _loop->extent, _source, _destination);
+ //   tasks_per_block = THREAD_PER_BLOCK * TASK_PER_THREAD;
+ //   num_blocks = (*COUNT + tasks_per_block - 1) / tasks_per_block;
+    cudaMemcpy2D(_destination, _end_loop->size, _source, _loop->extent, _end_loop->size, _copy_loops, cudaMemcpyDeviceToDevice);
+//     pack_contiguous_loop_cuda_kernel_global<<<192, 4*THREAD_PER_BLOCK>>>(_copy_loops, _end_loop->size, _loop->extent, _source, _destination);
 
 #if !defined(OPAL_DATATYPE_CUDA_DRY_RUN)    
     *(SOURCE) = _source +  _loop->extent*_copy_loops - _end_loop->first_elem_disp;
@@ -396,6 +402,52 @@ void pack_contiguous_loop_cuda( dt_elem_desc_t* ELEM,
 #endif
 }
 
+void pack_contiguous_loop_cuda_memcpy2d( dt_elem_desc_t* ELEM,
+                                uint32_t* COUNT,
+                                unsigned char** SOURCE,
+                                unsigned char** DESTINATION,
+                                size_t* SPACE, uint8_t* transfer_required )
+{
+    ddt_loop_desc_t *_loop = (ddt_loop_desc_t*)(ELEM);
+    ddt_endloop_desc_t* _end_loop = (ddt_endloop_desc_t*)((ELEM) + _loop->items);
+    unsigned char* _source = (*SOURCE) + _end_loop->first_elem_disp;
+    uint32_t _copy_loops = *(COUNT);
+    uint32_t num_blocks, tasks_per_block;
+    unsigned char* _destination = *(DESTINATION);
+    
+#if defined(OPAL_DATATYPE_CUDA_TIMING)    
+    TIMER_DATA_TYPE start, end, start_total, end_total;
+    long total_time;
+#endif
+
+    DT_CUDA_DEBUG( opal_cuda_output( 0, "I am in pack_contiguous_loop_cuda_memcpy2d\n"); );
+
+    if( (_copy_loops * _end_loop->size) > *(SPACE) )
+        _copy_loops = (uint32_t)(*(SPACE) / _end_loop->size);
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+    GET_TIME(start);
+#endif    
+
+    cudaMemcpy2D(_destination, _end_loop->size, _source, _loop->extent, _end_loop->size, _copy_loops, cudaMemcpyDeviceToHost);
+
+#if !defined(OPAL_DATATYPE_CUDA_DRY_RUN)    
+    *(SOURCE) = _source +  _loop->extent*_copy_loops - _end_loop->first_elem_disp;
+    *(DESTINATION) = *(DESTINATION)  + _copy_loops * _end_loop->size;
+    *(SPACE) -= _copy_loops * _end_loop->size;
+    *(COUNT) -= _copy_loops;
+#endif
+    
+//    cudaDeviceSynchronize();
+    *transfer_required = 0;
+    
+#if defined(OPAL_DATATYPE_CUDA_TIMING) 
+    GET_TIME( end );
+    total_time = ELAPSED_TIME( start, end );
+    printf( "[Timing]: vector packing with memcpy2d in %ld microsec\n", total_time );
+#endif
+}
+
 int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor,
                                                     struct iovec* iov,
                                                     uint32_t* out_size,
@@ -453,13 +505,13 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
         if (iov[0].iov_base == NULL) {
             iov[0].iov_base = (unsigned char *)opal_cuda_malloc_gpu_buffer(buffer_size, 0);
             destination = (unsigned char *)iov[0].iov_base;
+            pConvertor->gpu_buffer_ptr = destination;
             free_required = 1;
         } else {
             destination = (unsigned char *)iov[0].iov_base;
             free_required = 0;
         }
         transfer_required = 0;
-        pConvertor->gpu_buffer_ptr = destination;
     } else {
         buffer_size = iov[0].iov_len;
         if (pConvertor->gpu_buffer_ptr == NULL) {
@@ -620,7 +672,10 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
     }
     
 
-    cudaDeviceSynchronize();
+  //  cudaDeviceSynchronize();
+    for (i = 0; i < NB_STREAMS; i++) {
+        cudaStreamSynchronize(cuda_streams->opal_cuda_stream[i]);
+    }
     
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
     GET_TIME(start);
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
index 36316ae877f..484f22cf785 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
@@ -161,13 +161,16 @@ int32_t opal_generic_simple_unpack_function_cuda_vector( opal_convertor_t* pConv
         if (opal_cuda_is_gpu_buffer(iov[iov_count].iov_base)) {
             iov_ptr = (unsigned char*)iov[iov_count].iov_base;
             free_required = 0;
-        } else {  
+        } else if (!OPAL_DATATYPE_VECTOR_USE_MEMCPY2D){
             if (pConvertor->gpu_buffer_ptr == NULL) {
                 pConvertor->gpu_buffer_ptr = (unsigned char*)opal_cuda_malloc_gpu_buffer(iov[iov_count].iov_len, 0);
             }
             iov_ptr = pConvertor->gpu_buffer_ptr;
             cudaMemcpy(iov_ptr, iov[iov_count].iov_base, iov[iov_count].iov_len, cudaMemcpyHostToDevice);
             free_required = 1;
+        } else {
+            iov_ptr = (unsigned char*)iov[iov_count].iov_base;
+            free_required = 255;
         }
 #if defined(OPAL_DATATYPE_CUDA_TIMING) 
         GET_TIME( end );
@@ -222,7 +225,11 @@ int32_t opal_generic_simple_unpack_function_cuda_vector( opal_convertor_t* pConv
             if( OPAL_DATATYPE_LOOP == pElem->elem.common.type ) {
                 OPAL_PTRDIFF_TYPE local_disp = (OPAL_PTRDIFF_TYPE)conv_ptr;
                 if( pElem->loop.common.flags & OPAL_DATATYPE_FLAG_CONTIGUOUS ) {
-                    unpack_contiguous_loop_cuda(pElem, &count_desc, &iov_ptr, &conv_ptr, &iov_len_local);
+                    if (free_required == 255 && OPAL_DATATYPE_VECTOR_USE_MEMCPY2D) {
+                        unpack_contiguous_loop_cuda_memcpy2d(pElem, &count_desc, &iov_ptr, &conv_ptr, &iov_len_local, &free_required);
+                    } else {
+                        unpack_contiguous_loop_cuda(pElem, &count_desc, &iov_ptr, &conv_ptr, &iov_len_local);
+                    }
                     if( 0 == count_desc ) {  /* completed */
                         pos_desc += pElem->loop.items + 1;
                         goto update_loop_description;
@@ -250,8 +257,8 @@ int32_t opal_generic_simple_unpack_function_cuda_vector( opal_convertor_t* pConv
     *out_size = iov_count;
     if( pConvertor->bConverted == pConvertor->remote_size ) {
         pConvertor->flags |= CONVERTOR_COMPLETED;
-        DT_CUDA_DEBUG( opal_cuda_output( 1, "total packed %lu\n", pConvertor->bConverted); );
-        if (pConvertor->gpu_buffer_ptr != NULL && free_required) {
+        DT_CUDA_DEBUG( opal_cuda_output( 0, "Total unpacked %lu\n", pConvertor->bConverted); );
+        if (pConvertor->gpu_buffer_ptr != NULL && free_required == 1) {
             opal_cuda_free_gpu_buffer(pConvertor->gpu_buffer_ptr, 0);
             pConvertor->gpu_buffer_ptr = NULL;
         }
@@ -482,8 +489,10 @@ int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvert
 #endif
 
     }
-    cudaDeviceSynchronize();
-    
+    for (i = 0; i < NB_STREAMS; i++) {
+        cudaStreamSynchronize(cuda_streams->opal_cuda_stream[i]);
+    }
+
     iov[0].iov_len = total_unpacked;
     *max_data = total_unpacked;
     *out_size = 1;
@@ -529,15 +538,13 @@ void unpack_contiguous_loop_cuda( dt_elem_desc_t* ELEM,
     if( (_copy_loops * _end_loop->size) > *(SPACE) )
         _copy_loops = (uint32_t)(*(SPACE) / _end_loop->size);
 
-    // _destination = pBaseBuf_GPU;
-    // _source = (unsigned char*)cuda_desc_h->iov[0].iov_base;
-
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
     GET_TIME(start);
 #endif
-    tasks_per_block = THREAD_PER_BLOCK * TASK_PER_THREAD;
-    num_blocks = (*COUNT + tasks_per_block - 1) / tasks_per_block;
-    unpack_contiguous_loop_cuda_kernel_global<<<192, 4*THREAD_PER_BLOCK>>>(_copy_loops, _end_loop->size, _loop->extent, _source, _destination);
+//    tasks_per_block = THREAD_PER_BLOCK * TASK_PER_THREAD;
+//    num_blocks = (*COUNT + tasks_per_block - 1) / tasks_per_block;
+//    unpack_contiguous_loop_cuda_kernel_global<<<192, 4*THREAD_PER_BLOCK>>>(_copy_loops, _end_loop->size, _loop->extent, _source, _destination);
+     cudaMemcpy2D(_destination, _loop->extent, _source, _end_loop->size, _end_loop->size, _copy_loops, cudaMemcpyDeviceToDevice);
 
 #if !defined(OPAL_DATATYPE_CUDA_DRY_RUN)     
     *(DESTINATION) = _destination + _loop->extent*_copy_loops - _end_loop->first_elem_disp;
@@ -553,3 +560,47 @@ void unpack_contiguous_loop_cuda( dt_elem_desc_t* ELEM,
     printf( "[Timing]: vector unpacking in %ld microsec\n", total_time );
 #endif
 }
+
+void unpack_contiguous_loop_cuda_memcpy2d( dt_elem_desc_t* ELEM,
+                                  uint32_t* COUNT,
+                                  unsigned char** SOURCE,
+                                  unsigned char** DESTINATION,
+                                  size_t* SPACE, uint8_t* free_required )
+{
+    ddt_loop_desc_t *_loop = (ddt_loop_desc_t*)(ELEM);
+    ddt_endloop_desc_t* _end_loop = (ddt_endloop_desc_t*)((ELEM) + _loop->items);
+    unsigned char* _destination = (*DESTINATION) + _end_loop->first_elem_disp;
+    uint32_t _copy_loops = *(COUNT);
+    uint32_t num_blocks, tasks_per_block;
+    unsigned char* _source = *(SOURCE);
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)    
+    TIMER_DATA_TYPE start, end, start_total, end_total;
+    long total_time;
+#endif
+    
+    DT_CUDA_DEBUG( opal_cuda_output( 0, "I am in unpack_contiguous_loop_cuda_memcpy2d\n"); );
+
+    if( (_copy_loops * _end_loop->size) > *(SPACE) )
+        _copy_loops = (uint32_t)(*(SPACE) / _end_loop->size);
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+    GET_TIME(start);
+#endif
+    cudaMemcpy2D(_destination, _loop->extent, _source, _end_loop->size, _end_loop->size, _copy_loops, cudaMemcpyHostToDevice);
+
+#if !defined(OPAL_DATATYPE_CUDA_DRY_RUN)
+    *(DESTINATION) = _destination + _loop->extent*_copy_loops - _end_loop->first_elem_disp;
+    *(SOURCE) = *(SOURCE)  + _copy_loops * _end_loop->size;
+    *(SPACE) -= _copy_loops * _end_loop->size;
+    *(COUNT) -= _copy_loops;
+#endif
+    
+//    *free_required = 0;
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING) 
+    GET_TIME( end );
+    total_time = ELAPSED_TIME( start, end );
+    printf( "[Timing]: vector unpacking with memcpy2d in %ld microsec\n", total_time );
+#endif
+}
diff --git a/opal/mca/btl/smcuda/btl_smcuda.h b/opal/mca/btl/smcuda/btl_smcuda.h
index d562be32904..20465decc10 100644
--- a/opal/mca/btl/smcuda/btl_smcuda.h
+++ b/opal/mca/btl/smcuda/btl_smcuda.h
@@ -41,7 +41,7 @@
 #include "opal/mca/btl/btl.h"
 #include "opal/mca/common/sm/common_sm.h"
 
-#define OPAL_DATATYPE_DIRECT_COPY_GPUMEM    1
+#define OPAL_DATATYPE_DIRECT_COPY_GPUMEM    0
 
 BEGIN_C_DECLS
 
diff --git a/opal/mca/btl/smcuda/btl_smcuda_component.c b/opal/mca/btl/smcuda/btl_smcuda_component.c
index 78568ab952e..2382fef5d94 100644
--- a/opal/mca/btl/smcuda/btl_smcuda_component.c
+++ b/opal/mca/btl/smcuda/btl_smcuda_component.c
@@ -142,7 +142,7 @@ static int mca_btl_smcuda_component_verify(void) {
 static int smcuda_register(void)
 {
     /* register SM component parameters */
-    mca_btl_smcuda_param_register_int("free_list_num", 8, OPAL_INFO_LVL_5, &mca_btl_smcuda_component.sm_free_list_num);
+    mca_btl_smcuda_param_register_int("free_list_num", 16, OPAL_INFO_LVL_5, &mca_btl_smcuda_component.sm_free_list_num);
     mca_btl_smcuda_param_register_int("free_list_max", -1, OPAL_INFO_LVL_5, &mca_btl_smcuda_component.sm_free_list_max);
     mca_btl_smcuda_param_register_int("free_list_inc", 64, OPAL_INFO_LVL_5, &mca_btl_smcuda_component.sm_free_list_inc);
     mca_btl_smcuda_param_register_int("max_procs", -1, OPAL_INFO_LVL_5, &mca_btl_smcuda_component.sm_max_procs);
@@ -931,7 +931,7 @@ static void btl_smcuda_datatype_pack(mca_btl_base_module_t* btl,
     } else {
         struct iovec iov;
         int rc_dt = 0;
-        size_t pipeline_size = 1024*1024*20;
+        size_t pipeline_size = 1024*1024*200;
         uint32_t iov_count = 1;
         iov.iov_base = convertor->gpu_buffer_ptr;
         iov.iov_len = pipeline_size;

From 5f2aac5b07a0aaf2575d4cf36ac6d35e08323358 Mon Sep 17 00:00:00 2001
From: eddy16112 <eddy16112@gmail.com>
Date: Tue, 8 Sep 2015 18:34:57 -0400
Subject: [PATCH 113/190] enable zero copy + fix GPU buffer bug

Conflicts:
	opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
---
 opal/datatype/cuda/opal_datatype_cuda.cu      |  10 +-
 opal/datatype/cuda/opal_datatype_cuda.cuh     |  22 ++-
 .../cuda/opal_datatype_cuda_internal.cuh      |  10 +
 .../cuda/opal_datatype_pack_cuda_kernel.cu    |  14 ++
 .../cuda/opal_datatype_pack_cuda_wrapper.cu   | 172 ++++++++++++++++--
 .../cuda/opal_datatype_unpack_cuda_wrapper.cu |  87 +++++++--
 opal/mca/btl/smcuda/btl_smcuda_component.c    |   2 +-
 7 files changed, 280 insertions(+), 37 deletions(-)

diff --git a/opal/datatype/cuda/opal_datatype_cuda.cu b/opal/datatype/cuda/opal_datatype_cuda.cu
index b94679358a0..9791e40fef1 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cu
+++ b/opal/datatype/cuda/opal_datatype_cuda.cu
@@ -370,9 +370,9 @@ void* opal_cuda_malloc_gpu_buffer(size_t size, int gpu_id)
 {
     int dev_id;
     cudaGetDevice(&dev_id);
-    printf("malloc gpu buffer in dev %d\n", dev_id);
     ddt_cuda_device_t *device = &cuda_device[gpu_id];
     if (device->buffer_free_size < size) {
+        DT_CUDA_DEBUG( opal_cuda_output( 0, "No GPU buffer at dev_id %d.\n", dev_id); );
         return NULL;
     }
     ddt_cuda_buffer_t *ptr = NULL;
@@ -406,7 +406,7 @@ void* opal_cuda_malloc_gpu_buffer(size_t size, int gpu_id)
         cuda_list_push_head(&device->buffer_used, p);
         device->buffer_used_size += size;
         device->buffer_free_size -= size;
-        DT_CUDA_DEBUG( opal_cuda_output( 1, "Malloc GPU buffer %p.\n", addr); );
+        DT_CUDA_DEBUG( opal_cuda_output( 0, "Malloc GPU buffer %p, dev_id %d.\n", addr, dev_id); );
         return addr;
     }
 }
@@ -442,9 +442,11 @@ void opal_cuda_free_gpu_buffer(void *addr, int gpu_id)
     if (ptr == NULL) {
         DT_CUDA_DEBUG( opal_cuda_output( 0, "addr %p is not managed.\n", addr); );
     }
+    size_t size = ptr->size;
     cuda_list_item_merge_by_addr(&device->buffer_free, ptr);
-    device->buffer_free_size += ptr->size;
-    DT_CUDA_DEBUG( opal_cuda_output( 1, "Free GPU buffer %p.\n", addr); );
+    device->buffer_free_size += size;
+    device->buffer_used_size -= size;
+    DT_CUDA_DEBUG( opal_cuda_output( 0, "Free GPU buffer %p.\n", addr); );
 }
 
 void opal_dump_cuda_list(ddt_cuda_list_t *list)
diff --git a/opal/datatype/cuda/opal_datatype_cuda.cuh b/opal/datatype/cuda/opal_datatype_cuda.cuh
index 6e86640b5e6..b770f136969 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cuh
+++ b/opal/datatype/cuda/opal_datatype_cuda.cuh
@@ -48,7 +48,19 @@ void pack_contiguous_loop_cuda_memcpy2d( dt_elem_desc_t* ELEM,
                                          uint32_t* COUNT,
                                          unsigned char** SOURCE,
                                          unsigned char** DESTINATION,
-                                         size_t* SPACE, uint8_t* transfer_required );
+                                         size_t* SPACE );
+                                         
+void pack_contiguous_loop_cuda_zerocopy( dt_elem_desc_t* ELEM,
+                                         uint32_t* COUNT,
+                                         unsigned char** SOURCE,
+                                         unsigned char** DESTINATION,
+                                         size_t* SPACE );
+                                         
+void pack_contiguous_loop_cuda_pipeline( dt_elem_desc_t* ELEM,
+                                         uint32_t* COUNT,
+                                         unsigned char** SOURCE,
+                                         unsigned char** DESTINATION,
+                                         size_t* SPACE, unsigned char* gpu_buffer );
                                 
 void unpack_contiguous_loop_cuda( dt_elem_desc_t* ELEM,
                                   uint32_t* COUNT,
@@ -60,7 +72,13 @@ void unpack_contiguous_loop_cuda_memcpy2d( dt_elem_desc_t* ELEM,
                                            uint32_t* COUNT,
                                            unsigned char** SOURCE,
                                            unsigned char** DESTINATION,
-                                           size_t* SPACE, uint8_t* free_required );
+                                           size_t* SPACE );
+
+void unpack_contiguous_loop_cuda_zerocopy( dt_elem_desc_t* ELEM,
+                                           uint32_t* COUNT,
+                                           unsigned char** SOURCE,
+                                           unsigned char** DESTINATION,
+                                           size_t* SPACE);
                                   
 void pack_predefined_data_cuda( dt_elem_desc_t* ELEM,
                                 uint32_t* COUNT,
diff --git a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
index 98d787ac650..c0cfda8ea90 100644
--- a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
+++ b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
@@ -15,6 +15,8 @@
 #define OPAL_DATATYPE_CUDA_DEBUG_LEVEL  0
 #define OPAL_DATATYPE_CUDA_TIMING
 #define OPAL_DATATYPE_VECTOR_USE_MEMCPY2D   0
+#define OPAL_DATATYPE_VECTOR_USE_ZEROCPY   0
+#define OPAL_DATATYPE_VECTOR_USE_PIPELINE   0
 
 
 #define IOV_ARRAY_SIZE          1
@@ -160,6 +162,14 @@ __global__ void opal_generic_simple_pack_cuda_iov_kernel( ddt_cuda_iov_dist_t* c
 
 __global__ void opal_generic_simple_unpack_cuda_iov_kernel( ddt_cuda_iov_dist_t* cuda_iov_dist);
 
+__global__ void opal_empty_kernel(uint32_t copy_loops,
+                                  size_t size,
+                                  OPAL_PTRDIFF_TYPE extent,
+                                  unsigned char* source,
+                                  unsigned char* destination);
+                            
+__global__ void opal_empty_kernel_noargs();
+
 void opal_cuda_output(int output_id, const char *format, ...);
 
 #if defined (OPAL_DATATYPE_CUDA_DEBUG)
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
index 9bf130630f9..79281adf6cb 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
@@ -633,3 +633,17 @@ __global__ void opal_generic_simple_pack_cuda_iov_kernel( ddt_cuda_iov_dist_t* c
         }
     }
 }
+
+__global__ void opal_empty_kernel(uint32_t copy_loops,
+                                  size_t size,
+                                  OPAL_PTRDIFF_TYPE extent,
+                                  unsigned char* source,
+                                  unsigned char* destination)
+{
+    
+}
+
+__global__ void opal_empty_kernel_noargs()
+{
+    
+}
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
index 9a589501ae4..01fc947043c 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
@@ -238,13 +238,29 @@ int32_t opal_generic_simple_pack_function_cuda_vector(opal_convertor_t* pConvert
             }
             transfer_required = 0;
         } else {
-            iov_len_local = iov[iov_count].iov_len;
-            if (pConvertor->gpu_buffer_ptr == NULL) {
-                pConvertor->gpu_buffer_ptr = (unsigned char*)opal_cuda_malloc_gpu_buffer(iov_len_local, 0);
+            if (OPAL_DATATYPE_VECTOR_USE_MEMCPY2D || OPAL_DATATYPE_VECTOR_USE_ZEROCPY) {
+                pConvertor->gpu_buffer_ptr = NULL;
+                transfer_required = 0;
+                free_required = 0;
+                iov_ptr = (unsigned char*)iov[iov_count].iov_base;
+                iov_len_local = iov[iov_count].iov_len;
+            } else if (OPAL_DATATYPE_VECTOR_USE_PIPELINE){
+                iov_len_local = iov[iov_count].iov_len;
+                if (pConvertor->gpu_buffer_ptr == NULL) {
+                    pConvertor->gpu_buffer_ptr = (unsigned char*)opal_cuda_malloc_gpu_buffer(iov_len_local, 0);
+                }
+                transfer_required = 0;
+                free_required = 1;
+                iov_ptr = (unsigned char*)iov[iov_count].iov_base;
+            } else {
+                iov_len_local = iov[iov_count].iov_len;
+                if (pConvertor->gpu_buffer_ptr == NULL) {
+                    pConvertor->gpu_buffer_ptr = (unsigned char*)opal_cuda_malloc_gpu_buffer(iov_len_local, 0);
+                }
+                transfer_required = 1;
+                free_required = 1;
+                iov_ptr = pConvertor->gpu_buffer_ptr;
             }
-            transfer_required = 1;
-            free_required = 1;
-            iov_ptr = pConvertor->gpu_buffer_ptr;
         }
         while( 1 ) {
             while( pElem->elem.common.flags & OPAL_DATATYPE_FLAG_DATA ) {
@@ -291,9 +307,12 @@ int32_t opal_generic_simple_pack_function_cuda_vector(opal_convertor_t* pConvert
             if( OPAL_DATATYPE_LOOP == pElem->elem.common.type ) {
                 OPAL_PTRDIFF_TYPE local_disp = (OPAL_PTRDIFF_TYPE)conv_ptr;
                 if( pElem->loop.common.flags & OPAL_DATATYPE_FLAG_CONTIGUOUS ) {
-                    if (transfer_required && OPAL_DATATYPE_VECTOR_USE_MEMCPY2D) {
-                        iov_ptr = (unsigned char*)iov[iov_count].iov_base;
-                        pack_contiguous_loop_cuda_memcpy2d(pElem, &count_desc, &conv_ptr, &iov_ptr, &iov_len_local, &transfer_required);
+                    if (OPAL_DATATYPE_VECTOR_USE_MEMCPY2D) {
+                        pack_contiguous_loop_cuda_memcpy2d(pElem, &count_desc, &conv_ptr, &iov_ptr, &iov_len_local);
+                    } else if (OPAL_DATATYPE_VECTOR_USE_ZEROCPY) {
+                        pack_contiguous_loop_cuda_zerocopy(pElem, &count_desc, &conv_ptr, &iov_ptr, &iov_len_local);
+                    } else if (OPAL_DATATYPE_VECTOR_USE_PIPELINE) {
+                        pack_contiguous_loop_cuda_pipeline(pElem, &count_desc, &conv_ptr, &iov_ptr, &iov_len_local, pConvertor->gpu_buffer_ptr);
                     } else {
                         pack_contiguous_loop_cuda(pElem, &count_desc, &conv_ptr, &iov_ptr, &iov_len_local);
                     }
@@ -337,6 +356,7 @@ int32_t opal_generic_simple_pack_function_cuda_vector(opal_convertor_t* pConvert
         pConvertor->flags |= CONVERTOR_COMPLETED;
         DT_CUDA_DEBUG( opal_cuda_output( 0, "Total packed %lu\n", pConvertor->bConverted); );
         if (pConvertor->gpu_buffer_ptr != NULL && free_required == 1) {
+            printf("free\n");
            opal_cuda_free_gpu_buffer(pConvertor->gpu_buffer_ptr, 0);
            pConvertor->gpu_buffer_ptr = NULL;
         }
@@ -383,8 +403,84 @@ void pack_contiguous_loop_cuda( dt_elem_desc_t* ELEM,
 #endif    
  //   tasks_per_block = THREAD_PER_BLOCK * TASK_PER_THREAD;
  //   num_blocks = (*COUNT + tasks_per_block - 1) / tasks_per_block;
-    cudaMemcpy2D(_destination, _end_loop->size, _source, _loop->extent, _end_loop->size, _copy_loops, cudaMemcpyDeviceToDevice);
-//     pack_contiguous_loop_cuda_kernel_global<<<192, 4*THREAD_PER_BLOCK>>>(_copy_loops, _end_loop->size, _loop->extent, _source, _destination);
+ //    cudaMemcpy2D(_destination, _end_loop->size, _source, _loop->extent, _end_loop->size, _copy_loops, cudaMemcpyDeviceToDevice);
+//    pack_contiguous_loop_cuda_kernel_global<<<192, 4*THREAD_PER_BLOCK>>>(_copy_loops, _end_loop->size, _loop->extent, _source, _destination);
+//    int i;
+//    for (i = 0; i < 4; i++) {
+//     opal_empty_kernel<<<192, 4*THREAD_PER_BLOCK>>>(_copy_loops, _end_loop->size, _loop->extent, _source, _destination);
+     pack_contiguous_loop_cuda_kernel_global<<<192, 4*THREAD_PER_BLOCK>>>(_copy_loops, _end_loop->size, _loop->extent, _source, _destination);
+//     }
+
+#if !defined(OPAL_DATATYPE_CUDA_DRY_RUN)    
+    *(SOURCE) = _source +  _loop->extent*_copy_loops - _end_loop->first_elem_disp;
+    *(DESTINATION) = *(DESTINATION)  + _copy_loops * _end_loop->size;
+    *(SPACE) -= _copy_loops * _end_loop->size;
+    *(COUNT) -= _copy_loops;
+#endif
+    
+    cudaDeviceSynchronize();
+    
+#if defined(OPAL_DATATYPE_CUDA_TIMING) 
+    GET_TIME( end );
+    total_time = ELAPSED_TIME( start, end );
+    printf( "[Timing]: vector packing in %ld microsec\n", total_time );
+#endif
+}
+
+void pack_contiguous_loop_cuda_pipeline( dt_elem_desc_t* ELEM,
+                                         uint32_t* COUNT,
+                                         unsigned char** SOURCE,
+                                         unsigned char** DESTINATION,
+                                         size_t* SPACE, unsigned char* gpu_buffer )
+{
+    ddt_loop_desc_t *_loop = (ddt_loop_desc_t*)(ELEM);
+    ddt_endloop_desc_t* _end_loop = (ddt_endloop_desc_t*)((ELEM) + _loop->items);
+    unsigned char* _source = (*SOURCE) + _end_loop->first_elem_disp;
+    uint32_t _copy_loops = *(COUNT);
+    uint32_t num_blocks, tasks_per_block;
+    unsigned char* _destination_host = *(DESTINATION);
+    unsigned char* _destination_dev = gpu_buffer;
+    int i, pipeline_blocks;
+    uint32_t _copy_loops_per_pipeline; 
+    
+#if defined(OPAL_DATATYPE_CUDA_TIMING)    
+    TIMER_DATA_TYPE start, end, start_total, end_total;
+    long total_time;
+#endif
+
+    DT_CUDA_DEBUG( opal_cuda_output( 0, "I am in pack_contiguous_loop_cuda_pipeline\n"); );
+
+    if( (_copy_loops * _end_loop->size) > *(SPACE) )
+        _copy_loops = (uint32_t)(*(SPACE) / _end_loop->size);
+
+#if !defined(OPAL_DATATYPE_CUDA_DRY_RUN)
+ //   _source = pBaseBuf_GPU;
+ //   _destination = (unsigned char*)cuda_desc_h->iov[0].iov_base;
+#endif
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+    GET_TIME(start);
+#endif    
+ //   tasks_per_block = THREAD_PER_BLOCK * TASK_PER_THREAD;
+ //   num_blocks = (*COUNT + tasks_per_block - 1) / tasks_per_block;
+//    cudaMemcpy2D(_destination, _end_loop->size, _source, _loop->extent, _end_loop->size, _copy_loops, cudaMemcpyDeviceToDevice);
+    pipeline_blocks = 4;
+    cuda_streams->current_stream_id = 0;
+    _copy_loops_per_pipeline = (_copy_loops + pipeline_blocks -1 )/ pipeline_blocks;
+    pack_contiguous_loop_cuda_kernel_global<<<192, 4*THREAD_PER_BLOCK, 0, cuda_streams->opal_cuda_stream[cuda_streams->current_stream_id]>>>(_copy_loops_per_pipeline, _end_loop->size, _loop->extent, _source, _destination_dev);
+    for (i = 1; i <= pipeline_blocks; i++) {
+        cudaMemcpyAsync(_destination_host, _destination_dev, _end_loop->size * _copy_loops_per_pipeline, cudaMemcpyDeviceToHost, cuda_streams->opal_cuda_stream[cuda_streams->current_stream_id]);
+        cuda_streams->current_stream_id ++;
+        cuda_streams->current_stream_id = cuda_streams->current_stream_id % NB_STREAMS;
+        _source += _loop->extent * _copy_loops_per_pipeline;
+        _destination_dev += _end_loop->size * _copy_loops_per_pipeline;
+        _destination_host += _end_loop->size * _copy_loops_per_pipeline;
+        if (i == pipeline_blocks) {
+            _copy_loops_per_pipeline = _copy_loops - _copy_loops_per_pipeline * (pipeline_blocks - 1);
+        }
+        pack_contiguous_loop_cuda_kernel_global<<<192, 4*THREAD_PER_BLOCK, 0, cuda_streams->opal_cuda_stream[cuda_streams->current_stream_id]>>>(_copy_loops_per_pipeline, _end_loop->size, _loop->extent, _source, _destination_dev);
+    }
+    cudaMemcpyAsync(_destination_host, _destination_dev, _end_loop->size * _copy_loops_per_pipeline, cudaMemcpyDeviceToHost, cuda_streams->opal_cuda_stream[cuda_streams->current_stream_id]);
 
 #if !defined(OPAL_DATATYPE_CUDA_DRY_RUN)    
     *(SOURCE) = _source +  _loop->extent*_copy_loops - _end_loop->first_elem_disp;
@@ -406,7 +502,7 @@ void pack_contiguous_loop_cuda_memcpy2d( dt_elem_desc_t* ELEM,
                                 uint32_t* COUNT,
                                 unsigned char** SOURCE,
                                 unsigned char** DESTINATION,
-                                size_t* SPACE, uint8_t* transfer_required )
+                                size_t* SPACE )
 {
     ddt_loop_desc_t *_loop = (ddt_loop_desc_t*)(ELEM);
     ddt_endloop_desc_t* _end_loop = (ddt_endloop_desc_t*)((ELEM) + _loop->items);
@@ -439,7 +535,6 @@ void pack_contiguous_loop_cuda_memcpy2d( dt_elem_desc_t* ELEM,
 #endif
     
 //    cudaDeviceSynchronize();
-    *transfer_required = 0;
     
 #if defined(OPAL_DATATYPE_CUDA_TIMING) 
     GET_TIME( end );
@@ -448,6 +543,57 @@ void pack_contiguous_loop_cuda_memcpy2d( dt_elem_desc_t* ELEM,
 #endif
 }
 
+void pack_contiguous_loop_cuda_zerocopy( dt_elem_desc_t* ELEM,
+                                uint32_t* COUNT,
+                                unsigned char** SOURCE,
+                                unsigned char** DESTINATION,
+                                size_t* SPACE )
+{
+    ddt_loop_desc_t *_loop = (ddt_loop_desc_t*)(ELEM);
+    ddt_endloop_desc_t* _end_loop = (ddt_endloop_desc_t*)((ELEM) + _loop->items);
+    unsigned char* _source = (*SOURCE) + _end_loop->first_elem_disp;
+    uint32_t _copy_loops = *(COUNT);
+    uint32_t num_blocks, tasks_per_block;
+    unsigned char* _destination = *(DESTINATION);
+    unsigned char* _destination_dev;
+    
+#if defined(OPAL_DATATYPE_CUDA_TIMING)    
+    TIMER_DATA_TYPE start, end, start_total, end_total;
+    long total_time;
+#endif
+
+    DT_CUDA_DEBUG( opal_cuda_output( 0, "I am in pack_contiguous_loop_cuda_zerocopy\n"); );
+
+    if( (_copy_loops * _end_loop->size) > *(SPACE) )
+        _copy_loops = (uint32_t)(*(SPACE) / _end_loop->size);
+
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+    GET_TIME(start);
+#endif    
+ //   tasks_per_block = THREAD_PER_BLOCK * TASK_PER_THREAD;
+ //   num_blocks = (*COUNT + tasks_per_block - 1) / tasks_per_block;
+ //   cudaHostRegister(_destination, _copy_loops*_end_loop->size, cudaHostRegisterMapped);
+    cudaHostGetDevicePointer((void **)&_destination_dev, (void *) _destination, 0);
+    pack_contiguous_loop_cuda_kernel_global<<<192, 4*THREAD_PER_BLOCK>>>(_copy_loops, _end_loop->size, _loop->extent, _source, _destination_dev);
+
+#if !defined(OPAL_DATATYPE_CUDA_DRY_RUN)    
+    *(SOURCE) = _source +  _loop->extent*_copy_loops - _end_loop->first_elem_disp;
+    *(DESTINATION) = *(DESTINATION)  + _copy_loops * _end_loop->size;
+    *(SPACE) -= _copy_loops * _end_loop->size;
+    *(COUNT) -= _copy_loops;
+#endif
+    
+    cudaDeviceSynchronize();
+ //   cudaHostUnregister(_destination);
+    
+#if defined(OPAL_DATATYPE_CUDA_TIMING) 
+    GET_TIME( end );
+    total_time = ELAPSED_TIME( start, end );
+    printf( "[Timing]: vector packing in %ld microsec\n", total_time );
+#endif
+}
+
 int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor,
                                                     struct iovec* iov,
                                                     uint32_t* out_size,
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
index 484f22cf785..e48c0340bd8 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
@@ -161,18 +161,21 @@ int32_t opal_generic_simple_unpack_function_cuda_vector( opal_convertor_t* pConv
         if (opal_cuda_is_gpu_buffer(iov[iov_count].iov_base)) {
             iov_ptr = (unsigned char*)iov[iov_count].iov_base;
             free_required = 0;
-        } else if (!OPAL_DATATYPE_VECTOR_USE_MEMCPY2D){
-            if (pConvertor->gpu_buffer_ptr == NULL) {
-                pConvertor->gpu_buffer_ptr = (unsigned char*)opal_cuda_malloc_gpu_buffer(iov[iov_count].iov_len, 0);
-            }
-            iov_ptr = pConvertor->gpu_buffer_ptr;
-            cudaMemcpy(iov_ptr, iov[iov_count].iov_base, iov[iov_count].iov_len, cudaMemcpyHostToDevice);
-            free_required = 1;
         } else {
-            iov_ptr = (unsigned char*)iov[iov_count].iov_base;
-            free_required = 255;
-        }
-#if defined(OPAL_DATATYPE_CUDA_TIMING) 
+            if (OPAL_DATATYPE_VECTOR_USE_MEMCPY2D || OPAL_DATATYPE_VECTOR_USE_ZEROCPY) {
+                iov_ptr = (unsigned char*)iov[iov_count].iov_base;
+                pConvertor->gpu_buffer_ptr = NULL;
+                free_required = 0;
+            } else {
+                if (pConvertor->gpu_buffer_ptr == NULL) {
+                    pConvertor->gpu_buffer_ptr = (unsigned char*)opal_cuda_malloc_gpu_buffer(iov[iov_count].iov_len, 0);
+                }
+                iov_ptr = pConvertor->gpu_buffer_ptr;
+                cudaMemcpy(iov_ptr, iov[iov_count].iov_base, iov[iov_count].iov_len, cudaMemcpyHostToDevice);
+                free_required = 1;
+            }
+        } 
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
         GET_TIME( end );
         total_time = ELAPSED_TIME( start, end );
         printf( "[Timing]: HtoD memcpy in %ld microsec, free required %d\n", total_time, free_required );
@@ -225,8 +228,10 @@ int32_t opal_generic_simple_unpack_function_cuda_vector( opal_convertor_t* pConv
             if( OPAL_DATATYPE_LOOP == pElem->elem.common.type ) {
                 OPAL_PTRDIFF_TYPE local_disp = (OPAL_PTRDIFF_TYPE)conv_ptr;
                 if( pElem->loop.common.flags & OPAL_DATATYPE_FLAG_CONTIGUOUS ) {
-                    if (free_required == 255 && OPAL_DATATYPE_VECTOR_USE_MEMCPY2D) {
-                        unpack_contiguous_loop_cuda_memcpy2d(pElem, &count_desc, &iov_ptr, &conv_ptr, &iov_len_local, &free_required);
+                    if (OPAL_DATATYPE_VECTOR_USE_MEMCPY2D) {
+                        unpack_contiguous_loop_cuda_memcpy2d(pElem, &count_desc, &iov_ptr, &conv_ptr, &iov_len_local);
+                    } else if (OPAL_DATATYPE_VECTOR_USE_ZEROCPY) {
+                        unpack_contiguous_loop_cuda_zerocopy(pElem, &count_desc, &iov_ptr, &conv_ptr, &iov_len_local);
                     } else {
                         unpack_contiguous_loop_cuda(pElem, &count_desc, &iov_ptr, &conv_ptr, &iov_len_local);
                     }
@@ -543,8 +548,8 @@ void unpack_contiguous_loop_cuda( dt_elem_desc_t* ELEM,
 #endif
 //    tasks_per_block = THREAD_PER_BLOCK * TASK_PER_THREAD;
 //    num_blocks = (*COUNT + tasks_per_block - 1) / tasks_per_block;
-//    unpack_contiguous_loop_cuda_kernel_global<<<192, 4*THREAD_PER_BLOCK>>>(_copy_loops, _end_loop->size, _loop->extent, _source, _destination);
-     cudaMemcpy2D(_destination, _loop->extent, _source, _end_loop->size, _end_loop->size, _copy_loops, cudaMemcpyDeviceToDevice);
+    unpack_contiguous_loop_cuda_kernel_global<<<192, 4*THREAD_PER_BLOCK>>>(_copy_loops, _end_loop->size, _loop->extent, _source, _destination);
+//     cudaMemcpy2D(_destination, _loop->extent, _source, _end_loop->size, _end_loop->size, _copy_loops, cudaMemcpyDeviceToDevice);
 
 #if !defined(OPAL_DATATYPE_CUDA_DRY_RUN)     
     *(DESTINATION) = _destination + _loop->extent*_copy_loops - _end_loop->first_elem_disp;
@@ -565,7 +570,7 @@ void unpack_contiguous_loop_cuda_memcpy2d( dt_elem_desc_t* ELEM,
                                   uint32_t* COUNT,
                                   unsigned char** SOURCE,
                                   unsigned char** DESTINATION,
-                                  size_t* SPACE, uint8_t* free_required )
+                                  size_t* SPACE )
 {
     ddt_loop_desc_t *_loop = (ddt_loop_desc_t*)(ELEM);
     ddt_endloop_desc_t* _end_loop = (ddt_endloop_desc_t*)((ELEM) + _loop->items);
@@ -596,7 +601,6 @@ void unpack_contiguous_loop_cuda_memcpy2d( dt_elem_desc_t* ELEM,
     *(COUNT) -= _copy_loops;
 #endif
     
-//    *free_required = 0;
 
 #if defined(OPAL_DATATYPE_CUDA_TIMING) 
     GET_TIME( end );
@@ -604,3 +608,52 @@ void unpack_contiguous_loop_cuda_memcpy2d( dt_elem_desc_t* ELEM,
     printf( "[Timing]: vector unpacking with memcpy2d in %ld microsec\n", total_time );
 #endif
 }
+
+void unpack_contiguous_loop_cuda_zerocopy( dt_elem_desc_t* ELEM,
+                                           uint32_t* COUNT,
+                                           unsigned char** SOURCE,
+                                           unsigned char** DESTINATION,
+                                           size_t* SPACE)
+{
+    ddt_loop_desc_t *_loop = (ddt_loop_desc_t*)(ELEM);
+    ddt_endloop_desc_t* _end_loop = (ddt_endloop_desc_t*)((ELEM) + _loop->items);
+    unsigned char* _destination = (*DESTINATION) + _end_loop->first_elem_disp;
+    uint32_t _copy_loops = *(COUNT);
+    uint32_t num_blocks, tasks_per_block;
+    unsigned char* _source = *(SOURCE);
+    unsigned char* _source_dev;
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)    
+    TIMER_DATA_TYPE start, end, start_total, end_total;
+    long total_time;
+#endif
+    
+    DT_CUDA_DEBUG( opal_cuda_output( 0, "I am in unpack_contiguous_loop_cuda_zerocopy\n"); );
+
+    if( (_copy_loops * _end_loop->size) > *(SPACE) )
+        _copy_loops = (uint32_t)(*(SPACE) / _end_loop->size);
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+    GET_TIME(start);
+#endif
+//    tasks_per_block = THREAD_PER_BLOCK * TASK_PER_THREAD;
+//    num_blocks = (*COUNT + tasks_per_block - 1) / tasks_per_block;
+//    cudaHostRegister(_source, _copy_loops*_end_loop->size, cudaHostRegisterMapped);
+    cudaHostGetDevicePointer((void **)&_source_dev, (void *) _source, 0);
+    unpack_contiguous_loop_cuda_kernel_global<<<192, 4*THREAD_PER_BLOCK>>>(_copy_loops, _end_loop->size, _loop->extent, _source, _destination);
+
+#if !defined(OPAL_DATATYPE_CUDA_DRY_RUN)
+    *(DESTINATION) = _destination + _loop->extent*_copy_loops - _end_loop->first_elem_disp;
+    *(SOURCE) = *(SOURCE)  + _copy_loops * _end_loop->size;
+    *(SPACE) -= _copy_loops * _end_loop->size;
+    *(COUNT) -= _copy_loops;
+#endif
+
+    cudaDeviceSynchronize();
+  //  cudaHostUnregister(_source);
+#if defined(OPAL_DATATYPE_CUDA_TIMING) 
+    GET_TIME( end );
+    total_time = ELAPSED_TIME( start, end );
+    printf( "[Timing]: vector unpacking in %ld microsec\n", total_time );
+#endif
+}
diff --git a/opal/mca/btl/smcuda/btl_smcuda_component.c b/opal/mca/btl/smcuda/btl_smcuda_component.c
index 2382fef5d94..a9b08f3efdc 100644
--- a/opal/mca/btl/smcuda/btl_smcuda_component.c
+++ b/opal/mca/btl/smcuda/btl_smcuda_component.c
@@ -931,7 +931,7 @@ static void btl_smcuda_datatype_pack(mca_btl_base_module_t* btl,
     } else {
         struct iovec iov;
         int rc_dt = 0;
-        size_t pipeline_size = 1024*1024*200;
+        size_t pipeline_size = 1024*1024*10;
         uint32_t iov_count = 1;
         iov.iov_base = convertor->gpu_buffer_ptr;
         iov.iov_len = pipeline_size;

From eee322e3633b4cbe371a3cebf5581284151c20ef Mon Sep 17 00:00:00 2001
From: eddy16112 <eddy16112@gmail.com>
Date: Mon, 14 Sep 2015 17:22:43 -0400
Subject: [PATCH 114/190] put pipeline size into mca

---
 .../cuda/opal_datatype_cuda_internal.cuh      |  2 +-
 .../cuda/opal_datatype_pack_cuda_wrapper.cu   | 15 ++++++++-----
 .../cuda/opal_datatype_unpack_cuda_wrapper.cu |  2 +-
 opal/datatype/opal_datatype_pack.c            |  3 ++-
 opal/datatype/opal_datatype_unpack.c          |  3 ++-
 opal/mca/btl/smcuda/btl_smcuda.c              | 22 -------------------
 opal/mca/btl/smcuda/btl_smcuda.h              |  1 +
 opal/mca/btl/smcuda/btl_smcuda_component.c    |  4 +++-
 8 files changed, 19 insertions(+), 33 deletions(-)

diff --git a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
index c0cfda8ea90..938c1b5f8a1 100644
--- a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
+++ b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
@@ -30,7 +30,7 @@
 #define CUDA_NB_IOV         4096
 #define CUDA_IOV_LEN        1024*1204
 #define CUDA_MAX_NB_BLOCKS  1024
-#define CUDA_IOV_MAX_TASK_PER_BLOCK 200
+#define CUDA_IOV_MAX_TASK_PER_BLOCK 10
 #define ALIGNMENT_DOUBLE    8
 #define ALIGNMENT_FLOAT     4
 #define ALIGNMENT_CHAR      1
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
index 01fc947043c..e45a0b7df15 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
@@ -601,7 +601,7 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
 {
     uint32_t i, j;
     uint32_t count_desc, current_block, task_iteration, nb_blocks_per_description, residue_desc;
-    uint32_t nb_blocks, thread_per_block;
+    uint32_t nb_blocks, thread_per_block, nb_blocks_used;
     size_t length, buffer_size, length_per_iovec, dst_offset;
     unsigned char *destination, *destination_tmp;
     size_t total_packed, total_converted;
@@ -692,8 +692,9 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
 #endif
     
     dst_offset = 0;
-    thread_per_block = CUDA_WARP_SIZE * 5;
+    thread_per_block = CUDA_WARP_SIZE * 4;
     nb_blocks = 256;
+    nb_blocks_used = 0;
     
     while (cuda_iov_count > 0) {
         
@@ -752,6 +753,7 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
                 destination += cuda_iov_dist_h_current[current_block].nb_elements[task_iteration] * alignment;
                 DT_CUDA_DEBUG( opal_cuda_output(12, "PACKING \tblock %d, src %p, dst %p, nb_elements %d, alignment %d\n", current_block, cuda_iov_dist_h_current[current_block].src[task_iteration], cuda_iov_dist_h_current[current_block].dst[task_iteration], cuda_iov_dist_h_current[current_block].nb_elements[task_iteration], cuda_iov_dist_h_current[current_block].element_alignment[task_iteration]); );
                 current_block += 1;
+                nb_blocks_used ++;
                 if (current_block >= nb_blocks) {
                     current_block = 0;
                     task_iteration ++;
@@ -773,6 +775,7 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
                 destination += cuda_iov_dist_h_current[current_block].nb_elements[task_iteration] * orig_alignment;
                 DT_CUDA_DEBUG( opal_cuda_output(12, "PACKING \tblock %d, src %p, dst %p, nb_elements %d, alignment %d\n", current_block, cuda_iov_dist_h_current[current_block].src[task_iteration], cuda_iov_dist_h_current[current_block].dst[task_iteration], cuda_iov_dist_h_current[current_block].nb_elements[task_iteration], cuda_iov_dist_h_current[current_block].element_alignment[task_iteration]); );
                 current_block += 1;
+                nb_blocks_used ++;
                 if (current_block >= nb_blocks) {
                     current_block = 0;
                     task_iteration ++;
@@ -788,7 +791,7 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
 #if defined(OPAL_DATATYPE_CUDA_TIMING)    
         GET_TIME( end );
         total_time = ELAPSED_TIME( start, end );
-        printf( "[Timing]: Pack to dest %p, iov is prepared in %ld microsec, kernel submitted to CUDA stream %d\n", destination_tmp, total_time,  cuda_streams->current_stream_id);
+        printf( "[Timing]: Pack to dest %p, iov is prepared in %ld microsec, kernel submitted to CUDA stream %d, nb_blocks %d\n", destination_tmp, total_time,  cuda_streams->current_stream_id, nb_blocks_used);
 #endif
 
         cudaMemcpyAsync(cuda_iov_dist_d_current, cuda_iov_dist_h_current, sizeof(ddt_cuda_iov_dist_t)*(nb_blocks), cudaMemcpyHostToDevice, cuda_streams->opal_cuda_stream[cuda_streams->current_stream_id]);
@@ -818,10 +821,10 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
     }
     
 
-  //  cudaDeviceSynchronize();
-    for (i = 0; i < NB_STREAMS; i++) {
+    cudaDeviceSynchronize();
+ /*   for (i = 0; i < NB_STREAMS; i++) {
         cudaStreamSynchronize(cuda_streams->opal_cuda_stream[i]);
-    }
+    }*/
     
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
     GET_TIME(start);
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
index e48c0340bd8..2f281bdb494 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
@@ -370,7 +370,7 @@ int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvert
 #endif
     
     dst_offset = 0;
-    thread_per_block = CUDA_WARP_SIZE * 5;
+    thread_per_block = CUDA_WARP_SIZE * 4;
     nb_blocks = 256;
     
     while (cuda_iov_count > 0) {
diff --git a/opal/datatype/opal_datatype_pack.c b/opal/datatype/opal_datatype_pack.c
index 7ddefdd1728..54a28b93c5b 100644
--- a/opal/datatype/opal_datatype_pack.c
+++ b/opal/datatype/opal_datatype_pack.c
@@ -424,7 +424,8 @@ opal_generic_simple_pack_cuda_function( opal_convertor_t* pConvertor,
     
     if( OPAL_DATATYPE_LOOP == pElem->elem.common.type ) {
         if (opal_generic_simple_pack_function_cuda_vector_p != NULL) {
-            return (*opal_generic_simple_pack_function_cuda_vector_p)( pConvertor, iov, out_size, max_data);
+        //    return (*opal_generic_simple_pack_function_cuda_vector_p)( pConvertor, iov, out_size, max_data);
+            return (*opal_generic_simple_pack_function_cuda_iov_p)( pConvertor, iov, out_size, max_data);
         }
     } else {
         if (opal_generic_simple_pack_function_cuda_iov_p != NULL) {
diff --git a/opal/datatype/opal_datatype_unpack.c b/opal/datatype/opal_datatype_unpack.c
index ff8dae77971..5fe4003063d 100644
--- a/opal/datatype/opal_datatype_unpack.c
+++ b/opal/datatype/opal_datatype_unpack.c
@@ -611,7 +611,8 @@ opal_generic_simple_unpack_cuda_function( opal_convertor_t* pConvertor,
     
     if( OPAL_DATATYPE_LOOP == pElem->elem.common.type ) {
         if (opal_generic_simple_unpack_function_cuda_vector_p != NULL) {
-            return (*opal_generic_simple_unpack_function_cuda_vector_p)( pConvertor, iov, out_size, max_data);
+         //   return (*opal_generic_simple_unpack_function_cuda_vector_p)( pConvertor, iov, out_size, max_data);
+            return (*opal_generic_simple_unpack_function_cuda_iov_p)( pConvertor, iov, out_size, max_data);
         }
     } else {
         if (opal_generic_simple_unpack_function_cuda_iov_p != NULL) {
diff --git a/opal/mca/btl/smcuda/btl_smcuda.c b/opal/mca/btl/smcuda/btl_smcuda.c
index cc10683752f..da403ad937d 100644
--- a/opal/mca/btl/smcuda/btl_smcuda.c
+++ b/opal/mca/btl/smcuda/btl_smcuda.c
@@ -1046,28 +1046,6 @@ static int mca_btl_smcuda_deregister_mem (struct mca_btl_base_module_t* btl,
     return OPAL_SUCCESS;
 }
 
-int mca_btl_smcuda_notify_packing_done(void* send_value, int my_rank, int peer_rank)
-{
-    sm_fifo_t* fifo_send = &(mca_btl_smcuda_component.fifo[peer_rank][FIFO_MAP(my_rank)]);
-    if (fifo_send == NULL) {
-        return OPAL_ERROR;
-    } else {
-   //     return sm_fifo_write(send_value, fifo_send);
-        int tail = fifo_send->tail;
-        int head = fifo_send->head;
-        if ((head + 1) & fifo_send->mask == tail) {
-            printf("fifo is full\n");
-            return OPAL_ERR_OUT_OF_RESOURCE;
-        } else {
-            volatile void **q = (volatile void **) RELATIVE2VIRTUAL(fifo_send->queue);
-            tail = (tail - 1) & fifo_send->mask;
-            q[tail] = send_value;
-            printf("write to place %d tail %d head %d\n", tail, fifo_send->tail, fifo_send->head);
-            return OPAL_SUCCESS;
-        }
-    }
-}
-
 int mca_btl_smcuda_get_cuda (struct mca_btl_base_module_t *btl,
     struct mca_btl_base_endpoint_t *ep, void *local_address,
     uint64_t remote_address, struct mca_btl_base_registration_handle_t *local_handle,
diff --git a/opal/mca/btl/smcuda/btl_smcuda.h b/opal/mca/btl/smcuda/btl_smcuda.h
index 20465decc10..478dd184d24 100644
--- a/opal/mca/btl/smcuda/btl_smcuda.h
+++ b/opal/mca/btl/smcuda/btl_smcuda.h
@@ -207,6 +207,7 @@ struct mca_btl_smcuda_component_t {
     int cuda_ipc_output;
     int use_cuda_ipc;
     int use_cuda_ipc_same_gpu;
+    int cuda_dt_pipeline_size;
 #endif /* OPAL_CUDA_SUPPORT */
 };
 typedef struct mca_btl_smcuda_component_t mca_btl_smcuda_component_t;
diff --git a/opal/mca/btl/smcuda/btl_smcuda_component.c b/opal/mca/btl/smcuda/btl_smcuda_component.c
index a9b08f3efdc..870301b5f9c 100644
--- a/opal/mca/btl/smcuda/btl_smcuda_component.c
+++ b/opal/mca/btl/smcuda/btl_smcuda_component.c
@@ -167,6 +167,7 @@ static int smcuda_register(void)
     mca_btl_smcuda_param_register_int("use_cuda_ipc", 1, OPAL_INFO_LVL_4, &mca_btl_smcuda_component.use_cuda_ipc);
     mca_btl_smcuda_param_register_int("use_cuda_ipc_same_gpu", 1, OPAL_INFO_LVL_4,&mca_btl_smcuda_component.use_cuda_ipc_same_gpu);
     mca_btl_smcuda_param_register_int("cuda_ipc_verbose", 0, OPAL_INFO_LVL_4, &mca_btl_smcuda_component.cuda_ipc_verbose);
+    mca_btl_smcuda_param_register_int("cuda_dt_pipeline_size", 1024*1024*400, OPAL_INFO_LVL_4, &mca_btl_smcuda_component.cuda_dt_pipeline_size);
     mca_btl_smcuda_component.cuda_ipc_output = opal_output_open(NULL);
     opal_output_set_verbosity(mca_btl_smcuda_component.cuda_ipc_output, mca_btl_smcuda_component.cuda_ipc_verbose);
 #else /* OPAL_CUDA_SUPPORT */
@@ -931,7 +932,8 @@ static void btl_smcuda_datatype_pack(mca_btl_base_module_t* btl,
     } else {
         struct iovec iov;
         int rc_dt = 0;
-        size_t pipeline_size = 1024*1024*10;
+        size_t pipeline_size = mca_btl_smcuda_component.cuda_dt_pipeline_size;
+        printf("Pipeline_size %ld\n", pipeline_size);
         uint32_t iov_count = 1;
         iov.iov_base = convertor->gpu_buffer_ptr;
         iov.iov_len = pipeline_size;

From 630e831cf1ee7911ac768c2b11982c46652d6a29 Mon Sep 17 00:00:00 2001
From: George Bosilca <bosilca@icl.utk.edu>
Date: Tue, 15 Sep 2015 14:16:16 -0400
Subject: [PATCH 115/190] Upon datatype commit create a list of iovec
 representing a single iteration of the datatype based on a NULL pointer. This
 list will then contain the displacement and the length of each fragment of
 the datatype memory layout and can be used for any packing/unpacking purpose.

---
 opal/datatype/opal_convertor.h         |  6 +++++-
 opal/datatype/opal_convertor_raw.c     | 29 ++++++++++++++++++++++++++
 opal/datatype/opal_datatype.h          |  6 +++++-
 opal/datatype/opal_datatype_optimize.c |  6 ++++++
 4 files changed, 45 insertions(+), 2 deletions(-)

diff --git a/opal/datatype/opal_convertor.h b/opal/datatype/opal_convertor.h
index 1ee0c010e63..ace5cf4b1e4 100644
--- a/opal/datatype/opal_convertor.h
+++ b/opal/datatype/opal_convertor.h
@@ -283,7 +283,11 @@ opal_convertor_raw( opal_convertor_t* convertor,  /* [IN/OUT] */
                     struct iovec* iov,            /* [IN/OUT] */
                     uint32_t* iov_count,          /* [IN/OUT] */
                     size_t* length );             /* [OUT]    */
-
+OPAL_DECLSPEC void
+opal_convertor_to_iov(struct opal_convertor_t *convertor,
+                      struct iovec **iov,
+                      uint32_t *iov_count,
+                      size_t *max_data);
 /*
  * Upper level does not need to call the _nocheck function directly.
  */
diff --git a/opal/datatype/opal_convertor_raw.c b/opal/datatype/opal_convertor_raw.c
index b57d5aa1ded..441ee9ee0fc 100644
--- a/opal/datatype/opal_convertor_raw.c
+++ b/opal/datatype/opal_convertor_raw.c
@@ -211,3 +211,32 @@ opal_convertor_raw( opal_convertor_t* pConvertor,
                            pConvertor->stack_pos, pStack->index, (int)pStack->count, (long)pStack->disp ); );
     return 0;
 }
+
+#define IOVEC_INITIAL_SIZE 64
+
+void
+opal_convertor_to_iov(struct opal_convertor_t *convertor,
+                      struct iovec **iov,
+                      uint32_t *iov_count,
+                      size_t *max_data)
+{
+    uint32_t temp_count = IOVEC_INITIAL_SIZE;
+    struct iovec *iovec;
+    size_t temp_data;
+
+    *iov_count = 0;
+    *max_data = 0;
+
+    *iov = iovec = (struct iovec*) malloc(temp_count * sizeof(struct iovec));
+    while(1) {
+        int ret = opal_convertor_raw(convertor, iovec, &temp_count, &temp_data);
+        *iov_count += temp_count;
+        *max_data += temp_data;
+        if(ret)
+            break;
+
+        *iov = (struct iovec*)realloc(*iov, (*iov_count + IOVEC_INITIAL_SIZE) * sizeof(struct iovec));
+        temp_count = IOVEC_INITIAL_SIZE;
+        iovec = &((*iov)[*iov_count]);
+    }
+}
diff --git a/opal/datatype/opal_datatype.h b/opal/datatype/opal_datatype.h
index 25f014ead0d..c76df3bc373 100644
--- a/opal/datatype/opal_datatype.h
+++ b/opal/datatype/opal_datatype.h
@@ -128,7 +128,11 @@ struct opal_datatype_t {
                                       Reason being is that Fortran is not at the OPAL layer. */
     /* --- cacheline 5 boundary (320 bytes) was 32-36 bytes ago --- */
 
-    /* size: 352, cachelines: 6, members: 15 */
+    struct iovec*      iov;
+    int                iov_count;
+    size_t             max_data;
+    /* size: 372, cachelines: 6, members: 18 */
+
     /* last cacheline: 28-32 bytes */
 };
 
diff --git a/opal/datatype/opal_datatype_optimize.c b/opal/datatype/opal_datatype_optimize.c
index 5b66e4df595..611057afd9b 100644
--- a/opal/datatype/opal_datatype_optimize.c
+++ b/opal/datatype/opal_datatype_optimize.c
@@ -303,5 +303,11 @@ int32_t opal_datatype_commit( opal_datatype_t * pData )
         pLast->first_elem_disp = first_elem_disp;
         pLast->size            = pData->size;
     }
+
+    /* save a compressed datatype description as a iovec list */
+    opal_convertor_t* conv = opal_convertor_create( opal_local_arch, 0 /* unused */);
+    opal_convertor_prepare_for_send( conv, pData, 1, (void*)0 );
+    opal_convertor_to_iov(conv, &pData->iov, &pData->iov_count, &pData->max_data);
+    OBJ_RELEASE(conv);
     return OPAL_SUCCESS;
 }

From c3016bc0c0ee38876546ebb2ae8c9d5347937091 Mon Sep 17 00:00:00 2001
From: eddy16112 <eddy16112@gmail.com>
Date: Thu, 17 Sep 2015 01:52:23 -0400
Subject: [PATCH 116/190] contiguous vs non-contiguous is working

Conflicts:
	opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
	opal/datatype/opal_datatype_unpack.c
---
 ompi/mca/pml/ob1/pml_ob1_cuda.c               | 10 +++-
 opal/datatype/cuda/opal_datatype_cuda.cu      |  2 +
 .../cuda/opal_datatype_cuda_internal.cuh      |  4 +-
 .../cuda/opal_datatype_pack_cuda_wrapper.cu   | 50 +++++++++-------
 .../cuda/opal_datatype_unpack_cuda_wrapper.cu | 32 ++++++----
 opal/datatype/opal_datatype_optimize.c        |  8 +--
 opal/datatype/opal_datatype_unpack.c          |  4 +-
 opal/mca/btl/smcuda/btl_smcuda.c              | 59 +++++++++++++------
 opal/mca/btl/smcuda/btl_smcuda_component.c    | 42 ++++++-------
 9 files changed, 130 insertions(+), 81 deletions(-)

diff --git a/ompi/mca/pml/ob1/pml_ob1_cuda.c b/ompi/mca/pml/ob1/pml_ob1_cuda.c
index 0befb5078af..e371e7347ec 100644
--- a/ompi/mca/pml/ob1/pml_ob1_cuda.c
+++ b/ompi/mca/pml/ob1/pml_ob1_cuda.c
@@ -67,6 +67,8 @@ int mca_pml_ob1_send_request_start_cuda(mca_pml_ob1_send_request_t* sendreq,
                                         mca_bml_base_btl_t* bml_btl,
                                         size_t size) {
     int rc;
+    int local_device = 0;
+#if OPAL_CUDA_SUPPORT_41
 #if OPAL_CUDA_GDR_SUPPORT
     /* With some BTLs, switch to RNDV from RGET at large messages */
     if ((sendreq->req_send.req_base.req_convertor.flags & CONVERTOR_CUDA) &&
@@ -86,6 +88,13 @@ int mca_pml_ob1_send_request_start_cuda(mca_pml_ob1_send_request_t* sendreq,
                                                                            base,
                                                                            sendreq->req_send.req_bytes_packed,
                                                                            sendreq->req_rdma))) {
+            
+            rc = mca_common_cuda_get_device(&local_device);
+            if (rc != 0) {
+                opal_output_verbose(0, "Failed to get the GPU device ID, rc=%d", rc);
+                return rc;
+            }                                                                   
+            mca_pml_ob1_rdma_cuda_btl_register_data(sendreq->req_rdma, sendreq->req_rdma_cnt, 0, -1, 0, local_device); 
             rc = mca_pml_ob1_send_request_start_rdma(sendreq, bml_btl,
                                                      sendreq->req_send.req_bytes_packed);
             if( OPAL_UNLIKELY(OMPI_SUCCESS != rc) ) {
@@ -108,7 +117,6 @@ int mca_pml_ob1_send_request_start_cuda(mca_pml_ob1_send_request_t* sendreq,
             printf("GPU data ready for GET!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n");
             unsigned char *base;
             struct opal_convertor_t *convertor = &(sendreq->req_send.req_base.req_convertor);
-            int local_device = 0;
             base = opal_cuda_malloc_gpu_buffer_p(convertor->local_size, 0);
             convertor->gpu_buffer_ptr = base;
             sendreq->req_send.req_bytes_packed = convertor->local_size;
diff --git a/opal/datatype/cuda/opal_datatype_cuda.cu b/opal/datatype/cuda/opal_datatype_cuda.cu
index 9791e40fef1..29ade337b69 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cu
+++ b/opal/datatype/cuda/opal_datatype_cuda.cu
@@ -293,6 +293,8 @@ void opal_datatype_cuda_init(void)
     // ALIGNMENT_DOUBLE = sizeof(double);
     // ALIGNMENT_FLOAT = sizeof(float);
     // ALIGNMENT_CHAR = sizeof(char);
+    
+    cudaDeviceSynchronize();
 }
 
 void opal_datatype_cuda_fini(void)
diff --git a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
index 938c1b5f8a1..2102edb6a9c 100644
--- a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
+++ b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
@@ -26,8 +26,8 @@
 #define THREAD_PER_BLOCK    32
 #define CUDA_WARP_SIZE      32
 #define TASK_PER_THREAD     2
-#define NB_STREAMS          4
-#define CUDA_NB_IOV         4096
+#define NB_STREAMS          8
+#define CUDA_NB_IOV         1024*20
 #define CUDA_IOV_LEN        1024*1204
 #define CUDA_MAX_NB_BLOCKS  1024
 #define CUDA_IOV_MAX_TASK_PER_BLOCK 10
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
index e45a0b7df15..250e3e253e3 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
@@ -619,19 +619,10 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
 
 #if defined(OPAL_DATATYPE_CUDA_TIMING)    
     TIMER_DATA_TYPE start, end, start_total, end_total;
-    long total_time;
-#endif
-    
-#if defined(OPAL_DATATYPE_CUDA_TIMING)
-    GET_TIME(start_total);
+    long total_time, move_time;
 #endif
     
     DT_CUDA_DEBUG ( opal_cuda_output(0, "GPU datatype PACKING using iovec\n"); );
-
-
-#if defined(OPAL_DATATYPE_CUDA_TIMING)
-    GET_TIME(start);
-#endif
     
     description = pConvertor->use_desc->desc;
     pStack = pConvertor->pStack + pConvertor->stack_pos;
@@ -659,17 +650,24 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
         }
         transfer_required = 0;
     } else {
-        buffer_size = iov[0].iov_len;
-        if (pConvertor->gpu_buffer_ptr == NULL) {
-            pConvertor->gpu_buffer_ptr = (unsigned char*)opal_cuda_malloc_gpu_buffer(buffer_size, 0);
-        }
-        transfer_required = 1;
-        free_required = 1;
+        if (OPAL_DATATYPE_VECTOR_USE_ZEROCPY) {
+            pConvertor->gpu_buffer_ptr = NULL;
+            transfer_required = 0;
+            free_required = 0;
+            cudaHostGetDevicePointer((void **)&destination, (void *)iov[0].iov_base, 0);
+        } else {
+            buffer_size = iov[0].iov_len;
+            if (pConvertor->gpu_buffer_ptr == NULL) {
+                pConvertor->gpu_buffer_ptr = (unsigned char*)opal_cuda_malloc_gpu_buffer(buffer_size, 0);
+            }
+            transfer_required = 1;
+            free_required = 1;
 #if defined(OPAL_DATATYPE_CUDA_DRY_RUN)
-        destination = (unsigned char*)iov[0].iov_base;
+            destination = (unsigned char*)iov[0].iov_base;
 #else
-        destination = pConvertor->gpu_buffer_ptr;
+            destination = pConvertor->gpu_buffer_ptr;
 #endif /* OPAL_DATATYPE_CUDA_DRY_RUN */
+        }
     }
     
     destination_tmp = destination;
@@ -682,6 +680,14 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
     cuda_streams->current_stream_id = 0;
     convertor_flags = pConvertor->flags;
     orig_stack_index = pStack->index;
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+    GET_TIME(start_total);
+#endif
+    
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+    GET_TIME(start);
+#endif
     complete_flag = opal_convertor_raw( pConvertor, cuda_iov, &cuda_iov_count, &length );
     DT_CUDA_DEBUG ( opal_cuda_output(2, "PACKING complete flag %d, iov count %d, length %d, submit to CUDA stream %d\n", complete_flag, cuda_iov_count, length, cuda_streams->current_stream_id); );
 
@@ -692,7 +698,7 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
 #endif
     
     dst_offset = 0;
-    thread_per_block = CUDA_WARP_SIZE * 4;
+    thread_per_block = CUDA_WARP_SIZE * 5;
     nb_blocks = 256;
     nb_blocks_used = 0;
     
@@ -834,8 +840,8 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
     } 
 #if defined(OPAL_DATATYPE_CUDA_TIMING) 
     GET_TIME( end );
-    total_time = ELAPSED_TIME( start, end );
-    printf( "[Timing]: DtoH memcpy in %ld microsec, transfer required %d\n", total_time, transfer_required );
+    move_time = ELAPSED_TIME( start, end );
+    printf( "[Timing]: DtoH memcpy in %ld microsec, transfer required %d\n", move_time, transfer_required );
 #endif
     // float *vtmp = (float *)iov[0].iov_base;
     // DT_CUDA_DEBUG ( opal_cuda_output(0, "packed iov buffer, total packed %d\n", total_packed); );
@@ -852,7 +858,7 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
 #if defined(OPAL_DATATYPE_CUDA_TIMING)    
     GET_TIME( end_total );
     total_time = ELAPSED_TIME( start_total, end_total );
-    printf( "[Timing]: total packing in %ld microsec\n", total_time );
+    printf( "[Timing]: total packing in %ld microsec, kernel %ld microsec\n", total_time, total_time-move_time );
 #endif
     
     if( pConvertor->bConverted == pConvertor->local_size ) {
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
index 2f281bdb494..893f280c68f 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
@@ -303,7 +303,7 @@ int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvert
     
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
     TIMER_DATA_TYPE start, end, start_total, end_total;
-    long total_time;
+    long total_time, move_time;
 #endif
     
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
@@ -327,17 +327,23 @@ int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvert
     if (opal_cuda_is_gpu_buffer(iov[0].iov_base)) {
         source = (unsigned char*)iov[0].iov_base;
         free_required = 0;
-    } else {  
+    } else {
+        if (OPAL_DATATYPE_VECTOR_USE_ZEROCPY) {
+            cudaHostGetDevicePointer((void **)&source, (void *)iov[0].iov_base, 0);
+            pConvertor->gpu_buffer_ptr = NULL;
+            free_required = 0;
+        } else {
 #if defined(OPAL_DATATYPE_CUDA_DRY_RUN)
-        source = (unsigned char*)iov[0].iov_base;
+            source = (unsigned char*)iov[0].iov_base;
 #else
-        if (pConvertor->gpu_buffer_ptr == NULL) {
-            pConvertor->gpu_buffer_ptr = (unsigned char*)opal_cuda_malloc_gpu_buffer(iov[0].iov_len, 0);
+            if (pConvertor->gpu_buffer_ptr == NULL) {
+                pConvertor->gpu_buffer_ptr = (unsigned char*)opal_cuda_malloc_gpu_buffer(iov[0].iov_len, 0);
+            }
+            source = pConvertor->gpu_buffer_ptr;
+#endif /* OPAL_DATATYPE_CUDA_DRY_RUN */
+            cudaMemcpy(source, iov[0].iov_base, iov[0].iov_len, cudaMemcpyHostToDevice);
+            free_required = 1;
         }
-        source = pConvertor->gpu_buffer_ptr;
-#endif /* OPAL_DATATYPE_CUDA_DRY_RUN */  
-        cudaMemcpy(source, iov[0].iov_base, iov[0].iov_len, cudaMemcpyHostToDevice);
-        free_required = 1;
     }
     
     source_tmp = source;
@@ -345,8 +351,8 @@ int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvert
     DT_CUDA_DEBUG ( opal_cuda_output(0, "UNpack GPU base %p, unpack from buffer %p, total size %ld\n", pConvertor->pBaseBuf, source, iov[0].iov_len); );
 #if defined(OPAL_DATATYPE_CUDA_TIMING) 
     GET_TIME( end );
-    total_time = ELAPSED_TIME( start, end );
-    printf( "[Timing]: HtoD memcpy in %ld microsec, free required %d\n", total_time, free_required );
+    move_time = ELAPSED_TIME( start, end );
+    printf( "[Timing]: HtoD memcpy in %ld microsec, free required %d\n", move_time, free_required );
 #endif
 
 
@@ -370,7 +376,7 @@ int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvert
 #endif
     
     dst_offset = 0;
-    thread_per_block = CUDA_WARP_SIZE * 4;
+    thread_per_block = CUDA_WARP_SIZE * 5;
     nb_blocks = 256;
     
     while (cuda_iov_count > 0) {
@@ -506,7 +512,7 @@ int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvert
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
     GET_TIME( end_total );
     total_time = ELAPSED_TIME( start_total, end_total );
-    printf( "[Timing]: total unpacking in %ld microsec\n", total_time );
+    printf( "[Timing]: total unpacking in %ld microsec, kernel %ld microsec\n", total_time, total_time-move_time );
 #endif
     
     if( pConvertor->bConverted == pConvertor->local_size ) {
diff --git a/opal/datatype/opal_datatype_optimize.c b/opal/datatype/opal_datatype_optimize.c
index 611057afd9b..e8b8d9794bd 100644
--- a/opal/datatype/opal_datatype_optimize.c
+++ b/opal/datatype/opal_datatype_optimize.c
@@ -305,9 +305,9 @@ int32_t opal_datatype_commit( opal_datatype_t * pData )
     }
 
     /* save a compressed datatype description as a iovec list */
-    opal_convertor_t* conv = opal_convertor_create( opal_local_arch, 0 /* unused */);
-    opal_convertor_prepare_for_send( conv, pData, 1, (void*)0 );
-    opal_convertor_to_iov(conv, &pData->iov, &pData->iov_count, &pData->max_data);
-    OBJ_RELEASE(conv);
+//    opal_convertor_t* conv = opal_convertor_create( opal_local_arch, 0 /* unused */);
+//    opal_convertor_prepare_for_send( conv, pData, 1, (void*)0 );
+//    opal_convertor_to_iov(conv, &pData->iov, &pData->iov_count, &pData->max_data);
+//    OBJ_RELEASE(conv);
     return OPAL_SUCCESS;
 }
diff --git a/opal/datatype/opal_datatype_unpack.c b/opal/datatype/opal_datatype_unpack.c
index 5fe4003063d..fd269de6764 100644
--- a/opal/datatype/opal_datatype_unpack.c
+++ b/opal/datatype/opal_datatype_unpack.c
@@ -611,8 +611,8 @@ opal_generic_simple_unpack_cuda_function( opal_convertor_t* pConvertor,
     
     if( OPAL_DATATYPE_LOOP == pElem->elem.common.type ) {
         if (opal_generic_simple_unpack_function_cuda_vector_p != NULL) {
-         //   return (*opal_generic_simple_unpack_function_cuda_vector_p)( pConvertor, iov, out_size, max_data);
-            return (*opal_generic_simple_unpack_function_cuda_iov_p)( pConvertor, iov, out_size, max_data);
+            return (*opal_generic_simple_unpack_function_cuda_vector_p)( pConvertor, iov, out_size, max_data);
+          //  return (*opal_generic_simple_unpack_function_cuda_iov_p)( pConvertor, iov, out_size, max_data);
         }
     } else {
         if (opal_generic_simple_unpack_function_cuda_iov_p != NULL) {
diff --git a/opal/mca/btl/smcuda/btl_smcuda.c b/opal/mca/btl/smcuda/btl_smcuda.c
index da403ad937d..ca314d30ebf 100644
--- a/opal/mca/btl/smcuda/btl_smcuda.c
+++ b/opal/mca/btl/smcuda/btl_smcuda.c
@@ -55,6 +55,7 @@
 
 #if OPAL_CUDA_SUPPORT
 #include "opal/mca/common/cuda/common_cuda.h"
+#include "opal/datatype/opal_datatype_gpu.h"
 #endif /* OPAL_CUDA_SUPPORT */
 #include "opal/mca/mpool/base/base.h"
 #include "opal/mca/mpool/sm/mpool_sm.h"
@@ -1135,18 +1136,19 @@ int mca_btl_smcuda_get_cuda (struct mca_btl_base_module_t *btl,
     if ((recvreq->req_recv.req_base.req_convertor.flags & CONVERTOR_CUDA) &&
         (bml_btl->btl_flags & MCA_BTL_FLAGS_CUDA_GET)) {
         recvreq->req_recv.req_base.req_convertor.flags &= ~CONVERTOR_CUDA;
+        uint8_t pack_required = remote_handle->reg_data.pack_required;
+        uint32_t lindex = remote_handle->reg_data.lindex;
+        uint8_t remote_device = remote_handle->reg_data.gpu_device;
+        uint8_t local_device = 0;
         if(opal_convertor_need_buffers(&recvreq->req_recv.req_base.req_convertor) == true) {
             recvreq->req_recv.req_base.req_convertor.flags |= CONVERTOR_CUDA;
-            printf("RECEIVE REGT!!!!!!!!!!!\n");
+            printf("RECEIVE REGT UNPACK, size %ld!!!!!!!!!!!\n", size);
             
             struct opal_convertor_t *convertor = &(recvreq->req_recv.req_base.req_convertor);   
             size_t pipeline_size = remote_handle->reg_data.pipeline_size;
-            uint32_t lindex = remote_handle->reg_data.lindex;
-            uint8_t pack_required = remote_handle->reg_data.pack_required;
-            uint8_t remote_device = remote_handle->reg_data.gpu_device;
-            uint8_t local_device = 0;
-            rc = mca_common_cuda_get_device(&local_device);
             printf("i receive pipeline %ld, lindex %d, pack_required %d, remote_device %d， local_device %d\n", pipeline_size, lindex, pack_required, remote_device, local_device);
+            
+            rc = mca_common_cuda_get_device(&local_device);
             if (rc != 0) {
                 opal_output(0, "Failed to get the GPU device ID, rc=%d", rc);
                 return rc;
@@ -1156,23 +1158,46 @@ int mca_btl_smcuda_get_cuda (struct mca_btl_base_module_t *btl,
             } else {
                 convertor->gpu_buffer_ptr = remote_memory_address;   
             }
-            mca_btl_smcuda_cuda_dt_unpack_clone(convertor, ep, local_address, local_handle, remote_memory_address, (mca_btl_base_completion_fn_t)cbfunc, cbcontext, cbdata, pipeline_size, lindex, remote_device, local_device);
             if (pack_required) {
+                mca_btl_smcuda_cuda_dt_unpack_clone(convertor, ep, local_address, local_handle, remote_memory_address, (mca_btl_base_completion_fn_t)cbfunc, cbcontext, cbdata, pipeline_size, lindex, remote_device, local_device);
                 mca_btl_smcuda_send_cuda_pack_sig(btl, ep, lindex, 0, 0);
+                done = 0;
+                mca_btl_smcuda_free(btl, (mca_btl_base_descriptor_t *)frag);
+            } else {
+                struct iovec iov;
+                uint32_t iov_count = 1;
+                size_t max_data;
+                if (!OPAL_DATATYPE_DIRECT_COPY_GPUMEM && remote_device != local_device) {
+                    convertor->gpu_buffer_ptr = opal_cuda_malloc_gpu_buffer_p(size, 0);
+                    mca_common_cuda_memp2pcpy(convertor->gpu_buffer_ptr, remote_memory_address, size);
+                    iov.iov_base = convertor->gpu_buffer_ptr;
+                    printf("start D2D copy src %p, dst %p, size %lu\n", remote_memory_address, convertor->gpu_buffer_ptr, size);
+                } else {
+                    iov.iov_base = convertor->gpu_buffer_ptr;
+                }
+                iov.iov_len = size;
+                max_data = size;
+                opal_convertor_unpack(convertor, &iov, &iov_count, &max_data );
+                done = 1;
             }
-            done = 0;
-            mca_btl_smcuda_free(btl, (mca_btl_base_descriptor_t *)frag);
         } else {
+            printf("RECEIVE REGT CONTIGUOUS, size %ld !!!!!!!!!!!\n", size);
             recvreq->req_recv.req_base.req_convertor.flags |= CONVERTOR_CUDA;
-            rc = mca_common_cuda_memcpy(local_address, remote_memory_address, size,
-        				"mca_btl_smcuda_get", (mca_btl_base_descriptor_t *)frag,
-        				&done);
-            if (OPAL_SUCCESS != rc) {
-                /* Out of resources can be handled by upper layers. */
-                if (OPAL_ERR_OUT_OF_RESOURCE != rc) {
-                    opal_output(0, "Failed to cuMemcpy GPU memory, rc=%d", rc);
+            if (pack_required) {
+                mca_btl_smcuda_cuda_dt_unpack_clone(NULL, ep, local_address, local_handle, remote_memory_address, (mca_btl_base_completion_fn_t)cbfunc, cbcontext, cbdata, 0, lindex, 0, 0);
+                mca_btl_smcuda_send_cuda_pack_sig(btl, ep, lindex, 0, 0);
+                done = 0;
+            } else {
+                rc = mca_common_cuda_memcpy(local_address, remote_memory_address, size,
+        		            "mca_btl_smcuda_get", (mca_btl_base_descriptor_t *)frag,
+        				    &done);
+                if (OPAL_SUCCESS != rc) {
+                    /* Out of resources can be handled by upper layers. */
+                    if (OPAL_ERR_OUT_OF_RESOURCE != rc) {
+                        opal_output(0, "Failed to cuMemcpy GPU memory, rc=%d", rc);
+                    }
+                    return rc;
                 }
-                return rc;
             }
         }
     }
diff --git a/opal/mca/btl/smcuda/btl_smcuda_component.c b/opal/mca/btl/smcuda/btl_smcuda_component.c
index 870301b5f9c..da2fc6bf6b3 100644
--- a/opal/mca/btl/smcuda/btl_smcuda_component.c
+++ b/opal/mca/btl/smcuda/btl_smcuda_component.c
@@ -879,25 +879,27 @@ static void btl_smcuda_datatype_unpack(mca_btl_base_module_t* btl,
         struct iovec iov;
         uint32_t iov_count = 1;
         size_t max_data;
-        struct opal_convertor_t *convertor = my_cuda_dt_clone->convertor;     
-        if (!OPAL_DATATYPE_DIRECT_COPY_GPUMEM && my_cuda_dt_clone->remote_device != my_cuda_dt_clone->local_device) {
-            convertor->gpu_buffer_ptr = opal_cuda_malloc_gpu_buffer_p(pipeline_size, 0);
-            mca_common_cuda_memp2pcpy(convertor->gpu_buffer_ptr, my_cuda_dt_clone->remote_gpu_address + seq*pipeline_size, pipeline_size);
-            iov.iov_base = convertor->gpu_buffer_ptr;
-            printf("start D2D copy src %p, dst %p, size %lu\n", my_cuda_dt_clone->remote_gpu_address + seq*pipeline_size, convertor->gpu_buffer_ptr, pipeline_size);
-            
-        } else {
-            iov.iov_base = convertor->gpu_buffer_ptr + seq * pipeline_size;
-        }
-        max_data = pipeline_size;
-        iov.iov_len = pipeline_size;
-        opal_convertor_unpack(convertor, &iov, &iov_count, &max_data );
-        if (!OPAL_DATATYPE_DIRECT_COPY_GPUMEM && my_cuda_dt_clone->remote_device != my_cuda_dt_clone->local_device) {
-            if (convertor->gpu_buffer_ptr != NULL) {
-                opal_cuda_free_gpu_buffer_p(convertor->gpu_buffer_ptr, 0);
-                convertor->gpu_buffer_ptr = NULL;
+        struct opal_convertor_t *convertor = my_cuda_dt_clone->convertor;
+        if (convertor == NULL) { /* do not unpack */
+            mca_common_cuda_memp2pcpy(my_cuda_dt_clone->local_address + seq*pipeline_size, my_cuda_dt_clone->remote_gpu_address + seq*pipeline_size, pipeline_size);
+        } else {     /* unpack */
+            if (!OPAL_DATATYPE_DIRECT_COPY_GPUMEM && my_cuda_dt_clone->remote_device != my_cuda_dt_clone->local_device) {
+                convertor->gpu_buffer_ptr = opal_cuda_malloc_gpu_buffer_p(pipeline_size, 0);
+                mca_common_cuda_memp2pcpy(convertor->gpu_buffer_ptr, my_cuda_dt_clone->remote_gpu_address + seq*pipeline_size, pipeline_size);
+                iov.iov_base = convertor->gpu_buffer_ptr;
+                printf("start D2D copy src %p, dst %p, size %lu\n", my_cuda_dt_clone->remote_gpu_address + seq*pipeline_size, convertor->gpu_buffer_ptr, pipeline_size);        
+            } else {
+                iov.iov_base = convertor->gpu_buffer_ptr + seq * pipeline_size;
+            }
+            max_data = pipeline_size;
+            iov.iov_len = pipeline_size;
+            opal_convertor_unpack(convertor, &iov, &iov_count, &max_data );
+            if (!OPAL_DATATYPE_DIRECT_COPY_GPUMEM && my_cuda_dt_clone->remote_device != my_cuda_dt_clone->local_device) {
+                if (convertor->gpu_buffer_ptr != NULL) {
+                    opal_cuda_free_gpu_buffer_p(convertor->gpu_buffer_ptr, 0);
+                    convertor->gpu_buffer_ptr = NULL;
+                }   
             }
-            
         }
     }
    // MCA_BTL_SMCUDA_FRAG_RETURN(frag);
@@ -923,12 +925,12 @@ static void btl_smcuda_datatype_pack(mca_btl_base_module_t* btl,
     printf("$$$$$$$$$$$$$$hello, rank %d in smcuda pack seq %d, index %d\n", my_cuda_dt_clone->endpoint->my_smp_rank, seq, lindex);
     struct opal_convertor_t *convertor = my_cuda_dt_clone->convertor;
     if (seq == -1) {
-        mca_btl_smcuda_send_cuda_unpack_sig(btl, my_cuda_dt_clone->endpoint, lindex, 0, -2);
+        mca_btl_smcuda_send_cuda_unpack_sig(btl, endpoint, lindex, 0, -2);
         if (convertor->gpu_buffer_ptr != NULL) {
             opal_cuda_free_gpu_buffer_p(convertor->gpu_buffer_ptr, 0);
             convertor->gpu_buffer_ptr = NULL;
         }
-        mca_btl_smcuda_free_cuda_dt_pack_clone(my_cuda_dt_clone->endpoint, lindex);
+        mca_btl_smcuda_free_cuda_dt_pack_clone(endpoint, lindex);
     } else {
         struct iovec iov;
         int rc_dt = 0;

From 39d548ad6b8b89f01dafe9b44ec9638eb8b688ed Mon Sep 17 00:00:00 2001
From: eddy16112 <eddy16112@gmail.com>
Date: Thu, 17 Sep 2015 16:20:08 -0400
Subject: [PATCH 117/190] Fix pipeline bug

---
 ompi/mca/pml/ob1/pml_ob1_cuda.c            |  2 +-
 opal/mca/btl/smcuda/btl_smcuda.c           | 43 +++++++--------------
 opal/mca/btl/smcuda/btl_smcuda.h           | 24 +++---------
 opal/mca/btl/smcuda/btl_smcuda_component.c | 44 +++++++++++++---------
 4 files changed, 48 insertions(+), 65 deletions(-)

diff --git a/ompi/mca/pml/ob1/pml_ob1_cuda.c b/ompi/mca/pml/ob1/pml_ob1_cuda.c
index e371e7347ec..820c8e82d8e 100644
--- a/ompi/mca/pml/ob1/pml_ob1_cuda.c
+++ b/ompi/mca/pml/ob1/pml_ob1_cuda.c
@@ -135,7 +135,7 @@ int mca_pml_ob1_send_request_start_cuda(mca_pml_ob1_send_request_t* sendreq,
                     return rc;
                 }
                 mca_pml_ob1_rdma_cuda_btl_register_data(sendreq->req_rdma, sendreq->req_rdma_cnt, 0, lindex, 1, local_device); 
-                mca_btl_smcuda_cuda_dt_pack_clone(convertor, bml_btl->btl_endpoint, NULL, NULL, NULL, NULL, NULL, NULL, 0, lindex, 0, local_device);
+                mca_btl_smcuda_cuda_dt_pack_clone(convertor, bml_btl->btl_endpoint, NULL, NULL, 0, lindex, 0, local_device);
     
                 rc = mca_pml_ob1_send_request_start_rdma(sendreq, bml_btl,
                                                          sendreq->req_send.req_bytes_packed);
diff --git a/opal/mca/btl/smcuda/btl_smcuda.c b/opal/mca/btl/smcuda/btl_smcuda.c
index ca314d30ebf..da940fafcf2 100644
--- a/opal/mca/btl/smcuda/btl_smcuda.c
+++ b/opal/mca/btl/smcuda/btl_smcuda.c
@@ -1145,8 +1145,8 @@ int mca_btl_smcuda_get_cuda (struct mca_btl_base_module_t *btl,
             printf("RECEIVE REGT UNPACK, size %ld!!!!!!!!!!!\n", size);
             
             struct opal_convertor_t *convertor = &(recvreq->req_recv.req_base.req_convertor);   
-            size_t pipeline_size = remote_handle->reg_data.pipeline_size;
-            printf("i receive pipeline %ld, lindex %d, pack_required %d, remote_device %d， local_device %d\n", pipeline_size, lindex, pack_required, remote_device, local_device);
+          //  size_t pipeline_size = remote_handle->reg_data.pipeline_size;
+            printf("i receive lindex %d, pack_required %d, remote_device %d， local_device %d\n", lindex, pack_required, remote_device, local_device);
             
             rc = mca_common_cuda_get_device(&local_device);
             if (rc != 0) {
@@ -1159,10 +1159,10 @@ int mca_btl_smcuda_get_cuda (struct mca_btl_base_module_t *btl,
                 convertor->gpu_buffer_ptr = remote_memory_address;   
             }
             if (pack_required) {
-                mca_btl_smcuda_cuda_dt_unpack_clone(convertor, ep, local_address, local_handle, remote_memory_address, (mca_btl_base_completion_fn_t)cbfunc, cbcontext, cbdata, pipeline_size, lindex, remote_device, local_device);
+                mca_btl_smcuda_cuda_dt_unpack_clone(convertor, ep, remote_memory_address, (mca_btl_base_descriptor_t *)frag, 
+                                                    0, lindex, remote_device, local_device);
                 mca_btl_smcuda_send_cuda_pack_sig(btl, ep, lindex, 0, 0);
                 done = 0;
-                mca_btl_smcuda_free(btl, (mca_btl_base_descriptor_t *)frag);
             } else {
                 struct iovec iov;
                 uint32_t iov_count = 1;
@@ -1184,7 +1184,8 @@ int mca_btl_smcuda_get_cuda (struct mca_btl_base_module_t *btl,
             printf("RECEIVE REGT CONTIGUOUS, size %ld !!!!!!!!!!!\n", size);
             recvreq->req_recv.req_base.req_convertor.flags |= CONVERTOR_CUDA;
             if (pack_required) {
-                mca_btl_smcuda_cuda_dt_unpack_clone(NULL, ep, local_address, local_handle, remote_memory_address, (mca_btl_base_completion_fn_t)cbfunc, cbcontext, cbdata, 0, lindex, 0, 0);
+                mca_btl_smcuda_cuda_dt_unpack_clone(NULL, ep, remote_memory_address, (mca_btl_base_descriptor_t *)frag, 
+                                                    0, lindex, 0, 0);
                 mca_btl_smcuda_send_cuda_pack_sig(btl, ep, lindex, 0, 0);
                 done = 0;
             } else {
@@ -1294,7 +1295,7 @@ static void mca_btl_smcuda_send_cuda_ipc_request(struct mca_btl_base_module_t* b
 
 int mca_btl_smcuda_send_cuda_unpack_sig(struct mca_btl_base_module_t* btl,
                                            struct mca_btl_base_endpoint_t* endpoint, 
-                                           int lindex, int pipeline_size, int seq)
+                                           int lindex, int packed_size, int seq)
 {
     mca_btl_smcuda_frag_t* frag;
     int rc;
@@ -1311,7 +1312,7 @@ int mca_btl_smcuda_send_cuda_unpack_sig(struct mca_btl_base_module_t* btl,
     frag->base.des_flags = MCA_BTL_DES_FLAGS_BTL_OWNERSHIP;
     cuda_dt_hdr.seq = seq;
     cuda_dt_hdr.lindex = lindex;
-    cuda_dt_hdr.pipeline_size = pipeline_size;
+    cuda_dt_hdr.packed_size = packed_size;
     memcpy(frag->segment.seg_addr.pval, &cuda_dt_hdr, sizeof(cuda_dt_hdr_t));
     
     rc = mca_btl_smcuda_send(btl, endpoint, (struct mca_btl_base_descriptor_t*)frag,  MCA_BTL_TAG_SMCUDA_DATATYPE_UNPACK);
@@ -1321,7 +1322,7 @@ int mca_btl_smcuda_send_cuda_unpack_sig(struct mca_btl_base_module_t* btl,
 
 int mca_btl_smcuda_send_cuda_pack_sig(struct mca_btl_base_module_t* btl,
                                       struct mca_btl_base_endpoint_t* endpoint, 
-                                      int lindex, int pipeline_size, int seq)
+                                      int lindex, int packed_size, int seq)
 {
     mca_btl_smcuda_frag_t* frag;
     int rc;
@@ -1337,7 +1338,7 @@ int mca_btl_smcuda_send_cuda_pack_sig(struct mca_btl_base_module_t* btl,
     frag->base.des_flags = MCA_BTL_DES_FLAGS_BTL_OWNERSHIP;
     cuda_dt_hdr.seq = seq;
     cuda_dt_hdr.lindex = lindex;
-    cuda_dt_hdr.pipeline_size = pipeline_size;
+    cuda_dt_hdr.packed_size = packed_size;
     memcpy(frag->segment.seg_addr.pval, &cuda_dt_hdr, sizeof(cuda_dt_hdr_t));
     
     rc = mca_btl_smcuda_send(btl, endpoint, (struct mca_btl_base_descriptor_t*)frag,  MCA_BTL_TAG_SMCUDA_DATATYPE_PACK);
@@ -1413,56 +1414,40 @@ void mca_btl_smcuda_free_cuda_dt_unpack_clone(struct mca_btl_base_endpoint_t *en
 
 void mca_btl_smcuda_cuda_dt_pack_clone(struct opal_convertor_t *convertor,
                                        struct mca_btl_base_endpoint_t *endpoint,
-                                       void *local_address,
-                                       struct mca_btl_base_registration_handle_t *local_handle,
                                        void *remote_gpu_address,
-                                       mca_btl_base_completion_fn_t cbfunc,
-                                       void *cbcontext,
-                                       void *cbdata,
+                                       mca_btl_base_descriptor_t *frag,
                                        size_t pipeline_size,
                                        int lindex, uint8_t remote_device, uint8_t local_device)
 {
     endpoint->smcuda_dt_pack_clone[lindex].convertor = convertor;
  //   endpoint->smcuda_dt_pack_clone[lindex].gpu_ptr = convertor->gpu_buffer_ptr;
     endpoint->smcuda_dt_pack_clone[lindex].endpoint = endpoint;
-    endpoint->smcuda_dt_pack_clone[lindex].local_address = local_address;
-    endpoint->smcuda_dt_pack_clone[lindex].local_handle = local_handle;
     endpoint->smcuda_dt_pack_clone[lindex].remote_gpu_address = remote_gpu_address;
-    endpoint->smcuda_dt_pack_clone[lindex].cbfunc = cbfunc;
-    endpoint->smcuda_dt_pack_clone[lindex].cbcontext = cbcontext;
-    endpoint->smcuda_dt_pack_clone[lindex].cbdata = cbdata;
     endpoint->smcuda_dt_pack_clone[lindex].pipeline_size = pipeline_size;
     endpoint->smcuda_dt_pack_clone[lindex].lindex = lindex;
     endpoint->smcuda_dt_pack_clone[lindex].seq = -9;
     endpoint->smcuda_dt_pack_clone[lindex].remote_device = remote_device;
     endpoint->smcuda_dt_pack_clone[lindex].local_device = local_device;
+    endpoint->smcuda_dt_pack_clone[lindex].frag = frag;
 }
 
 void mca_btl_smcuda_cuda_dt_unpack_clone(struct opal_convertor_t *convertor,
                                          struct mca_btl_base_endpoint_t *endpoint,
-                                         void *local_address,
-                                         struct mca_btl_base_registration_handle_t *local_handle,
                                          void *remote_gpu_address,
-                                         mca_btl_base_completion_fn_t cbfunc,
-                                         void *cbcontext,
-                                         void *cbdata,
+                                         mca_btl_base_descriptor_t *frag,
                                          size_t pipeline_size,
                                          int lindex, uint8_t remote_device, uint8_t local_device)
 {
     endpoint->smcuda_dt_unpack_clone[lindex].convertor = convertor;
 //    endpoint->smcuda_dt_unpack_clone[lindex].gpu_ptr = convertor->gpu_buffer_ptr;
     endpoint->smcuda_dt_unpack_clone[lindex].endpoint = endpoint;
-    endpoint->smcuda_dt_unpack_clone[lindex].local_address = local_address;
-    endpoint->smcuda_dt_unpack_clone[lindex].local_handle = local_handle;
     endpoint->smcuda_dt_unpack_clone[lindex].remote_gpu_address = remote_gpu_address;
-    endpoint->smcuda_dt_unpack_clone[lindex].cbfunc = cbfunc;
-    endpoint->smcuda_dt_unpack_clone[lindex].cbcontext = cbcontext;
-    endpoint->smcuda_dt_unpack_clone[lindex].cbdata = cbdata;
     endpoint->smcuda_dt_unpack_clone[lindex].pipeline_size = pipeline_size;
     endpoint->smcuda_dt_unpack_clone[lindex].lindex = lindex;
     endpoint->smcuda_dt_unpack_clone[lindex].seq = -9;
     endpoint->smcuda_dt_unpack_clone[lindex].remote_device = remote_device;
     endpoint->smcuda_dt_unpack_clone[lindex].local_device = local_device;
+    endpoint->smcuda_dt_unpack_clone[lindex].frag = frag;
 }
 
 #endif /* OPAL_CUDA_SUPPORT */
diff --git a/opal/mca/btl/smcuda/btl_smcuda.h b/opal/mca/btl/smcuda/btl_smcuda.h
index 478dd184d24..d8ef5ed29f6 100644
--- a/opal/mca/btl/smcuda/btl_smcuda.h
+++ b/opal/mca/btl/smcuda/btl_smcuda.h
@@ -515,31 +515,27 @@ enum ipcState {
 typedef struct {
     int seq;
     int lindex;
-    int pipeline_size;
+    int packed_size;
 } cuda_dt_hdr_t;
 
 /* package save pack/unpack convertor and cbfunc */
 typedef struct {
     struct opal_convertor_t *convertor;
     struct mca_btl_base_endpoint_t *endpoint;
-    void *local_address;
-    struct mca_btl_base_registration_handle_t *local_handle;
     void *remote_gpu_address;
-    mca_btl_base_completion_fn_t cbfunc;
-    void *cbcontext;
-    void *cbdata;
     size_t pipeline_size;
     int lindex;
     int seq;
     uint8_t remote_device;
     uint8_t local_device;
+    mca_btl_base_descriptor_t *frag;
 } cuda_dt_clone_t;
 
 #define SMCUDA_DT_CLONE_SIZE 20
 extern cuda_dt_clone_t smcuda_dt_clone[SMCUDA_DT_CLONE_SIZE];
 
-int mca_btl_smcuda_send_cuda_unpack_sig(struct mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t* endpoint, int lindex, int pipeline_size, int seq);
-int mca_btl_smcuda_send_cuda_pack_sig(struct mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t* endpoint, int lindex, int pipeline_size, int seq);
+int mca_btl_smcuda_send_cuda_unpack_sig(struct mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t* endpoint, int lindex, int packed_size, int seq);
+int mca_btl_smcuda_send_cuda_pack_sig(struct mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t* endpoint, int lindex, int packed_size, int seq);
 int mca_btl_smcuda_check_cuda_dt_pack_clone_exist(struct mca_btl_base_endpoint_t *endpoint, struct opal_convertor_t *convertor);
 int mca_btl_smcuda_set_cuda_dt_pack_seq(struct mca_btl_base_endpoint_t *endpoint, int lindex, int seq);
 int mca_btl_smcuda_get_cuda_dt_pack_seq(struct mca_btl_base_endpoint_t *endpoint, int lindex);
@@ -550,22 +546,14 @@ void mca_btl_smcuda_free_cuda_dt_pack_clone(struct mca_btl_base_endpoint_t *endp
 void mca_btl_smcuda_free_cuda_dt_unpack_clone(struct mca_btl_base_endpoint_t *endpoint, int lindex);
 void mca_btl_smcuda_cuda_dt_pack_clone(struct opal_convertor_t *convertor,
                                        struct mca_btl_base_endpoint_t *endpoint,
-                                       void *local_address,
-                                       struct mca_btl_base_registration_handle_t *local_handle,
                                        void *remote_gpu_address,
-                                       mca_btl_base_completion_fn_t cbfunc,
-                                       void *cbcontext,
-                                       void *cbdata,
+                                       mca_btl_base_descriptor_t *frag,
                                        size_t pipeline_size,
                                        int lindex, uint8_t remote_device, uint8_t local_device);
 void mca_btl_smcuda_cuda_dt_unpack_clone(struct opal_convertor_t *convertor,
                                          struct mca_btl_base_endpoint_t *endpoint,
-                                         void *local_address,
-                                         struct mca_btl_base_registration_handle_t *local_handle,
                                          void *remote_gpu_address,
-                                         mca_btl_base_completion_fn_t cbfunc,
-                                         void *cbcontext,
-                                         void *cbdata,
+                                         mca_btl_base_descriptor_t *frag,
                                          size_t pipeline_size,
                                          int lindex, uint8_t remote_device, uint8_t local_device);
 
diff --git a/opal/mca/btl/smcuda/btl_smcuda_component.c b/opal/mca/btl/smcuda/btl_smcuda_component.c
index da2fc6bf6b3..e4e1c280857 100644
--- a/opal/mca/btl/smcuda/btl_smcuda_component.c
+++ b/opal/mca/btl/smcuda/btl_smcuda_component.c
@@ -858,7 +858,7 @@ static void btl_smcuda_datatype_unpack(mca_btl_base_module_t* btl,
     memcpy(&cuda_dt_hdr, segments->seg_addr.pval, sizeof(cuda_dt_hdr_t));
     int seq = cuda_dt_hdr.seq;
     int lindex = cuda_dt_hdr.lindex;
-    int pipeline_size = cuda_dt_hdr.pipeline_size;
+    size_t packed_size = cuda_dt_hdr.packed_size;
     mca_btl_smcuda_frag_t *frag = (mca_btl_smcuda_frag_t *)des;
     cuda_dt_clone_t *my_cuda_dt_clone;
 
@@ -870,29 +870,38 @@ static void btl_smcuda_datatype_unpack(mca_btl_base_module_t* btl,
     printf("$$$$$$$$$$$$$$hello, rank %d in smcuda unpack seq %d, index %d\n", my_cuda_dt_clone->endpoint->my_smp_rank, seq, lindex);
     
     if (seq == -2) {
-        mca_btl_base_rdma_completion_fn_t cbfunc = (mca_btl_base_rdma_completion_fn_t)my_cuda_dt_clone->cbfunc;
-        cbfunc(btl, endpoint, my_cuda_dt_clone->local_address, my_cuda_dt_clone->local_handle, my_cuda_dt_clone->cbcontext, my_cuda_dt_clone->cbdata, OPAL_SUCCESS);
+        mca_btl_smcuda_frag_t *frag_recv = (mca_btl_smcuda_frag_t *) my_cuda_dt_clone->frag;
+        mca_btl_base_rdma_completion_fn_t cbfunc = (mca_btl_base_rdma_completion_fn_t) frag_recv->base.des_cbfunc;
+        cbfunc (btl, endpoint, frag_recv->segment.seg_addr.pval, frag_recv->local_handle, frag_recv->base.des_context, frag_recv->base.des_cbdata, OPAL_SUCCESS);
+        mca_btl_smcuda_free(btl, (mca_btl_base_descriptor_t *)frag_recv);
         mca_btl_smcuda_free_cuda_dt_unpack_clone(endpoint, lindex);
     } else if (seq == -1) {
-        mca_btl_smcuda_send_cuda_pack_sig(btl, endpoint, lindex, pipeline_size, -1);
+        mca_btl_smcuda_send_cuda_pack_sig(btl, endpoint, lindex, 0, -1);
     } else {
         struct iovec iov;
         uint32_t iov_count = 1;
         size_t max_data;
         struct opal_convertor_t *convertor = my_cuda_dt_clone->convertor;
+        if (my_cuda_dt_clone->pipeline_size == 0) {
+            my_cuda_dt_clone->pipeline_size = packed_size;
+        }
+        size_t pipeline_size = my_cuda_dt_clone->pipeline_size;
         if (convertor == NULL) { /* do not unpack */
-            mca_common_cuda_memp2pcpy(my_cuda_dt_clone->local_address + seq*pipeline_size, my_cuda_dt_clone->remote_gpu_address + seq*pipeline_size, pipeline_size);
+            mca_btl_smcuda_frag_t *frag_recv = (mca_btl_smcuda_frag_t *) my_cuda_dt_clone->frag;
+            unsigned char *local_address = (unsigned char*)frag_recv->segment.seg_addr.pval;
+            printf("D2D local %p, remote %p, size %ld\n", local_address + seq*pipeline_size, my_cuda_dt_clone->remote_gpu_address+seq*pipeline_size, packed_size);
+            mca_common_cuda_memp2pcpy(local_address + seq*pipeline_size, my_cuda_dt_clone->remote_gpu_address + seq*pipeline_size, packed_size);
         } else {     /* unpack */
             if (!OPAL_DATATYPE_DIRECT_COPY_GPUMEM && my_cuda_dt_clone->remote_device != my_cuda_dt_clone->local_device) {
                 convertor->gpu_buffer_ptr = opal_cuda_malloc_gpu_buffer_p(pipeline_size, 0);
-                mca_common_cuda_memp2pcpy(convertor->gpu_buffer_ptr, my_cuda_dt_clone->remote_gpu_address + seq*pipeline_size, pipeline_size);
+                mca_common_cuda_memp2pcpy(convertor->gpu_buffer_ptr, my_cuda_dt_clone->remote_gpu_address + seq*pipeline_size, packed_size);
                 iov.iov_base = convertor->gpu_buffer_ptr;
-                printf("start D2D copy src %p, dst %p, size %lu\n", my_cuda_dt_clone->remote_gpu_address + seq*pipeline_size, convertor->gpu_buffer_ptr, pipeline_size);        
+                printf("start D2D copy src %p, dst %p, size %lu\n", my_cuda_dt_clone->remote_gpu_address + seq*pipeline_size, convertor->gpu_buffer_ptr, packed_size);        
             } else {
                 iov.iov_base = convertor->gpu_buffer_ptr + seq * pipeline_size;
             }
-            max_data = pipeline_size;
-            iov.iov_len = pipeline_size;
+            max_data = packed_size;
+            iov.iov_len = packed_size;
             opal_convertor_unpack(convertor, &iov, &iov_count, &max_data );
             if (!OPAL_DATATYPE_DIRECT_COPY_GPUMEM && my_cuda_dt_clone->remote_device != my_cuda_dt_clone->local_device) {
                 if (convertor->gpu_buffer_ptr != NULL) {
@@ -934,25 +943,26 @@ static void btl_smcuda_datatype_pack(mca_btl_base_module_t* btl,
     } else {
         struct iovec iov;
         int rc_dt = 0;
-        size_t pipeline_size = mca_btl_smcuda_component.cuda_dt_pipeline_size;
-        printf("Pipeline_size %ld\n", pipeline_size);
+        size_t packed_size = mca_btl_smcuda_component.cuda_dt_pipeline_size;
+        printf("Pipeline_size %ld\n", packed_size);
         uint32_t iov_count = 1;
         iov.iov_base = convertor->gpu_buffer_ptr;
-        iov.iov_len = pipeline_size;
+        iov.iov_len = packed_size;
         size_t max_data = 0;
         int seq = 0;
         /* the first pack here is used to get the correct size of pipeline_size */
         /* because pack may not use the whole pipeline size */
         rc_dt = opal_convertor_pack(convertor, &iov, &iov_count, &max_data );
-        pipeline_size = max_data;
-        mca_btl_smcuda_send_cuda_unpack_sig(btl, endpoint, lindex, pipeline_size, seq);
+        packed_size = max_data;
+        mca_btl_smcuda_send_cuda_unpack_sig(btl, endpoint, lindex, packed_size, seq);
         while (rc_dt != 1) {
-            iov.iov_base += pipeline_size;
+            iov.iov_base += packed_size;
             seq ++;
             rc_dt = opal_convertor_pack(convertor, &iov, &iov_count, &max_data );
-            mca_btl_smcuda_send_cuda_unpack_sig(btl, endpoint, lindex, pipeline_size, seq);
+            packed_size = max_data;
+            mca_btl_smcuda_send_cuda_unpack_sig(btl, endpoint, lindex, packed_size, seq);
         }
-        mca_btl_smcuda_send_cuda_unpack_sig(btl, endpoint, lindex, pipeline_size, -1);
+        mca_btl_smcuda_send_cuda_unpack_sig(btl, endpoint, lindex, 0, -1);
     }
   //  MCA_BTL_SMCUDA_FRAG_RETURN(frag);
 }

From 817a4fc8ef956bc81e06c225e0b69cdeb7c6693d Mon Sep 17 00:00:00 2001
From: eddy16112 <eddy16112@gmail.com>
Date: Fri, 18 Sep 2015 00:39:46 -0400
Subject: [PATCH 118/190] now we are able to pack directly to remote buffer if
 receiver is contiguous

---
 opal/mca/btl/smcuda/btl_smcuda.c           | 54 +++++++++++++-------
 opal/mca/btl/smcuda/btl_smcuda.h           | 18 +++++--
 opal/mca/btl/smcuda/btl_smcuda_component.c | 59 ++++++++++++++++++----
 3 files changed, 101 insertions(+), 30 deletions(-)

diff --git a/opal/mca/btl/smcuda/btl_smcuda.c b/opal/mca/btl/smcuda/btl_smcuda.c
index da940fafcf2..7dd56f6e612 100644
--- a/opal/mca/btl/smcuda/btl_smcuda.c
+++ b/opal/mca/btl/smcuda/btl_smcuda.c
@@ -1140,6 +1140,11 @@ int mca_btl_smcuda_get_cuda (struct mca_btl_base_module_t *btl,
         uint32_t lindex = remote_handle->reg_data.lindex;
         uint8_t remote_device = remote_handle->reg_data.gpu_device;
         uint8_t local_device = 0;
+        rc = mca_common_cuda_get_device(&local_device);
+        if (rc != 0) {
+            opal_output(0, "Failed to get the GPU device ID, rc=%d", rc);
+            return rc;
+        }
         if(opal_convertor_need_buffers(&recvreq->req_recv.req_base.req_convertor) == true) {
             recvreq->req_recv.req_base.req_convertor.flags |= CONVERTOR_CUDA;
             printf("RECEIVE REGT UNPACK, size %ld!!!!!!!!!!!\n", size);
@@ -1148,11 +1153,6 @@ int mca_btl_smcuda_get_cuda (struct mca_btl_base_module_t *btl,
           //  size_t pipeline_size = remote_handle->reg_data.pipeline_size;
             printf("i receive lindex %d, pack_required %d, remote_device %d， local_device %d\n", lindex, pack_required, remote_device, local_device);
             
-            rc = mca_common_cuda_get_device(&local_device);
-            if (rc != 0) {
-                opal_output(0, "Failed to get the GPU device ID, rc=%d", rc);
-                return rc;
-            }
             if (remote_device != local_device && !OPAL_DATATYPE_DIRECT_COPY_GPUMEM) {
                 convertor->gpu_buffer_ptr = NULL;  
             } else {
@@ -1161,7 +1161,12 @@ int mca_btl_smcuda_get_cuda (struct mca_btl_base_module_t *btl,
             if (pack_required) {
                 mca_btl_smcuda_cuda_dt_unpack_clone(convertor, ep, remote_memory_address, (mca_btl_base_descriptor_t *)frag, 
                                                     0, lindex, remote_device, local_device);
-                mca_btl_smcuda_send_cuda_pack_sig(btl, ep, lindex, 0, 0);
+                cuda_dt_hdr_t send_msg;
+                send_msg.lindex = lindex;
+                send_msg.packed_size = 0;
+                send_msg.seq = 0;
+                send_msg.msg_type = CUDA_PACK_TO_LOCAL;
+                mca_btl_smcuda_send_cuda_pack_sig(btl, ep, &send_msg);
                 done = 0;
             } else {
                 struct iovec iov;
@@ -1184,9 +1189,28 @@ int mca_btl_smcuda_get_cuda (struct mca_btl_base_module_t *btl,
             printf("RECEIVE REGT CONTIGUOUS, size %ld !!!!!!!!!!!\n", size);
             recvreq->req_recv.req_base.req_convertor.flags |= CONVERTOR_CUDA;
             if (pack_required) {
+                cuda_dt_hdr_t send_msg;
+                send_msg.lindex = lindex;
+                send_msg.packed_size = 0;
+                if (remote_device == local_device && OPAL_DATATYPE_DIRECT_COPY_GPUMEM) {
+                    /* now we are able to let sender pack directly to my memory */
+                    mca_mpool_common_cuda_reg_t loc_reg;
+                    mca_mpool_common_cuda_reg_t *loc_reg_ptr = &loc_reg;
+                    cuda_getmemhandle(local_address, size, (mca_mpool_base_registration_t *)&loc_reg, NULL);
+                    memcpy(send_msg.mem_handle, loc_reg_ptr->data.memHandle, sizeof(loc_reg_ptr->data.memHandle));
+                    send_msg.seq = -9;
+                    send_msg.msg_type = CUDA_PACK_TO_REMOTE;
+                    send_msg.remote_address = local_address;
+                    send_msg.remote_base = loc_reg.base.base;
+                    mca_common_wait_stream_synchronize(&loc_reg);
+                    printf("send r_addr %p, r_base %p\n", local_address, loc_reg.base.base);
+                } else {
+                    send_msg.seq = 0;
+                    send_msg.msg_type = CUDA_PACK_TO_LOCAL;
+                }
                 mca_btl_smcuda_cuda_dt_unpack_clone(NULL, ep, remote_memory_address, (mca_btl_base_descriptor_t *)frag, 
                                                     0, lindex, 0, 0);
-                mca_btl_smcuda_send_cuda_pack_sig(btl, ep, lindex, 0, 0);
+                mca_btl_smcuda_send_cuda_pack_sig(btl, ep, &send_msg);
                 done = 0;
             } else {
                 rc = mca_common_cuda_memcpy(local_address, remote_memory_address, size,
@@ -1295,7 +1319,7 @@ static void mca_btl_smcuda_send_cuda_ipc_request(struct mca_btl_base_module_t* b
 
 int mca_btl_smcuda_send_cuda_unpack_sig(struct mca_btl_base_module_t* btl,
                                            struct mca_btl_base_endpoint_t* endpoint, 
-                                           int lindex, int packed_size, int seq)
+                                           cuda_dt_hdr_t *send_msg)
 {
     mca_btl_smcuda_frag_t* frag;
     int rc;
@@ -1310,19 +1334,16 @@ int mca_btl_smcuda_send_cuda_unpack_sig(struct mca_btl_base_module_t* btl,
 
     /* Fill in fragment fields. */
     frag->base.des_flags = MCA_BTL_DES_FLAGS_BTL_OWNERSHIP;
-    cuda_dt_hdr.seq = seq;
-    cuda_dt_hdr.lindex = lindex;
-    cuda_dt_hdr.packed_size = packed_size;
-    memcpy(frag->segment.seg_addr.pval, &cuda_dt_hdr, sizeof(cuda_dt_hdr_t));
+    memcpy(frag->segment.seg_addr.pval, send_msg, sizeof(cuda_dt_hdr_t));
     
     rc = mca_btl_smcuda_send(btl, endpoint, (struct mca_btl_base_descriptor_t*)frag,  MCA_BTL_TAG_SMCUDA_DATATYPE_UNPACK);
-    printf("######## rank %d, send seq %d, endpoint %p\n", endpoint->my_smp_rank, seq, endpoint);
+    printf("######## rank %d, send seq %d, endpoint %p\n", endpoint->my_smp_rank, send_msg->seq, endpoint);
     return rc;
 }
 
 int mca_btl_smcuda_send_cuda_pack_sig(struct mca_btl_base_module_t* btl,
                                       struct mca_btl_base_endpoint_t* endpoint, 
-                                      int lindex, int packed_size, int seq)
+                                      cuda_dt_hdr_t *send_msg)
 {
     mca_btl_smcuda_frag_t* frag;
     int rc;
@@ -1336,10 +1357,7 @@ int mca_btl_smcuda_send_cuda_pack_sig(struct mca_btl_base_module_t* btl,
 
     /* Fill in fragment fields. */
     frag->base.des_flags = MCA_BTL_DES_FLAGS_BTL_OWNERSHIP;
-    cuda_dt_hdr.seq = seq;
-    cuda_dt_hdr.lindex = lindex;
-    cuda_dt_hdr.packed_size = packed_size;
-    memcpy(frag->segment.seg_addr.pval, &cuda_dt_hdr, sizeof(cuda_dt_hdr_t));
+    memcpy(frag->segment.seg_addr.pval, send_msg, sizeof(cuda_dt_hdr_t));
     
     rc = mca_btl_smcuda_send(btl, endpoint, (struct mca_btl_base_descriptor_t*)frag,  MCA_BTL_TAG_SMCUDA_DATATYPE_PACK);
     return rc;
diff --git a/opal/mca/btl/smcuda/btl_smcuda.h b/opal/mca/btl/smcuda/btl_smcuda.h
index d8ef5ed29f6..7616e16c720 100644
--- a/opal/mca/btl/smcuda/btl_smcuda.h
+++ b/opal/mca/btl/smcuda/btl_smcuda.h
@@ -41,7 +41,7 @@
 #include "opal/mca/btl/btl.h"
 #include "opal/mca/common/sm/common_sm.h"
 
-#define OPAL_DATATYPE_DIRECT_COPY_GPUMEM    0
+#define OPAL_DATATYPE_DIRECT_COPY_GPUMEM    1
 
 BEGIN_C_DECLS
 
@@ -514,10 +514,22 @@ enum ipcState {
 /* cuda datatype control message */
 typedef struct {
     int seq;
+    int msg_type;
     int lindex;
     int packed_size;
+    void *remote_address;
+    void *remote_base;
+    uint64_t mem_handle[8];
 } cuda_dt_hdr_t;
 
+#define CUDA_UNPACK_FROM_REMOTE     0
+#define CUDA_PACK_COMPLETE          1
+#define CUDA_PACK_COMPLETE_ACK      2
+#define CUDA_PACK_CLEANUP           3
+#define CUDA_PACK_TO_LOCAL          4
+#define CUDA_PACK_TO_REMOTE         5
+#define CUDA_UNPACK_NO              6
+
 /* package save pack/unpack convertor and cbfunc */
 typedef struct {
     struct opal_convertor_t *convertor;
@@ -534,8 +546,8 @@ typedef struct {
 #define SMCUDA_DT_CLONE_SIZE 20
 extern cuda_dt_clone_t smcuda_dt_clone[SMCUDA_DT_CLONE_SIZE];
 
-int mca_btl_smcuda_send_cuda_unpack_sig(struct mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t* endpoint, int lindex, int packed_size, int seq);
-int mca_btl_smcuda_send_cuda_pack_sig(struct mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t* endpoint, int lindex, int packed_size, int seq);
+int mca_btl_smcuda_send_cuda_unpack_sig(struct mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t* endpoint, cuda_dt_hdr_t *send_msg);
+int mca_btl_smcuda_send_cuda_pack_sig(struct mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t* endpoint, cuda_dt_hdr_t *send_msg);
 int mca_btl_smcuda_check_cuda_dt_pack_clone_exist(struct mca_btl_base_endpoint_t *endpoint, struct opal_convertor_t *convertor);
 int mca_btl_smcuda_set_cuda_dt_pack_seq(struct mca_btl_base_endpoint_t *endpoint, int lindex, int seq);
 int mca_btl_smcuda_get_cuda_dt_pack_seq(struct mca_btl_base_endpoint_t *endpoint, int lindex);
diff --git a/opal/mca/btl/smcuda/btl_smcuda_component.c b/opal/mca/btl/smcuda/btl_smcuda_component.c
index e4e1c280857..0243822d1d9 100644
--- a/opal/mca/btl/smcuda/btl_smcuda_component.c
+++ b/opal/mca/btl/smcuda/btl_smcuda_component.c
@@ -859,6 +859,7 @@ static void btl_smcuda_datatype_unpack(mca_btl_base_module_t* btl,
     int seq = cuda_dt_hdr.seq;
     int lindex = cuda_dt_hdr.lindex;
     size_t packed_size = cuda_dt_hdr.packed_size;
+    int msg_type = cuda_dt_hdr.msg_type;
     mca_btl_smcuda_frag_t *frag = (mca_btl_smcuda_frag_t *)des;
     cuda_dt_clone_t *my_cuda_dt_clone;
 
@@ -869,15 +870,20 @@ static void btl_smcuda_datatype_unpack(mca_btl_base_module_t* btl,
     
     printf("$$$$$$$$$$$$$$hello, rank %d in smcuda unpack seq %d, index %d\n", my_cuda_dt_clone->endpoint->my_smp_rank, seq, lindex);
     
-    if (seq == -2) {
+    if (msg_type == CUDA_PACK_CLEANUP) {
         mca_btl_smcuda_frag_t *frag_recv = (mca_btl_smcuda_frag_t *) my_cuda_dt_clone->frag;
         mca_btl_base_rdma_completion_fn_t cbfunc = (mca_btl_base_rdma_completion_fn_t) frag_recv->base.des_cbfunc;
         cbfunc (btl, endpoint, frag_recv->segment.seg_addr.pval, frag_recv->local_handle, frag_recv->base.des_context, frag_recv->base.des_cbdata, OPAL_SUCCESS);
         mca_btl_smcuda_free(btl, (mca_btl_base_descriptor_t *)frag_recv);
         mca_btl_smcuda_free_cuda_dt_unpack_clone(endpoint, lindex);
-    } else if (seq == -1) {
-        mca_btl_smcuda_send_cuda_pack_sig(btl, endpoint, lindex, 0, -1);
-    } else {
+    } else if (msg_type == CUDA_PACK_COMPLETE) {
+        cuda_dt_hdr_t send_msg;
+        send_msg.lindex = lindex;
+        send_msg.packed_size = 0;
+        send_msg.seq = -1;
+        send_msg.msg_type = CUDA_PACK_COMPLETE_ACK;
+        mca_btl_smcuda_send_cuda_pack_sig(btl, endpoint, &send_msg);
+    } else if (msg_type == CUDA_UNPACK_FROM_REMOTE){
         struct iovec iov;
         uint32_t iov_count = 1;
         size_t max_data;
@@ -924,8 +930,10 @@ static void btl_smcuda_datatype_pack(mca_btl_base_module_t* btl,
     memcpy(&cuda_dt_hdr, segments->seg_addr.pval, sizeof(cuda_dt_hdr_t));
     int seq = cuda_dt_hdr.seq;
     int lindex = cuda_dt_hdr.lindex;
+    int msg_type = cuda_dt_hdr.msg_type;
     mca_btl_smcuda_frag_t *frag = (mca_btl_smcuda_frag_t *)des;
     cuda_dt_clone_t *my_cuda_dt_clone;
+    cuda_dt_hdr_t send_msg;
 
     /* We can find the endoint back from the rank embedded in the header */
     endpoint = mca_btl_smcuda_component.sm_peers[frag->hdr->my_smp_rank];
@@ -933,14 +941,35 @@ static void btl_smcuda_datatype_pack(mca_btl_base_module_t* btl,
     
     printf("$$$$$$$$$$$$$$hello, rank %d in smcuda pack seq %d, index %d\n", my_cuda_dt_clone->endpoint->my_smp_rank, seq, lindex);
     struct opal_convertor_t *convertor = my_cuda_dt_clone->convertor;
-    if (seq == -1) {
-        mca_btl_smcuda_send_cuda_unpack_sig(btl, endpoint, lindex, 0, -2);
+    send_msg.lindex = lindex;
+    if (msg_type == CUDA_PACK_COMPLETE_ACK) {
+        send_msg.packed_size = 0;
+        send_msg.seq = -2;
+        send_msg.msg_type = CUDA_PACK_CLEANUP;
+        mca_btl_smcuda_send_cuda_unpack_sig(btl, endpoint, &send_msg);
         if (convertor->gpu_buffer_ptr != NULL) {
             opal_cuda_free_gpu_buffer_p(convertor->gpu_buffer_ptr, 0);
             convertor->gpu_buffer_ptr = NULL;
         }
         mca_btl_smcuda_free_cuda_dt_pack_clone(endpoint, lindex);
     } else {
+        mca_mpool_common_cuda_reg_t *rget_reg_ptr = NULL;
+        if (msg_type == CUDA_PACK_TO_REMOTE) { /* receiver is contiguous, and ask me to pack directly to his gpu memory */
+            opal_cuda_free_gpu_buffer_p(convertor->gpu_buffer_ptr, 0);
+            mca_mpool_common_cuda_reg_t rget_reg;
+            rget_reg_ptr= &rget_reg;
+            memset(&rget_reg, 0, sizeof(rget_reg));
+            memcpy(rget_reg.data.memHandle, cuda_dt_hdr.mem_handle, sizeof(cuda_dt_hdr.mem_handle));
+            cuda_openmemhandle(NULL, 0, (mca_mpool_base_registration_t *)&rget_reg, NULL);
+            mca_common_wait_stream_synchronize(&rget_reg);
+            size_t offset = (size_t) ((intptr_t) cuda_dt_hdr.remote_address - (intptr_t) cuda_dt_hdr.remote_base);
+            unsigned char *remote_memory_address = (unsigned char *)rget_reg_ptr->base.alloc_base + offset;
+            convertor->gpu_buffer_ptr = remote_memory_address;
+            printf("remote_memory_address $$$$$$ %p, r_addr %p, r_base %p\n", remote_memory_address, cuda_dt_hdr.remote_address, cuda_dt_hdr.remote_base);
+            send_msg.msg_type = CUDA_UNPACK_NO;
+        } else {
+            send_msg.msg_type = CUDA_UNPACK_FROM_REMOTE;
+        }
         struct iovec iov;
         int rc_dt = 0;
         size_t packed_size = mca_btl_smcuda_component.cuda_dt_pipeline_size;
@@ -954,15 +983,27 @@ static void btl_smcuda_datatype_pack(mca_btl_base_module_t* btl,
         /* because pack may not use the whole pipeline size */
         rc_dt = opal_convertor_pack(convertor, &iov, &iov_count, &max_data );
         packed_size = max_data;
-        mca_btl_smcuda_send_cuda_unpack_sig(btl, endpoint, lindex, packed_size, seq);
+        send_msg.packed_size = packed_size;
+        send_msg.seq = seq;
+        mca_btl_smcuda_send_cuda_unpack_sig(btl, endpoint, &send_msg);
         while (rc_dt != 1) {
             iov.iov_base += packed_size;
             seq ++;
             rc_dt = opal_convertor_pack(convertor, &iov, &iov_count, &max_data );
             packed_size = max_data;
-            mca_btl_smcuda_send_cuda_unpack_sig(btl, endpoint, lindex, packed_size, seq);
+            send_msg.packed_size = packed_size;
+            send_msg.seq = seq;
+            mca_btl_smcuda_send_cuda_unpack_sig(btl, endpoint, &send_msg);
+        }
+        
+        send_msg.packed_size = 0;
+        send_msg.seq = -1;
+        send_msg.msg_type = CUDA_PACK_COMPLETE;
+        mca_btl_smcuda_send_cuda_unpack_sig(btl, endpoint, &send_msg);
+        
+        if (rget_reg_ptr != NULL) { /* close memhandle */
+            cuda_closememhandle(NULL, (mca_mpool_base_registration_t *)rget_reg_ptr);
         }
-        mca_btl_smcuda_send_cuda_unpack_sig(btl, endpoint, lindex, 0, -1);
     }
   //  MCA_BTL_SMCUDA_FRAG_RETURN(frag);
 }

From ea582c546eabc02cb0a4d58ad8dc16bca6dc22f7 Mon Sep 17 00:00:00 2001
From: eddy16112 <eddy16112@gmail.com>
Date: Tue, 29 Sep 2015 17:12:40 -0400
Subject: [PATCH 119/190] add ddt_benchmark

---
 test/datatype/ddt_benchmark.c | 1184 +++++++++++++++++++++++++++++++++
 1 file changed, 1184 insertions(+)
 create mode 100644 test/datatype/ddt_benchmark.c

diff --git a/test/datatype/ddt_benchmark.c b/test/datatype/ddt_benchmark.c
new file mode 100644
index 00000000000..860e9b87c94
--- /dev/null
+++ b/test/datatype/ddt_benchmark.c
@@ -0,0 +1,1184 @@
+/* -*- Mode: C; c-basic-offset:4 ; -*- */
+/*
+ * Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana
+ *                         University Research and Technology
+ *                         Corporation.  All rights reserved.
+ * Copyright (c) 2004-2009 The University of Tennessee and The University
+ *                         of Tennessee Research Foundation.  All rights
+ *                         reserved.
+ * Copyright (c) 2004-2006 High Performance Computing Center Stuttgart, 
+ *                         University of Stuttgart.  All rights reserved.
+ * Copyright (c) 2004-2006 The Regents of the University of California.
+ *                         All rights reserved.
+ * Copyright (c) 2006      Sun Microsystems Inc. All rights reserved.
+ * $COPYRIGHT$
+ * 
+ * Additional copyrights may follow
+ * 
+ * $HEADER$
+ */
+
+#include "ompi_config.h"
+#include "ddt_lib.h"
+#include "opal/runtime/opal.h"
+#include "opal/datatype/opal_convertor.h"
+#include <time.h>
+#include <stdlib.h>
+#ifdef HAVE_SYS_TIME_H
+#include <sys/time.h>
+#endif
+#include <stdio.h>
+#include <string.h>
+
+#define DDT_TEST_CUDA
+#define CUDA_MEMCPY_2D_D2H
+
+
+#include <cuda_runtime_api.h>
+#include "opal/mca/common/cuda/common_cuda.h"
+#include "opal/runtime/opal_params.h"
+#define CONVERTOR_CUDA             0x00400000
+
+
+/* Compile with:
+mpicc -DHAVE_CONFIG_H -I. -I../../include -I../../../ompi-trunk/include  -I../.. -I../../include -I../../../ompi-trunk/opal -I../../../ompi-trunk/orte -I../../../ompi-trunk/ompi -g ddt_test.c -o ddt_test
+*/
+
+#define TIMER_DATA_TYPE struct timeval
+#define GET_TIME(TV)   gettimeofday( &(TV), NULL )
+#define ELAPSED_TIME(TSTART, TEND)  (((TEND).tv_sec - (TSTART).tv_sec) * 1000000 + ((TEND).tv_usec - (TSTART).tv_usec))
+
+#define DUMP_DATA_AFTER_COMMIT 0x00000001
+#define CHECK_PACK_UNPACK      0x00000002
+
+uint32_t remote_arch = 0xffffffff;
+
+static int test_upper( unsigned int length )
+{
+    double *mat1, *mat2, *inbuf;
+    ompi_datatype_t *pdt;
+    opal_convertor_t * pConv;
+    char *ptr;
+    int rc;
+    unsigned int i, j, iov_count, split_chunk, total_length;
+    size_t max_data;
+    struct iovec a;
+    TIMER_DATA_TYPE start, end;
+    long total_time;
+
+    printf( "test upper matrix\n" );
+    pdt = upper_matrix( length );
+    /*dt_dump( pdt );*/
+
+    mat1 = malloc( length * length * sizeof(double) );
+    init_random_upper_matrix( length, mat1 );
+    mat2 = calloc( length * length, sizeof(double) );
+
+    total_length = length * (length + 1) * ( sizeof(double) / 2);
+    inbuf = (double*)malloc( total_length );
+    ptr = (char*)inbuf;
+    /* copy upper matrix in the array simulating the input buffer */
+    for( i = 0; i < length; i++ ) {
+        uint32_t pos = i * length + i;
+        for( j = i; j < length; j++, pos++ ) {
+            *inbuf = mat1[pos];
+            inbuf++;
+        }
+    }
+    inbuf = (double*)ptr;
+    pConv = opal_convertor_create( remote_arch, 0 );
+    if( OPAL_SUCCESS != opal_convertor_prepare_for_recv( pConv, &(pdt->super), 1, mat2 ) ) {
+        printf( "Cannot attach the datatype to a convertor\n" );
+        return OMPI_ERROR;
+    }
+    
+    cudaDeviceSynchronize();
+
+    GET_TIME( start );
+    split_chunk = (length + 1) * sizeof(double);
+    /*    split_chunk = (total_length + 1) * sizeof(double); */
+    for( i = total_length; i > 0; ) {
+        if( i <= split_chunk ) {  /* equal test just to be able to set a breakpoint */
+            split_chunk = i;
+        }
+        a.iov_base = ptr;
+        a.iov_len = split_chunk;
+        iov_count = 1;
+        max_data = split_chunk;
+        opal_convertor_unpack( pConv, &a, &iov_count, &max_data );
+        ptr += max_data;
+        i -= max_data;
+        if( mat2[0] != inbuf[0] ) assert(0);
+    }
+    GET_TIME( end );
+    total_time = ELAPSED_TIME( start, end );
+    printf( "complete unpacking in %ld microsec\n", total_time );
+    free( inbuf );
+    rc = check_diag_matrix( length, mat1, mat2 );
+    free( mat1 );
+    free( mat2 );
+
+    /* test the automatic destruction pf the data */
+    ompi_datatype_destroy( &pdt ); assert( pdt == NULL );
+
+    OBJ_RELEASE( pConv );
+    return rc;
+}
+
+/**
+ * Computing the correct buffer length for moving a multiple of a datatype
+ * is not an easy task. Define a function to centralize the complexity in a
+ * single location.
+ */
+static size_t compute_buffer_length(ompi_datatype_t* pdt, int count)
+{
+    MPI_Aint extent, lb, true_extent, true_lb;
+    size_t length;
+
+    ompi_datatype_get_extent(pdt, &lb, &extent);
+    ompi_datatype_get_true_extent(pdt, &true_lb, &true_extent); (void)true_lb;
+    length = true_lb + true_extent + (count - 1) * extent;
+
+    return  length;
+}
+
+/**
+ *  Conversion function. They deal with data-types in 3 ways, always making local copies.
+ * In order to allow performance testings, there are 3 functions:
+ *  - one copying directly from one memory location to another one using the
+ *    data-type copy function.
+ *  - one which use a 2 convertors created with the same data-type
+ *  - and one using 2 convertors created from different data-types.
+ *
+ */
+static int local_copy_ddt_count( ompi_datatype_t* pdt, int count )
+{
+    void *pdst, *psrc;
+    TIMER_DATA_TYPE start, end;
+    long total_time;
+    size_t length;
+
+    length = compute_buffer_length(pdt, count);
+
+    pdst = malloc(length);
+    psrc = malloc(length);
+
+    for( size_t i = 0; i < length; i++ )
+	((char*)psrc)[i] = i % 128 + 32;
+    memset(pdst, 0, length);
+
+    cache_trash();  /* make sure the cache is useless */
+
+    GET_TIME( start );
+    if( OMPI_SUCCESS != ompi_datatype_copy_content_same_ddt( pdt, count, pdst, psrc ) ) {
+        printf( "Unable to copy the datatype in the function local_copy_ddt_count."
+                " Is the datatype committed ?\n" );
+    }
+    GET_TIME( end );
+    total_time = ELAPSED_TIME( start, end );
+    printf( "direct local copy in %ld microsec\n", total_time );
+    free(pdst);
+    free(psrc);
+
+    return OMPI_SUCCESS;
+}
+
+static void fill_vectors(double* vp, int itera, int contig, int gap)
+{
+    int i, j;
+    for (i = 0; i < itera-1; i++ ){
+        for (j = i*gap; j < (i+1)*gap; j++) {
+            if (j >= i*gap && j < i*gap+contig) {
+                vp[j] = 1.1;
+            } else {
+                vp[j] = -1.0;
+            }
+        }
+    }
+    for (i = (itera-1)*gap; i < (itera-1)*gap+contig; i++) {
+        vp[i] = 1.1;
+    }
+    
+    // printf("vector generated:\n");
+    // for (i = 0; i < (itera-1)*gap+contig; i++) {
+    //     printf("%1.f ", vp[i]);
+    // }
+    // printf("\n");
+}
+
+static void verify_vectors(double *vp, int itera, int contig, int gap)
+{
+    int i, j;
+    int error = 0;
+    for (i = 0; i < itera-1; i++) {
+        for (j = i*gap; j < (i+1)*gap; j++) {
+            if (j >= i*gap && j < i*gap+contig) {
+                if (vp[j] != 1.1) {
+                    error ++;
+                }
+            } 
+        }
+    }
+    for (i = (itera-1)*gap; i < (itera-1)*gap+contig; i++) {
+        if (vp[i] != 1.1) {
+            error ++;
+        }
+    }
+    // printf("vector received:\n");
+    // for (i = 0; i < (itera-1)*gap+contig; i++) {
+    //     printf("%1.f ", vp[i]);
+    // }
+    if (error != 0) {
+        printf("%d error is found\n", error);
+    } else {
+        printf("no error is found\n");
+    }
+}
+
+static int
+vector_ddt( ompi_datatype_t* send_type, int send_count,
+            ompi_datatype_t* recv_type, int recv_count,
+            int chunk, int itera, int contig, int gap )
+{
+    void *pdst = NULL, *psrc = NULL, *ptemp = NULL, *psrc_host = NULL, *pdst_host = NULL; 
+    opal_convertor_t *send_convertor = NULL, *recv_convertor = NULL;
+    struct iovec iov;
+    uint32_t iov_count;
+    size_t max_data;
+    int32_t length = 0, done1 = 0, done2 = 0;
+    TIMER_DATA_TYPE start, end, unpack_start, unpack_end;
+    long total_time, unpack_time = 0, push_time = 0, pop_time = 0, pack_time = 0;
+    size_t slength, rlength;
+
+    rlength = compute_buffer_length(recv_type, recv_count);
+    slength = compute_buffer_length(send_type, send_count);
+    
+    cudaSetDevice(0);
+
+    cudaError_t error = cudaMalloc((void **)&psrc, slength);
+    if ( error != cudaSuccess) {
+        printf("CUDA error: %s\n", cudaGetErrorString(error));
+        exit(-1);
+    }
+    cudaMemset(psrc, 0, slength);
+    printf("cudamalloc psrc %p\n", psrc);
+    
+    error = cudaMalloc((void **)&pdst, rlength);
+    if ( error != cudaSuccess) {
+        printf("CUDA error: %s\n", cudaGetErrorString(error));
+        exit(-1);
+    }
+    cudaMemset(pdst, 0, rlength); 
+    printf("cudamalloc pdst %p\n", pdst);
+    
+ //   error = cudaHostAlloc((void **)&ptemp, chunk, cudaHostAllocMapped);
+    error = cudaMallocHost((void **)&ptemp, chunk);
+    //ptemp = malloc(chunk);
+    if ( error != cudaSuccess) {
+        printf("CUDA error: %s\n", cudaGetErrorString(error));
+        exit(-1);
+    }
+    memset(ptemp, 0, chunk);
+    printf("cudamallochost ptemp %p\n", ptemp);
+    
+    
+    error = cudaMallocHost((void **)&psrc_host, slength);
+    error = cudaMallocHost((void **)&pdst_host, rlength);
+ //   psrc_host = malloc(slength);
+ //   pdst_host = malloc(rlength);
+    printf("cudamallochost phost \n");
+    
+    memset(psrc_host, 0, slength);
+    memset(pdst_host, 0, rlength);
+    if (itera > 0) {
+        fill_vectors((double *)psrc_host, itera, contig, gap);
+    }
+    cudaMemcpy(psrc, psrc_host, slength, cudaMemcpyHostToDevice);
+
+
+    send_convertor = opal_convertor_create( remote_arch, 0 );
+#if defined (DDT_TEST_CUDA)
+    send_convertor->flags |= CONVERTOR_CUDA;
+    if( OPAL_SUCCESS != opal_convertor_prepare_for_send( send_convertor, &(send_type->super), send_count, psrc ) ) {
+        printf( "Unable to create the send convertor. Is the datatype committed ?\n" );
+        goto clean_and_return;
+    }
+#else
+    if( OPAL_SUCCESS != opal_convertor_prepare_for_send( send_convertor, &(send_type->super), send_count, psrc_host ) ) {
+        printf( "Unable to create the send convertor. Is the datatype committed ?\n" );
+        goto clean_and_return;
+    }
+#endif
+    recv_convertor = opal_convertor_create( remote_arch, 0 );
+#if defined (DDT_TEST_CUDA)
+    recv_convertor->flags |= CONVERTOR_CUDA;
+    if( OPAL_SUCCESS != opal_convertor_prepare_for_recv( recv_convertor, &(recv_type->super), recv_count, pdst ) ) {
+        printf( "Unable to create the recv convertor. Is the datatype committed ?\n" );
+        goto clean_and_return;
+    }
+#else
+    if( OPAL_SUCCESS != opal_convertor_prepare_for_recv( recv_convertor, &(recv_type->super), recv_count, pdst_host ) ) {
+        printf( "Unable to create the recv convertor. Is the datatype committed ?\n" );
+        goto clean_and_return;
+    }
+#endif
+
+    cache_trash();  /* make sure the cache is useless */
+    cudaDeviceSynchronize();
+    
+    GET_TIME( start );
+#if !defined (DDT_TEST_CUDA)
+    GET_TIME( unpack_start );
+    cudaMemcpy(psrc_host, psrc, slength, cudaMemcpyDeviceToHost);
+    GET_TIME( unpack_end );
+    push_time = ELAPSED_TIME( unpack_start, unpack_end );
+#endif
+    while( (done1 & done2) != 1 ) {
+        /* They are supposed to finish in exactly the same time. */
+        if( done1 | done2 ) {
+            printf( "WRONG !!! the send is %s but the receive is %s in local_copy_with_convertor_2datatypes\n",
+                    (done1 ? "finish" : "not finish"),
+                    (done2 ? "finish" : "not finish") );
+        }
+
+        max_data = chunk;
+        iov_count = 1;
+        iov.iov_base = ptemp;
+        iov.iov_len = chunk;
+
+        if( done1 == 0 ) {
+            done1 = opal_convertor_pack( send_convertor, &iov, &iov_count, &max_data );
+       //     done1 = 1;
+        }
+
+        if( done2 == 0 ) {
+            GET_TIME( unpack_start );
+            done2 = opal_convertor_unpack( recv_convertor, &iov, &iov_count, &max_data );
+            GET_TIME( unpack_end );
+            unpack_time += ELAPSED_TIME( unpack_start, unpack_end );
+        }
+
+        length += max_data;
+    }
+#if !defined (DDT_TEST_CUDA)
+    GET_TIME( unpack_start );
+    cudaMemcpy(pdst, pdst_host, rlength, cudaMemcpyHostToDevice);
+    GET_TIME( unpack_end );
+    pop_time = ELAPSED_TIME( unpack_start, unpack_end );
+#endif
+
+    GET_TIME( end );
+    total_time = ELAPSED_TIME( start, end );
+    pack_time = total_time - unpack_time - push_time - pop_time;
+    printf( "copying different data-types using convertors in %ld microsec, p&up in %ld \n", total_time, pack_time+unpack_time );
+    printf( "\t unpack in %ld microsec [pack in %ld microsec], push in %ld microsec, pop in %ld microsec\n", unpack_time,
+            pack_time, push_time, pop_time);
+            
+    memset(pdst_host, 0, slength);
+    cudaMemcpy(pdst_host, pdst, rlength, cudaMemcpyDeviceToHost);
+    if (itera > 0) {
+        verify_vectors((double *)pdst_host, itera, contig, gap);
+    }
+
+ clean_and_return:
+    if( send_convertor != NULL ) {
+        OBJ_RELEASE( send_convertor ); assert( send_convertor == NULL );
+    }
+    if( recv_convertor != NULL ) {
+        OBJ_RELEASE( recv_convertor ); assert( recv_convertor == NULL );
+    }
+
+    if( NULL != pdst ) cudaFree( pdst );
+    if( NULL != psrc ) cudaFree( psrc );
+    if( NULL != ptemp ) cudaFreeHost( ptemp );
+    if( NULL != psrc_host ) cudaFreeHost( psrc_host );
+    if( NULL != pdst_host ) cudaFreeHost( pdst_host );
+
+    return OMPI_SUCCESS;
+}
+
+static int
+vector_ddt_2d( ompi_datatype_t* send_type, int send_count,
+            ompi_datatype_t* recv_type, int recv_count,
+            int chunk, int itera, int contig, int gap )
+{
+    void *pdst = NULL, *psrc = NULL, *ptemp = NULL, *psrc_host = NULL, *pdst_host = NULL; 
+    opal_convertor_t *send_convertor = NULL, *recv_convertor = NULL;
+    struct iovec iov;
+    uint32_t iov_count;
+    size_t max_data;
+    int32_t length = 0, done1 = 0, done2 = 0;
+    TIMER_DATA_TYPE start, end, unpack_start, unpack_end;
+    long total_time, unpack_time = 0, push_time = 0, pop_time = 0, pack_time = 0;
+    size_t slength, rlength;
+
+    rlength = compute_buffer_length(recv_type, recv_count);
+    slength = compute_buffer_length(send_type, send_count);
+    
+    cudaSetDevice(2);
+
+    cudaError_t error = cudaMalloc((void **)&psrc, slength);
+    if ( error != cudaSuccess) {
+        printf("CUDA error: %s\n", cudaGetErrorString(error));
+        exit(-1);
+    }
+    cudaMemset(psrc, 0, slength);
+    printf("cudamalloc psrc %p\n", psrc);
+    
+    error = cudaMalloc((void **)&pdst, rlength);
+    if ( error != cudaSuccess) {
+        printf("CUDA error: %s\n", cudaGetErrorString(error));
+        exit(-1);
+    }
+    cudaMemset(pdst, 0, rlength); 
+    printf("cudamalloc pdst %p\n", pdst);
+    
+    error = cudaMallocHost((void **)&ptemp, chunk);
+    if ( error != cudaSuccess) {
+        printf("CUDA error: %s\n", cudaGetErrorString(error));
+        exit(-1);
+    }
+    memset(ptemp, 0, chunk);
+    printf("cudamallochost ptemp %p\n", ptemp);
+    
+    
+    error = cudaMallocHost((void **)&psrc_host, slength);
+    error = cudaMallocHost((void **)&pdst_host, rlength);
+    printf("cudamallochost phost \n");
+    
+    memset(psrc_host, 0, slength);
+    memset(pdst_host, 0, rlength);
+    if (itera > 0) {
+        fill_vectors((double *)psrc_host, itera, contig, gap);
+    }
+    cudaMemcpy(psrc, psrc_host, slength, cudaMemcpyHostToDevice);
+
+
+    GET_TIME( start );
+    //cudaMemcpy2D(pdst, contig*sizeof(double), psrc, gap*sizeof(double), contig*sizeof(double), itera, cudaMemcpyDeviceToDevice);
+    cudaMemcpy2D(psrc_host, contig*sizeof(double), psrc, gap*sizeof(double), contig*sizeof(double), itera, cudaMemcpyDeviceToHost);
+    GET_TIME( end );
+    pop_time = ELAPSED_TIME( start, end );
+    
+    GET_TIME( start );
+    cudaMemcpy2D(pdst, gap*sizeof(double), psrc_host, contig*sizeof(double), contig*sizeof(double), itera, cudaMemcpyHostToDevice);
+    GET_TIME( end );
+    push_time = ELAPSED_TIME( start, end );
+    
+    printf( "MEMCPY2D D2H %ld microseconds, H2D %ld microseconds, size %ld\n", pop_time, push_time, contig*sizeof(double)*itera); 
+            
+    memset(pdst_host, 0, slength);
+    cudaMemcpy(pdst_host, pdst, rlength, cudaMemcpyDeviceToHost);
+    if (itera > 0) {
+        verify_vectors((double *)pdst_host, itera, contig, gap);
+    }
+    /* D2D D2H */
+    if (itera > 0) {
+        fill_vectors((double *)psrc_host, itera, contig, gap);
+    }
+    cudaMemcpy(psrc, psrc_host, slength, cudaMemcpyHostToDevice);
+
+
+    GET_TIME( start );
+    cudaMemcpy2D(pdst, contig*sizeof(double), psrc, gap*sizeof(double), contig*sizeof(double), itera, cudaMemcpyDeviceToDevice);
+    GET_TIME( end );
+    pack_time = ELAPSED_TIME( start, end );
+    
+    GET_TIME( start );
+    cudaMemcpy(psrc_host, pdst, contig*sizeof(double)*itera, cudaMemcpyDeviceToHost);
+    GET_TIME( end );
+    pop_time = ELAPSED_TIME( start, end );
+    
+    GET_TIME( start );
+    cudaMemcpy(psrc, psrc_host, contig*sizeof(double)*itera, cudaMemcpyHostToDevice);
+    GET_TIME( end );
+    push_time = ELAPSED_TIME( start, end );
+    
+    GET_TIME( start );
+    cudaMemcpy2D(pdst, gap*sizeof(double), psrc, contig*sizeof(double), contig*sizeof(double), itera, cudaMemcpyDeviceToDevice);
+    GET_TIME( end );
+    unpack_time = ELAPSED_TIME( start, end );
+    
+    printf( "MEMCPY2D D2H %ld microseconds, H2D %ld microseconds, pack in %ld, unpack in %ld, size %lu \n", pop_time, push_time, pack_time, unpack_time, contig*sizeof(double)*itera); 
+            
+    memset(pdst_host, 0, slength);
+    cudaMemcpy(pdst_host, pdst, rlength, cudaMemcpyDeviceToHost);
+    if (itera > 0) {
+        verify_vectors((double *)pdst_host, itera, contig, gap);
+    }
+    
+
+ clean_and_return:
+    if( send_convertor != NULL ) {
+        OBJ_RELEASE( send_convertor ); assert( send_convertor == NULL );
+    }
+    if( recv_convertor != NULL ) {
+        OBJ_RELEASE( recv_convertor ); assert( recv_convertor == NULL );
+    }
+
+    if( NULL != pdst ) cudaFree( pdst );
+    if( NULL != psrc ) cudaFree( psrc );
+    if( NULL != ptemp ) cudaFreeHost( ptemp );
+    if( NULL != psrc_host ) cudaFreeHost( psrc_host );
+    if( NULL != pdst_host ) cudaFreeHost( pdst_host );
+
+    return OMPI_SUCCESS;
+}
+
+
+static int
+local_copy_with_convertor_2datatypes_struct( ompi_datatype_t* send_type, int send_count,
+                                      ompi_datatype_t* recv_type, int recv_count,
+                                      int chunk, int count)
+{
+    void *pdst = NULL, *psrc = NULL, *ptemp = NULL, *phost = NULL;
+    opal_convertor_t *send_convertor = NULL, *recv_convertor = NULL;
+    struct iovec iov;
+    uint32_t iov_count;
+    size_t max_data;
+    int32_t length = 0, done1 = 0, done2 = 0;
+    TIMER_DATA_TYPE start, end, unpack_start, unpack_end;
+    long total_time, unpack_time = 0;
+    size_t slength, rlength;
+
+    rlength = compute_buffer_length(recv_type, recv_count);
+    slength = compute_buffer_length(send_type, send_count);
+    
+#if defined (DDT_TEST_CUDA)
+    cudaSetDevice(0);
+#endif
+
+#if defined (DDT_TEST_CUDA)
+    cudaError_t error = cudaMalloc((void **)&psrc, slength);
+    if ( error != cudaSuccess) {
+        printf("CUDA error: %s\n", cudaGetErrorString(error));
+        exit(-1);
+    }
+    cudaMemset(psrc, 0, slength);
+    printf("cudamalloc psrc %p\n", psrc);
+    
+    error = cudaMalloc((void **)&pdst, rlength);
+    if ( error != cudaSuccess) {
+        printf("CUDA error: %s\n", cudaGetErrorString(error));
+        exit(-1);
+    }
+    cudaMemset(pdst, 0, rlength); 
+    printf("cudamalloc pdst %p\n", pdst);
+    
+    error = cudaMallocHost((void **)&ptemp, chunk);
+    if ( error != cudaSuccess) {
+        printf("CUDA error: %s\n", cudaGetErrorString(error));
+        exit(-1);
+    }
+    memset(ptemp, 0, chunk);
+    printf("cudamallochost ptemp %p\n", ptemp);
+    
+    error = cudaMallocHost((void **)&phost, slength);
+    if ( error != cudaSuccess) {
+        printf("CUDA error: %s\n", cudaGetErrorString(error));
+        exit(-1);
+    }
+    memset(phost, 0, slength);
+    printf("cudamallochost phost %p\n", phost);
+#else
+    pdst  = malloc( rlength );
+    psrc  = malloc( slength );
+    ptemp = malloc( chunk );
+
+    /* initialize the buffers to prevent valgrind from complaining */
+    for( size_t i = 0; i < slength; i++ )
+            ((char*)psrc)[i] = i % 128 + 32;
+    memset(pdst, 0, rlength);
+#endif
+    
+#if defined (DDT_TEST_CUDA)
+
+    cudaMemcpy(psrc, phost, slength, cudaMemcpyHostToDevice);
+#else 
+
+#endif
+
+    send_convertor = opal_convertor_create( remote_arch, 0 );
+#if defined (DDT_TEST_CUDA)
+    send_convertor->flags |= CONVERTOR_CUDA;
+#endif
+    if( OPAL_SUCCESS != opal_convertor_prepare_for_send( send_convertor, &(send_type->super), send_count, psrc ) ) {
+        printf( "Unable to create the send convertor. Is the datatype committed ?\n" );
+        goto clean_and_return;
+    }
+    recv_convertor = opal_convertor_create( remote_arch, 0 );
+#if defined (DDT_TEST_CUDA)
+    recv_convertor->flags |= CONVERTOR_CUDA;
+#endif
+    if( OPAL_SUCCESS != opal_convertor_prepare_for_recv( recv_convertor, &(recv_type->super), recv_count, pdst ) ) {
+        printf( "Unable to create the recv convertor. Is the datatype committed ?\n" );
+        goto clean_and_return;
+    }
+
+    cache_trash();  /* make sure the cache is useless */
+
+    GET_TIME( start );
+    while( (done1 & done2) != 1 ) {
+        /* They are supposed to finish in exactly the same time. */
+        if( done1 | done2 ) {
+            printf( "WRONG !!! the send is %s but the receive is %s in local_copy_with_convertor_2datatypes\n",
+                    (done1 ? "finish" : "not finish"),
+                    (done2 ? "finish" : "not finish") );
+        }
+
+        max_data = chunk;
+        iov_count = 1;
+        iov.iov_base = ptemp;
+        iov.iov_len = chunk;
+
+        if( done1 == 0 ) {
+            done1 = opal_convertor_pack( send_convertor, &iov, &iov_count, &max_data );
+        }
+
+        if( done2 == 0 ) {
+            GET_TIME( unpack_start );
+            done2 = opal_convertor_unpack( recv_convertor, &iov, &iov_count, &max_data );
+            GET_TIME( unpack_end );
+            unpack_time += ELAPSED_TIME( unpack_start, unpack_end );
+        }
+
+        length += max_data;
+    }
+    GET_TIME( end );
+    total_time = ELAPSED_TIME( start, end );
+    printf( "copying different data-types using convertors in %ld microsec\n", total_time );
+    printf( "\t unpack in %ld microsec [pack in %ld microsec]\n", unpack_time,
+            total_time - unpack_time );
+            
+#if defined (DDT_TEST_CUDA)
+    memset(phost, 0, slength);
+    cudaMemcpy(phost, pdst, rlength, cudaMemcpyDeviceToHost);
+
+#else
+
+#endif
+ clean_and_return:
+    if( send_convertor != NULL ) {
+        OBJ_RELEASE( send_convertor ); assert( send_convertor == NULL );
+    }
+    if( recv_convertor != NULL ) {
+        OBJ_RELEASE( recv_convertor ); assert( recv_convertor == NULL );
+    }
+#if defined (DDT_TEST_CUDA)
+    if( NULL != pdst ) cudaFree( pdst );
+    if( NULL != psrc ) cudaFree( psrc );
+    if( NULL != ptemp ) cudaFreeHost( ptemp );
+    if( NULL != phost ) cudaFreeHost( phost );
+#else
+    if( NULL != pdst ) free( pdst );
+    if( NULL != psrc ) free( psrc );
+    if( NULL != ptemp ) free( ptemp );
+#endif
+    return OMPI_SUCCESS;
+}
+
+
+static void fill_upper_matrix(void *matt, int msize)
+{
+    int i, j, start, end;
+    int *blklens, *displs;
+#if defined (TEST_DOUBLE)
+    double *mat = (double *)matt;
+#elif defined (TEST_FLOAT)
+    float *mat = (float *)matt;
+#elif defined (TEST_CHAR)
+    char *mat = (char *)matt;
+#else
+    void *mat = matt;
+#endif
+    
+    blklens = (int *)malloc(sizeof(int)*msize);
+    displs = (int *)malloc(sizeof(int)*msize);
+    for (i = 0; i < msize; i++) {
+        blklens[i] = msize - i;
+        displs[i] = i*msize + i;
+    }
+    for (i = 0; i < msize; i++) {
+        start = displs[i];
+        end = start + blklens[i];
+        for (j = start; j < end; j++) {
+#if defined (TEST_CHAR)
+            mat[j] = 'a';
+#else
+            mat[j] = 0.0 + i;
+#endif
+        }
+    }
+    free(blklens);
+    free(displs);
+
+   // printf("matrix generate\n");
+   // for (i = 0; i < msize; i++) {
+   //     for (j = 0; j < msize; j++) {
+   //         printf(" %1.f ", mat[i*msize+j]);
+   //     }
+   //     printf("\n");
+   // }
+}
+
+static void verify_mat_result(void *matt, int msize)
+{
+    int *blklens, *displs;
+    int i, j, error = 0;
+    int start, end;
+#if defined (TEST_DOUBLE)
+    double *mat = (double *)matt;
+#elif defined (TEST_FLOAT)
+    float *mat = (float *)matt;
+#elif defined (TEST_CHAR)
+    char *mat = (char *)matt;
+#else
+    void *mat = matt;
+#endif
+    
+    blklens = (int *)malloc(sizeof(int)*msize);
+    displs = (int *)malloc(sizeof(int)*msize);
+    for (i = 0; i < msize; i++) {
+        blklens[i] = msize - i;
+        displs[i] = i*msize + i;
+    }
+    for (i = 0; i < msize; i++) {
+        start = displs[i];
+        end = start + blklens[i];
+        for (j = start; j < end; j++) {
+#if defined (TEST_CHAR) 
+            if (mat[j] != 'a') {
+#else
+            if (mat[j] != (0.0+i)) {
+#endif
+                error ++;
+            }
+        }
+    }
+    free(blklens);
+    free(displs);
+    
+    // printf("matrix received\n");
+    // for (i = 0; i < msize; i++) {
+    //     for (j = 0; j < msize; j++) {
+    //         printf(" %1.f ", mat[i*msize+j]);
+    //     }
+    //     printf("\n");
+    // }
+    
+    if (error != 0) {
+        printf("error is found %d\n", error);
+    } else {
+        printf("no error is found\n");
+    }
+}
+
+static int local_copy_with_convertor( ompi_datatype_t* pdt, int count, int chunk, int msize )
+{
+    void *pdst = NULL, *psrc = NULL, *ptemp = NULL, *phost = NULL;
+    opal_convertor_t *send_convertor = NULL, *recv_convertor = NULL;
+    struct iovec iov;
+    uint32_t iov_count;
+    size_t max_data, dt_length;
+    int32_t length = 0, done1 = 0, done2 = 0;
+    TIMER_DATA_TYPE start, end, unpack_start, unpack_end;
+    long total_time, unpack_time = 0;
+
+    dt_length = compute_buffer_length(pdt, count);
+    printf("length %lu\n", dt_length);
+
+#if defined (DDT_TEST_CUDA)
+    cudaSetDevice(0);
+#endif
+
+#if defined (DDT_TEST_CUDA)
+    cudaError_t error = cudaMalloc((void **)&psrc, dt_length);
+    if ( error != cudaSuccess) {
+        printf("CUDA error: %s\n", cudaGetErrorString(error));
+        exit(-1);
+    }
+    cudaMemset(psrc, 0, dt_length);
+    printf("cudamalloc psrc %p\n", psrc);
+    
+    error = cudaMalloc((void **)&pdst, dt_length);
+    if ( error != cudaSuccess) {
+        printf("CUDA error: %s\n", cudaGetErrorString(error));
+        exit(-1);
+    }
+    cudaMemset(pdst, 0, dt_length); 
+    printf("cudamalloc pdst %p\n", pdst);
+    
+    error = cudaMallocHost((void **)&ptemp, chunk);
+    if ( error != cudaSuccess) {
+        printf("CUDA error: %s\n", cudaGetErrorString(error));
+        exit(-1);
+    }
+    memset(ptemp, 0, chunk);
+    printf("cudamallochost ptemp %p\n", ptemp);
+    
+    error = cudaMallocHost((void **)&phost, dt_length);
+    if ( error != cudaSuccess) {
+        printf("CUDA error: %s\n", cudaGetErrorString(error));
+        exit(-1);
+    }
+    memset(phost, 0, dt_length);
+    printf("cudamallochost phost %p\n", phost);
+#else
+    pdst  = malloc(dt_length);
+    psrc  = malloc(dt_length);
+    ptemp = malloc(chunk);
+    
+    for( int i = 0; i < length; ((char*)psrc)[i] = i % 128 + 32, i++ );
+    memset( pdst, 0, length );
+#endif
+
+#if defined (DDT_TEST_CUDA)
+    if (msize > 0) {
+        fill_upper_matrix(phost, msize);
+    }
+    cudaMemcpy(psrc, phost, dt_length, cudaMemcpyHostToDevice);
+#else 
+    if (msize > 0) {
+        fill_upper_matrix(psrc, msize);
+    }
+#endif
+
+    send_convertor = opal_convertor_create( remote_arch, 0 );
+#if defined (DDT_TEST_CUDA)
+    send_convertor->flags |= CONVERTOR_CUDA;
+#endif
+    if( OPAL_SUCCESS != opal_convertor_prepare_for_send( send_convertor, &(pdt->super), count, psrc ) ) {
+        printf( "Unable to create the send convertor. Is the datatype committed ?\n" );
+        goto clean_and_return;
+    }
+
+    recv_convertor = opal_convertor_create( remote_arch, 0 );
+#if defined (DDT_TEST_CUDA)
+    recv_convertor->flags |= CONVERTOR_CUDA;
+#endif
+    if( OPAL_SUCCESS != opal_convertor_prepare_for_recv( recv_convertor, &(pdt->super), count, pdst ) ) {
+        printf( "Unable to create the recv convertor. Is the datatype committed ?\n" );
+        goto clean_and_return;
+    }
+
+    cache_trash();  /* make sure the cache is useless */
+    cudaDeviceSynchronize();
+
+    GET_TIME( start );
+    while( (done1 & done2) != 1 ) {
+        /* They are supposed to finish in exactly the same time. */
+        if( done1 | done2 ) {
+            printf( "WRONG !!! the send is %s but the receive is %s in local_copy_with_convertor\n",
+                    (done1 ? "finish" : "not finish"),
+                    (done2 ? "finish" : "not finish") );
+        }
+
+        max_data = chunk;
+        iov_count = 1;
+        iov.iov_base = ptemp;
+        iov.iov_len = chunk;
+
+        if( done1 == 0 ) {
+            done1 = opal_convertor_pack( send_convertor, &iov, &iov_count, &max_data );
+        }
+
+        if( done2 == 0 ) {
+            GET_TIME( unpack_start );
+            done2 = opal_convertor_unpack( recv_convertor, &iov, &iov_count, &max_data );
+            GET_TIME( unpack_end );
+            unpack_time += ELAPSED_TIME( unpack_start, unpack_end );
+        }
+
+        length += max_data;
+    }
+    GET_TIME( end );
+    total_time = ELAPSED_TIME( start, end );
+    printf( "copying same data-type using convertors in %ld microsec\n", total_time );
+    printf( "\t unpack in %ld microsec [pack in %ld microsec]\n", unpack_time,
+            total_time - unpack_time );
+            
+#if defined (DDT_TEST_CUDA)
+    memset(phost, 0, dt_length);
+    cudaMemcpy(phost, pdst, dt_length, cudaMemcpyDeviceToHost);
+    if (msize > 0) {
+        verify_mat_result(phost, msize);
+    }
+#else
+    if (msize > 0) {
+        verify_mat_result(pdst, msize);
+    }
+#endif
+clean_and_return:
+    if( NULL != send_convertor ) OBJ_RELEASE( send_convertor );
+    if( NULL != recv_convertor ) OBJ_RELEASE( recv_convertor );
+
+#if defined (DDT_TEST_CUDA)
+    if( NULL != pdst ) cudaFree( pdst );
+    if( NULL != psrc ) cudaFree( psrc );
+    if( NULL != ptemp ) cudaFreeHost( ptemp );
+    if( NULL != phost ) cudaFreeHost( phost );
+#else
+    if( NULL != pdst ) free( pdst );
+    if( NULL != psrc ) free( psrc );
+    if( NULL != ptemp ) free( ptemp );
+#endif
+    return OMPI_SUCCESS;
+}
+
+/**
+ * Main function. Call several tests and print-out the results. It try to stress the convertor
+ * using difficult data-type constructions as well as strange segment sizes for the conversion.
+ * Usually, it is able to detect most of the data-type and convertor problems. Any modifications
+ * on the data-type engine should first pass all the tests from this file, before going into other
+ * tests.
+ */
+int main( int argc, char* argv[] )
+{
+    ompi_datatype_t *pdt, *pdt1, *pdt2, *pdt3;
+    int rc, length = 500, i;
+
+#if defined (DDT_TEST_CUDA)
+    opal_cuda_support = 1;
+#endif
+    opal_init_util(&argc, &argv);
+#if defined (DDT_TEST_CUDA)
+   // mca_common_cuda_stage_one_init();
+#endif
+    ompi_datatype_init();
+
+    /**
+     * By default simulate homogeneous architectures.
+     */
+    remote_arch = opal_local_arch;
+/*    printf( "\n\n#\n * TEST INVERSED VECTOR\n #\n\n" );
+    pdt = create_inversed_vector( &ompi_mpi_int.dt, 10 );
+    if( outputFlags & CHECK_PACK_UNPACK ) {
+        local_copy_ddt_count(pdt, 100);
+        local_copy_with_convertor(pdt, 100, 956);
+    }
+    OBJ_RELEASE( pdt ); assert( pdt == NULL );
+    printf( "\n\n#\n * TEST STRANGE DATATYPE\n #\n\n" );
+    pdt = create_strange_dt();
+    if( outputFlags & CHECK_PACK_UNPACK ) {
+        local_copy_ddt_count(pdt, 1);
+        local_copy_with_convertor(pdt, 1, 956);
+    }
+    OBJ_RELEASE( pdt ); assert( pdt == NULL );
+*/    
+    printf("\n TEST STRUCT \n");
+    pdt = create_struct_type(5);
+    if( outputFlags & CHECK_PACK_UNPACK ) {
+        for (i = 1; i <= 1; i++) {
+  //          local_copy_with_convertor_2datatypes_struct(pdt, 1, pdt, 1, 1024*1024*100, 5);
+        }
+    }
+    
+    printf( "\n\n#\n * TEST UPPER TRIANGULAR MATRIX (size 100)\n #\n\n" );
+    int mat_size = 500;
+    for (mat_size = 500; mat_size <= 6000; mat_size +=500) {
+        pdt = upper_matrix(mat_size);
+        printf("----matrix size %d-----\n", mat_size);
+        if( outputFlags & CHECK_PACK_UNPACK ) {
+            for (i = 1; i <= 1; i++) {
+                local_copy_with_convertor(pdt, 1, 1024*1024*200, mat_size);
+            }
+        }
+        OBJ_RELEASE( pdt ); assert( pdt == NULL );
+    }
+    
+    int packed_size = 256;
+    int blk_len = 4;
+    int blk_count;
+    
+    while (packed_size <= 8388608) {
+        blk_count = packed_size / blk_len / sizeof(double);
+        printf( ">>--------------------------------------------<<\n" );
+        printf( "Vector data-type packed size %d, blk %d, count %d\n", packed_size, blk_len, blk_count );
+        pdt = create_vector_type( MPI_DOUBLE, blk_count, blk_len, 128+blk_len );
+        if( outputFlags & CHECK_PACK_UNPACK ) {
+            for (i = 0; i < 4; i++) {
+            //     vector_ddt( pdt, 1, pdt, 1, 1024*1024*30, blk_count, blk_len, 128+blk_len );
+            }
+        }
+        packed_size *= 2;
+    }
+    
+    packed_size = 256;
+    blk_len = 16;
+    while (packed_size <= 8388608) {
+        blk_count = packed_size / blk_len / sizeof(double);
+        printf( ">>--------------------------------------------<<\n" );
+        printf( "Vector data-type packed size %d, blk %d, count %d\n", packed_size, blk_len, blk_count );
+        pdt = create_vector_type( MPI_DOUBLE, blk_count, blk_len, 128+blk_len );
+        if( outputFlags & CHECK_PACK_UNPACK ) {
+            for (i = 0; i < 4; i++) {
+        //         vector_ddt( pdt, 1, pdt, 1, 1024*1024*30, blk_count, blk_len, 128+blk_len );
+            }
+        }
+        packed_size *= 2;
+    }
+    
+    packed_size = 1024;
+    blk_len = 64;
+    while (packed_size <= 8388608) {
+        blk_count = packed_size / blk_len / sizeof(double);
+        printf( ">>--------------------------------------------<<\n" );
+        printf( "Vector data-type packed size %d, blk %d, count %d\n", packed_size, blk_len, blk_count );
+        pdt = create_vector_type( MPI_DOUBLE, blk_count, blk_len, 128+blk_len );
+        if( outputFlags & CHECK_PACK_UNPACK ) {
+            for (i = 0; i < 4; i++) {
+         //       vector_ddt( pdt, 1, pdt, 1, 1024*1024*30, blk_count, blk_len, 128+blk_len );
+           //     vector_ddt_2d( pdt, 1, pdt, 1, 1024*1024*30, blk_count, blk_len, 128+blk_len );
+            }
+        }
+        packed_size *= 2;
+    }
+    
+    
+    for (blk_len = 4; blk_len <= 64; blk_len += 2) {
+        printf( ">>--------------------------------------------<<\n" );
+        printf( "Vector data-type (1024 times %d double stride 512)\n", blk_len );
+        pdt = create_vector_type( MPI_DOUBLE, 1000, blk_len, blk_len+128);
+        if( outputFlags & CHECK_PACK_UNPACK ) {
+            for (i = 0; i < 4; i++) {
+      //         vector_ddt( pdt, 1, pdt, 1, 1024*1024*20 , 1000, blk_len, blk_len+128);
+     //          vector_ddt_2d( pdt, 1, pdt, 1, 1024*1024*100 , 8192, blk_len, blk_len+128);
+            }
+        }
+        OBJ_RELEASE( pdt ); assert( pdt == NULL );
+    }
+    
+    
+    for (blk_len = 4; blk_len <= 64; blk_len += 2) {
+        printf( ">>--------------------------------------------<<\n" );
+        printf( "Vector data-type (60000 times %d double stride 512)\n", blk_len );
+        pdt = create_vector_type( MPI_DOUBLE, 8000, blk_len, blk_len+128);
+        if( outputFlags & CHECK_PACK_UNPACK ) {
+            for (i = 0; i < 4; i++) {
+     //            vector_ddt( pdt, 1, pdt, 1, 1024*1024*100 , 8000, blk_len, blk_len+128);
+    //             vector_ddt_2d( pdt, 1, pdt, 1, 1024*1024*100 , 8192, blk_len, blk_len+128);
+            }
+        }
+        OBJ_RELEASE( pdt ); assert( pdt == NULL );
+    }
+    
+    /*
+    for (blk_len = 4; blk_len <= 32; blk_len += 1) {
+        printf( ">>--------------------------------------------<<\n" );
+        printf( "Vector data-type (4000 times %d double stride 512)\n", blk_len );
+        pdt = create_vector_type( MPI_DOUBLE, 1000, blk_len, blk_len+64);
+        if( outputFlags & CHECK_PACK_UNPACK ) {
+            for (i = 0; i < 4; i++) {
+                vector_ddt( pdt, 1, pdt, 1, 1024*1024*200 , 1000, blk_len, blk_len+64);
+            }
+        }
+        OBJ_RELEASE( pdt ); assert( pdt == NULL );
+    }
+    */
+      
+    printf( "Vector data-type (4000 times 256 double stride 384)\n" );
+    pdt = create_vector_type( MPI_DOUBLE, 4000, 256, 384 );
+//    ompi_datatype_dump( pdt );
+    if( outputFlags & CHECK_PACK_UNPACK ) {
+        for (i = 0; i < 4; i++) {
+       // local_copy_ddt_count(pdt, 1);
+      //  local_copy_with_convertor( pdt, 1, 12 );
+      //  local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 12 );
+      //  local_copy_with_convertor( pdt, 1, 82 );
+      //  local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 82 );
+      //  local_copy_with_convertor( pdt, 1, 6000 );
+      //  local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 6000 );
+      //  local_copy_with_convertor( pdt, 1, 36000 );
+     //     vector_ddt( pdt, 1, pdt, 1, 1024*1024*200, 4000, 256, 384 );
+        }
+    }
+    printf( ">>--------------------------------------------<<\n" );
+    OBJ_RELEASE( pdt ); assert( pdt == NULL );
+    
+    printf( "Vector data-type (4000 times 128 double stride 256)\n" );
+    pdt = create_vector_type( MPI_DOUBLE, 4000, 128, 256 );
+//    ompi_datatype_dump( pdt );
+    if( outputFlags & CHECK_PACK_UNPACK ) {
+        for (i = 0; i < 10; i++) {
+       // local_copy_ddt_count(pdt, 1);
+      //  local_copy_with_convertor( pdt, 1, 12 );
+      //  local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 12 );
+      //  local_copy_with_convertor( pdt, 1, 82 );
+      //  local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 82 );
+      //  local_copy_with_convertor( pdt, 1, 6000 );
+      //  local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 6000 );
+      //  local_copy_with_convertor( pdt, 1, 36000 );
+    //      local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 1024*1024*5 );
+        }
+    }
+    printf( ">>--------------------------------------------<<\n" );
+    OBJ_RELEASE( pdt ); assert( pdt == NULL );
+    
+    printf( "Vector data-type (2000 times 3 double stride 4)\n" );
+    pdt = create_vector_type( MPI_DOUBLE, 2000, 3, 4 );
+//    ompi_datatype_dump( pdt );
+    if( outputFlags & CHECK_PACK_UNPACK ) {
+        for (i = 0; i < 10; i++) {
+       // local_copy_ddt_count(pdt, 1);
+      //  local_copy_with_convertor( pdt, 1, 12 );
+      //  local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 12 );
+      //  local_copy_with_convertor( pdt, 1, 82 );
+      //  local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 82 );
+      //  local_copy_with_convertor( pdt, 1, 6000 );
+      //  local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 6000 );
+      //  local_copy_with_convertor( pdt, 1, 36000 );
+      //  local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 1024*1024*4 );
+        }
+    }
+    printf( ">>--------------------------------------------<<\n" );
+    OBJ_RELEASE( pdt ); assert( pdt == NULL );
+    /*
+    printf( ">>--------------------------------------------<<\n" );
+    pdt = test_struct_char_double();
+    if( outputFlags & CHECK_PACK_UNPACK ) {
+        local_copy_ddt_count(pdt, 4500);
+        local_copy_with_convertor( pdt, 4500, 12 );
+        local_copy_with_convertor_2datatypes( pdt, 4500, pdt, 4500, 12 );
+    }
+    printf( ">>--------------------------------------------<<\n" );
+    OBJ_RELEASE( pdt ); assert( pdt == NULL );
+    
+    printf( ">>--------------------------------------------<<\n" );
+    pdt = test_create_twice_two_doubles();
+    if( outputFlags & CHECK_PACK_UNPACK ) {
+        local_copy_ddt_count(pdt, 4500);
+        local_copy_with_convertor( pdt, 4500, 12 );
+        local_copy_with_convertor_2datatypes( pdt, 4500, pdt, 4500, 12 );
+    }
+    printf( ">>--------------------------------------------<<\n" );
+    OBJ_RELEASE( pdt ); assert( pdt == NULL );
+    printf( ">>--------------------------------------------<<\n" );
+    pdt = test_create_blacs_type();
+    if( outputFlags & CHECK_PACK_UNPACK ) {
+        ompi_datatype_dump( pdt );
+        local_copy_ddt_count(pdt, 2);
+        local_copy_ddt_count(pdt, 4500);
+        local_copy_with_convertor( pdt, 4500, 956 );
+        local_copy_with_convertor_2datatypes( pdt, 4500, pdt, 4500, 956 );
+        local_copy_with_convertor( pdt, 4500, 16*1024 );
+        local_copy_with_convertor_2datatypes( pdt, 4500, pdt, 4500, 16*1024 );
+        local_copy_with_convertor( pdt, 4500, 64*1024 );
+        local_copy_with_convertor_2datatypes( pdt, 4500, pdt, 4500, 64*1024 );
+    }
+    printf( ">>--------------------------------------------<<\n" );
+    OBJ_RELEASE( pdt ); assert( pdt == NULL );
+    printf( ">>--------------------------------------------<<\n" );
+    pdt1 = test_create_blacs_type1( &ompi_mpi_int.dt );
+    pdt2 = test_create_blacs_type2( &ompi_mpi_int.dt );
+    if( outputFlags & CHECK_PACK_UNPACK ) {
+        local_copy_with_convertor_2datatypes( pdt1, 1, pdt2, 1, 100 );
+    }
+    printf( ">>--------------------------------------------<<\n" );
+    OBJ_RELEASE( pdt1 ); assert( pdt1 == NULL );
+    OBJ_RELEASE( pdt2 ); assert( pdt2 == NULL );
+*/
+    /* clean-ups all data allocations */
+    ompi_datatype_finalize();
+
+    return OMPI_SUCCESS;
+}

From 0a0df96b627f12fdf97fe4453c09bf6e5243f928 Mon Sep 17 00:00:00 2001
From: eddy16112 <eddy16112@gmail.com>
Date: Thu, 1 Oct 2015 23:00:08 -0400
Subject: [PATCH 120/190] modify for matrix transpose

---
 opal/datatype/cuda/opal_datatype_cuda.cuh     |   6 +
 .../cuda/opal_datatype_pack_cuda_wrapper.cu   | 227 ++++++++++++++--
 .../cuda/opal_datatype_unpack_cuda_wrapper.cu | 236 ++++++++++++++++-
 opal/datatype/opal_datatype_pack.c            |   3 +-
 opal/datatype/opal_datatype_unpack.c          |   3 +-
 opal/mca/btl/smcuda/btl_smcuda_component.c    |   2 -
 test/datatype/ddt_benchmark.c                 | 244 +++++++++++++++++-
 7 files changed, 689 insertions(+), 32 deletions(-)

diff --git a/opal/datatype/cuda/opal_datatype_cuda.cuh b/opal/datatype/cuda/opal_datatype_cuda.cuh
index b770f136969..436eaa9aec3 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cuh
+++ b/opal/datatype/cuda/opal_datatype_cuda.cuh
@@ -85,6 +85,12 @@ void pack_predefined_data_cuda( dt_elem_desc_t* ELEM,
                                 unsigned char** SOURCE,
                                 unsigned char** DESTINATION,
                                 size_t* SPACE );
+                                
+void unpack_predefined_data_cuda( dt_elem_desc_t* ELEM,
+                                  uint32_t* COUNT,
+                                  unsigned char** SOURCE,
+                                  unsigned char** DESTINATION,
+                                  size_t* SPACE );
 
 void opal_cuda_sync_device(void);
 
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
index 250e3e253e3..1268280fab6 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
@@ -266,11 +266,13 @@ int32_t opal_generic_simple_pack_function_cuda_vector(opal_convertor_t* pConvert
             while( pElem->elem.common.flags & OPAL_DATATYPE_FLAG_DATA ) {
                 /* now here we have a basic datatype */
                 /* should not go into here */
-                pStack--;
-                pConvertor->stack_pos--;
-                pos_desc --;
-                pElem = &(description[pos_desc]);
-                count_desc = count_desc_tmp;
+                pack_predefined_data_cuda( pElem, &count_desc, &conv_ptr, &iov_ptr, &iov_len_local );
+                if( 0 == count_desc ) {  /* completed */
+                    conv_ptr = pConvertor->pBaseBuf + pStack->disp;
+                    pos_desc++;  /* advance to the next data */
+                    UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
+                    continue;
+                }
                 goto complete_loop;
             }
             if( OPAL_DATATYPE_END_LOOP == pElem->elem.common.type ) { /* end of the current loop */
@@ -327,8 +329,7 @@ int32_t opal_generic_simple_pack_function_cuda_vector(opal_convertor_t* pConvert
                             pStack->disp + local_disp);
                 pos_desc++;
             update_loop_description:  /* update the current state */
-              //  conv_ptr = pConvertor->pBaseBuf + pStack->disp;
-                count_desc_tmp = count_desc;
+                conv_ptr = pConvertor->pBaseBuf + pStack->disp;
                 UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
                 continue;
             }
@@ -349,6 +350,7 @@ int32_t opal_generic_simple_pack_function_cuda_vector(opal_convertor_t* pConvert
         printf( "[Timing]: DtoH memcpy in %ld microsec, transfer required %d\n", total_time, transfer_required );
 #endif
     }
+    cudaDeviceSynchronize();
     *max_data = total_packed;
     pConvertor->bConverted += total_packed;  /* update the already converted bytes */
     *out_size = iov_count;
@@ -370,6 +372,205 @@ int32_t opal_generic_simple_pack_function_cuda_vector(opal_convertor_t* pConvert
     return 0;
 }
 
+// int32_t opal_generic_simple_pack_function_cuda_vector(opal_convertor_t* pConvertor,
+//                                                       struct iovec* iov,
+//                                                       uint32_t* out_size,
+//                                                       size_t* max_data )
+// {
+//     dt_stack_t* pStack;       /* pointer to the position on the stack */
+//     uint32_t pos_desc;        /* actual position in the description of the derived datatype */
+//     uint32_t count_desc;      /* the number of items already done in the actual pos_desc */
+//     size_t total_packed = 0;  /* total amount packed this time */
+//     dt_elem_desc_t* description;
+//     dt_elem_desc_t* pElem;
+//     const opal_datatype_t *pData = pConvertor->pDesc;
+//     unsigned char *conv_ptr, *iov_ptr;
+//     size_t iov_len_local;
+//     uint32_t iov_count;
+//     uint8_t transfer_required;
+//     uint8_t free_required;
+//     uint32_t count_desc_tmp;
+//
+// #if defined(OPAL_DATATYPE_CUDA_TIMING)
+//     TIMER_DATA_TYPE start, end, start_total, end_total;
+//     long total_time;
+// #endif
+//
+//     DT_CUDA_DEBUG( opal_cuda_output( 1, "opal_convertor_generic_simple_pack_cuda_vector( %p:%p, {%p, %lu}, %u, %u )\n",
+//                                 (void*)pConvertor, (void*)pConvertor->pBaseBuf,
+//                                 iov[0].iov_base, (unsigned long)iov[0].iov_len, *out_size, *max_data ); );
+//
+//     description = pConvertor->use_desc->desc;
+//
+//     /* For the first step we have to add both displacement to the source. After in the
+//      * main while loop we will set back the conv_ptr to the correct value. This is
+//      * due to the fact that the convertor can stop in the middle of a data with a count
+//      */
+//     pStack = pConvertor->pStack + pConvertor->stack_pos;
+//     pos_desc   = pStack->index;
+//     conv_ptr   = pConvertor->pBaseBuf + pStack->disp;
+//     count_desc = (uint32_t)pStack->count;
+//     pStack--;
+//     pConvertor->stack_pos--;
+//     pElem = &(description[pos_desc]);
+//
+//     DT_CUDA_DEBUG( opal_cuda_output( 1, "pack start pos_desc %d count_desc %d disp %ld\n"
+//                            "stack_pos %d pos_desc %d count_desc %d disp %ld\n",
+//                            pos_desc, count_desc, (long)(conv_ptr - pConvertor->pBaseBuf),
+//                            pConvertor->stack_pos, pStack->index, (int)pStack->count, (long)pStack->disp ); );
+//
+//
+//     for( iov_count = 0; iov_count < (*out_size); iov_count++ ) {
+//         if ((iov[iov_count].iov_base == NULL) || opal_cuda_is_gpu_buffer(iov[iov_count].iov_base)) {
+//             if (iov[iov_count].iov_len == 0) {
+//                 iov_len_local = DT_CUDA_BUFFER_SIZE;
+//             } else {
+//                 iov_len_local = iov[iov_count].iov_len;
+//             }
+//
+//             if (iov[iov_count].iov_base == NULL) {
+//                 iov[iov_count].iov_base = (unsigned char *)opal_cuda_malloc_gpu_buffer(iov_len_local, 0);
+//                 iov_ptr = (unsigned char *)iov[iov_count].iov_base;
+//                 pConvertor->gpu_buffer_ptr = iov_ptr;
+//                 free_required = 1;
+//             } else {
+//                 iov_ptr = (unsigned char *)iov[iov_count].iov_base;
+//                 free_required = 0;
+//             }
+//             transfer_required = 0;
+//         } else {
+//             if (OPAL_DATATYPE_VECTOR_USE_MEMCPY2D || OPAL_DATATYPE_VECTOR_USE_ZEROCPY) {
+//                 pConvertor->gpu_buffer_ptr = NULL;
+//                 transfer_required = 0;
+//                 free_required = 0;
+//                 iov_ptr = (unsigned char*)iov[iov_count].iov_base;
+//                 iov_len_local = iov[iov_count].iov_len;
+//             } else if (OPAL_DATATYPE_VECTOR_USE_PIPELINE){
+//                 iov_len_local = iov[iov_count].iov_len;
+//                 if (pConvertor->gpu_buffer_ptr == NULL) {
+//                     pConvertor->gpu_buffer_ptr = (unsigned char*)opal_cuda_malloc_gpu_buffer(iov_len_local, 0);
+//                 }
+//                 transfer_required = 0;
+//                 free_required = 1;
+//                 iov_ptr = (unsigned char*)iov[iov_count].iov_base;
+//             } else {
+//                 iov_len_local = iov[iov_count].iov_len;
+//                 if (pConvertor->gpu_buffer_ptr == NULL) {
+//                     pConvertor->gpu_buffer_ptr = (unsigned char*)opal_cuda_malloc_gpu_buffer(iov_len_local, 0);
+//                 }
+//                 transfer_required = 1;
+//                 free_required = 1;
+//                 iov_ptr = pConvertor->gpu_buffer_ptr;
+//             }
+//         }
+//         while( 1 ) {
+//             while( pElem->elem.common.flags & OPAL_DATATYPE_FLAG_DATA ) {
+//                 /* now here we have a basic datatype */
+//                 /* should not go into here */
+//                 pStack--;
+//                 pConvertor->stack_pos--;
+//                 pos_desc --;
+//                 pElem = &(description[pos_desc]);
+//                 count_desc = count_desc_tmp;
+//                 goto complete_loop;
+//             }
+//             if( OPAL_DATATYPE_END_LOOP == pElem->elem.common.type ) { /* end of the current loop */
+//                 DT_CUDA_DEBUG( opal_cuda_output( 2, "pack end_loop count %d stack_pos %d"
+//                                                  " pos_desc %d disp %ld space %lu\n",
+//                                                  (int)pStack->count, pConvertor->stack_pos,
+//                                                  pos_desc, (long)pStack->disp, (unsigned long)iov_len_local ); );
+//                 if( --(pStack->count) == 0 ) { /* end of loop */
+//                     if( 0 == pConvertor->stack_pos ) {
+//                         /* we lie about the size of the next element in order to
+//                          * make sure we exit the main loop.
+//                          */
+//                         *out_size = iov_count;
+//                         goto complete_loop;  /* completed */
+//                     }
+//                     pConvertor->stack_pos--;
+//                     pStack--;
+//                     pos_desc++;
+//                 } else {
+//                     pos_desc = pStack->index + 1;
+//                     if( pStack->index == -1 ) {
+//                         pStack->disp += (pData->ub - pData->lb);
+//                     } else {
+//                         assert( OPAL_DATATYPE_LOOP == description[pStack->index].loop.common.type );
+//                         pStack->disp += description[pStack->index].loop.extent;
+//                     }
+//                 }
+//                 conv_ptr = pConvertor->pBaseBuf + pStack->disp;
+//                 UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
+//                 DT_CUDA_DEBUG( opal_cuda_output( 2, "pack new_loop count %d stack_pos %d pos_desc %d count_desc %d disp %ld space %lu\n",
+//                                                  (int)pStack->count, pConvertor->stack_pos, pos_desc,
+//                                                  count_desc, (long)pStack->disp, (unsigned long)iov_len_local ); );
+//             }
+//             if( OPAL_DATATYPE_LOOP == pElem->elem.common.type ) {
+//                 OPAL_PTRDIFF_TYPE local_disp = (OPAL_PTRDIFF_TYPE)conv_ptr;
+//                 if( pElem->loop.common.flags & OPAL_DATATYPE_FLAG_CONTIGUOUS ) {
+//                     if (OPAL_DATATYPE_VECTOR_USE_MEMCPY2D) {
+//                         pack_contiguous_loop_cuda_memcpy2d(pElem, &count_desc, &conv_ptr, &iov_ptr, &iov_len_local);
+//                     } else if (OPAL_DATATYPE_VECTOR_USE_ZEROCPY) {
+//                         pack_contiguous_loop_cuda_zerocopy(pElem, &count_desc, &conv_ptr, &iov_ptr, &iov_len_local);
+//                     } else if (OPAL_DATATYPE_VECTOR_USE_PIPELINE) {
+//                         pack_contiguous_loop_cuda_pipeline(pElem, &count_desc, &conv_ptr, &iov_ptr, &iov_len_local, pConvertor->gpu_buffer_ptr);
+//                     } else {
+//                         pack_contiguous_loop_cuda(pElem, &count_desc, &conv_ptr, &iov_ptr, &iov_len_local);
+//                     }
+//                     if( 0 == count_desc ) {  /* completed */
+//                         pos_desc += pElem->loop.items + 1;
+//                         goto update_loop_description;
+//                     }
+//                     /* Save the stack with the correct last_count value. */
+//                 }
+//                 local_disp = (OPAL_PTRDIFF_TYPE)conv_ptr - local_disp;
+//                 PUSH_STACK( pStack, pConvertor->stack_pos, pos_desc, OPAL_DATATYPE_LOOP, count_desc,
+//                             pStack->disp + local_disp);
+//                 pos_desc++;
+//             update_loop_description:  /* update the current state */
+//               //  conv_ptr = pConvertor->pBaseBuf + pStack->disp;
+//                 count_desc_tmp = count_desc;
+//                 UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
+//                 continue;
+//             }
+//         }
+//     complete_loop:
+//         iov[iov_count].iov_len -= iov_len_local;  /* update the amount of valid data */
+//         total_packed += iov[iov_count].iov_len;
+//  //       printf("iov_len %d, local %d\n", iov[iov_count].iov_len, iov_len_local);
+// #if defined(OPAL_DATATYPE_CUDA_TIMING)
+//         GET_TIME(start);
+// #endif
+//         if (transfer_required) {
+//             cudaMemcpy(iov[iov_count].iov_base, pConvertor->gpu_buffer_ptr, total_packed, cudaMemcpyDeviceToHost);
+//         }
+// #if defined(OPAL_DATATYPE_CUDA_TIMING)
+//         GET_TIME( end );
+//         total_time = ELAPSED_TIME( start, end );
+//         printf( "[Timing]: DtoH memcpy in %ld microsec, transfer required %d\n", total_time, transfer_required );
+// #endif
+//     }
+//     *max_data = total_packed;
+//     pConvertor->bConverted += total_packed;  /* update the already converted bytes */
+//     *out_size = iov_count;
+//     if( pConvertor->bConverted == pConvertor->local_size ) {
+//         pConvertor->flags |= CONVERTOR_COMPLETED;
+//         DT_CUDA_DEBUG( opal_cuda_output( 0, "Total packed %lu\n", pConvertor->bConverted); );
+//         if (pConvertor->gpu_buffer_ptr != NULL && free_required == 1) {
+//             printf("free\n");
+//            opal_cuda_free_gpu_buffer(pConvertor->gpu_buffer_ptr, 0);
+//            pConvertor->gpu_buffer_ptr = NULL;
+//         }
+//         return 1;
+//     }
+//     /* Save the global position for the next round */
+//     PUSH_STACK( pStack, pConvertor->stack_pos, pos_desc, pElem->elem.common.type, count_desc,
+//                 conv_ptr - pConvertor->pBaseBuf );
+//     DT_CUDA_DEBUG( opal_cuda_output( 2, "pack save stack stack_pos %d pos_desc %d count_desc %d disp %ld\n",
+//                                      pConvertor->stack_pos, pStack->index, (int)pStack->count, (long)pStack->disp ); );
+//     return 0;
+// }
+
 void pack_contiguous_loop_cuda( dt_elem_desc_t* ELEM,
                                 uint32_t* COUNT,
                                 unsigned char** SOURCE,
@@ -892,10 +1093,6 @@ void pack_predefined_data_cuda( dt_elem_desc_t* ELEM,
         if( 0 == _copy_count ) return;  /* nothing to do */
     }
     
-#if !defined(OPAL_DATATYPE_CUDA_DRY_RUN)
-    _source = pBaseBuf_GPU + _elem->disp;
-    _destination = (unsigned char*)cuda_desc_h->iov[0].iov_base;
-#endif
     
     if (*COUNT / TASK_PER_THREAD < CUDA_WARP_SIZE) {
         thread_per_block = CUDA_WARP_SIZE;
@@ -904,13 +1101,13 @@ void pack_predefined_data_cuda( dt_elem_desc_t* ELEM,
     } else if (*COUNT / TASK_PER_THREAD < CUDA_WARP_SIZE * 3) {
         thread_per_block = CUDA_WARP_SIZE * 3;
     } else {
-        thread_per_block = CUDA_WARP_SIZE * 4;
+        thread_per_block = CUDA_WARP_SIZE * 5;
     }
     tasks_per_block = thread_per_block * TASK_PER_THREAD;
     nb_blocks = (*COUNT + tasks_per_block - 1) / tasks_per_block;
 
-    DBGPRINT("num_blocks %d, thread %d\n", nb_blocks, tasks_per_block);
-    DBGPRINT( "GPU pack 1. memcpy( %p, %p, %lu ) => space %lu\n", _destination, _source, (unsigned long)_copy_count, (unsigned long)(*(SPACE)) );
+ //   DBGPRINT("num_blocks %d, thread %d\n", nb_blocks, tasks_per_block);
+ //   DBGPRINT( "GPU pack 1. memcpy( %p, %p, %lu ) => space %lu\n", _destination, _source, (unsigned long)_copy_count, (unsigned long)(*(SPACE)) );
     
     pack_contiguous_loop_cuda_kernel_global<<<nb_blocks, thread_per_block, 0, cuda_streams->opal_cuda_stream[cuda_streams->current_stream_id]>>>(_copy_count, _copy_blength, _elem->extent, _source, _destination);
     cuda_streams->current_stream_id ++;
@@ -924,7 +1121,5 @@ void pack_predefined_data_cuda( dt_elem_desc_t* ELEM,
     *(COUNT)  -= _copy_count;
 #endif
     
-    cuda_desc_h->iov[0].iov_base = (unsigned char*)cuda_desc_h->iov[0].iov_base + _copy_blength;
- //   cudaDeviceSynchronize();
 }
 
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
index 893f280c68f..8f8af75274e 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
@@ -188,11 +188,17 @@ int32_t opal_generic_simple_unpack_function_cuda_vector( opal_convertor_t* pConv
             while( pElem->elem.common.flags & OPAL_DATATYPE_FLAG_DATA ) {
                 /* now here we have a basic datatype */
                 /* should not go to here */
-                pStack--;
-                pConvertor->stack_pos--;
-                pos_desc --;
-                pElem = &(description[pos_desc]);
-                count_desc = count_desc_tmp;
+                unpack_predefined_data_cuda( pElem, &count_desc, &iov_ptr, &conv_ptr, &iov_len_local );
+                if( 0 == count_desc ) {  /* completed */
+                    conv_ptr = pConvertor->pBaseBuf + pStack->disp;
+                    pos_desc++;  /* advance to the next data */
+                    UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
+                    continue;
+                }
+                assert( pElem->elem.common.type < OPAL_DATATYPE_MAX_PREDEFINED );
+                if( 0 != iov_len_local ) {
+                    assert(0);
+                }
                 goto complete_loop;
             }
             if( OPAL_DATATYPE_END_LOOP == pElem->elem.common.type ) { /* end of the current loop */
@@ -246,8 +252,7 @@ int32_t opal_generic_simple_unpack_function_cuda_vector( opal_convertor_t* pConv
                             pStack->disp + local_disp);
                 pos_desc++;
             update_loop_description:  /* update the current state */
-            //    conv_ptr = pConvertor->pBaseBuf + pStack->disp;
-                count_desc_tmp = count_desc;
+                conv_ptr = pConvertor->pBaseBuf + pStack->disp;
                 UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
                 continue;
             }
@@ -257,6 +262,7 @@ int32_t opal_generic_simple_unpack_function_cuda_vector( opal_convertor_t* pConv
         total_unpacked += iov[iov_count].iov_len;
     }
  complete_conversion:
+    cudaDeviceSynchronize();
     *max_data = total_unpacked;
     pConvertor->bConverted += total_unpacked;  /* update the already converted bytes */
     *out_size = iov_count;
@@ -277,6 +283,173 @@ int32_t opal_generic_simple_unpack_function_cuda_vector( opal_convertor_t* pConv
     return 0;
 }
 
+// int32_t opal_generic_simple_unpack_function_cuda_vector( opal_convertor_t* pConvertor,
+//                                                          struct iovec* iov, uint32_t* out_size,
+//                                                          size_t* max_data )
+// {
+//     dt_stack_t* pStack;                /* pointer to the position on the stack */
+//     uint32_t pos_desc;                 /* actual position in the description of the derived datatype */
+//     uint32_t count_desc;               /* the number of items already done in the actual pos_desc */
+//     size_t total_unpacked = 0;         /* total size unpacked this time */
+//     dt_elem_desc_t* description;
+//     dt_elem_desc_t* pElem;
+//     const opal_datatype_t *pData = pConvertor->pDesc;
+//     unsigned char *conv_ptr, *iov_ptr;
+//     size_t iov_len_local;
+//     uint32_t iov_count;
+//     uint8_t free_required;
+//     uint32_t count_desc_tmp;
+//
+// #if defined(OPAL_DATATYPE_CUDA_TIMING)
+//     TIMER_DATA_TYPE start, end;
+//     long total_time;
+// #endif
+//
+//     DT_CUDA_DEBUG( opal_cuda_output( 1, "opal_convertor_generic_simple_unpack( %p, {%p, %lu}, %u , %u)\n",
+//                                      (void*)pConvertor, iov[0].iov_base, (unsigned long)iov[0].iov_len, *out_size, *max_data ); )
+//
+//     description = pConvertor->use_desc->desc;
+//
+//     /* For the first step we have to add both displacement to the source. After in the
+//      * main while loop we will set back the source_base to the correct value. This is
+//      * due to the fact that the convertor can stop in the middle of a data with a count
+//      */
+//     pStack     = pConvertor->pStack + pConvertor->stack_pos;
+//     pos_desc   = pStack->index;
+//     conv_ptr   = pConvertor->pBaseBuf + pStack->disp;
+//     count_desc = (uint32_t)pStack->count;
+//     pStack--;
+//     pConvertor->stack_pos--;
+//     pElem = &(description[pos_desc]);
+//
+//     DT_CUDA_DEBUG( opal_cuda_output( 1, "unpack start pos_desc %d count_desc %d disp %ld\n"
+//                            "stack_pos %d pos_desc %d count_desc %d disp %ld\n",
+//                            pos_desc, count_desc, (long)(conv_ptr - pConvertor->pBaseBuf),
+//                            pConvertor->stack_pos, pStack->index, (int)pStack->count, (long)(pStack->disp) ); );
+//
+//     for( iov_count = 0; iov_count < (*out_size); iov_count++ ) {
+// #if defined(OPAL_DATATYPE_CUDA_TIMING)
+//         GET_TIME(start);
+// #endif
+//         if (opal_cuda_is_gpu_buffer(iov[iov_count].iov_base)) {
+//             iov_ptr = (unsigned char*)iov[iov_count].iov_base;
+//             free_required = 0;
+//         } else {
+//             if (OPAL_DATATYPE_VECTOR_USE_MEMCPY2D || OPAL_DATATYPE_VECTOR_USE_ZEROCPY) {
+//                 iov_ptr = (unsigned char*)iov[iov_count].iov_base;
+//                 pConvertor->gpu_buffer_ptr = NULL;
+//                 free_required = 0;
+//             } else {
+//                 if (pConvertor->gpu_buffer_ptr == NULL) {
+//                     pConvertor->gpu_buffer_ptr = (unsigned char*)opal_cuda_malloc_gpu_buffer(iov[iov_count].iov_len, 0);
+//                 }
+//                 iov_ptr = pConvertor->gpu_buffer_ptr;
+//                 cudaMemcpy(iov_ptr, iov[iov_count].iov_base, iov[iov_count].iov_len, cudaMemcpyHostToDevice);
+//                 free_required = 1;
+//             }
+//         }
+// #if defined(OPAL_DATATYPE_CUDA_TIMING)
+//         GET_TIME( end );
+//         total_time = ELAPSED_TIME( start, end );
+//         printf( "[Timing]: HtoD memcpy in %ld microsec, free required %d\n", total_time, free_required );
+// #endif
+//         iov_len_local = iov[iov_count].iov_len;
+//         if( 0 != pConvertor->partial_length ) {
+//             /* not support yet */
+//         }
+//         while( 1 ) {
+//             while( pElem->elem.common.flags & OPAL_DATATYPE_FLAG_DATA ) {
+//                 /* now here we have a basic datatype */
+//                 /* should not go to here */
+//                 pStack--;
+//                 pConvertor->stack_pos--;
+//                 pos_desc --;
+//                 pElem = &(description[pos_desc]);
+//                 count_desc = count_desc_tmp;
+//                 goto complete_loop;
+//             }
+//             if( OPAL_DATATYPE_END_LOOP == pElem->elem.common.type ) { /* end of the current loop */
+//                 DT_CUDA_DEBUG( opal_cuda_output( 2, "unpack end_loop count %d stack_pos %d pos_desc %d disp %ld space %lu\n",
+//                                                  (int)pStack->count, pConvertor->stack_pos, pos_desc,
+//                                                  (long)pStack->disp, (unsigned long)iov_len_local ); );
+//                 if( --(pStack->count) == 0 ) { /* end of loop */
+//                     if( 0 == pConvertor->stack_pos ) {
+//                         /* Do the same thing as when the loop is completed */
+//                         iov[iov_count].iov_len -= iov_len_local;  /* update the amount of valid data */
+//                         total_unpacked += iov[iov_count].iov_len;
+//                         iov_count++;  /* go to the next */
+//                         goto complete_conversion;
+//                     }
+//                     pConvertor->stack_pos--;
+//                     pStack--;
+//                     pos_desc++;
+//                 } else {
+//                     pos_desc = pStack->index + 1;
+//                     if( pStack->index == -1 ) {
+//                         pStack->disp += (pData->ub - pData->lb);
+//                     } else {
+//                         assert( OPAL_DATATYPE_LOOP == description[pStack->index].loop.common.type );
+//                         pStack->disp += description[pStack->index].loop.extent;
+//                     }
+//                 }
+//                 conv_ptr = pConvertor->pBaseBuf + pStack->disp;
+//                 UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
+//                 DT_CUDA_DEBUG( opal_cuda_output( 2, "unpack new_loop count %d stack_pos %d pos_desc %d disp %ld space %lu\n",
+//                                                  (int)pStack->count, pConvertor->stack_pos, pos_desc,
+//                                                  (long)pStack->disp, (unsigned long)iov_len_local ); );
+//             }
+//             if( OPAL_DATATYPE_LOOP == pElem->elem.common.type ) {
+//                 OPAL_PTRDIFF_TYPE local_disp = (OPAL_PTRDIFF_TYPE)conv_ptr;
+//                 if( pElem->loop.common.flags & OPAL_DATATYPE_FLAG_CONTIGUOUS ) {
+//                     if (OPAL_DATATYPE_VECTOR_USE_MEMCPY2D) {
+//                         unpack_contiguous_loop_cuda_memcpy2d(pElem, &count_desc, &iov_ptr, &conv_ptr, &iov_len_local);
+//                     } else if (OPAL_DATATYPE_VECTOR_USE_ZEROCPY) {
+//                         unpack_contiguous_loop_cuda_zerocopy(pElem, &count_desc, &iov_ptr, &conv_ptr, &iov_len_local);
+//                     } else {
+//                         unpack_contiguous_loop_cuda(pElem, &count_desc, &iov_ptr, &conv_ptr, &iov_len_local);
+//                     }
+//                     if( 0 == count_desc ) {  /* completed */
+//                         pos_desc += pElem->loop.items + 1;
+//                         goto update_loop_description;
+//                     }
+//                     /* Save the stack with the correct last_count value. */
+//                 }
+//                 local_disp = (OPAL_PTRDIFF_TYPE)conv_ptr - local_disp;
+//                 PUSH_STACK( pStack, pConvertor->stack_pos, pos_desc, OPAL_DATATYPE_LOOP, count_desc,
+//                             pStack->disp + local_disp);
+//                 pos_desc++;
+//             update_loop_description:  /* update the current state */
+//             //    conv_ptr = pConvertor->pBaseBuf + pStack->disp;
+//                 count_desc_tmp = count_desc;
+//                 UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
+//                 continue;
+//             }
+//         }
+//     complete_loop:
+//         iov[iov_count].iov_len -= iov_len_local;  /* update the amount of valid data */
+//         total_unpacked += iov[iov_count].iov_len;
+//     }
+//  complete_conversion:
+//     *max_data = total_unpacked;
+//     pConvertor->bConverted += total_unpacked;  /* update the already converted bytes */
+//     *out_size = iov_count;
+//     if( pConvertor->bConverted == pConvertor->remote_size ) {
+//         pConvertor->flags |= CONVERTOR_COMPLETED;
+//         DT_CUDA_DEBUG( opal_cuda_output( 0, "Total unpacked %lu\n", pConvertor->bConverted); );
+//         if (pConvertor->gpu_buffer_ptr != NULL && free_required == 1) {
+//             opal_cuda_free_gpu_buffer(pConvertor->gpu_buffer_ptr, 0);
+//             pConvertor->gpu_buffer_ptr = NULL;
+//         }
+//         return 1;
+//     }
+//     /* Save the global position for the next round */
+//     PUSH_STACK( pStack, pConvertor->stack_pos, pos_desc, pElem->elem.common.type, count_desc,
+//                 conv_ptr - pConvertor->pBaseBuf );
+//     DT_CUDA_DEBUG( opal_cuda_output( 2, "unpack save stack stack_pos %d pos_desc %d count_desc %d disp %ld\n",
+//                                      pConvertor->stack_pos, pStack->index, (int)pStack->count, (long)pStack->disp ); );
+//     return 0;
+// }
+
 int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvertor,
                                                   struct iovec* iov, 
                                                   uint32_t* out_size,
@@ -663,3 +836,52 @@ void unpack_contiguous_loop_cuda_zerocopy( dt_elem_desc_t* ELEM,
     printf( "[Timing]: vector unpacking in %ld microsec\n", total_time );
 #endif
 }
+
+void unpack_predefined_data_cuda( dt_elem_desc_t* ELEM,
+                                  uint32_t* COUNT,
+                                  unsigned char** SOURCE,
+                                  unsigned char** DESTINATION,
+                                  size_t* SPACE )
+{
+    uint32_t _copy_count = *(COUNT);
+    size_t _copy_blength;
+    ddt_elem_desc_t* _elem = &((ELEM)->elem);
+    unsigned char* _source = (*SOURCE);
+    uint32_t nb_blocks, tasks_per_block, thread_per_block;
+    unsigned char* _destination = *(DESTINATION) + _elem->disp;;
+
+    _copy_blength = 8;//opal_datatype_basicDatatypes[_elem->common.type]->size;
+    if( (_copy_count * _copy_blength) > *(SPACE) ) {
+        _copy_count = (uint32_t)(*(SPACE) / _copy_blength);
+        if( 0 == _copy_count ) return;  /* nothing to do */
+    }
+    
+    
+    if (*COUNT / TASK_PER_THREAD < CUDA_WARP_SIZE) {
+        thread_per_block = CUDA_WARP_SIZE;
+    } else if (*COUNT / TASK_PER_THREAD < CUDA_WARP_SIZE * 2) {
+        thread_per_block = CUDA_WARP_SIZE * 2;
+    } else if (*COUNT / TASK_PER_THREAD < CUDA_WARP_SIZE * 3) {
+        thread_per_block = CUDA_WARP_SIZE * 3;
+    } else {
+        thread_per_block = CUDA_WARP_SIZE * 5;
+    }
+    tasks_per_block = thread_per_block * TASK_PER_THREAD;
+    nb_blocks = (*COUNT + tasks_per_block - 1) / tasks_per_block;
+
+ //   DBGPRINT("num_blocks %d, thread %d\n", nb_blocks, tasks_per_block);
+ //   DBGPRINT( "GPU pack 1. memcpy( %p, %p, %lu ) => space %lu\n", _destination, _source, (unsigned long)_copy_count, (unsigned long)(*(SPACE)) );
+    
+    unpack_contiguous_loop_cuda_kernel_global<<<nb_blocks, thread_per_block, 0, cuda_streams->opal_cuda_stream[cuda_streams->current_stream_id]>>>(_copy_count, _copy_blength, _elem->extent, _source, _destination);
+    cuda_streams->current_stream_id ++;
+    cuda_streams->current_stream_id = cuda_streams->current_stream_id % NB_STREAMS;
+    
+#if !defined(OPAL_DATATYPE_CUDA_DRY_RUN)  
+    _copy_blength *= _copy_count;
+    *(DESTINATION)  = _destination + _elem->extent*_copy_count - _elem->disp;
+    *(SOURCE) += _copy_blength;
+    *(SPACE)  -= _copy_blength;
+    *(COUNT)  -= _copy_count;
+#endif
+    
+}
diff --git a/opal/datatype/opal_datatype_pack.c b/opal/datatype/opal_datatype_pack.c
index 54a28b93c5b..372d5a1291a 100644
--- a/opal/datatype/opal_datatype_pack.c
+++ b/opal/datatype/opal_datatype_pack.c
@@ -421,7 +421,8 @@ opal_generic_simple_pack_cuda_function( opal_convertor_t* pConvertor,
     pStack = pConvertor->pStack + pConvertor->stack_pos;
     pos_desc   = pStack->index;
     pElem = &(description[pos_desc]);
-    
+   
+   // return (*opal_generic_simple_pack_function_cuda_vector_p)( pConvertor, iov, out_size, max_data);
     if( OPAL_DATATYPE_LOOP == pElem->elem.common.type ) {
         if (opal_generic_simple_pack_function_cuda_vector_p != NULL) {
         //    return (*opal_generic_simple_pack_function_cuda_vector_p)( pConvertor, iov, out_size, max_data);
diff --git a/opal/datatype/opal_datatype_unpack.c b/opal/datatype/opal_datatype_unpack.c
index fd269de6764..d9d69683174 100644
--- a/opal/datatype/opal_datatype_unpack.c
+++ b/opal/datatype/opal_datatype_unpack.c
@@ -608,7 +608,8 @@ opal_generic_simple_unpack_cuda_function( opal_convertor_t* pConvertor,
     pStack = pConvertor->pStack + pConvertor->stack_pos;
     pos_desc   = pStack->index;
     pElem = &(description[pos_desc]);
-    
+   
+//    return (*opal_generic_simple_unpack_function_cuda_vector_p)( pConvertor, iov, out_size, max_data);
     if( OPAL_DATATYPE_LOOP == pElem->elem.common.type ) {
         if (opal_generic_simple_unpack_function_cuda_vector_p != NULL) {
             return (*opal_generic_simple_unpack_function_cuda_vector_p)( pConvertor, iov, out_size, max_data);
diff --git a/opal/mca/btl/smcuda/btl_smcuda_component.c b/opal/mca/btl/smcuda/btl_smcuda_component.c
index 0243822d1d9..3ffde4608fc 100644
--- a/opal/mca/btl/smcuda/btl_smcuda_component.c
+++ b/opal/mca/btl/smcuda/btl_smcuda_component.c
@@ -1284,10 +1284,8 @@ int mca_btl_smcuda_component_progress(void)
                 }
                 if( btl_ownership ) {
                     if (frag->hdr->tag == MCA_BTL_TAG_SMCUDA_DATATYPE_PACK) {
-                        printf("&&&&&&&&&&&&&&&&&&got PACK TAG\n");
                     }
                     if (frag->hdr->tag == MCA_BTL_TAG_SMCUDA_DATATYPE_UNPACK) {
-                        printf("&&&&&&&&&&&&&&&&&&got UNPACK TAG\n");
                     }
                     MCA_BTL_SMCUDA_FRAG_RETURN(frag);
                 }
diff --git a/test/datatype/ddt_benchmark.c b/test/datatype/ddt_benchmark.c
index 860e9b87c94..228238002e4 100644
--- a/test/datatype/ddt_benchmark.c
+++ b/test/datatype/ddt_benchmark.c
@@ -925,6 +925,232 @@ static int local_copy_with_convertor( ompi_datatype_t* pdt, int count, int chunk
     return OMPI_SUCCESS;
 }
 
+static void fill_matrix(void *matt, int msize)
+{
+    int i, j;
+#if defined (TEST_DOUBLE)
+    double *mat = (double *)matt;
+#elif defined (TEST_FLOAT)
+    float *mat = (float *)matt;
+#elif defined (TEST_CHAR)
+    char *mat = (char *)matt;
+#else
+    void *mat = matt;
+#endif
+    
+    for (i = 0; i < msize*msize; i++) {
+        mat[i] = i;
+    }
+
+   // printf("matrix generate\n");
+   // for (i = 0; i < msize; i++) {
+   //     for (j = 0; j < msize; j++) {
+   //         printf(" %1.f ", mat[i*msize+j]);
+   //     }
+   //     printf("\n");
+   // }
+}
+
+static void verify_mat(void *matt, int msize)
+{
+    int i, j, error = 0;
+#if defined (TEST_DOUBLE)
+    double *mat = (double *)matt;
+#elif defined (TEST_FLOAT)
+    float *mat = (float *)matt;
+#elif defined (TEST_CHAR)
+    char *mat = (char *)matt;
+#else
+    void *mat = matt;
+#endif
+    
+    for (i = 0; i < msize*msize; i++) {
+#if defined (TEST_CHAR) 
+        if (mat[i] != 'a') {
+#else
+        if (mat[i] != (0.0+i)) {
+#endif
+            error ++;
+        }
+    }
+    
+    // printf("matrix received\n");
+    // for (i = 0; i < msize; i++) {
+    //     for (j = 0; j < msize; j++) {
+    //         printf(" %1.f ", mat[i*msize+j]);
+    //     }
+    //     printf("\n");
+    // }
+    
+    if (error != 0) {
+        printf("error is found %d\n", error);
+    } else {
+        printf("no error is found\n");
+    }
+}
+
+static int local_copy_with_convertor_mat( ompi_datatype_t* pdt, int count, int chunk, int msize )
+{
+    void *pdst = NULL, *psrc = NULL, *ptemp = NULL, *phost = NULL;
+    opal_convertor_t *send_convertor = NULL, *recv_convertor = NULL;
+    struct iovec iov;
+    uint32_t iov_count;
+    size_t max_data, dt_length;
+    int32_t length = 0, done1 = 0, done2 = 0;
+    TIMER_DATA_TYPE start, end, unpack_start, unpack_end;
+    long total_time, unpack_time = 0;
+
+    dt_length = compute_buffer_length(pdt, count);
+    printf("length %lu\n", dt_length);
+
+#if defined (DDT_TEST_CUDA)
+    cudaSetDevice(0);
+#endif
+
+#if defined (DDT_TEST_CUDA)
+    cudaError_t error = cudaMalloc((void **)&psrc, dt_length);
+    if ( error != cudaSuccess) {
+        printf("CUDA error: %s\n", cudaGetErrorString(error));
+        exit(-1);
+    }
+    cudaMemset(psrc, 0, dt_length);
+    printf("cudamalloc psrc %p\n", psrc);
+    
+    error = cudaMalloc((void **)&pdst, dt_length);
+    if ( error != cudaSuccess) {
+        printf("CUDA error: %s\n", cudaGetErrorString(error));
+        exit(-1);
+    }
+    cudaMemset(pdst, 0, dt_length); 
+    printf("cudamalloc pdst %p\n", pdst);
+    
+    error = cudaMallocHost((void **)&ptemp, chunk);
+    if ( error != cudaSuccess) {
+        printf("CUDA error: %s\n", cudaGetErrorString(error));
+        exit(-1);
+    }
+    memset(ptemp, 0, chunk);
+    printf("cudamallochost ptemp %p\n", ptemp);
+    
+    error = cudaMallocHost((void **)&phost, dt_length);
+    if ( error != cudaSuccess) {
+        printf("CUDA error: %s\n", cudaGetErrorString(error));
+        exit(-1);
+    }
+    memset(phost, 0, dt_length);
+    printf("cudamallochost phost %p\n", phost);
+#else
+    pdst  = malloc(dt_length);
+    psrc  = malloc(dt_length);
+    ptemp = malloc(chunk);
+    
+    for( int i = 0; i < length; ((char*)psrc)[i] = i % 128 + 32, i++ );
+    memset( pdst, 0, length );
+#endif
+
+#if defined (DDT_TEST_CUDA)
+    if (msize > 0) {
+        fill_matrix(phost, msize);
+    }
+    cudaMemcpy(psrc, phost, dt_length, cudaMemcpyHostToDevice);
+#else 
+    if (msize > 0) {
+  //      fill_upper_matrix(psrc, msize);
+    }
+#endif
+
+    send_convertor = opal_convertor_create( remote_arch, 0 );
+#if defined (DDT_TEST_CUDA)
+    send_convertor->flags |= CONVERTOR_CUDA;
+#endif
+    if( OPAL_SUCCESS != opal_convertor_prepare_for_send( send_convertor, &(pdt->super), count, psrc ) ) {
+        printf( "Unable to create the send convertor. Is the datatype committed ?\n" );
+        goto clean_and_return;
+    }
+
+    recv_convertor = opal_convertor_create( remote_arch, 0 );
+#if defined (DDT_TEST_CUDA)
+    recv_convertor->flags |= CONVERTOR_CUDA;
+#endif
+    if( OPAL_SUCCESS != opal_convertor_prepare_for_recv( recv_convertor, &(pdt->super), count, pdst ) ) {
+        printf( "Unable to create the recv convertor. Is the datatype committed ?\n" );
+        goto clean_and_return;
+    }
+
+    cache_trash();  /* make sure the cache is useless */
+    cudaDeviceSynchronize();
+
+    GET_TIME( start );
+    while( (done1 & done2) != 1 ) {
+        /* They are supposed to finish in exactly the same time. */
+        if( done1 | done2 ) {
+            printf( "WRONG !!! the send is %s but the receive is %s in local_copy_with_convertor\n",
+                    (done1 ? "finish" : "not finish"),
+                    (done2 ? "finish" : "not finish") );
+        }
+
+        max_data = chunk;
+        iov_count = 1;
+        iov.iov_base = ptemp;
+        iov.iov_len = chunk;
+
+        if( done1 == 0 ) {
+            done1 = opal_convertor_pack( send_convertor, &iov, &iov_count, &max_data );
+        }
+        
+        // int i,j = 0;
+        // printf("buffer received\n");
+        // double *mat_temp = (double*)ptemp;
+        // for (i = 0; i < msize; i++) {
+        //     for (j = 0; j < msize; j++) {
+        //         printf(" %1.f ", mat_temp[i*msize+j]);
+        //     }
+        //     printf("\n");
+        // }
+
+        if( done2 == 0 ) {
+            GET_TIME( unpack_start );
+            done2 = opal_convertor_unpack( recv_convertor, &iov, &iov_count, &max_data );
+            GET_TIME( unpack_end );
+            unpack_time += ELAPSED_TIME( unpack_start, unpack_end );
+        }
+
+        length += max_data;
+    }
+    GET_TIME( end );
+    total_time = ELAPSED_TIME( start, end );
+    printf( "copying same data-type using convertors in %ld microsec\n", total_time );
+    printf( "\t unpack in %ld microsec [pack in %ld microsec]\n", unpack_time,
+            total_time - unpack_time );
+            
+#if defined (DDT_TEST_CUDA)
+    memset(phost, 0, dt_length);
+    cudaMemcpy(phost, pdst, dt_length, cudaMemcpyDeviceToHost);
+    if (msize > 0) {
+     verify_mat(phost, msize);
+    }
+#else
+    if (msize > 0) {
+//      verify_mat_result(pdst, msize);
+    }
+#endif
+clean_and_return:
+    if( NULL != send_convertor ) OBJ_RELEASE( send_convertor );
+    if( NULL != recv_convertor ) OBJ_RELEASE( recv_convertor );
+
+#if defined (DDT_TEST_CUDA)
+    if( NULL != pdst ) cudaFree( pdst );
+    if( NULL != psrc ) cudaFree( psrc );
+    if( NULL != ptemp ) cudaFreeHost( ptemp );
+    if( NULL != phost ) cudaFreeHost( phost );
+#else
+    if( NULL != pdst ) free( pdst );
+    if( NULL != psrc ) free( psrc );
+    if( NULL != ptemp ) free( ptemp );
+#endif
+    return OMPI_SUCCESS;
+}
+
 /**
  * Main function. Call several tests and print-out the results. It try to stress the convertor
  * using difficult data-type constructions as well as strange segment sizes for the conversion.
@@ -980,12 +1206,20 @@ int main( int argc, char* argv[] )
         printf("----matrix size %d-----\n", mat_size);
         if( outputFlags & CHECK_PACK_UNPACK ) {
             for (i = 1; i <= 1; i++) {
-                local_copy_with_convertor(pdt, 1, 1024*1024*200, mat_size);
+     //           local_copy_with_convertor(pdt, 1, 1024*1024*200, mat_size);
             }
         }
         OBJ_RELEASE( pdt ); assert( pdt == NULL );
     }
     
+    ompi_datatype_t *column, *matt;
+    mat_size = 500;
+    ompi_datatype_create_vector( mat_size, 1, mat_size, MPI_DOUBLE, &column );
+    ompi_datatype_create_hvector( mat_size, 1, sizeof(double), column, &matt );
+    ompi_datatype_commit( &matt );
+  //  local_copy_with_convertor_mat(matt, 1, 1200000, mat_size);
+    
+    
     int packed_size = 256;
     int blk_len = 4;
     int blk_count;
@@ -1035,13 +1269,13 @@ int main( int argc, char* argv[] )
     }
     
     
-    for (blk_len = 4; blk_len <= 64; blk_len += 2) {
+    for (blk_len = 64; blk_len <= 64; blk_len += 2) {
         printf( ">>--------------------------------------------<<\n" );
         printf( "Vector data-type (1024 times %d double stride 512)\n", blk_len );
         pdt = create_vector_type( MPI_DOUBLE, 1000, blk_len, blk_len+128);
         if( outputFlags & CHECK_PACK_UNPACK ) {
             for (i = 0; i < 4; i++) {
-      //         vector_ddt( pdt, 1, pdt, 1, 1024*1024*20 , 1000, blk_len, blk_len+128);
+                 vector_ddt( pdt, 1, pdt, 1, 1024*10 , 1000, blk_len, blk_len+128);
      //          vector_ddt_2d( pdt, 1, pdt, 1, 1024*1024*100 , 8192, blk_len, blk_len+128);
             }
         }
@@ -1099,7 +1333,7 @@ int main( int argc, char* argv[] )
     pdt = create_vector_type( MPI_DOUBLE, 4000, 128, 256 );
 //    ompi_datatype_dump( pdt );
     if( outputFlags & CHECK_PACK_UNPACK ) {
-        for (i = 0; i < 10; i++) {
+        for (i = 0; i < 1; i++) {
        // local_copy_ddt_count(pdt, 1);
       //  local_copy_with_convertor( pdt, 1, 12 );
       //  local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 12 );
@@ -1108,7 +1342,7 @@ int main( int argc, char* argv[] )
       //  local_copy_with_convertor( pdt, 1, 6000 );
       //  local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 6000 );
       //  local_copy_with_convertor( pdt, 1, 36000 );
-    //      local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 1024*1024*5 );
+     //     local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 1024*1024*5 );
         }
     }
     printf( ">>--------------------------------------------<<\n" );

From 58371c8901df111af70980f813fa3b1f62b051d2 Mon Sep 17 00:00:00 2001
From: eddy16112 <eddy16112@gmail.com>
Date: Fri, 2 Oct 2015 16:32:16 -0400
Subject: [PATCH 121/190] enable vector

---
 .../cuda/opal_datatype_pack_cuda_wrapper.cu   | 398 +++++++++---------
 .../cuda/opal_datatype_unpack_cuda_wrapper.cu | 334 +++++++--------
 test/datatype/ddt_benchmark.c                 |   8 +-
 3 files changed, 370 insertions(+), 370 deletions(-)

diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
index 1268280fab6..c3b327c733e 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
@@ -171,7 +171,7 @@ int32_t opal_generic_simple_pack_function_cuda( opal_convertor_t* pConvertor,
                                                   
 }
 
-int32_t opal_generic_simple_pack_function_cuda_vector(opal_convertor_t* pConvertor,
+int32_t opal_generic_simple_pack_function_cuda_vector2(opal_convertor_t* pConvertor,
                                                       struct iovec* iov,
                                                       uint32_t* out_size,
                                                       size_t* max_data )
@@ -372,204 +372,204 @@ int32_t opal_generic_simple_pack_function_cuda_vector(opal_convertor_t* pConvert
     return 0;
 }
 
-// int32_t opal_generic_simple_pack_function_cuda_vector(opal_convertor_t* pConvertor,
-//                                                       struct iovec* iov,
-//                                                       uint32_t* out_size,
-//                                                       size_t* max_data )
-// {
-//     dt_stack_t* pStack;       /* pointer to the position on the stack */
-//     uint32_t pos_desc;        /* actual position in the description of the derived datatype */
-//     uint32_t count_desc;      /* the number of items already done in the actual pos_desc */
-//     size_t total_packed = 0;  /* total amount packed this time */
-//     dt_elem_desc_t* description;
-//     dt_elem_desc_t* pElem;
-//     const opal_datatype_t *pData = pConvertor->pDesc;
-//     unsigned char *conv_ptr, *iov_ptr;
-//     size_t iov_len_local;
-//     uint32_t iov_count;
-//     uint8_t transfer_required;
-//     uint8_t free_required;
-//     uint32_t count_desc_tmp;
-//
-// #if defined(OPAL_DATATYPE_CUDA_TIMING)
-//     TIMER_DATA_TYPE start, end, start_total, end_total;
-//     long total_time;
-// #endif
-//
-//     DT_CUDA_DEBUG( opal_cuda_output( 1, "opal_convertor_generic_simple_pack_cuda_vector( %p:%p, {%p, %lu}, %u, %u )\n",
-//                                 (void*)pConvertor, (void*)pConvertor->pBaseBuf,
-//                                 iov[0].iov_base, (unsigned long)iov[0].iov_len, *out_size, *max_data ); );
-//
-//     description = pConvertor->use_desc->desc;
-//
-//     /* For the first step we have to add both displacement to the source. After in the
-//      * main while loop we will set back the conv_ptr to the correct value. This is
-//      * due to the fact that the convertor can stop in the middle of a data with a count
-//      */
-//     pStack = pConvertor->pStack + pConvertor->stack_pos;
-//     pos_desc   = pStack->index;
-//     conv_ptr   = pConvertor->pBaseBuf + pStack->disp;
-//     count_desc = (uint32_t)pStack->count;
-//     pStack--;
-//     pConvertor->stack_pos--;
-//     pElem = &(description[pos_desc]);
-//
-//     DT_CUDA_DEBUG( opal_cuda_output( 1, "pack start pos_desc %d count_desc %d disp %ld\n"
-//                            "stack_pos %d pos_desc %d count_desc %d disp %ld\n",
-//                            pos_desc, count_desc, (long)(conv_ptr - pConvertor->pBaseBuf),
-//                            pConvertor->stack_pos, pStack->index, (int)pStack->count, (long)pStack->disp ); );
-//
-//
-//     for( iov_count = 0; iov_count < (*out_size); iov_count++ ) {
-//         if ((iov[iov_count].iov_base == NULL) || opal_cuda_is_gpu_buffer(iov[iov_count].iov_base)) {
-//             if (iov[iov_count].iov_len == 0) {
-//                 iov_len_local = DT_CUDA_BUFFER_SIZE;
-//             } else {
-//                 iov_len_local = iov[iov_count].iov_len;
-//             }
-//
-//             if (iov[iov_count].iov_base == NULL) {
-//                 iov[iov_count].iov_base = (unsigned char *)opal_cuda_malloc_gpu_buffer(iov_len_local, 0);
-//                 iov_ptr = (unsigned char *)iov[iov_count].iov_base;
-//                 pConvertor->gpu_buffer_ptr = iov_ptr;
-//                 free_required = 1;
-//             } else {
-//                 iov_ptr = (unsigned char *)iov[iov_count].iov_base;
-//                 free_required = 0;
-//             }
-//             transfer_required = 0;
-//         } else {
-//             if (OPAL_DATATYPE_VECTOR_USE_MEMCPY2D || OPAL_DATATYPE_VECTOR_USE_ZEROCPY) {
-//                 pConvertor->gpu_buffer_ptr = NULL;
-//                 transfer_required = 0;
-//                 free_required = 0;
-//                 iov_ptr = (unsigned char*)iov[iov_count].iov_base;
-//                 iov_len_local = iov[iov_count].iov_len;
-//             } else if (OPAL_DATATYPE_VECTOR_USE_PIPELINE){
-//                 iov_len_local = iov[iov_count].iov_len;
-//                 if (pConvertor->gpu_buffer_ptr == NULL) {
-//                     pConvertor->gpu_buffer_ptr = (unsigned char*)opal_cuda_malloc_gpu_buffer(iov_len_local, 0);
-//                 }
-//                 transfer_required = 0;
-//                 free_required = 1;
-//                 iov_ptr = (unsigned char*)iov[iov_count].iov_base;
-//             } else {
-//                 iov_len_local = iov[iov_count].iov_len;
-//                 if (pConvertor->gpu_buffer_ptr == NULL) {
-//                     pConvertor->gpu_buffer_ptr = (unsigned char*)opal_cuda_malloc_gpu_buffer(iov_len_local, 0);
-//                 }
-//                 transfer_required = 1;
-//                 free_required = 1;
-//                 iov_ptr = pConvertor->gpu_buffer_ptr;
-//             }
-//         }
-//         while( 1 ) {
-//             while( pElem->elem.common.flags & OPAL_DATATYPE_FLAG_DATA ) {
-//                 /* now here we have a basic datatype */
-//                 /* should not go into here */
-//                 pStack--;
-//                 pConvertor->stack_pos--;
-//                 pos_desc --;
-//                 pElem = &(description[pos_desc]);
-//                 count_desc = count_desc_tmp;
-//                 goto complete_loop;
-//             }
-//             if( OPAL_DATATYPE_END_LOOP == pElem->elem.common.type ) { /* end of the current loop */
-//                 DT_CUDA_DEBUG( opal_cuda_output( 2, "pack end_loop count %d stack_pos %d"
-//                                                  " pos_desc %d disp %ld space %lu\n",
-//                                                  (int)pStack->count, pConvertor->stack_pos,
-//                                                  pos_desc, (long)pStack->disp, (unsigned long)iov_len_local ); );
-//                 if( --(pStack->count) == 0 ) { /* end of loop */
-//                     if( 0 == pConvertor->stack_pos ) {
-//                         /* we lie about the size of the next element in order to
-//                          * make sure we exit the main loop.
-//                          */
-//                         *out_size = iov_count;
-//                         goto complete_loop;  /* completed */
-//                     }
-//                     pConvertor->stack_pos--;
-//                     pStack--;
-//                     pos_desc++;
-//                 } else {
-//                     pos_desc = pStack->index + 1;
-//                     if( pStack->index == -1 ) {
-//                         pStack->disp += (pData->ub - pData->lb);
-//                     } else {
-//                         assert( OPAL_DATATYPE_LOOP == description[pStack->index].loop.common.type );
-//                         pStack->disp += description[pStack->index].loop.extent;
-//                     }
-//                 }
-//                 conv_ptr = pConvertor->pBaseBuf + pStack->disp;
-//                 UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
-//                 DT_CUDA_DEBUG( opal_cuda_output( 2, "pack new_loop count %d stack_pos %d pos_desc %d count_desc %d disp %ld space %lu\n",
-//                                                  (int)pStack->count, pConvertor->stack_pos, pos_desc,
-//                                                  count_desc, (long)pStack->disp, (unsigned long)iov_len_local ); );
-//             }
-//             if( OPAL_DATATYPE_LOOP == pElem->elem.common.type ) {
-//                 OPAL_PTRDIFF_TYPE local_disp = (OPAL_PTRDIFF_TYPE)conv_ptr;
-//                 if( pElem->loop.common.flags & OPAL_DATATYPE_FLAG_CONTIGUOUS ) {
-//                     if (OPAL_DATATYPE_VECTOR_USE_MEMCPY2D) {
-//                         pack_contiguous_loop_cuda_memcpy2d(pElem, &count_desc, &conv_ptr, &iov_ptr, &iov_len_local);
-//                     } else if (OPAL_DATATYPE_VECTOR_USE_ZEROCPY) {
-//                         pack_contiguous_loop_cuda_zerocopy(pElem, &count_desc, &conv_ptr, &iov_ptr, &iov_len_local);
-//                     } else if (OPAL_DATATYPE_VECTOR_USE_PIPELINE) {
-//                         pack_contiguous_loop_cuda_pipeline(pElem, &count_desc, &conv_ptr, &iov_ptr, &iov_len_local, pConvertor->gpu_buffer_ptr);
-//                     } else {
-//                         pack_contiguous_loop_cuda(pElem, &count_desc, &conv_ptr, &iov_ptr, &iov_len_local);
-//                     }
-//                     if( 0 == count_desc ) {  /* completed */
-//                         pos_desc += pElem->loop.items + 1;
-//                         goto update_loop_description;
-//                     }
-//                     /* Save the stack with the correct last_count value. */
-//                 }
-//                 local_disp = (OPAL_PTRDIFF_TYPE)conv_ptr - local_disp;
-//                 PUSH_STACK( pStack, pConvertor->stack_pos, pos_desc, OPAL_DATATYPE_LOOP, count_desc,
-//                             pStack->disp + local_disp);
-//                 pos_desc++;
-//             update_loop_description:  /* update the current state */
-//               //  conv_ptr = pConvertor->pBaseBuf + pStack->disp;
-//                 count_desc_tmp = count_desc;
-//                 UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
-//                 continue;
-//             }
-//         }
-//     complete_loop:
-//         iov[iov_count].iov_len -= iov_len_local;  /* update the amount of valid data */
-//         total_packed += iov[iov_count].iov_len;
-//  //       printf("iov_len %d, local %d\n", iov[iov_count].iov_len, iov_len_local);
-// #if defined(OPAL_DATATYPE_CUDA_TIMING)
-//         GET_TIME(start);
-// #endif
-//         if (transfer_required) {
-//             cudaMemcpy(iov[iov_count].iov_base, pConvertor->gpu_buffer_ptr, total_packed, cudaMemcpyDeviceToHost);
-//         }
-// #if defined(OPAL_DATATYPE_CUDA_TIMING)
-//         GET_TIME( end );
-//         total_time = ELAPSED_TIME( start, end );
-//         printf( "[Timing]: DtoH memcpy in %ld microsec, transfer required %d\n", total_time, transfer_required );
-// #endif
-//     }
-//     *max_data = total_packed;
-//     pConvertor->bConverted += total_packed;  /* update the already converted bytes */
-//     *out_size = iov_count;
-//     if( pConvertor->bConverted == pConvertor->local_size ) {
-//         pConvertor->flags |= CONVERTOR_COMPLETED;
-//         DT_CUDA_DEBUG( opal_cuda_output( 0, "Total packed %lu\n", pConvertor->bConverted); );
-//         if (pConvertor->gpu_buffer_ptr != NULL && free_required == 1) {
-//             printf("free\n");
-//            opal_cuda_free_gpu_buffer(pConvertor->gpu_buffer_ptr, 0);
-//            pConvertor->gpu_buffer_ptr = NULL;
-//         }
-//         return 1;
-//     }
-//     /* Save the global position for the next round */
-//     PUSH_STACK( pStack, pConvertor->stack_pos, pos_desc, pElem->elem.common.type, count_desc,
-//                 conv_ptr - pConvertor->pBaseBuf );
-//     DT_CUDA_DEBUG( opal_cuda_output( 2, "pack save stack stack_pos %d pos_desc %d count_desc %d disp %ld\n",
-//                                      pConvertor->stack_pos, pStack->index, (int)pStack->count, (long)pStack->disp ); );
-//     return 0;
-// }
+int32_t opal_generic_simple_pack_function_cuda_vector(opal_convertor_t* pConvertor,
+                                                      struct iovec* iov,
+                                                      uint32_t* out_size,
+                                                      size_t* max_data )
+{
+    dt_stack_t* pStack;       /* pointer to the position on the stack */
+    uint32_t pos_desc;        /* actual position in the description of the derived datatype */
+    uint32_t count_desc;      /* the number of items already done in the actual pos_desc */
+    size_t total_packed = 0;  /* total amount packed this time */
+    dt_elem_desc_t* description;
+    dt_elem_desc_t* pElem;
+    const opal_datatype_t *pData = pConvertor->pDesc;
+    unsigned char *conv_ptr, *iov_ptr;
+    size_t iov_len_local;
+    uint32_t iov_count;
+    uint8_t transfer_required;
+    uint8_t free_required;
+    uint32_t count_desc_tmp;
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+    TIMER_DATA_TYPE start, end, start_total, end_total;
+    long total_time;
+#endif
+
+    DT_CUDA_DEBUG( opal_cuda_output( 1, "opal_convertor_generic_simple_pack_cuda_vector( %p:%p, {%p, %lu}, %u, %u )\n",
+                                (void*)pConvertor, (void*)pConvertor->pBaseBuf,
+                                iov[0].iov_base, (unsigned long)iov[0].iov_len, *out_size, *max_data ); );
+
+    description = pConvertor->use_desc->desc;
+
+    /* For the first step we have to add both displacement to the source. After in the
+     * main while loop we will set back the conv_ptr to the correct value. This is
+     * due to the fact that the convertor can stop in the middle of a data with a count
+     */
+    pStack = pConvertor->pStack + pConvertor->stack_pos;
+    pos_desc   = pStack->index;
+    conv_ptr   = pConvertor->pBaseBuf + pStack->disp;
+    count_desc = (uint32_t)pStack->count;
+    pStack--;
+    pConvertor->stack_pos--;
+    pElem = &(description[pos_desc]);
+
+    DT_CUDA_DEBUG( opal_cuda_output( 1, "pack start pos_desc %d count_desc %d disp %ld\n"
+                           "stack_pos %d pos_desc %d count_desc %d disp %ld\n",
+                           pos_desc, count_desc, (long)(conv_ptr - pConvertor->pBaseBuf),
+                           pConvertor->stack_pos, pStack->index, (int)pStack->count, (long)pStack->disp ); );
+
+
+    for( iov_count = 0; iov_count < (*out_size); iov_count++ ) {
+        if ((iov[iov_count].iov_base == NULL) || opal_cuda_is_gpu_buffer(iov[iov_count].iov_base)) {
+            if (iov[iov_count].iov_len == 0) {
+                iov_len_local = DT_CUDA_BUFFER_SIZE;
+            } else {
+                iov_len_local = iov[iov_count].iov_len;
+            }
+
+            if (iov[iov_count].iov_base == NULL) {
+                iov[iov_count].iov_base = (unsigned char *)opal_cuda_malloc_gpu_buffer(iov_len_local, 0);
+                iov_ptr = (unsigned char *)iov[iov_count].iov_base;
+                pConvertor->gpu_buffer_ptr = iov_ptr;
+                free_required = 1;
+            } else {
+                iov_ptr = (unsigned char *)iov[iov_count].iov_base;
+                free_required = 0;
+            }
+            transfer_required = 0;
+        } else {
+            if (OPAL_DATATYPE_VECTOR_USE_MEMCPY2D || OPAL_DATATYPE_VECTOR_USE_ZEROCPY) {
+                pConvertor->gpu_buffer_ptr = NULL;
+                transfer_required = 0;
+                free_required = 0;
+                iov_ptr = (unsigned char*)iov[iov_count].iov_base;
+                iov_len_local = iov[iov_count].iov_len;
+            } else if (OPAL_DATATYPE_VECTOR_USE_PIPELINE){
+                iov_len_local = iov[iov_count].iov_len;
+                if (pConvertor->gpu_buffer_ptr == NULL) {
+                    pConvertor->gpu_buffer_ptr = (unsigned char*)opal_cuda_malloc_gpu_buffer(iov_len_local, 0);
+                }
+                transfer_required = 0;
+                free_required = 1;
+                iov_ptr = (unsigned char*)iov[iov_count].iov_base;
+            } else {
+                iov_len_local = iov[iov_count].iov_len;
+                if (pConvertor->gpu_buffer_ptr == NULL) {
+                    pConvertor->gpu_buffer_ptr = (unsigned char*)opal_cuda_malloc_gpu_buffer(iov_len_local, 0);
+                }
+                transfer_required = 1;
+                free_required = 1;
+                iov_ptr = pConvertor->gpu_buffer_ptr;
+            }
+        }
+        while( 1 ) {
+            while( pElem->elem.common.flags & OPAL_DATATYPE_FLAG_DATA ) {
+                /* now here we have a basic datatype */
+                /* should not go into here */
+                pStack--;
+                pConvertor->stack_pos--;
+                pos_desc --;
+                pElem = &(description[pos_desc]);
+                count_desc = count_desc_tmp;
+                goto complete_loop;
+            }
+            if( OPAL_DATATYPE_END_LOOP == pElem->elem.common.type ) { /* end of the current loop */
+                DT_CUDA_DEBUG( opal_cuda_output( 2, "pack end_loop count %d stack_pos %d"
+                                                 " pos_desc %d disp %ld space %lu\n",
+                                                 (int)pStack->count, pConvertor->stack_pos,
+                                                 pos_desc, (long)pStack->disp, (unsigned long)iov_len_local ); );
+                if( --(pStack->count) == 0 ) { /* end of loop */
+                    if( 0 == pConvertor->stack_pos ) {
+                        /* we lie about the size of the next element in order to
+                         * make sure we exit the main loop.
+                         */
+                        *out_size = iov_count;
+                        goto complete_loop;  /* completed */
+                    }
+                    pConvertor->stack_pos--;
+                    pStack--;
+                    pos_desc++;
+                } else {
+                    pos_desc = pStack->index + 1;
+                    if( pStack->index == -1 ) {
+                        pStack->disp += (pData->ub - pData->lb);
+                    } else {
+                        assert( OPAL_DATATYPE_LOOP == description[pStack->index].loop.common.type );
+                        pStack->disp += description[pStack->index].loop.extent;
+                    }
+                }
+                conv_ptr = pConvertor->pBaseBuf + pStack->disp;
+                UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
+                DT_CUDA_DEBUG( opal_cuda_output( 2, "pack new_loop count %d stack_pos %d pos_desc %d count_desc %d disp %ld space %lu\n",
+                                                 (int)pStack->count, pConvertor->stack_pos, pos_desc,
+                                                 count_desc, (long)pStack->disp, (unsigned long)iov_len_local ); );
+            }
+            if( OPAL_DATATYPE_LOOP == pElem->elem.common.type ) {
+                OPAL_PTRDIFF_TYPE local_disp = (OPAL_PTRDIFF_TYPE)conv_ptr;
+                if( pElem->loop.common.flags & OPAL_DATATYPE_FLAG_CONTIGUOUS ) {
+                    if (OPAL_DATATYPE_VECTOR_USE_MEMCPY2D) {
+                        pack_contiguous_loop_cuda_memcpy2d(pElem, &count_desc, &conv_ptr, &iov_ptr, &iov_len_local);
+                    } else if (OPAL_DATATYPE_VECTOR_USE_ZEROCPY) {
+                        pack_contiguous_loop_cuda_zerocopy(pElem, &count_desc, &conv_ptr, &iov_ptr, &iov_len_local);
+                    } else if (OPAL_DATATYPE_VECTOR_USE_PIPELINE) {
+                        pack_contiguous_loop_cuda_pipeline(pElem, &count_desc, &conv_ptr, &iov_ptr, &iov_len_local, pConvertor->gpu_buffer_ptr);
+                    } else {
+                        pack_contiguous_loop_cuda(pElem, &count_desc, &conv_ptr, &iov_ptr, &iov_len_local);
+                    }
+                    if( 0 == count_desc ) {  /* completed */
+                        pos_desc += pElem->loop.items + 1;
+                        goto update_loop_description;
+                    }
+                    /* Save the stack with the correct last_count value. */
+                }
+                local_disp = (OPAL_PTRDIFF_TYPE)conv_ptr - local_disp;
+                PUSH_STACK( pStack, pConvertor->stack_pos, pos_desc, OPAL_DATATYPE_LOOP, count_desc,
+                            pStack->disp + local_disp);
+                pos_desc++;
+            update_loop_description:  /* update the current state */
+              //  conv_ptr = pConvertor->pBaseBuf + pStack->disp;
+                count_desc_tmp = count_desc;
+                UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
+                continue;
+            }
+        }
+    complete_loop:
+        iov[iov_count].iov_len -= iov_len_local;  /* update the amount of valid data */
+        total_packed += iov[iov_count].iov_len;
+ //       printf("iov_len %d, local %d\n", iov[iov_count].iov_len, iov_len_local);
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+        GET_TIME(start);
+#endif
+        if (transfer_required) {
+            cudaMemcpy(iov[iov_count].iov_base, pConvertor->gpu_buffer_ptr, total_packed, cudaMemcpyDeviceToHost);
+        }
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+        GET_TIME( end );
+        total_time = ELAPSED_TIME( start, end );
+        printf( "[Timing]: DtoH memcpy in %ld microsec, transfer required %d\n", total_time, transfer_required );
+#endif
+    }
+    *max_data = total_packed;
+    pConvertor->bConverted += total_packed;  /* update the already converted bytes */
+    *out_size = iov_count;
+    if( pConvertor->bConverted == pConvertor->local_size ) {
+        pConvertor->flags |= CONVERTOR_COMPLETED;
+        DT_CUDA_DEBUG( opal_cuda_output( 0, "Total packed %lu\n", pConvertor->bConverted); );
+        if (pConvertor->gpu_buffer_ptr != NULL && free_required == 1) {
+            printf("free\n");
+           opal_cuda_free_gpu_buffer(pConvertor->gpu_buffer_ptr, 0);
+           pConvertor->gpu_buffer_ptr = NULL;
+        }
+        return 1;
+    }
+    /* Save the global position for the next round */
+    PUSH_STACK( pStack, pConvertor->stack_pos, pos_desc, pElem->elem.common.type, count_desc,
+                conv_ptr - pConvertor->pBaseBuf );
+    DT_CUDA_DEBUG( opal_cuda_output( 2, "pack save stack stack_pos %d pos_desc %d count_desc %d disp %ld\n",
+                                     pConvertor->stack_pos, pStack->index, (int)pStack->count, (long)pStack->disp ); );
+    return 0;
+}
 
 void pack_contiguous_loop_cuda( dt_elem_desc_t* ELEM,
                                 uint32_t* COUNT,
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
index 8f8af75274e..5374e2d9fc8 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
@@ -110,7 +110,7 @@ int32_t opal_generic_simple_unpack_function_cuda( opal_convertor_t* pConvertor,
 #endif /* OPAL_DATATYPE_CUDA_DRY_RUN */
 }
 
-int32_t opal_generic_simple_unpack_function_cuda_vector( opal_convertor_t* pConvertor,
+int32_t opal_generic_simple_unpack_function_cuda_vector2( opal_convertor_t* pConvertor,
                                                          struct iovec* iov, uint32_t* out_size,
                                                          size_t* max_data )
 {
@@ -283,172 +283,172 @@ int32_t opal_generic_simple_unpack_function_cuda_vector( opal_convertor_t* pConv
     return 0;
 }
 
-// int32_t opal_generic_simple_unpack_function_cuda_vector( opal_convertor_t* pConvertor,
-//                                                          struct iovec* iov, uint32_t* out_size,
-//                                                          size_t* max_data )
-// {
-//     dt_stack_t* pStack;                /* pointer to the position on the stack */
-//     uint32_t pos_desc;                 /* actual position in the description of the derived datatype */
-//     uint32_t count_desc;               /* the number of items already done in the actual pos_desc */
-//     size_t total_unpacked = 0;         /* total size unpacked this time */
-//     dt_elem_desc_t* description;
-//     dt_elem_desc_t* pElem;
-//     const opal_datatype_t *pData = pConvertor->pDesc;
-//     unsigned char *conv_ptr, *iov_ptr;
-//     size_t iov_len_local;
-//     uint32_t iov_count;
-//     uint8_t free_required;
-//     uint32_t count_desc_tmp;
-//
-// #if defined(OPAL_DATATYPE_CUDA_TIMING)
-//     TIMER_DATA_TYPE start, end;
-//     long total_time;
-// #endif
-//
-//     DT_CUDA_DEBUG( opal_cuda_output( 1, "opal_convertor_generic_simple_unpack( %p, {%p, %lu}, %u , %u)\n",
-//                                      (void*)pConvertor, iov[0].iov_base, (unsigned long)iov[0].iov_len, *out_size, *max_data ); )
-//
-//     description = pConvertor->use_desc->desc;
-//
-//     /* For the first step we have to add both displacement to the source. After in the
-//      * main while loop we will set back the source_base to the correct value. This is
-//      * due to the fact that the convertor can stop in the middle of a data with a count
-//      */
-//     pStack     = pConvertor->pStack + pConvertor->stack_pos;
-//     pos_desc   = pStack->index;
-//     conv_ptr   = pConvertor->pBaseBuf + pStack->disp;
-//     count_desc = (uint32_t)pStack->count;
-//     pStack--;
-//     pConvertor->stack_pos--;
-//     pElem = &(description[pos_desc]);
-//
-//     DT_CUDA_DEBUG( opal_cuda_output( 1, "unpack start pos_desc %d count_desc %d disp %ld\n"
-//                            "stack_pos %d pos_desc %d count_desc %d disp %ld\n",
-//                            pos_desc, count_desc, (long)(conv_ptr - pConvertor->pBaseBuf),
-//                            pConvertor->stack_pos, pStack->index, (int)pStack->count, (long)(pStack->disp) ); );
-//
-//     for( iov_count = 0; iov_count < (*out_size); iov_count++ ) {
-// #if defined(OPAL_DATATYPE_CUDA_TIMING)
-//         GET_TIME(start);
-// #endif
-//         if (opal_cuda_is_gpu_buffer(iov[iov_count].iov_base)) {
-//             iov_ptr = (unsigned char*)iov[iov_count].iov_base;
-//             free_required = 0;
-//         } else {
-//             if (OPAL_DATATYPE_VECTOR_USE_MEMCPY2D || OPAL_DATATYPE_VECTOR_USE_ZEROCPY) {
-//                 iov_ptr = (unsigned char*)iov[iov_count].iov_base;
-//                 pConvertor->gpu_buffer_ptr = NULL;
-//                 free_required = 0;
-//             } else {
-//                 if (pConvertor->gpu_buffer_ptr == NULL) {
-//                     pConvertor->gpu_buffer_ptr = (unsigned char*)opal_cuda_malloc_gpu_buffer(iov[iov_count].iov_len, 0);
-//                 }
-//                 iov_ptr = pConvertor->gpu_buffer_ptr;
-//                 cudaMemcpy(iov_ptr, iov[iov_count].iov_base, iov[iov_count].iov_len, cudaMemcpyHostToDevice);
-//                 free_required = 1;
-//             }
-//         }
-// #if defined(OPAL_DATATYPE_CUDA_TIMING)
-//         GET_TIME( end );
-//         total_time = ELAPSED_TIME( start, end );
-//         printf( "[Timing]: HtoD memcpy in %ld microsec, free required %d\n", total_time, free_required );
-// #endif
-//         iov_len_local = iov[iov_count].iov_len;
-//         if( 0 != pConvertor->partial_length ) {
-//             /* not support yet */
-//         }
-//         while( 1 ) {
-//             while( pElem->elem.common.flags & OPAL_DATATYPE_FLAG_DATA ) {
-//                 /* now here we have a basic datatype */
-//                 /* should not go to here */
-//                 pStack--;
-//                 pConvertor->stack_pos--;
-//                 pos_desc --;
-//                 pElem = &(description[pos_desc]);
-//                 count_desc = count_desc_tmp;
-//                 goto complete_loop;
-//             }
-//             if( OPAL_DATATYPE_END_LOOP == pElem->elem.common.type ) { /* end of the current loop */
-//                 DT_CUDA_DEBUG( opal_cuda_output( 2, "unpack end_loop count %d stack_pos %d pos_desc %d disp %ld space %lu\n",
-//                                                  (int)pStack->count, pConvertor->stack_pos, pos_desc,
-//                                                  (long)pStack->disp, (unsigned long)iov_len_local ); );
-//                 if( --(pStack->count) == 0 ) { /* end of loop */
-//                     if( 0 == pConvertor->stack_pos ) {
-//                         /* Do the same thing as when the loop is completed */
-//                         iov[iov_count].iov_len -= iov_len_local;  /* update the amount of valid data */
-//                         total_unpacked += iov[iov_count].iov_len;
-//                         iov_count++;  /* go to the next */
-//                         goto complete_conversion;
-//                     }
-//                     pConvertor->stack_pos--;
-//                     pStack--;
-//                     pos_desc++;
-//                 } else {
-//                     pos_desc = pStack->index + 1;
-//                     if( pStack->index == -1 ) {
-//                         pStack->disp += (pData->ub - pData->lb);
-//                     } else {
-//                         assert( OPAL_DATATYPE_LOOP == description[pStack->index].loop.common.type );
-//                         pStack->disp += description[pStack->index].loop.extent;
-//                     }
-//                 }
-//                 conv_ptr = pConvertor->pBaseBuf + pStack->disp;
-//                 UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
-//                 DT_CUDA_DEBUG( opal_cuda_output( 2, "unpack new_loop count %d stack_pos %d pos_desc %d disp %ld space %lu\n",
-//                                                  (int)pStack->count, pConvertor->stack_pos, pos_desc,
-//                                                  (long)pStack->disp, (unsigned long)iov_len_local ); );
-//             }
-//             if( OPAL_DATATYPE_LOOP == pElem->elem.common.type ) {
-//                 OPAL_PTRDIFF_TYPE local_disp = (OPAL_PTRDIFF_TYPE)conv_ptr;
-//                 if( pElem->loop.common.flags & OPAL_DATATYPE_FLAG_CONTIGUOUS ) {
-//                     if (OPAL_DATATYPE_VECTOR_USE_MEMCPY2D) {
-//                         unpack_contiguous_loop_cuda_memcpy2d(pElem, &count_desc, &iov_ptr, &conv_ptr, &iov_len_local);
-//                     } else if (OPAL_DATATYPE_VECTOR_USE_ZEROCPY) {
-//                         unpack_contiguous_loop_cuda_zerocopy(pElem, &count_desc, &iov_ptr, &conv_ptr, &iov_len_local);
-//                     } else {
-//                         unpack_contiguous_loop_cuda(pElem, &count_desc, &iov_ptr, &conv_ptr, &iov_len_local);
-//                     }
-//                     if( 0 == count_desc ) {  /* completed */
-//                         pos_desc += pElem->loop.items + 1;
-//                         goto update_loop_description;
-//                     }
-//                     /* Save the stack with the correct last_count value. */
-//                 }
-//                 local_disp = (OPAL_PTRDIFF_TYPE)conv_ptr - local_disp;
-//                 PUSH_STACK( pStack, pConvertor->stack_pos, pos_desc, OPAL_DATATYPE_LOOP, count_desc,
-//                             pStack->disp + local_disp);
-//                 pos_desc++;
-//             update_loop_description:  /* update the current state */
-//             //    conv_ptr = pConvertor->pBaseBuf + pStack->disp;
-//                 count_desc_tmp = count_desc;
-//                 UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
-//                 continue;
-//             }
-//         }
-//     complete_loop:
-//         iov[iov_count].iov_len -= iov_len_local;  /* update the amount of valid data */
-//         total_unpacked += iov[iov_count].iov_len;
-//     }
-//  complete_conversion:
-//     *max_data = total_unpacked;
-//     pConvertor->bConverted += total_unpacked;  /* update the already converted bytes */
-//     *out_size = iov_count;
-//     if( pConvertor->bConverted == pConvertor->remote_size ) {
-//         pConvertor->flags |= CONVERTOR_COMPLETED;
-//         DT_CUDA_DEBUG( opal_cuda_output( 0, "Total unpacked %lu\n", pConvertor->bConverted); );
-//         if (pConvertor->gpu_buffer_ptr != NULL && free_required == 1) {
-//             opal_cuda_free_gpu_buffer(pConvertor->gpu_buffer_ptr, 0);
-//             pConvertor->gpu_buffer_ptr = NULL;
-//         }
-//         return 1;
-//     }
-//     /* Save the global position for the next round */
-//     PUSH_STACK( pStack, pConvertor->stack_pos, pos_desc, pElem->elem.common.type, count_desc,
-//                 conv_ptr - pConvertor->pBaseBuf );
-//     DT_CUDA_DEBUG( opal_cuda_output( 2, "unpack save stack stack_pos %d pos_desc %d count_desc %d disp %ld\n",
-//                                      pConvertor->stack_pos, pStack->index, (int)pStack->count, (long)pStack->disp ); );
-//     return 0;
-// }
+int32_t opal_generic_simple_unpack_function_cuda_vector( opal_convertor_t* pConvertor,
+                                                         struct iovec* iov, uint32_t* out_size,
+                                                         size_t* max_data )
+{
+    dt_stack_t* pStack;                /* pointer to the position on the stack */
+    uint32_t pos_desc;                 /* actual position in the description of the derived datatype */
+    uint32_t count_desc;               /* the number of items already done in the actual pos_desc */
+    size_t total_unpacked = 0;         /* total size unpacked this time */
+    dt_elem_desc_t* description;
+    dt_elem_desc_t* pElem;
+    const opal_datatype_t *pData = pConvertor->pDesc;
+    unsigned char *conv_ptr, *iov_ptr;
+    size_t iov_len_local;
+    uint32_t iov_count;
+    uint8_t free_required;
+    uint32_t count_desc_tmp;
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+    TIMER_DATA_TYPE start, end;
+    long total_time;
+#endif
+
+    DT_CUDA_DEBUG( opal_cuda_output( 1, "opal_convertor_generic_simple_unpack( %p, {%p, %lu}, %u , %u)\n",
+                                     (void*)pConvertor, iov[0].iov_base, (unsigned long)iov[0].iov_len, *out_size, *max_data ); )
+
+    description = pConvertor->use_desc->desc;
+
+    /* For the first step we have to add both displacement to the source. After in the
+     * main while loop we will set back the source_base to the correct value. This is
+     * due to the fact that the convertor can stop in the middle of a data with a count
+     */
+    pStack     = pConvertor->pStack + pConvertor->stack_pos;
+    pos_desc   = pStack->index;
+    conv_ptr   = pConvertor->pBaseBuf + pStack->disp;
+    count_desc = (uint32_t)pStack->count;
+    pStack--;
+    pConvertor->stack_pos--;
+    pElem = &(description[pos_desc]);
+
+    DT_CUDA_DEBUG( opal_cuda_output( 1, "unpack start pos_desc %d count_desc %d disp %ld\n"
+                           "stack_pos %d pos_desc %d count_desc %d disp %ld\n",
+                           pos_desc, count_desc, (long)(conv_ptr - pConvertor->pBaseBuf),
+                           pConvertor->stack_pos, pStack->index, (int)pStack->count, (long)(pStack->disp) ); );
+
+    for( iov_count = 0; iov_count < (*out_size); iov_count++ ) {
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+        GET_TIME(start);
+#endif
+        if (opal_cuda_is_gpu_buffer(iov[iov_count].iov_base)) {
+            iov_ptr = (unsigned char*)iov[iov_count].iov_base;
+            free_required = 0;
+        } else {
+            if (OPAL_DATATYPE_VECTOR_USE_MEMCPY2D || OPAL_DATATYPE_VECTOR_USE_ZEROCPY) {
+                iov_ptr = (unsigned char*)iov[iov_count].iov_base;
+                pConvertor->gpu_buffer_ptr = NULL;
+                free_required = 0;
+            } else {
+                if (pConvertor->gpu_buffer_ptr == NULL) {
+                    pConvertor->gpu_buffer_ptr = (unsigned char*)opal_cuda_malloc_gpu_buffer(iov[iov_count].iov_len, 0);
+                }
+                iov_ptr = pConvertor->gpu_buffer_ptr;
+                cudaMemcpy(iov_ptr, iov[iov_count].iov_base, iov[iov_count].iov_len, cudaMemcpyHostToDevice);
+                free_required = 1;
+            }
+        }
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+        GET_TIME( end );
+        total_time = ELAPSED_TIME( start, end );
+        printf( "[Timing]: HtoD memcpy in %ld microsec, free required %d\n", total_time, free_required );
+#endif
+        iov_len_local = iov[iov_count].iov_len;
+        if( 0 != pConvertor->partial_length ) {
+            /* not support yet */
+        }
+        while( 1 ) {
+            while( pElem->elem.common.flags & OPAL_DATATYPE_FLAG_DATA ) {
+                /* now here we have a basic datatype */
+                /* should not go to here */
+                pStack--;
+                pConvertor->stack_pos--;
+                pos_desc --;
+                pElem = &(description[pos_desc]);
+                count_desc = count_desc_tmp;
+                goto complete_loop;
+            }
+            if( OPAL_DATATYPE_END_LOOP == pElem->elem.common.type ) { /* end of the current loop */
+                DT_CUDA_DEBUG( opal_cuda_output( 2, "unpack end_loop count %d stack_pos %d pos_desc %d disp %ld space %lu\n",
+                                                 (int)pStack->count, pConvertor->stack_pos, pos_desc,
+                                                 (long)pStack->disp, (unsigned long)iov_len_local ); );
+                if( --(pStack->count) == 0 ) { /* end of loop */
+                    if( 0 == pConvertor->stack_pos ) {
+                        /* Do the same thing as when the loop is completed */
+                        iov[iov_count].iov_len -= iov_len_local;  /* update the amount of valid data */
+                        total_unpacked += iov[iov_count].iov_len;
+                        iov_count++;  /* go to the next */
+                        goto complete_conversion;
+                    }
+                    pConvertor->stack_pos--;
+                    pStack--;
+                    pos_desc++;
+                } else {
+                    pos_desc = pStack->index + 1;
+                    if( pStack->index == -1 ) {
+                        pStack->disp += (pData->ub - pData->lb);
+                    } else {
+                        assert( OPAL_DATATYPE_LOOP == description[pStack->index].loop.common.type );
+                        pStack->disp += description[pStack->index].loop.extent;
+                    }
+                }
+                conv_ptr = pConvertor->pBaseBuf + pStack->disp;
+                UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
+                DT_CUDA_DEBUG( opal_cuda_output( 2, "unpack new_loop count %d stack_pos %d pos_desc %d disp %ld space %lu\n",
+                                                 (int)pStack->count, pConvertor->stack_pos, pos_desc,
+                                                 (long)pStack->disp, (unsigned long)iov_len_local ); );
+            }
+            if( OPAL_DATATYPE_LOOP == pElem->elem.common.type ) {
+                OPAL_PTRDIFF_TYPE local_disp = (OPAL_PTRDIFF_TYPE)conv_ptr;
+                if( pElem->loop.common.flags & OPAL_DATATYPE_FLAG_CONTIGUOUS ) {
+                    if (OPAL_DATATYPE_VECTOR_USE_MEMCPY2D) {
+                        unpack_contiguous_loop_cuda_memcpy2d(pElem, &count_desc, &iov_ptr, &conv_ptr, &iov_len_local);
+                    } else if (OPAL_DATATYPE_VECTOR_USE_ZEROCPY) {
+                        unpack_contiguous_loop_cuda_zerocopy(pElem, &count_desc, &iov_ptr, &conv_ptr, &iov_len_local);
+                    } else {
+                        unpack_contiguous_loop_cuda(pElem, &count_desc, &iov_ptr, &conv_ptr, &iov_len_local);
+                    }
+                    if( 0 == count_desc ) {  /* completed */
+                        pos_desc += pElem->loop.items + 1;
+                        goto update_loop_description;
+                    }
+                    /* Save the stack with the correct last_count value. */
+                }
+                local_disp = (OPAL_PTRDIFF_TYPE)conv_ptr - local_disp;
+                PUSH_STACK( pStack, pConvertor->stack_pos, pos_desc, OPAL_DATATYPE_LOOP, count_desc,
+                            pStack->disp + local_disp);
+                pos_desc++;
+            update_loop_description:  /* update the current state */
+            //    conv_ptr = pConvertor->pBaseBuf + pStack->disp;
+                count_desc_tmp = count_desc;
+                UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
+                continue;
+            }
+        }
+    complete_loop:
+        iov[iov_count].iov_len -= iov_len_local;  /* update the amount of valid data */
+        total_unpacked += iov[iov_count].iov_len;
+    }
+ complete_conversion:
+    *max_data = total_unpacked;
+    pConvertor->bConverted += total_unpacked;  /* update the already converted bytes */
+    *out_size = iov_count;
+    if( pConvertor->bConverted == pConvertor->remote_size ) {
+        pConvertor->flags |= CONVERTOR_COMPLETED;
+        DT_CUDA_DEBUG( opal_cuda_output( 0, "Total unpacked %lu\n", pConvertor->bConverted); );
+        if (pConvertor->gpu_buffer_ptr != NULL && free_required == 1) {
+            opal_cuda_free_gpu_buffer(pConvertor->gpu_buffer_ptr, 0);
+            pConvertor->gpu_buffer_ptr = NULL;
+        }
+        return 1;
+    }
+    /* Save the global position for the next round */
+    PUSH_STACK( pStack, pConvertor->stack_pos, pos_desc, pElem->elem.common.type, count_desc,
+                conv_ptr - pConvertor->pBaseBuf );
+    DT_CUDA_DEBUG( opal_cuda_output( 2, "unpack save stack stack_pos %d pos_desc %d count_desc %d disp %ld\n",
+                                     pConvertor->stack_pos, pStack->index, (int)pStack->count, (long)pStack->disp ); );
+    return 0;
+}
 
 int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvertor,
                                                   struct iovec* iov, 
diff --git a/test/datatype/ddt_benchmark.c b/test/datatype/ddt_benchmark.c
index 228238002e4..36f0e7e8659 100644
--- a/test/datatype/ddt_benchmark.c
+++ b/test/datatype/ddt_benchmark.c
@@ -30,7 +30,7 @@
 #include <stdio.h>
 #include <string.h>
 
-#define DDT_TEST_CUDA
+//#define DDT_TEST_CUDA
 #define CUDA_MEMCPY_2D_D2H
 
 
@@ -1213,11 +1213,11 @@ int main( int argc, char* argv[] )
     }
     
     ompi_datatype_t *column, *matt;
-    mat_size = 500;
+    mat_size = 1500;
     ompi_datatype_create_vector( mat_size, 1, mat_size, MPI_DOUBLE, &column );
     ompi_datatype_create_hvector( mat_size, 1, sizeof(double), column, &matt );
     ompi_datatype_commit( &matt );
-  //  local_copy_with_convertor_mat(matt, 1, 1200000, mat_size);
+    local_copy_with_convertor_mat(matt, 1, 200000000, mat_size);
     
     
     int packed_size = 256;
@@ -1275,7 +1275,7 @@ int main( int argc, char* argv[] )
         pdt = create_vector_type( MPI_DOUBLE, 1000, blk_len, blk_len+128);
         if( outputFlags & CHECK_PACK_UNPACK ) {
             for (i = 0; i < 4; i++) {
-                 vector_ddt( pdt, 1, pdt, 1, 1024*10 , 1000, blk_len, blk_len+128);
+                 vector_ddt( pdt, 1, pdt, 1, 1024*10240 , 1000, blk_len, blk_len+128);
      //          vector_ddt_2d( pdt, 1, pdt, 1, 1024*1024*100 , 8192, blk_len, blk_len+128);
             }
         }

From 277c8bd6b29c8d7de90b00b94723deee3686a61e Mon Sep 17 00:00:00 2001
From: eddy16112 <eddy16112@gmail.com>
Date: Tue, 6 Oct 2015 00:32:52 -0400
Subject: [PATCH 122/190] receiver now will send msg back to sender for buffer
 reuse

Conflicts:
	opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
---
 ompi/mca/pml/ob1/pml_ob1_cuda.c               |  1 +
 .../cuda/opal_datatype_pack_cuda_wrapper.cu   |  5 +-
 opal/datatype/opal_convertor.h                |  1 +
 opal/datatype/opal_datatype_gpu.h             |  2 +-
 opal/mca/btl/smcuda/btl_smcuda.c              |  6 +-
 opal/mca/btl/smcuda/btl_smcuda.h              |  9 +--
 opal/mca/btl/smcuda/btl_smcuda_component.c    | 72 ++++++++++++++-----
 test/datatype/ddt_benchmark.c                 | 41 ++++++++---
 8 files changed, 101 insertions(+), 36 deletions(-)

diff --git a/ompi/mca/pml/ob1/pml_ob1_cuda.c b/ompi/mca/pml/ob1/pml_ob1_cuda.c
index 820c8e82d8e..77436ab41d6 100644
--- a/ompi/mca/pml/ob1/pml_ob1_cuda.c
+++ b/ompi/mca/pml/ob1/pml_ob1_cuda.c
@@ -119,6 +119,7 @@ int mca_pml_ob1_send_request_start_cuda(mca_pml_ob1_send_request_t* sendreq,
             struct opal_convertor_t *convertor = &(sendreq->req_send.req_base.req_convertor);
             base = opal_cuda_malloc_gpu_buffer_p(convertor->local_size, 0);
             convertor->gpu_buffer_ptr = base;
+            convertor->gpu_buffer_size = convertor->local_size;
             sendreq->req_send.req_bytes_packed = convertor->local_size;
             printf("GPU BUFFER %p, local %lu, remote %lu\n", base, convertor->local_size, convertor->remote_size);
             if( 0 != (sendreq->req_rdma_cnt = (uint32_t)mca_pml_ob1_rdma_cuda_btls(
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
index c3b327c733e..a4d4b427a45 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
@@ -604,8 +604,9 @@ void pack_contiguous_loop_cuda( dt_elem_desc_t* ELEM,
 #endif    
  //   tasks_per_block = THREAD_PER_BLOCK * TASK_PER_THREAD;
  //   num_blocks = (*COUNT + tasks_per_block - 1) / tasks_per_block;
- //    cudaMemcpy2D(_destination, _end_loop->size, _source, _loop->extent, _end_loop->size, _copy_loops, cudaMemcpyDeviceToDevice);
-//    pack_contiguous_loop_cuda_kernel_global<<<192, 4*THREAD_PER_BLOCK>>>(_copy_loops, _end_loop->size, _loop->extent, _source, _destination);
+//    printf("extent %ld, size %ld, count %ld\n", _loop->extent, _end_loop->size, _copy_loops);
+//    cudaMemcpy2D(_destination, _end_loop->size, _source, _loop->extent, _end_loop->size, _copy_loops, cudaMemcpyDeviceToDevice);
+    pack_contiguous_loop_cuda_kernel_global<<<192, 4*THREAD_PER_BLOCK>>>(_copy_loops, _end_loop->size, _loop->extent, _source, _destination);
 //    int i;
 //    for (i = 0; i < 4; i++) {
 //     opal_empty_kernel<<<192, 4*THREAD_PER_BLOCK>>>(_copy_loops, _end_loop->size, _loop->extent, _source, _destination);
diff --git a/opal/datatype/opal_convertor.h b/opal/datatype/opal_convertor.h
index ace5cf4b1e4..6b4746eaa9a 100644
--- a/opal/datatype/opal_convertor.h
+++ b/opal/datatype/opal_convertor.h
@@ -113,6 +113,7 @@ struct opal_convertor_t {
     void *                        stream;         /**< CUstream for async copy */
 
     unsigned char *               gpu_buffer_ptr; /**< GPU buffer used for pack/unpack */
+    size_t                        gpu_buffer_size;
     uint64_t *                    pipeline_event[MAX_IPC_EVENT_HANDLE]; /**< cuda event for pipeline */
 #endif
     /* size: 248, cachelines: 4, members: 20 */
diff --git a/opal/datatype/opal_datatype_gpu.h b/opal/datatype/opal_datatype_gpu.h
index 8ae90cde92f..887c8a0918b 100644
--- a/opal/datatype/opal_datatype_gpu.h
+++ b/opal/datatype/opal_datatype_gpu.h
@@ -66,4 +66,4 @@ extern unsigned char* (*opal_cuda_get_gpu_pack_buffer_p)(void);
 extern void (*opal_cuda_free_gpu_buffer_p)(void *addr, int gpu_id);
 
 extern void* (*opal_cuda_malloc_gpu_buffer_p)(size_t size, int gpu_id);
-#endif /* OPAL_DATATYPE_GPU_H_HAS_BEEN_INCLUDED */
\ No newline at end of file
+#endif /* OPAL_DATATYPE_GPU_H_HAS_BEEN_INCLUDED */
diff --git a/opal/mca/btl/smcuda/btl_smcuda.c b/opal/mca/btl/smcuda/btl_smcuda.c
index 7dd56f6e612..dacc343ba84 100644
--- a/opal/mca/btl/smcuda/btl_smcuda.c
+++ b/opal/mca/btl/smcuda/btl_smcuda.c
@@ -1165,7 +1165,7 @@ int mca_btl_smcuda_get_cuda (struct mca_btl_base_module_t *btl,
                 send_msg.lindex = lindex;
                 send_msg.packed_size = 0;
                 send_msg.seq = 0;
-                send_msg.msg_type = CUDA_PACK_TO_LOCAL;
+                send_msg.msg_type = CUDA_PACK_TO_LOCAL_START;
                 mca_btl_smcuda_send_cuda_pack_sig(btl, ep, &send_msg);
                 done = 0;
             } else {
@@ -1199,14 +1199,14 @@ int mca_btl_smcuda_get_cuda (struct mca_btl_base_module_t *btl,
                     cuda_getmemhandle(local_address, size, (mca_mpool_base_registration_t *)&loc_reg, NULL);
                     memcpy(send_msg.mem_handle, loc_reg_ptr->data.memHandle, sizeof(loc_reg_ptr->data.memHandle));
                     send_msg.seq = -9;
-                    send_msg.msg_type = CUDA_PACK_TO_REMOTE;
+                    send_msg.msg_type = CUDA_PACK_TO_REMOTE_START;
                     send_msg.remote_address = local_address;
                     send_msg.remote_base = loc_reg.base.base;
                     mca_common_wait_stream_synchronize(&loc_reg);
                     printf("send r_addr %p, r_base %p\n", local_address, loc_reg.base.base);
                 } else {
                     send_msg.seq = 0;
-                    send_msg.msg_type = CUDA_PACK_TO_LOCAL;
+                    send_msg.msg_type = CUDA_PACK_TO_LOCAL_START;
                 }
                 mca_btl_smcuda_cuda_dt_unpack_clone(NULL, ep, remote_memory_address, (mca_btl_base_descriptor_t *)frag, 
                                                     0, lindex, 0, 0);
diff --git a/opal/mca/btl/smcuda/btl_smcuda.h b/opal/mca/btl/smcuda/btl_smcuda.h
index 7616e16c720..a1d9e5166e1 100644
--- a/opal/mca/btl/smcuda/btl_smcuda.h
+++ b/opal/mca/btl/smcuda/btl_smcuda.h
@@ -522,13 +522,14 @@ typedef struct {
     uint64_t mem_handle[8];
 } cuda_dt_hdr_t;
 
-#define CUDA_UNPACK_FROM_REMOTE     0
+#define CUDA_UNPACK_FROM_SEQ        0
 #define CUDA_PACK_COMPLETE          1
 #define CUDA_PACK_COMPLETE_ACK      2
 #define CUDA_PACK_CLEANUP           3
-#define CUDA_PACK_TO_LOCAL          4
-#define CUDA_PACK_TO_REMOTE         5
-#define CUDA_UNPACK_NO              6
+#define CUDA_PACK_TO_LOCAL_START    4
+#define CUDA_PACK_TO_REMOTE_START   5
+#define CUDA_PACK_TO_SEQ      6
+#define CUDA_UNPACK_NO              7
 
 /* package save pack/unpack convertor and cbfunc */
 typedef struct {
diff --git a/opal/mca/btl/smcuda/btl_smcuda_component.c b/opal/mca/btl/smcuda/btl_smcuda_component.c
index 3ffde4608fc..de772340fa0 100644
--- a/opal/mca/btl/smcuda/btl_smcuda_component.c
+++ b/opal/mca/btl/smcuda/btl_smcuda_component.c
@@ -167,7 +167,7 @@ static int smcuda_register(void)
     mca_btl_smcuda_param_register_int("use_cuda_ipc", 1, OPAL_INFO_LVL_4, &mca_btl_smcuda_component.use_cuda_ipc);
     mca_btl_smcuda_param_register_int("use_cuda_ipc_same_gpu", 1, OPAL_INFO_LVL_4,&mca_btl_smcuda_component.use_cuda_ipc_same_gpu);
     mca_btl_smcuda_param_register_int("cuda_ipc_verbose", 0, OPAL_INFO_LVL_4, &mca_btl_smcuda_component.cuda_ipc_verbose);
-    mca_btl_smcuda_param_register_int("cuda_dt_pipeline_size", 1024*1024*400, OPAL_INFO_LVL_4, &mca_btl_smcuda_component.cuda_dt_pipeline_size);
+    mca_btl_smcuda_param_register_int("cuda_ddt_pipeline_size", 1024*1024*400, OPAL_INFO_LVL_4, &mca_btl_smcuda_component.cuda_dt_pipeline_size);
     mca_btl_smcuda_component.cuda_ipc_output = opal_output_open(NULL);
     opal_output_set_verbosity(mca_btl_smcuda_component.cuda_ipc_output, mca_btl_smcuda_component.cuda_ipc_verbose);
 #else /* OPAL_CUDA_SUPPORT */
@@ -869,6 +869,8 @@ static void btl_smcuda_datatype_unpack(mca_btl_base_module_t* btl,
     assert(my_cuda_dt_clone->lindex == lindex);
     
     printf("$$$$$$$$$$$$$$hello, rank %d in smcuda unpack seq %d, index %d\n", my_cuda_dt_clone->endpoint->my_smp_rank, seq, lindex);
+    cuda_dt_hdr_t send_msg;
+    send_msg.lindex = lindex;
     
     if (msg_type == CUDA_PACK_CLEANUP) {
         mca_btl_smcuda_frag_t *frag_recv = (mca_btl_smcuda_frag_t *) my_cuda_dt_clone->frag;
@@ -877,13 +879,11 @@ static void btl_smcuda_datatype_unpack(mca_btl_base_module_t* btl,
         mca_btl_smcuda_free(btl, (mca_btl_base_descriptor_t *)frag_recv);
         mca_btl_smcuda_free_cuda_dt_unpack_clone(endpoint, lindex);
     } else if (msg_type == CUDA_PACK_COMPLETE) {
-        cuda_dt_hdr_t send_msg;
-        send_msg.lindex = lindex;
         send_msg.packed_size = 0;
         send_msg.seq = -1;
         send_msg.msg_type = CUDA_PACK_COMPLETE_ACK;
         mca_btl_smcuda_send_cuda_pack_sig(btl, endpoint, &send_msg);
-    } else if (msg_type == CUDA_UNPACK_FROM_REMOTE){
+    } else if (msg_type == CUDA_UNPACK_FROM_SEQ){
         struct iovec iov;
         uint32_t iov_count = 1;
         size_t max_data;
@@ -916,6 +916,10 @@ static void btl_smcuda_datatype_unpack(mca_btl_base_module_t* btl,
                 }   
             }
         }
+        send_msg.seq = seq;
+        send_msg.packed_size = packed_size;
+        send_msg.msg_type = CUDA_PACK_TO_SEQ;
+        mca_btl_smcuda_send_cuda_pack_sig(btl, endpoint, &send_msg);
     }
    // MCA_BTL_SMCUDA_FRAG_RETURN(frag);
 }
@@ -931,9 +935,14 @@ static void btl_smcuda_datatype_pack(mca_btl_base_module_t* btl,
     int seq = cuda_dt_hdr.seq;
     int lindex = cuda_dt_hdr.lindex;
     int msg_type = cuda_dt_hdr.msg_type;
+    size_t packed_size = cuda_dt_hdr.packed_size;
     mca_btl_smcuda_frag_t *frag = (mca_btl_smcuda_frag_t *)des;
     cuda_dt_clone_t *my_cuda_dt_clone;
     cuda_dt_hdr_t send_msg;
+    
+    uint32_t iov_count = 1;
+    int rc_dt = 0;
+    size_t max_data = 0;
 
     /* We can find the endoint back from the rank embedded in the header */
     endpoint = mca_btl_smcuda_component.sm_peers[frag->hdr->my_smp_rank];
@@ -952,9 +961,28 @@ static void btl_smcuda_datatype_pack(mca_btl_base_module_t* btl,
             convertor->gpu_buffer_ptr = NULL;
         }
         mca_btl_smcuda_free_cuda_dt_pack_clone(endpoint, lindex);
+    } else if (msg_type == CUDA_PACK_TO_SEQ) {
+        printf("i receive a message pack to seq, packed %ld, pipeline_size %ld\n", convertor->bConverted, my_cuda_dt_clone->pipeline_size); 
+        if (convertor->bConverted < convertor->local_size) {
+            struct iovec iov;
+            iov.iov_base = convertor->gpu_buffer_ptr + seq*my_cuda_dt_clone->pipeline_size;
+            iov.iov_len = packed_size;
+            rc_dt = opal_convertor_pack(convertor, &iov, &iov_count, &max_data );
+            packed_size = max_data;
+            send_msg.packed_size = packed_size;
+            send_msg.seq = seq;
+            send_msg.msg_type = CUDA_UNPACK_FROM_SEQ;
+            mca_btl_smcuda_send_cuda_unpack_sig(btl, endpoint, &send_msg);
+            if (rc_dt == 1) {
+                send_msg.packed_size = 0;
+                send_msg.seq = -1;
+                send_msg.msg_type = CUDA_PACK_COMPLETE;
+                mca_btl_smcuda_send_cuda_unpack_sig(btl, endpoint, &send_msg);
+            }
+        }
     } else {
         mca_mpool_common_cuda_reg_t *rget_reg_ptr = NULL;
-        if (msg_type == CUDA_PACK_TO_REMOTE) { /* receiver is contiguous, and ask me to pack directly to his gpu memory */
+        if (msg_type == CUDA_PACK_TO_REMOTE_START) { /* receiver is contiguous, and ask me to pack directly to his gpu memory */
             opal_cuda_free_gpu_buffer_p(convertor->gpu_buffer_ptr, 0);
             mca_mpool_common_cuda_reg_t rget_reg;
             rget_reg_ptr= &rget_reg;
@@ -967,39 +995,49 @@ static void btl_smcuda_datatype_pack(mca_btl_base_module_t* btl,
             convertor->gpu_buffer_ptr = remote_memory_address;
             printf("remote_memory_address $$$$$$ %p, r_addr %p, r_base %p\n", remote_memory_address, cuda_dt_hdr.remote_address, cuda_dt_hdr.remote_base);
             send_msg.msg_type = CUDA_UNPACK_NO;
+            convertor->gpu_buffer_size = convertor->local_size;
         } else {
-            send_msg.msg_type = CUDA_UNPACK_FROM_REMOTE;
+            send_msg.msg_type = CUDA_UNPACK_FROM_SEQ;
         }
         struct iovec iov;
-        int rc_dt = 0;
-        size_t packed_size = mca_btl_smcuda_component.cuda_dt_pipeline_size;
+        packed_size = mca_btl_smcuda_component.cuda_dt_pipeline_size;
         printf("Pipeline_size %ld\n", packed_size);
-        uint32_t iov_count = 1;
         iov.iov_base = convertor->gpu_buffer_ptr;
         iov.iov_len = packed_size;
-        size_t max_data = 0;
-        int seq = 0;
+        max_data = 0;
+        seq = 0;
         /* the first pack here is used to get the correct size of pipeline_size */
         /* because pack may not use the whole pipeline size */
         rc_dt = opal_convertor_pack(convertor, &iov, &iov_count, &max_data );
         packed_size = max_data;
+        iov.iov_base += packed_size;
+        /* save pipeline size */
+        my_cuda_dt_clone->pipeline_size = packed_size;   
+        convertor->gpu_buffer_size -= packed_size;
         send_msg.packed_size = packed_size;
         send_msg.seq = seq;
         mca_btl_smcuda_send_cuda_unpack_sig(btl, endpoint, &send_msg);
-        while (rc_dt != 1) {
-            iov.iov_base += packed_size;
+        while (rc_dt != 1 && convertor->gpu_buffer_size > 0) {
+            if (convertor->gpu_buffer_size < packed_size) {
+                packed_size = convertor->gpu_buffer_size;
+            } 
+            iov.iov_len = packed_size;
             seq ++;
             rc_dt = opal_convertor_pack(convertor, &iov, &iov_count, &max_data );
             packed_size = max_data;
+            iov.iov_base += packed_size;
+            convertor->gpu_buffer_size -= packed_size;
             send_msg.packed_size = packed_size;
             send_msg.seq = seq;
             mca_btl_smcuda_send_cuda_unpack_sig(btl, endpoint, &send_msg);
         }
         
-        send_msg.packed_size = 0;
-        send_msg.seq = -1;
-        send_msg.msg_type = CUDA_PACK_COMPLETE;
-        mca_btl_smcuda_send_cuda_unpack_sig(btl, endpoint, &send_msg);
+        if (rc_dt == 1) {
+            send_msg.packed_size = 0;
+            send_msg.seq = -1;
+            send_msg.msg_type = CUDA_PACK_COMPLETE;
+            mca_btl_smcuda_send_cuda_unpack_sig(btl, endpoint, &send_msg);
+        }
         
         if (rget_reg_ptr != NULL) { /* close memhandle */
             cuda_closememhandle(NULL, (mca_mpool_base_registration_t *)rget_reg_ptr);
diff --git a/test/datatype/ddt_benchmark.c b/test/datatype/ddt_benchmark.c
index 36f0e7e8659..2d25274ee9b 100644
--- a/test/datatype/ddt_benchmark.c
+++ b/test/datatype/ddt_benchmark.c
@@ -30,7 +30,7 @@
 #include <stdio.h>
 #include <string.h>
 
-//#define DDT_TEST_CUDA
+#define DDT_TEST_CUDA
 #define CUDA_MEMCPY_2D_D2H
 
 
@@ -191,7 +191,7 @@ static void fill_vectors(double* vp, int itera, int contig, int gap)
             if (j >= i*gap && j < i*gap+contig) {
                 vp[j] = 1.1;
             } else {
-                vp[j] = -1.0;
+                vp[j] = 0;
             }
         }
     }
@@ -203,7 +203,7 @@ static void fill_vectors(double* vp, int itera, int contig, int gap)
     // for (i = 0; i < (itera-1)*gap+contig; i++) {
     //     printf("%1.f ", vp[i]);
     // }
-    // printf("\n");
+    printf("\n");
 }
 
 static void verify_vectors(double *vp, int itera, int contig, int gap)
@@ -350,6 +350,16 @@ vector_ddt( ompi_datatype_t* send_type, int send_count,
             done1 = opal_convertor_pack( send_convertor, &iov, &iov_count, &max_data );
        //     done1 = 1;
         }
+        
+        // int i,j = 0;
+        // printf("buffer received\n");
+        // double *mat_temp = (double*)ptemp;
+        // for (i = 0; i < itera; i++) {
+        //     for (j = 0; j < contig; j++) {
+        //         printf(" %1.f ", mat_temp[i*itera+j]);
+        //     }
+        //     printf("\n");
+        // }
 
         if( done2 == 0 ) {
             GET_TIME( unpack_start );
@@ -1213,11 +1223,11 @@ int main( int argc, char* argv[] )
     }
     
     ompi_datatype_t *column, *matt;
-    mat_size = 1500;
-    ompi_datatype_create_vector( mat_size, 1, mat_size, MPI_DOUBLE, &column );
-    ompi_datatype_create_hvector( mat_size, 1, sizeof(double), column, &matt );
-    ompi_datatype_commit( &matt );
-    local_copy_with_convertor_mat(matt, 1, 200000000, mat_size);
+    mat_size = 4000;
+//    ompi_datatype_create_vector( mat_size, 1, mat_size, MPI_DOUBLE, &column );
+//    ompi_datatype_create_hvector( mat_size, 1, sizeof(double), column, &matt );
+//    ompi_datatype_commit( &matt );
+//    local_copy_with_convertor_mat(matt, 1, 200000000, mat_size);
     
     
     int packed_size = 256;
@@ -1275,7 +1285,7 @@ int main( int argc, char* argv[] )
         pdt = create_vector_type( MPI_DOUBLE, 1000, blk_len, blk_len+128);
         if( outputFlags & CHECK_PACK_UNPACK ) {
             for (i = 0; i < 4; i++) {
-                 vector_ddt( pdt, 1, pdt, 1, 1024*10240 , 1000, blk_len, blk_len+128);
+      //           vector_ddt( pdt, 1, pdt, 1, 1024*10240 , 1000, blk_len, blk_len+128);
      //          vector_ddt_2d( pdt, 1, pdt, 1, 1024*1024*100 , 8192, blk_len, blk_len+128);
             }
         }
@@ -1296,6 +1306,19 @@ int main( int argc, char* argv[] )
         OBJ_RELEASE( pdt ); assert( pdt == NULL );
     }
     
+    for (blk_len = 2000; blk_len <= 2000; blk_len += 500) {
+        printf( ">>--------------------------------------------<<\n" );
+        printf( "Vector data-type (60000 times %d double stride 512)\n", blk_len );
+        pdt = create_vector_type( MPI_DOUBLE, blk_len, blk_len, blk_len*2);
+        if( outputFlags & CHECK_PACK_UNPACK ) {
+            for (i = 0; i < 4; i++) {
+                  vector_ddt( pdt, 1, pdt, 1, 1024*1024*100 , blk_len, blk_len, blk_len*2);
+    //             vector_ddt_2d( pdt, 1, pdt, 1, 1024*1024*100 , 8192, blk_len, blk_len+128);
+            }
+        }
+        OBJ_RELEASE( pdt ); assert( pdt == NULL );
+    }
+    
     /*
     for (blk_len = 4; blk_len <= 32; blk_len += 1) {
         printf( ">>--------------------------------------------<<\n" );

From 4591656d4a411e4f3a399b28132acf98031bde0c Mon Sep 17 00:00:00 2001
From: Wei Wu <eddy16112@gmail.com>
Date: Fri, 9 Oct 2015 16:46:41 -0700
Subject: [PATCH 123/190] fix zerocopy

---
 .../cuda/opal_datatype_pack_cuda_wrapper.cu         | 13 +++++++++----
 .../cuda/opal_datatype_unpack_cuda_wrapper.cu       | 13 +++++++++----
 opal/mca/btl/openib/btl_openib_frag.h               |  2 ++
 opal/mca/btl/smcuda/btl_smcuda.h                    |  2 +-
 4 files changed, 21 insertions(+), 9 deletions(-)

diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
index a4d4b427a45..00c7812b605 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
@@ -605,8 +605,8 @@ void pack_contiguous_loop_cuda( dt_elem_desc_t* ELEM,
  //   tasks_per_block = THREAD_PER_BLOCK * TASK_PER_THREAD;
  //   num_blocks = (*COUNT + tasks_per_block - 1) / tasks_per_block;
 //    printf("extent %ld, size %ld, count %ld\n", _loop->extent, _end_loop->size, _copy_loops);
-//    cudaMemcpy2D(_destination, _end_loop->size, _source, _loop->extent, _end_loop->size, _copy_loops, cudaMemcpyDeviceToDevice);
-    pack_contiguous_loop_cuda_kernel_global<<<192, 4*THREAD_PER_BLOCK>>>(_copy_loops, _end_loop->size, _loop->extent, _source, _destination);
+    cudaMemcpy2D(_destination, _end_loop->size, _source, _loop->extent, _end_loop->size, _copy_loops, cudaMemcpyDeviceToDevice);
+//    pack_contiguous_loop_cuda_kernel_global<<<192, 4*THREAD_PER_BLOCK>>>(_copy_loops, _end_loop->size, _loop->extent, _source, _destination);
 //    int i;
 //    for (i = 0; i < 4; i++) {
 //     opal_empty_kernel<<<192, 4*THREAD_PER_BLOCK>>>(_copy_loops, _end_loop->size, _loop->extent, _source, _destination);
@@ -776,7 +776,12 @@ void pack_contiguous_loop_cuda_zerocopy( dt_elem_desc_t* ELEM,
  //   tasks_per_block = THREAD_PER_BLOCK * TASK_PER_THREAD;
  //   num_blocks = (*COUNT + tasks_per_block - 1) / tasks_per_block;
  //   cudaHostRegister(_destination, _copy_loops*_end_loop->size, cudaHostRegisterMapped);
-    cudaHostGetDevicePointer((void **)&_destination_dev, (void *) _destination, 0);
+    cudaError_t reg_rv = cudaHostGetDevicePointer((void **)&_destination_dev, (void *) _destination, 0);
+    if (reg_rv != cudaSuccess) {
+        const char *cuda_err = cudaGetErrorString(reg_rv);
+        printf("can not get dev  mem, %s\n", cuda_err);
+    }
+    //cudaMemcpy2D(_destination_dev, _end_loop->size, _source, _loop->extent, _end_loop->size, _copy_loops, cudaMemcpyDeviceToDevice);
     pack_contiguous_loop_cuda_kernel_global<<<192, 4*THREAD_PER_BLOCK>>>(_copy_loops, _end_loop->size, _loop->extent, _source, _destination_dev);
 
 #if !defined(OPAL_DATATYPE_CUDA_DRY_RUN)    
@@ -852,13 +857,13 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
         }
         transfer_required = 0;
     } else {
+        buffer_size = iov[0].iov_len;
         if (OPAL_DATATYPE_VECTOR_USE_ZEROCPY) {
             pConvertor->gpu_buffer_ptr = NULL;
             transfer_required = 0;
             free_required = 0;
             cudaHostGetDevicePointer((void **)&destination, (void *)iov[0].iov_base, 0);
         } else {
-            buffer_size = iov[0].iov_len;
             if (pConvertor->gpu_buffer_ptr == NULL) {
                 pConvertor->gpu_buffer_ptr = (unsigned char*)opal_cuda_malloc_gpu_buffer(buffer_size, 0);
             }
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
index 5374e2d9fc8..c268fe2fb94 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
@@ -727,8 +727,8 @@ void unpack_contiguous_loop_cuda( dt_elem_desc_t* ELEM,
 #endif
 //    tasks_per_block = THREAD_PER_BLOCK * TASK_PER_THREAD;
 //    num_blocks = (*COUNT + tasks_per_block - 1) / tasks_per_block;
-    unpack_contiguous_loop_cuda_kernel_global<<<192, 4*THREAD_PER_BLOCK>>>(_copy_loops, _end_loop->size, _loop->extent, _source, _destination);
-//     cudaMemcpy2D(_destination, _loop->extent, _source, _end_loop->size, _end_loop->size, _copy_loops, cudaMemcpyDeviceToDevice);
+//    unpack_contiguous_loop_cuda_kernel_global<<<192, 4*THREAD_PER_BLOCK>>>(_copy_loops, _end_loop->size, _loop->extent, _source, _destination);
+     cudaMemcpy2D(_destination, _loop->extent, _source, _end_loop->size, _end_loop->size, _copy_loops, cudaMemcpyDeviceToDevice);
 
 #if !defined(OPAL_DATATYPE_CUDA_DRY_RUN)     
     *(DESTINATION) = _destination + _loop->extent*_copy_loops - _end_loop->first_elem_disp;
@@ -818,8 +818,13 @@ void unpack_contiguous_loop_cuda_zerocopy( dt_elem_desc_t* ELEM,
 //    tasks_per_block = THREAD_PER_BLOCK * TASK_PER_THREAD;
 //    num_blocks = (*COUNT + tasks_per_block - 1) / tasks_per_block;
 //    cudaHostRegister(_source, _copy_loops*_end_loop->size, cudaHostRegisterMapped);
-    cudaHostGetDevicePointer((void **)&_source_dev, (void *) _source, 0);
-    unpack_contiguous_loop_cuda_kernel_global<<<192, 4*THREAD_PER_BLOCK>>>(_copy_loops, _end_loop->size, _loop->extent, _source, _destination);
+    cudaError_t reg_rv = cudaHostGetDevicePointer((void **)&_source_dev, (void *) _source, 0);
+    if (reg_rv != cudaSuccess) {
+        const char *cuda_err = cudaGetErrorString(reg_rv);
+        printf("can not get dev mem, %s\n", cuda_err);
+    }
+    //cudaMemcpy2D(_destination, _loop->extent, _source_dev, _end_loop->size, _end_loop->size, _copy_loops, cudaMemcpyDeviceToDevice);
+    unpack_contiguous_loop_cuda_kernel_global<<<192, 4*THREAD_PER_BLOCK>>>(_copy_loops, _end_loop->size, _loop->extent, _source_dev, _destination);
 
 #if !defined(OPAL_DATATYPE_CUDA_DRY_RUN)
     *(DESTINATION) = _destination + _loop->extent*_copy_loops - _end_loop->first_elem_disp;
diff --git a/opal/mca/btl/openib/btl_openib_frag.h b/opal/mca/btl/openib/btl_openib_frag.h
index 7ca37142429..b73a817e1e6 100644
--- a/opal/mca/btl/openib/btl_openib_frag.h
+++ b/opal/mca/btl/openib/btl_openib_frag.h
@@ -25,6 +25,8 @@
 #ifndef MCA_BTL_IB_FRAG_H
 #define MCA_BTL_IB_FRAG_H
 
+#define OPAL_OPENIB_PAD_HDR 1
+
 #include "opal_config.h"
 #include "opal/align.h"
 #include "opal/mca/btl/btl.h"
diff --git a/opal/mca/btl/smcuda/btl_smcuda.h b/opal/mca/btl/smcuda/btl_smcuda.h
index a1d9e5166e1..abd043f9f10 100644
--- a/opal/mca/btl/smcuda/btl_smcuda.h
+++ b/opal/mca/btl/smcuda/btl_smcuda.h
@@ -41,7 +41,7 @@
 #include "opal/mca/btl/btl.h"
 #include "opal/mca/common/sm/common_sm.h"
 
-#define OPAL_DATATYPE_DIRECT_COPY_GPUMEM    1
+#define OPAL_DATATYPE_DIRECT_COPY_GPUMEM    0
 
 BEGIN_C_DECLS
 

From c5add7eb33f37ccd18f73d58e5c22cc3838e2728 Mon Sep 17 00:00:00 2001
From: eddy16112 <eddy16112@gmail.com>
Date: Thu, 22 Oct 2015 17:36:31 -0400
Subject: [PATCH 124/190] offset instead of actual addess, and lots of clean up
 for unused functions

Conflicts:
	opal/datatype/cuda/opal_datatype_cuda.cu
	opal/datatype/cuda/opal_datatype_cuda_internal.cuh
	opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
	opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
	opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
	opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
	opal/datatype/opal_datatype_gpu.c
---
 ompi/mca/pml/ob1/pml_ob1_cuda.c               |   2 +-
 opal/datatype/cuda/opal_datatype_cuda.cu      | 144 +----
 opal/datatype/cuda/opal_datatype_cuda.cuh     |  10 -
 .../cuda/opal_datatype_cuda_internal.cuh      |  74 +--
 .../cuda/opal_datatype_pack_cuda_kernel.cu    | 539 +-----------------
 .../cuda/opal_datatype_pack_cuda_wrapper.cu   | 292 ++--------
 .../cuda/opal_datatype_unpack_cuda_kernel.cu  | 262 +--------
 .../cuda/opal_datatype_unpack_cuda_wrapper.cu | 243 ++------
 opal/datatype/opal_datatype_gpu.c             |  35 +-
 opal/datatype/opal_datatype_gpu.h             |  13 -
 opal/mca/btl/smcuda/btl_smcuda.c              |  72 ++-
 opal/mca/btl/smcuda/btl_smcuda.h              |  15 +-
 opal/mca/btl/smcuda/btl_smcuda_component.c    |  16 +-
 opal/mca/btl/smcuda/btl_smcuda_endpoint.h     |   4 +-
 test/datatype/ddt_benchmark.c                 |   4 +-
 15 files changed, 219 insertions(+), 1506 deletions(-)

diff --git a/ompi/mca/pml/ob1/pml_ob1_cuda.c b/ompi/mca/pml/ob1/pml_ob1_cuda.c
index 77436ab41d6..e41b75a99c6 100644
--- a/ompi/mca/pml/ob1/pml_ob1_cuda.c
+++ b/ompi/mca/pml/ob1/pml_ob1_cuda.c
@@ -136,7 +136,7 @@ int mca_pml_ob1_send_request_start_cuda(mca_pml_ob1_send_request_t* sendreq,
                     return rc;
                 }
                 mca_pml_ob1_rdma_cuda_btl_register_data(sendreq->req_rdma, sendreq->req_rdma_cnt, 0, lindex, 1, local_device); 
-                mca_btl_smcuda_cuda_dt_pack_clone(convertor, bml_btl->btl_endpoint, NULL, NULL, 0, lindex, 0, local_device);
+                mca_btl_smcuda_cuda_dt_pack_clone( bml_btl->btl_endpoint, convertor, NULL, NULL, 0, lindex, 0, local_device);
     
                 rc = mca_pml_ob1_send_request_start_rdma(sendreq, bml_btl,
                                                          sendreq->req_send.req_bytes_packed);
diff --git a/opal/datatype/cuda/opal_datatype_cuda.cu b/opal/datatype/cuda/opal_datatype_cuda.cu
index 29ade337b69..bce80b4a592 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cu
+++ b/opal/datatype/cuda/opal_datatype_cuda.cu
@@ -9,56 +9,14 @@
 #include <assert.h>
 #include <stdarg.h> 
 
-/*
- * NOTE: The order of this array *MUST* match what is listed in datatype.h
- * (use of designated initializers should relax this restrictions some)
- */
-/*
-OPAL_DECLSPEC const size_t opal_datatype_basicDatatypesSize[OPAL_DATATYPE_MAX_PREDEFINED] = {
-    OPAL_DATATYPE_LOOP_SIZE,
-    OPAL_DATATYPE_END_LOOP_SIZE,
-    OPAL_DATATYPE_LB_SIZE,
-    OPAL_DATATYPE_UB_SIZE,
-    OPAL_DATATYPE_INT1_SIZE,
-    OPAL_DATATYPE_INT2_SIZE,
-    OPAL_DATATYPE_INT4_SIZE,
-    OPAL_DATATYPE_INT8_SIZE,
-    OPAL_DATATYPE_INT16_SIZE,   
-    OPAL_DATATYPE_UINT1_SIZE,
-    OPAL_DATATYPE_UINT2_SIZE,
-    OPAL_DATATYPE_UINT4_SIZE,
-    OPAL_DATATYPE_UINT8_SIZE,
-    OPAL_DATATYPE_UINT16_SIZE,  
-    OPAL_DATATYPE_FLOAT2_SIZE,
-    OPAL_DATATYPE_FLOAT4_SIZE,
-    OPAL_DATATYPE_FLOAT8_SIZE,
-    OPAL_DATATYPE_FLOAT12_SIZE,
-    OPAL_DATATYPE_FLOAT16_SIZE,
-    OPAL_DATATYPE_FLOAT_COMPLEX_SIZE,
-    OPAL_DATATYPE_DOUBLE_COMPLEX_SIZE,
-    OPAL_DATATYPE_LONG_DOUBLE_COMPLEX_SIZE,
-    OPAL_DATATYPE_BOOL_SIZE,
-    OPAL_DATATYPE_WCHAR_SIZE,
-    OPAL_DATATYPE_UNAVAILABLE_SIZE,
-};
-*/
-/***** my variables ********/
-
 
 ddt_cuda_list_t *cuda_free_list;
 ddt_cuda_device_t *cuda_device;
-ddt_cuda_desc_t *cuda_desc_d, *cuda_desc_h;
-unsigned char *pBaseBuf_GPU, *gpu_src_const, *gpu_dest_const;
-unsigned char *ddt_cuda_pack_buffer, *ddt_cuda_unpack_buffer;
 ddt_cuda_stream_t* cuda_streams;
 struct iovec cuda_iov[CUDA_NB_IOV];
 uint32_t cuda_iov_count;
-ddt_cuda_description_dist_t description_dist_h[CUDA_MAX_NB_BLOCKS];
-ddt_cuda_description_dist_t* description_dist_d;
-ddt_cuda_iov_dist_t cuda_iov_dist_h[NB_STREAMS][CUDA_MAX_NB_BLOCKS];
+ddt_cuda_iov_dist_t* cuda_iov_dist_h[NB_STREAMS];
 ddt_cuda_iov_dist_t* cuda_iov_dist_d[NB_STREAMS];
-dt_elem_desc_t* description_d;
-uint8_t opal_datatype_cuda_debug;
 
 //uint8_t ALIGNMENT_DOUBLE, ALIGNMENT_FLOAT, ALIGNMENT_CHAR;
 
@@ -202,6 +160,17 @@ static inline void cuda_list_item_merge_by_addr(ddt_cuda_list_t *list)
     }
 }
 
+void opal_cuda_output(int output_id, const char *format, ...)
+{
+    if (output_id >= 0 && output_id <= OPAL_DATATYPE_CUDA_DEBUG_LEVEL) {
+        va_list arglist;
+        fprintf( stderr, "[Debug %d]: ", output_id );
+        va_start(arglist, format);
+        vfprintf(stderr, format, arglist);
+        va_end(arglist);
+    }
+}
+
 void opal_datatype_cuda_init(void)
 {
     uint32_t i;
@@ -213,7 +182,6 @@ void opal_datatype_cuda_init(void)
         opal_cuda_output(0, "Cannot retrieve the device being used. Drop CUDA support!\n");
         return;
     }    
-    printf("current device %d\n", device);
 
     cuda_free_list = init_cuda_free_list();
     
@@ -224,6 +192,7 @@ void opal_datatype_cuda_init(void)
         if (cudaMalloc((void **)(&gpu_ptr), sizeof(char)*DT_CUDA_BUFFER_SIZE) != cudaSuccess) {
             DT_CUDA_DEBUG( opal_cuda_output( 0, "cudaMalloc is failed in GPU %d\n", i); );
         }
+        DT_CUDA_DEBUG ( opal_cuda_output(2, "DDT engine cudaMalloc buffer %p in GPU %d\n", gpu_ptr, i););
         cudaMemset(gpu_ptr, 0, sizeof(char)*DT_CUDA_BUFFER_SIZE);
         cuda_device[i].gpu_buffer = gpu_ptr;
         
@@ -241,33 +210,6 @@ void opal_datatype_cuda_init(void)
         cuda_device[i].buffer_used.nb_elements = 0;
     }
     
-    cudaMalloc((void **)&cuda_desc_d, sizeof(ddt_cuda_desc_t));
-    cudaMallocHost((void **)&cuda_desc_h, sizeof(ddt_cuda_desc_t));
-    printf("size cuda_desc %d\n", sizeof(ddt_cuda_desc_t));
-    
-    // printf("malloc iov\n");
-    // for (i = 0; i < IOV_ARRAY_SIZE; i++) {
-    //     void* iov_base;
-    //     cudaMalloc( (void **)&iov_base, sizeof(char)*IOV_LEN);
-    //     cuda_desc_h->iov[i].iov_base = iov_base;
-    //     cuda_desc_h->iov[i].iov_len = IOV_LEN;
-    // }
-    
-    cudaMalloc((void **)(&ddt_cuda_pack_buffer), sizeof(char)*DT_CUDA_BUFFER_SIZE);
-    printf("malloc cuda packing buffer, %p\n", ddt_cuda_pack_buffer);
-    cudaMalloc((void **)(&ddt_cuda_unpack_buffer), sizeof(char)*DT_CUDA_BUFFER_SIZE);
-    printf("malloc cuda unpacking buffer, %p\n", ddt_cuda_unpack_buffer);
-
-    cuda_desc_h->iov[0].iov_base = ddt_cuda_pack_buffer;
-    cuda_desc_h->iov[0].iov_len = DT_CUDA_BUFFER_SIZE;
-    
-    cudaMalloc((void **)(&pBaseBuf_GPU), sizeof(char)*DT_CUDA_BUFFER_SIZE);
-    gpu_src_const = pBaseBuf_GPU;
-    gpu_dest_const = (unsigned char*)cuda_desc_h->iov[0].iov_base; 
-    
-    cuda_desc_h->description_max_count = 0;
-    cuda_desc_h->description_count = 0;
-    
     /* init cuda stream */
     cuda_streams = (ddt_cuda_stream_t*)malloc(sizeof(ddt_cuda_stream_t));
     for (i = 0; i < NB_STREAMS; i++) {
@@ -278,17 +220,11 @@ void opal_datatype_cuda_init(void)
     /* init cuda_iov */
     cuda_iov_count = CUDA_NB_IOV;
     
-    /* init description dist array */
-    cudaMalloc((void **)(&description_dist_d), sizeof(ddt_cuda_description_dist_t)*CUDA_MAX_NB_BLOCKS);
-    cuda_desc_h->description_dist = description_dist_d;
-    
     /* only for iov version */
     for (i = 0; i < NB_STREAMS; i++) {
         cudaMalloc((void **)(&cuda_iov_dist_d[i]), sizeof(ddt_cuda_iov_dist_t)*CUDA_MAX_NB_BLOCKS);
     }
     
-    opal_datatype_cuda_debug = 1;
-    
     // /* init size for double, float, char */
     // ALIGNMENT_DOUBLE = sizeof(double);
     // ALIGNMENT_FLOAT = sizeof(float);
@@ -301,29 +237,6 @@ void opal_datatype_cuda_fini(void)
 {
     uint32_t i;
     
-    if (cuda_desc_d != NULL) {
-        cudaFree(cuda_desc_d);
-        cuda_desc_d = NULL;
-    }
-    if (cuda_desc_h->description != NULL) {
-        cudaFree(cuda_desc_h->description);
-        cuda_desc_h->description = NULL;
-    }
-    if (cuda_desc_h->description_dist != NULL) {
-        cudaFree(cuda_desc_h->description_dist);
-        cuda_desc_h->description_dist = NULL;
-    }
-    printf("free iov\n");
-    if (cuda_desc_h != NULL) {    
-        for (i = 0; i < IOV_ARRAY_SIZE; i++) {
-            cudaFree(cuda_desc_h->iov[i].iov_base);
-            cuda_desc_h->iov[i].iov_base = NULL;
-        }
-    
-        cudaFreeHost(cuda_desc_h);
-        cuda_desc_h = NULL;
-    }
-    
     /* destory cuda stream */
     for (i = 0; i < NB_STREAMS; i++) {
         cudaStreamDestroy(cuda_streams->opal_cuda_stream[i]);
@@ -339,8 +252,6 @@ void opal_datatype_cuda_fini(void)
 void opal_cuda_sync_device(void)
 {
     cudaDeviceSynchronize();
-    pBaseBuf_GPU = gpu_src_const;
-    cuda_desc_h->iov[0].iov_base = (void*)gpu_dest_const;
 }
 
 int32_t opal_cuda_is_gpu_buffer(const void *ptr)
@@ -359,15 +270,6 @@ int32_t opal_cuda_is_gpu_buffer(const void *ptr)
     return (memType == CU_MEMORYTYPE_DEVICE) ? 1 : 0;
 }
 
-unsigned char* opal_cuda_get_gpu_pack_buffer()
-{
-    if (ddt_cuda_pack_buffer != NULL) {
-        return ddt_cuda_pack_buffer;
-    } else {
-        return NULL;
-    }
-}
-
 void* opal_cuda_malloc_gpu_buffer(size_t size, int gpu_id)
 {
     int dev_id;
@@ -408,7 +310,7 @@ void* opal_cuda_malloc_gpu_buffer(size_t size, int gpu_id)
         cuda_list_push_head(&device->buffer_used, p);
         device->buffer_used_size += size;
         device->buffer_free_size -= size;
-        DT_CUDA_DEBUG( opal_cuda_output( 0, "Malloc GPU buffer %p, dev_id %d.\n", addr, dev_id); );
+        DT_CUDA_DEBUG( opal_cuda_output( 2, "Malloc GPU buffer %p, dev_id %d.\n", addr, dev_id); );
         return addr;
     }
 }
@@ -448,28 +350,16 @@ void opal_cuda_free_gpu_buffer(void *addr, int gpu_id)
     cuda_list_item_merge_by_addr(&device->buffer_free, ptr);
     device->buffer_free_size += size;
     device->buffer_used_size -= size;
-    DT_CUDA_DEBUG( opal_cuda_output( 0, "Free GPU buffer %p.\n", addr); );
+    DT_CUDA_DEBUG( opal_cuda_output( 2, "Free GPU buffer %p.\n", addr); );
 }
 
 void opal_dump_cuda_list(ddt_cuda_list_t *list)
 {
     ddt_cuda_buffer_t *ptr = NULL;
     ptr = list->head;
-    DT_CUDA_DEBUG( opal_cuda_output( 0, "DUMP cuda list %p, nb_elements %d\n", list, list->nb_elements); );
+    DT_CUDA_DEBUG( opal_cuda_output( 2, "DUMP cuda list %p, nb_elements %d\n", list, list->nb_elements); );
     while (ptr != NULL) {
-        DT_CUDA_DEBUG( opal_cuda_output( 0, "\titem addr %p, size %ld.\n", ptr->gpu_addr, ptr->size); );
+        DT_CUDA_DEBUG( opal_cuda_output( 2, "\titem addr %p, size %ld.\n", ptr->gpu_addr, ptr->size); );
         ptr = ptr->next;
     }
 }
-
-/* from internal.h*/
-void opal_cuda_output(int output_id, const char *format, ...)
-{
-    if (output_id >= 0 && output_id <= OPAL_DATATYPE_CUDA_DEBUG_LEVEL) {
-        va_list arglist;
-        fprintf( stderr, "[Debug %d]: ", output_id );
-        va_start(arglist, format);
-        vfprintf(stderr, format, arglist);
-        va_end(arglist);
-    }
-}
diff --git a/opal/datatype/cuda/opal_datatype_cuda.cuh b/opal/datatype/cuda/opal_datatype_cuda.cuh
index 436eaa9aec3..94336ac6475 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cuh
+++ b/opal/datatype/cuda/opal_datatype_cuda.cuh
@@ -8,10 +8,6 @@ void opal_datatype_cuda_init(void);
 
 void opal_datatype_cuda_fini(void);
                                 
-int32_t opal_generic_simple_pack_function_cuda( opal_convertor_t* pConvertor,
-                                                struct iovec* iov, 
-                                                uint32_t* out_size,
-                                                size_t* max_data );
                                                 
 int32_t opal_generic_simple_pack_function_cuda_vector( opal_convertor_t* pConvertor,
                                                        struct iovec* iov, 
@@ -22,11 +18,6 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
                                                     struct iovec* iov, 
                                                     uint32_t* out_size,
                                                     size_t* max_data );                                              
-
-int32_t opal_generic_simple_unpack_function_cuda( opal_convertor_t* pConvertor,
-                                                  struct iovec* iov, 
-                                                  uint32_t* out_size,
-                                                  size_t* max_data );
                                                   
 int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvertor,
                                                   struct iovec* iov, 
@@ -102,7 +93,6 @@ void opal_cuda_free_gpu_buffer(void *addr, int gpu_id);
 
 void opal_dump_cuda_list(ddt_cuda_list_t *list);
 
-unsigned char* opal_cuda_get_gpu_pack_buffer();
 }
                             
 #endif  /* OPAL_DATATYPE_CUDA_H_HAS_BEEN_INCLUDED */
\ No newline at end of file
diff --git a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
index 2102edb6a9c..160d54336d4 100644
--- a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
+++ b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
@@ -10,9 +10,9 @@
 
 /* OPAL_CUDA */
 // #define OPAL_DATATYPE_CUDA_DRY_RUN
-#define OPAL_DATATYPE_CUDA_DEBUG
+#define OPAL_DATATYPE_CUDA_DEBUG    1
 //#define OPAL_DATATYPE_CUDA_KERNEL_TIME
-#define OPAL_DATATYPE_CUDA_DEBUG_LEVEL  0
+#define OPAL_DATATYPE_CUDA_DEBUG_LEVEL  2
 #define OPAL_DATATYPE_CUDA_TIMING
 #define OPAL_DATATYPE_VECTOR_USE_MEMCPY2D   0
 #define OPAL_DATATYPE_VECTOR_USE_ZEROCPY   0
@@ -40,43 +40,16 @@
 #define ELAPSED_TIME(TSTART, TEND)  (((TEND).tv_sec - (TSTART).tv_sec) * 1000000 + ((TEND).tv_usec - (TSTART).tv_usec))
 
 
-
-typedef struct {
-    uint32_t description_index[200];     /* index of y direction */
-    uint32_t description_local_index[200];   /* index of x direction */
-    uint32_t dst_offset[200];
-    uint32_t description_used;
-} ddt_cuda_description_dist_t;
-
-typedef struct {
-    dt_stack_t pStack[DT_STATIC_STACK_SIZE];
-    dt_elem_desc_t* description;
-    struct iovec iov[IOV_ARRAY_SIZE];
-    uint32_t stack_pos;
-    uint32_t stack_size;
-    unsigned char* pBaseBuf; /* const */
-    OPAL_PTRDIFF_TYPE lb;  /* const */
-    OPAL_PTRDIFF_TYPE ub;  /* const */
-    size_t bConverted;
-    size_t local_size; /* const */
-    uint32_t out_size;
-    size_t max_data;
-    uint32_t description_count;
-    uint32_t description_max_count;
-    ddt_cuda_description_dist_t *description_dist;
-} ddt_cuda_desc_t;
-
 typedef struct {
     cudaStream_t opal_cuda_stream[NB_STREAMS];
     uint32_t current_stream_id;
 } ddt_cuda_stream_t;
 
 typedef struct {
-    unsigned char* src[CUDA_IOV_MAX_TASK_PER_BLOCK];
-    unsigned char* dst[CUDA_IOV_MAX_TASK_PER_BLOCK];
-    uint32_t nb_elements[CUDA_IOV_MAX_TASK_PER_BLOCK];
-    uint8_t element_alignment[CUDA_IOV_MAX_TASK_PER_BLOCK];
-    uint32_t nb_tasks;
+    size_t src_offset;
+    size_t dst_offset;
+    uint32_t nb_elements;
+    uint8_t element_alignment;
 } ddt_cuda_iov_dist_t;
 
 typedef struct ddt_cuda_buffer{
@@ -103,19 +76,11 @@ typedef struct {
 
 extern ddt_cuda_list_t *cuda_free_list;
 extern ddt_cuda_device_t *cuda_device;
-extern ddt_cuda_desc_t *cuda_desc_d, *cuda_desc_h;
-extern unsigned char* pBaseBuf_GPU;
-extern unsigned char *ddt_cuda_pack_buffer, *ddt_cuda_unpack_buffer;
-extern size_t ddt_cuda_buffer_space;
 extern ddt_cuda_stream_t* cuda_streams;
 extern struct iovec cuda_iov[CUDA_NB_IOV];
 extern uint32_t cuda_iov_count;
-extern ddt_cuda_description_dist_t description_dist_h[CUDA_MAX_NB_BLOCKS];
-extern ddt_cuda_description_dist_t* description_dist_d;
-extern ddt_cuda_iov_dist_t cuda_iov_dist_h[NB_STREAMS][CUDA_MAX_NB_BLOCKS];
+extern ddt_cuda_iov_dist_t* cuda_iov_dist_h[NB_STREAMS];
 extern ddt_cuda_iov_dist_t* cuda_iov_dist_d[NB_STREAMS];
-extern dt_elem_desc_t* description_d;
-extern uint8_t opal_datatype_cuda_debug;
 
 //extern uint8_t ALIGNMENT_DOUBLE, ALIGNMENT_FLOAT, ALIGNMENT_CHAR;
 
@@ -126,24 +91,6 @@ extern uint8_t opal_datatype_cuda_debug;
 #define DBGPRINT(fmt, ...) 
 #endif 
 
-__device__ void pack_contiguous_loop_cuda_kernel( dt_elem_desc_t* ELEM,
-                                                  uint32_t* COUNT,
-                                                  unsigned char** SOURCE,
-                                                  unsigned char** DESTINATION,
-                                                  size_t* SPACE );
-                                                            
-__device__ void unpack_contiguous_loop_cuda_kernel( dt_elem_desc_t* ELEM,
-                                                    uint32_t* COUNT,
-                                                    unsigned char** SOURCE,
-                                                    unsigned char** DESTINATION,
-                                                    size_t* SPACE );
-                                                  
-__global__ void opal_generic_simple_pack_cuda_kernel(ddt_cuda_desc_t* cuda_desc);
-
-__global__ void opal_generic_simple_pack_cuda_kernel_v2(ddt_cuda_desc_t* cuda_desc);
-
-__global__ void opal_generic_simple_unpack_cuda_kernel(ddt_cuda_desc_t* cuda_desc);
-
 __global__ void pack_contiguous_loop_cuda_kernel_global( uint32_t copy_loops,
                                                          size_t size,
                                                          OPAL_PTRDIFF_TYPE extent,
@@ -156,11 +103,10 @@ __global__ void unpack_contiguous_loop_cuda_kernel_global( uint32_t copy_loops,
                                                            unsigned char* source,
                                                            unsigned char* destination );
                                                            
-// __global__ void opal_generic_simple_pack_cuda_iov_kernel( ddt_cuda_description_dist_t* desc_dist_d, dt_elem_desc_t* desc_d, uint32_t required_blocks, struct iovec* iov, unsigned char* pBaseBuf);
 
-__global__ void opal_generic_simple_pack_cuda_iov_kernel( ddt_cuda_iov_dist_t* cuda_iov_dist);
+__global__ void opal_generic_simple_pack_cuda_iov_kernel( ddt_cuda_iov_dist_t* cuda_iov_dist, int nb_blocks_used, unsigned char* source_base, unsigned char* destination_base);
 
-__global__ void opal_generic_simple_unpack_cuda_iov_kernel( ddt_cuda_iov_dist_t* cuda_iov_dist);
+__global__ void opal_generic_simple_unpack_cuda_iov_kernel( ddt_cuda_iov_dist_t* cuda_iov_dist, int nb_blocks_used, unsigned char* source_base, unsigned char* destination_base);
 
 __global__ void opal_empty_kernel(uint32_t copy_loops,
                                   size_t size,
@@ -173,7 +119,7 @@ __global__ void opal_empty_kernel_noargs();
 void opal_cuda_output(int output_id, const char *format, ...);
 
 #if defined (OPAL_DATATYPE_CUDA_DEBUG)
-#define DT_CUDA_DEBUG( INST ) if (opal_datatype_cuda_debug) { INST }
+#define DT_CUDA_DEBUG( INST ) if (OPAL_DATATYPE_CUDA_DEBUG) { INST }
 #else
 #define DT_CUDA_DEBUG( INST )
 #endif
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
index 79281adf6cb..a58b831b78b 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
@@ -5,529 +5,6 @@
 #include <stdio.h> 
 #include <time.h>
 
-__device__ void pack_contiguous_loop_cuda_kernel( dt_elem_desc_t* ELEM,
-                                                  uint32_t* COUNT,
-                                                  unsigned char** SOURCE,
-                                                  unsigned char** DESTINATION,
-                                                  size_t* SPACE )
-{
-    ddt_loop_desc_t *_loop = (ddt_loop_desc_t*)(ELEM);
-    ddt_endloop_desc_t* _end_loop = (ddt_endloop_desc_t*)((ELEM) + _loop->items);
-    unsigned char* _src_disp = (*SOURCE) + _end_loop->first_elem_disp;
-    uint32_t _copy_loops = *(COUNT);
-    uint32_t _i, tid, num_threads;
-    unsigned char* _destination = *DESTINATION;
-//    unsigned char* _source = _src_disp;
-    uint32_t gap, nb_elements;
-    double *_source_tmp, *_destination_tmp, *_src_disp_tmp;
-    
-    tid = threadIdx.x + blockIdx.x * blockDim.x;
-    num_threads = gridDim.x * blockDim.x;
-
-    if( (_copy_loops * _end_loop->size) > *(SPACE) )
-        _copy_loops = (uint32_t)(*(SPACE) / _end_loop->size);
-    
-//     num_task_per_thread = _copy_loops / num_threads;
-//     residue = _copy_loops % num_threads;
-//     if ( ((tid < residue) && (residue != 0)) || (residue == 0) ) {
-//         num_task_per_thread += residue == 0 ? 0 : 1;
-//         start_index = tid * num_task_per_thread;
-//     } else {
-//         start_index = residue * (num_task_per_thread+1) + (tid-residue) * num_task_per_thread;
-//     }
-//
-//     end_index = start_index + num_task_per_thread;
-//     DBGPRINT("tid %d, start %d, end %d, num_task_per_thread %d, copy_loops %d\n", tid, start_index, end_index, num_task_per_thread, _copy_loops);
-//     for( _i = start_index; _i < end_index; _i++ ) {
-//         // OPAL_DATATYPE_SAFEGUARD_POINTER( _source, _loop->extent, (CONVERTOR)->pBaseBuf,
-//         //                             (CONVERTOR)->pDesc, (CONVERTOR)->count );
-//         _source = _src_disp + _i * _loop->extent;
-//         _destination = *DESTINATION + _i * _end_loop->size;
-//         DBGPRINT("tid %d, pack 3. memcpy( %p, %p, %lu ) => space %lu, _i %d\n",
-//                                tid, _destination, _source, (unsigned long)_end_loop->size, (unsigned long)(*(SPACE) - _i * _end_loop->size), _i );
-//     //    MEMCPY_CSUM( *(DESTINATION), _source, _end_loop->size, (CONVERTOR) );
-// #if !defined (OPAL_DATATYPE_CUDA_DRY_RUN)
-//  //       memcpy(_destination, _source, _end_loop->size);
-//         _source_tmp = (double *)_source;
-//         _destination_tmp = (double *)_destination;
-//         for (_j = 0; _j < _end_loop->size/8; _j++)
-//         {
-//             *_destination_tmp = *_source_tmp;
-//             _destination_tmp ++;
-//             _source_tmp ++;
-//         }
-// #endif /* OPAL_DATATYPE_CUDA_DRY_RUN */
-//     }
-    
-    gap = (_loop->extent - _end_loop->size) / 8;
-    nb_elements = _end_loop->size / 8;
-    _src_disp_tmp = (double*)_src_disp;
-    _destination_tmp = (double*)_destination;
-    _destination_tmp += tid;
-
-    __syncthreads();
-
-    for (_i = tid; _i < _copy_loops*nb_elements; _i+=num_threads) {
-        _source_tmp = _src_disp_tmp + tid + _i/num_threads*num_threads + _i/nb_elements * gap;
-#if defined (OPAL_DATATYPE_CUDA_DEBUG)
-        if (_i % nb_elements == 0 ) {
-            DBGPRINT("tid %d, pack 3. memcpy( %p, %p, %lu ) => space %lu, _i %d, actual _i %d\n",
-                                            tid, _destination_tmp, _source_tmp, (unsigned long)_end_loop->size, (unsigned long)(*(SPACE) - _i/nb_elements * _end_loop->size), _i/nb_elements, _i );
-        }
-        // if (_i / nb_elements ==1 && tid == 0 ) {
-        //     DBGPRINT("tid %d, pack 3. memcpy( %p, %p, %lu ) => space %lu, _i %d, actual _i %d\n",
-        //                                     tid, _destination_tmp, _source_tmp, (unsigned long)_end_loop->size, (unsigned long)(*(SPACE) - _i/nb_elements * _end_loop->size), _i/nb_elements, _i );
-        // }
-#endif /* OPAL_DATATYPE_CUDA_DEBUG */
-#if !defined (OPAL_DATATYPE_CUDA_DRY_RUN)
-        *_destination_tmp = *_source_tmp;
-#endif /* ! OPAL_DATATYPE_CUDA_DRY_RUN */
-        _destination_tmp += num_threads;
-
-    }
-    *(SOURCE) = _src_disp + _copy_loops*_loop->extent - _end_loop->first_elem_disp;
-    *(DESTINATION) = *(DESTINATION)  + _copy_loops * _end_loop->size;
-    *(SPACE) -= _copy_loops * _end_loop->size;
-    *(COUNT) -= _copy_loops;
-
-}
-
-__device__ void pack_predefined_data_cuda_kernel( dt_elem_desc_t* ELEM,
-                                                  uint32_t* COUNT,
-                                                  unsigned char** SOURCE,
-                                                  unsigned char** DESTINATION,
-                                                  size_t* SPACE )
-{
-    uint32_t _copy_count = *(COUNT);
-    size_t _copy_blength;
-    ddt_elem_desc_t* _elem = &((ELEM)->elem);
-    unsigned char* _src_disp = (*SOURCE) + _elem->disp;
-    uint32_t _i, tid, num_threads;
-    unsigned char* _destination = *DESTINATION;
-    uint32_t gap, nb_elements;
-    double *_source_tmp, *_destination_tmp, *_src_disp_tmp;;
-
-    _copy_blength = 8;//opal_datatype_basicDatatypes[_elem->common.type]->size;
-    if( (_copy_count * _copy_blength) > *(SPACE) ) {
-        _copy_count = (uint32_t)(*(SPACE) / _copy_blength);
-        if( 0 == _copy_count ) return;  /* nothing to do */
-    }
-    
-    tid = threadIdx.x + blockIdx.x * blockDim.x;
-    num_threads = gridDim.x * blockDim.x;
-    
-    gap = (_elem->extent - _copy_blength) / 8;
-    nb_elements = _copy_blength / 8;
-    _src_disp_tmp = (double*)_src_disp;
-    _destination_tmp = (double*)_destination;
-    _destination_tmp += tid;
-    
-    __syncthreads();
-    
-    for (_i = tid; _i < _copy_count*nb_elements; _i+=num_threads) {
-        _source_tmp = _src_disp_tmp + tid + _i/num_threads*num_threads + _i/nb_elements * gap;
-#if defined (OPAL_DATATYPE_CUDA_DEBUG)
-        if (_i == 0 ) {
-            DBGPRINT("tid %d, pack 3. memcpy( %p, %p, %lu ) => space %lu, _i %d, count %d\n",
-                                            tid, _destination_tmp, _source_tmp, (unsigned long)_copy_blength*_copy_count, (unsigned long)(*(SPACE) - _i/nb_elements * _copy_blength), _i/nb_elements, _copy_count );
-        }
-        // if (_i / nb_elements ==1 && tid == 0 ) {
-        //     DBGPRINT("tid %d, pack 3. memcpy( %p, %p, %lu ) => space %lu, _i %d, actual _i %d\n",
-        //                                     tid, _destination_tmp, _source_tmp, (unsigned long)_end_loop->size, (unsigned long)(*(SPACE) - _i/nb_elements * _end_loop->size), _i/nb_elements, _i );
-        // }
-#endif /* OPAL_DATATYPE_CUDA_DEBUG */
-#if !defined (OPAL_DATATYPE_CUDA_DRY_RUN)
-        *_destination_tmp = *_source_tmp;
-#endif /* ! OPAL_DATATYPE_CUDA_DRY_RUN */
-        _destination_tmp += num_threads;
-
-    }
-    
-    _copy_blength *= _copy_count;
-    *(SOURCE)  = _src_disp + _elem->extent*_copy_count - _elem->disp;
-    *(DESTINATION) += _copy_blength;
-    *(SPACE)  -= _copy_blength;
-    *(COUNT)  -= _copy_count;
-    
-}
-
-__device__ void pack_predefined_data_cuda_kernel_v2( dt_elem_desc_t* ELEM,
-                                                     uint32_t* COUNT,
-                                                     unsigned char* SOURCE,
-                                                     unsigned char* DESTINATION,
-                                                     size_t* SPACE,
-                                                     uint32_t local_index,
-                                                     uint32_t dst_offset )
-{
-    uint32_t _copy_count = *(COUNT);
-    size_t _copy_blength;
-    ddt_elem_desc_t* _elem = &((ELEM)->elem);
-    unsigned char* _src_disp = (SOURCE) + _elem->disp;
-    uint32_t local_tid;
-    unsigned char* _destination = DESTINATION;
-    double *_source_tmp, *_destination_tmp, *_src_disp_tmp;;
-
-    _copy_blength = 8;//opal_datatype_basicDatatypes[_elem->common.type]->size;
-    // if( (_copy_count * _copy_blength) > *(SPACE) ) {
-    //     _copy_count = (uint32_t)(*(SPACE) / _copy_blength);
-    //     if( 0 == _copy_count ) return;  /* nothing to do */
-    // }
-    
-    local_tid = threadIdx.x + local_index * blockDim.x;
-    _src_disp_tmp = (double*)_src_disp;
-    _destination_tmp = (double*)_destination + dst_offset;
-    
-    if (local_tid < _copy_count) {
-        _source_tmp = _src_disp_tmp + local_tid;
-        _destination_tmp += local_tid;
-#if defined (OPAL_DATATYPE_CUDA_DEBUG)
-       if (local_tid == 0 ) {
-            DBGPRINT("tid %d, local_index %d, pack 1. memcpy( %p, %p, %lu ) => space %lu, blockIdx %d, count %d, destination %p, offset %d\n",
-                                            local_tid, local_index, _destination_tmp, _source_tmp, (unsigned long)_copy_blength*_copy_count, (unsigned long)(*(SPACE) - local_tid * _copy_blength), blockIdx.x, _copy_count, _destination, dst_offset );
-       }
-#endif /* OPAL_DATATYPE_CUDA_DEBUG */
-#if !defined (OPAL_DATATYPE_CUDA_DRY_RUN)
-       *_destination_tmp = *_source_tmp;
-#endif /* ! OPAL_DATATYPE_CUDA_DRY_RUN */
-    }
-}
-
-__global__ void opal_generic_simple_pack_cuda_kernel(ddt_cuda_desc_t* cuda_desc)
-{
-    dt_stack_t *pStack;       /* pointer to the position on the stack */
-    uint32_t pos_desc;        /* actual position in the description of the derived datatype */
-    uint32_t count_desc;      /* the number of items already done in the actual pos_desc */
-    size_t total_packed = 0;  /* total amount packed this time */
-    dt_elem_desc_t* description;
-    dt_elem_desc_t* pElem;
-    unsigned char *conv_ptr, *iov_ptr, *pBaseBuf;
-    size_t iov_len_local;
-    uint32_t iov_count;
-    uint32_t stack_pos;
-    struct iovec* iov;
-
-    OPAL_PTRDIFF_TYPE extent;
-    uint32_t out_size;
-
-    // __shared__ ddt_cuda_desc_t cuda_desc_b;
-    __shared__ dt_stack_t shared_pStack[DT_STATIC_STACK_SIZE];
-
-    if (threadIdx.x < DT_STATIC_STACK_SIZE) {
-        shared_pStack[threadIdx.x] = cuda_desc->pStack[threadIdx.x];
-    }
-    __syncthreads();
-
-
-    // load cuda descriptor from constant memory
-    iov = cuda_desc->iov;
-    pStack = shared_pStack;
-    description = cuda_desc->description;
-    stack_pos = cuda_desc->stack_pos;
-    pBaseBuf = cuda_desc->pBaseBuf;
-    extent = cuda_desc->ub - cuda_desc->lb;
-    out_size = cuda_desc->out_size;
-
-    pStack = pStack + stack_pos;
-    pos_desc   = pStack->index;
-    conv_ptr   = pBaseBuf + pStack->disp;
-    count_desc = (uint32_t)pStack->count;
-    pStack--;
-    stack_pos--;
-    pElem = &(description[pos_desc]);
-
-//    printf("pack start pos_desc %d count_desc %d disp %ld, stack_pos %d pos_desc %d count_desc %d disp %ld\n",
-//            pos_desc, count_desc, (long)(conv_ptr - pBaseBuf), stack_pos, pStack->index, (int)pStack->count, (long)pStack->disp);
-
-    for( iov_count = 0; iov_count < out_size; iov_count++ ) {
-        iov_ptr = (unsigned char *) iov[iov_count].iov_base;
-        iov_len_local = iov[iov_count].iov_len;
-        DBGPRINT("iov_len_local %lu, flags %d, types %d, count %d\n", iov_len_local, description->elem.common.flags, description->elem.common.type, description->elem.count);
-        while( 1 ) {
-            while( pElem->elem.common.flags & OPAL_DATATYPE_FLAG_DATA ) {
-                /* now here we have a basic datatype */
-                // PACK_PREDEFINED_DATATYPE( pConvertor, pElem, count_desc,
-                //                           conv_ptr, iov_ptr, iov_len_local );     
-                pack_predefined_data_cuda_kernel(pElem, &count_desc, &conv_ptr, &iov_ptr, &iov_len_local);
-                if( 0 == count_desc ) {  /* completed */
-                    conv_ptr = pBaseBuf + pStack->disp;
-                    pos_desc++;  /* advance to the next data */
-                    UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
-                    continue;
-                }
-                goto complete_loop;
-            }
-            if( OPAL_DATATYPE_END_LOOP == pElem->elem.common.type ) { /* end of the current loop */
-                // DO_DEBUG( opal_output( 0, "pack end_loop count %d stack_pos %d"
-                //                        " pos_desc %d disp %ld space %lu\n",
-                //                        (int)pStack->count, pConvertor->stack_pos,
-                //                        pos_desc, (long)pStack->disp, (unsigned long)iov_len_local ); );
-                if (threadIdx.x == 0) {
-                    (pStack->count)--;
-                }
-                __syncthreads();
-
-                if( (pStack->count) == 0 ) { /* end of loop */
-                    if( 0 == stack_pos ) {
-                        /* we lie about the size of the next element in order to
-                         * make sure we exit the main loop.
-                         */
-                        out_size = iov_count;
-                        goto complete_loop;  /* completed */
-                    }
-                    stack_pos--;
-                    pStack--;
-                    pos_desc++;
-                } else {
-                    pos_desc = pStack->index + 1;
-                    if (threadIdx.x == 0) {
-                        if( pStack->index == -1 ) {
-                            pStack->disp += extent;
-                        } else {
-                            // assert( OPAL_DATATYPE_LOOP == description[pStack->index].loop.common.type );
-                            pStack->disp += description[pStack->index].loop.extent;
-                        }
-                    }
-                    __syncthreads();
-                }
-                conv_ptr = pBaseBuf + pStack->disp;
-                UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
-                // DO_DEBUG( opal_output( 0, "pack new_loop count %d stack_pos %d pos_desc %d count_desc %d disp %ld space %lu\n",
-                //                        (int)pStack->count, pConvertor->stack_pos, pos_desc,
-                //                        count_desc, (long)pStack->disp, (unsigned long)iov_len_local ); );
-            }
-            if( OPAL_DATATYPE_LOOP == pElem->elem.common.type ) {
-                OPAL_PTRDIFF_TYPE local_disp = (OPAL_PTRDIFF_TYPE)conv_ptr;
-                if( pElem->loop.common.flags & OPAL_DATATYPE_FLAG_CONTIGUOUS ) {
-                    pack_contiguous_loop_cuda_kernel( pElem, &count_desc,
-                                          &conv_ptr, &iov_ptr, &iov_len_local );
-                    if( 0 == count_desc ) {  /* completed */
-                        pos_desc += pElem->loop.items + 1;
-                        goto update_loop_description;
-                    }
-                    /* Save the stack with the correct last_count value. */
-                }
-                local_disp = (OPAL_PTRDIFF_TYPE)conv_ptr - local_disp;
-
-                PUSH_STACK( pStack, stack_pos, pos_desc, OPAL_DATATYPE_LOOP, count_desc,
-                            pStack->disp + local_disp);
-
-                pos_desc++;
-            update_loop_description:  /* update the current state */
-                conv_ptr = pBaseBuf + pStack->disp;
-                UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
-                // DDT_DUMP_STACK( pConvertor->pStack, pConvertor->stack_pos, pElem, "advance loop" );
-                continue;
-            }
-        }
-    complete_loop:
-        if (threadIdx.x == 0) {
-            iov[iov_count].iov_len -= iov_len_local;  /* update the amount of valid data */
-        }
-        __syncthreads();
-        total_packed += iov[iov_count].iov_len;
-    }
-
-    // if (tid == 0) {
-    //     cuda_desc->max_data = total_packed;
-    //     cuda_desc->out_size = iov_count;
-    //     // cuda_desc->bConverted += total_packed;  /* update the already converted bytes */
-    //     // if( cuda_desc->bConverted == cuda_desc->local_size ) {
-    //     //     cuda_desc->stack_pos = stack_pos;
-    //     //     memcpy(cuda_desc->pStack, pStack_head, sizeof(dt_stack_t)*cuda_desc->stack_size);
-    //     //     return;
-    //     // }
-    //     // /* Save the global position for the next round */
-    //     // PUSH_STACK( pStack, stack_pos, pos_desc, OPAL_DATATYPE_INT8, count_desc,
-    //     //             conv_ptr - pBaseBuf );
-    //     // memcpy(cuda_desc->pStack, pStack_head, sizeof(dt_stack_t)*cuda_desc->stack_size);
-    //     // cuda_desc->stack_pos = stack_pos;
-    // }
-
-    return;
-}
-
-__global__ void opal_generic_simple_pack_cuda_kernel_v2(ddt_cuda_desc_t* cuda_desc)
-{
-    dt_stack_t *pStack;       /* pointer to the position on the stack */
-    uint32_t pos_desc;        /* actual position in the description of the derived datatype */
-    uint32_t count_desc;      /* the number of items already done in the actual pos_desc */
-    size_t total_packed = 0;  /* total amount packed this time */
-    dt_elem_desc_t* description;
-    dt_elem_desc_t* pElem;
-    unsigned char *conv_ptr, *iov_ptr, *pBaseBuf;
-    size_t iov_len_local;
-    uint32_t iov_count;
-    uint32_t stack_pos;
-    struct iovec* iov;
-    ddt_cuda_description_dist_t* description_dist_d;
-    uint32_t ct = 0, local_index, dst_offset;
-
-    OPAL_PTRDIFF_TYPE extent;
-    uint32_t out_size;
-
-    // __shared__ ddt_cuda_desc_t cuda_desc_b;
-    __shared__ dt_stack_t shared_pStack[DT_STATIC_STACK_SIZE];
-
-    if (threadIdx.x < DT_STATIC_STACK_SIZE) {
-        shared_pStack[threadIdx.x] = cuda_desc->pStack[threadIdx.x];
-    }
-    __syncthreads();
-
-
-    // load cuda descriptor from constant memory
-    iov = cuda_desc->iov;
-    pStack = shared_pStack;
-    description = cuda_desc->description;
-    stack_pos = cuda_desc->stack_pos;
-    pBaseBuf = cuda_desc->pBaseBuf;
-    extent = cuda_desc->ub - cuda_desc->lb;
-    out_size = cuda_desc->out_size;
-    description_dist_d = cuda_desc->description_dist;
-
-    pStack = pStack + stack_pos;
-    pos_desc = description_dist_d[blockIdx.x].description_index[ct];
-    local_index = description_dist_d[blockIdx.x].description_local_index[ct];
-    dst_offset = description_dist_d[blockIdx.x].dst_offset[ct];
-    pElem = &(description[pos_desc]);
-    count_desc = pElem->elem.count;
-    conv_ptr = pBaseBuf + pStack->disp;
-    pStack--;
-    stack_pos--;
-
-//    printf("pack start pos_desc %d count_desc %d disp %ld, stack_pos %d pos_desc %d count_desc %d disp %ld\n",
-//            pos_desc, count_desc, (long)(conv_ptr - pBaseBuf), stack_pos, pStack->index, (int)pStack->count, (long)pStack->disp);
-
-    for( iov_count = 0; iov_count < out_size; iov_count++ ) {
-        iov_ptr = (unsigned char *) iov[iov_count].iov_base;
-        iov_len_local = iov[iov_count].iov_len;
-//        DBGPRINT("iov_len_local %lu, flags %d, types %d, count %d\n", iov_len_local, description->elem.common.flags, description->elem.common.type, description->elem.count);
-        while( 1 ) {
-            while( pElem->elem.common.flags & OPAL_DATATYPE_FLAG_DATA ) {
-                /* now here we have a basic datatype */
-                // PACK_PREDEFINED_DATATYPE( pConvertor, pElem, count_desc,
-                //                           conv_ptr, iov_ptr, iov_len_local );  
-               pack_predefined_data_cuda_kernel_v2(pElem, &count_desc, conv_ptr, iov_ptr, &iov_len_local, local_index, dst_offset);
-               count_desc = 0;
-                if( 0 == count_desc ) {  /* completed */
-                    conv_ptr = pBaseBuf + pStack->disp;
-                    ct ++;
-                    if (ct >= description_dist_d[blockIdx.x].description_used) {
-                        pos_desc = cuda_desc->description_count-1;
-                    } else {
-                        pos_desc = description_dist_d[blockIdx.x].description_index[ct];  /* advance to the next data */
-                        local_index = description_dist_d[blockIdx.x].description_local_index[ct];
-                        dst_offset = description_dist_d[blockIdx.x].dst_offset[ct];
-                    }
-#if defined (OPAL_DATATYPE_CUDA_DEBUG)
-                    if (pos_desc > (cuda_desc->description_count - 1)) {
-                        printf("EEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEERROR, block %d, thread %d, pos_desc %d\n", blockIdx.x, threadIdx.x, pos_desc);
-                    }
-#endif /* OPAL_DATATYPE_CUDA_DEBUG */
-                    UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
-#if defined (OPAL_DATATYPE_CUDA_DEBUG)
-                    if (pos_desc < (cuda_desc->description_count - 1) && !(pElem->elem.common.flags & OPAL_DATATYPE_FLAG_DATA)) {
-                        printf("I get a error block %d, thread %d, pos_desc %d\n", blockIdx.x, threadIdx.x, pos_desc);
-                    }
-#endif /* OPAL_DATATYPE_CUDA_DEBUG */
-                    continue;
-                }
-                goto complete_loop;
-            }
-            if( OPAL_DATATYPE_END_LOOP == pElem->elem.common.type ) { /* end of the current loop */
-                // DO_DEBUG( opal_output( 0, "pack end_loop count %d stack_pos %d"
-                //                        " pos_desc %d disp %ld space %lu\n",
-                //                        (int)pStack->count, pConvertor->stack_pos,
-                //                        pos_desc, (long)pStack->disp, (unsigned long)iov_len_local ); );
-                if (threadIdx.x == 0) {
-                    (pStack->count)--;
-                }
-                __syncthreads();
-
-                if( (pStack->count) == 0 ) { /* end of loop */
-                    if( 0 == stack_pos ) {
-                        /* we lie about the size of the next element in order to
-                         * make sure we exit the main loop.
-                         */
-                        out_size = iov_count;
-                        goto complete_loop;  /* completed */
-                    }
-                    stack_pos--;
-                    pStack--;
-                    pos_desc++;
-                } else {
-                    pos_desc = pStack->index + 1;
-                    if (threadIdx.x == 0) {
-                        if( pStack->index == -1 ) {
-                            pStack->disp += extent;
-                        } else {
-                            // assert( OPAL_DATATYPE_LOOP == description[pStack->index].loop.common.type );
-                            pStack->disp += description[pStack->index].loop.extent;
-                        }
-                    }
-                    __syncthreads();
-                }
-                conv_ptr = pBaseBuf + pStack->disp;
-                UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
-                // DO_DEBUG( opal_output( 0, "pack new_loop count %d stack_pos %d pos_desc %d count_desc %d disp %ld space %lu\n",
-                //                        (int)pStack->count, pConvertor->stack_pos, pos_desc,
-                //                        count_desc, (long)pStack->disp, (unsigned long)iov_len_local ); );
-            }
-            if( OPAL_DATATYPE_LOOP == pElem->elem.common.type ) {
-                OPAL_PTRDIFF_TYPE local_disp = (OPAL_PTRDIFF_TYPE)conv_ptr;
-                if( pElem->loop.common.flags & OPAL_DATATYPE_FLAG_CONTIGUOUS ) {
-                    pack_contiguous_loop_cuda_kernel( pElem, &count_desc,
-                                          &conv_ptr, &iov_ptr, &iov_len_local );
-                    if( 0 == count_desc ) {  /* completed */
-                        pos_desc += pElem->loop.items + 1;
-                        goto update_loop_description;
-                    }
-                    /* Save the stack with the correct last_count value. */
-                }
-                local_disp = (OPAL_PTRDIFF_TYPE)conv_ptr - local_disp;
-
-                PUSH_STACK( pStack, stack_pos, pos_desc, OPAL_DATATYPE_LOOP, count_desc,
-                            pStack->disp + local_disp);
-
-                pos_desc++;
-            update_loop_description:  /* update the current state */
-                conv_ptr = pBaseBuf + pStack->disp;
-                UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
-                // DDT_DUMP_STACK( pConvertor->pStack, pConvertor->stack_pos, pElem, "advance loop" );
-                continue;
-            }
-        }
-    complete_loop:
-        if (threadIdx.x == 0) {
-            iov[iov_count].iov_len -= iov_len_local;  /* update the amount of valid data */
-        }
-        __syncthreads();
-        total_packed += iov[iov_count].iov_len;
-    }
-#if defined (OPAL_DATATYPE_CUDA_DEBUG)    
-    if (ct != description_dist_d[blockIdx.x].description_used) {
-        printf("I am at the end, but error,ct %d\n", ct);
-    }
-#endif /* OPAL_DATATYPE_CUDA_DEBUG */
-
-    // if (tid == 0) {
-    //     cuda_desc->max_data = total_packed;
-    //     cuda_desc->out_size = iov_count;
-    //     // cuda_desc->bConverted += total_packed;  /* update the already converted bytes */
-    //     // if( cuda_desc->bConverted == cuda_desc->local_size ) {
-    //     //     cuda_desc->stack_pos = stack_pos;
-    //     //     memcpy(cuda_desc->pStack, pStack_head, sizeof(dt_stack_t)*cuda_desc->stack_size);
-    //     //     return;
-    //     // }
-    //     // /* Save the global position for the next round */
-    //     // PUSH_STACK( pStack, stack_pos, pos_desc, OPAL_DATATYPE_INT8, count_desc,
-    //     //             conv_ptr - pBaseBuf );
-    //     // memcpy(cuda_desc->pStack, pStack_head, sizeof(dt_stack_t)*cuda_desc->stack_size);
-    //     // cuda_desc->stack_pos = stack_pos;
-    // }
-
-    return;
-}
-
 __global__ void pack_contiguous_loop_cuda_kernel_global( uint32_t copy_loops,
                                                          size_t size,
                                                          OPAL_PTRDIFF_TYPE extent,
@@ -593,10 +70,10 @@ __global__ void pack_contiguous_loop_cuda_kernel_global( uint32_t copy_loops,
 //
 // }
 
-__global__ void opal_generic_simple_pack_cuda_iov_kernel( ddt_cuda_iov_dist_t* cuda_iov_dist)
+__global__ void opal_generic_simple_pack_cuda_iov_kernel( ddt_cuda_iov_dist_t* cuda_iov_dist, int nb_blocks_used, unsigned char* source_base, unsigned char* destination_base)
 {
     uint32_t i, _copy_count;
-    unsigned char *src, *dst;
+    size_t src_offset, dst_offset;
     uint8_t alignment;
     unsigned char *_source_tmp, *_destination_tmp;
     
@@ -609,18 +86,18 @@ __global__ void opal_generic_simple_pack_cuda_iov_kernel( ddt_cuda_iov_dist_t* c
     __syncthreads();
     
     for (i = 0; i < nb_tasks; i++) {
-        src = cuda_iov_dist[blockIdx.x].src[i];
-        dst = cuda_iov_dist[blockIdx.x].dst[i];
-        _copy_count = cuda_iov_dist[blockIdx.x].nb_elements[i];
-        alignment = cuda_iov_dist[blockIdx.x].element_alignment[i];
+        src_offset = cuda_iov_dist[blockIdx.x + i * gridDim.x].src_offset;
+        dst_offset = cuda_iov_dist[blockIdx.x + i * gridDim.x].dst_offset;
+        _copy_count = cuda_iov_dist[blockIdx.x + i * gridDim.x].nb_elements;
+        alignment = cuda_iov_dist[blockIdx.x + i * gridDim.x].element_alignment;
         
         // if (threadIdx.x == 0) {
         //     printf("block %d, ali %d, nb_element %d\n", blockIdx.x, cuda_iov_dist[blockIdx.x].element_alignment[i], _copy_count);
         // }
         
         if (threadIdx.x < _copy_count) {
-            _source_tmp = src + threadIdx.x * alignment;
-            _destination_tmp = dst + threadIdx.x * alignment;
+            _source_tmp = source_base + src_offset + threadIdx.x * alignment;
+            _destination_tmp = destination_base + dst_offset + threadIdx.x * alignment;
 #if !defined (OPAL_DATATYPE_CUDA_DRY_RUN)
             if (alignment == ALIGNMENT_DOUBLE) {
                 *((long *)_destination_tmp) = *((long *)_source_tmp);
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
index 00c7812b605..efc0c7af957 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
@@ -7,169 +7,6 @@
 #include <stdio.h>
 #include <assert.h>
 
-int32_t opal_generic_simple_pack_function_cuda( opal_convertor_t* pConvertor,
-                                                struct iovec* iov, 
-                                                uint32_t* out_size,
-                                                size_t* max_data )
-{
-    uint32_t i;
-    dt_elem_desc_t* description;
-    dt_elem_desc_t* pElem;
-    const opal_datatype_t *pData = pConvertor->pDesc;
-    uint32_t tasks_per_block, num_blocks, thread_per_block;
-    dt_stack_t* pStack;
-    
-    //return -99;
-
-    description = pConvertor->use_desc->desc;
-    
-    cuda_desc_h->stack_pos = pConvertor->stack_pos;
-#if defined(OPAL_DATATYPE_CUDA_DRY_RUN)
-    cuda_desc_h->pBaseBuf = pConvertor->pBaseBuf;
-#else
-    cuda_desc_h->pBaseBuf = pBaseBuf_GPU;
-#endif /* OPAL_DATATYPE_CUDA_DRY_RUN */
-    cuda_desc_h->lb = pData->lb;
-    cuda_desc_h->ub = pData->ub;
-    cuda_desc_h->out_size = *out_size;
-    cuda_desc_h->max_data = *max_data;
-    cuda_desc_h->bConverted = pConvertor->bConverted;
-    cuda_desc_h->local_size = pConvertor->local_size;
-    cuda_desc_h->stack_size = pConvertor->stack_size;
-    
-    for (i = 0; i < pConvertor->stack_size; i++) {
-        cuda_desc_h->pStack[i] = pConvertor->pStack[i];
-    }
-    if (cuda_desc_h->description_max_count != 0) {
-        if (cuda_desc_h->description_max_count >= (pConvertor->use_desc->used+1)) {
-            cuda_desc_h->description_count = pConvertor->use_desc->used+1;
-        } else {
-            cudaFree(cuda_desc_h->description);
-            cuda_desc_h->description = NULL;
-            cudaMalloc((void **)&(cuda_desc_h->description), sizeof(dt_elem_desc_t)*(pConvertor->use_desc->used+1));
-            cuda_desc_h->description_max_count = pConvertor->use_desc->used+1;
-            cuda_desc_h->description_count = pConvertor->use_desc->used+1;
-        }
-        
-    } else {
-        cudaMalloc((void **)&(cuda_desc_h->description), sizeof(dt_elem_desc_t)*(pConvertor->use_desc->used+1));
-        cuda_desc_h->description_max_count = pConvertor->use_desc->used+1;
-        cuda_desc_h->description_count = pConvertor->use_desc->used+1;
-    }
-    cudaMemcpy(cuda_desc_h->description, description, sizeof(dt_elem_desc_t)*(cuda_desc_h->description_count), cudaMemcpyHostToDevice);
-    printf("description ct %d\n", cuda_desc_h->description_count);
-    
-    // for (i = 0; i < pConvertor->use_desc->used+1; i++) {
-    //     cuda_desc_h->description[i] = description[i];
-    // }
-    
-    DBGPRINT("stack_size %d\n", pConvertor->stack_size);
-
-    DBGPRINT("flags %d, types %d, count %d\n", description->elem.common.flags, description->elem.common.type, description->elem.count);
-    
-    for (i = 0; i < *out_size; i++) {
-#if defined (OPAL_DATATYPE_CUDA_DRY_RUN)
-        cuda_desc_h->iov[i].iov_base = iov[i].iov_base;
-#endif /* OPAL_DATATYPE_CUDA_DRY_RUN */
-        cuda_desc_h->iov[i].iov_len = iov[i].iov_len;
-    }
-    
-    pStack = pConvertor->pStack + pConvertor->stack_pos;
-    thread_per_block = CUDA_WARP_SIZE * 5;
-    tasks_per_block = thread_per_block * TASK_PER_THREAD;
-    num_blocks = ((uint32_t)pStack->count + tasks_per_block - 1) / tasks_per_block;
-    num_blocks = 512;
-
-    /***/
-    uint32_t pos_desc, count_desc, current_block, task_iteration, nb_blocks_per_description, j, dst_offset;
-    pos_desc   = pStack->index;
-    pElem = &(description[pos_desc]);
-    count_desc = (uint32_t)pStack->count;
-    current_block = 0;
-    task_iteration = 0;
-    dst_offset = 0;
-    while( 1 ) {
-        while( pElem->elem.common.flags & OPAL_DATATYPE_FLAG_DATA ) {
-            nb_blocks_per_description = (count_desc + thread_per_block - 1) / thread_per_block;
-            for (i = 0; i < nb_blocks_per_description; i++) {
-                description_dist_h[current_block].description_index[task_iteration] = pos_desc;
-                description_dist_h[current_block].description_local_index[task_iteration] = i;
-                description_dist_h[current_block].dst_offset[task_iteration] = dst_offset;
-                description_dist_h[current_block].description_used = task_iteration + 1;
-                if ( (i+1) * thread_per_block <= count_desc) {
-                    dst_offset += thread_per_block;
-                } else {
-                    dst_offset += thread_per_block - ((i+1)*thread_per_block - count_desc);
-                }
-                current_block += 1;
-                if (current_block >= num_blocks) {
-                    current_block = 0;
-                    task_iteration ++;
-                }
-            }
-            pos_desc ++;
-            pElem = &(description[pos_desc]);
-            count_desc = pElem->elem.count;
-        }
-        if( OPAL_DATATYPE_END_LOOP == pElem->elem.common.type ) {
-            break;
-        }
-    }
-
-    // for (i = 0; i < num_blocks; i++) {
-    //     printf("block %d\t, used %d\n", i, description_dist_h[i].description_used);
-    //     for (j = 0; j < description_dist_h[i].description_used; j++) {
-    //         pos_desc = description_dist_h[i].description_index[j];
-    //         pElem = &(description[pos_desc]);
-    //         printf("i %d\t, descp_pos %d\t, local_index %d\t, count %d\t, dst offset %d\n", j, description_dist_h[i].description_index[j], description_dist_h[i].description_local_index[j], pElem->elem.count, description_dist_h[i].dst_offset[j]);
-    //     }
-    // }
-
-    cudaMemcpy(cuda_desc_h->description_dist, description_dist_h, sizeof(ddt_cuda_description_dist_t)*(num_blocks), cudaMemcpyHostToDevice);
-    /***/
-    
-    cudaMemcpy(cuda_desc_d, cuda_desc_h, sizeof(ddt_cuda_desc_t), cudaMemcpyHostToDevice);
-      
-    printf("launch pack kernel, count %d, num_blocks %d, total threads %d\n", (uint32_t)pStack->count, num_blocks, num_blocks*thread_per_block);
-    opal_generic_simple_pack_cuda_kernel_v2<<<num_blocks, thread_per_block>>>(cuda_desc_d);
-#if !defined(OPAL_DATATYPE_CUDA_DRY_RUN)
-    size_t position = pConvertor->pDesc->size;
-//    opal_convertor_set_position_nocheck(pConvertor, &position);
-#endif
-    cudaDeviceSynchronize();
-    
-   return 1;
-    
-    
-#if defined(OPAL_DATATYPE_CUDA_DRY_RUN)
-    return -99;
-#else
-    // /* copy stack and description data back to CPU */
-    // cudaMemcpy(cuda_desc_h, cuda_desc_d, sizeof(ddt_cuda_desc_t), cudaMemcpyDeviceToHost);
-    //
-    // for (i = 0; i < pConvertor->stack_size; i++) {
-    //     pConvertor->pStack[i] = cuda_desc_h->pStack[i];
-    // }
-    //
-    // pConvertor->stack_pos = cuda_desc_h->stack_pos;
-    // *out_size = cuda_desc_h->out_size;
-    // *max_data = cuda_desc_h->max_data;
-    // pConvertor->bConverted = cuda_desc_h->bConverted;
-    // pConvertor->local_size = cuda_desc_h->local_size;
-    //
-    // for (i = 0; i < *out_size; i++) {
-    //     iov[i].iov_len = cuda_desc_h->iov[i].iov_len;
-    // }
-    //
-    if( pConvertor->bConverted == pConvertor->local_size ) {
-        // pConvertor->flags |= CONVERTOR_COMPLETED;
-        return 1;
-    }
-
-    return 0;
-#endif /* OPAL_DATATYPE_CUDA_DRY_RUN */
-                                                  
-}
 
 int32_t opal_generic_simple_pack_function_cuda_vector2(opal_convertor_t* pConvertor,
                                                       struct iovec* iov,
@@ -396,7 +233,7 @@ int32_t opal_generic_simple_pack_function_cuda_vector(opal_convertor_t* pConvert
     long total_time;
 #endif
 
-    DT_CUDA_DEBUG( opal_cuda_output( 1, "opal_convertor_generic_simple_pack_cuda_vector( %p:%p, {%p, %lu}, %u, %u )\n",
+    DT_CUDA_DEBUG( opal_cuda_output( 2, "opal_convertor_generic_simple_pack_cuda_vector( %p:%p, {%p, %lu}, %u, %u )\n",
                                 (void*)pConvertor, (void*)pConvertor->pBaseBuf,
                                 iov[0].iov_base, (unsigned long)iov[0].iov_len, *out_size, *max_data ); );
 
@@ -414,7 +251,7 @@ int32_t opal_generic_simple_pack_function_cuda_vector(opal_convertor_t* pConvert
     pConvertor->stack_pos--;
     pElem = &(description[pos_desc]);
 
-    DT_CUDA_DEBUG( opal_cuda_output( 1, "pack start pos_desc %d count_desc %d disp %ld\n"
+    DT_CUDA_DEBUG( opal_cuda_output( 4, "pack start pos_desc %d count_desc %d disp %ld\n"
                            "stack_pos %d pos_desc %d count_desc %d disp %ld\n",
                            pos_desc, count_desc, (long)(conv_ptr - pConvertor->pBaseBuf),
                            pConvertor->stack_pos, pStack->index, (int)pStack->count, (long)pStack->disp ); );
@@ -475,7 +312,7 @@ int32_t opal_generic_simple_pack_function_cuda_vector(opal_convertor_t* pConvert
                 goto complete_loop;
             }
             if( OPAL_DATATYPE_END_LOOP == pElem->elem.common.type ) { /* end of the current loop */
-                DT_CUDA_DEBUG( opal_cuda_output( 2, "pack end_loop count %d stack_pos %d"
+                DT_CUDA_DEBUG( opal_cuda_output( 4, "pack end_loop count %d stack_pos %d"
                                                  " pos_desc %d disp %ld space %lu\n",
                                                  (int)pStack->count, pConvertor->stack_pos,
                                                  pos_desc, (long)pStack->disp, (unsigned long)iov_len_local ); );
@@ -501,7 +338,7 @@ int32_t opal_generic_simple_pack_function_cuda_vector(opal_convertor_t* pConvert
                 }
                 conv_ptr = pConvertor->pBaseBuf + pStack->disp;
                 UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
-                DT_CUDA_DEBUG( opal_cuda_output( 2, "pack new_loop count %d stack_pos %d pos_desc %d count_desc %d disp %ld space %lu\n",
+                DT_CUDA_DEBUG( opal_cuda_output( 4, "pack new_loop count %d stack_pos %d pos_desc %d count_desc %d disp %ld space %lu\n",
                                                  (int)pStack->count, pConvertor->stack_pos, pos_desc,
                                                  count_desc, (long)pStack->disp, (unsigned long)iov_len_local ); );
             }
@@ -537,7 +374,6 @@ int32_t opal_generic_simple_pack_function_cuda_vector(opal_convertor_t* pConvert
     complete_loop:
         iov[iov_count].iov_len -= iov_len_local;  /* update the amount of valid data */
         total_packed += iov[iov_count].iov_len;
- //       printf("iov_len %d, local %d\n", iov[iov_count].iov_len, iov_len_local);
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
         GET_TIME(start);
 #endif
@@ -547,15 +383,15 @@ int32_t opal_generic_simple_pack_function_cuda_vector(opal_convertor_t* pConvert
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
         GET_TIME( end );
         total_time = ELAPSED_TIME( start, end );
-        printf( "[Timing]: DtoH memcpy in %ld microsec, transfer required %d\n", total_time, transfer_required );
+        DT_CUDA_DEBUG( opal_cuda_output( 2, "[Timing]: DtoH memcpy in %ld microsec, transfer required %d\n", total_time, transfer_required ); );
 #endif
     }
     *max_data = total_packed;
     pConvertor->bConverted += total_packed;  /* update the already converted bytes */
     *out_size = iov_count;
+    DT_CUDA_DEBUG( opal_cuda_output( 2, "Pack total packed %lu\n", total_packed); );
     if( pConvertor->bConverted == pConvertor->local_size ) {
         pConvertor->flags |= CONVERTOR_COMPLETED;
-        DT_CUDA_DEBUG( opal_cuda_output( 0, "Total packed %lu\n", pConvertor->bConverted); );
         if (pConvertor->gpu_buffer_ptr != NULL && free_required == 1) {
             printf("free\n");
            opal_cuda_free_gpu_buffer(pConvertor->gpu_buffer_ptr, 0);
@@ -566,7 +402,7 @@ int32_t opal_generic_simple_pack_function_cuda_vector(opal_convertor_t* pConvert
     /* Save the global position for the next round */
     PUSH_STACK( pStack, pConvertor->stack_pos, pos_desc, pElem->elem.common.type, count_desc,
                 conv_ptr - pConvertor->pBaseBuf );
-    DT_CUDA_DEBUG( opal_cuda_output( 2, "pack save stack stack_pos %d pos_desc %d count_desc %d disp %ld\n",
+    DT_CUDA_DEBUG( opal_cuda_output( 4, "pack save stack stack_pos %d pos_desc %d count_desc %d disp %ld\n",
                                      pConvertor->stack_pos, pStack->index, (int)pStack->count, (long)pStack->disp ); );
     return 0;
 }
@@ -589,15 +425,11 @@ void pack_contiguous_loop_cuda( dt_elem_desc_t* ELEM,
     long total_time;
 #endif
 
-    DT_CUDA_DEBUG( opal_cuda_output( 0, "I am in pack_contiguous_loop_cuda\n"); );
+    DT_CUDA_DEBUG( opal_cuda_output( 2, "Pack using contiguous_loop_cuda\n"); );
 
     if( (_copy_loops * _end_loop->size) > *(SPACE) )
         _copy_loops = (uint32_t)(*(SPACE) / _end_loop->size);
 
-#if !defined(OPAL_DATATYPE_CUDA_DRY_RUN)
- //   _source = pBaseBuf_GPU;
- //   _destination = (unsigned char*)cuda_desc_h->iov[0].iov_base;
-#endif
 
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
     GET_TIME(start);
@@ -625,7 +457,7 @@ void pack_contiguous_loop_cuda( dt_elem_desc_t* ELEM,
 #if defined(OPAL_DATATYPE_CUDA_TIMING) 
     GET_TIME( end );
     total_time = ELAPSED_TIME( start, end );
-    printf( "[Timing]: vector packing in %ld microsec\n", total_time );
+    DT_CUDA_DEBUG( opal_cuda_output( 2, "[Timing]: vector packing in %ld microsec\n", total_time ); );
 #endif
 }
 
@@ -650,7 +482,7 @@ void pack_contiguous_loop_cuda_pipeline( dt_elem_desc_t* ELEM,
     long total_time;
 #endif
 
-    DT_CUDA_DEBUG( opal_cuda_output( 0, "I am in pack_contiguous_loop_cuda_pipeline\n"); );
+    DT_CUDA_DEBUG( opal_cuda_output( 2, "Pack using contiguous_loop_cuda_pipeline\n"); );
 
     if( (_copy_loops * _end_loop->size) > *(SPACE) )
         _copy_loops = (uint32_t)(*(SPACE) / _end_loop->size);
@@ -696,7 +528,7 @@ void pack_contiguous_loop_cuda_pipeline( dt_elem_desc_t* ELEM,
 #if defined(OPAL_DATATYPE_CUDA_TIMING) 
     GET_TIME( end );
     total_time = ELAPSED_TIME( start, end );
-    printf( "[Timing]: vector packing in %ld microsec\n", total_time );
+    DT_CUDA_DEBUG( opal_cuda_output( 2, "[Timing]: vector packing in %ld microsec\n", total_time ); );
 #endif
 }
 
@@ -718,7 +550,7 @@ void pack_contiguous_loop_cuda_memcpy2d( dt_elem_desc_t* ELEM,
     long total_time;
 #endif
 
-    DT_CUDA_DEBUG( opal_cuda_output( 0, "I am in pack_contiguous_loop_cuda_memcpy2d\n"); );
+    DT_CUDA_DEBUG( opal_cuda_output( 2, "Pack using contiguous_loop_cuda_memcpy2d\n"); );
 
     if( (_copy_loops * _end_loop->size) > *(SPACE) )
         _copy_loops = (uint32_t)(*(SPACE) / _end_loop->size);
@@ -741,7 +573,7 @@ void pack_contiguous_loop_cuda_memcpy2d( dt_elem_desc_t* ELEM,
 #if defined(OPAL_DATATYPE_CUDA_TIMING) 
     GET_TIME( end );
     total_time = ELAPSED_TIME( start, end );
-    printf( "[Timing]: vector packing with memcpy2d in %ld microsec\n", total_time );
+    DT_CUDA_DEBUG( opal_cuda_output( 2, "[Timing]: vector packing with memcpy2d in %ld microsec\n", total_time ); );
 #endif
 }
 
@@ -764,7 +596,7 @@ void pack_contiguous_loop_cuda_zerocopy( dt_elem_desc_t* ELEM,
     long total_time;
 #endif
 
-    DT_CUDA_DEBUG( opal_cuda_output( 0, "I am in pack_contiguous_loop_cuda_zerocopy\n"); );
+    DT_CUDA_DEBUG( opal_cuda_output( 2, "Pack using contiguous_loop_cuda_zerocopy\n"); );
 
     if( (_copy_loops * _end_loop->size) > *(SPACE) )
         _copy_loops = (uint32_t)(*(SPACE) / _end_loop->size);
@@ -797,7 +629,7 @@ void pack_contiguous_loop_cuda_zerocopy( dt_elem_desc_t* ELEM,
 #if defined(OPAL_DATATYPE_CUDA_TIMING) 
     GET_TIME( end );
     total_time = ELAPSED_TIME( start, end );
-    printf( "[Timing]: vector packing in %ld microsec\n", total_time );
+    DT_CUDA_DEBUG( opal_cuda_output( 2, "[Timing]: vector packing in %ld microsec\n", total_time ); );
 #endif
 }
 
@@ -810,16 +642,16 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
     uint32_t count_desc, current_block, task_iteration, nb_blocks_per_description, residue_desc;
     uint32_t nb_blocks, thread_per_block, nb_blocks_used;
     size_t length, buffer_size, length_per_iovec, dst_offset;
-    unsigned char *destination, *destination_tmp;
+    unsigned char *destination, *destination_base, *source_base;
     size_t total_packed, total_converted;
     int32_t complete_flag = 0;
     uint8_t buffer_isfull = 0, transfer_required, free_required;
     uint32_t convertor_flags;
-    dt_elem_desc_t* description;
-    dt_elem_desc_t* pElem;
-    dt_stack_t* pStack;
+//    dt_elem_desc_t* description;
+//    dt_elem_desc_t* pElem;
+//    dt_stack_t* pStack;
     uint8_t alignment, orig_alignment;
-    int32_t orig_stack_index;
+//    int32_t orig_stack_index;
     
     ddt_cuda_iov_dist_t* cuda_iov_dist_h_current;
     ddt_cuda_iov_dist_t* cuda_iov_dist_d_current;
@@ -829,12 +661,11 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
     long total_time, move_time;
 #endif
     
-    DT_CUDA_DEBUG ( opal_cuda_output(0, "GPU datatype PACKING using iovec\n"); );
-    
-    description = pConvertor->use_desc->desc;
+    /*description = pConvertor->use_desc->desc;
     pStack = pConvertor->pStack + pConvertor->stack_pos;
     pElem = &(description[pStack->index]);
-//    printf("size elem %lu, size %d\n", pElem->elem.common.type, opal_datatype_basicDatatypes[pElem->elem.common.type]->size);
+    printf("size elem %lu, size %d\n", pElem->elem.common.type, opal_datatype_basicDatatypes[pElem->elem.common.type]->size);
+    */
     
 //    assert(opal_datatype_basicDatatypes[pElem->elem.common.type]->size != 0);
 
@@ -869,24 +700,19 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
             }
             transfer_required = 1;
             free_required = 1;
-#if defined(OPAL_DATATYPE_CUDA_DRY_RUN)
-            destination = (unsigned char*)iov[0].iov_base;
-#else
             destination = pConvertor->gpu_buffer_ptr;
-#endif /* OPAL_DATATYPE_CUDA_DRY_RUN */
         }
     }
     
-    destination_tmp = destination;
-    
-    DT_CUDA_DEBUG ( opal_cuda_output(0, "Pack GPU base %p, pack to buffer %p\n", pConvertor->pBaseBuf, destination););
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "Pack using IOV, GPU base %p, pack to buffer %p\n", pConvertor->pBaseBuf, destination););
 
     cuda_iov_count = 1000;//CUDA_NB_IOV;
     total_packed = 0;
     total_converted = pConvertor->bConverted;
     cuda_streams->current_stream_id = 0;
     convertor_flags = pConvertor->flags;
-    orig_stack_index = pStack->index;
+  //  orig_stack_index = pStack->index;
+    destination_base = destination;
 
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
     GET_TIME(start_total);
@@ -896,12 +722,12 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
     GET_TIME(start);
 #endif
     complete_flag = opal_convertor_raw( pConvertor, cuda_iov, &cuda_iov_count, &length );
-    DT_CUDA_DEBUG ( opal_cuda_output(2, "PACKING complete flag %d, iov count %d, length %d, submit to CUDA stream %d\n", complete_flag, cuda_iov_count, length, cuda_streams->current_stream_id); );
+    DT_CUDA_DEBUG ( opal_cuda_output(4, "Pack complete flag %d, iov count %d, length %d, submit to CUDA stream %d\n", complete_flag, cuda_iov_count, length, cuda_streams->current_stream_id); );
 
 #if defined(OPAL_DATATYPE_CUDA_TIMING)    
     GET_TIME( end );
     total_time = ELAPSED_TIME( start, end );
-    printf( "[Timing]: ddt to iov in %ld microsec\n", total_time );
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: ddt to iov in %ld microsec\n", total_time ); );
 #endif
     
     dst_offset = 0;
@@ -914,7 +740,8 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
         current_block = 0;
         task_iteration = 0;
         cuda_iov_dist_h_current = cuda_iov_dist_h[cuda_streams->current_stream_id];
-        cuda_iov_dist_d_current = cuda_iov_dist_d[cuda_streams->current_stream_id]; 
+        cuda_iov_dist_d_current = cuda_iov_dist_d[cuda_streams->current_stream_id];
+        source_base = (unsigned char*)cuda_iov[0].iov_base; 
 
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
         GET_TIME(start);
@@ -924,11 +751,12 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
         }
 
         for (i = 0; i < cuda_iov_count; i++) {
-            pElem = &(description[orig_stack_index+i]);
+          /*  pElem = &(description[orig_stack_index+i]);*/
             if (buffer_size >= cuda_iov[i].iov_len) {
                 length_per_iovec = cuda_iov[i].iov_len;
             } else {
-                orig_alignment = opal_datatype_basicDatatypes[pElem->elem.common.type]->size;
+                /*orig_alignment = opal_datatype_basicDatatypes[pElem->elem.common.type]->size;*/
+                orig_alignment = ALIGNMENT_CHAR;
                 length_per_iovec = buffer_size / orig_alignment * orig_alignment;
                 buffer_isfull = 1;
             }
@@ -949,12 +777,11 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
             count_desc = length_per_iovec / alignment;
             residue_desc = length_per_iovec % alignment;
             nb_blocks_per_description = (count_desc + thread_per_block - 1) / thread_per_block;
-            DT_CUDA_DEBUG ( opal_cuda_output(10, "PACKING description %d, size %d, residue %d, alignment %d, nb_block_aquired %d\n", i, count_desc, residue_desc, alignment, nb_blocks_per_description); );
+            DT_CUDA_DEBUG ( opal_cuda_output(10, "Pack description %d, size %d, residue %d, alignment %d, nb_block_aquired %d\n", i, count_desc, residue_desc, alignment, nb_blocks_per_description); );
             for (j = 0; j < nb_blocks_per_description; j++) {
-                cuda_iov_dist_h_current[current_block].src[task_iteration] = (unsigned char *)(cuda_iov[i].iov_base) + j * thread_per_block * alignment;
-                cuda_iov_dist_h_current[current_block].dst[task_iteration] = destination;
-                cuda_iov_dist_h_current[current_block].element_alignment[task_iteration] = alignment;
-                cuda_iov_dist_h_current[current_block].nb_tasks = task_iteration + 1;
+                cuda_iov_dist_h_current[nb_blocks_used].src_offset = (unsigned char *)(cuda_iov[i].iov_base) + j * thread_per_block * alignment - source_base;
+                cuda_iov_dist_h_current[nb_blocks_used].dst_offset = destination - destination_base;
+                cuda_iov_dist_h_current[nb_blocks_used].element_alignment = alignment;
                 if ( (j+1) * thread_per_block <= count_desc) {
                     cuda_iov_dist_h_current[current_block].nb_elements[task_iteration] = thread_per_block;// * sizeof(double);
                 } else {
@@ -963,9 +790,8 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
 #if defined (OPAL_DATATYPE_CUDA_DEBUG)
                 assert(cuda_iov_dist_h_current[current_block].nb_elements[task_iteration] > 0);
 #endif /* OPAL_DATATYPE_CUDA_DEBUG */
-                destination += cuda_iov_dist_h_current[current_block].nb_elements[task_iteration] * alignment;
-                DT_CUDA_DEBUG( opal_cuda_output(12, "PACKING \tblock %d, src %p, dst %p, nb_elements %d, alignment %d\n", current_block, cuda_iov_dist_h_current[current_block].src[task_iteration], cuda_iov_dist_h_current[current_block].dst[task_iteration], cuda_iov_dist_h_current[current_block].nb_elements[task_iteration], cuda_iov_dist_h_current[current_block].element_alignment[task_iteration]); );
-                current_block += 1;
+                destination += cuda_iov_dist_h_current[nb_blocks_used].nb_elements * alignment;
+                DT_CUDA_DEBUG( opal_cuda_output(12, "Pack \tblock %d, src %p, dst %p, nb_elements %d, alignment %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].src_offset, cuda_iov_dist_h_current[nb_blocks_used].dst_offset, cuda_iov_dist_h_current[nb_blocks_used].nb_elements, cuda_iov_dist_h_current[nb_blocks_used].element_alignment); );
                 nb_blocks_used ++;
                 if (current_block >= nb_blocks) {
                     current_block = 0;
@@ -976,18 +802,17 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
             
             /* handle residue */
             if (residue_desc != 0) {
-                orig_alignment = opal_datatype_basicDatatypes[pElem->elem.common.type]->size;
-                cuda_iov_dist_h_current[current_block].src[task_iteration] = (unsigned char *)(cuda_iov[i].iov_base) + length_per_iovec / alignment * alignment;
-                cuda_iov_dist_h_current[current_block].dst[task_iteration] = destination;
-                cuda_iov_dist_h_current[current_block].element_alignment[task_iteration] = orig_alignment;
-                cuda_iov_dist_h_current[current_block].nb_tasks = task_iteration + 1;
-                cuda_iov_dist_h_current[current_block].nb_elements[task_iteration] = (length_per_iovec - length_per_iovec / alignment * alignment) / orig_alignment;
+                /*orig_alignment = opal_datatype_basicDatatypes[pElem->elem.common.type]->size;*/
+                orig_alignment = ALIGNMENT_CHAR;
+                cuda_iov_dist_h_current[nb_blocks_used].src_offset = (unsigned char *)(cuda_iov[i].iov_base) + length_per_iovec / alignment * alignment - source_base;
+                cuda_iov_dist_h_current[nb_blocks_used].dst_offset = destination - destination_base;
+                cuda_iov_dist_h_current[nb_blocks_used].element_alignment = orig_alignment;
+                cuda_iov_dist_h_current[nb_blocks_used].nb_elements = (length_per_iovec - length_per_iovec / alignment * alignment) / orig_alignment;
 #if defined (OPAL_DATATYPE_CUDA_DEBUG)
                 assert(cuda_iov_dist_h_current[current_block].nb_elements[task_iteration] > 0);
 #endif /* OPAL_DATATYPE_CUDA_DEBUG */
-                destination += cuda_iov_dist_h_current[current_block].nb_elements[task_iteration] * orig_alignment;
-                DT_CUDA_DEBUG( opal_cuda_output(12, "PACKING \tblock %d, src %p, dst %p, nb_elements %d, alignment %d\n", current_block, cuda_iov_dist_h_current[current_block].src[task_iteration], cuda_iov_dist_h_current[current_block].dst[task_iteration], cuda_iov_dist_h_current[current_block].nb_elements[task_iteration], cuda_iov_dist_h_current[current_block].element_alignment[task_iteration]); );
-                current_block += 1;
+                destination += cuda_iov_dist_h_current[nb_blocks_used].nb_elements * orig_alignment;
+                DT_CUDA_DEBUG( opal_cuda_output(12, "Pack \tblock %d, src %p, dst %p, nb_elements %d, alignment %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].src_offset, cuda_iov_dist_h_current[nb_blocks_used].dst_offset, cuda_iov_dist_h_current[nb_blocks_used].nb_elements, cuda_iov_dist_h_current[nb_blocks_used].element_alignment); );
                 nb_blocks_used ++;
                 if (current_block >= nb_blocks) {
                     current_block = 0;
@@ -1004,11 +829,11 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
 #if defined(OPAL_DATATYPE_CUDA_TIMING)    
         GET_TIME( end );
         total_time = ELAPSED_TIME( start, end );
-        printf( "[Timing]: Pack to dest %p, iov is prepared in %ld microsec, kernel submitted to CUDA stream %d, nb_blocks %d\n", destination_tmp, total_time,  cuda_streams->current_stream_id, nb_blocks_used);
+        DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Pack to dest %p, iov is prepared in %ld microsec, kernel submitted to CUDA stream %d, nb_blocks %d\n", destination_base, total_time,  cuda_streams->current_stream_id, nb_blocks_used); );
 #endif
 
-        cudaMemcpyAsync(cuda_iov_dist_d_current, cuda_iov_dist_h_current, sizeof(ddt_cuda_iov_dist_t)*(nb_blocks), cudaMemcpyHostToDevice, cuda_streams->opal_cuda_stream[cuda_streams->current_stream_id]);
-        opal_generic_simple_pack_cuda_iov_kernel<<<nb_blocks, thread_per_block, 0, cuda_streams->opal_cuda_stream[cuda_streams->current_stream_id]>>>(cuda_iov_dist_d_current);
+        cudaMemcpyAsync(cuda_iov_dist_d_current, cuda_iov_dist_h_current, sizeof(ddt_cuda_iov_dist_t)*(nb_blocks_used), cudaMemcpyHostToDevice, cuda_streams->opal_cuda_stream[cuda_streams->current_stream_id]);
+        opal_generic_simple_pack_cuda_iov_kernel<<<nb_blocks, thread_per_block, 0, cuda_streams->opal_cuda_stream[cuda_streams->current_stream_id]>>>(cuda_iov_dist_d_current, nb_blocks_used, source_base, destination_base);
         cuda_streams->current_stream_id ++;
         cuda_streams->current_stream_id = cuda_streams->current_stream_id % NB_STREAMS;
         
@@ -1023,21 +848,20 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
         GET_TIME(start);
 #endif
         convertor_flags = pConvertor->flags;
-        orig_stack_index = pStack->index;
+//        orig_stack_index = pStack->index;
         complete_flag = opal_convertor_raw( pConvertor, cuda_iov, &cuda_iov_count, &length );
-        DT_CUDA_DEBUG ( opal_cuda_output(2, "PACKING complete flag %d, iov count %d, length %d, submit to CUDA stream %d\n", complete_flag, cuda_iov_count, length, cuda_streams->current_stream_id); );
+        DT_CUDA_DEBUG ( opal_cuda_output(4, "Pack complete flag %d, iov count %d, length %d, submit to CUDA stream %d\n", complete_flag, cuda_iov_count, length, cuda_streams->current_stream_id); );
 #if defined(OPAL_DATATYPE_CUDA_TIMING)    
         GET_TIME( end );
         total_time = ELAPSED_TIME( start, end );
-        printf( "[Timing]: ddt to iov in %ld microsec\n", total_time );
+        DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: ddt to iov in %ld microsec\n", total_time ); );
 #endif
     }
     
 
-    cudaDeviceSynchronize();
- /*   for (i = 0; i < NB_STREAMS; i++) {
+    for (i = 0; i < NB_STREAMS; i++) {
         cudaStreamSynchronize(cuda_streams->opal_cuda_stream[i]);
-    }*/
+    }
     
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
     GET_TIME(start);
@@ -1048,7 +872,7 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
 #if defined(OPAL_DATATYPE_CUDA_TIMING) 
     GET_TIME( end );
     move_time = ELAPSED_TIME( start, end );
-    printf( "[Timing]: DtoH memcpy in %ld microsec, transfer required %d\n", move_time, transfer_required );
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: DtoH memcpy in %ld microsec, transfer required %d\n", move_time, transfer_required ); );
 #endif
     // float *vtmp = (float *)iov[0].iov_base;
     // DT_CUDA_DEBUG ( opal_cuda_output(0, "packed iov buffer, total packed %d\n", total_packed); );
@@ -1060,12 +884,12 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
     iov[0].iov_len = total_packed;
     *max_data = total_packed;
     *out_size = 1;
-    DT_CUDA_DEBUG ( opal_cuda_output(0, "PACKING total packed %d\n", total_packed); );
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "Pack total packed %d\n", total_packed); );
     
 #if defined(OPAL_DATATYPE_CUDA_TIMING)    
     GET_TIME( end_total );
     total_time = ELAPSED_TIME( start_total, end_total );
-    printf( "[Timing]: total packing in %ld microsec, kernel %ld microsec\n", total_time, total_time-move_time );
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: total packing in %ld microsec, kernel %ld microsec\n", total_time, total_time-move_time ); );
 #endif
     
     if( pConvertor->bConverted == pConvertor->local_size ) {
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
index 3303e6fe9f5..2ea3bb59885 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
@@ -5,257 +5,11 @@
 #include <cuda.h>
 #include <stdio.h> 
 
-__device__ void unpack_contiguous_loop_cuda_kernel( dt_elem_desc_t* ELEM,
-                                                    uint32_t* COUNT,
-                                                    unsigned char** SOURCE,
-                                                    unsigned char** DESTINATION,
-                                                    size_t* SPACE )
-{
-    ddt_loop_desc_t *_loop = (ddt_loop_desc_t*)(ELEM);
-    ddt_endloop_desc_t* _end_loop = (ddt_endloop_desc_t*)((ELEM) + _loop->items);
-    unsigned char* _dst_disp = (*DESTINATION) + _end_loop->first_elem_disp;
-    uint32_t _copy_loops = *(COUNT);
-    uint32_t _i, tid, num_threads;
-    unsigned char* _source = *SOURCE;
-//    unsigned char* _source = _src_disp;
-    uint32_t gap, nb_elements;
-    double *_source_tmp, *_destination_tmp, *_dst_disp_tmp;;
-    
-    tid = threadIdx.x + blockIdx.x * blockDim.x;
-    num_threads = gridDim.x * blockDim.x;
-
-    if( (_copy_loops * _end_loop->size) > *(SPACE) )
-        _copy_loops = (uint32_t)(*(SPACE) / _end_loop->size);
-    
-    gap = (_loop->extent - _end_loop->size) / 8;
-    nb_elements = _end_loop->size / 8;
-    _dst_disp_tmp = (double*)_dst_disp;
-    _source_tmp = (double*)_source;
-    _destination_tmp = _dst_disp_tmp + tid;
-    _source_tmp += tid;
-
-    __syncthreads();
-    for (_i = tid; _i < _copy_loops*nb_elements; _i+=num_threads) {
-        _destination_tmp = _dst_disp_tmp + tid + _i/num_threads*num_threads + _i/nb_elements * gap;
-#if defined (OPAL_DATATYPE_CUDA_DEBUG)
-        if (_i % nb_elements == 0 ) {
-            DBGPRINT("tid %d, pack 3. memcpy( %p, %p, %lu ) => space %lu, _i %d, actual _i %d\n",
-                                            tid, _destination_tmp, _source_tmp, (unsigned long)_end_loop->size, (unsigned long)(*(SPACE) - _i/nb_elements * _end_loop->size), _i/nb_elements, _i );
-        }
-        // if (_i / nb_elements ==1 && tid == 0 ) {
-        //     DBGPRINT("tid %d, pack 3. memcpy( %p, %p, %lu ) => space %lu, _i %d, actual _i %d\n",
-        //                                     tid, _destination_tmp, _source_tmp, (unsigned long)_end_loop->size, (unsigned long)(*(SPACE) - _i/nb_elements * _end_loop->size), _i/nb_elements, _i );
-        // }
-#endif /* OPAL_DATATYPE_CUDA_DEBUG */
-#if !defined (OPAL_DATATYPE_CUDA_DRY_RUN)
-        *_destination_tmp = *_source_tmp;
-#endif /* ! OPAL_DATATYPE_CUDA_DRY_RUN */
-        _source_tmp += num_threads;
-//        _source_tmp += num_threads;
-
-    }
-    *(DESTINATION) = _dst_disp + _copy_loops*_loop->extent - _end_loop->first_elem_disp;
-    *(SOURCE) = *(SOURCE)  + _copy_loops * _end_loop->size;
-    *(SPACE) -= _copy_loops * _end_loop->size;
-    *(COUNT) -= _copy_loops;
-
-    __syncthreads();
-}
-
-__global__ void opal_generic_simple_unpack_cuda_kernel(ddt_cuda_desc_t* cuda_desc)
-{
-    dt_stack_t* pStack;                /* pointer to the position on the stack */
-    uint32_t pos_desc;                 /* actual position in the description of the derived datatype */
-    uint32_t count_desc;               /* the number of items already done in the actual pos_desc */
-    size_t total_unpacked = 0;         /* total size unpacked this time */
-    dt_elem_desc_t* description;
-    dt_elem_desc_t* pElem;
-    unsigned char *conv_ptr, *iov_ptr, *pBaseBuf;
-    size_t iov_len_local;
-    uint32_t iov_count;
-    uint32_t stack_pos;
-    struct iovec* iov;
-
-    OPAL_PTRDIFF_TYPE lb; 
-    OPAL_PTRDIFF_TYPE ub;
-    uint32_t out_size;
-    uint32_t tid;
-
-    tid = threadIdx.x + blockIdx.x * blockDim.x;
-    
- //   __shared__ ddt_cuda_desc_t cuda_desc_b;
-    __shared__ dt_stack_t shared_pStack[DT_STATIC_STACK_SIZE];
-
-    if (threadIdx.x < DT_STATIC_STACK_SIZE) {
-        shared_pStack[threadIdx.x] = cuda_desc->pStack[threadIdx.x];
-    }
-    __syncthreads();
-    
-    // load cuda descriptor from constant memory
-    iov = cuda_desc->iov;
-    pStack = shared_pStack;
-    description = cuda_desc->description;
-    stack_pos = cuda_desc->stack_pos;
-    pBaseBuf = cuda_desc->pBaseBuf;
-    lb = cuda_desc->lb;
-    ub = cuda_desc->ub;
-    out_size = cuda_desc->out_size;
-
-    /* For the first step we have to add both displacement to the source. After in the
-     * main while loop we will set back the source_base to the correct value. This is
-     * due to the fact that the convertor can stop in the middle of a data with a count
-     */
-    pStack     = pStack + stack_pos;
-    pos_desc   = pStack->index;
-    conv_ptr   = pBaseBuf + pStack->disp;
-    count_desc = (uint32_t)pStack->count;
-    pStack--;
-    stack_pos--;
-    pElem = &(description[pos_desc]);
-
-
-    for( iov_count = 0; iov_count < out_size; iov_count++ ) {
-        iov_ptr = (unsigned char *) iov[iov_count].iov_base;
-        iov_len_local = iov[iov_count].iov_len;
-        // if( 0 != pConvertor->partial_length ) {
-        //     size_t element_length = opal_datatype_basicDatatypes[pElem->elem.common.type]->size;
-        //     size_t missing_length = element_length - pConvertor->partial_length;
-        //
-        //     assert( pElem->elem.common.flags & OPAL_DATATYPE_FLAG_DATA );
-        //     COMPUTE_CSUM( iov_ptr, missing_length, pConvertor );
-        //     opal_unpack_partial_datatype( pConvertor, pElem,
-        //                                   iov_ptr,
-        //                                   pConvertor->partial_length, element_length - pConvertor->partial_length,
-        //                                   &conv_ptr );
-        //     --count_desc;
-        //     if( 0 == count_desc ) {
-        //         conv_ptr = pConvertor->pBaseBuf + pStack->disp;
-        //         pos_desc++;  /* advance to the next data */
-        //         UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
-        //     }
-        //     iov_ptr       += missing_length;
-        //     iov_len_local -= missing_length;
-        //     pConvertor->partial_length = 0;  /* nothing more inside */
-        // }
-        while( 1 ) {
-            while( pElem->elem.common.flags & OPAL_DATATYPE_FLAG_DATA ) {
-                /* now here we have a basic datatype */
-                // UNPACK_PREDEFINED_DATATYPE( pConvertor, pElem, count_desc,
-                //                             iov_ptr, conv_ptr, iov_len_local );
-                if( 0 == count_desc ) {  /* completed */
-                    conv_ptr = pBaseBuf + pStack->disp;
-                    pos_desc++;  /* advance to the next data */
-                    UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
-                    continue;
-                }
-                // assert( pElem->elem.common.type < OPAL_DATATYPE_MAX_PREDEFINED );
-                if( 0 != iov_len_local ) {
-                    unsigned char* temp = conv_ptr;
-                    /* We have some partial data here. Let's copy it into the convertor
-                     * and keep it hot until the next round.
-                     */
-                    // assert( iov_len_local < opal_datatype_basicDatatypes[pElem->elem.common.type]->size );
-                    // COMPUTE_CSUM( iov_ptr, iov_len_local, pConvertor );
-                    //
-                    // opal_unpack_partial_datatype( pConvertor, pElem,
-                    //                               iov_ptr, 0, iov_len_local,
-                    //                               &temp );
-                    //
-                    // pConvertor->partial_length = (uint32_t)iov_len_local;
-                    iov_len_local = 0;
-                }
-                goto complete_loop;
-            }
-            if( OPAL_DATATYPE_END_LOOP == pElem->elem.common.type ) { /* end of the current loop */
-                // DO_DEBUG( opal_output( 0, "unpack end_loop count %d stack_pos %d pos_desc %d disp %ld space %lu\n",
-                //                        (int)pStack->count, pConvertor->stack_pos, pos_desc,
-                //                        (long)pStack->disp, (unsigned long)iov_len_local ); );
-                if (threadIdx.x == 0) {
-                    (pStack->count)--;
-                }
-                __syncthreads();
-                
-                if( pStack->count == 0 ) { /* end of loop */
-                    if( 0 == stack_pos ) {
-                        /* Do the same thing as when the loop is completed */
-                        iov[iov_count].iov_len -= iov_len_local;  /* update the amount of valid data */
-                        total_unpacked += iov[iov_count].iov_len;
-                        iov_count++;  /* go to the next */
-                        goto complete_conversion;
-                    }
-                    stack_pos--;
-                    pStack--;
-                    pos_desc++;
-                } else {
-                    pos_desc = pStack->index + 1;
-                    if (threadIdx.x == 0) {
-                        if( pStack->index == -1 ) {
-                            pStack->disp += (ub - lb);
-                        } else {
-                            //assert( OPAL_DATATYPE_LOOP == description[pStack->index].loop.common.type );
-                            pStack->disp += description[pStack->index].loop.extent;
-                        }
-                    }
-                    __syncthreads();
-                }
-                conv_ptr = pBaseBuf + pStack->disp;
-                UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
-                // DO_DEBUG( opal_output( 0, "unpack new_loop count %d stack_pos %d pos_desc %d disp %ld space %lu\n",
-                //                        (int)pStack->count, pConvertor->stack_pos, pos_desc,
-                //                        (long)pStack->disp, (unsigned long)iov_len_local ); );
-            }
-            if( OPAL_DATATYPE_LOOP == pElem->elem.common.type ) {
-                OPAL_PTRDIFF_TYPE local_disp = (OPAL_PTRDIFF_TYPE)conv_ptr;
-                if( pElem->loop.common.flags & OPAL_DATATYPE_FLAG_CONTIGUOUS ) {
-                    unpack_contiguous_loop_cuda_kernel( pElem, &count_desc,
-                                                        &iov_ptr, &conv_ptr, &iov_len_local );
-                    count_desc = 0;
-                    if( 0 == count_desc ) {  /* completed */
-                        pos_desc += pElem->loop.items + 1;
-                        goto update_loop_description;
-                    }
-                    /* Save the stack with the correct last_count value. */
-                }
-                local_disp = (OPAL_PTRDIFF_TYPE)conv_ptr - local_disp;
-                PUSH_STACK( pStack, stack_pos, pos_desc, OPAL_DATATYPE_LOOP, count_desc,
-                            pStack->disp + local_disp);
-                pos_desc++;
-            update_loop_description:  /* update the current state */
-                conv_ptr = pBaseBuf + pStack->disp;
-                UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
-                // DDT_DUMP_STACK( pConvertor->pStack, pConvertor->stack_pos, pElem, "advance loop" );
-                continue;
-            }
-        }
-    complete_loop:
-        if (threadIdx.x == 0) { 
-            iov[iov_count].iov_len -= iov_len_local;  /* update the amount of valid data */
-        }
-        __syncthreads();
-        total_unpacked += iov[iov_count].iov_len;
-    }
- complete_conversion:
-    if (tid == 0) {
-        cuda_desc->max_data = total_unpacked;
-    //    pConvertor->bConverted += total_unpacked;  /* update the already converted bytes */
-        cuda_desc->out_size = iov_count;
-        // if( pConvertor->bConverted == pConvertor->remote_size ) {
-        //     pConvertor->flags |= CONVERTOR_COMPLETED;
-        //     return 1;
-        // }
-        // /* Save the global position for the next round */
-        // PUSH_STACK( pStack, pConvertor->stack_pos, pos_desc, OPAL_DATATYPE_UINT1, count_desc,
-        //             conv_ptr - pConvertor->pBaseBuf );
-        // DO_DEBUG( opal_output( 0, "unpack save stack stack_pos %d pos_desc %d count_desc %d disp %ld\n",
-        //                        pConvertor->stack_pos, pStack->index, (int)pStack->count, (long)pStack->disp ); );
-    }
-}
-
 
-__global__ void opal_generic_simple_unpack_cuda_iov_kernel( ddt_cuda_iov_dist_t* cuda_iov_dist)
+__global__ void opal_generic_simple_unpack_cuda_iov_kernel( ddt_cuda_iov_dist_t* cuda_iov_dist, int nb_blocks_used, unsigned char* source_base, unsigned char* destination_base)
 {
     uint32_t i, _copy_count;
-    unsigned char *src, *dst;
+    size_t src_offset, dst_offset;
     uint8_t alignment;
     unsigned char *_source_tmp, *_destination_tmp;
     
@@ -267,14 +21,14 @@ __global__ void opal_generic_simple_unpack_cuda_iov_kernel( ddt_cuda_iov_dist_t*
     __syncthreads();
     
     for (i = 0; i < nb_tasks; i++) {
-        src = cuda_iov_dist[blockIdx.x].src[i];
-        dst = cuda_iov_dist[blockIdx.x].dst[i];
-        _copy_count = cuda_iov_dist[blockIdx.x].nb_elements[i];
-        alignment = cuda_iov_dist[blockIdx.x].element_alignment[i];
+        src_offset = cuda_iov_dist[blockIdx.x + i * gridDim.x].src_offset;
+        dst_offset = cuda_iov_dist[blockIdx.x + i * gridDim.x].dst_offset;
+        _copy_count = cuda_iov_dist[blockIdx.x + i * gridDim.x].nb_elements;
+        alignment = cuda_iov_dist[blockIdx.x + i * gridDim.x].element_alignment;
         
         if (threadIdx.x < _copy_count) {
-            _source_tmp = src + threadIdx.x * alignment;
-            _destination_tmp = dst + threadIdx.x * alignment;
+            _source_tmp = source_base + src_offset + threadIdx.x * alignment;
+            _destination_tmp = destination_base + dst_offset + threadIdx.x * alignment;
 #if !defined (OPAL_DATATYPE_CUDA_DRY_RUN)
                 if (alignment == ALIGNMENT_DOUBLE) {
                     *((long *)_destination_tmp) = *((long *)_source_tmp);
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
index c268fe2fb94..52f9acccc09 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
@@ -7,108 +7,6 @@
 #include <stdio.h>
 #include <assert.h>
 
-int32_t opal_generic_simple_unpack_function_cuda( opal_convertor_t* pConvertor,
-                                                  struct iovec* iov, 
-                                                  uint32_t* out_size,
-                                                  size_t* max_data )
-{
-    uint32_t i;
-    dt_elem_desc_t* description;
-    const opal_datatype_t *pData = pConvertor->pDesc;
-    uint32_t tasks_per_block, num_blocks, thread_per_block;
-    dt_stack_t* pStack;
-    
-    return -99;
-    description = pConvertor->use_desc->desc;
-    
-    cuda_desc_h->stack_pos = pConvertor->stack_pos;
-#if defined(OPAL_DATATYPE_CUDA_DRY_RUN)
-    cuda_desc_h->pBaseBuf = pConvertor->pBaseBuf;
-#else
-    cuda_desc_h->pBaseBuf = pBaseBuf_GPU;
-#endif /* OPAL_DATATYPE_CUDA_DRY_RUN */
-    cuda_desc_h->lb = pData->lb;
-    cuda_desc_h->ub = pData->ub;
-    cuda_desc_h->out_size = *out_size;
-    cuda_desc_h->max_data = *max_data;
-    cuda_desc_h->bConverted = pConvertor->bConverted;
-    cuda_desc_h->local_size = pConvertor->local_size;
-    cuda_desc_h->stack_size = pConvertor->stack_size;
-    
-    for (i = 0; i < pConvertor->stack_size; i++) {
-        cuda_desc_h->pStack[i] = pConvertor->pStack[i];
-    }
-    if (cuda_desc_h->description_max_count != 0) {
-        if (cuda_desc_h->description_max_count >= (pConvertor->use_desc->used+1)) {
-            cuda_desc_h->description_count = pConvertor->use_desc->used+1;
-        } else {
-            cudaFree(cuda_desc_h->description);
-            cuda_desc_h->description = NULL;
-            cudaMalloc((void **)&(cuda_desc_h->description), sizeof(dt_elem_desc_t)*(pConvertor->use_desc->used+1));
-            cuda_desc_h->description_max_count = pConvertor->use_desc->used+1;
-            cuda_desc_h->description_count = pConvertor->use_desc->used+1;
-        }
-        
-    } else {
-        cudaMalloc((void **)&(cuda_desc_h->description), sizeof(dt_elem_desc_t)*(pConvertor->use_desc->used+1));
-        cuda_desc_h->description_max_count = pConvertor->use_desc->used+1;
-        cuda_desc_h->description_count = pConvertor->use_desc->used+1;
-    }
-    cudaMemcpy(cuda_desc_h->description, description, sizeof(dt_elem_desc_t)*(pConvertor->use_desc->used+1), cudaMemcpyHostToDevice);
-    
-    DBGPRINT("stack_size %d\n", pConvertor->stack_size);
-
-    DBGPRINT("flags %d, types %d, count %d\n", description->elem.common.flags, description->elem.common.type, description->elem.count);
-    
-    for (i = 0; i < *out_size; i++) {
-#if defined (OPAL_DATATYPE_CUDA_DRY_RUN)
-        cuda_desc_h->iov[i].iov_base = iov[i].iov_base;
-#endif /* OPAL_DATATYPE_CUDA_DRY_RUN */
-        cuda_desc_h->iov[i].iov_len = iov[i].iov_len;
-    }
-    
-    cudaMemcpy(cuda_desc_d, cuda_desc_h, sizeof(ddt_cuda_desc_t), cudaMemcpyHostToDevice);
-    
-    pStack = pConvertor->pStack + pConvertor->stack_pos;
-    thread_per_block = CUDA_WARP_SIZE * 3;
-    tasks_per_block = thread_per_block * TASK_PER_THREAD;
-    num_blocks = ((uint32_t)pStack->count + tasks_per_block - 1) / tasks_per_block;
-    printf("launch unpack kernel, count %d, num_blocks %d, total threads %d\n", (uint32_t)pStack->count, num_blocks, num_blocks*thread_per_block);
-    opal_generic_simple_unpack_cuda_kernel<<<192, thread_per_block>>>(cuda_desc_d);
-#if !defined(OPAL_DATATYPE_CUDA_DRY_RUN)
-    size_t position = pConvertor->pDesc->size;
-    opal_convertor_set_position_nocheck(pConvertor, &position);
-#endif
-    cudaDeviceSynchronize();
-    
-#if defined(OPAL_DATATYPE_CUDA_DRY_RUN)
-    return -99;
-#else
-    // /* copy stack and description data back to CPU */
-    // cudaMemcpy(cuda_desc_h, cuda_desc_d, sizeof(ddt_cuda_desc_t), cudaMemcpyDeviceToHost);
-    //
-    // for (i = 0; i < pConvertor->stack_size; i++) {
-    //     pConvertor->pStack[i] = cuda_desc_h->pStack[i];
-    // }
-    //
-    // pConvertor->stack_pos = cuda_desc_h->stack_pos;
-    // *out_size = cuda_desc_h->out_size;
-    // *max_data = cuda_desc_h->max_data;
-    // pConvertor->bConverted = cuda_desc_h->bConverted;
-    // pConvertor->local_size = cuda_desc_h->local_size;
-    //
-    // for (i = 0; i < *out_size; i++) {
-    //     iov[i].iov_len = cuda_desc_h->iov[i].iov_len;
-    // }
-    //
-    if( pConvertor->bConverted == pConvertor->local_size ) {
-        // pConvertor->flags |= CONVERTOR_COMPLETED;
-        return 1;
-    }
-
-    return 0;
-#endif /* OPAL_DATATYPE_CUDA_DRY_RUN */
-}
 
 int32_t opal_generic_simple_unpack_function_cuda_vector2( opal_convertor_t* pConvertor,
                                                          struct iovec* iov, uint32_t* out_size,
@@ -305,7 +203,7 @@ int32_t opal_generic_simple_unpack_function_cuda_vector( opal_convertor_t* pConv
     long total_time;
 #endif
 
-    DT_CUDA_DEBUG( opal_cuda_output( 1, "opal_convertor_generic_simple_unpack( %p, {%p, %lu}, %u , %u)\n",
+    DT_CUDA_DEBUG( opal_cuda_output( 2, "opal_convertor_generic_simple_unpack_vector( %p, {%p, %lu}, %u , %u)\n",
                                      (void*)pConvertor, iov[0].iov_base, (unsigned long)iov[0].iov_len, *out_size, *max_data ); )
 
     description = pConvertor->use_desc->desc;
@@ -322,7 +220,7 @@ int32_t opal_generic_simple_unpack_function_cuda_vector( opal_convertor_t* pConv
     pConvertor->stack_pos--;
     pElem = &(description[pos_desc]);
 
-    DT_CUDA_DEBUG( opal_cuda_output( 1, "unpack start pos_desc %d count_desc %d disp %ld\n"
+    DT_CUDA_DEBUG( opal_cuda_output( 4, "unpack start pos_desc %d count_desc %d disp %ld\n"
                            "stack_pos %d pos_desc %d count_desc %d disp %ld\n",
                            pos_desc, count_desc, (long)(conv_ptr - pConvertor->pBaseBuf),
                            pConvertor->stack_pos, pStack->index, (int)pStack->count, (long)(pStack->disp) ); );
@@ -351,7 +249,7 @@ int32_t opal_generic_simple_unpack_function_cuda_vector( opal_convertor_t* pConv
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
         GET_TIME( end );
         total_time = ELAPSED_TIME( start, end );
-        printf( "[Timing]: HtoD memcpy in %ld microsec, free required %d\n", total_time, free_required );
+        DT_CUDA_DEBUG( opal_cuda_output( 2, "[Timing]: HtoD memcpy in %ld microsec, free required %d\n", total_time, free_required ); );
 #endif
         iov_len_local = iov[iov_count].iov_len;
         if( 0 != pConvertor->partial_length ) {
@@ -369,7 +267,7 @@ int32_t opal_generic_simple_unpack_function_cuda_vector( opal_convertor_t* pConv
                 goto complete_loop;
             }
             if( OPAL_DATATYPE_END_LOOP == pElem->elem.common.type ) { /* end of the current loop */
-                DT_CUDA_DEBUG( opal_cuda_output( 2, "unpack end_loop count %d stack_pos %d pos_desc %d disp %ld space %lu\n",
+                DT_CUDA_DEBUG( opal_cuda_output( 4, "unpack end_loop count %d stack_pos %d pos_desc %d disp %ld space %lu\n",
                                                  (int)pStack->count, pConvertor->stack_pos, pos_desc,
                                                  (long)pStack->disp, (unsigned long)iov_len_local ); );
                 if( --(pStack->count) == 0 ) { /* end of loop */
@@ -394,7 +292,7 @@ int32_t opal_generic_simple_unpack_function_cuda_vector( opal_convertor_t* pConv
                 }
                 conv_ptr = pConvertor->pBaseBuf + pStack->disp;
                 UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
-                DT_CUDA_DEBUG( opal_cuda_output( 2, "unpack new_loop count %d stack_pos %d pos_desc %d disp %ld space %lu\n",
+                DT_CUDA_DEBUG( opal_cuda_output( 4, "unpack new_loop count %d stack_pos %d pos_desc %d disp %ld space %lu\n",
                                                  (int)pStack->count, pConvertor->stack_pos, pos_desc,
                                                  (long)pStack->disp, (unsigned long)iov_len_local ); );
             }
@@ -433,9 +331,9 @@ int32_t opal_generic_simple_unpack_function_cuda_vector( opal_convertor_t* pConv
     *max_data = total_unpacked;
     pConvertor->bConverted += total_unpacked;  /* update the already converted bytes */
     *out_size = iov_count;
+    DT_CUDA_DEBUG( opal_cuda_output( 2, "Unpack total unpacked %lu\n", total_unpacked); );
     if( pConvertor->bConverted == pConvertor->remote_size ) {
         pConvertor->flags |= CONVERTOR_COMPLETED;
-        DT_CUDA_DEBUG( opal_cuda_output( 0, "Total unpacked %lu\n", pConvertor->bConverted); );
         if (pConvertor->gpu_buffer_ptr != NULL && free_required == 1) {
             opal_cuda_free_gpu_buffer(pConvertor->gpu_buffer_ptr, 0);
             pConvertor->gpu_buffer_ptr = NULL;
@@ -445,7 +343,7 @@ int32_t opal_generic_simple_unpack_function_cuda_vector( opal_convertor_t* pConv
     /* Save the global position for the next round */
     PUSH_STACK( pStack, pConvertor->stack_pos, pos_desc, pElem->elem.common.type, count_desc,
                 conv_ptr - pConvertor->pBaseBuf );
-    DT_CUDA_DEBUG( opal_cuda_output( 2, "unpack save stack stack_pos %d pos_desc %d count_desc %d disp %ld\n",
+    DT_CUDA_DEBUG( opal_cuda_output( 4, "unpack save stack stack_pos %d pos_desc %d count_desc %d disp %ld\n",
                                      pConvertor->stack_pos, pStack->index, (int)pStack->count, (long)pStack->disp ); );
     return 0;
 }
@@ -459,17 +357,17 @@ int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvert
     uint32_t count_desc, current_block, task_iteration, nb_blocks_per_description, dst_offset, residue_desc;
     uint32_t nb_blocks, thread_per_block;
     size_t length, buffer_size, length_per_iovec;
-    unsigned char *source, *source_tmp;
+    unsigned char *source, *source_base, *destination_base;
     size_t total_unpacked, total_converted;
     int32_t complete_flag = 0;
     uint8_t buffer_isfull = 0;
     uint8_t free_required = 0;
     uint32_t convertor_flags;
-    dt_elem_desc_t* description;
-    dt_elem_desc_t* pElem;
-    dt_stack_t* pStack;
+//    dt_elem_desc_t* description;
+//    dt_elem_desc_t* pElem;
+//    dt_stack_t* pStack;
     uint8_t alignment, orig_alignment;
-    int32_t orig_stack_index;
+//    int32_t orig_stack_index;
 
     ddt_cuda_iov_dist_t* cuda_iov_dist_h_current;
     ddt_cuda_iov_dist_t* cuda_iov_dist_d_current;
@@ -482,18 +380,13 @@ int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvert
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
     GET_TIME(start_total);
 #endif
-    
-    description = pConvertor->use_desc->desc;
+
+/*    description = pConvertor->use_desc->desc;
     pStack = pConvertor->pStack + pConvertor->stack_pos;
     pElem = &(description[pStack->index]);
-    DT_CUDA_DEBUG ( opal_cuda_output(0, "GPU datatype UNpacking using iovec\n"); );
-    
-    // double *vtmp = (double *)iov[0].iov_base;
-    // for (uint32_t i = 0; i < iov[0].iov_len/sizeof(double); i++) {
-    //     printf(" %1.f ", *vtmp);
-    //     vtmp ++;
-    // }
-    // printf("\n");
+    printf("size elem %d, size %lu\n", pElem->elem.common.type, opal_datatype_basicDatatypes[pElem->elem.common.type]->size);
+*/
+
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
     GET_TIME(start);
 #endif
@@ -506,26 +399,22 @@ int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvert
             pConvertor->gpu_buffer_ptr = NULL;
             free_required = 0;
         } else {
-#if defined(OPAL_DATATYPE_CUDA_DRY_RUN)
-            source = (unsigned char*)iov[0].iov_base;
-#else
             if (pConvertor->gpu_buffer_ptr == NULL) {
                 pConvertor->gpu_buffer_ptr = (unsigned char*)opal_cuda_malloc_gpu_buffer(iov[0].iov_len, 0);
             }
             source = pConvertor->gpu_buffer_ptr;
-#endif /* OPAL_DATATYPE_CUDA_DRY_RUN */
             cudaMemcpy(source, iov[0].iov_base, iov[0].iov_len, cudaMemcpyHostToDevice);
             free_required = 1;
         }
     }
     
-    source_tmp = source;
 
-    DT_CUDA_DEBUG ( opal_cuda_output(0, "UNpack GPU base %p, unpack from buffer %p, total size %ld\n", pConvertor->pBaseBuf, source, iov[0].iov_len); );
-#if defined(OPAL_DATATYPE_CUDA_TIMING) 
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "Unpack using IOV, GPU base %p, unpack from buffer %p, total size %ld\n",
+                                     pConvertor->pBaseBuf, source, iov[0].iov_len); );
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
     GET_TIME( end );
     move_time = ELAPSED_TIME( start, end );
-    printf( "[Timing]: HtoD memcpy in %ld microsec, free required %d\n", move_time, free_required );
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: HtoD memcpy in %ld microsec, free required %d\n", move_time, free_required ); );
 #endif
 
 
@@ -538,14 +427,15 @@ int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvert
     total_converted = pConvertor->bConverted;
     cuda_streams->current_stream_id = 0;
     convertor_flags = pConvertor->flags;
-    orig_stack_index = pStack->index;
+//    orig_stack_index = pStack->index;
+    source_base = source;
     complete_flag = opal_convertor_raw( pConvertor, cuda_iov, &cuda_iov_count, &length );
-    DT_CUDA_DEBUG ( opal_cuda_output(2, "UNPACKING complete flag %d, iov count %d, length %d, submit to CUDA stream %d\n", complete_flag, cuda_iov_count, length, cuda_streams->current_stream_id); );
+    DT_CUDA_DEBUG ( opal_cuda_output(4, "Unpack complete flag %d, iov count %d, length %d, submit to CUDA stream %d\n", complete_flag, cuda_iov_count, length, cuda_streams->current_stream_id); );
 
 #if defined (OPAL_DATATYPE_CUDA_TIMING)
     GET_TIME( end );
     total_time = ELAPSED_TIME( start, end );
-    printf( "[Timing]: ddt to iov in %ld microsec\n", total_time );
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: ddt to iov in %ld microsec\n", total_time ); );
 #endif
     
     dst_offset = 0;
@@ -557,8 +447,9 @@ int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvert
         current_block = 0;
         task_iteration = 0;
         cuda_iov_dist_h_current = cuda_iov_dist_h[cuda_streams->current_stream_id];
-        cuda_iov_dist_d_current = cuda_iov_dist_d[cuda_streams->current_stream_id]; 
-        
+        cuda_iov_dist_d_current = cuda_iov_dist_d[cuda_streams->current_stream_id];
+        destination_base = (unsigned char*)cuda_iov[0].iov_base;
+
 #if defined (OPAL_DATATYPE_CUDA_TIMING)
         GET_TIME(start);
 #endif
@@ -567,11 +458,12 @@ int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvert
         }
         
         for (i = 0; i < cuda_iov_count; i++) {
-            pElem = &(description[orig_stack_index+i]);
+//            pElem = &(description[orig_stack_index+i]);
             if (buffer_size >= cuda_iov[i].iov_len) {
                 length_per_iovec = cuda_iov[i].iov_len;
             } else {
-                orig_alignment = opal_datatype_basicDatatypes[pElem->elem.common.type]->size;
+              /*  orig_alignment = opal_datatype_basicDatatypes[pElem->elem.common.type]->size;*/
+                orig_alignment = ALIGNMENT_CHAR;
                 length_per_iovec = buffer_size / orig_alignment * orig_alignment;
                 buffer_isfull = 1;
             }
@@ -590,12 +482,11 @@ int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvert
             count_desc = length_per_iovec / alignment;
             residue_desc = length_per_iovec % alignment;
             nb_blocks_per_description = (count_desc + thread_per_block - 1) / thread_per_block;
-            DT_CUDA_DEBUG ( opal_cuda_output(10, "UNPACKING description %d, size %d, residue %d, alignment %d\n", i, count_desc, residue_desc, alignment); );
+            DT_CUDA_DEBUG ( opal_cuda_output(10, "Unpack description %d, size %d, residue %d, alignment %d\n", i, count_desc, residue_desc, alignment); );
             for (j = 0; j < nb_blocks_per_description; j++) {
-                cuda_iov_dist_h_current[current_block].dst[task_iteration] = (unsigned char *)(cuda_iov[i].iov_base) + j * thread_per_block * alignment;
-                cuda_iov_dist_h_current[current_block].src[task_iteration] = source;
-                cuda_iov_dist_h_current[current_block].element_alignment[task_iteration] = alignment;
-                cuda_iov_dist_h_current[current_block].nb_tasks = task_iteration + 1;
+                cuda_iov_dist_h_current[nb_blocks_used].dst_offset = (unsigned char *)(cuda_iov[i].iov_base) + j * thread_per_block * alignment - destination_base;
+                cuda_iov_dist_h_current[nb_blocks_used].src_offset = source - source_base;
+                cuda_iov_dist_h_current[nb_blocks_used].element_alignment = alignment;
                 if ( (j+1) * thread_per_block <= count_desc) {
                     cuda_iov_dist_h_current[current_block].nb_elements[task_iteration] = thread_per_block;// * sizeof(double);
                 } else {
@@ -604,35 +495,25 @@ int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvert
 #if defined (OPAL_DATATYPE_CUDA_DEBUG)
                 assert (cuda_iov_dist_h_current[current_block].nb_elements[task_iteration] > 0); 
 #endif /* OPAL_DATATYPE_CUDA_DEBUG */
-                source += cuda_iov_dist_h_current[current_block].nb_elements[task_iteration] * alignment;
-                DT_CUDA_DEBUG( opal_cuda_output(12, "UNPACKING \tblock %d, src %p, dst %p, nb_elements %d, alignment %d\n", current_block, cuda_iov_dist_h_current[current_block].src[task_iteration], cuda_iov_dist_h_current[current_block].dst[task_iteration], cuda_iov_dist_h_current[current_block].nb_elements[task_iteration], cuda_iov_dist_h_current[current_block].element_alignment[task_iteration]); );
-                current_block += 1;
-                if (current_block >= nb_blocks) {
-                    current_block = 0;
-                    task_iteration ++;
-                    assert(task_iteration < CUDA_IOV_MAX_TASK_PER_BLOCK);
-                }
+                source += cuda_iov_dist_h_current[nb_blocks_used].nb_elements * alignment;
+                DT_CUDA_DEBUG( opal_cuda_output(12, "Unpack \tblock %d, src %p, dst %p, nb_elements %d, alignment %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].src_offset, cuda_iov_dist_h_current[nb_blocks_used].dst_offset, cuda_iov_dist_h_current[nb_blocks_used].nb_elements, cuda_iov_dist_h_current[nb_blocks_used].element_alignment); );
+                nb_blocks_used ++;
             }
             
             /* handle residue */
             if (residue_desc != 0) {
-                orig_alignment = opal_datatype_basicDatatypes[pElem->elem.common.type]->size;
-                cuda_iov_dist_h_current[current_block].dst[task_iteration] = (unsigned char *)(cuda_iov[i].iov_base) + length_per_iovec / alignment * alignment;
-                cuda_iov_dist_h_current[current_block].src[task_iteration] = source;
-                cuda_iov_dist_h_current[current_block].element_alignment[task_iteration] = orig_alignment;
-                cuda_iov_dist_h_current[current_block].nb_tasks = task_iteration + 1;
-                cuda_iov_dist_h_current[current_block].nb_elements[task_iteration] = (length_per_iovec - length_per_iovec / alignment * alignment) / orig_alignment;
+               /* orig_alignment = opal_datatype_basicDatatypes[pElem->elem.common.type]->size;*/
+                orig_alignment = ALIGNMENT_CHAR;
+                cuda_iov_dist_h_current[nb_blocks_used].dst_offset = (unsigned char *)(cuda_iov[i].iov_base) + length_per_iovec / alignment * alignment - destination_base;
+                cuda_iov_dist_h_current[nb_blocks_used].src_offset = source - source_base;
+                cuda_iov_dist_h_current[nb_blocks_used].element_alignment = orig_alignment;
+                cuda_iov_dist_h_current[nb_blocks_used].nb_elements = (length_per_iovec - length_per_iovec / alignment * alignment) / orig_alignment;
 #if defined (OPAL_DATATYPE_CUDA_DEBUG)
                 assert (cuda_iov_dist_h_current[current_block].nb_elements[task_iteration] > 0);
 #endif /* OPAL_DATATYPE_CUDA_DEBUG */
-                source += cuda_iov_dist_h_current[current_block].nb_elements[task_iteration] * orig_alignment;
-                DT_CUDA_DEBUG( opal_cuda_output(12, "UNPACKING \tblock %d, src %p, dst %p, nb_elements %d, alignment %d\n", current_block, cuda_iov_dist_h_current[current_block].src[task_iteration], cuda_iov_dist_h_current[current_block].dst[task_iteration], cuda_iov_dist_h_current[current_block].nb_elements[task_iteration], cuda_iov_dist_h_current[current_block].element_alignment[task_iteration]); );
-                current_block += 1;
-                if (current_block >= nb_blocks) {
-                    current_block = 0;
-                    task_iteration ++;
-                    assert(task_iteration < CUDA_IOV_MAX_TASK_PER_BLOCK);
-                }
+                source += cuda_iov_dist_h_current[nb_blocks_used].nb_elements * orig_alignment;
+                DT_CUDA_DEBUG( opal_cuda_output(12, "Unpack \tblock %d, src %p, dst %p, nb_elements %d, alignment %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].src_offset, cuda_iov_dist_h_current[nb_blocks_used].dst_offset, cuda_iov_dist_h_current[nb_blocks_used].nb_elements, cuda_iov_dist_h_current[nb_blocks_used].element_alignment); );
+                nb_blocks_used ++;
             }
             
             if (buffer_isfull) {
@@ -643,11 +524,11 @@ int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvert
 #if defined(OPAL_DATATYPE_CUDA_TIMING)    
         GET_TIME( end );
         total_time = ELAPSED_TIME( start, end );
-        printf( "[Timing]: UNpack src %p, iov is prepared in %ld microsec, kernel submitted to CUDA stream %d\n", source_tmp, total_time,  cuda_streams->current_stream_id);
+        DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Unpack src %p, iov is prepared in %ld microsec, kernel submitted to CUDA stream %d\n", source_base, total_time,  cuda_streams->current_stream_id); );
 #endif
-                
-        cudaMemcpyAsync(cuda_iov_dist_d_current, cuda_iov_dist_h_current, sizeof(ddt_cuda_iov_dist_t)*(nb_blocks), cudaMemcpyHostToDevice, cuda_streams->opal_cuda_stream[cuda_streams->current_stream_id]);
-        opal_generic_simple_unpack_cuda_iov_kernel<<<nb_blocks, thread_per_block, 0, cuda_streams->opal_cuda_stream[cuda_streams->current_stream_id]>>>(cuda_iov_dist_d_current);
+
+        cudaMemcpyAsync(cuda_iov_dist_d_current, cuda_iov_dist_h_current, sizeof(ddt_cuda_iov_dist_t)*(nb_blocks_used), cudaMemcpyHostToDevice, cuda_streams->opal_cuda_stream[cuda_streams->current_stream_id]);
+        opal_generic_simple_unpack_cuda_iov_kernel<<<nb_blocks, thread_per_block, 0, cuda_streams->opal_cuda_stream[cuda_streams->current_stream_id]>>>(cuda_iov_dist_d_current, nb_blocks_used, source_base, destination_base);
         cuda_streams->current_stream_id ++;
         cuda_streams->current_stream_id = cuda_streams->current_stream_id % NB_STREAMS;    
         
@@ -663,13 +544,13 @@ int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvert
         convertor_flags = pConvertor->flags;     
 #endif
         convertor_flags = pConvertor->flags;
-        orig_stack_index = pStack->index;
+//        orig_stack_index = pStack->index;
         complete_flag = opal_convertor_raw( pConvertor, cuda_iov, &cuda_iov_count, &length );
-        DT_CUDA_DEBUG ( opal_cuda_output(8, "UNPACKING complete flag %d, iov count %d, length %d, submit to CUDA stream %d\n", complete_flag, cuda_iov_count, length, cuda_streams->current_stream_id); );
+        DT_CUDA_DEBUG ( opal_cuda_output(4, "Unpack complete flag %d, iov count %d, length %d, submit to CUDA stream %d, nb_blocks %d\n", complete_flag, cuda_iov_count, length, cuda_streams->current_stream_id, nb_blocks_used); );
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
         GET_TIME( end );
         total_time = ELAPSED_TIME( start, end );
-        printf( "[Timing]: ddt to iov in %ld microsec\n", total_time );
+        DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: ddt to iov in %ld microsec\n", total_time ); );
 #endif
 
     }
@@ -680,12 +561,12 @@ int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvert
     iov[0].iov_len = total_unpacked;
     *max_data = total_unpacked;
     *out_size = 1;
-    DT_CUDA_DEBUG ( opal_cuda_output(0, "UNPACKING total unpacked %d\n", total_unpacked); );
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "Unpack total unpacked %d\n", total_unpacked); );
 
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
     GET_TIME( end_total );
     total_time = ELAPSED_TIME( start_total, end_total );
-    printf( "[Timing]: total unpacking in %ld microsec, kernel %ld microsec\n", total_time, total_time-move_time );
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: total unpacking in %ld microsec, kernel %ld microsec\n", total_time, total_time-move_time ); );
 #endif
     
     if( pConvertor->bConverted == pConvertor->local_size ) {
@@ -717,7 +598,7 @@ void unpack_contiguous_loop_cuda( dt_elem_desc_t* ELEM,
     long total_time;
 #endif
     
-    DT_CUDA_DEBUG( opal_cuda_output( 0, "I am in unpack_contiguous_loop_cuda\n"); );
+    DT_CUDA_DEBUG( opal_cuda_output( 2, "Unpack using contiguous_loop_cuda\n"); );
 
     if( (_copy_loops * _end_loop->size) > *(SPACE) )
         _copy_loops = (uint32_t)(*(SPACE) / _end_loop->size);
@@ -741,7 +622,7 @@ void unpack_contiguous_loop_cuda( dt_elem_desc_t* ELEM,
 #if defined(OPAL_DATATYPE_CUDA_TIMING) 
     GET_TIME( end );
     total_time = ELAPSED_TIME( start, end );
-    printf( "[Timing]: vector unpacking in %ld microsec\n", total_time );
+    DT_CUDA_DEBUG( opal_cuda_output( 2, "[Timing]: vector unpacking in %ld microsec\n", total_time ); );
 #endif
 }
 
@@ -763,7 +644,7 @@ void unpack_contiguous_loop_cuda_memcpy2d( dt_elem_desc_t* ELEM,
     long total_time;
 #endif
     
-    DT_CUDA_DEBUG( opal_cuda_output( 0, "I am in unpack_contiguous_loop_cuda_memcpy2d\n"); );
+    DT_CUDA_DEBUG( opal_cuda_output( 2, "Unpack using contiguous_loop_cuda_memcpy2d\n"); );
 
     if( (_copy_loops * _end_loop->size) > *(SPACE) )
         _copy_loops = (uint32_t)(*(SPACE) / _end_loop->size);
@@ -784,7 +665,7 @@ void unpack_contiguous_loop_cuda_memcpy2d( dt_elem_desc_t* ELEM,
 #if defined(OPAL_DATATYPE_CUDA_TIMING) 
     GET_TIME( end );
     total_time = ELAPSED_TIME( start, end );
-    printf( "[Timing]: vector unpacking with memcpy2d in %ld microsec\n", total_time );
+    DT_CUDA_DEBUG( opal_cuda_output( 2, "[Timing]: vector unpacking with memcpy2d in %ld microsec\n", total_time ); );
 #endif
 }
 
@@ -807,7 +688,7 @@ void unpack_contiguous_loop_cuda_zerocopy( dt_elem_desc_t* ELEM,
     long total_time;
 #endif
     
-    DT_CUDA_DEBUG( opal_cuda_output( 0, "I am in unpack_contiguous_loop_cuda_zerocopy\n"); );
+    DT_CUDA_DEBUG( opal_cuda_output( 2, "Unpack using contiguous_loop_cuda_zerocopy\n"); );
 
     if( (_copy_loops * _end_loop->size) > *(SPACE) )
         _copy_loops = (uint32_t)(*(SPACE) / _end_loop->size);
@@ -838,7 +719,7 @@ void unpack_contiguous_loop_cuda_zerocopy( dt_elem_desc_t* ELEM,
 #if defined(OPAL_DATATYPE_CUDA_TIMING) 
     GET_TIME( end );
     total_time = ELAPSED_TIME( start, end );
-    printf( "[Timing]: vector unpacking in %ld microsec\n", total_time );
+    DT_CUDA_DEBUG( opal_cuda_output( 2, "[Timing]: vector unpacking in %ld microsec\n", total_time ); );
 #endif
 }
 
diff --git a/opal/datatype/opal_datatype_gpu.c b/opal/datatype/opal_datatype_gpu.c
index ef7a8f41d27..095cd477dd3 100644
--- a/opal/datatype/opal_datatype_gpu.c
+++ b/opal/datatype/opal_datatype_gpu.c
@@ -45,15 +45,6 @@ void (*opal_datatype_cuda_init_p)(void) = NULL;
 
 void (*opal_datatype_cuda_fini_p)(void) = NULL;
 
-int32_t (*opal_generic_simple_pack_function_cuda_p)( opal_convertor_t* pConvertor,
-                                                     struct iovec* iov,
-                                                     uint32_t* out_size,
-                                                     size_t* max_data ) = NULL;
-
-int32_t (*opal_generic_simple_unpack_function_cuda_p)( opal_convertor_t* pConvertor,
-                                                       struct iovec* iov,
-                                                       uint32_t* out_size,
-                                                       size_t* max_data ) = NULL;
 
 int32_t (*opal_generic_simple_pack_function_cuda_iov_p)( opal_convertor_t* pConvertor,
                                                         struct iovec* iov,
@@ -95,8 +86,6 @@ void (*pack_predefined_data_cuda_p)( dt_elem_desc_t* ELEM,
 
 void (*opal_cuda_sync_device_p)(void) = NULL;
 
-unsigned char* (*opal_cuda_get_gpu_pack_buffer_p)(void) = NULL;
-
 void (*opal_cuda_free_gpu_buffer_p)(void *addr, int gpu_id) = NULL;
 
 void* (*opal_cuda_malloc_gpu_buffer_p)(size_t size, int gpu_id) = NULL;
@@ -129,8 +118,6 @@ int32_t opal_datatype_gpu_init(void)
         }
         OPAL_DATATYPE_FIND_CUDA_FUNCTION_OR_RETURN( opal_datatype_cuda_handle, opal_datatype_cuda_init );
         OPAL_DATATYPE_FIND_CUDA_FUNCTION_OR_RETURN( opal_datatype_cuda_handle, opal_datatype_cuda_fini );
-        OPAL_DATATYPE_FIND_CUDA_FUNCTION_OR_RETURN( opal_datatype_cuda_handle, opal_generic_simple_pack_function_cuda );
-        OPAL_DATATYPE_FIND_CUDA_FUNCTION_OR_RETURN( opal_datatype_cuda_handle, opal_generic_simple_unpack_function_cuda );
         OPAL_DATATYPE_FIND_CUDA_FUNCTION_OR_RETURN( opal_datatype_cuda_handle, opal_generic_simple_pack_function_cuda_iov );
         OPAL_DATATYPE_FIND_CUDA_FUNCTION_OR_RETURN( opal_datatype_cuda_handle, opal_generic_simple_unpack_function_cuda_iov );
         OPAL_DATATYPE_FIND_CUDA_FUNCTION_OR_RETURN( opal_datatype_cuda_handle, opal_generic_simple_pack_function_cuda_vector );
@@ -139,12 +126,11 @@ int32_t opal_datatype_gpu_init(void)
         OPAL_DATATYPE_FIND_CUDA_FUNCTION_OR_RETURN( opal_datatype_cuda_handle, unpack_contiguous_loop_cuda );
         OPAL_DATATYPE_FIND_CUDA_FUNCTION_OR_RETURN( opal_datatype_cuda_handle, pack_predefined_data_cuda );
         OPAL_DATATYPE_FIND_CUDA_FUNCTION_OR_RETURN( opal_datatype_cuda_handle, opal_cuda_sync_device );
-        OPAL_DATATYPE_FIND_CUDA_FUNCTION_OR_RETURN( opal_datatype_cuda_handle, opal_cuda_get_gpu_pack_buffer );
         OPAL_DATATYPE_FIND_CUDA_FUNCTION_OR_RETURN( opal_datatype_cuda_handle, opal_cuda_free_gpu_buffer );
         OPAL_DATATYPE_FIND_CUDA_FUNCTION_OR_RETURN( opal_datatype_cuda_handle, opal_cuda_malloc_gpu_buffer );
 
         (*opal_datatype_cuda_init_p)();
-        printf("cuda init done\n");
+        opal_output( 0, "cuda init done\n");
     }
     return OPAL_SUCCESS;
 }
@@ -156,8 +142,6 @@ int32_t opal_datatype_gpu_fini(void)
         /* Reset all functions to NULL */
         opal_datatype_cuda_init_p = NULL;
         opal_datatype_cuda_fini_p = NULL;
-        opal_generic_simple_pack_function_cuda_p = NULL;
-        opal_generic_simple_unpack_function_cuda_p = NULL;
         opal_generic_simple_pack_function_cuda_iov_p = NULL;
         opal_generic_simple_unpack_function_cuda_iov_p = NULL;
         opal_generic_simple_pack_function_cuda_vector_p = NULL;
@@ -166,7 +150,6 @@ int32_t opal_datatype_gpu_fini(void)
         unpack_contiguous_loop_cuda_p = NULL;
         pack_predefined_data_cuda_p = NULL;
         opal_cuda_sync_device_p = NULL;
-        opal_cuda_get_gpu_pack_buffer_p = NULL;
         opal_cuda_free_gpu_buffer_p = NULL;
         opal_cuda_malloc_gpu_buffer_p = NULL;
 
@@ -176,21 +159,7 @@ int32_t opal_datatype_gpu_fini(void)
         if( NULL != opal_datatype_cuda_lib )
             free(opal_datatype_cuda_lib);
         opal_datatype_cuda_lib = NULL;
-        printf("cuda fini done\n");
+        opal_output( 0, "cuda fini done\n");
     }
     return OPAL_SUCCESS;
 }
-
-unsigned char* opal_datatype_get_gpu_buffer(void)
-{
-#if OPAL_DATATYPE_CUDA_KERNEL
-    if (opal_datatype_gpu_init() != OPAL_SUCCESS) {
-        opal_datatype_gpu_fini();
-        return NULL;
-    }
-    return (*opal_cuda_get_gpu_pack_buffer_p)();
-#else
-    return NULL;
-#endif /* defined OPAL_DATATYPE_CUDA_KERNEL */
-    
-}
diff --git a/opal/datatype/opal_datatype_gpu.h b/opal/datatype/opal_datatype_gpu.h
index 887c8a0918b..d50e2fe8d99 100644
--- a/opal/datatype/opal_datatype_gpu.h
+++ b/opal/datatype/opal_datatype_gpu.h
@@ -5,21 +5,10 @@
 
 int32_t opal_datatype_gpu_init(void);
 int32_t opal_datatype_gpu_fini(void);
-unsigned char* opal_datatype_get_gpu_buffer(void);
 
 extern void (*opal_datatype_cuda_init_p)(void);
 
 extern void (*opal_datatype_cuda_fini_p)(void);
-
-extern int32_t (*opal_generic_simple_pack_function_cuda_p)( opal_convertor_t* pConvertor,
-                                                            struct iovec* iov, 
-                                                            uint32_t* out_size,
-                                                            size_t* max_data );
-                                                            
-extern int32_t (*opal_generic_simple_unpack_function_cuda_p)( opal_convertor_t* pConvertor,
-                                                              struct iovec* iov, 
-                                                              uint32_t* out_size,
-                                                              size_t* max_data );
                                                               
 extern int32_t (*opal_generic_simple_pack_function_cuda_iov_p)( opal_convertor_t* pConvertor,
                                                                 struct iovec* iov, 
@@ -61,8 +50,6 @@ extern void (*pack_predefined_data_cuda_p)( dt_elem_desc_t* ELEM,
                                             
 extern void (*opal_cuda_sync_device_p)(void);
 
-extern unsigned char* (*opal_cuda_get_gpu_pack_buffer_p)(void);
-
 extern void (*opal_cuda_free_gpu_buffer_p)(void *addr, int gpu_id);
 
 extern void* (*opal_cuda_malloc_gpu_buffer_p)(size_t size, int gpu_id);
diff --git a/opal/mca/btl/smcuda/btl_smcuda.c b/opal/mca/btl/smcuda/btl_smcuda.c
index dacc343ba84..2e7bee3279b 100644
--- a/opal/mca/btl/smcuda/btl_smcuda.c
+++ b/opal/mca/btl/smcuda/btl_smcuda.c
@@ -495,8 +495,8 @@ create_sm_endpoint(int local_proc, struct opal_proc_t *proc)
                                                  NULL,
                                                  &resources);
         for (int i = 0; i < SMCUDA_DT_CLONE_SIZE; i++) {
-            ep->smcuda_dt_pack_clone[i].lindex = -1;
-            ep->smcuda_dt_unpack_clone[i].lindex = -1;
+            ep->smcuda_ddt_pack_clone[i].lindex = -1;
+            ep->smcuda_ddt_unpack_clone[i].lindex = -1;
         }
     }
 #endif /* OPAL_CUDA_SUPPORT */
@@ -1159,7 +1159,7 @@ int mca_btl_smcuda_get_cuda (struct mca_btl_base_module_t *btl,
                 convertor->gpu_buffer_ptr = remote_memory_address;   
             }
             if (pack_required) {
-                mca_btl_smcuda_cuda_dt_unpack_clone(convertor, ep, remote_memory_address, (mca_btl_base_descriptor_t *)frag, 
+                mca_btl_smcuda_cuda_dt_unpack_clone(ep, convertor, remote_memory_address, (mca_btl_base_descriptor_t *)frag, 
                                                     0, lindex, remote_device, local_device);
                 cuda_dt_hdr_t send_msg;
                 send_msg.lindex = lindex;
@@ -1208,7 +1208,7 @@ int mca_btl_smcuda_get_cuda (struct mca_btl_base_module_t *btl,
                     send_msg.seq = 0;
                     send_msg.msg_type = CUDA_PACK_TO_LOCAL_START;
                 }
-                mca_btl_smcuda_cuda_dt_unpack_clone(NULL, ep, remote_memory_address, (mca_btl_base_descriptor_t *)frag, 
+                mca_btl_smcuda_cuda_dt_unpack_clone(ep, NULL, remote_memory_address, (mca_btl_base_descriptor_t *)frag, 
                                                     0, lindex, 0, 0);
                 mca_btl_smcuda_send_cuda_pack_sig(btl, ep, &send_msg);
                 done = 0;
@@ -1367,7 +1367,7 @@ int mca_btl_smcuda_check_cuda_dt_pack_clone_exist(struct mca_btl_base_endpoint_t
 {
     int i;
     for (i = 0; i < SMCUDA_DT_CLONE_SIZE; i++) {
-        if (endpoint->smcuda_dt_pack_clone[i].convertor == convertor) {
+        if (endpoint->smcuda_ddt_pack_clone[i].convertor == convertor) {
             return i;
         }
     }
@@ -1376,7 +1376,7 @@ int mca_btl_smcuda_check_cuda_dt_pack_clone_exist(struct mca_btl_base_endpoint_t
 
 int mca_btl_smcuda_set_cuda_dt_pack_seq(struct mca_btl_base_endpoint_t *endpoint, int lindex, int seq)
 {
-    endpoint->smcuda_dt_pack_clone[lindex].seq = seq;
+    endpoint->smcuda_ddt_pack_clone[lindex].seq = seq;
     return 0;
 }
 
@@ -1385,7 +1385,7 @@ int mca_btl_smcuda_get_cuda_dt_pack_seq(struct mca_btl_base_endpoint_t *endpoint
     if (lindex >= SMCUDA_DT_CLONE_SIZE) {
         return -9;
     } else {
-        return endpoint->smcuda_dt_pack_clone[lindex].seq;
+        return endpoint->smcuda_ddt_pack_clone[lindex].seq;
     }
 }
 
@@ -1394,7 +1394,7 @@ int mca_btl_smcuda_get_cuda_dt_pack_pipeline_size(struct mca_btl_base_endpoint_t
     if (lindex >= SMCUDA_DT_CLONE_SIZE) {
         return -9;
     } else {
-        return endpoint->smcuda_dt_pack_clone[lindex].pipeline_size;
+        return endpoint->smcuda_ddt_pack_clone[lindex].pipeline_size;
     }
 }
 
@@ -1402,7 +1402,7 @@ int mca_btl_smcuda_alloc_cuda_dt_pack_clone(struct mca_btl_base_endpoint_t *endp
 {
     int i;
     for (i = 0; i < SMCUDA_DT_CLONE_SIZE; i++) {
-        if (endpoint->smcuda_dt_pack_clone[i].lindex == -1) {
+        if (endpoint->smcuda_ddt_pack_clone[i].lindex == -1) {
             return i;
         }
     }
@@ -1412,7 +1412,7 @@ int mca_btl_smcuda_alloc_cuda_dt_unpack_clone(struct mca_btl_base_endpoint_t *en
 {
     int i;
     for (i = 0; i < SMCUDA_DT_CLONE_SIZE; i++) {
-        if (endpoint->smcuda_dt_unpack_clone[i].lindex == -1) {
+        if (endpoint->smcuda_ddt_unpack_clone[i].lindex == -1) {
             return i;
         }
     }
@@ -1421,51 +1421,47 @@ int mca_btl_smcuda_alloc_cuda_dt_unpack_clone(struct mca_btl_base_endpoint_t *en
 
 void mca_btl_smcuda_free_cuda_dt_pack_clone(struct mca_btl_base_endpoint_t *endpoint, int lindex)
 {
-    assert(endpoint->smcuda_dt_pack_clone[lindex].lindex == lindex);
-    endpoint->smcuda_dt_pack_clone[lindex].lindex = -1;
+    assert(endpoint->smcuda_ddt_pack_clone[lindex].lindex == lindex);
+    endpoint->smcuda_ddt_pack_clone[lindex].lindex = -1;
 }
 void mca_btl_smcuda_free_cuda_dt_unpack_clone(struct mca_btl_base_endpoint_t *endpoint, int lindex)
 {
-    assert(endpoint->smcuda_dt_unpack_clone[lindex].lindex == lindex);
-    endpoint->smcuda_dt_unpack_clone[lindex].lindex = -1;
+    assert(endpoint->smcuda_ddt_unpack_clone[lindex].lindex == lindex);
+    endpoint->smcuda_ddt_unpack_clone[lindex].lindex = -1;
 }
 
-void mca_btl_smcuda_cuda_dt_pack_clone(struct opal_convertor_t *convertor,
-                                       struct mca_btl_base_endpoint_t *endpoint,
+void mca_btl_smcuda_cuda_dt_pack_clone(struct mca_btl_base_endpoint_t *endpoint,
+                                       struct opal_convertor_t *convertor,
                                        void *remote_gpu_address,
                                        mca_btl_base_descriptor_t *frag,
                                        size_t pipeline_size,
                                        int lindex, uint8_t remote_device, uint8_t local_device)
 {
-    endpoint->smcuda_dt_pack_clone[lindex].convertor = convertor;
- //   endpoint->smcuda_dt_pack_clone[lindex].gpu_ptr = convertor->gpu_buffer_ptr;
-    endpoint->smcuda_dt_pack_clone[lindex].endpoint = endpoint;
-    endpoint->smcuda_dt_pack_clone[lindex].remote_gpu_address = remote_gpu_address;
-    endpoint->smcuda_dt_pack_clone[lindex].pipeline_size = pipeline_size;
-    endpoint->smcuda_dt_pack_clone[lindex].lindex = lindex;
-    endpoint->smcuda_dt_pack_clone[lindex].seq = -9;
-    endpoint->smcuda_dt_pack_clone[lindex].remote_device = remote_device;
-    endpoint->smcuda_dt_pack_clone[lindex].local_device = local_device;
-    endpoint->smcuda_dt_pack_clone[lindex].frag = frag;
+    endpoint->smcuda_ddt_pack_clone[lindex].convertor = convertor;
+    endpoint->smcuda_ddt_pack_clone[lindex].remote_gpu_address = remote_gpu_address;
+    endpoint->smcuda_ddt_pack_clone[lindex].pipeline_size = pipeline_size;
+    endpoint->smcuda_ddt_pack_clone[lindex].lindex = lindex;
+    endpoint->smcuda_ddt_pack_clone[lindex].seq = -9;
+    endpoint->smcuda_ddt_pack_clone[lindex].remote_device = remote_device;
+    endpoint->smcuda_ddt_pack_clone[lindex].local_device = local_device;
+    endpoint->smcuda_ddt_pack_clone[lindex].frag = frag;
 }
 
-void mca_btl_smcuda_cuda_dt_unpack_clone(struct opal_convertor_t *convertor,
-                                         struct mca_btl_base_endpoint_t *endpoint,
+void mca_btl_smcuda_cuda_dt_unpack_clone(struct mca_btl_base_endpoint_t *endpoint,
+                                         struct opal_convertor_t *convertor,
                                          void *remote_gpu_address,
                                          mca_btl_base_descriptor_t *frag,
                                          size_t pipeline_size,
                                          int lindex, uint8_t remote_device, uint8_t local_device)
 {
-    endpoint->smcuda_dt_unpack_clone[lindex].convertor = convertor;
-//    endpoint->smcuda_dt_unpack_clone[lindex].gpu_ptr = convertor->gpu_buffer_ptr;
-    endpoint->smcuda_dt_unpack_clone[lindex].endpoint = endpoint;
-    endpoint->smcuda_dt_unpack_clone[lindex].remote_gpu_address = remote_gpu_address;
-    endpoint->smcuda_dt_unpack_clone[lindex].pipeline_size = pipeline_size;
-    endpoint->smcuda_dt_unpack_clone[lindex].lindex = lindex;
-    endpoint->smcuda_dt_unpack_clone[lindex].seq = -9;
-    endpoint->smcuda_dt_unpack_clone[lindex].remote_device = remote_device;
-    endpoint->smcuda_dt_unpack_clone[lindex].local_device = local_device;
-    endpoint->smcuda_dt_unpack_clone[lindex].frag = frag;
+    endpoint->smcuda_ddt_unpack_clone[lindex].convertor = convertor;
+    endpoint->smcuda_ddt_unpack_clone[lindex].remote_gpu_address = remote_gpu_address;
+    endpoint->smcuda_ddt_unpack_clone[lindex].pipeline_size = pipeline_size;
+    endpoint->smcuda_ddt_unpack_clone[lindex].lindex = lindex;
+    endpoint->smcuda_ddt_unpack_clone[lindex].seq = -9;
+    endpoint->smcuda_ddt_unpack_clone[lindex].remote_device = remote_device;
+    endpoint->smcuda_ddt_unpack_clone[lindex].local_device = local_device;
+    endpoint->smcuda_ddt_unpack_clone[lindex].frag = frag;
 }
 
 #endif /* OPAL_CUDA_SUPPORT */
diff --git a/opal/mca/btl/smcuda/btl_smcuda.h b/opal/mca/btl/smcuda/btl_smcuda.h
index abd043f9f10..f9171ec8962 100644
--- a/opal/mca/btl/smcuda/btl_smcuda.h
+++ b/opal/mca/btl/smcuda/btl_smcuda.h
@@ -207,7 +207,7 @@ struct mca_btl_smcuda_component_t {
     int cuda_ipc_output;
     int use_cuda_ipc;
     int use_cuda_ipc_same_gpu;
-    int cuda_dt_pipeline_size;
+    int cuda_ddt_pipeline_size;
 #endif /* OPAL_CUDA_SUPPORT */
 };
 typedef struct mca_btl_smcuda_component_t mca_btl_smcuda_component_t;
@@ -534,7 +534,6 @@ typedef struct {
 /* package save pack/unpack convertor and cbfunc */
 typedef struct {
     struct opal_convertor_t *convertor;
-    struct mca_btl_base_endpoint_t *endpoint;
     void *remote_gpu_address;
     size_t pipeline_size;
     int lindex;
@@ -542,10 +541,10 @@ typedef struct {
     uint8_t remote_device;
     uint8_t local_device;
     mca_btl_base_descriptor_t *frag;
-} cuda_dt_clone_t;
+} cuda_ddt_clone_t;
 
 #define SMCUDA_DT_CLONE_SIZE 20
-extern cuda_dt_clone_t smcuda_dt_clone[SMCUDA_DT_CLONE_SIZE];
+extern cuda_ddt_clone_t smcuda_dt_clone[SMCUDA_DT_CLONE_SIZE];
 
 int mca_btl_smcuda_send_cuda_unpack_sig(struct mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t* endpoint, cuda_dt_hdr_t *send_msg);
 int mca_btl_smcuda_send_cuda_pack_sig(struct mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t* endpoint, cuda_dt_hdr_t *send_msg);
@@ -557,14 +556,14 @@ int mca_btl_smcuda_alloc_cuda_dt_pack_clone(struct mca_btl_base_endpoint_t *endp
 int mca_btl_smcuda_alloc_cuda_dt_unpack_clone(struct mca_btl_base_endpoint_t *endpoint);
 void mca_btl_smcuda_free_cuda_dt_pack_clone(struct mca_btl_base_endpoint_t *endpoint, int lindex);
 void mca_btl_smcuda_free_cuda_dt_unpack_clone(struct mca_btl_base_endpoint_t *endpoint, int lindex);
-void mca_btl_smcuda_cuda_dt_pack_clone(struct opal_convertor_t *convertor,
-                                       struct mca_btl_base_endpoint_t *endpoint,
+void mca_btl_smcuda_cuda_dt_pack_clone(struct mca_btl_base_endpoint_t *endpoint,
+                                       struct opal_convertor_t *convertor,
                                        void *remote_gpu_address,
                                        mca_btl_base_descriptor_t *frag,
                                        size_t pipeline_size,
                                        int lindex, uint8_t remote_device, uint8_t local_device);
-void mca_btl_smcuda_cuda_dt_unpack_clone(struct opal_convertor_t *convertor,
-                                         struct mca_btl_base_endpoint_t *endpoint,
+void mca_btl_smcuda_cuda_dt_unpack_clone(struct mca_btl_base_endpoint_t *endpoint,
+                                         struct opal_convertor_t *convertor,
                                          void *remote_gpu_address,
                                          mca_btl_base_descriptor_t *frag,
                                          size_t pipeline_size,
diff --git a/opal/mca/btl/smcuda/btl_smcuda_component.c b/opal/mca/btl/smcuda/btl_smcuda_component.c
index de772340fa0..ee25fabd4e5 100644
--- a/opal/mca/btl/smcuda/btl_smcuda_component.c
+++ b/opal/mca/btl/smcuda/btl_smcuda_component.c
@@ -167,7 +167,7 @@ static int smcuda_register(void)
     mca_btl_smcuda_param_register_int("use_cuda_ipc", 1, OPAL_INFO_LVL_4, &mca_btl_smcuda_component.use_cuda_ipc);
     mca_btl_smcuda_param_register_int("use_cuda_ipc_same_gpu", 1, OPAL_INFO_LVL_4,&mca_btl_smcuda_component.use_cuda_ipc_same_gpu);
     mca_btl_smcuda_param_register_int("cuda_ipc_verbose", 0, OPAL_INFO_LVL_4, &mca_btl_smcuda_component.cuda_ipc_verbose);
-    mca_btl_smcuda_param_register_int("cuda_ddt_pipeline_size", 1024*1024*400, OPAL_INFO_LVL_4, &mca_btl_smcuda_component.cuda_dt_pipeline_size);
+    mca_btl_smcuda_param_register_int("cuda_ddt_pipeline_size", 1024*1024*400, OPAL_INFO_LVL_4, &mca_btl_smcuda_component.cuda_ddt_pipeline_size);
     mca_btl_smcuda_component.cuda_ipc_output = opal_output_open(NULL);
     opal_output_set_verbosity(mca_btl_smcuda_component.cuda_ipc_output, mca_btl_smcuda_component.cuda_ipc_verbose);
 #else /* OPAL_CUDA_SUPPORT */
@@ -861,14 +861,14 @@ static void btl_smcuda_datatype_unpack(mca_btl_base_module_t* btl,
     size_t packed_size = cuda_dt_hdr.packed_size;
     int msg_type = cuda_dt_hdr.msg_type;
     mca_btl_smcuda_frag_t *frag = (mca_btl_smcuda_frag_t *)des;
-    cuda_dt_clone_t *my_cuda_dt_clone;
+    cuda_ddt_clone_t *my_cuda_dt_clone;
 
     /* We can find the endoint back from the rank embedded in the header */
     endpoint = mca_btl_smcuda_component.sm_peers[frag->hdr->my_smp_rank];
-    my_cuda_dt_clone = &endpoint->smcuda_dt_unpack_clone[lindex];
+    my_cuda_dt_clone = &endpoint->smcuda_ddt_unpack_clone[lindex];
     assert(my_cuda_dt_clone->lindex == lindex);
     
-    printf("$$$$$$$$$$$$$$hello, rank %d in smcuda unpack seq %d, index %d\n", my_cuda_dt_clone->endpoint->my_smp_rank, seq, lindex);
+    printf("$$$$$$$$$$$$$$hello, rank %d in smcuda unpack seq %d, index %d\n", endpoint->my_smp_rank, seq, lindex);
     cuda_dt_hdr_t send_msg;
     send_msg.lindex = lindex;
     
@@ -937,7 +937,7 @@ static void btl_smcuda_datatype_pack(mca_btl_base_module_t* btl,
     int msg_type = cuda_dt_hdr.msg_type;
     size_t packed_size = cuda_dt_hdr.packed_size;
     mca_btl_smcuda_frag_t *frag = (mca_btl_smcuda_frag_t *)des;
-    cuda_dt_clone_t *my_cuda_dt_clone;
+    cuda_ddt_clone_t *my_cuda_dt_clone;
     cuda_dt_hdr_t send_msg;
     
     uint32_t iov_count = 1;
@@ -946,9 +946,9 @@ static void btl_smcuda_datatype_pack(mca_btl_base_module_t* btl,
 
     /* We can find the endoint back from the rank embedded in the header */
     endpoint = mca_btl_smcuda_component.sm_peers[frag->hdr->my_smp_rank];
-    my_cuda_dt_clone = &endpoint->smcuda_dt_pack_clone[lindex];
+    my_cuda_dt_clone = &endpoint->smcuda_ddt_pack_clone[lindex];
     
-    printf("$$$$$$$$$$$$$$hello, rank %d in smcuda pack seq %d, index %d\n", my_cuda_dt_clone->endpoint->my_smp_rank, seq, lindex);
+    printf("$$$$$$$$$$$$$$hello, rank %d in smcuda pack seq %d, index %d\n", endpoint->my_smp_rank, seq, lindex);
     struct opal_convertor_t *convertor = my_cuda_dt_clone->convertor;
     send_msg.lindex = lindex;
     if (msg_type == CUDA_PACK_COMPLETE_ACK) {
@@ -1000,7 +1000,7 @@ static void btl_smcuda_datatype_pack(mca_btl_base_module_t* btl,
             send_msg.msg_type = CUDA_UNPACK_FROM_SEQ;
         }
         struct iovec iov;
-        packed_size = mca_btl_smcuda_component.cuda_dt_pipeline_size;
+        packed_size = mca_btl_smcuda_component.cuda_ddt_pipeline_size;
         printf("Pipeline_size %ld\n", packed_size);
         iov.iov_base = convertor->gpu_buffer_ptr;
         iov.iov_len = packed_size;
diff --git a/opal/mca/btl/smcuda/btl_smcuda_endpoint.h b/opal/mca/btl/smcuda/btl_smcuda_endpoint.h
index e4df5ee56d0..f3b79866c14 100644
--- a/opal/mca/btl/smcuda/btl_smcuda_endpoint.h
+++ b/opal/mca/btl/smcuda/btl_smcuda_endpoint.h
@@ -49,8 +49,8 @@ struct mca_btl_base_endpoint_t {
     opal_proc_t *proc_opal;  /**< Needed for adding CUDA IPC support dynamically */
     enum ipcState ipcstate;  /**< CUDA IPC connection status */
     int ipctries;            /**< Number of times CUDA IPC connect was sent */
-    cuda_dt_clone_t smcuda_dt_pack_clone[SMCUDA_DT_CLONE_SIZE];
-    cuda_dt_clone_t smcuda_dt_unpack_clone[SMCUDA_DT_CLONE_SIZE];
+    cuda_ddt_clone_t smcuda_ddt_pack_clone[SMCUDA_DT_CLONE_SIZE];
+    cuda_ddt_clone_t smcuda_ddt_unpack_clone[SMCUDA_DT_CLONE_SIZE];
 #endif /* OPAL_CUDA_SUPPORT */
 };
 
diff --git a/test/datatype/ddt_benchmark.c b/test/datatype/ddt_benchmark.c
index 2d25274ee9b..92bdf644d4d 100644
--- a/test/datatype/ddt_benchmark.c
+++ b/test/datatype/ddt_benchmark.c
@@ -1211,12 +1211,12 @@ int main( int argc, char* argv[] )
     
     printf( "\n\n#\n * TEST UPPER TRIANGULAR MATRIX (size 100)\n #\n\n" );
     int mat_size = 500;
-    for (mat_size = 500; mat_size <= 6000; mat_size +=500) {
+    for (mat_size = 500; mat_size <= 500; mat_size +=500) {
         pdt = upper_matrix(mat_size);
         printf("----matrix size %d-----\n", mat_size);
         if( outputFlags & CHECK_PACK_UNPACK ) {
             for (i = 1; i <= 1; i++) {
-     //           local_copy_with_convertor(pdt, 1, 1024*1024*200, mat_size);
+                local_copy_with_convertor(pdt, 1, 1024*1024*200, mat_size);
             }
         }
         OBJ_RELEASE( pdt ); assert( pdt == NULL );

From 38db0e6815be71a7b3bd10100b17ac6606e53f9e Mon Sep 17 00:00:00 2001
From: eddy16112 <eddy16112@gmail.com>
Date: Sun, 25 Oct 2015 18:54:31 -0400
Subject: [PATCH 125/190] rewrite pipeline

---
 ompi/mca/pml/ob1/pml_ob1_cuda.c               |   6 +-
 .../cuda/opal_datatype_cuda_internal.cuh      |   2 +-
 .../cuda/opal_datatype_pack_cuda_wrapper.cu   |   2 +
 .../cuda/opal_datatype_unpack_cuda_wrapper.cu |   2 +
 opal/mca/btl/smcuda/btl_smcuda.c              |  69 ++++------
 opal/mca/btl/smcuda/btl_smcuda.h              |  53 ++++----
 opal/mca/btl/smcuda/btl_smcuda_component.c    | 127 +++++++-----------
 7 files changed, 114 insertions(+), 147 deletions(-)

diff --git a/ompi/mca/pml/ob1/pml_ob1_cuda.c b/ompi/mca/pml/ob1/pml_ob1_cuda.c
index e41b75a99c6..befbf091b68 100644
--- a/ompi/mca/pml/ob1/pml_ob1_cuda.c
+++ b/ompi/mca/pml/ob1/pml_ob1_cuda.c
@@ -117,8 +117,10 @@ int mca_pml_ob1_send_request_start_cuda(mca_pml_ob1_send_request_t* sendreq,
             printf("GPU data ready for GET!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n");
             unsigned char *base;
             struct opal_convertor_t *convertor = &(sendreq->req_send.req_base.req_convertor);
+         //   base = opal_cuda_malloc_gpu_buffer_p(4000000*4, 0);
             base = opal_cuda_malloc_gpu_buffer_p(convertor->local_size, 0);
             convertor->gpu_buffer_ptr = base;
+            convertor->gpu_buffer_size = 4000000*4;//convertor->local_size;
             convertor->gpu_buffer_size = convertor->local_size;
             sendreq->req_send.req_bytes_packed = convertor->local_size;
             printf("GPU BUFFER %p, local %lu, remote %lu\n", base, convertor->local_size, convertor->remote_size);
@@ -128,7 +130,7 @@ int mca_pml_ob1_send_request_start_cuda(mca_pml_ob1_send_request_t* sendreq,
                                                                            sendreq->req_send.req_bytes_packed,
                                                                            sendreq->req_rdma))) {
     
-                int lindex = mca_btl_smcuda_alloc_cuda_dt_pack_clone(bml_btl->btl_endpoint);
+                int lindex = mca_btl_smcuda_alloc_cuda_ddt_pack_clone(bml_btl->btl_endpoint);
                 assert(lindex >= 0);
                 rc = mca_common_cuda_get_device(&local_device);
                 if (rc != 0) {
@@ -136,7 +138,7 @@ int mca_pml_ob1_send_request_start_cuda(mca_pml_ob1_send_request_t* sendreq,
                     return rc;
                 }
                 mca_pml_ob1_rdma_cuda_btl_register_data(sendreq->req_rdma, sendreq->req_rdma_cnt, 0, lindex, 1, local_device); 
-                mca_btl_smcuda_cuda_dt_pack_clone( bml_btl->btl_endpoint, convertor, NULL, NULL, 0, lindex, 0, local_device);
+                mca_btl_smcuda_cuda_ddt_pack_clone( bml_btl->btl_endpoint, convertor, NULL, NULL, lindex, 0, local_device);
     
                 rc = mca_pml_ob1_send_request_start_rdma(sendreq, bml_btl,
                                                          sendreq->req_send.req_bytes_packed);
diff --git a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
index 160d54336d4..268554126ab 100644
--- a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
+++ b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
@@ -13,7 +13,7 @@
 #define OPAL_DATATYPE_CUDA_DEBUG    1
 //#define OPAL_DATATYPE_CUDA_KERNEL_TIME
 #define OPAL_DATATYPE_CUDA_DEBUG_LEVEL  2
-#define OPAL_DATATYPE_CUDA_TIMING
+//#define OPAL_DATATYPE_CUDA_TIMING
 #define OPAL_DATATYPE_VECTOR_USE_MEMCPY2D   0
 #define OPAL_DATATYPE_VECTOR_USE_ZEROCPY   0
 #define OPAL_DATATYPE_VECTOR_USE_PIPELINE   0
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
index efc0c7af957..9ee6fc0f032 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
@@ -839,9 +839,11 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
         
         /* buffer is full */
         if (buffer_isfull) {
+            size_t total_converted_tmp = total_converted;
             pConvertor->flags = convertor_flags;
             total_converted += total_packed;
             opal_convertor_set_position_nocheck(pConvertor, &total_converted);
+            total_packed = total_converted - total_converted_tmp;
             break;
         }
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
index 52f9acccc09..ba8a89e88cb 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
@@ -534,9 +534,11 @@ int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvert
         
         /* buffer is full */
         if (buffer_isfull) {
+            size_t total_converted_tmp = total_converted;
             pConvertor->flags = convertor_flags;
             total_converted += total_unpacked;
             opal_convertor_set_position_nocheck(pConvertor, &total_converted);
+            total_unpacked = total_converted - total_converted_tmp;
             break;
         }
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
diff --git a/opal/mca/btl/smcuda/btl_smcuda.c b/opal/mca/btl/smcuda/btl_smcuda.c
index 2e7bee3279b..14d0a3995ce 100644
--- a/opal/mca/btl/smcuda/btl_smcuda.c
+++ b/opal/mca/btl/smcuda/btl_smcuda.c
@@ -1158,10 +1158,10 @@ int mca_btl_smcuda_get_cuda (struct mca_btl_base_module_t *btl,
             } else {
                 convertor->gpu_buffer_ptr = remote_memory_address;   
             }
+            cuda_ddt_hdr_t send_msg;
             if (pack_required) {
-                mca_btl_smcuda_cuda_dt_unpack_clone(ep, convertor, remote_memory_address, (mca_btl_base_descriptor_t *)frag, 
-                                                    0, lindex, remote_device, local_device);
-                cuda_dt_hdr_t send_msg;
+                mca_btl_smcuda_cuda_ddt_unpack_clone(ep, convertor, remote_memory_address, (mca_btl_base_descriptor_t *)frag, 
+                                                    lindex, remote_device, local_device);
                 send_msg.lindex = lindex;
                 send_msg.packed_size = 0;
                 send_msg.seq = 0;
@@ -1188,13 +1188,13 @@ int mca_btl_smcuda_get_cuda (struct mca_btl_base_module_t *btl,
         } else {
             printf("RECEIVE REGT CONTIGUOUS, size %ld !!!!!!!!!!!\n", size);
             recvreq->req_recv.req_base.req_convertor.flags |= CONVERTOR_CUDA;
+            cuda_ddt_hdr_t send_msg;
             if (pack_required) {
-                cuda_dt_hdr_t send_msg;
                 send_msg.lindex = lindex;
                 send_msg.packed_size = 0;
                 if (remote_device == local_device && OPAL_DATATYPE_DIRECT_COPY_GPUMEM) {
                     /* now we are able to let sender pack directly to my memory */
-                    mca_mpool_common_cuda_reg_t loc_reg;
+  /*                  mca_mpool_common_cuda_reg_t loc_reg;
                     mca_mpool_common_cuda_reg_t *loc_reg_ptr = &loc_reg;
                     cuda_getmemhandle(local_address, size, (mca_mpool_base_registration_t *)&loc_reg, NULL);
                     memcpy(send_msg.mem_handle, loc_reg_ptr->data.memHandle, sizeof(loc_reg_ptr->data.memHandle));
@@ -1203,13 +1203,13 @@ int mca_btl_smcuda_get_cuda (struct mca_btl_base_module_t *btl,
                     send_msg.remote_address = local_address;
                     send_msg.remote_base = loc_reg.base.base;
                     mca_common_wait_stream_synchronize(&loc_reg);
-                    printf("send r_addr %p, r_base %p\n", local_address, loc_reg.base.base);
+                    printf("send r_addr %p, r_base %p\n", local_address, loc_reg.base.base);*/
                 } else {
                     send_msg.seq = 0;
                     send_msg.msg_type = CUDA_PACK_TO_LOCAL_START;
                 }
-                mca_btl_smcuda_cuda_dt_unpack_clone(ep, NULL, remote_memory_address, (mca_btl_base_descriptor_t *)frag, 
-                                                    0, lindex, 0, 0);
+                mca_btl_smcuda_cuda_ddt_unpack_clone(ep, NULL, remote_memory_address, (mca_btl_base_descriptor_t *)frag, 
+                                                    lindex, 0, 0);
                 mca_btl_smcuda_send_cuda_pack_sig(btl, ep, &send_msg);
                 done = 0;
             } else {
@@ -1319,11 +1319,11 @@ static void mca_btl_smcuda_send_cuda_ipc_request(struct mca_btl_base_module_t* b
 
 int mca_btl_smcuda_send_cuda_unpack_sig(struct mca_btl_base_module_t* btl,
                                            struct mca_btl_base_endpoint_t* endpoint, 
-                                           cuda_dt_hdr_t *send_msg)
+                                           cuda_ddt_hdr_t *send_msg)
 {
     mca_btl_smcuda_frag_t* frag;
     int rc;
-    cuda_dt_hdr_t cuda_dt_hdr;
+    cuda_ddt_hdr_t cuda_dt_hdr;
     
     /* allocate a fragment, giving up if we can't get one */
     MCA_BTL_SMCUDA_FRAG_ALLOC_EAGER(frag);
@@ -1334,7 +1334,7 @@ int mca_btl_smcuda_send_cuda_unpack_sig(struct mca_btl_base_module_t* btl,
 
     /* Fill in fragment fields. */
     frag->base.des_flags = MCA_BTL_DES_FLAGS_BTL_OWNERSHIP;
-    memcpy(frag->segment.seg_addr.pval, send_msg, sizeof(cuda_dt_hdr_t));
+    memcpy(frag->segment.seg_addr.pval, send_msg, sizeof(cuda_ddt_hdr_t));
     
     rc = mca_btl_smcuda_send(btl, endpoint, (struct mca_btl_base_descriptor_t*)frag,  MCA_BTL_TAG_SMCUDA_DATATYPE_UNPACK);
     printf("######## rank %d, send seq %d, endpoint %p\n", endpoint->my_smp_rank, send_msg->seq, endpoint);
@@ -1343,11 +1343,11 @@ int mca_btl_smcuda_send_cuda_unpack_sig(struct mca_btl_base_module_t* btl,
 
 int mca_btl_smcuda_send_cuda_pack_sig(struct mca_btl_base_module_t* btl,
                                       struct mca_btl_base_endpoint_t* endpoint, 
-                                      cuda_dt_hdr_t *send_msg)
+                                      cuda_ddt_hdr_t *send_msg)
 {
     mca_btl_smcuda_frag_t* frag;
     int rc;
-    cuda_dt_hdr_t cuda_dt_hdr;
+    cuda_ddt_hdr_t cuda_dt_hdr;
     
     /* allocate a fragment, giving up if we can't get one */
     MCA_BTL_SMCUDA_FRAG_ALLOC_EAGER(frag);
@@ -1357,7 +1357,7 @@ int mca_btl_smcuda_send_cuda_pack_sig(struct mca_btl_base_module_t* btl,
 
     /* Fill in fragment fields. */
     frag->base.des_flags = MCA_BTL_DES_FLAGS_BTL_OWNERSHIP;
-    memcpy(frag->segment.seg_addr.pval, send_msg, sizeof(cuda_dt_hdr_t));
+    memcpy(frag->segment.seg_addr.pval, send_msg, sizeof(cuda_ddt_hdr_t));
     
     rc = mca_btl_smcuda_send(btl, endpoint, (struct mca_btl_base_descriptor_t*)frag,  MCA_BTL_TAG_SMCUDA_DATATYPE_PACK);
     return rc;
@@ -1389,16 +1389,7 @@ int mca_btl_smcuda_get_cuda_dt_pack_seq(struct mca_btl_base_endpoint_t *endpoint
     }
 }
 
-int mca_btl_smcuda_get_cuda_dt_pack_pipeline_size(struct mca_btl_base_endpoint_t *endpoint, int lindex)
-{
-    if (lindex >= SMCUDA_DT_CLONE_SIZE) {
-        return -9;
-    } else {
-        return endpoint->smcuda_ddt_pack_clone[lindex].pipeline_size;
-    }
-}
-
-int mca_btl_smcuda_alloc_cuda_dt_pack_clone(struct mca_btl_base_endpoint_t *endpoint)
+int mca_btl_smcuda_alloc_cuda_ddt_pack_clone(struct mca_btl_base_endpoint_t *endpoint)
 {
     int i;
     for (i = 0; i < SMCUDA_DT_CLONE_SIZE; i++) {
@@ -1408,7 +1399,7 @@ int mca_btl_smcuda_alloc_cuda_dt_pack_clone(struct mca_btl_base_endpoint_t *endp
     }
     return -1;
 }
-int mca_btl_smcuda_alloc_cuda_dt_unpack_clone(struct mca_btl_base_endpoint_t *endpoint)
+int mca_btl_smcuda_alloc_cuda_ddt_unpack_clone(struct mca_btl_base_endpoint_t *endpoint)
 {
     int i;
     for (i = 0; i < SMCUDA_DT_CLONE_SIZE; i++) {
@@ -1419,27 +1410,25 @@ int mca_btl_smcuda_alloc_cuda_dt_unpack_clone(struct mca_btl_base_endpoint_t *en
     return -1;
 }
 
-void mca_btl_smcuda_free_cuda_dt_pack_clone(struct mca_btl_base_endpoint_t *endpoint, int lindex)
+void mca_btl_smcuda_free_cuda_ddt_pack_clone(struct mca_btl_base_endpoint_t *endpoint, int lindex)
 {
     assert(endpoint->smcuda_ddt_pack_clone[lindex].lindex == lindex);
     endpoint->smcuda_ddt_pack_clone[lindex].lindex = -1;
 }
-void mca_btl_smcuda_free_cuda_dt_unpack_clone(struct mca_btl_base_endpoint_t *endpoint, int lindex)
+void mca_btl_smcuda_free_cuda_ddt_unpack_clone(struct mca_btl_base_endpoint_t *endpoint, int lindex)
 {
     assert(endpoint->smcuda_ddt_unpack_clone[lindex].lindex == lindex);
     endpoint->smcuda_ddt_unpack_clone[lindex].lindex = -1;
 }
 
-void mca_btl_smcuda_cuda_dt_pack_clone(struct mca_btl_base_endpoint_t *endpoint,
-                                       struct opal_convertor_t *convertor,
-                                       void *remote_gpu_address,
-                                       mca_btl_base_descriptor_t *frag,
-                                       size_t pipeline_size,
-                                       int lindex, uint8_t remote_device, uint8_t local_device)
+void mca_btl_smcuda_cuda_ddt_pack_clone(struct mca_btl_base_endpoint_t *endpoint,
+                                        struct opal_convertor_t *convertor,
+                                        void *remote_gpu_address,
+                                        mca_btl_base_descriptor_t *frag,
+                                        int lindex, uint8_t remote_device, uint8_t local_device)
 {
     endpoint->smcuda_ddt_pack_clone[lindex].convertor = convertor;
     endpoint->smcuda_ddt_pack_clone[lindex].remote_gpu_address = remote_gpu_address;
-    endpoint->smcuda_ddt_pack_clone[lindex].pipeline_size = pipeline_size;
     endpoint->smcuda_ddt_pack_clone[lindex].lindex = lindex;
     endpoint->smcuda_ddt_pack_clone[lindex].seq = -9;
     endpoint->smcuda_ddt_pack_clone[lindex].remote_device = remote_device;
@@ -1447,16 +1436,14 @@ void mca_btl_smcuda_cuda_dt_pack_clone(struct mca_btl_base_endpoint_t *endpoint,
     endpoint->smcuda_ddt_pack_clone[lindex].frag = frag;
 }
 
-void mca_btl_smcuda_cuda_dt_unpack_clone(struct mca_btl_base_endpoint_t *endpoint,
-                                         struct opal_convertor_t *convertor,
-                                         void *remote_gpu_address,
-                                         mca_btl_base_descriptor_t *frag,
-                                         size_t pipeline_size,
-                                         int lindex, uint8_t remote_device, uint8_t local_device)
+void mca_btl_smcuda_cuda_ddt_unpack_clone(struct mca_btl_base_endpoint_t *endpoint,
+                                          struct opal_convertor_t *convertor,
+                                          void *remote_gpu_address,
+                                          mca_btl_base_descriptor_t *frag,
+                                          int lindex, uint8_t remote_device, uint8_t local_device)
 {
     endpoint->smcuda_ddt_unpack_clone[lindex].convertor = convertor;
     endpoint->smcuda_ddt_unpack_clone[lindex].remote_gpu_address = remote_gpu_address;
-    endpoint->smcuda_ddt_unpack_clone[lindex].pipeline_size = pipeline_size;
     endpoint->smcuda_ddt_unpack_clone[lindex].lindex = lindex;
     endpoint->smcuda_ddt_unpack_clone[lindex].seq = -9;
     endpoint->smcuda_ddt_unpack_clone[lindex].remote_device = remote_device;
diff --git a/opal/mca/btl/smcuda/btl_smcuda.h b/opal/mca/btl/smcuda/btl_smcuda.h
index f9171ec8962..46ae97b3909 100644
--- a/opal/mca/btl/smcuda/btl_smcuda.h
+++ b/opal/mca/btl/smcuda/btl_smcuda.h
@@ -511,31 +511,34 @@ enum ipcState {
     IPC_BAD
 };
 
-/* cuda datatype control message */
+/* cuda datatype pack/unpack message */
 typedef struct {
     int seq;
     int msg_type;
     int lindex;
     int packed_size;
+} cuda_ddt_hdr_t;
+
+/* cuda datatype put message */
+typedef struct {
     void *remote_address;
     void *remote_base;
     uint64_t mem_handle[8];
-} cuda_dt_hdr_t;
+} cuda_ddt_put_hdr_t;
 
-#define CUDA_UNPACK_FROM_SEQ        0
-#define CUDA_PACK_COMPLETE          1
-#define CUDA_PACK_COMPLETE_ACK      2
-#define CUDA_PACK_CLEANUP           3
+#define CUDA_DDT_UNPACK_FROM_BLOCK  0
+#define CUDA_DDT_COMPLETE           1
+#define CUDA_DDT_COMPLETE_ACK       2
+#define CUDA_DDT_CLEANUP            3
 #define CUDA_PACK_TO_LOCAL_START    4
 #define CUDA_PACK_TO_REMOTE_START   5
-#define CUDA_PACK_TO_SEQ      6
+#define CUDA_DDT_PACK_TO_BLOCK      6
 #define CUDA_UNPACK_NO              7
 
 /* package save pack/unpack convertor and cbfunc */
 typedef struct {
     struct opal_convertor_t *convertor;
     void *remote_gpu_address;
-    size_t pipeline_size;
     int lindex;
     int seq;
     uint8_t remote_device;
@@ -546,28 +549,26 @@ typedef struct {
 #define SMCUDA_DT_CLONE_SIZE 20
 extern cuda_ddt_clone_t smcuda_dt_clone[SMCUDA_DT_CLONE_SIZE];
 
-int mca_btl_smcuda_send_cuda_unpack_sig(struct mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t* endpoint, cuda_dt_hdr_t *send_msg);
-int mca_btl_smcuda_send_cuda_pack_sig(struct mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t* endpoint, cuda_dt_hdr_t *send_msg);
+int mca_btl_smcuda_send_cuda_unpack_sig(struct mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t* endpoint, cuda_ddt_hdr_t *send_msg);
+int mca_btl_smcuda_send_cuda_pack_sig(struct mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t* endpoint, cuda_ddt_hdr_t *send_msg);
 int mca_btl_smcuda_check_cuda_dt_pack_clone_exist(struct mca_btl_base_endpoint_t *endpoint, struct opal_convertor_t *convertor);
 int mca_btl_smcuda_set_cuda_dt_pack_seq(struct mca_btl_base_endpoint_t *endpoint, int lindex, int seq);
 int mca_btl_smcuda_get_cuda_dt_pack_seq(struct mca_btl_base_endpoint_t *endpoint, int lindex);
 int mca_btl_smcuda_get_cuda_dt_pack_pipeline_size(struct mca_btl_base_endpoint_t *endpoint, int lindex);
-int mca_btl_smcuda_alloc_cuda_dt_pack_clone(struct mca_btl_base_endpoint_t *endpoint);
-int mca_btl_smcuda_alloc_cuda_dt_unpack_clone(struct mca_btl_base_endpoint_t *endpoint);
-void mca_btl_smcuda_free_cuda_dt_pack_clone(struct mca_btl_base_endpoint_t *endpoint, int lindex);
-void mca_btl_smcuda_free_cuda_dt_unpack_clone(struct mca_btl_base_endpoint_t *endpoint, int lindex);
-void mca_btl_smcuda_cuda_dt_pack_clone(struct mca_btl_base_endpoint_t *endpoint,
-                                       struct opal_convertor_t *convertor,
-                                       void *remote_gpu_address,
-                                       mca_btl_base_descriptor_t *frag,
-                                       size_t pipeline_size,
-                                       int lindex, uint8_t remote_device, uint8_t local_device);
-void mca_btl_smcuda_cuda_dt_unpack_clone(struct mca_btl_base_endpoint_t *endpoint,
-                                         struct opal_convertor_t *convertor,
-                                         void *remote_gpu_address,
-                                         mca_btl_base_descriptor_t *frag,
-                                         size_t pipeline_size,
-                                         int lindex, uint8_t remote_device, uint8_t local_device);
+int mca_btl_smcuda_alloc_cuda_ddt_pack_clone(struct mca_btl_base_endpoint_t *endpoint);
+int mca_btl_smcuda_alloc_cuda_ddt_unpack_clone(struct mca_btl_base_endpoint_t *endpoint);
+void mca_btl_smcuda_free_cuda_ddt_pack_clone(struct mca_btl_base_endpoint_t *endpoint, int lindex);
+void mca_btl_smcuda_free_cuda_ddt_unpack_clone(struct mca_btl_base_endpoint_t *endpoint, int lindex);
+void mca_btl_smcuda_cuda_ddt_pack_clone(struct mca_btl_base_endpoint_t *endpoint,
+                                        struct opal_convertor_t *convertor,
+                                        void *remote_gpu_address,
+                                        mca_btl_base_descriptor_t *frag,
+                                        int lindex, uint8_t remote_device, uint8_t local_device);
+void mca_btl_smcuda_cuda_ddt_unpack_clone(struct mca_btl_base_endpoint_t *endpoint,
+                                          struct opal_convertor_t *convertor,
+                                          void *remote_gpu_address,
+                                          mca_btl_base_descriptor_t *frag,
+                                          int lindex, uint8_t remote_device, uint8_t local_device);
 
 #endif /* OPAL_CUDA_SUPPORT */
 
diff --git a/opal/mca/btl/smcuda/btl_smcuda_component.c b/opal/mca/btl/smcuda/btl_smcuda_component.c
index ee25fabd4e5..4f46b8a5beb 100644
--- a/opal/mca/btl/smcuda/btl_smcuda_component.c
+++ b/opal/mca/btl/smcuda/btl_smcuda_component.c
@@ -853,13 +853,13 @@ static void btl_smcuda_datatype_unpack(mca_btl_base_module_t* btl,
                                        mca_btl_base_descriptor_t* des, void* cbdata)
 {   
     struct mca_btl_base_endpoint_t *endpoint;
-    cuda_dt_hdr_t cuda_dt_hdr;
+    cuda_ddt_hdr_t recv_msg;
     mca_btl_base_segment_t* segments = des->des_segments;
-    memcpy(&cuda_dt_hdr, segments->seg_addr.pval, sizeof(cuda_dt_hdr_t));
-    int seq = cuda_dt_hdr.seq;
-    int lindex = cuda_dt_hdr.lindex;
-    size_t packed_size = cuda_dt_hdr.packed_size;
-    int msg_type = cuda_dt_hdr.msg_type;
+    memcpy(&recv_msg, segments->seg_addr.pval, sizeof(cuda_ddt_hdr_t));
+    int seq = recv_msg.seq;
+    int lindex = recv_msg.lindex;
+    size_t packed_size = recv_msg.packed_size;
+    int msg_type = recv_msg.msg_type;
     mca_btl_smcuda_frag_t *frag = (mca_btl_smcuda_frag_t *)des;
     cuda_ddt_clone_t *my_cuda_dt_clone;
 
@@ -869,29 +869,21 @@ static void btl_smcuda_datatype_unpack(mca_btl_base_module_t* btl,
     assert(my_cuda_dt_clone->lindex == lindex);
     
     printf("$$$$$$$$$$$$$$hello, rank %d in smcuda unpack seq %d, index %d\n", endpoint->my_smp_rank, seq, lindex);
-    cuda_dt_hdr_t send_msg;
+    cuda_ddt_hdr_t send_msg;
     send_msg.lindex = lindex;
     
-    if (msg_type == CUDA_PACK_CLEANUP) {
+    if (msg_type == CUDA_DDT_CLEANUP) {
         mca_btl_smcuda_frag_t *frag_recv = (mca_btl_smcuda_frag_t *) my_cuda_dt_clone->frag;
         mca_btl_base_rdma_completion_fn_t cbfunc = (mca_btl_base_rdma_completion_fn_t) frag_recv->base.des_cbfunc;
         cbfunc (btl, endpoint, frag_recv->segment.seg_addr.pval, frag_recv->local_handle, frag_recv->base.des_context, frag_recv->base.des_cbdata, OPAL_SUCCESS);
         mca_btl_smcuda_free(btl, (mca_btl_base_descriptor_t *)frag_recv);
-        mca_btl_smcuda_free_cuda_dt_unpack_clone(endpoint, lindex);
-    } else if (msg_type == CUDA_PACK_COMPLETE) {
-        send_msg.packed_size = 0;
-        send_msg.seq = -1;
-        send_msg.msg_type = CUDA_PACK_COMPLETE_ACK;
-        mca_btl_smcuda_send_cuda_pack_sig(btl, endpoint, &send_msg);
-    } else if (msg_type == CUDA_UNPACK_FROM_SEQ){
+        mca_btl_smcuda_free_cuda_ddt_unpack_clone(endpoint, lindex);
+    } else if (msg_type == CUDA_DDT_UNPACK_FROM_BLOCK || msg_type == CUDA_DDT_COMPLETE){
         struct iovec iov;
         uint32_t iov_count = 1;
         size_t max_data;
         struct opal_convertor_t *convertor = my_cuda_dt_clone->convertor;
-        if (my_cuda_dt_clone->pipeline_size == 0) {
-            my_cuda_dt_clone->pipeline_size = packed_size;
-        }
-        size_t pipeline_size = my_cuda_dt_clone->pipeline_size;
+        size_t pipeline_size = mca_btl_smcuda_component.cuda_ddt_pipeline_size;
         if (convertor == NULL) { /* do not unpack */
             mca_btl_smcuda_frag_t *frag_recv = (mca_btl_smcuda_frag_t *) my_cuda_dt_clone->frag;
             unsigned char *local_address = (unsigned char*)frag_recv->segment.seg_addr.pval;
@@ -899,7 +891,7 @@ static void btl_smcuda_datatype_unpack(mca_btl_base_module_t* btl,
             mca_common_cuda_memp2pcpy(local_address + seq*pipeline_size, my_cuda_dt_clone->remote_gpu_address + seq*pipeline_size, packed_size);
         } else {     /* unpack */
             if (!OPAL_DATATYPE_DIRECT_COPY_GPUMEM && my_cuda_dt_clone->remote_device != my_cuda_dt_clone->local_device) {
-                convertor->gpu_buffer_ptr = opal_cuda_malloc_gpu_buffer_p(pipeline_size, 0);
+                convertor->gpu_buffer_ptr = opal_cuda_malloc_gpu_buffer_p(packed_size, 0);
                 mca_common_cuda_memp2pcpy(convertor->gpu_buffer_ptr, my_cuda_dt_clone->remote_gpu_address + seq*pipeline_size, packed_size);
                 iov.iov_base = convertor->gpu_buffer_ptr;
                 printf("start D2D copy src %p, dst %p, size %lu\n", my_cuda_dt_clone->remote_gpu_address + seq*pipeline_size, convertor->gpu_buffer_ptr, packed_size);        
@@ -918,7 +910,11 @@ static void btl_smcuda_datatype_unpack(mca_btl_base_module_t* btl,
         }
         send_msg.seq = seq;
         send_msg.packed_size = packed_size;
-        send_msg.msg_type = CUDA_PACK_TO_SEQ;
+        if (msg_type == CUDA_DDT_COMPLETE) {
+            send_msg.msg_type = CUDA_DDT_COMPLETE_ACK;
+        } else {
+            send_msg.msg_type = CUDA_DDT_PACK_TO_BLOCK;
+        }
         mca_btl_smcuda_send_cuda_pack_sig(btl, endpoint, &send_msg);
     }
    // MCA_BTL_SMCUDA_FRAG_RETURN(frag);
@@ -929,19 +925,19 @@ static void btl_smcuda_datatype_pack(mca_btl_base_module_t* btl,
                                      mca_btl_base_descriptor_t* des, void* cbdata)
 {
     struct mca_btl_base_endpoint_t *endpoint;
-    cuda_dt_hdr_t cuda_dt_hdr;
+    cuda_ddt_hdr_t recv_msg;
     mca_btl_base_segment_t* segments = des->des_segments;
-    memcpy(&cuda_dt_hdr, segments->seg_addr.pval, sizeof(cuda_dt_hdr_t));
-    int seq = cuda_dt_hdr.seq;
-    int lindex = cuda_dt_hdr.lindex;
-    int msg_type = cuda_dt_hdr.msg_type;
-    size_t packed_size = cuda_dt_hdr.packed_size;
+    memcpy(&recv_msg, segments->seg_addr.pval, sizeof(cuda_ddt_hdr_t));
+    int seq = recv_msg.seq;
+    int lindex = recv_msg.lindex;
+    int msg_type = recv_msg.msg_type;
+    size_t packed_size = recv_msg.packed_size;
     mca_btl_smcuda_frag_t *frag = (mca_btl_smcuda_frag_t *)des;
     cuda_ddt_clone_t *my_cuda_dt_clone;
-    cuda_dt_hdr_t send_msg;
+    cuda_ddt_hdr_t send_msg;
     
     uint32_t iov_count = 1;
-    int rc_dt = 0;
+    int rv_dt = 0;
     size_t max_data = 0;
 
     /* We can find the endoint back from the rank embedded in the header */
@@ -951,39 +947,37 @@ static void btl_smcuda_datatype_pack(mca_btl_base_module_t* btl,
     printf("$$$$$$$$$$$$$$hello, rank %d in smcuda pack seq %d, index %d\n", endpoint->my_smp_rank, seq, lindex);
     struct opal_convertor_t *convertor = my_cuda_dt_clone->convertor;
     send_msg.lindex = lindex;
-    if (msg_type == CUDA_PACK_COMPLETE_ACK) {
+    if (msg_type == CUDA_DDT_COMPLETE_ACK) {
         send_msg.packed_size = 0;
         send_msg.seq = -2;
-        send_msg.msg_type = CUDA_PACK_CLEANUP;
+        send_msg.msg_type = CUDA_DDT_CLEANUP;
         mca_btl_smcuda_send_cuda_unpack_sig(btl, endpoint, &send_msg);
         if (convertor->gpu_buffer_ptr != NULL) {
             opal_cuda_free_gpu_buffer_p(convertor->gpu_buffer_ptr, 0);
             convertor->gpu_buffer_ptr = NULL;
         }
-        mca_btl_smcuda_free_cuda_dt_pack_clone(endpoint, lindex);
-    } else if (msg_type == CUDA_PACK_TO_SEQ) {
-        printf("i receive a message pack to seq, packed %ld, pipeline_size %ld\n", convertor->bConverted, my_cuda_dt_clone->pipeline_size); 
+        mca_btl_smcuda_free_cuda_ddt_pack_clone(endpoint, lindex);
+    } else if (msg_type == CUDA_DDT_PACK_TO_BLOCK) {
+        printf("i receive a message pack to seq, packed %ld, pipeline_size %ld\n", convertor->bConverted, mca_btl_smcuda_component.cuda_ddt_pipeline_size); 
         if (convertor->bConverted < convertor->local_size) {
             struct iovec iov;
-            iov.iov_base = convertor->gpu_buffer_ptr + seq*my_cuda_dt_clone->pipeline_size;
+            iov.iov_base = convertor->gpu_buffer_ptr + seq * mca_btl_smcuda_component.cuda_ddt_pipeline_size;;
             iov.iov_len = packed_size;
-            rc_dt = opal_convertor_pack(convertor, &iov, &iov_count, &max_data );
+            rv_dt = opal_convertor_pack(convertor, &iov, &iov_count, &max_data );
             packed_size = max_data;
             send_msg.packed_size = packed_size;
             send_msg.seq = seq;
-            send_msg.msg_type = CUDA_UNPACK_FROM_SEQ;
-            mca_btl_smcuda_send_cuda_unpack_sig(btl, endpoint, &send_msg);
-            if (rc_dt == 1) {
-                send_msg.packed_size = 0;
-                send_msg.seq = -1;
-                send_msg.msg_type = CUDA_PACK_COMPLETE;
-                mca_btl_smcuda_send_cuda_unpack_sig(btl, endpoint, &send_msg);
+            if (rv_dt == 1) {
+                send_msg.msg_type = CUDA_DDT_COMPLETE;
+            } else {
+                send_msg.msg_type = CUDA_DDT_UNPACK_FROM_BLOCK;
             }
+            mca_btl_smcuda_send_cuda_unpack_sig(btl, endpoint, &send_msg);
         }
     } else {
         mca_mpool_common_cuda_reg_t *rget_reg_ptr = NULL;
         if (msg_type == CUDA_PACK_TO_REMOTE_START) { /* receiver is contiguous, and ask me to pack directly to his gpu memory */
-            opal_cuda_free_gpu_buffer_p(convertor->gpu_buffer_ptr, 0);
+/*            opal_cuda_free_gpu_buffer_p(convertor->gpu_buffer_ptr, 0);
             mca_mpool_common_cuda_reg_t rget_reg;
             rget_reg_ptr= &rget_reg;
             memset(&rget_reg, 0, sizeof(rget_reg));
@@ -995,48 +989,27 @@ static void btl_smcuda_datatype_pack(mca_btl_base_module_t* btl,
             convertor->gpu_buffer_ptr = remote_memory_address;
             printf("remote_memory_address $$$$$$ %p, r_addr %p, r_base %p\n", remote_memory_address, cuda_dt_hdr.remote_address, cuda_dt_hdr.remote_base);
             send_msg.msg_type = CUDA_UNPACK_NO;
-            convertor->gpu_buffer_size = convertor->local_size;
+            convertor->gpu_buffer_size = convertor->local_size;*/
         } else {
-            send_msg.msg_type = CUDA_UNPACK_FROM_SEQ;
+            send_msg.msg_type = CUDA_DDT_UNPACK_FROM_BLOCK;
         }
         struct iovec iov;
         packed_size = mca_btl_smcuda_component.cuda_ddt_pipeline_size;
         printf("Pipeline_size %ld\n", packed_size);
+        iov.iov_len = mca_btl_smcuda_component.cuda_ddt_pipeline_size;
         iov.iov_base = convertor->gpu_buffer_ptr;
-        iov.iov_len = packed_size;
-        max_data = 0;
         seq = 0;
-        /* the first pack here is used to get the correct size of pipeline_size */
-        /* because pack may not use the whole pipeline size */
-        rc_dt = opal_convertor_pack(convertor, &iov, &iov_count, &max_data );
-        packed_size = max_data;
-        iov.iov_base += packed_size;
-        /* save pipeline size */
-        my_cuda_dt_clone->pipeline_size = packed_size;   
-        convertor->gpu_buffer_size -= packed_size;
-        send_msg.packed_size = packed_size;
-        send_msg.seq = seq;
-        mca_btl_smcuda_send_cuda_unpack_sig(btl, endpoint, &send_msg);
-        while (rc_dt != 1 && convertor->gpu_buffer_size > 0) {
-            if (convertor->gpu_buffer_size < packed_size) {
-                packed_size = convertor->gpu_buffer_size;
-            } 
-            iov.iov_len = packed_size;
-            seq ++;
-            rc_dt = opal_convertor_pack(convertor, &iov, &iov_count, &max_data );
-            packed_size = max_data;
-            iov.iov_base += packed_size;
-            convertor->gpu_buffer_size -= packed_size;
-            send_msg.packed_size = packed_size;
+        while (rv_dt != 1 && convertor->gpu_buffer_size > 0) {
+            rv_dt = opal_convertor_pack(convertor, &iov, &iov_count, &max_data );
+            iov.iov_base += mca_btl_smcuda_component.cuda_ddt_pipeline_size;
+            convertor->gpu_buffer_size -= mca_btl_smcuda_component.cuda_ddt_pipeline_size;
+            send_msg.packed_size = max_data;
             send_msg.seq = seq;
+            if (rv_dt == 1) {
+                send_msg.msg_type = CUDA_DDT_COMPLETE;
+            }
             mca_btl_smcuda_send_cuda_unpack_sig(btl, endpoint, &send_msg);
-        }
-        
-        if (rc_dt == 1) {
-            send_msg.packed_size = 0;
-            send_msg.seq = -1;
-            send_msg.msg_type = CUDA_PACK_COMPLETE;
-            mca_btl_smcuda_send_cuda_unpack_sig(btl, endpoint, &send_msg);
+            seq ++;
         }
         
         if (rget_reg_ptr != NULL) { /* close memhandle */

From 0ab564b90905402e19db49ca38f09f566d447f31 Mon Sep 17 00:00:00 2001
From: eddy16112 <eddy16112@gmail.com>
Date: Mon, 26 Oct 2015 17:02:33 -0400
Subject: [PATCH 126/190] s up and running. PUT size in an MCA parameters.

Conflicts:
	opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu

Conflicts:
	opal/mca/btl/btl.h
---
 ompi/mca/pml/ob1/pml_ob1_cuda.c               |  17 +--
 opal/datatype/cuda/opal_datatype_cuda.cu      |  11 ++
 opal/datatype/cuda/opal_datatype_cuda.cuh     |   4 +
 .../cuda/opal_datatype_pack_cuda_wrapper.cu   |   3 +
 .../cuda/opal_datatype_unpack_cuda_wrapper.cu |   3 +
 opal/datatype/opal_datatype_gpu.c             |   8 ++
 opal/datatype/opal_datatype_gpu.h             |   4 +
 opal/mca/btl/btl.h                            |   3 +
 opal/mca/btl/smcuda/btl_smcuda.c              | 115 +++++++++---------
 opal/mca/btl/smcuda/btl_smcuda.h              |  17 +--
 opal/mca/btl/smcuda/btl_smcuda_component.c    |  98 ++++++++++-----
 opal/mca/common/cuda/common_cuda.c            |   1 -
 12 files changed, 177 insertions(+), 107 deletions(-)

diff --git a/ompi/mca/pml/ob1/pml_ob1_cuda.c b/ompi/mca/pml/ob1/pml_ob1_cuda.c
index befbf091b68..3dcb0b9ad14 100644
--- a/ompi/mca/pml/ob1/pml_ob1_cuda.c
+++ b/ompi/mca/pml/ob1/pml_ob1_cuda.c
@@ -114,16 +114,19 @@ int mca_pml_ob1_send_request_start_cuda(mca_pml_ob1_send_request_t* sendreq,
         sendreq->req_send.req_base.req_convertor.flags |= CONVERTOR_CUDA;
         mca_bml_base_btl_t* bml_endpoint_btl = mca_bml_base_btl_array_get_index(&(sendreq->req_endpoint->btl_send), 0);
         if ((bml_endpoint_btl->btl_flags & MCA_BTL_FLAGS_CUDA_GET) && CUDA_DDT_WITH_RDMA) {
-            printf("GPU data ready for GET!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n");
             unsigned char *base;
             struct opal_convertor_t *convertor = &(sendreq->req_send.req_base.req_convertor);
-         //   base = opal_cuda_malloc_gpu_buffer_p(4000000*4, 0);
-            base = opal_cuda_malloc_gpu_buffer_p(convertor->local_size, 0);
+            size_t buffer_size = 0;
+            if (convertor->local_size > bml_btl->btl->btl_cuda_ddt_pipeline_size) {
+                buffer_size = bml_btl->btl->btl_cuda_ddt_pipeline_size * bml_btl->btl->btl_cuda_ddt_pipeline_depth;
+            } else {
+                buffer_size = convertor->local_size;
+            }
+            base = opal_cuda_malloc_gpu_buffer_p(buffer_size, 0);
             convertor->gpu_buffer_ptr = base;
-            convertor->gpu_buffer_size = 4000000*4;//convertor->local_size;
-            convertor->gpu_buffer_size = convertor->local_size;
+            convertor->gpu_buffer_size = buffer_size;
             sendreq->req_send.req_bytes_packed = convertor->local_size;
-            printf("GPU BUFFER %p, local %lu, remote %lu\n", base, convertor->local_size, convertor->remote_size);
+            opal_output(0, "malloc GPU BUFFER %p for pack, local size %lu, pipeline size %lu, depth %d\n", base, convertor->local_size, bml_btl->btl->btl_cuda_ddt_pipeline_size, bml_btl->btl->btl_cuda_ddt_pipeline_depth);
             if( 0 != (sendreq->req_rdma_cnt = (uint32_t)mca_pml_ob1_rdma_cuda_btls(
                                                                            sendreq->req_endpoint,
                                                                            base,
@@ -228,7 +231,7 @@ int mca_pml_ob1_rdma_cuda_btl_register_data(
       //       mca_common_cuda_geteventhandle(&convertor->pipeline_event[j], j, (mca_mpool_base_registration_t *)cuda_reg);
       // //      printf("event %lu, j %d\n", convertor->pipeline_event[j], j);
       //   }
-        cuda_reg->data.pipeline_size = pipeline_size;
+   //     cuda_reg->data.pipeline_size = pipeline_size;
         cuda_reg->data.lindex = lindex;
         cuda_reg->data.pack_required = pack_required;
         cuda_reg->data.gpu_device = gpu_device;
diff --git a/opal/datatype/cuda/opal_datatype_cuda.cu b/opal/datatype/cuda/opal_datatype_cuda.cu
index bce80b4a592..0f0d52d558b 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cu
+++ b/opal/datatype/cuda/opal_datatype_cuda.cu
@@ -353,6 +353,17 @@ void opal_cuda_free_gpu_buffer(void *addr, int gpu_id)
     DT_CUDA_DEBUG( opal_cuda_output( 2, "Free GPU buffer %p.\n", addr); );
 }
 
+void opal_cuda_d2dcpy_async(void* dst, const void* src, size_t count)
+{
+    cudaMemcpyAsync(dst, src, count, cudaMemcpyDeviceToDevice, cuda_streams->opal_cuda_stream[0]);
+}
+
+void opal_cuda_d2dcpy(void* dst, const void* src, size_t count)
+{
+    cudaMemcpyAsync(dst, src, count, cudaMemcpyDeviceToDevice, cuda_streams->opal_cuda_stream[0]);
+    cudaStreamSynchronize(cuda_streams->opal_cuda_stream[0]);
+}
+
 void opal_dump_cuda_list(ddt_cuda_list_t *list)
 {
     ddt_cuda_buffer_t *ptr = NULL;
diff --git a/opal/datatype/cuda/opal_datatype_cuda.cuh b/opal/datatype/cuda/opal_datatype_cuda.cuh
index 94336ac6475..d71d349d46b 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cuh
+++ b/opal/datatype/cuda/opal_datatype_cuda.cuh
@@ -91,6 +91,10 @@ void* opal_cuda_malloc_gpu_buffer(size_t size, int gpu_id);
 
 void opal_cuda_free_gpu_buffer(void *addr, int gpu_id);
 
+void opal_cuda_d2dcpy_async(void* dst, const void* src, size_t count);
+
+void opal_cuda_d2dcpy(void* dst, const void* src, size_t count);
+
 void opal_dump_cuda_list(ddt_cuda_list_t *list);
 
 }
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
index 9ee6fc0f032..381aaf99ae8 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
@@ -300,6 +300,7 @@ int32_t opal_generic_simple_pack_function_cuda_vector(opal_convertor_t* pConvert
                 iov_ptr = pConvertor->gpu_buffer_ptr;
             }
         }
+        cudaStreamSynchronize(cuda_streams->opal_cuda_stream[0]);
         while( 1 ) {
             while( pElem->elem.common.flags & OPAL_DATATYPE_FLAG_DATA ) {
                 /* now here we have a basic datatype */
@@ -713,6 +714,8 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
     convertor_flags = pConvertor->flags;
   //  orig_stack_index = pStack->index;
     destination_base = destination;
+    
+    cudaStreamSynchronize(cuda_streams->opal_cuda_stream[0]);
 
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
     GET_TIME(start_total);
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
index ba8a89e88cb..4ee73897f68 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
@@ -251,6 +251,7 @@ int32_t opal_generic_simple_unpack_function_cuda_vector( opal_convertor_t* pConv
         total_time = ELAPSED_TIME( start, end );
         DT_CUDA_DEBUG( opal_cuda_output( 2, "[Timing]: HtoD memcpy in %ld microsec, free required %d\n", total_time, free_required ); );
 #endif
+        cudaStreamSynchronize(cuda_streams->opal_cuda_stream[0]);
         iov_len_local = iov[iov_count].iov_len;
         if( 0 != pConvertor->partial_length ) {
             /* not support yet */
@@ -377,6 +378,8 @@ int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvert
     long total_time, move_time;
 #endif
     
+    cudaStreamSynchronize(cuda_streams->opal_cuda_stream[0]);
+
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
     GET_TIME(start_total);
 #endif
diff --git a/opal/datatype/opal_datatype_gpu.c b/opal/datatype/opal_datatype_gpu.c
index 095cd477dd3..f05ecbd84b5 100644
--- a/opal/datatype/opal_datatype_gpu.c
+++ b/opal/datatype/opal_datatype_gpu.c
@@ -90,6 +90,10 @@ void (*opal_cuda_free_gpu_buffer_p)(void *addr, int gpu_id) = NULL;
 
 void* (*opal_cuda_malloc_gpu_buffer_p)(size_t size, int gpu_id) = NULL;
 
+void (*opal_cuda_d2dcpy_async_p)(void* dst, const void* src, size_t count) = NULL;
+
+void (*opal_cuda_d2dcpy_p)(void* dst, const void* src, size_t count) = NULL;
+
 #define OPAL_DATATYPE_FIND_CUDA_FUNCTION_OR_RETURN(handle, fname)       \
     do {                                                                \
         char* _error;                                                   \
@@ -128,6 +132,8 @@ int32_t opal_datatype_gpu_init(void)
         OPAL_DATATYPE_FIND_CUDA_FUNCTION_OR_RETURN( opal_datatype_cuda_handle, opal_cuda_sync_device );
         OPAL_DATATYPE_FIND_CUDA_FUNCTION_OR_RETURN( opal_datatype_cuda_handle, opal_cuda_free_gpu_buffer );
         OPAL_DATATYPE_FIND_CUDA_FUNCTION_OR_RETURN( opal_datatype_cuda_handle, opal_cuda_malloc_gpu_buffer );
+        OPAL_DATATYPE_FIND_CUDA_FUNCTION_OR_RETURN( opal_datatype_cuda_handle, opal_cuda_d2dcpy_async );
+        OPAL_DATATYPE_FIND_CUDA_FUNCTION_OR_RETURN( opal_datatype_cuda_handle, opal_cuda_d2dcpy );
 
         (*opal_datatype_cuda_init_p)();
         opal_output( 0, "cuda init done\n");
@@ -152,6 +158,8 @@ int32_t opal_datatype_gpu_fini(void)
         opal_cuda_sync_device_p = NULL;
         opal_cuda_free_gpu_buffer_p = NULL;
         opal_cuda_malloc_gpu_buffer_p = NULL;
+        opal_cuda_d2dcpy_async_p = NULL;
+        opal_cuda_d2dcpy_p = NULL;
 
         dlclose(opal_datatype_cuda_handle);
         opal_datatype_cuda_handle = NULL;
diff --git a/opal/datatype/opal_datatype_gpu.h b/opal/datatype/opal_datatype_gpu.h
index d50e2fe8d99..df42d68b6fc 100644
--- a/opal/datatype/opal_datatype_gpu.h
+++ b/opal/datatype/opal_datatype_gpu.h
@@ -53,4 +53,8 @@ extern void (*opal_cuda_sync_device_p)(void);
 extern void (*opal_cuda_free_gpu_buffer_p)(void *addr, int gpu_id);
 
 extern void* (*opal_cuda_malloc_gpu_buffer_p)(size_t size, int gpu_id);
+
+extern void (*opal_cuda_d2dcpy_async_p)(void* dst, const void* src, size_t count);
+
+extern void (*opal_cuda_d2dcpy_p)(void* dst, const void* src, size_t count);
 #endif /* OPAL_DATATYPE_GPU_H_HAS_BEEN_INCLUDED */
diff --git a/opal/mca/btl/btl.h b/opal/mca/btl/btl.h
index 431610ff17f..1a38ec4c331 100644
--- a/opal/mca/btl/btl.h
+++ b/opal/mca/btl/btl.h
@@ -190,6 +190,7 @@ typedef uint8_t mca_btl_base_tag_t;
 #define MCA_BTL_TAG_SMCUDA            (MCA_BTL_TAG_BTL + 2)
 #define MCA_BTL_TAG_SMCUDA_DATATYPE_UNPACK   (MCA_BTL_TAG_BTL + 3)
 #define MCA_BTL_TAG_SMCUDA_DATATYPE_PACK     (MCA_BTL_TAG_BTL + 4)
+#define MCA_BTL_TAG_SMCUDA_DATATYPE_PUT      (MCA_BTL_TAG_BTL + 5)
 
 /* prefered protocol */
 #define MCA_BTL_FLAGS_SEND            0x0001
@@ -1181,6 +1182,8 @@ struct mca_btl_base_module_t {
 #endif /* OPAL_CUDA_GDR_SUPPORT */
 #if OPAL_CUDA_SUPPORT
     size_t      btl_cuda_max_send_size;   /**< set if CUDA max send_size is different from host max send size */
+    size_t      btl_cuda_ddt_pipeline_size;
+    int         btl_cuda_ddt_pipeline_depth;
 #endif /* OPAL_CUDA_SUPPORT */
 };
 typedef struct mca_btl_base_module_t mca_btl_base_module_t;
diff --git a/opal/mca/btl/smcuda/btl_smcuda.c b/opal/mca/btl/smcuda/btl_smcuda.c
index 14d0a3995ce..9d5d5441683 100644
--- a/opal/mca/btl/smcuda/btl_smcuda.c
+++ b/opal/mca/btl/smcuda/btl_smcuda.c
@@ -83,6 +83,13 @@ static struct mca_btl_base_registration_handle_t *mca_btl_smcuda_register_mem (
 
 static int mca_btl_smcuda_deregister_mem (struct mca_btl_base_module_t* btl,
                                           struct mca_btl_base_registration_handle_t *handle);
+                                          
+inline static int mca_btl_smcuda_cuda_ddt_start_pack(struct mca_btl_base_module_t *btl, 
+                                                     struct mca_btl_base_endpoint_t *endpoint,
+                                                     struct opal_convertor_t *convertor,
+                                                     void *remote_gpu_address,
+                                                     mca_btl_base_descriptor_t *frag,
+                                                     int lindex, uint8_t remote_device, uint8_t local_device);
 #endif
 
 mca_btl_smcuda_t mca_btl_smcuda = {
@@ -402,7 +409,6 @@ smcuda_btl_first_time_init(mca_btl_smcuda_t *smcuda_btl,
 
     /* allocation will be for the fragment descriptor and payload buffer */
     length = sizeof(mca_btl_smcuda_frag1_t);
-    printf("free list %d\n", mca_btl_smcuda_component.sm_free_list_num);
     length_payload =
         sizeof(mca_btl_smcuda_hdr_t) + mca_btl_smcuda_component.eager_limit;
     i = opal_free_list_init (&mca_btl_smcuda_component.sm_frags_eager, length,
@@ -1147,11 +1153,8 @@ int mca_btl_smcuda_get_cuda (struct mca_btl_base_module_t *btl,
         }
         if(opal_convertor_need_buffers(&recvreq->req_recv.req_base.req_convertor) == true) {
             recvreq->req_recv.req_base.req_convertor.flags |= CONVERTOR_CUDA;
-            printf("RECEIVE REGT UNPACK, size %ld!!!!!!!!!!!\n", size);
             
             struct opal_convertor_t *convertor = &(recvreq->req_recv.req_base.req_convertor);   
-          //  size_t pipeline_size = remote_handle->reg_data.pipeline_size;
-            printf("i receive lindex %d, pack_required %d, remote_device %d， local_device %d\n", lindex, pack_required, remote_device, local_device);
             
             if (remote_device != local_device && !OPAL_DATATYPE_DIRECT_COPY_GPUMEM) {
                 convertor->gpu_buffer_ptr = NULL;  
@@ -1160,13 +1163,8 @@ int mca_btl_smcuda_get_cuda (struct mca_btl_base_module_t *btl,
             }
             cuda_ddt_hdr_t send_msg;
             if (pack_required) {
-                mca_btl_smcuda_cuda_ddt_unpack_clone(ep, convertor, remote_memory_address, (mca_btl_base_descriptor_t *)frag, 
+                mca_btl_smcuda_cuda_ddt_start_pack(btl, ep, convertor, remote_memory_address, (mca_btl_base_descriptor_t *)frag, 
                                                     lindex, remote_device, local_device);
-                send_msg.lindex = lindex;
-                send_msg.packed_size = 0;
-                send_msg.seq = 0;
-                send_msg.msg_type = CUDA_PACK_TO_LOCAL_START;
-                mca_btl_smcuda_send_cuda_pack_sig(btl, ep, &send_msg);
                 done = 0;
             } else {
                 struct iovec iov;
@@ -1174,9 +1172,9 @@ int mca_btl_smcuda_get_cuda (struct mca_btl_base_module_t *btl,
                 size_t max_data;
                 if (!OPAL_DATATYPE_DIRECT_COPY_GPUMEM && remote_device != local_device) {
                     convertor->gpu_buffer_ptr = opal_cuda_malloc_gpu_buffer_p(size, 0);
-                    mca_common_cuda_memp2pcpy(convertor->gpu_buffer_ptr, remote_memory_address, size);
+                    (*opal_cuda_d2dcpy_async_p)(convertor->gpu_buffer_ptr, remote_memory_address, size);
                     iov.iov_base = convertor->gpu_buffer_ptr;
-                    printf("start D2D copy src %p, dst %p, size %lu\n", remote_memory_address, convertor->gpu_buffer_ptr, size);
+                    opal_output(0, "start D2D copy src %p, dst %p, size %lu\n", remote_memory_address, convertor->gpu_buffer_ptr, size);
                 } else {
                     iov.iov_base = convertor->gpu_buffer_ptr;
                 }
@@ -1186,31 +1184,28 @@ int mca_btl_smcuda_get_cuda (struct mca_btl_base_module_t *btl,
                 done = 1;
             }
         } else {
-            printf("RECEIVE REGT CONTIGUOUS, size %ld !!!!!!!!!!!\n", size);
             recvreq->req_recv.req_base.req_convertor.flags |= CONVERTOR_CUDA;
-            cuda_ddt_hdr_t send_msg;
             if (pack_required) {
-                send_msg.lindex = lindex;
-                send_msg.packed_size = 0;
-                if (remote_device == local_device && OPAL_DATATYPE_DIRECT_COPY_GPUMEM) {
+                if (remote_device == local_device || OPAL_DATATYPE_DIRECT_COPY_GPUMEM) {
                     /* now we are able to let sender pack directly to my memory */
-  /*                  mca_mpool_common_cuda_reg_t loc_reg;
+                    mca_mpool_common_cuda_reg_t loc_reg;
                     mca_mpool_common_cuda_reg_t *loc_reg_ptr = &loc_reg;
-                    cuda_getmemhandle(local_address, size, (mca_mpool_base_registration_t *)&loc_reg, NULL);
-                    memcpy(send_msg.mem_handle, loc_reg_ptr->data.memHandle, sizeof(loc_reg_ptr->data.memHandle));
-                    send_msg.seq = -9;
-                    send_msg.msg_type = CUDA_PACK_TO_REMOTE_START;
-                    send_msg.remote_address = local_address;
-                    send_msg.remote_base = loc_reg.base.base;
-                    mca_common_wait_stream_synchronize(&loc_reg);
-                    printf("send r_addr %p, r_base %p\n", local_address, loc_reg.base.base);*/
+                    cuda_ddt_put_hdr_t put_msg;
+                    if (OPAL_SUCCESS != cuda_getmemhandle(local_address, size, (mca_mpool_base_registration_t *)&loc_reg, NULL)) {
+                        mca_btl_smcuda_cuda_ddt_start_pack(btl, ep, NULL, remote_memory_address, (mca_btl_base_descriptor_t *)frag, 
+                                                           lindex, remote_device, local_device);
+                    }
+                    memcpy(put_msg.mem_handle, loc_reg_ptr->data.memHandle, sizeof(loc_reg_ptr->data.memHandle));
+                    put_msg.remote_address = local_address;
+                    put_msg.remote_base = loc_reg.base.base;
+                    put_msg.lindex = lindex;
+                    mca_btl_smcuda_cuda_ddt_unpack_clone(ep, NULL, remote_memory_address, (mca_btl_base_descriptor_t *)frag, 
+                                                        lindex, 0, 0);
+                    mca_btl_smcuda_send_cuda_put_sig(btl, ep, &put_msg);
                 } else {
-                    send_msg.seq = 0;
-                    send_msg.msg_type = CUDA_PACK_TO_LOCAL_START;
+                    mca_btl_smcuda_cuda_ddt_start_pack(btl, ep, NULL, remote_memory_address, (mca_btl_base_descriptor_t *)frag, 
+                                                       lindex, remote_device, local_device);
                 }
-                mca_btl_smcuda_cuda_ddt_unpack_clone(ep, NULL, remote_memory_address, (mca_btl_base_descriptor_t *)frag, 
-                                                    lindex, 0, 0);
-                mca_btl_smcuda_send_cuda_pack_sig(btl, ep, &send_msg);
                 done = 0;
             } else {
                 rc = mca_common_cuda_memcpy(local_address, remote_memory_address, size,
@@ -1323,12 +1318,11 @@ int mca_btl_smcuda_send_cuda_unpack_sig(struct mca_btl_base_module_t* btl,
 {
     mca_btl_smcuda_frag_t* frag;
     int rc;
-    cuda_ddt_hdr_t cuda_dt_hdr;
     
     /* allocate a fragment, giving up if we can't get one */
     MCA_BTL_SMCUDA_FRAG_ALLOC_EAGER(frag);
     if( OPAL_UNLIKELY(NULL == frag) ) {
-        printf("!!!!!!!!!! no frag \n");
+        opal_output(0, "no frag for send unpack sig\n");
         return OPAL_ERR_OUT_OF_RESOURCE;;
     }
 
@@ -1337,7 +1331,6 @@ int mca_btl_smcuda_send_cuda_unpack_sig(struct mca_btl_base_module_t* btl,
     memcpy(frag->segment.seg_addr.pval, send_msg, sizeof(cuda_ddt_hdr_t));
     
     rc = mca_btl_smcuda_send(btl, endpoint, (struct mca_btl_base_descriptor_t*)frag,  MCA_BTL_TAG_SMCUDA_DATATYPE_UNPACK);
-    printf("######## rank %d, send seq %d, endpoint %p\n", endpoint->my_smp_rank, send_msg->seq, endpoint);
     return rc;
 }
 
@@ -1347,11 +1340,11 @@ int mca_btl_smcuda_send_cuda_pack_sig(struct mca_btl_base_module_t* btl,
 {
     mca_btl_smcuda_frag_t* frag;
     int rc;
-    cuda_ddt_hdr_t cuda_dt_hdr;
     
     /* allocate a fragment, giving up if we can't get one */
     MCA_BTL_SMCUDA_FRAG_ALLOC_EAGER(frag);
     if( OPAL_UNLIKELY(NULL == frag) ) {
+        opal_output(0, "no frag for send pack sig\n");
         return OPAL_ERR_OUT_OF_RESOURCE;;
     }
 
@@ -1363,30 +1356,44 @@ int mca_btl_smcuda_send_cuda_pack_sig(struct mca_btl_base_module_t* btl,
     return rc;
 }
 
-int mca_btl_smcuda_check_cuda_dt_pack_clone_exist(struct mca_btl_base_endpoint_t *endpoint, struct opal_convertor_t *convertor)
+int mca_btl_smcuda_send_cuda_put_sig(struct mca_btl_base_module_t* btl,
+                                     struct mca_btl_base_endpoint_t* endpoint, 
+                                     cuda_ddt_put_hdr_t *put_msg)
 {
-    int i;
-    for (i = 0; i < SMCUDA_DT_CLONE_SIZE; i++) {
-        if (endpoint->smcuda_ddt_pack_clone[i].convertor == convertor) {
-            return i;
-        }
+    mca_btl_smcuda_frag_t* frag;
+    int rc;
+    
+    /* allocate a fragment, giving up if we can't get one */
+    MCA_BTL_SMCUDA_FRAG_ALLOC_EAGER(frag);
+    if( OPAL_UNLIKELY(NULL == frag) ) {
+        opal_output(0, "no frag for send put sig\n");
+        return OPAL_ERR_OUT_OF_RESOURCE;;
     }
-    return -1;
-}
 
-int mca_btl_smcuda_set_cuda_dt_pack_seq(struct mca_btl_base_endpoint_t *endpoint, int lindex, int seq)
-{
-    endpoint->smcuda_ddt_pack_clone[lindex].seq = seq;
-    return 0;
+    /* Fill in fragment fields. */
+    frag->base.des_flags = MCA_BTL_DES_FLAGS_BTL_OWNERSHIP;
+    memcpy(frag->segment.seg_addr.pval, put_msg, sizeof(cuda_ddt_put_hdr_t));
+    
+    rc = mca_btl_smcuda_send(btl, endpoint, (struct mca_btl_base_descriptor_t*)frag,  MCA_BTL_TAG_SMCUDA_DATATYPE_PUT);
+    return rc;
 }
 
-int mca_btl_smcuda_get_cuda_dt_pack_seq(struct mca_btl_base_endpoint_t *endpoint, int lindex)
+inline static int mca_btl_smcuda_cuda_ddt_start_pack(struct mca_btl_base_module_t *btl,
+                                                     struct mca_btl_base_endpoint_t *endpoint,
+                                                     struct opal_convertor_t *convertor,
+                                                     void *remote_gpu_address,
+                                                     mca_btl_base_descriptor_t *frag,
+                                                     int lindex, uint8_t remote_device, uint8_t local_device)
 {
-    if (lindex >= SMCUDA_DT_CLONE_SIZE) {
-        return -9;
-    } else {
-        return endpoint->smcuda_ddt_pack_clone[lindex].seq;
-    }
+    cuda_ddt_hdr_t send_msg;
+    mca_btl_smcuda_cuda_ddt_unpack_clone(endpoint, convertor, remote_gpu_address, (mca_btl_base_descriptor_t *)frag, 
+                                        lindex, remote_device, local_device);
+    send_msg.lindex = lindex;
+    send_msg.packed_size = 0;
+    send_msg.seq = 0;
+    send_msg.msg_type = CUDA_DDT_PACK_START;
+    opal_output(0, "smcuda btl start pack, remote_gpu_address %p, frag %p, lindex %d, remote_device %d, local_device %d\n", remote_gpu_address, frag, lindex, remote_device, local_device);
+    mca_btl_smcuda_send_cuda_pack_sig(btl, endpoint, &send_msg);
 }
 
 int mca_btl_smcuda_alloc_cuda_ddt_pack_clone(struct mca_btl_base_endpoint_t *endpoint)
@@ -1430,7 +1437,6 @@ void mca_btl_smcuda_cuda_ddt_pack_clone(struct mca_btl_base_endpoint_t *endpoint
     endpoint->smcuda_ddt_pack_clone[lindex].convertor = convertor;
     endpoint->smcuda_ddt_pack_clone[lindex].remote_gpu_address = remote_gpu_address;
     endpoint->smcuda_ddt_pack_clone[lindex].lindex = lindex;
-    endpoint->smcuda_ddt_pack_clone[lindex].seq = -9;
     endpoint->smcuda_ddt_pack_clone[lindex].remote_device = remote_device;
     endpoint->smcuda_ddt_pack_clone[lindex].local_device = local_device;
     endpoint->smcuda_ddt_pack_clone[lindex].frag = frag;
@@ -1445,7 +1451,6 @@ void mca_btl_smcuda_cuda_ddt_unpack_clone(struct mca_btl_base_endpoint_t *endpoi
     endpoint->smcuda_ddt_unpack_clone[lindex].convertor = convertor;
     endpoint->smcuda_ddt_unpack_clone[lindex].remote_gpu_address = remote_gpu_address;
     endpoint->smcuda_ddt_unpack_clone[lindex].lindex = lindex;
-    endpoint->smcuda_ddt_unpack_clone[lindex].seq = -9;
     endpoint->smcuda_ddt_unpack_clone[lindex].remote_device = remote_device;
     endpoint->smcuda_ddt_unpack_clone[lindex].local_device = local_device;
     endpoint->smcuda_ddt_unpack_clone[lindex].frag = frag;
diff --git a/opal/mca/btl/smcuda/btl_smcuda.h b/opal/mca/btl/smcuda/btl_smcuda.h
index 46ae97b3909..288dc2027d3 100644
--- a/opal/mca/btl/smcuda/btl_smcuda.h
+++ b/opal/mca/btl/smcuda/btl_smcuda.h
@@ -513,14 +513,15 @@ enum ipcState {
 
 /* cuda datatype pack/unpack message */
 typedef struct {
+    int lindex;
     int seq;
     int msg_type;
-    int lindex;
     int packed_size;
 } cuda_ddt_hdr_t;
 
 /* cuda datatype put message */
 typedef struct {
+    int lindex;
     void *remote_address;
     void *remote_base;
     uint64_t mem_handle[8];
@@ -530,31 +531,25 @@ typedef struct {
 #define CUDA_DDT_COMPLETE           1
 #define CUDA_DDT_COMPLETE_ACK       2
 #define CUDA_DDT_CLEANUP            3
-#define CUDA_PACK_TO_LOCAL_START    4
-#define CUDA_PACK_TO_REMOTE_START   5
-#define CUDA_DDT_PACK_TO_BLOCK      6
-#define CUDA_UNPACK_NO              7
+#define CUDA_DDT_PACK_START         4
+#define CUDA_DDT_PACK_TO_BLOCK      5
+#define CUDA_UNPACK_NO              6
 
 /* package save pack/unpack convertor and cbfunc */
 typedef struct {
     struct opal_convertor_t *convertor;
     void *remote_gpu_address;
     int lindex;
-    int seq;
     uint8_t remote_device;
     uint8_t local_device;
     mca_btl_base_descriptor_t *frag;
 } cuda_ddt_clone_t;
 
 #define SMCUDA_DT_CLONE_SIZE 20
-extern cuda_ddt_clone_t smcuda_dt_clone[SMCUDA_DT_CLONE_SIZE];
 
 int mca_btl_smcuda_send_cuda_unpack_sig(struct mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t* endpoint, cuda_ddt_hdr_t *send_msg);
 int mca_btl_smcuda_send_cuda_pack_sig(struct mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t* endpoint, cuda_ddt_hdr_t *send_msg);
-int mca_btl_smcuda_check_cuda_dt_pack_clone_exist(struct mca_btl_base_endpoint_t *endpoint, struct opal_convertor_t *convertor);
-int mca_btl_smcuda_set_cuda_dt_pack_seq(struct mca_btl_base_endpoint_t *endpoint, int lindex, int seq);
-int mca_btl_smcuda_get_cuda_dt_pack_seq(struct mca_btl_base_endpoint_t *endpoint, int lindex);
-int mca_btl_smcuda_get_cuda_dt_pack_pipeline_size(struct mca_btl_base_endpoint_t *endpoint, int lindex);
+int mca_btl_smcuda_send_cuda_put_sig(struct mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t* endpoint, cuda_ddt_put_hdr_t *put_msg);
 int mca_btl_smcuda_alloc_cuda_ddt_pack_clone(struct mca_btl_base_endpoint_t *endpoint);
 int mca_btl_smcuda_alloc_cuda_ddt_unpack_clone(struct mca_btl_base_endpoint_t *endpoint);
 void mca_btl_smcuda_free_cuda_ddt_pack_clone(struct mca_btl_base_endpoint_t *endpoint, int lindex);
diff --git a/opal/mca/btl/smcuda/btl_smcuda_component.c b/opal/mca/btl/smcuda/btl_smcuda_component.c
index 4f46b8a5beb..51be3eafafa 100644
--- a/opal/mca/btl/smcuda/btl_smcuda_component.c
+++ b/opal/mca/btl/smcuda/btl_smcuda_component.c
@@ -173,6 +173,9 @@ static int smcuda_register(void)
 #else /* OPAL_CUDA_SUPPORT */
     mca_btl_smcuda.super.btl_exclusivity = MCA_BTL_EXCLUSIVITY_LOW;
 #endif /* OPAL_CUDA_SUPPORT */
+    mca_btl_smcuda.super.btl_cuda_ddt_pipeline_size = mca_btl_smcuda_component.cuda_ddt_pipeline_size;
+    printf("pipeline size %lu\n", mca_btl_smcuda.super.btl_cuda_ddt_pipeline_size);
+    mca_btl_smcuda.super.btl_cuda_ddt_pipeline_depth = 4;
     mca_btl_smcuda.super.btl_eager_limit = 4*1024;
     mca_btl_smcuda.super.btl_rndv_eager_limit = 4*1024;
     mca_btl_smcuda.super.btl_max_send_size = 32*1024;
@@ -848,6 +851,7 @@ static void btl_smcuda_control(mca_btl_base_module_t* btl,
     }
 }
 
+/* for receiver */
 static void btl_smcuda_datatype_unpack(mca_btl_base_module_t* btl,
                                        mca_btl_base_tag_t tag,
                                        mca_btl_base_descriptor_t* des, void* cbdata)
@@ -868,7 +872,6 @@ static void btl_smcuda_datatype_unpack(mca_btl_base_module_t* btl,
     my_cuda_dt_clone = &endpoint->smcuda_ddt_unpack_clone[lindex];
     assert(my_cuda_dt_clone->lindex == lindex);
     
-    printf("$$$$$$$$$$$$$$hello, rank %d in smcuda unpack seq %d, index %d\n", endpoint->my_smp_rank, seq, lindex);
     cuda_ddt_hdr_t send_msg;
     send_msg.lindex = lindex;
     
@@ -887,14 +890,14 @@ static void btl_smcuda_datatype_unpack(mca_btl_base_module_t* btl,
         if (convertor == NULL) { /* do not unpack */
             mca_btl_smcuda_frag_t *frag_recv = (mca_btl_smcuda_frag_t *) my_cuda_dt_clone->frag;
             unsigned char *local_address = (unsigned char*)frag_recv->segment.seg_addr.pval;
-            printf("D2D local %p, remote %p, size %ld\n", local_address + seq*pipeline_size, my_cuda_dt_clone->remote_gpu_address+seq*pipeline_size, packed_size);
+            opal_output(0, "no unpack, start D2D copy local %p, remote %p, size %ld\n", local_address + seq*pipeline_size, my_cuda_dt_clone->remote_gpu_address+seq*pipeline_size, packed_size);
             mca_common_cuda_memp2pcpy(local_address + seq*pipeline_size, my_cuda_dt_clone->remote_gpu_address + seq*pipeline_size, packed_size);
         } else {     /* unpack */
             if (!OPAL_DATATYPE_DIRECT_COPY_GPUMEM && my_cuda_dt_clone->remote_device != my_cuda_dt_clone->local_device) {
                 convertor->gpu_buffer_ptr = opal_cuda_malloc_gpu_buffer_p(packed_size, 0);
-                mca_common_cuda_memp2pcpy(convertor->gpu_buffer_ptr, my_cuda_dt_clone->remote_gpu_address + seq*pipeline_size, packed_size);
+                (*opal_cuda_d2dcpy_async_p)(convertor->gpu_buffer_ptr, my_cuda_dt_clone->remote_gpu_address + seq*pipeline_size, packed_size);
                 iov.iov_base = convertor->gpu_buffer_ptr;
-                printf("start D2D copy src %p, dst %p, size %lu\n", my_cuda_dt_clone->remote_gpu_address + seq*pipeline_size, convertor->gpu_buffer_ptr, packed_size);        
+                opal_output(0, "unpack, start D2D copy src %p, dst %p, size %lu\n", my_cuda_dt_clone->remote_gpu_address + seq*pipeline_size, convertor->gpu_buffer_ptr, packed_size);        
             } else {
                 iov.iov_base = convertor->gpu_buffer_ptr + seq * pipeline_size;
             }
@@ -917,9 +920,9 @@ static void btl_smcuda_datatype_unpack(mca_btl_base_module_t* btl,
         }
         mca_btl_smcuda_send_cuda_pack_sig(btl, endpoint, &send_msg);
     }
-   // MCA_BTL_SMCUDA_FRAG_RETURN(frag);
 }
 
+/* for sender */
 static void btl_smcuda_datatype_pack(mca_btl_base_module_t* btl,
                                      mca_btl_base_tag_t tag,
                                      mca_btl_base_descriptor_t* des, void* cbdata)
@@ -944,7 +947,6 @@ static void btl_smcuda_datatype_pack(mca_btl_base_module_t* btl,
     endpoint = mca_btl_smcuda_component.sm_peers[frag->hdr->my_smp_rank];
     my_cuda_dt_clone = &endpoint->smcuda_ddt_pack_clone[lindex];
     
-    printf("$$$$$$$$$$$$$$hello, rank %d in smcuda pack seq %d, index %d\n", endpoint->my_smp_rank, seq, lindex);
     struct opal_convertor_t *convertor = my_cuda_dt_clone->convertor;
     send_msg.lindex = lindex;
     if (msg_type == CUDA_DDT_COMPLETE_ACK) {
@@ -958,7 +960,6 @@ static void btl_smcuda_datatype_pack(mca_btl_base_module_t* btl,
         }
         mca_btl_smcuda_free_cuda_ddt_pack_clone(endpoint, lindex);
     } else if (msg_type == CUDA_DDT_PACK_TO_BLOCK) {
-        printf("i receive a message pack to seq, packed %ld, pipeline_size %ld\n", convertor->bConverted, mca_btl_smcuda_component.cuda_ddt_pipeline_size); 
         if (convertor->bConverted < convertor->local_size) {
             struct iovec iov;
             iov.iov_base = convertor->gpu_buffer_ptr + seq * mca_btl_smcuda_component.cuda_ddt_pipeline_size;;
@@ -974,28 +975,8 @@ static void btl_smcuda_datatype_pack(mca_btl_base_module_t* btl,
             }
             mca_btl_smcuda_send_cuda_unpack_sig(btl, endpoint, &send_msg);
         }
-    } else {
-        mca_mpool_common_cuda_reg_t *rget_reg_ptr = NULL;
-        if (msg_type == CUDA_PACK_TO_REMOTE_START) { /* receiver is contiguous, and ask me to pack directly to his gpu memory */
-/*            opal_cuda_free_gpu_buffer_p(convertor->gpu_buffer_ptr, 0);
-            mca_mpool_common_cuda_reg_t rget_reg;
-            rget_reg_ptr= &rget_reg;
-            memset(&rget_reg, 0, sizeof(rget_reg));
-            memcpy(rget_reg.data.memHandle, cuda_dt_hdr.mem_handle, sizeof(cuda_dt_hdr.mem_handle));
-            cuda_openmemhandle(NULL, 0, (mca_mpool_base_registration_t *)&rget_reg, NULL);
-            mca_common_wait_stream_synchronize(&rget_reg);
-            size_t offset = (size_t) ((intptr_t) cuda_dt_hdr.remote_address - (intptr_t) cuda_dt_hdr.remote_base);
-            unsigned char *remote_memory_address = (unsigned char *)rget_reg_ptr->base.alloc_base + offset;
-            convertor->gpu_buffer_ptr = remote_memory_address;
-            printf("remote_memory_address $$$$$$ %p, r_addr %p, r_base %p\n", remote_memory_address, cuda_dt_hdr.remote_address, cuda_dt_hdr.remote_base);
-            send_msg.msg_type = CUDA_UNPACK_NO;
-            convertor->gpu_buffer_size = convertor->local_size;*/
-        } else {
-            send_msg.msg_type = CUDA_DDT_UNPACK_FROM_BLOCK;
-        }
+    } else if (msg_type == CUDA_DDT_PACK_START) {
         struct iovec iov;
-        packed_size = mca_btl_smcuda_component.cuda_ddt_pipeline_size;
-        printf("Pipeline_size %ld\n", packed_size);
         iov.iov_len = mca_btl_smcuda_component.cuda_ddt_pipeline_size;
         iov.iov_base = convertor->gpu_buffer_ptr;
         seq = 0;
@@ -1007,16 +988,65 @@ static void btl_smcuda_datatype_pack(mca_btl_base_module_t* btl,
             send_msg.seq = seq;
             if (rv_dt == 1) {
                 send_msg.msg_type = CUDA_DDT_COMPLETE;
+            } else {
+                send_msg.msg_type = CUDA_DDT_UNPACK_FROM_BLOCK;
             }
             mca_btl_smcuda_send_cuda_unpack_sig(btl, endpoint, &send_msg);
             seq ++;
         }
-        
-        if (rget_reg_ptr != NULL) { /* close memhandle */
-            cuda_closememhandle(NULL, (mca_mpool_base_registration_t *)rget_reg_ptr);
-        }
+    } else {
+        opal_output(0, "unknown message\n");
     }
-  //  MCA_BTL_SMCUDA_FRAG_RETURN(frag);
+}
+
+/* for sender */
+static void btl_smcuda_datatype_put(mca_btl_base_module_t* btl,
+                                    mca_btl_base_tag_t tag,
+                                    mca_btl_base_descriptor_t* des, void* cbdata)
+{
+    struct mca_btl_base_endpoint_t *endpoint;
+    cuda_ddt_put_hdr_t recv_msg;
+    mca_btl_base_segment_t* segments = des->des_segments;
+    memcpy(&recv_msg, segments->seg_addr.pval, sizeof(cuda_ddt_put_hdr_t));
+    int lindex = recv_msg.lindex;
+    void *remote_address = recv_msg.remote_address;
+    void *remote_base = recv_msg.remote_base;
+    mca_btl_smcuda_frag_t *frag = (mca_btl_smcuda_frag_t *)des;
+    cuda_ddt_clone_t *my_cuda_dt_clone;
+    cuda_ddt_hdr_t send_msg;
+    
+    /* We can find the endoint back from the rank embedded in the header */
+    endpoint = mca_btl_smcuda_component.sm_peers[frag->hdr->my_smp_rank];
+    my_cuda_dt_clone = &endpoint->smcuda_ddt_pack_clone[lindex];
+    struct opal_convertor_t *convertor = my_cuda_dt_clone->convertor;
+    
+    opal_cuda_free_gpu_buffer_p(convertor->gpu_buffer_ptr, 0);
+    mca_mpool_common_cuda_reg_t *rget_reg_ptr = NULL;
+    mca_mpool_common_cuda_reg_t rget_reg;
+    rget_reg_ptr= &rget_reg;
+    memset(&rget_reg, 0, sizeof(rget_reg));
+    memcpy(rget_reg.data.memHandle, recv_msg.mem_handle, sizeof(recv_msg.mem_handle));
+    cuda_openmemhandle(NULL, 0, (mca_mpool_base_registration_t *)&rget_reg, NULL);
+    size_t offset = (size_t) ((intptr_t)remote_address - (intptr_t)remote_base);
+    unsigned char *remote_memory_address = (unsigned char *)rget_reg_ptr->base.alloc_base + offset;
+    convertor->gpu_buffer_ptr = remote_memory_address;
+    opal_output(0, "smcuda start put, remote_memory_address $$$$$$ %p, r_addr %p, r_base %p\n", remote_memory_address, remote_address, remote_base);
+    convertor->gpu_buffer_size = convertor->local_size;
+    
+    struct iovec iov;
+    uint32_t iov_count = 1;
+    int rv_dt = 0;
+    size_t max_data = 0;
+    iov.iov_len = convertor->local_size;
+    iov.iov_base = convertor->gpu_buffer_ptr;
+    rv_dt = opal_convertor_pack(convertor, &iov, &iov_count, &max_data );
+    assert(rv_dt == 1);
+    send_msg.lindex = lindex;
+    send_msg.packed_size = 0;
+    send_msg.seq = -2;
+    send_msg.msg_type = CUDA_DDT_CLEANUP;
+    mca_btl_smcuda_send_cuda_unpack_sig(btl, endpoint, &send_msg);
+    mca_btl_smcuda_free_cuda_ddt_pack_clone(endpoint, lindex);
 }
 
 #endif /* OPAL_CUDA_SUPPORT */
@@ -1137,6 +1167,8 @@ mca_btl_smcuda_component_init(int *num_btls,
     mca_btl_base_active_message_trigger[MCA_BTL_TAG_SMCUDA_DATATYPE_UNPACK].cbdata = NULL;
     mca_btl_base_active_message_trigger[MCA_BTL_TAG_SMCUDA_DATATYPE_PACK].cbfunc = btl_smcuda_datatype_pack;
     mca_btl_base_active_message_trigger[MCA_BTL_TAG_SMCUDA_DATATYPE_PACK].cbdata = NULL;
+    mca_btl_base_active_message_trigger[MCA_BTL_TAG_SMCUDA_DATATYPE_PUT].cbfunc = btl_smcuda_datatype_put;
+    mca_btl_base_active_message_trigger[MCA_BTL_TAG_SMCUDA_DATATYPE_PUT].cbdata = NULL;
 
 #endif /* OPAL_CUDA_SUPPORT */
 
diff --git a/opal/mca/common/cuda/common_cuda.c b/opal/mca/common/cuda/common_cuda.c
index efcc380d3d2..4cbc8ac4b50 100644
--- a/opal/mca/common/cuda/common_cuda.c
+++ b/opal/mca/common/cuda/common_cuda.c
@@ -1047,7 +1047,6 @@ int cuda_getmemhandle(void *base, size_t size, mca_mpool_base_registration_t *ne
                             "CUDA: cuMemGetAddressRange passed: addr=%p, size=%d, pbase=%p, psize=%d ",
                             base, (int)size, (void *)pbase, (int)psize);
     }
-    printf("sizeof memhandle %lu, CUipcMemHandle %lu, cuEvent %lu, char %lu\n", sizeof(memHandle), sizeof(CUipcMemHandle), sizeof(CUevent), sizeof(char));
 
     /* Store all the information in the registration */
     cuda_reg->base.base = (void *)pbase;

From 50abfc8586f7091da7664a80cd586677a27b8aa5 Mon Sep 17 00:00:00 2001
From: eddy16112 <eddy16112@gmail.com>
Date: Mon, 26 Oct 2015 22:21:15 -0400
Subject: [PATCH 127/190] less bugs

Conflicts:
	ompi/mca/pml/monitoring/pml_monitoring_component.c
	opal/mca/mpool/gpusm/mpool_gpusm.h
---
 opal/datatype/cuda/opal_datatype_cuda.cu      | 121 +++++++++---------
 .../cuda/opal_datatype_cuda_internal.cuh      |   2 +-
 .../cuda/opal_datatype_orig_internal.h        |   3 -
 .../cuda/opal_datatype_pack_cuda_kernel.cu    |   6 +-
 .../cuda/opal_datatype_pack_cuda_wrapper.cu   |  31 ++---
 .../cuda/opal_datatype_unpack_cuda_kernel.cu  |   5 +-
 .../cuda/opal_datatype_unpack_cuda_wrapper.cu |  63 +++++----
 opal/datatype/opal_convertor.c                |   8 +-
 opal/datatype/opal_datatype_gpu.c             |   2 +
 opal/datatype/opal_datatype_pack.c            |   8 +-
 opal/mca/common/cuda/common_cuda.c            |   4 +-
 opal/mca/mpool/gpusm/mpool_gpusm.h            |   4 +-
 test/datatype/Makefile.am                     |   2 +-
 test/datatype/ddt_lib.h                       |   1 +
 14 files changed, 125 insertions(+), 135 deletions(-)

diff --git a/opal/datatype/cuda/opal_datatype_cuda.cu b/opal/datatype/cuda/opal_datatype_cuda.cu
index 0f0d52d558b..18706fe0f78 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cu
+++ b/opal/datatype/cuda/opal_datatype_cuda.cu
@@ -142,21 +142,27 @@ static inline void cuda_list_insert_before(ddt_cuda_list_t *list, ddt_cuda_buffe
     list->nb_elements ++;
 }
 
-static inline void cuda_list_item_merge_by_addr(ddt_cuda_list_t *list)
+/**
+ * Collapse the list of free buffers by mergining consecutive buffers. As the property of this list
+ * is continously maintained, we only have to parse it up to the newest inserted elements.
+ */
+static inline void cuda_list_item_merge_by_addr(ddt_cuda_list_t *list, ddt_cuda_buffer_t* last)
 {
-    ddt_cuda_buffer_t *ptr = NULL;
+    ddt_cuda_buffer_t *current = list->head;
     ddt_cuda_buffer_t *next = NULL;
-    ptr = list->head;
-    while(ptr != NULL) {
-        next = ptr->next;
-        if (next == NULL) {
-            break;
-        } else if ((ptr->gpu_addr + ptr->size) == next->gpu_addr) {
-            ptr->size += next->size;
+    void* stop_addr = last->gpu_addr;
+
+    while(1) {  /* loop forever, the exit conditions are inside */
+        if( NULL == (next = current->next) ) return;
+        if ((current->gpu_addr + current->size) == next->gpu_addr) {
+            current->size += next->size;
             cuda_list_delete(list, next);
-        } else {
-            ptr = ptr->next;
+            free(next);  /* release the element, and try to continue merging */
+            continue;
         }
+        current = current->next;
+        if( NULL == current ) return;
+        if( current->gpu_addr > stop_addr ) return;
     }
 }
 
@@ -210,6 +216,7 @@ void opal_datatype_cuda_init(void)
         cuda_device[i].buffer_used.nb_elements = 0;
     }
     
+    
     /* init cuda stream */
     cuda_streams = (ddt_cuda_stream_t*)malloc(sizeof(ddt_cuda_stream_t));
     for (i = 0; i < NB_STREAMS; i++) {
@@ -222,7 +229,8 @@ void opal_datatype_cuda_init(void)
     
     /* only for iov version */
     for (i = 0; i < NB_STREAMS; i++) {
-        cudaMalloc((void **)(&cuda_iov_dist_d[i]), sizeof(ddt_cuda_iov_dist_t)*CUDA_MAX_NB_BLOCKS);
+        cudaMallocHost((void **)(&cuda_iov_dist_h[i]), sizeof(ddt_cuda_iov_dist_t)*CUDA_MAX_NB_BLOCKS*CUDA_IOV_MAX_TASK_PER_BLOCK);
+        cudaMalloc((void **)(&cuda_iov_dist_d[i]), sizeof(ddt_cuda_iov_dist_t)*CUDA_MAX_NB_BLOCKS*CUDA_IOV_MAX_TASK_PER_BLOCK);
     }
     
     // /* init size for double, float, char */
@@ -245,6 +253,7 @@ void opal_datatype_cuda_fini(void)
     
     /* only for iov version */
     for (i = 0; i < NB_STREAMS; i++) {
+        cudaFreeHost(cuda_iov_dist_h[i]);
         cudaFree(cuda_iov_dist_d[i]);
     }
 }
@@ -279,72 +288,60 @@ void* opal_cuda_malloc_gpu_buffer(size_t size, int gpu_id)
         DT_CUDA_DEBUG( opal_cuda_output( 0, "No GPU buffer at dev_id %d.\n", dev_id); );
         return NULL;
     }
-    ddt_cuda_buffer_t *ptr = NULL;
-    void *addr = NULL;
-    ptr = device->buffer_free.head;
+    ddt_cuda_buffer_t *ptr = device->buffer_free.head;
     while (ptr != NULL) {
-        if (ptr->size >= size) {
-            addr = ptr->gpu_addr;
-            ptr->size -= size;
-            if (ptr->size == 0) {
-                cuda_list_delete(&device->buffer_free, ptr);
-                obj_ddt_cuda_buffer_reset(ptr);
-                cuda_list_push_head(cuda_free_list, ptr);
-            } else {
-                ptr->gpu_addr += size;
-            }
-            break;
+        if (ptr->size < size) {  /* Not enough room in this buffer, check next */
+            ptr = ptr->next;
+            continue;
         }
-        ptr = ptr->next;
-    }
-    
-    if (ptr == NULL) {
-        return NULL;
-    } else {    
-        ddt_cuda_buffer_t *p = cuda_list_pop_tail(cuda_free_list);
-        if (p == NULL) {
-            p = obj_ddt_cuda_buffer_new();
+        void *addr = ptr->gpu_addr;
+        ptr->size -= size;
+        if (ptr->size == 0) {
+            cuda_list_delete(&device->buffer_free, ptr);
+            obj_ddt_cuda_buffer_reset(ptr);
+            /* hold on this ptr object, we will reuse it right away */
+        } else {
+            ptr->gpu_addr += size;
+            ptr = cuda_list_pop_tail(cuda_free_list);
+            if( NULL == ptr )
+                ptr = obj_ddt_cuda_buffer_new();
         }
-        p->size = size;
-        p->gpu_addr = (unsigned char*)addr;
-        cuda_list_push_head(&device->buffer_used, p);
+        assert(NULL != ptr);
+        ptr->size = size;
+        ptr->gpu_addr = (unsigned char*)addr;
+        cuda_list_push_head(&device->buffer_used, ptr);
         device->buffer_used_size += size;
         device->buffer_free_size -= size;
         DT_CUDA_DEBUG( opal_cuda_output( 2, "Malloc GPU buffer %p, dev_id %d.\n", addr, dev_id); );
         return addr;
     }
+    return NULL;
 }
 
 void opal_cuda_free_gpu_buffer(void *addr, int gpu_id)
 {
     ddt_cuda_device_t *device = &cuda_device[gpu_id];
-    ddt_cuda_buffer_t *ptr = NULL;
-    ddt_cuda_buffer_t *ptr_next = NULL;
-    ptr = device->buffer_used.head;
-    while (ptr != NULL) {
-        if (ptr->gpu_addr == addr) {
-            cuda_list_delete(&device->buffer_used, ptr);
-            ptr_next = device->buffer_free.head;
-            while (ptr_next != NULL) {
-                if (ptr_next->gpu_addr > addr) {
-                    break;
-                }
-                ptr_next = ptr_next->next;
-            }
-            if (ptr_next == NULL) {
-                /* buffer_free is empty, or insert to last one */
-                cuda_list_push_tail(&device->buffer_free, ptr);
-            } else {
-                cuda_list_insert_before(&device->buffer_free, ptr, ptr_next);
-            }
-            cuda_list_item_merge_by_addr(&device->buffer_free);
-            device->buffer_free_size += ptr->size;
+    ddt_cuda_buffer_t *ptr = device->buffer_used.head;
+
+    /* Find the holder of this GPU allocation */
+    for( ; (NULL != ptr) && (ptr->gpu_addr != addr); ptr = ptr->next );
+    if (NULL == ptr) {  /* we could not find it. something went wrong */
+        DT_CUDA_DEBUG( opal_cuda_output( 0, "addr %p is not managed.\n", addr); );
+        return;
+    }
+    cuda_list_delete(&device->buffer_used, ptr);
+    /* Insert the element in the list of free buffers ordered by the addr */
+    ddt_cuda_buffer_t *ptr_next = device->buffer_free.head;
+    while (ptr_next != NULL) {
+        if (ptr_next->gpu_addr > addr) {
             break;
         }
-        ptr = ptr->next;
+        ptr_next = ptr_next->next;
     }
-    if (ptr == NULL) {
-        DT_CUDA_DEBUG( opal_cuda_output( 0, "addr %p is not managed.\n", addr); );
+    if (ptr_next == NULL) {  /* buffer_free is empty, or insert to last one */
+        cuda_list_push_tail(&device->buffer_free, ptr);
+    } else {
+        cuda_list_insert_before(&device->buffer_free, ptr, ptr_next);
     }
     size_t size = ptr->size;
     cuda_list_item_merge_by_addr(&device->buffer_free, ptr);
diff --git a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
index 268554126ab..fe49449f976 100644
--- a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
+++ b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
@@ -30,7 +30,7 @@
 #define CUDA_NB_IOV         1024*20
 #define CUDA_IOV_LEN        1024*1204
 #define CUDA_MAX_NB_BLOCKS  1024
-#define CUDA_IOV_MAX_TASK_PER_BLOCK 10
+#define CUDA_IOV_MAX_TASK_PER_BLOCK 400
 #define ALIGNMENT_DOUBLE    8
 #define ALIGNMENT_FLOAT     4
 #define ALIGNMENT_CHAR      1
diff --git a/opal/datatype/cuda/opal_datatype_orig_internal.h b/opal/datatype/cuda/opal_datatype_orig_internal.h
index 90561359f75..4dde12d235d 100644
--- a/opal/datatype/cuda/opal_datatype_orig_internal.h
+++ b/opal/datatype/cuda/opal_datatype_orig_internal.h
@@ -5,9 +5,6 @@
 
 #include "opal_config.h"
 
-/* original OMPI */
-#define OPAL_DECLSPEC
-
 #define OPAL_PTRDIFF_TYPE ptrdiff_t
 #define DT_STATIC_STACK_SIZE    5                /**< This should be sufficient for most applications */
 
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
index a58b831b78b..dd9af2a5a7e 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
@@ -81,7 +81,11 @@ __global__ void opal_generic_simple_pack_cuda_iov_kernel( ddt_cuda_iov_dist_t* c
     
     if (threadIdx.x == 0) {
         //printf("iov pack kernel \n");
-        nb_tasks = cuda_iov_dist[blockIdx.x].nb_tasks;
+        nb_tasks = nb_blocks_used / gridDim.x;
+        if (blockIdx.x < (nb_blocks_used % gridDim.x)) {
+            nb_tasks ++;
+        }
+   //     printf("nb_tasks %d, griddim %d, nb_blocks_used %d, bloid %d \n", nb_tasks, gridDim.x, nb_blocks_used, blockIdx.x);
     }
     __syncthreads();
     
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
index 381aaf99ae8..0a51f66d877 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
@@ -443,7 +443,7 @@ void pack_contiguous_loop_cuda( dt_elem_desc_t* ELEM,
 //    int i;
 //    for (i = 0; i < 4; i++) {
 //     opal_empty_kernel<<<192, 4*THREAD_PER_BLOCK>>>(_copy_loops, _end_loop->size, _loop->extent, _source, _destination);
-     pack_contiguous_loop_cuda_kernel_global<<<192, 4*THREAD_PER_BLOCK>>>(_copy_loops, _end_loop->size, _loop->extent, _source, _destination);
+ //    pack_contiguous_loop_cuda_kernel_global<<<192, 4*THREAD_PER_BLOCK>>>(_copy_loops, _end_loop->size, _loop->extent, _source, _destination);
 //     }
 
 #if !defined(OPAL_DATATYPE_CUDA_DRY_RUN)    
@@ -640,7 +640,7 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
                                                     size_t* max_data )
 {
     uint32_t i, j;
-    uint32_t count_desc, current_block, task_iteration, nb_blocks_per_description, residue_desc;
+    uint32_t count_desc, nb_blocks_per_description, residue_desc;
     uint32_t nb_blocks, thread_per_block, nb_blocks_used;
     size_t length, buffer_size, length_per_iovec, dst_offset;
     unsigned char *destination, *destination_base, *source_base;
@@ -736,12 +736,10 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
     dst_offset = 0;
     thread_per_block = CUDA_WARP_SIZE * 5;
     nb_blocks = 256;
-    nb_blocks_used = 0;
     
     while (cuda_iov_count > 0) {
         
-        current_block = 0;
-        task_iteration = 0;
+        nb_blocks_used = 0;
         cuda_iov_dist_h_current = cuda_iov_dist_h[cuda_streams->current_stream_id];
         cuda_iov_dist_d_current = cuda_iov_dist_d[cuda_streams->current_stream_id];
         source_base = (unsigned char*)cuda_iov[0].iov_base; 
@@ -749,9 +747,6 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
         GET_TIME(start);
 #endif
-        for (i = 0; i < nb_blocks; i++) {
-            cuda_iov_dist_h_current[i].nb_tasks = 0;
-        }
 
         for (i = 0; i < cuda_iov_count; i++) {
           /*  pElem = &(description[orig_stack_index+i]);*/
@@ -786,21 +781,17 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
                 cuda_iov_dist_h_current[nb_blocks_used].dst_offset = destination - destination_base;
                 cuda_iov_dist_h_current[nb_blocks_used].element_alignment = alignment;
                 if ( (j+1) * thread_per_block <= count_desc) {
-                    cuda_iov_dist_h_current[current_block].nb_elements[task_iteration] = thread_per_block;// * sizeof(double);
+                    cuda_iov_dist_h_current[nb_blocks_used].nb_elements = thread_per_block;// * sizeof(double);
                 } else {
-                    cuda_iov_dist_h_current[current_block].nb_elements[task_iteration] = count_desc - j*thread_per_block; //(thread_per_block - ((j+1)*thread_per_block - count_desc));// * sizeof(double);
+                    cuda_iov_dist_h_current[nb_blocks_used].nb_elements = count_desc - j*thread_per_block; //(thread_per_block - ((j+1)*thread_per_block - count_desc));// * sizeof(double);
                 }
 #if defined (OPAL_DATATYPE_CUDA_DEBUG)
-                assert(cuda_iov_dist_h_current[current_block].nb_elements[task_iteration] > 0);
+                assert(cuda_iov_dist_h_current[nb_blocks_used].nb_elements > 0);
 #endif /* OPAL_DATATYPE_CUDA_DEBUG */
                 destination += cuda_iov_dist_h_current[nb_blocks_used].nb_elements * alignment;
                 DT_CUDA_DEBUG( opal_cuda_output(12, "Pack \tblock %d, src %p, dst %p, nb_elements %d, alignment %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].src_offset, cuda_iov_dist_h_current[nb_blocks_used].dst_offset, cuda_iov_dist_h_current[nb_blocks_used].nb_elements, cuda_iov_dist_h_current[nb_blocks_used].element_alignment); );
                 nb_blocks_used ++;
-                if (current_block >= nb_blocks) {
-                    current_block = 0;
-                    task_iteration ++;
-                    assert(task_iteration < CUDA_IOV_MAX_TASK_PER_BLOCK);
-                }
+                assert (nb_blocks_used < CUDA_MAX_NB_BLOCKS*CUDA_IOV_MAX_TASK_PER_BLOCK);
             }
             
             /* handle residue */
@@ -812,16 +803,12 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
                 cuda_iov_dist_h_current[nb_blocks_used].element_alignment = orig_alignment;
                 cuda_iov_dist_h_current[nb_blocks_used].nb_elements = (length_per_iovec - length_per_iovec / alignment * alignment) / orig_alignment;
 #if defined (OPAL_DATATYPE_CUDA_DEBUG)
-                assert(cuda_iov_dist_h_current[current_block].nb_elements[task_iteration] > 0);
+                assert(cuda_iov_dist_h_current[nb_blocks_used].nb_elements > 0);
 #endif /* OPAL_DATATYPE_CUDA_DEBUG */
                 destination += cuda_iov_dist_h_current[nb_blocks_used].nb_elements * orig_alignment;
                 DT_CUDA_DEBUG( opal_cuda_output(12, "Pack \tblock %d, src %p, dst %p, nb_elements %d, alignment %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].src_offset, cuda_iov_dist_h_current[nb_blocks_used].dst_offset, cuda_iov_dist_h_current[nb_blocks_used].nb_elements, cuda_iov_dist_h_current[nb_blocks_used].element_alignment); );
                 nb_blocks_used ++;
-                if (current_block >= nb_blocks) {
-                    current_block = 0;
-                    task_iteration ++;
-                    assert(task_iteration < CUDA_IOV_MAX_TASK_PER_BLOCK);
-                }
+                assert (nb_blocks_used < CUDA_MAX_NB_BLOCKS*CUDA_IOV_MAX_TASK_PER_BLOCK);
             }
             
             if (buffer_isfull) {
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
index 2ea3bb59885..a23aff7710c 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
@@ -16,7 +16,10 @@ __global__ void opal_generic_simple_unpack_cuda_iov_kernel( ddt_cuda_iov_dist_t*
     __shared__ uint32_t nb_tasks;
     
     if (threadIdx.x == 0) {
-        nb_tasks = cuda_iov_dist[blockIdx.x].nb_tasks;
+        nb_tasks = nb_blocks_used / gridDim.x;
+        if (blockIdx.x < nb_blocks_used % gridDim.x) {
+            nb_tasks ++;
+        }
     }
     __syncthreads();
     
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
index 4ee73897f68..696a2c12694 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
@@ -26,7 +26,7 @@ int32_t opal_generic_simple_unpack_function_cuda_vector2( opal_convertor_t* pCon
     uint32_t count_desc_tmp;
 
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
-    TIMER_DATA_TYPE start, end, start_total, end_total;
+    TIMER_DATA_TYPE start, end;
     long total_time;
 #endif
 
@@ -350,13 +350,13 @@ int32_t opal_generic_simple_unpack_function_cuda_vector( opal_convertor_t* pConv
 }
 
 int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvertor,
-                                                  struct iovec* iov, 
-                                                  uint32_t* out_size,
-                                                  size_t* max_data )
+                                                      struct iovec* iov,
+                                                      uint32_t* out_size,
+                                                      size_t* max_data )
 {
     uint32_t i, j;
-    uint32_t count_desc, current_block, task_iteration, nb_blocks_per_description, dst_offset, residue_desc;
-    uint32_t nb_blocks, thread_per_block;
+    uint32_t count_desc, nb_blocks_per_description, dst_offset, residue_desc;
+    uint32_t nb_blocks, thread_per_block, nb_blocks_used;
     size_t length, buffer_size, length_per_iovec;
     unsigned char *source, *source_base, *destination_base;
     size_t total_unpacked, total_converted;
@@ -372,7 +372,7 @@ int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvert
 
     ddt_cuda_iov_dist_t* cuda_iov_dist_h_current;
     ddt_cuda_iov_dist_t* cuda_iov_dist_d_current;
-    
+
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
     TIMER_DATA_TYPE start, end, start_total, end_total;
     long total_time, move_time;
@@ -410,7 +410,6 @@ int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvert
             free_required = 1;
         }
     }
-    
 
     DT_CUDA_DEBUG ( opal_cuda_output(2, "Unpack using IOV, GPU base %p, unpack from buffer %p, total size %ld\n",
                                      pConvertor->pBaseBuf, source, iov[0].iov_len); );
@@ -440,15 +439,14 @@ int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvert
     total_time = ELAPSED_TIME( start, end );
     DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: ddt to iov in %ld microsec\n", total_time ); );
 #endif
-    
+
     dst_offset = 0;
     thread_per_block = CUDA_WARP_SIZE * 5;
     nb_blocks = 256;
-    
+
     while (cuda_iov_count > 0) {
-        
-        current_block = 0;
-        task_iteration = 0;
+
+        nb_blocks_used = 0;
         cuda_iov_dist_h_current = cuda_iov_dist_h[cuda_streams->current_stream_id];
         cuda_iov_dist_d_current = cuda_iov_dist_d[cuda_streams->current_stream_id];
         destination_base = (unsigned char*)cuda_iov[0].iov_base;
@@ -456,10 +454,7 @@ int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvert
 #if defined (OPAL_DATATYPE_CUDA_TIMING)
         GET_TIME(start);
 #endif
-        for (i = 0; i < nb_blocks; i++) {
-            cuda_iov_dist_h_current[i].nb_tasks = 0;
-        }
-        
+
         for (i = 0; i < cuda_iov_count; i++) {
 //            pElem = &(description[orig_stack_index+i]);
             if (buffer_size >= cuda_iov[i].iov_len) {
@@ -472,7 +467,7 @@ int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvert
             }
             buffer_size -= length_per_iovec;
             total_unpacked += length_per_iovec;
-            
+
             /* check alignment */
             if ((uintptr_t)(cuda_iov[i].iov_base) % ALIGNMENT_DOUBLE == 0 && (uintptr_t)source % ALIGNMENT_DOUBLE == 0 && length_per_iovec >= ALIGNMENT_DOUBLE) {
                 alignment = ALIGNMENT_DOUBLE;
@@ -482,6 +477,8 @@ int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvert
                 alignment = ALIGNMENT_CHAR;
             }
 
+            //alignment = ALIGNMENT_DOUBLE;
+
             count_desc = length_per_iovec / alignment;
             residue_desc = length_per_iovec % alignment;
             nb_blocks_per_description = (count_desc + thread_per_block - 1) / thread_per_block;
@@ -491,18 +488,18 @@ int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvert
                 cuda_iov_dist_h_current[nb_blocks_used].src_offset = source - source_base;
                 cuda_iov_dist_h_current[nb_blocks_used].element_alignment = alignment;
                 if ( (j+1) * thread_per_block <= count_desc) {
-                    cuda_iov_dist_h_current[current_block].nb_elements[task_iteration] = thread_per_block;// * sizeof(double);
+                    cuda_iov_dist_h_current[nb_blocks_used].nb_elements = thread_per_block;// * sizeof(double);
                 } else {
-                    cuda_iov_dist_h_current[current_block].nb_elements[task_iteration] = (thread_per_block - ((j+1)*thread_per_block - count_desc));// * sizeof(double);
+                    cuda_iov_dist_h_current[nb_blocks_used].nb_elements = (thread_per_block - ((j+1)*thread_per_block - count_desc));// * sizeof(double);
                 }
 #if defined (OPAL_DATATYPE_CUDA_DEBUG)
-                assert (cuda_iov_dist_h_current[current_block].nb_elements[task_iteration] > 0); 
+                assert (cuda_iov_dist_h_current[nb_blocks_used].nb_elements > 0); 
 #endif /* OPAL_DATATYPE_CUDA_DEBUG */
                 source += cuda_iov_dist_h_current[nb_blocks_used].nb_elements * alignment;
                 DT_CUDA_DEBUG( opal_cuda_output(12, "Unpack \tblock %d, src %p, dst %p, nb_elements %d, alignment %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].src_offset, cuda_iov_dist_h_current[nb_blocks_used].dst_offset, cuda_iov_dist_h_current[nb_blocks_used].nb_elements, cuda_iov_dist_h_current[nb_blocks_used].element_alignment); );
                 nb_blocks_used ++;
             }
-            
+
             /* handle residue */
             if (residue_desc != 0) {
                /* orig_alignment = opal_datatype_basicDatatypes[pElem->elem.common.type]->size;*/
@@ -512,19 +509,19 @@ int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvert
                 cuda_iov_dist_h_current[nb_blocks_used].element_alignment = orig_alignment;
                 cuda_iov_dist_h_current[nb_blocks_used].nb_elements = (length_per_iovec - length_per_iovec / alignment * alignment) / orig_alignment;
 #if defined (OPAL_DATATYPE_CUDA_DEBUG)
-                assert (cuda_iov_dist_h_current[current_block].nb_elements[task_iteration] > 0);
+                assert (cuda_iov_dist_h_current[nb_blocks_used].nb_elements > 0);
 #endif /* OPAL_DATATYPE_CUDA_DEBUG */
                 source += cuda_iov_dist_h_current[nb_blocks_used].nb_elements * orig_alignment;
                 DT_CUDA_DEBUG( opal_cuda_output(12, "Unpack \tblock %d, src %p, dst %p, nb_elements %d, alignment %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].src_offset, cuda_iov_dist_h_current[nb_blocks_used].dst_offset, cuda_iov_dist_h_current[nb_blocks_used].nb_elements, cuda_iov_dist_h_current[nb_blocks_used].element_alignment); );
                 nb_blocks_used ++;
             }
-            
+
             if (buffer_isfull) {
                 break;
             }
         }
 
-#if defined(OPAL_DATATYPE_CUDA_TIMING)    
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
         GET_TIME( end );
         total_time = ELAPSED_TIME( start, end );
         DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Unpack src %p, iov is prepared in %ld microsec, kernel submitted to CUDA stream %d\n", source_base, total_time,  cuda_streams->current_stream_id); );
@@ -533,8 +530,8 @@ int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvert
         cudaMemcpyAsync(cuda_iov_dist_d_current, cuda_iov_dist_h_current, sizeof(ddt_cuda_iov_dist_t)*(nb_blocks_used), cudaMemcpyHostToDevice, cuda_streams->opal_cuda_stream[cuda_streams->current_stream_id]);
         opal_generic_simple_unpack_cuda_iov_kernel<<<nb_blocks, thread_per_block, 0, cuda_streams->opal_cuda_stream[cuda_streams->current_stream_id]>>>(cuda_iov_dist_d_current, nb_blocks_used, source_base, destination_base);
         cuda_streams->current_stream_id ++;
-        cuda_streams->current_stream_id = cuda_streams->current_stream_id % NB_STREAMS;    
-        
+        cuda_streams->current_stream_id = cuda_streams->current_stream_id % NB_STREAMS;
+
         /* buffer is full */
         if (buffer_isfull) {
             size_t total_converted_tmp = total_converted;
@@ -546,7 +543,6 @@ int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvert
         }
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
         GET_TIME(start);
-        convertor_flags = pConvertor->flags;     
 #endif
         convertor_flags = pConvertor->flags;
 //        orig_stack_index = pStack->index;
@@ -559,6 +555,7 @@ int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvert
 #endif
 
     }
+   // cudaDeviceSynchronize();
     for (i = 0; i < NB_STREAMS; i++) {
         cudaStreamSynchronize(cuda_streams->opal_cuda_stream[i]);
     }
@@ -573,7 +570,7 @@ int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvert
     total_time = ELAPSED_TIME( start_total, end_total );
     DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: total unpacking in %ld microsec, kernel %ld microsec\n", total_time, total_time-move_time ); );
 #endif
-    
+
     if( pConvertor->bConverted == pConvertor->local_size ) {
         pConvertor->flags |= CONVERTOR_COMPLETED;
         if (pConvertor->gpu_buffer_ptr != NULL && free_required) {
@@ -581,8 +578,8 @@ int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvert
             pConvertor->gpu_buffer_ptr = NULL;
         }
         return 1;
-    }        
-    return 0;   
+    }
+    return 0;
 }
 
 void unpack_contiguous_loop_cuda( dt_elem_desc_t* ELEM,
@@ -616,13 +613,13 @@ void unpack_contiguous_loop_cuda( dt_elem_desc_t* ELEM,
 //    unpack_contiguous_loop_cuda_kernel_global<<<192, 4*THREAD_PER_BLOCK>>>(_copy_loops, _end_loop->size, _loop->extent, _source, _destination);
      cudaMemcpy2D(_destination, _loop->extent, _source, _end_loop->size, _end_loop->size, _copy_loops, cudaMemcpyDeviceToDevice);
 
-#if !defined(OPAL_DATATYPE_CUDA_DRY_RUN)     
+#if !defined(OPAL_DATATYPE_CUDA_DRY_RUN)
     *(DESTINATION) = _destination + _loop->extent*_copy_loops - _end_loop->first_elem_disp;
     *(SOURCE) = *(SOURCE)  + _copy_loops * _end_loop->size;
     *(SPACE) -= _copy_loops * _end_loop->size;
     *(COUNT) -= _copy_loops;
 #endif
-    
+
     cudaDeviceSynchronize();
 #if defined(OPAL_DATATYPE_CUDA_TIMING) 
     GET_TIME( end );
diff --git a/opal/datatype/opal_convertor.c b/opal/datatype/opal_convertor.c
index e74a1d67883..d8f6fbe0687 100644
--- a/opal/datatype/opal_convertor.c
+++ b/opal/datatype/opal_convertor.c
@@ -566,8 +566,8 @@ int32_t opal_convertor_prepare_for_recv( opal_convertor_t* convertor,
     if (opal_datatype_gpu_init() != OPAL_SUCCESS) {
         opal_datatype_gpu_fini();
     }
-#endif /* defined OPAL_DATATYPE_CUDA_KERNEL */
-#endif
+#endif /* OPAL_DATATYPE_CUDA_KERNEL */
+#endif  /* OPAL_CUDA_SUPPORT */
 
     OPAL_CONVERTOR_PREPARE( convertor, datatype, count, pUserBuf );
 
@@ -615,8 +615,8 @@ int32_t opal_convertor_prepare_for_send( opal_convertor_t* convertor,
     if (opal_datatype_gpu_init() != OPAL_SUCCESS) {
         opal_datatype_gpu_fini();
     }
-#endif /* defined OPAL_DATATYPE_CUDA_KERNEL */
-#endif
+#endif /* OPAL_DATATYPE_CUDA_KERNEL */
+#endif  /* OPAL_CUDA_SUPPORT */
 
     OPAL_CONVERTOR_PREPARE( convertor, datatype, count, pUserBuf );
 
diff --git a/opal/datatype/opal_datatype_gpu.c b/opal/datatype/opal_datatype_gpu.c
index f05ecbd84b5..4e516766737 100644
--- a/opal/datatype/opal_datatype_gpu.c
+++ b/opal/datatype/opal_datatype_gpu.c
@@ -22,7 +22,9 @@
 #include "opal_config.h"
 
 #include <stddef.h>
+#include <stdio.h>
 #include <dlfcn.h>
+#include <stdio.h>
 
 #include "opal/mca/installdirs/installdirs.h"
 #include "opal/datatype/opal_convertor_internal.h"
diff --git a/opal/datatype/opal_datatype_pack.c b/opal/datatype/opal_datatype_pack.c
index 372d5a1291a..5a5a2470cb1 100644
--- a/opal/datatype/opal_datatype_pack.c
+++ b/opal/datatype/opal_datatype_pack.c
@@ -290,7 +290,7 @@ opal_generic_simple_pack_function( opal_convertor_t* pConvertor,
                            (void*)pConvertor, (void*)pConvertor->pBaseBuf,
                            iov[0].iov_base, (unsigned long)iov[0].iov_len, *out_size ); );
 
-    printf("I am in simple pack, max_data %lu, iov_len %lu\n", *max_data, iov[0].iov_len);
+    opal_output(0, "I am in simple pack, max_data %lu, iov_len %lu\n", *max_data, iov[0].iov_len);
     description = pConvertor->use_desc->desc;
 
     /* For the first step we have to add both displacement to the source. After in the
@@ -390,7 +390,7 @@ opal_generic_simple_pack_function( opal_convertor_t* pConvertor,
     *out_size = iov_count;
     if( pConvertor->bConverted == pConvertor->local_size ) {
         pConvertor->flags |= CONVERTOR_COMPLETED;
-        printf("total packed %lu\n", pConvertor->bConverted);
+        opal_output(0, "total packed %lu\n", pConvertor->bConverted);
         // double *vtmp = (double *)iov[0].iov_base;
         // for (uint32_t i = 0; i < total_packed/8; i++) {
         //     printf(" %1.f ", *vtmp);
@@ -425,8 +425,8 @@ opal_generic_simple_pack_cuda_function( opal_convertor_t* pConvertor,
    // return (*opal_generic_simple_pack_function_cuda_vector_p)( pConvertor, iov, out_size, max_data);
     if( OPAL_DATATYPE_LOOP == pElem->elem.common.type ) {
         if (opal_generic_simple_pack_function_cuda_vector_p != NULL) {
-        //    return (*opal_generic_simple_pack_function_cuda_vector_p)( pConvertor, iov, out_size, max_data);
-            return (*opal_generic_simple_pack_function_cuda_iov_p)( pConvertor, iov, out_size, max_data);
+            return (*opal_generic_simple_pack_function_cuda_vector_p)( pConvertor, iov, out_size, max_data);
+        //    return (*opal_generic_simple_pack_function_cuda_iov_p)( pConvertor, iov, out_size, max_data);
         }
     } else {
         if (opal_generic_simple_pack_function_cuda_iov_p != NULL) {
diff --git a/opal/mca/common/cuda/common_cuda.c b/opal/mca/common/cuda/common_cuda.c
index 4cbc8ac4b50..6bcb031003d 100644
--- a/opal/mca/common/cuda/common_cuda.c
+++ b/opal/mca/common/cuda/common_cuda.c
@@ -1911,7 +1911,9 @@ static int mca_common_cuda_is_gpu_buffer(const void *pUserBuf, opal_convertor_t
     if (!stage_three_init_complete) {
         if (0 != mca_common_cuda_stage_three_init()) {
             opal_cuda_support = 0;
-        }
+        } else {
+	    opal_datatype_gpu_init();
+	}
     }
 
     return 1;
diff --git a/opal/mca/mpool/gpusm/mpool_gpusm.h b/opal/mca/mpool/gpusm/mpool_gpusm.h
index 537c95108a8..5d3d02b5110 100644
--- a/opal/mca/mpool/gpusm/mpool_gpusm.h
+++ b/opal/mca/mpool/gpusm/mpool_gpusm.h
@@ -41,8 +41,8 @@ struct mca_mpool_gpusm_registration_t {
     uint64_t evtHandle[EVTHANDLE_SIZE]; /* CUipcEventHandle */
     uintptr_t event;                    /* CUevent */
 };
-typedef struct mca_mpool_gpusm_registration_t mca_mpool_gpusm_registration_t;
-OPAL_DECLSPEC OBJ_CLASS_DECLARATION(mca_mpool_gpusm_registration_t);
+typedef struct mca_mpool_gpusm_registration_t mca_mpool_gpusm_registration_t; 
+OPAL_DECLSPEC OBJ_CLASS_DECLARATION(mca_mpool_gpusm_registration_t); 
 
 struct mca_mpool_gpusm_component_t {
     mca_mpool_base_component_t super;
diff --git a/test/datatype/Makefile.am b/test/datatype/Makefile.am
index 8c240423139..de658c503cb 100644
--- a/test/datatype/Makefile.am
+++ b/test/datatype/Makefile.am
@@ -15,7 +15,7 @@
 
 if PROJECT_OMPI
     MPI_TESTS = checksum position position_noncontig ddt_test ddt_raw unpack_ooo ddt_pack ddt_benchmark
-    MPI_CHECKS = to_self ddt_pack
+    MPI_CHECKS = to_self
 endif
 TESTS = opal_datatype_test $(MPI_TESTS)
 
diff --git a/test/datatype/ddt_lib.h b/test/datatype/ddt_lib.h
index 539434f9525..0f6bbc2cb37 100644
--- a/test/datatype/ddt_lib.h
+++ b/test/datatype/ddt_lib.h
@@ -96,5 +96,6 @@ extern ompi_datatype_t* create_strange_dt( void );
 extern ompi_datatype_t* create_contiguous_type( const ompi_datatype_t* data, int count );
 extern ompi_datatype_t* create_vector_type( const ompi_datatype_t* data, int count,
                                             int length, int stride );
+extern ompi_datatype_t* create_struct_constant_gap_resized_ddt( ompi_datatype_t* type );
 extern ompi_datatype_t* create_struct_type(int count);
 

From 7c86f4cc48451b9dff2357c616bab76a9e4a4ca9 Mon Sep 17 00:00:00 2001
From: eddy16112 <eddy16112@gmail.com>
Date: Tue, 27 Oct 2015 01:24:23 -0400
Subject: [PATCH 128/190] fix pipelining for non-contiguous to contiguous

---
 opal/mca/btl/smcuda/btl_smcuda.c           | 16 +++++++++++-----
 opal/mca/btl/smcuda/btl_smcuda.h           |  1 +
 opal/mca/btl/smcuda/btl_smcuda_component.c | 21 ++++++++++++---------
 3 files changed, 24 insertions(+), 14 deletions(-)

diff --git a/opal/mca/btl/smcuda/btl_smcuda.c b/opal/mca/btl/smcuda/btl_smcuda.c
index 9d5d5441683..e53449e82eb 100644
--- a/opal/mca/btl/smcuda/btl_smcuda.c
+++ b/opal/mca/btl/smcuda/btl_smcuda.c
@@ -1123,7 +1123,7 @@ int mca_btl_smcuda_get_cuda (struct mca_btl_base_module_t *btl,
     offset = (size_t) ((intptr_t) remote_address - (intptr_t) reg_ptr->base.base);
     remote_memory_address = (unsigned char *)reg_ptr->base.alloc_base + offset;
     if (0 != offset) {
-        printf("!!!!!!offset %lu, ra %p, base %p\n", offset, (void*)remote_address, (void*)reg_ptr->base.base);
+        printf("!!!!!!offset %lu, ra %p, base %p, remote %p\n", offset, (void*)remote_address, (void*)reg_ptr->base.base, remote_memory_address);
         opal_output(-1, "OFFSET=%d", (int)offset);
     }
 
@@ -1151,10 +1151,12 @@ int mca_btl_smcuda_get_cuda (struct mca_btl_base_module_t *btl,
             opal_output(0, "Failed to get the GPU device ID, rc=%d", rc);
             return rc;
         }
+        struct opal_convertor_t *convertor = NULL;
         if(opal_convertor_need_buffers(&recvreq->req_recv.req_base.req_convertor) == true) {
             recvreq->req_recv.req_base.req_convertor.flags |= CONVERTOR_CUDA;
             
-            struct opal_convertor_t *convertor = &(recvreq->req_recv.req_base.req_convertor);   
+            convertor = &(recvreq->req_recv.req_base.req_convertor);   
+            printf("local addr %p, pbase %p\n", local_address, convertor->pBaseBuf);
             
             if (remote_device != local_device && !OPAL_DATATYPE_DIRECT_COPY_GPUMEM) {
                 convertor->gpu_buffer_ptr = NULL;  
@@ -1181,29 +1183,31 @@ int mca_btl_smcuda_get_cuda (struct mca_btl_base_module_t *btl,
                 iov.iov_len = size;
                 max_data = size;
                 opal_convertor_unpack(convertor, &iov, &iov_count, &max_data );
+                opal_cuda_free_gpu_buffer_p(convertor->gpu_buffer_ptr, 0);
                 done = 1;
             }
         } else {
             recvreq->req_recv.req_base.req_convertor.flags |= CONVERTOR_CUDA;
             if (pack_required) {
+                convertor = &(recvreq->req_recv.req_base.req_convertor);   
                 if (remote_device == local_device || OPAL_DATATYPE_DIRECT_COPY_GPUMEM) {
                     /* now we are able to let sender pack directly to my memory */
                     mca_mpool_common_cuda_reg_t loc_reg;
                     mca_mpool_common_cuda_reg_t *loc_reg_ptr = &loc_reg;
                     cuda_ddt_put_hdr_t put_msg;
                     if (OPAL_SUCCESS != cuda_getmemhandle(local_address, size, (mca_mpool_base_registration_t *)&loc_reg, NULL)) {
-                        mca_btl_smcuda_cuda_ddt_start_pack(btl, ep, NULL, remote_memory_address, (mca_btl_base_descriptor_t *)frag, 
+                        mca_btl_smcuda_cuda_ddt_start_pack(btl, ep, convertor, remote_memory_address, (mca_btl_base_descriptor_t *)frag, 
                                                            lindex, remote_device, local_device);
                     }
                     memcpy(put_msg.mem_handle, loc_reg_ptr->data.memHandle, sizeof(loc_reg_ptr->data.memHandle));
                     put_msg.remote_address = local_address;
                     put_msg.remote_base = loc_reg.base.base;
                     put_msg.lindex = lindex;
-                    mca_btl_smcuda_cuda_ddt_unpack_clone(ep, NULL, remote_memory_address, (mca_btl_base_descriptor_t *)frag, 
+                    mca_btl_smcuda_cuda_ddt_unpack_clone(ep, convertor, remote_memory_address, (mca_btl_base_descriptor_t *)frag, 
                                                         lindex, 0, 0);
                     mca_btl_smcuda_send_cuda_put_sig(btl, ep, &put_msg);
                 } else {
-                    mca_btl_smcuda_cuda_ddt_start_pack(btl, ep, NULL, remote_memory_address, (mca_btl_base_descriptor_t *)frag, 
+                    mca_btl_smcuda_cuda_ddt_start_pack(btl, ep, convertor, remote_memory_address, (mca_btl_base_descriptor_t *)frag, 
                                                        lindex, remote_device, local_device);
                 }
                 done = 0;
@@ -1435,6 +1439,7 @@ void mca_btl_smcuda_cuda_ddt_pack_clone(struct mca_btl_base_endpoint_t *endpoint
                                         int lindex, uint8_t remote_device, uint8_t local_device)
 {
     endpoint->smcuda_ddt_pack_clone[lindex].convertor = convertor;
+    endpoint->smcuda_ddt_pack_clone[lindex].current_convertor_pBaseBuf = convertor->pBaseBuf;
     endpoint->smcuda_ddt_pack_clone[lindex].remote_gpu_address = remote_gpu_address;
     endpoint->smcuda_ddt_pack_clone[lindex].lindex = lindex;
     endpoint->smcuda_ddt_pack_clone[lindex].remote_device = remote_device;
@@ -1449,6 +1454,7 @@ void mca_btl_smcuda_cuda_ddt_unpack_clone(struct mca_btl_base_endpoint_t *endpoi
                                           int lindex, uint8_t remote_device, uint8_t local_device)
 {
     endpoint->smcuda_ddt_unpack_clone[lindex].convertor = convertor;
+    endpoint->smcuda_ddt_unpack_clone[lindex].current_convertor_pBaseBuf = convertor->pBaseBuf;
     endpoint->smcuda_ddt_unpack_clone[lindex].remote_gpu_address = remote_gpu_address;
     endpoint->smcuda_ddt_unpack_clone[lindex].lindex = lindex;
     endpoint->smcuda_ddt_unpack_clone[lindex].remote_device = remote_device;
diff --git a/opal/mca/btl/smcuda/btl_smcuda.h b/opal/mca/btl/smcuda/btl_smcuda.h
index 288dc2027d3..26dbcb34b2d 100644
--- a/opal/mca/btl/smcuda/btl_smcuda.h
+++ b/opal/mca/btl/smcuda/btl_smcuda.h
@@ -538,6 +538,7 @@ typedef struct {
 /* package save pack/unpack convertor and cbfunc */
 typedef struct {
     struct opal_convertor_t *convertor;
+    unsigned char *current_convertor_pBaseBuf;
     void *remote_gpu_address;
     int lindex;
     uint8_t remote_device;
diff --git a/opal/mca/btl/smcuda/btl_smcuda_component.c b/opal/mca/btl/smcuda/btl_smcuda_component.c
index 51be3eafafa..3d8d01c90a1 100644
--- a/opal/mca/btl/smcuda/btl_smcuda_component.c
+++ b/opal/mca/btl/smcuda/btl_smcuda_component.c
@@ -887,12 +887,16 @@ static void btl_smcuda_datatype_unpack(mca_btl_base_module_t* btl,
         size_t max_data;
         struct opal_convertor_t *convertor = my_cuda_dt_clone->convertor;
         size_t pipeline_size = mca_btl_smcuda_component.cuda_ddt_pipeline_size;
-        if (convertor == NULL) { /* do not unpack */
+        convertor->flags &= ~CONVERTOR_CUDA;
+        if (opal_convertor_need_buffers(convertor) == false) { /* do not unpack */
+            convertor->flags |= CONVERTOR_CUDA;
             mca_btl_smcuda_frag_t *frag_recv = (mca_btl_smcuda_frag_t *) my_cuda_dt_clone->frag;
-            unsigned char *local_address = (unsigned char*)frag_recv->segment.seg_addr.pval;
-            opal_output(0, "no unpack, start D2D copy local %p, remote %p, size %ld\n", local_address + seq*pipeline_size, my_cuda_dt_clone->remote_gpu_address+seq*pipeline_size, packed_size);
-            mca_common_cuda_memp2pcpy(local_address + seq*pipeline_size, my_cuda_dt_clone->remote_gpu_address + seq*pipeline_size, packed_size);
+            unsigned char *local_address = my_cuda_dt_clone->current_convertor_pBaseBuf;
+            opal_output(0, "no unpack, start D2D copy local %p, remote %p, size %ld\n", local_address, my_cuda_dt_clone->remote_gpu_address+seq*pipeline_size, packed_size);
+            mca_common_cuda_memp2pcpy(local_address, my_cuda_dt_clone->remote_gpu_address + seq*pipeline_size, packed_size);
+            my_cuda_dt_clone->current_convertor_pBaseBuf += packed_size;
         } else {     /* unpack */
+            convertor->flags |= CONVERTOR_CUDA;
             if (!OPAL_DATATYPE_DIRECT_COPY_GPUMEM && my_cuda_dt_clone->remote_device != my_cuda_dt_clone->local_device) {
                 convertor->gpu_buffer_ptr = opal_cuda_malloc_gpu_buffer_p(packed_size, 0);
                 (*opal_cuda_d2dcpy_async_p)(convertor->gpu_buffer_ptr, my_cuda_dt_clone->remote_gpu_address + seq*pipeline_size, packed_size);
@@ -912,7 +916,6 @@ static void btl_smcuda_datatype_unpack(mca_btl_base_module_t* btl,
             }
         }
         send_msg.seq = seq;
-        send_msg.packed_size = packed_size;
         if (msg_type == CUDA_DDT_COMPLETE) {
             send_msg.msg_type = CUDA_DDT_COMPLETE_ACK;
         } else {
@@ -934,7 +937,6 @@ static void btl_smcuda_datatype_pack(mca_btl_base_module_t* btl,
     int seq = recv_msg.seq;
     int lindex = recv_msg.lindex;
     int msg_type = recv_msg.msg_type;
-    size_t packed_size = recv_msg.packed_size;
     mca_btl_smcuda_frag_t *frag = (mca_btl_smcuda_frag_t *)des;
     cuda_ddt_clone_t *my_cuda_dt_clone;
     cuda_ddt_hdr_t send_msg;
@@ -942,6 +944,7 @@ static void btl_smcuda_datatype_pack(mca_btl_base_module_t* btl,
     uint32_t iov_count = 1;
     int rv_dt = 0;
     size_t max_data = 0;
+    size_t packed_size = 0;
 
     /* We can find the endoint back from the rank embedded in the header */
     endpoint = mca_btl_smcuda_component.sm_peers[frag->hdr->my_smp_rank];
@@ -962,8 +965,8 @@ static void btl_smcuda_datatype_pack(mca_btl_base_module_t* btl,
     } else if (msg_type == CUDA_DDT_PACK_TO_BLOCK) {
         if (convertor->bConverted < convertor->local_size) {
             struct iovec iov;
-            iov.iov_base = convertor->gpu_buffer_ptr + seq * mca_btl_smcuda_component.cuda_ddt_pipeline_size;;
-            iov.iov_len = packed_size;
+            iov.iov_base = convertor->gpu_buffer_ptr + seq * mca_btl_smcuda_component.cuda_ddt_pipeline_size;
+            iov.iov_len = mca_btl_smcuda_component.cuda_ddt_pipeline_size;
             rv_dt = opal_convertor_pack(convertor, &iov, &iov_count, &max_data );
             packed_size = max_data;
             send_msg.packed_size = packed_size;
@@ -977,8 +980,8 @@ static void btl_smcuda_datatype_pack(mca_btl_base_module_t* btl,
         }
     } else if (msg_type == CUDA_DDT_PACK_START) {
         struct iovec iov;
-        iov.iov_len = mca_btl_smcuda_component.cuda_ddt_pipeline_size;
         iov.iov_base = convertor->gpu_buffer_ptr;
+        iov.iov_len = mca_btl_smcuda_component.cuda_ddt_pipeline_size;
         seq = 0;
         while (rv_dt != 1 && convertor->gpu_buffer_size > 0) {
             rv_dt = opal_convertor_pack(convertor, &iov, &iov_count, &max_data );

From 06a07a52810e49f7be70fbdbe004ca2e7d9962a3 Mon Sep 17 00:00:00 2001
From: Wei Wu <wwu12@dancer.icl.utk.edu>
Date: Tue, 27 Oct 2015 18:30:10 -0400
Subject: [PATCH 129/190] opal_datatype is chnaged, so we need more space

---
 ompi/datatype/ompi_datatype.h    |  2 +-
 opal/mca/btl/smcuda/btl_smcuda.c | 29 ++++++++++++++---------------
 2 files changed, 15 insertions(+), 16 deletions(-)

diff --git a/ompi/datatype/ompi_datatype.h b/ompi/datatype/ompi_datatype.h
index 17e1632e07d..9ff0719867c 100644
--- a/ompi/datatype/ompi_datatype.h
+++ b/ompi/datatype/ompi_datatype.h
@@ -94,7 +94,7 @@ OMPI_DECLSPEC OBJ_CLASS_DECLARATION(ompi_datatype_t);
 /* Using set constant for padding of the DATATYPE handles because the size of
  * base structure is very close to being the same no matter the bitness.
  */
-#define PREDEFINED_DATATYPE_PAD (512)
+#define PREDEFINED_DATATYPE_PAD (1024)
 
 struct ompi_predefined_datatype_t {
     struct ompi_datatype_t dt;
diff --git a/opal/mca/btl/smcuda/btl_smcuda.c b/opal/mca/btl/smcuda/btl_smcuda.c
index e53449e82eb..be8df760f4f 100644
--- a/opal/mca/btl/smcuda/btl_smcuda.c
+++ b/opal/mca/btl/smcuda/btl_smcuda.c
@@ -72,9 +72,9 @@
 #include "btl_smcuda_frag.h"
 #include "btl_smcuda_fifo.h"
 
-#include "ompi/mca/pml/ob1/pml_ob1_recvreq.h"
+#include "ompi/mca/bml/bml.h"
 #include "ompi/mca/pml/ob1/pml_ob1_rdmafrag.h"
-
+#include "ompi/mca/pml/base/pml_base_request.h"
 
 #if OPAL_CUDA_SUPPORT
 static struct mca_btl_base_registration_handle_t *mca_btl_smcuda_register_mem (
@@ -1136,26 +1136,25 @@ int mca_btl_smcuda_get_cuda (struct mca_btl_base_module_t *btl,
     
     /* datatype RDMA */
     mca_pml_ob1_rdma_frag_t *frag_ob1 = cbdata;
-    mca_pml_ob1_recv_request_t *recvreq = (mca_pml_ob1_recv_request_t *) frag_ob1->rdma_req;
     mca_bml_base_btl_t *bml_btl = frag_ob1->rdma_bml;
-    
-    if ((recvreq->req_recv.req_base.req_convertor.flags & CONVERTOR_CUDA) &&
+    mca_pml_base_request_t *req = (mca_pml_base_request_t*) frag_ob1->rdma_req;
+    opal_convertor_t* convertor = &req->req_convertor;
+
+    if ((convertor->flags & CONVERTOR_CUDA) &&
         (bml_btl->btl_flags & MCA_BTL_FLAGS_CUDA_GET)) {
-        recvreq->req_recv.req_base.req_convertor.flags &= ~CONVERTOR_CUDA;
+        convertor->flags &= ~CONVERTOR_CUDA;
         uint8_t pack_required = remote_handle->reg_data.pack_required;
         uint32_t lindex = remote_handle->reg_data.lindex;
         uint8_t remote_device = remote_handle->reg_data.gpu_device;
-        uint8_t local_device = 0;
+        int32_t local_device = 0;
         rc = mca_common_cuda_get_device(&local_device);
         if (rc != 0) {
             opal_output(0, "Failed to get the GPU device ID, rc=%d", rc);
             return rc;
         }
-        struct opal_convertor_t *convertor = NULL;
-        if(opal_convertor_need_buffers(&recvreq->req_recv.req_base.req_convertor) == true) {
-            recvreq->req_recv.req_base.req_convertor.flags |= CONVERTOR_CUDA;
+        if(opal_convertor_need_buffers(convertor) == true) {
+            convertor->flags |= CONVERTOR_CUDA;
             
-            convertor = &(recvreq->req_recv.req_base.req_convertor);   
             printf("local addr %p, pbase %p\n", local_address, convertor->pBaseBuf);
             
             if (remote_device != local_device && !OPAL_DATATYPE_DIRECT_COPY_GPUMEM) {
@@ -1163,7 +1162,6 @@ int mca_btl_smcuda_get_cuda (struct mca_btl_base_module_t *btl,
             } else {
                 convertor->gpu_buffer_ptr = remote_memory_address;   
             }
-            cuda_ddt_hdr_t send_msg;
             if (pack_required) {
                 mca_btl_smcuda_cuda_ddt_start_pack(btl, ep, convertor, remote_memory_address, (mca_btl_base_descriptor_t *)frag, 
                                                     lindex, remote_device, local_device);
@@ -1187,9 +1185,8 @@ int mca_btl_smcuda_get_cuda (struct mca_btl_base_module_t *btl,
                 done = 1;
             }
         } else {
-            recvreq->req_recv.req_base.req_convertor.flags |= CONVERTOR_CUDA;
+            convertor->flags |= CONVERTOR_CUDA;
             if (pack_required) {
-                convertor = &(recvreq->req_recv.req_base.req_convertor);   
                 if (remote_device == local_device || OPAL_DATATYPE_DIRECT_COPY_GPUMEM) {
                     /* now we are able to let sender pack directly to my memory */
                     mca_mpool_common_cuda_reg_t loc_reg;
@@ -1396,8 +1393,10 @@ inline static int mca_btl_smcuda_cuda_ddt_start_pack(struct mca_btl_base_module_
     send_msg.packed_size = 0;
     send_msg.seq = 0;
     send_msg.msg_type = CUDA_DDT_PACK_START;
-    opal_output(0, "smcuda btl start pack, remote_gpu_address %p, frag %p, lindex %d, remote_device %d, local_device %d\n", remote_gpu_address, frag, lindex, remote_device, local_device);
+    opal_output(0, "smcuda btl start pack, remote_gpu_address %p, frag %p, lindex %d, remote_device %d, local_device %d\n",
+                (void*)remote_gpu_address, (void*)frag, lindex, remote_device, local_device);
     mca_btl_smcuda_send_cuda_pack_sig(btl, endpoint, &send_msg);
+    return OPAL_SUCCESS;
 }
 
 int mca_btl_smcuda_alloc_cuda_ddt_pack_clone(struct mca_btl_base_endpoint_t *endpoint)

From 986e5c95bf02020cd6a10be25ae8746bc477dc6a Mon Sep 17 00:00:00 2001
From: Wei Wu <wwu12@dancer.icl.utk.edu>
Date: Tue, 27 Oct 2015 18:52:03 -0400
Subject: [PATCH 130/190] reorder datatypes to cache boundaries

---
 opal/datatype/opal_datatype.h | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/opal/datatype/opal_datatype.h b/opal/datatype/opal_datatype.h
index c76df3bc373..beb5d0e0e20 100644
--- a/opal/datatype/opal_datatype.h
+++ b/opal/datatype/opal_datatype.h
@@ -107,33 +107,32 @@ struct opal_datatype_t {
     size_t             size;     /**< total size in bytes of the memory used by the data if
                                       the data is put on a contiguous buffer */
     OPAL_PTRDIFF_TYPE  true_lb;  /**< the true lb of the data without user defined lb and ub */
+    /* --- cacheline 1 boundary (64 bytes) --- */
     OPAL_PTRDIFF_TYPE  true_ub;  /**< the true ub of the data without user defined lb and ub */
     OPAL_PTRDIFF_TYPE  lb;       /**< lower bound in memory */
     OPAL_PTRDIFF_TYPE  ub;       /**< upper bound in memory */
-    /* --- cacheline 1 boundary (64 bytes) --- */
     size_t             nbElems;  /**< total number of elements inside the datatype */
-    uint32_t           align;    /**< data should be aligned to */
 
     /* Attribute fields */
     char               name[OPAL_MAX_OBJECT_NAME];  /**< name of the datatype */
-    /* --- cacheline 2 boundary (128 bytes) was 8-12 bytes ago --- */
+    /* --- cacheline 2 boundary (128 bytes) was 40 bytes ago --- */
     dt_type_desc_t     desc;     /**< the data description */
     dt_type_desc_t     opt_desc; /**< short description of the data used when conversion is useless
                                       or in the send case (without conversion) */
 
+    uint32_t           align;    /**< data should be aligned to */
     uint32_t           btypes[OPAL_DATATYPE_MAX_SUPPORTED];
                                  /**< basic elements count used to compute the size of the
                                       datatype for remote nodes. The length of the array is dependent on
                                       the maximum number of datatypes of all top layers.
                                       Reason being is that Fortran is not at the OPAL layer. */
-    /* --- cacheline 5 boundary (320 bytes) was 32-36 bytes ago --- */
-
+    /* --- cacheline 6 boundary (384 bytes) was 8 bytes ago --- */
     struct iovec*      iov;
     int                iov_count;
     size_t             max_data;
-    /* size: 372, cachelines: 6, members: 18 */
+    /* size: 416, cachelines: 7, members: 18 */
 
-    /* last cacheline: 28-32 bytes */
+    /* last cacheline: 32 bytes */
 };
 
 typedef struct opal_datatype_t opal_datatype_t;

From 08f69f6355c11263d978fc6ae90ed3cd9bad8c16 Mon Sep 17 00:00:00 2001
From: eddy16112 <eddy16112@gmail.com>
Date: Tue, 27 Oct 2015 22:01:02 -0400
Subject: [PATCH 131/190] slience warnings

---
 ompi/mca/pml/ob1/pml_ob1_cuda.c            | 17 ++++++++---------
 opal/mca/btl/smcuda/btl_smcuda.c           | 12 ++++++------
 opal/mca/btl/smcuda/btl_smcuda.h           |  8 ++++----
 opal/mca/btl/smcuda/btl_smcuda_component.c | 14 ++++++++------
 opal/mca/common/cuda/common_cuda.h         |  4 +---
 5 files changed, 27 insertions(+), 28 deletions(-)

diff --git a/ompi/mca/pml/ob1/pml_ob1_cuda.c b/ompi/mca/pml/ob1/pml_ob1_cuda.c
index 3dcb0b9ad14..89a397d41f2 100644
--- a/ompi/mca/pml/ob1/pml_ob1_cuda.c
+++ b/ompi/mca/pml/ob1/pml_ob1_cuda.c
@@ -52,7 +52,7 @@ size_t mca_pml_ob1_rdma_cuda_btls(
 int mca_pml_ob1_rdma_cuda_btl_register_data(
     mca_pml_ob1_com_btl_t* rdma_btls, 
     uint32_t num_btls_used, 
-    size_t pipeline_size, int lindex, uint8_t pack_required, uint8_t gpu_device);
+    int lindex, uint8_t pack_required, int32_t gpu_device);
 
 int mca_pml_ob1_cuda_need_buffers(void * rreq,
                                   mca_btl_base_module_t* btl);
@@ -67,7 +67,7 @@ int mca_pml_ob1_send_request_start_cuda(mca_pml_ob1_send_request_t* sendreq,
                                         mca_bml_base_btl_t* bml_btl,
                                         size_t size) {
     int rc;
-    int local_device = 0;
+    int32_t local_device = 0;
 #if OPAL_CUDA_SUPPORT_41
 #if OPAL_CUDA_GDR_SUPPORT
     /* With some BTLs, switch to RNDV from RGET at large messages */
@@ -91,10 +91,10 @@ int mca_pml_ob1_send_request_start_cuda(mca_pml_ob1_send_request_t* sendreq,
             
             rc = mca_common_cuda_get_device(&local_device);
             if (rc != 0) {
-                opal_output_verbose(0, "Failed to get the GPU device ID, rc=%d", rc);
+                opal_output(0, "Failed to get the GPU device ID, rc= %d\n", rc);
                 return rc;
             }                                                                   
-            mca_pml_ob1_rdma_cuda_btl_register_data(sendreq->req_rdma, sendreq->req_rdma_cnt, 0, -1, 0, local_device); 
+            mca_pml_ob1_rdma_cuda_btl_register_data(sendreq->req_rdma, sendreq->req_rdma_cnt, -1, 0, local_device); 
             rc = mca_pml_ob1_send_request_start_rdma(sendreq, bml_btl,
                                                      sendreq->req_send.req_bytes_packed);
             if( OPAL_UNLIKELY(OMPI_SUCCESS != rc) ) {
@@ -137,10 +137,10 @@ int mca_pml_ob1_send_request_start_cuda(mca_pml_ob1_send_request_t* sendreq,
                 assert(lindex >= 0);
                 rc = mca_common_cuda_get_device(&local_device);
                 if (rc != 0) {
-                    opal_output_verbose(0, "Failed to get the GPU device ID, rc=%d", rc);
+                    opal_output(0, "Failed to get the GPU device ID, rc=%d\n", rc);
                     return rc;
                 }
-                mca_pml_ob1_rdma_cuda_btl_register_data(sendreq->req_rdma, sendreq->req_rdma_cnt, 0, lindex, 1, local_device); 
+                mca_pml_ob1_rdma_cuda_btl_register_data(sendreq->req_rdma, sendreq->req_rdma_cnt, lindex, 1, local_device); 
                 mca_btl_smcuda_cuda_ddt_pack_clone( bml_btl->btl_endpoint, convertor, NULL, NULL, lindex, 0, local_device);
     
                 rc = mca_pml_ob1_send_request_start_rdma(sendreq, bml_btl,
@@ -219,9 +219,9 @@ size_t mca_pml_ob1_rdma_cuda_btls(
 int mca_pml_ob1_rdma_cuda_btl_register_data(
     mca_pml_ob1_com_btl_t* rdma_btls, 
     uint32_t num_btls_used, 
-    size_t pipeline_size, int lindex, uint8_t pack_required, uint8_t gpu_device)
+    int lindex, uint8_t pack_required, int32_t gpu_device)
 {
-    uint32_t i, j;
+    uint32_t i;
     for (i = 0; i < num_btls_used; i++) {
         mca_btl_base_registration_handle_t *handle = rdma_btls[i].btl_reg;
         mca_mpool_common_cuda_reg_t *cuda_reg = (mca_mpool_common_cuda_reg_t *)
@@ -231,7 +231,6 @@ int mca_pml_ob1_rdma_cuda_btl_register_data(
       //       mca_common_cuda_geteventhandle(&convertor->pipeline_event[j], j, (mca_mpool_base_registration_t *)cuda_reg);
       // //      printf("event %lu, j %d\n", convertor->pipeline_event[j], j);
       //   }
-   //     cuda_reg->data.pipeline_size = pipeline_size;
         cuda_reg->data.lindex = lindex;
         cuda_reg->data.pack_required = pack_required;
         cuda_reg->data.gpu_device = gpu_device;
diff --git a/opal/mca/btl/smcuda/btl_smcuda.c b/opal/mca/btl/smcuda/btl_smcuda.c
index be8df760f4f..f6e27a7c47c 100644
--- a/opal/mca/btl/smcuda/btl_smcuda.c
+++ b/opal/mca/btl/smcuda/btl_smcuda.c
@@ -89,7 +89,7 @@ inline static int mca_btl_smcuda_cuda_ddt_start_pack(struct mca_btl_base_module_
                                                      struct opal_convertor_t *convertor,
                                                      void *remote_gpu_address,
                                                      mca_btl_base_descriptor_t *frag,
-                                                     int lindex, uint8_t remote_device, uint8_t local_device);
+                                                     int lindex, int remote_device, int local_device);
 #endif
 
 mca_btl_smcuda_t mca_btl_smcuda = {
@@ -1145,8 +1145,8 @@ int mca_btl_smcuda_get_cuda (struct mca_btl_base_module_t *btl,
         convertor->flags &= ~CONVERTOR_CUDA;
         uint8_t pack_required = remote_handle->reg_data.pack_required;
         uint32_t lindex = remote_handle->reg_data.lindex;
-        uint8_t remote_device = remote_handle->reg_data.gpu_device;
-        int32_t local_device = 0;
+        int remote_device = remote_handle->reg_data.gpu_device;
+        int local_device = 0;
         rc = mca_common_cuda_get_device(&local_device);
         if (rc != 0) {
             opal_output(0, "Failed to get the GPU device ID, rc=%d", rc);
@@ -1384,7 +1384,7 @@ inline static int mca_btl_smcuda_cuda_ddt_start_pack(struct mca_btl_base_module_
                                                      struct opal_convertor_t *convertor,
                                                      void *remote_gpu_address,
                                                      mca_btl_base_descriptor_t *frag,
-                                                     int lindex, uint8_t remote_device, uint8_t local_device)
+                                                     int lindex, int remote_device, int local_device)
 {
     cuda_ddt_hdr_t send_msg;
     mca_btl_smcuda_cuda_ddt_unpack_clone(endpoint, convertor, remote_gpu_address, (mca_btl_base_descriptor_t *)frag, 
@@ -1435,7 +1435,7 @@ void mca_btl_smcuda_cuda_ddt_pack_clone(struct mca_btl_base_endpoint_t *endpoint
                                         struct opal_convertor_t *convertor,
                                         void *remote_gpu_address,
                                         mca_btl_base_descriptor_t *frag,
-                                        int lindex, uint8_t remote_device, uint8_t local_device)
+                                        int lindex, int remote_device, int local_device)
 {
     endpoint->smcuda_ddt_pack_clone[lindex].convertor = convertor;
     endpoint->smcuda_ddt_pack_clone[lindex].current_convertor_pBaseBuf = convertor->pBaseBuf;
@@ -1450,7 +1450,7 @@ void mca_btl_smcuda_cuda_ddt_unpack_clone(struct mca_btl_base_endpoint_t *endpoi
                                           struct opal_convertor_t *convertor,
                                           void *remote_gpu_address,
                                           mca_btl_base_descriptor_t *frag,
-                                          int lindex, uint8_t remote_device, uint8_t local_device)
+                                          int lindex, int remote_device, int local_device)
 {
     endpoint->smcuda_ddt_unpack_clone[lindex].convertor = convertor;
     endpoint->smcuda_ddt_unpack_clone[lindex].current_convertor_pBaseBuf = convertor->pBaseBuf;
diff --git a/opal/mca/btl/smcuda/btl_smcuda.h b/opal/mca/btl/smcuda/btl_smcuda.h
index 26dbcb34b2d..ec5cbfa129c 100644
--- a/opal/mca/btl/smcuda/btl_smcuda.h
+++ b/opal/mca/btl/smcuda/btl_smcuda.h
@@ -541,8 +541,8 @@ typedef struct {
     unsigned char *current_convertor_pBaseBuf;
     void *remote_gpu_address;
     int lindex;
-    uint8_t remote_device;
-    uint8_t local_device;
+    int remote_device;
+    int local_device;
     mca_btl_base_descriptor_t *frag;
 } cuda_ddt_clone_t;
 
@@ -559,12 +559,12 @@ void mca_btl_smcuda_cuda_ddt_pack_clone(struct mca_btl_base_endpoint_t *endpoint
                                         struct opal_convertor_t *convertor,
                                         void *remote_gpu_address,
                                         mca_btl_base_descriptor_t *frag,
-                                        int lindex, uint8_t remote_device, uint8_t local_device);
+                                        int lindex, int remote_device, int local_device);
 void mca_btl_smcuda_cuda_ddt_unpack_clone(struct mca_btl_base_endpoint_t *endpoint,
                                           struct opal_convertor_t *convertor,
                                           void *remote_gpu_address,
                                           mca_btl_base_descriptor_t *frag,
-                                          int lindex, uint8_t remote_device, uint8_t local_device);
+                                          int lindex, int remote_device, int local_device);
 
 #endif /* OPAL_CUDA_SUPPORT */
 
diff --git a/opal/mca/btl/smcuda/btl_smcuda_component.c b/opal/mca/btl/smcuda/btl_smcuda_component.c
index 3d8d01c90a1..183edb8b671 100644
--- a/opal/mca/btl/smcuda/btl_smcuda_component.c
+++ b/opal/mca/btl/smcuda/btl_smcuda_component.c
@@ -888,20 +888,22 @@ static void btl_smcuda_datatype_unpack(mca_btl_base_module_t* btl,
         struct opal_convertor_t *convertor = my_cuda_dt_clone->convertor;
         size_t pipeline_size = mca_btl_smcuda_component.cuda_ddt_pipeline_size;
         convertor->flags &= ~CONVERTOR_CUDA;
+        unsigned char *remote_address = NULL;
         if (opal_convertor_need_buffers(convertor) == false) { /* do not unpack */
             convertor->flags |= CONVERTOR_CUDA;
-            mca_btl_smcuda_frag_t *frag_recv = (mca_btl_smcuda_frag_t *) my_cuda_dt_clone->frag;
             unsigned char *local_address = my_cuda_dt_clone->current_convertor_pBaseBuf;
-            opal_output(0, "no unpack, start D2D copy local %p, remote %p, size %ld\n", local_address, my_cuda_dt_clone->remote_gpu_address+seq*pipeline_size, packed_size);
-            mca_common_cuda_memp2pcpy(local_address, my_cuda_dt_clone->remote_gpu_address + seq*pipeline_size, packed_size);
+            remote_address = (unsigned char*)my_cuda_dt_clone->remote_gpu_address + seq * pipeline_size;
+            opal_output(0, "no unpack, start D2D copy local %p, remote %p, size %ld\n", local_address, remote_address, packed_size);
+            mca_common_cuda_memp2pcpy(local_address, (unsigned char*)my_cuda_dt_clone->remote_gpu_address + seq*pipeline_size, packed_size);
             my_cuda_dt_clone->current_convertor_pBaseBuf += packed_size;
         } else {     /* unpack */
             convertor->flags |= CONVERTOR_CUDA;
             if (!OPAL_DATATYPE_DIRECT_COPY_GPUMEM && my_cuda_dt_clone->remote_device != my_cuda_dt_clone->local_device) {
                 convertor->gpu_buffer_ptr = opal_cuda_malloc_gpu_buffer_p(packed_size, 0);
-                (*opal_cuda_d2dcpy_async_p)(convertor->gpu_buffer_ptr, my_cuda_dt_clone->remote_gpu_address + seq*pipeline_size, packed_size);
+                remote_address = (unsigned char*)my_cuda_dt_clone->remote_gpu_address + seq * pipeline_size;
+                (*opal_cuda_d2dcpy_async_p)(convertor->gpu_buffer_ptr, remote_address, packed_size);
                 iov.iov_base = convertor->gpu_buffer_ptr;
-                opal_output(0, "unpack, start D2D copy src %p, dst %p, size %lu\n", my_cuda_dt_clone->remote_gpu_address + seq*pipeline_size, convertor->gpu_buffer_ptr, packed_size);        
+                opal_output(0, "unpack, start D2D copy src %p, dst %p, size %lu\n", remote_address, convertor->gpu_buffer_ptr, packed_size);        
             } else {
                 iov.iov_base = convertor->gpu_buffer_ptr + seq * pipeline_size;
             }
@@ -985,7 +987,7 @@ static void btl_smcuda_datatype_pack(mca_btl_base_module_t* btl,
         seq = 0;
         while (rv_dt != 1 && convertor->gpu_buffer_size > 0) {
             rv_dt = opal_convertor_pack(convertor, &iov, &iov_count, &max_data );
-            iov.iov_base += mca_btl_smcuda_component.cuda_ddt_pipeline_size;
+            iov.iov_base = (void*)((unsigned char*)iov.iov_base + mca_btl_smcuda_component.cuda_ddt_pipeline_size);
             convertor->gpu_buffer_size -= mca_btl_smcuda_component.cuda_ddt_pipeline_size;
             send_msg.packed_size = max_data;
             send_msg.seq = seq;
diff --git a/opal/mca/common/cuda/common_cuda.h b/opal/mca/common/cuda/common_cuda.h
index d5220052d63..61256fa6809 100644
--- a/opal/mca/common/cuda/common_cuda.h
+++ b/opal/mca/common/cuda/common_cuda.h
@@ -38,11 +38,9 @@ struct mca_mpool_common_cuda_reg_data_t {
     uint64_t event;
     opal_ptr_t memh_seg_addr;
     size_t memh_seg_len;
-    // uint64_t pipeline_evtHandle[MAX_IPC_EVENT_HANDLE*EVTHANDLE_SIZE];
-    size_t pipeline_size;
     uint32_t lindex;
     uint8_t pack_required;
-    uint8_t gpu_device;
+    int32_t gpu_device;
 };
 typedef struct mca_mpool_common_cuda_reg_data_t mca_mpool_common_cuda_reg_data_t;
 

From de1ef4e549647085c057f46d9fc28c757e4447be Mon Sep 17 00:00:00 2001
From: eddy16112 <eddy16112@gmail.com>
Date: Wed, 28 Oct 2015 16:29:28 -0400
Subject: [PATCH 132/190] remove smcuda btl calls from pml ob1

---
 ompi/mca/pml/ob1/pml_ob1_cuda.c            |  15 +-
 opal/mca/btl/smcuda/btl_smcuda.c           | 155 +++++++++++----------
 opal/mca/btl/smcuda/btl_smcuda.h           |  29 ++--
 opal/mca/btl/smcuda/btl_smcuda_component.c |  33 ++---
 opal/mca/btl/smcuda/btl_smcuda_endpoint.h  |   5 +-
 opal/mca/common/cuda/common_cuda.h         |   2 +-
 6 files changed, 115 insertions(+), 124 deletions(-)

diff --git a/ompi/mca/pml/ob1/pml_ob1_cuda.c b/ompi/mca/pml/ob1/pml_ob1_cuda.c
index 89a397d41f2..a8c507f35c6 100644
--- a/ompi/mca/pml/ob1/pml_ob1_cuda.c
+++ b/ompi/mca/pml/ob1/pml_ob1_cuda.c
@@ -52,7 +52,7 @@ size_t mca_pml_ob1_rdma_cuda_btls(
 int mca_pml_ob1_rdma_cuda_btl_register_data(
     mca_pml_ob1_com_btl_t* rdma_btls, 
     uint32_t num_btls_used, 
-    int lindex, uint8_t pack_required, int32_t gpu_device);
+    struct opal_convertor_t *pack_convertor, uint8_t pack_required, int32_t gpu_device);
 
 int mca_pml_ob1_cuda_need_buffers(void * rreq,
                                   mca_btl_base_module_t* btl);
@@ -78,6 +78,7 @@ int mca_pml_ob1_send_request_start_cuda(mca_pml_ob1_send_request_t* sendreq,
 #endif /* OPAL_CUDA_GDR_SUPPORT */
 
     sendreq->req_send.req_base.req_convertor.flags &= ~CONVERTOR_CUDA;
+    struct opal_convertor_t *convertor = &(sendreq->req_send.req_base.req_convertor);
     if (opal_convertor_need_buffers(&sendreq->req_send.req_base.req_convertor) == false) {
         unsigned char *base;
         opal_convertor_get_current_pointer( &sendreq->req_send.req_base.req_convertor, (void**)&base );
@@ -94,7 +95,7 @@ int mca_pml_ob1_send_request_start_cuda(mca_pml_ob1_send_request_t* sendreq,
                 opal_output(0, "Failed to get the GPU device ID, rc= %d\n", rc);
                 return rc;
             }                                                                   
-            mca_pml_ob1_rdma_cuda_btl_register_data(sendreq->req_rdma, sendreq->req_rdma_cnt, -1, 0, local_device); 
+            mca_pml_ob1_rdma_cuda_btl_register_data(sendreq->req_rdma, sendreq->req_rdma_cnt, convertor, 0, local_device); 
             rc = mca_pml_ob1_send_request_start_rdma(sendreq, bml_btl,
                                                      sendreq->req_send.req_bytes_packed);
             if( OPAL_UNLIKELY(OMPI_SUCCESS != rc) ) {
@@ -115,7 +116,6 @@ int mca_pml_ob1_send_request_start_cuda(mca_pml_ob1_send_request_t* sendreq,
         mca_bml_base_btl_t* bml_endpoint_btl = mca_bml_base_btl_array_get_index(&(sendreq->req_endpoint->btl_send), 0);
         if ((bml_endpoint_btl->btl_flags & MCA_BTL_FLAGS_CUDA_GET) && CUDA_DDT_WITH_RDMA) {
             unsigned char *base;
-            struct opal_convertor_t *convertor = &(sendreq->req_send.req_base.req_convertor);
             size_t buffer_size = 0;
             if (convertor->local_size > bml_btl->btl->btl_cuda_ddt_pipeline_size) {
                 buffer_size = bml_btl->btl->btl_cuda_ddt_pipeline_size * bml_btl->btl->btl_cuda_ddt_pipeline_depth;
@@ -133,15 +133,12 @@ int mca_pml_ob1_send_request_start_cuda(mca_pml_ob1_send_request_t* sendreq,
                                                                            sendreq->req_send.req_bytes_packed,
                                                                            sendreq->req_rdma))) {
     
-                int lindex = mca_btl_smcuda_alloc_cuda_ddt_pack_clone(bml_btl->btl_endpoint);
-                assert(lindex >= 0);
                 rc = mca_common_cuda_get_device(&local_device);
                 if (rc != 0) {
                     opal_output(0, "Failed to get the GPU device ID, rc=%d\n", rc);
                     return rc;
                 }
-                mca_pml_ob1_rdma_cuda_btl_register_data(sendreq->req_rdma, sendreq->req_rdma_cnt, lindex, 1, local_device); 
-                mca_btl_smcuda_cuda_ddt_pack_clone( bml_btl->btl_endpoint, convertor, NULL, NULL, lindex, 0, local_device);
+                mca_pml_ob1_rdma_cuda_btl_register_data(sendreq->req_rdma, sendreq->req_rdma_cnt, convertor, 1, local_device); 
     
                 rc = mca_pml_ob1_send_request_start_rdma(sendreq, bml_btl,
                                                          sendreq->req_send.req_bytes_packed);
@@ -219,7 +216,7 @@ size_t mca_pml_ob1_rdma_cuda_btls(
 int mca_pml_ob1_rdma_cuda_btl_register_data(
     mca_pml_ob1_com_btl_t* rdma_btls, 
     uint32_t num_btls_used, 
-    int lindex, uint8_t pack_required, int32_t gpu_device)
+    struct opal_convertor_t *pack_convertor, uint8_t pack_required, int32_t gpu_device)
 {
     uint32_t i;
     for (i = 0; i < num_btls_used; i++) {
@@ -231,9 +228,9 @@ int mca_pml_ob1_rdma_cuda_btl_register_data(
       //       mca_common_cuda_geteventhandle(&convertor->pipeline_event[j], j, (mca_mpool_base_registration_t *)cuda_reg);
       // //      printf("event %lu, j %d\n", convertor->pipeline_event[j], j);
       //   }
-        cuda_reg->data.lindex = lindex;
         cuda_reg->data.pack_required = pack_required;
         cuda_reg->data.gpu_device = gpu_device;
+        cuda_reg->data.pack_convertor = pack_convertor;
 
     }
     return 0;
diff --git a/opal/mca/btl/smcuda/btl_smcuda.c b/opal/mca/btl/smcuda/btl_smcuda.c
index f6e27a7c47c..2d015ad11fb 100644
--- a/opal/mca/btl/smcuda/btl_smcuda.c
+++ b/opal/mca/btl/smcuda/btl_smcuda.c
@@ -86,7 +86,8 @@ static int mca_btl_smcuda_deregister_mem (struct mca_btl_base_module_t* btl,
                                           
 inline static int mca_btl_smcuda_cuda_ddt_start_pack(struct mca_btl_base_module_t *btl, 
                                                      struct mca_btl_base_endpoint_t *endpoint,
-                                                     struct opal_convertor_t *convertor,
+                                                     struct opal_convertor_t *pack_convertor,
+                                                     struct opal_convertor_t *unpack_convertor,
                                                      void *remote_gpu_address,
                                                      mca_btl_base_descriptor_t *frag,
                                                      int lindex, int remote_device, int local_device);
@@ -500,9 +501,13 @@ create_sm_endpoint(int local_proc, struct opal_proc_t *proc)
         ep->mpool = mca_mpool_base_module_create("rgpusm",
                                                  NULL,
                                                  &resources);
-        for (int i = 0; i < SMCUDA_DT_CLONE_SIZE; i++) {
-            ep->smcuda_ddt_pack_clone[i].lindex = -1;
-            ep->smcuda_ddt_unpack_clone[i].lindex = -1;
+        /* alloc array for pack/unpack use */
+        ep->smcuda_ddt_clone = NULL;
+        ep->smcuda_ddt_clone = (cuda_ddt_clone_t *)malloc(sizeof(cuda_ddt_clone_t) * SMCUDA_DT_CLONE_SIZE);
+        ep->smcuda_ddt_clone_size = SMCUDA_DT_CLONE_SIZE;
+        ep->smcuda_ddt_clone_avail = SMCUDA_DT_CLONE_SIZE;
+        for (int i = 0; i < ep->smcuda_ddt_clone_size; i++) {
+            ep->smcuda_ddt_clone[i].lindex = -1;
         }
     }
 #endif /* OPAL_CUDA_SUPPORT */
@@ -709,6 +714,15 @@ int mca_btl_smcuda_del_procs(
     struct opal_proc_t **procs,
     struct mca_btl_base_endpoint_t **peers)
 {
+    int32_t proc;
+    struct mca_btl_base_endpoint_t * ep;
+    for (proc = 0; proc < (int32_t)nprocs; proc++) {
+        ep = peers[proc];
+        if (ep->smcuda_ddt_clone != NULL) {
+            free(ep->smcuda_ddt_clone);
+            ep->smcuda_ddt_clone = NULL;
+        }
+    }
     return OPAL_SUCCESS;
 }
 
@@ -1138,32 +1152,34 @@ int mca_btl_smcuda_get_cuda (struct mca_btl_base_module_t *btl,
     mca_pml_ob1_rdma_frag_t *frag_ob1 = cbdata;
     mca_bml_base_btl_t *bml_btl = frag_ob1->rdma_bml;
     mca_pml_base_request_t *req = (mca_pml_base_request_t*) frag_ob1->rdma_req;
-    opal_convertor_t* convertor = &req->req_convertor;
+    opal_convertor_t* unpack_convertor = &req->req_convertor;
 
-    if ((convertor->flags & CONVERTOR_CUDA) &&
+    if ((unpack_convertor->flags & CONVERTOR_CUDA) &&
         (bml_btl->btl_flags & MCA_BTL_FLAGS_CUDA_GET)) {
-        convertor->flags &= ~CONVERTOR_CUDA;
+        unpack_convertor->flags &= ~CONVERTOR_CUDA;
         uint8_t pack_required = remote_handle->reg_data.pack_required;
-        uint32_t lindex = remote_handle->reg_data.lindex;
+        int lindex = -1;
         int remote_device = remote_handle->reg_data.gpu_device;
+        opal_convertor_t* pack_convertor = remote_handle->reg_data.pack_convertor;
         int local_device = 0;
         rc = mca_common_cuda_get_device(&local_device);
         if (rc != 0) {
             opal_output(0, "Failed to get the GPU device ID, rc=%d", rc);
             return rc;
         }
-        if(opal_convertor_need_buffers(convertor) == true) {
-            convertor->flags |= CONVERTOR_CUDA;
+        if(opal_convertor_need_buffers(unpack_convertor) == true) {
+            unpack_convertor->flags |= CONVERTOR_CUDA;
             
-            printf("local addr %p, pbase %p\n", local_address, convertor->pBaseBuf);
+            printf("local addr %p, pbase %p\n", local_address, unpack_convertor->pBaseBuf);
             
             if (remote_device != local_device && !OPAL_DATATYPE_DIRECT_COPY_GPUMEM) {
-                convertor->gpu_buffer_ptr = NULL;  
+                unpack_convertor->gpu_buffer_ptr = NULL;  
             } else {
-                convertor->gpu_buffer_ptr = remote_memory_address;   
+                unpack_convertor->gpu_buffer_ptr = remote_memory_address;   
             }
             if (pack_required) {
-                mca_btl_smcuda_cuda_ddt_start_pack(btl, ep, convertor, remote_memory_address, (mca_btl_base_descriptor_t *)frag, 
+                lindex = mca_btl_smcuda_alloc_cuda_ddt_clone(ep);
+                mca_btl_smcuda_cuda_ddt_start_pack(btl, ep, pack_convertor, unpack_convertor, remote_memory_address, (mca_btl_base_descriptor_t *)frag, 
                                                     lindex, remote_device, local_device);
                 done = 0;
             } else {
@@ -1171,40 +1187,42 @@ int mca_btl_smcuda_get_cuda (struct mca_btl_base_module_t *btl,
                 uint32_t iov_count = 1;
                 size_t max_data;
                 if (!OPAL_DATATYPE_DIRECT_COPY_GPUMEM && remote_device != local_device) {
-                    convertor->gpu_buffer_ptr = opal_cuda_malloc_gpu_buffer_p(size, 0);
-                    (*opal_cuda_d2dcpy_async_p)(convertor->gpu_buffer_ptr, remote_memory_address, size);
-                    iov.iov_base = convertor->gpu_buffer_ptr;
-                    opal_output(0, "start D2D copy src %p, dst %p, size %lu\n", remote_memory_address, convertor->gpu_buffer_ptr, size);
+                    unpack_convertor->gpu_buffer_ptr = opal_cuda_malloc_gpu_buffer_p(size, 0);
+                    (*opal_cuda_d2dcpy_async_p)(unpack_convertor->gpu_buffer_ptr, remote_memory_address, size);
+                    iov.iov_base = unpack_convertor->gpu_buffer_ptr;
+                    opal_output(0, "start D2D copy src %p, dst %p, size %lu\n", remote_memory_address, unpack_convertor->gpu_buffer_ptr, size);
                 } else {
-                    iov.iov_base = convertor->gpu_buffer_ptr;
+                    iov.iov_base = unpack_convertor->gpu_buffer_ptr;
                 }
                 iov.iov_len = size;
                 max_data = size;
-                opal_convertor_unpack(convertor, &iov, &iov_count, &max_data );
-                opal_cuda_free_gpu_buffer_p(convertor->gpu_buffer_ptr, 0);
+                opal_convertor_unpack(unpack_convertor, &iov, &iov_count, &max_data );
+                opal_cuda_free_gpu_buffer_p(unpack_convertor->gpu_buffer_ptr, 0);
                 done = 1;
             }
         } else {
-            convertor->flags |= CONVERTOR_CUDA;
+            unpack_convertor->flags |= CONVERTOR_CUDA;
             if (pack_required) {
+                lindex = mca_btl_smcuda_alloc_cuda_ddt_clone(ep);
                 if (remote_device == local_device || OPAL_DATATYPE_DIRECT_COPY_GPUMEM) {
                     /* now we are able to let sender pack directly to my memory */
                     mca_mpool_common_cuda_reg_t loc_reg;
                     mca_mpool_common_cuda_reg_t *loc_reg_ptr = &loc_reg;
                     cuda_ddt_put_hdr_t put_msg;
                     if (OPAL_SUCCESS != cuda_getmemhandle(local_address, size, (mca_mpool_base_registration_t *)&loc_reg, NULL)) {
-                        mca_btl_smcuda_cuda_ddt_start_pack(btl, ep, convertor, remote_memory_address, (mca_btl_base_descriptor_t *)frag, 
+                        mca_btl_smcuda_cuda_ddt_start_pack(btl, ep, pack_convertor, unpack_convertor, remote_memory_address, (mca_btl_base_descriptor_t *)frag, 
                                                            lindex, remote_device, local_device);
                     }
                     memcpy(put_msg.mem_handle, loc_reg_ptr->data.memHandle, sizeof(loc_reg_ptr->data.memHandle));
                     put_msg.remote_address = local_address;
                     put_msg.remote_base = loc_reg.base.base;
                     put_msg.lindex = lindex;
-                    mca_btl_smcuda_cuda_ddt_unpack_clone(ep, convertor, remote_memory_address, (mca_btl_base_descriptor_t *)frag, 
+                    put_msg.pack_convertor = pack_convertor;
+                    mca_btl_smcuda_cuda_ddt_clone(ep, pack_convertor, unpack_convertor, remote_memory_address, (mca_btl_base_descriptor_t *)frag, 
                                                         lindex, 0, 0);
                     mca_btl_smcuda_send_cuda_put_sig(btl, ep, &put_msg);
                 } else {
-                    mca_btl_smcuda_cuda_ddt_start_pack(btl, ep, convertor, remote_memory_address, (mca_btl_base_descriptor_t *)frag, 
+                    mca_btl_smcuda_cuda_ddt_start_pack(btl, ep, pack_convertor, unpack_convertor, remote_memory_address, (mca_btl_base_descriptor_t *)frag, 
                                                        lindex, remote_device, local_device);
                 }
                 done = 0;
@@ -1381,84 +1399,67 @@ int mca_btl_smcuda_send_cuda_put_sig(struct mca_btl_base_module_t* btl,
 
 inline static int mca_btl_smcuda_cuda_ddt_start_pack(struct mca_btl_base_module_t *btl,
                                                      struct mca_btl_base_endpoint_t *endpoint,
-                                                     struct opal_convertor_t *convertor,
+                                                     struct opal_convertor_t *pack_convertor,
+                                                     struct opal_convertor_t *unpack_convertor,
                                                      void *remote_gpu_address,
                                                      mca_btl_base_descriptor_t *frag,
                                                      int lindex, int remote_device, int local_device)
 {
     cuda_ddt_hdr_t send_msg;
-    mca_btl_smcuda_cuda_ddt_unpack_clone(endpoint, convertor, remote_gpu_address, (mca_btl_base_descriptor_t *)frag, 
+    mca_btl_smcuda_cuda_ddt_clone(endpoint, pack_convertor, unpack_convertor, remote_gpu_address, (mca_btl_base_descriptor_t *)frag, 
                                         lindex, remote_device, local_device);
     send_msg.lindex = lindex;
     send_msg.packed_size = 0;
     send_msg.seq = 0;
     send_msg.msg_type = CUDA_DDT_PACK_START;
+    send_msg.pack_convertor = pack_convertor;
     opal_output(0, "smcuda btl start pack, remote_gpu_address %p, frag %p, lindex %d, remote_device %d, local_device %d\n",
                 (void*)remote_gpu_address, (void*)frag, lindex, remote_device, local_device);
     mca_btl_smcuda_send_cuda_pack_sig(btl, endpoint, &send_msg);
     return OPAL_SUCCESS;
 }
 
-int mca_btl_smcuda_alloc_cuda_ddt_pack_clone(struct mca_btl_base_endpoint_t *endpoint)
-{
-    int i;
-    for (i = 0; i < SMCUDA_DT_CLONE_SIZE; i++) {
-        if (endpoint->smcuda_ddt_pack_clone[i].lindex == -1) {
-            return i;
-        }
-    }
-    return -1;
-}
-int mca_btl_smcuda_alloc_cuda_ddt_unpack_clone(struct mca_btl_base_endpoint_t *endpoint)
+int mca_btl_smcuda_alloc_cuda_ddt_clone(struct mca_btl_base_endpoint_t *endpoint)
 {
     int i;
-    for (i = 0; i < SMCUDA_DT_CLONE_SIZE; i++) {
-        if (endpoint->smcuda_ddt_unpack_clone[i].lindex == -1) {
-            return i;
+    if (endpoint->smcuda_ddt_clone_avail > 0) {
+        for (i = 0; i < endpoint->smcuda_ddt_clone_size; i++) {
+            if (endpoint->smcuda_ddt_clone[i].lindex == -1) {
+                endpoint->smcuda_ddt_clone_avail --;
+                opal_output(0, "Alloc cuda ddt clone array success, lindex %d\n",i);
+                return i;
+            }
         }
+    } else {
+        endpoint->smcuda_ddt_clone = realloc(endpoint->smcuda_ddt_clone, endpoint->smcuda_ddt_clone_size + SMCUDA_DT_CLONE_SIZE);
+        endpoint->smcuda_ddt_clone_avail = SMCUDA_DT_CLONE_SIZE - 1;
+        endpoint->smcuda_ddt_clone_size += SMCUDA_DT_CLONE_SIZE;
+        return endpoint->smcuda_ddt_clone_size - SMCUDA_DT_CLONE_SIZE;
     }
-    return -1;
-}
-
-void mca_btl_smcuda_free_cuda_ddt_pack_clone(struct mca_btl_base_endpoint_t *endpoint, int lindex)
-{
-    assert(endpoint->smcuda_ddt_pack_clone[lindex].lindex == lindex);
-    endpoint->smcuda_ddt_pack_clone[lindex].lindex = -1;
-}
-void mca_btl_smcuda_free_cuda_ddt_unpack_clone(struct mca_btl_base_endpoint_t *endpoint, int lindex)
-{
-    assert(endpoint->smcuda_ddt_unpack_clone[lindex].lindex == lindex);
-    endpoint->smcuda_ddt_unpack_clone[lindex].lindex = -1;
 }
 
-void mca_btl_smcuda_cuda_ddt_pack_clone(struct mca_btl_base_endpoint_t *endpoint,
-                                        struct opal_convertor_t *convertor,
-                                        void *remote_gpu_address,
-                                        mca_btl_base_descriptor_t *frag,
-                                        int lindex, int remote_device, int local_device)
+void mca_btl_smcuda_free_cuda_ddt_clone(struct mca_btl_base_endpoint_t *endpoint, int lindex)
 {
-    endpoint->smcuda_ddt_pack_clone[lindex].convertor = convertor;
-    endpoint->smcuda_ddt_pack_clone[lindex].current_convertor_pBaseBuf = convertor->pBaseBuf;
-    endpoint->smcuda_ddt_pack_clone[lindex].remote_gpu_address = remote_gpu_address;
-    endpoint->smcuda_ddt_pack_clone[lindex].lindex = lindex;
-    endpoint->smcuda_ddt_pack_clone[lindex].remote_device = remote_device;
-    endpoint->smcuda_ddt_pack_clone[lindex].local_device = local_device;
-    endpoint->smcuda_ddt_pack_clone[lindex].frag = frag;
+    assert(endpoint->smcuda_ddt_clone[lindex].lindex == lindex);
+    endpoint->smcuda_ddt_clone[lindex].lindex = -1;
+    endpoint->smcuda_ddt_clone_avail ++;
 }
 
-void mca_btl_smcuda_cuda_ddt_unpack_clone(struct mca_btl_base_endpoint_t *endpoint,
-                                          struct opal_convertor_t *convertor,
-                                          void *remote_gpu_address,
-                                          mca_btl_base_descriptor_t *frag,
-                                          int lindex, int remote_device, int local_device)
+void mca_btl_smcuda_cuda_ddt_clone(struct mca_btl_base_endpoint_t *endpoint,
+                                   struct opal_convertor_t *pack_convertor,
+                                   struct opal_convertor_t *unpack_convertor,
+                                   void *remote_gpu_address,
+                                   mca_btl_base_descriptor_t *frag,
+                                   int lindex, int remote_device, int local_device)
 {
-    endpoint->smcuda_ddt_unpack_clone[lindex].convertor = convertor;
-    endpoint->smcuda_ddt_unpack_clone[lindex].current_convertor_pBaseBuf = convertor->pBaseBuf;
-    endpoint->smcuda_ddt_unpack_clone[lindex].remote_gpu_address = remote_gpu_address;
-    endpoint->smcuda_ddt_unpack_clone[lindex].lindex = lindex;
-    endpoint->smcuda_ddt_unpack_clone[lindex].remote_device = remote_device;
-    endpoint->smcuda_ddt_unpack_clone[lindex].local_device = local_device;
-    endpoint->smcuda_ddt_unpack_clone[lindex].frag = frag;
+    endpoint->smcuda_ddt_clone[lindex].pack_convertor = pack_convertor;
+    endpoint->smcuda_ddt_clone[lindex].unpack_convertor = unpack_convertor;
+    endpoint->smcuda_ddt_clone[lindex].current_unpack_convertor_pBaseBuf = unpack_convertor->pBaseBuf;
+    endpoint->smcuda_ddt_clone[lindex].remote_gpu_address = remote_gpu_address;
+    endpoint->smcuda_ddt_clone[lindex].lindex = lindex;
+    endpoint->smcuda_ddt_clone[lindex].remote_device = remote_device;
+    endpoint->smcuda_ddt_clone[lindex].local_device = local_device;
+    endpoint->smcuda_ddt_clone[lindex].frag = frag;
 }
 
 #endif /* OPAL_CUDA_SUPPORT */
diff --git a/opal/mca/btl/smcuda/btl_smcuda.h b/opal/mca/btl/smcuda/btl_smcuda.h
index ec5cbfa129c..8305029d79e 100644
--- a/opal/mca/btl/smcuda/btl_smcuda.h
+++ b/opal/mca/btl/smcuda/btl_smcuda.h
@@ -517,6 +517,7 @@ typedef struct {
     int seq;
     int msg_type;
     int packed_size;
+    struct opal_convertor_t *pack_convertor;
 } cuda_ddt_hdr_t;
 
 /* cuda datatype put message */
@@ -525,6 +526,7 @@ typedef struct {
     void *remote_address;
     void *remote_base;
     uint64_t mem_handle[8];
+    struct opal_convertor_t *pack_convertor;
 } cuda_ddt_put_hdr_t;
 
 #define CUDA_DDT_UNPACK_FROM_BLOCK  0
@@ -537,8 +539,9 @@ typedef struct {
 
 /* package save pack/unpack convertor and cbfunc */
 typedef struct {
-    struct opal_convertor_t *convertor;
-    unsigned char *current_convertor_pBaseBuf;
+    struct opal_convertor_t *pack_convertor;
+    struct opal_convertor_t *unpack_convertor;
+    unsigned char *current_unpack_convertor_pBaseBuf;
     void *remote_gpu_address;
     int lindex;
     int remote_device;
@@ -551,20 +554,14 @@ typedef struct {
 int mca_btl_smcuda_send_cuda_unpack_sig(struct mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t* endpoint, cuda_ddt_hdr_t *send_msg);
 int mca_btl_smcuda_send_cuda_pack_sig(struct mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t* endpoint, cuda_ddt_hdr_t *send_msg);
 int mca_btl_smcuda_send_cuda_put_sig(struct mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t* endpoint, cuda_ddt_put_hdr_t *put_msg);
-int mca_btl_smcuda_alloc_cuda_ddt_pack_clone(struct mca_btl_base_endpoint_t *endpoint);
-int mca_btl_smcuda_alloc_cuda_ddt_unpack_clone(struct mca_btl_base_endpoint_t *endpoint);
-void mca_btl_smcuda_free_cuda_ddt_pack_clone(struct mca_btl_base_endpoint_t *endpoint, int lindex);
-void mca_btl_smcuda_free_cuda_ddt_unpack_clone(struct mca_btl_base_endpoint_t *endpoint, int lindex);
-void mca_btl_smcuda_cuda_ddt_pack_clone(struct mca_btl_base_endpoint_t *endpoint,
-                                        struct opal_convertor_t *convertor,
-                                        void *remote_gpu_address,
-                                        mca_btl_base_descriptor_t *frag,
-                                        int lindex, int remote_device, int local_device);
-void mca_btl_smcuda_cuda_ddt_unpack_clone(struct mca_btl_base_endpoint_t *endpoint,
-                                          struct opal_convertor_t *convertor,
-                                          void *remote_gpu_address,
-                                          mca_btl_base_descriptor_t *frag,
-                                          int lindex, int remote_device, int local_device);
+int mca_btl_smcuda_alloc_cuda_ddt_clone(struct mca_btl_base_endpoint_t *endpoint);
+void mca_btl_smcuda_free_cuda_ddt_clone(struct mca_btl_base_endpoint_t *endpoint, int lindex);
+void mca_btl_smcuda_cuda_ddt_clone(struct mca_btl_base_endpoint_t *endpoint,
+                                   struct opal_convertor_t *pack_convertor,
+                                   struct opal_convertor_t *unpack_convertor,
+                                   void *remote_gpu_address,
+                                   mca_btl_base_descriptor_t *frag,
+                                   int lindex, int remote_device, int local_device);
 
 #endif /* OPAL_CUDA_SUPPORT */
 
diff --git a/opal/mca/btl/smcuda/btl_smcuda_component.c b/opal/mca/btl/smcuda/btl_smcuda_component.c
index 183edb8b671..c7bdb40c028 100644
--- a/opal/mca/btl/smcuda/btl_smcuda_component.c
+++ b/opal/mca/btl/smcuda/btl_smcuda_component.c
@@ -856,7 +856,7 @@ static void btl_smcuda_datatype_unpack(mca_btl_base_module_t* btl,
                                        mca_btl_base_tag_t tag,
                                        mca_btl_base_descriptor_t* des, void* cbdata)
 {   
-    struct mca_btl_base_endpoint_t *endpoint;
+    struct mca_btl_base_endpoint_t *endpoint = NULL;
     cuda_ddt_hdr_t recv_msg;
     mca_btl_base_segment_t* segments = des->des_segments;
     memcpy(&recv_msg, segments->seg_addr.pval, sizeof(cuda_ddt_hdr_t));
@@ -869,33 +869,34 @@ static void btl_smcuda_datatype_unpack(mca_btl_base_module_t* btl,
 
     /* We can find the endoint back from the rank embedded in the header */
     endpoint = mca_btl_smcuda_component.sm_peers[frag->hdr->my_smp_rank];
-    my_cuda_dt_clone = &endpoint->smcuda_ddt_unpack_clone[lindex];
+    my_cuda_dt_clone = &endpoint->smcuda_ddt_clone[lindex];
     assert(my_cuda_dt_clone->lindex == lindex);
     
     cuda_ddt_hdr_t send_msg;
     send_msg.lindex = lindex;
+    send_msg.pack_convertor = my_cuda_dt_clone->pack_convertor;
     
     if (msg_type == CUDA_DDT_CLEANUP) {
         mca_btl_smcuda_frag_t *frag_recv = (mca_btl_smcuda_frag_t *) my_cuda_dt_clone->frag;
         mca_btl_base_rdma_completion_fn_t cbfunc = (mca_btl_base_rdma_completion_fn_t) frag_recv->base.des_cbfunc;
         cbfunc (btl, endpoint, frag_recv->segment.seg_addr.pval, frag_recv->local_handle, frag_recv->base.des_context, frag_recv->base.des_cbdata, OPAL_SUCCESS);
         mca_btl_smcuda_free(btl, (mca_btl_base_descriptor_t *)frag_recv);
-        mca_btl_smcuda_free_cuda_ddt_unpack_clone(endpoint, lindex);
+        mca_btl_smcuda_free_cuda_ddt_clone(endpoint, lindex);
     } else if (msg_type == CUDA_DDT_UNPACK_FROM_BLOCK || msg_type == CUDA_DDT_COMPLETE){
         struct iovec iov;
         uint32_t iov_count = 1;
         size_t max_data;
-        struct opal_convertor_t *convertor = my_cuda_dt_clone->convertor;
+        struct opal_convertor_t *convertor = my_cuda_dt_clone->unpack_convertor;
         size_t pipeline_size = mca_btl_smcuda_component.cuda_ddt_pipeline_size;
         convertor->flags &= ~CONVERTOR_CUDA;
         unsigned char *remote_address = NULL;
         if (opal_convertor_need_buffers(convertor) == false) { /* do not unpack */
             convertor->flags |= CONVERTOR_CUDA;
-            unsigned char *local_address = my_cuda_dt_clone->current_convertor_pBaseBuf;
+            unsigned char *local_address = my_cuda_dt_clone->current_unpack_convertor_pBaseBuf;
             remote_address = (unsigned char*)my_cuda_dt_clone->remote_gpu_address + seq * pipeline_size;
             opal_output(0, "no unpack, start D2D copy local %p, remote %p, size %ld\n", local_address, remote_address, packed_size);
             mca_common_cuda_memp2pcpy(local_address, (unsigned char*)my_cuda_dt_clone->remote_gpu_address + seq*pipeline_size, packed_size);
-            my_cuda_dt_clone->current_convertor_pBaseBuf += packed_size;
+            my_cuda_dt_clone->current_unpack_convertor_pBaseBuf += packed_size;
         } else {     /* unpack */
             convertor->flags |= CONVERTOR_CUDA;
             if (!OPAL_DATATYPE_DIRECT_COPY_GPUMEM && my_cuda_dt_clone->remote_device != my_cuda_dt_clone->local_device) {
@@ -932,27 +933,25 @@ static void btl_smcuda_datatype_pack(mca_btl_base_module_t* btl,
                                      mca_btl_base_tag_t tag,
                                      mca_btl_base_descriptor_t* des, void* cbdata)
 {
-    struct mca_btl_base_endpoint_t *endpoint;
+    struct mca_btl_base_endpoint_t *endpoint = NULL;
     cuda_ddt_hdr_t recv_msg;
     mca_btl_base_segment_t* segments = des->des_segments;
     memcpy(&recv_msg, segments->seg_addr.pval, sizeof(cuda_ddt_hdr_t));
     int seq = recv_msg.seq;
     int lindex = recv_msg.lindex;
     int msg_type = recv_msg.msg_type;
+    struct opal_convertor_t *convertor = recv_msg.pack_convertor;
     mca_btl_smcuda_frag_t *frag = (mca_btl_smcuda_frag_t *)des;
-    cuda_ddt_clone_t *my_cuda_dt_clone;
     cuda_ddt_hdr_t send_msg;
     
+    /* We can find the endoint back from the rank embedded in the header */
+    endpoint = mca_btl_smcuda_component.sm_peers[frag->hdr->my_smp_rank];
+    
     uint32_t iov_count = 1;
     int rv_dt = 0;
     size_t max_data = 0;
     size_t packed_size = 0;
 
-    /* We can find the endoint back from the rank embedded in the header */
-    endpoint = mca_btl_smcuda_component.sm_peers[frag->hdr->my_smp_rank];
-    my_cuda_dt_clone = &endpoint->smcuda_ddt_pack_clone[lindex];
-    
-    struct opal_convertor_t *convertor = my_cuda_dt_clone->convertor;
     send_msg.lindex = lindex;
     if (msg_type == CUDA_DDT_COMPLETE_ACK) {
         send_msg.packed_size = 0;
@@ -963,7 +962,6 @@ static void btl_smcuda_datatype_pack(mca_btl_base_module_t* btl,
             opal_cuda_free_gpu_buffer_p(convertor->gpu_buffer_ptr, 0);
             convertor->gpu_buffer_ptr = NULL;
         }
-        mca_btl_smcuda_free_cuda_ddt_pack_clone(endpoint, lindex);
     } else if (msg_type == CUDA_DDT_PACK_TO_BLOCK) {
         if (convertor->bConverted < convertor->local_size) {
             struct iovec iov;
@@ -1009,21 +1007,19 @@ static void btl_smcuda_datatype_put(mca_btl_base_module_t* btl,
                                     mca_btl_base_tag_t tag,
                                     mca_btl_base_descriptor_t* des, void* cbdata)
 {
-    struct mca_btl_base_endpoint_t *endpoint;
+    struct mca_btl_base_endpoint_t *endpoint = NULL;
     cuda_ddt_put_hdr_t recv_msg;
     mca_btl_base_segment_t* segments = des->des_segments;
     memcpy(&recv_msg, segments->seg_addr.pval, sizeof(cuda_ddt_put_hdr_t));
     int lindex = recv_msg.lindex;
     void *remote_address = recv_msg.remote_address;
     void *remote_base = recv_msg.remote_base;
+    struct opal_convertor_t *convertor = recv_msg.pack_convertor;
     mca_btl_smcuda_frag_t *frag = (mca_btl_smcuda_frag_t *)des;
-    cuda_ddt_clone_t *my_cuda_dt_clone;
     cuda_ddt_hdr_t send_msg;
     
     /* We can find the endoint back from the rank embedded in the header */
     endpoint = mca_btl_smcuda_component.sm_peers[frag->hdr->my_smp_rank];
-    my_cuda_dt_clone = &endpoint->smcuda_ddt_pack_clone[lindex];
-    struct opal_convertor_t *convertor = my_cuda_dt_clone->convertor;
     
     opal_cuda_free_gpu_buffer_p(convertor->gpu_buffer_ptr, 0);
     mca_mpool_common_cuda_reg_t *rget_reg_ptr = NULL;
@@ -1051,7 +1047,6 @@ static void btl_smcuda_datatype_put(mca_btl_base_module_t* btl,
     send_msg.seq = -2;
     send_msg.msg_type = CUDA_DDT_CLEANUP;
     mca_btl_smcuda_send_cuda_unpack_sig(btl, endpoint, &send_msg);
-    mca_btl_smcuda_free_cuda_ddt_pack_clone(endpoint, lindex);
 }
 
 #endif /* OPAL_CUDA_SUPPORT */
diff --git a/opal/mca/btl/smcuda/btl_smcuda_endpoint.h b/opal/mca/btl/smcuda/btl_smcuda_endpoint.h
index f3b79866c14..20936dbeac1 100644
--- a/opal/mca/btl/smcuda/btl_smcuda_endpoint.h
+++ b/opal/mca/btl/smcuda/btl_smcuda_endpoint.h
@@ -49,8 +49,9 @@ struct mca_btl_base_endpoint_t {
     opal_proc_t *proc_opal;  /**< Needed for adding CUDA IPC support dynamically */
     enum ipcState ipcstate;  /**< CUDA IPC connection status */
     int ipctries;            /**< Number of times CUDA IPC connect was sent */
-    cuda_ddt_clone_t smcuda_ddt_pack_clone[SMCUDA_DT_CLONE_SIZE];
-    cuda_ddt_clone_t smcuda_ddt_unpack_clone[SMCUDA_DT_CLONE_SIZE];
+    cuda_ddt_clone_t *smcuda_ddt_clone;
+    int smcuda_ddt_clone_size;
+    int smcuda_ddt_clone_avail;
 #endif /* OPAL_CUDA_SUPPORT */
 };
 
diff --git a/opal/mca/common/cuda/common_cuda.h b/opal/mca/common/cuda/common_cuda.h
index 61256fa6809..9adda6dc82f 100644
--- a/opal/mca/common/cuda/common_cuda.h
+++ b/opal/mca/common/cuda/common_cuda.h
@@ -38,9 +38,9 @@ struct mca_mpool_common_cuda_reg_data_t {
     uint64_t event;
     opal_ptr_t memh_seg_addr;
     size_t memh_seg_len;
-    uint32_t lindex;
     uint8_t pack_required;
     int32_t gpu_device;
+    struct opal_convertor_t *pack_convertor;
 };
 typedef struct mca_mpool_common_cuda_reg_data_t mca_mpool_common_cuda_reg_data_t;
 

From b60bae51d3cfa0321d0dec4149d6d53bf573f4c4 Mon Sep 17 00:00:00 2001
From: eddy16112 <eddy16112@gmail.com>
Date: Wed, 28 Oct 2015 16:33:31 -0400
Subject: [PATCH 133/190] this file is not used anymore

---
 .../cuda/opal_datatype_orig_internal.h        | 645 ------------------
 1 file changed, 645 deletions(-)
 delete mode 100644 opal/datatype/cuda/opal_datatype_orig_internal.h

diff --git a/opal/datatype/cuda/opal_datatype_orig_internal.h b/opal/datatype/cuda/opal_datatype_orig_internal.h
deleted file mode 100644
index 4dde12d235d..00000000000
--- a/opal/datatype/cuda/opal_datatype_orig_internal.h
+++ /dev/null
@@ -1,645 +0,0 @@
-#ifndef OPAL_DATATYPE_ORIG_INTERNAL_H_HAS_BEEN_INCLUDED
-#define OPAL_DATATYPE_ORIG_INTERNAL_H_HAS_BEEN_INCLUDED
-
-#include <stdbool.h>
-
-#include "opal_config.h"
-
-#define OPAL_PTRDIFF_TYPE ptrdiff_t
-#define DT_STATIC_STACK_SIZE    5                /**< This should be sufficient for most applications */
-
-#if OPAL_ENABLE_DEBUG
-/* Any kind of unique ID should do the job */
-#define OPAL_OBJ_MAGIC_ID ((0xdeafbeedULL << 32) + 0xdeafbeedULL)
-#endif
-
-/* keep the last 16 bits free for data flags */
-#define CONVERTOR_DATATYPE_MASK    0x0000FFFF
-#define CONVERTOR_SEND_CONVERSION  0x00010000
-#define CONVERTOR_RECV             0x00020000
-#define CONVERTOR_SEND             0x00040000
-#define CONVERTOR_HOMOGENEOUS      0x00080000
-#define CONVERTOR_NO_OP            0x00100000
-#define CONVERTOR_WITH_CHECKSUM    0x00200000
-#define CONVERTOR_CUDA             0x00400000
-#define CONVERTOR_CUDA_ASYNC       0x00800000
-#define CONVERTOR_TYPE_MASK        0x00FF0000
-#define CONVERTOR_STATE_START      0x01000000
-#define CONVERTOR_STATE_COMPLETE   0x02000000
-#define CONVERTOR_STATE_ALLOC      0x04000000
-#define CONVERTOR_COMPLETED        0x08000000
-
-#define OPAL_DATATYPE_LOOP           0
-#define OPAL_DATATYPE_END_LOOP       1
-#define OPAL_DATATYPE_LB             2
-#define OPAL_DATATYPE_UB             3
-#define OPAL_DATATYPE_FIRST_TYPE     4 /* Number of first real type */
-#define OPAL_DATATYPE_INT1           4
-#define OPAL_DATATYPE_INT2           5
-#define OPAL_DATATYPE_INT4           6
-#define OPAL_DATATYPE_INT8           7
-#define OPAL_DATATYPE_INT16          8
-#define OPAL_DATATYPE_UINT1          9
-#define OPAL_DATATYPE_UINT2          10
-#define OPAL_DATATYPE_UINT4          11
-#define OPAL_DATATYPE_UINT8          12
-#define OPAL_DATATYPE_UINT16         13
-#define OPAL_DATATYPE_FLOAT2         14
-#define OPAL_DATATYPE_FLOAT4         15
-#define OPAL_DATATYPE_FLOAT8         16
-#define OPAL_DATATYPE_FLOAT12        17
-#define OPAL_DATATYPE_FLOAT16        18
-#define OPAL_DATATYPE_FLOAT_COMPLEX  19
-#define OPAL_DATATYPE_DOUBLE_COMPLEX 20
-#define OPAL_DATATYPE_LONG_DOUBLE_COMPLEX 21
-#define OPAL_DATATYPE_BOOL           22
-#define OPAL_DATATYPE_WCHAR          23
-#define OPAL_DATATYPE_UNAVAILABLE    24
-
-/* flags for the datatypes. */
-#define OPAL_DATATYPE_FLAG_UNAVAILABLE   0x0001  /**< datatypes unavailable on the build (OS or compiler dependant) */
-#define OPAL_DATATYPE_FLAG_PREDEFINED    0x0002  /**< cannot be removed: initial and predefined datatypes */
-#define OPAL_DATATYPE_FLAG_COMMITED      0x0004  /**< ready to be used for a send/recv operation */
-#define OPAL_DATATYPE_FLAG_OVERLAP       0x0008  /**< datatype is unpropper for a recv operation */
-#define OPAL_DATATYPE_FLAG_CONTIGUOUS    0x0010  /**< contiguous datatype */
-#define OPAL_DATATYPE_FLAG_NO_GAPS       0x0020  /**< no gaps around the datatype, aka OPAL_DATATYPE_FLAG_CONTIGUOUS and extent == size */
-#define OPAL_DATATYPE_FLAG_USER_LB       0x0040  /**< has a user defined LB */
-#define OPAL_DATATYPE_FLAG_USER_UB       0x0080  /**< has a user defined UB */
-#define OPAL_DATATYPE_FLAG_DATA          0x0100  /**< data or control structure */
-/*
- * We should make the difference here between the predefined contiguous and non contiguous
- * datatypes. The OPAL_DATATYPE_FLAG_BASIC is held by all predefined contiguous datatypes.
- */
-#define OPAL_DATATYPE_FLAG_BASIC         (OPAL_DATATYPE_FLAG_PREDEFINED | \
-                                          OPAL_DATATYPE_FLAG_CONTIGUOUS | \
-                                          OPAL_DATATYPE_FLAG_NO_GAPS |    \
-                                          OPAL_DATATYPE_FLAG_DATA |       \
-                                          OPAL_DATATYPE_FLAG_COMMITED)
- 
-/* typedefs ***********************************************************/
-
-typedef struct opal_object_t opal_object_t;
-typedef struct opal_class_t opal_class_t;
-typedef void (*opal_construct_t) (opal_object_t *);
-typedef void (*opal_destruct_t) (opal_object_t *);
-
-
-/* types **************************************************************/
-
-/**
-* Class descriptor.
-*
-* There should be a single instance of this descriptor for each class
-* definition.
-*/
-struct opal_class_t {
-  const char *cls_name;           /**< symbolic name for class */
-  opal_class_t *cls_parent;       /**< parent class descriptor */
-  opal_construct_t cls_construct; /**< class constructor */
-  opal_destruct_t cls_destruct;   /**< class destructor */
-  int cls_initialized;            /**< is class initialized */
-  int cls_depth;                  /**< depth of class hierarchy tree */
-  opal_construct_t *cls_construct_array;
-                                  /**< array of parent class constructors */
-  opal_destruct_t *cls_destruct_array;
-                                  /**< array of parent class destructors */
-  size_t cls_sizeof;              /**< size of an object instance */
-};
-
-/**
- * Base object.
- *
- * This is special and does not follow the pattern for other classes.
- */
-struct opal_object_t {
-#if OPAL_ENABLE_DEBUG
-    /** Magic ID -- want this to be the very first item in the
-        struct's memory */
-    uint64_t obj_magic_id;
-#endif
-    opal_class_t *obj_class;            /**< class descriptor */
-    volatile int32_t obj_reference_count;   /**< reference count */
-#if OPAL_ENABLE_DEBUG
-   const char* cls_init_file_name;        /**< In debug mode store the file where the object get contructed */
-   int   cls_init_lineno;           /**< In debug mode store the line number where the object get contructed */
-#endif  /* OPAL_ENABLE_DEBUG */
-};
-
-/**
- * Declaration for class descriptor
- *
- * @param NAME          Name of class
- *
- * Put this in NAME.h
- */
-#define OBJ_CLASS_DECLARATION(NAME)             \
-    extern opal_class_t NAME ## _class
-
-/**
- * Return a pointer to the class descriptor associated with a
- * class type.
- *
- * @param NAME          Name of class
- * @return              Pointer to class descriptor
- */
-#define OBJ_CLASS(NAME)     (&(NAME ## _class))
-
-/**
- * For static initializations of OBJects.
- *
- * @param NAME   Name of the class to initialize
- */
-#if OPAL_ENABLE_DEBUG
-#define OPAL_OBJ_STATIC_INIT(BASE_CLASS) { OPAL_OBJ_MAGIC_ID, OBJ_CLASS(BASE_CLASS), 1, __FILE__, __LINE__ }
-#else
-#define OPAL_OBJ_STATIC_INIT(BASE_CLASS) { OBJ_CLASS(BASE_CLASS), 1 }
-#endif
-
-
-
-struct ddt_elem_id_description {
-    uint16_t   flags;  /**< flags for the record */
-    uint16_t   type;   /**< the basic data type id */
-};
-typedef struct ddt_elem_id_description ddt_elem_id_description;
-
-/* the basic element. A data description is composed
- * by a set of basic elements.
- */
-struct ddt_elem_desc {
-    ddt_elem_id_description common;           /**< basic data description and flags */
-    uint32_t                count;            /**< number of blocks */
-    uint32_t                blocklen;         /**< number of elements on each block */
-    OPAL_PTRDIFF_TYPE       extent;           /**< extent of each block (in bytes) */
-    OPAL_PTRDIFF_TYPE       disp;             /**< displacement of the first block */
-};
-typedef struct ddt_elem_desc ddt_elem_desc_t;
-
-struct ddt_loop_desc {
-    ddt_elem_id_description common;           /**< basic data description and flags */
-    uint32_t                loops;            /**< number of elements */
-    uint32_t                items;            /**< number of items in the loop */
-    size_t                  unused;           /**< not used right now */
-    OPAL_PTRDIFF_TYPE       extent;           /**< extent of the whole loop */
-};
-typedef struct ddt_loop_desc ddt_loop_desc_t;
-
-struct ddt_endloop_desc {
-    ddt_elem_id_description common;           /**< basic data description and flags */
-    uint32_t                items;            /**< number of elements */
-    uint32_t                unused;           /**< not used right now */
-    size_t                  size;             /**< real size of the data in the loop */
-    OPAL_PTRDIFF_TYPE       first_elem_disp;  /**< the displacement of the first block in the loop */
-};
-typedef struct ddt_endloop_desc ddt_endloop_desc_t;
-
-union dt_elem_desc {
-    ddt_elem_desc_t    elem;
-    ddt_loop_desc_t    loop;
-    ddt_endloop_desc_t end_loop;
-};
-typedef union dt_elem_desc dt_elem_desc_t;
-
-/* dt_type_description */
-typedef uint32_t opal_datatype_count_t;
-
-struct dt_type_desc_t {
-    opal_datatype_count_t  length;  /**< the maximum number of elements in the description array */
-    opal_datatype_count_t  used;    /**< the number of used elements in the description array */
-    dt_elem_desc_t*        desc;
-};
-typedef struct dt_type_desc_t dt_type_desc_t;
-
-/*
- * The datatype description.
- */
-#define OPAL_DATATYPE_MAX_PREDEFINED 25
-#define OPAL_DATATYPE_MAX_SUPPORTED  47
-#define OPAL_MAX_OBJECT_NAME         64
-
-struct opal_datatype_t {
-    opal_object_t      super;    /**< basic superclass */
-    uint16_t           flags;    /**< the flags */
-    uint16_t           id;       /**< data id, normally the index in the data array. */
-    uint32_t           bdt_used; /**< bitset of which basic datatypes are used in the data description */
-    size_t             size;     /**< total size in bytes of the memory used by the data if
-                                      the data is put on a contiguous buffer */
-    OPAL_PTRDIFF_TYPE  true_lb;  /**< the true lb of the data without user defined lb and ub */
-    OPAL_PTRDIFF_TYPE  true_ub;  /**< the true ub of the data without user defined lb and ub */
-    OPAL_PTRDIFF_TYPE  lb;       /**< lower bound in memory */
-    OPAL_PTRDIFF_TYPE  ub;       /**< upper bound in memory */
-    /* --- cacheline 1 boundary (64 bytes) --- */
-    size_t             nbElems;  /**< total number of elements inside the datatype */
-    uint32_t           align;    /**< data should be aligned to */
-
-    /* Attribute fields */
-    char               name[OPAL_MAX_OBJECT_NAME];  /**< name of the datatype */
-    /* --- cacheline 2 boundary (128 bytes) was 8-12 bytes ago --- */
-    dt_type_desc_t     desc;     /**< the data description */
-    dt_type_desc_t     opt_desc; /**< short description of the data used when conversion is useless
-                                      or in the send case (without conversion) */
-
-    uint32_t           btypes[OPAL_DATATYPE_MAX_SUPPORTED];
-                                 /**< basic elements count used to compute the size of the
-                                      datatype for remote nodes. The length of the array is dependent on
-                                      the maximum number of datatypes of all top layers.
-                                      Reason being is that Fortran is not at the OPAL layer. */
-    /* --- cacheline 5 boundary (320 bytes) was 32-36 bytes ago --- */
-
-    /* size: 352, cachelines: 6, members: 15 */
-    /* last cacheline: 28-32 bytes */
-};
-
-typedef struct opal_datatype_t opal_datatype_t;
-
-OPAL_DECLSPEC OBJ_CLASS_DECLARATION( opal_datatype_t );
-
-/* convertor and stack */
-typedef struct opal_convertor_t opal_convertor_t;
-
-typedef int32_t (*convertor_advance_fct_t)( opal_convertor_t* pConvertor,
-                                            struct iovec* iov,
-                                            uint32_t* out_size,
-                                            size_t* max_data );
-typedef void*(*memalloc_fct_t)( size_t* pLength, void* userdata );
-typedef void*(*memcpy_fct_t)( void* dest, const void* src, size_t n, opal_convertor_t* pConvertor );
-
-/* The master convertor struct (defined in convertor_internal.h) */
-struct opal_convertor_master_t;
-
-struct dt_stack_t {
-    int32_t           index;    /**< index in the element description */
-    int16_t           type;     /**< the type used for the last pack/unpack (original or OPAL_DATATYPE_UINT1) */
-    size_t            count;    /**< number of times we still have to do it */
-    OPAL_PTRDIFF_TYPE disp;     /**< actual displacement depending on the count field */
-};
-typedef struct dt_stack_t dt_stack_t;
-
-typedef int32_t (*conversion_fct_t)( opal_convertor_t* pConvertor, uint32_t count,
-                                     const void* from, size_t from_len, OPAL_PTRDIFF_TYPE from_extent,
-                                     void* to, size_t to_length, OPAL_PTRDIFF_TYPE to_extent,
-                                     OPAL_PTRDIFF_TYPE *advance );
-
-typedef struct opal_convertor_master_t {
-    struct opal_convertor_master_t* next;
-    uint32_t                        remote_arch;
-    uint32_t                        flags;
-    uint32_t                        hetero_mask;
-    const size_t                    remote_sizes[OPAL_DATATYPE_MAX_PREDEFINED];
-    conversion_fct_t*               pFunctions;   /**< the convertor functions pointer */
-} opal_convertor_master_t;
-
-#define MAX_IPC_EVENT_HANDLE   10
-
-struct opal_convertor_t {
-    opal_object_t                 super;          /**< basic superclass */
-    uint32_t                      remoteArch;     /**< the remote architecture */
-    uint32_t                      flags;          /**< the properties of this convertor */
-    size_t                        local_size;     /**< overall length data on local machine, compared to bConverted */
-    size_t                        remote_size;    /**< overall length data on remote machine, compared to bConverted */
-    const opal_datatype_t*        pDesc;          /**< the datatype description associated with the convertor */
-    const dt_type_desc_t*         use_desc;       /**< the version used by the convertor (normal or optimized) */
-    opal_datatype_count_t         count;          /**< the total number of full datatype elements */
-    uint32_t                      stack_size;     /**< size of the allocated stack */
-    /* --- cacheline 1 boundary (64 bytes) --- */
-    unsigned char*                pBaseBuf;       /**< initial buffer as supplied by the user */
-    dt_stack_t*                   pStack;         /**< the local stack for the actual conversion */
-    convertor_advance_fct_t       fAdvance;       /**< pointer to the pack/unpack functions */
-    struct opal_convertor_master_t* master;       /**< the master convertor */
-
-    /* All others fields get modified for every call to pack/unpack functions */
-    uint32_t                      stack_pos;      /**< the actual position on the stack */
-    uint32_t                      partial_length; /**< amount of data left over from the last unpack */
-    size_t                        bConverted;     /**< # of bytes already converted */
-    uint32_t                      checksum;       /**< checksum computed by pack/unpack operation */
-    uint32_t                      csum_ui1;       /**< partial checksum computed by pack/unpack operation */
-    size_t                        csum_ui2;       /**< partial checksum computed by pack/unpack operation */
-     /* --- cacheline 2 boundary (128 bytes) --- */
-    dt_stack_t                    static_stack[DT_STATIC_STACK_SIZE];  /**< local stack for small datatypes */
-    /* --- cacheline 3 boundary (192 bytes) was 56 bytes ago --- */
-
-#if OPAL_CUDA_SUPPORT
-    memcpy_fct_t                  cbmemcpy;       /**< memcpy or cuMemcpy */
-    void *                        stream;         /**< CUstream for async copy */
-
-    unsigned char *               gpu_buffer_ptr; /**< GPU buffer used for pack/unpack */
-    uint64_t *                    pipeline_event[MAX_IPC_EVENT_HANDLE]; /**< cuda event for pipeline */
-#endif
-    /* size: 248, cachelines: 4, members: 20 */
-    /* last cacheline: 56 bytes */
-};
-
-struct iovec {  
-    void *iov_base; /* Starting address */  
-    size_t iov_len; /* Length in bytes */  
-};
-
-
-OPAL_DECLSPEC extern union dt_elem_desc opal_datatype_predefined_elem_desc[2 * OPAL_DATATYPE_MAX_PREDEFINED];
-
-#define OPAL_DATATYPE_INIT_BTYPES_ARRAY_UNAVAILABLE { 0 }
-#define OPAL_DATATYPE_INIT_BTYPES_ARRAY(NAME) { [OPAL_DATATYPE_ ## NAME] = 1 }
-
-#define OPAL_DATATYPE_INIT_NAME(NAME) "OPAL_" #NAME
-
-/*
- * Macro to initialize the main description for basic types, setting the pointer
- * into the array opal_datatype_predefined_type_desc, which is initialized at
- * runtime in opal_datatype_init(). Each basic type has two desc-elements....
- */
-#define OPAL_DATATYPE_INIT_DESC_PREDEFINED(NAME)                                     \
-    {                                                                                \
-        .length = 1, .used = 1,                                                      \
-        .desc = &(opal_datatype_predefined_elem_desc[2 * OPAL_DATATYPE_ ## NAME])    \
-    }
-#define OPAL_DATATYPE_INIT_DESC_NULL  {.length = 0, .used = 0, .desc = NULL}
-
-#define OPAL_DATATYPE_INITIALIZER_UNAVAILABLE_NAMED( NAME, FLAGS )                   \
-    {                                                                                \
-        .super = OPAL_OBJ_STATIC_INIT(opal_datatype_t),                              \
-        .flags = OPAL_DATATYPE_FLAG_UNAVAILABLE | OPAL_DATATYPE_FLAG_PREDEFINED | (FLAGS), \
-        .id = OPAL_DATATYPE_ ## NAME,                                                \
-        .bdt_used = 0,                                                               \
-        .size = 0,                                                                   \
-        .true_lb = 0, .true_ub = 0, .lb = 0, .ub = 0,                                \
-        .align = 0,                                                                  \
-        .nbElems = 1,                                                                \
-        .name = OPAL_DATATYPE_INIT_NAME(NAME),                                       \
-        .desc = OPAL_DATATYPE_INIT_DESC_PREDEFINED(UNAVAILABLE),                     \
-        .opt_desc = OPAL_DATATYPE_INIT_DESC_PREDEFINED(UNAVAILABLE),                 \
-        .btypes = OPAL_DATATYPE_INIT_BTYPES_ARRAY_UNAVAILABLE                        \
-    }
-
-#define OPAL_DATATYPE_INITIALIZER_EMPTY( FLAGS )                        \
-    {                                                                   \
-        .super = OPAL_OBJ_STATIC_INIT(opal_datatype_t),                 \
-        .flags = OPAL_DATATYPE_FLAG_PREDEFINED | (FLAGS),               \
-        .id = 0,                                                        \
-        .bdt_used = 0,                                                  \
-        .size = 0,                                                      \
-        .true_lb = 0, .true_ub = 0, .lb = 0, .ub = 0,                   \
-        .align = 0,                                                     \
-        .nbElems = 1,                                                   \
-        .name = OPAL_DATATYPE_INIT_NAME(EMPTY),                         \
-        .desc = OPAL_DATATYPE_INIT_DESC_NULL,                           \
-        .opt_desc = OPAL_DATATYPE_INIT_DESC_NULL,                       \
-        .btypes = OPAL_DATATYPE_INIT_BTYPES_ARRAY_UNAVAILABLE           \
-    }
-
-#define OPAL_DATATYPE_INIT_BASIC_TYPE( TYPE, NAME, FLAGS )              \
-    {                                                                   \
-        .super = OPAL_OBJ_STATIC_INIT(opal_datatype_t),                 \
-        .flags = OPAL_DATATYPE_FLAG_PREDEFINED | (FLAGS),               \
-        .id = TYPE,                                                     \
-        .bdt_used = (((uint32_t)1)<<(TYPE)),                            \
-        .size = 0,                                                      \
-        .true_lb = 0, .true_ub = 0, .lb = 0, .ub = 0,                   \
-        .align = 0,                                                     \
-        .nbElems = 1,                                                   \
-        .name = OPAL_DATATYPE_INIT_NAME(NAME),                          \
-        .desc = OPAL_DATATYPE_INIT_DESC_NULL,                           \
-        .opt_desc = OPAL_DATATYPE_INIT_DESC_NULL,                       \
-        .btypes = OPAL_DATATYPE_INIT_BTYPES_ARRAY(NAME)                 \
-    }
-    
-#define OPAL_DATATYPE_INIT_BASIC_DATATYPE( TYPE, ALIGN, NAME, FLAGS )                \
-    {                                                                                \
-        .super = OPAL_OBJ_STATIC_INIT(opal_datatype_t),                              \
-        .flags = OPAL_DATATYPE_FLAG_BASIC | (FLAGS),                                 \
-        .id = OPAL_DATATYPE_ ## NAME,                                                \
-        .bdt_used = (((uint32_t)1)<<(OPAL_DATATYPE_ ## NAME)),                       \
-        .size = sizeof(TYPE),                                                        \
-        .true_lb = 0, .true_ub = sizeof(TYPE), .lb = 0, .ub = sizeof(TYPE),          \
-        .align = (ALIGN),                                                            \
-        .nbElems = 1,                                                                \
-        .name = OPAL_DATATYPE_INIT_NAME(NAME),                                       \
-        .desc = OPAL_DATATYPE_INIT_DESC_PREDEFINED(NAME),                            \
-        .opt_desc = OPAL_DATATYPE_INIT_DESC_PREDEFINED(NAME),                        \
-        .btypes = OPAL_DATATYPE_INIT_BTYPES_ARRAY(NAME)                              \
-    }
-
-#define OPAL_DATATYPE_INITIALIZER_LOOP(FLAGS)       OPAL_DATATYPE_INIT_BASIC_TYPE( OPAL_DATATYPE_LOOP, LOOP, FLAGS )
-#define OPAL_DATATYPE_INITIALIZER_END_LOOP(FLAGS)   OPAL_DATATYPE_INIT_BASIC_TYPE( OPAL_DATATYPE_END_LOOP, END_LOOP, FLAGS )
-#define OPAL_DATATYPE_INITIALIZER_LB(FLAGS)         OPAL_DATATYPE_INIT_BASIC_TYPE( OPAL_DATATYPE_LB, LB, FLAGS )
-#define OPAL_DATATYPE_INITIALIZER_UB(FLAGS)         OPAL_DATATYPE_INIT_BASIC_TYPE( OPAL_DATATYPE_UB, UB, FLAGS ) 
-#define OPAL_DATATYPE_INITIALIZER_INT1(FLAGS)       OPAL_DATATYPE_INIT_BASIC_DATATYPE( int8_t, OPAL_ALIGNMENT_INT8, INT1, FLAGS )
-#define OPAL_DATATYPE_INITIALIZER_INT2(FLAGS)       OPAL_DATATYPE_INIT_BASIC_DATATYPE( int16_t, OPAL_ALIGNMENT_INT16, INT2, FLAGS )
-#define OPAL_DATATYPE_INITIALIZER_INT4(FLAGS)       OPAL_DATATYPE_INIT_BASIC_DATATYPE( int32_t, OPAL_ALIGNMENT_INT32, INT4, FLAGS )
-#define OPAL_DATATYPE_INITIALIZER_INT8(FLAGS)       OPAL_DATATYPE_INIT_BASIC_DATATYPE( int64_t, OPAL_ALIGNMENT_INT64, INT8, FLAGS )
-#ifdef HAVE_INT128_T
-#define OPAL_DATATYPE_INITIALIZER_INT16(FLAGS)      OPAL_DATATYPE_INIT_BASIC_DATATYPE( int128_t, OPAL_ALIGNMENT_INT128, INT16, FLAGS )
-#else
-#define OPAL_DATATYPE_INITIALIZER_INT16(FLAGS)      OPAL_DATATYPE_INITIALIZER_UNAVAILABLE_NAMED( INT16, FLAGS )
-#endif
-#define OPAL_DATATYPE_INITIALIZER_UINT1(FLAGS)      OPAL_DATATYPE_INIT_BASIC_DATATYPE( uint8_t, OPAL_ALIGNMENT_INT8, UINT1, FLAGS )
-#define OPAL_DATATYPE_INITIALIZER_UINT2(FLAGS)      OPAL_DATATYPE_INIT_BASIC_DATATYPE( uint16_t, OPAL_ALIGNMENT_INT16, UINT2, FLAGS )
-#define OPAL_DATATYPE_INITIALIZER_UINT4(FLAGS)      OPAL_DATATYPE_INIT_BASIC_DATATYPE( uint32_t, OPAL_ALIGNMENT_INT32, UINT4, FLAGS )
-#define OPAL_DATATYPE_INITIALIZER_UINT8(FLAGS)      OPAL_DATATYPE_INIT_BASIC_DATATYPE( uint64_t, OPAL_ALIGNMENT_INT64, UINT8, FLAGS )
-#ifdef HAVE_UINT128_T
-#define OPAL_DATATYPE_INITIALIZER_UINT16(FLAGS)     OPAL_DATATYPE_INIT_BASIC_DATATYPE( uint128_t, OPAL_ALIGNMENT_INT128, UINT16, FLAGS )
-#else
-#define OPAL_DATATYPE_INITIALIZER_UINT16(FLAGS)     OPAL_DATATYPE_INITIALIZER_UNAVAILABLE_NAMED( INT16, FLAGS )
-#endif
-
-#if SIZEOF_FLOAT == 2
-#define OPAL_DATATYPE_INITIALIZER_FLOAT2(FLAGS)     OPAL_DATATYPE_INIT_BASIC_DATATYPE( float, OPAL_ALIGNMENT_FLOAT, FLOAT2, FLAGS )
-#elif SIZEOF_DOUBLE == 2
-#define OPAL_DATATYPE_INITIALIZER_FLOAT2(FLAGS)     OPAL_DATATYPE_INIT_BASIC_DATATYPE( double, OPAL_ALIGNMENT_DOUBLE, FLOAT2, FLAGS )
-#elif SIZEOF_LONG_DOUBLE == 2
-#define OPAL_DATATYPE_INITIALIZER_FLOAT2(FLAGS)     OPAL_DATATYPE_INIT_BASIC_DATATYPE( long double, OPAL_ALIGNMENT_LONG_DOUBLE, FLOAT2, FLAGS )
-#else
-#define OPAL_DATATYPE_INITIALIZER_FLOAT2(FLAGS)     OPAL_DATATYPE_INITIALIZER_UNAVAILABLE_NAMED( FLOAT2, FLAGS )
-#endif
-
-#if SIZEOF_FLOAT == 4
-#define OPAL_DATATYPE_INITIALIZER_FLOAT4(FLAGS)     OPAL_DATATYPE_INIT_BASIC_DATATYPE( float, OPAL_ALIGNMENT_FLOAT, FLOAT4, FLAGS )
-#elif SIZEOF_DOUBLE == 4
-#define OPAL_DATATYPE_INITIALIZER_FLOAT4(FLAGS)     OPAL_DATATYPE_INIT_BASIC_DATATYPE( double, OPAL_ALIGNMENT_DOUBLE, FLOAT4, FLAGS )
-#elif SIZEOF_LONG_DOUBLE == 4
-#define OPAL_DATATYPE_INITIALIZER_FLOAT4(FLAGS)     OPAL_DATATYPE_INIT_BASIC_DATATYPE( long double, OPAL_ALIGNMENT_LONG_DOUBLE, FLOAT4, FLAGS )
-#else
-#define OPAL_DATATYPE_INITIALIZER_FLOAT4(FLAGS)     OPAL_DATATYPE_INITIALIZER_UNAVAILABLE_NAMED( FLOAT4, FLAGS )
-#endif
-
-#if SIZEOF_FLOAT == 8
-#define OPAL_DATATYPE_INITIALIZER_FLOAT8(FLAGS)     OPAL_DATATYPE_INIT_BASIC_DATATYPE( float, OPAL_ALIGNMENT_FLOAT, FLOAT8, FLAGS )
-#elif SIZEOF_DOUBLE == 8
-#define OPAL_DATATYPE_INITIALIZER_FLOAT8(FLAGS)     OPAL_DATATYPE_INIT_BASIC_DATATYPE( double, OPAL_ALIGNMENT_DOUBLE, FLOAT8, FLAGS )
-#elif SIZEOF_LONG_DOUBLE == 8
-#define OPAL_DATATYPE_INITIALIZER_FLOAT8(FLAGS)     OPAL_DATATYPE_INIT_BASIC_DATATYPE( long double, OPAL_ALIGNMENT_LONG_DOUBLE, FLOAT8, FLAGS )
-#else
-#define OPAL_DATATYPE_INITIALIZER_FLOAT8(FLAGS)     OPAL_DATATYPE_INITIALIZER_UNAVAILABLE_NAMED( FLOAT8, FLAGS )
-#endif
-
-#if SIZEOF_FLOAT == 12
-#define OPAL_DATATYPE_INITIALIZER_FLOAT12(FLAGS)    OPAL_DATATYPE_INIT_BASIC_DATATYPE( float, OPAL_ALIGNMENT_FLOAT, FLOAT12, FLAGS )
-#elif SIZEOF_DOUBLE == 12
-#define OPAL_DATATYPE_INITIALIZER_FLOAT12(FLAGS)    OPAL_DATATYPE_INIT_BASIC_DATATYPE( double, OPAL_ALIGNMENT_DOUBLE, FLOAT12, FLAGS )
-#elif SIZEOF_LONG_DOUBLE == 12
-#define OPAL_DATATYPE_INITIALIZER_FLOAT12(FLAGS)    OPAL_DATATYPE_INIT_BASIC_DATATYPE( long double, OPAL_ALIGNMENT_LONG_DOUBLE, FLOAT12, FLAGS )
-#else
-#define OPAL_DATATYPE_INITIALIZER_FLOAT12(FLAGS)    OPAL_DATATYPE_INITIALIZER_UNAVAILABLE_NAMED( FLOAT12, FLAGS )
-#endif
-
-#if SIZEOF_FLOAT == 16
-#define OPAL_DATATYPE_INITIALIZER_FLOAT16(FLAGS)    OPAL_DATATYPE_INIT_BASIC_DATATYPE( float, OPAL_ALIGNMENT_FLOAT, FLOAT16, FLAGS )
-#elif SIZEOF_DOUBLE == 16
-#define OPAL_DATATYPE_INITIALIZER_FLOAT16(FLAGS)    OPAL_DATATYPE_INIT_BASIC_DATATYPE( double, OPAL_ALIGNMENT_DOUBLE, FLOAT16, FLAGS )
-#elif SIZEOF_LONG_DOUBLE == 16
-#define OPAL_DATATYPE_INITIALIZER_FLOAT16(FLAGS)    OPAL_DATATYPE_INIT_BASIC_DATATYPE( long double, OPAL_ALIGNMENT_LONG_DOUBLE, FLOAT16, FLAGS )
-#else
-#define OPAL_DATATYPE_INITIALIZER_FLOAT16(FLAGS)    OPAL_DATATYPE_INITIALIZER_UNAVAILABLE_NAMED( FLOAT16, FLAGS )
-#endif
-
-#if HAVE_FLOAT__COMPLEX
-#define OPAL_DATATYPE_INITIALIZER_FLOAT_COMPLEX(FLAGS) OPAL_DATATYPE_INIT_BASIC_DATATYPE( float _Complex, OPAL_ALIGNMENT_FLOAT_COMPLEX, FLOAT_COMPLEX, FLAGS )
-#else
-#define OPAL_DATATYPE_INITIALIZER_FLOAT_COMPLEX(FLAGS) OPAL_DATATYPE_INITIALIZER_UNAVAILABLE_NAMED( FLOAT_COMPLEX, FLAGS)
-#endif
-
-#if HAVE_DOUBLE__COMPLEX
-#define OPAL_DATATYPE_INITIALIZER_DOUBLE_COMPLEX(FLAGS) OPAL_DATATYPE_INIT_BASIC_DATATYPE( double _Complex, OPAL_ALIGNMENT_DOUBLE_COMPLEX, DOUBLE_COMPLEX, FLAGS )
-#else
-#define OPAL_DATATYPE_INITIALIZER_DOUBLE_COMPLEX(FLAGS) OPAL_DATATYPE_INITIALIZER_UNAVAILABLE_NAMED( DOUBLE_COMPLEX, FLAGS)
-#endif
-
-#if HAVE_LONG_DOUBLE__COMPLEX
-#define OPAL_DATATYPE_INITIALIZER_LONG_DOUBLE_COMPLEX(FLAGS) OPAL_DATATYPE_INIT_BASIC_DATATYPE( long double _Complex, OPAL_ALIGNMENT_LONG_DOUBLE_COMPLEX, LONG_DOUBLE_COMPLEX, FLAGS )
-#else
-#define OPAL_DATATYPE_INITIALIZER_LONG_DOUBLE_COMPLEX(FLAGS) OPAL_DATATYPE_INITIALIZER_UNAVAILABLE_NAMED( LONG_DOUBLE_COMPLEX, FLAGS)
-#endif
-
-#define OPAL_DATATYPE_INITIALIZER_BOOL(FLAGS)       OPAL_DATATYPE_INIT_BASIC_DATATYPE( _Bool, OPAL_ALIGNMENT_BOOL, BOOL, FLAGS )
-
-#if OPAL_ALIGNMENT_WCHAR != 0
-#define OPAL_DATATYPE_INITIALIZER_WCHAR(FLAGS)      OPAL_DATATYPE_INIT_BASIC_DATATYPE( wchar_t, OPAL_ALIGNMENT_WCHAR, WCHAR, FLAGS )
-#else
-#define OPAL_DATATYPE_INITIALIZER_WCHAR(FLAGS)      OPAL_DATATYPE_INITIALIZER_UNAVAILABLE_NAMED( WCHAR, FLAGS )
-#endif
-    
-#define SAVE_STACK( PSTACK, INDEX, TYPE, COUNT, DISP) \
-do { \
-   (PSTACK)->index    = (INDEX); \
-   (PSTACK)->type     = (TYPE); \
-   (PSTACK)->count    = (COUNT); \
-   (PSTACK)->disp     = (DISP); \
-} while(0)
-
-#define PUSH_STACK( PSTACK, STACK_POS, INDEX, TYPE, COUNT, DISP) \
-do { \
-    dt_stack_t* pTempStack = (PSTACK) + 1; \
-    SAVE_STACK( pTempStack, (INDEX), (TYPE), (COUNT), (DISP) );  \
-    (STACK_POS)++; \
-    (PSTACK) = pTempStack; \
-} while(0)
-
-#define UPDATE_INTERNAL_COUNTERS( DESCRIPTION, POSITION, ELEMENT, COUNTER ) \
-    do {                                                                \
-        (ELEMENT) = &((DESCRIPTION)[(POSITION)]);                       \
-        (COUNTER) = (ELEMENT)->elem.count;                              \
-    } while (0)   
-
-OPAL_DECLSPEC extern const size_t opal_datatype_basicDatatypesSize[OPAL_DATATYPE_MAX_PREDEFINED];
-
-#define     OPAL_DATATYPE_LOOP_SIZE         0
-#define     OPAL_DATATYPE_END_LOOP_SIZE     0
-#define     OPAL_DATATYPE_LB_SIZE           0
-#define     OPAL_DATATYPE_UB_SIZE           0
-#define     OPAL_DATATYPE_INT1_SIZE         sizeof(int8_t)
-#define     OPAL_DATATYPE_INT2_SIZE         sizeof(int16_t)
-#define     OPAL_DATATYPE_INT4_SIZE         sizeof(int32_t)
-#define     OPAL_DATATYPE_INT8_SIZE         sizeof(int64_t)
-#ifdef HAVE_INT128_T
-#   define  OPAL_DATATYPE_INT16_SIZE        sizeof(int128_t)       /* Yes, double-machine word integers are available */
-#else
-#   define  OPAL_DATATYPE_INT16_SIZE        0
-#endif
-
-#define     OPAL_DATATYPE_UINT1_SIZE        sizeof(uint8_t)
-#define     OPAL_DATATYPE_UINT2_SIZE        sizeof(uint16_t)
-#define     OPAL_DATATYPE_UINT4_SIZE        sizeof(uint32_t)
-#define     OPAL_DATATYPE_UINT8_SIZE        sizeof(uint64_t)
-#ifdef HAVE_UINT128_T
-#   define  OPAL_DATATYPE_UINT16_SIZE       sizeof(uint128_t)      /* Yes, double-machine word integers are available */
-#else
-#   define  OPAL_DATATYPE_UINT16_SIZE       0
-#endif
-
-#if SIZEOF_FLOAT == 2
-#   define  OPAL_DATATYPE_FLOAT2_SIZE       sizeof(float)
-#elif SIZEOF_DOUBLE == 2
-#   define  OPAL_DATATYPE_FLOAT2_SIZE       sizeof(double)
-#elif SIZEOF_LONG_DOUBLE == 2
-#   define  OPAL_DATATYPE_FLOAT2_SIZE       sizeof(long double)
-#else
-#   define  OPAL_DATATYPE_FLOAT2_SIZE       0
-#endif
-
-#if SIZEOF_FLOAT == 4
-#   define  OPAL_DATATYPE_FLOAT4_SIZE       sizeof(float)
-#elif SIZEOF_DOUBLE == 4
-#   define  OPAL_DATATYPE_FLOAT4_SIZE       sizeof(double)
-#elif SIZEOF_LONG_DOUBLE == 4
-#   define  OPAL_DATATYPE_FLOAT4_SIZE       sizeof(long double)
-#else
-#   define  OPAL_DATATYPE_FLOAT4_SIZE       0
-#endif
-
-#if SIZEOF_FLOAT == 8
-#   define  OPAL_DATATYPE_FLOAT8_SIZE       sizeof(float)
-#elif SIZEOF_DOUBLE == 8
-#   define  OPAL_DATATYPE_FLOAT8_SIZE       sizeof(double)
-#elif SIZEOF_LONG_DOUBLE == 8
-#   define  OPAL_DATATYPE_FLOAT8_SIZE       sizeof(long double)
-#else
-#   define  OPAL_DATATYPE_FLOAT8_SIZE       0
-#endif
-
-#if SIZEOF_FLOAT == 12
-#   define  OPAL_DATATYPE_FLOAT12_SIZE      sizeof(float)
-#elif SIZEOF_DOUBLE == 12
-#   define  OPAL_DATATYPE_FLOAT12_SIZE      sizeof(double)
-#elif SIZEOF_LONG_DOUBLE == 12
-#   define  OPAL_DATATYPE_FLOAT12_SIZE      sizeof(long double)
-#else
-#   define  OPAL_DATATYPE_FLOAT12_SIZE      0
-#endif
-
-#if SIZEOF_FLOAT == 16
-#   define  OPAL_DATATYPE_FLOAT16_SIZE      sizeof(float)
-#elif SIZEOF_DOUBLE == 16
-#   define  OPAL_DATATYPE_FLOAT16_SIZE      sizeof(double)
-#elif SIZEOF_LONG_DOUBLE == 16
-#   define  OPAL_DATATYPE_FLOAT16_SIZE      sizeof(long double)
-#else
-#   define  OPAL_DATATYPE_FLOAT16_SIZE      0
-#endif
-        
-#if HAVE_FLOAT__COMPLEX
-#   define  OPAL_DATATYPE_FLOAT_COMPLEX_SIZE    sizeof(float _Complex)
-#else
-#   define  OPAL_DATATYPE_FLOAT_COMPLEX_SIZE    0
-#endif
-
-#if HAVE_DOUBLE__COMPLEX
-#   define  OPAL_DATATYPE_DOUBLE_COMPLEX_SIZE    sizeof(float _Complex)
-#else
-#   define  OPAL_DATATYPE_DOUBLE_COMPLEX_SIZE    0
-#endif
-    
-#if HAVE_LONG_DOUBLE__COMPLEX
-#   define  OPAL_DATATYPE_LONG_DOUBLE_COMPLEX_SIZE    sizeof(float _Complex)
-#else
-#   define  OPAL_DATATYPE_LONG_DOUBLE_COMPLEX_SIZE    0
-#endif
-
-#define     OPAL_DATATYPE_BOOL_SIZE         sizeof(_Bool)
-#if OPAL_ALIGNMENT_WCHAR != 0
-#   define  OPAL_DATATYPE_WCHAR_SIZE        sizeof(wchar_t)
-#else 
-#   define  OPAL_DATATYPE_WCHAR_SIZE        0
-#endif
-
-#define     OPAL_DATATYPE_UNAVAILABLE_SIZE  0
-
-#endif  /* OPAL_DATATYPE_ORIG_INTERNAL_H_HAS_BEEN_INCLUDED */

From 351bce9341f6768dbc2f6f15f9413432685d6a5b Mon Sep 17 00:00:00 2001
From: eddy16112 <eddy16112@gmail.com>
Date: Thu, 29 Oct 2015 17:15:50 -0400
Subject: [PATCH 134/190] cuda ddt support is able to turn itself off. Make it
 support multi-GPU when ompi support multi-GPU in the future

---
 ompi/mca/pml/ob1/pml_ob1_cuda.c               |   2 +-
 opal/datatype/cuda/opal_datatype_cuda.cu      | 100 ++++++++----------
 opal/datatype/cuda/opal_datatype_cuda.cuh     |   6 +-
 .../cuda/opal_datatype_cuda_internal.cuh      |  10 +-
 .../cuda/opal_datatype_pack_cuda_wrapper.cu   |  23 ++--
 .../cuda/opal_datatype_unpack_cuda_wrapper.cu |  13 ++-
 opal/datatype/opal_convertor.c                |  18 +++-
 opal/datatype/opal_datatype_gpu.c             |  20 ++--
 opal/datatype/opal_datatype_gpu.h             |   8 +-
 9 files changed, 107 insertions(+), 93 deletions(-)

diff --git a/ompi/mca/pml/ob1/pml_ob1_cuda.c b/ompi/mca/pml/ob1/pml_ob1_cuda.c
index a8c507f35c6..af06f6ffdef 100644
--- a/ompi/mca/pml/ob1/pml_ob1_cuda.c
+++ b/ompi/mca/pml/ob1/pml_ob1_cuda.c
@@ -114,7 +114,7 @@ int mca_pml_ob1_send_request_start_cuda(mca_pml_ob1_send_request_t* sendreq,
          * memory into RNDV message is expensive. */
         sendreq->req_send.req_base.req_convertor.flags |= CONVERTOR_CUDA;
         mca_bml_base_btl_t* bml_endpoint_btl = mca_bml_base_btl_array_get_index(&(sendreq->req_endpoint->btl_send), 0);
-        if ((bml_endpoint_btl->btl_flags & MCA_BTL_FLAGS_CUDA_GET) && CUDA_DDT_WITH_RDMA) {
+        if ((bml_endpoint_btl->btl_flags & MCA_BTL_FLAGS_CUDA_GET) && (opal_datatype_cuda_kernel_support == 1)) {
             unsigned char *base;
             size_t buffer_size = 0;
             if (convertor->local_size > bml_btl->btl->btl_cuda_ddt_pipeline_size) {
diff --git a/opal/datatype/cuda/opal_datatype_cuda.cu b/opal/datatype/cuda/opal_datatype_cuda.cu
index 18706fe0f78..e0ca2cd7ed3 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cu
+++ b/opal/datatype/cuda/opal_datatype_cuda.cu
@@ -11,12 +11,10 @@
 
 
 ddt_cuda_list_t *cuda_free_list;
-ddt_cuda_device_t *cuda_device;
-ddt_cuda_stream_t* cuda_streams;
+ddt_cuda_device_t *cuda_devices;
+ddt_cuda_device_t *current_cuda_device;
 struct iovec cuda_iov[CUDA_NB_IOV];
 uint32_t cuda_iov_count;
-ddt_cuda_iov_dist_t* cuda_iov_dist_h[NB_STREAMS];
-ddt_cuda_iov_dist_t* cuda_iov_dist_d[NB_STREAMS];
 
 //uint8_t ALIGNMENT_DOUBLE, ALIGNMENT_FLOAT, ALIGNMENT_CHAR;
 
@@ -177,90 +175,86 @@ void opal_cuda_output(int output_id, const char *format, ...)
     }
 }
 
-void opal_datatype_cuda_init(void)
+int32_t opal_datatype_cuda_init(void)
 {
-    uint32_t i;
+    uint32_t i, j;
     int device;
     cudaError res;
 
     res = cudaGetDevice(&device);
     if( cudaSuccess != res ) {
         opal_cuda_output(0, "Cannot retrieve the device being used. Drop CUDA support!\n");
-        return;
+        return OPAL_ERROR;
     }    
 
     cuda_free_list = init_cuda_free_list();
     
     /* init device */
-    cuda_device = (ddt_cuda_device_t *)malloc(sizeof(ddt_cuda_device_t)*1);
-    for (i = 0; i < 1; i++) {
+    cuda_devices = (ddt_cuda_device_t *)malloc(sizeof(ddt_cuda_device_t)*NB_GPUS);
+    for (i = 0; i < NB_GPUS; i++) {
         unsigned char *gpu_ptr = NULL;
         if (cudaMalloc((void **)(&gpu_ptr), sizeof(char)*DT_CUDA_BUFFER_SIZE) != cudaSuccess) {
             DT_CUDA_DEBUG( opal_cuda_output( 0, "cudaMalloc is failed in GPU %d\n", i); );
+            return OPAL_ERROR;
         }
         DT_CUDA_DEBUG ( opal_cuda_output(2, "DDT engine cudaMalloc buffer %p in GPU %d\n", gpu_ptr, i););
         cudaMemset(gpu_ptr, 0, sizeof(char)*DT_CUDA_BUFFER_SIZE);
-        cuda_device[i].gpu_buffer = gpu_ptr;
+        cuda_devices[i].gpu_buffer = gpu_ptr;
         
-        cuda_device[i].buffer_free_size = DT_CUDA_BUFFER_SIZE;
+        cuda_devices[i].buffer_free_size = DT_CUDA_BUFFER_SIZE;
         ddt_cuda_buffer_t *p = obj_ddt_cuda_buffer_new();
         p->size = DT_CUDA_BUFFER_SIZE;
         p->gpu_addr = gpu_ptr;
-        cuda_device[i].buffer_free.head = p;
-        cuda_device[i].buffer_free.tail = cuda_device[i].buffer_free.head;
-        cuda_device[i].buffer_free.nb_elements = 1;
+        cuda_devices[i].buffer_free.head = p;
+        cuda_devices[i].buffer_free.tail = cuda_devices[i].buffer_free.head;
+        cuda_devices[i].buffer_free.nb_elements = 1;
         
-        cuda_device[i].buffer_used.head = NULL;
-        cuda_device[i].buffer_used.tail = NULL;
-        cuda_device[i].buffer_used_size = 0;
-        cuda_device[i].buffer_used.nb_elements = 0;
-    }
+        cuda_devices[i].buffer_used.head = NULL;
+        cuda_devices[i].buffer_used.tail = NULL;
+        cuda_devices[i].buffer_used_size = 0;
+        cuda_devices[i].buffer_used.nb_elements = 0;
     
-    
-    /* init cuda stream */
-    cuda_streams = (ddt_cuda_stream_t*)malloc(sizeof(ddt_cuda_stream_t));
-    for (i = 0; i < NB_STREAMS; i++) {
-        cudaStreamCreate(&(cuda_streams->opal_cuda_stream[i]));
+        /* init cuda stream */
+        ddt_cuda_stream_t *cuda_streams = (ddt_cuda_stream_t*)malloc(sizeof(ddt_cuda_stream_t));
+        for (j = 0; j < NB_STREAMS; j++) {
+            cudaStreamCreate(&(cuda_streams->opal_cuda_stream[j]));
+            cudaMallocHost((void **)(&(cuda_devices[i].cuda_iov_dist_h[j])), sizeof(ddt_cuda_iov_dist_t) * CUDA_MAX_NB_BLOCKS * CUDA_IOV_MAX_TASK_PER_BLOCK);
+            cudaMalloc((void **)(&(cuda_devices[i].cuda_iov_dist_d[j])), sizeof(ddt_cuda_iov_dist_t) * CUDA_MAX_NB_BLOCKS * CUDA_IOV_MAX_TASK_PER_BLOCK);
+        }
+        cuda_streams->current_stream_id = 0;
+        cuda_devices[i].cuda_streams = cuda_streams;
     }
-    cuda_streams->current_stream_id = 0;
+    current_cuda_device = &(cuda_devices[0]);
     
     /* init cuda_iov */
     cuda_iov_count = CUDA_NB_IOV;
     
-    /* only for iov version */
-    for (i = 0; i < NB_STREAMS; i++) {
-        cudaMallocHost((void **)(&cuda_iov_dist_h[i]), sizeof(ddt_cuda_iov_dist_t)*CUDA_MAX_NB_BLOCKS*CUDA_IOV_MAX_TASK_PER_BLOCK);
-        cudaMalloc((void **)(&cuda_iov_dist_d[i]), sizeof(ddt_cuda_iov_dist_t)*CUDA_MAX_NB_BLOCKS*CUDA_IOV_MAX_TASK_PER_BLOCK);
-    }
-    
     // /* init size for double, float, char */
     // ALIGNMENT_DOUBLE = sizeof(double);
     // ALIGNMENT_FLOAT = sizeof(float);
     // ALIGNMENT_CHAR = sizeof(char);
     
     cudaDeviceSynchronize();
+    return OPAL_SUCCESS;
 }
 
-void opal_datatype_cuda_fini(void)
+int32_t opal_datatype_cuda_fini(void)
 {
-    uint32_t i;
+    uint32_t i, j;
     
-    /* destory cuda stream */
-    for (i = 0; i < NB_STREAMS; i++) {
-        cudaStreamDestroy(cuda_streams->opal_cuda_stream[i]);
-    }
-    free(cuda_streams);
-    
-    /* only for iov version */
-    for (i = 0; i < NB_STREAMS; i++) {
-        cudaFreeHost(cuda_iov_dist_h[i]);
-        cudaFree(cuda_iov_dist_d[i]);
+    for (i = 0; i < NB_GPUS; i++) {
+        /* free gpu buffer */
+        cudaFree(cuda_devices[i].gpu_buffer);   
+        /* destory cuda stream and iov*/
+        for (j = 0; j < NB_STREAMS; j++) {
+            cudaStreamDestroy(cuda_devices[i].cuda_streams->opal_cuda_stream[j]);
+            cudaFreeHost(cuda_devices[i].cuda_iov_dist_h[j]);
+            cudaFree(cuda_devices[i].cuda_iov_dist_d[j]);
+        }
+        free(cuda_devices[i].cuda_streams);
     }
-}
-
-void opal_cuda_sync_device(void)
-{
-    cudaDeviceSynchronize();
+    current_cuda_device = NULL;
+    return OPAL_SUCCESS;
 }
 
 int32_t opal_cuda_is_gpu_buffer(const void *ptr)
@@ -283,7 +277,7 @@ void* opal_cuda_malloc_gpu_buffer(size_t size, int gpu_id)
 {
     int dev_id;
     cudaGetDevice(&dev_id);
-    ddt_cuda_device_t *device = &cuda_device[gpu_id];
+    ddt_cuda_device_t *device = &cuda_devices[gpu_id];
     if (device->buffer_free_size < size) {
         DT_CUDA_DEBUG( opal_cuda_output( 0, "No GPU buffer at dev_id %d.\n", dev_id); );
         return NULL;
@@ -320,7 +314,7 @@ void* opal_cuda_malloc_gpu_buffer(size_t size, int gpu_id)
 
 void opal_cuda_free_gpu_buffer(void *addr, int gpu_id)
 {
-    ddt_cuda_device_t *device = &cuda_device[gpu_id];
+    ddt_cuda_device_t *device = &cuda_devices[gpu_id];
     ddt_cuda_buffer_t *ptr = device->buffer_used.head;
 
     /* Find the holder of this GPU allocation */
@@ -352,13 +346,13 @@ void opal_cuda_free_gpu_buffer(void *addr, int gpu_id)
 
 void opal_cuda_d2dcpy_async(void* dst, const void* src, size_t count)
 {
-    cudaMemcpyAsync(dst, src, count, cudaMemcpyDeviceToDevice, cuda_streams->opal_cuda_stream[0]);
+    cudaMemcpyAsync(dst, src, count, cudaMemcpyDeviceToDevice, current_cuda_device->cuda_streams->opal_cuda_stream[0]);
 }
 
 void opal_cuda_d2dcpy(void* dst, const void* src, size_t count)
 {
-    cudaMemcpyAsync(dst, src, count, cudaMemcpyDeviceToDevice, cuda_streams->opal_cuda_stream[0]);
-    cudaStreamSynchronize(cuda_streams->opal_cuda_stream[0]);
+    cudaMemcpyAsync(dst, src, count, cudaMemcpyDeviceToDevice, current_cuda_device->cuda_streams->opal_cuda_stream[0]);
+    cudaStreamSynchronize(current_cuda_device->cuda_streams->opal_cuda_stream[0]);
 }
 
 void opal_dump_cuda_list(ddt_cuda_list_t *list)
diff --git a/opal/datatype/cuda/opal_datatype_cuda.cuh b/opal/datatype/cuda/opal_datatype_cuda.cuh
index d71d349d46b..5cc2a77c6ef 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cuh
+++ b/opal/datatype/cuda/opal_datatype_cuda.cuh
@@ -4,9 +4,9 @@
 extern "C"
 {
     
-void opal_datatype_cuda_init(void);
+int32_t opal_datatype_cuda_init(void);
 
-void opal_datatype_cuda_fini(void);
+int32_t opal_datatype_cuda_fini(void);
                                 
                                                 
 int32_t opal_generic_simple_pack_function_cuda_vector( opal_convertor_t* pConvertor,
@@ -83,8 +83,6 @@ void unpack_predefined_data_cuda( dt_elem_desc_t* ELEM,
                                   unsigned char** DESTINATION,
                                   size_t* SPACE );
 
-void opal_cuda_sync_device(void);
-
 int32_t opal_cuda_is_gpu_buffer(const void *ptr);
 
 void* opal_cuda_malloc_gpu_buffer(size_t size, int gpu_id);
diff --git a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
index fe49449f976..3977da4125b 100644
--- a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
+++ b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
@@ -19,6 +19,7 @@
 #define OPAL_DATATYPE_VECTOR_USE_PIPELINE   0
 
 
+#define NB_GPUS                 1
 #define IOV_ARRAY_SIZE          1
 #define DT_CUDA_BUFFER_SIZE    1024*1024*200
 #define DT_CUDA_FREE_LIST_SIZE  50
@@ -72,15 +73,16 @@ typedef struct {
     ddt_cuda_list_t buffer_used;
     size_t buffer_free_size;
     size_t buffer_used_size;
+    ddt_cuda_stream_t *cuda_streams;
+    ddt_cuda_iov_dist_t* cuda_iov_dist_h[NB_STREAMS];
+    ddt_cuda_iov_dist_t* cuda_iov_dist_d[NB_STREAMS];
 } ddt_cuda_device_t;
 
 extern ddt_cuda_list_t *cuda_free_list;
-extern ddt_cuda_device_t *cuda_device;
-extern ddt_cuda_stream_t* cuda_streams;
+extern ddt_cuda_device_t *cuda_devices;
+extern ddt_cuda_device_t *current_cuda_device;
 extern struct iovec cuda_iov[CUDA_NB_IOV];
 extern uint32_t cuda_iov_count;
-extern ddt_cuda_iov_dist_t* cuda_iov_dist_h[NB_STREAMS];
-extern ddt_cuda_iov_dist_t* cuda_iov_dist_d[NB_STREAMS];
 
 //extern uint8_t ALIGNMENT_DOUBLE, ALIGNMENT_FLOAT, ALIGNMENT_CHAR;
 
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
index 0a51f66d877..2c674bbea6d 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
@@ -27,6 +27,8 @@ int32_t opal_generic_simple_pack_function_cuda_vector2(opal_convertor_t* pConver
     uint8_t free_required;
     uint32_t count_desc_tmp;
     
+    ddt_cuda_stream_t *cuda_streams = current_cuda_device->cuda_streams;
+    
 #if defined(OPAL_DATATYPE_CUDA_TIMING)    
     TIMER_DATA_TYPE start, end, start_total, end_total;
     long total_time;
@@ -227,6 +229,8 @@ int32_t opal_generic_simple_pack_function_cuda_vector(opal_convertor_t* pConvert
     uint8_t transfer_required;
     uint8_t free_required;
     uint32_t count_desc_tmp;
+    
+    ddt_cuda_stream_t *cuda_streams = current_cuda_device->cuda_streams;
 
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
     TIMER_DATA_TYPE start, end, start_total, end_total;
@@ -478,6 +482,8 @@ void pack_contiguous_loop_cuda_pipeline( dt_elem_desc_t* ELEM,
     int i, pipeline_blocks;
     uint32_t _copy_loops_per_pipeline; 
     
+    ddt_cuda_stream_t *cuda_streams = current_cuda_device->cuda_streams;
+    
 #if defined(OPAL_DATATYPE_CUDA_TIMING)    
     TIMER_DATA_TYPE start, end, start_total, end_total;
     long total_time;
@@ -654,6 +660,7 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
     uint8_t alignment, orig_alignment;
 //    int32_t orig_stack_index;
     
+    ddt_cuda_stream_t *cuda_streams = current_cuda_device->cuda_streams;
     ddt_cuda_iov_dist_t* cuda_iov_dist_h_current;
     ddt_cuda_iov_dist_t* cuda_iov_dist_d_current;
 
@@ -740,8 +747,8 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
     while (cuda_iov_count > 0) {
         
         nb_blocks_used = 0;
-        cuda_iov_dist_h_current = cuda_iov_dist_h[cuda_streams->current_stream_id];
-        cuda_iov_dist_d_current = cuda_iov_dist_d[cuda_streams->current_stream_id];
+        cuda_iov_dist_h_current = current_cuda_device->cuda_iov_dist_h[cuda_streams->current_stream_id];
+        cuda_iov_dist_d_current = current_cuda_device->cuda_iov_dist_d[cuda_streams->current_stream_id];
         source_base = (unsigned char*)cuda_iov[0].iov_base; 
 
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
@@ -769,8 +776,6 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
             } else {
                 alignment = ALIGNMENT_CHAR;
             }
-            
-           // alignment = ALIGNMENT_DOUBLE;
 
             count_desc = length_per_iovec / alignment;
             residue_desc = length_per_iovec % alignment;
@@ -866,13 +871,7 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
     move_time = ELAPSED_TIME( start, end );
     DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: DtoH memcpy in %ld microsec, transfer required %d\n", move_time, transfer_required ); );
 #endif
-    // float *vtmp = (float *)iov[0].iov_base;
-    // DT_CUDA_DEBUG ( opal_cuda_output(0, "packed iov buffer, total packed %d\n", total_packed); );
-    // for (uint32_t i = 0; i < total_packed/sizeof(float); i++) {
-    //     printf(" %1.f ", *vtmp);
-    //     vtmp ++;
-    // }
-    // printf("\n");
+
     iov[0].iov_len = total_packed;
     *max_data = total_packed;
     *out_size = 1;
@@ -908,6 +907,8 @@ void pack_predefined_data_cuda( dt_elem_desc_t* ELEM,
     unsigned char* _source = (*SOURCE) + _elem->disp;
     uint32_t nb_blocks, tasks_per_block, thread_per_block;
     unsigned char* _destination = *(DESTINATION);
+    
+    ddt_cuda_stream_t *cuda_streams = current_cuda_device->cuda_streams;
 
     _copy_blength = 8;//opal_datatype_basicDatatypes[_elem->common.type]->size;
     if( (_copy_count * _copy_blength) > *(SPACE) ) {
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
index 696a2c12694..f6251fd77f7 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
@@ -24,6 +24,8 @@ int32_t opal_generic_simple_unpack_function_cuda_vector2( opal_convertor_t* pCon
     uint32_t iov_count;
     uint8_t free_required;
     uint32_t count_desc_tmp;
+    
+    ddt_cuda_stream_t *cuda_streams = current_cuda_device->cuda_streams;
 
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
     TIMER_DATA_TYPE start, end;
@@ -197,6 +199,8 @@ int32_t opal_generic_simple_unpack_function_cuda_vector( opal_convertor_t* pConv
     uint32_t iov_count;
     uint8_t free_required;
     uint32_t count_desc_tmp;
+    
+    ddt_cuda_stream_t *cuda_streams = current_cuda_device->cuda_streams;
 
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
     TIMER_DATA_TYPE start, end;
@@ -370,6 +374,7 @@ int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvert
     uint8_t alignment, orig_alignment;
 //    int32_t orig_stack_index;
 
+    ddt_cuda_stream_t *cuda_streams = current_cuda_device->cuda_streams;
     ddt_cuda_iov_dist_t* cuda_iov_dist_h_current;
     ddt_cuda_iov_dist_t* cuda_iov_dist_d_current;
 
@@ -447,8 +452,8 @@ int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvert
     while (cuda_iov_count > 0) {
 
         nb_blocks_used = 0;
-        cuda_iov_dist_h_current = cuda_iov_dist_h[cuda_streams->current_stream_id];
-        cuda_iov_dist_d_current = cuda_iov_dist_d[cuda_streams->current_stream_id];
+        cuda_iov_dist_h_current = current_cuda_device->cuda_iov_dist_h[cuda_streams->current_stream_id];
+        cuda_iov_dist_d_current = current_cuda_device->cuda_iov_dist_d[cuda_streams->current_stream_id];
         destination_base = (unsigned char*)cuda_iov[0].iov_base;
 
 #if defined (OPAL_DATATYPE_CUDA_TIMING)
@@ -736,7 +741,9 @@ void unpack_predefined_data_cuda( dt_elem_desc_t* ELEM,
     ddt_elem_desc_t* _elem = &((ELEM)->elem);
     unsigned char* _source = (*SOURCE);
     uint32_t nb_blocks, tasks_per_block, thread_per_block;
-    unsigned char* _destination = *(DESTINATION) + _elem->disp;;
+    unsigned char* _destination = *(DESTINATION) + _elem->disp;
+    
+    ddt_cuda_stream_t *cuda_streams = current_cuda_device->cuda_streams;
 
     _copy_blength = 8;//opal_datatype_basicDatatypes[_elem->common.type]->size;
     if( (_copy_count * _copy_blength) > *(SPACE) ) {
diff --git a/opal/datatype/opal_convertor.c b/opal/datatype/opal_convertor.c
index d8f6fbe0687..82bbf241685 100644
--- a/opal/datatype/opal_convertor.c
+++ b/opal/datatype/opal_convertor.c
@@ -580,7 +580,12 @@ int32_t opal_convertor_prepare_for_recv( opal_convertor_t* convertor,
         if( convertor->pDesc->flags & OPAL_DATATYPE_FLAG_CONTIGUOUS ) {
             convertor->fAdvance = opal_unpack_homogeneous_contig_checksum;
         } else {
-            convertor->fAdvance = opal_generic_simple_unpack_checksum;
+            if ((convertor->flags & CONVERTOR_CUDA) && (opal_datatype_cuda_kernel_support == 1)) {
+                convertor->fAdvance = opal_generic_simple_unpack_cuda_checksum;
+                convertor->gpu_buffer_ptr = NULL;
+            } else {
+                convertor->fAdvance = opal_generic_simple_unpack_checksum;
+            }
         }
     } else {
 #if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
@@ -591,7 +596,7 @@ int32_t opal_convertor_prepare_for_recv( opal_convertor_t* convertor,
         if( convertor->pDesc->flags & OPAL_DATATYPE_FLAG_CONTIGUOUS ) {
             convertor->fAdvance = opal_unpack_homogeneous_contig;
         } else {
-            if ((convertor->flags & CONVERTOR_CUDA) && OPAL_DATATYPE_CUDA_KERNEL) {
+            if ((convertor->flags & CONVERTOR_CUDA) && (opal_datatype_cuda_kernel_support == 1)) {
                 convertor->fAdvance = opal_generic_simple_unpack_cuda;
                 convertor->gpu_buffer_ptr = NULL;
             } else {
@@ -628,7 +633,12 @@ int32_t opal_convertor_prepare_for_send( opal_convertor_t* convertor,
             else
                 convertor->fAdvance = opal_pack_homogeneous_contig_with_gaps_checksum;
         } else {
-            convertor->fAdvance = opal_generic_simple_pack_checksum;
+            if ((convertor->flags & CONVERTOR_CUDA) && (opal_datatype_cuda_kernel_support == 1)) {
+                convertor->fAdvance = opal_generic_simple_pack_cuda_checksum;
+                convertor->gpu_buffer_ptr = NULL;
+            } else {
+                convertor->fAdvance = opal_generic_simple_pack_checksum;
+            }
         }
     } else {
         if( datatype->flags & OPAL_DATATYPE_FLAG_CONTIGUOUS ) {
@@ -638,7 +648,7 @@ int32_t opal_convertor_prepare_for_send( opal_convertor_t* convertor,
             else
                 convertor->fAdvance = opal_pack_homogeneous_contig_with_gaps;
         } else {
-            if ((convertor->flags & CONVERTOR_CUDA) && OPAL_DATATYPE_CUDA_KERNEL ) {
+            if ((convertor->flags & CONVERTOR_CUDA) && (opal_datatype_cuda_kernel_support == 1)) {
                 convertor->fAdvance = opal_generic_simple_pack_cuda;
                 convertor->gpu_buffer_ptr = NULL;
             } else {
diff --git a/opal/datatype/opal_datatype_gpu.c b/opal/datatype/opal_datatype_gpu.c
index 4e516766737..f21b22c72d2 100644
--- a/opal/datatype/opal_datatype_gpu.c
+++ b/opal/datatype/opal_datatype_gpu.c
@@ -40,12 +40,14 @@
 
 #include "opal/datatype/opal_datatype_gpu.h"
 
+int32_t opal_datatype_cuda_kernel_support = 0;
+
 static void *opal_datatype_cuda_handle = NULL;
 static char *opal_datatype_cuda_lib = NULL;
 
-void (*opal_datatype_cuda_init_p)(void) = NULL;
+int32_t (*opal_datatype_cuda_init_p)(void) = NULL;
 
-void (*opal_datatype_cuda_fini_p)(void) = NULL;
+int32_t (*opal_datatype_cuda_fini_p)(void) = NULL;
 
 
 int32_t (*opal_generic_simple_pack_function_cuda_iov_p)( opal_convertor_t* pConvertor,
@@ -86,8 +88,6 @@ void (*pack_predefined_data_cuda_p)( dt_elem_desc_t* ELEM,
                                      unsigned char** DESTINATION,
                                      size_t* SPACE ) = NULL;
 
-void (*opal_cuda_sync_device_p)(void) = NULL;
-
 void (*opal_cuda_free_gpu_buffer_p)(void *addr, int gpu_id) = NULL;
 
 void* (*opal_cuda_malloc_gpu_buffer_p)(size_t size, int gpu_id) = NULL;
@@ -131,14 +131,16 @@ int32_t opal_datatype_gpu_init(void)
         OPAL_DATATYPE_FIND_CUDA_FUNCTION_OR_RETURN( opal_datatype_cuda_handle, pack_contiguous_loop_cuda );
         OPAL_DATATYPE_FIND_CUDA_FUNCTION_OR_RETURN( opal_datatype_cuda_handle, unpack_contiguous_loop_cuda );
         OPAL_DATATYPE_FIND_CUDA_FUNCTION_OR_RETURN( opal_datatype_cuda_handle, pack_predefined_data_cuda );
-        OPAL_DATATYPE_FIND_CUDA_FUNCTION_OR_RETURN( opal_datatype_cuda_handle, opal_cuda_sync_device );
         OPAL_DATATYPE_FIND_CUDA_FUNCTION_OR_RETURN( opal_datatype_cuda_handle, opal_cuda_free_gpu_buffer );
         OPAL_DATATYPE_FIND_CUDA_FUNCTION_OR_RETURN( opal_datatype_cuda_handle, opal_cuda_malloc_gpu_buffer );
         OPAL_DATATYPE_FIND_CUDA_FUNCTION_OR_RETURN( opal_datatype_cuda_handle, opal_cuda_d2dcpy_async );
         OPAL_DATATYPE_FIND_CUDA_FUNCTION_OR_RETURN( opal_datatype_cuda_handle, opal_cuda_d2dcpy );
 
-        (*opal_datatype_cuda_init_p)();
-        opal_output( 0, "cuda init done\n");
+        if (OPAL_SUCCESS != (*opal_datatype_cuda_init_p)()) {
+            return OPAL_ERROR;
+        }
+        opal_datatype_cuda_kernel_support = 1;
+        opal_output( 0, "opal_datatype_cuda_kernel_support init done\n");
     }
     return OPAL_SUCCESS;
 }
@@ -157,7 +159,6 @@ int32_t opal_datatype_gpu_fini(void)
         pack_contiguous_loop_cuda_p = NULL;
         unpack_contiguous_loop_cuda_p = NULL;
         pack_predefined_data_cuda_p = NULL;
-        opal_cuda_sync_device_p = NULL;
         opal_cuda_free_gpu_buffer_p = NULL;
         opal_cuda_malloc_gpu_buffer_p = NULL;
         opal_cuda_d2dcpy_async_p = NULL;
@@ -169,7 +170,8 @@ int32_t opal_datatype_gpu_fini(void)
         if( NULL != opal_datatype_cuda_lib )
             free(opal_datatype_cuda_lib);
         opal_datatype_cuda_lib = NULL;
-        opal_output( 0, "cuda fini done\n");
+        opal_datatype_cuda_kernel_support = 0;
+        opal_output( 0, "opal_datatype_cuda_kernel_support fini done\n");
     }
     return OPAL_SUCCESS;
 }
diff --git a/opal/datatype/opal_datatype_gpu.h b/opal/datatype/opal_datatype_gpu.h
index df42d68b6fc..340fbf24da7 100644
--- a/opal/datatype/opal_datatype_gpu.h
+++ b/opal/datatype/opal_datatype_gpu.h
@@ -3,12 +3,14 @@
 
 #define OPAL_DATATYPE_CUDA_KERNEL   1
 
+extern int32_t opal_datatype_cuda_kernel_support;
+
 int32_t opal_datatype_gpu_init(void);
 int32_t opal_datatype_gpu_fini(void);
 
-extern void (*opal_datatype_cuda_init_p)(void);
+extern int32_t (*opal_datatype_cuda_init_p)(void);
 
-extern void (*opal_datatype_cuda_fini_p)(void);
+extern int32_t (*opal_datatype_cuda_fini_p)(void);
                                                               
 extern int32_t (*opal_generic_simple_pack_function_cuda_iov_p)( opal_convertor_t* pConvertor,
                                                                 struct iovec* iov, 
@@ -47,8 +49,6 @@ extern void (*pack_predefined_data_cuda_p)( dt_elem_desc_t* ELEM,
                                             unsigned char** SOURCE,
                                             unsigned char** DESTINATION,
                                             size_t* SPACE );
-                                            
-extern void (*opal_cuda_sync_device_p)(void);
 
 extern void (*opal_cuda_free_gpu_buffer_p)(void *addr, int gpu_id);
 

From aa571161535c2abe0d6fcfa0aef3cc2534cb3e13 Mon Sep 17 00:00:00 2001
From: eddy16112 <eddy16112@gmail.com>
Date: Fri, 30 Oct 2015 18:42:09 -0400
Subject: [PATCH 135/190] fix a cuda stream bug for iov, remove some stream
 syncs

---
 opal/datatype/cuda/opal_datatype_cuda.cu      | 35 ++++++--
 opal/datatype/cuda/opal_datatype_cuda.cuh     |  4 +-
 .../cuda/opal_datatype_cuda_internal.cuh      | 20 ++++-
 .../cuda/opal_datatype_pack_cuda_wrapper.cu   | 85 ++++++++++---------
 .../cuda/opal_datatype_unpack_cuda_wrapper.cu | 76 +++++++++++------
 test/datatype/ddt_benchmark.c                 |  6 +-
 6 files changed, 146 insertions(+), 80 deletions(-)

diff --git a/opal/datatype/cuda/opal_datatype_cuda.cu b/opal/datatype/cuda/opal_datatype_cuda.cu
index e0ca2cd7ed3..3c5208d7122 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cu
+++ b/opal/datatype/cuda/opal_datatype_cuda.cu
@@ -215,14 +215,21 @@ int32_t opal_datatype_cuda_init(void)
         cuda_devices[i].buffer_used.nb_elements = 0;
     
         /* init cuda stream */
-        ddt_cuda_stream_t *cuda_streams = (ddt_cuda_stream_t*)malloc(sizeof(ddt_cuda_stream_t));
+        ddt_cuda_stream_t *cuda_streams = (ddt_cuda_stream_t *)malloc(sizeof(ddt_cuda_stream_t));
+        ddt_cuda_iov_pipeline_block_t *cuda_iov_pipeline_block = NULL;
         for (j = 0; j < NB_STREAMS; j++) {
             cudaStreamCreate(&(cuda_streams->opal_cuda_stream[j]));
-            cudaMallocHost((void **)(&(cuda_devices[i].cuda_iov_dist_h[j])), sizeof(ddt_cuda_iov_dist_t) * CUDA_MAX_NB_BLOCKS * CUDA_IOV_MAX_TASK_PER_BLOCK);
-            cudaMalloc((void **)(&(cuda_devices[i].cuda_iov_dist_d[j])), sizeof(ddt_cuda_iov_dist_t) * CUDA_MAX_NB_BLOCKS * CUDA_IOV_MAX_TASK_PER_BLOCK);
+            cuda_iov_pipeline_block = (ddt_cuda_iov_pipeline_block_t *)malloc(sizeof(ddt_cuda_iov_pipeline_block_t));
+            cudaMallocHost((void **)(&(cuda_iov_pipeline_block->cuda_iov_dist_h)), sizeof(ddt_cuda_iov_dist_t) * CUDA_MAX_NB_BLOCKS * CUDA_IOV_MAX_TASK_PER_BLOCK);
+            cudaMalloc((void **)(&(cuda_iov_pipeline_block->cuda_iov_dist_d)), sizeof(ddt_cuda_iov_dist_t) * CUDA_MAX_NB_BLOCKS * CUDA_IOV_MAX_TASK_PER_BLOCK);
+            cuda_iov_pipeline_block->cuda_stream = &(cuda_streams->opal_cuda_stream[0]);
+            cuda_iov_pipeline_block->cuda_stream_id = 0;
+            cudaEventCreate(&(cuda_iov_pipeline_block->cuda_event), cudaEventDisableTiming);
+            cuda_devices[i].cuda_iov_pipeline_block[j] = cuda_iov_pipeline_block;
         }
         cuda_streams->current_stream_id = 0;
         cuda_devices[i].cuda_streams = cuda_streams;
+        cudaEventCreate(&(cuda_devices[i].memcpy_event), cudaEventDisableTiming);
     }
     current_cuda_device = &(cuda_devices[0]);
     
@@ -246,12 +253,23 @@ int32_t opal_datatype_cuda_fini(void)
         /* free gpu buffer */
         cudaFree(cuda_devices[i].gpu_buffer);   
         /* destory cuda stream and iov*/
+        ddt_cuda_iov_pipeline_block_t *cuda_iov_pipeline_block = NULL;
         for (j = 0; j < NB_STREAMS; j++) {
             cudaStreamDestroy(cuda_devices[i].cuda_streams->opal_cuda_stream[j]);
-            cudaFreeHost(cuda_devices[i].cuda_iov_dist_h[j]);
-            cudaFree(cuda_devices[i].cuda_iov_dist_d[j]);
+            cuda_iov_pipeline_block = cuda_devices[i].cuda_iov_pipeline_block[j];
+            if (cuda_iov_pipeline_block != NULL) {
+                cudaFreeHost(cuda_iov_pipeline_block->cuda_iov_dist_h);
+                cudaFree(cuda_iov_pipeline_block->cuda_iov_dist_d);
+                cudaEventDestroy(cuda_iov_pipeline_block->cuda_event);
+                cuda_iov_pipeline_block->cuda_stream = NULL;
+                cuda_iov_pipeline_block->cuda_stream_id = -1;
+                free(cuda_iov_pipeline_block);
+                cuda_iov_pipeline_block = NULL;
+            }
         }
         free(cuda_devices[i].cuda_streams);
+        cuda_devices[i].cuda_streams = NULL;
+        cudaEventDestroy(cuda_devices[i].memcpy_event);
     }
     current_cuda_device = NULL;
     return OPAL_SUCCESS;
@@ -344,6 +362,13 @@ void opal_cuda_free_gpu_buffer(void *addr, int gpu_id)
     DT_CUDA_DEBUG( opal_cuda_output( 2, "Free GPU buffer %p.\n", addr); );
 }
 
+void opal_cuda_check_error(cudaError_t err)
+{
+    if (err != cudaSuccess) {
+        DT_CUDA_DEBUG( opal_cuda_output(0, "CUDA calls error %s\n", cudaGetErrorString(err)); );
+    }
+}
+
 void opal_cuda_d2dcpy_async(void* dst, const void* src, size_t count)
 {
     cudaMemcpyAsync(dst, src, count, cudaMemcpyDeviceToDevice, current_cuda_device->cuda_streams->opal_cuda_stream[0]);
diff --git a/opal/datatype/cuda/opal_datatype_cuda.cuh b/opal/datatype/cuda/opal_datatype_cuda.cuh
index 5cc2a77c6ef..8c228fc3404 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cuh
+++ b/opal/datatype/cuda/opal_datatype_cuda.cuh
@@ -35,7 +35,7 @@ void pack_contiguous_loop_cuda( dt_elem_desc_t* ELEM,
                                 unsigned char** DESTINATION,
                                 size_t* SPACE );
                                 
-void pack_contiguous_loop_cuda_memcpy2d( dt_elem_desc_t* ELEM,
+void pack_contiguous_loop_cuda_memcpy2d_d2h( dt_elem_desc_t* ELEM,
                                          uint32_t* COUNT,
                                          unsigned char** SOURCE,
                                          unsigned char** DESTINATION,
@@ -59,7 +59,7 @@ void unpack_contiguous_loop_cuda( dt_elem_desc_t* ELEM,
                                   unsigned char** DESTINATION,
                                   size_t* SPACE );
 
-void unpack_contiguous_loop_cuda_memcpy2d( dt_elem_desc_t* ELEM,
+void unpack_contiguous_loop_cuda_memcpy2d_d2h( dt_elem_desc_t* ELEM,
                                            uint32_t* COUNT,
                                            unsigned char** SOURCE,
                                            unsigned char** DESTINATION,
diff --git a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
index 3977da4125b..506a5fe22cd 100644
--- a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
+++ b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
@@ -13,10 +13,12 @@
 #define OPAL_DATATYPE_CUDA_DEBUG    1
 //#define OPAL_DATATYPE_CUDA_KERNEL_TIME
 #define OPAL_DATATYPE_CUDA_DEBUG_LEVEL  2
-//#define OPAL_DATATYPE_CUDA_TIMING
-#define OPAL_DATATYPE_VECTOR_USE_MEMCPY2D   0
+#define OPAL_DATATYPE_CUDA_TIMING
+#define OPAL_DATATYPE_VECTOR_USE_MEMCPY2D_D2H   0
 #define OPAL_DATATYPE_VECTOR_USE_ZEROCPY   0
 #define OPAL_DATATYPE_VECTOR_USE_PIPELINE   0
+#define OPAL_DATATYPE_VECTOR_USE_MEMCPY2D_AS_KERNEL   1
+
 
 
 #define NB_GPUS                 1
@@ -53,6 +55,14 @@ typedef struct {
     uint8_t element_alignment;
 } ddt_cuda_iov_dist_t;
 
+typedef struct {
+    ddt_cuda_iov_dist_t* cuda_iov_dist_h;
+    ddt_cuda_iov_dist_t* cuda_iov_dist_d;
+    cudaStream_t *cuda_stream;
+    int32_t cuda_stream_id;
+    cudaEvent_t cuda_event;
+} ddt_cuda_iov_pipeline_block_t;
+
 typedef struct ddt_cuda_buffer{
     unsigned char* gpu_addr;
     size_t size;
@@ -74,8 +84,8 @@ typedef struct {
     size_t buffer_free_size;
     size_t buffer_used_size;
     ddt_cuda_stream_t *cuda_streams;
-    ddt_cuda_iov_dist_t* cuda_iov_dist_h[NB_STREAMS];
-    ddt_cuda_iov_dist_t* cuda_iov_dist_d[NB_STREAMS];
+    ddt_cuda_iov_pipeline_block_t *cuda_iov_pipeline_block[NB_STREAMS];
+    cudaEvent_t memcpy_event;
 } ddt_cuda_device_t;
 
 extern ddt_cuda_list_t *cuda_free_list;
@@ -120,6 +130,8 @@ __global__ void opal_empty_kernel_noargs();
 
 void opal_cuda_output(int output_id, const char *format, ...);
 
+void opal_cuda_check_error(cudaError_t err);
+
 #if defined (OPAL_DATATYPE_CUDA_DEBUG)
 #define DT_CUDA_DEBUG( INST ) if (OPAL_DATATYPE_CUDA_DEBUG) { INST }
 #else
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
index 2c674bbea6d..dccf2803c6a 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
@@ -77,7 +77,7 @@ int32_t opal_generic_simple_pack_function_cuda_vector2(opal_convertor_t* pConver
             }
             transfer_required = 0;
         } else {
-            if (OPAL_DATATYPE_VECTOR_USE_MEMCPY2D || OPAL_DATATYPE_VECTOR_USE_ZEROCPY) {
+            if (OPAL_DATATYPE_VECTOR_USE_MEMCPY2D_D2H || OPAL_DATATYPE_VECTOR_USE_ZEROCPY) {
                 pConvertor->gpu_buffer_ptr = NULL;
                 transfer_required = 0;
                 free_required = 0;
@@ -148,8 +148,8 @@ int32_t opal_generic_simple_pack_function_cuda_vector2(opal_convertor_t* pConver
             if( OPAL_DATATYPE_LOOP == pElem->elem.common.type ) {
                 OPAL_PTRDIFF_TYPE local_disp = (OPAL_PTRDIFF_TYPE)conv_ptr;
                 if( pElem->loop.common.flags & OPAL_DATATYPE_FLAG_CONTIGUOUS ) {
-                    if (OPAL_DATATYPE_VECTOR_USE_MEMCPY2D) {
-                        pack_contiguous_loop_cuda_memcpy2d(pElem, &count_desc, &conv_ptr, &iov_ptr, &iov_len_local);
+                    if (OPAL_DATATYPE_VECTOR_USE_MEMCPY2D_D2H) {
+                        pack_contiguous_loop_cuda_memcpy2d_d2h(pElem, &count_desc, &conv_ptr, &iov_ptr, &iov_len_local);
                     } else if (OPAL_DATATYPE_VECTOR_USE_ZEROCPY) {
                         pack_contiguous_loop_cuda_zerocopy(pElem, &count_desc, &conv_ptr, &iov_ptr, &iov_len_local);
                     } else if (OPAL_DATATYPE_VECTOR_USE_PIPELINE) {
@@ -280,7 +280,7 @@ int32_t opal_generic_simple_pack_function_cuda_vector(opal_convertor_t* pConvert
             }
             transfer_required = 0;
         } else {
-            if (OPAL_DATATYPE_VECTOR_USE_MEMCPY2D || OPAL_DATATYPE_VECTOR_USE_ZEROCPY) {
+            if (OPAL_DATATYPE_VECTOR_USE_MEMCPY2D_D2H || OPAL_DATATYPE_VECTOR_USE_ZEROCPY) {
                 pConvertor->gpu_buffer_ptr = NULL;
                 transfer_required = 0;
                 free_required = 0;
@@ -304,7 +304,6 @@ int32_t opal_generic_simple_pack_function_cuda_vector(opal_convertor_t* pConvert
                 iov_ptr = pConvertor->gpu_buffer_ptr;
             }
         }
-        cudaStreamSynchronize(cuda_streams->opal_cuda_stream[0]);
         while( 1 ) {
             while( pElem->elem.common.flags & OPAL_DATATYPE_FLAG_DATA ) {
                 /* now here we have a basic datatype */
@@ -350,8 +349,8 @@ int32_t opal_generic_simple_pack_function_cuda_vector(opal_convertor_t* pConvert
             if( OPAL_DATATYPE_LOOP == pElem->elem.common.type ) {
                 OPAL_PTRDIFF_TYPE local_disp = (OPAL_PTRDIFF_TYPE)conv_ptr;
                 if( pElem->loop.common.flags & OPAL_DATATYPE_FLAG_CONTIGUOUS ) {
-                    if (OPAL_DATATYPE_VECTOR_USE_MEMCPY2D) {
-                        pack_contiguous_loop_cuda_memcpy2d(pElem, &count_desc, &conv_ptr, &iov_ptr, &iov_len_local);
+                    if (OPAL_DATATYPE_VECTOR_USE_MEMCPY2D_D2H) {
+                        pack_contiguous_loop_cuda_memcpy2d_d2h(pElem, &count_desc, &conv_ptr, &iov_ptr, &iov_len_local);
                     } else if (OPAL_DATATYPE_VECTOR_USE_ZEROCPY) {
                         pack_contiguous_loop_cuda_zerocopy(pElem, &count_desc, &conv_ptr, &iov_ptr, &iov_len_local);
                     } else if (OPAL_DATATYPE_VECTOR_USE_PIPELINE) {
@@ -425,6 +424,8 @@ void pack_contiguous_loop_cuda( dt_elem_desc_t* ELEM,
     uint32_t num_blocks, tasks_per_block;
     unsigned char* _destination = *(DESTINATION);
     
+    ddt_cuda_stream_t *cuda_streams = current_cuda_device->cuda_streams;
+    
 #if defined(OPAL_DATATYPE_CUDA_TIMING)    
     TIMER_DATA_TYPE start, end, start_total, end_total;
     long total_time;
@@ -442,13 +443,11 @@ void pack_contiguous_loop_cuda( dt_elem_desc_t* ELEM,
  //   tasks_per_block = THREAD_PER_BLOCK * TASK_PER_THREAD;
  //   num_blocks = (*COUNT + tasks_per_block - 1) / tasks_per_block;
 //    printf("extent %ld, size %ld, count %ld\n", _loop->extent, _end_loop->size, _copy_loops);
-    cudaMemcpy2D(_destination, _end_loop->size, _source, _loop->extent, _end_loop->size, _copy_loops, cudaMemcpyDeviceToDevice);
-//    pack_contiguous_loop_cuda_kernel_global<<<192, 4*THREAD_PER_BLOCK>>>(_copy_loops, _end_loop->size, _loop->extent, _source, _destination);
-//    int i;
-//    for (i = 0; i < 4; i++) {
-//     opal_empty_kernel<<<192, 4*THREAD_PER_BLOCK>>>(_copy_loops, _end_loop->size, _loop->extent, _source, _destination);
- //    pack_contiguous_loop_cuda_kernel_global<<<192, 4*THREAD_PER_BLOCK>>>(_copy_loops, _end_loop->size, _loop->extent, _source, _destination);
-//     }
+#if OPAL_DATATYPE_VECTOR_USE_MEMCPY2D_AS_KERNEL
+    cudaMemcpy2DAsync(_destination, _end_loop->size, _source, _loop->extent, _end_loop->size, _copy_loops, cudaMemcpyDeviceToDevice, cuda_streams->opal_cuda_stream[0]);
+#else
+    pack_contiguous_loop_cuda_kernel_global<<<192, 4*THREAD_PER_BLOCK, 0, cuda_streams->opal_cuda_stream[0]>>>(_copy_loops, _end_loop->size, _loop->extent, _source, _destination);
+#endif /* OPAL_DATATYPE_VECTOR_USE_MEMCPY2D_AS_KERNEL */
 
 #if !defined(OPAL_DATATYPE_CUDA_DRY_RUN)    
     *(SOURCE) = _source +  _loop->extent*_copy_loops - _end_loop->first_elem_disp;
@@ -457,7 +456,7 @@ void pack_contiguous_loop_cuda( dt_elem_desc_t* ELEM,
     *(COUNT) -= _copy_loops;
 #endif
     
-    cudaDeviceSynchronize();
+    cudaStreamSynchronize(cuda_streams->opal_cuda_stream[0]);
     
 #if defined(OPAL_DATATYPE_CUDA_TIMING) 
     GET_TIME( end );
@@ -466,6 +465,7 @@ void pack_contiguous_loop_cuda( dt_elem_desc_t* ELEM,
 #endif
 }
 
+/* this function will not be used */
 void pack_contiguous_loop_cuda_pipeline( dt_elem_desc_t* ELEM,
                                          uint32_t* COUNT,
                                          unsigned char** SOURCE,
@@ -537,9 +537,9 @@ void pack_contiguous_loop_cuda_pipeline( dt_elem_desc_t* ELEM,
     total_time = ELAPSED_TIME( start, end );
     DT_CUDA_DEBUG( opal_cuda_output( 2, "[Timing]: vector packing in %ld microsec\n", total_time ); );
 #endif
-}
+} 
 
-void pack_contiguous_loop_cuda_memcpy2d( dt_elem_desc_t* ELEM,
+void pack_contiguous_loop_cuda_memcpy2d_d2h( dt_elem_desc_t* ELEM,
                                 uint32_t* COUNT,
                                 unsigned char** SOURCE,
                                 unsigned char** DESTINATION,
@@ -551,6 +551,7 @@ void pack_contiguous_loop_cuda_memcpy2d( dt_elem_desc_t* ELEM,
     uint32_t _copy_loops = *(COUNT);
     uint32_t num_blocks, tasks_per_block;
     unsigned char* _destination = *(DESTINATION);
+    ddt_cuda_stream_t *cuda_streams = current_cuda_device->cuda_streams;
     
 #if defined(OPAL_DATATYPE_CUDA_TIMING)    
     TIMER_DATA_TYPE start, end, start_total, end_total;
@@ -566,7 +567,7 @@ void pack_contiguous_loop_cuda_memcpy2d( dt_elem_desc_t* ELEM,
     GET_TIME(start);
 #endif    
 
-    cudaMemcpy2D(_destination, _end_loop->size, _source, _loop->extent, _end_loop->size, _copy_loops, cudaMemcpyDeviceToHost);
+    cudaMemcpy2DAsync(_destination, _end_loop->size, _source, _loop->extent, _end_loop->size, _copy_loops, cudaMemcpyDeviceToHost, cuda_streams->opal_cuda_stream[0]);
 
 #if !defined(OPAL_DATATYPE_CUDA_DRY_RUN)    
     *(SOURCE) = _source +  _loop->extent*_copy_loops - _end_loop->first_elem_disp;
@@ -575,7 +576,7 @@ void pack_contiguous_loop_cuda_memcpy2d( dt_elem_desc_t* ELEM,
     *(COUNT) -= _copy_loops;
 #endif
     
-//    cudaDeviceSynchronize();
+    cudaStreamSynchronize(cuda_streams->opal_cuda_stream[0]);
     
 #if defined(OPAL_DATATYPE_CUDA_TIMING) 
     GET_TIME( end );
@@ -597,6 +598,7 @@ void pack_contiguous_loop_cuda_zerocopy( dt_elem_desc_t* ELEM,
     uint32_t num_blocks, tasks_per_block;
     unsigned char* _destination = *(DESTINATION);
     unsigned char* _destination_dev;
+    ddt_cuda_stream_t *cuda_streams = current_cuda_device->cuda_streams;
     
 #if defined(OPAL_DATATYPE_CUDA_TIMING)    
     TIMER_DATA_TYPE start, end, start_total, end_total;
@@ -612,16 +614,17 @@ void pack_contiguous_loop_cuda_zerocopy( dt_elem_desc_t* ELEM,
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
     GET_TIME(start);
 #endif    
- //   tasks_per_block = THREAD_PER_BLOCK * TASK_PER_THREAD;
- //   num_blocks = (*COUNT + tasks_per_block - 1) / tasks_per_block;
- //   cudaHostRegister(_destination, _copy_loops*_end_loop->size, cudaHostRegisterMapped);
+
     cudaError_t reg_rv = cudaHostGetDevicePointer((void **)&_destination_dev, (void *) _destination, 0);
     if (reg_rv != cudaSuccess) {
         const char *cuda_err = cudaGetErrorString(reg_rv);
         printf("can not get dev  mem, %s\n", cuda_err);
     }
-    //cudaMemcpy2D(_destination_dev, _end_loop->size, _source, _loop->extent, _end_loop->size, _copy_loops, cudaMemcpyDeviceToDevice);
-    pack_contiguous_loop_cuda_kernel_global<<<192, 4*THREAD_PER_BLOCK>>>(_copy_loops, _end_loop->size, _loop->extent, _source, _destination_dev);
+#if OPAL_DATATYPE_VECTOR_USE_MEMCPY2D_AS_KERNEL
+    cudaMemcpy2DAsync(_destination_dev, _end_loop->size, _source, _loop->extent, _end_loop->size, _copy_loops, cudaMemcpyDeviceToDevice, cuda_streams->opal_cuda_stream[0]);
+#else
+    pack_contiguous_loop_cuda_kernel_global<<<192, 4*THREAD_PER_BLOCK, 0, cuda_streams->opal_cuda_stream[0]>>>(_copy_loops, _end_loop->size, _loop->extent, _source, _destination_dev);
+#endif /* OPAL_DATATYPE_VECTOR_USE_MEMCPY2D_AS_KERNEL */
 
 #if !defined(OPAL_DATATYPE_CUDA_DRY_RUN)    
     *(SOURCE) = _source +  _loop->extent*_copy_loops - _end_loop->first_elem_disp;
@@ -630,8 +633,7 @@ void pack_contiguous_loop_cuda_zerocopy( dt_elem_desc_t* ELEM,
     *(COUNT) -= _copy_loops;
 #endif
     
-    cudaDeviceSynchronize();
- //   cudaHostUnregister(_destination);
+    cudaStreamSynchronize(cuda_streams->opal_cuda_stream[0]);
     
 #if defined(OPAL_DATATYPE_CUDA_TIMING) 
     GET_TIME( end );
@@ -659,10 +661,13 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
 //    dt_stack_t* pStack;
     uint8_t alignment, orig_alignment;
 //    int32_t orig_stack_index;
-    
+    cudaError_t cuda_err;
     ddt_cuda_stream_t *cuda_streams = current_cuda_device->cuda_streams;
     ddt_cuda_iov_dist_t* cuda_iov_dist_h_current;
     ddt_cuda_iov_dist_t* cuda_iov_dist_d_current;
+    ddt_cuda_iov_pipeline_block_t *cuda_iov_pipeline_block;
+    int iov_pipeline_block_id = 0;
+    cudaStream_t *cuda_stream_iov = NULL;
 
 #if defined(OPAL_DATATYPE_CUDA_TIMING)    
     TIMER_DATA_TYPE start, end, start_total, end_total;
@@ -721,8 +726,6 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
     convertor_flags = pConvertor->flags;
   //  orig_stack_index = pStack->index;
     destination_base = destination;
-    
-    cudaStreamSynchronize(cuda_streams->opal_cuda_stream[0]);
 
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
     GET_TIME(start_total);
@@ -747,8 +750,12 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
     while (cuda_iov_count > 0) {
         
         nb_blocks_used = 0;
-        cuda_iov_dist_h_current = current_cuda_device->cuda_iov_dist_h[cuda_streams->current_stream_id];
-        cuda_iov_dist_d_current = current_cuda_device->cuda_iov_dist_d[cuda_streams->current_stream_id];
+        cuda_iov_pipeline_block = current_cuda_device->cuda_iov_pipeline_block[iov_pipeline_block_id];
+        cuda_iov_dist_h_current = cuda_iov_pipeline_block->cuda_iov_dist_h;
+        cuda_iov_dist_d_current = cuda_iov_pipeline_block->cuda_iov_dist_d;
+        cuda_stream_iov = cuda_iov_pipeline_block->cuda_stream;
+        cuda_err = cudaStreamWaitEvent(*cuda_stream_iov, cuda_iov_pipeline_block->cuda_event, 0);
+        opal_cuda_check_error(cuda_err);
         source_base = (unsigned char*)cuda_iov[0].iov_base; 
 
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
@@ -786,9 +793,9 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
                 cuda_iov_dist_h_current[nb_blocks_used].dst_offset = destination - destination_base;
                 cuda_iov_dist_h_current[nb_blocks_used].element_alignment = alignment;
                 if ( (j+1) * thread_per_block <= count_desc) {
-                    cuda_iov_dist_h_current[nb_blocks_used].nb_elements = thread_per_block;// * sizeof(double);
+                    cuda_iov_dist_h_current[nb_blocks_used].nb_elements = thread_per_block;
                 } else {
-                    cuda_iov_dist_h_current[nb_blocks_used].nb_elements = count_desc - j*thread_per_block; //(thread_per_block - ((j+1)*thread_per_block - count_desc));// * sizeof(double);
+                    cuda_iov_dist_h_current[nb_blocks_used].nb_elements = count_desc - j*thread_per_block; 
                 }
 #if defined (OPAL_DATATYPE_CUDA_DEBUG)
                 assert(cuda_iov_dist_h_current[nb_blocks_used].nb_elements > 0);
@@ -824,13 +831,15 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
 #if defined(OPAL_DATATYPE_CUDA_TIMING)    
         GET_TIME( end );
         total_time = ELAPSED_TIME( start, end );
-        DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Pack to dest %p, iov is prepared in %ld microsec, kernel submitted to CUDA stream %d, nb_blocks %d\n", destination_base, total_time,  cuda_streams->current_stream_id, nb_blocks_used); );
+        DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Pack to dest %p, iov is prepared in %ld microsec, kernel submitted to CUDA stream %d, nb_blocks %d\n", destination_base, total_time,  cuda_iov_pipeline_block->cuda_stream_id, nb_blocks_used); );
 #endif
 
-        cudaMemcpyAsync(cuda_iov_dist_d_current, cuda_iov_dist_h_current, sizeof(ddt_cuda_iov_dist_t)*(nb_blocks_used), cudaMemcpyHostToDevice, cuda_streams->opal_cuda_stream[cuda_streams->current_stream_id]);
-        opal_generic_simple_pack_cuda_iov_kernel<<<nb_blocks, thread_per_block, 0, cuda_streams->opal_cuda_stream[cuda_streams->current_stream_id]>>>(cuda_iov_dist_d_current, nb_blocks_used, source_base, destination_base);
-        cuda_streams->current_stream_id ++;
-        cuda_streams->current_stream_id = cuda_streams->current_stream_id % NB_STREAMS;
+        cudaMemcpyAsync(cuda_iov_dist_d_current, cuda_iov_dist_h_current, sizeof(ddt_cuda_iov_dist_t)*(nb_blocks_used), cudaMemcpyHostToDevice, *cuda_stream_iov);
+        opal_generic_simple_pack_cuda_iov_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cuda_iov_dist_d_current, nb_blocks_used, source_base, destination_base);
+        cuda_err = cudaEventRecord(cuda_iov_pipeline_block->cuda_event, *cuda_stream_iov);
+        opal_cuda_check_error(cuda_err);
+        iov_pipeline_block_id ++;
+        iov_pipeline_block_id = iov_pipeline_block_id % NB_STREAMS;
         
         /* buffer is full */
         if (buffer_isfull) {
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
index f6251fd77f7..a8ba035ef78 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
@@ -62,7 +62,7 @@ int32_t opal_generic_simple_unpack_function_cuda_vector2( opal_convertor_t* pCon
             iov_ptr = (unsigned char*)iov[iov_count].iov_base;
             free_required = 0;
         } else {
-            if (OPAL_DATATYPE_VECTOR_USE_MEMCPY2D || OPAL_DATATYPE_VECTOR_USE_ZEROCPY) {
+            if (OPAL_DATATYPE_VECTOR_USE_MEMCPY2D_D2H || OPAL_DATATYPE_VECTOR_USE_ZEROCPY) {
                 iov_ptr = (unsigned char*)iov[iov_count].iov_base;
                 pConvertor->gpu_buffer_ptr = NULL;
                 free_required = 0;
@@ -81,6 +81,7 @@ int32_t opal_generic_simple_unpack_function_cuda_vector2( opal_convertor_t* pCon
         printf( "[Timing]: HtoD memcpy in %ld microsec, free required %d\n", total_time, free_required );
 #endif
         iov_len_local = iov[iov_count].iov_len;
+        cudaDeviceSynchronize();
         if( 0 != pConvertor->partial_length ) {
             /* not support yet */
         }
@@ -134,8 +135,8 @@ int32_t opal_generic_simple_unpack_function_cuda_vector2( opal_convertor_t* pCon
             if( OPAL_DATATYPE_LOOP == pElem->elem.common.type ) {
                 OPAL_PTRDIFF_TYPE local_disp = (OPAL_PTRDIFF_TYPE)conv_ptr;
                 if( pElem->loop.common.flags & OPAL_DATATYPE_FLAG_CONTIGUOUS ) {
-                    if (OPAL_DATATYPE_VECTOR_USE_MEMCPY2D) {
-                        unpack_contiguous_loop_cuda_memcpy2d(pElem, &count_desc, &iov_ptr, &conv_ptr, &iov_len_local);
+                    if (OPAL_DATATYPE_VECTOR_USE_MEMCPY2D_D2H) {
+                        unpack_contiguous_loop_cuda_memcpy2d_d2h(pElem, &count_desc, &iov_ptr, &conv_ptr, &iov_len_local);
                     } else if (OPAL_DATATYPE_VECTOR_USE_ZEROCPY) {
                         unpack_contiguous_loop_cuda_zerocopy(pElem, &count_desc, &iov_ptr, &conv_ptr, &iov_len_local);
                     } else {
@@ -237,7 +238,7 @@ int32_t opal_generic_simple_unpack_function_cuda_vector( opal_convertor_t* pConv
             iov_ptr = (unsigned char*)iov[iov_count].iov_base;
             free_required = 0;
         } else {
-            if (OPAL_DATATYPE_VECTOR_USE_MEMCPY2D || OPAL_DATATYPE_VECTOR_USE_ZEROCPY) {
+            if (OPAL_DATATYPE_VECTOR_USE_MEMCPY2D_D2H || OPAL_DATATYPE_VECTOR_USE_ZEROCPY) {
                 iov_ptr = (unsigned char*)iov[iov_count].iov_base;
                 pConvertor->gpu_buffer_ptr = NULL;
                 free_required = 0;
@@ -255,7 +256,6 @@ int32_t opal_generic_simple_unpack_function_cuda_vector( opal_convertor_t* pConv
         total_time = ELAPSED_TIME( start, end );
         DT_CUDA_DEBUG( opal_cuda_output( 2, "[Timing]: HtoD memcpy in %ld microsec, free required %d\n", total_time, free_required ); );
 #endif
-        cudaStreamSynchronize(cuda_streams->opal_cuda_stream[0]);
         iov_len_local = iov[iov_count].iov_len;
         if( 0 != pConvertor->partial_length ) {
             /* not support yet */
@@ -304,8 +304,8 @@ int32_t opal_generic_simple_unpack_function_cuda_vector( opal_convertor_t* pConv
             if( OPAL_DATATYPE_LOOP == pElem->elem.common.type ) {
                 OPAL_PTRDIFF_TYPE local_disp = (OPAL_PTRDIFF_TYPE)conv_ptr;
                 if( pElem->loop.common.flags & OPAL_DATATYPE_FLAG_CONTIGUOUS ) {
-                    if (OPAL_DATATYPE_VECTOR_USE_MEMCPY2D) {
-                        unpack_contiguous_loop_cuda_memcpy2d(pElem, &count_desc, &iov_ptr, &conv_ptr, &iov_len_local);
+                    if (OPAL_DATATYPE_VECTOR_USE_MEMCPY2D_D2H) {
+                        unpack_contiguous_loop_cuda_memcpy2d_d2h(pElem, &count_desc, &iov_ptr, &conv_ptr, &iov_len_local);
                     } else if (OPAL_DATATYPE_VECTOR_USE_ZEROCPY) {
                         unpack_contiguous_loop_cuda_zerocopy(pElem, &count_desc, &iov_ptr, &conv_ptr, &iov_len_local);
                     } else {
@@ -373,17 +373,18 @@ int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvert
 //    dt_stack_t* pStack;
     uint8_t alignment, orig_alignment;
 //    int32_t orig_stack_index;
-
+    cudaError_t cuda_err;
     ddt_cuda_stream_t *cuda_streams = current_cuda_device->cuda_streams;
     ddt_cuda_iov_dist_t* cuda_iov_dist_h_current;
     ddt_cuda_iov_dist_t* cuda_iov_dist_d_current;
+    ddt_cuda_iov_pipeline_block_t *cuda_iov_pipeline_block;
+    int iov_pipeline_block_id = 0;
+    cudaStream_t *cuda_stream_iov = NULL;
 
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
     TIMER_DATA_TYPE start, end, start_total, end_total;
     long total_time, move_time;
 #endif
-    
-    cudaStreamSynchronize(cuda_streams->opal_cuda_stream[0]);
 
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
     GET_TIME(start_total);
@@ -423,6 +424,9 @@ int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvert
     move_time = ELAPSED_TIME( start, end );
     DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: HtoD memcpy in %ld microsec, free required %d\n", move_time, free_required ); );
 #endif
+    
+//    cuda_err = cudaEventRecord(current_cuda_device->memcpy_event, current_cuda_device->cuda_streams->opal_cuda_stream[0]);
+//    opal_cuda_check_error(cuda_err);
 
 
 #if defined (OPAL_DATATYPE_CUDA_TIMING)
@@ -452,8 +456,12 @@ int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvert
     while (cuda_iov_count > 0) {
 
         nb_blocks_used = 0;
-        cuda_iov_dist_h_current = current_cuda_device->cuda_iov_dist_h[cuda_streams->current_stream_id];
-        cuda_iov_dist_d_current = current_cuda_device->cuda_iov_dist_d[cuda_streams->current_stream_id];
+        cuda_iov_pipeline_block = current_cuda_device->cuda_iov_pipeline_block[iov_pipeline_block_id];
+        cuda_iov_dist_h_current = cuda_iov_pipeline_block->cuda_iov_dist_h;
+        cuda_iov_dist_d_current = cuda_iov_pipeline_block->cuda_iov_dist_d;
+        cuda_stream_iov = cuda_iov_pipeline_block->cuda_stream;
+        cuda_err = cudaStreamWaitEvent(*cuda_stream_iov, cuda_iov_pipeline_block->cuda_event, 0);
+        opal_cuda_check_error(cuda_err);
         destination_base = (unsigned char*)cuda_iov[0].iov_base;
 
 #if defined (OPAL_DATATYPE_CUDA_TIMING)
@@ -529,14 +537,16 @@ int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvert
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
         GET_TIME( end );
         total_time = ELAPSED_TIME( start, end );
-        DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Unpack src %p, iov is prepared in %ld microsec, kernel submitted to CUDA stream %d\n", source_base, total_time,  cuda_streams->current_stream_id); );
+        DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Unpack src %p, iov is prepared in %ld microsec, kernel submitted to CUDA stream %d, nb_blocks_used %d\n", source_base, total_time,  cuda_iov_pipeline_block->cuda_stream_id, nb_blocks_used); );
 #endif
 
-        cudaMemcpyAsync(cuda_iov_dist_d_current, cuda_iov_dist_h_current, sizeof(ddt_cuda_iov_dist_t)*(nb_blocks_used), cudaMemcpyHostToDevice, cuda_streams->opal_cuda_stream[cuda_streams->current_stream_id]);
-        opal_generic_simple_unpack_cuda_iov_kernel<<<nb_blocks, thread_per_block, 0, cuda_streams->opal_cuda_stream[cuda_streams->current_stream_id]>>>(cuda_iov_dist_d_current, nb_blocks_used, source_base, destination_base);
-        cuda_streams->current_stream_id ++;
-        cuda_streams->current_stream_id = cuda_streams->current_stream_id % NB_STREAMS;
-
+        cudaMemcpyAsync(cuda_iov_dist_d_current, cuda_iov_dist_h_current, sizeof(ddt_cuda_iov_dist_t)*(nb_blocks_used), cudaMemcpyHostToDevice, *cuda_stream_iov);
+        opal_generic_simple_unpack_cuda_iov_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cuda_iov_dist_d_current, nb_blocks_used, source_base, destination_base);
+        cuda_err = cudaEventRecord(cuda_iov_pipeline_block->cuda_event, *cuda_stream_iov);
+        opal_cuda_check_error(cuda_err);
+        iov_pipeline_block_id ++;
+        iov_pipeline_block_id = iov_pipeline_block_id % NB_STREAMS;
+        
         /* buffer is full */
         if (buffer_isfull) {
             size_t total_converted_tmp = total_converted;
@@ -560,7 +570,7 @@ int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvert
 #endif
 
     }
-   // cudaDeviceSynchronize();
+
     for (i = 0; i < NB_STREAMS; i++) {
         cudaStreamSynchronize(cuda_streams->opal_cuda_stream[i]);
     }
@@ -599,6 +609,7 @@ void unpack_contiguous_loop_cuda( dt_elem_desc_t* ELEM,
     uint32_t _copy_loops = *(COUNT);
     uint32_t num_blocks, tasks_per_block;
     unsigned char* _source = *(SOURCE);
+    ddt_cuda_stream_t *cuda_streams = current_cuda_device->cuda_streams;
 
 #if defined(OPAL_DATATYPE_CUDA_TIMING)    
     TIMER_DATA_TYPE start, end, start_total, end_total;
@@ -615,8 +626,11 @@ void unpack_contiguous_loop_cuda( dt_elem_desc_t* ELEM,
 #endif
 //    tasks_per_block = THREAD_PER_BLOCK * TASK_PER_THREAD;
 //    num_blocks = (*COUNT + tasks_per_block - 1) / tasks_per_block;
-//    unpack_contiguous_loop_cuda_kernel_global<<<192, 4*THREAD_PER_BLOCK>>>(_copy_loops, _end_loop->size, _loop->extent, _source, _destination);
-     cudaMemcpy2D(_destination, _loop->extent, _source, _end_loop->size, _end_loop->size, _copy_loops, cudaMemcpyDeviceToDevice);
+#if OPAL_DATATYPE_VECTOR_USE_MEMCPY2D_AS_KERNEL
+     cudaMemcpy2DAsync(_destination, _loop->extent, _source, _end_loop->size, _end_loop->size, _copy_loops, cudaMemcpyDeviceToDevice, cuda_streams->opal_cuda_stream[0]);
+#else
+     unpack_contiguous_loop_cuda_kernel_global<<<192, 4*THREAD_PER_BLOCK, 0, cuda_streams->opal_cuda_stream[0]>>>(_copy_loops, _end_loop->size, _loop->extent, _source, _destination);
+#endif /* OPAL_DATATYPE_VECTOR_USE_MEMCPY2D_AS_KERNEL */
 
 #if !defined(OPAL_DATATYPE_CUDA_DRY_RUN)
     *(DESTINATION) = _destination + _loop->extent*_copy_loops - _end_loop->first_elem_disp;
@@ -625,7 +639,7 @@ void unpack_contiguous_loop_cuda( dt_elem_desc_t* ELEM,
     *(COUNT) -= _copy_loops;
 #endif
 
-    cudaDeviceSynchronize();
+    cudaStreamSynchronize(cuda_streams->opal_cuda_stream[0]);
 #if defined(OPAL_DATATYPE_CUDA_TIMING) 
     GET_TIME( end );
     total_time = ELAPSED_TIME( start, end );
@@ -645,6 +659,7 @@ void unpack_contiguous_loop_cuda_memcpy2d( dt_elem_desc_t* ELEM,
     uint32_t _copy_loops = *(COUNT);
     uint32_t num_blocks, tasks_per_block;
     unsigned char* _source = *(SOURCE);
+    ddt_cuda_stream_t *cuda_streams = current_cuda_device->cuda_streams;
 
 #if defined(OPAL_DATATYPE_CUDA_TIMING)    
     TIMER_DATA_TYPE start, end, start_total, end_total;
@@ -659,7 +674,7 @@ void unpack_contiguous_loop_cuda_memcpy2d( dt_elem_desc_t* ELEM,
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
     GET_TIME(start);
 #endif
-    cudaMemcpy2D(_destination, _loop->extent, _source, _end_loop->size, _end_loop->size, _copy_loops, cudaMemcpyHostToDevice);
+    cudaMemcpy2DAsync(_destination, _loop->extent, _source, _end_loop->size, _end_loop->size, _copy_loops, cudaMemcpyHostToDevice, cuda_streams->opal_cuda_stream[0]);
 
 #if !defined(OPAL_DATATYPE_CUDA_DRY_RUN)
     *(DESTINATION) = _destination + _loop->extent*_copy_loops - _end_loop->first_elem_disp;
@@ -668,7 +683,8 @@ void unpack_contiguous_loop_cuda_memcpy2d( dt_elem_desc_t* ELEM,
     *(COUNT) -= _copy_loops;
 #endif
     
-
+    cudaStreamSynchronize(cuda_streams->opal_cuda_stream[0]);
+    
 #if defined(OPAL_DATATYPE_CUDA_TIMING) 
     GET_TIME( end );
     total_time = ELAPSED_TIME( start, end );
@@ -689,6 +705,7 @@ void unpack_contiguous_loop_cuda_zerocopy( dt_elem_desc_t* ELEM,
     uint32_t num_blocks, tasks_per_block;
     unsigned char* _source = *(SOURCE);
     unsigned char* _source_dev;
+    ddt_cuda_stream_t *cuda_streams = current_cuda_device->cuda_streams;
 
 #if defined(OPAL_DATATYPE_CUDA_TIMING)    
     TIMER_DATA_TYPE start, end, start_total, end_total;
@@ -705,14 +722,17 @@ void unpack_contiguous_loop_cuda_zerocopy( dt_elem_desc_t* ELEM,
 #endif
 //    tasks_per_block = THREAD_PER_BLOCK * TASK_PER_THREAD;
 //    num_blocks = (*COUNT + tasks_per_block - 1) / tasks_per_block;
-//    cudaHostRegister(_source, _copy_loops*_end_loop->size, cudaHostRegisterMapped);
+
     cudaError_t reg_rv = cudaHostGetDevicePointer((void **)&_source_dev, (void *) _source, 0);
     if (reg_rv != cudaSuccess) {
         const char *cuda_err = cudaGetErrorString(reg_rv);
         printf("can not get dev mem, %s\n", cuda_err);
     }
-    //cudaMemcpy2D(_destination, _loop->extent, _source_dev, _end_loop->size, _end_loop->size, _copy_loops, cudaMemcpyDeviceToDevice);
-    unpack_contiguous_loop_cuda_kernel_global<<<192, 4*THREAD_PER_BLOCK>>>(_copy_loops, _end_loop->size, _loop->extent, _source_dev, _destination);
+#if OPAL_DATATYPE_VECTOR_USE_MEMCPY2D_AS_KERNEL
+    cudaMemcpy2DAsync(_destination, _loop->extent, _source_dev, _end_loop->size, _end_loop->size, _copy_loops, cudaMemcpyDeviceToDevice, cuda_streams->opal_cuda_stream[0]);
+#else
+    unpack_contiguous_loop_cuda_kernel_global<<<192, 4*THREAD_PER_BLOCK, 0, cuda_streams->opal_cuda_stream[0]>>>(_copy_loops, _end_loop->size, _loop->extent, _source_dev, _destination);
+#endif /* OPAL_DATATYPE_VECTOR_USE_MEMCPY2D_AS_KERNEL */
 
 #if !defined(OPAL_DATATYPE_CUDA_DRY_RUN)
     *(DESTINATION) = _destination + _loop->extent*_copy_loops - _end_loop->first_elem_disp;
@@ -721,7 +741,7 @@ void unpack_contiguous_loop_cuda_zerocopy( dt_elem_desc_t* ELEM,
     *(COUNT) -= _copy_loops;
 #endif
 
-    cudaDeviceSynchronize();
+    cudaStreamSynchronize(cuda_streams->opal_cuda_stream[0]);
   //  cudaHostUnregister(_source);
 #if defined(OPAL_DATATYPE_CUDA_TIMING) 
     GET_TIME( end );
diff --git a/test/datatype/ddt_benchmark.c b/test/datatype/ddt_benchmark.c
index 92bdf644d4d..45440dc2c04 100644
--- a/test/datatype/ddt_benchmark.c
+++ b/test/datatype/ddt_benchmark.c
@@ -1211,11 +1211,11 @@ int main( int argc, char* argv[] )
     
     printf( "\n\n#\n * TEST UPPER TRIANGULAR MATRIX (size 100)\n #\n\n" );
     int mat_size = 500;
-    for (mat_size = 500; mat_size <= 500; mat_size +=500) {
+    for (mat_size = 6000; mat_size <= 6000; mat_size +=500) {
         pdt = upper_matrix(mat_size);
         printf("----matrix size %d-----\n", mat_size);
         if( outputFlags & CHECK_PACK_UNPACK ) {
-            for (i = 1; i <= 1; i++) {
+            for (i = 1; i <= 2; i++) {
                 local_copy_with_convertor(pdt, 1, 1024*1024*200, mat_size);
             }
         }
@@ -1312,7 +1312,7 @@ int main( int argc, char* argv[] )
         pdt = create_vector_type( MPI_DOUBLE, blk_len, blk_len, blk_len*2);
         if( outputFlags & CHECK_PACK_UNPACK ) {
             for (i = 0; i < 4; i++) {
-                  vector_ddt( pdt, 1, pdt, 1, 1024*1024*100 , blk_len, blk_len, blk_len*2);
+        //          vector_ddt( pdt, 1, pdt, 1, 1024*1024*100 , blk_len, blk_len, blk_len*2);
     //             vector_ddt_2d( pdt, 1, pdt, 1, 1024*1024*100 , 8192, blk_len, blk_len+128);
             }
         }

From 5eb7bf1f88926fc6d309dc141911f28be70e6956 Mon Sep 17 00:00:00 2001
From: Wei Wu <eddy16112@gmail.com>
Date: Wed, 4 Nov 2015 12:05:59 -0800
Subject: [PATCH 136/190] in openib, disable rdma for non-contiguous gpu data

---
 ompi/mca/pml/ob1/pml_ob1_cuda.c            | 53 ++++++++++++++++++----
 opal/mca/btl/btl.h                         |  3 +-
 opal/mca/btl/openib/btl_openib_mca.c       |  1 +
 opal/mca/btl/smcuda/btl_smcuda_component.c |  1 +
 4 files changed, 48 insertions(+), 10 deletions(-)

diff --git a/ompi/mca/pml/ob1/pml_ob1_cuda.c b/ompi/mca/pml/ob1/pml_ob1_cuda.c
index af06f6ffdef..ec11680486f 100644
--- a/ompi/mca/pml/ob1/pml_ob1_cuda.c
+++ b/ompi/mca/pml/ob1/pml_ob1_cuda.c
@@ -54,6 +54,8 @@ int mca_pml_ob1_rdma_cuda_btl_register_data(
     uint32_t num_btls_used, 
     struct opal_convertor_t *pack_convertor, uint8_t pack_required, int32_t gpu_device);
 
+size_t mca_pml_ob1_rdma_cuda_avail(mca_bml_base_endpoint_t* bml_endpoint);
+
 int mca_pml_ob1_cuda_need_buffers(void * rreq,
                                   mca_btl_base_module_t* btl);
 
@@ -69,17 +71,17 @@ int mca_pml_ob1_send_request_start_cuda(mca_pml_ob1_send_request_t* sendreq,
     int rc;
     int32_t local_device = 0;
 #if OPAL_CUDA_SUPPORT_41
-#if OPAL_CUDA_GDR_SUPPORT
-    /* With some BTLs, switch to RNDV from RGET at large messages */
-    if ((sendreq->req_send.req_base.req_convertor.flags & CONVERTOR_CUDA) &&
-        (sendreq->req_send.req_bytes_packed > (bml_btl->btl->btl_cuda_rdma_limit - sizeof(mca_pml_ob1_hdr_t)))) {
-        return mca_pml_ob1_send_request_start_rndv(sendreq, bml_btl, 0, 0);
-    }
-#endif /* OPAL_CUDA_GDR_SUPPORT */
 
     sendreq->req_send.req_base.req_convertor.flags &= ~CONVERTOR_CUDA;
     struct opal_convertor_t *convertor = &(sendreq->req_send.req_base.req_convertor);
     if (opal_convertor_need_buffers(&sendreq->req_send.req_base.req_convertor) == false) {
+#if OPAL_CUDA_GDR_SUPPORT
+        /* With some BTLs, switch to RNDV from RGET at large messages */
+        if ((sendreq->req_send.req_bytes_packed > (bml_btl->btl->btl_cuda_rdma_limit - sizeof(mca_pml_ob1_hdr_t)))) {
+            sendreq->req_send.req_base.req_convertor.flags |= CONVERTOR_CUDA;
+            return mca_pml_ob1_send_request_start_rndv(sendreq, bml_btl, 0, 0);
+        }
+#endif /* OPAL_CUDA_GDR_SUPPORT */
         unsigned char *base;
         opal_convertor_get_current_pointer( &sendreq->req_send.req_base.req_convertor, (void**)&base );
         /* Set flag back */
@@ -113,8 +115,9 @@ int mca_pml_ob1_send_request_start_cuda(mca_pml_ob1_send_request_t* sendreq,
         /* Do not send anything with first rendezvous message as copying GPU
          * memory into RNDV message is expensive. */
         sendreq->req_send.req_base.req_convertor.flags |= CONVERTOR_CUDA;
-        mca_bml_base_btl_t* bml_endpoint_btl = mca_bml_base_btl_array_get_index(&(sendreq->req_endpoint->btl_send), 0);
-        if ((bml_endpoint_btl->btl_flags & MCA_BTL_FLAGS_CUDA_GET) && (opal_datatype_cuda_kernel_support == 1)) {
+        if ((mca_pml_ob1_rdma_cuda_avail(sendreq->req_endpoint) != 0) && 
+            (opal_datatype_cuda_kernel_support == 1) && 
+            (bml_btl->btl->btl_cuda_ddt_allow_rdma == 1)) {
             unsigned char *base;
             size_t buffer_size = 0;
             if (convertor->local_size > bml_btl->btl->btl_cuda_ddt_pipeline_size) {
@@ -236,6 +239,38 @@ int mca_pml_ob1_rdma_cuda_btl_register_data(
     return 0;
 }
 
+size_t mca_pml_ob1_rdma_cuda_avail(mca_bml_base_endpoint_t* bml_endpoint)
+{
+    int num_btls = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_send);
+    double weight_total = 0;
+    int num_btls_used = 0, n;
+
+    /* shortcut when there are no rdma capable btls */
+    if(num_btls == 0) {
+        return 0;
+    }
+
+    /* check to see if memory is registered */
+    for(n = 0; n < num_btls && num_btls_used < mca_pml_ob1.max_rdma_per_request;
+            n++) {
+        mca_bml_base_btl_t* bml_btl =
+            mca_bml_base_btl_array_get_index(&bml_endpoint->btl_send, n);
+
+        if (bml_btl->btl_flags & MCA_BTL_FLAGS_CUDA_GET) {
+            weight_total += bml_btl->btl_weight;
+            num_btls_used++;
+        }
+    }
+
+    /* if we don't use leave_pinned and all BTLs that already have this memory
+ *      * registered amount to less then half of available bandwidth - fall back to
+ *           * pipeline protocol */
+    if(0 == num_btls_used || (!mca_pml_ob1.leave_pinned && weight_total < 0.5))
+        return 0;
+
+    return num_btls_used;
+}
+
 int mca_pml_ob1_cuda_need_buffers(void * rreq,
                                   mca_btl_base_module_t* btl)
 {
diff --git a/opal/mca/btl/btl.h b/opal/mca/btl/btl.h
index 1a38ec4c331..7e693c62b84 100644
--- a/opal/mca/btl/btl.h
+++ b/opal/mca/btl/btl.h
@@ -1182,8 +1182,9 @@ struct mca_btl_base_module_t {
 #endif /* OPAL_CUDA_GDR_SUPPORT */
 #if OPAL_CUDA_SUPPORT
     size_t      btl_cuda_max_send_size;   /**< set if CUDA max send_size is different from host max send size */
+    int32_t     btl_cuda_ddt_allow_rdma;
     size_t      btl_cuda_ddt_pipeline_size;
-    int         btl_cuda_ddt_pipeline_depth;
+    int32_t     btl_cuda_ddt_pipeline_depth;
 #endif /* OPAL_CUDA_SUPPORT */
 };
 typedef struct mca_btl_base_module_t mca_btl_base_module_t;
diff --git a/opal/mca/btl/openib/btl_openib_mca.c b/opal/mca/btl/openib/btl_openib_mca.c
index 07dcdd07c76..6a0d4ef25cf 100644
--- a/opal/mca/btl/openib/btl_openib_mca.c
+++ b/opal/mca/btl/openib/btl_openib_mca.c
@@ -648,6 +648,7 @@ int btl_openib_register_mca_params(void)
         mca_btl_openib_module.super.btl_cuda_rdma_limit = 0;  /* Unused */
     }
 #endif /* OPAL_CUDA_GDR_SUPPORT */
+    mca_btl_openib_module.super.btl_cuda_ddt_allow_rdma = 0;
 #endif /* OPAL_CUDA_SUPPORT */
     CHECK(mca_btl_base_param_register(
             &mca_btl_openib_component.super.btl_version,
diff --git a/opal/mca/btl/smcuda/btl_smcuda_component.c b/opal/mca/btl/smcuda/btl_smcuda_component.c
index c7bdb40c028..9c1f5235d1e 100644
--- a/opal/mca/btl/smcuda/btl_smcuda_component.c
+++ b/opal/mca/btl/smcuda/btl_smcuda_component.c
@@ -176,6 +176,7 @@ static int smcuda_register(void)
     mca_btl_smcuda.super.btl_cuda_ddt_pipeline_size = mca_btl_smcuda_component.cuda_ddt_pipeline_size;
     printf("pipeline size %lu\n", mca_btl_smcuda.super.btl_cuda_ddt_pipeline_size);
     mca_btl_smcuda.super.btl_cuda_ddt_pipeline_depth = 4;
+    mca_btl_smcuda.super.btl_cuda_ddt_allow_rdma = 1;
     mca_btl_smcuda.super.btl_eager_limit = 4*1024;
     mca_btl_smcuda.super.btl_rndv_eager_limit = 4*1024;
     mca_btl_smcuda.super.btl_max_send_size = 32*1024;

From aa24f4c4aaa6903642a43d9818866f346b74ec7e Mon Sep 17 00:00:00 2001
From: eddy16112 <eddy16112@gmail.com>
Date: Wed, 4 Nov 2015 17:26:59 -0500
Subject: [PATCH 137/190] move ddt kernel support function pointer into
 opal_datatype_cuda.c

---
 ompi/mca/pml/ob1/pml_ob1_cuda.c            |   4 +-
 opal/datatype/Makefile.am                  |   4 +-
 opal/datatype/opal_convertor.c             |  11 --
 opal/datatype/opal_datatype_cuda.c         | 167 +++++++++++++++++++
 opal/datatype/opal_datatype_cuda.h         |  26 +++
 opal/datatype/opal_datatype_gpu.c          | 177 ---------------------
 opal/datatype/opal_datatype_gpu.h          |  60 -------
 opal/datatype/opal_datatype_module.c       |  10 +-
 opal/datatype/opal_datatype_pack.c         |  21 +--
 opal/datatype/opal_datatype_unpack.c       |  14 +-
 opal/mca/btl/smcuda/btl_smcuda.c           |   9 +-
 opal/mca/btl/smcuda/btl_smcuda_component.c |  12 +-
 opal/mca/common/cuda/common_cuda.c         |  13 +-
 opal/mca/common/cuda/common_cuda.h         |   1 -
 14 files changed, 224 insertions(+), 305 deletions(-)
 delete mode 100644 opal/datatype/opal_datatype_gpu.c
 delete mode 100644 opal/datatype/opal_datatype_gpu.h

diff --git a/ompi/mca/pml/ob1/pml_ob1_cuda.c b/ompi/mca/pml/ob1/pml_ob1_cuda.c
index ec11680486f..68e97d77c4e 100644
--- a/ompi/mca/pml/ob1/pml_ob1_cuda.c
+++ b/ompi/mca/pml/ob1/pml_ob1_cuda.c
@@ -37,7 +37,7 @@
 #include "ompi/mca/bml/base/base.h"
 #include "ompi/memchecker.h"
 
-#include "opal/datatype/opal_datatype_gpu.h"
+#include "opal/datatype/opal_datatype_cuda.h"
 #include "opal/mca/common/cuda/common_cuda.h"
 #include "opal/mca/btl/smcuda/btl_smcuda.h"
 
@@ -125,7 +125,7 @@ int mca_pml_ob1_send_request_start_cuda(mca_pml_ob1_send_request_t* sendreq,
             } else {
                 buffer_size = convertor->local_size;
             }
-            base = opal_cuda_malloc_gpu_buffer_p(buffer_size, 0);
+            base = opal_cuda_malloc_gpu_buffer(buffer_size, 0);
             convertor->gpu_buffer_ptr = base;
             convertor->gpu_buffer_size = buffer_size;
             sendreq->req_send.req_bytes_packed = convertor->local_size;
diff --git a/opal/datatype/Makefile.am b/opal/datatype/Makefile.am
index 7683c2e8786..ca64cf29237 100644
--- a/opal/datatype/Makefile.am
+++ b/opal/datatype/Makefile.am
@@ -32,8 +32,7 @@ headers = \
         opal_datatype_memcpy.h \
         opal_datatype_pack.h \
         opal_datatype_prototypes.h \
-        opal_datatype_unpack.h \
-		opal_datatype_gpu.h
+        opal_datatype_unpack.h
 
 
 noinst_LTLIBRARIES = \
@@ -61,7 +60,6 @@ libdatatype_la_SOURCES = \
         opal_datatype_get_count.c \
         opal_datatype_module.c \
         opal_datatype_optimize.c \
-		opal_datatype_gpu.c \
         opal_datatype_pack.c \
         opal_datatype_position.c \
         opal_datatype_resize.c \
diff --git a/opal/datatype/opal_convertor.c b/opal/datatype/opal_convertor.c
index 82bbf241685..7fed801766d 100644
--- a/opal/datatype/opal_convertor.c
+++ b/opal/datatype/opal_convertor.c
@@ -39,7 +39,6 @@
 #include "opal/datatype/opal_convertor_internal.h"
 #if OPAL_CUDA_SUPPORT
 #include "opal/datatype/opal_datatype_cuda.h"
-#include "opal/datatype/opal_datatype_gpu.h"
 #define MEMCPY_CUDA( DST, SRC, BLENGTH, CONVERTOR ) \
     CONVERTOR->cbmemcpy( (DST), (SRC), (BLENGTH), (CONVERTOR) )
 #endif
@@ -562,11 +561,6 @@ int32_t opal_convertor_prepare_for_recv( opal_convertor_t* convertor,
     convertor->flags |= CONVERTOR_RECV;
 #if OPAL_CUDA_SUPPORT
     mca_cuda_convertor_init(convertor, pUserBuf);
-#if OPAL_DATATYPE_CUDA_KERNEL
-    if (opal_datatype_gpu_init() != OPAL_SUCCESS) {
-        opal_datatype_gpu_fini();
-    }
-#endif /* OPAL_DATATYPE_CUDA_KERNEL */
 #endif  /* OPAL_CUDA_SUPPORT */
 
     OPAL_CONVERTOR_PREPARE( convertor, datatype, count, pUserBuf );
@@ -616,11 +610,6 @@ int32_t opal_convertor_prepare_for_send( opal_convertor_t* convertor,
     convertor->flags |= CONVERTOR_SEND;
 #if OPAL_CUDA_SUPPORT
     mca_cuda_convertor_init(convertor, pUserBuf);
-#if OPAL_DATATYPE_CUDA_KERNEL
-    if (opal_datatype_gpu_init() != OPAL_SUCCESS) {
-        opal_datatype_gpu_fini();
-    }
-#endif /* OPAL_DATATYPE_CUDA_KERNEL */
 #endif  /* OPAL_CUDA_SUPPORT */
 
     OPAL_CONVERTOR_PREPARE( convertor, datatype, count, pUserBuf );
diff --git a/opal/datatype/opal_datatype_cuda.c b/opal/datatype/opal_datatype_cuda.c
index e09618e747b..23cdb47acd6 100644
--- a/opal/datatype/opal_datatype_cuda.c
+++ b/opal/datatype/opal_datatype_cuda.c
@@ -12,11 +12,13 @@
 #include <errno.h>
 #include <string.h>
 #include <unistd.h>
+#include <dlfcn.h>
 
 #include "opal/align.h"
 #include "opal/util/output.h"
 #include "opal/datatype/opal_convertor.h"
 #include "opal/datatype/opal_datatype_cuda.h"
+#include "opal/mca/installdirs/installdirs.h"
 
 static bool initialized = false;
 int opal_cuda_verbose = 0;
@@ -26,6 +28,24 @@ static void opal_cuda_support_init(void);
 static int (*common_cuda_initialization_function)(opal_common_cuda_function_table_t *) = NULL;
 static opal_common_cuda_function_table_t ftable;
 
+/* folowing variables are used for cuda ddt kernel support */
+static opal_datatype_cuda_kernel_function_table_t cuda_kernel_table;
+static void *opal_datatype_cuda_kernel_handle = NULL;
+static char *opal_datatype_cuda_kernel_lib = NULL;
+int32_t opal_datatype_cuda_kernel_support = 0;
+
+#define OPAL_DATATYPE_FIND_CUDA_KERNEL_FUNCTION_OR_RETURN(handle, fname)            \
+    do {                                                                            \
+        char* _error;                                                               \
+        *(void **)(&(cuda_kernel_table.fname ## _p)) = dlsym((handle), # fname);    \
+        if(NULL != (_error = dlerror()) )  {                                        \
+            opal_output(0, "Finding %s error: %s\n", # fname, _error);              \
+            cuda_kernel_table.fname ## _p = NULL;                                   \
+            return OPAL_ERROR;                                                      \
+        }                                                                           \
+    } while (0)
+
+
 /* This function allows the common cuda code to register an
  * initialization function that gets called the first time an attempt
  * is made to send or receive a GPU pointer.  This allows us to delay
@@ -60,6 +80,10 @@ void mca_cuda_convertor_init(opal_convertor_t* convertor, const void *pUserBuf)
     if (ftable.gpu_is_gpu_buffer(pUserBuf, convertor)) {
         convertor->flags |= CONVERTOR_CUDA;
     }
+    
+    if (OPAL_SUCCESS != opal_datatype_cuda_kernel_support_init()) {
+        opal_datatype_cuda_kernel_support_fini();    
+    }
 }
 
 /* Checks the type of pointer
@@ -189,3 +213,146 @@ void opal_cuda_set_copy_function_async(opal_convertor_t* convertor, void *stream
     convertor->flags |= CONVERTOR_CUDA_ASYNC;
     convertor->stream = stream;
 }
+
+/* following functions are used for cuda ddt kernel support */
+int32_t opal_datatype_cuda_kernel_support_init(void)
+{
+    if (opal_datatype_cuda_kernel_handle ==  NULL) {
+
+        /* If the library name was initialized but the load failed, we have another chance to change it */
+        if( NULL != opal_datatype_cuda_kernel_lib )
+            free(opal_datatype_cuda_kernel_lib);
+        asprintf(&opal_datatype_cuda_kernel_lib, "%s/%s", opal_install_dirs.libdir, "opal_datatype_cuda.so");
+
+        opal_datatype_cuda_kernel_handle = dlopen(opal_datatype_cuda_kernel_lib , RTLD_LAZY);
+        if (!opal_datatype_cuda_kernel_handle) {
+            opal_output( 0, "Failed to load %s library: error %s\n", opal_datatype_cuda_kernel_lib, dlerror());
+            opal_datatype_cuda_kernel_handle = NULL;
+            return OPAL_ERROR;
+        }
+        
+        OPAL_DATATYPE_FIND_CUDA_KERNEL_FUNCTION_OR_RETURN( opal_datatype_cuda_kernel_handle, opal_datatype_cuda_init );
+        OPAL_DATATYPE_FIND_CUDA_KERNEL_FUNCTION_OR_RETURN( opal_datatype_cuda_kernel_handle, opal_datatype_cuda_fini );
+        OPAL_DATATYPE_FIND_CUDA_KERNEL_FUNCTION_OR_RETURN( opal_datatype_cuda_kernel_handle, opal_generic_simple_pack_function_cuda_iov );
+        OPAL_DATATYPE_FIND_CUDA_KERNEL_FUNCTION_OR_RETURN( opal_datatype_cuda_kernel_handle, opal_generic_simple_unpack_function_cuda_iov );
+        OPAL_DATATYPE_FIND_CUDA_KERNEL_FUNCTION_OR_RETURN( opal_datatype_cuda_kernel_handle, opal_generic_simple_pack_function_cuda_vector );
+        OPAL_DATATYPE_FIND_CUDA_KERNEL_FUNCTION_OR_RETURN( opal_datatype_cuda_kernel_handle, opal_generic_simple_unpack_function_cuda_vector );
+        OPAL_DATATYPE_FIND_CUDA_KERNEL_FUNCTION_OR_RETURN( opal_datatype_cuda_kernel_handle, opal_cuda_free_gpu_buffer );
+        OPAL_DATATYPE_FIND_CUDA_KERNEL_FUNCTION_OR_RETURN( opal_datatype_cuda_kernel_handle, opal_cuda_malloc_gpu_buffer );
+        OPAL_DATATYPE_FIND_CUDA_KERNEL_FUNCTION_OR_RETURN( opal_datatype_cuda_kernel_handle, opal_cuda_d2dcpy_async );
+        OPAL_DATATYPE_FIND_CUDA_KERNEL_FUNCTION_OR_RETURN( opal_datatype_cuda_kernel_handle, opal_cuda_d2dcpy );
+        
+        if (OPAL_SUCCESS != cuda_kernel_table.opal_datatype_cuda_init_p()) {
+            return OPAL_ERROR;
+        }
+        opal_datatype_cuda_kernel_support = 1;
+        opal_output( 0, "opal_datatype_cuda_kernel_support_init done\n");
+    }
+    return OPAL_SUCCESS;
+}
+
+int32_t opal_datatype_cuda_kernel_support_fini(void)
+{
+    if (opal_datatype_cuda_kernel_handle != NULL) {
+        cuda_kernel_table.opal_datatype_cuda_fini_p();
+        /* Reset all functions to NULL */
+        cuda_kernel_table.opal_datatype_cuda_init_p = NULL;
+        cuda_kernel_table.opal_datatype_cuda_fini_p = NULL;
+        cuda_kernel_table.opal_generic_simple_pack_function_cuda_iov_p = NULL;
+        cuda_kernel_table.opal_generic_simple_unpack_function_cuda_iov_p = NULL;
+        cuda_kernel_table.opal_generic_simple_pack_function_cuda_vector_p = NULL;
+        cuda_kernel_table.opal_generic_simple_unpack_function_cuda_vector_p = NULL;
+        cuda_kernel_table.opal_cuda_free_gpu_buffer_p = NULL;
+        cuda_kernel_table.opal_cuda_malloc_gpu_buffer_p = NULL;
+        cuda_kernel_table.opal_cuda_d2dcpy_async_p = NULL;
+        cuda_kernel_table.opal_cuda_d2dcpy_p = NULL;
+
+        dlclose(opal_datatype_cuda_kernel_handle);
+        opal_datatype_cuda_kernel_handle = NULL;
+
+        if( NULL != opal_datatype_cuda_kernel_lib )
+            free(opal_datatype_cuda_kernel_lib);
+        opal_datatype_cuda_kernel_lib = NULL;
+        opal_datatype_cuda_kernel_support = 0;
+        opal_output( 0, "opal_datatype_cuda_kernel_support_fini done\n");
+    }
+    return OPAL_SUCCESS;
+}
+
+int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor, struct iovec* iov, uint32_t* out_size, size_t* max_data )
+{
+    if (cuda_kernel_table.opal_generic_simple_pack_function_cuda_iov_p != NULL) {
+        return cuda_kernel_table.opal_generic_simple_pack_function_cuda_iov_p(pConvertor, iov, out_size, max_data);
+    } else {
+        opal_output(0, "opal_generic_simple_pack_function_cuda_iov function pointer is NULL\n");
+        return -1;
+    }
+}
+
+int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvertor, struct iovec* iov, uint32_t* out_size, size_t* max_data )
+{
+    if (cuda_kernel_table.opal_generic_simple_unpack_function_cuda_iov_p != NULL) {
+        return cuda_kernel_table.opal_generic_simple_unpack_function_cuda_iov_p(pConvertor, iov, out_size, max_data);
+    } else {
+        opal_output(0, "opal_generic_simple_unpack_function_cuda_iov function pointer is NULL\n");
+        return -1;
+    }
+}
+
+int32_t opal_generic_simple_pack_function_cuda_vector( opal_convertor_t* pConvertor, struct iovec* iov, uint32_t* out_size, size_t* max_data )
+{
+    if (cuda_kernel_table.opal_generic_simple_pack_function_cuda_vector_p != NULL) {
+        return cuda_kernel_table.opal_generic_simple_pack_function_cuda_vector_p(pConvertor, iov, out_size, max_data);
+    } else {
+        opal_output(0, "opal_generic_simple_pack_function_cuda_vector function pointer is NULL\n");
+        return -1;
+    }
+}
+
+int32_t opal_generic_simple_unpack_function_cuda_vector( opal_convertor_t* pConvertor, struct iovec* iov, uint32_t* out_size, size_t* max_data )
+{
+    if (cuda_kernel_table.opal_generic_simple_unpack_function_cuda_vector_p != NULL) {
+        return cuda_kernel_table.opal_generic_simple_unpack_function_cuda_vector_p(pConvertor, iov, out_size, max_data);
+    } else {
+        opal_output(0, "opal_generic_simple_unpack_function_cuda_vector function pointer is NULL\n");
+        return -1;
+    }
+}
+
+void* opal_cuda_malloc_gpu_buffer(size_t size, int gpu_id)
+{
+    if (cuda_kernel_table.opal_cuda_malloc_gpu_buffer_p != NULL) {
+        return cuda_kernel_table.opal_cuda_malloc_gpu_buffer_p(size, gpu_id);
+    } else {
+        opal_output(0, "opal_cuda_malloc_gpu_buffer function pointer is NULL\n");
+        return NULL;
+    }
+}
+
+void opal_cuda_free_gpu_buffer(void *addr, int gpu_id)
+{
+    if (cuda_kernel_table.opal_cuda_free_gpu_buffer_p != NULL) {
+        cuda_kernel_table.opal_cuda_free_gpu_buffer_p(addr, gpu_id);
+    } else {
+        opal_output(0, "opal_cuda_free_gpu_buffer function pointer is NULL\n");
+    }
+}
+
+void opal_cuda_d2dcpy(void* dst, const void* src, size_t count)
+{
+    if (cuda_kernel_table.opal_cuda_d2dcpy_p != NULL) {
+        cuda_kernel_table.opal_cuda_d2dcpy_p(dst, src, count);
+    } else {
+        opal_output(0, "opal_cuda_d2dcpy function pointer is NULL\n");
+    }
+}
+
+void opal_cuda_d2dcpy_async(void* dst, const void* src, size_t count)
+{
+    if (cuda_kernel_table.opal_cuda_d2dcpy_async_p != NULL) {
+        cuda_kernel_table.opal_cuda_d2dcpy_async_p(dst, src, count);
+    } else {
+        opal_output(0, "opal_cuda_d2dcpy_async function pointer is NULL\n");
+    }
+}
+
diff --git a/opal/datatype/opal_datatype_cuda.h b/opal/datatype/opal_datatype_cuda.h
index 676af80273b..a5a68074034 100644
--- a/opal/datatype/opal_datatype_cuda.h
+++ b/opal/datatype/opal_datatype_cuda.h
@@ -21,6 +21,21 @@ struct opal_common_cuda_function_table {
 };
 typedef struct opal_common_cuda_function_table opal_common_cuda_function_table_t;
 
+struct opal_datatype_cuda_kernel_function_table {
+    int32_t (*opal_datatype_cuda_init_p)(void);
+    int32_t (*opal_datatype_cuda_fini_p)(void);
+    void (*opal_cuda_free_gpu_buffer_p)(void *addr, int gpu_id);
+    void* (*opal_cuda_malloc_gpu_buffer_p)(size_t size, int gpu_id);
+    void (*opal_cuda_d2dcpy_async_p)(void* dst, const void* src, size_t count);
+    void (*opal_cuda_d2dcpy_p)(void* dst, const void* src, size_t count);
+    int32_t (*opal_generic_simple_pack_function_cuda_iov_p)( opal_convertor_t* pConvertor, struct iovec* iov, uint32_t* out_size, size_t* max_data );
+    int32_t (*opal_generic_simple_unpack_function_cuda_iov_p)( opal_convertor_t* pConvertor, struct iovec* iov, uint32_t* out_size, size_t* max_data );
+    int32_t (*opal_generic_simple_pack_function_cuda_vector_p)( opal_convertor_t* pConvertor, struct iovec* iov, uint32_t* out_size, size_t* max_data );
+    int32_t (*opal_generic_simple_unpack_function_cuda_vector_p)( opal_convertor_t* pConvertor, struct iovec* iov, uint32_t* out_size, size_t* max_data );                                                         
+};
+typedef struct opal_datatype_cuda_kernel_function_table opal_datatype_cuda_kernel_function_table_t;
+extern int32_t opal_datatype_cuda_kernel_support;
+
 void mca_cuda_convertor_init(opal_convertor_t* convertor, const void *pUserBuf);
 bool opal_cuda_check_bufs(char *dest, char *src);
 void* opal_cuda_memcpy(void * dest, const void * src, size_t size, opal_convertor_t* convertor);
@@ -29,4 +44,15 @@ void* opal_cuda_memmove(void * dest, void * src, size_t size);
 void opal_cuda_add_initialization_function(int (*fptr)(opal_common_cuda_function_table_t *));
 void opal_cuda_set_copy_function_async(opal_convertor_t* convertor, void *stream);
 
+int32_t opal_datatype_cuda_kernel_support_init(void);
+int32_t opal_datatype_cuda_kernel_support_fini(void);
+int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor, struct iovec* iov, uint32_t* out_size, size_t* max_data );
+int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvertor, struct iovec* iov, uint32_t* out_size, size_t* max_data );
+int32_t opal_generic_simple_pack_function_cuda_vector( opal_convertor_t* pConvertor, struct iovec* iov, uint32_t* out_size, size_t* max_data );
+int32_t opal_generic_simple_unpack_function_cuda_vector( opal_convertor_t* pConvertor, struct iovec* iov, uint32_t* out_size, size_t* max_data ); 
+void* opal_cuda_malloc_gpu_buffer(size_t size, int gpu_id);
+void opal_cuda_free_gpu_buffer(void *addr, int gpu_id);
+void opal_cuda_d2dcpy(void* dst, const void* src, size_t count);
+void opal_cuda_d2dcpy_async(void* dst, const void* src, size_t count);
+
 #endif
diff --git a/opal/datatype/opal_datatype_gpu.c b/opal/datatype/opal_datatype_gpu.c
deleted file mode 100644
index f21b22c72d2..00000000000
--- a/opal/datatype/opal_datatype_gpu.c
+++ /dev/null
@@ -1,177 +0,0 @@
-/* -*- Mode: C; c-basic-offset:4 ; -*- */
-/*
- * Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana
- *                         University Research and Technology
- *                         Corporation.  All rights reserved.
- * Copyright (c) 2004-2015 The University of Tennessee and The University
- *                         of Tennessee Research Foundation.  All rights
- *                         reserved.
- * Copyright (c) 2004-2006 High Performance Computing Center Stuttgart,
- *                         University of Stuttgart.  All rights reserved.
- * Copyright (c) 2004-2006 The Regents of the University of California.
- *                         All rights reserved.
- * Copyright (c) 2009      Oak Ridge National Labs.  All rights reserved.
- * Copyright (c) 2013 Cisco Systems, Inc.  All rights reserved.
- * $COPYRIGHT$
- *
- * Additional copyrights may follow
- *
- * $HEADER$
- */
-
-#include "opal_config.h"
-
-#include <stddef.h>
-#include <stdio.h>
-#include <dlfcn.h>
-#include <stdio.h>
-
-#include "opal/mca/installdirs/installdirs.h"
-#include "opal/datatype/opal_convertor_internal.h"
-#include "opal/datatype/opal_datatype_internal.h"
-
-#if OPAL_ENABLE_DEBUG
-#include "opal/util/output.h"
-
-#define DO_DEBUG(INST)  if( opal_pack_debug ) { INST }
-#else
-#define DO_DEBUG(INST)
-#endif  /* OPAL_ENABLE_DEBUG */
-
-#include "opal/datatype/opal_datatype_gpu.h"
-
-int32_t opal_datatype_cuda_kernel_support = 0;
-
-static void *opal_datatype_cuda_handle = NULL;
-static char *opal_datatype_cuda_lib = NULL;
-
-int32_t (*opal_datatype_cuda_init_p)(void) = NULL;
-
-int32_t (*opal_datatype_cuda_fini_p)(void) = NULL;
-
-
-int32_t (*opal_generic_simple_pack_function_cuda_iov_p)( opal_convertor_t* pConvertor,
-                                                        struct iovec* iov,
-                                                        uint32_t* out_size,
-                                                        size_t* max_data ) = NULL;
-
-int32_t (*opal_generic_simple_unpack_function_cuda_iov_p)( opal_convertor_t* pConvertor,
-                                                        struct iovec* iov,
-                                                        uint32_t* out_size,
-                                                        size_t* max_data ) = NULL;
-
-int32_t (*opal_generic_simple_pack_function_cuda_vector_p)( opal_convertor_t* pConvertor,
-                                                            struct iovec* iov,
-                                                            uint32_t* out_size,
-                                                            size_t* max_data ) = NULL;
-
-int32_t (*opal_generic_simple_unpack_function_cuda_vector_p)( opal_convertor_t* pConvertor,
-                                                              struct iovec* iov,
-                                                              uint32_t* out_size,
-                                                              size_t* max_data ) = NULL;
-
-void (*pack_contiguous_loop_cuda_p)( dt_elem_desc_t* ELEM,
-                                     uint32_t* COUNT,
-                                     unsigned char** SOURCE,
-                                     unsigned char** DESTINATION,
-                                     size_t* SPACE ) = NULL;
-
-void (*unpack_contiguous_loop_cuda_p)( dt_elem_desc_t* ELEM,
-                                       uint32_t* COUNT,
-                                       unsigned char** SOURCE,
-                                       unsigned char** DESTINATION,
-                                       size_t* SPACE ) = NULL;
-
-void (*pack_predefined_data_cuda_p)( dt_elem_desc_t* ELEM,
-                                     uint32_t* COUNT,
-                                     unsigned char** SOURCE,
-                                     unsigned char** DESTINATION,
-                                     size_t* SPACE ) = NULL;
-
-void (*opal_cuda_free_gpu_buffer_p)(void *addr, int gpu_id) = NULL;
-
-void* (*opal_cuda_malloc_gpu_buffer_p)(size_t size, int gpu_id) = NULL;
-
-void (*opal_cuda_d2dcpy_async_p)(void* dst, const void* src, size_t count) = NULL;
-
-void (*opal_cuda_d2dcpy_p)(void* dst, const void* src, size_t count) = NULL;
-
-#define OPAL_DATATYPE_FIND_CUDA_FUNCTION_OR_RETURN(handle, fname)       \
-    do {                                                                \
-        char* _error;                                                   \
-        *(void **)(&(fname ## _p)) = dlsym((handle), # fname);          \
-        if(NULL != (_error = dlerror()) )  {                            \
-            opal_output(0, "Finding %s error: %s\n", # fname, _error);  \
-            fname ## _p = NULL;                                         \
-            return OPAL_ERROR;                                          \
-        }                                                               \
-    } while (0)
-
-int32_t opal_datatype_gpu_init(void)
-{
-    if (opal_datatype_cuda_handle ==  NULL) {
-
-        /* If the library name was initialized but the load failed, we have another chance to change it */
-        if( NULL != opal_datatype_cuda_lib )
-            free(opal_datatype_cuda_lib);
-        asprintf(&opal_datatype_cuda_lib, "%s/%s", opal_install_dirs.libdir, "opal_datatype_cuda.so");
-
-        opal_datatype_cuda_handle = dlopen(opal_datatype_cuda_lib , RTLD_LAZY);
-        if (!opal_datatype_cuda_handle) {
-            opal_output( 0, "Failed to load %s library: error %s\n", opal_datatype_cuda_lib, dlerror());
-            opal_datatype_cuda_handle = NULL;
-            return OPAL_ERROR;
-        }
-        OPAL_DATATYPE_FIND_CUDA_FUNCTION_OR_RETURN( opal_datatype_cuda_handle, opal_datatype_cuda_init );
-        OPAL_DATATYPE_FIND_CUDA_FUNCTION_OR_RETURN( opal_datatype_cuda_handle, opal_datatype_cuda_fini );
-        OPAL_DATATYPE_FIND_CUDA_FUNCTION_OR_RETURN( opal_datatype_cuda_handle, opal_generic_simple_pack_function_cuda_iov );
-        OPAL_DATATYPE_FIND_CUDA_FUNCTION_OR_RETURN( opal_datatype_cuda_handle, opal_generic_simple_unpack_function_cuda_iov );
-        OPAL_DATATYPE_FIND_CUDA_FUNCTION_OR_RETURN( opal_datatype_cuda_handle, opal_generic_simple_pack_function_cuda_vector );
-        OPAL_DATATYPE_FIND_CUDA_FUNCTION_OR_RETURN( opal_datatype_cuda_handle, opal_generic_simple_unpack_function_cuda_vector );
-        OPAL_DATATYPE_FIND_CUDA_FUNCTION_OR_RETURN( opal_datatype_cuda_handle, pack_contiguous_loop_cuda );
-        OPAL_DATATYPE_FIND_CUDA_FUNCTION_OR_RETURN( opal_datatype_cuda_handle, unpack_contiguous_loop_cuda );
-        OPAL_DATATYPE_FIND_CUDA_FUNCTION_OR_RETURN( opal_datatype_cuda_handle, pack_predefined_data_cuda );
-        OPAL_DATATYPE_FIND_CUDA_FUNCTION_OR_RETURN( opal_datatype_cuda_handle, opal_cuda_free_gpu_buffer );
-        OPAL_DATATYPE_FIND_CUDA_FUNCTION_OR_RETURN( opal_datatype_cuda_handle, opal_cuda_malloc_gpu_buffer );
-        OPAL_DATATYPE_FIND_CUDA_FUNCTION_OR_RETURN( opal_datatype_cuda_handle, opal_cuda_d2dcpy_async );
-        OPAL_DATATYPE_FIND_CUDA_FUNCTION_OR_RETURN( opal_datatype_cuda_handle, opal_cuda_d2dcpy );
-
-        if (OPAL_SUCCESS != (*opal_datatype_cuda_init_p)()) {
-            return OPAL_ERROR;
-        }
-        opal_datatype_cuda_kernel_support = 1;
-        opal_output( 0, "opal_datatype_cuda_kernel_support init done\n");
-    }
-    return OPAL_SUCCESS;
-}
-
-int32_t opal_datatype_gpu_fini(void)
-{
-    if (opal_datatype_cuda_handle != NULL) {
-        (*opal_datatype_cuda_fini_p)();
-        /* Reset all functions to NULL */
-        opal_datatype_cuda_init_p = NULL;
-        opal_datatype_cuda_fini_p = NULL;
-        opal_generic_simple_pack_function_cuda_iov_p = NULL;
-        opal_generic_simple_unpack_function_cuda_iov_p = NULL;
-        opal_generic_simple_pack_function_cuda_vector_p = NULL;
-        opal_generic_simple_unpack_function_cuda_vector_p = NULL;
-        pack_contiguous_loop_cuda_p = NULL;
-        unpack_contiguous_loop_cuda_p = NULL;
-        pack_predefined_data_cuda_p = NULL;
-        opal_cuda_free_gpu_buffer_p = NULL;
-        opal_cuda_malloc_gpu_buffer_p = NULL;
-        opal_cuda_d2dcpy_async_p = NULL;
-        opal_cuda_d2dcpy_p = NULL;
-
-        dlclose(opal_datatype_cuda_handle);
-        opal_datatype_cuda_handle = NULL;
-
-        if( NULL != opal_datatype_cuda_lib )
-            free(opal_datatype_cuda_lib);
-        opal_datatype_cuda_lib = NULL;
-        opal_datatype_cuda_kernel_support = 0;
-        opal_output( 0, "opal_datatype_cuda_kernel_support fini done\n");
-    }
-    return OPAL_SUCCESS;
-}
diff --git a/opal/datatype/opal_datatype_gpu.h b/opal/datatype/opal_datatype_gpu.h
deleted file mode 100644
index 340fbf24da7..00000000000
--- a/opal/datatype/opal_datatype_gpu.h
+++ /dev/null
@@ -1,60 +0,0 @@
-#ifndef OPAL_DATATYPE_GPU_H_HAS_BEEN_INCLUDED
-#define OPAL_DATATYPE_GPU_H_HAS_BEEN_INCLUDED
-
-#define OPAL_DATATYPE_CUDA_KERNEL   1
-
-extern int32_t opal_datatype_cuda_kernel_support;
-
-int32_t opal_datatype_gpu_init(void);
-int32_t opal_datatype_gpu_fini(void);
-
-extern int32_t (*opal_datatype_cuda_init_p)(void);
-
-extern int32_t (*opal_datatype_cuda_fini_p)(void);
-                                                              
-extern int32_t (*opal_generic_simple_pack_function_cuda_iov_p)( opal_convertor_t* pConvertor,
-                                                                struct iovec* iov, 
-                                                                uint32_t* out_size,
-                                                                size_t* max_data );
-                                                                
-extern int32_t (*opal_generic_simple_pack_function_cuda_vector_p)( opal_convertor_t* pConvertor,
-                                                                   struct iovec* iov, 
-                                                                   uint32_t* out_size,
-                                                                   size_t* max_data );
-                                                                
-extern int32_t (*opal_generic_simple_unpack_function_cuda_iov_p)( opal_convertor_t* pConvertor,
-                                                                  struct iovec* iov, 
-                                                                  uint32_t* out_size,
-                                                                  size_t* max_data );
-                                                                  
-extern int32_t (*opal_generic_simple_unpack_function_cuda_vector_p)( opal_convertor_t* pConvertor,
-                                                                     struct iovec* iov, 
-                                                                     uint32_t* out_size,
-                                                                     size_t* max_data );
-                                                              
-extern void (*pack_contiguous_loop_cuda_p)( dt_elem_desc_t* ELEM,
-                                            uint32_t* COUNT,
-                                            unsigned char** SOURCE,
-                                            unsigned char** DESTINATION,
-                                            size_t* SPACE );
-                                            
-extern void (*unpack_contiguous_loop_cuda_p)( dt_elem_desc_t* ELEM,
-                                            uint32_t* COUNT,
-                                            unsigned char** SOURCE,
-                                            unsigned char** DESTINATION,
-                                            size_t* SPACE );
-
-extern void (*pack_predefined_data_cuda_p)( dt_elem_desc_t* ELEM,
-                                            uint32_t* COUNT,
-                                            unsigned char** SOURCE,
-                                            unsigned char** DESTINATION,
-                                            size_t* SPACE );
-
-extern void (*opal_cuda_free_gpu_buffer_p)(void *addr, int gpu_id);
-
-extern void* (*opal_cuda_malloc_gpu_buffer_p)(size_t size, int gpu_id);
-
-extern void (*opal_cuda_d2dcpy_async_p)(void* dst, const void* src, size_t count);
-
-extern void (*opal_cuda_d2dcpy_p)(void* dst, const void* src, size_t count);
-#endif /* OPAL_DATATYPE_GPU_H_HAS_BEEN_INCLUDED */
diff --git a/opal/datatype/opal_datatype_module.c b/opal/datatype/opal_datatype_module.c
index 09940374ab3..92a3fe40174 100644
--- a/opal/datatype/opal_datatype_module.c
+++ b/opal/datatype/opal_datatype_module.c
@@ -33,7 +33,9 @@
 #include "opal/datatype/opal_datatype.h"
 #include "opal/datatype/opal_convertor_internal.h"
 #include "opal/mca/base/mca_base_var.h"
-#include "opal/datatype/opal_datatype_gpu.h"
+#if OPAL_CUDA_SUPPORT
+#include "opal/datatype/opal_datatype_cuda.h"
+#endif /* OPAL_CUDA_SUPPORT */
 
 /* by default the debuging is turned off */
 int opal_datatype_dfd = -1;
@@ -249,9 +251,9 @@ int32_t opal_datatype_finalize( void )
     /* clear all master convertors */
     opal_convertor_destroy_masters();
 
-#if OPAL_DATATYPE_CUDA_KERNEL
-    opal_datatype_gpu_fini();
-#endif /* defined OPAL_DATATYPE_CUDA_KERNEL */
+#if OPAL_CUDA_SUPPORT
+    opal_datatype_cuda_kernel_support_fini();
+#endif /* OPAL_CUDA_SUPPORT */
 
     return OPAL_SUCCESS;
 }
diff --git a/opal/datatype/opal_datatype_pack.c b/opal/datatype/opal_datatype_pack.c
index 5a5a2470cb1..0bb29e2f3fc 100644
--- a/opal/datatype/opal_datatype_pack.c
+++ b/opal/datatype/opal_datatype_pack.c
@@ -37,7 +37,9 @@
 #include "opal/datatype/opal_datatype_checksum.h"
 #include "opal/datatype/opal_datatype_pack.h"
 #include "opal/datatype/opal_datatype_prototypes.h"
-#include "opal/datatype/opal_datatype_gpu.h"
+#if OPAL_CUDA_SUPPORT
+#include "opal/datatype/opal_datatype_cuda.h"
+#endif /* OPAL_CUDA_SUPPORT */
 
 #if defined(CHECKSUM)
 #define opal_pack_homogeneous_contig_function           opal_pack_homogeneous_contig_checksum
@@ -316,7 +318,6 @@ opal_generic_simple_pack_function( opal_convertor_t* pConvertor,
         while( 1 ) {
             while( pElem->elem.common.flags & OPAL_DATATYPE_FLAG_DATA ) {
                 /* now here we have a basic datatype */
-//                (*pack_predefined_data_cuda_p)(pElem, &count_desc, &conv_ptr, &iov_ptr, &iov_len_local);
                 PACK_PREDEFINED_DATATYPE( pConvertor, pElem, count_desc,
                                         conv_ptr, iov_ptr, iov_len_local );
                 if( 0 == count_desc ) {  /* completed */
@@ -361,7 +362,6 @@ opal_generic_simple_pack_function( opal_convertor_t* pConvertor,
             if( OPAL_DATATYPE_LOOP == pElem->elem.common.type ) {
                 OPAL_PTRDIFF_TYPE local_disp = (OPAL_PTRDIFF_TYPE)conv_ptr;
                 if( pElem->loop.common.flags & OPAL_DATATYPE_FLAG_CONTIGUOUS ) {
-                    //(*pack_contiguous_loop_cuda_p)(pElem, &count_desc, &conv_ptr, &iov_ptr, &iov_len_local);
                     PACK_CONTIGUOUS_LOOP( pConvertor, pElem, count_desc,
                                           conv_ptr, iov_ptr, iov_len_local );
                     if( 0 == count_desc ) {  /* completed */
@@ -391,12 +391,6 @@ opal_generic_simple_pack_function( opal_convertor_t* pConvertor,
     if( pConvertor->bConverted == pConvertor->local_size ) {
         pConvertor->flags |= CONVERTOR_COMPLETED;
         opal_output(0, "total packed %lu\n", pConvertor->bConverted);
-        // double *vtmp = (double *)iov[0].iov_base;
-        // for (uint32_t i = 0; i < total_packed/8; i++) {
-        //     printf(" %1.f ", *vtmp);
-        //     vtmp ++;
-        // }
-        // printf("\n");
         return 1;
     }
     /* Save the global position for the next round */
@@ -424,14 +418,9 @@ opal_generic_simple_pack_cuda_function( opal_convertor_t* pConvertor,
    
    // return (*opal_generic_simple_pack_function_cuda_vector_p)( pConvertor, iov, out_size, max_data);
     if( OPAL_DATATYPE_LOOP == pElem->elem.common.type ) {
-        if (opal_generic_simple_pack_function_cuda_vector_p != NULL) {
-            return (*opal_generic_simple_pack_function_cuda_vector_p)( pConvertor, iov, out_size, max_data);
-        //    return (*opal_generic_simple_pack_function_cuda_iov_p)( pConvertor, iov, out_size, max_data);
-        }
+        return opal_generic_simple_pack_function_cuda_vector( pConvertor, iov, out_size, max_data);
     } else {
-        if (opal_generic_simple_pack_function_cuda_iov_p != NULL) {
-            return (*opal_generic_simple_pack_function_cuda_iov_p)( pConvertor, iov, out_size, max_data);
-        }
+        return opal_generic_simple_pack_function_cuda_iov( pConvertor, iov, out_size, max_data);
     }
     return 0;
 }
diff --git a/opal/datatype/opal_datatype_unpack.c b/opal/datatype/opal_datatype_unpack.c
index d9d69683174..6a2fbd70a6c 100644
--- a/opal/datatype/opal_datatype_unpack.c
+++ b/opal/datatype/opal_datatype_unpack.c
@@ -27,7 +27,6 @@
 
 #include "opal/datatype/opal_convertor_internal.h"
 #include "opal/datatype/opal_datatype_internal.h"
-#include "opal/datatype/opal_datatype_gpu.h"
 
 #if OPAL_ENABLE_DEBUG
 #include "opal/util/output.h"
@@ -40,6 +39,9 @@
 #include "opal/datatype/opal_datatype_checksum.h"
 #include "opal/datatype/opal_datatype_unpack.h"
 #include "opal/datatype/opal_datatype_prototypes.h"
+#if OPAL_CUDA_SUPPORT
+#include "opal/datatype/opal_datatype_cuda.h"
+#endif /* OPAL_CUDA_SUPPORT */
 
 #if defined(CHECKSUM)
 #define opal_unpack_general_function            opal_unpack_general_checksum
@@ -385,7 +387,6 @@ opal_generic_simple_unpack_function( opal_convertor_t* pConvertor,
                 if( pElem->loop.common.flags & OPAL_DATATYPE_FLAG_CONTIGUOUS ) {
                     UNPACK_CONTIGUOUS_LOOP( pConvertor, pElem, count_desc, 
                                             iov_ptr, conv_ptr, iov_len_local );
-                //    (*unpack_contiguous_loop_cuda_p)(pElem, &count_desc, &iov_ptr, &conv_ptr, &iov_len_local);
                     if( 0 == count_desc ) {  /* completed */
                         pos_desc += pElem->loop.items + 1;
                         goto update_loop_description;
@@ -611,14 +612,9 @@ opal_generic_simple_unpack_cuda_function( opal_convertor_t* pConvertor,
    
 //    return (*opal_generic_simple_unpack_function_cuda_vector_p)( pConvertor, iov, out_size, max_data);
     if( OPAL_DATATYPE_LOOP == pElem->elem.common.type ) {
-        if (opal_generic_simple_unpack_function_cuda_vector_p != NULL) {
-            return (*opal_generic_simple_unpack_function_cuda_vector_p)( pConvertor, iov, out_size, max_data);
-          //  return (*opal_generic_simple_unpack_function_cuda_iov_p)( pConvertor, iov, out_size, max_data);
-        }
+        return opal_generic_simple_unpack_function_cuda_vector( pConvertor, iov, out_size, max_data);
     } else {
-        if (opal_generic_simple_unpack_function_cuda_iov_p != NULL) {
-            return (*opal_generic_simple_unpack_function_cuda_iov_p)( pConvertor, iov, out_size, max_data);
-        }
+        return opal_generic_simple_unpack_function_cuda_iov( pConvertor, iov, out_size, max_data);
     }
     return 0;
 }
diff --git a/opal/mca/btl/smcuda/btl_smcuda.c b/opal/mca/btl/smcuda/btl_smcuda.c
index 2d015ad11fb..eeafea57fb6 100644
--- a/opal/mca/btl/smcuda/btl_smcuda.c
+++ b/opal/mca/btl/smcuda/btl_smcuda.c
@@ -55,7 +55,7 @@
 
 #if OPAL_CUDA_SUPPORT
 #include "opal/mca/common/cuda/common_cuda.h"
-#include "opal/datatype/opal_datatype_gpu.h"
+#include "opal/datatype/opal_datatype_cuda.h"
 #endif /* OPAL_CUDA_SUPPORT */
 #include "opal/mca/mpool/base/base.h"
 #include "opal/mca/mpool/sm/mpool_sm.h"
@@ -1187,8 +1187,8 @@ int mca_btl_smcuda_get_cuda (struct mca_btl_base_module_t *btl,
                 uint32_t iov_count = 1;
                 size_t max_data;
                 if (!OPAL_DATATYPE_DIRECT_COPY_GPUMEM && remote_device != local_device) {
-                    unpack_convertor->gpu_buffer_ptr = opal_cuda_malloc_gpu_buffer_p(size, 0);
-                    (*opal_cuda_d2dcpy_async_p)(unpack_convertor->gpu_buffer_ptr, remote_memory_address, size);
+                    unpack_convertor->gpu_buffer_ptr = opal_cuda_malloc_gpu_buffer(size, 0);
+                    opal_cuda_d2dcpy_async(unpack_convertor->gpu_buffer_ptr, remote_memory_address, size);
                     iov.iov_base = unpack_convertor->gpu_buffer_ptr;
                     opal_output(0, "start D2D copy src %p, dst %p, size %lu\n", remote_memory_address, unpack_convertor->gpu_buffer_ptr, size);
                 } else {
@@ -1197,7 +1197,7 @@ int mca_btl_smcuda_get_cuda (struct mca_btl_base_module_t *btl,
                 iov.iov_len = size;
                 max_data = size;
                 opal_convertor_unpack(unpack_convertor, &iov, &iov_count, &max_data );
-                opal_cuda_free_gpu_buffer_p(unpack_convertor->gpu_buffer_ptr, 0);
+                opal_cuda_free_gpu_buffer(unpack_convertor->gpu_buffer_ptr, 0);
                 done = 1;
             }
         } else {
@@ -1436,6 +1436,7 @@ int mca_btl_smcuda_alloc_cuda_ddt_clone(struct mca_btl_base_endpoint_t *endpoint
         endpoint->smcuda_ddt_clone_size += SMCUDA_DT_CLONE_SIZE;
         return endpoint->smcuda_ddt_clone_size - SMCUDA_DT_CLONE_SIZE;
     }
+    return -1;
 }
 
 void mca_btl_smcuda_free_cuda_ddt_clone(struct mca_btl_base_endpoint_t *endpoint, int lindex)
diff --git a/opal/mca/btl/smcuda/btl_smcuda_component.c b/opal/mca/btl/smcuda/btl_smcuda_component.c
index 9c1f5235d1e..c4a299ef84a 100644
--- a/opal/mca/btl/smcuda/btl_smcuda_component.c
+++ b/opal/mca/btl/smcuda/btl_smcuda_component.c
@@ -54,7 +54,7 @@
 
 #if OPAL_CUDA_SUPPORT
 #include "opal/mca/common/cuda/common_cuda.h"
-#include "opal/datatype/opal_datatype_gpu.h"
+#include "opal/datatype/opal_datatype_cuda.h"
 #endif /* OPAL_CUDA_SUPPORT */
 #if OPAL_ENABLE_FT_CR    == 1
 #include "opal/runtime/opal_cr.h"
@@ -901,9 +901,9 @@ static void btl_smcuda_datatype_unpack(mca_btl_base_module_t* btl,
         } else {     /* unpack */
             convertor->flags |= CONVERTOR_CUDA;
             if (!OPAL_DATATYPE_DIRECT_COPY_GPUMEM && my_cuda_dt_clone->remote_device != my_cuda_dt_clone->local_device) {
-                convertor->gpu_buffer_ptr = opal_cuda_malloc_gpu_buffer_p(packed_size, 0);
+                convertor->gpu_buffer_ptr = opal_cuda_malloc_gpu_buffer(packed_size, 0);
                 remote_address = (unsigned char*)my_cuda_dt_clone->remote_gpu_address + seq * pipeline_size;
-                (*opal_cuda_d2dcpy_async_p)(convertor->gpu_buffer_ptr, remote_address, packed_size);
+                opal_cuda_d2dcpy_async(convertor->gpu_buffer_ptr, remote_address, packed_size);
                 iov.iov_base = convertor->gpu_buffer_ptr;
                 opal_output(0, "unpack, start D2D copy src %p, dst %p, size %lu\n", remote_address, convertor->gpu_buffer_ptr, packed_size);        
             } else {
@@ -914,7 +914,7 @@ static void btl_smcuda_datatype_unpack(mca_btl_base_module_t* btl,
             opal_convertor_unpack(convertor, &iov, &iov_count, &max_data );
             if (!OPAL_DATATYPE_DIRECT_COPY_GPUMEM && my_cuda_dt_clone->remote_device != my_cuda_dt_clone->local_device) {
                 if (convertor->gpu_buffer_ptr != NULL) {
-                    opal_cuda_free_gpu_buffer_p(convertor->gpu_buffer_ptr, 0);
+                    opal_cuda_free_gpu_buffer(convertor->gpu_buffer_ptr, 0);
                     convertor->gpu_buffer_ptr = NULL;
                 }   
             }
@@ -960,7 +960,7 @@ static void btl_smcuda_datatype_pack(mca_btl_base_module_t* btl,
         send_msg.msg_type = CUDA_DDT_CLEANUP;
         mca_btl_smcuda_send_cuda_unpack_sig(btl, endpoint, &send_msg);
         if (convertor->gpu_buffer_ptr != NULL) {
-            opal_cuda_free_gpu_buffer_p(convertor->gpu_buffer_ptr, 0);
+            opal_cuda_free_gpu_buffer(convertor->gpu_buffer_ptr, 0);
             convertor->gpu_buffer_ptr = NULL;
         }
     } else if (msg_type == CUDA_DDT_PACK_TO_BLOCK) {
@@ -1022,7 +1022,7 @@ static void btl_smcuda_datatype_put(mca_btl_base_module_t* btl,
     /* We can find the endoint back from the rank embedded in the header */
     endpoint = mca_btl_smcuda_component.sm_peers[frag->hdr->my_smp_rank];
     
-    opal_cuda_free_gpu_buffer_p(convertor->gpu_buffer_ptr, 0);
+    opal_cuda_free_gpu_buffer(convertor->gpu_buffer_ptr, 0);
     mca_mpool_common_cuda_reg_t *rget_reg_ptr = NULL;
     mca_mpool_common_cuda_reg_t rget_reg;
     rget_reg_ptr= &rget_reg;
diff --git a/opal/mca/common/cuda/common_cuda.c b/opal/mca/common/cuda/common_cuda.c
index 6bcb031003d..38b35a44b09 100644
--- a/opal/mca/common/cuda/common_cuda.c
+++ b/opal/mca/common/cuda/common_cuda.c
@@ -33,7 +33,6 @@
 #include "opal/align.h"
 #include "opal/datatype/opal_convertor.h"
 #include "opal/datatype/opal_datatype_cuda.h"
-#include "opal/datatype/opal_datatype_gpu.h"
 #include "opal/util/output.h"
 #include "opal/util/show_help.h"
 #include "opal/util/proc.h"
@@ -1639,16 +1638,6 @@ int progress_one_cuda_htod_event(struct mca_btl_base_descriptor_t **frag) {
     return 0;
 }
 
-int mca_common_cuda_geteventhandle(uint64_t **event, int n, mca_mpool_base_registration_t *newreg) 
-{
-    CUipcEventHandle evtHandle;
-    mca_mpool_common_cuda_reg_t *cuda_reg = (mca_mpool_common_cuda_reg_t*)newreg;
- //   mca_common_cuda_construct_event_and_handle(event, (void**)&evtHandle);
-//    printf("0 %p, 1 %p\n",&cuda_reg->data.pipeline_evtHandle[0], &cuda_reg->data.pipeline_evtHandle[EVTHANDLE_SIZE]);
- //   memcpy(&cuda_reg->data.pipeline_evtHandle[n*EVTHANDLE_SIZE], &evtHandle, sizeof(evtHandle));
-    return OPAL_SUCCESS;
-}
-
 int mca_common_cuda_create_event(uint64_t **event)
 {
     CUresult result;
@@ -1912,7 +1901,7 @@ static int mca_common_cuda_is_gpu_buffer(const void *pUserBuf, opal_convertor_t
         if (0 != mca_common_cuda_stage_three_init()) {
             opal_cuda_support = 0;
         } else {
-	    opal_datatype_gpu_init();
+	    opal_datatype_cuda_kernel_support_init();
 	}
     }
 
diff --git a/opal/mca/common/cuda/common_cuda.h b/opal/mca/common/cuda/common_cuda.h
index 9adda6dc82f..e0b511fa48b 100644
--- a/opal/mca/common/cuda/common_cuda.h
+++ b/opal/mca/common/cuda/common_cuda.h
@@ -93,7 +93,6 @@ OPAL_DECLSPEC int mca_common_cuda_device_can_access_peer(int *access, int dev1,
 OPAL_DECLSPEC int mca_common_cuda_stage_one_init(void);
 OPAL_DECLSPEC int mca_common_cuda_get_address_range(void *pbase, size_t *psize, void *base);
 OPAL_DECLSPEC void mca_common_cuda_fini(void);
-OPAL_DECLSPEC int mca_common_cuda_geteventhandle(uint64_t **event, int n, mca_mpool_base_registration_t *newreg);
 OPAL_DECLSPEC int mca_common_cuda_create_event(uint64_t **event);
 OPAL_DECLSPEC int mca_common_cuda_record_event(uint64_t *event);
 OPAL_DECLSPEC int mca_common_cuda_query_event(uint64_t *event);

From 1b1d827489420cfc6e02bd1583e83e9eaa980756 Mon Sep 17 00:00:00 2001
From: eddy16112 <eddy16112@gmail.com>
Date: Thu, 5 Nov 2015 16:25:41 -0500
Subject: [PATCH 138/190] rename some functions

---
 opal/datatype/cuda/Makefile.in                |   4 +-
 opal/datatype/cuda/opal_datatype_cuda.cu      |  14 +--
 opal/datatype/cuda/opal_datatype_cuda.cuh     |  46 ++++----
 .../cuda/opal_datatype_cuda_internal.cuh      |   8 --
 .../cuda/opal_datatype_pack_cuda_kernel.cu    |  43 +------
 .../cuda/opal_datatype_pack_cuda_wrapper.cu   |  34 +++---
 .../cuda/opal_datatype_unpack_cuda_wrapper.cu |  30 ++---
 opal/datatype/opal_convertor.h                |   1 -
 opal/datatype/opal_datatype_cuda.c            | 106 +++++++++---------
 opal/datatype/opal_datatype_cuda.h            |  24 ++--
 opal/datatype/opal_datatype_module.c          |   2 +-
 opal/datatype/opal_datatype_pack.c            |   2 +-
 opal/datatype/opal_datatype_unpack.c          |   2 +-
 opal/mca/common/cuda/common_cuda.c            |   2 +-
 14 files changed, 134 insertions(+), 184 deletions(-)

diff --git a/opal/datatype/cuda/Makefile.in b/opal/datatype/cuda/Makefile.in
index ded04f1ed3c..ea0af09c6d0 100644
--- a/opal/datatype/cuda/Makefile.in
+++ b/opal/datatype/cuda/Makefile.in
@@ -9,8 +9,8 @@ VPATH = @srcdir@
 NVCC       = nvcc
 ARCH       = @AR@
 ARCHFLAGS  = cr
-STLIB     ?= opal_datatype_cuda.a
-DYLIB     ?= opal_datatype_cuda.so
+STLIB     ?= opal_datatype_cuda_kernel.a
+DYLIB     ?= opal_datatype_cuda_kernel.so
 EXTLIB     = -L$(top_builddir)/opal/datatype/.libs -ldatatype -L$(top_builddir)/opal/.libs -lopen-pal -L/usr/local/cuda/lib -lcuda
 subdir     = opal/datatype/cuda
 
diff --git a/opal/datatype/cuda/opal_datatype_cuda.cu b/opal/datatype/cuda/opal_datatype_cuda.cu
index 3c5208d7122..e07adb33c5e 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cu
+++ b/opal/datatype/cuda/opal_datatype_cuda.cu
@@ -175,7 +175,7 @@ void opal_cuda_output(int output_id, const char *format, ...)
     }
 }
 
-int32_t opal_datatype_cuda_init(void)
+int32_t opal_ddt_cuda_kernel_init(void)
 {
     uint32_t i, j;
     int device;
@@ -245,7 +245,7 @@ int32_t opal_datatype_cuda_init(void)
     return OPAL_SUCCESS;
 }
 
-int32_t opal_datatype_cuda_fini(void)
+int32_t opal_ddt_cuda_kernel_fini(void)
 {
     uint32_t i, j;
     
@@ -275,7 +275,7 @@ int32_t opal_datatype_cuda_fini(void)
     return OPAL_SUCCESS;
 }
 
-int32_t opal_cuda_is_gpu_buffer(const void *ptr)
+int32_t opal_ddt_cuda_is_gpu_buffer(const void *ptr)
 {
     int res;
     CUmemorytype memType;
@@ -291,7 +291,7 @@ int32_t opal_cuda_is_gpu_buffer(const void *ptr)
     return (memType == CU_MEMORYTYPE_DEVICE) ? 1 : 0;
 }
 
-void* opal_cuda_malloc_gpu_buffer(size_t size, int gpu_id)
+void* opal_ddt_cuda_malloc_gpu_buffer(size_t size, int gpu_id)
 {
     int dev_id;
     cudaGetDevice(&dev_id);
@@ -330,7 +330,7 @@ void* opal_cuda_malloc_gpu_buffer(size_t size, int gpu_id)
     return NULL;
 }
 
-void opal_cuda_free_gpu_buffer(void *addr, int gpu_id)
+void opal_ddt_cuda_free_gpu_buffer(void *addr, int gpu_id)
 {
     ddt_cuda_device_t *device = &cuda_devices[gpu_id];
     ddt_cuda_buffer_t *ptr = device->buffer_used.head;
@@ -369,12 +369,12 @@ void opal_cuda_check_error(cudaError_t err)
     }
 }
 
-void opal_cuda_d2dcpy_async(void* dst, const void* src, size_t count)
+void opal_ddt_cuda_d2dcpy_async(void* dst, const void* src, size_t count)
 {
     cudaMemcpyAsync(dst, src, count, cudaMemcpyDeviceToDevice, current_cuda_device->cuda_streams->opal_cuda_stream[0]);
 }
 
-void opal_cuda_d2dcpy(void* dst, const void* src, size_t count)
+void opal_ddt_cuda_d2dcpy(void* dst, const void* src, size_t count)
 {
     cudaMemcpyAsync(dst, src, count, cudaMemcpyDeviceToDevice, current_cuda_device->cuda_streams->opal_cuda_stream[0]);
     cudaStreamSynchronize(current_cuda_device->cuda_streams->opal_cuda_stream[0]);
diff --git a/opal/datatype/cuda/opal_datatype_cuda.cuh b/opal/datatype/cuda/opal_datatype_cuda.cuh
index 8c228fc3404..53f548c6d34 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cuh
+++ b/opal/datatype/cuda/opal_datatype_cuda.cuh
@@ -4,30 +4,30 @@
 extern "C"
 {
     
-int32_t opal_datatype_cuda_init(void);
+int32_t opal_ddt_cuda_kernel_init(void);
 
-int32_t opal_datatype_cuda_fini(void);
+int32_t opal_ddt_cuda_kernel_fini(void);
                                 
                                                 
-int32_t opal_generic_simple_pack_function_cuda_vector( opal_convertor_t* pConvertor,
-                                                       struct iovec* iov, 
-                                                       uint32_t* out_size,
-                                                       size_t* max_data );
+int32_t opal_ddt_generic_simple_pack_function_cuda_vector( opal_convertor_t* pConvertor,
+                                                           struct iovec* iov, 
+                                                           uint32_t* out_size,
+                                                           size_t* max_data );
                                                 
-int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor,
-                                                    struct iovec* iov, 
-                                                    uint32_t* out_size,
-                                                    size_t* max_data );                                              
+int32_t opal_ddt_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor,
+                                                        struct iovec* iov, 
+                                                        uint32_t* out_size,
+                                                        size_t* max_data );                                              
                                                   
-int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvertor,
-                                                  struct iovec* iov, 
-                                                  uint32_t* out_size,
-                                                  size_t* max_data );  
+int32_t opal_ddt_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvertor,
+                                                          struct iovec* iov, 
+                                                          uint32_t* out_size,
+                                                          size_t* max_data );  
                                                 
-int32_t opal_generic_simple_unpack_function_cuda_vector( opal_convertor_t* pConvertor,
-                                                         struct iovec* iov, 
-                                                         uint32_t* out_size,
-                                                         size_t* max_data );
+int32_t opal_ddt_generic_simple_unpack_function_cuda_vector( opal_convertor_t* pConvertor,
+                                                             struct iovec* iov, 
+                                                             uint32_t* out_size,
+                                                             size_t* max_data );
 
 void pack_contiguous_loop_cuda( dt_elem_desc_t* ELEM,
                                 uint32_t* COUNT,
@@ -83,15 +83,15 @@ void unpack_predefined_data_cuda( dt_elem_desc_t* ELEM,
                                   unsigned char** DESTINATION,
                                   size_t* SPACE );
 
-int32_t opal_cuda_is_gpu_buffer(const void *ptr);
+int32_t opal_ddt_cuda_is_gpu_buffer(const void *ptr);
 
-void* opal_cuda_malloc_gpu_buffer(size_t size, int gpu_id);
+void* opal_ddt_cuda_malloc_gpu_buffer(size_t size, int gpu_id);
 
-void opal_cuda_free_gpu_buffer(void *addr, int gpu_id);
+void opal_ddt_cuda_free_gpu_buffer(void *addr, int gpu_id);
 
-void opal_cuda_d2dcpy_async(void* dst, const void* src, size_t count);
+void opal_ddt_cuda_d2dcpy_async(void* dst, const void* src, size_t count);
 
-void opal_cuda_d2dcpy(void* dst, const void* src, size_t count);
+void opal_ddt_cuda_d2dcpy(void* dst, const void* src, size_t count);
 
 void opal_dump_cuda_list(ddt_cuda_list_t *list);
 
diff --git a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
index 506a5fe22cd..7648eed3b3e 100644
--- a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
+++ b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
@@ -120,14 +120,6 @@ __global__ void opal_generic_simple_pack_cuda_iov_kernel( ddt_cuda_iov_dist_t* c
 
 __global__ void opal_generic_simple_unpack_cuda_iov_kernel( ddt_cuda_iov_dist_t* cuda_iov_dist, int nb_blocks_used, unsigned char* source_base, unsigned char* destination_base);
 
-__global__ void opal_empty_kernel(uint32_t copy_loops,
-                                  size_t size,
-                                  OPAL_PTRDIFF_TYPE extent,
-                                  unsigned char* source,
-                                  unsigned char* destination);
-                            
-__global__ void opal_empty_kernel_noargs();
-
 void opal_cuda_output(int output_id, const char *format, ...);
 
 void opal_cuda_check_error(cudaError_t err);
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
index dd9af2a5a7e..6b0e18b1078 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
@@ -43,33 +43,6 @@ __global__ void pack_contiguous_loop_cuda_kernel_global( uint32_t copy_loops,
     }
 }
 
-// __global__ void opal_generic_simple_pack_cuda_iov_kernel( ddt_cuda_description_dist_t* desc_dist_d,
-//                                                         dt_elem_desc_t* desc_d,
-//                                                         uint32_t required_blocks, struct iovec* iov, unsigned char* pBaseBuf)
-// {
-//     uint32_t i;
-//     dt_elem_desc_t* pElem;
-//     unsigned char *conv_ptr, *iov_ptr;
-//     uint32_t local_index, dst_offset, pos_desc, count_desc;
-//     size_t iov_len_local;
-//
-//     iov_ptr = (unsigned char *) iov[0].iov_base;
-//     iov_len_local = iov[0].iov_len;
-//     conv_ptr = pBaseBuf;
-//     for (i = 0; i < desc_dist_d[blockIdx.x].description_used; i++) {
-//         pos_desc = desc_dist_d[blockIdx.x].description_index[i];
-//         local_index = desc_dist_d[blockIdx.x].description_local_index[i];
-//         dst_offset = desc_dist_d[blockIdx.x].dst_offset[i];
-//         pElem = &(desc_d[pos_desc]);
-//         count_desc = pElem->elem.count;
-//
-//   //      if ( pElem->elem.common.flags & OPAL_DATATYPE_FLAG_DATA ) {
-//             pack_predefined_data_cuda_kernel_v2(pElem, &count_desc, conv_ptr, iov_ptr, &iov_len_local, local_index, dst_offset);
-// //        }
-//     }
-//
-// }
-
 __global__ void opal_generic_simple_pack_cuda_iov_kernel( ddt_cuda_iov_dist_t* cuda_iov_dist, int nb_blocks_used, unsigned char* source_base, unsigned char* destination_base)
 {
     uint32_t i, _copy_count;
@@ -113,18 +86,4 @@ __global__ void opal_generic_simple_pack_cuda_iov_kernel( ddt_cuda_iov_dist_t* c
 #endif /* ! OPAL_DATATYPE_CUDA_DRY_RUN */
         }
     }
-}
-
-__global__ void opal_empty_kernel(uint32_t copy_loops,
-                                  size_t size,
-                                  OPAL_PTRDIFF_TYPE extent,
-                                  unsigned char* source,
-                                  unsigned char* destination)
-{
-    
-}
-
-__global__ void opal_empty_kernel_noargs()
-{
-    
-}
+}
\ No newline at end of file
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
index dccf2803c6a..97481755209 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
@@ -8,7 +8,7 @@
 #include <assert.h>
 
 
-int32_t opal_generic_simple_pack_function_cuda_vector2(opal_convertor_t* pConvertor,
+int32_t opal_ddt_generic_simple_pack_function_cuda_vector2(opal_convertor_t* pConvertor,
                                                       struct iovec* iov,
                                                       uint32_t* out_size,
                                                       size_t* max_data )
@@ -59,7 +59,7 @@ int32_t opal_generic_simple_pack_function_cuda_vector2(opal_convertor_t* pConver
     
     
     for( iov_count = 0; iov_count < (*out_size); iov_count++ ) {
-        if ((iov[iov_count].iov_base == NULL) || opal_cuda_is_gpu_buffer(iov[iov_count].iov_base)) {
+        if ((iov[iov_count].iov_base == NULL) || opal_ddt_cuda_is_gpu_buffer(iov[iov_count].iov_base)) {
             if (iov[iov_count].iov_len == 0) {
                 iov_len_local = DT_CUDA_BUFFER_SIZE;
             } else {
@@ -67,7 +67,7 @@ int32_t opal_generic_simple_pack_function_cuda_vector2(opal_convertor_t* pConver
             }
         
             if (iov[iov_count].iov_base == NULL) {
-                iov[iov_count].iov_base = (unsigned char *)opal_cuda_malloc_gpu_buffer(iov_len_local, 0);
+                iov[iov_count].iov_base = (unsigned char *)opal_ddt_cuda_malloc_gpu_buffer(iov_len_local, 0);
                 iov_ptr = (unsigned char *)iov[iov_count].iov_base;
                 pConvertor->gpu_buffer_ptr = iov_ptr;
                 free_required = 1;
@@ -86,7 +86,7 @@ int32_t opal_generic_simple_pack_function_cuda_vector2(opal_convertor_t* pConver
             } else if (OPAL_DATATYPE_VECTOR_USE_PIPELINE){
                 iov_len_local = iov[iov_count].iov_len;
                 if (pConvertor->gpu_buffer_ptr == NULL) {
-                    pConvertor->gpu_buffer_ptr = (unsigned char*)opal_cuda_malloc_gpu_buffer(iov_len_local, 0);
+                    pConvertor->gpu_buffer_ptr = (unsigned char*)opal_ddt_cuda_malloc_gpu_buffer(iov_len_local, 0);
                 }
                 transfer_required = 0;
                 free_required = 1;
@@ -94,7 +94,7 @@ int32_t opal_generic_simple_pack_function_cuda_vector2(opal_convertor_t* pConver
             } else {
                 iov_len_local = iov[iov_count].iov_len;
                 if (pConvertor->gpu_buffer_ptr == NULL) {
-                    pConvertor->gpu_buffer_ptr = (unsigned char*)opal_cuda_malloc_gpu_buffer(iov_len_local, 0);
+                    pConvertor->gpu_buffer_ptr = (unsigned char*)opal_ddt_cuda_malloc_gpu_buffer(iov_len_local, 0);
                 }
                 transfer_required = 1;
                 free_required = 1;
@@ -198,7 +198,7 @@ int32_t opal_generic_simple_pack_function_cuda_vector2(opal_convertor_t* pConver
         DT_CUDA_DEBUG( opal_cuda_output( 0, "Total packed %lu\n", pConvertor->bConverted); );
         if (pConvertor->gpu_buffer_ptr != NULL && free_required == 1) {
             printf("free\n");
-           opal_cuda_free_gpu_buffer(pConvertor->gpu_buffer_ptr, 0);
+           opal_ddt_cuda_free_gpu_buffer(pConvertor->gpu_buffer_ptr, 0);
            pConvertor->gpu_buffer_ptr = NULL;
         }
         return 1;
@@ -211,7 +211,7 @@ int32_t opal_generic_simple_pack_function_cuda_vector2(opal_convertor_t* pConver
     return 0;
 }
 
-int32_t opal_generic_simple_pack_function_cuda_vector(opal_convertor_t* pConvertor,
+int32_t opal_ddt_generic_simple_pack_function_cuda_vector(opal_convertor_t* pConvertor,
                                                       struct iovec* iov,
                                                       uint32_t* out_size,
                                                       size_t* max_data )
@@ -262,7 +262,7 @@ int32_t opal_generic_simple_pack_function_cuda_vector(opal_convertor_t* pConvert
 
 
     for( iov_count = 0; iov_count < (*out_size); iov_count++ ) {
-        if ((iov[iov_count].iov_base == NULL) || opal_cuda_is_gpu_buffer(iov[iov_count].iov_base)) {
+        if ((iov[iov_count].iov_base == NULL) || opal_ddt_cuda_is_gpu_buffer(iov[iov_count].iov_base)) {
             if (iov[iov_count].iov_len == 0) {
                 iov_len_local = DT_CUDA_BUFFER_SIZE;
             } else {
@@ -270,7 +270,7 @@ int32_t opal_generic_simple_pack_function_cuda_vector(opal_convertor_t* pConvert
             }
 
             if (iov[iov_count].iov_base == NULL) {
-                iov[iov_count].iov_base = (unsigned char *)opal_cuda_malloc_gpu_buffer(iov_len_local, 0);
+                iov[iov_count].iov_base = (unsigned char *)opal_ddt_cuda_malloc_gpu_buffer(iov_len_local, 0);
                 iov_ptr = (unsigned char *)iov[iov_count].iov_base;
                 pConvertor->gpu_buffer_ptr = iov_ptr;
                 free_required = 1;
@@ -289,7 +289,7 @@ int32_t opal_generic_simple_pack_function_cuda_vector(opal_convertor_t* pConvert
             } else if (OPAL_DATATYPE_VECTOR_USE_PIPELINE){
                 iov_len_local = iov[iov_count].iov_len;
                 if (pConvertor->gpu_buffer_ptr == NULL) {
-                    pConvertor->gpu_buffer_ptr = (unsigned char*)opal_cuda_malloc_gpu_buffer(iov_len_local, 0);
+                    pConvertor->gpu_buffer_ptr = (unsigned char*)opal_ddt_cuda_malloc_gpu_buffer(iov_len_local, 0);
                 }
                 transfer_required = 0;
                 free_required = 1;
@@ -297,7 +297,7 @@ int32_t opal_generic_simple_pack_function_cuda_vector(opal_convertor_t* pConvert
             } else {
                 iov_len_local = iov[iov_count].iov_len;
                 if (pConvertor->gpu_buffer_ptr == NULL) {
-                    pConvertor->gpu_buffer_ptr = (unsigned char*)opal_cuda_malloc_gpu_buffer(iov_len_local, 0);
+                    pConvertor->gpu_buffer_ptr = (unsigned char*)opal_ddt_cuda_malloc_gpu_buffer(iov_len_local, 0);
                 }
                 transfer_required = 1;
                 free_required = 1;
@@ -398,7 +398,7 @@ int32_t opal_generic_simple_pack_function_cuda_vector(opal_convertor_t* pConvert
         pConvertor->flags |= CONVERTOR_COMPLETED;
         if (pConvertor->gpu_buffer_ptr != NULL && free_required == 1) {
             printf("free\n");
-           opal_cuda_free_gpu_buffer(pConvertor->gpu_buffer_ptr, 0);
+           opal_ddt_cuda_free_gpu_buffer(pConvertor->gpu_buffer_ptr, 0);
            pConvertor->gpu_buffer_ptr = NULL;
         }
         return 1;
@@ -642,7 +642,7 @@ void pack_contiguous_loop_cuda_zerocopy( dt_elem_desc_t* ELEM,
 #endif
 }
 
-int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor,
+int32_t opal_ddt_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor,
                                                     struct iovec* iov,
                                                     uint32_t* out_size,
                                                     size_t* max_data )
@@ -683,7 +683,7 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
 //    assert(opal_datatype_basicDatatypes[pElem->elem.common.type]->size != 0);
 
  //   printf("buffer size %d, max_data %d\n", iov[0].iov_len, *max_data);
-    if ((iov[0].iov_base == NULL) || opal_cuda_is_gpu_buffer(iov[0].iov_base)) {
+    if ((iov[0].iov_base == NULL) || opal_ddt_cuda_is_gpu_buffer(iov[0].iov_base)) {
         if (iov[0].iov_len == 0) {
             buffer_size = DT_CUDA_BUFFER_SIZE;
         } else {
@@ -691,7 +691,7 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
         }
         
         if (iov[0].iov_base == NULL) {
-            iov[0].iov_base = (unsigned char *)opal_cuda_malloc_gpu_buffer(buffer_size, 0);
+            iov[0].iov_base = (unsigned char *)opal_ddt_cuda_malloc_gpu_buffer(buffer_size, 0);
             destination = (unsigned char *)iov[0].iov_base;
             pConvertor->gpu_buffer_ptr = destination;
             free_required = 1;
@@ -709,7 +709,7 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
             cudaHostGetDevicePointer((void **)&destination, (void *)iov[0].iov_base, 0);
         } else {
             if (pConvertor->gpu_buffer_ptr == NULL) {
-                pConvertor->gpu_buffer_ptr = (unsigned char*)opal_cuda_malloc_gpu_buffer(buffer_size, 0);
+                pConvertor->gpu_buffer_ptr = (unsigned char*)opal_ddt_cuda_malloc_gpu_buffer(buffer_size, 0);
             }
             transfer_required = 1;
             free_required = 1;
@@ -895,7 +895,7 @@ int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor
     if( pConvertor->bConverted == pConvertor->local_size ) {
         pConvertor->flags |= CONVERTOR_COMPLETED;
         if (pConvertor->gpu_buffer_ptr != NULL && free_required) {
-           opal_cuda_free_gpu_buffer(pConvertor->gpu_buffer_ptr, 0);
+           opal_ddt_cuda_free_gpu_buffer(pConvertor->gpu_buffer_ptr, 0);
            pConvertor->gpu_buffer_ptr = NULL;
         }
         return 1;
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
index a8ba035ef78..9d0e02067d1 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
@@ -8,7 +8,7 @@
 #include <assert.h>
 
 
-int32_t opal_generic_simple_unpack_function_cuda_vector2( opal_convertor_t* pConvertor,
+int32_t opal_ddt_generic_simple_unpack_function_cuda_vector2( opal_convertor_t* pConvertor,
                                                          struct iovec* iov, uint32_t* out_size,
                                                          size_t* max_data )
 {
@@ -58,7 +58,7 @@ int32_t opal_generic_simple_unpack_function_cuda_vector2( opal_convertor_t* pCon
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
         GET_TIME(start);
 #endif
-        if (opal_cuda_is_gpu_buffer(iov[iov_count].iov_base)) {
+        if (opal_ddt_cuda_is_gpu_buffer(iov[iov_count].iov_base)) {
             iov_ptr = (unsigned char*)iov[iov_count].iov_base;
             free_required = 0;
         } else {
@@ -68,7 +68,7 @@ int32_t opal_generic_simple_unpack_function_cuda_vector2( opal_convertor_t* pCon
                 free_required = 0;
             } else {
                 if (pConvertor->gpu_buffer_ptr == NULL) {
-                    pConvertor->gpu_buffer_ptr = (unsigned char*)opal_cuda_malloc_gpu_buffer(iov[iov_count].iov_len, 0);
+                    pConvertor->gpu_buffer_ptr = (unsigned char*)opal_ddt_cuda_malloc_gpu_buffer(iov[iov_count].iov_len, 0);
                 }
                 iov_ptr = pConvertor->gpu_buffer_ptr;
                 cudaMemcpy(iov_ptr, iov[iov_count].iov_base, iov[iov_count].iov_len, cudaMemcpyHostToDevice);
@@ -171,7 +171,7 @@ int32_t opal_generic_simple_unpack_function_cuda_vector2( opal_convertor_t* pCon
         pConvertor->flags |= CONVERTOR_COMPLETED;
         DT_CUDA_DEBUG( opal_cuda_output( 0, "Total unpacked %lu\n", pConvertor->bConverted); );
         if (pConvertor->gpu_buffer_ptr != NULL && free_required == 1) {
-            opal_cuda_free_gpu_buffer(pConvertor->gpu_buffer_ptr, 0);
+            opal_ddt_cuda_free_gpu_buffer(pConvertor->gpu_buffer_ptr, 0);
             pConvertor->gpu_buffer_ptr = NULL;
         }
         return 1;
@@ -184,7 +184,7 @@ int32_t opal_generic_simple_unpack_function_cuda_vector2( opal_convertor_t* pCon
     return 0;
 }
 
-int32_t opal_generic_simple_unpack_function_cuda_vector( opal_convertor_t* pConvertor,
+int32_t opal_ddt_generic_simple_unpack_function_cuda_vector( opal_convertor_t* pConvertor,
                                                          struct iovec* iov, uint32_t* out_size,
                                                          size_t* max_data )
 {
@@ -234,7 +234,7 @@ int32_t opal_generic_simple_unpack_function_cuda_vector( opal_convertor_t* pConv
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
         GET_TIME(start);
 #endif
-        if (opal_cuda_is_gpu_buffer(iov[iov_count].iov_base)) {
+        if (opal_ddt_cuda_is_gpu_buffer(iov[iov_count].iov_base)) {
             iov_ptr = (unsigned char*)iov[iov_count].iov_base;
             free_required = 0;
         } else {
@@ -244,7 +244,7 @@ int32_t opal_generic_simple_unpack_function_cuda_vector( opal_convertor_t* pConv
                 free_required = 0;
             } else {
                 if (pConvertor->gpu_buffer_ptr == NULL) {
-                    pConvertor->gpu_buffer_ptr = (unsigned char*)opal_cuda_malloc_gpu_buffer(iov[iov_count].iov_len, 0);
+                    pConvertor->gpu_buffer_ptr = (unsigned char*)opal_ddt_cuda_malloc_gpu_buffer(iov[iov_count].iov_len, 0);
                 }
                 iov_ptr = pConvertor->gpu_buffer_ptr;
                 cudaMemcpy(iov_ptr, iov[iov_count].iov_base, iov[iov_count].iov_len, cudaMemcpyHostToDevice);
@@ -340,7 +340,7 @@ int32_t opal_generic_simple_unpack_function_cuda_vector( opal_convertor_t* pConv
     if( pConvertor->bConverted == pConvertor->remote_size ) {
         pConvertor->flags |= CONVERTOR_COMPLETED;
         if (pConvertor->gpu_buffer_ptr != NULL && free_required == 1) {
-            opal_cuda_free_gpu_buffer(pConvertor->gpu_buffer_ptr, 0);
+            opal_ddt_cuda_free_gpu_buffer(pConvertor->gpu_buffer_ptr, 0);
             pConvertor->gpu_buffer_ptr = NULL;
         }
         return 1;
@@ -353,10 +353,10 @@ int32_t opal_generic_simple_unpack_function_cuda_vector( opal_convertor_t* pConv
     return 0;
 }
 
-int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvertor,
-                                                      struct iovec* iov,
-                                                      uint32_t* out_size,
-                                                      size_t* max_data )
+int32_t opal_ddt_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvertor,
+                                                          struct iovec* iov,
+                                                          uint32_t* out_size,
+                                                          size_t* max_data )
 {
     uint32_t i, j;
     uint32_t count_desc, nb_blocks_per_description, dst_offset, residue_desc;
@@ -399,7 +399,7 @@ int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvert
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
     GET_TIME(start);
 #endif
-    if (opal_cuda_is_gpu_buffer(iov[0].iov_base)) {
+    if (opal_ddt_cuda_is_gpu_buffer(iov[0].iov_base)) {
         source = (unsigned char*)iov[0].iov_base;
         free_required = 0;
     } else {
@@ -409,7 +409,7 @@ int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvert
             free_required = 0;
         } else {
             if (pConvertor->gpu_buffer_ptr == NULL) {
-                pConvertor->gpu_buffer_ptr = (unsigned char*)opal_cuda_malloc_gpu_buffer(iov[0].iov_len, 0);
+                pConvertor->gpu_buffer_ptr = (unsigned char*)opal_ddt_cuda_malloc_gpu_buffer(iov[0].iov_len, 0);
             }
             source = pConvertor->gpu_buffer_ptr;
             cudaMemcpy(source, iov[0].iov_base, iov[0].iov_len, cudaMemcpyHostToDevice);
@@ -589,7 +589,7 @@ int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvert
     if( pConvertor->bConverted == pConvertor->local_size ) {
         pConvertor->flags |= CONVERTOR_COMPLETED;
         if (pConvertor->gpu_buffer_ptr != NULL && free_required) {
-            opal_cuda_free_gpu_buffer(pConvertor->gpu_buffer_ptr, 0);
+            opal_ddt_cuda_free_gpu_buffer(pConvertor->gpu_buffer_ptr, 0);
             pConvertor->gpu_buffer_ptr = NULL;
         }
         return 1;
diff --git a/opal/datatype/opal_convertor.h b/opal/datatype/opal_convertor.h
index 6b4746eaa9a..af74ee1221c 100644
--- a/opal/datatype/opal_convertor.h
+++ b/opal/datatype/opal_convertor.h
@@ -114,7 +114,6 @@ struct opal_convertor_t {
 
     unsigned char *               gpu_buffer_ptr; /**< GPU buffer used for pack/unpack */
     size_t                        gpu_buffer_size;
-    uint64_t *                    pipeline_event[MAX_IPC_EVENT_HANDLE]; /**< cuda event for pipeline */
 #endif
     /* size: 248, cachelines: 4, members: 20 */
     /* last cacheline: 56 bytes */
diff --git a/opal/datatype/opal_datatype_cuda.c b/opal/datatype/opal_datatype_cuda.c
index 23cdb47acd6..96c3221b94c 100644
--- a/opal/datatype/opal_datatype_cuda.c
+++ b/opal/datatype/opal_datatype_cuda.c
@@ -81,8 +81,8 @@ void mca_cuda_convertor_init(opal_convertor_t* convertor, const void *pUserBuf)
         convertor->flags |= CONVERTOR_CUDA;
     }
     
-    if (OPAL_SUCCESS != opal_datatype_cuda_kernel_support_init()) {
-        opal_datatype_cuda_kernel_support_fini();    
+    if (OPAL_SUCCESS != opal_cuda_kernel_support_init()) {
+        opal_cuda_kernel_support_fini();    
     }
 }
 
@@ -215,14 +215,14 @@ void opal_cuda_set_copy_function_async(opal_convertor_t* convertor, void *stream
 }
 
 /* following functions are used for cuda ddt kernel support */
-int32_t opal_datatype_cuda_kernel_support_init(void)
+int32_t opal_cuda_kernel_support_init(void)
 {
     if (opal_datatype_cuda_kernel_handle ==  NULL) {
 
         /* If the library name was initialized but the load failed, we have another chance to change it */
         if( NULL != opal_datatype_cuda_kernel_lib )
             free(opal_datatype_cuda_kernel_lib);
-        asprintf(&opal_datatype_cuda_kernel_lib, "%s/%s", opal_install_dirs.libdir, "opal_datatype_cuda.so");
+        asprintf(&opal_datatype_cuda_kernel_lib, "%s/%s", opal_install_dirs.libdir, "opal_datatype_cuda_kernel.so");
 
         opal_datatype_cuda_kernel_handle = dlopen(opal_datatype_cuda_kernel_lib , RTLD_LAZY);
         if (!opal_datatype_cuda_kernel_handle) {
@@ -231,41 +231,41 @@ int32_t opal_datatype_cuda_kernel_support_init(void)
             return OPAL_ERROR;
         }
         
-        OPAL_DATATYPE_FIND_CUDA_KERNEL_FUNCTION_OR_RETURN( opal_datatype_cuda_kernel_handle, opal_datatype_cuda_init );
-        OPAL_DATATYPE_FIND_CUDA_KERNEL_FUNCTION_OR_RETURN( opal_datatype_cuda_kernel_handle, opal_datatype_cuda_fini );
-        OPAL_DATATYPE_FIND_CUDA_KERNEL_FUNCTION_OR_RETURN( opal_datatype_cuda_kernel_handle, opal_generic_simple_pack_function_cuda_iov );
-        OPAL_DATATYPE_FIND_CUDA_KERNEL_FUNCTION_OR_RETURN( opal_datatype_cuda_kernel_handle, opal_generic_simple_unpack_function_cuda_iov );
-        OPAL_DATATYPE_FIND_CUDA_KERNEL_FUNCTION_OR_RETURN( opal_datatype_cuda_kernel_handle, opal_generic_simple_pack_function_cuda_vector );
-        OPAL_DATATYPE_FIND_CUDA_KERNEL_FUNCTION_OR_RETURN( opal_datatype_cuda_kernel_handle, opal_generic_simple_unpack_function_cuda_vector );
-        OPAL_DATATYPE_FIND_CUDA_KERNEL_FUNCTION_OR_RETURN( opal_datatype_cuda_kernel_handle, opal_cuda_free_gpu_buffer );
-        OPAL_DATATYPE_FIND_CUDA_KERNEL_FUNCTION_OR_RETURN( opal_datatype_cuda_kernel_handle, opal_cuda_malloc_gpu_buffer );
-        OPAL_DATATYPE_FIND_CUDA_KERNEL_FUNCTION_OR_RETURN( opal_datatype_cuda_kernel_handle, opal_cuda_d2dcpy_async );
-        OPAL_DATATYPE_FIND_CUDA_KERNEL_FUNCTION_OR_RETURN( opal_datatype_cuda_kernel_handle, opal_cuda_d2dcpy );
+        OPAL_DATATYPE_FIND_CUDA_KERNEL_FUNCTION_OR_RETURN( opal_datatype_cuda_kernel_handle, opal_ddt_cuda_kernel_init );
+        OPAL_DATATYPE_FIND_CUDA_KERNEL_FUNCTION_OR_RETURN( opal_datatype_cuda_kernel_handle, opal_ddt_cuda_kernel_fini );
+        OPAL_DATATYPE_FIND_CUDA_KERNEL_FUNCTION_OR_RETURN( opal_datatype_cuda_kernel_handle, opal_ddt_generic_simple_pack_function_cuda_iov );
+        OPAL_DATATYPE_FIND_CUDA_KERNEL_FUNCTION_OR_RETURN( opal_datatype_cuda_kernel_handle, opal_ddt_generic_simple_unpack_function_cuda_iov );
+        OPAL_DATATYPE_FIND_CUDA_KERNEL_FUNCTION_OR_RETURN( opal_datatype_cuda_kernel_handle, opal_ddt_generic_simple_pack_function_cuda_vector );
+        OPAL_DATATYPE_FIND_CUDA_KERNEL_FUNCTION_OR_RETURN( opal_datatype_cuda_kernel_handle, opal_ddt_generic_simple_unpack_function_cuda_vector );
+        OPAL_DATATYPE_FIND_CUDA_KERNEL_FUNCTION_OR_RETURN( opal_datatype_cuda_kernel_handle, opal_ddt_cuda_free_gpu_buffer );
+        OPAL_DATATYPE_FIND_CUDA_KERNEL_FUNCTION_OR_RETURN( opal_datatype_cuda_kernel_handle, opal_ddt_cuda_malloc_gpu_buffer );
+        OPAL_DATATYPE_FIND_CUDA_KERNEL_FUNCTION_OR_RETURN( opal_datatype_cuda_kernel_handle, opal_ddt_cuda_d2dcpy_async );
+        OPAL_DATATYPE_FIND_CUDA_KERNEL_FUNCTION_OR_RETURN( opal_datatype_cuda_kernel_handle, opal_ddt_cuda_d2dcpy );
         
-        if (OPAL_SUCCESS != cuda_kernel_table.opal_datatype_cuda_init_p()) {
+        if (OPAL_SUCCESS != cuda_kernel_table.opal_ddt_cuda_kernel_init_p()) {
             return OPAL_ERROR;
         }
         opal_datatype_cuda_kernel_support = 1;
-        opal_output( 0, "opal_datatype_cuda_kernel_support_init done\n");
+        opal_output( 0, "opal_cuda_kernel_support_init done\n");
     }
     return OPAL_SUCCESS;
 }
 
-int32_t opal_datatype_cuda_kernel_support_fini(void)
+int32_t opal_cuda_kernel_support_fini(void)
 {
     if (opal_datatype_cuda_kernel_handle != NULL) {
-        cuda_kernel_table.opal_datatype_cuda_fini_p();
+        cuda_kernel_table.opal_ddt_cuda_kernel_fini_p();
         /* Reset all functions to NULL */
-        cuda_kernel_table.opal_datatype_cuda_init_p = NULL;
-        cuda_kernel_table.opal_datatype_cuda_fini_p = NULL;
-        cuda_kernel_table.opal_generic_simple_pack_function_cuda_iov_p = NULL;
-        cuda_kernel_table.opal_generic_simple_unpack_function_cuda_iov_p = NULL;
-        cuda_kernel_table.opal_generic_simple_pack_function_cuda_vector_p = NULL;
-        cuda_kernel_table.opal_generic_simple_unpack_function_cuda_vector_p = NULL;
-        cuda_kernel_table.opal_cuda_free_gpu_buffer_p = NULL;
-        cuda_kernel_table.opal_cuda_malloc_gpu_buffer_p = NULL;
-        cuda_kernel_table.opal_cuda_d2dcpy_async_p = NULL;
-        cuda_kernel_table.opal_cuda_d2dcpy_p = NULL;
+        cuda_kernel_table.opal_ddt_cuda_kernel_init_p = NULL;
+        cuda_kernel_table.opal_ddt_cuda_kernel_fini_p = NULL;
+        cuda_kernel_table.opal_ddt_generic_simple_pack_function_cuda_iov_p = NULL;
+        cuda_kernel_table.opal_ddt_generic_simple_unpack_function_cuda_iov_p = NULL;
+        cuda_kernel_table.opal_ddt_generic_simple_pack_function_cuda_vector_p = NULL;
+        cuda_kernel_table.opal_ddt_generic_simple_unpack_function_cuda_vector_p = NULL;
+        cuda_kernel_table.opal_ddt_cuda_free_gpu_buffer_p = NULL;
+        cuda_kernel_table.opal_ddt_cuda_malloc_gpu_buffer_p = NULL;
+        cuda_kernel_table.opal_ddt_cuda_d2dcpy_async_p = NULL;
+        cuda_kernel_table.opal_ddt_cuda_d2dcpy_p = NULL;
 
         dlclose(opal_datatype_cuda_kernel_handle);
         opal_datatype_cuda_kernel_handle = NULL;
@@ -274,85 +274,85 @@ int32_t opal_datatype_cuda_kernel_support_fini(void)
             free(opal_datatype_cuda_kernel_lib);
         opal_datatype_cuda_kernel_lib = NULL;
         opal_datatype_cuda_kernel_support = 0;
-        opal_output( 0, "opal_datatype_cuda_kernel_support_fini done\n");
+        opal_output( 0, "opal_cuda_kernel_support_fini done\n");
     }
     return OPAL_SUCCESS;
 }
 
 int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor, struct iovec* iov, uint32_t* out_size, size_t* max_data )
 {
-    if (cuda_kernel_table.opal_generic_simple_pack_function_cuda_iov_p != NULL) {
-        return cuda_kernel_table.opal_generic_simple_pack_function_cuda_iov_p(pConvertor, iov, out_size, max_data);
+    if (cuda_kernel_table.opal_ddt_generic_simple_pack_function_cuda_iov_p != NULL) {
+        return cuda_kernel_table.opal_ddt_generic_simple_pack_function_cuda_iov_p(pConvertor, iov, out_size, max_data);
     } else {
-        opal_output(0, "opal_generic_simple_pack_function_cuda_iov function pointer is NULL\n");
+        opal_output(0, "opal_ddt_generic_simple_pack_function_cuda_iov function pointer is NULL\n");
         return -1;
     }
 }
 
 int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvertor, struct iovec* iov, uint32_t* out_size, size_t* max_data )
 {
-    if (cuda_kernel_table.opal_generic_simple_unpack_function_cuda_iov_p != NULL) {
-        return cuda_kernel_table.opal_generic_simple_unpack_function_cuda_iov_p(pConvertor, iov, out_size, max_data);
+    if (cuda_kernel_table.opal_ddt_generic_simple_unpack_function_cuda_iov_p != NULL) {
+        return cuda_kernel_table.opal_ddt_generic_simple_unpack_function_cuda_iov_p(pConvertor, iov, out_size, max_data);
     } else {
-        opal_output(0, "opal_generic_simple_unpack_function_cuda_iov function pointer is NULL\n");
+        opal_output(0, "opal_ddt_generic_simple_unpack_function_cuda_iov function pointer is NULL\n");
         return -1;
     }
 }
 
 int32_t opal_generic_simple_pack_function_cuda_vector( opal_convertor_t* pConvertor, struct iovec* iov, uint32_t* out_size, size_t* max_data )
 {
-    if (cuda_kernel_table.opal_generic_simple_pack_function_cuda_vector_p != NULL) {
-        return cuda_kernel_table.opal_generic_simple_pack_function_cuda_vector_p(pConvertor, iov, out_size, max_data);
+    if (cuda_kernel_table.opal_ddt_generic_simple_pack_function_cuda_vector_p != NULL) {
+        return cuda_kernel_table.opal_ddt_generic_simple_pack_function_cuda_vector_p(pConvertor, iov, out_size, max_data);
     } else {
-        opal_output(0, "opal_generic_simple_pack_function_cuda_vector function pointer is NULL\n");
+        opal_output(0, "opal_ddt_generic_simple_pack_function_cuda_vector function pointer is NULL\n");
         return -1;
     }
 }
 
 int32_t opal_generic_simple_unpack_function_cuda_vector( opal_convertor_t* pConvertor, struct iovec* iov, uint32_t* out_size, size_t* max_data )
 {
-    if (cuda_kernel_table.opal_generic_simple_unpack_function_cuda_vector_p != NULL) {
-        return cuda_kernel_table.opal_generic_simple_unpack_function_cuda_vector_p(pConvertor, iov, out_size, max_data);
+    if (cuda_kernel_table.opal_ddt_generic_simple_unpack_function_cuda_vector_p != NULL) {
+        return cuda_kernel_table.opal_ddt_generic_simple_unpack_function_cuda_vector_p(pConvertor, iov, out_size, max_data);
     } else {
-        opal_output(0, "opal_generic_simple_unpack_function_cuda_vector function pointer is NULL\n");
+        opal_output(0, "opal_ddt_generic_simple_unpack_function_cuda_vector function pointer is NULL\n");
         return -1;
     }
 }
 
 void* opal_cuda_malloc_gpu_buffer(size_t size, int gpu_id)
 {
-    if (cuda_kernel_table.opal_cuda_malloc_gpu_buffer_p != NULL) {
-        return cuda_kernel_table.opal_cuda_malloc_gpu_buffer_p(size, gpu_id);
+    if (cuda_kernel_table.opal_ddt_cuda_malloc_gpu_buffer_p != NULL) {
+        return cuda_kernel_table.opal_ddt_cuda_malloc_gpu_buffer_p(size, gpu_id);
     } else {
-        opal_output(0, "opal_cuda_malloc_gpu_buffer function pointer is NULL\n");
+        opal_output(0, "opal_ddt_cuda_malloc_gpu_buffer function pointer is NULL\n");
         return NULL;
     }
 }
 
 void opal_cuda_free_gpu_buffer(void *addr, int gpu_id)
 {
-    if (cuda_kernel_table.opal_cuda_free_gpu_buffer_p != NULL) {
-        cuda_kernel_table.opal_cuda_free_gpu_buffer_p(addr, gpu_id);
+    if (cuda_kernel_table.opal_ddt_cuda_free_gpu_buffer_p != NULL) {
+        cuda_kernel_table.opal_ddt_cuda_free_gpu_buffer_p(addr, gpu_id);
     } else {
-        opal_output(0, "opal_cuda_free_gpu_buffer function pointer is NULL\n");
+        opal_output(0, "opal_ddt_cuda_free_gpu_buffer function pointer is NULL\n");
     }
 }
 
 void opal_cuda_d2dcpy(void* dst, const void* src, size_t count)
 {
-    if (cuda_kernel_table.opal_cuda_d2dcpy_p != NULL) {
-        cuda_kernel_table.opal_cuda_d2dcpy_p(dst, src, count);
+    if (cuda_kernel_table.opal_ddt_cuda_d2dcpy_p != NULL) {
+        cuda_kernel_table.opal_ddt_cuda_d2dcpy_p(dst, src, count);
     } else {
-        opal_output(0, "opal_cuda_d2dcpy function pointer is NULL\n");
+        opal_output(0, "opal_ddt_cuda_d2dcpy function pointer is NULL\n");
     }
 }
 
 void opal_cuda_d2dcpy_async(void* dst, const void* src, size_t count)
 {
-    if (cuda_kernel_table.opal_cuda_d2dcpy_async_p != NULL) {
-        cuda_kernel_table.opal_cuda_d2dcpy_async_p(dst, src, count);
+    if (cuda_kernel_table.opal_ddt_cuda_d2dcpy_async_p != NULL) {
+        cuda_kernel_table.opal_ddt_cuda_d2dcpy_async_p(dst, src, count);
     } else {
-        opal_output(0, "opal_cuda_d2dcpy_async function pointer is NULL\n");
+        opal_output(0, "opal_ddt_cuda_d2dcpy_async function pointer is NULL\n");
     }
 }
 
diff --git a/opal/datatype/opal_datatype_cuda.h b/opal/datatype/opal_datatype_cuda.h
index a5a68074034..8b6f996e422 100644
--- a/opal/datatype/opal_datatype_cuda.h
+++ b/opal/datatype/opal_datatype_cuda.h
@@ -22,16 +22,16 @@ struct opal_common_cuda_function_table {
 typedef struct opal_common_cuda_function_table opal_common_cuda_function_table_t;
 
 struct opal_datatype_cuda_kernel_function_table {
-    int32_t (*opal_datatype_cuda_init_p)(void);
-    int32_t (*opal_datatype_cuda_fini_p)(void);
-    void (*opal_cuda_free_gpu_buffer_p)(void *addr, int gpu_id);
-    void* (*opal_cuda_malloc_gpu_buffer_p)(size_t size, int gpu_id);
-    void (*opal_cuda_d2dcpy_async_p)(void* dst, const void* src, size_t count);
-    void (*opal_cuda_d2dcpy_p)(void* dst, const void* src, size_t count);
-    int32_t (*opal_generic_simple_pack_function_cuda_iov_p)( opal_convertor_t* pConvertor, struct iovec* iov, uint32_t* out_size, size_t* max_data );
-    int32_t (*opal_generic_simple_unpack_function_cuda_iov_p)( opal_convertor_t* pConvertor, struct iovec* iov, uint32_t* out_size, size_t* max_data );
-    int32_t (*opal_generic_simple_pack_function_cuda_vector_p)( opal_convertor_t* pConvertor, struct iovec* iov, uint32_t* out_size, size_t* max_data );
-    int32_t (*opal_generic_simple_unpack_function_cuda_vector_p)( opal_convertor_t* pConvertor, struct iovec* iov, uint32_t* out_size, size_t* max_data );                                                         
+    int32_t (*opal_ddt_cuda_kernel_init_p)(void);
+    int32_t (*opal_ddt_cuda_kernel_fini_p)(void);
+    void (*opal_ddt_cuda_free_gpu_buffer_p)(void *addr, int gpu_id);
+    void* (*opal_ddt_cuda_malloc_gpu_buffer_p)(size_t size, int gpu_id);
+    void (*opal_ddt_cuda_d2dcpy_async_p)(void* dst, const void* src, size_t count);
+    void (*opal_ddt_cuda_d2dcpy_p)(void* dst, const void* src, size_t count);
+    int32_t (*opal_ddt_generic_simple_pack_function_cuda_iov_p)( opal_convertor_t* pConvertor, struct iovec* iov, uint32_t* out_size, size_t* max_data );
+    int32_t (*opal_ddt_generic_simple_unpack_function_cuda_iov_p)( opal_convertor_t* pConvertor, struct iovec* iov, uint32_t* out_size, size_t* max_data );
+    int32_t (*opal_ddt_generic_simple_pack_function_cuda_vector_p)( opal_convertor_t* pConvertor, struct iovec* iov, uint32_t* out_size, size_t* max_data );
+    int32_t (*opal_ddt_generic_simple_unpack_function_cuda_vector_p)( opal_convertor_t* pConvertor, struct iovec* iov, uint32_t* out_size, size_t* max_data );                                                         
 };
 typedef struct opal_datatype_cuda_kernel_function_table opal_datatype_cuda_kernel_function_table_t;
 extern int32_t opal_datatype_cuda_kernel_support;
@@ -44,8 +44,8 @@ void* opal_cuda_memmove(void * dest, void * src, size_t size);
 void opal_cuda_add_initialization_function(int (*fptr)(opal_common_cuda_function_table_t *));
 void opal_cuda_set_copy_function_async(opal_convertor_t* convertor, void *stream);
 
-int32_t opal_datatype_cuda_kernel_support_init(void);
-int32_t opal_datatype_cuda_kernel_support_fini(void);
+int32_t opal_cuda_kernel_support_init(void);
+int32_t opal_cuda_kernel_support_fini(void);
 int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor, struct iovec* iov, uint32_t* out_size, size_t* max_data );
 int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvertor, struct iovec* iov, uint32_t* out_size, size_t* max_data );
 int32_t opal_generic_simple_pack_function_cuda_vector( opal_convertor_t* pConvertor, struct iovec* iov, uint32_t* out_size, size_t* max_data );
diff --git a/opal/datatype/opal_datatype_module.c b/opal/datatype/opal_datatype_module.c
index 92a3fe40174..77d6bfa62ac 100644
--- a/opal/datatype/opal_datatype_module.c
+++ b/opal/datatype/opal_datatype_module.c
@@ -252,7 +252,7 @@ int32_t opal_datatype_finalize( void )
     opal_convertor_destroy_masters();
 
 #if OPAL_CUDA_SUPPORT
-    opal_datatype_cuda_kernel_support_fini();
+    opal_cuda_kernel_support_fini();
 #endif /* OPAL_CUDA_SUPPORT */
 
     return OPAL_SUCCESS;
diff --git a/opal/datatype/opal_datatype_pack.c b/opal/datatype/opal_datatype_pack.c
index 0bb29e2f3fc..0573db427df 100644
--- a/opal/datatype/opal_datatype_pack.c
+++ b/opal/datatype/opal_datatype_pack.c
@@ -416,7 +416,7 @@ opal_generic_simple_pack_cuda_function( opal_convertor_t* pConvertor,
     pos_desc   = pStack->index;
     pElem = &(description[pos_desc]);
    
-   // return (*opal_generic_simple_pack_function_cuda_vector_p)( pConvertor, iov, out_size, max_data);
+//    return opal_generic_simple_pack_function_cuda_iov( pConvertor, iov, out_size, max_data);
     if( OPAL_DATATYPE_LOOP == pElem->elem.common.type ) {
         return opal_generic_simple_pack_function_cuda_vector( pConvertor, iov, out_size, max_data);
     } else {
diff --git a/opal/datatype/opal_datatype_unpack.c b/opal/datatype/opal_datatype_unpack.c
index 6a2fbd70a6c..5f0ac368f68 100644
--- a/opal/datatype/opal_datatype_unpack.c
+++ b/opal/datatype/opal_datatype_unpack.c
@@ -610,7 +610,7 @@ opal_generic_simple_unpack_cuda_function( opal_convertor_t* pConvertor,
     pos_desc   = pStack->index;
     pElem = &(description[pos_desc]);
    
-//    return (*opal_generic_simple_unpack_function_cuda_vector_p)( pConvertor, iov, out_size, max_data);
+//    return opal_generic_simple_unpack_function_cuda_iov( pConvertor, iov, out_size, max_data);
     if( OPAL_DATATYPE_LOOP == pElem->elem.common.type ) {
         return opal_generic_simple_unpack_function_cuda_vector( pConvertor, iov, out_size, max_data);
     } else {
diff --git a/opal/mca/common/cuda/common_cuda.c b/opal/mca/common/cuda/common_cuda.c
index 38b35a44b09..c358bcb7a57 100644
--- a/opal/mca/common/cuda/common_cuda.c
+++ b/opal/mca/common/cuda/common_cuda.c
@@ -1901,7 +1901,7 @@ static int mca_common_cuda_is_gpu_buffer(const void *pUserBuf, opal_convertor_t
         if (0 != mca_common_cuda_stage_three_init()) {
             opal_cuda_support = 0;
         } else {
-	    opal_datatype_cuda_kernel_support_init();
+	    opal_cuda_kernel_support_init();
 	}
     }
 

From 323566376331b867d09097c4884d38e1f809753d Mon Sep 17 00:00:00 2001
From: eddy16112 <eddy16112@gmail.com>
Date: Fri, 6 Nov 2015 18:43:31 -0500
Subject: [PATCH 139/190] check point

---
 opal/datatype/cuda/opal_datatype_cuda.cu      | 29 ++++++++
 opal/datatype/cuda/opal_datatype_cuda.cuh     |  6 ++
 .../cuda/opal_datatype_cuda_internal.cuh      |  4 +-
 .../cuda/opal_datatype_pack_cuda_wrapper.cu   | 70 +++++++++++++++----
 .../cuda/opal_datatype_unpack_cuda_wrapper.cu | 39 ++++++++---
 opal/datatype/opal_convertor.c                |  4 +-
 opal/datatype/opal_convertor.h                |  1 +
 opal/datatype/opal_datatype.h                 |  6 +-
 opal/datatype/opal_datatype_cuda.c            | 35 +++++++++-
 opal/datatype/opal_datatype_cuda.h            |  6 +-
 opal/datatype/opal_datatype_destroy.c         | 13 ++++
 opal/datatype/opal_datatype_optimize.c        |  7 ++
 opal/datatype/opal_datatype_pack.c            |  2 +-
 opal/datatype/opal_datatype_unpack.c          |  2 +-
 test/datatype/ddt_benchmark.c                 | 22 +++---
 15 files changed, 204 insertions(+), 42 deletions(-)

diff --git a/opal/datatype/cuda/opal_datatype_cuda.cu b/opal/datatype/cuda/opal_datatype_cuda.cu
index e07adb33c5e..6a6e06ff28d 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cu
+++ b/opal/datatype/cuda/opal_datatype_cuda.cu
@@ -275,6 +275,35 @@ int32_t opal_ddt_cuda_kernel_fini(void)
     return OPAL_SUCCESS;
 }
 
+void* opal_ddt_cuda_iov_dist_init(void) 
+{
+#if OPAL_DATATYPE_CUDA_IOV_CACHE 
+    ddt_cuda_iov_dist_t *p = NULL;
+    cudaMalloc((void **)(&p), sizeof(ddt_cuda_iov_dist_t) * NUM_CUDA_IOV_PER_DDT);
+    if (p != NULL) {
+        DT_CUDA_DEBUG( opal_cuda_output( 2, "Malloc cuda_iov_dist for ddt is successed %p.\n", p); );
+        return p;
+    } else {
+        DT_CUDA_DEBUG( opal_cuda_output( 0, "Malloc cuda_iov_dist for ddt is failed.\n"); );
+        return NULL;
+    }
+#else
+    DT_CUDA_DEBUG( opal_cuda_output( 2, "cuda iov cache is not enabled.\n"); );
+    return (void *)0xDEADBEEF;
+#endif /* OPAL_DATATYPE_CUDA_IOV_CACHE */
+}
+
+void opal_ddt_cuda_iov_dist_fini(void* cuda_iov_dist) 
+{
+#if OPAL_DATATYPE_CUDA_IOV_CACHE
+    ddt_cuda_iov_dist_t *p = (ddt_cuda_iov_dist_t *) cuda_iov_dist;
+    if (p != NULL) {
+        cudaFree(p);
+        DT_CUDA_DEBUG( opal_cuda_output( 2, "Free cuda_iov_dist for ddt is successed %p.\n", p); );
+    }
+#endif /* OPAL_DATATYPE_CUDA_IOV_CACHE */
+}
+
 int32_t opal_ddt_cuda_is_gpu_buffer(const void *ptr)
 {
     int res;
diff --git a/opal/datatype/cuda/opal_datatype_cuda.cuh b/opal/datatype/cuda/opal_datatype_cuda.cuh
index 53f548c6d34..ea3631af67f 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cuh
+++ b/opal/datatype/cuda/opal_datatype_cuda.cuh
@@ -95,6 +95,12 @@ void opal_ddt_cuda_d2dcpy(void* dst, const void* src, size_t count);
 
 void opal_dump_cuda_list(ddt_cuda_list_t *list);
 
+void* opal_ddt_cuda_iov_dist_init(void);
+
+void opal_ddt_cuda_iov_dist_fini(void *cuda_iov_dist);
+
+void pack_iov_cached(opal_convertor_t* pConvertor, unsigned char *destination);
+
 }
                             
 #endif  /* OPAL_DATATYPE_CUDA_H_HAS_BEEN_INCLUDED */
\ No newline at end of file
diff --git a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
index 7648eed3b3e..ca630fc1b93 100644
--- a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
+++ b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
@@ -18,6 +18,7 @@
 #define OPAL_DATATYPE_VECTOR_USE_ZEROCPY   0
 #define OPAL_DATATYPE_VECTOR_USE_PIPELINE   0
 #define OPAL_DATATYPE_VECTOR_USE_MEMCPY2D_AS_KERNEL   1
+#define OPAL_DATATYPE_CUDA_IOV_CACHE    1
 
 
 
@@ -36,7 +37,8 @@
 #define CUDA_IOV_MAX_TASK_PER_BLOCK 400
 #define ALIGNMENT_DOUBLE    8
 #define ALIGNMENT_FLOAT     4
-#define ALIGNMENT_CHAR      1
+#define ALIGNMENT_CHAR      18
+#define NUM_CUDA_IOV_PER_DDT    100000
 
 #define TIMER_DATA_TYPE struct timeval
 #define GET_TIME(TV)   gettimeofday( &(TV), NULL )
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
index 97481755209..b82888a3f96 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
@@ -8,7 +8,7 @@
 #include <assert.h>
 
 
-int32_t opal_ddt_generic_simple_pack_function_cuda_vector2(opal_convertor_t* pConvertor,
+int32_t opal_ddt_generic_simple_pack_function_cuda_vector(opal_convertor_t* pConvertor,
                                                       struct iovec* iov,
                                                       uint32_t* out_size,
                                                       size_t* max_data )
@@ -28,13 +28,15 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_vector2(opal_convertor_t* pCo
     uint32_t count_desc_tmp;
     
     ddt_cuda_stream_t *cuda_streams = current_cuda_device->cuda_streams;
+    int contiguous_loop_flag = 0;
+    int i;
     
 #if defined(OPAL_DATATYPE_CUDA_TIMING)    
     TIMER_DATA_TYPE start, end, start_total, end_total;
     long total_time;
 #endif
     
-    DT_CUDA_DEBUG( opal_cuda_output( 1, "opal_convertor_generic_simple_pack_cuda_vector( %p:%p, {%p, %lu}, %u, %u )\n",
+    DT_CUDA_DEBUG( opal_cuda_output( 2, "opal_convertor_generic_simple_pack_cuda_vector( %p:%p, {%p, %lu}, %u, %u )\n",
                                 (void*)pConvertor, (void*)pConvertor->pBaseBuf,
                                 iov[0].iov_base, (unsigned long)iov[0].iov_len, *out_size, *max_data ); );
 
@@ -52,7 +54,7 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_vector2(opal_convertor_t* pCo
     pConvertor->stack_pos--;
     pElem = &(description[pos_desc]);
 
-    DT_CUDA_DEBUG( opal_cuda_output( 1, "pack start pos_desc %d count_desc %d disp %ld\n"
+    DT_CUDA_DEBUG( opal_cuda_output( 4, "pack start pos_desc %d count_desc %d disp %ld\n"
                            "stack_pos %d pos_desc %d count_desc %d disp %ld\n",
                            pos_desc, count_desc, (long)(conv_ptr - pConvertor->pBaseBuf),
                            pConvertor->stack_pos, pStack->index, (int)pStack->count, (long)pStack->disp ); );
@@ -112,10 +114,17 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_vector2(opal_convertor_t* pCo
                     UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
                     continue;
                 }
+                if (contiguous_loop_flag) {
+                    pStack--;
+                    pConvertor->stack_pos--;
+                    pos_desc --;
+                    pElem = &(description[pos_desc]);
+                    count_desc = count_desc_tmp;
+                }
                 goto complete_loop;
             }
             if( OPAL_DATATYPE_END_LOOP == pElem->elem.common.type ) { /* end of the current loop */
-                DT_CUDA_DEBUG( opal_cuda_output( 2, "pack end_loop count %d stack_pos %d"
+                DT_CUDA_DEBUG( opal_cuda_output( 4, "pack end_loop count %d stack_pos %d"
                                                  " pos_desc %d disp %ld space %lu\n",
                                                  (int)pStack->count, pConvertor->stack_pos,
                                                  pos_desc, (long)pStack->disp, (unsigned long)iov_len_local ); );
@@ -141,7 +150,7 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_vector2(opal_convertor_t* pCo
                 }
                 conv_ptr = pConvertor->pBaseBuf + pStack->disp;
                 UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
-                DT_CUDA_DEBUG( opal_cuda_output( 2, "pack new_loop count %d stack_pos %d pos_desc %d count_desc %d disp %ld space %lu\n",
+                DT_CUDA_DEBUG( opal_cuda_output( 4, "pack new_loop count %d stack_pos %d pos_desc %d count_desc %d disp %ld space %lu\n",
                                                  (int)pStack->count, pConvertor->stack_pos, pos_desc,
                                                  count_desc, (long)pStack->disp, (unsigned long)iov_len_local ); );
             }
@@ -160,6 +169,8 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_vector2(opal_convertor_t* pCo
                     if( 0 == count_desc ) {  /* completed */
                         pos_desc += pElem->loop.items + 1;
                         goto update_loop_description;
+                    } else {
+                        contiguous_loop_flag = 1;
                     }
                     /* Save the stack with the correct last_count value. */
                 }
@@ -168,7 +179,11 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_vector2(opal_convertor_t* pCo
                             pStack->disp + local_disp);
                 pos_desc++;
             update_loop_description:  /* update the current state */
-                conv_ptr = pConvertor->pBaseBuf + pStack->disp;
+                if (contiguous_loop_flag) {
+                    count_desc_tmp = count_desc;
+                } else {
+                    conv_ptr = pConvertor->pBaseBuf + pStack->disp;
+                }
                 UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
                 continue;
             }
@@ -177,6 +192,9 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_vector2(opal_convertor_t* pCo
         iov[iov_count].iov_len -= iov_len_local;  /* update the amount of valid data */
         total_packed += iov[iov_count].iov_len;
  //       printf("iov_len %d, local %d\n", iov[iov_count].iov_len, iov_len_local);
+        for (i = 0; i < NB_STREAMS; i++) {
+            cudaStreamSynchronize(cuda_streams->opal_cuda_stream[i]);
+        }
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
         GET_TIME(start);
 #endif
@@ -186,16 +204,15 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_vector2(opal_convertor_t* pCo
 #if defined(OPAL_DATATYPE_CUDA_TIMING) 
         GET_TIME( end );
         total_time = ELAPSED_TIME( start, end );
-        printf( "[Timing]: DtoH memcpy in %ld microsec, transfer required %d\n", total_time, transfer_required );
+        DT_CUDA_DEBUG( opal_cuda_output( 2, "[Timing]: DtoH memcpy in %ld microsec, transfer required %d\n", total_time, transfer_required ); );
 #endif
     }
-    cudaDeviceSynchronize();
     *max_data = total_packed;
     pConvertor->bConverted += total_packed;  /* update the already converted bytes */
     *out_size = iov_count;
     if( pConvertor->bConverted == pConvertor->local_size ) {
         pConvertor->flags |= CONVERTOR_COMPLETED;
-        DT_CUDA_DEBUG( opal_cuda_output( 0, "Total packed %lu\n", pConvertor->bConverted); );
+        DT_CUDA_DEBUG( opal_cuda_output( 0, "Pack total packed %lu\n", pConvertor->bConverted); );
         if (pConvertor->gpu_buffer_ptr != NULL && free_required == 1) {
             printf("free\n");
            opal_ddt_cuda_free_gpu_buffer(pConvertor->gpu_buffer_ptr, 0);
@@ -206,12 +223,12 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_vector2(opal_convertor_t* pCo
     /* Save the global position for the next round */
     PUSH_STACK( pStack, pConvertor->stack_pos, pos_desc, pElem->elem.common.type, count_desc,
                 conv_ptr - pConvertor->pBaseBuf );
-    DT_CUDA_DEBUG( opal_cuda_output( 2, "pack save stack stack_pos %d pos_desc %d count_desc %d disp %ld\n",
+    DT_CUDA_DEBUG( opal_cuda_output( 4, "pack save stack stack_pos %d pos_desc %d count_desc %d disp %ld\n",
                                      pConvertor->stack_pos, pStack->index, (int)pStack->count, (long)pStack->disp ); );
     return 0;
 }
 
-int32_t opal_ddt_generic_simple_pack_function_cuda_vector(opal_convertor_t* pConvertor,
+int32_t opal_ddt_generic_simple_pack_function_cuda_vector2(opal_convertor_t* pConvertor,
                                                       struct iovec* iov,
                                                       uint32_t* out_size,
                                                       size_t* max_data )
@@ -369,7 +386,7 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_vector(opal_convertor_t* pCon
                             pStack->disp + local_disp);
                 pos_desc++;
             update_loop_description:  /* update the current state */
-              //  conv_ptr = pConvertor->pBaseBuf + pStack->disp;
+           //     conv_ptr = pConvertor->pBaseBuf + pStack->disp;
                 count_desc_tmp = count_desc;
                 UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
                 continue;
@@ -674,6 +691,12 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConve
     long total_time, move_time;
 #endif
     
+#if OPAL_DATATYPE_CUDA_IOV_CACHE
+    opal_datatype_t *pDesc = (opal_datatype_t *)pConvertor->pDesc;
+    ddt_cuda_iov_dist_t *cuda_iov_dist_cache = (ddt_cuda_iov_dist_t *)pDesc->cuda_iov_dist;
+    cuda_iov_dist_cache += pDesc->cuda_iov_count;    
+#endif /* OPAL_DATATYPE_CUDA_IOV_CACHE */
+    
     /*description = pConvertor->use_desc->desc;
     pStack = pConvertor->pStack + pConvertor->stack_pos;
     pElem = &(description[pStack->index]);
@@ -717,6 +740,13 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConve
         }
     }
     
+#if OPAL_DATATYPE_CUDA_IOV_CACHE
+    /* cuda iov is cached */
+    if (pDesc->cuda_iov_is_cached == 2) {
+        pack_iov_cached(pConvertor, destination);
+    }
+#endif /* OPAL_DATATYPE_CUDA_IOV_CACHE */    
+    
     DT_CUDA_DEBUG ( opal_cuda_output(2, "Pack using IOV, GPU base %p, pack to buffer %p\n", pConvertor->pBaseBuf, destination););
 
     cuda_iov_count = 1000;//CUDA_NB_IOV;
@@ -835,6 +865,11 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConve
 #endif
 
         cudaMemcpyAsync(cuda_iov_dist_d_current, cuda_iov_dist_h_current, sizeof(ddt_cuda_iov_dist_t)*(nb_blocks_used), cudaMemcpyHostToDevice, *cuda_stream_iov);
+#if OPAL_DATATYPE_CUDA_IOV_CACHE
+        cudaMemcpyAsync(cuda_iov_dist_cache, cuda_iov_dist_d_current, sizeof(ddt_cuda_iov_dist_t)*(nb_blocks_used), cudaMemcpyDeviceToDevice, *cuda_stream_iov);
+        pDesc->cuda_iov_count += nb_blocks_used;
+        cuda_iov_dist_cache += nb_blocks_used;
+#endif /* OPAL_DATATYPE_CUDA_IOV_CACHE */
         opal_generic_simple_pack_cuda_iov_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cuda_iov_dist_d_current, nb_blocks_used, source_base, destination_base);
         cuda_err = cudaEventRecord(cuda_iov_pipeline_block->cuda_event, *cuda_stream_iov);
         opal_cuda_check_error(cuda_err);
@@ -898,11 +933,22 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConve
            opal_ddt_cuda_free_gpu_buffer(pConvertor->gpu_buffer_ptr, 0);
            pConvertor->gpu_buffer_ptr = NULL;
         }
+#if OPAL_DATATYPE_CUDA_IOV_CACHE
+        pDesc->cuda_iov_is_cached = 2;
+#endif /* OPAL_DATATYPE_CUDA_IOV_CACHE */
         return 1;
     }        
     return 0;
 }
 
+#if OPAL_DATATYPE_CUDA_IOV_CACHE
+void pack_iov_cached(opal_convertor_t* pConvertor, unsigned char *destination)
+{
+    const opal_datatype_t *datatype = pConvertor->pDesc;
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "cuda iov cached %p, count %ld\n", datatype->cuda_iov_dist, datatype->cuda_iov_count ); );
+}
+#endif /* OPAL_DATATYPE_CUDA_IOV_CACHE */
+
 
 void pack_predefined_data_cuda( dt_elem_desc_t* ELEM,
                                 uint32_t* COUNT,
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
index 9d0e02067d1..f483d230934 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
@@ -8,7 +8,7 @@
 #include <assert.h>
 
 
-int32_t opal_ddt_generic_simple_unpack_function_cuda_vector2( opal_convertor_t* pConvertor,
+int32_t opal_ddt_generic_simple_unpack_function_cuda_vector( opal_convertor_t* pConvertor,
                                                          struct iovec* iov, uint32_t* out_size,
                                                          size_t* max_data )
 {
@@ -26,13 +26,15 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_vector2( opal_convertor_t*
     uint32_t count_desc_tmp;
     
     ddt_cuda_stream_t *cuda_streams = current_cuda_device->cuda_streams;
+    int contiguous_loop_flag = 0;
+    int i;
 
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
     TIMER_DATA_TYPE start, end;
     long total_time;
 #endif
 
-    DT_CUDA_DEBUG( opal_cuda_output( 1, "opal_convertor_generic_simple_unpack( %p, {%p, %lu}, %u , %u)\n",
+    DT_CUDA_DEBUG( opal_cuda_output( 2, "opal_convertor_generic_simple_unpack( %p, {%p, %lu}, %u , %u)\n",
                                      (void*)pConvertor, iov[0].iov_base, (unsigned long)iov[0].iov_len, *out_size, *max_data ); )
 
     description = pConvertor->use_desc->desc;
@@ -49,7 +51,7 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_vector2( opal_convertor_t*
     pConvertor->stack_pos--;
     pElem = &(description[pos_desc]);
 
-    DT_CUDA_DEBUG( opal_cuda_output( 1, "unpack start pos_desc %d count_desc %d disp %ld\n"
+    DT_CUDA_DEBUG( opal_cuda_output( 4, "unpack start pos_desc %d count_desc %d disp %ld\n"
                            "stack_pos %d pos_desc %d count_desc %d disp %ld\n",
                            pos_desc, count_desc, (long)(conv_ptr - pConvertor->pBaseBuf),
                            pConvertor->stack_pos, pStack->index, (int)pStack->count, (long)(pStack->disp) ); );
@@ -78,7 +80,7 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_vector2( opal_convertor_t*
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
         GET_TIME( end );
         total_time = ELAPSED_TIME( start, end );
-        printf( "[Timing]: HtoD memcpy in %ld microsec, free required %d\n", total_time, free_required );
+        DT_CUDA_DEBUG( opal_cuda_output( 2, "[Timing]: HtoD memcpy in %ld microsec, free required %d\n", total_time, free_required ); );
 #endif
         iov_len_local = iov[iov_count].iov_len;
         cudaDeviceSynchronize();
@@ -96,6 +98,13 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_vector2( opal_convertor_t*
                     UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
                     continue;
                 }
+                if (contiguous_loop_flag) {
+                    pStack--;
+                    pConvertor->stack_pos--;
+                    pos_desc --;
+                    pElem = &(description[pos_desc]);
+                    count_desc = count_desc_tmp;
+                }
                 assert( pElem->elem.common.type < OPAL_DATATYPE_MAX_PREDEFINED );
                 if( 0 != iov_len_local ) {
                     assert(0);
@@ -103,7 +112,7 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_vector2( opal_convertor_t*
                 goto complete_loop;
             }
             if( OPAL_DATATYPE_END_LOOP == pElem->elem.common.type ) { /* end of the current loop */
-                DT_CUDA_DEBUG( opal_cuda_output( 2, "unpack end_loop count %d stack_pos %d pos_desc %d disp %ld space %lu\n",
+                DT_CUDA_DEBUG( opal_cuda_output( 4, "unpack end_loop count %d stack_pos %d pos_desc %d disp %ld space %lu\n",
                                                  (int)pStack->count, pConvertor->stack_pos, pos_desc,
                                                  (long)pStack->disp, (unsigned long)iov_len_local ); );
                 if( --(pStack->count) == 0 ) { /* end of loop */
@@ -128,7 +137,7 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_vector2( opal_convertor_t*
                 }
                 conv_ptr = pConvertor->pBaseBuf + pStack->disp;
                 UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
-                DT_CUDA_DEBUG( opal_cuda_output( 2, "unpack new_loop count %d stack_pos %d pos_desc %d disp %ld space %lu\n",
+                DT_CUDA_DEBUG( opal_cuda_output( 4, "unpack new_loop count %d stack_pos %d pos_desc %d disp %ld space %lu\n",
                                                  (int)pStack->count, pConvertor->stack_pos, pos_desc,
                                                  (long)pStack->disp, (unsigned long)iov_len_local ); );
             }
@@ -145,6 +154,8 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_vector2( opal_convertor_t*
                     if( 0 == count_desc ) {  /* completed */
                         pos_desc += pElem->loop.items + 1;
                         goto update_loop_description;
+                    } else {
+                        contiguous_loop_flag = 1;
                     }
                     /* Save the stack with the correct last_count value. */
                 }
@@ -153,7 +164,11 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_vector2( opal_convertor_t*
                             pStack->disp + local_disp);
                 pos_desc++;
             update_loop_description:  /* update the current state */
-                conv_ptr = pConvertor->pBaseBuf + pStack->disp;
+                if (contiguous_loop_flag) {
+                    count_desc_tmp = count_desc;
+                } else {
+                    conv_ptr = pConvertor->pBaseBuf + pStack->disp;
+                }
                 UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
                 continue;
             }
@@ -163,13 +178,15 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_vector2( opal_convertor_t*
         total_unpacked += iov[iov_count].iov_len;
     }
  complete_conversion:
-    cudaDeviceSynchronize();
+    for (i = 0; i < NB_STREAMS; i++) {
+        cudaStreamSynchronize(cuda_streams->opal_cuda_stream[i]);
+    }
     *max_data = total_unpacked;
     pConvertor->bConverted += total_unpacked;  /* update the already converted bytes */
     *out_size = iov_count;
     if( pConvertor->bConverted == pConvertor->remote_size ) {
         pConvertor->flags |= CONVERTOR_COMPLETED;
-        DT_CUDA_DEBUG( opal_cuda_output( 0, "Total unpacked %lu\n", pConvertor->bConverted); );
+        DT_CUDA_DEBUG( opal_cuda_output( 2, "Unpack total unpacked %lu\n", pConvertor->bConverted); );
         if (pConvertor->gpu_buffer_ptr != NULL && free_required == 1) {
             opal_ddt_cuda_free_gpu_buffer(pConvertor->gpu_buffer_ptr, 0);
             pConvertor->gpu_buffer_ptr = NULL;
@@ -179,12 +196,12 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_vector2( opal_convertor_t*
     /* Save the global position for the next round */
     PUSH_STACK( pStack, pConvertor->stack_pos, pos_desc, pElem->elem.common.type, count_desc,
                 conv_ptr - pConvertor->pBaseBuf );
-    DT_CUDA_DEBUG( opal_cuda_output( 2, "unpack save stack stack_pos %d pos_desc %d count_desc %d disp %ld\n",
+    DT_CUDA_DEBUG( opal_cuda_output( 4, "unpack save stack stack_pos %d pos_desc %d count_desc %d disp %ld\n",
                                      pConvertor->stack_pos, pStack->index, (int)pStack->count, (long)pStack->disp ); );
     return 0;
 }
 
-int32_t opal_ddt_generic_simple_unpack_function_cuda_vector( opal_convertor_t* pConvertor,
+int32_t opal_ddt_generic_simple_unpack_function_cuda_vector2( opal_convertor_t* pConvertor,
                                                          struct iovec* iov, uint32_t* out_size,
                                                          size_t* max_data )
 {
diff --git a/opal/datatype/opal_convertor.c b/opal/datatype/opal_convertor.c
index 7fed801766d..329ff4e62c1 100644
--- a/opal/datatype/opal_convertor.c
+++ b/opal/datatype/opal_convertor.c
@@ -560,7 +560,7 @@ int32_t opal_convertor_prepare_for_recv( opal_convertor_t* convertor,
 
     convertor->flags |= CONVERTOR_RECV;
 #if OPAL_CUDA_SUPPORT
-    mca_cuda_convertor_init(convertor, pUserBuf);
+    mca_cuda_convertor_init(convertor, pUserBuf, datatype);
 #endif  /* OPAL_CUDA_SUPPORT */
 
     OPAL_CONVERTOR_PREPARE( convertor, datatype, count, pUserBuf );
@@ -609,7 +609,7 @@ int32_t opal_convertor_prepare_for_send( opal_convertor_t* convertor,
 {
     convertor->flags |= CONVERTOR_SEND;
 #if OPAL_CUDA_SUPPORT
-    mca_cuda_convertor_init(convertor, pUserBuf);
+    mca_cuda_convertor_init(convertor, pUserBuf, datatype);
 #endif  /* OPAL_CUDA_SUPPORT */
 
     OPAL_CONVERTOR_PREPARE( convertor, datatype, count, pUserBuf );
diff --git a/opal/datatype/opal_convertor.h b/opal/datatype/opal_convertor.h
index af74ee1221c..822a91e85e0 100644
--- a/opal/datatype/opal_convertor.h
+++ b/opal/datatype/opal_convertor.h
@@ -114,6 +114,7 @@ struct opal_convertor_t {
 
     unsigned char *               gpu_buffer_ptr; /**< GPU buffer used for pack/unpack */
     size_t                        gpu_buffer_size;
+    size_t                        current_cuda_iov_count;
 #endif
     /* size: 248, cachelines: 4, members: 20 */
     /* last cacheline: 56 bytes */
diff --git a/opal/datatype/opal_datatype.h b/opal/datatype/opal_datatype.h
index beb5d0e0e20..5fed516df4b 100644
--- a/opal/datatype/opal_datatype.h
+++ b/opal/datatype/opal_datatype.h
@@ -131,7 +131,11 @@ struct opal_datatype_t {
     int                iov_count;
     size_t             max_data;
     /* size: 416, cachelines: 7, members: 18 */
-
+#if OPAL_CUDA_SUPPORT
+    void *             cuda_iov_dist;
+    size_t             cuda_iov_count;
+    int8_t             cuda_iov_is_cached;
+#endif /* OPAL_CUDA_SUPPORT */
     /* last cacheline: 32 bytes */
 };
 
diff --git a/opal/datatype/opal_datatype_cuda.c b/opal/datatype/opal_datatype_cuda.c
index 96c3221b94c..729e460de1a 100644
--- a/opal/datatype/opal_datatype_cuda.c
+++ b/opal/datatype/opal_datatype_cuda.c
@@ -61,7 +61,7 @@ void opal_cuda_add_initialization_function(int (*fptr)(opal_common_cuda_function
  * is enabled or not.  If CUDA is not enabled, then short circuit out
  * for all future calls.
  */
-void mca_cuda_convertor_init(opal_convertor_t* convertor, const void *pUserBuf)
+void mca_cuda_convertor_init(opal_convertor_t* convertor, const void *pUserBuf, const struct opal_datatype_t* datatype)
 {
     /* Only do the initialization on the first GPU access */
     if (!initialized) {
@@ -84,6 +84,18 @@ void mca_cuda_convertor_init(opal_convertor_t* convertor, const void *pUserBuf)
     if (OPAL_SUCCESS != opal_cuda_kernel_support_init()) {
         opal_cuda_kernel_support_fini();    
     }
+    if (opal_datatype_cuda_kernel_support == 1 && datatype->cuda_iov_is_cached == 0) {
+        struct opal_datatype_t* datatype_tmp = (opal_datatype_t *)datatype;
+        datatype_tmp->cuda_iov_dist = opal_cuda_iov_dist_init();
+        if (datatype_tmp->cuda_iov_dist == (void*)0xDEADBEEF || datatype_tmp->cuda_iov_dist == NULL) {
+            /* either cuda iov cache is not enabled or cuda_iov_cache malloc is failed, then we do not cache cuda iov */
+            datatype_tmp->cuda_iov_is_cached = -1;
+        } else {
+            /* cuda iov buffer is ready , the value will be marked to 2 when caching is finished*/
+            datatype_tmp->cuda_iov_is_cached = 1;
+        }
+    }
+    
 }
 
 /* Checks the type of pointer
@@ -241,6 +253,8 @@ int32_t opal_cuda_kernel_support_init(void)
         OPAL_DATATYPE_FIND_CUDA_KERNEL_FUNCTION_OR_RETURN( opal_datatype_cuda_kernel_handle, opal_ddt_cuda_malloc_gpu_buffer );
         OPAL_DATATYPE_FIND_CUDA_KERNEL_FUNCTION_OR_RETURN( opal_datatype_cuda_kernel_handle, opal_ddt_cuda_d2dcpy_async );
         OPAL_DATATYPE_FIND_CUDA_KERNEL_FUNCTION_OR_RETURN( opal_datatype_cuda_kernel_handle, opal_ddt_cuda_d2dcpy );
+        OPAL_DATATYPE_FIND_CUDA_KERNEL_FUNCTION_OR_RETURN( opal_datatype_cuda_kernel_handle, opal_ddt_cuda_iov_dist_init );
+        OPAL_DATATYPE_FIND_CUDA_KERNEL_FUNCTION_OR_RETURN( opal_datatype_cuda_kernel_handle, opal_ddt_cuda_iov_dist_fini );
         
         if (OPAL_SUCCESS != cuda_kernel_table.opal_ddt_cuda_kernel_init_p()) {
             return OPAL_ERROR;
@@ -356,3 +370,22 @@ void opal_cuda_d2dcpy_async(void* dst, const void* src, size_t count)
     }
 }
 
+void* opal_cuda_iov_dist_init(void)
+{
+    if (cuda_kernel_table.opal_ddt_cuda_iov_dist_init_p != NULL) {
+        return cuda_kernel_table.opal_ddt_cuda_iov_dist_init_p();
+    } else {
+        opal_output(0, "opal_ddt_cuda_iov_dist_init function pointer is NULL\n");
+        return NULL;
+    }
+}
+
+void opal_cuda_iov_dist_fini(void *cuda_iov_dist)
+{
+    if (cuda_kernel_table.opal_ddt_cuda_iov_dist_fini_p != NULL) {
+        cuda_kernel_table.opal_ddt_cuda_iov_dist_fini_p(cuda_iov_dist);
+    } else {
+        opal_output(0, "opal_ddt_cuda_iov_dist_fini function pointer is NULL\n");
+    }
+}
+
diff --git a/opal/datatype/opal_datatype_cuda.h b/opal/datatype/opal_datatype_cuda.h
index 8b6f996e422..24e85f649b9 100644
--- a/opal/datatype/opal_datatype_cuda.h
+++ b/opal/datatype/opal_datatype_cuda.h
@@ -28,6 +28,8 @@ struct opal_datatype_cuda_kernel_function_table {
     void* (*opal_ddt_cuda_malloc_gpu_buffer_p)(size_t size, int gpu_id);
     void (*opal_ddt_cuda_d2dcpy_async_p)(void* dst, const void* src, size_t count);
     void (*opal_ddt_cuda_d2dcpy_p)(void* dst, const void* src, size_t count);
+    void* (*opal_ddt_cuda_iov_dist_init_p)(void);
+    void (*opal_ddt_cuda_iov_dist_fini_p)(void *cuda_iov_dist);
     int32_t (*opal_ddt_generic_simple_pack_function_cuda_iov_p)( opal_convertor_t* pConvertor, struct iovec* iov, uint32_t* out_size, size_t* max_data );
     int32_t (*opal_ddt_generic_simple_unpack_function_cuda_iov_p)( opal_convertor_t* pConvertor, struct iovec* iov, uint32_t* out_size, size_t* max_data );
     int32_t (*opal_ddt_generic_simple_pack_function_cuda_vector_p)( opal_convertor_t* pConvertor, struct iovec* iov, uint32_t* out_size, size_t* max_data );
@@ -36,7 +38,7 @@ struct opal_datatype_cuda_kernel_function_table {
 typedef struct opal_datatype_cuda_kernel_function_table opal_datatype_cuda_kernel_function_table_t;
 extern int32_t opal_datatype_cuda_kernel_support;
 
-void mca_cuda_convertor_init(opal_convertor_t* convertor, const void *pUserBuf);
+void mca_cuda_convertor_init(opal_convertor_t* convertor, const void *pUserBuf, const struct opal_datatype_t* datatype);
 bool opal_cuda_check_bufs(char *dest, char *src);
 void* opal_cuda_memcpy(void * dest, const void * src, size_t size, opal_convertor_t* convertor);
 void* opal_cuda_memcpy_sync(void * dest, const void * src, size_t size);
@@ -54,5 +56,7 @@ void* opal_cuda_malloc_gpu_buffer(size_t size, int gpu_id);
 void opal_cuda_free_gpu_buffer(void *addr, int gpu_id);
 void opal_cuda_d2dcpy(void* dst, const void* src, size_t count);
 void opal_cuda_d2dcpy_async(void* dst, const void* src, size_t count);
+void* opal_cuda_iov_dist_init(void);
+void opal_cuda_iov_dist_fini(void *cuda_iov_dist);
 
 #endif
diff --git a/opal/datatype/opal_datatype_destroy.c b/opal/datatype/opal_datatype_destroy.c
index d468cd07e8c..8c225e698c0 100644
--- a/opal/datatype/opal_datatype_destroy.c
+++ b/opal/datatype/opal_datatype_destroy.c
@@ -22,10 +22,23 @@
 #include "opal/constants.h"
 #include "opal/datatype/opal_datatype.h"
 #include "opal/datatype/opal_datatype_internal.h"
+#if OPAL_CUDA_SUPPORT
+#include "opal/datatype/opal_convertor.h"
+#include "opal/datatype/opal_datatype_cuda.h"
+#endif /* OPAL_CUDA_SUPPORT */   
 
 int32_t opal_datatype_destroy( opal_datatype_t** dt )
 {
     opal_datatype_t* pData = *dt;
+    
+#if OPAL_CUDA_SUPPORT   
+    /* free cuda iov */
+    if (opal_datatype_cuda_kernel_support== 1 && pData->cuda_iov_dist != NULL && pData->cuda_iov_dist != (void*)0xDEADBEEF) {
+        opal_cuda_iov_dist_fini(pData->cuda_iov_dist);
+        pData->cuda_iov_dist = NULL;
+        pData->cuda_iov_count = 0;
+    }
+#endif /* OPAL_CUDA_SUPPORT */
 
     if( (pData->flags & OPAL_DATATYPE_FLAG_PREDEFINED) &&
         (pData->super.obj_reference_count <= 1) )
diff --git a/opal/datatype/opal_datatype_optimize.c b/opal/datatype/opal_datatype_optimize.c
index e8b8d9794bd..b33b7347fd8 100644
--- a/opal/datatype/opal_datatype_optimize.c
+++ b/opal/datatype/opal_datatype_optimize.c
@@ -304,6 +304,13 @@ int32_t opal_datatype_commit( opal_datatype_t * pData )
         pLast->size            = pData->size;
     }
 
+#if OPAL_CUDA_SUPPORT   
+    /* cuda iov for caching, it will be malloced latter when init convertor */
+    pData->cuda_iov_dist = NULL;
+    pData->cuda_iov_is_cached = 0;
+    pData->cuda_iov_count = 0;
+#endif /* OPAL_CUDA_SUPPORT */
+
     /* save a compressed datatype description as a iovec list */
 //    opal_convertor_t* conv = opal_convertor_create( opal_local_arch, 0 /* unused */);
 //    opal_convertor_prepare_for_send( conv, pData, 1, (void*)0 );
diff --git a/opal/datatype/opal_datatype_pack.c b/opal/datatype/opal_datatype_pack.c
index 0573db427df..9812a371a85 100644
--- a/opal/datatype/opal_datatype_pack.c
+++ b/opal/datatype/opal_datatype_pack.c
@@ -416,7 +416,7 @@ opal_generic_simple_pack_cuda_function( opal_convertor_t* pConvertor,
     pos_desc   = pStack->index;
     pElem = &(description[pos_desc]);
    
-//    return opal_generic_simple_pack_function_cuda_iov( pConvertor, iov, out_size, max_data);
+//    return opal_generic_simple_pack_function_cuda_vector( pConvertor, iov, out_size, max_data);
     if( OPAL_DATATYPE_LOOP == pElem->elem.common.type ) {
         return opal_generic_simple_pack_function_cuda_vector( pConvertor, iov, out_size, max_data);
     } else {
diff --git a/opal/datatype/opal_datatype_unpack.c b/opal/datatype/opal_datatype_unpack.c
index 5f0ac368f68..f5e1e76588f 100644
--- a/opal/datatype/opal_datatype_unpack.c
+++ b/opal/datatype/opal_datatype_unpack.c
@@ -610,7 +610,7 @@ opal_generic_simple_unpack_cuda_function( opal_convertor_t* pConvertor,
     pos_desc   = pStack->index;
     pElem = &(description[pos_desc]);
    
-//    return opal_generic_simple_unpack_function_cuda_iov( pConvertor, iov, out_size, max_data);
+//    return opal_generic_simple_unpack_function_cuda_vector( pConvertor, iov, out_size, max_data);
     if( OPAL_DATATYPE_LOOP == pElem->elem.common.type ) {
         return opal_generic_simple_unpack_function_cuda_vector( pConvertor, iov, out_size, max_data);
     } else {
diff --git a/test/datatype/ddt_benchmark.c b/test/datatype/ddt_benchmark.c
index 45440dc2c04..1bb91f663c8 100644
--- a/test/datatype/ddt_benchmark.c
+++ b/test/datatype/ddt_benchmark.c
@@ -1178,7 +1178,7 @@ int main( int argc, char* argv[] )
 #endif
     opal_init_util(&argc, &argv);
 #if defined (DDT_TEST_CUDA)
-   // mca_common_cuda_stage_one_init();
+    mca_common_cuda_stage_one_init();
 #endif
     ompi_datatype_init();
 
@@ -1216,18 +1216,18 @@ int main( int argc, char* argv[] )
         printf("----matrix size %d-----\n", mat_size);
         if( outputFlags & CHECK_PACK_UNPACK ) {
             for (i = 1; i <= 2; i++) {
-                local_copy_with_convertor(pdt, 1, 1024*1024*200, mat_size);
+    //            local_copy_with_convertor(pdt, 1, 1024*1024*200, mat_size);
             }
         }
         OBJ_RELEASE( pdt ); assert( pdt == NULL );
     }
     
     ompi_datatype_t *column, *matt;
-    mat_size = 4000;
-//    ompi_datatype_create_vector( mat_size, 1, mat_size, MPI_DOUBLE, &column );
-//    ompi_datatype_create_hvector( mat_size, 1, sizeof(double), column, &matt );
-//    ompi_datatype_commit( &matt );
-//    local_copy_with_convertor_mat(matt, 1, 200000000, mat_size);
+    mat_size = 1000;
+    ompi_datatype_create_vector( mat_size, 1, mat_size, MPI_DOUBLE, &column );
+    ompi_datatype_create_hvector( mat_size, 1, sizeof(double), column, &matt );
+    ompi_datatype_commit( &matt );
+    local_copy_with_convertor_mat(matt, 1, 200000000, mat_size);
     
     
     int packed_size = 256;
@@ -1279,13 +1279,13 @@ int main( int argc, char* argv[] )
     }
     
     
-    for (blk_len = 64; blk_len <= 64; blk_len += 2) {
+    for (blk_len = 1000; blk_len <= 1000; blk_len += 2) {
         printf( ">>--------------------------------------------<<\n" );
         printf( "Vector data-type (1024 times %d double stride 512)\n", blk_len );
-        pdt = create_vector_type( MPI_DOUBLE, 1000, blk_len, blk_len+128);
+        pdt = create_vector_type( MPI_DOUBLE, 1000, blk_len, blk_len*2);
         if( outputFlags & CHECK_PACK_UNPACK ) {
-            for (i = 0; i < 4; i++) {
-      //           vector_ddt( pdt, 1, pdt, 1, 1024*10240 , 1000, blk_len, blk_len+128);
+            for (i = 0; i < 1; i++) {
+                 vector_ddt( pdt, 1, pdt, 1, 2000000 , 1000, blk_len, blk_len*2);
      //          vector_ddt_2d( pdt, 1, pdt, 1, 1024*1024*100 , 8192, blk_len, blk_len+128);
             }
         }

From f3d37d8aabd3d2a124647eabce5314954dd385c0 Mon Sep 17 00:00:00 2001
From: George Bosilca <bosilca@icl.utk.edu>
Date: Fri, 6 Nov 2015 20:40:13 -0500
Subject: [PATCH 140/190] Add support for caching the unpacked datatype
 description via the opal_convertor_raw_cached function.

---
 opal/datatype/opal_convertor.h       | 13 ++++++++++++-
 opal/datatype/opal_convertor_raw.c   | 27 ++++++++++++++++++++++++++-
 opal/datatype/opal_datatype.h        |  3 +++
 opal/datatype/opal_datatype_create.c | 10 +++++++++-
 4 files changed, 50 insertions(+), 3 deletions(-)

diff --git a/opal/datatype/opal_convertor.h b/opal/datatype/opal_convertor.h
index 822a91e85e0..fb8b4d630a4 100644
--- a/opal/datatype/opal_convertor.h
+++ b/opal/datatype/opal_convertor.h
@@ -3,7 +3,7 @@
  * Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana
  *                         University Research and Technology
  *                         Corporation.  All rights reserved.
- * Copyright (c) 2004-2014 The University of Tennessee and The University
+ * Copyright (c) 2004-2015 The University of Tennessee and The University
  *                         of Tennessee Research Foundation.  All rights
  *                         reserved.
  * Copyright (c) 2004-2006 High Performance Computing Center Stuttgart,
@@ -289,6 +289,17 @@ opal_convertor_to_iov(struct opal_convertor_t *convertor,
                       struct iovec **iov,
                       uint32_t *iov_count,
                       size_t *max_data);
+
+/**
+ * A straighforward description of the datatype in terms of a NULL
+ * based iovec (so basically displacements from the begining of a pointer,
+ * will be generated and stored in the datatype itself. This description
+ * can be used to pack/unpack the data manually.
+ */
+OPAL_DECLSPEC int
+opal_convertor_raw_cached(struct opal_convertor_t *convertor,
+                          const struct iovec **iov,
+                          uint32_t* iov_count);
 /*
  * Upper level does not need to call the _nocheck function directly.
  */
diff --git a/opal/datatype/opal_convertor_raw.c b/opal/datatype/opal_convertor_raw.c
index 441ee9ee0fc..bf46a7a9d5a 100644
--- a/opal/datatype/opal_convertor_raw.c
+++ b/opal/datatype/opal_convertor_raw.c
@@ -1,6 +1,6 @@
 /* -*- Mode: C; c-basic-offset:4 ; -*- */
 /*
- * Copyright (c) 2004-2009 The University of Tennessee and The University
+ * Copyright (c) 2004-2015 The University of Tennessee and The University
  *                         of Tennessee Research Foundation.  All rights
  *                         reserved.
  * Copyright (c) 2009      Oak Ridge National Labs.  All rights reserved.
@@ -240,3 +240,28 @@ opal_convertor_to_iov(struct opal_convertor_t *convertor,
         iovec = &((*iov)[*iov_count]);
     }
 }
+
+int opal_convertor_raw_cached(struct opal_convertor_t *convertor,
+                              const struct iovec **iov,
+                              uint32_t* iov_count)
+{
+    if( NULL == convertor->pDesc->cached_iovec ) {
+        struct opal_convertor_t conv;
+        size_t max_data;
+
+        OBJ_CONSTRUCT(&conv, opal_convertor_t);
+        conv.remoteArch = convertor->remoteArch;
+        conv.stack_pos  = 0;
+        conv.flags      = convertor->flags;
+        conv.master     = convertor->master;
+        opal_convertor_prepare_for_send(&conv, convertor->pDesc, 1, NULL);
+        opal_convertor_get_packed_size(&conv, &max_data);
+        opal_convertor_to_iov(&conv, (struct iovec **)&convertor->pDesc->cached_iovec,
+                              (uint32_t *)&convertor->pDesc->cached_iovec_count, &max_data);
+        OBJ_DESTRUCT(&conv);
+    }
+    *iov = convertor->pDesc->cached_iovec;
+    *iov_count = convertor->pDesc->cached_iovec_count;
+
+    return OPAL_SUCCESS;
+}
diff --git a/opal/datatype/opal_datatype.h b/opal/datatype/opal_datatype.h
index 5fed516df4b..b15a8d845ee 100644
--- a/opal/datatype/opal_datatype.h
+++ b/opal/datatype/opal_datatype.h
@@ -137,6 +137,9 @@ struct opal_datatype_t {
     int8_t             cuda_iov_is_cached;
 #endif /* OPAL_CUDA_SUPPORT */
     /* last cacheline: 32 bytes */
+
+    struct iovec*      cached_iovec;
+    uint32_t           cached_iovec_count;
 };
 
 typedef struct opal_datatype_t opal_datatype_t;
diff --git a/opal/datatype/opal_datatype_create.c b/opal/datatype/opal_datatype_create.c
index e64e1f04190..b97a84f5174 100644
--- a/opal/datatype/opal_datatype_create.c
+++ b/opal/datatype/opal_datatype_create.c
@@ -3,7 +3,7 @@
  * Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana
  *                         University Research and Technology
  *                         Corporation.  All rights reserved.
- * Copyright (c) 2004-2013 The University of Tennessee and The University
+ * Copyright (c) 2004-2015 The University of Tennessee and The University
  *                         of Tennessee Research Foundation.  All rights
  *                         reserved.
  * Copyright (c) 2004-2006 High Performance Computing Center Stuttgart,
@@ -53,6 +53,9 @@ static void opal_datatype_construct( opal_datatype_t* pData )
     pData->opt_desc.length    = 0;
     pData->opt_desc.used      = 0;
 
+    pData->cached_iovec       = NULL;
+    pData->cached_iovec_count = 0;
+
     for( i = 0; i < OPAL_DATATYPE_MAX_SUPPORTED; i++ )
         pData->btypes[i]      = 0;
 }
@@ -82,6 +85,11 @@ static void opal_datatype_destruct( opal_datatype_t* datatype )
 
     /* make sure the name is set to empty */
     datatype->name[0] = '\0';
+
+    if( NULL != datatype->cached_iovec ) {
+        free(datatype->cached_iovec);
+        datatype->cached_iovec = NULL;
+    }
 }
 
 OBJ_CLASS_INSTANCE(opal_datatype_t, opal_object_t, opal_datatype_construct, opal_datatype_destruct);

From e4e11bc04ae95a167a2361a71b4f6c27f29ea3d3 Mon Sep 17 00:00:00 2001
From: eddy16112 <eddy16112@gmail.com>
Date: Fri, 6 Nov 2015 23:23:33 -0500
Subject: [PATCH 141/190] check point use raw_cached, but cuda iov caching is
 not enabled

---
 .../cuda/opal_datatype_cuda_internal.cuh      |   8 +-
 .../cuda/opal_datatype_pack_cuda_wrapper.cu   | 293 +++++++++++++++++-
 .../cuda/opal_datatype_unpack_cuda_wrapper.cu | 254 ++++++++++++++-
 opal/datatype/opal_convertor.h                |   2 +
 opal/datatype/opal_datatype_cuda.c            |   3 +
 test/datatype/ddt_benchmark.c                 |  16 +-
 6 files changed, 562 insertions(+), 14 deletions(-)

diff --git a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
index ca630fc1b93..eff247a15d2 100644
--- a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
+++ b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
@@ -18,7 +18,7 @@
 #define OPAL_DATATYPE_VECTOR_USE_ZEROCPY   0
 #define OPAL_DATATYPE_VECTOR_USE_PIPELINE   0
 #define OPAL_DATATYPE_VECTOR_USE_MEMCPY2D_AS_KERNEL   1
-#define OPAL_DATATYPE_CUDA_IOV_CACHE    1
+#define OPAL_DATATYPE_CUDA_IOV_CACHE    0
 
 
 
@@ -37,7 +37,7 @@
 #define CUDA_IOV_MAX_TASK_PER_BLOCK 400
 #define ALIGNMENT_DOUBLE    8
 #define ALIGNMENT_FLOAT     4
-#define ALIGNMENT_CHAR      18
+#define ALIGNMENT_CHAR      1
 #define NUM_CUDA_IOV_PER_DDT    100000
 
 #define TIMER_DATA_TYPE struct timeval
@@ -139,6 +139,10 @@ int32_t opal_convertor_set_position_nocheck( opal_convertor_t* convertor, size_t
 int32_t opal_convertor_raw( opal_convertor_t* pConvertor, 
 		                    struct iovec* iov, uint32_t* iov_count,
 		                    size_t* length );
+
+int opal_convertor_raw_cached(struct opal_convertor_t *convertor,
+                              const struct iovec **iov,
+                              uint32_t* iov_count);
 }
 
 #endif  /* OPAL_DATATYPE_CUDA_INTERNAL_H_HAS_BEEN_INCLUDED */
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
index b82888a3f96..b2366b211f4 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
@@ -659,7 +659,7 @@ void pack_contiguous_loop_cuda_zerocopy( dt_elem_desc_t* ELEM,
 #endif
 }
 
-int32_t opal_ddt_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor,
+int32_t opal_ddt_generic_simple_pack_function_cuda_iov2( opal_convertor_t* pConvertor,
                                                     struct iovec* iov,
                                                     uint32_t* out_size,
                                                     size_t* max_data )
@@ -776,6 +776,7 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConve
     dst_offset = 0;
     thread_per_block = CUDA_WARP_SIZE * 5;
     nb_blocks = 256;
+    source_base = (unsigned char*)pConvertor->pBaseBuf; 
     
     while (cuda_iov_count > 0) {
         
@@ -786,7 +787,6 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConve
         cuda_stream_iov = cuda_iov_pipeline_block->cuda_stream;
         cuda_err = cudaStreamWaitEvent(*cuda_stream_iov, cuda_iov_pipeline_block->cuda_event, 0);
         opal_cuda_check_error(cuda_err);
-        source_base = (unsigned char*)cuda_iov[0].iov_base; 
 
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
         GET_TIME(start);
@@ -941,6 +941,295 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConve
     return 0;
 }
 
+int32_t opal_ddt_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor,
+                                                    struct iovec* iov,
+                                                    uint32_t* out_size,
+                                                    size_t* max_data )
+{
+    uint32_t i, j;
+    uint32_t count_desc, nb_blocks_per_description, residue_desc;
+    uint32_t nb_blocks, thread_per_block, nb_blocks_used;
+    size_t length, buffer_size, length_per_iovec;
+    unsigned char *destination, *destination_base, *source_base, *source;
+    size_t total_packed, total_converted;
+    int32_t complete_flag = 0;
+    uint8_t buffer_isfull = 0, transfer_required, free_required;
+    uint32_t convertor_flags;
+//    dt_elem_desc_t* description;
+//    dt_elem_desc_t* pElem;
+//    dt_stack_t* pStack;
+    uint8_t alignment, orig_alignment;
+//    int32_t orig_stack_index;
+    cudaError_t cuda_err;
+    ddt_cuda_stream_t *cuda_streams = current_cuda_device->cuda_streams;
+    ddt_cuda_iov_dist_t* cuda_iov_dist_h_current;
+    ddt_cuda_iov_dist_t* cuda_iov_dist_d_current;
+    ddt_cuda_iov_pipeline_block_t *cuda_iov_pipeline_block;
+    int iov_pipeline_block_id = 0;
+    cudaStream_t *cuda_stream_iov = NULL;
+    const struct iovec *ddt_iov = NULL;
+    uint32_t ddt_iov_count;
+    size_t iov_len;
+    int iov_start_pos, iov_end_pos;
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)    
+    TIMER_DATA_TYPE start, end, start_total, end_total;
+    long total_time, move_time;
+#endif
+    
+#if OPAL_DATATYPE_CUDA_IOV_CACHE
+    opal_datatype_t *pDesc = (opal_datatype_t *)pConvertor->pDesc;
+    ddt_cuda_iov_dist_t *cuda_iov_dist_cache = (ddt_cuda_iov_dist_t *)pDesc->cuda_iov_dist;
+    cuda_iov_dist_cache += pDesc->cuda_iov_count;    
+#endif /* OPAL_DATATYPE_CUDA_IOV_CACHE */
+    
+    /*description = pConvertor->use_desc->desc;
+    pStack = pConvertor->pStack + pConvertor->stack_pos;
+    pElem = &(description[pStack->index]);
+    printf("size elem %lu, size %d\n", pElem->elem.common.type, opal_datatype_basicDatatypes[pElem->elem.common.type]->size);
+    */
+    
+//    assert(opal_datatype_basicDatatypes[pElem->elem.common.type]->size != 0);
+
+ //   printf("buffer size %d, max_data %d\n", iov[0].iov_len, *max_data);
+    if ((iov[0].iov_base == NULL) || opal_ddt_cuda_is_gpu_buffer(iov[0].iov_base)) {
+        if (iov[0].iov_len == 0) {
+            buffer_size = DT_CUDA_BUFFER_SIZE;
+        } else {
+            buffer_size = iov[0].iov_len;
+        }
+        
+        if (iov[0].iov_base == NULL) {
+            iov[0].iov_base = (unsigned char *)opal_ddt_cuda_malloc_gpu_buffer(buffer_size, 0);
+            destination = (unsigned char *)iov[0].iov_base;
+            pConvertor->gpu_buffer_ptr = destination;
+            free_required = 1;
+        } else {
+            destination = (unsigned char *)iov[0].iov_base;
+            free_required = 0;
+        }
+        transfer_required = 0;
+    } else {
+        buffer_size = iov[0].iov_len;
+        if (OPAL_DATATYPE_VECTOR_USE_ZEROCPY) {
+            pConvertor->gpu_buffer_ptr = NULL;
+            transfer_required = 0;
+            free_required = 0;
+            cudaHostGetDevicePointer((void **)&destination, (void *)iov[0].iov_base, 0);
+        } else {
+            if (pConvertor->gpu_buffer_ptr == NULL) {
+                pConvertor->gpu_buffer_ptr = (unsigned char*)opal_ddt_cuda_malloc_gpu_buffer(buffer_size, 0);
+            }
+            transfer_required = 1;
+            free_required = 1;
+            destination = pConvertor->gpu_buffer_ptr;
+        }
+    }
+    
+#if OPAL_DATATYPE_CUDA_IOV_CACHE
+    /* cuda iov is cached */
+    if (pDesc->cuda_iov_is_cached == 2) {
+        pack_iov_cached(pConvertor, destination);
+    }
+#endif /* OPAL_DATATYPE_CUDA_IOV_CACHE */    
+    
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "Pack using IOV, GPU base %p, pack to buffer %p\n", pConvertor->pBaseBuf, destination););
+
+    cuda_iov_count = 4000;//CUDA_NB_IOV;
+    total_packed = 0;
+    total_converted = pConvertor->bConverted;
+    cuda_streams->current_stream_id = 0;
+  //  orig_stack_index = pStack->index;
+    destination_base = destination;
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+    GET_TIME(start_total);
+#endif
+    
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+    GET_TIME(start);
+#endif
+    
+    opal_convertor_raw_cached( pConvertor, &ddt_iov, &ddt_iov_count);
+    assert(ddt_iov != NULL);
+    DT_CUDA_DEBUG ( opal_cuda_output(4, "Pack iov count %d, submit to CUDA stream %d\n", ddt_iov_count, cuda_streams->current_stream_id); );
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)    
+    GET_TIME( end );
+    total_time = ELAPSED_TIME( start, end );
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: ddt to iov in %ld microsec\n", total_time ); );
+#endif
+    
+    thread_per_block = CUDA_WARP_SIZE * 5;
+    nb_blocks = 256;
+    
+    iov_start_pos = pConvertor->current_iov_pos;
+    iov_end_pos = iov_start_pos + 1000;
+    if (iov_end_pos > ddt_iov_count) {
+        iov_end_pos = ddt_iov_count;
+    }
+    source_base = (unsigned char*)pConvertor->pBaseBuf; 
+    
+    while (iov_start_pos < iov_end_pos && !buffer_isfull) {
+        
+        nb_blocks_used = 0;
+        cuda_iov_pipeline_block = current_cuda_device->cuda_iov_pipeline_block[iov_pipeline_block_id];
+        cuda_iov_dist_h_current = cuda_iov_pipeline_block->cuda_iov_dist_h;
+        cuda_iov_dist_d_current = cuda_iov_pipeline_block->cuda_iov_dist_d;
+        cuda_stream_iov = cuda_iov_pipeline_block->cuda_stream;
+        cuda_err = cudaStreamWaitEvent(*cuda_stream_iov, cuda_iov_pipeline_block->cuda_event, 0);
+        opal_cuda_check_error(cuda_err);
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+        GET_TIME(start);
+#endif
+
+        for (i = iov_start_pos; i < iov_end_pos; i++) {
+            if (pConvertor->current_iov_partial_length > 0) {
+                iov_len = pConvertor->current_iov_partial_length;
+                pConvertor->current_iov_partial_length = 0;
+            } else {
+                iov_len = ddt_iov[i].iov_len;
+            }
+            if (buffer_size >= iov_len) {
+                length_per_iovec = iov_len;
+            } else {
+                /*orig_alignment = opal_datatype_basicDatatypes[pElem->elem.common.type]->size;*/
+                orig_alignment = ALIGNMENT_CHAR;
+                length_per_iovec = buffer_size / orig_alignment * orig_alignment;
+                buffer_isfull = 1;
+                pConvertor->current_iov_partial_length = iov_len - length_per_iovec;
+                pConvertor->current_iov_pos = i;
+            }
+            buffer_size -= length_per_iovec;
+            total_packed += length_per_iovec;
+            source = (size_t)(ddt_iov[i].iov_base) + (ddt_iov[i].iov_len - iov_len) + source_base;
+            
+            /* check alignment */
+            if ((uintptr_t)(source) % ALIGNMENT_DOUBLE == 0 && (uintptr_t)destination % ALIGNMENT_DOUBLE == 0 && length_per_iovec >= ALIGNMENT_DOUBLE) {
+                alignment = ALIGNMENT_DOUBLE;
+            } else if ((uintptr_t)(source) % ALIGNMENT_FLOAT == 0 && (uintptr_t)destination % ALIGNMENT_FLOAT == 0 && length_per_iovec >= ALIGNMENT_FLOAT) {
+                alignment = ALIGNMENT_FLOAT;
+            } else {
+                alignment = ALIGNMENT_CHAR;
+            }
+
+            count_desc = length_per_iovec / alignment;
+            residue_desc = length_per_iovec % alignment;
+            nb_blocks_per_description = (count_desc + thread_per_block - 1) / thread_per_block;
+            DT_CUDA_DEBUG ( opal_cuda_output(10, "Pack description %d, size %d, residue %d, alignment %d, nb_block_aquired %d\n", i, count_desc, residue_desc, alignment, nb_blocks_per_description); );
+            for (j = 0; j < nb_blocks_per_description; j++) {
+                cuda_iov_dist_h_current[nb_blocks_used].src_offset = source + j * thread_per_block * alignment - source_base;
+                cuda_iov_dist_h_current[nb_blocks_used].dst_offset = destination - destination_base;
+                cuda_iov_dist_h_current[nb_blocks_used].element_alignment = alignment;
+                if ( (j+1) * thread_per_block <= count_desc) {
+                    cuda_iov_dist_h_current[nb_blocks_used].nb_elements = thread_per_block;
+                } else {
+                    cuda_iov_dist_h_current[nb_blocks_used].nb_elements = count_desc - j*thread_per_block; 
+                }
+#if defined (OPAL_DATATYPE_CUDA_DEBUG)
+                assert(cuda_iov_dist_h_current[nb_blocks_used].nb_elements > 0);
+#endif /* OPAL_DATATYPE_CUDA_DEBUG */
+                destination += cuda_iov_dist_h_current[nb_blocks_used].nb_elements * alignment;
+                DT_CUDA_DEBUG( opal_cuda_output(12, "Pack \tblock %d, src %p, dst %p, nb_elements %d, alignment %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].src_offset, cuda_iov_dist_h_current[nb_blocks_used].dst_offset, cuda_iov_dist_h_current[nb_blocks_used].nb_elements, cuda_iov_dist_h_current[nb_blocks_used].element_alignment); );
+                nb_blocks_used ++;
+                assert (nb_blocks_used < CUDA_MAX_NB_BLOCKS*CUDA_IOV_MAX_TASK_PER_BLOCK);
+            }
+            
+            /* handle residue */
+            if (residue_desc != 0) {
+                /*orig_alignment = opal_datatype_basicDatatypes[pElem->elem.common.type]->size;*/
+                orig_alignment = ALIGNMENT_CHAR;
+                cuda_iov_dist_h_current[nb_blocks_used].src_offset = source + length_per_iovec / alignment * alignment - source_base;
+                cuda_iov_dist_h_current[nb_blocks_used].dst_offset = destination - destination_base;
+                cuda_iov_dist_h_current[nb_blocks_used].element_alignment = orig_alignment;
+                cuda_iov_dist_h_current[nb_blocks_used].nb_elements = (length_per_iovec - length_per_iovec / alignment * alignment) / orig_alignment;
+#if defined (OPAL_DATATYPE_CUDA_DEBUG)
+                assert(cuda_iov_dist_h_current[nb_blocks_used].nb_elements > 0);
+#endif /* OPAL_DATATYPE_CUDA_DEBUG */
+                destination += cuda_iov_dist_h_current[nb_blocks_used].nb_elements * orig_alignment;
+                DT_CUDA_DEBUG( opal_cuda_output(12, "Pack \tblock %d, src %p, dst %p, nb_elements %d, alignment %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].src_offset, cuda_iov_dist_h_current[nb_blocks_used].dst_offset, cuda_iov_dist_h_current[nb_blocks_used].nb_elements, cuda_iov_dist_h_current[nb_blocks_used].element_alignment); );
+                nb_blocks_used ++;
+                assert (nb_blocks_used < CUDA_MAX_NB_BLOCKS*CUDA_IOV_MAX_TASK_PER_BLOCK);
+            }
+            
+            if (buffer_isfull) {
+                break;
+            }
+        }
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)    
+        GET_TIME( end );
+        total_time = ELAPSED_TIME( start, end );
+        DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Pack to dest %p, iov is prepared in %ld microsec, kernel submitted to CUDA stream %d, nb_blocks %d\n", destination_base, total_time,  cuda_iov_pipeline_block->cuda_stream_id, nb_blocks_used); );
+#endif
+
+        cudaMemcpyAsync(cuda_iov_dist_d_current, cuda_iov_dist_h_current, sizeof(ddt_cuda_iov_dist_t)*(nb_blocks_used), cudaMemcpyHostToDevice, *cuda_stream_iov);
+#if OPAL_DATATYPE_CUDA_IOV_CACHE
+        cudaMemcpyAsync(cuda_iov_dist_cache, cuda_iov_dist_d_current, sizeof(ddt_cuda_iov_dist_t)*(nb_blocks_used), cudaMemcpyDeviceToDevice, *cuda_stream_iov);
+        pDesc->cuda_iov_count += nb_blocks_used;
+        cuda_iov_dist_cache += nb_blocks_used;
+#endif /* OPAL_DATATYPE_CUDA_IOV_CACHE */
+        DT_CUDA_DEBUG ( opal_cuda_output(2, "kernel launched src_base %p, dst_base %p, nb_blocks %ld\n", source_base, destination_base, nb_blocks_used ); );
+        opal_generic_simple_pack_cuda_iov_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cuda_iov_dist_d_current, nb_blocks_used, source_base, destination_base);
+        cuda_err = cudaEventRecord(cuda_iov_pipeline_block->cuda_event, *cuda_stream_iov);
+        opal_cuda_check_error(cuda_err);
+        iov_pipeline_block_id ++;
+        iov_pipeline_block_id = iov_pipeline_block_id % NB_STREAMS;
+        
+//        orig_stack_index = pStack->index;
+        iov_start_pos = iov_end_pos;
+        iov_end_pos = iov_start_pos + 1000;
+        if (iov_end_pos > ddt_iov_count) {
+            iov_end_pos = ddt_iov_count;
+        }
+        DT_CUDA_DEBUG ( opal_cuda_output(4, "Pack iov start pos %d end pos %d, submit to CUDA stream %d\n", iov_start_pos, iov_end_pos, cuda_streams->current_stream_id); );
+    }
+    
+
+    for (i = 0; i < NB_STREAMS; i++) {
+        cudaStreamSynchronize(cuda_streams->opal_cuda_stream[i]);
+    }
+    
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+    GET_TIME(start);
+#endif
+    if (transfer_required) {
+        cudaMemcpy(iov[0].iov_base, pConvertor->gpu_buffer_ptr, total_packed, cudaMemcpyDeviceToHost);
+    } 
+#if defined(OPAL_DATATYPE_CUDA_TIMING) 
+    GET_TIME( end );
+    move_time = ELAPSED_TIME( start, end );
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: DtoH memcpy in %ld microsec, transfer required %d\n", move_time, transfer_required ); );
+#endif
+
+    iov[0].iov_len = total_packed;
+    *max_data = total_packed;
+    *out_size = 1;
+    pConvertor->bConverted += total_packed;
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "Pack total packed %d\n", total_packed); );
+    
+#if defined(OPAL_DATATYPE_CUDA_TIMING)    
+    GET_TIME( end_total );
+    total_time = ELAPSED_TIME( start_total, end_total );
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: total packing in %ld microsec, kernel %ld microsec\n", total_time, total_time-move_time ); );
+#endif
+    
+    if( pConvertor->bConverted == pConvertor->local_size ) {
+        pConvertor->flags |= CONVERTOR_COMPLETED;
+        if (pConvertor->gpu_buffer_ptr != NULL && free_required) {
+           opal_ddt_cuda_free_gpu_buffer(pConvertor->gpu_buffer_ptr, 0);
+           pConvertor->gpu_buffer_ptr = NULL;
+        }
+#if OPAL_DATATYPE_CUDA_IOV_CACHE
+        pDesc->cuda_iov_is_cached = 2;
+#endif /* OPAL_DATATYPE_CUDA_IOV_CACHE */
+        return 1;
+    }        
+    return 0;
+}
+
+
 #if OPAL_DATATYPE_CUDA_IOV_CACHE
 void pack_iov_cached(opal_convertor_t* pConvertor, unsigned char *destination)
 {
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
index f483d230934..4b4438fa8e4 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
@@ -370,7 +370,7 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_vector2( opal_convertor_t*
     return 0;
 }
 
-int32_t opal_ddt_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvertor,
+int32_t opal_ddt_generic_simple_unpack_function_cuda_iov2( opal_convertor_t* pConvertor,
                                                           struct iovec* iov,
                                                           uint32_t* out_size,
                                                           size_t* max_data )
@@ -469,6 +469,7 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pCon
     dst_offset = 0;
     thread_per_block = CUDA_WARP_SIZE * 5;
     nb_blocks = 256;
+    destination_base = (unsigned char*)pConvertor->pBaseBuf;
 
     while (cuda_iov_count > 0) {
 
@@ -479,7 +480,7 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pCon
         cuda_stream_iov = cuda_iov_pipeline_block->cuda_stream;
         cuda_err = cudaStreamWaitEvent(*cuda_stream_iov, cuda_iov_pipeline_block->cuda_event, 0);
         opal_cuda_check_error(cuda_err);
-        destination_base = (unsigned char*)cuda_iov[0].iov_base;
+        
 
 #if defined (OPAL_DATATYPE_CUDA_TIMING)
         GET_TIME(start);
@@ -614,6 +615,255 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pCon
     return 0;
 }
 
+int32_t opal_ddt_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvertor,
+                                                          struct iovec* iov,
+                                                          uint32_t* out_size,
+                                                          size_t* max_data )
+{
+    uint32_t i, j;
+    uint32_t count_desc, nb_blocks_per_description, residue_desc;
+    uint32_t nb_blocks, thread_per_block, nb_blocks_used;
+    size_t length, buffer_size, length_per_iovec;
+    unsigned char *source, *source_base, *destination_base, *destination;
+    size_t total_unpacked, total_converted;
+    int32_t complete_flag = 0;
+    uint8_t buffer_isfull = 0;
+    uint8_t free_required = 0;
+    uint32_t convertor_flags;
+//    dt_elem_desc_t* description;
+//    dt_elem_desc_t* pElem;
+//    dt_stack_t* pStack;
+    uint8_t alignment, orig_alignment;
+//    int32_t orig_stack_index;
+    cudaError_t cuda_err;
+    ddt_cuda_stream_t *cuda_streams = current_cuda_device->cuda_streams;
+    ddt_cuda_iov_dist_t* cuda_iov_dist_h_current;
+    ddt_cuda_iov_dist_t* cuda_iov_dist_d_current;
+    ddt_cuda_iov_pipeline_block_t *cuda_iov_pipeline_block;
+    int iov_pipeline_block_id = 0;
+    cudaStream_t *cuda_stream_iov = NULL;
+    const struct iovec *ddt_iov = NULL;
+    uint32_t ddt_iov_count;
+    size_t iov_len;
+    int iov_start_pos, iov_end_pos;
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+    TIMER_DATA_TYPE start, end, start_total, end_total;
+    long total_time, move_time;
+#endif
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+    GET_TIME(start_total);
+#endif
+
+/*    description = pConvertor->use_desc->desc;
+    pStack = pConvertor->pStack + pConvertor->stack_pos;
+    pElem = &(description[pStack->index]);
+    printf("size elem %d, size %lu\n", pElem->elem.common.type, opal_datatype_basicDatatypes[pElem->elem.common.type]->size);
+*/
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+    GET_TIME(start);
+#endif
+    if (opal_ddt_cuda_is_gpu_buffer(iov[0].iov_base)) {
+        source = (unsigned char*)iov[0].iov_base;
+        free_required = 0;
+    } else {
+        if (OPAL_DATATYPE_VECTOR_USE_ZEROCPY) {
+            cudaHostGetDevicePointer((void **)&source, (void *)iov[0].iov_base, 0);
+            pConvertor->gpu_buffer_ptr = NULL;
+            free_required = 0;
+        } else {
+            if (pConvertor->gpu_buffer_ptr == NULL) {
+                pConvertor->gpu_buffer_ptr = (unsigned char*)opal_ddt_cuda_malloc_gpu_buffer(iov[0].iov_len, 0);
+            }
+            source = pConvertor->gpu_buffer_ptr;
+            cudaMemcpy(source, iov[0].iov_base, iov[0].iov_len, cudaMemcpyHostToDevice);
+            free_required = 1;
+        }
+    }
+
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "Unpack using IOV, GPU base %p, unpack from buffer %p, total size %ld\n",
+                                     pConvertor->pBaseBuf, source, iov[0].iov_len); );
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+    GET_TIME( end );
+    move_time = ELAPSED_TIME( start, end );
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: HtoD memcpy in %ld microsec, free required %d\n", move_time, free_required ); );
+#endif
+    
+//    cuda_err = cudaEventRecord(current_cuda_device->memcpy_event, current_cuda_device->cuda_streams->opal_cuda_stream[0]);
+//    opal_cuda_check_error(cuda_err);
+
+
+#if defined (OPAL_DATATYPE_CUDA_TIMING)
+    GET_TIME(start);
+#endif
+    buffer_size = iov[0].iov_len;
+    cuda_iov_count = 1000;
+    total_unpacked = 0;
+    total_converted = pConvertor->bConverted;
+    cuda_streams->current_stream_id = 0;
+    convertor_flags = pConvertor->flags;
+//    orig_stack_index = pStack->index;
+    source_base = source;
+    opal_convertor_raw_cached( pConvertor, &ddt_iov, &ddt_iov_count);
+    assert(ddt_iov != NULL);
+    DT_CUDA_DEBUG ( opal_cuda_output(4, "Unpack iov count %d, submit to CUDA stream %d\n", ddt_iov_count, cuda_streams->current_stream_id); );
+
+#if defined (OPAL_DATATYPE_CUDA_TIMING)
+    GET_TIME( end );
+    total_time = ELAPSED_TIME( start, end );
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: ddt to iov in %ld microsec\n", total_time ); );
+#endif
+
+    thread_per_block = CUDA_WARP_SIZE * 5;
+    nb_blocks = 256;
+    
+    iov_start_pos = pConvertor->current_iov_pos;
+    iov_end_pos = iov_start_pos + 1000;
+    if (iov_end_pos > ddt_iov_count) {
+        iov_end_pos = ddt_iov_count;
+    }
+    destination_base = (unsigned char*)pConvertor->pBaseBuf;
+
+    while (iov_start_pos < iov_end_pos && !buffer_isfull) {
+
+        nb_blocks_used = 0;
+        cuda_iov_pipeline_block = current_cuda_device->cuda_iov_pipeline_block[iov_pipeline_block_id];
+        cuda_iov_dist_h_current = cuda_iov_pipeline_block->cuda_iov_dist_h;
+        cuda_iov_dist_d_current = cuda_iov_pipeline_block->cuda_iov_dist_d;
+        cuda_stream_iov = cuda_iov_pipeline_block->cuda_stream;
+        cuda_err = cudaStreamWaitEvent(*cuda_stream_iov, cuda_iov_pipeline_block->cuda_event, 0);
+        opal_cuda_check_error(cuda_err);
+        
+
+#if defined (OPAL_DATATYPE_CUDA_TIMING)
+        GET_TIME(start);
+#endif
+
+        for (i = iov_start_pos; i < iov_end_pos; i++) {
+            if (pConvertor->current_iov_partial_length > 0) {
+                iov_len = pConvertor->current_iov_partial_length;
+                pConvertor->current_iov_partial_length = 0;
+            } else {
+                iov_len = ddt_iov[i].iov_len;
+            }
+            if (buffer_size >= iov_len) {
+                length_per_iovec = iov_len;
+            } else {
+              /*  orig_alignment = opal_datatype_basicDatatypes[pElem->elem.common.type]->size;*/
+                orig_alignment = ALIGNMENT_CHAR;
+                length_per_iovec = buffer_size / orig_alignment * orig_alignment;
+                buffer_isfull = 1;
+                pConvertor->current_iov_partial_length = iov_len - length_per_iovec;
+                pConvertor->current_iov_pos = i;
+            }
+            buffer_size -= length_per_iovec;
+            total_unpacked += length_per_iovec;
+            destination = (size_t)(ddt_iov[i].iov_base) + (ddt_iov[i].iov_len - iov_len) + destination_base;
+
+            /* check alignment */
+            if ((uintptr_t)(destination) % ALIGNMENT_DOUBLE == 0 && (uintptr_t)source % ALIGNMENT_DOUBLE == 0 && length_per_iovec >= ALIGNMENT_DOUBLE) {
+                alignment = ALIGNMENT_DOUBLE;
+            } else if ((uintptr_t)(destination) % ALIGNMENT_FLOAT == 0 && (uintptr_t)source % ALIGNMENT_FLOAT == 0 && length_per_iovec >= ALIGNMENT_FLOAT) {
+                alignment = ALIGNMENT_FLOAT;
+            } else {
+                alignment = ALIGNMENT_CHAR;
+            }
+
+            //alignment = ALIGNMENT_DOUBLE;
+
+            count_desc = length_per_iovec / alignment;
+            residue_desc = length_per_iovec % alignment;
+            nb_blocks_per_description = (count_desc + thread_per_block - 1) / thread_per_block;
+            DT_CUDA_DEBUG ( opal_cuda_output(10, "Unpack description %d, size %d, residue %d, alignment %d\n", i, count_desc, residue_desc, alignment); );
+            for (j = 0; j < nb_blocks_per_description; j++) {
+                cuda_iov_dist_h_current[nb_blocks_used].dst_offset = destination + j * thread_per_block * alignment - destination_base;
+                cuda_iov_dist_h_current[nb_blocks_used].src_offset = source - source_base;
+                cuda_iov_dist_h_current[nb_blocks_used].element_alignment = alignment;
+                if ( (j+1) * thread_per_block <= count_desc) {
+                    cuda_iov_dist_h_current[nb_blocks_used].nb_elements = thread_per_block;// * sizeof(double);
+                } else {
+                    cuda_iov_dist_h_current[nb_blocks_used].nb_elements = (thread_per_block - ((j+1)*thread_per_block - count_desc));// * sizeof(double);
+                }
+#if defined (OPAL_DATATYPE_CUDA_DEBUG)
+                assert (cuda_iov_dist_h_current[nb_blocks_used].nb_elements > 0); 
+#endif /* OPAL_DATATYPE_CUDA_DEBUG */
+                source += cuda_iov_dist_h_current[nb_blocks_used].nb_elements * alignment;
+                DT_CUDA_DEBUG( opal_cuda_output(12, "Unpack \tblock %d, src %p, dst %p, nb_elements %d, alignment %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].src_offset, cuda_iov_dist_h_current[nb_blocks_used].dst_offset, cuda_iov_dist_h_current[nb_blocks_used].nb_elements, cuda_iov_dist_h_current[nb_blocks_used].element_alignment); );
+                nb_blocks_used ++;
+            }
+
+            /* handle residue */
+            if (residue_desc != 0) {
+               /* orig_alignment = opal_datatype_basicDatatypes[pElem->elem.common.type]->size;*/
+                orig_alignment = ALIGNMENT_CHAR;
+                cuda_iov_dist_h_current[nb_blocks_used].dst_offset = destination + length_per_iovec / alignment * alignment - destination_base;
+                cuda_iov_dist_h_current[nb_blocks_used].src_offset = source - source_base;
+                cuda_iov_dist_h_current[nb_blocks_used].element_alignment = orig_alignment;
+                cuda_iov_dist_h_current[nb_blocks_used].nb_elements = (length_per_iovec - length_per_iovec / alignment * alignment) / orig_alignment;
+#if defined (OPAL_DATATYPE_CUDA_DEBUG)
+                assert (cuda_iov_dist_h_current[nb_blocks_used].nb_elements > 0);
+#endif /* OPAL_DATATYPE_CUDA_DEBUG */
+                source += cuda_iov_dist_h_current[nb_blocks_used].nb_elements * orig_alignment;
+                DT_CUDA_DEBUG( opal_cuda_output(12, "Unpack \tblock %d, src %p, dst %p, nb_elements %d, alignment %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].src_offset, cuda_iov_dist_h_current[nb_blocks_used].dst_offset, cuda_iov_dist_h_current[nb_blocks_used].nb_elements, cuda_iov_dist_h_current[nb_blocks_used].element_alignment); );
+                nb_blocks_used ++;
+            }
+
+            if (buffer_isfull) {
+                break;
+            }
+        }
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+        GET_TIME( end );
+        total_time = ELAPSED_TIME( start, end );
+        DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Unpack src %p, iov is prepared in %ld microsec, kernel submitted to CUDA stream %d, nb_blocks_used %d\n", source_base, total_time,  cuda_iov_pipeline_block->cuda_stream_id, nb_blocks_used); );
+#endif
+
+        cudaMemcpyAsync(cuda_iov_dist_d_current, cuda_iov_dist_h_current, sizeof(ddt_cuda_iov_dist_t)*(nb_blocks_used), cudaMemcpyHostToDevice, *cuda_stream_iov);
+        opal_generic_simple_unpack_cuda_iov_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cuda_iov_dist_d_current, nb_blocks_used, source_base, destination_base);
+        cuda_err = cudaEventRecord(cuda_iov_pipeline_block->cuda_event, *cuda_stream_iov);
+        opal_cuda_check_error(cuda_err);
+        iov_pipeline_block_id ++;
+        iov_pipeline_block_id = iov_pipeline_block_id % NB_STREAMS;
+        
+        iov_start_pos = iov_end_pos;
+        iov_end_pos = iov_start_pos + 1000;
+        if (iov_end_pos > ddt_iov_count) {
+            iov_end_pos = ddt_iov_count;
+        }
+        DT_CUDA_DEBUG ( opal_cuda_output(4, "Unpack iov start pos %d end pos %d, submit to CUDA stream %d\n", iov_start_pos, iov_end_pos, cuda_streams->current_stream_id); );
+
+    }
+
+    for (i = 0; i < NB_STREAMS; i++) {
+        cudaStreamSynchronize(cuda_streams->opal_cuda_stream[i]);
+    }
+
+    iov[0].iov_len = total_unpacked;
+    *max_data = total_unpacked;
+    *out_size = 1;
+    pConvertor->bConverted += total_unpacked;
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "Unpack total unpacked %d\n", total_unpacked); );
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+    GET_TIME( end_total );
+    total_time = ELAPSED_TIME( start_total, end_total );
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: total unpacking in %ld microsec, kernel %ld microsec\n", total_time, total_time-move_time ); );
+#endif
+
+    if( pConvertor->bConverted == pConvertor->local_size ) {
+        pConvertor->flags |= CONVERTOR_COMPLETED;
+        if (pConvertor->gpu_buffer_ptr != NULL && free_required) {
+            opal_ddt_cuda_free_gpu_buffer(pConvertor->gpu_buffer_ptr, 0);
+            pConvertor->gpu_buffer_ptr = NULL;
+        }
+        return 1;
+    }
+    return 0;
+}
+
 void unpack_contiguous_loop_cuda( dt_elem_desc_t* ELEM,
                                   uint32_t* COUNT,
                                   unsigned char** SOURCE,
diff --git a/opal/datatype/opal_convertor.h b/opal/datatype/opal_convertor.h
index fb8b4d630a4..1ab600cc49b 100644
--- a/opal/datatype/opal_convertor.h
+++ b/opal/datatype/opal_convertor.h
@@ -115,6 +115,8 @@ struct opal_convertor_t {
     unsigned char *               gpu_buffer_ptr; /**< GPU buffer used for pack/unpack */
     size_t                        gpu_buffer_size;
     size_t                        current_cuda_iov_count;
+    size_t                        current_iov_pos;
+    size_t                        current_iov_partial_length;
 #endif
     /* size: 248, cachelines: 4, members: 20 */
     /* last cacheline: 56 bytes */
diff --git a/opal/datatype/opal_datatype_cuda.c b/opal/datatype/opal_datatype_cuda.c
index 729e460de1a..3b3fc556ef9 100644
--- a/opal/datatype/opal_datatype_cuda.c
+++ b/opal/datatype/opal_datatype_cuda.c
@@ -95,6 +95,9 @@ void mca_cuda_convertor_init(opal_convertor_t* convertor, const void *pUserBuf,
             datatype_tmp->cuda_iov_is_cached = 1;
         }
     }
+    convertor->current_cuda_iov_count = 0;
+    convertor->current_iov_pos = 0;
+    convertor->current_iov_partial_length = 0;
     
 }
 
diff --git a/test/datatype/ddt_benchmark.c b/test/datatype/ddt_benchmark.c
index 1bb91f663c8..50f62ec5839 100644
--- a/test/datatype/ddt_benchmark.c
+++ b/test/datatype/ddt_benchmark.c
@@ -1211,12 +1211,12 @@ int main( int argc, char* argv[] )
     
     printf( "\n\n#\n * TEST UPPER TRIANGULAR MATRIX (size 100)\n #\n\n" );
     int mat_size = 500;
-    for (mat_size = 6000; mat_size <= 6000; mat_size +=500) {
+    for (mat_size = 2000; mat_size <= 2000; mat_size +=500) {
         pdt = upper_matrix(mat_size);
         printf("----matrix size %d-----\n", mat_size);
         if( outputFlags & CHECK_PACK_UNPACK ) {
-            for (i = 1; i <= 2; i++) {
-    //            local_copy_with_convertor(pdt, 1, 1024*1024*200, mat_size);
+            for (i = 1; i <= 1; i++) {
+                local_copy_with_convertor(pdt, 1, 4000001, mat_size);
             }
         }
         OBJ_RELEASE( pdt ); assert( pdt == NULL );
@@ -1224,10 +1224,10 @@ int main( int argc, char* argv[] )
     
     ompi_datatype_t *column, *matt;
     mat_size = 1000;
-    ompi_datatype_create_vector( mat_size, 1, mat_size, MPI_DOUBLE, &column );
-    ompi_datatype_create_hvector( mat_size, 1, sizeof(double), column, &matt );
-    ompi_datatype_commit( &matt );
-    local_copy_with_convertor_mat(matt, 1, 200000000, mat_size);
+ //   ompi_datatype_create_vector( mat_size, 1, mat_size, MPI_DOUBLE, &column );
+ //   ompi_datatype_create_hvector( mat_size, 1, sizeof(double), column, &matt );
+ //   ompi_datatype_commit( &matt );
+ //   local_copy_with_convertor_mat(matt, 1, 200000000, mat_size);
     
     
     int packed_size = 256;
@@ -1285,7 +1285,7 @@ int main( int argc, char* argv[] )
         pdt = create_vector_type( MPI_DOUBLE, 1000, blk_len, blk_len*2);
         if( outputFlags & CHECK_PACK_UNPACK ) {
             for (i = 0; i < 1; i++) {
-                 vector_ddt( pdt, 1, pdt, 1, 2000000 , 1000, blk_len, blk_len*2);
+        //         vector_ddt( pdt, 1, pdt, 1, 2000000 , 1000, blk_len, blk_len*2);
      //          vector_ddt_2d( pdt, 1, pdt, 1, 1024*1024*100 , 8192, blk_len, blk_len+128);
             }
         }

From bbd221f5721886328ce14a2a334e8cd837c55a2d Mon Sep 17 00:00:00 2001
From: eddy16112 <eddy16112@gmail.com>
Date: Sun, 8 Nov 2015 15:23:02 -0500
Subject: [PATCH 142/190] check point, split iov into two version, non-cached
 and cached

---
 opal/datatype/cuda/opal_datatype_cuda.cu      |  12 +-
 opal/datatype/cuda/opal_datatype_cuda.cuh     |  36 +++--
 .../cuda/opal_datatype_cuda_internal.cuh      |  23 +++-
 .../cuda/opal_datatype_pack_cuda_kernel.cu    |  70 +++++++++-
 .../cuda/opal_datatype_pack_cuda_wrapper.cu   | 125 ++++++------------
 .../cuda/opal_datatype_unpack_cuda_kernel.cu  |  43 +++++-
 .../cuda/opal_datatype_unpack_cuda_wrapper.cu |  63 +++++----
 opal/datatype/opal_datatype_pack.c            |   2 +-
 opal/datatype/opal_datatype_unpack.c          |   2 +-
 test/datatype/ddt_benchmark.c                 |   2 +-
 10 files changed, 238 insertions(+), 140 deletions(-)

diff --git a/opal/datatype/cuda/opal_datatype_cuda.cu b/opal/datatype/cuda/opal_datatype_cuda.cu
index 6a6e06ff28d..fbafc2bfbe2 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cu
+++ b/opal/datatype/cuda/opal_datatype_cuda.cu
@@ -220,8 +220,10 @@ int32_t opal_ddt_cuda_kernel_init(void)
         for (j = 0; j < NB_STREAMS; j++) {
             cudaStreamCreate(&(cuda_streams->opal_cuda_stream[j]));
             cuda_iov_pipeline_block = (ddt_cuda_iov_pipeline_block_t *)malloc(sizeof(ddt_cuda_iov_pipeline_block_t));
-            cudaMallocHost((void **)(&(cuda_iov_pipeline_block->cuda_iov_dist_h)), sizeof(ddt_cuda_iov_dist_t) * CUDA_MAX_NB_BLOCKS * CUDA_IOV_MAX_TASK_PER_BLOCK);
-            cudaMalloc((void **)(&(cuda_iov_pipeline_block->cuda_iov_dist_d)), sizeof(ddt_cuda_iov_dist_t) * CUDA_MAX_NB_BLOCKS * CUDA_IOV_MAX_TASK_PER_BLOCK);
+            cudaMallocHost((void **)(&(cuda_iov_pipeline_block->cuda_iov_dist_non_cached_h)), sizeof(ddt_cuda_iov_dist_non_cached_t) * CUDA_MAX_NB_BLOCKS * CUDA_IOV_MAX_TASK_PER_BLOCK);
+            cudaMalloc((void **)(&(cuda_iov_pipeline_block->cuda_iov_dist_non_cached_d)), sizeof(ddt_cuda_iov_dist_non_cached_t) * CUDA_MAX_NB_BLOCKS * CUDA_IOV_MAX_TASK_PER_BLOCK);
+            cudaMallocHost((void **)(&(cuda_iov_pipeline_block->cuda_iov_dist_cached_h)), sizeof(ddt_cuda_iov_dist_cached_t) * CUDA_MAX_NB_BLOCKS * CUDA_IOV_MAX_TASK_PER_BLOCK);
+            cudaMalloc((void **)(&(cuda_iov_pipeline_block->cuda_iov_dist_cached_d)), sizeof(ddt_cuda_iov_dist_cached_t) * CUDA_MAX_NB_BLOCKS * CUDA_IOV_MAX_TASK_PER_BLOCK);
             cuda_iov_pipeline_block->cuda_stream = &(cuda_streams->opal_cuda_stream[0]);
             cuda_iov_pipeline_block->cuda_stream_id = 0;
             cudaEventCreate(&(cuda_iov_pipeline_block->cuda_event), cudaEventDisableTiming);
@@ -258,8 +260,10 @@ int32_t opal_ddt_cuda_kernel_fini(void)
             cudaStreamDestroy(cuda_devices[i].cuda_streams->opal_cuda_stream[j]);
             cuda_iov_pipeline_block = cuda_devices[i].cuda_iov_pipeline_block[j];
             if (cuda_iov_pipeline_block != NULL) {
-                cudaFreeHost(cuda_iov_pipeline_block->cuda_iov_dist_h);
-                cudaFree(cuda_iov_pipeline_block->cuda_iov_dist_d);
+                cudaFreeHost(cuda_iov_pipeline_block->cuda_iov_dist_non_cached_h);
+                cudaFree(cuda_iov_pipeline_block->cuda_iov_dist_non_cached_d);
+                cudaFreeHost(cuda_iov_pipeline_block->cuda_iov_dist_cached_h);
+                cudaFree(cuda_iov_pipeline_block->cuda_iov_dist_cached_d);
                 cudaEventDestroy(cuda_iov_pipeline_block->cuda_event);
                 cuda_iov_pipeline_block->cuda_stream = NULL;
                 cuda_iov_pipeline_block->cuda_stream_id = -1;
diff --git a/opal/datatype/cuda/opal_datatype_cuda.cuh b/opal/datatype/cuda/opal_datatype_cuda.cuh
index ea3631af67f..73a740a2822 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cuh
+++ b/opal/datatype/cuda/opal_datatype_cuda.cuh
@@ -12,22 +12,42 @@ int32_t opal_ddt_cuda_kernel_fini(void);
 int32_t opal_ddt_generic_simple_pack_function_cuda_vector( opal_convertor_t* pConvertor,
                                                            struct iovec* iov, 
                                                            uint32_t* out_size,
-                                                           size_t* max_data );
+                                                           size_t* max_data ); 
                                                 
+int32_t opal_ddt_generic_simple_unpack_function_cuda_vector( opal_convertor_t* pConvertor,
+                                                             struct iovec* iov, 
+                                                             uint32_t* out_size,
+                                                             size_t* max_data );
+                                                             
 int32_t opal_ddt_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor,
                                                         struct iovec* iov, 
                                                         uint32_t* out_size,
                                                         size_t* max_data );                                              
-                                                  
+
 int32_t opal_ddt_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvertor,
                                                           struct iovec* iov, 
                                                           uint32_t* out_size,
-                                                          size_t* max_data );  
-                                                
-int32_t opal_ddt_generic_simple_unpack_function_cuda_vector( opal_convertor_t* pConvertor,
-                                                             struct iovec* iov, 
-                                                             uint32_t* out_size,
-                                                             size_t* max_data );
+                                                          size_t* max_data ); 
+                                                          
+int32_t opal_ddt_generic_simple_pack_function_cuda_iov_non_cached( opal_convertor_t* pConvertor,
+                                                                   struct iovec* iov, 
+                                                                   uint32_t* out_size,
+                                                                   size_t* max_data );                                              
+
+int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_non_cached( opal_convertor_t* pConvertor,
+                                                                     struct iovec* iov, 
+                                                                     uint32_t* out_size,
+                                                                     size_t* max_data ); 
+                                                                                                                    
+int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t* pConvertor,
+                                                               struct iovec* iov, 
+                                                               uint32_t* out_size,
+                                                               size_t* max_data );                                              
+
+int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_t* pConvertor,
+                                                                 struct iovec* iov, 
+                                                                 uint32_t* out_size,
+                                                                 size_t* max_data ); 
 
 void pack_contiguous_loop_cuda( dt_elem_desc_t* ELEM,
                                 uint32_t* COUNT,
diff --git a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
index eff247a15d2..a91dd8e4f1b 100644
--- a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
+++ b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
@@ -50,16 +50,25 @@ typedef struct {
     uint32_t current_stream_id;
 } ddt_cuda_stream_t;
 
+typedef struct {
+    unsigned char* src;
+    unsigned char* dst;
+    uint32_t nb_elements;
+    uint8_t element_alignment;
+} ddt_cuda_iov_dist_non_cached_t;
+
 typedef struct {
     size_t src_offset;
     size_t dst_offset;
     uint32_t nb_elements;
     uint8_t element_alignment;
-} ddt_cuda_iov_dist_t;
+} ddt_cuda_iov_dist_cached_t;
 
 typedef struct {
-    ddt_cuda_iov_dist_t* cuda_iov_dist_h;
-    ddt_cuda_iov_dist_t* cuda_iov_dist_d;
+    ddt_cuda_iov_dist_non_cached_t* cuda_iov_dist_non_cached_h;
+    ddt_cuda_iov_dist_non_cached_t* cuda_iov_dist_non_cached_d;
+    ddt_cuda_iov_dist_cached_t* cuda_iov_dist_cached_h;
+    ddt_cuda_iov_dist_cached_t* cuda_iov_dist_cached_d;
     cudaStream_t *cuda_stream;
     int32_t cuda_stream_id;
     cudaEvent_t cuda_event;
@@ -118,9 +127,13 @@ __global__ void unpack_contiguous_loop_cuda_kernel_global( uint32_t copy_loops,
                                                            unsigned char* destination );
                                                            
 
-__global__ void opal_generic_simple_pack_cuda_iov_kernel( ddt_cuda_iov_dist_t* cuda_iov_dist, int nb_blocks_used, unsigned char* source_base, unsigned char* destination_base);
+__global__ void opal_generic_simple_pack_cuda_iov_non_cached_kernel( ddt_cuda_iov_dist_non_cached_t* cuda_iov_dist, int nb_blocks_used);
+
+__global__ void opal_generic_simple_unpack_cuda_iov_non_cached_kernel( ddt_cuda_iov_dist_non_cached_t* cuda_iov_dist, int nb_blocks_used);
+
+__global__ void opal_generic_simple_pack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, int nb_blocks_used, unsigned char* source_base, unsigned char* destination_base);
 
-__global__ void opal_generic_simple_unpack_cuda_iov_kernel( ddt_cuda_iov_dist_t* cuda_iov_dist, int nb_blocks_used, unsigned char* source_base, unsigned char* destination_base);
+__global__ void opal_generic_simple_unpack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, int nb_blocks_used, unsigned char* source_base, unsigned char* destination_base);
 
 void opal_cuda_output(int output_id, const char *format, ...);
 
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
index 6b0e18b1078..ccf7d923af7 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
@@ -43,10 +43,10 @@ __global__ void pack_contiguous_loop_cuda_kernel_global( uint32_t copy_loops,
     }
 }
 
-__global__ void opal_generic_simple_pack_cuda_iov_kernel( ddt_cuda_iov_dist_t* cuda_iov_dist, int nb_blocks_used, unsigned char* source_base, unsigned char* destination_base)
+__global__ void opal_generic_simple_pack_cuda_iov_non_cached_kernel( ddt_cuda_iov_dist_non_cached_t* cuda_iov_dist, int nb_blocks_used)
 {
     uint32_t i, _copy_count;
-    size_t src_offset, dst_offset;
+    unsigned char *src, *dst;
     uint8_t alignment;
     unsigned char *_source_tmp, *_destination_tmp;
     
@@ -63,8 +63,8 @@ __global__ void opal_generic_simple_pack_cuda_iov_kernel( ddt_cuda_iov_dist_t* c
     __syncthreads();
     
     for (i = 0; i < nb_tasks; i++) {
-        src_offset = cuda_iov_dist[blockIdx.x + i * gridDim.x].src_offset;
-        dst_offset = cuda_iov_dist[blockIdx.x + i * gridDim.x].dst_offset;
+        src = cuda_iov_dist[blockIdx.x + i * gridDim.x].src;
+        dst = cuda_iov_dist[blockIdx.x + i * gridDim.x].dst;
         _copy_count = cuda_iov_dist[blockIdx.x + i * gridDim.x].nb_elements;
         alignment = cuda_iov_dist[blockIdx.x + i * gridDim.x].element_alignment;
         
@@ -73,8 +73,8 @@ __global__ void opal_generic_simple_pack_cuda_iov_kernel( ddt_cuda_iov_dist_t* c
         // }
         
         if (threadIdx.x < _copy_count) {
-            _source_tmp = source_base + src_offset + threadIdx.x * alignment;
-            _destination_tmp = destination_base + dst_offset + threadIdx.x * alignment;
+            _source_tmp = src + threadIdx.x * alignment;
+            _destination_tmp = dst + threadIdx.x * alignment;
 #if !defined (OPAL_DATATYPE_CUDA_DRY_RUN)
             if (alignment == ALIGNMENT_DOUBLE) {
                 *((long *)_destination_tmp) = *((long *)_source_tmp);
@@ -86,4 +86,62 @@ __global__ void opal_generic_simple_pack_cuda_iov_kernel( ddt_cuda_iov_dist_t* c
 #endif /* ! OPAL_DATATYPE_CUDA_DRY_RUN */
         }
     }
+}
+
+__global__ void opal_generic_simple_pack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, int nb_blocks_used, unsigned char* source_base, unsigned char* destination_base)
+{
+    uint32_t i, _copy_count;
+    size_t src_offset, dst_offset;
+    uint8_t alignment;
+    unsigned char *_source_tmp, *_destination_tmp;
+    
+    __shared__ uint32_t nb_tasks;
+    __shared__ uint8_t my_alignment;
+    
+    if (threadIdx.x == 0) {
+        //printf("iov pack kernel \n");
+        nb_tasks = nb_blocks_used / gridDim.x;
+        if (blockIdx.x < (nb_blocks_used % gridDim.x)) {
+            nb_tasks ++;
+        }
+   //     printf("nb_tasks %d, griddim %d, nb_blocks_used %d, bloid %d \n", nb_tasks, gridDim.x, nb_blocks_used, blockIdx.x);
+    }
+    __syncthreads();
+    
+    for (i = 0; i < nb_tasks; i++) {
+        src_offset = cuda_iov_dist[blockIdx.x + i * gridDim.x].src_offset;
+        dst_offset = cuda_iov_dist[blockIdx.x + i * gridDim.x].dst_offset;
+        _copy_count = cuda_iov_dist[blockIdx.x + i * gridDim.x].nb_elements;
+        alignment = cuda_iov_dist[blockIdx.x + i * gridDim.x].element_alignment;
+        
+        if (threadIdx.x == 0) {
+            _source_tmp = source_base + src_offset;
+            _destination_tmp = destination_base + dst_offset;
+            if ((uintptr_t)(_source_tmp) % ALIGNMENT_DOUBLE == 0 && (uintptr_t)_destination_tmp % ALIGNMENT_DOUBLE == 0) {
+                my_alignment = ALIGNMENT_DOUBLE;
+            } else if ((uintptr_t)(_source_tmp) % ALIGNMENT_FLOAT == 0 && (uintptr_t)_destination_tmp % ALIGNMENT_FLOAT == 0) {
+                my_alignment = ALIGNMENT_FLOAT;
+            } else {
+                my_alignment = ALIGNMENT_CHAR;
+            }
+            if (my_alignment != alignment) {
+                printf("my align %d, align %d\n", my_alignment, alignment);
+            }
+        }
+        __syncthreads();
+        
+        if (threadIdx.x < _copy_count) {
+            _source_tmp = source_base + src_offset + threadIdx.x * alignment;
+            _destination_tmp = destination_base + dst_offset + threadIdx.x * alignment;
+#if !defined (OPAL_DATATYPE_CUDA_DRY_RUN)
+            if (my_alignment == ALIGNMENT_DOUBLE) {
+                *((long *)_destination_tmp) = *((long *)_source_tmp);
+            } else if (my_alignment == ALIGNMENT_FLOAT) {
+                *((int *)_destination_tmp) = *((int *)_source_tmp);
+            } else {
+                * _destination_tmp = *_source_tmp;
+            }
+#endif /* ! OPAL_DATATYPE_CUDA_DRY_RUN */
+        }
+    }
 }
\ No newline at end of file
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
index b2366b211f4..b8dda932626 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
@@ -659,16 +659,24 @@ void pack_contiguous_loop_cuda_zerocopy( dt_elem_desc_t* ELEM,
 #endif
 }
 
-int32_t opal_ddt_generic_simple_pack_function_cuda_iov2( opal_convertor_t* pConvertor,
-                                                    struct iovec* iov,
-                                                    uint32_t* out_size,
-                                                    size_t* max_data )
+int32_t opal_ddt_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor,
+                                                        struct iovec* iov,
+                                                        uint32_t* out_size,
+                                                        size_t* max_data )
+{
+    return opal_ddt_generic_simple_pack_function_cuda_iov_non_cached(pConvertor, iov, out_size, max_data);   
+}
+
+int32_t opal_ddt_generic_simple_pack_function_cuda_iov_non_cached( opal_convertor_t* pConvertor,
+                                                                   struct iovec* iov,
+                                                                   uint32_t* out_size,
+                                                                   size_t* max_data )
 {
     uint32_t i, j;
     uint32_t count_desc, nb_blocks_per_description, residue_desc;
     uint32_t nb_blocks, thread_per_block, nb_blocks_used;
     size_t length, buffer_size, length_per_iovec, dst_offset;
-    unsigned char *destination, *destination_base, *source_base;
+    unsigned char *destination, *destination_base;
     size_t total_packed, total_converted;
     int32_t complete_flag = 0;
     uint8_t buffer_isfull = 0, transfer_required, free_required;
@@ -680,8 +688,8 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov2( opal_convertor_t* pConv
 //    int32_t orig_stack_index;
     cudaError_t cuda_err;
     ddt_cuda_stream_t *cuda_streams = current_cuda_device->cuda_streams;
-    ddt_cuda_iov_dist_t* cuda_iov_dist_h_current;
-    ddt_cuda_iov_dist_t* cuda_iov_dist_d_current;
+    ddt_cuda_iov_dist_non_cached_t* cuda_iov_dist_h_current;
+    ddt_cuda_iov_dist_non_cached_t* cuda_iov_dist_d_current;
     ddt_cuda_iov_pipeline_block_t *cuda_iov_pipeline_block;
     int iov_pipeline_block_id = 0;
     cudaStream_t *cuda_stream_iov = NULL;
@@ -691,12 +699,6 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov2( opal_convertor_t* pConv
     long total_time, move_time;
 #endif
     
-#if OPAL_DATATYPE_CUDA_IOV_CACHE
-    opal_datatype_t *pDesc = (opal_datatype_t *)pConvertor->pDesc;
-    ddt_cuda_iov_dist_t *cuda_iov_dist_cache = (ddt_cuda_iov_dist_t *)pDesc->cuda_iov_dist;
-    cuda_iov_dist_cache += pDesc->cuda_iov_count;    
-#endif /* OPAL_DATATYPE_CUDA_IOV_CACHE */
-    
     /*description = pConvertor->use_desc->desc;
     pStack = pConvertor->pStack + pConvertor->stack_pos;
     pElem = &(description[pStack->index]);
@@ -738,16 +740,10 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov2( opal_convertor_t* pConv
             free_required = 1;
             destination = pConvertor->gpu_buffer_ptr;
         }
-    }
+    }   
     
-#if OPAL_DATATYPE_CUDA_IOV_CACHE
-    /* cuda iov is cached */
-    if (pDesc->cuda_iov_is_cached == 2) {
-        pack_iov_cached(pConvertor, destination);
-    }
-#endif /* OPAL_DATATYPE_CUDA_IOV_CACHE */    
-    
-    DT_CUDA_DEBUG ( opal_cuda_output(2, "Pack using IOV, GPU base %p, pack to buffer %p\n", pConvertor->pBaseBuf, destination););
+    destination_base = destination;
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "Pack using IOV non cached, GPU base %p, pack to buffer %p\n", pConvertor->pBaseBuf, destination););
 
     cuda_iov_count = 1000;//CUDA_NB_IOV;
     total_packed = 0;
@@ -755,7 +751,6 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov2( opal_convertor_t* pConv
     cuda_streams->current_stream_id = 0;
     convertor_flags = pConvertor->flags;
   //  orig_stack_index = pStack->index;
-    destination_base = destination;
 
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
     GET_TIME(start_total);
@@ -776,14 +771,13 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov2( opal_convertor_t* pConv
     dst_offset = 0;
     thread_per_block = CUDA_WARP_SIZE * 5;
     nb_blocks = 256;
-    source_base = (unsigned char*)pConvertor->pBaseBuf; 
     
     while (cuda_iov_count > 0) {
         
         nb_blocks_used = 0;
         cuda_iov_pipeline_block = current_cuda_device->cuda_iov_pipeline_block[iov_pipeline_block_id];
-        cuda_iov_dist_h_current = cuda_iov_pipeline_block->cuda_iov_dist_h;
-        cuda_iov_dist_d_current = cuda_iov_pipeline_block->cuda_iov_dist_d;
+        cuda_iov_dist_h_current = cuda_iov_pipeline_block->cuda_iov_dist_non_cached_h;
+        cuda_iov_dist_d_current = cuda_iov_pipeline_block->cuda_iov_dist_non_cached_d;
         cuda_stream_iov = cuda_iov_pipeline_block->cuda_stream;
         cuda_err = cudaStreamWaitEvent(*cuda_stream_iov, cuda_iov_pipeline_block->cuda_event, 0);
         opal_cuda_check_error(cuda_err);
@@ -819,8 +813,8 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov2( opal_convertor_t* pConv
             nb_blocks_per_description = (count_desc + thread_per_block - 1) / thread_per_block;
             DT_CUDA_DEBUG ( opal_cuda_output(10, "Pack description %d, size %d, residue %d, alignment %d, nb_block_aquired %d\n", i, count_desc, residue_desc, alignment, nb_blocks_per_description); );
             for (j = 0; j < nb_blocks_per_description; j++) {
-                cuda_iov_dist_h_current[nb_blocks_used].src_offset = (unsigned char *)(cuda_iov[i].iov_base) + j * thread_per_block * alignment - source_base;
-                cuda_iov_dist_h_current[nb_blocks_used].dst_offset = destination - destination_base;
+                cuda_iov_dist_h_current[nb_blocks_used].src = (unsigned char *)(cuda_iov[i].iov_base) + j * thread_per_block * alignment;
+                cuda_iov_dist_h_current[nb_blocks_used].dst = destination;
                 cuda_iov_dist_h_current[nb_blocks_used].element_alignment = alignment;
                 if ( (j+1) * thread_per_block <= count_desc) {
                     cuda_iov_dist_h_current[nb_blocks_used].nb_elements = thread_per_block;
@@ -831,7 +825,7 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov2( opal_convertor_t* pConv
                 assert(cuda_iov_dist_h_current[nb_blocks_used].nb_elements > 0);
 #endif /* OPAL_DATATYPE_CUDA_DEBUG */
                 destination += cuda_iov_dist_h_current[nb_blocks_used].nb_elements * alignment;
-                DT_CUDA_DEBUG( opal_cuda_output(12, "Pack \tblock %d, src %p, dst %p, nb_elements %d, alignment %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].src_offset, cuda_iov_dist_h_current[nb_blocks_used].dst_offset, cuda_iov_dist_h_current[nb_blocks_used].nb_elements, cuda_iov_dist_h_current[nb_blocks_used].element_alignment); );
+                DT_CUDA_DEBUG( opal_cuda_output(12, "Pack \tblock %d, src %p, dst %p, nb_elements %d, alignment %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].src, cuda_iov_dist_h_current[nb_blocks_used].dst, cuda_iov_dist_h_current[nb_blocks_used].nb_elements, cuda_iov_dist_h_current[nb_blocks_used].element_alignment); );
                 nb_blocks_used ++;
                 assert (nb_blocks_used < CUDA_MAX_NB_BLOCKS*CUDA_IOV_MAX_TASK_PER_BLOCK);
             }
@@ -840,15 +834,15 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov2( opal_convertor_t* pConv
             if (residue_desc != 0) {
                 /*orig_alignment = opal_datatype_basicDatatypes[pElem->elem.common.type]->size;*/
                 orig_alignment = ALIGNMENT_CHAR;
-                cuda_iov_dist_h_current[nb_blocks_used].src_offset = (unsigned char *)(cuda_iov[i].iov_base) + length_per_iovec / alignment * alignment - source_base;
-                cuda_iov_dist_h_current[nb_blocks_used].dst_offset = destination - destination_base;
+                cuda_iov_dist_h_current[nb_blocks_used].src = (unsigned char *)(cuda_iov[i].iov_base) + length_per_iovec / alignment * alignment;
+                cuda_iov_dist_h_current[nb_blocks_used].dst = destination;
                 cuda_iov_dist_h_current[nb_blocks_used].element_alignment = orig_alignment;
                 cuda_iov_dist_h_current[nb_blocks_used].nb_elements = (length_per_iovec - length_per_iovec / alignment * alignment) / orig_alignment;
 #if defined (OPAL_DATATYPE_CUDA_DEBUG)
                 assert(cuda_iov_dist_h_current[nb_blocks_used].nb_elements > 0);
 #endif /* OPAL_DATATYPE_CUDA_DEBUG */
                 destination += cuda_iov_dist_h_current[nb_blocks_used].nb_elements * orig_alignment;
-                DT_CUDA_DEBUG( opal_cuda_output(12, "Pack \tblock %d, src %p, dst %p, nb_elements %d, alignment %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].src_offset, cuda_iov_dist_h_current[nb_blocks_used].dst_offset, cuda_iov_dist_h_current[nb_blocks_used].nb_elements, cuda_iov_dist_h_current[nb_blocks_used].element_alignment); );
+                DT_CUDA_DEBUG( opal_cuda_output(12, "Pack \tblock %d, src %p, dst %p, nb_elements %d, alignment %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].src, cuda_iov_dist_h_current[nb_blocks_used].dst, cuda_iov_dist_h_current[nb_blocks_used].nb_elements, cuda_iov_dist_h_current[nb_blocks_used].element_alignment); );
                 nb_blocks_used ++;
                 assert (nb_blocks_used < CUDA_MAX_NB_BLOCKS*CUDA_IOV_MAX_TASK_PER_BLOCK);
             }
@@ -864,13 +858,8 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov2( opal_convertor_t* pConv
         DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Pack to dest %p, iov is prepared in %ld microsec, kernel submitted to CUDA stream %d, nb_blocks %d\n", destination_base, total_time,  cuda_iov_pipeline_block->cuda_stream_id, nb_blocks_used); );
 #endif
 
-        cudaMemcpyAsync(cuda_iov_dist_d_current, cuda_iov_dist_h_current, sizeof(ddt_cuda_iov_dist_t)*(nb_blocks_used), cudaMemcpyHostToDevice, *cuda_stream_iov);
-#if OPAL_DATATYPE_CUDA_IOV_CACHE
-        cudaMemcpyAsync(cuda_iov_dist_cache, cuda_iov_dist_d_current, sizeof(ddt_cuda_iov_dist_t)*(nb_blocks_used), cudaMemcpyDeviceToDevice, *cuda_stream_iov);
-        pDesc->cuda_iov_count += nb_blocks_used;
-        cuda_iov_dist_cache += nb_blocks_used;
-#endif /* OPAL_DATATYPE_CUDA_IOV_CACHE */
-        opal_generic_simple_pack_cuda_iov_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cuda_iov_dist_d_current, nb_blocks_used, source_base, destination_base);
+        cudaMemcpyAsync(cuda_iov_dist_d_current, cuda_iov_dist_h_current, sizeof(ddt_cuda_iov_dist_non_cached_t)*(nb_blocks_used), cudaMemcpyHostToDevice, *cuda_stream_iov);
+        opal_generic_simple_pack_cuda_iov_non_cached_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cuda_iov_dist_d_current, nb_blocks_used);
         cuda_err = cudaEventRecord(cuda_iov_pipeline_block->cuda_event, *cuda_stream_iov);
         opal_cuda_check_error(cuda_err);
         iov_pipeline_block_id ++;
@@ -933,18 +922,15 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov2( opal_convertor_t* pConv
            opal_ddt_cuda_free_gpu_buffer(pConvertor->gpu_buffer_ptr, 0);
            pConvertor->gpu_buffer_ptr = NULL;
         }
-#if OPAL_DATATYPE_CUDA_IOV_CACHE
-        pDesc->cuda_iov_is_cached = 2;
-#endif /* OPAL_DATATYPE_CUDA_IOV_CACHE */
         return 1;
     }        
     return 0;
 }
 
-int32_t opal_ddt_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor,
-                                                    struct iovec* iov,
-                                                    uint32_t* out_size,
-                                                    size_t* max_data )
+int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t* pConvertor,
+                                                               struct iovec* iov,
+                                                               uint32_t* out_size,
+                                                               size_t* max_data )
 {
     uint32_t i, j;
     uint32_t count_desc, nb_blocks_per_description, residue_desc;
@@ -962,8 +948,8 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConve
 //    int32_t orig_stack_index;
     cudaError_t cuda_err;
     ddt_cuda_stream_t *cuda_streams = current_cuda_device->cuda_streams;
-    ddt_cuda_iov_dist_t* cuda_iov_dist_h_current;
-    ddt_cuda_iov_dist_t* cuda_iov_dist_d_current;
+    ddt_cuda_iov_dist_cached_t* cuda_iov_dist_h_current;
+    ddt_cuda_iov_dist_cached_t* cuda_iov_dist_d_current;
     ddt_cuda_iov_pipeline_block_t *cuda_iov_pipeline_block;
     int iov_pipeline_block_id = 0;
     cudaStream_t *cuda_stream_iov = NULL;
@@ -977,12 +963,6 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConve
     long total_time, move_time;
 #endif
     
-#if OPAL_DATATYPE_CUDA_IOV_CACHE
-    opal_datatype_t *pDesc = (opal_datatype_t *)pConvertor->pDesc;
-    ddt_cuda_iov_dist_t *cuda_iov_dist_cache = (ddt_cuda_iov_dist_t *)pDesc->cuda_iov_dist;
-    cuda_iov_dist_cache += pDesc->cuda_iov_count;    
-#endif /* OPAL_DATATYPE_CUDA_IOV_CACHE */
-    
     /*description = pConvertor->use_desc->desc;
     pStack = pConvertor->pStack + pConvertor->stack_pos;
     pElem = &(description[pStack->index]);
@@ -1024,16 +1004,9 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConve
             free_required = 1;
             destination = pConvertor->gpu_buffer_ptr;
         }
-    }
-    
-#if OPAL_DATATYPE_CUDA_IOV_CACHE
-    /* cuda iov is cached */
-    if (pDesc->cuda_iov_is_cached == 2) {
-        pack_iov_cached(pConvertor, destination);
-    }
-#endif /* OPAL_DATATYPE_CUDA_IOV_CACHE */    
+    }   
     
-    DT_CUDA_DEBUG ( opal_cuda_output(2, "Pack using IOV, GPU base %p, pack to buffer %p\n", pConvertor->pBaseBuf, destination););
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "Pack using IOV cached, GPU base %p, pack to buffer %p\n", pConvertor->pBaseBuf, destination););
 
     cuda_iov_count = 4000;//CUDA_NB_IOV;
     total_packed = 0;
@@ -1074,8 +1047,8 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConve
         
         nb_blocks_used = 0;
         cuda_iov_pipeline_block = current_cuda_device->cuda_iov_pipeline_block[iov_pipeline_block_id];
-        cuda_iov_dist_h_current = cuda_iov_pipeline_block->cuda_iov_dist_h;
-        cuda_iov_dist_d_current = cuda_iov_pipeline_block->cuda_iov_dist_d;
+        cuda_iov_dist_h_current = cuda_iov_pipeline_block->cuda_iov_dist_cached_h;
+        cuda_iov_dist_d_current = cuda_iov_pipeline_block->cuda_iov_dist_cached_d;
         cuda_stream_iov = cuda_iov_pipeline_block->cuda_stream;
         cuda_err = cudaStreamWaitEvent(*cuda_stream_iov, cuda_iov_pipeline_block->cuda_event, 0);
         opal_cuda_check_error(cuda_err);
@@ -1164,14 +1137,9 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConve
         DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Pack to dest %p, iov is prepared in %ld microsec, kernel submitted to CUDA stream %d, nb_blocks %d\n", destination_base, total_time,  cuda_iov_pipeline_block->cuda_stream_id, nb_blocks_used); );
 #endif
 
-        cudaMemcpyAsync(cuda_iov_dist_d_current, cuda_iov_dist_h_current, sizeof(ddt_cuda_iov_dist_t)*(nb_blocks_used), cudaMemcpyHostToDevice, *cuda_stream_iov);
-#if OPAL_DATATYPE_CUDA_IOV_CACHE
-        cudaMemcpyAsync(cuda_iov_dist_cache, cuda_iov_dist_d_current, sizeof(ddt_cuda_iov_dist_t)*(nb_blocks_used), cudaMemcpyDeviceToDevice, *cuda_stream_iov);
-        pDesc->cuda_iov_count += nb_blocks_used;
-        cuda_iov_dist_cache += nb_blocks_used;
-#endif /* OPAL_DATATYPE_CUDA_IOV_CACHE */
+        cudaMemcpyAsync(cuda_iov_dist_d_current, cuda_iov_dist_h_current, sizeof(ddt_cuda_iov_dist_cached_t)*(nb_blocks_used), cudaMemcpyHostToDevice, *cuda_stream_iov);
         DT_CUDA_DEBUG ( opal_cuda_output(2, "kernel launched src_base %p, dst_base %p, nb_blocks %ld\n", source_base, destination_base, nb_blocks_used ); );
-        opal_generic_simple_pack_cuda_iov_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cuda_iov_dist_d_current, nb_blocks_used, source_base, destination_base);
+        opal_generic_simple_pack_cuda_iov_cached_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cuda_iov_dist_d_current, nb_blocks_used, source_base, destination_base);
         cuda_err = cudaEventRecord(cuda_iov_pipeline_block->cuda_event, *cuda_stream_iov);
         opal_cuda_check_error(cuda_err);
         iov_pipeline_block_id ++;
@@ -1221,24 +1189,11 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConve
            opal_ddt_cuda_free_gpu_buffer(pConvertor->gpu_buffer_ptr, 0);
            pConvertor->gpu_buffer_ptr = NULL;
         }
-#if OPAL_DATATYPE_CUDA_IOV_CACHE
-        pDesc->cuda_iov_is_cached = 2;
-#endif /* OPAL_DATATYPE_CUDA_IOV_CACHE */
         return 1;
     }        
     return 0;
 }
 
-
-#if OPAL_DATATYPE_CUDA_IOV_CACHE
-void pack_iov_cached(opal_convertor_t* pConvertor, unsigned char *destination)
-{
-    const opal_datatype_t *datatype = pConvertor->pDesc;
-    DT_CUDA_DEBUG ( opal_cuda_output(2, "cuda iov cached %p, count %ld\n", datatype->cuda_iov_dist, datatype->cuda_iov_count ); );
-}
-#endif /* OPAL_DATATYPE_CUDA_IOV_CACHE */
-
-
 void pack_predefined_data_cuda( dt_elem_desc_t* ELEM,
                                 uint32_t* COUNT,
                                 unsigned char** SOURCE,
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
index a23aff7710c..37527bd2071 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
@@ -6,7 +6,47 @@
 #include <stdio.h> 
 
 
-__global__ void opal_generic_simple_unpack_cuda_iov_kernel( ddt_cuda_iov_dist_t* cuda_iov_dist, int nb_blocks_used, unsigned char* source_base, unsigned char* destination_base)
+__global__ void opal_generic_simple_unpack_cuda_iov_non_cached_kernel( ddt_cuda_iov_dist_non_cached_t* cuda_iov_dist, int nb_blocks_used)
+{
+    uint32_t i, _copy_count;
+    unsigned char *src, *dst;
+    uint8_t alignment;
+    unsigned char *_source_tmp, *_destination_tmp;
+    
+    __shared__ uint32_t nb_tasks;
+    
+    if (threadIdx.x == 0) {
+        nb_tasks = nb_blocks_used / gridDim.x;
+        if (blockIdx.x < nb_blocks_used % gridDim.x) {
+            nb_tasks ++;
+        }
+    }
+    __syncthreads();
+    
+    for (i = 0; i < nb_tasks; i++) {
+        src = cuda_iov_dist[blockIdx.x + i * gridDim.x].src;
+        dst = cuda_iov_dist[blockIdx.x + i * gridDim.x].dst;
+        _copy_count = cuda_iov_dist[blockIdx.x + i * gridDim.x].nb_elements;
+        alignment = cuda_iov_dist[blockIdx.x + i * gridDim.x].element_alignment;
+        
+        if (threadIdx.x < _copy_count) {
+            _source_tmp = src + threadIdx.x * alignment;
+            _destination_tmp = dst + threadIdx.x * alignment;
+#if !defined (OPAL_DATATYPE_CUDA_DRY_RUN)
+                if (alignment == ALIGNMENT_DOUBLE) {
+                    *((long *)_destination_tmp) = *((long *)_source_tmp);
+                } else if (alignment == ALIGNMENT_FLOAT) {
+                    *((int *)_destination_tmp) = *((int *)_source_tmp);
+                } else {
+                    * _destination_tmp = *_source_tmp;
+                }
+        //   printf("src %p, %1.f | dst %p, %1.f\n", _source_tmp, *_source_tmp, _destination_tmp, *_destination_tmp);
+#endif /* ! OPAL_DATATYPE_CUDA_DRY_RUN */
+        }
+    }
+}
+
+__global__ void opal_generic_simple_unpack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, int nb_blocks_used, unsigned char* source_base, unsigned char* destination_base)
 {
     uint32_t i, _copy_count;
     size_t src_offset, dst_offset;
@@ -45,6 +85,7 @@ __global__ void opal_generic_simple_unpack_cuda_iov_kernel( ddt_cuda_iov_dist_t*
         }
     }
 }
+
 __global__ void unpack_contiguous_loop_cuda_kernel_global( uint32_t copy_loops,
                                                            size_t size,
                                                            OPAL_PTRDIFF_TYPE extent,
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
index 4b4438fa8e4..fc7b3d28f6a 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
@@ -370,16 +370,24 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_vector2( opal_convertor_t*
     return 0;
 }
 
-int32_t opal_ddt_generic_simple_unpack_function_cuda_iov2( opal_convertor_t* pConvertor,
+int32_t opal_ddt_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvertor,
                                                           struct iovec* iov,
                                                           uint32_t* out_size,
                                                           size_t* max_data )
+{
+    return opal_ddt_generic_simple_unpack_function_cuda_iov_non_cached(pConvertor, iov, out_size, max_data);
+}
+
+int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_non_cached( opal_convertor_t* pConvertor,
+                                                                     struct iovec* iov,
+                                                                     uint32_t* out_size,
+                                                                     size_t* max_data )
 {
     uint32_t i, j;
     uint32_t count_desc, nb_blocks_per_description, dst_offset, residue_desc;
     uint32_t nb_blocks, thread_per_block, nb_blocks_used;
     size_t length, buffer_size, length_per_iovec;
-    unsigned char *source, *source_base, *destination_base;
+    unsigned char *source, *source_base;
     size_t total_unpacked, total_converted;
     int32_t complete_flag = 0;
     uint8_t buffer_isfull = 0;
@@ -392,8 +400,8 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov2( opal_convertor_t* pCo
 //    int32_t orig_stack_index;
     cudaError_t cuda_err;
     ddt_cuda_stream_t *cuda_streams = current_cuda_device->cuda_streams;
-    ddt_cuda_iov_dist_t* cuda_iov_dist_h_current;
-    ddt_cuda_iov_dist_t* cuda_iov_dist_d_current;
+    ddt_cuda_iov_dist_non_cached_t* cuda_iov_dist_h_current;
+    ddt_cuda_iov_dist_non_cached_t* cuda_iov_dist_d_current;
     ddt_cuda_iov_pipeline_block_t *cuda_iov_pipeline_block;
     int iov_pipeline_block_id = 0;
     cudaStream_t *cuda_stream_iov = NULL;
@@ -434,7 +442,8 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov2( opal_convertor_t* pCo
         }
     }
 
-    DT_CUDA_DEBUG ( opal_cuda_output(2, "Unpack using IOV, GPU base %p, unpack from buffer %p, total size %ld\n",
+    source_base = source;
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "Unpack using IOV non cached, GPU base %p, unpack from buffer %p, total size %ld\n",
                                      pConvertor->pBaseBuf, source, iov[0].iov_len); );
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
     GET_TIME( end );
@@ -456,7 +465,6 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov2( opal_convertor_t* pCo
     cuda_streams->current_stream_id = 0;
     convertor_flags = pConvertor->flags;
 //    orig_stack_index = pStack->index;
-    source_base = source;
     complete_flag = opal_convertor_raw( pConvertor, cuda_iov, &cuda_iov_count, &length );
     DT_CUDA_DEBUG ( opal_cuda_output(4, "Unpack complete flag %d, iov count %d, length %d, submit to CUDA stream %d\n", complete_flag, cuda_iov_count, length, cuda_streams->current_stream_id); );
 
@@ -469,14 +477,13 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov2( opal_convertor_t* pCo
     dst_offset = 0;
     thread_per_block = CUDA_WARP_SIZE * 5;
     nb_blocks = 256;
-    destination_base = (unsigned char*)pConvertor->pBaseBuf;
 
     while (cuda_iov_count > 0) {
 
         nb_blocks_used = 0;
         cuda_iov_pipeline_block = current_cuda_device->cuda_iov_pipeline_block[iov_pipeline_block_id];
-        cuda_iov_dist_h_current = cuda_iov_pipeline_block->cuda_iov_dist_h;
-        cuda_iov_dist_d_current = cuda_iov_pipeline_block->cuda_iov_dist_d;
+        cuda_iov_dist_h_current = cuda_iov_pipeline_block->cuda_iov_dist_non_cached_h;
+        cuda_iov_dist_d_current = cuda_iov_pipeline_block->cuda_iov_dist_non_cached_d;
         cuda_stream_iov = cuda_iov_pipeline_block->cuda_stream;
         cuda_err = cudaStreamWaitEvent(*cuda_stream_iov, cuda_iov_pipeline_block->cuda_event, 0);
         opal_cuda_check_error(cuda_err);
@@ -515,8 +522,8 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov2( opal_convertor_t* pCo
             nb_blocks_per_description = (count_desc + thread_per_block - 1) / thread_per_block;
             DT_CUDA_DEBUG ( opal_cuda_output(10, "Unpack description %d, size %d, residue %d, alignment %d\n", i, count_desc, residue_desc, alignment); );
             for (j = 0; j < nb_blocks_per_description; j++) {
-                cuda_iov_dist_h_current[nb_blocks_used].dst_offset = (unsigned char *)(cuda_iov[i].iov_base) + j * thread_per_block * alignment - destination_base;
-                cuda_iov_dist_h_current[nb_blocks_used].src_offset = source - source_base;
+                cuda_iov_dist_h_current[nb_blocks_used].dst = (unsigned char *)(cuda_iov[i].iov_base) + j * thread_per_block * alignment;
+                cuda_iov_dist_h_current[nb_blocks_used].src = source;
                 cuda_iov_dist_h_current[nb_blocks_used].element_alignment = alignment;
                 if ( (j+1) * thread_per_block <= count_desc) {
                     cuda_iov_dist_h_current[nb_blocks_used].nb_elements = thread_per_block;// * sizeof(double);
@@ -527,7 +534,7 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov2( opal_convertor_t* pCo
                 assert (cuda_iov_dist_h_current[nb_blocks_used].nb_elements > 0); 
 #endif /* OPAL_DATATYPE_CUDA_DEBUG */
                 source += cuda_iov_dist_h_current[nb_blocks_used].nb_elements * alignment;
-                DT_CUDA_DEBUG( opal_cuda_output(12, "Unpack \tblock %d, src %p, dst %p, nb_elements %d, alignment %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].src_offset, cuda_iov_dist_h_current[nb_blocks_used].dst_offset, cuda_iov_dist_h_current[nb_blocks_used].nb_elements, cuda_iov_dist_h_current[nb_blocks_used].element_alignment); );
+                DT_CUDA_DEBUG( opal_cuda_output(12, "Unpack \tblock %d, src %p, dst %p, nb_elements %d, alignment %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].src, cuda_iov_dist_h_current[nb_blocks_used].dst, cuda_iov_dist_h_current[nb_blocks_used].nb_elements, cuda_iov_dist_h_current[nb_blocks_used].element_alignment); );
                 nb_blocks_used ++;
             }
 
@@ -535,15 +542,15 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov2( opal_convertor_t* pCo
             if (residue_desc != 0) {
                /* orig_alignment = opal_datatype_basicDatatypes[pElem->elem.common.type]->size;*/
                 orig_alignment = ALIGNMENT_CHAR;
-                cuda_iov_dist_h_current[nb_blocks_used].dst_offset = (unsigned char *)(cuda_iov[i].iov_base) + length_per_iovec / alignment * alignment - destination_base;
-                cuda_iov_dist_h_current[nb_blocks_used].src_offset = source - source_base;
+                cuda_iov_dist_h_current[nb_blocks_used].dst = (unsigned char *)(cuda_iov[i].iov_base) + length_per_iovec / alignment * alignment;
+                cuda_iov_dist_h_current[nb_blocks_used].src = source;
                 cuda_iov_dist_h_current[nb_blocks_used].element_alignment = orig_alignment;
                 cuda_iov_dist_h_current[nb_blocks_used].nb_elements = (length_per_iovec - length_per_iovec / alignment * alignment) / orig_alignment;
 #if defined (OPAL_DATATYPE_CUDA_DEBUG)
                 assert (cuda_iov_dist_h_current[nb_blocks_used].nb_elements > 0);
 #endif /* OPAL_DATATYPE_CUDA_DEBUG */
                 source += cuda_iov_dist_h_current[nb_blocks_used].nb_elements * orig_alignment;
-                DT_CUDA_DEBUG( opal_cuda_output(12, "Unpack \tblock %d, src %p, dst %p, nb_elements %d, alignment %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].src_offset, cuda_iov_dist_h_current[nb_blocks_used].dst_offset, cuda_iov_dist_h_current[nb_blocks_used].nb_elements, cuda_iov_dist_h_current[nb_blocks_used].element_alignment); );
+                DT_CUDA_DEBUG( opal_cuda_output(12, "Unpack \tblock %d, src %p, dst %p, nb_elements %d, alignment %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].src, cuda_iov_dist_h_current[nb_blocks_used].dst, cuda_iov_dist_h_current[nb_blocks_used].nb_elements, cuda_iov_dist_h_current[nb_blocks_used].element_alignment); );
                 nb_blocks_used ++;
             }
 
@@ -558,8 +565,8 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov2( opal_convertor_t* pCo
         DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Unpack src %p, iov is prepared in %ld microsec, kernel submitted to CUDA stream %d, nb_blocks_used %d\n", source_base, total_time,  cuda_iov_pipeline_block->cuda_stream_id, nb_blocks_used); );
 #endif
 
-        cudaMemcpyAsync(cuda_iov_dist_d_current, cuda_iov_dist_h_current, sizeof(ddt_cuda_iov_dist_t)*(nb_blocks_used), cudaMemcpyHostToDevice, *cuda_stream_iov);
-        opal_generic_simple_unpack_cuda_iov_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cuda_iov_dist_d_current, nb_blocks_used, source_base, destination_base);
+        cudaMemcpyAsync(cuda_iov_dist_d_current, cuda_iov_dist_h_current, sizeof(ddt_cuda_iov_dist_non_cached_t)*(nb_blocks_used), cudaMemcpyHostToDevice, *cuda_stream_iov);
+        opal_generic_simple_unpack_cuda_iov_non_cached_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cuda_iov_dist_d_current, nb_blocks_used);
         cuda_err = cudaEventRecord(cuda_iov_pipeline_block->cuda_event, *cuda_stream_iov);
         opal_cuda_check_error(cuda_err);
         iov_pipeline_block_id ++;
@@ -615,10 +622,10 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov2( opal_convertor_t* pCo
     return 0;
 }
 
-int32_t opal_ddt_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvertor,
-                                                          struct iovec* iov,
-                                                          uint32_t* out_size,
-                                                          size_t* max_data )
+int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_t* pConvertor,
+                                                                 struct iovec* iov,
+                                                                 uint32_t* out_size,
+                                                                 size_t* max_data )
 {
     uint32_t i, j;
     uint32_t count_desc, nb_blocks_per_description, residue_desc;
@@ -637,8 +644,8 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pCon
 //    int32_t orig_stack_index;
     cudaError_t cuda_err;
     ddt_cuda_stream_t *cuda_streams = current_cuda_device->cuda_streams;
-    ddt_cuda_iov_dist_t* cuda_iov_dist_h_current;
-    ddt_cuda_iov_dist_t* cuda_iov_dist_d_current;
+    ddt_cuda_iov_dist_cached_t* cuda_iov_dist_h_current;
+    ddt_cuda_iov_dist_cached_t* cuda_iov_dist_d_current;
     ddt_cuda_iov_pipeline_block_t *cuda_iov_pipeline_block;
     int iov_pipeline_block_id = 0;
     cudaStream_t *cuda_stream_iov = NULL;
@@ -683,7 +690,7 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pCon
         }
     }
 
-    DT_CUDA_DEBUG ( opal_cuda_output(2, "Unpack using IOV, GPU base %p, unpack from buffer %p, total size %ld\n",
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "Unpack using IOV cached, GPU base %p, unpack from buffer %p, total size %ld\n",
                                      pConvertor->pBaseBuf, source, iov[0].iov_len); );
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
     GET_TIME( end );
@@ -730,8 +737,8 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pCon
 
         nb_blocks_used = 0;
         cuda_iov_pipeline_block = current_cuda_device->cuda_iov_pipeline_block[iov_pipeline_block_id];
-        cuda_iov_dist_h_current = cuda_iov_pipeline_block->cuda_iov_dist_h;
-        cuda_iov_dist_d_current = cuda_iov_pipeline_block->cuda_iov_dist_d;
+        cuda_iov_dist_h_current = cuda_iov_pipeline_block->cuda_iov_dist_cached_h;
+        cuda_iov_dist_d_current = cuda_iov_pipeline_block->cuda_iov_dist_cached_d;
         cuda_stream_iov = cuda_iov_pipeline_block->cuda_stream;
         cuda_err = cudaStreamWaitEvent(*cuda_stream_iov, cuda_iov_pipeline_block->cuda_event, 0);
         opal_cuda_check_error(cuda_err);
@@ -821,8 +828,8 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pCon
         DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Unpack src %p, iov is prepared in %ld microsec, kernel submitted to CUDA stream %d, nb_blocks_used %d\n", source_base, total_time,  cuda_iov_pipeline_block->cuda_stream_id, nb_blocks_used); );
 #endif
 
-        cudaMemcpyAsync(cuda_iov_dist_d_current, cuda_iov_dist_h_current, sizeof(ddt_cuda_iov_dist_t)*(nb_blocks_used), cudaMemcpyHostToDevice, *cuda_stream_iov);
-        opal_generic_simple_unpack_cuda_iov_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cuda_iov_dist_d_current, nb_blocks_used, source_base, destination_base);
+        cudaMemcpyAsync(cuda_iov_dist_d_current, cuda_iov_dist_h_current, sizeof(ddt_cuda_iov_dist_cached_t)*(nb_blocks_used), cudaMemcpyHostToDevice, *cuda_stream_iov);
+        opal_generic_simple_unpack_cuda_iov_cached_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cuda_iov_dist_d_current, nb_blocks_used, source_base, destination_base);
         cuda_err = cudaEventRecord(cuda_iov_pipeline_block->cuda_event, *cuda_stream_iov);
         opal_cuda_check_error(cuda_err);
         iov_pipeline_block_id ++;
diff --git a/opal/datatype/opal_datatype_pack.c b/opal/datatype/opal_datatype_pack.c
index 9812a371a85..c8985db7913 100644
--- a/opal/datatype/opal_datatype_pack.c
+++ b/opal/datatype/opal_datatype_pack.c
@@ -416,7 +416,7 @@ opal_generic_simple_pack_cuda_function( opal_convertor_t* pConvertor,
     pos_desc   = pStack->index;
     pElem = &(description[pos_desc]);
    
-//    return opal_generic_simple_pack_function_cuda_vector( pConvertor, iov, out_size, max_data);
+    return opal_generic_simple_pack_function_cuda_iov( pConvertor, iov, out_size, max_data);
     if( OPAL_DATATYPE_LOOP == pElem->elem.common.type ) {
         return opal_generic_simple_pack_function_cuda_vector( pConvertor, iov, out_size, max_data);
     } else {
diff --git a/opal/datatype/opal_datatype_unpack.c b/opal/datatype/opal_datatype_unpack.c
index f5e1e76588f..5f51b3f828b 100644
--- a/opal/datatype/opal_datatype_unpack.c
+++ b/opal/datatype/opal_datatype_unpack.c
@@ -610,7 +610,7 @@ opal_generic_simple_unpack_cuda_function( opal_convertor_t* pConvertor,
     pos_desc   = pStack->index;
     pElem = &(description[pos_desc]);
    
-//    return opal_generic_simple_unpack_function_cuda_vector( pConvertor, iov, out_size, max_data);
+    return opal_generic_simple_unpack_function_cuda_iov( pConvertor, iov, out_size, max_data);
     if( OPAL_DATATYPE_LOOP == pElem->elem.common.type ) {
         return opal_generic_simple_unpack_function_cuda_vector( pConvertor, iov, out_size, max_data);
     } else {
diff --git a/test/datatype/ddt_benchmark.c b/test/datatype/ddt_benchmark.c
index 50f62ec5839..c8c3fd7db45 100644
--- a/test/datatype/ddt_benchmark.c
+++ b/test/datatype/ddt_benchmark.c
@@ -1216,7 +1216,7 @@ int main( int argc, char* argv[] )
         printf("----matrix size %d-----\n", mat_size);
         if( outputFlags & CHECK_PACK_UNPACK ) {
             for (i = 1; i <= 1; i++) {
-                local_copy_with_convertor(pdt, 1, 4000001, mat_size);
+                local_copy_with_convertor(pdt, 1, 40000000, mat_size);
             }
         }
         OBJ_RELEASE( pdt ); assert( pdt == NULL );

From a741011cee723f7660b351a7a397a05d5f08db47 Mon Sep 17 00:00:00 2001
From: eddy16112 <eddy16112@gmail.com>
Date: Sun, 8 Nov 2015 18:26:45 -0500
Subject: [PATCH 143/190] check point iov cache

---
 opal/datatype/cuda/opal_datatype_cuda.cu      | 10 ++--
 .../cuda/opal_datatype_cuda_internal.cuh      |  3 +-
 .../cuda/opal_datatype_pack_cuda_kernel.cu    | 46 +++++++++--------
 .../cuda/opal_datatype_pack_cuda_wrapper.cu   | 24 ++++-----
 .../cuda/opal_datatype_unpack_cuda_kernel.cu  | 49 +++++++++++++------
 .../cuda/opal_datatype_unpack_cuda_wrapper.cu | 24 +++++----
 opal/datatype/opal_datatype_create.c          | 13 +++++
 opal/datatype/opal_datatype_cuda.c            |  7 ++-
 opal/datatype/opal_datatype_destroy.c         | 15 +-----
 test/datatype/ddt_benchmark.c                 |  2 +-
 10 files changed, 106 insertions(+), 87 deletions(-)

diff --git a/opal/datatype/cuda/opal_datatype_cuda.cu b/opal/datatype/cuda/opal_datatype_cuda.cu
index fbafc2bfbe2..f53e006a4fd 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cu
+++ b/opal/datatype/cuda/opal_datatype_cuda.cu
@@ -282,13 +282,13 @@ int32_t opal_ddt_cuda_kernel_fini(void)
 void* opal_ddt_cuda_iov_dist_init(void) 
 {
 #if OPAL_DATATYPE_CUDA_IOV_CACHE 
-    ddt_cuda_iov_dist_t *p = NULL;
-    cudaMalloc((void **)(&p), sizeof(ddt_cuda_iov_dist_t) * NUM_CUDA_IOV_PER_DDT);
+    ddt_cuda_iov_dist_cached_t *p = NULL;
+    cudaMalloc((void **)(&p), sizeof(ddt_cuda_iov_dist_cached_t) * NUM_CUDA_IOV_PER_DDT);
     if (p != NULL) {
-        DT_CUDA_DEBUG( opal_cuda_output( 2, "Malloc cuda_iov_dist for ddt is successed %p.\n", p); );
+        DT_CUDA_DEBUG( opal_cuda_output( 2, "Malloc cuda_iov_dist_cached for ddt is successed %p.\n", p); );
         return p;
     } else {
-        DT_CUDA_DEBUG( opal_cuda_output( 0, "Malloc cuda_iov_dist for ddt is failed.\n"); );
+        DT_CUDA_DEBUG( opal_cuda_output( 0, "Malloc cuda_iov_dist_cached for ddt is failed.\n"); );
         return NULL;
     }
 #else
@@ -300,7 +300,7 @@ void* opal_ddt_cuda_iov_dist_init(void)
 void opal_ddt_cuda_iov_dist_fini(void* cuda_iov_dist) 
 {
 #if OPAL_DATATYPE_CUDA_IOV_CACHE
-    ddt_cuda_iov_dist_t *p = (ddt_cuda_iov_dist_t *) cuda_iov_dist;
+    ddt_cuda_iov_dist_cached_t *p = (ddt_cuda_iov_dist_cached_t *) cuda_iov_dist;
     if (p != NULL) {
         cudaFree(p);
         DT_CUDA_DEBUG( opal_cuda_output( 2, "Free cuda_iov_dist for ddt is successed %p.\n", p); );
diff --git a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
index a91dd8e4f1b..1fa0e17b4c7 100644
--- a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
+++ b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
@@ -60,8 +60,7 @@ typedef struct {
 typedef struct {
     size_t src_offset;
     size_t dst_offset;
-    uint32_t nb_elements;
-    uint8_t element_alignment;
+    uint32_t nb_bytes;
 } ddt_cuda_iov_dist_cached_t;
 
 typedef struct {
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
index ccf7d923af7..f4a100b969d 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
@@ -90,16 +90,15 @@ __global__ void opal_generic_simple_pack_cuda_iov_non_cached_kernel( ddt_cuda_io
 
 __global__ void opal_generic_simple_pack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, int nb_blocks_used, unsigned char* source_base, unsigned char* destination_base)
 {
-    uint32_t i, _copy_count;
+    uint32_t i, j;
     size_t src_offset, dst_offset;
-    uint8_t alignment;
     unsigned char *_source_tmp, *_destination_tmp;
     
     __shared__ uint32_t nb_tasks;
-    __shared__ uint8_t my_alignment;
+    __shared__ uint32_t copy_count;
+    __shared__ uint8_t alignment;
     
     if (threadIdx.x == 0) {
-        //printf("iov pack kernel \n");
         nb_tasks = nb_blocks_used / gridDim.x;
         if (blockIdx.x < (nb_blocks_used % gridDim.x)) {
             nb_tasks ++;
@@ -111,37 +110,36 @@ __global__ void opal_generic_simple_pack_cuda_iov_cached_kernel( ddt_cuda_iov_di
     for (i = 0; i < nb_tasks; i++) {
         src_offset = cuda_iov_dist[blockIdx.x + i * gridDim.x].src_offset;
         dst_offset = cuda_iov_dist[blockIdx.x + i * gridDim.x].dst_offset;
-        _copy_count = cuda_iov_dist[blockIdx.x + i * gridDim.x].nb_elements;
-        alignment = cuda_iov_dist[blockIdx.x + i * gridDim.x].element_alignment;
         
         if (threadIdx.x == 0) {
             _source_tmp = source_base + src_offset;
             _destination_tmp = destination_base + dst_offset;
-            if ((uintptr_t)(_source_tmp) % ALIGNMENT_DOUBLE == 0 && (uintptr_t)_destination_tmp % ALIGNMENT_DOUBLE == 0) {
-                my_alignment = ALIGNMENT_DOUBLE;
-            } else if ((uintptr_t)(_source_tmp) % ALIGNMENT_FLOAT == 0 && (uintptr_t)_destination_tmp % ALIGNMENT_FLOAT == 0) {
-                my_alignment = ALIGNMENT_FLOAT;
+            uint32_t _nb_bytes = cuda_iov_dist[blockIdx.x + i * gridDim.x].nb_bytes;
+            if ((uintptr_t)(_source_tmp) % ALIGNMENT_DOUBLE == 0 && (uintptr_t)_destination_tmp % ALIGNMENT_DOUBLE == 0 && _nb_bytes >= ALIGNMENT_DOUBLE) {
+                alignment = ALIGNMENT_DOUBLE;
+            } else if ((uintptr_t)(_source_tmp) % ALIGNMENT_FLOAT == 0 && (uintptr_t)_destination_tmp % ALIGNMENT_FLOAT == 0 && _nb_bytes >= ALIGNMENT_FLOAT) {
+                alignment = ALIGNMENT_FLOAT;
             } else {
-                my_alignment = ALIGNMENT_CHAR;
-            }
-            if (my_alignment != alignment) {
-                printf("my align %d, align %d\n", my_alignment, alignment);
+                alignment = ALIGNMENT_CHAR;
             }
+            copy_count = _nb_bytes / alignment;
         }
         __syncthreads();
         
-        if (threadIdx.x < _copy_count) {
-            _source_tmp = source_base + src_offset + threadIdx.x * alignment;
-            _destination_tmp = destination_base + dst_offset + threadIdx.x * alignment;
+        for (j = threadIdx.x; j < copy_count; j += blockDim.x) {
+            if (j < copy_count) {
+                _source_tmp = source_base + src_offset + j * alignment;
+                _destination_tmp = destination_base + dst_offset + j * alignment;
 #if !defined (OPAL_DATATYPE_CUDA_DRY_RUN)
-            if (my_alignment == ALIGNMENT_DOUBLE) {
-                *((long *)_destination_tmp) = *((long *)_source_tmp);
-            } else if (my_alignment == ALIGNMENT_FLOAT) {
-                *((int *)_destination_tmp) = *((int *)_source_tmp);
-            } else {
-                * _destination_tmp = *_source_tmp;
-            }
+                if (alignment == ALIGNMENT_DOUBLE) {
+                    *((long *)_destination_tmp) = *((long *)_source_tmp);
+                } else if (alignment == ALIGNMENT_FLOAT) {
+                    *((int *)_destination_tmp) = *((int *)_source_tmp);
+                } else {
+                    * _destination_tmp = *_source_tmp;
+                }
 #endif /* ! OPAL_DATATYPE_CUDA_DRY_RUN */
+            }
         }
     }
 }
\ No newline at end of file
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
index b8dda932626..36cdcbaf3cd 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
@@ -664,7 +664,7 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConve
                                                         uint32_t* out_size,
                                                         size_t* max_data )
 {
-    return opal_ddt_generic_simple_pack_function_cuda_iov_non_cached(pConvertor, iov, out_size, max_data);   
+    return opal_ddt_generic_simple_pack_function_cuda_iov_cached(pConvertor, iov, out_size, max_data);   
 }
 
 int32_t opal_ddt_generic_simple_pack_function_cuda_iov_non_cached( opal_convertor_t* pConvertor,
@@ -1086,6 +1086,8 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
             } else {
                 alignment = ALIGNMENT_CHAR;
             }
+            
+            alignment = ALIGNMENT_DOUBLE;
 
             count_desc = length_per_iovec / alignment;
             residue_desc = length_per_iovec % alignment;
@@ -1094,17 +1096,16 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
             for (j = 0; j < nb_blocks_per_description; j++) {
                 cuda_iov_dist_h_current[nb_blocks_used].src_offset = source + j * thread_per_block * alignment - source_base;
                 cuda_iov_dist_h_current[nb_blocks_used].dst_offset = destination - destination_base;
-                cuda_iov_dist_h_current[nb_blocks_used].element_alignment = alignment;
                 if ( (j+1) * thread_per_block <= count_desc) {
-                    cuda_iov_dist_h_current[nb_blocks_used].nb_elements = thread_per_block;
+                    cuda_iov_dist_h_current[nb_blocks_used].nb_bytes = thread_per_block * alignment;
                 } else {
-                    cuda_iov_dist_h_current[nb_blocks_used].nb_elements = count_desc - j*thread_per_block; 
+                    cuda_iov_dist_h_current[nb_blocks_used].nb_bytes = (count_desc - j*thread_per_block) * alignment; 
                 }
 #if defined (OPAL_DATATYPE_CUDA_DEBUG)
-                assert(cuda_iov_dist_h_current[nb_blocks_used].nb_elements > 0);
+                assert(cuda_iov_dist_h_current[nb_blocks_used].nb_bytes > 0);
 #endif /* OPAL_DATATYPE_CUDA_DEBUG */
-                destination += cuda_iov_dist_h_current[nb_blocks_used].nb_elements * alignment;
-                DT_CUDA_DEBUG( opal_cuda_output(12, "Pack \tblock %d, src %p, dst %p, nb_elements %d, alignment %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].src_offset, cuda_iov_dist_h_current[nb_blocks_used].dst_offset, cuda_iov_dist_h_current[nb_blocks_used].nb_elements, cuda_iov_dist_h_current[nb_blocks_used].element_alignment); );
+                destination += cuda_iov_dist_h_current[nb_blocks_used].nb_bytes;
+                DT_CUDA_DEBUG( opal_cuda_output(12, "Pack \tblock %d, src_offset %ld, dst_offset %ld, nb_elements %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].src_offset, cuda_iov_dist_h_current[nb_blocks_used].dst_offset, cuda_iov_dist_h_current[nb_blocks_used].nb_bytes); );
                 nb_blocks_used ++;
                 assert (nb_blocks_used < CUDA_MAX_NB_BLOCKS*CUDA_IOV_MAX_TASK_PER_BLOCK);
             }
@@ -1115,13 +1116,12 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
                 orig_alignment = ALIGNMENT_CHAR;
                 cuda_iov_dist_h_current[nb_blocks_used].src_offset = source + length_per_iovec / alignment * alignment - source_base;
                 cuda_iov_dist_h_current[nb_blocks_used].dst_offset = destination - destination_base;
-                cuda_iov_dist_h_current[nb_blocks_used].element_alignment = orig_alignment;
-                cuda_iov_dist_h_current[nb_blocks_used].nb_elements = (length_per_iovec - length_per_iovec / alignment * alignment) / orig_alignment;
+                cuda_iov_dist_h_current[nb_blocks_used].nb_bytes = (length_per_iovec - length_per_iovec / alignment * alignment) / orig_alignment;
 #if defined (OPAL_DATATYPE_CUDA_DEBUG)
-                assert(cuda_iov_dist_h_current[nb_blocks_used].nb_elements > 0);
+                assert(cuda_iov_dist_h_current[nb_blocks_used].nb_bytes > 0);
 #endif /* OPAL_DATATYPE_CUDA_DEBUG */
-                destination += cuda_iov_dist_h_current[nb_blocks_used].nb_elements * orig_alignment;
-                DT_CUDA_DEBUG( opal_cuda_output(12, "Pack \tblock %d, src %p, dst %p, nb_elements %d, alignment %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].src_offset, cuda_iov_dist_h_current[nb_blocks_used].dst_offset, cuda_iov_dist_h_current[nb_blocks_used].nb_elements, cuda_iov_dist_h_current[nb_blocks_used].element_alignment); );
+                destination += cuda_iov_dist_h_current[nb_blocks_used].nb_bytes;
+                DT_CUDA_DEBUG( opal_cuda_output(12, "Pack \tblock %d, src_offset %ld, dst_offset %ld, nb_bytes %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].src_offset, cuda_iov_dist_h_current[nb_blocks_used].dst_offset, cuda_iov_dist_h_current[nb_blocks_used].nb_bytes); );
                 nb_blocks_used ++;
                 assert (nb_blocks_used < CUDA_MAX_NB_BLOCKS*CUDA_IOV_MAX_TASK_PER_BLOCK);
             }
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
index 37527bd2071..b1e2831f5c1 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
@@ -48,12 +48,13 @@ __global__ void opal_generic_simple_unpack_cuda_iov_non_cached_kernel( ddt_cuda_
 
 __global__ void opal_generic_simple_unpack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, int nb_blocks_used, unsigned char* source_base, unsigned char* destination_base)
 {
-    uint32_t i, _copy_count;
+    uint32_t i, j;
     size_t src_offset, dst_offset;
-    uint8_t alignment;
     unsigned char *_source_tmp, *_destination_tmp;
     
     __shared__ uint32_t nb_tasks;
+    __shared__ uint32_t copy_count;
+    __shared__ uint8_t alignment;
     
     if (threadIdx.x == 0) {
         nb_tasks = nb_blocks_used / gridDim.x;
@@ -66,22 +67,40 @@ __global__ void opal_generic_simple_unpack_cuda_iov_cached_kernel( ddt_cuda_iov_
     for (i = 0; i < nb_tasks; i++) {
         src_offset = cuda_iov_dist[blockIdx.x + i * gridDim.x].src_offset;
         dst_offset = cuda_iov_dist[blockIdx.x + i * gridDim.x].dst_offset;
-        _copy_count = cuda_iov_dist[blockIdx.x + i * gridDim.x].nb_elements;
-        alignment = cuda_iov_dist[blockIdx.x + i * gridDim.x].element_alignment;
         
-        if (threadIdx.x < _copy_count) {
-            _source_tmp = source_base + src_offset + threadIdx.x * alignment;
-            _destination_tmp = destination_base + dst_offset + threadIdx.x * alignment;
+        if (threadIdx.x == 0) {
+            _source_tmp = source_base + src_offset;
+            _destination_tmp = destination_base + dst_offset;
+            uint32_t _nb_bytes = cuda_iov_dist[blockIdx.x + i * gridDim.x].nb_bytes;
+            if ((uintptr_t)(_destination_tmp) % ALIGNMENT_DOUBLE == 0 && (uintptr_t)_source_tmp % ALIGNMENT_DOUBLE == 0 && _nb_bytes >= ALIGNMENT_DOUBLE) {
+                alignment = ALIGNMENT_DOUBLE;
+            } else if ((uintptr_t)(_destination_tmp) % ALIGNMENT_FLOAT == 0 && (uintptr_t)_source_tmp % ALIGNMENT_FLOAT == 0 && _nb_bytes >= ALIGNMENT_FLOAT) {
+                alignment = ALIGNMENT_FLOAT;
+            } else {
+                alignment = ALIGNMENT_CHAR;
+            }
+            copy_count = _nb_bytes / alignment;
+        }
+        __syncthreads();
+        
+        for (j = threadIdx.x; j < copy_count; j += blockDim.x) {
+/*            if (threadIdx.x == 0) {
+                if (copy_count > blockDim.x) printf("copy_count %d, dim %d\n", copy_count, blockDim.x);
+            }*/
+            if (j < copy_count) {
+                _source_tmp = source_base + src_offset + j * alignment;
+                _destination_tmp = destination_base + dst_offset + j * alignment;
 #if !defined (OPAL_DATATYPE_CUDA_DRY_RUN)
-                if (alignment == ALIGNMENT_DOUBLE) {
-                    *((long *)_destination_tmp) = *((long *)_source_tmp);
-                } else if (alignment == ALIGNMENT_FLOAT) {
-                    *((int *)_destination_tmp) = *((int *)_source_tmp);
-                } else {
-                    * _destination_tmp = *_source_tmp;
-                }
-        //   printf("src %p, %1.f | dst %p, %1.f\n", _source_tmp, *_source_tmp, _destination_tmp, *_destination_tmp);
+                    if (alignment == ALIGNMENT_DOUBLE) {
+                        *((long *)_destination_tmp) = *((long *)_source_tmp);
+                    } else if (alignment == ALIGNMENT_FLOAT) {
+                        *((int *)_destination_tmp) = *((int *)_source_tmp);
+                    } else {
+                        * _destination_tmp = *_source_tmp;
+                    }
+            //   printf("src %p, %1.f | dst %p, %1.f\n", _source_tmp, *_source_tmp, _destination_tmp, *_destination_tmp);
 #endif /* ! OPAL_DATATYPE_CUDA_DRY_RUN */
+            }
         }
     }
 }
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
index fc7b3d28f6a..0bdf66638fc 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
@@ -375,7 +375,7 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pCon
                                                           uint32_t* out_size,
                                                           size_t* max_data )
 {
-    return opal_ddt_generic_simple_unpack_function_cuda_iov_non_cached(pConvertor, iov, out_size, max_data);
+    return opal_ddt_generic_simple_unpack_function_cuda_iov_cached(pConvertor, iov, out_size, max_data);
 }
 
 int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_non_cached( opal_convertor_t* pConvertor,
@@ -778,7 +778,7 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
                 alignment = ALIGNMENT_CHAR;
             }
 
-            //alignment = ALIGNMENT_DOUBLE;
+            alignment = ALIGNMENT_DOUBLE;
 
             count_desc = length_per_iovec / alignment;
             residue_desc = length_per_iovec % alignment;
@@ -787,17 +787,16 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
             for (j = 0; j < nb_blocks_per_description; j++) {
                 cuda_iov_dist_h_current[nb_blocks_used].dst_offset = destination + j * thread_per_block * alignment - destination_base;
                 cuda_iov_dist_h_current[nb_blocks_used].src_offset = source - source_base;
-                cuda_iov_dist_h_current[nb_blocks_used].element_alignment = alignment;
                 if ( (j+1) * thread_per_block <= count_desc) {
-                    cuda_iov_dist_h_current[nb_blocks_used].nb_elements = thread_per_block;// * sizeof(double);
+                    cuda_iov_dist_h_current[nb_blocks_used].nb_bytes = thread_per_block * alignment;
                 } else {
-                    cuda_iov_dist_h_current[nb_blocks_used].nb_elements = (thread_per_block - ((j+1)*thread_per_block - count_desc));// * sizeof(double);
+                    cuda_iov_dist_h_current[nb_blocks_used].nb_bytes = (thread_per_block - ((j+1)*thread_per_block - count_desc)) * alignment;
                 }
 #if defined (OPAL_DATATYPE_CUDA_DEBUG)
-                assert (cuda_iov_dist_h_current[nb_blocks_used].nb_elements > 0); 
+                assert (cuda_iov_dist_h_current[nb_blocks_used].nb_bytes > 0); 
 #endif /* OPAL_DATATYPE_CUDA_DEBUG */
-                source += cuda_iov_dist_h_current[nb_blocks_used].nb_elements * alignment;
-                DT_CUDA_DEBUG( opal_cuda_output(12, "Unpack \tblock %d, src %p, dst %p, nb_elements %d, alignment %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].src_offset, cuda_iov_dist_h_current[nb_blocks_used].dst_offset, cuda_iov_dist_h_current[nb_blocks_used].nb_elements, cuda_iov_dist_h_current[nb_blocks_used].element_alignment); );
+                source += cuda_iov_dist_h_current[nb_blocks_used].nb_bytes;
+                DT_CUDA_DEBUG( opal_cuda_output(12, "Unpack \tblock %d, src_offset %ld, dst_offset %ld, nb_bytes %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].src_offset, cuda_iov_dist_h_current[nb_blocks_used].dst_offset, cuda_iov_dist_h_current[nb_blocks_used].nb_bytes); );
                 nb_blocks_used ++;
             }
 
@@ -807,13 +806,12 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
                 orig_alignment = ALIGNMENT_CHAR;
                 cuda_iov_dist_h_current[nb_blocks_used].dst_offset = destination + length_per_iovec / alignment * alignment - destination_base;
                 cuda_iov_dist_h_current[nb_blocks_used].src_offset = source - source_base;
-                cuda_iov_dist_h_current[nb_blocks_used].element_alignment = orig_alignment;
-                cuda_iov_dist_h_current[nb_blocks_used].nb_elements = (length_per_iovec - length_per_iovec / alignment * alignment) / orig_alignment;
+                cuda_iov_dist_h_current[nb_blocks_used].nb_bytes = (length_per_iovec - length_per_iovec / alignment * alignment) / orig_alignment;
 #if defined (OPAL_DATATYPE_CUDA_DEBUG)
-                assert (cuda_iov_dist_h_current[nb_blocks_used].nb_elements > 0);
+                assert (cuda_iov_dist_h_current[nb_blocks_used].nb_bytes > 0);
 #endif /* OPAL_DATATYPE_CUDA_DEBUG */
-                source += cuda_iov_dist_h_current[nb_blocks_used].nb_elements * orig_alignment;
-                DT_CUDA_DEBUG( opal_cuda_output(12, "Unpack \tblock %d, src %p, dst %p, nb_elements %d, alignment %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].src_offset, cuda_iov_dist_h_current[nb_blocks_used].dst_offset, cuda_iov_dist_h_current[nb_blocks_used].nb_elements, cuda_iov_dist_h_current[nb_blocks_used].element_alignment); );
+                source += cuda_iov_dist_h_current[nb_blocks_used].nb_bytes;
+                DT_CUDA_DEBUG( opal_cuda_output(12, "Unpack \tblock %d, src %ld, dst %ld, nb_bytes %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].src_offset, cuda_iov_dist_h_current[nb_blocks_used].dst_offset, cuda_iov_dist_h_current[nb_blocks_used].nb_bytes); );
                 nb_blocks_used ++;
             }
 
diff --git a/opal/datatype/opal_datatype_create.c b/opal/datatype/opal_datatype_create.c
index b97a84f5174..b95e13374d1 100644
--- a/opal/datatype/opal_datatype_create.c
+++ b/opal/datatype/opal_datatype_create.c
@@ -27,6 +27,10 @@
 #include "opal/datatype/opal_datatype_internal.h"
 #include "limits.h"
 #include "opal/prefetch.h"
+#if OPAL_CUDA_SUPPORT
+//#include "opal/datatype/opal_convertor.h"
+//#include "opal/datatype/opal_datatype_cuda.h"
+#endif /* OPAL_CUDA_SUPPORT */ 
 
 static void opal_datatype_construct( opal_datatype_t* pData )
 {
@@ -90,6 +94,15 @@ static void opal_datatype_destruct( opal_datatype_t* datatype )
         free(datatype->cached_iovec);
         datatype->cached_iovec = NULL;
     }
+    
+#if OPAL_CUDA_SUPPORT   
+    /* free cuda iov */
+/*    if (opal_datatype_cuda_kernel_support== 1 && datatype->cuda_iov_dist != NULL && datatype->cuda_iov_dist != (void*)0xDEADBEEF) {
+        opal_cuda_iov_dist_fini(datatype->cuda_iov_dist);
+        datatype->cuda_iov_dist = NULL;
+        datatype->cuda_iov_count = 0;
+    } */
+#endif /* OPAL_CUDA_SUPPORT */
 }
 
 OBJ_CLASS_INSTANCE(opal_datatype_t, opal_object_t, opal_datatype_construct, opal_datatype_destruct);
diff --git a/opal/datatype/opal_datatype_cuda.c b/opal/datatype/opal_datatype_cuda.c
index 3b3fc556ef9..e14e58bdb1c 100644
--- a/opal/datatype/opal_datatype_cuda.c
+++ b/opal/datatype/opal_datatype_cuda.c
@@ -84,7 +84,10 @@ void mca_cuda_convertor_init(opal_convertor_t* convertor, const void *pUserBuf,
     if (OPAL_SUCCESS != opal_cuda_kernel_support_init()) {
         opal_cuda_kernel_support_fini();    
     }
-    if (opal_datatype_cuda_kernel_support == 1 && datatype->cuda_iov_is_cached == 0) {
+
+#if 0    
+    convertor->flags &= ~CONVERTOR_CUDA;  
+    if (opal_datatype_cuda_kernel_support == 1 && datatype->cuda_iov_is_cached == 0 && opal_convertor_need_buffers(convertor) == true) {
         struct opal_datatype_t* datatype_tmp = (opal_datatype_t *)datatype;
         datatype_tmp->cuda_iov_dist = opal_cuda_iov_dist_init();
         if (datatype_tmp->cuda_iov_dist == (void*)0xDEADBEEF || datatype_tmp->cuda_iov_dist == NULL) {
@@ -95,6 +98,8 @@ void mca_cuda_convertor_init(opal_convertor_t* convertor, const void *pUserBuf,
             datatype_tmp->cuda_iov_is_cached = 1;
         }
     }
+    convertor->flags |= CONVERTOR_CUDA;
+#endif
     convertor->current_cuda_iov_count = 0;
     convertor->current_iov_pos = 0;
     convertor->current_iov_partial_length = 0;
diff --git a/opal/datatype/opal_datatype_destroy.c b/opal/datatype/opal_datatype_destroy.c
index 8c225e698c0..593d5bfd67a 100644
--- a/opal/datatype/opal_datatype_destroy.c
+++ b/opal/datatype/opal_datatype_destroy.c
@@ -21,24 +21,11 @@
 #include "opal_config.h"
 #include "opal/constants.h"
 #include "opal/datatype/opal_datatype.h"
-#include "opal/datatype/opal_datatype_internal.h"
-#if OPAL_CUDA_SUPPORT
-#include "opal/datatype/opal_convertor.h"
-#include "opal/datatype/opal_datatype_cuda.h"
-#endif /* OPAL_CUDA_SUPPORT */   
+#include "opal/datatype/opal_datatype_internal.h"  
 
 int32_t opal_datatype_destroy( opal_datatype_t** dt )
 {
     opal_datatype_t* pData = *dt;
-    
-#if OPAL_CUDA_SUPPORT   
-    /* free cuda iov */
-    if (opal_datatype_cuda_kernel_support== 1 && pData->cuda_iov_dist != NULL && pData->cuda_iov_dist != (void*)0xDEADBEEF) {
-        opal_cuda_iov_dist_fini(pData->cuda_iov_dist);
-        pData->cuda_iov_dist = NULL;
-        pData->cuda_iov_count = 0;
-    }
-#endif /* OPAL_CUDA_SUPPORT */
 
     if( (pData->flags & OPAL_DATATYPE_FLAG_PREDEFINED) &&
         (pData->super.obj_reference_count <= 1) )
diff --git a/test/datatype/ddt_benchmark.c b/test/datatype/ddt_benchmark.c
index c8c3fd7db45..50f62ec5839 100644
--- a/test/datatype/ddt_benchmark.c
+++ b/test/datatype/ddt_benchmark.c
@@ -1216,7 +1216,7 @@ int main( int argc, char* argv[] )
         printf("----matrix size %d-----\n", mat_size);
         if( outputFlags & CHECK_PACK_UNPACK ) {
             for (i = 1; i <= 1; i++) {
-                local_copy_with_convertor(pdt, 1, 40000000, mat_size);
+                local_copy_with_convertor(pdt, 1, 4000001, mat_size);
             }
         }
         OBJ_RELEASE( pdt ); assert( pdt == NULL );

From 7b87adb011e13eef1b76fe19034a43a20c64a666 Mon Sep 17 00:00:00 2001
From: eddy16112 <eddy16112@gmail.com>
Date: Sun, 8 Nov 2015 22:04:22 -0500
Subject: [PATCH 144/190] another checkpoint

---
 opal/datatype/cuda/opal_datatype_cuda.cu      | 15 +++++++++-
 opal/datatype/cuda/opal_datatype_cuda.cuh     |  4 +++
 .../cuda/opal_datatype_cuda_internal.cuh      |  2 +-
 .../cuda/opal_datatype_pack_cuda_kernel.cu    |  5 ++--
 .../cuda/opal_datatype_pack_cuda_wrapper.cu   | 19 +++++-------
 .../cuda/opal_datatype_unpack_cuda_kernel.cu  |  7 +++--
 .../cuda/opal_datatype_unpack_cuda_wrapper.cu | 16 ++++------
 opal/datatype/opal_convertor.h                |  5 ++--
 opal/datatype/opal_datatype.h                 |  6 ++--
 opal/datatype/opal_datatype_create.c          | 21 ++++++++-----
 opal/datatype/opal_datatype_cuda.c            | 30 ++-----------------
 opal/datatype/opal_datatype_cuda.h            |  1 -
 opal/datatype/opal_datatype_optimize.c        |  7 -----
 13 files changed, 63 insertions(+), 75 deletions(-)

diff --git a/opal/datatype/cuda/opal_datatype_cuda.cu b/opal/datatype/cuda/opal_datatype_cuda.cu
index f53e006a4fd..e35fbcffd27 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cu
+++ b/opal/datatype/cuda/opal_datatype_cuda.cu
@@ -293,7 +293,7 @@ void* opal_ddt_cuda_iov_dist_init(void)
     }
 #else
     DT_CUDA_DEBUG( opal_cuda_output( 2, "cuda iov cache is not enabled.\n"); );
-    return (void *)0xDEADBEEF;
+    return NULL;
 #endif /* OPAL_DATATYPE_CUDA_IOV_CACHE */
 }
 
@@ -308,6 +308,19 @@ void opal_ddt_cuda_iov_dist_fini(void* cuda_iov_dist)
 #endif /* OPAL_DATATYPE_CUDA_IOV_CACHE */
 }
 
+void opal_ddt_get_cached_cuda_iov(struct opal_convertor_t *convertor,
+                                  ddt_cuda_iov_dist_cached_t **cuda_iov_dist,
+                                  uint32_t* cuda_iov_count, uint8_t *cuda_iov_is_cached)
+{
+    opal_datatype_t *datatype = (opal_datatype_t *)convertor->pDesc;
+    if (datatype->cached_cuda_iov_dist == NULL) {
+        datatype->cached_cuda_iov_dist = opal_ddt_cuda_iov_dist_init();
+        datatype->cached_cuda_iov_count = NUM_CUDA_IOV_PER_DDT;
+    }
+    *cuda_iov_dist = (ddt_cuda_iov_dist_cached_t *)datatype->cached_cuda_iov_dist;
+    *cuda_iov_count = datatype->cached_cuda_iov_count;                      
+}
+
 int32_t opal_ddt_cuda_is_gpu_buffer(const void *ptr)
 {
     int res;
diff --git a/opal/datatype/cuda/opal_datatype_cuda.cuh b/opal/datatype/cuda/opal_datatype_cuda.cuh
index 73a740a2822..0711b2c067d 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cuh
+++ b/opal/datatype/cuda/opal_datatype_cuda.cuh
@@ -121,6 +121,10 @@ void opal_ddt_cuda_iov_dist_fini(void *cuda_iov_dist);
 
 void pack_iov_cached(opal_convertor_t* pConvertor, unsigned char *destination);
 
+void opal_ddt_get_cached_cuda_iov(struct opal_convertor_t *convertor,
+                                  ddt_cuda_iov_dist_cached_t **cuda_iov_dist,
+                                  uint32_t *cuda_iov_count, uint8_t *cuda_iov_is_cached);
+
 }
                             
 #endif  /* OPAL_DATATYPE_CUDA_H_HAS_BEEN_INCLUDED */
\ No newline at end of file
diff --git a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
index 1fa0e17b4c7..779db2b385a 100644
--- a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
+++ b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
@@ -18,7 +18,7 @@
 #define OPAL_DATATYPE_VECTOR_USE_ZEROCPY   0
 #define OPAL_DATATYPE_VECTOR_USE_PIPELINE   0
 #define OPAL_DATATYPE_VECTOR_USE_MEMCPY2D_AS_KERNEL   1
-#define OPAL_DATATYPE_CUDA_IOV_CACHE    0
+#define OPAL_DATATYPE_CUDA_IOV_CACHE    1
 
 
 
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
index f4a100b969d..42acd8c4906 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
@@ -115,9 +115,10 @@ __global__ void opal_generic_simple_pack_cuda_iov_cached_kernel( ddt_cuda_iov_di
             _source_tmp = source_base + src_offset;
             _destination_tmp = destination_base + dst_offset;
             uint32_t _nb_bytes = cuda_iov_dist[blockIdx.x + i * gridDim.x].nb_bytes;
-            if ((uintptr_t)(_source_tmp) % ALIGNMENT_DOUBLE == 0 && (uintptr_t)_destination_tmp % ALIGNMENT_DOUBLE == 0 && _nb_bytes >= ALIGNMENT_DOUBLE) {
+            /* block size is either multiple of ALIGNMENT_DOUBLE or residule */
+            if ((uintptr_t)(_source_tmp) % ALIGNMENT_DOUBLE == 0 && (uintptr_t)_destination_tmp % ALIGNMENT_DOUBLE == 0 && _nb_bytes % ALIGNMENT_DOUBLE == 0) {
                 alignment = ALIGNMENT_DOUBLE;
-            } else if ((uintptr_t)(_source_tmp) % ALIGNMENT_FLOAT == 0 && (uintptr_t)_destination_tmp % ALIGNMENT_FLOAT == 0 && _nb_bytes >= ALIGNMENT_FLOAT) {
+            } else if ((uintptr_t)(_source_tmp) % ALIGNMENT_FLOAT == 0 && (uintptr_t)_destination_tmp % ALIGNMENT_FLOAT == 0 && _nb_bytes % ALIGNMENT_FLOAT == 0) {
                 alignment = ALIGNMENT_FLOAT;
             } else {
                 alignment = ALIGNMENT_CHAR;
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
index 36cdcbaf3cd..bae2a714b79 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
@@ -956,7 +956,10 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
     const struct iovec *ddt_iov = NULL;
     uint32_t ddt_iov_count;
     size_t iov_len;
-    int iov_start_pos, iov_end_pos;
+    uint32_t iov_start_pos, iov_end_pos;
+    ddt_cuda_iov_dist_cached_t* cached_cuda_iov_dist_d;
+    uint32_t cached_cuda_iov_count;
+    uint8_t cuda_iov_is_cached;
 
 #if defined(OPAL_DATATYPE_CUDA_TIMING)    
     TIMER_DATA_TYPE start, end, start_total, end_total;
@@ -1025,6 +1028,8 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
     
     opal_convertor_raw_cached( pConvertor, &ddt_iov, &ddt_iov_count);
     assert(ddt_iov != NULL);
+    opal_ddt_get_cached_cuda_iov(pConvertor, &cached_cuda_iov_dist_d, &cached_cuda_iov_count, &cuda_iov_is_cached);
+    assert(cached_cuda_iov_dist_d != NULL);
     DT_CUDA_DEBUG ( opal_cuda_output(4, "Pack iov count %d, submit to CUDA stream %d\n", ddt_iov_count, cuda_streams->current_stream_id); );
 
 #if defined(OPAL_DATATYPE_CUDA_TIMING)    
@@ -1078,15 +1083,7 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
             total_packed += length_per_iovec;
             source = (size_t)(ddt_iov[i].iov_base) + (ddt_iov[i].iov_len - iov_len) + source_base;
             
-            /* check alignment */
-            if ((uintptr_t)(source) % ALIGNMENT_DOUBLE == 0 && (uintptr_t)destination % ALIGNMENT_DOUBLE == 0 && length_per_iovec >= ALIGNMENT_DOUBLE) {
-                alignment = ALIGNMENT_DOUBLE;
-            } else if ((uintptr_t)(source) % ALIGNMENT_FLOAT == 0 && (uintptr_t)destination % ALIGNMENT_FLOAT == 0 && length_per_iovec >= ALIGNMENT_FLOAT) {
-                alignment = ALIGNMENT_FLOAT;
-            } else {
-                alignment = ALIGNMENT_CHAR;
-            }
-            
+            /* block size is either multiple of ALIGNMENT_DOUBLE or residule */
             alignment = ALIGNMENT_DOUBLE;
 
             count_desc = length_per_iovec / alignment;
@@ -1105,7 +1102,7 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
                 assert(cuda_iov_dist_h_current[nb_blocks_used].nb_bytes > 0);
 #endif /* OPAL_DATATYPE_CUDA_DEBUG */
                 destination += cuda_iov_dist_h_current[nb_blocks_used].nb_bytes;
-                DT_CUDA_DEBUG( opal_cuda_output(12, "Pack \tblock %d, src_offset %ld, dst_offset %ld, nb_elements %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].src_offset, cuda_iov_dist_h_current[nb_blocks_used].dst_offset, cuda_iov_dist_h_current[nb_blocks_used].nb_bytes); );
+                DT_CUDA_DEBUG( opal_cuda_output(12, "Pack \tblock %d, src_offset %ld, dst_offset %ld, nb_bytes %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].src_offset, cuda_iov_dist_h_current[nb_blocks_used].dst_offset, cuda_iov_dist_h_current[nb_blocks_used].nb_bytes); );
                 nb_blocks_used ++;
                 assert (nb_blocks_used < CUDA_MAX_NB_BLOCKS*CUDA_IOV_MAX_TASK_PER_BLOCK);
             }
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
index b1e2831f5c1..1fe37218fba 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
@@ -72,9 +72,9 @@ __global__ void opal_generic_simple_unpack_cuda_iov_cached_kernel( ddt_cuda_iov_
             _source_tmp = source_base + src_offset;
             _destination_tmp = destination_base + dst_offset;
             uint32_t _nb_bytes = cuda_iov_dist[blockIdx.x + i * gridDim.x].nb_bytes;
-            if ((uintptr_t)(_destination_tmp) % ALIGNMENT_DOUBLE == 0 && (uintptr_t)_source_tmp % ALIGNMENT_DOUBLE == 0 && _nb_bytes >= ALIGNMENT_DOUBLE) {
+            if ((uintptr_t)(_destination_tmp) % ALIGNMENT_DOUBLE == 0 && (uintptr_t)_source_tmp % ALIGNMENT_DOUBLE == 0 && _nb_bytes % ALIGNMENT_DOUBLE == 0) {
                 alignment = ALIGNMENT_DOUBLE;
-            } else if ((uintptr_t)(_destination_tmp) % ALIGNMENT_FLOAT == 0 && (uintptr_t)_source_tmp % ALIGNMENT_FLOAT == 0 && _nb_bytes >= ALIGNMENT_FLOAT) {
+            } else if ((uintptr_t)(_destination_tmp) % ALIGNMENT_FLOAT == 0 && (uintptr_t)_source_tmp % ALIGNMENT_FLOAT == 0 && _nb_bytes % ALIGNMENT_FLOAT == 0) {
                 alignment = ALIGNMENT_FLOAT;
             } else {
                 alignment = ALIGNMENT_CHAR;
@@ -90,6 +90,9 @@ __global__ void opal_generic_simple_unpack_cuda_iov_cached_kernel( ddt_cuda_iov_
             if (j < copy_count) {
                 _source_tmp = source_base + src_offset + j * alignment;
                 _destination_tmp = destination_base + dst_offset + j * alignment;
+  /*              if (threadIdx.x == 0) {
+                    printf("_src %p, dst %p, alignment %d, blk %d, j %d, count %d\n", _source_tmp, _destination_tmp, alignment, blockIdx.x, j, copy_count);
+                }*/
 #if !defined (OPAL_DATATYPE_CUDA_DRY_RUN)
                     if (alignment == ALIGNMENT_DOUBLE) {
                         *((long *)_destination_tmp) = *((long *)_source_tmp);
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
index 0bdf66638fc..ed105558f96 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
@@ -652,7 +652,10 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
     const struct iovec *ddt_iov = NULL;
     uint32_t ddt_iov_count;
     size_t iov_len;
-    int iov_start_pos, iov_end_pos;
+    uint32_t iov_start_pos, iov_end_pos;
+    ddt_cuda_iov_dist_cached_t* cached_cuda_iov_dist_d;
+    uint32_t cached_cuda_iov_count;
+    uint8_t cuda_iov_is_cached;
 
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
     TIMER_DATA_TYPE start, end, start_total, end_total;
@@ -715,6 +718,8 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
     source_base = source;
     opal_convertor_raw_cached( pConvertor, &ddt_iov, &ddt_iov_count);
     assert(ddt_iov != NULL);
+    opal_ddt_get_cached_cuda_iov(pConvertor, &cached_cuda_iov_dist_d, &cached_cuda_iov_count, &cuda_iov_is_cached);
+    assert(cached_cuda_iov_dist_d != NULL);
     DT_CUDA_DEBUG ( opal_cuda_output(4, "Unpack iov count %d, submit to CUDA stream %d\n", ddt_iov_count, cuda_streams->current_stream_id); );
 
 #if defined (OPAL_DATATYPE_CUDA_TIMING)
@@ -769,15 +774,6 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
             total_unpacked += length_per_iovec;
             destination = (size_t)(ddt_iov[i].iov_base) + (ddt_iov[i].iov_len - iov_len) + destination_base;
 
-            /* check alignment */
-            if ((uintptr_t)(destination) % ALIGNMENT_DOUBLE == 0 && (uintptr_t)source % ALIGNMENT_DOUBLE == 0 && length_per_iovec >= ALIGNMENT_DOUBLE) {
-                alignment = ALIGNMENT_DOUBLE;
-            } else if ((uintptr_t)(destination) % ALIGNMENT_FLOAT == 0 && (uintptr_t)source % ALIGNMENT_FLOAT == 0 && length_per_iovec >= ALIGNMENT_FLOAT) {
-                alignment = ALIGNMENT_FLOAT;
-            } else {
-                alignment = ALIGNMENT_CHAR;
-            }
-
             alignment = ALIGNMENT_DOUBLE;
 
             count_desc = length_per_iovec / alignment;
diff --git a/opal/datatype/opal_convertor.h b/opal/datatype/opal_convertor.h
index 1ab600cc49b..b7c0a43a6ed 100644
--- a/opal/datatype/opal_convertor.h
+++ b/opal/datatype/opal_convertor.h
@@ -114,9 +114,10 @@ struct opal_convertor_t {
 
     unsigned char *               gpu_buffer_ptr; /**< GPU buffer used for pack/unpack */
     size_t                        gpu_buffer_size;
-    size_t                        current_cuda_iov_count;
-    size_t                        current_iov_pos;
+    uint32_t                      current_cuda_iov_pos;
+    uint32_t                      current_iov_pos;
     size_t                        current_iov_partial_length;
+    opal_datatype_count_t         current_count;
 #endif
     /* size: 248, cachelines: 4, members: 20 */
     /* last cacheline: 56 bytes */
diff --git a/opal/datatype/opal_datatype.h b/opal/datatype/opal_datatype.h
index b15a8d845ee..010a70b7270 100644
--- a/opal/datatype/opal_datatype.h
+++ b/opal/datatype/opal_datatype.h
@@ -132,9 +132,9 @@ struct opal_datatype_t {
     size_t             max_data;
     /* size: 416, cachelines: 7, members: 18 */
 #if OPAL_CUDA_SUPPORT
-    void *             cuda_iov_dist;
-    size_t             cuda_iov_count;
-    int8_t             cuda_iov_is_cached;
+    void *             cached_cuda_iov_dist;
+    uint32_t           cached_cuda_iov_count;
+    uint8_t            cuda_iov_is_cached;
 #endif /* OPAL_CUDA_SUPPORT */
     /* last cacheline: 32 bytes */
 
diff --git a/opal/datatype/opal_datatype_create.c b/opal/datatype/opal_datatype_create.c
index b95e13374d1..19caffe19ae 100644
--- a/opal/datatype/opal_datatype_create.c
+++ b/opal/datatype/opal_datatype_create.c
@@ -28,8 +28,8 @@
 #include "limits.h"
 #include "opal/prefetch.h"
 #if OPAL_CUDA_SUPPORT
-//#include "opal/datatype/opal_convertor.h"
-//#include "opal/datatype/opal_datatype_cuda.h"
+#include "opal/datatype/opal_convertor.h"
+#include "opal/datatype/opal_datatype_cuda.h"
 #endif /* OPAL_CUDA_SUPPORT */ 
 
 static void opal_datatype_construct( opal_datatype_t* pData )
@@ -59,6 +59,12 @@ static void opal_datatype_construct( opal_datatype_t* pData )
 
     pData->cached_iovec       = NULL;
     pData->cached_iovec_count = 0;
+    
+#if OPAL_CUDA_SUPPORT
+    pData->cached_cuda_iov_dist = NULL;
+    pData->cached_cuda_iov_count = 0;
+    pData->cuda_iov_is_cached = 0;
+#endif /* OPAL_CUDA_SUPPORT */
 
     for( i = 0; i < OPAL_DATATYPE_MAX_SUPPORTED; i++ )
         pData->btypes[i]      = 0;
@@ -97,11 +103,12 @@ static void opal_datatype_destruct( opal_datatype_t* datatype )
     
 #if OPAL_CUDA_SUPPORT   
     /* free cuda iov */
-/*    if (opal_datatype_cuda_kernel_support== 1 && datatype->cuda_iov_dist != NULL && datatype->cuda_iov_dist != (void*)0xDEADBEEF) {
-        opal_cuda_iov_dist_fini(datatype->cuda_iov_dist);
-        datatype->cuda_iov_dist = NULL;
-        datatype->cuda_iov_count = 0;
-    } */
+    if (opal_datatype_cuda_kernel_support == 1 && datatype->cached_cuda_iov_dist != NULL) {
+        opal_cuda_iov_dist_fini(datatype->cached_cuda_iov_dist);
+        datatype->cached_cuda_iov_dist = NULL;
+        datatype->cached_cuda_iov_count = 0;
+        datatype->cuda_iov_is_cached = 0;
+    }
 #endif /* OPAL_CUDA_SUPPORT */
 }
 
diff --git a/opal/datatype/opal_datatype_cuda.c b/opal/datatype/opal_datatype_cuda.c
index e14e58bdb1c..ddc48444777 100644
--- a/opal/datatype/opal_datatype_cuda.c
+++ b/opal/datatype/opal_datatype_cuda.c
@@ -85,25 +85,10 @@ void mca_cuda_convertor_init(opal_convertor_t* convertor, const void *pUserBuf,
         opal_cuda_kernel_support_fini();    
     }
 
-#if 0    
-    convertor->flags &= ~CONVERTOR_CUDA;  
-    if (opal_datatype_cuda_kernel_support == 1 && datatype->cuda_iov_is_cached == 0 && opal_convertor_need_buffers(convertor) == true) {
-        struct opal_datatype_t* datatype_tmp = (opal_datatype_t *)datatype;
-        datatype_tmp->cuda_iov_dist = opal_cuda_iov_dist_init();
-        if (datatype_tmp->cuda_iov_dist == (void*)0xDEADBEEF || datatype_tmp->cuda_iov_dist == NULL) {
-            /* either cuda iov cache is not enabled or cuda_iov_cache malloc is failed, then we do not cache cuda iov */
-            datatype_tmp->cuda_iov_is_cached = -1;
-        } else {
-            /* cuda iov buffer is ready , the value will be marked to 2 when caching is finished*/
-            datatype_tmp->cuda_iov_is_cached = 1;
-        }
-    }
-    convertor->flags |= CONVERTOR_CUDA;
-#endif
-    convertor->current_cuda_iov_count = 0;
+    convertor->current_cuda_iov_pos = 0;
     convertor->current_iov_pos = 0;
     convertor->current_iov_partial_length = 0;
-    
+    convertor->current_count = 0;
 }
 
 /* Checks the type of pointer
@@ -261,7 +246,6 @@ int32_t opal_cuda_kernel_support_init(void)
         OPAL_DATATYPE_FIND_CUDA_KERNEL_FUNCTION_OR_RETURN( opal_datatype_cuda_kernel_handle, opal_ddt_cuda_malloc_gpu_buffer );
         OPAL_DATATYPE_FIND_CUDA_KERNEL_FUNCTION_OR_RETURN( opal_datatype_cuda_kernel_handle, opal_ddt_cuda_d2dcpy_async );
         OPAL_DATATYPE_FIND_CUDA_KERNEL_FUNCTION_OR_RETURN( opal_datatype_cuda_kernel_handle, opal_ddt_cuda_d2dcpy );
-        OPAL_DATATYPE_FIND_CUDA_KERNEL_FUNCTION_OR_RETURN( opal_datatype_cuda_kernel_handle, opal_ddt_cuda_iov_dist_init );
         OPAL_DATATYPE_FIND_CUDA_KERNEL_FUNCTION_OR_RETURN( opal_datatype_cuda_kernel_handle, opal_ddt_cuda_iov_dist_fini );
         
         if (OPAL_SUCCESS != cuda_kernel_table.opal_ddt_cuda_kernel_init_p()) {
@@ -378,16 +362,6 @@ void opal_cuda_d2dcpy_async(void* dst, const void* src, size_t count)
     }
 }
 
-void* opal_cuda_iov_dist_init(void)
-{
-    if (cuda_kernel_table.opal_ddt_cuda_iov_dist_init_p != NULL) {
-        return cuda_kernel_table.opal_ddt_cuda_iov_dist_init_p();
-    } else {
-        opal_output(0, "opal_ddt_cuda_iov_dist_init function pointer is NULL\n");
-        return NULL;
-    }
-}
-
 void opal_cuda_iov_dist_fini(void *cuda_iov_dist)
 {
     if (cuda_kernel_table.opal_ddt_cuda_iov_dist_fini_p != NULL) {
diff --git a/opal/datatype/opal_datatype_cuda.h b/opal/datatype/opal_datatype_cuda.h
index 24e85f649b9..37af008daa8 100644
--- a/opal/datatype/opal_datatype_cuda.h
+++ b/opal/datatype/opal_datatype_cuda.h
@@ -28,7 +28,6 @@ struct opal_datatype_cuda_kernel_function_table {
     void* (*opal_ddt_cuda_malloc_gpu_buffer_p)(size_t size, int gpu_id);
     void (*opal_ddt_cuda_d2dcpy_async_p)(void* dst, const void* src, size_t count);
     void (*opal_ddt_cuda_d2dcpy_p)(void* dst, const void* src, size_t count);
-    void* (*opal_ddt_cuda_iov_dist_init_p)(void);
     void (*opal_ddt_cuda_iov_dist_fini_p)(void *cuda_iov_dist);
     int32_t (*opal_ddt_generic_simple_pack_function_cuda_iov_p)( opal_convertor_t* pConvertor, struct iovec* iov, uint32_t* out_size, size_t* max_data );
     int32_t (*opal_ddt_generic_simple_unpack_function_cuda_iov_p)( opal_convertor_t* pConvertor, struct iovec* iov, uint32_t* out_size, size_t* max_data );
diff --git a/opal/datatype/opal_datatype_optimize.c b/opal/datatype/opal_datatype_optimize.c
index b33b7347fd8..e8b8d9794bd 100644
--- a/opal/datatype/opal_datatype_optimize.c
+++ b/opal/datatype/opal_datatype_optimize.c
@@ -304,13 +304,6 @@ int32_t opal_datatype_commit( opal_datatype_t * pData )
         pLast->size            = pData->size;
     }
 
-#if OPAL_CUDA_SUPPORT   
-    /* cuda iov for caching, it will be malloced latter when init convertor */
-    pData->cuda_iov_dist = NULL;
-    pData->cuda_iov_is_cached = 0;
-    pData->cuda_iov_count = 0;
-#endif /* OPAL_CUDA_SUPPORT */
-
     /* save a compressed datatype description as a iovec list */
 //    opal_convertor_t* conv = opal_convertor_create( opal_local_arch, 0 /* unused */);
 //    opal_convertor_prepare_for_send( conv, pData, 1, (void*)0 );

From 270898b1117b289e944ac900e4ae1e3350ebaf2a Mon Sep 17 00:00:00 2001
From: eddy16112 <eddy16112@gmail.com>
Date: Mon, 9 Nov 2015 00:17:39 -0500
Subject: [PATCH 145/190] check point, cuda iov is cached, but not used for
 pack/unpack

---
 opal/datatype/cuda/opal_datatype_cuda.cu      |  30 ++-
 opal/datatype/cuda/opal_datatype_cuda.cuh     |   4 +
 .../cuda/opal_datatype_cuda_internal.cuh      |   3 +-
 .../cuda/opal_datatype_pack_cuda_wrapper.cu   | 185 +++++++++---------
 .../cuda/opal_datatype_unpack_cuda_wrapper.cu | 178 +++++++++--------
 test/datatype/ddt_benchmark.c                 |   4 +-
 6 files changed, 226 insertions(+), 178 deletions(-)

diff --git a/opal/datatype/cuda/opal_datatype_cuda.cu b/opal/datatype/cuda/opal_datatype_cuda.cu
index e35fbcffd27..ea77cadbae8 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cu
+++ b/opal/datatype/cuda/opal_datatype_cuda.cu
@@ -279,11 +279,11 @@ int32_t opal_ddt_cuda_kernel_fini(void)
     return OPAL_SUCCESS;
 }
 
-void* opal_ddt_cuda_iov_dist_init(void) 
+void* opal_ddt_cuda_iov_dist_init(uint32_t size) 
 {
 #if OPAL_DATATYPE_CUDA_IOV_CACHE 
     ddt_cuda_iov_dist_cached_t *p = NULL;
-    cudaMalloc((void **)(&p), sizeof(ddt_cuda_iov_dist_cached_t) * NUM_CUDA_IOV_PER_DDT);
+    cudaMalloc((void **)(&p), sizeof(ddt_cuda_iov_dist_cached_t) * size);
     if (p != NULL) {
         DT_CUDA_DEBUG( opal_cuda_output( 2, "Malloc cuda_iov_dist_cached for ddt is successed %p.\n", p); );
         return p;
@@ -314,13 +314,37 @@ void opal_ddt_get_cached_cuda_iov(struct opal_convertor_t *convertor,
 {
     opal_datatype_t *datatype = (opal_datatype_t *)convertor->pDesc;
     if (datatype->cached_cuda_iov_dist == NULL) {
-        datatype->cached_cuda_iov_dist = opal_ddt_cuda_iov_dist_init();
+        datatype->cached_cuda_iov_dist = opal_ddt_cuda_iov_dist_init(NUM_CUDA_IOV_PER_DDT);
         datatype->cached_cuda_iov_count = NUM_CUDA_IOV_PER_DDT;
     }
     *cuda_iov_dist = (ddt_cuda_iov_dist_cached_t *)datatype->cached_cuda_iov_dist;
     *cuda_iov_count = datatype->cached_cuda_iov_count;                      
 }
 
+void opal_ddt_set_cuda_iov_is_cached(struct opal_convertor_t *convertor, uint32_t cuda_iov_count)
+{
+    opal_datatype_t *datatype = (opal_datatype_t *)convertor->pDesc;
+    assert(datatype->cached_cuda_iov_dist != NULL);
+    datatype->cached_cuda_iov_count = cuda_iov_count;
+}
+
+void opal_ddt_check_cuda_iov_is_full(struct opal_convertor_t *convertor, uint32_t cuda_iov_count)
+{
+    opal_datatype_t *datatype = (opal_datatype_t *)convertor->pDesc;
+    assert(datatype->cached_cuda_iov_dist != NULL);
+    if (datatype->cached_cuda_iov_count < cuda_iov_count) {
+        printf("cuda count %d, new count %d\n", datatype->cached_cuda_iov_count, cuda_iov_count);
+  //      assert(0);
+        void *old_iov = datatype->cached_cuda_iov_dist;
+        void *new_iov = opal_ddt_cuda_iov_dist_init(datatype->cached_cuda_iov_count + NUM_CUDA_IOV_PER_DDT);
+        assert(new_iov != NULL);
+        cudaMemcpy(new_iov, old_iov, datatype->cached_cuda_iov_count * sizeof(ddt_cuda_iov_dist_cached_t), cudaMemcpyDeviceToDevice);
+        datatype->cached_cuda_iov_dist = new_iov;
+        datatype->cached_cuda_iov_count += NUM_CUDA_IOV_PER_DDT;
+        opal_ddt_cuda_iov_dist_fini(old_iov);
+    }
+}
+
 int32_t opal_ddt_cuda_is_gpu_buffer(const void *ptr)
 {
     int res;
diff --git a/opal/datatype/cuda/opal_datatype_cuda.cuh b/opal/datatype/cuda/opal_datatype_cuda.cuh
index 0711b2c067d..ea89dda3c53 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cuh
+++ b/opal/datatype/cuda/opal_datatype_cuda.cuh
@@ -124,6 +124,10 @@ void pack_iov_cached(opal_convertor_t* pConvertor, unsigned char *destination);
 void opal_ddt_get_cached_cuda_iov(struct opal_convertor_t *convertor,
                                   ddt_cuda_iov_dist_cached_t **cuda_iov_dist,
                                   uint32_t *cuda_iov_count, uint8_t *cuda_iov_is_cached);
+                                  
+void opal_ddt_set_cuda_iov_is_cached(struct opal_convertor_t *convertor, uint32_t cuda_iov_count);
+
+void opal_ddt_check_cuda_iov_is_full(struct opal_convertor_t *convertor, uint32_t cuda_iov_count);
 
 }
                             
diff --git a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
index 779db2b385a..d34e6039ff3 100644
--- a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
+++ b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
@@ -38,7 +38,8 @@
 #define ALIGNMENT_DOUBLE    8
 #define ALIGNMENT_FLOAT     4
 #define ALIGNMENT_CHAR      1
-#define NUM_CUDA_IOV_PER_DDT    100000
+#define NUM_CUDA_IOV_PER_DDT    150000
+#define IOV_PIPELINE_SIZE   1000
 
 #define TIMER_DATA_TYPE struct timeval
 #define GET_TIME(TV)   gettimeofday( &(TV), NULL )
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
index bae2a714b79..b5155a0e9e1 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
@@ -1011,7 +1011,6 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
     
     DT_CUDA_DEBUG ( opal_cuda_output(2, "Pack using IOV cached, GPU base %p, pack to buffer %p\n", pConvertor->pBaseBuf, destination););
 
-    cuda_iov_count = 4000;//CUDA_NB_IOV;
     total_packed = 0;
     total_converted = pConvertor->bConverted;
     cuda_streams->current_stream_id = 0;
@@ -1041,116 +1040,126 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
     thread_per_block = CUDA_WARP_SIZE * 5;
     nb_blocks = 256;
     
-    iov_start_pos = pConvertor->current_iov_pos;
-    iov_end_pos = iov_start_pos + 1000;
-    if (iov_end_pos > ddt_iov_count) {
-        iov_end_pos = ddt_iov_count;
-    }
-    source_base = (unsigned char*)pConvertor->pBaseBuf; 
+    while(pConvertor->current_count < pConvertor->count && !buffer_isfull) {
+    
+        iov_start_pos = pConvertor->current_iov_pos;
+        iov_end_pos = iov_start_pos + IOV_PIPELINE_SIZE;
+        if (iov_end_pos > ddt_iov_count) {
+            iov_end_pos = ddt_iov_count;
+        }
+        source_base = (unsigned char*)pConvertor->pBaseBuf; 
     
-    while (iov_start_pos < iov_end_pos && !buffer_isfull) {
+        while (iov_start_pos < iov_end_pos && !buffer_isfull) {
         
-        nb_blocks_used = 0;
-        cuda_iov_pipeline_block = current_cuda_device->cuda_iov_pipeline_block[iov_pipeline_block_id];
-        cuda_iov_dist_h_current = cuda_iov_pipeline_block->cuda_iov_dist_cached_h;
-        cuda_iov_dist_d_current = cuda_iov_pipeline_block->cuda_iov_dist_cached_d;
-        cuda_stream_iov = cuda_iov_pipeline_block->cuda_stream;
-        cuda_err = cudaStreamWaitEvent(*cuda_stream_iov, cuda_iov_pipeline_block->cuda_event, 0);
-        opal_cuda_check_error(cuda_err);
+            nb_blocks_used = 0;
+            cuda_iov_pipeline_block = current_cuda_device->cuda_iov_pipeline_block[iov_pipeline_block_id];
+            cuda_iov_dist_h_current = cuda_iov_pipeline_block->cuda_iov_dist_cached_h;
+            cuda_iov_dist_d_current = cached_cuda_iov_dist_d + pConvertor->current_cuda_iov_pos;
+            cuda_stream_iov = cuda_iov_pipeline_block->cuda_stream;
+            cuda_err = cudaStreamWaitEvent(*cuda_stream_iov, cuda_iov_pipeline_block->cuda_event, 0);
+            opal_cuda_check_error(cuda_err);
 
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
-        GET_TIME(start);
+            GET_TIME(start);
 #endif
 
-        for (i = iov_start_pos; i < iov_end_pos; i++) {
-            if (pConvertor->current_iov_partial_length > 0) {
-                iov_len = pConvertor->current_iov_partial_length;
-                pConvertor->current_iov_partial_length = 0;
-            } else {
-                iov_len = ddt_iov[i].iov_len;
-            }
-            if (buffer_size >= iov_len) {
-                length_per_iovec = iov_len;
-            } else {
-                /*orig_alignment = opal_datatype_basicDatatypes[pElem->elem.common.type]->size;*/
-                orig_alignment = ALIGNMENT_CHAR;
-                length_per_iovec = buffer_size / orig_alignment * orig_alignment;
-                buffer_isfull = 1;
-                pConvertor->current_iov_partial_length = iov_len - length_per_iovec;
-                pConvertor->current_iov_pos = i;
-            }
-            buffer_size -= length_per_iovec;
-            total_packed += length_per_iovec;
-            source = (size_t)(ddt_iov[i].iov_base) + (ddt_iov[i].iov_len - iov_len) + source_base;
-            
-            /* block size is either multiple of ALIGNMENT_DOUBLE or residule */
-            alignment = ALIGNMENT_DOUBLE;
-
-            count_desc = length_per_iovec / alignment;
-            residue_desc = length_per_iovec % alignment;
-            nb_blocks_per_description = (count_desc + thread_per_block - 1) / thread_per_block;
-            DT_CUDA_DEBUG ( opal_cuda_output(10, "Pack description %d, size %d, residue %d, alignment %d, nb_block_aquired %d\n", i, count_desc, residue_desc, alignment, nb_blocks_per_description); );
-            for (j = 0; j < nb_blocks_per_description; j++) {
-                cuda_iov_dist_h_current[nb_blocks_used].src_offset = source + j * thread_per_block * alignment - source_base;
-                cuda_iov_dist_h_current[nb_blocks_used].dst_offset = destination - destination_base;
-                if ( (j+1) * thread_per_block <= count_desc) {
-                    cuda_iov_dist_h_current[nb_blocks_used].nb_bytes = thread_per_block * alignment;
+            for (i = iov_start_pos; i < iov_end_pos && !buffer_isfull; i++) {
+                if (pConvertor->current_iov_partial_length > 0) {
+                    iov_len = pConvertor->current_iov_partial_length;
+                    pConvertor->current_iov_partial_length = 0;
+                } else {
+                    iov_len = ddt_iov[i].iov_len;
+                }
+                if (buffer_size >= iov_len) {
+                    length_per_iovec = iov_len;
                 } else {
-                    cuda_iov_dist_h_current[nb_blocks_used].nb_bytes = (count_desc - j*thread_per_block) * alignment; 
+                    /*orig_alignment = opal_datatype_basicDatatypes[pElem->elem.common.type]->size;*/
+                    orig_alignment = ALIGNMENT_CHAR;
+                    length_per_iovec = buffer_size / orig_alignment * orig_alignment;
+                    buffer_isfull = 1;
+                    pConvertor->current_iov_partial_length = iov_len - length_per_iovec;
+                    pConvertor->current_iov_pos = i;
                 }
+                buffer_size -= length_per_iovec;
+                total_packed += length_per_iovec;
+                source = (size_t)(ddt_iov[i].iov_base) + (ddt_iov[i].iov_len - iov_len) + source_base;
+            
+                /* block size is either multiple of ALIGNMENT_DOUBLE or residule */
+                alignment = ALIGNMENT_DOUBLE;
+
+                count_desc = length_per_iovec / alignment;
+                residue_desc = length_per_iovec % alignment;
+                nb_blocks_per_description = (count_desc + thread_per_block - 1) / thread_per_block;
+                DT_CUDA_DEBUG ( opal_cuda_output(10, "Pack description %d, size %d, residue %d, alignment %d, nb_block_aquired %d\n", i, count_desc, residue_desc, alignment, nb_blocks_per_description); );
+                for (j = 0; j < nb_blocks_per_description; j++) {
+                    cuda_iov_dist_h_current[nb_blocks_used].src_offset = source + j * thread_per_block * alignment - source_base;
+                    cuda_iov_dist_h_current[nb_blocks_used].dst_offset = destination - destination_base;
+                    if ( (j+1) * thread_per_block <= count_desc) {
+                        cuda_iov_dist_h_current[nb_blocks_used].nb_bytes = thread_per_block * alignment;
+                    } else {
+                        cuda_iov_dist_h_current[nb_blocks_used].nb_bytes = (count_desc - j*thread_per_block) * alignment; 
+                    }
 #if defined (OPAL_DATATYPE_CUDA_DEBUG)
-                assert(cuda_iov_dist_h_current[nb_blocks_used].nb_bytes > 0);
+                    assert(cuda_iov_dist_h_current[nb_blocks_used].nb_bytes > 0);
 #endif /* OPAL_DATATYPE_CUDA_DEBUG */
-                destination += cuda_iov_dist_h_current[nb_blocks_used].nb_bytes;
-                DT_CUDA_DEBUG( opal_cuda_output(12, "Pack \tblock %d, src_offset %ld, dst_offset %ld, nb_bytes %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].src_offset, cuda_iov_dist_h_current[nb_blocks_used].dst_offset, cuda_iov_dist_h_current[nb_blocks_used].nb_bytes); );
-                nb_blocks_used ++;
-                assert (nb_blocks_used < CUDA_MAX_NB_BLOCKS*CUDA_IOV_MAX_TASK_PER_BLOCK);
-            }
+                    destination += cuda_iov_dist_h_current[nb_blocks_used].nb_bytes;
+                    DT_CUDA_DEBUG( opal_cuda_output(12, "Pack \tblock %d, src_offset %ld, dst_offset %ld, nb_bytes %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].src_offset, cuda_iov_dist_h_current[nb_blocks_used].dst_offset, cuda_iov_dist_h_current[nb_blocks_used].nb_bytes); );
+                    nb_blocks_used ++;
+                    assert (nb_blocks_used < CUDA_MAX_NB_BLOCKS*CUDA_IOV_MAX_TASK_PER_BLOCK);
+                }
             
-            /* handle residue */
-            if (residue_desc != 0) {
-                /*orig_alignment = opal_datatype_basicDatatypes[pElem->elem.common.type]->size;*/
-                orig_alignment = ALIGNMENT_CHAR;
-                cuda_iov_dist_h_current[nb_blocks_used].src_offset = source + length_per_iovec / alignment * alignment - source_base;
-                cuda_iov_dist_h_current[nb_blocks_used].dst_offset = destination - destination_base;
-                cuda_iov_dist_h_current[nb_blocks_used].nb_bytes = (length_per_iovec - length_per_iovec / alignment * alignment) / orig_alignment;
+                /* handle residue */
+                if (residue_desc != 0) {
+                    /*orig_alignment = opal_datatype_basicDatatypes[pElem->elem.common.type]->size;*/
+                    orig_alignment = ALIGNMENT_CHAR;
+                    cuda_iov_dist_h_current[nb_blocks_used].src_offset = source + length_per_iovec / alignment * alignment - source_base;
+                    cuda_iov_dist_h_current[nb_blocks_used].dst_offset = destination - destination_base;
+                    cuda_iov_dist_h_current[nb_blocks_used].nb_bytes = (length_per_iovec - length_per_iovec / alignment * alignment) / orig_alignment;
 #if defined (OPAL_DATATYPE_CUDA_DEBUG)
-                assert(cuda_iov_dist_h_current[nb_blocks_used].nb_bytes > 0);
+                    assert(cuda_iov_dist_h_current[nb_blocks_used].nb_bytes > 0);
 #endif /* OPAL_DATATYPE_CUDA_DEBUG */
-                destination += cuda_iov_dist_h_current[nb_blocks_used].nb_bytes;
-                DT_CUDA_DEBUG( opal_cuda_output(12, "Pack \tblock %d, src_offset %ld, dst_offset %ld, nb_bytes %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].src_offset, cuda_iov_dist_h_current[nb_blocks_used].dst_offset, cuda_iov_dist_h_current[nb_blocks_used].nb_bytes); );
-                nb_blocks_used ++;
-                assert (nb_blocks_used < CUDA_MAX_NB_BLOCKS*CUDA_IOV_MAX_TASK_PER_BLOCK);
+                    destination += cuda_iov_dist_h_current[nb_blocks_used].nb_bytes;
+                    DT_CUDA_DEBUG( opal_cuda_output(12, "Pack \tblock %d, src_offset %ld, dst_offset %ld, nb_bytes %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].src_offset, cuda_iov_dist_h_current[nb_blocks_used].dst_offset, cuda_iov_dist_h_current[nb_blocks_used].nb_bytes); );
+                    nb_blocks_used ++;
+                    assert (nb_blocks_used < CUDA_MAX_NB_BLOCKS*CUDA_IOV_MAX_TASK_PER_BLOCK);
+                }
             }
             
-            if (buffer_isfull) {
-                break;
+            if (!buffer_isfull) {
+                pConvertor->current_iov_pos = i;
             }
-        }
 
 #if defined(OPAL_DATATYPE_CUDA_TIMING)    
-        GET_TIME( end );
-        total_time = ELAPSED_TIME( start, end );
-        DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Pack to dest %p, iov is prepared in %ld microsec, kernel submitted to CUDA stream %d, nb_blocks %d\n", destination_base, total_time,  cuda_iov_pipeline_block->cuda_stream_id, nb_blocks_used); );
+            GET_TIME( end );
+            total_time = ELAPSED_TIME( start, end );
+            DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Pack to dest %p, iov is prepared in %ld microsec, kernel submitted to CUDA stream %d, nb_blocks %d\n", destination_base, total_time,  cuda_iov_pipeline_block->cuda_stream_id, nb_blocks_used); );
 #endif
 
-        cudaMemcpyAsync(cuda_iov_dist_d_current, cuda_iov_dist_h_current, sizeof(ddt_cuda_iov_dist_cached_t)*(nb_blocks_used), cudaMemcpyHostToDevice, *cuda_stream_iov);
-        DT_CUDA_DEBUG ( opal_cuda_output(2, "kernel launched src_base %p, dst_base %p, nb_blocks %ld\n", source_base, destination_base, nb_blocks_used ); );
-        opal_generic_simple_pack_cuda_iov_cached_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cuda_iov_dist_d_current, nb_blocks_used, source_base, destination_base);
-        cuda_err = cudaEventRecord(cuda_iov_pipeline_block->cuda_event, *cuda_stream_iov);
-        opal_cuda_check_error(cuda_err);
-        iov_pipeline_block_id ++;
-        iov_pipeline_block_id = iov_pipeline_block_id % NB_STREAMS;
+          //  opal_ddt_check_cuda_iov_is_full(pConvertor, pConvertor->current_cuda_iov_pos + nb_blocks_used); /* make sure cuda iov has enough space */
+            cudaMemcpyAsync(cuda_iov_dist_d_current, cuda_iov_dist_h_current, sizeof(ddt_cuda_iov_dist_cached_t)*(nb_blocks_used), cudaMemcpyHostToDevice, *cuda_stream_iov);
+            DT_CUDA_DEBUG ( opal_cuda_output(2, "kernel launched src_base %p, dst_base %p, nb_blocks %ld\n", source_base, destination_base, nb_blocks_used ); );
+            opal_generic_simple_pack_cuda_iov_cached_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cuda_iov_dist_d_current, nb_blocks_used, source_base, destination_base);
+            cuda_err = cudaEventRecord(cuda_iov_pipeline_block->cuda_event, *cuda_stream_iov);
+            opal_cuda_check_error(cuda_err);
+            iov_pipeline_block_id ++;
+            iov_pipeline_block_id = iov_pipeline_block_id % NB_STREAMS;
+            pConvertor->current_cuda_iov_pos += nb_blocks_used;
         
-//        orig_stack_index = pStack->index;
-        iov_start_pos = iov_end_pos;
-        iov_end_pos = iov_start_pos + 1000;
-        if (iov_end_pos > ddt_iov_count) {
-            iov_end_pos = ddt_iov_count;
+    //        orig_stack_index = pStack->index;
+            iov_start_pos = iov_end_pos;
+            iov_end_pos = iov_start_pos + IOV_PIPELINE_SIZE;
+            if (iov_end_pos >= ddt_iov_count) {
+                iov_end_pos = ddt_iov_count;
+            }
+            /* count = 0 done, iov cached finished */
+            if (pConvertor->current_iov_pos == ddt_iov_count) {
+                pConvertor->current_count ++;
+                opal_ddt_set_cuda_iov_is_cached(pConvertor, pConvertor->current_cuda_iov_pos);
+                DT_CUDA_DEBUG ( opal_cuda_output(2, "Pack cuda iov is cached, count %d\n", pConvertor->current_cuda_iov_pos););
+            }
+            DT_CUDA_DEBUG ( opal_cuda_output(4, "Pack iov start pos %d end pos %d, submit to CUDA stream %d\n", iov_start_pos, iov_end_pos, cuda_streams->current_stream_id); );
         }
-        DT_CUDA_DEBUG ( opal_cuda_output(4, "Pack iov start pos %d end pos %d, submit to CUDA stream %d\n", iov_start_pos, iov_end_pos, cuda_streams->current_stream_id); );
     }
-    
 
     for (i = 0; i < NB_STREAMS; i++) {
         cudaStreamSynchronize(cuda_streams->opal_cuda_stream[i]);
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
index ed105558f96..7d3dfa404ac 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
@@ -709,7 +709,6 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
     GET_TIME(start);
 #endif
     buffer_size = iov[0].iov_len;
-    cuda_iov_count = 1000;
     total_unpacked = 0;
     total_converted = pConvertor->bConverted;
     cuda_streams->current_stream_id = 0;
@@ -731,111 +730,122 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
     thread_per_block = CUDA_WARP_SIZE * 5;
     nb_blocks = 256;
     
-    iov_start_pos = pConvertor->current_iov_pos;
-    iov_end_pos = iov_start_pos + 1000;
-    if (iov_end_pos > ddt_iov_count) {
-        iov_end_pos = ddt_iov_count;
-    }
-    destination_base = (unsigned char*)pConvertor->pBaseBuf;
+    while(pConvertor->current_count < pConvertor->count && !buffer_isfull) {
+    
+        iov_start_pos = pConvertor->current_iov_pos;
+        iov_end_pos = iov_start_pos + IOV_PIPELINE_SIZE;
+        if (iov_end_pos > ddt_iov_count) {
+            iov_end_pos = ddt_iov_count;
+        }
+        destination_base = (unsigned char*)pConvertor->pBaseBuf;
 
-    while (iov_start_pos < iov_end_pos && !buffer_isfull) {
+        while (iov_start_pos < iov_end_pos && !buffer_isfull) {
 
-        nb_blocks_used = 0;
-        cuda_iov_pipeline_block = current_cuda_device->cuda_iov_pipeline_block[iov_pipeline_block_id];
-        cuda_iov_dist_h_current = cuda_iov_pipeline_block->cuda_iov_dist_cached_h;
-        cuda_iov_dist_d_current = cuda_iov_pipeline_block->cuda_iov_dist_cached_d;
-        cuda_stream_iov = cuda_iov_pipeline_block->cuda_stream;
-        cuda_err = cudaStreamWaitEvent(*cuda_stream_iov, cuda_iov_pipeline_block->cuda_event, 0);
-        opal_cuda_check_error(cuda_err);
+            nb_blocks_used = 0;
+            cuda_iov_pipeline_block = current_cuda_device->cuda_iov_pipeline_block[iov_pipeline_block_id];
+            cuda_iov_dist_h_current = cuda_iov_pipeline_block->cuda_iov_dist_cached_h;
+            cuda_iov_dist_d_current = cached_cuda_iov_dist_d + pConvertor->current_cuda_iov_pos;
+            cuda_stream_iov = cuda_iov_pipeline_block->cuda_stream;
+            cuda_err = cudaStreamWaitEvent(*cuda_stream_iov, cuda_iov_pipeline_block->cuda_event, 0);
+            opal_cuda_check_error(cuda_err);
         
 
 #if defined (OPAL_DATATYPE_CUDA_TIMING)
-        GET_TIME(start);
+            GET_TIME(start);
 #endif
 
-        for (i = iov_start_pos; i < iov_end_pos; i++) {
-            if (pConvertor->current_iov_partial_length > 0) {
-                iov_len = pConvertor->current_iov_partial_length;
-                pConvertor->current_iov_partial_length = 0;
-            } else {
-                iov_len = ddt_iov[i].iov_len;
-            }
-            if (buffer_size >= iov_len) {
-                length_per_iovec = iov_len;
-            } else {
-              /*  orig_alignment = opal_datatype_basicDatatypes[pElem->elem.common.type]->size;*/
-                orig_alignment = ALIGNMENT_CHAR;
-                length_per_iovec = buffer_size / orig_alignment * orig_alignment;
-                buffer_isfull = 1;
-                pConvertor->current_iov_partial_length = iov_len - length_per_iovec;
-                pConvertor->current_iov_pos = i;
-            }
-            buffer_size -= length_per_iovec;
-            total_unpacked += length_per_iovec;
-            destination = (size_t)(ddt_iov[i].iov_base) + (ddt_iov[i].iov_len - iov_len) + destination_base;
-
-            alignment = ALIGNMENT_DOUBLE;
-
-            count_desc = length_per_iovec / alignment;
-            residue_desc = length_per_iovec % alignment;
-            nb_blocks_per_description = (count_desc + thread_per_block - 1) / thread_per_block;
-            DT_CUDA_DEBUG ( opal_cuda_output(10, "Unpack description %d, size %d, residue %d, alignment %d\n", i, count_desc, residue_desc, alignment); );
-            for (j = 0; j < nb_blocks_per_description; j++) {
-                cuda_iov_dist_h_current[nb_blocks_used].dst_offset = destination + j * thread_per_block * alignment - destination_base;
-                cuda_iov_dist_h_current[nb_blocks_used].src_offset = source - source_base;
-                if ( (j+1) * thread_per_block <= count_desc) {
-                    cuda_iov_dist_h_current[nb_blocks_used].nb_bytes = thread_per_block * alignment;
+            for (i = iov_start_pos; i < iov_end_pos && !buffer_isfull; i++) {
+                if (pConvertor->current_iov_partial_length > 0) {
+                    iov_len = pConvertor->current_iov_partial_length;
+                    pConvertor->current_iov_partial_length = 0;
+                } else {
+                    iov_len = ddt_iov[i].iov_len;
+                }
+                if (buffer_size >= iov_len) {
+                    length_per_iovec = iov_len;
                 } else {
-                    cuda_iov_dist_h_current[nb_blocks_used].nb_bytes = (thread_per_block - ((j+1)*thread_per_block - count_desc)) * alignment;
+                  /*  orig_alignment = opal_datatype_basicDatatypes[pElem->elem.common.type]->size;*/
+                    orig_alignment = ALIGNMENT_CHAR;
+                    length_per_iovec = buffer_size / orig_alignment * orig_alignment;
+                    buffer_isfull = 1;
+                    pConvertor->current_iov_partial_length = iov_len - length_per_iovec;
+                    pConvertor->current_iov_pos = i;
                 }
+                buffer_size -= length_per_iovec;
+                total_unpacked += length_per_iovec;
+                destination = (size_t)(ddt_iov[i].iov_base) + (ddt_iov[i].iov_len - iov_len) + destination_base;
+
+                alignment = ALIGNMENT_DOUBLE;
+
+                count_desc = length_per_iovec / alignment;
+                residue_desc = length_per_iovec % alignment;
+                nb_blocks_per_description = (count_desc + thread_per_block - 1) / thread_per_block;
+                DT_CUDA_DEBUG ( opal_cuda_output(10, "Unpack description %d, size %d, residue %d, alignment %d\n", i, count_desc, residue_desc, alignment); );
+                for (j = 0; j < nb_blocks_per_description; j++) {
+                    cuda_iov_dist_h_current[nb_blocks_used].dst_offset = destination + j * thread_per_block * alignment - destination_base;
+                    cuda_iov_dist_h_current[nb_blocks_used].src_offset = source - source_base;
+                    if ( (j+1) * thread_per_block <= count_desc) {
+                        cuda_iov_dist_h_current[nb_blocks_used].nb_bytes = thread_per_block * alignment;
+                    } else {
+                        cuda_iov_dist_h_current[nb_blocks_used].nb_bytes = (thread_per_block - ((j+1)*thread_per_block - count_desc)) * alignment;
+                    }
 #if defined (OPAL_DATATYPE_CUDA_DEBUG)
-                assert (cuda_iov_dist_h_current[nb_blocks_used].nb_bytes > 0); 
+                    assert (cuda_iov_dist_h_current[nb_blocks_used].nb_bytes > 0); 
 #endif /* OPAL_DATATYPE_CUDA_DEBUG */
-                source += cuda_iov_dist_h_current[nb_blocks_used].nb_bytes;
-                DT_CUDA_DEBUG( opal_cuda_output(12, "Unpack \tblock %d, src_offset %ld, dst_offset %ld, nb_bytes %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].src_offset, cuda_iov_dist_h_current[nb_blocks_used].dst_offset, cuda_iov_dist_h_current[nb_blocks_used].nb_bytes); );
-                nb_blocks_used ++;
-            }
+                    source += cuda_iov_dist_h_current[nb_blocks_used].nb_bytes;
+                    DT_CUDA_DEBUG( opal_cuda_output(12, "Unpack \tblock %d, src_offset %ld, dst_offset %ld, nb_bytes %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].src_offset, cuda_iov_dist_h_current[nb_blocks_used].dst_offset, cuda_iov_dist_h_current[nb_blocks_used].nb_bytes); );
+                    nb_blocks_used ++;
+                }
 
-            /* handle residue */
-            if (residue_desc != 0) {
-               /* orig_alignment = opal_datatype_basicDatatypes[pElem->elem.common.type]->size;*/
-                orig_alignment = ALIGNMENT_CHAR;
-                cuda_iov_dist_h_current[nb_blocks_used].dst_offset = destination + length_per_iovec / alignment * alignment - destination_base;
-                cuda_iov_dist_h_current[nb_blocks_used].src_offset = source - source_base;
-                cuda_iov_dist_h_current[nb_blocks_used].nb_bytes = (length_per_iovec - length_per_iovec / alignment * alignment) / orig_alignment;
+                /* handle residue */
+                if (residue_desc != 0) {
+                   /* orig_alignment = opal_datatype_basicDatatypes[pElem->elem.common.type]->size;*/
+                    orig_alignment = ALIGNMENT_CHAR;
+                    cuda_iov_dist_h_current[nb_blocks_used].dst_offset = destination + length_per_iovec / alignment * alignment - destination_base;
+                    cuda_iov_dist_h_current[nb_blocks_used].src_offset = source - source_base;
+                    cuda_iov_dist_h_current[nb_blocks_used].nb_bytes = (length_per_iovec - length_per_iovec / alignment * alignment) / orig_alignment;
 #if defined (OPAL_DATATYPE_CUDA_DEBUG)
-                assert (cuda_iov_dist_h_current[nb_blocks_used].nb_bytes > 0);
+                    assert (cuda_iov_dist_h_current[nb_blocks_used].nb_bytes > 0);
 #endif /* OPAL_DATATYPE_CUDA_DEBUG */
-                source += cuda_iov_dist_h_current[nb_blocks_used].nb_bytes;
-                DT_CUDA_DEBUG( opal_cuda_output(12, "Unpack \tblock %d, src %ld, dst %ld, nb_bytes %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].src_offset, cuda_iov_dist_h_current[nb_blocks_used].dst_offset, cuda_iov_dist_h_current[nb_blocks_used].nb_bytes); );
-                nb_blocks_used ++;
+                    source += cuda_iov_dist_h_current[nb_blocks_used].nb_bytes;
+                    DT_CUDA_DEBUG( opal_cuda_output(12, "Unpack \tblock %d, src %ld, dst %ld, nb_bytes %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].src_offset, cuda_iov_dist_h_current[nb_blocks_used].dst_offset, cuda_iov_dist_h_current[nb_blocks_used].nb_bytes); );
+                    nb_blocks_used ++;
+                }
             }
-
-            if (buffer_isfull) {
-                break;
+            
+            if (!buffer_isfull) {
+                pConvertor->current_iov_pos = i;
             }
-        }
 
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
-        GET_TIME( end );
-        total_time = ELAPSED_TIME( start, end );
-        DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Unpack src %p, iov is prepared in %ld microsec, kernel submitted to CUDA stream %d, nb_blocks_used %d\n", source_base, total_time,  cuda_iov_pipeline_block->cuda_stream_id, nb_blocks_used); );
+            GET_TIME( end );
+            total_time = ELAPSED_TIME( start, end );
+            DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Unpack src %p, iov is prepared in %ld microsec, kernel submitted to CUDA stream %d, nb_blocks_used %d\n", source_base, total_time,  cuda_iov_pipeline_block->cuda_stream_id, nb_blocks_used); );
 #endif
 
-        cudaMemcpyAsync(cuda_iov_dist_d_current, cuda_iov_dist_h_current, sizeof(ddt_cuda_iov_dist_cached_t)*(nb_blocks_used), cudaMemcpyHostToDevice, *cuda_stream_iov);
-        opal_generic_simple_unpack_cuda_iov_cached_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cuda_iov_dist_d_current, nb_blocks_used, source_base, destination_base);
-        cuda_err = cudaEventRecord(cuda_iov_pipeline_block->cuda_event, *cuda_stream_iov);
-        opal_cuda_check_error(cuda_err);
-        iov_pipeline_block_id ++;
-        iov_pipeline_block_id = iov_pipeline_block_id % NB_STREAMS;
+            cudaMemcpyAsync(cuda_iov_dist_d_current, cuda_iov_dist_h_current, sizeof(ddt_cuda_iov_dist_cached_t)*(nb_blocks_used), cudaMemcpyHostToDevice, *cuda_stream_iov);
+            opal_generic_simple_unpack_cuda_iov_cached_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cuda_iov_dist_d_current, nb_blocks_used, source_base, destination_base);
+            cuda_err = cudaEventRecord(cuda_iov_pipeline_block->cuda_event, *cuda_stream_iov);
+            opal_cuda_check_error(cuda_err);
+            iov_pipeline_block_id ++;
+            iov_pipeline_block_id = iov_pipeline_block_id % NB_STREAMS;
+            pConvertor->current_cuda_iov_pos += nb_blocks_used;
         
-        iov_start_pos = iov_end_pos;
-        iov_end_pos = iov_start_pos + 1000;
-        if (iov_end_pos > ddt_iov_count) {
-            iov_end_pos = ddt_iov_count;
-        }
-        DT_CUDA_DEBUG ( opal_cuda_output(4, "Unpack iov start pos %d end pos %d, submit to CUDA stream %d\n", iov_start_pos, iov_end_pos, cuda_streams->current_stream_id); );
+            iov_start_pos = iov_end_pos;
+            iov_end_pos = iov_start_pos + IOV_PIPELINE_SIZE;
+            if (iov_end_pos >= ddt_iov_count) {
+                iov_end_pos = ddt_iov_count;
+            }
+            /* finished */
+            if (pConvertor->current_iov_pos == ddt_iov_count) {
+                pConvertor->current_count ++;
+                opal_ddt_set_cuda_iov_is_cached(pConvertor, pConvertor->current_cuda_iov_pos);
+                DT_CUDA_DEBUG ( opal_cuda_output(2, "Pack cuda iov is cached, count %d\n", pConvertor->current_cuda_iov_pos););
+            }
+            DT_CUDA_DEBUG ( opal_cuda_output(4, "Unpack iov start pos %d end pos %d, submit to CUDA stream %d\n", iov_start_pos, iov_end_pos, cuda_streams->current_stream_id); );
 
+        }
+    
     }
 
     for (i = 0; i < NB_STREAMS; i++) {
diff --git a/test/datatype/ddt_benchmark.c b/test/datatype/ddt_benchmark.c
index 50f62ec5839..f2112d598b2 100644
--- a/test/datatype/ddt_benchmark.c
+++ b/test/datatype/ddt_benchmark.c
@@ -1211,12 +1211,12 @@ int main( int argc, char* argv[] )
     
     printf( "\n\n#\n * TEST UPPER TRIANGULAR MATRIX (size 100)\n #\n\n" );
     int mat_size = 500;
-    for (mat_size = 2000; mat_size <= 2000; mat_size +=500) {
+    for (mat_size = 1000; mat_size <= 1000; mat_size +=500) {
         pdt = upper_matrix(mat_size);
         printf("----matrix size %d-----\n", mat_size);
         if( outputFlags & CHECK_PACK_UNPACK ) {
             for (i = 1; i <= 1; i++) {
-                local_copy_with_convertor(pdt, 1, 4000001, mat_size);
+                local_copy_with_convertor(pdt, 1, 4000000, mat_size);
             }
         }
         OBJ_RELEASE( pdt ); assert( pdt == NULL );

From ee0408f6fb2f2becb1cf263ccc20e6adc10b2a62 Mon Sep 17 00:00:00 2001
From: eddy16112 <eddy16112@gmail.com>
Date: Mon, 9 Nov 2015 22:07:04 -0500
Subject: [PATCH 146/190] check point, ready to use cached cuda iov

---
 opal/datatype/cuda/opal_datatype_cuda.cu      | 72 ++++++++++++-------
 opal/datatype/cuda/opal_datatype_cuda.cuh     | 12 ++--
 .../cuda/opal_datatype_cuda_internal.cuh      | 17 +++--
 .../cuda/opal_datatype_pack_cuda_kernel.cu    | 13 ++--
 .../cuda/opal_datatype_pack_cuda_wrapper.cu   | 37 +++++++---
 .../cuda/opal_datatype_unpack_cuda_kernel.cu  | 13 ++--
 .../cuda/opal_datatype_unpack_cuda_wrapper.cu | 38 +++++++---
 opal/datatype/opal_datatype.h                 |  4 +-
 opal/datatype/opal_datatype_create.c          | 12 ++--
 opal/datatype/opal_datatype_cuda.c            | 11 +--
 opal/datatype/opal_datatype_cuda.h            |  6 +-
 11 files changed, 147 insertions(+), 88 deletions(-)

diff --git a/opal/datatype/cuda/opal_datatype_cuda.cu b/opal/datatype/cuda/opal_datatype_cuda.cu
index ea77cadbae8..3ac7ba0ac5f 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cu
+++ b/opal/datatype/cuda/opal_datatype_cuda.cu
@@ -223,7 +223,8 @@ int32_t opal_ddt_cuda_kernel_init(void)
             cudaMallocHost((void **)(&(cuda_iov_pipeline_block->cuda_iov_dist_non_cached_h)), sizeof(ddt_cuda_iov_dist_non_cached_t) * CUDA_MAX_NB_BLOCKS * CUDA_IOV_MAX_TASK_PER_BLOCK);
             cudaMalloc((void **)(&(cuda_iov_pipeline_block->cuda_iov_dist_non_cached_d)), sizeof(ddt_cuda_iov_dist_non_cached_t) * CUDA_MAX_NB_BLOCKS * CUDA_IOV_MAX_TASK_PER_BLOCK);
             cudaMallocHost((void **)(&(cuda_iov_pipeline_block->cuda_iov_dist_cached_h)), sizeof(ddt_cuda_iov_dist_cached_t) * CUDA_MAX_NB_BLOCKS * CUDA_IOV_MAX_TASK_PER_BLOCK);
-            cudaMalloc((void **)(&(cuda_iov_pipeline_block->cuda_iov_dist_cached_d)), sizeof(ddt_cuda_iov_dist_cached_t) * CUDA_MAX_NB_BLOCKS * CUDA_IOV_MAX_TASK_PER_BLOCK);
+            cudaMallocHost((void **)(&(cuda_iov_pipeline_block->cuda_iov_contig_buf_h)), sizeof(uintptr_t) * CUDA_MAX_NB_BLOCKS * CUDA_IOV_MAX_TASK_PER_BLOCK);
+            cudaMalloc((void **)(&(cuda_iov_pipeline_block->cuda_iov_contig_buf_d)), sizeof(uintptr_t) * CUDA_MAX_NB_BLOCKS * CUDA_IOV_MAX_TASK_PER_BLOCK);
             cuda_iov_pipeline_block->cuda_stream = &(cuda_streams->opal_cuda_stream[0]);
             cuda_iov_pipeline_block->cuda_stream_id = 0;
             cudaEventCreate(&(cuda_iov_pipeline_block->cuda_event), cudaEventDisableTiming);
@@ -263,7 +264,8 @@ int32_t opal_ddt_cuda_kernel_fini(void)
                 cudaFreeHost(cuda_iov_pipeline_block->cuda_iov_dist_non_cached_h);
                 cudaFree(cuda_iov_pipeline_block->cuda_iov_dist_non_cached_d);
                 cudaFreeHost(cuda_iov_pipeline_block->cuda_iov_dist_cached_h);
-                cudaFree(cuda_iov_pipeline_block->cuda_iov_dist_cached_d);
+                cudaFreeHost(cuda_iov_pipeline_block->cuda_iov_contig_buf_h);
+                cudaFree(cuda_iov_pipeline_block->cuda_iov_contig_buf_d);
                 cudaEventDestroy(cuda_iov_pipeline_block->cuda_event);
                 cuda_iov_pipeline_block->cuda_stream = NULL;
                 cuda_iov_pipeline_block->cuda_stream_id = -1;
@@ -279,14 +281,20 @@ int32_t opal_ddt_cuda_kernel_fini(void)
     return OPAL_SUCCESS;
 }
 
-void* opal_ddt_cuda_iov_dist_init(uint32_t size) 
+void* opal_ddt_cached_cuda_iov_init(uint32_t size) 
 {
 #if OPAL_DATATYPE_CUDA_IOV_CACHE 
-    ddt_cuda_iov_dist_cached_t *p = NULL;
-    cudaMalloc((void **)(&p), sizeof(ddt_cuda_iov_dist_cached_t) * size);
-    if (p != NULL) {
-        DT_CUDA_DEBUG( opal_cuda_output( 2, "Malloc cuda_iov_dist_cached for ddt is successed %p.\n", p); );
-        return p;
+    ddt_cuda_iov_total_cached_t *tmp = (ddt_cuda_iov_total_cached_t *)malloc(sizeof(ddt_cuda_iov_total_cached_t));
+    ddt_cuda_iov_dist_cached_t *tmp_cuda_iov_d = NULL;
+    cudaMalloc((void **)(&tmp_cuda_iov_d), sizeof(ddt_cuda_iov_dist_cached_t) * size);
+    uint32_t *tmp_nb_bytes = (uint32_t *)malloc(sizeof(uint32_t) * size);
+    if (tmp != NULL && tmp_cuda_iov_d != NULL && tmp_nb_bytes != NULL) {
+        tmp->cuda_iov_dist_d = tmp_cuda_iov_d;
+        tmp->cuda_iov_count = size;
+        tmp->cuda_iov_is_cached = 0;
+        tmp->nb_bytes_h = tmp_nb_bytes;
+        DT_CUDA_DEBUG( opal_cuda_output( 2, "Malloc cuda_iov_dist_cached for ddt is successed, cached cuda iov %p, cuda_iov_d %p, nb_bytes_h %p, size %d.\n", tmp, tmp_cuda_iov_d, tmp_nb_bytes, size); );
+        return tmp;
     } else {
         DT_CUDA_DEBUG( opal_cuda_output( 0, "Malloc cuda_iov_dist_cached for ddt is failed.\n"); );
         return NULL;
@@ -297,39 +305,54 @@ void* opal_ddt_cuda_iov_dist_init(uint32_t size)
 #endif /* OPAL_DATATYPE_CUDA_IOV_CACHE */
 }
 
-void opal_ddt_cuda_iov_dist_fini(void* cuda_iov_dist) 
+void opal_ddt_cached_cuda_iov_fini(void* cached_cuda_iov) 
 {
 #if OPAL_DATATYPE_CUDA_IOV_CACHE
-    ddt_cuda_iov_dist_cached_t *p = (ddt_cuda_iov_dist_cached_t *) cuda_iov_dist;
-    if (p != NULL) {
-        cudaFree(p);
-        DT_CUDA_DEBUG( opal_cuda_output( 2, "Free cuda_iov_dist for ddt is successed %p.\n", p); );
+    ddt_cuda_iov_total_cached_t *tmp = (ddt_cuda_iov_total_cached_t *) cached_cuda_iov;
+    if (tmp != NULL) {
+        DT_CUDA_DEBUG( opal_cuda_output( 2, "Free cuda_iov_dist for ddt is successed %p.\n", tmp); );
+        if (tmp->cuda_iov_dist_d != NULL) {
+            cudaFree(tmp->cuda_iov_dist_d);
+            tmp->cuda_iov_dist_d = NULL;
+        }
+        if (tmp->nb_bytes_h != NULL) {
+            free(tmp->nb_bytes_h);
+            tmp->nb_bytes_h = NULL;
+        }
+        free(tmp);
+        tmp = NULL;
     }
 #endif /* OPAL_DATATYPE_CUDA_IOV_CACHE */
 }
 
-void opal_ddt_get_cached_cuda_iov(struct opal_convertor_t *convertor,
-                                  ddt_cuda_iov_dist_cached_t **cuda_iov_dist,
-                                  uint32_t* cuda_iov_count, uint8_t *cuda_iov_is_cached)
+void opal_ddt_get_cached_cuda_iov(struct opal_convertor_t *convertor, ddt_cuda_iov_total_cached_t **cached_cuda_iov)
 {
     opal_datatype_t *datatype = (opal_datatype_t *)convertor->pDesc;
-    if (datatype->cached_cuda_iov_dist == NULL) {
-        datatype->cached_cuda_iov_dist = opal_ddt_cuda_iov_dist_init(NUM_CUDA_IOV_PER_DDT);
-        datatype->cached_cuda_iov_count = NUM_CUDA_IOV_PER_DDT;
+    if (datatype->cached_cuda_iov == NULL) {
+        datatype->cached_cuda_iov = opal_ddt_cached_cuda_iov_init(NUM_CUDA_IOV_PER_DDT);
     }
-    *cuda_iov_dist = (ddt_cuda_iov_dist_cached_t *)datatype->cached_cuda_iov_dist;
-    *cuda_iov_count = datatype->cached_cuda_iov_count;                      
+    *cached_cuda_iov = (ddt_cuda_iov_total_cached_t *)datatype->cached_cuda_iov;                   
 }
 
-void opal_ddt_set_cuda_iov_is_cached(struct opal_convertor_t *convertor, uint32_t cuda_iov_count)
+void opal_ddt_set_cuda_iov_cached(struct opal_convertor_t *convertor, uint32_t cuda_iov_count)
 {
     opal_datatype_t *datatype = (opal_datatype_t *)convertor->pDesc;
-    assert(datatype->cached_cuda_iov_dist != NULL);
-    datatype->cached_cuda_iov_count = cuda_iov_count;
+    assert(datatype->cached_cuda_iov != NULL);
+    ddt_cuda_iov_total_cached_t *tmp = (ddt_cuda_iov_total_cached_t *)datatype->cached_cuda_iov;
+    tmp->cuda_iov_count = cuda_iov_count;
+}
+
+uint8_t opal_ddt_cuda_iov_is_cached(struct opal_convertor_t *convertor)
+{
+    opal_datatype_t *datatype = (opal_datatype_t *)convertor->pDesc;
+    assert(datatype->cached_cuda_iov != NULL);
+    ddt_cuda_iov_total_cached_t *tmp = (ddt_cuda_iov_total_cached_t *)datatype->cached_cuda_iov;
+    return tmp->cuda_iov_is_cached;
 }
 
 void opal_ddt_check_cuda_iov_is_full(struct opal_convertor_t *convertor, uint32_t cuda_iov_count)
 {
+#if 0
     opal_datatype_t *datatype = (opal_datatype_t *)convertor->pDesc;
     assert(datatype->cached_cuda_iov_dist != NULL);
     if (datatype->cached_cuda_iov_count < cuda_iov_count) {
@@ -343,6 +366,7 @@ void opal_ddt_check_cuda_iov_is_full(struct opal_convertor_t *convertor, uint32_
         datatype->cached_cuda_iov_count += NUM_CUDA_IOV_PER_DDT;
         opal_ddt_cuda_iov_dist_fini(old_iov);
     }
+#endif
 }
 
 int32_t opal_ddt_cuda_is_gpu_buffer(const void *ptr)
diff --git a/opal/datatype/cuda/opal_datatype_cuda.cuh b/opal/datatype/cuda/opal_datatype_cuda.cuh
index ea89dda3c53..6c071188c2c 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cuh
+++ b/opal/datatype/cuda/opal_datatype_cuda.cuh
@@ -115,17 +115,17 @@ void opal_ddt_cuda_d2dcpy(void* dst, const void* src, size_t count);
 
 void opal_dump_cuda_list(ddt_cuda_list_t *list);
 
-void* opal_ddt_cuda_iov_dist_init(void);
+void* opal_ddt_cached_cuda_iov_init(void);
 
-void opal_ddt_cuda_iov_dist_fini(void *cuda_iov_dist);
+void opal_ddt_cached_cuda_iov_fini(void *cached_cuda_iov);
 
 void pack_iov_cached(opal_convertor_t* pConvertor, unsigned char *destination);
 
-void opal_ddt_get_cached_cuda_iov(struct opal_convertor_t *convertor,
-                                  ddt_cuda_iov_dist_cached_t **cuda_iov_dist,
-                                  uint32_t *cuda_iov_count, uint8_t *cuda_iov_is_cached);
+void opal_ddt_get_cached_cuda_iov(struct opal_convertor_t *convertor, ddt_cuda_iov_total_cached_t **cached_cuda_iov);
                                   
-void opal_ddt_set_cuda_iov_is_cached(struct opal_convertor_t *convertor, uint32_t cuda_iov_count);
+void opal_ddt_set_cuda_iov_cached(struct opal_convertor_t *convertor, uint32_t cuda_iov_count);
+
+uint8_t opal_ddt_cuda_iov_is_cached(struct opal_convertor_t *convertor);
 
 void opal_ddt_check_cuda_iov_is_full(struct opal_convertor_t *convertor, uint32_t cuda_iov_count);
 
diff --git a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
index d34e6039ff3..1b47b89f1d0 100644
--- a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
+++ b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
@@ -59,16 +59,23 @@ typedef struct {
 } ddt_cuda_iov_dist_non_cached_t;
 
 typedef struct {
-    size_t src_offset;
-    size_t dst_offset;
+    size_t ptr_offset;
     uint32_t nb_bytes;
 } ddt_cuda_iov_dist_cached_t;
 
+typedef struct {
+    ddt_cuda_iov_dist_cached_t* cuda_iov_dist_d;
+    uint32_t cuda_iov_count;
+    uint32_t* nb_bytes_h;
+    uint8_t cuda_iov_is_cached;
+} ddt_cuda_iov_total_cached_t;
+
 typedef struct {
     ddt_cuda_iov_dist_non_cached_t* cuda_iov_dist_non_cached_h;
     ddt_cuda_iov_dist_non_cached_t* cuda_iov_dist_non_cached_d;
     ddt_cuda_iov_dist_cached_t* cuda_iov_dist_cached_h;
-    ddt_cuda_iov_dist_cached_t* cuda_iov_dist_cached_d;
+    uintptr_t *cuda_iov_contig_buf_h;
+    uintptr_t *cuda_iov_contig_buf_d;
     cudaStream_t *cuda_stream;
     int32_t cuda_stream_id;
     cudaEvent_t cuda_event;
@@ -131,9 +138,9 @@ __global__ void opal_generic_simple_pack_cuda_iov_non_cached_kernel( ddt_cuda_io
 
 __global__ void opal_generic_simple_unpack_cuda_iov_non_cached_kernel( ddt_cuda_iov_dist_non_cached_t* cuda_iov_dist, int nb_blocks_used);
 
-__global__ void opal_generic_simple_pack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, int nb_blocks_used, unsigned char* source_base, unsigned char* destination_base);
+__global__ void opal_generic_simple_pack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, uintptr_t* cuda_iov_contig_buf_d, int nb_blocks_used, unsigned char* source_base);
 
-__global__ void opal_generic_simple_unpack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, int nb_blocks_used, unsigned char* source_base, unsigned char* destination_base);
+__global__ void opal_generic_simple_unpack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, uintptr_t* cuda_iov_contig_buf_d, int nb_blocks_used, unsigned char* destination_base);
 
 void opal_cuda_output(int output_id, const char *format, ...);
 
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
index 42acd8c4906..e85b83e55b5 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
@@ -88,10 +88,11 @@ __global__ void opal_generic_simple_pack_cuda_iov_non_cached_kernel( ddt_cuda_io
     }
 }
 
-__global__ void opal_generic_simple_pack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, int nb_blocks_used, unsigned char* source_base, unsigned char* destination_base)
+__global__ void opal_generic_simple_pack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, uintptr_t* cuda_iov_contig_buf_d, int nb_blocks_used, unsigned char* source_base)
 {
     uint32_t i, j;
-    size_t src_offset, dst_offset;
+    size_t src_offset;
+    unsigned char *dst;
     unsigned char *_source_tmp, *_destination_tmp;
     
     __shared__ uint32_t nb_tasks;
@@ -108,12 +109,12 @@ __global__ void opal_generic_simple_pack_cuda_iov_cached_kernel( ddt_cuda_iov_di
     __syncthreads();
     
     for (i = 0; i < nb_tasks; i++) {
-        src_offset = cuda_iov_dist[blockIdx.x + i * gridDim.x].src_offset;
-        dst_offset = cuda_iov_dist[blockIdx.x + i * gridDim.x].dst_offset;
+        src_offset = cuda_iov_dist[blockIdx.x + i * gridDim.x].ptr_offset;
+        dst = (unsigned char *)cuda_iov_contig_buf_d[blockIdx.x + i * gridDim.x];
         
         if (threadIdx.x == 0) {
             _source_tmp = source_base + src_offset;
-            _destination_tmp = destination_base + dst_offset;
+            _destination_tmp = dst;
             uint32_t _nb_bytes = cuda_iov_dist[blockIdx.x + i * gridDim.x].nb_bytes;
             /* block size is either multiple of ALIGNMENT_DOUBLE or residule */
             if ((uintptr_t)(_source_tmp) % ALIGNMENT_DOUBLE == 0 && (uintptr_t)_destination_tmp % ALIGNMENT_DOUBLE == 0 && _nb_bytes % ALIGNMENT_DOUBLE == 0) {
@@ -130,7 +131,7 @@ __global__ void opal_generic_simple_pack_cuda_iov_cached_kernel( ddt_cuda_iov_di
         for (j = threadIdx.x; j < copy_count; j += blockDim.x) {
             if (j < copy_count) {
                 _source_tmp = source_base + src_offset + j * alignment;
-                _destination_tmp = destination_base + dst_offset + j * alignment;
+                _destination_tmp = dst + j * alignment;
 #if !defined (OPAL_DATATYPE_CUDA_DRY_RUN)
                 if (alignment == ALIGNMENT_DOUBLE) {
                     *((long *)_destination_tmp) = *((long *)_source_tmp);
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
index b5155a0e9e1..e6b9545226f 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
@@ -950,6 +950,7 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
     ddt_cuda_stream_t *cuda_streams = current_cuda_device->cuda_streams;
     ddt_cuda_iov_dist_cached_t* cuda_iov_dist_h_current;
     ddt_cuda_iov_dist_cached_t* cuda_iov_dist_d_current;
+    uintptr_t *cuda_iov_contig_buf_h_current, *cuda_iov_contig_buf_d_current;
     ddt_cuda_iov_pipeline_block_t *cuda_iov_pipeline_block;
     int iov_pipeline_block_id = 0;
     cudaStream_t *cuda_stream_iov = NULL;
@@ -957,7 +958,9 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
     uint32_t ddt_iov_count;
     size_t iov_len;
     uint32_t iov_start_pos, iov_end_pos;
+    ddt_cuda_iov_total_cached_t* cached_cuda_iov;
     ddt_cuda_iov_dist_cached_t* cached_cuda_iov_dist_d;
+    uint32_t *cached_cuda_iov_nb_bytes_list_h, *cuda_iov_nb_bytes_list_h_current;
     uint32_t cached_cuda_iov_count;
     uint8_t cuda_iov_is_cached;
 
@@ -1027,8 +1030,13 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
     
     opal_convertor_raw_cached( pConvertor, &ddt_iov, &ddt_iov_count);
     assert(ddt_iov != NULL);
-    opal_ddt_get_cached_cuda_iov(pConvertor, &cached_cuda_iov_dist_d, &cached_cuda_iov_count, &cuda_iov_is_cached);
+    opal_ddt_get_cached_cuda_iov(pConvertor, &cached_cuda_iov);
+    cached_cuda_iov_dist_d = cached_cuda_iov->cuda_iov_dist_d;
     assert(cached_cuda_iov_dist_d != NULL);
+    cached_cuda_iov_nb_bytes_list_h = cached_cuda_iov->nb_bytes_h;
+    assert(cached_cuda_iov_nb_bytes_list_h != NULL);
+    cached_cuda_iov_count = cached_cuda_iov->cuda_iov_count;
+    cuda_iov_is_cached = cached_cuda_iov->cuda_iov_is_cached;
     DT_CUDA_DEBUG ( opal_cuda_output(4, "Pack iov count %d, submit to CUDA stream %d\n", ddt_iov_count, cuda_streams->current_stream_id); );
 
 #if defined(OPAL_DATATYPE_CUDA_TIMING)    
@@ -1040,7 +1048,8 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
     thread_per_block = CUDA_WARP_SIZE * 5;
     nb_blocks = 256;
     
-    while(pConvertor->current_count < pConvertor->count && !buffer_isfull) {
+    /* cuda iov is not cached, start to cache iov */
+    if(opal_ddt_cuda_iov_is_cached(pConvertor) == 0) {
     
         iov_start_pos = pConvertor->current_iov_pos;
         iov_end_pos = iov_start_pos + IOV_PIPELINE_SIZE;
@@ -1054,7 +1063,10 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
             nb_blocks_used = 0;
             cuda_iov_pipeline_block = current_cuda_device->cuda_iov_pipeline_block[iov_pipeline_block_id];
             cuda_iov_dist_h_current = cuda_iov_pipeline_block->cuda_iov_dist_cached_h;
+            cuda_iov_contig_buf_h_current = cuda_iov_pipeline_block->cuda_iov_contig_buf_h;
+            cuda_iov_contig_buf_d_current = cuda_iov_pipeline_block->cuda_iov_contig_buf_d;
             cuda_iov_dist_d_current = cached_cuda_iov_dist_d + pConvertor->current_cuda_iov_pos;
+            cuda_iov_nb_bytes_list_h_current = cached_cuda_iov_nb_bytes_list_h + pConvertor->current_cuda_iov_pos;
             cuda_stream_iov = cuda_iov_pipeline_block->cuda_stream;
             cuda_err = cudaStreamWaitEvent(*cuda_stream_iov, cuda_iov_pipeline_block->cuda_event, 0);
             opal_cuda_check_error(cuda_err);
@@ -1092,8 +1104,8 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
                 nb_blocks_per_description = (count_desc + thread_per_block - 1) / thread_per_block;
                 DT_CUDA_DEBUG ( opal_cuda_output(10, "Pack description %d, size %d, residue %d, alignment %d, nb_block_aquired %d\n", i, count_desc, residue_desc, alignment, nb_blocks_per_description); );
                 for (j = 0; j < nb_blocks_per_description; j++) {
-                    cuda_iov_dist_h_current[nb_blocks_used].src_offset = source + j * thread_per_block * alignment - source_base;
-                    cuda_iov_dist_h_current[nb_blocks_used].dst_offset = destination - destination_base;
+                    cuda_iov_dist_h_current[nb_blocks_used].ptr_offset = source + j * thread_per_block * alignment - source_base;
+                    cuda_iov_contig_buf_h_current[nb_blocks_used] = (uintptr_t)destination;
                     if ( (j+1) * thread_per_block <= count_desc) {
                         cuda_iov_dist_h_current[nb_blocks_used].nb_bytes = thread_per_block * alignment;
                     } else {
@@ -1102,8 +1114,9 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
 #if defined (OPAL_DATATYPE_CUDA_DEBUG)
                     assert(cuda_iov_dist_h_current[nb_blocks_used].nb_bytes > 0);
 #endif /* OPAL_DATATYPE_CUDA_DEBUG */
+                    cuda_iov_nb_bytes_list_h_current[nb_blocks_used] = cuda_iov_dist_h_current[nb_blocks_used].nb_bytes;
                     destination += cuda_iov_dist_h_current[nb_blocks_used].nb_bytes;
-                    DT_CUDA_DEBUG( opal_cuda_output(12, "Pack \tblock %d, src_offset %ld, dst_offset %ld, nb_bytes %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].src_offset, cuda_iov_dist_h_current[nb_blocks_used].dst_offset, cuda_iov_dist_h_current[nb_blocks_used].nb_bytes); );
+                    DT_CUDA_DEBUG( opal_cuda_output(12, "Pack \tblock %d, src_offset %ld, dst %p, nb_bytes %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].ptr_offset, cuda_iov_contig_buf_h_current[nb_blocks_used], cuda_iov_dist_h_current[nb_blocks_used].nb_bytes); );
                     nb_blocks_used ++;
                     assert (nb_blocks_used < CUDA_MAX_NB_BLOCKS*CUDA_IOV_MAX_TASK_PER_BLOCK);
                 }
@@ -1112,14 +1125,15 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
                 if (residue_desc != 0) {
                     /*orig_alignment = opal_datatype_basicDatatypes[pElem->elem.common.type]->size;*/
                     orig_alignment = ALIGNMENT_CHAR;
-                    cuda_iov_dist_h_current[nb_blocks_used].src_offset = source + length_per_iovec / alignment * alignment - source_base;
-                    cuda_iov_dist_h_current[nb_blocks_used].dst_offset = destination - destination_base;
-                    cuda_iov_dist_h_current[nb_blocks_used].nb_bytes = (length_per_iovec - length_per_iovec / alignment * alignment) / orig_alignment;
+                    cuda_iov_dist_h_current[nb_blocks_used].ptr_offset = source + length_per_iovec / alignment * alignment - source_base;
+                    cuda_iov_contig_buf_h_current[nb_blocks_used] = (uintptr_t)destination;
+                    cuda_iov_dist_h_current[nb_blocks_used].nb_bytes = length_per_iovec - length_per_iovec / alignment * alignment;
 #if defined (OPAL_DATATYPE_CUDA_DEBUG)
                     assert(cuda_iov_dist_h_current[nb_blocks_used].nb_bytes > 0);
 #endif /* OPAL_DATATYPE_CUDA_DEBUG */
+                    cuda_iov_nb_bytes_list_h_current[nb_blocks_used] = cuda_iov_dist_h_current[nb_blocks_used].nb_bytes;
                     destination += cuda_iov_dist_h_current[nb_blocks_used].nb_bytes;
-                    DT_CUDA_DEBUG( opal_cuda_output(12, "Pack \tblock %d, src_offset %ld, dst_offset %ld, nb_bytes %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].src_offset, cuda_iov_dist_h_current[nb_blocks_used].dst_offset, cuda_iov_dist_h_current[nb_blocks_used].nb_bytes); );
+                    DT_CUDA_DEBUG( opal_cuda_output(12, "Pack \tblock %d, src_offset %ld, dst %p, nb_bytes %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].ptr_offset, cuda_iov_contig_buf_h_current[nb_blocks_used], cuda_iov_dist_h_current[nb_blocks_used].nb_bytes); );
                     nb_blocks_used ++;
                     assert (nb_blocks_used < CUDA_MAX_NB_BLOCKS*CUDA_IOV_MAX_TASK_PER_BLOCK);
                 }
@@ -1137,8 +1151,9 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
 
           //  opal_ddt_check_cuda_iov_is_full(pConvertor, pConvertor->current_cuda_iov_pos + nb_blocks_used); /* make sure cuda iov has enough space */
             cudaMemcpyAsync(cuda_iov_dist_d_current, cuda_iov_dist_h_current, sizeof(ddt_cuda_iov_dist_cached_t)*(nb_blocks_used), cudaMemcpyHostToDevice, *cuda_stream_iov);
+            cudaMemcpyAsync(cuda_iov_contig_buf_d_current, cuda_iov_contig_buf_h_current, sizeof(uintptr_t)*(nb_blocks_used), cudaMemcpyHostToDevice, *cuda_stream_iov);
             DT_CUDA_DEBUG ( opal_cuda_output(2, "kernel launched src_base %p, dst_base %p, nb_blocks %ld\n", source_base, destination_base, nb_blocks_used ); );
-            opal_generic_simple_pack_cuda_iov_cached_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cuda_iov_dist_d_current, nb_blocks_used, source_base, destination_base);
+            opal_generic_simple_pack_cuda_iov_cached_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cuda_iov_dist_d_current, cuda_iov_contig_buf_d_current, nb_blocks_used, source_base);
             cuda_err = cudaEventRecord(cuda_iov_pipeline_block->cuda_event, *cuda_stream_iov);
             opal_cuda_check_error(cuda_err);
             iov_pipeline_block_id ++;
@@ -1154,7 +1169,7 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
             /* count = 0 done, iov cached finished */
             if (pConvertor->current_iov_pos == ddt_iov_count) {
                 pConvertor->current_count ++;
-                opal_ddt_set_cuda_iov_is_cached(pConvertor, pConvertor->current_cuda_iov_pos);
+                opal_ddt_set_cuda_iov_cached(pConvertor, pConvertor->current_cuda_iov_pos);
                 DT_CUDA_DEBUG ( opal_cuda_output(2, "Pack cuda iov is cached, count %d\n", pConvertor->current_cuda_iov_pos););
             }
             DT_CUDA_DEBUG ( opal_cuda_output(4, "Pack iov start pos %d end pos %d, submit to CUDA stream %d\n", iov_start_pos, iov_end_pos, cuda_streams->current_stream_id); );
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
index 1fe37218fba..9ea9414ba77 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
@@ -46,10 +46,11 @@ __global__ void opal_generic_simple_unpack_cuda_iov_non_cached_kernel( ddt_cuda_
     }
 }
 
-__global__ void opal_generic_simple_unpack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, int nb_blocks_used, unsigned char* source_base, unsigned char* destination_base)
+__global__ void opal_generic_simple_unpack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, uintptr_t* cuda_iov_contig_buf_d, int nb_blocks_used, unsigned char* destination_base)
 {
     uint32_t i, j;
-    size_t src_offset, dst_offset;
+    size_t dst_offset;
+    unsigned char *src;
     unsigned char *_source_tmp, *_destination_tmp;
     
     __shared__ uint32_t nb_tasks;
@@ -65,11 +66,11 @@ __global__ void opal_generic_simple_unpack_cuda_iov_cached_kernel( ddt_cuda_iov_
     __syncthreads();
     
     for (i = 0; i < nb_tasks; i++) {
-        src_offset = cuda_iov_dist[blockIdx.x + i * gridDim.x].src_offset;
-        dst_offset = cuda_iov_dist[blockIdx.x + i * gridDim.x].dst_offset;
+        src = (unsigned char *)cuda_iov_contig_buf_d[blockIdx.x + i * gridDim.x];
+        dst_offset = cuda_iov_dist[blockIdx.x + i * gridDim.x].ptr_offset;
         
         if (threadIdx.x == 0) {
-            _source_tmp = source_base + src_offset;
+            _source_tmp = src;
             _destination_tmp = destination_base + dst_offset;
             uint32_t _nb_bytes = cuda_iov_dist[blockIdx.x + i * gridDim.x].nb_bytes;
             if ((uintptr_t)(_destination_tmp) % ALIGNMENT_DOUBLE == 0 && (uintptr_t)_source_tmp % ALIGNMENT_DOUBLE == 0 && _nb_bytes % ALIGNMENT_DOUBLE == 0) {
@@ -88,7 +89,7 @@ __global__ void opal_generic_simple_unpack_cuda_iov_cached_kernel( ddt_cuda_iov_
                 if (copy_count > blockDim.x) printf("copy_count %d, dim %d\n", copy_count, blockDim.x);
             }*/
             if (j < copy_count) {
-                _source_tmp = source_base + src_offset + j * alignment;
+                _source_tmp = src + j * alignment;
                 _destination_tmp = destination_base + dst_offset + j * alignment;
   /*              if (threadIdx.x == 0) {
                     printf("_src %p, dst %p, alignment %d, blk %d, j %d, count %d\n", _source_tmp, _destination_tmp, alignment, blockIdx.x, j, copy_count);
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
index 7d3dfa404ac..49171e5b277 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
@@ -646,6 +646,7 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
     ddt_cuda_stream_t *cuda_streams = current_cuda_device->cuda_streams;
     ddt_cuda_iov_dist_cached_t* cuda_iov_dist_h_current;
     ddt_cuda_iov_dist_cached_t* cuda_iov_dist_d_current;
+    uintptr_t *cuda_iov_contig_buf_h_current, *cuda_iov_contig_buf_d_current;
     ddt_cuda_iov_pipeline_block_t *cuda_iov_pipeline_block;
     int iov_pipeline_block_id = 0;
     cudaStream_t *cuda_stream_iov = NULL;
@@ -653,7 +654,9 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
     uint32_t ddt_iov_count;
     size_t iov_len;
     uint32_t iov_start_pos, iov_end_pos;
+    ddt_cuda_iov_total_cached_t* cached_cuda_iov;
     ddt_cuda_iov_dist_cached_t* cached_cuda_iov_dist_d;
+    uint32_t *cached_cuda_iov_nb_bytes_list_h, *cuda_iov_nb_bytes_list_h_current;
     uint32_t cached_cuda_iov_count;
     uint8_t cuda_iov_is_cached;
 
@@ -717,8 +720,13 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
     source_base = source;
     opal_convertor_raw_cached( pConvertor, &ddt_iov, &ddt_iov_count);
     assert(ddt_iov != NULL);
-    opal_ddt_get_cached_cuda_iov(pConvertor, &cached_cuda_iov_dist_d, &cached_cuda_iov_count, &cuda_iov_is_cached);
+    opal_ddt_get_cached_cuda_iov(pConvertor, &cached_cuda_iov);
+    cached_cuda_iov_dist_d = cached_cuda_iov->cuda_iov_dist_d;
     assert(cached_cuda_iov_dist_d != NULL);
+    cached_cuda_iov_nb_bytes_list_h = cached_cuda_iov->nb_bytes_h;
+    assert(cached_cuda_iov_nb_bytes_list_h != NULL);
+    cached_cuda_iov_count = cached_cuda_iov->cuda_iov_count;
+    cuda_iov_is_cached = cached_cuda_iov->cuda_iov_is_cached;
     DT_CUDA_DEBUG ( opal_cuda_output(4, "Unpack iov count %d, submit to CUDA stream %d\n", ddt_iov_count, cuda_streams->current_stream_id); );
 
 #if defined (OPAL_DATATYPE_CUDA_TIMING)
@@ -730,7 +738,8 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
     thread_per_block = CUDA_WARP_SIZE * 5;
     nb_blocks = 256;
     
-    while(pConvertor->current_count < pConvertor->count && !buffer_isfull) {
+    /* cuda iov is not cached, start to cache iov */
+    if(opal_ddt_cuda_iov_is_cached(pConvertor) == 0) {
     
         iov_start_pos = pConvertor->current_iov_pos;
         iov_end_pos = iov_start_pos + IOV_PIPELINE_SIZE;
@@ -744,7 +753,11 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
             nb_blocks_used = 0;
             cuda_iov_pipeline_block = current_cuda_device->cuda_iov_pipeline_block[iov_pipeline_block_id];
             cuda_iov_dist_h_current = cuda_iov_pipeline_block->cuda_iov_dist_cached_h;
+            cuda_iov_contig_buf_h_current = cuda_iov_pipeline_block->cuda_iov_contig_buf_h;
+            cuda_iov_contig_buf_d_current = cuda_iov_pipeline_block->cuda_iov_contig_buf_d;
             cuda_iov_dist_d_current = cached_cuda_iov_dist_d + pConvertor->current_cuda_iov_pos;
+            cuda_iov_nb_bytes_list_h_current = cached_cuda_iov_nb_bytes_list_h + pConvertor->current_cuda_iov_pos;
+            DT_CUDA_DEBUG ( opal_cuda_output(2, "Pack cuda iov pos %d\n", pConvertor->current_cuda_iov_pos););
             cuda_stream_iov = cuda_iov_pipeline_block->cuda_stream;
             cuda_err = cudaStreamWaitEvent(*cuda_stream_iov, cuda_iov_pipeline_block->cuda_event, 0);
             opal_cuda_check_error(cuda_err);
@@ -782,8 +795,8 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
                 nb_blocks_per_description = (count_desc + thread_per_block - 1) / thread_per_block;
                 DT_CUDA_DEBUG ( opal_cuda_output(10, "Unpack description %d, size %d, residue %d, alignment %d\n", i, count_desc, residue_desc, alignment); );
                 for (j = 0; j < nb_blocks_per_description; j++) {
-                    cuda_iov_dist_h_current[nb_blocks_used].dst_offset = destination + j * thread_per_block * alignment - destination_base;
-                    cuda_iov_dist_h_current[nb_blocks_used].src_offset = source - source_base;
+                    cuda_iov_dist_h_current[nb_blocks_used].ptr_offset = destination + j * thread_per_block * alignment - destination_base;
+                    cuda_iov_contig_buf_h_current[nb_blocks_used] = (uintptr_t)source;
                     if ( (j+1) * thread_per_block <= count_desc) {
                         cuda_iov_dist_h_current[nb_blocks_used].nb_bytes = thread_per_block * alignment;
                     } else {
@@ -792,8 +805,9 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
 #if defined (OPAL_DATATYPE_CUDA_DEBUG)
                     assert (cuda_iov_dist_h_current[nb_blocks_used].nb_bytes > 0); 
 #endif /* OPAL_DATATYPE_CUDA_DEBUG */
+                    cuda_iov_nb_bytes_list_h_current[nb_blocks_used] = cuda_iov_dist_h_current[nb_blocks_used].nb_bytes;
                     source += cuda_iov_dist_h_current[nb_blocks_used].nb_bytes;
-                    DT_CUDA_DEBUG( opal_cuda_output(12, "Unpack \tblock %d, src_offset %ld, dst_offset %ld, nb_bytes %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].src_offset, cuda_iov_dist_h_current[nb_blocks_used].dst_offset, cuda_iov_dist_h_current[nb_blocks_used].nb_bytes); );
+                    DT_CUDA_DEBUG( opal_cuda_output(12, "Unpack \tblock %d, src_offset %ld, dst %p, nb_bytes %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].ptr_offset, cuda_iov_contig_buf_h_current[nb_blocks_used], cuda_iov_dist_h_current[nb_blocks_used].nb_bytes); );
                     nb_blocks_used ++;
                 }
 
@@ -801,14 +815,15 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
                 if (residue_desc != 0) {
                    /* orig_alignment = opal_datatype_basicDatatypes[pElem->elem.common.type]->size;*/
                     orig_alignment = ALIGNMENT_CHAR;
-                    cuda_iov_dist_h_current[nb_blocks_used].dst_offset = destination + length_per_iovec / alignment * alignment - destination_base;
-                    cuda_iov_dist_h_current[nb_blocks_used].src_offset = source - source_base;
-                    cuda_iov_dist_h_current[nb_blocks_used].nb_bytes = (length_per_iovec - length_per_iovec / alignment * alignment) / orig_alignment;
+                    cuda_iov_dist_h_current[nb_blocks_used].ptr_offset = destination + length_per_iovec / alignment * alignment - destination_base;
+                    cuda_iov_contig_buf_h_current[nb_blocks_used] = (uintptr_t)source;
+                    cuda_iov_dist_h_current[nb_blocks_used].nb_bytes = length_per_iovec - length_per_iovec / alignment * alignment;
 #if defined (OPAL_DATATYPE_CUDA_DEBUG)
                     assert (cuda_iov_dist_h_current[nb_blocks_used].nb_bytes > 0);
 #endif /* OPAL_DATATYPE_CUDA_DEBUG */
+                    cuda_iov_nb_bytes_list_h_current[nb_blocks_used] = cuda_iov_dist_h_current[nb_blocks_used].nb_bytes;
                     source += cuda_iov_dist_h_current[nb_blocks_used].nb_bytes;
-                    DT_CUDA_DEBUG( opal_cuda_output(12, "Unpack \tblock %d, src %ld, dst %ld, nb_bytes %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].src_offset, cuda_iov_dist_h_current[nb_blocks_used].dst_offset, cuda_iov_dist_h_current[nb_blocks_used].nb_bytes); );
+                    DT_CUDA_DEBUG( opal_cuda_output(12, "Unpack \tblock %d, src_offset %ld, dst %p, nb_bytes %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].ptr_offset, cuda_iov_contig_buf_h_current[nb_blocks_used], cuda_iov_dist_h_current[nb_blocks_used].nb_bytes); );
                     nb_blocks_used ++;
                 }
             }
@@ -824,7 +839,8 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
 #endif
 
             cudaMemcpyAsync(cuda_iov_dist_d_current, cuda_iov_dist_h_current, sizeof(ddt_cuda_iov_dist_cached_t)*(nb_blocks_used), cudaMemcpyHostToDevice, *cuda_stream_iov);
-            opal_generic_simple_unpack_cuda_iov_cached_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cuda_iov_dist_d_current, nb_blocks_used, source_base, destination_base);
+            cudaMemcpyAsync(cuda_iov_contig_buf_d_current, cuda_iov_contig_buf_h_current, sizeof(uintptr_t)*(nb_blocks_used), cudaMemcpyHostToDevice, *cuda_stream_iov);
+            opal_generic_simple_unpack_cuda_iov_cached_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cuda_iov_dist_d_current, cuda_iov_contig_buf_d_current, nb_blocks_used, destination_base);
             cuda_err = cudaEventRecord(cuda_iov_pipeline_block->cuda_event, *cuda_stream_iov);
             opal_cuda_check_error(cuda_err);
             iov_pipeline_block_id ++;
@@ -839,7 +855,7 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
             /* finished */
             if (pConvertor->current_iov_pos == ddt_iov_count) {
                 pConvertor->current_count ++;
-                opal_ddt_set_cuda_iov_is_cached(pConvertor, pConvertor->current_cuda_iov_pos);
+                opal_ddt_set_cuda_iov_cached(pConvertor, pConvertor->current_cuda_iov_pos);
                 DT_CUDA_DEBUG ( opal_cuda_output(2, "Pack cuda iov is cached, count %d\n", pConvertor->current_cuda_iov_pos););
             }
             DT_CUDA_DEBUG ( opal_cuda_output(4, "Unpack iov start pos %d end pos %d, submit to CUDA stream %d\n", iov_start_pos, iov_end_pos, cuda_streams->current_stream_id); );
diff --git a/opal/datatype/opal_datatype.h b/opal/datatype/opal_datatype.h
index 010a70b7270..01f876fd795 100644
--- a/opal/datatype/opal_datatype.h
+++ b/opal/datatype/opal_datatype.h
@@ -132,9 +132,7 @@ struct opal_datatype_t {
     size_t             max_data;
     /* size: 416, cachelines: 7, members: 18 */
 #if OPAL_CUDA_SUPPORT
-    void *             cached_cuda_iov_dist;
-    uint32_t           cached_cuda_iov_count;
-    uint8_t            cuda_iov_is_cached;
+    void *             cached_cuda_iov;
 #endif /* OPAL_CUDA_SUPPORT */
     /* last cacheline: 32 bytes */
 
diff --git a/opal/datatype/opal_datatype_create.c b/opal/datatype/opal_datatype_create.c
index 19caffe19ae..44c0e3020b6 100644
--- a/opal/datatype/opal_datatype_create.c
+++ b/opal/datatype/opal_datatype_create.c
@@ -61,9 +61,7 @@ static void opal_datatype_construct( opal_datatype_t* pData )
     pData->cached_iovec_count = 0;
     
 #if OPAL_CUDA_SUPPORT
-    pData->cached_cuda_iov_dist = NULL;
-    pData->cached_cuda_iov_count = 0;
-    pData->cuda_iov_is_cached = 0;
+    pData->cached_cuda_iov = NULL;
 #endif /* OPAL_CUDA_SUPPORT */
 
     for( i = 0; i < OPAL_DATATYPE_MAX_SUPPORTED; i++ )
@@ -103,11 +101,9 @@ static void opal_datatype_destruct( opal_datatype_t* datatype )
     
 #if OPAL_CUDA_SUPPORT   
     /* free cuda iov */
-    if (opal_datatype_cuda_kernel_support == 1 && datatype->cached_cuda_iov_dist != NULL) {
-        opal_cuda_iov_dist_fini(datatype->cached_cuda_iov_dist);
-        datatype->cached_cuda_iov_dist = NULL;
-        datatype->cached_cuda_iov_count = 0;
-        datatype->cuda_iov_is_cached = 0;
+    if (opal_datatype_cuda_kernel_support == 1 && datatype->cached_cuda_iov != NULL) {
+        opal_cached_cuda_iov_fini(datatype->cached_cuda_iov);
+        datatype->cached_cuda_iov = NULL;
     }
 #endif /* OPAL_CUDA_SUPPORT */
 }
diff --git a/opal/datatype/opal_datatype_cuda.c b/opal/datatype/opal_datatype_cuda.c
index ddc48444777..c65e635a506 100644
--- a/opal/datatype/opal_datatype_cuda.c
+++ b/opal/datatype/opal_datatype_cuda.c
@@ -246,7 +246,7 @@ int32_t opal_cuda_kernel_support_init(void)
         OPAL_DATATYPE_FIND_CUDA_KERNEL_FUNCTION_OR_RETURN( opal_datatype_cuda_kernel_handle, opal_ddt_cuda_malloc_gpu_buffer );
         OPAL_DATATYPE_FIND_CUDA_KERNEL_FUNCTION_OR_RETURN( opal_datatype_cuda_kernel_handle, opal_ddt_cuda_d2dcpy_async );
         OPAL_DATATYPE_FIND_CUDA_KERNEL_FUNCTION_OR_RETURN( opal_datatype_cuda_kernel_handle, opal_ddt_cuda_d2dcpy );
-        OPAL_DATATYPE_FIND_CUDA_KERNEL_FUNCTION_OR_RETURN( opal_datatype_cuda_kernel_handle, opal_ddt_cuda_iov_dist_fini );
+        OPAL_DATATYPE_FIND_CUDA_KERNEL_FUNCTION_OR_RETURN( opal_datatype_cuda_kernel_handle, opal_ddt_cached_cuda_iov_fini );
         
         if (OPAL_SUCCESS != cuda_kernel_table.opal_ddt_cuda_kernel_init_p()) {
             return OPAL_ERROR;
@@ -272,6 +272,7 @@ int32_t opal_cuda_kernel_support_fini(void)
         cuda_kernel_table.opal_ddt_cuda_malloc_gpu_buffer_p = NULL;
         cuda_kernel_table.opal_ddt_cuda_d2dcpy_async_p = NULL;
         cuda_kernel_table.opal_ddt_cuda_d2dcpy_p = NULL;
+        cuda_kernel_table.opal_ddt_cached_cuda_iov_fini_p = NULL;
 
         dlclose(opal_datatype_cuda_kernel_handle);
         opal_datatype_cuda_kernel_handle = NULL;
@@ -362,12 +363,12 @@ void opal_cuda_d2dcpy_async(void* dst, const void* src, size_t count)
     }
 }
 
-void opal_cuda_iov_dist_fini(void *cuda_iov_dist)
+void opal_cached_cuda_iov_fini(void *cached_cuda_iov)
 {
-    if (cuda_kernel_table.opal_ddt_cuda_iov_dist_fini_p != NULL) {
-        cuda_kernel_table.opal_ddt_cuda_iov_dist_fini_p(cuda_iov_dist);
+    if (cuda_kernel_table.opal_ddt_cached_cuda_iov_fini_p != NULL) {
+        cuda_kernel_table.opal_ddt_cached_cuda_iov_fini_p(cached_cuda_iov);
     } else {
-        opal_output(0, "opal_ddt_cuda_iov_dist_fini function pointer is NULL\n");
+        opal_output(0, "opal_ddt_cached_cuda_iov_fini function pointer is NULL\n");
     }
 }
 
diff --git a/opal/datatype/opal_datatype_cuda.h b/opal/datatype/opal_datatype_cuda.h
index 37af008daa8..7b613470ab0 100644
--- a/opal/datatype/opal_datatype_cuda.h
+++ b/opal/datatype/opal_datatype_cuda.h
@@ -28,7 +28,7 @@ struct opal_datatype_cuda_kernel_function_table {
     void* (*opal_ddt_cuda_malloc_gpu_buffer_p)(size_t size, int gpu_id);
     void (*opal_ddt_cuda_d2dcpy_async_p)(void* dst, const void* src, size_t count);
     void (*opal_ddt_cuda_d2dcpy_p)(void* dst, const void* src, size_t count);
-    void (*opal_ddt_cuda_iov_dist_fini_p)(void *cuda_iov_dist);
+    void (*opal_ddt_cached_cuda_iov_fini_p)(void *cached_cuda_iov);
     int32_t (*opal_ddt_generic_simple_pack_function_cuda_iov_p)( opal_convertor_t* pConvertor, struct iovec* iov, uint32_t* out_size, size_t* max_data );
     int32_t (*opal_ddt_generic_simple_unpack_function_cuda_iov_p)( opal_convertor_t* pConvertor, struct iovec* iov, uint32_t* out_size, size_t* max_data );
     int32_t (*opal_ddt_generic_simple_pack_function_cuda_vector_p)( opal_convertor_t* pConvertor, struct iovec* iov, uint32_t* out_size, size_t* max_data );
@@ -55,7 +55,7 @@ void* opal_cuda_malloc_gpu_buffer(size_t size, int gpu_id);
 void opal_cuda_free_gpu_buffer(void *addr, int gpu_id);
 void opal_cuda_d2dcpy(void* dst, const void* src, size_t count);
 void opal_cuda_d2dcpy_async(void* dst, const void* src, size_t count);
-void* opal_cuda_iov_dist_init(void);
-void opal_cuda_iov_dist_fini(void *cuda_iov_dist);
+void* opal_cached_cuda_iov_init(void);
+void opal_cached_cuda_iov_fini(void *cached_cuda_iov);
 
 #endif

From b76bf60b2c7cd5f077ffd8e9d0ded3d22bf187fc Mon Sep 17 00:00:00 2001
From: eddy16112 <eddy16112@gmail.com>
Date: Tue, 10 Nov 2015 00:33:29 -0500
Subject: [PATCH 147/190] checkpoint, cached cuda iov is working with multiple
 send, but not for count > 1

---
 opal/datatype/cuda/opal_datatype_cuda.cu      |  1 +
 .../cuda/opal_datatype_pack_cuda_wrapper.cu   | 62 +++++++++++++++---
 .../cuda/opal_datatype_unpack_cuda_wrapper.cu | 63 ++++++++++++++++---
 test/datatype/ddt_benchmark.c                 |  2 +-
 4 files changed, 112 insertions(+), 16 deletions(-)

diff --git a/opal/datatype/cuda/opal_datatype_cuda.cu b/opal/datatype/cuda/opal_datatype_cuda.cu
index 3ac7ba0ac5f..18494bcba70 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cu
+++ b/opal/datatype/cuda/opal_datatype_cuda.cu
@@ -340,6 +340,7 @@ void opal_ddt_set_cuda_iov_cached(struct opal_convertor_t *convertor, uint32_t c
     assert(datatype->cached_cuda_iov != NULL);
     ddt_cuda_iov_total_cached_t *tmp = (ddt_cuda_iov_total_cached_t *)datatype->cached_cuda_iov;
     tmp->cuda_iov_count = cuda_iov_count;
+    tmp->cuda_iov_is_cached = 1;
 }
 
 uint8_t opal_ddt_cuda_iov_is_cached(struct opal_convertor_t *convertor)
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
index e6b9545226f..34c1883c2d1 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
@@ -937,7 +937,7 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
     uint32_t nb_blocks, thread_per_block, nb_blocks_used;
     size_t length, buffer_size, length_per_iovec;
     unsigned char *destination, *destination_base, *source_base, *source;
-    size_t total_packed, total_converted;
+    size_t total_packed, packed_w_cache ,packed_wo_cache;
     int32_t complete_flag = 0;
     uint8_t buffer_isfull = 0, transfer_required, free_required;
     uint32_t convertor_flags;
@@ -957,7 +957,7 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
     const struct iovec *ddt_iov = NULL;
     uint32_t ddt_iov_count;
     size_t iov_len;
-    uint32_t iov_start_pos, iov_end_pos;
+    uint32_t iov_start_pos, iov_end_pos, cuda_iov_start_pos, cuda_iov_end_pos;
     ddt_cuda_iov_total_cached_t* cached_cuda_iov;
     ddt_cuda_iov_dist_cached_t* cached_cuda_iov_dist_d;
     uint32_t *cached_cuda_iov_nb_bytes_list_h, *cuda_iov_nb_bytes_list_h_current;
@@ -1015,7 +1015,8 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
     DT_CUDA_DEBUG ( opal_cuda_output(2, "Pack using IOV cached, GPU base %p, pack to buffer %p\n", pConvertor->pBaseBuf, destination););
 
     total_packed = 0;
-    total_converted = pConvertor->bConverted;
+    packed_wo_cache = 0;
+    packed_w_cache = 0;
     cuda_streams->current_stream_id = 0;
   //  orig_stack_index = pStack->index;
     destination_base = destination;
@@ -1047,6 +1048,7 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
     
     thread_per_block = CUDA_WARP_SIZE * 5;
     nb_blocks = 256;
+    source_base = (unsigned char*)pConvertor->pBaseBuf; 
     
     /* cuda iov is not cached, start to cache iov */
     if(opal_ddt_cuda_iov_is_cached(pConvertor) == 0) {
@@ -1056,7 +1058,6 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
         if (iov_end_pos > ddt_iov_count) {
             iov_end_pos = ddt_iov_count;
         }
-        source_base = (unsigned char*)pConvertor->pBaseBuf; 
     
         while (iov_start_pos < iov_end_pos && !buffer_isfull) {
         
@@ -1093,7 +1094,7 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
                     pConvertor->current_iov_pos = i;
                 }
                 buffer_size -= length_per_iovec;
-                total_packed += length_per_iovec;
+                packed_wo_cache += length_per_iovec;
                 source = (size_t)(ddt_iov[i].iov_base) + (ddt_iov[i].iov_len - iov_len) + source_base;
             
                 /* block size is either multiple of ALIGNMENT_DOUBLE or residule */
@@ -1175,11 +1176,60 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
             DT_CUDA_DEBUG ( opal_cuda_output(4, "Pack iov start pos %d end pos %d, submit to CUDA stream %d\n", iov_start_pos, iov_end_pos, cuda_streams->current_stream_id); );
         }
     }
+    total_packed += packed_wo_cache;
+    pConvertor->bConverted += packed_wo_cache;
+
+   
+    /* now we use cached cuda iov */
+    if( pConvertor->bConverted != pConvertor->local_size && !buffer_isfull) {
+        cuda_iov_start_pos = pConvertor->current_cuda_iov_pos;
+        cuda_iov_end_pos = cached_cuda_iov_count;
+        nb_blocks_used = 0;
+        cuda_iov_pipeline_block = current_cuda_device->cuda_iov_pipeline_block[iov_pipeline_block_id];
+        cuda_iov_contig_buf_h_current = cuda_iov_pipeline_block->cuda_iov_contig_buf_h;
+        cuda_iov_contig_buf_d_current = cuda_iov_pipeline_block->cuda_iov_contig_buf_d;
+        cuda_iov_dist_d_current = cached_cuda_iov_dist_d + pConvertor->current_cuda_iov_pos;
+        cuda_stream_iov = cuda_iov_pipeline_block->cuda_stream;
+        cuda_err = cudaStreamWaitEvent(*cuda_stream_iov, cuda_iov_pipeline_block->cuda_event, 0);
+        opal_cuda_check_error(cuda_err);
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+        GET_TIME(start);
+#endif
+        for (i = cuda_iov_start_pos; i < cuda_iov_end_pos && !buffer_isfull; i++) {
+            packed_w_cache += cached_cuda_iov_nb_bytes_list_h[i];
+            if (packed_w_cache <= buffer_size) {
+                cuda_iov_contig_buf_h_current[nb_blocks_used] = (uintptr_t)destination;
+                destination += cached_cuda_iov_nb_bytes_list_h[i];
+                nb_blocks_used ++;
+            } else {
+                packed_w_cache -= cached_cuda_iov_nb_bytes_list_h[i];
+                buffer_isfull = 1;
+                break;
+            }
+        }
+#if defined(OPAL_DATATYPE_CUDA_TIMING)    
+        GET_TIME( end );
+        total_time = ELAPSED_TIME( start, end );
+        DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Pack to dest %p, cached cuda iov is prepared in %ld microsec, kernel submitted to CUDA stream %d, nb_blocks %d\n", destination_base, total_time,  cuda_iov_pipeline_block->cuda_stream_id, nb_blocks_used); );
+#endif
+        pConvertor->current_cuda_iov_pos += nb_blocks_used;
+        cudaMemcpyAsync(cuda_iov_contig_buf_d_current, cuda_iov_contig_buf_h_current, sizeof(uintptr_t)*(nb_blocks_used), cudaMemcpyHostToDevice, *cuda_stream_iov);
+        DT_CUDA_DEBUG ( opal_cuda_output(2, "kernel launched src_base %p, dst_base %p, nb_blocks %ld\n", source_base, destination_base, nb_blocks_used ); );
+        opal_generic_simple_pack_cuda_iov_cached_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cuda_iov_dist_d_current, cuda_iov_contig_buf_d_current, nb_blocks_used, source_base);
+        cuda_err = cudaEventRecord(cuda_iov_pipeline_block->cuda_event, *cuda_stream_iov);
+        opal_cuda_check_error(cuda_err);
+        iov_pipeline_block_id ++;
+        iov_pipeline_block_id = iov_pipeline_block_id % NB_STREAMS;
+    }
 
     for (i = 0; i < NB_STREAMS; i++) {
         cudaStreamSynchronize(cuda_streams->opal_cuda_stream[i]);
     }
     
+    total_packed += packed_w_cache;
+    pConvertor->bConverted += packed_w_cache;
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "Pack total packed %d\n", total_packed); );
+    
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
     GET_TIME(start);
 #endif
@@ -1195,8 +1245,6 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
     iov[0].iov_len = total_packed;
     *max_data = total_packed;
     *out_size = 1;
-    pConvertor->bConverted += total_packed;
-    DT_CUDA_DEBUG ( opal_cuda_output(2, "Pack total packed %d\n", total_packed); );
     
 #if defined(OPAL_DATATYPE_CUDA_TIMING)    
     GET_TIME( end_total );
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
index 49171e5b277..6689a48a3b4 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
@@ -632,7 +632,7 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
     uint32_t nb_blocks, thread_per_block, nb_blocks_used;
     size_t length, buffer_size, length_per_iovec;
     unsigned char *source, *source_base, *destination_base, *destination;
-    size_t total_unpacked, total_converted;
+    size_t total_unpacked, unpacked_wo_cache, unpacked_w_cache;
     int32_t complete_flag = 0;
     uint8_t buffer_isfull = 0;
     uint8_t free_required = 0;
@@ -653,7 +653,7 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
     const struct iovec *ddt_iov = NULL;
     uint32_t ddt_iov_count;
     size_t iov_len;
-    uint32_t iov_start_pos, iov_end_pos;
+    uint32_t iov_start_pos, iov_end_pos, cuda_iov_start_pos, cuda_iov_end_pos;
     ddt_cuda_iov_total_cached_t* cached_cuda_iov;
     ddt_cuda_iov_dist_cached_t* cached_cuda_iov_dist_d;
     uint32_t *cached_cuda_iov_nb_bytes_list_h, *cuda_iov_nb_bytes_list_h_current;
@@ -713,7 +713,8 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
 #endif
     buffer_size = iov[0].iov_len;
     total_unpacked = 0;
-    total_converted = pConvertor->bConverted;
+    unpacked_wo_cache = 0;
+    unpacked_w_cache = 0;
     cuda_streams->current_stream_id = 0;
     convertor_flags = pConvertor->flags;
 //    orig_stack_index = pStack->index;
@@ -737,6 +738,7 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
 
     thread_per_block = CUDA_WARP_SIZE * 5;
     nb_blocks = 256;
+    destination_base = (unsigned char*)pConvertor->pBaseBuf;
     
     /* cuda iov is not cached, start to cache iov */
     if(opal_ddt_cuda_iov_is_cached(pConvertor) == 0) {
@@ -746,7 +748,6 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
         if (iov_end_pos > ddt_iov_count) {
             iov_end_pos = ddt_iov_count;
         }
-        destination_base = (unsigned char*)pConvertor->pBaseBuf;
 
         while (iov_start_pos < iov_end_pos && !buffer_isfull) {
 
@@ -785,7 +786,7 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
                     pConvertor->current_iov_pos = i;
                 }
                 buffer_size -= length_per_iovec;
-                total_unpacked += length_per_iovec;
+                unpacked_wo_cache += length_per_iovec;
                 destination = (size_t)(ddt_iov[i].iov_base) + (ddt_iov[i].iov_len - iov_len) + destination_base;
 
                 alignment = ALIGNMENT_DOUBLE;
@@ -861,18 +862,64 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
             DT_CUDA_DEBUG ( opal_cuda_output(4, "Unpack iov start pos %d end pos %d, submit to CUDA stream %d\n", iov_start_pos, iov_end_pos, cuda_streams->current_stream_id); );
 
         }
-    
     }
+    total_unpacked += unpacked_wo_cache;
+    pConvertor->bConverted += unpacked_wo_cache;
+#if 1    
+    /* now we use cached cuda iov */
+    if( pConvertor->bConverted != pConvertor->local_size && !buffer_isfull) {
+        cuda_iov_start_pos = pConvertor->current_cuda_iov_pos;
+        cuda_iov_end_pos = cached_cuda_iov_count;
+        nb_blocks_used = 0;
+        cuda_iov_pipeline_block = current_cuda_device->cuda_iov_pipeline_block[iov_pipeline_block_id];
+        cuda_iov_contig_buf_h_current = cuda_iov_pipeline_block->cuda_iov_contig_buf_h;
+        cuda_iov_contig_buf_d_current = cuda_iov_pipeline_block->cuda_iov_contig_buf_d;
+        cuda_iov_dist_d_current = cached_cuda_iov_dist_d + pConvertor->current_cuda_iov_pos;
+        cuda_stream_iov = cuda_iov_pipeline_block->cuda_stream;
+        cuda_err = cudaStreamWaitEvent(*cuda_stream_iov, cuda_iov_pipeline_block->cuda_event, 0);
+        opal_cuda_check_error(cuda_err);
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+        GET_TIME(start);
+#endif
+        for (i = cuda_iov_start_pos; i < cuda_iov_end_pos && !buffer_isfull; i++) {
+            unpacked_w_cache += cached_cuda_iov_nb_bytes_list_h[i];
+            if (unpacked_w_cache <= buffer_size) {
+                cuda_iov_contig_buf_h_current[nb_blocks_used] = (uintptr_t)source;
+                source += cached_cuda_iov_nb_bytes_list_h[i];
+                nb_blocks_used ++;
+            } else {
+                unpacked_w_cache -= cached_cuda_iov_nb_bytes_list_h[i];
+                buffer_isfull = 1;
+                break;
+            }
+        }
+#if defined(OPAL_DATATYPE_CUDA_TIMING)    
+        GET_TIME( end );
+        total_time = ELAPSED_TIME( start, end );
+        DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Unpack src %p, cached cuda iov is prepared in %ld microsec, kernel submitted to CUDA stream %d, nb_blocks %d\n", source_base, total_time,  cuda_iov_pipeline_block->cuda_stream_id, nb_blocks_used); );
+#endif
+        pConvertor->current_cuda_iov_pos += nb_blocks_used;
+        cudaMemcpyAsync(cuda_iov_contig_buf_d_current, cuda_iov_contig_buf_h_current, sizeof(uintptr_t)*(nb_blocks_used), cudaMemcpyHostToDevice, *cuda_stream_iov);
+        DT_CUDA_DEBUG ( opal_cuda_output(2, "kernel launched src_base %p, dst_base %p, nb_blocks %ld\n", source_base, destination_base, nb_blocks_used ); );
+        opal_generic_simple_unpack_cuda_iov_cached_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cuda_iov_dist_d_current, cuda_iov_contig_buf_d_current, nb_blocks_used, destination_base);
+        cuda_err = cudaEventRecord(cuda_iov_pipeline_block->cuda_event, *cuda_stream_iov);
+        opal_cuda_check_error(cuda_err);
+        iov_pipeline_block_id ++;
+        iov_pipeline_block_id = iov_pipeline_block_id % NB_STREAMS;
+    }
+#endif
 
     for (i = 0; i < NB_STREAMS; i++) {
         cudaStreamSynchronize(cuda_streams->opal_cuda_stream[i]);
     }
+    
+    total_unpacked += unpacked_w_cache;
+    pConvertor->bConverted += unpacked_w_cache;
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "Unpack total unpacked %d\n", total_unpacked); );
 
     iov[0].iov_len = total_unpacked;
     *max_data = total_unpacked;
     *out_size = 1;
-    pConvertor->bConverted += total_unpacked;
-    DT_CUDA_DEBUG ( opal_cuda_output(2, "Unpack total unpacked %d\n", total_unpacked); );
 
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
     GET_TIME( end_total );
diff --git a/test/datatype/ddt_benchmark.c b/test/datatype/ddt_benchmark.c
index f2112d598b2..6dd3b4cf879 100644
--- a/test/datatype/ddt_benchmark.c
+++ b/test/datatype/ddt_benchmark.c
@@ -1211,7 +1211,7 @@ int main( int argc, char* argv[] )
     
     printf( "\n\n#\n * TEST UPPER TRIANGULAR MATRIX (size 100)\n #\n\n" );
     int mat_size = 500;
-    for (mat_size = 1000; mat_size <= 1000; mat_size +=500) {
+    for (mat_size = 2000; mat_size <= 2000; mat_size +=500) {
         pdt = upper_matrix(mat_size);
         printf("----matrix size %d-----\n", mat_size);
         if( outputFlags & CHECK_PACK_UNPACK ) {

From a0e949347b77c6061531fd1aacebf0e725168b4b Mon Sep 17 00:00:00 2001
From: eddy16112 <eddy16112@gmail.com>
Date: Tue, 10 Nov 2015 19:26:22 -0500
Subject: [PATCH 148/190] checkpoint, fix a bug for partial unpack

---
 .../cuda/opal_datatype_cuda_internal.cuh      |  2 +-
 .../cuda/opal_datatype_pack_cuda_wrapper.cu   | 17 +++----
 .../cuda/opal_datatype_unpack_cuda_kernel.cu  | 16 +++++--
 .../cuda/opal_datatype_unpack_cuda_wrapper.cu | 44 ++++++++++++++-----
 test/datatype/ddt_benchmark.c                 |  2 +-
 5 files changed, 58 insertions(+), 23 deletions(-)

diff --git a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
index 1b47b89f1d0..b7e8e9405f6 100644
--- a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
+++ b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
@@ -140,7 +140,7 @@ __global__ void opal_generic_simple_unpack_cuda_iov_non_cached_kernel( ddt_cuda_
 
 __global__ void opal_generic_simple_pack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, uintptr_t* cuda_iov_contig_buf_d, int nb_blocks_used, unsigned char* source_base);
 
-__global__ void opal_generic_simple_unpack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, uintptr_t* cuda_iov_contig_buf_d, int nb_blocks_used, unsigned char* destination_base);
+__global__ void opal_generic_simple_unpack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, uintptr_t* cuda_iov_contig_buf_d, int nb_blocks_used, unsigned char* destination_base, size_t cuda_iov_partial_length_start, size_t cuda_iov_partial_length_end);
 
 void opal_cuda_output(int output_id, const char *format, ...);
 
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
index 34c1883c2d1..55cb955808e 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
@@ -955,14 +955,14 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
     int iov_pipeline_block_id = 0;
     cudaStream_t *cuda_stream_iov = NULL;
     const struct iovec *ddt_iov = NULL;
-    uint32_t ddt_iov_count;
-    size_t iov_len;
+    uint32_t ddt_iov_count = 0;
+    size_t iov_len = 0;
     uint32_t iov_start_pos, iov_end_pos, cuda_iov_start_pos, cuda_iov_end_pos;
     ddt_cuda_iov_total_cached_t* cached_cuda_iov;
     ddt_cuda_iov_dist_cached_t* cached_cuda_iov_dist_d;
     uint32_t *cached_cuda_iov_nb_bytes_list_h, *cuda_iov_nb_bytes_list_h_current;
-    uint32_t cached_cuda_iov_count;
-    uint8_t cuda_iov_is_cached;
+    uint32_t cached_cuda_iov_count = 0;
+    uint8_t cuda_iov_is_cached = 0;
 
 #if defined(OPAL_DATATYPE_CUDA_TIMING)    
     TIMER_DATA_TYPE start, end, start_total, end_total;
@@ -1196,17 +1196,18 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
         GET_TIME(start);
 #endif
         for (i = cuda_iov_start_pos; i < cuda_iov_end_pos && !buffer_isfull; i++) {
-            packed_w_cache += cached_cuda_iov_nb_bytes_list_h[i];
-            if (packed_w_cache <= buffer_size) {
+            if (buffer_size >= cached_cuda_iov_nb_bytes_list_h[i]) {
                 cuda_iov_contig_buf_h_current[nb_blocks_used] = (uintptr_t)destination;
                 destination += cached_cuda_iov_nb_bytes_list_h[i];
-                nb_blocks_used ++;
+                packed_w_cache += cached_cuda_iov_nb_bytes_list_h[i];
+                buffer_size -= cached_cuda_iov_nb_bytes_list_h[i];
+                nb_blocks_used++;
             } else {
-                packed_w_cache -= cached_cuda_iov_nb_bytes_list_h[i];
                 buffer_isfull = 1;
                 break;
             }
         }
+        printf("nb_blocks_used %d, my %d\n", nb_blocks_used, i - cuda_iov_start_pos);
 #if defined(OPAL_DATATYPE_CUDA_TIMING)    
         GET_TIME( end );
         total_time = ELAPSED_TIME( start, end );
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
index 9ea9414ba77..c553a7991b0 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
@@ -46,7 +46,7 @@ __global__ void opal_generic_simple_unpack_cuda_iov_non_cached_kernel( ddt_cuda_
     }
 }
 
-__global__ void opal_generic_simple_unpack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, uintptr_t* cuda_iov_contig_buf_d, int nb_blocks_used, unsigned char* destination_base)
+__global__ void opal_generic_simple_unpack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, uintptr_t* cuda_iov_contig_buf_d, int nb_blocks_used, unsigned char* destination_base, size_t cuda_iov_partial_length_start, size_t cuda_iov_partial_length_end)
 {
     uint32_t i, j;
     size_t dst_offset;
@@ -68,11 +68,21 @@ __global__ void opal_generic_simple_unpack_cuda_iov_cached_kernel( ddt_cuda_iov_
     for (i = 0; i < nb_tasks; i++) {
         src = (unsigned char *)cuda_iov_contig_buf_d[blockIdx.x + i * gridDim.x];
         dst_offset = cuda_iov_dist[blockIdx.x + i * gridDim.x].ptr_offset;
-        
+        if (i == 0 && blockIdx.x == 0 && cuda_iov_partial_length_start != 0) {
+         //   if (threadIdx.x == 0) printf("cuda_iov_partial_length_start %d", cuda_iov_partial_length_start);
+            dst_offset = dst_offset + cuda_iov_dist[blockIdx.x + i * gridDim.x].nb_bytes - cuda_iov_partial_length_start; 
+        }
         if (threadIdx.x == 0) {
             _source_tmp = src;
             _destination_tmp = destination_base + dst_offset;
-            uint32_t _nb_bytes = cuda_iov_dist[blockIdx.x + i * gridDim.x].nb_bytes;
+            uint32_t _nb_bytes = 0;
+            if (i == 0 && blockIdx.x == 0 && cuda_iov_partial_length_start != 0) {
+                _nb_bytes = cuda_iov_partial_length_start;
+            } else if (i == nb_tasks-1 && (blockIdx.x == (nb_blocks_used-1) % gridDim.x) && cuda_iov_partial_length_end != 0) {
+                _nb_bytes = cuda_iov_partial_length_end;
+            } else {
+                _nb_bytes = cuda_iov_dist[blockIdx.x + i * gridDim.x].nb_bytes;
+            }
             if ((uintptr_t)(_destination_tmp) % ALIGNMENT_DOUBLE == 0 && (uintptr_t)_source_tmp % ALIGNMENT_DOUBLE == 0 && _nb_bytes % ALIGNMENT_DOUBLE == 0) {
                 alignment = ALIGNMENT_DOUBLE;
             } else if ((uintptr_t)(_destination_tmp) % ALIGNMENT_FLOAT == 0 && (uintptr_t)_source_tmp % ALIGNMENT_FLOAT == 0 && _nb_bytes % ALIGNMENT_FLOAT == 0) {
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
index 6689a48a3b4..66d72995e26 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
@@ -651,14 +651,16 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
     int iov_pipeline_block_id = 0;
     cudaStream_t *cuda_stream_iov = NULL;
     const struct iovec *ddt_iov = NULL;
-    uint32_t ddt_iov_count;
-    size_t iov_len;
+    uint32_t ddt_iov_count = 0;
+    size_t iov_len = 0;
     uint32_t iov_start_pos, iov_end_pos, cuda_iov_start_pos, cuda_iov_end_pos;
     ddt_cuda_iov_total_cached_t* cached_cuda_iov;
     ddt_cuda_iov_dist_cached_t* cached_cuda_iov_dist_d;
     uint32_t *cached_cuda_iov_nb_bytes_list_h, *cuda_iov_nb_bytes_list_h_current;
-    uint32_t cached_cuda_iov_count;
-    uint8_t cuda_iov_is_cached;
+    uint32_t cached_cuda_iov_count = 0;
+    uint8_t cuda_iov_is_cached = 0;
+    size_t cuda_iov_partial_length_start = 0;
+    size_t cuda_iov_partial_length_end = 0;
 
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
     TIMER_DATA_TYPE start, end, start_total, end_total;
@@ -841,7 +843,7 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
 
             cudaMemcpyAsync(cuda_iov_dist_d_current, cuda_iov_dist_h_current, sizeof(ddt_cuda_iov_dist_cached_t)*(nb_blocks_used), cudaMemcpyHostToDevice, *cuda_stream_iov);
             cudaMemcpyAsync(cuda_iov_contig_buf_d_current, cuda_iov_contig_buf_h_current, sizeof(uintptr_t)*(nb_blocks_used), cudaMemcpyHostToDevice, *cuda_stream_iov);
-            opal_generic_simple_unpack_cuda_iov_cached_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cuda_iov_dist_d_current, cuda_iov_contig_buf_d_current, nb_blocks_used, destination_base);
+            opal_generic_simple_unpack_cuda_iov_cached_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cuda_iov_dist_d_current, cuda_iov_contig_buf_d_current, nb_blocks_used, destination_base, 0, 0);
             cuda_err = cudaEventRecord(cuda_iov_pipeline_block->cuda_event, *cuda_stream_iov);
             opal_cuda_check_error(cuda_err);
             iov_pipeline_block_id ++;
@@ -881,14 +883,32 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
         GET_TIME(start);
 #endif
+        if (pConvertor->current_iov_partial_length > 0) {
+            cuda_iov_partial_length_start = pConvertor->current_iov_partial_length;
+            buffer_size -= cuda_iov_partial_length_start;
+            pConvertor->current_iov_partial_length = 0;
+            cuda_iov_contig_buf_h_current[nb_blocks_used] = (uintptr_t)source;
+            source += cuda_iov_partial_length_start;
+            cuda_iov_start_pos ++;
+            nb_blocks_used ++;
+        }
         for (i = cuda_iov_start_pos; i < cuda_iov_end_pos && !buffer_isfull; i++) {
-            unpacked_w_cache += cached_cuda_iov_nb_bytes_list_h[i];
-            if (unpacked_w_cache <= buffer_size) {
+            if (buffer_size >= cached_cuda_iov_nb_bytes_list_h[i]) {
                 cuda_iov_contig_buf_h_current[nb_blocks_used] = (uintptr_t)source;
                 source += cached_cuda_iov_nb_bytes_list_h[i];
+                unpacked_w_cache += cached_cuda_iov_nb_bytes_list_h[i];
+                buffer_size -= cached_cuda_iov_nb_bytes_list_h[i];
                 nb_blocks_used ++;
             } else {
-                unpacked_w_cache -= cached_cuda_iov_nb_bytes_list_h[i];
+                if (buffer_size > 0) {
+                    cuda_iov_partial_length_end = buffer_size;
+                    unpacked_w_cache += cuda_iov_partial_length_end;
+                    cuda_iov_contig_buf_h_current[nb_blocks_used] = (uintptr_t)source;
+                    source += cuda_iov_partial_length_end;
+                    pConvertor->current_iov_partial_length = cached_cuda_iov_nb_bytes_list_h[i] - cuda_iov_partial_length_end;
+                    nb_blocks_used ++;
+                }
+                buffer_size = 0;
                 buffer_isfull = 1;
                 break;
             }
@@ -898,10 +918,14 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
         total_time = ELAPSED_TIME( start, end );
         DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Unpack src %p, cached cuda iov is prepared in %ld microsec, kernel submitted to CUDA stream %d, nb_blocks %d\n", source_base, total_time,  cuda_iov_pipeline_block->cuda_stream_id, nb_blocks_used); );
 #endif
-        pConvertor->current_cuda_iov_pos += nb_blocks_used;
+        if (pConvertor->current_iov_partial_length > 0) {
+            pConvertor->current_cuda_iov_pos += nb_blocks_used - 1;
+        } else {
+            pConvertor->current_cuda_iov_pos += nb_blocks_used;
+        }
         cudaMemcpyAsync(cuda_iov_contig_buf_d_current, cuda_iov_contig_buf_h_current, sizeof(uintptr_t)*(nb_blocks_used), cudaMemcpyHostToDevice, *cuda_stream_iov);
         DT_CUDA_DEBUG ( opal_cuda_output(2, "kernel launched src_base %p, dst_base %p, nb_blocks %ld\n", source_base, destination_base, nb_blocks_used ); );
-        opal_generic_simple_unpack_cuda_iov_cached_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cuda_iov_dist_d_current, cuda_iov_contig_buf_d_current, nb_blocks_used, destination_base);
+        opal_generic_simple_unpack_cuda_iov_cached_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cuda_iov_dist_d_current, cuda_iov_contig_buf_d_current, nb_blocks_used, destination_base, cuda_iov_partial_length_start, cuda_iov_partial_length_end);
         cuda_err = cudaEventRecord(cuda_iov_pipeline_block->cuda_event, *cuda_stream_iov);
         opal_cuda_check_error(cuda_err);
         iov_pipeline_block_id ++;
diff --git a/test/datatype/ddt_benchmark.c b/test/datatype/ddt_benchmark.c
index 6dd3b4cf879..c8c3fd7db45 100644
--- a/test/datatype/ddt_benchmark.c
+++ b/test/datatype/ddt_benchmark.c
@@ -1216,7 +1216,7 @@ int main( int argc, char* argv[] )
         printf("----matrix size %d-----\n", mat_size);
         if( outputFlags & CHECK_PACK_UNPACK ) {
             for (i = 1; i <= 1; i++) {
-                local_copy_with_convertor(pdt, 1, 4000000, mat_size);
+                local_copy_with_convertor(pdt, 1, 40000000, mat_size);
             }
         }
         OBJ_RELEASE( pdt ); assert( pdt == NULL );

From c1f595984efb495b90f44d06ec29de3225eb054c Mon Sep 17 00:00:00 2001
From: eddy16112 <eddy16112@gmail.com>
Date: Wed, 11 Nov 2015 00:51:37 -0500
Subject: [PATCH 149/190] checkpoint, fix unpack size

---
 opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu | 1 +
 1 file changed, 1 insertion(+)

diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
index 66d72995e26..84d5bd5ea1d 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
@@ -885,6 +885,7 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
 #endif
         if (pConvertor->current_iov_partial_length > 0) {
             cuda_iov_partial_length_start = pConvertor->current_iov_partial_length;
+            unpacked_w_cache += cuda_iov_partial_length_start;
             buffer_size -= cuda_iov_partial_length_start;
             pConvertor->current_iov_partial_length = 0;
             cuda_iov_contig_buf_h_current[nb_blocks_used] = (uintptr_t)source;

From 5b63994905a98a11e3ab10b610abfe11ab917d40 Mon Sep 17 00:00:00 2001
From: eddy16112 <eddy16112@gmail.com>
Date: Wed, 11 Nov 2015 17:49:25 -0500
Subject: [PATCH 150/190] checkpoint, during unpack, cache the entire iov
 before unpack

---
 opal/datatype/cuda/opal_datatype_cuda.cu      |  14 ++
 opal/datatype/cuda/opal_datatype_cuda.cuh     |   2 +
 .../cuda/opal_datatype_pack_cuda_wrapper.cu   |   3 +-
 .../cuda/opal_datatype_unpack_cuda_wrapper.cu | 172 ++++++------------
 4 files changed, 75 insertions(+), 116 deletions(-)

diff --git a/opal/datatype/cuda/opal_datatype_cuda.cu b/opal/datatype/cuda/opal_datatype_cuda.cu
index 18494bcba70..471c6e63709 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cu
+++ b/opal/datatype/cuda/opal_datatype_cuda.cu
@@ -351,6 +351,20 @@ uint8_t opal_ddt_cuda_iov_is_cached(struct opal_convertor_t *convertor)
     return tmp->cuda_iov_is_cached;
 }
 
+void opal_ddt_set_cuda_iov_position(struct opal_convertor_t *convertor, size_t ddt_offset, const uint32_t *cached_cuda_iov_nb_bytes_list_h, const uint32_t cuda_iov_count)
+{
+    int i;
+    size_t iov_size = 0;
+    for(i = 0; i < cuda_iov_count; i++) {
+        iov_size += cached_cuda_iov_nb_bytes_list_h[i];
+        if (iov_size > ddt_offset) {
+            convertor->current_iov_partial_length = iov_size - ddt_offset;
+            convertor->current_cuda_iov_pos = i;
+            break;
+        }
+    }
+}
+
 void opal_ddt_check_cuda_iov_is_full(struct opal_convertor_t *convertor, uint32_t cuda_iov_count)
 {
 #if 0
diff --git a/opal/datatype/cuda/opal_datatype_cuda.cuh b/opal/datatype/cuda/opal_datatype_cuda.cuh
index 6c071188c2c..8e30726ace2 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cuh
+++ b/opal/datatype/cuda/opal_datatype_cuda.cuh
@@ -129,6 +129,8 @@ uint8_t opal_ddt_cuda_iov_is_cached(struct opal_convertor_t *convertor);
 
 void opal_ddt_check_cuda_iov_is_full(struct opal_convertor_t *convertor, uint32_t cuda_iov_count);
 
+void opal_ddt_set_cuda_iov_position(struct opal_convertor_t *convertor, size_t ddt_offset, const uint32_t *cached_cuda_iov_nb_bytes_list_h, const uint32_t cuda_iov_count);
+
 }
                             
 #endif  /* OPAL_DATATYPE_CUDA_H_HAS_BEEN_INCLUDED */
\ No newline at end of file
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
index 55cb955808e..8236692cad9 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
@@ -937,7 +937,7 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
     uint32_t nb_blocks, thread_per_block, nb_blocks_used;
     size_t length, buffer_size, length_per_iovec;
     unsigned char *destination, *destination_base, *source_base, *source;
-    size_t total_packed, packed_w_cache ,packed_wo_cache;
+    size_t total_packed = 0, packed_w_cache = 0, packed_wo_cache = 0;
     int32_t complete_flag = 0;
     uint8_t buffer_isfull = 0, transfer_required, free_required;
     uint32_t convertor_flags;
@@ -1207,7 +1207,6 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
                 break;
             }
         }
-        printf("nb_blocks_used %d, my %d\n", nb_blocks_used, i - cuda_iov_start_pos);
 #if defined(OPAL_DATATYPE_CUDA_TIMING)    
         GET_TIME( end );
         total_time = ELAPSED_TIME( start, end );
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
index 84d5bd5ea1d..549b58a9986 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
@@ -632,7 +632,7 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
     uint32_t nb_blocks, thread_per_block, nb_blocks_used;
     size_t length, buffer_size, length_per_iovec;
     unsigned char *source, *source_base, *destination_base, *destination;
-    size_t total_unpacked, unpacked_wo_cache, unpacked_w_cache;
+    size_t total_unpacked = 0, unpacked_wo_cache = 0, unpacked_w_cache = 0;
     int32_t complete_flag = 0;
     uint8_t buffer_isfull = 0;
     uint8_t free_required = 0;
@@ -744,132 +744,74 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
     
     /* cuda iov is not cached, start to cache iov */
     if(opal_ddt_cuda_iov_is_cached(pConvertor) == 0) {
+        nb_blocks_used = 0;
+        cuda_iov_pipeline_block = current_cuda_device->cuda_iov_pipeline_block[iov_pipeline_block_id];
+        cuda_iov_dist_h_current = cuda_iov_pipeline_block->cuda_iov_dist_cached_h;
+        cuda_stream_iov = cuda_iov_pipeline_block->cuda_stream;
+        cuda_err = cudaStreamWaitEvent(*cuda_stream_iov, cuda_iov_pipeline_block->cuda_event, 0);
+        opal_cuda_check_error(cuda_err);
     
-        iov_start_pos = pConvertor->current_iov_pos;
-        iov_end_pos = iov_start_pos + IOV_PIPELINE_SIZE;
-        if (iov_end_pos > ddt_iov_count) {
-            iov_end_pos = ddt_iov_count;
-        }
-
-        while (iov_start_pos < iov_end_pos && !buffer_isfull) {
-
-            nb_blocks_used = 0;
-            cuda_iov_pipeline_block = current_cuda_device->cuda_iov_pipeline_block[iov_pipeline_block_id];
-            cuda_iov_dist_h_current = cuda_iov_pipeline_block->cuda_iov_dist_cached_h;
-            cuda_iov_contig_buf_h_current = cuda_iov_pipeline_block->cuda_iov_contig_buf_h;
-            cuda_iov_contig_buf_d_current = cuda_iov_pipeline_block->cuda_iov_contig_buf_d;
-            cuda_iov_dist_d_current = cached_cuda_iov_dist_d + pConvertor->current_cuda_iov_pos;
-            cuda_iov_nb_bytes_list_h_current = cached_cuda_iov_nb_bytes_list_h + pConvertor->current_cuda_iov_pos;
-            DT_CUDA_DEBUG ( opal_cuda_output(2, "Pack cuda iov pos %d\n", pConvertor->current_cuda_iov_pos););
-            cuda_stream_iov = cuda_iov_pipeline_block->cuda_stream;
-            cuda_err = cudaStreamWaitEvent(*cuda_stream_iov, cuda_iov_pipeline_block->cuda_event, 0);
-            opal_cuda_check_error(cuda_err);
-        
 
 #if defined (OPAL_DATATYPE_CUDA_TIMING)
-            GET_TIME(start);
+        GET_TIME(start);
 #endif
 
-            for (i = iov_start_pos; i < iov_end_pos && !buffer_isfull; i++) {
-                if (pConvertor->current_iov_partial_length > 0) {
-                    iov_len = pConvertor->current_iov_partial_length;
-                    pConvertor->current_iov_partial_length = 0;
-                } else {
-                    iov_len = ddt_iov[i].iov_len;
-                }
-                if (buffer_size >= iov_len) {
-                    length_per_iovec = iov_len;
-                } else {
-                  /*  orig_alignment = opal_datatype_basicDatatypes[pElem->elem.common.type]->size;*/
-                    orig_alignment = ALIGNMENT_CHAR;
-                    length_per_iovec = buffer_size / orig_alignment * orig_alignment;
-                    buffer_isfull = 1;
-                    pConvertor->current_iov_partial_length = iov_len - length_per_iovec;
-                    pConvertor->current_iov_pos = i;
-                }
-                buffer_size -= length_per_iovec;
-                unpacked_wo_cache += length_per_iovec;
-                destination = (size_t)(ddt_iov[i].iov_base) + (ddt_iov[i].iov_len - iov_len) + destination_base;
+        for (i = 0; i < ddt_iov_count; i++) {
+            length_per_iovec = ddt_iov[i].iov_len;
+            destination = (size_t)(ddt_iov[i].iov_base) + destination_base;
 
-                alignment = ALIGNMENT_DOUBLE;
+            alignment = ALIGNMENT_DOUBLE;
 
-                count_desc = length_per_iovec / alignment;
-                residue_desc = length_per_iovec % alignment;
-                nb_blocks_per_description = (count_desc + thread_per_block - 1) / thread_per_block;
-                DT_CUDA_DEBUG ( opal_cuda_output(10, "Unpack description %d, size %d, residue %d, alignment %d\n", i, count_desc, residue_desc, alignment); );
-                for (j = 0; j < nb_blocks_per_description; j++) {
-                    cuda_iov_dist_h_current[nb_blocks_used].ptr_offset = destination + j * thread_per_block * alignment - destination_base;
-                    cuda_iov_contig_buf_h_current[nb_blocks_used] = (uintptr_t)source;
-                    if ( (j+1) * thread_per_block <= count_desc) {
-                        cuda_iov_dist_h_current[nb_blocks_used].nb_bytes = thread_per_block * alignment;
-                    } else {
-                        cuda_iov_dist_h_current[nb_blocks_used].nb_bytes = (thread_per_block - ((j+1)*thread_per_block - count_desc)) * alignment;
-                    }
+            count_desc = length_per_iovec / alignment;
+            residue_desc = length_per_iovec % alignment;
+            nb_blocks_per_description = (count_desc + thread_per_block - 1) / thread_per_block;
+            DT_CUDA_DEBUG ( opal_cuda_output(10, "Unpack description %d, size %d, residue %d, alignment %d\n", i, count_desc, residue_desc, alignment); );
+            for (j = 0; j < nb_blocks_per_description; j++) {
+                cuda_iov_dist_h_current[nb_blocks_used].ptr_offset = destination + j * thread_per_block * alignment - destination_base;
+                if ( (j+1) * thread_per_block <= count_desc) {
+                    cuda_iov_dist_h_current[nb_blocks_used].nb_bytes = thread_per_block * alignment;
+                } else {
+                    cuda_iov_dist_h_current[nb_blocks_used].nb_bytes = (thread_per_block - ((j+1)*thread_per_block - count_desc)) * alignment;
+                }
 #if defined (OPAL_DATATYPE_CUDA_DEBUG)
-                    assert (cuda_iov_dist_h_current[nb_blocks_used].nb_bytes > 0); 
+                assert (cuda_iov_dist_h_current[nb_blocks_used].nb_bytes > 0); 
 #endif /* OPAL_DATATYPE_CUDA_DEBUG */
-                    cuda_iov_nb_bytes_list_h_current[nb_blocks_used] = cuda_iov_dist_h_current[nb_blocks_used].nb_bytes;
-                    source += cuda_iov_dist_h_current[nb_blocks_used].nb_bytes;
-                    DT_CUDA_DEBUG( opal_cuda_output(12, "Unpack \tblock %d, src_offset %ld, dst %p, nb_bytes %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].ptr_offset, cuda_iov_contig_buf_h_current[nb_blocks_used], cuda_iov_dist_h_current[nb_blocks_used].nb_bytes); );
-                    nb_blocks_used ++;
-                }
+                cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] = cuda_iov_dist_h_current[nb_blocks_used].nb_bytes;
+                DT_CUDA_DEBUG( opal_cuda_output(12, "Unpack \tblock %d, src_offset %ld, nb_bytes %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].ptr_offset, cuda_iov_dist_h_current[nb_blocks_used].nb_bytes); );
+                nb_blocks_used ++;
+            }
 
-                /* handle residue */
-                if (residue_desc != 0) {
-                   /* orig_alignment = opal_datatype_basicDatatypes[pElem->elem.common.type]->size;*/
-                    orig_alignment = ALIGNMENT_CHAR;
-                    cuda_iov_dist_h_current[nb_blocks_used].ptr_offset = destination + length_per_iovec / alignment * alignment - destination_base;
-                    cuda_iov_contig_buf_h_current[nb_blocks_used] = (uintptr_t)source;
-                    cuda_iov_dist_h_current[nb_blocks_used].nb_bytes = length_per_iovec - length_per_iovec / alignment * alignment;
+            /* handle residue */
+            if (residue_desc != 0) {
+               /* orig_alignment = opal_datatype_basicDatatypes[pElem->elem.common.type]->size;*/
+                orig_alignment = ALIGNMENT_CHAR;
+                cuda_iov_dist_h_current[nb_blocks_used].ptr_offset = destination + length_per_iovec / alignment * alignment - destination_base;
+                cuda_iov_dist_h_current[nb_blocks_used].nb_bytes = length_per_iovec - length_per_iovec / alignment * alignment;
 #if defined (OPAL_DATATYPE_CUDA_DEBUG)
-                    assert (cuda_iov_dist_h_current[nb_blocks_used].nb_bytes > 0);
+                assert (cuda_iov_dist_h_current[nb_blocks_used].nb_bytes > 0);
 #endif /* OPAL_DATATYPE_CUDA_DEBUG */
-                    cuda_iov_nb_bytes_list_h_current[nb_blocks_used] = cuda_iov_dist_h_current[nb_blocks_used].nb_bytes;
-                    source += cuda_iov_dist_h_current[nb_blocks_used].nb_bytes;
-                    DT_CUDA_DEBUG( opal_cuda_output(12, "Unpack \tblock %d, src_offset %ld, dst %p, nb_bytes %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].ptr_offset, cuda_iov_contig_buf_h_current[nb_blocks_used], cuda_iov_dist_h_current[nb_blocks_used].nb_bytes); );
-                    nb_blocks_used ++;
-                }
-            }
-            
-            if (!buffer_isfull) {
-                pConvertor->current_iov_pos = i;
+                cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] = cuda_iov_dist_h_current[nb_blocks_used].nb_bytes;
+                DT_CUDA_DEBUG( opal_cuda_output(12, "Unpack \tblock %d, src_offset %ld, nb_bytes %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].ptr_offset, cuda_iov_dist_h_current[nb_blocks_used].nb_bytes); );
+                nb_blocks_used ++;
             }
-
+        }
+        
+        cudaMemcpyAsync(cached_cuda_iov_dist_d, cuda_iov_dist_h_current, sizeof(ddt_cuda_iov_dist_cached_t)*(nb_blocks_used), cudaMemcpyHostToDevice, *cuda_stream_iov);
+        cuda_err = cudaEventRecord(cuda_iov_pipeline_block->cuda_event, *cuda_stream_iov);
+        opal_cuda_check_error(cuda_err);
+        opal_ddt_set_cuda_iov_cached(pConvertor, nb_blocks_used);
+        DT_CUDA_DEBUG ( opal_cuda_output(2, "Pack cuda iov is cached, count %d\n", nb_blocks_used););
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
-            GET_TIME( end );
-            total_time = ELAPSED_TIME( start, end );
-            DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Unpack src %p, iov is prepared in %ld microsec, kernel submitted to CUDA stream %d, nb_blocks_used %d\n", source_base, total_time,  cuda_iov_pipeline_block->cuda_stream_id, nb_blocks_used); );
+        GET_TIME( end );
+        total_time = ELAPSED_TIME( start, end );
+        DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Unpack src %p, iov is prepared in %ld microsec, nb_blocks_used %d\n", source_base, total_time, nb_blocks_used); );
 #endif
-
-            cudaMemcpyAsync(cuda_iov_dist_d_current, cuda_iov_dist_h_current, sizeof(ddt_cuda_iov_dist_cached_t)*(nb_blocks_used), cudaMemcpyHostToDevice, *cuda_stream_iov);
-            cudaMemcpyAsync(cuda_iov_contig_buf_d_current, cuda_iov_contig_buf_h_current, sizeof(uintptr_t)*(nb_blocks_used), cudaMemcpyHostToDevice, *cuda_stream_iov);
-            opal_generic_simple_unpack_cuda_iov_cached_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cuda_iov_dist_d_current, cuda_iov_contig_buf_d_current, nb_blocks_used, destination_base, 0, 0);
-            cuda_err = cudaEventRecord(cuda_iov_pipeline_block->cuda_event, *cuda_stream_iov);
-            opal_cuda_check_error(cuda_err);
-            iov_pipeline_block_id ++;
-            iov_pipeline_block_id = iov_pipeline_block_id % NB_STREAMS;
-            pConvertor->current_cuda_iov_pos += nb_blocks_used;
-        
-            iov_start_pos = iov_end_pos;
-            iov_end_pos = iov_start_pos + IOV_PIPELINE_SIZE;
-            if (iov_end_pos >= ddt_iov_count) {
-                iov_end_pos = ddt_iov_count;
-            }
-            /* finished */
-            if (pConvertor->current_iov_pos == ddt_iov_count) {
-                pConvertor->current_count ++;
-                opal_ddt_set_cuda_iov_cached(pConvertor, pConvertor->current_cuda_iov_pos);
-                DT_CUDA_DEBUG ( opal_cuda_output(2, "Pack cuda iov is cached, count %d\n", pConvertor->current_cuda_iov_pos););
-            }
-            DT_CUDA_DEBUG ( opal_cuda_output(4, "Unpack iov start pos %d end pos %d, submit to CUDA stream %d\n", iov_start_pos, iov_end_pos, cuda_streams->current_stream_id); );
-
-        }
     }
-    total_unpacked += unpacked_wo_cache;
-    pConvertor->bConverted += unpacked_wo_cache;
+    
 #if 1    
     /* now we use cached cuda iov */
     if( pConvertor->bConverted != pConvertor->local_size && !buffer_isfull) {
+        opal_ddt_set_cuda_iov_position(pConvertor, pConvertor->bConverted, cached_cuda_iov_nb_bytes_list_h, cached_cuda_iov_count);
         cuda_iov_start_pos = pConvertor->current_cuda_iov_pos;
         cuda_iov_end_pos = cached_cuda_iov_count;
         nb_blocks_used = 0;
@@ -878,11 +820,10 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
         cuda_iov_contig_buf_d_current = cuda_iov_pipeline_block->cuda_iov_contig_buf_d;
         cuda_iov_dist_d_current = cached_cuda_iov_dist_d + pConvertor->current_cuda_iov_pos;
         cuda_stream_iov = cuda_iov_pipeline_block->cuda_stream;
-        cuda_err = cudaStreamWaitEvent(*cuda_stream_iov, cuda_iov_pipeline_block->cuda_event, 0);
-        opal_cuda_check_error(cuda_err);
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
         GET_TIME(start);
 #endif
+        printf("[00000] partial_length %ld, pos %d\n", pConvertor->current_iov_partial_length, pConvertor->current_cuda_iov_pos);
         if (pConvertor->current_iov_partial_length > 0) {
             cuda_iov_partial_length_start = pConvertor->current_iov_partial_length;
             unpacked_w_cache += cuda_iov_partial_length_start;
@@ -919,11 +860,14 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
         total_time = ELAPSED_TIME( start, end );
         DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Unpack src %p, cached cuda iov is prepared in %ld microsec, kernel submitted to CUDA stream %d, nb_blocks %d\n", source_base, total_time,  cuda_iov_pipeline_block->cuda_stream_id, nb_blocks_used); );
 #endif
+        /*
         if (pConvertor->current_iov_partial_length > 0) {
-            pConvertor->current_cuda_iov_pos += nb_blocks_used - 1;
-        } else {
-            pConvertor->current_cuda_iov_pos += nb_blocks_used;
-        }
+                    pConvertor->current_cuda_iov_pos += nb_blocks_used - 1;
+                } else {
+                    pConvertor->current_cuda_iov_pos += nb_blocks_used;
+                } */
+        cuda_err = cudaStreamWaitEvent(*cuda_stream_iov, cuda_iov_pipeline_block->cuda_event, 0);
+        opal_cuda_check_error(cuda_err);
         cudaMemcpyAsync(cuda_iov_contig_buf_d_current, cuda_iov_contig_buf_h_current, sizeof(uintptr_t)*(nb_blocks_used), cudaMemcpyHostToDevice, *cuda_stream_iov);
         DT_CUDA_DEBUG ( opal_cuda_output(2, "kernel launched src_base %p, dst_base %p, nb_blocks %ld\n", source_base, destination_base, nb_blocks_used ); );
         opal_generic_simple_unpack_cuda_iov_cached_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cuda_iov_dist_d_current, cuda_iov_contig_buf_d_current, nb_blocks_used, destination_base, cuda_iov_partial_length_start, cuda_iov_partial_length_end);

From fb68b99647884c683257dec07b1c2bbf13f3d89e Mon Sep 17 00:00:00 2001
From: eddy16112 <eddy16112@gmail.com>
Date: Wed, 11 Nov 2015 21:07:45 -0500
Subject: [PATCH 151/190] another checkpoint

---
 .../cuda/opal_datatype_pack_cuda_wrapper.cu   | 186 ++++++------------
 .../cuda/opal_datatype_unpack_cuda_wrapper.cu |  28 +--
 test/datatype/ddt_benchmark.c                 |   6 +-
 test/datatype/ddt_lib.h                       |   4 +-
 4 files changed, 86 insertions(+), 138 deletions(-)

diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
index 8236692cad9..016b8294b8f 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
@@ -948,19 +948,21 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
 //    int32_t orig_stack_index;
     cudaError_t cuda_err;
     ddt_cuda_stream_t *cuda_streams = current_cuda_device->cuda_streams;
-    ddt_cuda_iov_dist_cached_t* cuda_iov_dist_h_current;
-    ddt_cuda_iov_dist_cached_t* cuda_iov_dist_d_current;
-    uintptr_t *cuda_iov_contig_buf_h_current, *cuda_iov_contig_buf_d_current;
-    ddt_cuda_iov_pipeline_block_t *cuda_iov_pipeline_block;
+    ddt_cuda_iov_dist_cached_t* cuda_iov_dist_h_current = NULL;
+    ddt_cuda_iov_dist_cached_t* cuda_iov_dist_d_current = NULL;
+    uintptr_t *cuda_iov_contig_buf_h_current = NULL; 
+    uintptr_t *cuda_iov_contig_buf_d_current = NULL;
+    ddt_cuda_iov_pipeline_block_t *cuda_iov_pipeline_block = NULL;
     int iov_pipeline_block_id = 0;
     cudaStream_t *cuda_stream_iov = NULL;
     const struct iovec *ddt_iov = NULL;
     uint32_t ddt_iov_count = 0;
     size_t iov_len = 0;
     uint32_t iov_start_pos, iov_end_pos, cuda_iov_start_pos, cuda_iov_end_pos;
-    ddt_cuda_iov_total_cached_t* cached_cuda_iov;
-    ddt_cuda_iov_dist_cached_t* cached_cuda_iov_dist_d;
-    uint32_t *cached_cuda_iov_nb_bytes_list_h, *cuda_iov_nb_bytes_list_h_current;
+    ddt_cuda_iov_total_cached_t* cached_cuda_iov = NULL;
+    ddt_cuda_iov_dist_cached_t* cached_cuda_iov_dist_d = NULL;
+    uint32_t *cached_cuda_iov_nb_bytes_list_h = NULL;
+    uint32_t *cuda_iov_nb_bytes_list_h_current = NULL;
     uint32_t cached_cuda_iov_count = 0;
     uint8_t cuda_iov_is_cached = 0;
 
@@ -1036,8 +1038,6 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
     assert(cached_cuda_iov_dist_d != NULL);
     cached_cuda_iov_nb_bytes_list_h = cached_cuda_iov->nb_bytes_h;
     assert(cached_cuda_iov_nb_bytes_list_h != NULL);
-    cached_cuda_iov_count = cached_cuda_iov->cuda_iov_count;
-    cuda_iov_is_cached = cached_cuda_iov->cuda_iov_is_cached;
     DT_CUDA_DEBUG ( opal_cuda_output(4, "Pack iov count %d, submit to CUDA stream %d\n", ddt_iov_count, cuda_streams->current_stream_id); );
 
 #if defined(OPAL_DATATYPE_CUDA_TIMING)    
@@ -1052,133 +1052,75 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
     
     /* cuda iov is not cached, start to cache iov */
     if(opal_ddt_cuda_iov_is_cached(pConvertor) == 0) {
-    
-        iov_start_pos = pConvertor->current_iov_pos;
-        iov_end_pos = iov_start_pos + IOV_PIPELINE_SIZE;
-        if (iov_end_pos > ddt_iov_count) {
-            iov_end_pos = ddt_iov_count;
-        }
-    
-        while (iov_start_pos < iov_end_pos && !buffer_isfull) {
-        
-            nb_blocks_used = 0;
-            cuda_iov_pipeline_block = current_cuda_device->cuda_iov_pipeline_block[iov_pipeline_block_id];
-            cuda_iov_dist_h_current = cuda_iov_pipeline_block->cuda_iov_dist_cached_h;
-            cuda_iov_contig_buf_h_current = cuda_iov_pipeline_block->cuda_iov_contig_buf_h;
-            cuda_iov_contig_buf_d_current = cuda_iov_pipeline_block->cuda_iov_contig_buf_d;
-            cuda_iov_dist_d_current = cached_cuda_iov_dist_d + pConvertor->current_cuda_iov_pos;
-            cuda_iov_nb_bytes_list_h_current = cached_cuda_iov_nb_bytes_list_h + pConvertor->current_cuda_iov_pos;
-            cuda_stream_iov = cuda_iov_pipeline_block->cuda_stream;
-            cuda_err = cudaStreamWaitEvent(*cuda_stream_iov, cuda_iov_pipeline_block->cuda_event, 0);
-            opal_cuda_check_error(cuda_err);
+        nb_blocks_used = 0;
+        cuda_iov_pipeline_block = current_cuda_device->cuda_iov_pipeline_block[iov_pipeline_block_id];
+        cuda_iov_dist_h_current = cuda_iov_pipeline_block->cuda_iov_dist_cached_h;
+        cuda_stream_iov = cuda_iov_pipeline_block->cuda_stream;
+        cuda_err = cudaStreamWaitEvent(*cuda_stream_iov, cuda_iov_pipeline_block->cuda_event, 0);
+        opal_cuda_check_error(cuda_err);
 
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
-            GET_TIME(start);
+        GET_TIME(start);
 #endif
 
-            for (i = iov_start_pos; i < iov_end_pos && !buffer_isfull; i++) {
-                if (pConvertor->current_iov_partial_length > 0) {
-                    iov_len = pConvertor->current_iov_partial_length;
-                    pConvertor->current_iov_partial_length = 0;
-                } else {
-                    iov_len = ddt_iov[i].iov_len;
-                }
-                if (buffer_size >= iov_len) {
-                    length_per_iovec = iov_len;
+        for (i = 0; i < ddt_iov_count; i++) {
+            length_per_iovec = ddt_iov[i].iov_len;
+            source = (size_t)(ddt_iov[i].iov_base) + source_base;
+        
+            /* block size is either multiple of ALIGNMENT_DOUBLE or residule */
+            alignment = ALIGNMENT_DOUBLE;
+
+            count_desc = length_per_iovec / alignment;
+            residue_desc = length_per_iovec % alignment;
+            nb_blocks_per_description = (count_desc + thread_per_block - 1) / thread_per_block;
+            DT_CUDA_DEBUG ( opal_cuda_output(10, "Pack description %d, size %d, residue %d, alignment %d, nb_block_aquired %d\n", i, count_desc, residue_desc, alignment, nb_blocks_per_description); );
+            for (j = 0; j < nb_blocks_per_description; j++) {
+                cuda_iov_dist_h_current[nb_blocks_used].ptr_offset = source + j * thread_per_block * alignment - source_base;
+                if ( (j+1) * thread_per_block <= count_desc) {
+                    cuda_iov_dist_h_current[nb_blocks_used].nb_bytes = thread_per_block * alignment;
                 } else {
-                    /*orig_alignment = opal_datatype_basicDatatypes[pElem->elem.common.type]->size;*/
-                    orig_alignment = ALIGNMENT_CHAR;
-                    length_per_iovec = buffer_size / orig_alignment * orig_alignment;
-                    buffer_isfull = 1;
-                    pConvertor->current_iov_partial_length = iov_len - length_per_iovec;
-                    pConvertor->current_iov_pos = i;
+                    cuda_iov_dist_h_current[nb_blocks_used].nb_bytes = (count_desc - j*thread_per_block) * alignment; 
                 }
-                buffer_size -= length_per_iovec;
-                packed_wo_cache += length_per_iovec;
-                source = (size_t)(ddt_iov[i].iov_base) + (ddt_iov[i].iov_len - iov_len) + source_base;
-            
-                /* block size is either multiple of ALIGNMENT_DOUBLE or residule */
-                alignment = ALIGNMENT_DOUBLE;
-
-                count_desc = length_per_iovec / alignment;
-                residue_desc = length_per_iovec % alignment;
-                nb_blocks_per_description = (count_desc + thread_per_block - 1) / thread_per_block;
-                DT_CUDA_DEBUG ( opal_cuda_output(10, "Pack description %d, size %d, residue %d, alignment %d, nb_block_aquired %d\n", i, count_desc, residue_desc, alignment, nb_blocks_per_description); );
-                for (j = 0; j < nb_blocks_per_description; j++) {
-                    cuda_iov_dist_h_current[nb_blocks_used].ptr_offset = source + j * thread_per_block * alignment - source_base;
-                    cuda_iov_contig_buf_h_current[nb_blocks_used] = (uintptr_t)destination;
-                    if ( (j+1) * thread_per_block <= count_desc) {
-                        cuda_iov_dist_h_current[nb_blocks_used].nb_bytes = thread_per_block * alignment;
-                    } else {
-                        cuda_iov_dist_h_current[nb_blocks_used].nb_bytes = (count_desc - j*thread_per_block) * alignment; 
-                    }
 #if defined (OPAL_DATATYPE_CUDA_DEBUG)
-                    assert(cuda_iov_dist_h_current[nb_blocks_used].nb_bytes > 0);
+                assert(cuda_iov_dist_h_current[nb_blocks_used].nb_bytes > 0);
 #endif /* OPAL_DATATYPE_CUDA_DEBUG */
-                    cuda_iov_nb_bytes_list_h_current[nb_blocks_used] = cuda_iov_dist_h_current[nb_blocks_used].nb_bytes;
-                    destination += cuda_iov_dist_h_current[nb_blocks_used].nb_bytes;
-                    DT_CUDA_DEBUG( opal_cuda_output(12, "Pack \tblock %d, src_offset %ld, dst %p, nb_bytes %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].ptr_offset, cuda_iov_contig_buf_h_current[nb_blocks_used], cuda_iov_dist_h_current[nb_blocks_used].nb_bytes); );
-                    nb_blocks_used ++;
-                    assert (nb_blocks_used < CUDA_MAX_NB_BLOCKS*CUDA_IOV_MAX_TASK_PER_BLOCK);
-                }
-            
-                /* handle residue */
-                if (residue_desc != 0) {
-                    /*orig_alignment = opal_datatype_basicDatatypes[pElem->elem.common.type]->size;*/
-                    orig_alignment = ALIGNMENT_CHAR;
-                    cuda_iov_dist_h_current[nb_blocks_used].ptr_offset = source + length_per_iovec / alignment * alignment - source_base;
-                    cuda_iov_contig_buf_h_current[nb_blocks_used] = (uintptr_t)destination;
-                    cuda_iov_dist_h_current[nb_blocks_used].nb_bytes = length_per_iovec - length_per_iovec / alignment * alignment;
+                cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] = cuda_iov_dist_h_current[nb_blocks_used].nb_bytes;
+                DT_CUDA_DEBUG( opal_cuda_output(12, "Pack \tblock %d, src_offset %ld, nb_bytes %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].ptr_offset, cuda_iov_dist_h_current[nb_blocks_used].nb_bytes); );
+                nb_blocks_used ++;
+                assert (nb_blocks_used < CUDA_MAX_NB_BLOCKS*CUDA_IOV_MAX_TASK_PER_BLOCK);
+            }
+        
+            /* handle residue */
+            if (residue_desc != 0) {
+                /*orig_alignment = opal_datatype_basicDatatypes[pElem->elem.common.type]->size;*/
+                orig_alignment = ALIGNMENT_CHAR;
+                cuda_iov_dist_h_current[nb_blocks_used].ptr_offset = source + length_per_iovec / alignment * alignment - source_base;
+                cuda_iov_dist_h_current[nb_blocks_used].nb_bytes = length_per_iovec - length_per_iovec / alignment * alignment;
 #if defined (OPAL_DATATYPE_CUDA_DEBUG)
-                    assert(cuda_iov_dist_h_current[nb_blocks_used].nb_bytes > 0);
+                assert(cuda_iov_dist_h_current[nb_blocks_used].nb_bytes > 0);
 #endif /* OPAL_DATATYPE_CUDA_DEBUG */
-                    cuda_iov_nb_bytes_list_h_current[nb_blocks_used] = cuda_iov_dist_h_current[nb_blocks_used].nb_bytes;
-                    destination += cuda_iov_dist_h_current[nb_blocks_used].nb_bytes;
-                    DT_CUDA_DEBUG( opal_cuda_output(12, "Pack \tblock %d, src_offset %ld, dst %p, nb_bytes %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].ptr_offset, cuda_iov_contig_buf_h_current[nb_blocks_used], cuda_iov_dist_h_current[nb_blocks_used].nb_bytes); );
-                    nb_blocks_used ++;
-                    assert (nb_blocks_used < CUDA_MAX_NB_BLOCKS*CUDA_IOV_MAX_TASK_PER_BLOCK);
-                }
-            }
-            
-            if (!buffer_isfull) {
-                pConvertor->current_iov_pos = i;
+                cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] = cuda_iov_dist_h_current[nb_blocks_used].nb_bytes;
+                DT_CUDA_DEBUG( opal_cuda_output(12, "Pack \tblock %d, src_offset %ld, nb_bytes %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].ptr_offset, cuda_iov_dist_h_current[nb_blocks_used].nb_bytes); );
+                nb_blocks_used ++;
+                assert (nb_blocks_used < CUDA_MAX_NB_BLOCKS*CUDA_IOV_MAX_TASK_PER_BLOCK);
             }
-
+        }
+        cudaMemcpyAsync(cached_cuda_iov_dist_d, cuda_iov_dist_h_current, sizeof(ddt_cuda_iov_dist_cached_t)*(nb_blocks_used), cudaMemcpyHostToDevice, *cuda_stream_iov);
+        cuda_err = cudaEventRecord(cuda_iov_pipeline_block->cuda_event, *cuda_stream_iov);
+        opal_cuda_check_error(cuda_err);
+        opal_ddt_set_cuda_iov_cached(pConvertor, nb_blocks_used);
+        DT_CUDA_DEBUG ( opal_cuda_output(2, "Pack cuda iov is cached, count %d\n", nb_blocks_used););
 #if defined(OPAL_DATATYPE_CUDA_TIMING)    
-            GET_TIME( end );
-            total_time = ELAPSED_TIME( start, end );
-            DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Pack to dest %p, iov is prepared in %ld microsec, kernel submitted to CUDA stream %d, nb_blocks %d\n", destination_base, total_time,  cuda_iov_pipeline_block->cuda_stream_id, nb_blocks_used); );
+        GET_TIME( end );
+        total_time = ELAPSED_TIME( start, end );
+        DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Pack to dest %p, iov is prepared in %ld microsec, nb_blocks %d\n", destination_base, total_time, nb_blocks_used); );
 #endif
-
-          //  opal_ddt_check_cuda_iov_is_full(pConvertor, pConvertor->current_cuda_iov_pos + nb_blocks_used); /* make sure cuda iov has enough space */
-            cudaMemcpyAsync(cuda_iov_dist_d_current, cuda_iov_dist_h_current, sizeof(ddt_cuda_iov_dist_cached_t)*(nb_blocks_used), cudaMemcpyHostToDevice, *cuda_stream_iov);
-            cudaMemcpyAsync(cuda_iov_contig_buf_d_current, cuda_iov_contig_buf_h_current, sizeof(uintptr_t)*(nb_blocks_used), cudaMemcpyHostToDevice, *cuda_stream_iov);
-            DT_CUDA_DEBUG ( opal_cuda_output(2, "kernel launched src_base %p, dst_base %p, nb_blocks %ld\n", source_base, destination_base, nb_blocks_used ); );
-            opal_generic_simple_pack_cuda_iov_cached_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cuda_iov_dist_d_current, cuda_iov_contig_buf_d_current, nb_blocks_used, source_base);
-            cuda_err = cudaEventRecord(cuda_iov_pipeline_block->cuda_event, *cuda_stream_iov);
-            opal_cuda_check_error(cuda_err);
-            iov_pipeline_block_id ++;
-            iov_pipeline_block_id = iov_pipeline_block_id % NB_STREAMS;
-            pConvertor->current_cuda_iov_pos += nb_blocks_used;
-        
-    //        orig_stack_index = pStack->index;
-            iov_start_pos = iov_end_pos;
-            iov_end_pos = iov_start_pos + IOV_PIPELINE_SIZE;
-            if (iov_end_pos >= ddt_iov_count) {
-                iov_end_pos = ddt_iov_count;
-            }
-            /* count = 0 done, iov cached finished */
-            if (pConvertor->current_iov_pos == ddt_iov_count) {
-                pConvertor->current_count ++;
-                opal_ddt_set_cuda_iov_cached(pConvertor, pConvertor->current_cuda_iov_pos);
-                DT_CUDA_DEBUG ( opal_cuda_output(2, "Pack cuda iov is cached, count %d\n", pConvertor->current_cuda_iov_pos););
-            }
-            DT_CUDA_DEBUG ( opal_cuda_output(4, "Pack iov start pos %d end pos %d, submit to CUDA stream %d\n", iov_start_pos, iov_end_pos, cuda_streams->current_stream_id); );
-        }
     }
-    total_packed += packed_wo_cache;
-    pConvertor->bConverted += packed_wo_cache;
-
+    for (i = 0; i < NB_STREAMS; i++) {
+        cudaStreamSynchronize(cuda_streams->opal_cuda_stream[i]);
+    }
+    
+    cached_cuda_iov_count = cached_cuda_iov->cuda_iov_count;
    
     /* now we use cached cuda iov */
     if( pConvertor->bConverted != pConvertor->local_size && !buffer_isfull) {
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
index 549b58a9986..f7427dd861e 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
@@ -644,19 +644,21 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
 //    int32_t orig_stack_index;
     cudaError_t cuda_err;
     ddt_cuda_stream_t *cuda_streams = current_cuda_device->cuda_streams;
-    ddt_cuda_iov_dist_cached_t* cuda_iov_dist_h_current;
-    ddt_cuda_iov_dist_cached_t* cuda_iov_dist_d_current;
-    uintptr_t *cuda_iov_contig_buf_h_current, *cuda_iov_contig_buf_d_current;
-    ddt_cuda_iov_pipeline_block_t *cuda_iov_pipeline_block;
+    ddt_cuda_iov_dist_cached_t* cuda_iov_dist_h_current = NULL;
+    ddt_cuda_iov_dist_cached_t* cuda_iov_dist_d_current = NULL;
+    uintptr_t *cuda_iov_contig_buf_h_current = NULL;
+    uintptr_t *cuda_iov_contig_buf_d_current = NULL;
+    ddt_cuda_iov_pipeline_block_t *cuda_iov_pipeline_block = NULL;
     int iov_pipeline_block_id = 0;
     cudaStream_t *cuda_stream_iov = NULL;
     const struct iovec *ddt_iov = NULL;
     uint32_t ddt_iov_count = 0;
     size_t iov_len = 0;
     uint32_t iov_start_pos, iov_end_pos, cuda_iov_start_pos, cuda_iov_end_pos;
-    ddt_cuda_iov_total_cached_t* cached_cuda_iov;
-    ddt_cuda_iov_dist_cached_t* cached_cuda_iov_dist_d;
-    uint32_t *cached_cuda_iov_nb_bytes_list_h, *cuda_iov_nb_bytes_list_h_current;
+    ddt_cuda_iov_total_cached_t* cached_cuda_iov = NULL;
+    ddt_cuda_iov_dist_cached_t* cached_cuda_iov_dist_d = NULL;
+    uint32_t *cached_cuda_iov_nb_bytes_list_h = NULL;
+    uint32_t *cuda_iov_nb_bytes_list_h_current = NULL;
     uint32_t cached_cuda_iov_count = 0;
     uint8_t cuda_iov_is_cached = 0;
     size_t cuda_iov_partial_length_start = 0;
@@ -728,8 +730,6 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
     assert(cached_cuda_iov_dist_d != NULL);
     cached_cuda_iov_nb_bytes_list_h = cached_cuda_iov->nb_bytes_h;
     assert(cached_cuda_iov_nb_bytes_list_h != NULL);
-    cached_cuda_iov_count = cached_cuda_iov->cuda_iov_count;
-    cuda_iov_is_cached = cached_cuda_iov->cuda_iov_is_cached;
     DT_CUDA_DEBUG ( opal_cuda_output(4, "Unpack iov count %d, submit to CUDA stream %d\n", ddt_iov_count, cuda_streams->current_stream_id); );
 
 #if defined (OPAL_DATATYPE_CUDA_TIMING)
@@ -796,11 +796,11 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
             }
         }
         
-        cudaMemcpyAsync(cached_cuda_iov_dist_d, cuda_iov_dist_h_current, sizeof(ddt_cuda_iov_dist_cached_t)*(nb_blocks_used), cudaMemcpyHostToDevice, *cuda_stream_iov);
+        cudaMemcpy(cached_cuda_iov_dist_d, cuda_iov_dist_h_current, sizeof(ddt_cuda_iov_dist_cached_t)*(nb_blocks_used), cudaMemcpyHostToDevice);
         cuda_err = cudaEventRecord(cuda_iov_pipeline_block->cuda_event, *cuda_stream_iov);
         opal_cuda_check_error(cuda_err);
         opal_ddt_set_cuda_iov_cached(pConvertor, nb_blocks_used);
-        DT_CUDA_DEBUG ( opal_cuda_output(2, "Pack cuda iov is cached, count %d\n", nb_blocks_used););
+        DT_CUDA_DEBUG ( opal_cuda_output(2, "Unpack cuda iov is cached, count %d\n", nb_blocks_used););
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
         GET_TIME( end );
         total_time = ELAPSED_TIME( start, end );
@@ -808,6 +808,12 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
 #endif
     }
     
+    for (i = 0; i < NB_STREAMS; i++) {
+        cudaStreamSynchronize(cuda_streams->opal_cuda_stream[i]);
+    }
+    
+    cached_cuda_iov_count = cached_cuda_iov->cuda_iov_count;
+    
 #if 1    
     /* now we use cached cuda iov */
     if( pConvertor->bConverted != pConvertor->local_size && !buffer_isfull) {
diff --git a/test/datatype/ddt_benchmark.c b/test/datatype/ddt_benchmark.c
index c8c3fd7db45..bab37e059c4 100644
--- a/test/datatype/ddt_benchmark.c
+++ b/test/datatype/ddt_benchmark.c
@@ -1306,13 +1306,13 @@ int main( int argc, char* argv[] )
         OBJ_RELEASE( pdt ); assert( pdt == NULL );
     }
     
-    for (blk_len = 2000; blk_len <= 2000; blk_len += 500) {
+    for (blk_len = 20; blk_len <= 20; blk_len += 500) {
         printf( ">>--------------------------------------------<<\n" );
         printf( "Vector data-type (60000 times %d double stride 512)\n", blk_len );
         pdt = create_vector_type( MPI_DOUBLE, blk_len, blk_len, blk_len*2);
         if( outputFlags & CHECK_PACK_UNPACK ) {
-            for (i = 0; i < 4; i++) {
-        //          vector_ddt( pdt, 1, pdt, 1, 1024*1024*100 , blk_len, blk_len, blk_len*2);
+            for (i = 0; i < 1; i++) {
+     //             vector_ddt( pdt, 1, pdt, 1, 1024*1024*100 , blk_len, blk_len, blk_len*2);
     //             vector_ddt_2d( pdt, 1, pdt, 1, 1024*1024*100 , 8192, blk_len, blk_len+128);
             }
         }
diff --git a/test/datatype/ddt_lib.h b/test/datatype/ddt_lib.h
index 0f6bbc2cb37..ef462ce0f31 100644
--- a/test/datatype/ddt_lib.h
+++ b/test/datatype/ddt_lib.h
@@ -34,9 +34,9 @@
 #define DUMP_DATA_AFTER_COMMIT 0x00000001
 #define CHECK_PACK_UNPACK      0x00000002
 
-#define TEST_DOUBLE
+//#define TEST_DOUBLE
 //#define TEST_FLOAT
-//#define TEST_CHAR
+#define TEST_CHAR
 
 
 extern uint32_t outputFlags;

From 64e2a62ea83bd25e39e47ea79411887a935aad94 Mon Sep 17 00:00:00 2001
From: eddy16112 <eddy16112@gmail.com>
Date: Wed, 11 Nov 2015 22:58:09 -0500
Subject: [PATCH 152/190] checkpoint , remove unnecessary cuda stream sync

---
 .../cuda/opal_datatype_pack_cuda_wrapper.cu        | 11 -----------
 .../cuda/opal_datatype_unpack_cuda_wrapper.cu      | 14 +-------------
 2 files changed, 1 insertion(+), 24 deletions(-)

diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
index 016b8294b8f..3509ac2de6b 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
@@ -1106,8 +1106,6 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
             }
         }
         cudaMemcpyAsync(cached_cuda_iov_dist_d, cuda_iov_dist_h_current, sizeof(ddt_cuda_iov_dist_cached_t)*(nb_blocks_used), cudaMemcpyHostToDevice, *cuda_stream_iov);
-        cuda_err = cudaEventRecord(cuda_iov_pipeline_block->cuda_event, *cuda_stream_iov);
-        opal_cuda_check_error(cuda_err);
         opal_ddt_set_cuda_iov_cached(pConvertor, nb_blocks_used);
         DT_CUDA_DEBUG ( opal_cuda_output(2, "Pack cuda iov is cached, count %d\n", nb_blocks_used););
 #if defined(OPAL_DATATYPE_CUDA_TIMING)    
@@ -1116,9 +1114,6 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
         DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Pack to dest %p, iov is prepared in %ld microsec, nb_blocks %d\n", destination_base, total_time, nb_blocks_used); );
 #endif
     }
-    for (i = 0; i < NB_STREAMS; i++) {
-        cudaStreamSynchronize(cuda_streams->opal_cuda_stream[i]);
-    }
     
     cached_cuda_iov_count = cached_cuda_iov->cuda_iov_count;
    
@@ -1132,8 +1127,6 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
         cuda_iov_contig_buf_d_current = cuda_iov_pipeline_block->cuda_iov_contig_buf_d;
         cuda_iov_dist_d_current = cached_cuda_iov_dist_d + pConvertor->current_cuda_iov_pos;
         cuda_stream_iov = cuda_iov_pipeline_block->cuda_stream;
-        cuda_err = cudaStreamWaitEvent(*cuda_stream_iov, cuda_iov_pipeline_block->cuda_event, 0);
-        opal_cuda_check_error(cuda_err);
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
         GET_TIME(start);
 #endif
@@ -1158,10 +1151,6 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
         cudaMemcpyAsync(cuda_iov_contig_buf_d_current, cuda_iov_contig_buf_h_current, sizeof(uintptr_t)*(nb_blocks_used), cudaMemcpyHostToDevice, *cuda_stream_iov);
         DT_CUDA_DEBUG ( opal_cuda_output(2, "kernel launched src_base %p, dst_base %p, nb_blocks %ld\n", source_base, destination_base, nb_blocks_used ); );
         opal_generic_simple_pack_cuda_iov_cached_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cuda_iov_dist_d_current, cuda_iov_contig_buf_d_current, nb_blocks_used, source_base);
-        cuda_err = cudaEventRecord(cuda_iov_pipeline_block->cuda_event, *cuda_stream_iov);
-        opal_cuda_check_error(cuda_err);
-        iov_pipeline_block_id ++;
-        iov_pipeline_block_id = iov_pipeline_block_id % NB_STREAMS;
     }
 
     for (i = 0; i < NB_STREAMS; i++) {
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
index f7427dd861e..062b75f7224 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
@@ -797,8 +797,6 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
         }
         
         cudaMemcpy(cached_cuda_iov_dist_d, cuda_iov_dist_h_current, sizeof(ddt_cuda_iov_dist_cached_t)*(nb_blocks_used), cudaMemcpyHostToDevice);
-        cuda_err = cudaEventRecord(cuda_iov_pipeline_block->cuda_event, *cuda_stream_iov);
-        opal_cuda_check_error(cuda_err);
         opal_ddt_set_cuda_iov_cached(pConvertor, nb_blocks_used);
         DT_CUDA_DEBUG ( opal_cuda_output(2, "Unpack cuda iov is cached, count %d\n", nb_blocks_used););
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
@@ -808,13 +806,8 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
 #endif
     }
     
-    for (i = 0; i < NB_STREAMS; i++) {
-        cudaStreamSynchronize(cuda_streams->opal_cuda_stream[i]);
-    }
-    
     cached_cuda_iov_count = cached_cuda_iov->cuda_iov_count;
-    
-#if 1    
+      
     /* now we use cached cuda iov */
     if( pConvertor->bConverted != pConvertor->local_size && !buffer_isfull) {
         opal_ddt_set_cuda_iov_position(pConvertor, pConvertor->bConverted, cached_cuda_iov_nb_bytes_list_h, cached_cuda_iov_count);
@@ -872,17 +865,12 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
                 } else {
                     pConvertor->current_cuda_iov_pos += nb_blocks_used;
                 } */
-        cuda_err = cudaStreamWaitEvent(*cuda_stream_iov, cuda_iov_pipeline_block->cuda_event, 0);
-        opal_cuda_check_error(cuda_err);
         cudaMemcpyAsync(cuda_iov_contig_buf_d_current, cuda_iov_contig_buf_h_current, sizeof(uintptr_t)*(nb_blocks_used), cudaMemcpyHostToDevice, *cuda_stream_iov);
         DT_CUDA_DEBUG ( opal_cuda_output(2, "kernel launched src_base %p, dst_base %p, nb_blocks %ld\n", source_base, destination_base, nb_blocks_used ); );
         opal_generic_simple_unpack_cuda_iov_cached_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cuda_iov_dist_d_current, cuda_iov_contig_buf_d_current, nb_blocks_used, destination_base, cuda_iov_partial_length_start, cuda_iov_partial_length_end);
-        cuda_err = cudaEventRecord(cuda_iov_pipeline_block->cuda_event, *cuda_stream_iov);
-        opal_cuda_check_error(cuda_err);
         iov_pipeline_block_id ++;
         iov_pipeline_block_id = iov_pipeline_block_id % NB_STREAMS;
     }
-#endif
 
     for (i = 0; i < NB_STREAMS; i++) {
         cudaStreamSynchronize(cuda_streams->opal_cuda_stream[i]);

From 39de9e0312304c4ac812935116ee44b30e58aab9 Mon Sep 17 00:00:00 2001
From: eddy16112 <eddy16112@gmail.com>
Date: Thu, 12 Nov 2015 22:56:40 -0500
Subject: [PATCH 153/190] use bit to replace %

---
 .../cuda/opal_datatype_cuda_internal.cuh      |  4 +-
 .../cuda/opal_datatype_pack_cuda_kernel.cu    | 35 +++++++-------
 .../cuda/opal_datatype_pack_cuda_wrapper.cu   | 19 +++-----
 .../cuda/opal_datatype_unpack_cuda_kernel.cu  | 48 +++++++++----------
 .../cuda/opal_datatype_unpack_cuda_wrapper.cu | 25 +++-------
 test/datatype/ddt_benchmark.c                 | 17 ++++++-
 6 files changed, 70 insertions(+), 78 deletions(-)

diff --git a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
index b7e8e9405f6..b1c36b66e14 100644
--- a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
+++ b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
@@ -138,9 +138,9 @@ __global__ void opal_generic_simple_pack_cuda_iov_non_cached_kernel( ddt_cuda_io
 
 __global__ void opal_generic_simple_unpack_cuda_iov_non_cached_kernel( ddt_cuda_iov_dist_non_cached_t* cuda_iov_dist, int nb_blocks_used);
 
-__global__ void opal_generic_simple_pack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, uintptr_t* cuda_iov_contig_buf_d, int nb_blocks_used, unsigned char* source_base);
+__global__ void opal_generic_simple_pack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, uint32_t cuda_iov_pos, uintptr_t* cuda_iov_contig_buf_d, int nb_blocks_used, unsigned char* source_base);
 
-__global__ void opal_generic_simple_unpack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, uintptr_t* cuda_iov_contig_buf_d, int nb_blocks_used, unsigned char* destination_base, size_t cuda_iov_partial_length_start, size_t cuda_iov_partial_length_end);
+__global__ void opal_generic_simple_unpack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, uint32_t cuda_iov_pos, uintptr_t* cuda_iov_contig_buf_d, int nb_blocks_used, unsigned char* destination_base, size_t cuda_iov_partial_length_start, size_t cuda_iov_partial_length_end);
 
 void opal_cuda_output(int output_id, const char *format, ...);
 
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
index e85b83e55b5..97c6c69aeff 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
@@ -88,16 +88,17 @@ __global__ void opal_generic_simple_pack_cuda_iov_non_cached_kernel( ddt_cuda_io
     }
 }
 
-__global__ void opal_generic_simple_pack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, uintptr_t* cuda_iov_contig_buf_d, int nb_blocks_used, unsigned char* source_base)
+__global__ void opal_generic_simple_pack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, uint32_t cuda_iov_pos, uintptr_t* cuda_iov_contig_buf_d, int nb_blocks_used, unsigned char* source_base)
 {
-    uint32_t i, j;
+    uint32_t i, j, _nb_bytes;    
     size_t src_offset;
     unsigned char *dst;
     unsigned char *_source_tmp, *_destination_tmp;
+    uint32_t current_cuda_iov_pos = cuda_iov_pos;
     
     __shared__ uint32_t nb_tasks;
-    __shared__ uint32_t copy_count;
-    __shared__ uint8_t alignment;
+    uint32_t copy_count;
+    uint8_t alignment;
     
     if (threadIdx.x == 0) {
         nb_tasks = nb_blocks_used / gridDim.x;
@@ -109,24 +110,20 @@ __global__ void opal_generic_simple_pack_cuda_iov_cached_kernel( ddt_cuda_iov_di
     __syncthreads();
     
     for (i = 0; i < nb_tasks; i++) {
-        src_offset = cuda_iov_dist[blockIdx.x + i * gridDim.x].ptr_offset;
+        src_offset = cuda_iov_dist[blockIdx.x + i * gridDim.x + current_cuda_iov_pos].ptr_offset;
         dst = (unsigned char *)cuda_iov_contig_buf_d[blockIdx.x + i * gridDim.x];
+        _nb_bytes = cuda_iov_dist[blockIdx.x + i * gridDim.x + current_cuda_iov_pos].nb_bytes;
         
-        if (threadIdx.x == 0) {
-            _source_tmp = source_base + src_offset;
-            _destination_tmp = dst;
-            uint32_t _nb_bytes = cuda_iov_dist[blockIdx.x + i * gridDim.x].nb_bytes;
-            /* block size is either multiple of ALIGNMENT_DOUBLE or residule */
-            if ((uintptr_t)(_source_tmp) % ALIGNMENT_DOUBLE == 0 && (uintptr_t)_destination_tmp % ALIGNMENT_DOUBLE == 0 && _nb_bytes % ALIGNMENT_DOUBLE == 0) {
-                alignment = ALIGNMENT_DOUBLE;
-            } else if ((uintptr_t)(_source_tmp) % ALIGNMENT_FLOAT == 0 && (uintptr_t)_destination_tmp % ALIGNMENT_FLOAT == 0 && _nb_bytes % ALIGNMENT_FLOAT == 0) {
-                alignment = ALIGNMENT_FLOAT;
-            } else {
-                alignment = ALIGNMENT_CHAR;
-            }
-            copy_count = _nb_bytes / alignment;
+        _source_tmp = source_base + src_offset;
+        /* block size is either multiple of ALIGNMENT_DOUBLE or residule */
+        if ((uintptr_t)(_source_tmp) & 0x7 == 0 && (uintptr_t)dst & 0x7 == 0 && _nb_bytes & 0x7 == 0) {
+            alignment = ALIGNMENT_DOUBLE;
+        } else if ((uintptr_t)(_source_tmp) & 0x3 == 0 && (uintptr_t)dst & 0x3 == 0 && _nb_bytes & 0x3 == 0) {
+            alignment = ALIGNMENT_FLOAT;
+        } else {
+            alignment = ALIGNMENT_CHAR;
         }
-        __syncthreads();
+        copy_count = _nb_bytes / alignment;
         
         for (j = threadIdx.x; j < copy_count; j += blockDim.x) {
             if (j < copy_count) {
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
index 3509ac2de6b..1d14c000977 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
@@ -937,7 +937,7 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
     uint32_t nb_blocks, thread_per_block, nb_blocks_used;
     size_t length, buffer_size, length_per_iovec;
     unsigned char *destination, *destination_base, *source_base, *source;
-    size_t total_packed = 0, packed_w_cache = 0, packed_wo_cache = 0;
+    size_t total_packed;
     int32_t complete_flag = 0;
     uint8_t buffer_isfull = 0, transfer_required, free_required;
     uint32_t convertor_flags;
@@ -1017,8 +1017,6 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
     DT_CUDA_DEBUG ( opal_cuda_output(2, "Pack using IOV cached, GPU base %p, pack to buffer %p\n", pConvertor->pBaseBuf, destination););
 
     total_packed = 0;
-    packed_wo_cache = 0;
-    packed_w_cache = 0;
     cuda_streams->current_stream_id = 0;
   //  orig_stack_index = pStack->index;
     destination_base = destination;
@@ -1093,7 +1091,6 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
             /* handle residue */
             if (residue_desc != 0) {
                 /*orig_alignment = opal_datatype_basicDatatypes[pElem->elem.common.type]->size;*/
-                orig_alignment = ALIGNMENT_CHAR;
                 cuda_iov_dist_h_current[nb_blocks_used].ptr_offset = source + length_per_iovec / alignment * alignment - source_base;
                 cuda_iov_dist_h_current[nb_blocks_used].nb_bytes = length_per_iovec - length_per_iovec / alignment * alignment;
 #if defined (OPAL_DATATYPE_CUDA_DEBUG)
@@ -1111,7 +1108,7 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
 #if defined(OPAL_DATATYPE_CUDA_TIMING)    
         GET_TIME( end );
         total_time = ELAPSED_TIME( start, end );
-        DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Pack to dest %p, iov is prepared in %ld microsec, nb_blocks %d\n", destination_base, total_time, nb_blocks_used); );
+        DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Pack cached cuda iov is prepared in %ld microsec, nb_blocks %d\n", total_time, nb_blocks_used); );
 #endif
     }
     
@@ -1125,7 +1122,6 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
         cuda_iov_pipeline_block = current_cuda_device->cuda_iov_pipeline_block[iov_pipeline_block_id];
         cuda_iov_contig_buf_h_current = cuda_iov_pipeline_block->cuda_iov_contig_buf_h;
         cuda_iov_contig_buf_d_current = cuda_iov_pipeline_block->cuda_iov_contig_buf_d;
-        cuda_iov_dist_d_current = cached_cuda_iov_dist_d + pConvertor->current_cuda_iov_pos;
         cuda_stream_iov = cuda_iov_pipeline_block->cuda_stream;
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
         GET_TIME(start);
@@ -1134,7 +1130,7 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
             if (buffer_size >= cached_cuda_iov_nb_bytes_list_h[i]) {
                 cuda_iov_contig_buf_h_current[nb_blocks_used] = (uintptr_t)destination;
                 destination += cached_cuda_iov_nb_bytes_list_h[i];
-                packed_w_cache += cached_cuda_iov_nb_bytes_list_h[i];
+                total_packed += cached_cuda_iov_nb_bytes_list_h[i];
                 buffer_size -= cached_cuda_iov_nb_bytes_list_h[i];
                 nb_blocks_used++;
             } else {
@@ -1147,18 +1143,17 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
         total_time = ELAPSED_TIME( start, end );
         DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Pack to dest %p, cached cuda iov is prepared in %ld microsec, kernel submitted to CUDA stream %d, nb_blocks %d\n", destination_base, total_time,  cuda_iov_pipeline_block->cuda_stream_id, nb_blocks_used); );
 #endif
-        pConvertor->current_cuda_iov_pos += nb_blocks_used;
         cudaMemcpyAsync(cuda_iov_contig_buf_d_current, cuda_iov_contig_buf_h_current, sizeof(uintptr_t)*(nb_blocks_used), cudaMemcpyHostToDevice, *cuda_stream_iov);
         DT_CUDA_DEBUG ( opal_cuda_output(2, "kernel launched src_base %p, dst_base %p, nb_blocks %ld\n", source_base, destination_base, nb_blocks_used ); );
-        opal_generic_simple_pack_cuda_iov_cached_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cuda_iov_dist_d_current, cuda_iov_contig_buf_d_current, nb_blocks_used, source_base);
+        opal_generic_simple_pack_cuda_iov_cached_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cached_cuda_iov_dist_d, pConvertor->current_cuda_iov_pos, cuda_iov_contig_buf_d_current, nb_blocks_used, source_base);
+        pConvertor->current_cuda_iov_pos += nb_blocks_used;
     }
 
     for (i = 0; i < NB_STREAMS; i++) {
         cudaStreamSynchronize(cuda_streams->opal_cuda_stream[i]);
     }
-    
-    total_packed += packed_w_cache;
-    pConvertor->bConverted += packed_w_cache;
+
+    pConvertor->bConverted += total_packed;
     DT_CUDA_DEBUG ( opal_cuda_output(2, "Pack total packed %d\n", total_packed); );
     
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
index c553a7991b0..7eb179a0a42 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
@@ -46,16 +46,18 @@ __global__ void opal_generic_simple_unpack_cuda_iov_non_cached_kernel( ddt_cuda_
     }
 }
 
-__global__ void opal_generic_simple_unpack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, uintptr_t* cuda_iov_contig_buf_d, int nb_blocks_used, unsigned char* destination_base, size_t cuda_iov_partial_length_start, size_t cuda_iov_partial_length_end)
+__global__ void opal_generic_simple_unpack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, uint32_t cuda_iov_pos, uintptr_t* cuda_iov_contig_buf_d, int nb_blocks_used, unsigned char* destination_base, size_t cuda_iov_partial_length_start, size_t cuda_iov_partial_length_end)
 {
     uint32_t i, j;
     size_t dst_offset;
     unsigned char *src;
     unsigned char *_source_tmp, *_destination_tmp;
+    uint32_t _nb_bytes;
+    uint32_t current_cuda_iov_pos = cuda_iov_pos; 
     
     __shared__ uint32_t nb_tasks;
-    __shared__ uint32_t copy_count;
-    __shared__ uint8_t alignment;
+    uint32_t copy_count;
+    uint8_t alignment;
     
     if (threadIdx.x == 0) {
         nb_tasks = nb_blocks_used / gridDim.x;
@@ -67,32 +69,28 @@ __global__ void opal_generic_simple_unpack_cuda_iov_cached_kernel( ddt_cuda_iov_
     
     for (i = 0; i < nb_tasks; i++) {
         src = (unsigned char *)cuda_iov_contig_buf_d[blockIdx.x + i * gridDim.x];
-        dst_offset = cuda_iov_dist[blockIdx.x + i * gridDim.x].ptr_offset;
+        dst_offset = cuda_iov_dist[blockIdx.x + i * gridDim.x + current_cuda_iov_pos].ptr_offset;
         if (i == 0 && blockIdx.x == 0 && cuda_iov_partial_length_start != 0) {
          //   if (threadIdx.x == 0) printf("cuda_iov_partial_length_start %d", cuda_iov_partial_length_start);
-            dst_offset = dst_offset + cuda_iov_dist[blockIdx.x + i * gridDim.x].nb_bytes - cuda_iov_partial_length_start; 
+            dst_offset = dst_offset + cuda_iov_dist[blockIdx.x + i * gridDim.x + current_cuda_iov_pos].nb_bytes - cuda_iov_partial_length_start; 
         }
-        if (threadIdx.x == 0) {
-            _source_tmp = src;
-            _destination_tmp = destination_base + dst_offset;
-            uint32_t _nb_bytes = 0;
-            if (i == 0 && blockIdx.x == 0 && cuda_iov_partial_length_start != 0) {
-                _nb_bytes = cuda_iov_partial_length_start;
-            } else if (i == nb_tasks-1 && (blockIdx.x == (nb_blocks_used-1) % gridDim.x) && cuda_iov_partial_length_end != 0) {
-                _nb_bytes = cuda_iov_partial_length_end;
-            } else {
-                _nb_bytes = cuda_iov_dist[blockIdx.x + i * gridDim.x].nb_bytes;
-            }
-            if ((uintptr_t)(_destination_tmp) % ALIGNMENT_DOUBLE == 0 && (uintptr_t)_source_tmp % ALIGNMENT_DOUBLE == 0 && _nb_bytes % ALIGNMENT_DOUBLE == 0) {
-                alignment = ALIGNMENT_DOUBLE;
-            } else if ((uintptr_t)(_destination_tmp) % ALIGNMENT_FLOAT == 0 && (uintptr_t)_source_tmp % ALIGNMENT_FLOAT == 0 && _nb_bytes % ALIGNMENT_FLOAT == 0) {
-                alignment = ALIGNMENT_FLOAT;
-            } else {
-                alignment = ALIGNMENT_CHAR;
-            }
-            copy_count = _nb_bytes / alignment;
+        _destination_tmp = destination_base + dst_offset;
+
+        if (i == 0 && blockIdx.x == 0 && cuda_iov_partial_length_start != 0) {
+            _nb_bytes = cuda_iov_partial_length_start;
+        } else if (i == nb_tasks-1 && (blockIdx.x == (nb_blocks_used-1) % gridDim.x) && cuda_iov_partial_length_end != 0) {
+            _nb_bytes = cuda_iov_partial_length_end;
+        } else {
+            _nb_bytes = cuda_iov_dist[blockIdx.x + i * gridDim.x + current_cuda_iov_pos].nb_bytes;
+        }
+        if ((uintptr_t)(_destination_tmp) & 0x7 == 0 && (uintptr_t)src & 0x7 == 0 && _nb_bytes & 0x7 == 0) {
+            alignment = ALIGNMENT_DOUBLE;
+        } else if ((uintptr_t)(_destination_tmp) & 0x3 == 0 && (uintptr_t)src & 0x3 == 0 && _nb_bytes & 0x3 == 0) {
+            alignment = ALIGNMENT_FLOAT;
+        } else {
+            alignment = ALIGNMENT_CHAR;
         }
-        __syncthreads();
+        copy_count = _nb_bytes / alignment;
         
         for (j = threadIdx.x; j < copy_count; j += blockDim.x) {
 /*            if (threadIdx.x == 0) {
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
index 062b75f7224..50009710d2d 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
@@ -632,7 +632,7 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
     uint32_t nb_blocks, thread_per_block, nb_blocks_used;
     size_t length, buffer_size, length_per_iovec;
     unsigned char *source, *source_base, *destination_base, *destination;
-    size_t total_unpacked = 0, unpacked_wo_cache = 0, unpacked_w_cache = 0;
+    size_t total_unpacked;
     int32_t complete_flag = 0;
     uint8_t buffer_isfull = 0;
     uint8_t free_required = 0;
@@ -717,8 +717,6 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
 #endif
     buffer_size = iov[0].iov_len;
     total_unpacked = 0;
-    unpacked_wo_cache = 0;
-    unpacked_w_cache = 0;
     cuda_streams->current_stream_id = 0;
     convertor_flags = pConvertor->flags;
 //    orig_stack_index = pStack->index;
@@ -802,7 +800,7 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
         GET_TIME( end );
         total_time = ELAPSED_TIME( start, end );
-        DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Unpack src %p, iov is prepared in %ld microsec, nb_blocks_used %d\n", source_base, total_time, nb_blocks_used); );
+        DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Unpack cached cuda iov is prepared in %ld microsec, nb_blocks_used %d\n", total_time, nb_blocks_used); );
 #endif
     }
     
@@ -825,7 +823,7 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
         printf("[00000] partial_length %ld, pos %d\n", pConvertor->current_iov_partial_length, pConvertor->current_cuda_iov_pos);
         if (pConvertor->current_iov_partial_length > 0) {
             cuda_iov_partial_length_start = pConvertor->current_iov_partial_length;
-            unpacked_w_cache += cuda_iov_partial_length_start;
+            total_unpacked += cuda_iov_partial_length_start;
             buffer_size -= cuda_iov_partial_length_start;
             pConvertor->current_iov_partial_length = 0;
             cuda_iov_contig_buf_h_current[nb_blocks_used] = (uintptr_t)source;
@@ -837,13 +835,13 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
             if (buffer_size >= cached_cuda_iov_nb_bytes_list_h[i]) {
                 cuda_iov_contig_buf_h_current[nb_blocks_used] = (uintptr_t)source;
                 source += cached_cuda_iov_nb_bytes_list_h[i];
-                unpacked_w_cache += cached_cuda_iov_nb_bytes_list_h[i];
+                total_unpacked += cached_cuda_iov_nb_bytes_list_h[i];
                 buffer_size -= cached_cuda_iov_nb_bytes_list_h[i];
                 nb_blocks_used ++;
             } else {
                 if (buffer_size > 0) {
                     cuda_iov_partial_length_end = buffer_size;
-                    unpacked_w_cache += cuda_iov_partial_length_end;
+                    total_unpacked += cuda_iov_partial_length_end;
                     cuda_iov_contig_buf_h_current[nb_blocks_used] = (uintptr_t)source;
                     source += cuda_iov_partial_length_end;
                     pConvertor->current_iov_partial_length = cached_cuda_iov_nb_bytes_list_h[i] - cuda_iov_partial_length_end;
@@ -859,25 +857,16 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
         total_time = ELAPSED_TIME( start, end );
         DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Unpack src %p, cached cuda iov is prepared in %ld microsec, kernel submitted to CUDA stream %d, nb_blocks %d\n", source_base, total_time,  cuda_iov_pipeline_block->cuda_stream_id, nb_blocks_used); );
 #endif
-        /*
-        if (pConvertor->current_iov_partial_length > 0) {
-                    pConvertor->current_cuda_iov_pos += nb_blocks_used - 1;
-                } else {
-                    pConvertor->current_cuda_iov_pos += nb_blocks_used;
-                } */
         cudaMemcpyAsync(cuda_iov_contig_buf_d_current, cuda_iov_contig_buf_h_current, sizeof(uintptr_t)*(nb_blocks_used), cudaMemcpyHostToDevice, *cuda_stream_iov);
         DT_CUDA_DEBUG ( opal_cuda_output(2, "kernel launched src_base %p, dst_base %p, nb_blocks %ld\n", source_base, destination_base, nb_blocks_used ); );
-        opal_generic_simple_unpack_cuda_iov_cached_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cuda_iov_dist_d_current, cuda_iov_contig_buf_d_current, nb_blocks_used, destination_base, cuda_iov_partial_length_start, cuda_iov_partial_length_end);
-        iov_pipeline_block_id ++;
-        iov_pipeline_block_id = iov_pipeline_block_id % NB_STREAMS;
+        opal_generic_simple_unpack_cuda_iov_cached_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cached_cuda_iov_dist_d, pConvertor->current_cuda_iov_pos, cuda_iov_contig_buf_d_current, nb_blocks_used, destination_base, cuda_iov_partial_length_start, cuda_iov_partial_length_end);
     }
 
     for (i = 0; i < NB_STREAMS; i++) {
         cudaStreamSynchronize(cuda_streams->opal_cuda_stream[i]);
     }
     
-    total_unpacked += unpacked_w_cache;
-    pConvertor->bConverted += unpacked_w_cache;
+    pConvertor->bConverted += total_unpacked;
     DT_CUDA_DEBUG ( opal_cuda_output(2, "Unpack total unpacked %d\n", total_unpacked); );
 
     iov[0].iov_len = total_unpacked;
diff --git a/test/datatype/ddt_benchmark.c b/test/datatype/ddt_benchmark.c
index bab37e059c4..d961ef34e4e 100644
--- a/test/datatype/ddt_benchmark.c
+++ b/test/datatype/ddt_benchmark.c
@@ -793,6 +793,8 @@ static int local_copy_with_convertor( ompi_datatype_t* pdt, int count, int chunk
     int32_t length = 0, done1 = 0, done2 = 0;
     TIMER_DATA_TYPE start, end, unpack_start, unpack_end;
     long total_time, unpack_time = 0;
+    int j, t_error = 0;
+    unsigned char *mat_char;
 
     dt_length = compute_buffer_length(pdt, count);
     printf("length %lu\n", dt_length);
@@ -890,7 +892,18 @@ static int local_copy_with_convertor( ompi_datatype_t* pdt, int count, int chunk
 
         if( done1 == 0 ) {
             done1 = opal_convertor_pack( send_convertor, &iov, &iov_count, &max_data );
+            
         }
+#if defined (TEST_CHAR)
+   /*     mat_char = (unsigned char *)ptemp;
+        for (j = 0; j < max_data; j++) {
+            if (mat_char[j] != 'a') {
+                t_error ++;
+                printf("error %d, %c\n", j, mat_char[j]);
+            }
+        }
+        printf("total error %d\n", t_error);*/
+#endif
 
         if( done2 == 0 ) {
             GET_TIME( unpack_start );
@@ -1306,13 +1319,13 @@ int main( int argc, char* argv[] )
         OBJ_RELEASE( pdt ); assert( pdt == NULL );
     }
     
-    for (blk_len = 20; blk_len <= 20; blk_len += 500) {
+    for (blk_len = 51; blk_len <= 51; blk_len += 500) {
         printf( ">>--------------------------------------------<<\n" );
         printf( "Vector data-type (60000 times %d double stride 512)\n", blk_len );
         pdt = create_vector_type( MPI_DOUBLE, blk_len, blk_len, blk_len*2);
         if( outputFlags & CHECK_PACK_UNPACK ) {
             for (i = 0; i < 1; i++) {
-     //             vector_ddt( pdt, 1, pdt, 1, 1024*1024*100 , blk_len, blk_len, blk_len*2);
+      //           vector_ddt( pdt, 1, pdt, 1, 1024*1024*100 , blk_len, blk_len, blk_len*2);
     //             vector_ddt_2d( pdt, 1, pdt, 1, 1024*1024*100 , 8192, blk_len, blk_len+128);
             }
         }

From f17c5f833e88c194f12000eec7b8749c9706a0b0 Mon Sep 17 00:00:00 2001
From: eddy16112 <eddy16112@gmail.com>
Date: Thu, 12 Nov 2015 23:27:59 -0500
Subject: [PATCH 154/190] rollback to use %, not bit, since it is faster, not
 sure why

---
 opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu   | 6 +++---
 opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
index 97c6c69aeff..93fb188ddcd 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
@@ -116,9 +116,9 @@ __global__ void opal_generic_simple_pack_cuda_iov_cached_kernel( ddt_cuda_iov_di
         
         _source_tmp = source_base + src_offset;
         /* block size is either multiple of ALIGNMENT_DOUBLE or residule */
-        if ((uintptr_t)(_source_tmp) & 0x7 == 0 && (uintptr_t)dst & 0x7 == 0 && _nb_bytes & 0x7 == 0) {
+        if ((uintptr_t)(_source_tmp) % ALIGNMENT_DOUBLE == 0 && (uintptr_t)dst % ALIGNMENT_DOUBLE == 0 && _nb_bytes % ALIGNMENT_DOUBLE == 0) {
             alignment = ALIGNMENT_DOUBLE;
-        } else if ((uintptr_t)(_source_tmp) & 0x3 == 0 && (uintptr_t)dst & 0x3 == 0 && _nb_bytes & 0x3 == 0) {
+        } else if ((uintptr_t)(_source_tmp) % ALIGNMENT_FLOAT == 0 && (uintptr_t)dst % ALIGNMENT_FLOAT == 0 && _nb_bytes % ALIGNMENT_FLOAT == 0) {
             alignment = ALIGNMENT_FLOAT;
         } else {
             alignment = ALIGNMENT_CHAR;
@@ -141,4 +141,4 @@ __global__ void opal_generic_simple_pack_cuda_iov_cached_kernel( ddt_cuda_iov_di
             }
         }
     }
-}
\ No newline at end of file
+}
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
index 7eb179a0a42..f98a8c0b2ea 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
@@ -83,9 +83,9 @@ __global__ void opal_generic_simple_unpack_cuda_iov_cached_kernel( ddt_cuda_iov_
         } else {
             _nb_bytes = cuda_iov_dist[blockIdx.x + i * gridDim.x + current_cuda_iov_pos].nb_bytes;
         }
-        if ((uintptr_t)(_destination_tmp) & 0x7 == 0 && (uintptr_t)src & 0x7 == 0 && _nb_bytes & 0x7 == 0) {
+        if ((uintptr_t)(_destination_tmp) % ALIGNMENT_DOUBLE == 0 && (uintptr_t)src % ALIGNMENT_DOUBLE == 0 && _nb_bytes % ALIGNMENT_DOUBLE == 0) {
             alignment = ALIGNMENT_DOUBLE;
-        } else if ((uintptr_t)(_destination_tmp) & 0x3 == 0 && (uintptr_t)src & 0x3 == 0 && _nb_bytes & 0x3 == 0) {
+        } else if ((uintptr_t)(_destination_tmp) % ALIGNMENT_FLOAT == 0 && (uintptr_t)src % ALIGNMENT_FLOAT == 0 && _nb_bytes % ALIGNMENT_FLOAT == 0) {
             alignment = ALIGNMENT_FLOAT;
         } else {
             alignment = ALIGNMENT_CHAR;

From 4ea326ea8de9770a12ff97eec626494d7f8d0403 Mon Sep 17 00:00:00 2001
From: eddy16112 <eddy16112@gmail.com>
Date: Fri, 13 Nov 2015 16:48:09 -0500
Subject: [PATCH 155/190] now cuda iov is {nc_disp, c_disp}

---
 .../cuda/opal_datatype_cuda_internal.cuh      |  8 ++--
 .../cuda/opal_datatype_pack_cuda_kernel.cu    | 22 ++++++-----
 .../cuda/opal_datatype_pack_cuda_wrapper.cu   | 35 +++++++++--------
 .../cuda/opal_datatype_unpack_cuda_kernel.cu  | 35 +++++++++--------
 .../cuda/opal_datatype_unpack_cuda_wrapper.cu | 39 +++++++++----------
 test/datatype/ddt_benchmark.c                 |  4 +-
 6 files changed, 73 insertions(+), 70 deletions(-)

diff --git a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
index b1c36b66e14..ea4afa0b989 100644
--- a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
+++ b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
@@ -59,8 +59,8 @@ typedef struct {
 } ddt_cuda_iov_dist_non_cached_t;
 
 typedef struct {
-    size_t ptr_offset;
-    uint32_t nb_bytes;
+    size_t ncontig_disp;
+    size_t contig_disp;
 } ddt_cuda_iov_dist_cached_t;
 
 typedef struct {
@@ -138,9 +138,9 @@ __global__ void opal_generic_simple_pack_cuda_iov_non_cached_kernel( ddt_cuda_io
 
 __global__ void opal_generic_simple_unpack_cuda_iov_non_cached_kernel( ddt_cuda_iov_dist_non_cached_t* cuda_iov_dist, int nb_blocks_used);
 
-__global__ void opal_generic_simple_pack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, uint32_t cuda_iov_pos, uintptr_t* cuda_iov_contig_buf_d, int nb_blocks_used, unsigned char* source_base);
+__global__ void opal_generic_simple_pack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, uint32_t cuda_iov_pos, uintptr_t* cuda_iov_contig_buf_d, int nb_blocks_used, unsigned char* source_base, unsigned char* destination_base);
 
-__global__ void opal_generic_simple_unpack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, uint32_t cuda_iov_pos, uintptr_t* cuda_iov_contig_buf_d, int nb_blocks_used, unsigned char* destination_base, size_t cuda_iov_partial_length_start, size_t cuda_iov_partial_length_end);
+__global__ void opal_generic_simple_unpack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, uint32_t cuda_iov_pos, uintptr_t* cuda_iov_contig_buf_d, int nb_blocks_used, unsigned char* destination_base, unsigned char* source_base, size_t cuda_iov_partial_length_start, size_t cuda_iov_partial_length_end);
 
 void opal_cuda_output(int output_id, const char *format, ...);
 
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
index 93fb188ddcd..ddfd68b0e4c 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
@@ -88,13 +88,14 @@ __global__ void opal_generic_simple_pack_cuda_iov_non_cached_kernel( ddt_cuda_io
     }
 }
 
-__global__ void opal_generic_simple_pack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, uint32_t cuda_iov_pos, uintptr_t* cuda_iov_contig_buf_d, int nb_blocks_used, unsigned char* source_base)
+__global__ void opal_generic_simple_pack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, uint32_t cuda_iov_pos, uintptr_t* cuda_iov_contig_buf_d, int nb_blocks_used, unsigned char* source_base, unsigned char* destination_base)
 {
-    uint32_t i, j, _nb_bytes;    
-    size_t src_offset;
-    unsigned char *dst;
+    uint32_t i, j;
+    size_t _nb_bytes;    
+    size_t src_offset, dst_offset;
     unsigned char *_source_tmp, *_destination_tmp;
     uint32_t current_cuda_iov_pos = cuda_iov_pos;
+    size_t destination_disp = cuda_iov_dist[current_cuda_iov_pos].contig_disp;
     
     __shared__ uint32_t nb_tasks;
     uint32_t copy_count;
@@ -110,15 +111,16 @@ __global__ void opal_generic_simple_pack_cuda_iov_cached_kernel( ddt_cuda_iov_di
     __syncthreads();
     
     for (i = 0; i < nb_tasks; i++) {
-        src_offset = cuda_iov_dist[blockIdx.x + i * gridDim.x + current_cuda_iov_pos].ptr_offset;
-        dst = (unsigned char *)cuda_iov_contig_buf_d[blockIdx.x + i * gridDim.x];
-        _nb_bytes = cuda_iov_dist[blockIdx.x + i * gridDim.x + current_cuda_iov_pos].nb_bytes;
+        src_offset = cuda_iov_dist[blockIdx.x + i * gridDim.x + current_cuda_iov_pos].ncontig_disp;
+        dst_offset = cuda_iov_dist[blockIdx.x + i * gridDim.x + current_cuda_iov_pos].contig_disp - destination_disp;
+        _nb_bytes = cuda_iov_dist[blockIdx.x + i * gridDim.x + current_cuda_iov_pos + 1].contig_disp - cuda_iov_dist[blockIdx.x + i * gridDim.x + current_cuda_iov_pos].contig_disp;
         
         _source_tmp = source_base + src_offset;
+        _destination_tmp = destination_base + dst_offset;
         /* block size is either multiple of ALIGNMENT_DOUBLE or residule */
-        if ((uintptr_t)(_source_tmp) % ALIGNMENT_DOUBLE == 0 && (uintptr_t)dst % ALIGNMENT_DOUBLE == 0 && _nb_bytes % ALIGNMENT_DOUBLE == 0) {
+        if ((uintptr_t)(_source_tmp) % ALIGNMENT_DOUBLE == 0 && (uintptr_t)(_destination_tmp) % ALIGNMENT_DOUBLE == 0 && _nb_bytes % ALIGNMENT_DOUBLE == 0) {
             alignment = ALIGNMENT_DOUBLE;
-        } else if ((uintptr_t)(_source_tmp) % ALIGNMENT_FLOAT == 0 && (uintptr_t)dst % ALIGNMENT_FLOAT == 0 && _nb_bytes % ALIGNMENT_FLOAT == 0) {
+        } else if ((uintptr_t)(_source_tmp) % ALIGNMENT_FLOAT == 0 && (uintptr_t)(_destination_tmp) % ALIGNMENT_FLOAT == 0 && _nb_bytes % ALIGNMENT_FLOAT == 0) {
             alignment = ALIGNMENT_FLOAT;
         } else {
             alignment = ALIGNMENT_CHAR;
@@ -128,7 +130,7 @@ __global__ void opal_generic_simple_pack_cuda_iov_cached_kernel( ddt_cuda_iov_di
         for (j = threadIdx.x; j < copy_count; j += blockDim.x) {
             if (j < copy_count) {
                 _source_tmp = source_base + src_offset + j * alignment;
-                _destination_tmp = dst + j * alignment;
+                _destination_tmp = destination_base + dst_offset + j * alignment;
 #if !defined (OPAL_DATATYPE_CUDA_DRY_RUN)
                 if (alignment == ALIGNMENT_DOUBLE) {
                     *((long *)_destination_tmp) = *((long *)_source_tmp);
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
index 1d14c000977..f1ce6dbda7d 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
@@ -965,6 +965,7 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
     uint32_t *cuda_iov_nb_bytes_list_h_current = NULL;
     uint32_t cached_cuda_iov_count = 0;
     uint8_t cuda_iov_is_cached = 0;
+    size_t destionation_disp = 0;
 
 #if defined(OPAL_DATATYPE_CUDA_TIMING)    
     TIMER_DATA_TYPE start, end, start_total, end_total;
@@ -1073,17 +1074,18 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
             nb_blocks_per_description = (count_desc + thread_per_block - 1) / thread_per_block;
             DT_CUDA_DEBUG ( opal_cuda_output(10, "Pack description %d, size %d, residue %d, alignment %d, nb_block_aquired %d\n", i, count_desc, residue_desc, alignment, nb_blocks_per_description); );
             for (j = 0; j < nb_blocks_per_description; j++) {
-                cuda_iov_dist_h_current[nb_blocks_used].ptr_offset = source + j * thread_per_block * alignment - source_base;
+                cuda_iov_dist_h_current[nb_blocks_used].ncontig_disp = source + j * thread_per_block * alignment - source_base;
+                cuda_iov_dist_h_current[nb_blocks_used].contig_disp = destionation_disp;
                 if ( (j+1) * thread_per_block <= count_desc) {
-                    cuda_iov_dist_h_current[nb_blocks_used].nb_bytes = thread_per_block * alignment;
+                    cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] = thread_per_block * alignment;
                 } else {
-                    cuda_iov_dist_h_current[nb_blocks_used].nb_bytes = (count_desc - j*thread_per_block) * alignment; 
+                    cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] = (count_desc - j*thread_per_block) * alignment; 
                 }
 #if defined (OPAL_DATATYPE_CUDA_DEBUG)
-                assert(cuda_iov_dist_h_current[nb_blocks_used].nb_bytes > 0);
+                assert(cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] > 0);
 #endif /* OPAL_DATATYPE_CUDA_DEBUG */
-                cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] = cuda_iov_dist_h_current[nb_blocks_used].nb_bytes;
-                DT_CUDA_DEBUG( opal_cuda_output(12, "Pack \tblock %d, src_offset %ld, nb_bytes %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].ptr_offset, cuda_iov_dist_h_current[nb_blocks_used].nb_bytes); );
+                destionation_disp += cached_cuda_iov_nb_bytes_list_h[nb_blocks_used];
+                DT_CUDA_DEBUG( opal_cuda_output(12, "Pack \tblock %d, ncontig_disp %ld, contig_disp %ld, nb_bytes %ld\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].ncontig_disp, cuda_iov_dist_h_current[nb_blocks_used].contig_disp, cached_cuda_iov_nb_bytes_list_h[nb_blocks_used]); );
                 nb_blocks_used ++;
                 assert (nb_blocks_used < CUDA_MAX_NB_BLOCKS*CUDA_IOV_MAX_TASK_PER_BLOCK);
             }
@@ -1091,18 +1093,21 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
             /* handle residue */
             if (residue_desc != 0) {
                 /*orig_alignment = opal_datatype_basicDatatypes[pElem->elem.common.type]->size;*/
-                cuda_iov_dist_h_current[nb_blocks_used].ptr_offset = source + length_per_iovec / alignment * alignment - source_base;
-                cuda_iov_dist_h_current[nb_blocks_used].nb_bytes = length_per_iovec - length_per_iovec / alignment * alignment;
+                cuda_iov_dist_h_current[nb_blocks_used].ncontig_disp = source + length_per_iovec / alignment * alignment - source_base;
+                cuda_iov_dist_h_current[nb_blocks_used].contig_disp = destionation_disp;
+                cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] = length_per_iovec - length_per_iovec / alignment * alignment;
 #if defined (OPAL_DATATYPE_CUDA_DEBUG)
-                assert(cuda_iov_dist_h_current[nb_blocks_used].nb_bytes > 0);
+                assert(cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] > 0);
 #endif /* OPAL_DATATYPE_CUDA_DEBUG */
-                cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] = cuda_iov_dist_h_current[nb_blocks_used].nb_bytes;
-                DT_CUDA_DEBUG( opal_cuda_output(12, "Pack \tblock %d, src_offset %ld, nb_bytes %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].ptr_offset, cuda_iov_dist_h_current[nb_blocks_used].nb_bytes); );
+                destionation_disp += cached_cuda_iov_nb_bytes_list_h[nb_blocks_used];
+                DT_CUDA_DEBUG( opal_cuda_output(12, "Pack \tblock %d, ncontig_disp %ld, contig_disp %ld, nb_bytes %ld\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].ncontig_disp, cuda_iov_dist_h_current[nb_blocks_used].contig_disp, cached_cuda_iov_nb_bytes_list_h[nb_blocks_used]); );
                 nb_blocks_used ++;
                 assert (nb_blocks_used < CUDA_MAX_NB_BLOCKS*CUDA_IOV_MAX_TASK_PER_BLOCK);
             }
         }
-        cudaMemcpyAsync(cached_cuda_iov_dist_d, cuda_iov_dist_h_current, sizeof(ddt_cuda_iov_dist_cached_t)*(nb_blocks_used), cudaMemcpyHostToDevice, *cuda_stream_iov);
+        /* use additional entry to store the size of entire contiguous buffer needed for one ddt */
+        cuda_iov_dist_h_current[nb_blocks_used].contig_disp = destionation_disp;
+        cudaMemcpyAsync(cached_cuda_iov_dist_d, cuda_iov_dist_h_current, sizeof(ddt_cuda_iov_dist_cached_t)*(nb_blocks_used+1), cudaMemcpyHostToDevice, *cuda_stream_iov);
         opal_ddt_set_cuda_iov_cached(pConvertor, nb_blocks_used);
         DT_CUDA_DEBUG ( opal_cuda_output(2, "Pack cuda iov is cached, count %d\n", nb_blocks_used););
 #if defined(OPAL_DATATYPE_CUDA_TIMING)    
@@ -1128,8 +1133,6 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
 #endif
         for (i = cuda_iov_start_pos; i < cuda_iov_end_pos && !buffer_isfull; i++) {
             if (buffer_size >= cached_cuda_iov_nb_bytes_list_h[i]) {
-                cuda_iov_contig_buf_h_current[nb_blocks_used] = (uintptr_t)destination;
-                destination += cached_cuda_iov_nb_bytes_list_h[i];
                 total_packed += cached_cuda_iov_nb_bytes_list_h[i];
                 buffer_size -= cached_cuda_iov_nb_bytes_list_h[i];
                 nb_blocks_used++;
@@ -1143,9 +1146,9 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
         total_time = ELAPSED_TIME( start, end );
         DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Pack to dest %p, cached cuda iov is prepared in %ld microsec, kernel submitted to CUDA stream %d, nb_blocks %d\n", destination_base, total_time,  cuda_iov_pipeline_block->cuda_stream_id, nb_blocks_used); );
 #endif
-        cudaMemcpyAsync(cuda_iov_contig_buf_d_current, cuda_iov_contig_buf_h_current, sizeof(uintptr_t)*(nb_blocks_used), cudaMemcpyHostToDevice, *cuda_stream_iov);
+//        cudaMemcpyAsync(cuda_iov_contig_buf_d_current, cuda_iov_contig_buf_h_current, sizeof(uintptr_t)*(nb_blocks_used), cudaMemcpyHostToDevice, *cuda_stream_iov);
         DT_CUDA_DEBUG ( opal_cuda_output(2, "kernel launched src_base %p, dst_base %p, nb_blocks %ld\n", source_base, destination_base, nb_blocks_used ); );
-        opal_generic_simple_pack_cuda_iov_cached_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cached_cuda_iov_dist_d, pConvertor->current_cuda_iov_pos, cuda_iov_contig_buf_d_current, nb_blocks_used, source_base);
+        opal_generic_simple_pack_cuda_iov_cached_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cached_cuda_iov_dist_d, pConvertor->current_cuda_iov_pos, cuda_iov_contig_buf_d_current, nb_blocks_used, source_base, destination_base);
         pConvertor->current_cuda_iov_pos += nb_blocks_used;
     }
 
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
index f98a8c0b2ea..9cf705ae7e3 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
@@ -46,15 +46,17 @@ __global__ void opal_generic_simple_unpack_cuda_iov_non_cached_kernel( ddt_cuda_
     }
 }
 
-__global__ void opal_generic_simple_unpack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, uint32_t cuda_iov_pos, uintptr_t* cuda_iov_contig_buf_d, int nb_blocks_used, unsigned char* destination_base, size_t cuda_iov_partial_length_start, size_t cuda_iov_partial_length_end)
+__global__ void opal_generic_simple_unpack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, uint32_t cuda_iov_pos, uintptr_t* cuda_iov_contig_buf_d, int nb_blocks_used, unsigned char* destination_base, unsigned char* source_base, size_t cuda_iov_partial_length_start, size_t cuda_iov_partial_length_end)
 {
     uint32_t i, j;
-    size_t dst_offset;
+    size_t dst_offset, src_offset;
     unsigned char *src;
     unsigned char *_source_tmp, *_destination_tmp;
-    uint32_t _nb_bytes;
-    uint32_t current_cuda_iov_pos = cuda_iov_pos; 
-    
+    size_t _nb_bytes;
+    uint32_t current_cuda_iov_pos = cuda_iov_pos;
+    size_t source_disp = cuda_iov_dist[current_cuda_iov_pos].contig_disp;
+    size_t source_partial_disp = (cuda_iov_dist[current_cuda_iov_pos+1].contig_disp - cuda_iov_dist[current_cuda_iov_pos].contig_disp) - cuda_iov_partial_length_start; 
+
     __shared__ uint32_t nb_tasks;
     uint32_t copy_count;
     uint8_t alignment;
@@ -68,24 +70,23 @@ __global__ void opal_generic_simple_unpack_cuda_iov_cached_kernel( ddt_cuda_iov_
     __syncthreads();
     
     for (i = 0; i < nb_tasks; i++) {
-        src = (unsigned char *)cuda_iov_contig_buf_d[blockIdx.x + i * gridDim.x];
-        dst_offset = cuda_iov_dist[blockIdx.x + i * gridDim.x + current_cuda_iov_pos].ptr_offset;
-        if (i == 0 && blockIdx.x == 0 && cuda_iov_partial_length_start != 0) {
-         //   if (threadIdx.x == 0) printf("cuda_iov_partial_length_start %d", cuda_iov_partial_length_start);
-            dst_offset = dst_offset + cuda_iov_dist[blockIdx.x + i * gridDim.x + current_cuda_iov_pos].nb_bytes - cuda_iov_partial_length_start; 
-        }
-        _destination_tmp = destination_base + dst_offset;
+        src_offset = cuda_iov_dist[blockIdx.x + i * gridDim.x + current_cuda_iov_pos].contig_disp - source_disp - source_partial_disp;
+        dst_offset = cuda_iov_dist[blockIdx.x + i * gridDim.x + current_cuda_iov_pos].ncontig_disp;
+        _nb_bytes = cuda_iov_dist[blockIdx.x + i * gridDim.x + current_cuda_iov_pos + 1].contig_disp - cuda_iov_dist[blockIdx.x + i * gridDim.x + current_cuda_iov_pos].contig_disp;
 
         if (i == 0 && blockIdx.x == 0 && cuda_iov_partial_length_start != 0) {
+            src_offset = cuda_iov_dist[current_cuda_iov_pos].contig_disp - source_disp;
+            dst_offset = dst_offset + _nb_bytes - cuda_iov_partial_length_start;  
             _nb_bytes = cuda_iov_partial_length_start;
         } else if (i == nb_tasks-1 && (blockIdx.x == (nb_blocks_used-1) % gridDim.x) && cuda_iov_partial_length_end != 0) {
             _nb_bytes = cuda_iov_partial_length_end;
-        } else {
-            _nb_bytes = cuda_iov_dist[blockIdx.x + i * gridDim.x + current_cuda_iov_pos].nb_bytes;
         }
-        if ((uintptr_t)(_destination_tmp) % ALIGNMENT_DOUBLE == 0 && (uintptr_t)src % ALIGNMENT_DOUBLE == 0 && _nb_bytes % ALIGNMENT_DOUBLE == 0) {
+        
+        _destination_tmp = destination_base + dst_offset; 
+        _source_tmp = source_base + src_offset;
+        if ((uintptr_t)(_destination_tmp) % ALIGNMENT_DOUBLE == 0 && (uintptr_t)(_source_tmp) % ALIGNMENT_DOUBLE == 0 && _nb_bytes % ALIGNMENT_DOUBLE == 0) {
             alignment = ALIGNMENT_DOUBLE;
-        } else if ((uintptr_t)(_destination_tmp) % ALIGNMENT_FLOAT == 0 && (uintptr_t)src % ALIGNMENT_FLOAT == 0 && _nb_bytes % ALIGNMENT_FLOAT == 0) {
+        } else if ((uintptr_t)(_destination_tmp) % ALIGNMENT_FLOAT == 0 && (uintptr_t)(_source_tmp) % ALIGNMENT_FLOAT == 0 && _nb_bytes % ALIGNMENT_FLOAT == 0) {
             alignment = ALIGNMENT_FLOAT;
         } else {
             alignment = ALIGNMENT_CHAR;
@@ -97,7 +98,7 @@ __global__ void opal_generic_simple_unpack_cuda_iov_cached_kernel( ddt_cuda_iov_
                 if (copy_count > blockDim.x) printf("copy_count %d, dim %d\n", copy_count, blockDim.x);
             }*/
             if (j < copy_count) {
-                _source_tmp = src + j * alignment;
+                _source_tmp = source_base + src_offset + j * alignment;
                 _destination_tmp = destination_base + dst_offset + j * alignment;
   /*              if (threadIdx.x == 0) {
                     printf("_src %p, dst %p, alignment %d, blk %d, j %d, count %d\n", _source_tmp, _destination_tmp, alignment, blockIdx.x, j, copy_count);
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
index 50009710d2d..dc356d96471 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
@@ -663,6 +663,7 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
     uint8_t cuda_iov_is_cached = 0;
     size_t cuda_iov_partial_length_start = 0;
     size_t cuda_iov_partial_length_end = 0;
+    size_t source_disp = 0;
 
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
     TIMER_DATA_TYPE start, end, start_total, end_total;
@@ -765,17 +766,18 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
             nb_blocks_per_description = (count_desc + thread_per_block - 1) / thread_per_block;
             DT_CUDA_DEBUG ( opal_cuda_output(10, "Unpack description %d, size %d, residue %d, alignment %d\n", i, count_desc, residue_desc, alignment); );
             for (j = 0; j < nb_blocks_per_description; j++) {
-                cuda_iov_dist_h_current[nb_blocks_used].ptr_offset = destination + j * thread_per_block * alignment - destination_base;
+                cuda_iov_dist_h_current[nb_blocks_used].ncontig_disp = destination + j * thread_per_block * alignment - destination_base;
+                cuda_iov_dist_h_current[nb_blocks_used].contig_disp = source_disp;
                 if ( (j+1) * thread_per_block <= count_desc) {
-                    cuda_iov_dist_h_current[nb_blocks_used].nb_bytes = thread_per_block * alignment;
+                    cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] = thread_per_block * alignment;
                 } else {
-                    cuda_iov_dist_h_current[nb_blocks_used].nb_bytes = (thread_per_block - ((j+1)*thread_per_block - count_desc)) * alignment;
+                    cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] = (thread_per_block - ((j+1)*thread_per_block - count_desc)) * alignment;
                 }
 #if defined (OPAL_DATATYPE_CUDA_DEBUG)
-                assert (cuda_iov_dist_h_current[nb_blocks_used].nb_bytes > 0); 
+                assert (cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] > 0); 
 #endif /* OPAL_DATATYPE_CUDA_DEBUG */
-                cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] = cuda_iov_dist_h_current[nb_blocks_used].nb_bytes;
-                DT_CUDA_DEBUG( opal_cuda_output(12, "Unpack \tblock %d, src_offset %ld, nb_bytes %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].ptr_offset, cuda_iov_dist_h_current[nb_blocks_used].nb_bytes); );
+                source_disp += cached_cuda_iov_nb_bytes_list_h[nb_blocks_used];
+                DT_CUDA_DEBUG( opal_cuda_output(12, "Unpack \tblock %d, ncontig_disp %ld, contig_disp %ld, nb_bytes %ld\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].ncontig_disp, cuda_iov_dist_h_current[nb_blocks_used].contig_disp, cached_cuda_iov_nb_bytes_list_h[nb_blocks_used]); );
                 nb_blocks_used ++;
             }
 
@@ -783,18 +785,20 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
             if (residue_desc != 0) {
                /* orig_alignment = opal_datatype_basicDatatypes[pElem->elem.common.type]->size;*/
                 orig_alignment = ALIGNMENT_CHAR;
-                cuda_iov_dist_h_current[nb_blocks_used].ptr_offset = destination + length_per_iovec / alignment * alignment - destination_base;
-                cuda_iov_dist_h_current[nb_blocks_used].nb_bytes = length_per_iovec - length_per_iovec / alignment * alignment;
+                cuda_iov_dist_h_current[nb_blocks_used].ncontig_disp = destination + length_per_iovec / alignment * alignment - destination_base;
+                cuda_iov_dist_h_current[nb_blocks_used].contig_disp = source_disp;
+                cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] = length_per_iovec - length_per_iovec / alignment * alignment;
 #if defined (OPAL_DATATYPE_CUDA_DEBUG)
-                assert (cuda_iov_dist_h_current[nb_blocks_used].nb_bytes > 0);
+                assert (cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] > 0);
 #endif /* OPAL_DATATYPE_CUDA_DEBUG */
-                cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] = cuda_iov_dist_h_current[nb_blocks_used].nb_bytes;
-                DT_CUDA_DEBUG( opal_cuda_output(12, "Unpack \tblock %d, src_offset %ld, nb_bytes %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].ptr_offset, cuda_iov_dist_h_current[nb_blocks_used].nb_bytes); );
+                source_disp += cached_cuda_iov_nb_bytes_list_h[nb_blocks_used];
+                DT_CUDA_DEBUG( opal_cuda_output(12, "Unpack \tblock %d, ncontig_disp %ld, contig_disp %ld, nb_bytes %ld\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].ncontig_disp, cuda_iov_dist_h_current[nb_blocks_used].contig_disp, cached_cuda_iov_nb_bytes_list_h[nb_blocks_used]); );
                 nb_blocks_used ++;
             }
         }
-        
-        cudaMemcpy(cached_cuda_iov_dist_d, cuda_iov_dist_h_current, sizeof(ddt_cuda_iov_dist_cached_t)*(nb_blocks_used), cudaMemcpyHostToDevice);
+        /* use additional entry to store the size of entire contiguous buffer needed for one ddt */
+        cuda_iov_dist_h_current[nb_blocks_used].contig_disp = source_disp;
+        cudaMemcpy(cached_cuda_iov_dist_d, cuda_iov_dist_h_current, sizeof(ddt_cuda_iov_dist_cached_t)*(nb_blocks_used+1), cudaMemcpyHostToDevice);
         opal_ddt_set_cuda_iov_cached(pConvertor, nb_blocks_used);
         DT_CUDA_DEBUG ( opal_cuda_output(2, "Unpack cuda iov is cached, count %d\n", nb_blocks_used););
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
@@ -826,15 +830,11 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
             total_unpacked += cuda_iov_partial_length_start;
             buffer_size -= cuda_iov_partial_length_start;
             pConvertor->current_iov_partial_length = 0;
-            cuda_iov_contig_buf_h_current[nb_blocks_used] = (uintptr_t)source;
-            source += cuda_iov_partial_length_start;
             cuda_iov_start_pos ++;
             nb_blocks_used ++;
         }
         for (i = cuda_iov_start_pos; i < cuda_iov_end_pos && !buffer_isfull; i++) {
             if (buffer_size >= cached_cuda_iov_nb_bytes_list_h[i]) {
-                cuda_iov_contig_buf_h_current[nb_blocks_used] = (uintptr_t)source;
-                source += cached_cuda_iov_nb_bytes_list_h[i];
                 total_unpacked += cached_cuda_iov_nb_bytes_list_h[i];
                 buffer_size -= cached_cuda_iov_nb_bytes_list_h[i];
                 nb_blocks_used ++;
@@ -842,9 +842,6 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
                 if (buffer_size > 0) {
                     cuda_iov_partial_length_end = buffer_size;
                     total_unpacked += cuda_iov_partial_length_end;
-                    cuda_iov_contig_buf_h_current[nb_blocks_used] = (uintptr_t)source;
-                    source += cuda_iov_partial_length_end;
-                    pConvertor->current_iov_partial_length = cached_cuda_iov_nb_bytes_list_h[i] - cuda_iov_partial_length_end;
                     nb_blocks_used ++;
                 }
                 buffer_size = 0;
@@ -859,7 +856,7 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
 #endif
         cudaMemcpyAsync(cuda_iov_contig_buf_d_current, cuda_iov_contig_buf_h_current, sizeof(uintptr_t)*(nb_blocks_used), cudaMemcpyHostToDevice, *cuda_stream_iov);
         DT_CUDA_DEBUG ( opal_cuda_output(2, "kernel launched src_base %p, dst_base %p, nb_blocks %ld\n", source_base, destination_base, nb_blocks_used ); );
-        opal_generic_simple_unpack_cuda_iov_cached_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cached_cuda_iov_dist_d, pConvertor->current_cuda_iov_pos, cuda_iov_contig_buf_d_current, nb_blocks_used, destination_base, cuda_iov_partial_length_start, cuda_iov_partial_length_end);
+        opal_generic_simple_unpack_cuda_iov_cached_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cached_cuda_iov_dist_d, pConvertor->current_cuda_iov_pos, cuda_iov_contig_buf_d_current, nb_blocks_used, destination_base, source_base, cuda_iov_partial_length_start, cuda_iov_partial_length_end);
     }
 
     for (i = 0; i < NB_STREAMS; i++) {
diff --git a/test/datatype/ddt_benchmark.c b/test/datatype/ddt_benchmark.c
index d961ef34e4e..e879e5c0192 100644
--- a/test/datatype/ddt_benchmark.c
+++ b/test/datatype/ddt_benchmark.c
@@ -895,14 +895,14 @@ static int local_copy_with_convertor( ompi_datatype_t* pdt, int count, int chunk
             
         }
 #if defined (TEST_CHAR)
-   /*     mat_char = (unsigned char *)ptemp;
+        mat_char = (unsigned char *)ptemp;
         for (j = 0; j < max_data; j++) {
             if (mat_char[j] != 'a') {
                 t_error ++;
                 printf("error %d, %c\n", j, mat_char[j]);
             }
         }
-        printf("total error %d\n", t_error);*/
+        printf("total error %d\n", t_error);
 #endif
 
         if( done2 == 0 ) {

From 491dd73f54bc36dc60e2162fc908ddb7d8012700 Mon Sep 17 00:00:00 2001
From: eddy16112 <eddy16112@gmail.com>
Date: Fri, 13 Nov 2015 18:33:48 -0500
Subject: [PATCH 156/190] clean up kernel, put variables uses multiple times
 into register

---
 .../datatype/cuda/opal_datatype_pack_cuda_kernel.cu |  8 +++++---
 .../cuda/opal_datatype_unpack_cuda_kernel.cu        | 13 +++++++------
 .../cuda/opal_datatype_unpack_cuda_wrapper.cu       |  1 -
 3 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
index ddfd68b0e4c..92a96d1cb2b 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
@@ -91,11 +91,12 @@ __global__ void opal_generic_simple_pack_cuda_iov_non_cached_kernel( ddt_cuda_io
 __global__ void opal_generic_simple_pack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, uint32_t cuda_iov_pos, uintptr_t* cuda_iov_contig_buf_d, int nb_blocks_used, unsigned char* source_base, unsigned char* destination_base)
 {
     uint32_t i, j;
-    size_t _nb_bytes;    
+    uint32_t _nb_bytes;    
     size_t src_offset, dst_offset;
     unsigned char *_source_tmp, *_destination_tmp;
     uint32_t current_cuda_iov_pos = cuda_iov_pos;
     size_t destination_disp = cuda_iov_dist[current_cuda_iov_pos].contig_disp;
+    size_t contig_disp;
     
     __shared__ uint32_t nb_tasks;
     uint32_t copy_count;
@@ -111,9 +112,10 @@ __global__ void opal_generic_simple_pack_cuda_iov_cached_kernel( ddt_cuda_iov_di
     __syncthreads();
     
     for (i = 0; i < nb_tasks; i++) {
+        contig_disp = cuda_iov_dist[blockIdx.x + i * gridDim.x + current_cuda_iov_pos].contig_disp;  /* this variable is used multiple times, so put in in register */
         src_offset = cuda_iov_dist[blockIdx.x + i * gridDim.x + current_cuda_iov_pos].ncontig_disp;
-        dst_offset = cuda_iov_dist[blockIdx.x + i * gridDim.x + current_cuda_iov_pos].contig_disp - destination_disp;
-        _nb_bytes = cuda_iov_dist[blockIdx.x + i * gridDim.x + current_cuda_iov_pos + 1].contig_disp - cuda_iov_dist[blockIdx.x + i * gridDim.x + current_cuda_iov_pos].contig_disp;
+        dst_offset = contig_disp - destination_disp;
+        _nb_bytes = cuda_iov_dist[blockIdx.x + i * gridDim.x + current_cuda_iov_pos + 1].contig_disp - contig_disp;
         
         _source_tmp = source_base + src_offset;
         _destination_tmp = destination_base + dst_offset;
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
index 9cf705ae7e3..f2c337ea682 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
@@ -50,12 +50,12 @@ __global__ void opal_generic_simple_unpack_cuda_iov_cached_kernel( ddt_cuda_iov_
 {
     uint32_t i, j;
     size_t dst_offset, src_offset;
-    unsigned char *src;
     unsigned char *_source_tmp, *_destination_tmp;
-    size_t _nb_bytes;
+    uint32_t _nb_bytes;
     uint32_t current_cuda_iov_pos = cuda_iov_pos;
     size_t source_disp = cuda_iov_dist[current_cuda_iov_pos].contig_disp;
-    size_t source_partial_disp = (cuda_iov_dist[current_cuda_iov_pos+1].contig_disp - cuda_iov_dist[current_cuda_iov_pos].contig_disp) - cuda_iov_partial_length_start; 
+    size_t source_partial_disp = (cuda_iov_dist[current_cuda_iov_pos+1].contig_disp - cuda_iov_dist[current_cuda_iov_pos].contig_disp) - cuda_iov_partial_length_start;
+    size_t contig_disp; 
 
     __shared__ uint32_t nb_tasks;
     uint32_t copy_count;
@@ -70,12 +70,13 @@ __global__ void opal_generic_simple_unpack_cuda_iov_cached_kernel( ddt_cuda_iov_
     __syncthreads();
     
     for (i = 0; i < nb_tasks; i++) {
-        src_offset = cuda_iov_dist[blockIdx.x + i * gridDim.x + current_cuda_iov_pos].contig_disp - source_disp - source_partial_disp;
+        contig_disp = cuda_iov_dist[blockIdx.x + i * gridDim.x + current_cuda_iov_pos].contig_disp;  /* this variable is used multiple times, so put in in register */
+        src_offset = contig_disp - source_disp - source_partial_disp;
         dst_offset = cuda_iov_dist[blockIdx.x + i * gridDim.x + current_cuda_iov_pos].ncontig_disp;
-        _nb_bytes = cuda_iov_dist[blockIdx.x + i * gridDim.x + current_cuda_iov_pos + 1].contig_disp - cuda_iov_dist[blockIdx.x + i * gridDim.x + current_cuda_iov_pos].contig_disp;
+        _nb_bytes = cuda_iov_dist[blockIdx.x + i * gridDim.x + current_cuda_iov_pos + 1].contig_disp - contig_disp;
 
         if (i == 0 && blockIdx.x == 0 && cuda_iov_partial_length_start != 0) {
-            src_offset = cuda_iov_dist[current_cuda_iov_pos].contig_disp - source_disp;
+            src_offset = contig_disp - source_disp;
             dst_offset = dst_offset + _nb_bytes - cuda_iov_partial_length_start;  
             _nb_bytes = cuda_iov_partial_length_start;
         } else if (i == nb_tasks-1 && (blockIdx.x == (nb_blocks_used-1) % gridDim.x) && cuda_iov_partial_length_end != 0) {
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
index dc356d96471..d400e05efcf 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
@@ -854,7 +854,6 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
         total_time = ELAPSED_TIME( start, end );
         DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Unpack src %p, cached cuda iov is prepared in %ld microsec, kernel submitted to CUDA stream %d, nb_blocks %d\n", source_base, total_time,  cuda_iov_pipeline_block->cuda_stream_id, nb_blocks_used); );
 #endif
-        cudaMemcpyAsync(cuda_iov_contig_buf_d_current, cuda_iov_contig_buf_h_current, sizeof(uintptr_t)*(nb_blocks_used), cudaMemcpyHostToDevice, *cuda_stream_iov);
         DT_CUDA_DEBUG ( opal_cuda_output(2, "kernel launched src_base %p, dst_base %p, nb_blocks %ld\n", source_base, destination_base, nb_blocks_used ); );
         opal_generic_simple_unpack_cuda_iov_cached_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cached_cuda_iov_dist_d, pConvertor->current_cuda_iov_pos, cuda_iov_contig_buf_d_current, nb_blocks_used, destination_base, source_base, cuda_iov_partial_length_start, cuda_iov_partial_length_end);
     }

From 6cc7ada1731f1e63fb0646833a55b213d3bf4603 Mon Sep 17 00:00:00 2001
From: eddy16112 <eddy16112@gmail.com>
Date: Fri, 13 Nov 2015 22:18:41 -0500
Subject: [PATCH 157/190] another checkpoint

---
 opal/datatype/cuda/opal_datatype_cuda.cu      |  9 ++++
 .../cuda/opal_datatype_pack_cuda_wrapper.cu   | 26 +++++-----
 .../cuda/opal_datatype_unpack_cuda_kernel.cu  |  6 ++-
 .../cuda/opal_datatype_unpack_cuda_wrapper.cu | 47 ++++++++++---------
 4 files changed, 53 insertions(+), 35 deletions(-)

diff --git a/opal/datatype/cuda/opal_datatype_cuda.cu b/opal/datatype/cuda/opal_datatype_cuda.cu
index 471c6e63709..3213a3b43fd 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cu
+++ b/opal/datatype/cuda/opal_datatype_cuda.cu
@@ -355,12 +355,21 @@ void opal_ddt_set_cuda_iov_position(struct opal_convertor_t *convertor, size_t d
 {
     int i;
     size_t iov_size = 0;
+    convertor->current_iov_partial_length = 0;
+    convertor->current_cuda_iov_pos = 0;
+    if (ddt_offset == 0) {
+       return;
+    }
     for(i = 0; i < cuda_iov_count; i++) {
         iov_size += cached_cuda_iov_nb_bytes_list_h[i];
         if (iov_size > ddt_offset) {
             convertor->current_iov_partial_length = iov_size - ddt_offset;
             convertor->current_cuda_iov_pos = i;
             break;
+        } else if (iov_size == ddt_offset){
+            convertor->current_iov_partial_length = 0;
+            convertor->current_cuda_iov_pos = i+1;
+            break;
         }
     }
 }
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
index f1ce6dbda7d..36ce4e3951d 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
@@ -1117,17 +1117,14 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
 #endif
     }
     
-    cached_cuda_iov_count = cached_cuda_iov->cuda_iov_count;
-   
     /* now we use cached cuda iov */
-    if( pConvertor->bConverted != pConvertor->local_size && !buffer_isfull) {
-        cuda_iov_start_pos = pConvertor->current_cuda_iov_pos;
-        cuda_iov_end_pos = cached_cuda_iov_count;
-        nb_blocks_used = 0;
-        cuda_iov_pipeline_block = current_cuda_device->cuda_iov_pipeline_block[iov_pipeline_block_id];
-        cuda_iov_contig_buf_h_current = cuda_iov_pipeline_block->cuda_iov_contig_buf_h;
-        cuda_iov_contig_buf_d_current = cuda_iov_pipeline_block->cuda_iov_contig_buf_d;
-        cuda_stream_iov = cuda_iov_pipeline_block->cuda_stream;
+    cuda_iov_start_pos = pConvertor->current_cuda_iov_pos;
+    cuda_iov_end_pos = cached_cuda_iov->cuda_iov_count;
+    nb_blocks_used = 0;
+    cuda_iov_pipeline_block = current_cuda_device->cuda_iov_pipeline_block[iov_pipeline_block_id];
+    cuda_stream_iov = cuda_iov_pipeline_block->cuda_stream;
+   
+    if( pConvertor->current_count < pConvertor->count && !buffer_isfull) {
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
         GET_TIME(start);
 #endif
@@ -1141,16 +1138,21 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
                 break;
             }
         }
+        if (!buffer_isfull) {
+            pConvertor->current_count ++;
+            cuda_iov_start_pos = 0;
+            cuda_iov_end_pos = cached_cuda_iov->cuda_iov_count;
+        }
 #if defined(OPAL_DATATYPE_CUDA_TIMING)    
         GET_TIME( end );
         total_time = ELAPSED_TIME( start, end );
         DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Pack to dest %p, cached cuda iov is prepared in %ld microsec, kernel submitted to CUDA stream %d, nb_blocks %d\n", destination_base, total_time,  cuda_iov_pipeline_block->cuda_stream_id, nb_blocks_used); );
 #endif
-//        cudaMemcpyAsync(cuda_iov_contig_buf_d_current, cuda_iov_contig_buf_h_current, sizeof(uintptr_t)*(nb_blocks_used), cudaMemcpyHostToDevice, *cuda_stream_iov);
+    }
         DT_CUDA_DEBUG ( opal_cuda_output(2, "kernel launched src_base %p, dst_base %p, nb_blocks %ld\n", source_base, destination_base, nb_blocks_used ); );
         opal_generic_simple_pack_cuda_iov_cached_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cached_cuda_iov_dist_d, pConvertor->current_cuda_iov_pos, cuda_iov_contig_buf_d_current, nb_blocks_used, source_base, destination_base);
         pConvertor->current_cuda_iov_pos += nb_blocks_used;
-    }
+        pConvertor->current_cuda_iov_pos = pConvertor->current_cuda_iov_pos % cached_cuda_iov->cuda_iov_count;
 
     for (i = 0; i < NB_STREAMS; i++) {
         cudaStreamSynchronize(cuda_streams->opal_cuda_stream[i]);
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
index f2c337ea682..b58cff27cf3 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
@@ -54,7 +54,7 @@ __global__ void opal_generic_simple_unpack_cuda_iov_cached_kernel( ddt_cuda_iov_
     uint32_t _nb_bytes;
     uint32_t current_cuda_iov_pos = cuda_iov_pos;
     size_t source_disp = cuda_iov_dist[current_cuda_iov_pos].contig_disp;
-    size_t source_partial_disp = (cuda_iov_dist[current_cuda_iov_pos+1].contig_disp - cuda_iov_dist[current_cuda_iov_pos].contig_disp) - cuda_iov_partial_length_start;
+    size_t source_partial_disp = 0;
     size_t contig_disp; 
 
     __shared__ uint32_t nb_tasks;
@@ -69,6 +69,10 @@ __global__ void opal_generic_simple_unpack_cuda_iov_cached_kernel( ddt_cuda_iov_
     }
     __syncthreads();
     
+    if (cuda_iov_partial_length_start != 0) {
+        source_partial_disp = (cuda_iov_dist[current_cuda_iov_pos+1].contig_disp - cuda_iov_dist[current_cuda_iov_pos].contig_disp) - cuda_iov_partial_length_start;
+    }
+    
     for (i = 0; i < nb_tasks; i++) {
         contig_disp = cuda_iov_dist[blockIdx.x + i * gridDim.x + current_cuda_iov_pos].contig_disp;  /* this variable is used multiple times, so put in in register */
         src_offset = contig_disp - source_disp - source_partial_disp;
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
index d400e05efcf..70d9d10465e 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
@@ -807,32 +807,30 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
         DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Unpack cached cuda iov is prepared in %ld microsec, nb_blocks_used %d\n", total_time, nb_blocks_used); );
 #endif
     }
-    
-    cached_cuda_iov_count = cached_cuda_iov->cuda_iov_count;
       
     /* now we use cached cuda iov */
-    if( pConvertor->bConverted != pConvertor->local_size && !buffer_isfull) {
-        opal_ddt_set_cuda_iov_position(pConvertor, pConvertor->bConverted, cached_cuda_iov_nb_bytes_list_h, cached_cuda_iov_count);
-        cuda_iov_start_pos = pConvertor->current_cuda_iov_pos;
-        cuda_iov_end_pos = cached_cuda_iov_count;
-        nb_blocks_used = 0;
-        cuda_iov_pipeline_block = current_cuda_device->cuda_iov_pipeline_block[iov_pipeline_block_id];
-        cuda_iov_contig_buf_h_current = cuda_iov_pipeline_block->cuda_iov_contig_buf_h;
-        cuda_iov_contig_buf_d_current = cuda_iov_pipeline_block->cuda_iov_contig_buf_d;
-        cuda_iov_dist_d_current = cached_cuda_iov_dist_d + pConvertor->current_cuda_iov_pos;
-        cuda_stream_iov = cuda_iov_pipeline_block->cuda_stream;
+    cached_cuda_iov_count = cached_cuda_iov->cuda_iov_count;
+    opal_ddt_set_cuda_iov_position(pConvertor, pConvertor->bConverted, cached_cuda_iov_nb_bytes_list_h, cached_cuda_iov_count);
+    cuda_iov_start_pos = pConvertor->current_cuda_iov_pos;
+    cuda_iov_end_pos = cached_cuda_iov_count;
+    nb_blocks_used = 0;
+    cuda_iov_pipeline_block = current_cuda_device->cuda_iov_pipeline_block[iov_pipeline_block_id];
+    cuda_stream_iov = cuda_iov_pipeline_block->cuda_stream;
+    
+    printf("[00000] partial_length %ld, pos %d\n", pConvertor->current_iov_partial_length, pConvertor->current_cuda_iov_pos);
+    if (pConvertor->current_iov_partial_length > 0) {
+        cuda_iov_partial_length_start = pConvertor->current_iov_partial_length;
+        total_unpacked += cuda_iov_partial_length_start;
+        buffer_size -= cuda_iov_partial_length_start;
+        pConvertor->current_iov_partial_length = 0;
+        cuda_iov_start_pos ++;
+        nb_blocks_used ++;
+    }
+    
+    while( pConvertor->current_count < pConvertor->count && !buffer_isfull) {
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
         GET_TIME(start);
 #endif
-        printf("[00000] partial_length %ld, pos %d\n", pConvertor->current_iov_partial_length, pConvertor->current_cuda_iov_pos);
-        if (pConvertor->current_iov_partial_length > 0) {
-            cuda_iov_partial_length_start = pConvertor->current_iov_partial_length;
-            total_unpacked += cuda_iov_partial_length_start;
-            buffer_size -= cuda_iov_partial_length_start;
-            pConvertor->current_iov_partial_length = 0;
-            cuda_iov_start_pos ++;
-            nb_blocks_used ++;
-        }
         for (i = cuda_iov_start_pos; i < cuda_iov_end_pos && !buffer_isfull; i++) {
             if (buffer_size >= cached_cuda_iov_nb_bytes_list_h[i]) {
                 total_unpacked += cached_cuda_iov_nb_bytes_list_h[i];
@@ -849,14 +847,19 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
                 break;
             }
         }
+        if (!buffer_isfull) {
+            pConvertor->current_count ++;
+            cuda_iov_start_pos = 0;
+            cuda_iov_end_pos = cached_cuda_iov->cuda_iov_count;
+        }
 #if defined(OPAL_DATATYPE_CUDA_TIMING)    
         GET_TIME( end );
         total_time = ELAPSED_TIME( start, end );
         DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Unpack src %p, cached cuda iov is prepared in %ld microsec, kernel submitted to CUDA stream %d, nb_blocks %d\n", source_base, total_time,  cuda_iov_pipeline_block->cuda_stream_id, nb_blocks_used); );
 #endif
+    }
         DT_CUDA_DEBUG ( opal_cuda_output(2, "kernel launched src_base %p, dst_base %p, nb_blocks %ld\n", source_base, destination_base, nb_blocks_used ); );
         opal_generic_simple_unpack_cuda_iov_cached_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cached_cuda_iov_dist_d, pConvertor->current_cuda_iov_pos, cuda_iov_contig_buf_d_current, nb_blocks_used, destination_base, source_base, cuda_iov_partial_length_start, cuda_iov_partial_length_end);
-    }
 
     for (i = 0; i < NB_STREAMS; i++) {
         cudaStreamSynchronize(cuda_streams->opal_cuda_stream[i]);

From 998a072c11ec843b972eeb89476efc8c3a3e45fd Mon Sep 17 00:00:00 2001
From: eddy16112 <eddy16112@gmail.com>
Date: Sat, 14 Nov 2015 01:40:55 -0500
Subject: [PATCH 158/190] now convertor->count > 1 is woring

---
 opal/datatype/cuda/opal_datatype_cuda.cu      |  3 +++
 .../cuda/opal_datatype_cuda_internal.cuh      |  4 +--
 .../cuda/opal_datatype_pack_cuda_kernel.cu    | 27 ++++++++++++++-----
 .../cuda/opal_datatype_pack_cuda_wrapper.cu   | 27 +++++++++++--------
 .../cuda/opal_datatype_unpack_cuda_kernel.cu  | 27 ++++++++++++++-----
 .../cuda/opal_datatype_unpack_cuda_wrapper.cu | 22 ++++++++-------
 6 files changed, 74 insertions(+), 36 deletions(-)

diff --git a/opal/datatype/cuda/opal_datatype_cuda.cu b/opal/datatype/cuda/opal_datatype_cuda.cu
index 3213a3b43fd..ec33b5c0e4d 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cu
+++ b/opal/datatype/cuda/opal_datatype_cuda.cu
@@ -355,11 +355,14 @@ void opal_ddt_set_cuda_iov_position(struct opal_convertor_t *convertor, size_t d
 {
     int i;
     size_t iov_size = 0;
+    size_t ddt_size;
     convertor->current_iov_partial_length = 0;
     convertor->current_cuda_iov_pos = 0;
     if (ddt_offset == 0) {
        return;
     }
+    opal_datatype_type_size(convertor->pDesc, &ddt_size);
+    ddt_offset = ddt_offset % ddt_size;
     for(i = 0; i < cuda_iov_count; i++) {
         iov_size += cached_cuda_iov_nb_bytes_list_h[i];
         if (iov_size > ddt_offset) {
diff --git a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
index ea4afa0b989..82a28420580 100644
--- a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
+++ b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
@@ -138,9 +138,9 @@ __global__ void opal_generic_simple_pack_cuda_iov_non_cached_kernel( ddt_cuda_io
 
 __global__ void opal_generic_simple_unpack_cuda_iov_non_cached_kernel( ddt_cuda_iov_dist_non_cached_t* cuda_iov_dist, int nb_blocks_used);
 
-__global__ void opal_generic_simple_pack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, uint32_t cuda_iov_pos, uintptr_t* cuda_iov_contig_buf_d, int nb_blocks_used, unsigned char* source_base, unsigned char* destination_base);
+__global__ void opal_generic_simple_pack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, uint32_t cuda_iov_pos, uint32_t cuda_iov_count, uint32_t ddt_extent, uint32_t current_count, int nb_blocks_used, unsigned char* source_base, unsigned char* destination_base);
 
-__global__ void opal_generic_simple_unpack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, uint32_t cuda_iov_pos, uintptr_t* cuda_iov_contig_buf_d, int nb_blocks_used, unsigned char* destination_base, unsigned char* source_base, size_t cuda_iov_partial_length_start, size_t cuda_iov_partial_length_end);
+__global__ void opal_generic_simple_unpack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, uint32_t cuda_iov_pos, uint32_t cuda_iov_count, uint32_t ddt_extent, uint32_t current_count, int nb_blocks_used, unsigned char* destination_base, unsigned char* source_base, size_t cuda_iov_partial_length_start, size_t cuda_iov_partial_length_end);
 
 void opal_cuda_output(int output_id, const char *format, ...);
 
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
index 92a96d1cb2b..2564fe1393c 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
@@ -88,7 +88,7 @@ __global__ void opal_generic_simple_pack_cuda_iov_non_cached_kernel( ddt_cuda_io
     }
 }
 
-__global__ void opal_generic_simple_pack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, uint32_t cuda_iov_pos, uintptr_t* cuda_iov_contig_buf_d, int nb_blocks_used, unsigned char* source_base, unsigned char* destination_base)
+__global__ void opal_generic_simple_pack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, uint32_t cuda_iov_pos, uint32_t cuda_iov_count, uint32_t ddt_extent, uint32_t current_count, int nb_blocks_used, unsigned char* source_base, unsigned char* destination_base)
 {
     uint32_t i, j;
     uint32_t _nb_bytes;    
@@ -97,6 +97,9 @@ __global__ void opal_generic_simple_pack_cuda_iov_cached_kernel( ddt_cuda_iov_di
     uint32_t current_cuda_iov_pos = cuda_iov_pos;
     size_t destination_disp = cuda_iov_dist[current_cuda_iov_pos].contig_disp;
     size_t contig_disp;
+    uint32_t _my_cuda_iov_pos;
+    uint32_t _my_cuda_iov_iteration;
+    size_t ddt_size = cuda_iov_dist[cuda_iov_count].contig_disp;
     
     __shared__ uint32_t nb_tasks;
     uint32_t copy_count;
@@ -107,15 +110,20 @@ __global__ void opal_generic_simple_pack_cuda_iov_cached_kernel( ddt_cuda_iov_di
         if (blockIdx.x < (nb_blocks_used % gridDim.x)) {
             nb_tasks ++;
         }
-   //     printf("nb_tasks %d, griddim %d, nb_blocks_used %d, bloid %d \n", nb_tasks, gridDim.x, nb_blocks_used, blockIdx.x);
+    //    printf("cuda_iov_count %d, ddt_extent %d, current_count %d\n", cuda_iov_count, ddt_extent, current_count);
+    //     printf("nb_tasks %d, griddim %d, nb_blocks_used %d, bloid %d \n", nb_tasks, gridDim.x, nb_blocks_used, blockIdx.x);
     }
     __syncthreads();
     
     for (i = 0; i < nb_tasks; i++) {
-        contig_disp = cuda_iov_dist[blockIdx.x + i * gridDim.x + current_cuda_iov_pos].contig_disp;  /* this variable is used multiple times, so put in in register */
-        src_offset = cuda_iov_dist[blockIdx.x + i * gridDim.x + current_cuda_iov_pos].ncontig_disp;
-        dst_offset = contig_disp - destination_disp;
-        _nb_bytes = cuda_iov_dist[blockIdx.x + i * gridDim.x + current_cuda_iov_pos + 1].contig_disp - contig_disp;
+        /* these 3 variables are used multiple times, so put in in register */
+        _my_cuda_iov_pos = (blockIdx.x + i * gridDim.x + current_cuda_iov_pos) % cuda_iov_count;
+        _my_cuda_iov_iteration = (blockIdx.x + i * gridDim.x + current_cuda_iov_pos) / cuda_iov_count;
+        contig_disp = cuda_iov_dist[_my_cuda_iov_pos].contig_disp;  
+        
+        src_offset = cuda_iov_dist[_my_cuda_iov_pos].ncontig_disp + (_my_cuda_iov_iteration + current_count) * ddt_extent;
+        dst_offset = contig_disp + ddt_size * _my_cuda_iov_iteration - destination_disp;
+        _nb_bytes = cuda_iov_dist[_my_cuda_iov_pos + 1].contig_disp - contig_disp;
         
         _source_tmp = source_base + src_offset;
         _destination_tmp = destination_base + dst_offset;
@@ -128,7 +136,12 @@ __global__ void opal_generic_simple_pack_cuda_iov_cached_kernel( ddt_cuda_iov_di
             alignment = ALIGNMENT_CHAR;
         }
         copy_count = _nb_bytes / alignment;
-        
+    /*    
+        if (threadIdx.x == 0 && nb_tasks != 0) {
+            printf("pack block %d, src_offset %ld, dst_offset %ld, count %d, nb_bytes %d, nb_tasks %d, i %d\n", blockIdx.x, src_offset, dst_offset, copy_count, _nb_bytes, nb_tasks, i);
+        }
+        __syncthreads();
+      */  
         for (j = threadIdx.x; j < copy_count; j += blockDim.x) {
             if (j < copy_count) {
                 _source_tmp = source_base + src_offset + j * alignment;
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
index 36ce4e3951d..fc9181e902b 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
@@ -966,6 +966,8 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
     uint32_t cached_cuda_iov_count = 0;
     uint8_t cuda_iov_is_cached = 0;
     size_t destionation_disp = 0;
+    opal_datatype_count_t convertor_current_count;
+    OPAL_PTRDIFF_TYPE ddt_extent;
 
 #if defined(OPAL_DATATYPE_CUDA_TIMING)    
     TIMER_DATA_TYPE start, end, start_total, end_total;
@@ -1118,16 +1120,18 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
     }
     
     /* now we use cached cuda iov */
+    cached_cuda_iov_count = cached_cuda_iov->cuda_iov_count;
     cuda_iov_start_pos = pConvertor->current_cuda_iov_pos;
-    cuda_iov_end_pos = cached_cuda_iov->cuda_iov_count;
+    cuda_iov_end_pos = cached_cuda_iov_count;
     nb_blocks_used = 0;
     cuda_iov_pipeline_block = current_cuda_device->cuda_iov_pipeline_block[iov_pipeline_block_id];
     cuda_stream_iov = cuda_iov_pipeline_block->cuda_stream;
+    convertor_current_count = pConvertor->current_count;
    
-    if( pConvertor->current_count < pConvertor->count && !buffer_isfull) {
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
-        GET_TIME(start);
+    GET_TIME(start);
 #endif
+    while( pConvertor->current_count < pConvertor->count && !buffer_isfull) {
         for (i = cuda_iov_start_pos; i < cuda_iov_end_pos && !buffer_isfull; i++) {
             if (buffer_size >= cached_cuda_iov_nb_bytes_list_h[i]) {
                 total_packed += cached_cuda_iov_nb_bytes_list_h[i];
@@ -1143,16 +1147,17 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
             cuda_iov_start_pos = 0;
             cuda_iov_end_pos = cached_cuda_iov->cuda_iov_count;
         }
+    }
 #if defined(OPAL_DATATYPE_CUDA_TIMING)    
-        GET_TIME( end );
-        total_time = ELAPSED_TIME( start, end );
-        DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Pack to dest %p, cached cuda iov is prepared in %ld microsec, kernel submitted to CUDA stream %d, nb_blocks %d\n", destination_base, total_time,  cuda_iov_pipeline_block->cuda_stream_id, nb_blocks_used); );
+    GET_TIME( end );
+    total_time = ELAPSED_TIME( start, end );
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Pack to dest %p, cached cuda iov is prepared in %ld microsec, kernel submitted to CUDA stream %d, nb_blocks %d\n", destination_base, total_time,  cuda_iov_pipeline_block->cuda_stream_id, nb_blocks_used); );
 #endif
-    }
-        DT_CUDA_DEBUG ( opal_cuda_output(2, "kernel launched src_base %p, dst_base %p, nb_blocks %ld\n", source_base, destination_base, nb_blocks_used ); );
-        opal_generic_simple_pack_cuda_iov_cached_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cached_cuda_iov_dist_d, pConvertor->current_cuda_iov_pos, cuda_iov_contig_buf_d_current, nb_blocks_used, source_base, destination_base);
-        pConvertor->current_cuda_iov_pos += nb_blocks_used;
-        pConvertor->current_cuda_iov_pos = pConvertor->current_cuda_iov_pos % cached_cuda_iov->cuda_iov_count;
+    opal_datatype_type_extent(pConvertor->pDesc, &ddt_extent);
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "kernel launched src_base %p, dst_base %p, nb_blocks %ld, extent %ld\n", source_base, destination_base, nb_blocks_used, ddt_extent ); );
+    opal_generic_simple_pack_cuda_iov_cached_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cached_cuda_iov_dist_d, pConvertor->current_cuda_iov_pos, cached_cuda_iov_count, ddt_extent, convertor_current_count, nb_blocks_used, source_base, destination_base);
+    pConvertor->current_cuda_iov_pos += nb_blocks_used;
+    pConvertor->current_cuda_iov_pos = pConvertor->current_cuda_iov_pos % cached_cuda_iov->cuda_iov_count;
 
     for (i = 0; i < NB_STREAMS; i++) {
         cudaStreamSynchronize(cuda_streams->opal_cuda_stream[i]);
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
index b58cff27cf3..f6ee8e0bfc4 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
@@ -46,7 +46,7 @@ __global__ void opal_generic_simple_unpack_cuda_iov_non_cached_kernel( ddt_cuda_
     }
 }
 
-__global__ void opal_generic_simple_unpack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, uint32_t cuda_iov_pos, uintptr_t* cuda_iov_contig_buf_d, int nb_blocks_used, unsigned char* destination_base, unsigned char* source_base, size_t cuda_iov_partial_length_start, size_t cuda_iov_partial_length_end)
+__global__ void opal_generic_simple_unpack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, uint32_t cuda_iov_pos, uint32_t cuda_iov_count, uint32_t ddt_extent, uint32_t current_count, int nb_blocks_used, unsigned char* destination_base, unsigned char* source_base, size_t cuda_iov_partial_length_start, size_t cuda_iov_partial_length_end)
 {
     uint32_t i, j;
     size_t dst_offset, src_offset;
@@ -56,6 +56,9 @@ __global__ void opal_generic_simple_unpack_cuda_iov_cached_kernel( ddt_cuda_iov_
     size_t source_disp = cuda_iov_dist[current_cuda_iov_pos].contig_disp;
     size_t source_partial_disp = 0;
     size_t contig_disp; 
+    uint32_t _my_cuda_iov_pos;
+    uint32_t _my_cuda_iov_iteration;
+    size_t ddt_size = cuda_iov_dist[cuda_iov_count].contig_disp;
 
     __shared__ uint32_t nb_tasks;
     uint32_t copy_count;
@@ -66,6 +69,7 @@ __global__ void opal_generic_simple_unpack_cuda_iov_cached_kernel( ddt_cuda_iov_
         if (blockIdx.x < nb_blocks_used % gridDim.x) {
             nb_tasks ++;
         }
+     //   printf("cuda_iov_count %d, ddt_extent %d, current_count %d, ddt_size %d\n", cuda_iov_count, ddt_extent, current_count, ddt_size);
     }
     __syncthreads();
     
@@ -74,13 +78,17 @@ __global__ void opal_generic_simple_unpack_cuda_iov_cached_kernel( ddt_cuda_iov_
     }
     
     for (i = 0; i < nb_tasks; i++) {
-        contig_disp = cuda_iov_dist[blockIdx.x + i * gridDim.x + current_cuda_iov_pos].contig_disp;  /* this variable is used multiple times, so put in in register */
-        src_offset = contig_disp - source_disp - source_partial_disp;
-        dst_offset = cuda_iov_dist[blockIdx.x + i * gridDim.x + current_cuda_iov_pos].ncontig_disp;
-        _nb_bytes = cuda_iov_dist[blockIdx.x + i * gridDim.x + current_cuda_iov_pos + 1].contig_disp - contig_disp;
+        /* these 3 variables are used multiple times, so put in in register */
+        _my_cuda_iov_pos = (blockIdx.x + i * gridDim.x + current_cuda_iov_pos) % cuda_iov_count;
+        _my_cuda_iov_iteration = (blockIdx.x + i * gridDim.x + current_cuda_iov_pos) / cuda_iov_count;
+        contig_disp = cuda_iov_dist[_my_cuda_iov_pos].contig_disp; 
+        
+        src_offset = contig_disp + ddt_size * _my_cuda_iov_iteration - source_disp - source_partial_disp;
+        dst_offset = cuda_iov_dist[_my_cuda_iov_pos].ncontig_disp + (_my_cuda_iov_iteration + current_count) * ddt_extent;
+        _nb_bytes = cuda_iov_dist[_my_cuda_iov_pos + 1].contig_disp - contig_disp;
 
         if (i == 0 && blockIdx.x == 0 && cuda_iov_partial_length_start != 0) {
-            src_offset = contig_disp - source_disp;
+            src_offset = contig_disp + ddt_size * _my_cuda_iov_iteration - source_disp;
             dst_offset = dst_offset + _nb_bytes - cuda_iov_partial_length_start;  
             _nb_bytes = cuda_iov_partial_length_start;
         } else if (i == nb_tasks-1 && (blockIdx.x == (nb_blocks_used-1) % gridDim.x) && cuda_iov_partial_length_end != 0) {
@@ -97,7 +105,12 @@ __global__ void opal_generic_simple_unpack_cuda_iov_cached_kernel( ddt_cuda_iov_
             alignment = ALIGNMENT_CHAR;
         }
         copy_count = _nb_bytes / alignment;
-        
+   /*     
+        if (threadIdx.x == 0 && nb_tasks != 0) {
+            printf("unpack block %d, src_offset %ld, dst_offset %ld, count %d, nb_bytes %d, nb_tasks %d, i %d\n", blockIdx.x, src_offset, dst_offset, copy_count, _nb_bytes, nb_tasks, i);
+        }
+        __syncthreads();
+     */   
         for (j = threadIdx.x; j < copy_count; j += blockDim.x) {
 /*            if (threadIdx.x == 0) {
                 if (copy_count > blockDim.x) printf("copy_count %d, dim %d\n", copy_count, blockDim.x);
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
index 70d9d10465e..49355e8e017 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
@@ -664,6 +664,8 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
     size_t cuda_iov_partial_length_start = 0;
     size_t cuda_iov_partial_length_end = 0;
     size_t source_disp = 0;
+    opal_datatype_count_t convertor_current_count;
+    OPAL_PTRDIFF_TYPE ddt_extent;
 
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
     TIMER_DATA_TYPE start, end, start_total, end_total;
@@ -816,6 +818,7 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
     nb_blocks_used = 0;
     cuda_iov_pipeline_block = current_cuda_device->cuda_iov_pipeline_block[iov_pipeline_block_id];
     cuda_stream_iov = cuda_iov_pipeline_block->cuda_stream;
+    convertor_current_count = pConvertor->current_count;
     
     printf("[00000] partial_length %ld, pos %d\n", pConvertor->current_iov_partial_length, pConvertor->current_cuda_iov_pos);
     if (pConvertor->current_iov_partial_length > 0) {
@@ -827,10 +830,10 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
         nb_blocks_used ++;
     }
     
-    while( pConvertor->current_count < pConvertor->count && !buffer_isfull) {
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
-        GET_TIME(start);
+    GET_TIME(start);
 #endif
+    while( pConvertor->current_count < pConvertor->count && !buffer_isfull) {
         for (i = cuda_iov_start_pos; i < cuda_iov_end_pos && !buffer_isfull; i++) {
             if (buffer_size >= cached_cuda_iov_nb_bytes_list_h[i]) {
                 total_unpacked += cached_cuda_iov_nb_bytes_list_h[i];
@@ -850,16 +853,17 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
         if (!buffer_isfull) {
             pConvertor->current_count ++;
             cuda_iov_start_pos = 0;
-            cuda_iov_end_pos = cached_cuda_iov->cuda_iov_count;
+            cuda_iov_end_pos = cached_cuda_iov_count;
         }
+    }
 #if defined(OPAL_DATATYPE_CUDA_TIMING)    
-        GET_TIME( end );
-        total_time = ELAPSED_TIME( start, end );
-        DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Unpack src %p, cached cuda iov is prepared in %ld microsec, kernel submitted to CUDA stream %d, nb_blocks %d\n", source_base, total_time,  cuda_iov_pipeline_block->cuda_stream_id, nb_blocks_used); );
+    GET_TIME( end );
+    total_time = ELAPSED_TIME( start, end );
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Unpack src %p, cached cuda iov is prepared in %ld microsec, kernel submitted to CUDA stream %d, nb_blocks %d\n", source_base, total_time,  cuda_iov_pipeline_block->cuda_stream_id, nb_blocks_used); );
 #endif
-    }
-        DT_CUDA_DEBUG ( opal_cuda_output(2, "kernel launched src_base %p, dst_base %p, nb_blocks %ld\n", source_base, destination_base, nb_blocks_used ); );
-        opal_generic_simple_unpack_cuda_iov_cached_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cached_cuda_iov_dist_d, pConvertor->current_cuda_iov_pos, cuda_iov_contig_buf_d_current, nb_blocks_used, destination_base, source_base, cuda_iov_partial_length_start, cuda_iov_partial_length_end);
+    opal_datatype_type_extent(pConvertor->pDesc, &ddt_extent);
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "kernel launched src_base %p, dst_base %p, nb_blocks %ld\n", source_base, destination_base, nb_blocks_used ); );
+    opal_generic_simple_unpack_cuda_iov_cached_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cached_cuda_iov_dist_d, pConvertor->current_cuda_iov_pos, cached_cuda_iov_count, ddt_extent, convertor_current_count, nb_blocks_used, destination_base, source_base, cuda_iov_partial_length_start, cuda_iov_partial_length_end);
 
     for (i = 0; i < NB_STREAMS; i++) {
         cudaStreamSynchronize(cuda_streams->opal_cuda_stream[i]);

From 83d4858b8236bf3031a4bc9fde0c3e8d0423b701 Mon Sep 17 00:00:00 2001
From: eddy16112 <eddy16112@gmail.com>
Date: Mon, 16 Nov 2015 16:02:11 -0500
Subject: [PATCH 159/190] move the cuda iov caching into a seperate function

---
 opal/datatype/cuda/opal_datatype_cuda.cu      |  79 +++++++++++++
 opal/datatype/cuda/opal_datatype_cuda.cuh     |   2 +
 .../cuda/opal_datatype_pack_cuda_wrapper.cu   | 110 +++---------------
 .../cuda/opal_datatype_unpack_cuda_wrapper.cu | 107 ++---------------
 4 files changed, 105 insertions(+), 193 deletions(-)

diff --git a/opal/datatype/cuda/opal_datatype_cuda.cu b/opal/datatype/cuda/opal_datatype_cuda.cu
index ec33b5c0e4d..5747eb2b3a5 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cu
+++ b/opal/datatype/cuda/opal_datatype_cuda.cu
@@ -325,6 +325,85 @@ void opal_ddt_cached_cuda_iov_fini(void* cached_cuda_iov)
 #endif /* OPAL_DATATYPE_CUDA_IOV_CACHE */
 }
 
+/* cached_cuda_iov_d is not ready until explicitlt sync with cuda stream 0 
+*/
+int32_t opal_ddt_cache_cuda_iov(opal_convertor_t* pConvertor, ddt_cuda_iov_dist_cached_t *cached_cuda_iov_d, uint32_t *cached_cuda_iov_nb_bytes_list_h, uint32_t *cuda_iov_count)
+{
+    uint32_t i, j;
+    uint32_t count_desc, nb_blocks_per_description, residue_desc;
+    uint32_t thread_per_block, nb_blocks_used;
+    size_t length_per_iovec;
+    uint8_t alignment;
+    ddt_cuda_iov_pipeline_block_t *cuda_iov_pipeline_block = NULL;
+    ddt_cuda_iov_dist_cached_t *cuda_iov_h = NULL;
+    cudaStream_t *cuda_stream_iov = NULL;
+    const struct iovec *ddt_iov = NULL;
+    uint32_t ddt_iov_count = 0;
+    size_t ncontig_disp_base;
+    size_t contig_disp = 0;
+    
+    opal_convertor_raw_cached( pConvertor, &ddt_iov, &ddt_iov_count);
+    if (ddt_iov == NULL) {
+        DT_CUDA_DEBUG ( opal_cuda_output(0, "Can not get ddt iov\n"););
+        return OPAL_ERROR;
+    }
+    
+    nb_blocks_used = 0;
+    cuda_iov_pipeline_block = current_cuda_device->cuda_iov_pipeline_block[0];
+    cuda_iov_h = cuda_iov_pipeline_block->cuda_iov_dist_cached_h;
+    cuda_stream_iov = cuda_iov_pipeline_block->cuda_stream;
+
+    for (i = 0; i < ddt_iov_count; i++) {
+        length_per_iovec = ddt_iov[i].iov_len;
+        ncontig_disp_base = (size_t)(ddt_iov[i].iov_base);
+    
+        /* block size is either multiple of ALIGNMENT_DOUBLE or residule */
+        alignment = ALIGNMENT_DOUBLE;
+
+        count_desc = length_per_iovec / alignment;
+        residue_desc = length_per_iovec % alignment;
+        nb_blocks_per_description = (count_desc + thread_per_block - 1) / thread_per_block;
+        DT_CUDA_DEBUG ( opal_cuda_output(10, "Cache cuda IOV description %d, size %d, residue %d, alignment %d, nb_block_aquired %d\n", i, count_desc, residue_desc, alignment, nb_blocks_per_description); );
+        for (j = 0; j < nb_blocks_per_description; j++) {
+            cuda_iov_h[nb_blocks_used].ncontig_disp = ncontig_disp_base + j * thread_per_block * alignment;
+            cuda_iov_h[nb_blocks_used].contig_disp = contig_disp;
+            if ( (j+1) * thread_per_block <= count_desc) {
+                cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] = thread_per_block * alignment;
+            } else {
+                cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] = (count_desc - j*thread_per_block) * alignment; 
+            }
+#if defined (OPAL_DATATYPE_CUDA_DEBUG)
+            assert(cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] > 0);
+#endif /* OPAL_DATATYPE_CUDA_DEBUG */
+            contig_disp += cached_cuda_iov_nb_bytes_list_h[nb_blocks_used];
+            DT_CUDA_DEBUG( opal_cuda_output(12, "Cache cuda IOV \tblock %d, ncontig_disp %ld, contig_disp %ld, nb_bytes %ld\n", nb_blocks_used, cuda_iov_h[nb_blocks_used].ncontig_disp, cuda_iov_h[nb_blocks_used].contig_disp, cached_cuda_iov_nb_bytes_list_h[nb_blocks_used]); );
+            nb_blocks_used ++;
+            assert (nb_blocks_used < CUDA_MAX_NB_BLOCKS*CUDA_IOV_MAX_TASK_PER_BLOCK);
+        }
+    
+        /* handle residue */
+        if (residue_desc != 0) {
+            cuda_iov_h[nb_blocks_used].ncontig_disp = ncontig_disp_base + length_per_iovec / alignment * alignment;
+            cuda_iov_h[nb_blocks_used].contig_disp = contig_disp;
+            cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] = length_per_iovec - length_per_iovec / alignment * alignment;
+#if defined (OPAL_DATATYPE_CUDA_DEBUG)
+            assert(cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] > 0);
+#endif /* OPAL_DATATYPE_CUDA_DEBUG */
+            contig_disp += cached_cuda_iov_nb_bytes_list_h[nb_blocks_used];
+            DT_CUDA_DEBUG( opal_cuda_output(12, "Cache cuda IOV \tblock %d, ncontig_disp %ld, contig_disp %ld, nb_bytes %ld\n", nb_blocks_used, cuda_iov_h[nb_blocks_used].ncontig_disp, cuda_iov_h[nb_blocks_used].contig_disp, cached_cuda_iov_nb_bytes_list_h[nb_blocks_used]); );
+            nb_blocks_used ++;
+#if defined (OPAL_DATATYPE_CUDA_DEBUG)
+            assert (nb_blocks_used < CUDA_MAX_NB_BLOCKS*CUDA_IOV_MAX_TASK_PER_BLOCK);
+#endif /* OPAL_DATATYPE_CUDA_DEBUG */
+        }
+    }
+    /* use additional entry to store the size of entire contiguous buffer needed for one ddt */
+    cuda_iov_h[nb_blocks_used].contig_disp = contig_disp;
+    cudaMemcpyAsync(cached_cuda_iov_d, cuda_iov_h, sizeof(ddt_cuda_iov_dist_cached_t)*(nb_blocks_used+1), cudaMemcpyHostToDevice, *cuda_stream_iov);
+    *cuda_iov_count = nb_blocks_used;
+    return OPAL_SUCCESS;
+}
+
 void opal_ddt_get_cached_cuda_iov(struct opal_convertor_t *convertor, ddt_cuda_iov_total_cached_t **cached_cuda_iov)
 {
     opal_datatype_t *datatype = (opal_datatype_t *)convertor->pDesc;
diff --git a/opal/datatype/cuda/opal_datatype_cuda.cuh b/opal/datatype/cuda/opal_datatype_cuda.cuh
index 8e30726ace2..4a71ab37d63 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cuh
+++ b/opal/datatype/cuda/opal_datatype_cuda.cuh
@@ -131,6 +131,8 @@ void opal_ddt_check_cuda_iov_is_full(struct opal_convertor_t *convertor, uint32_
 
 void opal_ddt_set_cuda_iov_position(struct opal_convertor_t *convertor, size_t ddt_offset, const uint32_t *cached_cuda_iov_nb_bytes_list_h, const uint32_t cuda_iov_count);
 
+int32_t opal_ddt_cache_cuda_iov(opal_convertor_t* pConvertor, ddt_cuda_iov_dist_cached_t *cached_cuda_iov_d, uint32_t *cached_cuda_iov_nb_bytes_list_h, uint32_t *cuda_iov_count);
+
 }
                             
 #endif  /* OPAL_DATATYPE_CUDA_H_HAS_BEEN_INCLUDED */
\ No newline at end of file
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
index fc9181e902b..ddc2ec08a89 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
@@ -932,40 +932,21 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
                                                                uint32_t* out_size,
                                                                size_t* max_data )
 {
-    uint32_t i, j;
-    uint32_t count_desc, nb_blocks_per_description, residue_desc;
+    uint32_t i;
     uint32_t nb_blocks, thread_per_block, nb_blocks_used;
-    size_t length, buffer_size, length_per_iovec;
-    unsigned char *destination, *destination_base, *source_base, *source;
+    size_t buffer_size;
+    unsigned char *destination, *destination_base, *source_base;
     size_t total_packed;
-    int32_t complete_flag = 0;
     uint8_t buffer_isfull = 0, transfer_required, free_required;
-    uint32_t convertor_flags;
-//    dt_elem_desc_t* description;
-//    dt_elem_desc_t* pElem;
-//    dt_stack_t* pStack;
-    uint8_t alignment, orig_alignment;
-//    int32_t orig_stack_index;
     cudaError_t cuda_err;
     ddt_cuda_stream_t *cuda_streams = current_cuda_device->cuda_streams;
-    ddt_cuda_iov_dist_cached_t* cuda_iov_dist_h_current = NULL;
-    ddt_cuda_iov_dist_cached_t* cuda_iov_dist_d_current = NULL;
-    uintptr_t *cuda_iov_contig_buf_h_current = NULL; 
-    uintptr_t *cuda_iov_contig_buf_d_current = NULL;
     ddt_cuda_iov_pipeline_block_t *cuda_iov_pipeline_block = NULL;
-    int iov_pipeline_block_id = 0;
     cudaStream_t *cuda_stream_iov = NULL;
-    const struct iovec *ddt_iov = NULL;
-    uint32_t ddt_iov_count = 0;
-    size_t iov_len = 0;
-    uint32_t iov_start_pos, iov_end_pos, cuda_iov_start_pos, cuda_iov_end_pos;
+    uint32_t cuda_iov_start_pos, cuda_iov_end_pos;
     ddt_cuda_iov_total_cached_t* cached_cuda_iov = NULL;
     ddt_cuda_iov_dist_cached_t* cached_cuda_iov_dist_d = NULL;
     uint32_t *cached_cuda_iov_nb_bytes_list_h = NULL;
-    uint32_t *cuda_iov_nb_bytes_list_h_current = NULL;
     uint32_t cached_cuda_iov_count = 0;
-    uint8_t cuda_iov_is_cached = 0;
-    size_t destionation_disp = 0;
     opal_datatype_count_t convertor_current_count;
     OPAL_PTRDIFF_TYPE ddt_extent;
 
@@ -973,14 +954,6 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
     TIMER_DATA_TYPE start, end, start_total, end_total;
     long total_time, move_time;
 #endif
-    
-    /*description = pConvertor->use_desc->desc;
-    pStack = pConvertor->pStack + pConvertor->stack_pos;
-    pElem = &(description[pStack->index]);
-    printf("size elem %lu, size %d\n", pElem->elem.common.type, opal_datatype_basicDatatypes[pElem->elem.common.type]->size);
-    */
-    
-//    assert(opal_datatype_basicDatatypes[pElem->elem.common.type]->size != 0);
 
  //   printf("buffer size %d, max_data %d\n", iov[0].iov_len, *max_data);
     if ((iov[0].iov_base == NULL) || opal_ddt_cuda_is_gpu_buffer(iov[0].iov_base)) {
@@ -1021,7 +994,6 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
 
     total_packed = 0;
     cuda_streams->current_stream_id = 0;
-  //  orig_stack_index = pStack->index;
     destination_base = destination;
 
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
@@ -1032,14 +1004,11 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
     GET_TIME(start);
 #endif
     
-    opal_convertor_raw_cached( pConvertor, &ddt_iov, &ddt_iov_count);
-    assert(ddt_iov != NULL);
     opal_ddt_get_cached_cuda_iov(pConvertor, &cached_cuda_iov);
     cached_cuda_iov_dist_d = cached_cuda_iov->cuda_iov_dist_d;
     assert(cached_cuda_iov_dist_d != NULL);
     cached_cuda_iov_nb_bytes_list_h = cached_cuda_iov->nb_bytes_h;
     assert(cached_cuda_iov_nb_bytes_list_h != NULL);
-    DT_CUDA_DEBUG ( opal_cuda_output(4, "Pack iov count %d, submit to CUDA stream %d\n", ddt_iov_count, cuda_streams->current_stream_id); );
 
 #if defined(OPAL_DATATYPE_CUDA_TIMING)    
     GET_TIME( end );
@@ -1053,69 +1022,20 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
     
     /* cuda iov is not cached, start to cache iov */
     if(opal_ddt_cuda_iov_is_cached(pConvertor) == 0) {
-        nb_blocks_used = 0;
-        cuda_iov_pipeline_block = current_cuda_device->cuda_iov_pipeline_block[iov_pipeline_block_id];
-        cuda_iov_dist_h_current = cuda_iov_pipeline_block->cuda_iov_dist_cached_h;
-        cuda_stream_iov = cuda_iov_pipeline_block->cuda_stream;
-        cuda_err = cudaStreamWaitEvent(*cuda_stream_iov, cuda_iov_pipeline_block->cuda_event, 0);
-        opal_cuda_check_error(cuda_err);
-
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
         GET_TIME(start);
 #endif
-
-        for (i = 0; i < ddt_iov_count; i++) {
-            length_per_iovec = ddt_iov[i].iov_len;
-            source = (size_t)(ddt_iov[i].iov_base) + source_base;
-        
-            /* block size is either multiple of ALIGNMENT_DOUBLE or residule */
-            alignment = ALIGNMENT_DOUBLE;
-
-            count_desc = length_per_iovec / alignment;
-            residue_desc = length_per_iovec % alignment;
-            nb_blocks_per_description = (count_desc + thread_per_block - 1) / thread_per_block;
-            DT_CUDA_DEBUG ( opal_cuda_output(10, "Pack description %d, size %d, residue %d, alignment %d, nb_block_aquired %d\n", i, count_desc, residue_desc, alignment, nb_blocks_per_description); );
-            for (j = 0; j < nb_blocks_per_description; j++) {
-                cuda_iov_dist_h_current[nb_blocks_used].ncontig_disp = source + j * thread_per_block * alignment - source_base;
-                cuda_iov_dist_h_current[nb_blocks_used].contig_disp = destionation_disp;
-                if ( (j+1) * thread_per_block <= count_desc) {
-                    cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] = thread_per_block * alignment;
-                } else {
-                    cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] = (count_desc - j*thread_per_block) * alignment; 
-                }
-#if defined (OPAL_DATATYPE_CUDA_DEBUG)
-                assert(cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] > 0);
-#endif /* OPAL_DATATYPE_CUDA_DEBUG */
-                destionation_disp += cached_cuda_iov_nb_bytes_list_h[nb_blocks_used];
-                DT_CUDA_DEBUG( opal_cuda_output(12, "Pack \tblock %d, ncontig_disp %ld, contig_disp %ld, nb_bytes %ld\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].ncontig_disp, cuda_iov_dist_h_current[nb_blocks_used].contig_disp, cached_cuda_iov_nb_bytes_list_h[nb_blocks_used]); );
-                nb_blocks_used ++;
-                assert (nb_blocks_used < CUDA_MAX_NB_BLOCKS*CUDA_IOV_MAX_TASK_PER_BLOCK);
-            }
-        
-            /* handle residue */
-            if (residue_desc != 0) {
-                /*orig_alignment = opal_datatype_basicDatatypes[pElem->elem.common.type]->size;*/
-                cuda_iov_dist_h_current[nb_blocks_used].ncontig_disp = source + length_per_iovec / alignment * alignment - source_base;
-                cuda_iov_dist_h_current[nb_blocks_used].contig_disp = destionation_disp;
-                cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] = length_per_iovec - length_per_iovec / alignment * alignment;
-#if defined (OPAL_DATATYPE_CUDA_DEBUG)
-                assert(cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] > 0);
-#endif /* OPAL_DATATYPE_CUDA_DEBUG */
-                destionation_disp += cached_cuda_iov_nb_bytes_list_h[nb_blocks_used];
-                DT_CUDA_DEBUG( opal_cuda_output(12, "Pack \tblock %d, ncontig_disp %ld, contig_disp %ld, nb_bytes %ld\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].ncontig_disp, cuda_iov_dist_h_current[nb_blocks_used].contig_disp, cached_cuda_iov_nb_bytes_list_h[nb_blocks_used]); );
-                nb_blocks_used ++;
-                assert (nb_blocks_used < CUDA_MAX_NB_BLOCKS*CUDA_IOV_MAX_TASK_PER_BLOCK);
-            }
+        if (opal_ddt_cache_cuda_iov(pConvertor, cached_cuda_iov_dist_d, cached_cuda_iov_nb_bytes_list_h, &nb_blocks_used) == OPAL_SUCCESS) {
+            opal_ddt_set_cuda_iov_cached(pConvertor, nb_blocks_used);
+            DT_CUDA_DEBUG ( opal_cuda_output(2, "Pack cuda iov is cached, count %d\n", nb_blocks_used););
+        } else {
+            DT_CUDA_DEBUG ( opal_cuda_output(0, "Pack cache cuda iov is failed\n"););
+            return OPAL_ERROR;
         }
-        /* use additional entry to store the size of entire contiguous buffer needed for one ddt */
-        cuda_iov_dist_h_current[nb_blocks_used].contig_disp = destionation_disp;
-        cudaMemcpyAsync(cached_cuda_iov_dist_d, cuda_iov_dist_h_current, sizeof(ddt_cuda_iov_dist_cached_t)*(nb_blocks_used+1), cudaMemcpyHostToDevice, *cuda_stream_iov);
-        opal_ddt_set_cuda_iov_cached(pConvertor, nb_blocks_used);
-        DT_CUDA_DEBUG ( opal_cuda_output(2, "Pack cuda iov is cached, count %d\n", nb_blocks_used););
 #if defined(OPAL_DATATYPE_CUDA_TIMING)    
         GET_TIME( end );
         total_time = ELAPSED_TIME( start, end );
-        DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Pack cached cuda iov is prepared in %ld microsec, nb_blocks %d\n", total_time, nb_blocks_used); );
+        DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Pack cuda iov is cached in %ld microsec, nb_blocks %d\n", total_time, nb_blocks_used); );
 #endif
     }
     
@@ -1124,7 +1044,7 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
     cuda_iov_start_pos = pConvertor->current_cuda_iov_pos;
     cuda_iov_end_pos = cached_cuda_iov_count;
     nb_blocks_used = 0;
-    cuda_iov_pipeline_block = current_cuda_device->cuda_iov_pipeline_block[iov_pipeline_block_id];
+    cuda_iov_pipeline_block = current_cuda_device->cuda_iov_pipeline_block[0];
     cuda_stream_iov = cuda_iov_pipeline_block->cuda_stream;
     convertor_current_count = pConvertor->current_count;
    
@@ -1154,14 +1074,12 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
     DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Pack to dest %p, cached cuda iov is prepared in %ld microsec, kernel submitted to CUDA stream %d, nb_blocks %d\n", destination_base, total_time,  cuda_iov_pipeline_block->cuda_stream_id, nb_blocks_used); );
 #endif
     opal_datatype_type_extent(pConvertor->pDesc, &ddt_extent);
-    DT_CUDA_DEBUG ( opal_cuda_output(2, "kernel launched src_base %p, dst_base %p, nb_blocks %ld, extent %ld\n", source_base, destination_base, nb_blocks_used, ddt_extent ); );
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "Pack kernel launched src_base %p, dst_base %p, nb_blocks %ld, extent %ld\n", source_base, destination_base, nb_blocks_used, ddt_extent ); );
     opal_generic_simple_pack_cuda_iov_cached_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cached_cuda_iov_dist_d, pConvertor->current_cuda_iov_pos, cached_cuda_iov_count, ddt_extent, convertor_current_count, nb_blocks_used, source_base, destination_base);
     pConvertor->current_cuda_iov_pos += nb_blocks_used;
     pConvertor->current_cuda_iov_pos = pConvertor->current_cuda_iov_pos % cached_cuda_iov->cuda_iov_count;
 
-    for (i = 0; i < NB_STREAMS; i++) {
-        cudaStreamSynchronize(cuda_streams->opal_cuda_stream[i]);
-    }
+    cudaStreamSynchronize(cuda_streams->opal_cuda_stream[0]);
 
     pConvertor->bConverted += total_packed;
     DT_CUDA_DEBUG ( opal_cuda_output(2, "Pack total packed %d\n", total_packed); );
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
index 49355e8e017..fe8475a201a 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
@@ -627,43 +627,24 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
                                                                  uint32_t* out_size,
                                                                  size_t* max_data )
 {
-    uint32_t i, j;
-    uint32_t count_desc, nb_blocks_per_description, residue_desc;
+    uint32_t i;
     uint32_t nb_blocks, thread_per_block, nb_blocks_used;
-    size_t length, buffer_size, length_per_iovec;
+    size_t buffer_size;
     unsigned char *source, *source_base, *destination_base, *destination;
     size_t total_unpacked;
-    int32_t complete_flag = 0;
     uint8_t buffer_isfull = 0;
     uint8_t free_required = 0;
-    uint32_t convertor_flags;
-//    dt_elem_desc_t* description;
-//    dt_elem_desc_t* pElem;
-//    dt_stack_t* pStack;
-    uint8_t alignment, orig_alignment;
-//    int32_t orig_stack_index;
     cudaError_t cuda_err;
     ddt_cuda_stream_t *cuda_streams = current_cuda_device->cuda_streams;
-    ddt_cuda_iov_dist_cached_t* cuda_iov_dist_h_current = NULL;
-    ddt_cuda_iov_dist_cached_t* cuda_iov_dist_d_current = NULL;
-    uintptr_t *cuda_iov_contig_buf_h_current = NULL;
-    uintptr_t *cuda_iov_contig_buf_d_current = NULL;
     ddt_cuda_iov_pipeline_block_t *cuda_iov_pipeline_block = NULL;
-    int iov_pipeline_block_id = 0;
     cudaStream_t *cuda_stream_iov = NULL;
-    const struct iovec *ddt_iov = NULL;
-    uint32_t ddt_iov_count = 0;
-    size_t iov_len = 0;
-    uint32_t iov_start_pos, iov_end_pos, cuda_iov_start_pos, cuda_iov_end_pos;
+    uint32_t cuda_iov_start_pos, cuda_iov_end_pos;
     ddt_cuda_iov_total_cached_t* cached_cuda_iov = NULL;
     ddt_cuda_iov_dist_cached_t* cached_cuda_iov_dist_d = NULL;
     uint32_t *cached_cuda_iov_nb_bytes_list_h = NULL;
-    uint32_t *cuda_iov_nb_bytes_list_h_current = NULL;
     uint32_t cached_cuda_iov_count = 0;
-    uint8_t cuda_iov_is_cached = 0;
     size_t cuda_iov_partial_length_start = 0;
     size_t cuda_iov_partial_length_end = 0;
-    size_t source_disp = 0;
     opal_datatype_count_t convertor_current_count;
     OPAL_PTRDIFF_TYPE ddt_extent;
 
@@ -676,12 +657,6 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
     GET_TIME(start_total);
 #endif
 
-/*    description = pConvertor->use_desc->desc;
-    pStack = pConvertor->pStack + pConvertor->stack_pos;
-    pElem = &(description[pStack->index]);
-    printf("size elem %d, size %lu\n", pElem->elem.common.type, opal_datatype_basicDatatypes[pElem->elem.common.type]->size);
-*/
-
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
     GET_TIME(start);
 #endif
@@ -710,9 +685,6 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
     move_time = ELAPSED_TIME( start, end );
     DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: HtoD memcpy in %ld microsec, free required %d\n", move_time, free_required ); );
 #endif
-    
-//    cuda_err = cudaEventRecord(current_cuda_device->memcpy_event, current_cuda_device->cuda_streams->opal_cuda_stream[0]);
-//    opal_cuda_check_error(cuda_err);
 
 
 #if defined (OPAL_DATATYPE_CUDA_TIMING)
@@ -721,17 +693,12 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
     buffer_size = iov[0].iov_len;
     total_unpacked = 0;
     cuda_streams->current_stream_id = 0;
-    convertor_flags = pConvertor->flags;
-//    orig_stack_index = pStack->index;
     source_base = source;
-    opal_convertor_raw_cached( pConvertor, &ddt_iov, &ddt_iov_count);
-    assert(ddt_iov != NULL);
     opal_ddt_get_cached_cuda_iov(pConvertor, &cached_cuda_iov);
     cached_cuda_iov_dist_d = cached_cuda_iov->cuda_iov_dist_d;
     assert(cached_cuda_iov_dist_d != NULL);
     cached_cuda_iov_nb_bytes_list_h = cached_cuda_iov->nb_bytes_h;
     assert(cached_cuda_iov_nb_bytes_list_h != NULL);
-    DT_CUDA_DEBUG ( opal_cuda_output(4, "Unpack iov count %d, submit to CUDA stream %d\n", ddt_iov_count, cuda_streams->current_stream_id); );
 
 #if defined (OPAL_DATATYPE_CUDA_TIMING)
     GET_TIME( end );
@@ -745,68 +712,17 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
     
     /* cuda iov is not cached, start to cache iov */
     if(opal_ddt_cuda_iov_is_cached(pConvertor) == 0) {
-        nb_blocks_used = 0;
-        cuda_iov_pipeline_block = current_cuda_device->cuda_iov_pipeline_block[iov_pipeline_block_id];
-        cuda_iov_dist_h_current = cuda_iov_pipeline_block->cuda_iov_dist_cached_h;
-        cuda_stream_iov = cuda_iov_pipeline_block->cuda_stream;
-        cuda_err = cudaStreamWaitEvent(*cuda_stream_iov, cuda_iov_pipeline_block->cuda_event, 0);
-        opal_cuda_check_error(cuda_err);
-    
-
 #if defined (OPAL_DATATYPE_CUDA_TIMING)
         GET_TIME(start);
 #endif
-
-        for (i = 0; i < ddt_iov_count; i++) {
-            length_per_iovec = ddt_iov[i].iov_len;
-            destination = (size_t)(ddt_iov[i].iov_base) + destination_base;
-
-            alignment = ALIGNMENT_DOUBLE;
-
-            count_desc = length_per_iovec / alignment;
-            residue_desc = length_per_iovec % alignment;
-            nb_blocks_per_description = (count_desc + thread_per_block - 1) / thread_per_block;
-            DT_CUDA_DEBUG ( opal_cuda_output(10, "Unpack description %d, size %d, residue %d, alignment %d\n", i, count_desc, residue_desc, alignment); );
-            for (j = 0; j < nb_blocks_per_description; j++) {
-                cuda_iov_dist_h_current[nb_blocks_used].ncontig_disp = destination + j * thread_per_block * alignment - destination_base;
-                cuda_iov_dist_h_current[nb_blocks_used].contig_disp = source_disp;
-                if ( (j+1) * thread_per_block <= count_desc) {
-                    cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] = thread_per_block * alignment;
-                } else {
-                    cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] = (thread_per_block - ((j+1)*thread_per_block - count_desc)) * alignment;
-                }
-#if defined (OPAL_DATATYPE_CUDA_DEBUG)
-                assert (cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] > 0); 
-#endif /* OPAL_DATATYPE_CUDA_DEBUG */
-                source_disp += cached_cuda_iov_nb_bytes_list_h[nb_blocks_used];
-                DT_CUDA_DEBUG( opal_cuda_output(12, "Unpack \tblock %d, ncontig_disp %ld, contig_disp %ld, nb_bytes %ld\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].ncontig_disp, cuda_iov_dist_h_current[nb_blocks_used].contig_disp, cached_cuda_iov_nb_bytes_list_h[nb_blocks_used]); );
-                nb_blocks_used ++;
-            }
-
-            /* handle residue */
-            if (residue_desc != 0) {
-               /* orig_alignment = opal_datatype_basicDatatypes[pElem->elem.common.type]->size;*/
-                orig_alignment = ALIGNMENT_CHAR;
-                cuda_iov_dist_h_current[nb_blocks_used].ncontig_disp = destination + length_per_iovec / alignment * alignment - destination_base;
-                cuda_iov_dist_h_current[nb_blocks_used].contig_disp = source_disp;
-                cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] = length_per_iovec - length_per_iovec / alignment * alignment;
-#if defined (OPAL_DATATYPE_CUDA_DEBUG)
-                assert (cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] > 0);
-#endif /* OPAL_DATATYPE_CUDA_DEBUG */
-                source_disp += cached_cuda_iov_nb_bytes_list_h[nb_blocks_used];
-                DT_CUDA_DEBUG( opal_cuda_output(12, "Unpack \tblock %d, ncontig_disp %ld, contig_disp %ld, nb_bytes %ld\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].ncontig_disp, cuda_iov_dist_h_current[nb_blocks_used].contig_disp, cached_cuda_iov_nb_bytes_list_h[nb_blocks_used]); );
-                nb_blocks_used ++;
-            }
+        if (opal_ddt_cache_cuda_iov(pConvertor, cached_cuda_iov_dist_d, cached_cuda_iov_nb_bytes_list_h, &nb_blocks_used) == OPAL_SUCCESS) {
+            opal_ddt_set_cuda_iov_cached(pConvertor, nb_blocks_used);
+            DT_CUDA_DEBUG ( opal_cuda_output(2, "Unpack cuda iov is cached, count %d\n", nb_blocks_used););
         }
-        /* use additional entry to store the size of entire contiguous buffer needed for one ddt */
-        cuda_iov_dist_h_current[nb_blocks_used].contig_disp = source_disp;
-        cudaMemcpy(cached_cuda_iov_dist_d, cuda_iov_dist_h_current, sizeof(ddt_cuda_iov_dist_cached_t)*(nb_blocks_used+1), cudaMemcpyHostToDevice);
-        opal_ddt_set_cuda_iov_cached(pConvertor, nb_blocks_used);
-        DT_CUDA_DEBUG ( opal_cuda_output(2, "Unpack cuda iov is cached, count %d\n", nb_blocks_used););
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
         GET_TIME( end );
         total_time = ELAPSED_TIME( start, end );
-        DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Unpack cached cuda iov is prepared in %ld microsec, nb_blocks_used %d\n", total_time, nb_blocks_used); );
+        DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Unpack cuda iov is cached in %ld microsec, nb_blocks_used %d\n", total_time, nb_blocks_used); );
 #endif
     }
       
@@ -816,11 +732,10 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
     cuda_iov_start_pos = pConvertor->current_cuda_iov_pos;
     cuda_iov_end_pos = cached_cuda_iov_count;
     nb_blocks_used = 0;
-    cuda_iov_pipeline_block = current_cuda_device->cuda_iov_pipeline_block[iov_pipeline_block_id];
+    cuda_iov_pipeline_block = current_cuda_device->cuda_iov_pipeline_block[0];
     cuda_stream_iov = cuda_iov_pipeline_block->cuda_stream;
     convertor_current_count = pConvertor->current_count;
     
-    printf("[00000] partial_length %ld, pos %d\n", pConvertor->current_iov_partial_length, pConvertor->current_cuda_iov_pos);
     if (pConvertor->current_iov_partial_length > 0) {
         cuda_iov_partial_length_start = pConvertor->current_iov_partial_length;
         total_unpacked += cuda_iov_partial_length_start;
@@ -862,12 +777,10 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
     DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Unpack src %p, cached cuda iov is prepared in %ld microsec, kernel submitted to CUDA stream %d, nb_blocks %d\n", source_base, total_time,  cuda_iov_pipeline_block->cuda_stream_id, nb_blocks_used); );
 #endif
     opal_datatype_type_extent(pConvertor->pDesc, &ddt_extent);
-    DT_CUDA_DEBUG ( opal_cuda_output(2, "kernel launched src_base %p, dst_base %p, nb_blocks %ld\n", source_base, destination_base, nb_blocks_used ); );
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "Unpack kernel launched src_base %p, dst_base %p, nb_blocks %ld\n", source_base, destination_base, nb_blocks_used ); );
     opal_generic_simple_unpack_cuda_iov_cached_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cached_cuda_iov_dist_d, pConvertor->current_cuda_iov_pos, cached_cuda_iov_count, ddt_extent, convertor_current_count, nb_blocks_used, destination_base, source_base, cuda_iov_partial_length_start, cuda_iov_partial_length_end);
 
-    for (i = 0; i < NB_STREAMS; i++) {
-        cudaStreamSynchronize(cuda_streams->opal_cuda_stream[i]);
-    }
+    cudaStreamSynchronize(cuda_streams->opal_cuda_stream[0]);
     
     pConvertor->bConverted += total_unpacked;
     DT_CUDA_DEBUG ( opal_cuda_output(2, "Unpack total unpacked %d\n", total_unpacked); );

From f49dae4a6b9c9fbfd342d185abfee08868bb7983 Mon Sep 17 00:00:00 2001
From: eddy16112 <eddy16112@gmail.com>
Date: Mon, 16 Nov 2015 16:30:27 -0500
Subject: [PATCH 160/190] these two variables are useless now

---
 opal/datatype/cuda/opal_datatype_cuda.cu           | 4 ----
 opal/datatype/cuda/opal_datatype_cuda_internal.cuh | 2 --
 2 files changed, 6 deletions(-)

diff --git a/opal/datatype/cuda/opal_datatype_cuda.cu b/opal/datatype/cuda/opal_datatype_cuda.cu
index 5747eb2b3a5..a71099c41a3 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cu
+++ b/opal/datatype/cuda/opal_datatype_cuda.cu
@@ -223,8 +223,6 @@ int32_t opal_ddt_cuda_kernel_init(void)
             cudaMallocHost((void **)(&(cuda_iov_pipeline_block->cuda_iov_dist_non_cached_h)), sizeof(ddt_cuda_iov_dist_non_cached_t) * CUDA_MAX_NB_BLOCKS * CUDA_IOV_MAX_TASK_PER_BLOCK);
             cudaMalloc((void **)(&(cuda_iov_pipeline_block->cuda_iov_dist_non_cached_d)), sizeof(ddt_cuda_iov_dist_non_cached_t) * CUDA_MAX_NB_BLOCKS * CUDA_IOV_MAX_TASK_PER_BLOCK);
             cudaMallocHost((void **)(&(cuda_iov_pipeline_block->cuda_iov_dist_cached_h)), sizeof(ddt_cuda_iov_dist_cached_t) * CUDA_MAX_NB_BLOCKS * CUDA_IOV_MAX_TASK_PER_BLOCK);
-            cudaMallocHost((void **)(&(cuda_iov_pipeline_block->cuda_iov_contig_buf_h)), sizeof(uintptr_t) * CUDA_MAX_NB_BLOCKS * CUDA_IOV_MAX_TASK_PER_BLOCK);
-            cudaMalloc((void **)(&(cuda_iov_pipeline_block->cuda_iov_contig_buf_d)), sizeof(uintptr_t) * CUDA_MAX_NB_BLOCKS * CUDA_IOV_MAX_TASK_PER_BLOCK);
             cuda_iov_pipeline_block->cuda_stream = &(cuda_streams->opal_cuda_stream[0]);
             cuda_iov_pipeline_block->cuda_stream_id = 0;
             cudaEventCreate(&(cuda_iov_pipeline_block->cuda_event), cudaEventDisableTiming);
@@ -264,8 +262,6 @@ int32_t opal_ddt_cuda_kernel_fini(void)
                 cudaFreeHost(cuda_iov_pipeline_block->cuda_iov_dist_non_cached_h);
                 cudaFree(cuda_iov_pipeline_block->cuda_iov_dist_non_cached_d);
                 cudaFreeHost(cuda_iov_pipeline_block->cuda_iov_dist_cached_h);
-                cudaFreeHost(cuda_iov_pipeline_block->cuda_iov_contig_buf_h);
-                cudaFree(cuda_iov_pipeline_block->cuda_iov_contig_buf_d);
                 cudaEventDestroy(cuda_iov_pipeline_block->cuda_event);
                 cuda_iov_pipeline_block->cuda_stream = NULL;
                 cuda_iov_pipeline_block->cuda_stream_id = -1;
diff --git a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
index 82a28420580..5e7bb41d0dc 100644
--- a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
+++ b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
@@ -74,8 +74,6 @@ typedef struct {
     ddt_cuda_iov_dist_non_cached_t* cuda_iov_dist_non_cached_h;
     ddt_cuda_iov_dist_non_cached_t* cuda_iov_dist_non_cached_d;
     ddt_cuda_iov_dist_cached_t* cuda_iov_dist_cached_h;
-    uintptr_t *cuda_iov_contig_buf_h;
-    uintptr_t *cuda_iov_contig_buf_d;
     cudaStream_t *cuda_stream;
     int32_t cuda_stream_id;
     cudaEvent_t cuda_event;

From ef04c97fe51b59f8524fb3925bfb3095d2c91427 Mon Sep 17 00:00:00 2001
From: eddy16112 <eddy16112@gmail.com>
Date: Mon, 16 Nov 2015 16:49:55 -0500
Subject: [PATCH 161/190] fix a bug for ib, current count of convertor should
 be set in set_cuda_iov_position

---
 opal/datatype/cuda/opal_datatype_cuda.cu | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/opal/datatype/cuda/opal_datatype_cuda.cu b/opal/datatype/cuda/opal_datatype_cuda.cu
index a71099c41a3..3129c320068 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cu
+++ b/opal/datatype/cuda/opal_datatype_cuda.cu
@@ -433,10 +433,12 @@ void opal_ddt_set_cuda_iov_position(struct opal_convertor_t *convertor, size_t d
     size_t ddt_size;
     convertor->current_iov_partial_length = 0;
     convertor->current_cuda_iov_pos = 0;
+    convertor->current_count = 0;
     if (ddt_offset == 0) {
        return;
     }
     opal_datatype_type_size(convertor->pDesc, &ddt_size);
+    convertor->current_count = ddt_offset / ddt_size;
     ddt_offset = ddt_offset % ddt_size;
     for(i = 0; i < cuda_iov_count; i++) {
         iov_size += cached_cuda_iov_nb_bytes_list_h[i];

From 189fa156edebb5add1b8f3d99e05590c4fb8a3f4 Mon Sep 17 00:00:00 2001
From: eddy16112 <eddy16112@gmail.com>
Date: Mon, 16 Nov 2015 21:00:09 -0500
Subject: [PATCH 162/190] cleanup, move cudamalloc into cache cuda iov

---
 opal/datatype/cuda/opal_datatype_cuda.cu      | 61 +++++++++++++------
 opal/datatype/cuda/opal_datatype_cuda.cuh     |  2 +-
 .../cuda/opal_datatype_pack_cuda_wrapper.cu   | 14 ++---
 .../cuda/opal_datatype_unpack_cuda_wrapper.cu | 13 ++--
 4 files changed, 56 insertions(+), 34 deletions(-)

diff --git a/opal/datatype/cuda/opal_datatype_cuda.cu b/opal/datatype/cuda/opal_datatype_cuda.cu
index 3129c320068..d0927dc4162 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cu
+++ b/opal/datatype/cuda/opal_datatype_cuda.cu
@@ -281,15 +281,13 @@ void* opal_ddt_cached_cuda_iov_init(uint32_t size)
 {
 #if OPAL_DATATYPE_CUDA_IOV_CACHE 
     ddt_cuda_iov_total_cached_t *tmp = (ddt_cuda_iov_total_cached_t *)malloc(sizeof(ddt_cuda_iov_total_cached_t));
-    ddt_cuda_iov_dist_cached_t *tmp_cuda_iov_d = NULL;
-    cudaMalloc((void **)(&tmp_cuda_iov_d), sizeof(ddt_cuda_iov_dist_cached_t) * size);
     uint32_t *tmp_nb_bytes = (uint32_t *)malloc(sizeof(uint32_t) * size);
-    if (tmp != NULL && tmp_cuda_iov_d != NULL && tmp_nb_bytes != NULL) {
-        tmp->cuda_iov_dist_d = tmp_cuda_iov_d;
+    if (tmp != NULL && tmp_nb_bytes != NULL) {
+        tmp->cuda_iov_dist_d = NULL;
         tmp->cuda_iov_count = size;
         tmp->cuda_iov_is_cached = 0;
         tmp->nb_bytes_h = tmp_nb_bytes;
-        DT_CUDA_DEBUG( opal_cuda_output( 2, "Malloc cuda_iov_dist_cached for ddt is successed, cached cuda iov %p, cuda_iov_d %p, nb_bytes_h %p, size %d.\n", tmp, tmp_cuda_iov_d, tmp_nb_bytes, size); );
+        DT_CUDA_DEBUG( opal_cuda_output( 2, "Malloc cuda_iov_dist_cached for ddt is successed, cached cuda iov %p, nb_bytes_h %p, size %d.\n", tmp, tmp_nb_bytes, size); );
         return tmp;
     } else {
         DT_CUDA_DEBUG( opal_cuda_output( 0, "Malloc cuda_iov_dist_cached for ddt is failed.\n"); );
@@ -323,7 +321,7 @@ void opal_ddt_cached_cuda_iov_fini(void* cached_cuda_iov)
 
 /* cached_cuda_iov_d is not ready until explicitlt sync with cuda stream 0 
 */
-int32_t opal_ddt_cache_cuda_iov(opal_convertor_t* pConvertor, ddt_cuda_iov_dist_cached_t *cached_cuda_iov_d, uint32_t *cached_cuda_iov_nb_bytes_list_h, uint32_t *cuda_iov_count)
+int32_t opal_ddt_cache_cuda_iov(opal_convertor_t* pConvertor, uint32_t *cuda_iov_count)
 {
     uint32_t i, j;
     uint32_t count_desc, nb_blocks_per_description, residue_desc;
@@ -331,12 +329,17 @@ int32_t opal_ddt_cache_cuda_iov(opal_convertor_t* pConvertor, ddt_cuda_iov_dist_
     size_t length_per_iovec;
     uint8_t alignment;
     ddt_cuda_iov_pipeline_block_t *cuda_iov_pipeline_block = NULL;
-    ddt_cuda_iov_dist_cached_t *cuda_iov_h = NULL;
+    ddt_cuda_iov_total_cached_t* cached_cuda_iov = NULL;
+    ddt_cuda_iov_dist_cached_t *cached_cuda_iov_dist_d = NULL;
+    ddt_cuda_iov_dist_cached_t *cuda_iov_dist_h = NULL;
     cudaStream_t *cuda_stream_iov = NULL;
     const struct iovec *ddt_iov = NULL;
     uint32_t ddt_iov_count = 0;
     size_t ncontig_disp_base;
     size_t contig_disp = 0;
+    uint32_t *cached_cuda_iov_nb_bytes_list_h = NULL;
+    
+    opal_datatype_t *datatype = (opal_datatype_t *)pConvertor->pDesc;
     
     opal_convertor_raw_cached( pConvertor, &ddt_iov, &ddt_iov_count);
     if (ddt_iov == NULL) {
@@ -344,10 +347,18 @@ int32_t opal_ddt_cache_cuda_iov(opal_convertor_t* pConvertor, ddt_cuda_iov_dist_
         return OPAL_ERROR;
     }
     
+    
+    cached_cuda_iov = (ddt_cuda_iov_total_cached_t *)opal_ddt_cached_cuda_iov_init(NUM_CUDA_IOV_PER_DDT);
+    if (cached_cuda_iov == NULL) {
+        DT_CUDA_DEBUG ( opal_cuda_output(0, "Can not init cuda iov\n"););
+        return OPAL_ERROR;
+    }
+    cached_cuda_iov_nb_bytes_list_h = cached_cuda_iov->nb_bytes_h;
     nb_blocks_used = 0;
     cuda_iov_pipeline_block = current_cuda_device->cuda_iov_pipeline_block[0];
-    cuda_iov_h = cuda_iov_pipeline_block->cuda_iov_dist_cached_h;
+    cuda_iov_dist_h = cuda_iov_pipeline_block->cuda_iov_dist_cached_h;
     cuda_stream_iov = cuda_iov_pipeline_block->cuda_stream;
+    thread_per_block = CUDA_WARP_SIZE * 5;
 
     for (i = 0; i < ddt_iov_count; i++) {
         length_per_iovec = ddt_iov[i].iov_len;
@@ -361,8 +372,8 @@ int32_t opal_ddt_cache_cuda_iov(opal_convertor_t* pConvertor, ddt_cuda_iov_dist_
         nb_blocks_per_description = (count_desc + thread_per_block - 1) / thread_per_block;
         DT_CUDA_DEBUG ( opal_cuda_output(10, "Cache cuda IOV description %d, size %d, residue %d, alignment %d, nb_block_aquired %d\n", i, count_desc, residue_desc, alignment, nb_blocks_per_description); );
         for (j = 0; j < nb_blocks_per_description; j++) {
-            cuda_iov_h[nb_blocks_used].ncontig_disp = ncontig_disp_base + j * thread_per_block * alignment;
-            cuda_iov_h[nb_blocks_used].contig_disp = contig_disp;
+            cuda_iov_dist_h[nb_blocks_used].ncontig_disp = ncontig_disp_base + j * thread_per_block * alignment;
+            cuda_iov_dist_h[nb_blocks_used].contig_disp = contig_disp;
             if ( (j+1) * thread_per_block <= count_desc) {
                 cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] = thread_per_block * alignment;
             } else {
@@ -372,21 +383,21 @@ int32_t opal_ddt_cache_cuda_iov(opal_convertor_t* pConvertor, ddt_cuda_iov_dist_
             assert(cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] > 0);
 #endif /* OPAL_DATATYPE_CUDA_DEBUG */
             contig_disp += cached_cuda_iov_nb_bytes_list_h[nb_blocks_used];
-            DT_CUDA_DEBUG( opal_cuda_output(12, "Cache cuda IOV \tblock %d, ncontig_disp %ld, contig_disp %ld, nb_bytes %ld\n", nb_blocks_used, cuda_iov_h[nb_blocks_used].ncontig_disp, cuda_iov_h[nb_blocks_used].contig_disp, cached_cuda_iov_nb_bytes_list_h[nb_blocks_used]); );
+            DT_CUDA_DEBUG( opal_cuda_output(12, "Cache cuda IOV \tblock %d, ncontig_disp %ld, contig_disp %ld, nb_bytes %ld\n", nb_blocks_used, cuda_iov_dist_h[nb_blocks_used].ncontig_disp, cuda_iov_dist_h[nb_blocks_used].contig_disp, cached_cuda_iov_nb_bytes_list_h[nb_blocks_used]); );
             nb_blocks_used ++;
             assert (nb_blocks_used < CUDA_MAX_NB_BLOCKS*CUDA_IOV_MAX_TASK_PER_BLOCK);
         }
     
         /* handle residue */
         if (residue_desc != 0) {
-            cuda_iov_h[nb_blocks_used].ncontig_disp = ncontig_disp_base + length_per_iovec / alignment * alignment;
-            cuda_iov_h[nb_blocks_used].contig_disp = contig_disp;
+            cuda_iov_dist_h[nb_blocks_used].ncontig_disp = ncontig_disp_base + length_per_iovec / alignment * alignment;
+            cuda_iov_dist_h[nb_blocks_used].contig_disp = contig_disp;
             cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] = length_per_iovec - length_per_iovec / alignment * alignment;
 #if defined (OPAL_DATATYPE_CUDA_DEBUG)
             assert(cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] > 0);
 #endif /* OPAL_DATATYPE_CUDA_DEBUG */
             contig_disp += cached_cuda_iov_nb_bytes_list_h[nb_blocks_used];
-            DT_CUDA_DEBUG( opal_cuda_output(12, "Cache cuda IOV \tblock %d, ncontig_disp %ld, contig_disp %ld, nb_bytes %ld\n", nb_blocks_used, cuda_iov_h[nb_blocks_used].ncontig_disp, cuda_iov_h[nb_blocks_used].contig_disp, cached_cuda_iov_nb_bytes_list_h[nb_blocks_used]); );
+            DT_CUDA_DEBUG( opal_cuda_output(12, "Cache cuda IOV \tblock %d, ncontig_disp %ld, contig_disp %ld, nb_bytes %ld\n", nb_blocks_used, cuda_iov_dist_h[nb_blocks_used].ncontig_disp, cuda_iov_dist_h[nb_blocks_used].contig_disp, cached_cuda_iov_nb_bytes_list_h[nb_blocks_used]); );
             nb_blocks_used ++;
 #if defined (OPAL_DATATYPE_CUDA_DEBUG)
             assert (nb_blocks_used < CUDA_MAX_NB_BLOCKS*CUDA_IOV_MAX_TASK_PER_BLOCK);
@@ -394,8 +405,15 @@ int32_t opal_ddt_cache_cuda_iov(opal_convertor_t* pConvertor, ddt_cuda_iov_dist_
         }
     }
     /* use additional entry to store the size of entire contiguous buffer needed for one ddt */
-    cuda_iov_h[nb_blocks_used].contig_disp = contig_disp;
-    cudaMemcpyAsync(cached_cuda_iov_d, cuda_iov_h, sizeof(ddt_cuda_iov_dist_cached_t)*(nb_blocks_used+1), cudaMemcpyHostToDevice, *cuda_stream_iov);
+    cuda_iov_dist_h[nb_blocks_used].contig_disp = contig_disp;
+    cudaMalloc((void **)(&cached_cuda_iov_dist_d), sizeof(ddt_cuda_iov_dist_cached_t) * (nb_blocks_used+1));
+    if (cached_cuda_iov_dist_d == NULL) {
+        DT_CUDA_DEBUG ( opal_cuda_output(0, "Can not malloc cuda iov in GPU\n"););
+        return OPAL_ERROR;
+    }
+    cudaMemcpyAsync(cached_cuda_iov_dist_d, cuda_iov_dist_h, sizeof(ddt_cuda_iov_dist_cached_t)*(nb_blocks_used+1), cudaMemcpyHostToDevice, *cuda_stream_iov);
+    cached_cuda_iov->cuda_iov_dist_d = cached_cuda_iov_dist_d;
+    datatype->cached_cuda_iov = cached_cuda_iov;
     *cuda_iov_count = nb_blocks_used;
     return OPAL_SUCCESS;
 }
@@ -404,9 +422,10 @@ void opal_ddt_get_cached_cuda_iov(struct opal_convertor_t *convertor, ddt_cuda_i
 {
     opal_datatype_t *datatype = (opal_datatype_t *)convertor->pDesc;
     if (datatype->cached_cuda_iov == NULL) {
-        datatype->cached_cuda_iov = opal_ddt_cached_cuda_iov_init(NUM_CUDA_IOV_PER_DDT);
-    }
-    *cached_cuda_iov = (ddt_cuda_iov_total_cached_t *)datatype->cached_cuda_iov;                   
+        *cached_cuda_iov = NULL;
+    } else {
+        *cached_cuda_iov = (ddt_cuda_iov_total_cached_t *)datatype->cached_cuda_iov;
+    }                 
 }
 
 void opal_ddt_set_cuda_iov_cached(struct opal_convertor_t *convertor, uint32_t cuda_iov_count)
@@ -421,7 +440,9 @@ void opal_ddt_set_cuda_iov_cached(struct opal_convertor_t *convertor, uint32_t c
 uint8_t opal_ddt_cuda_iov_is_cached(struct opal_convertor_t *convertor)
 {
     opal_datatype_t *datatype = (opal_datatype_t *)convertor->pDesc;
-    assert(datatype->cached_cuda_iov != NULL);
+    if (datatype->cached_cuda_iov == NULL) {
+        return 0;
+    }
     ddt_cuda_iov_total_cached_t *tmp = (ddt_cuda_iov_total_cached_t *)datatype->cached_cuda_iov;
     return tmp->cuda_iov_is_cached;
 }
diff --git a/opal/datatype/cuda/opal_datatype_cuda.cuh b/opal/datatype/cuda/opal_datatype_cuda.cuh
index 4a71ab37d63..8ad9b3ec658 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cuh
+++ b/opal/datatype/cuda/opal_datatype_cuda.cuh
@@ -131,7 +131,7 @@ void opal_ddt_check_cuda_iov_is_full(struct opal_convertor_t *convertor, uint32_
 
 void opal_ddt_set_cuda_iov_position(struct opal_convertor_t *convertor, size_t ddt_offset, const uint32_t *cached_cuda_iov_nb_bytes_list_h, const uint32_t cuda_iov_count);
 
-int32_t opal_ddt_cache_cuda_iov(opal_convertor_t* pConvertor, ddt_cuda_iov_dist_cached_t *cached_cuda_iov_d, uint32_t *cached_cuda_iov_nb_bytes_list_h, uint32_t *cuda_iov_count);
+int32_t opal_ddt_cache_cuda_iov(opal_convertor_t* pConvertor, uint32_t *cuda_iov_count);
 
 }
                             
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
index ddc2ec08a89..c98d540e54e 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
@@ -1003,12 +1003,6 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
     GET_TIME(start);
 #endif
-    
-    opal_ddt_get_cached_cuda_iov(pConvertor, &cached_cuda_iov);
-    cached_cuda_iov_dist_d = cached_cuda_iov->cuda_iov_dist_d;
-    assert(cached_cuda_iov_dist_d != NULL);
-    cached_cuda_iov_nb_bytes_list_h = cached_cuda_iov->nb_bytes_h;
-    assert(cached_cuda_iov_nb_bytes_list_h != NULL);
 
 #if defined(OPAL_DATATYPE_CUDA_TIMING)    
     GET_TIME( end );
@@ -1025,7 +1019,7 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
         GET_TIME(start);
 #endif
-        if (opal_ddt_cache_cuda_iov(pConvertor, cached_cuda_iov_dist_d, cached_cuda_iov_nb_bytes_list_h, &nb_blocks_used) == OPAL_SUCCESS) {
+        if (opal_ddt_cache_cuda_iov(pConvertor, &nb_blocks_used) == OPAL_SUCCESS) {
             opal_ddt_set_cuda_iov_cached(pConvertor, nb_blocks_used);
             DT_CUDA_DEBUG ( opal_cuda_output(2, "Pack cuda iov is cached, count %d\n", nb_blocks_used););
         } else {
@@ -1040,6 +1034,12 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
     }
     
     /* now we use cached cuda iov */
+    opal_ddt_get_cached_cuda_iov(pConvertor, &cached_cuda_iov);
+    cached_cuda_iov_dist_d = cached_cuda_iov->cuda_iov_dist_d;
+    assert(cached_cuda_iov_dist_d != NULL);
+    cached_cuda_iov_nb_bytes_list_h = cached_cuda_iov->nb_bytes_h;
+    assert(cached_cuda_iov_nb_bytes_list_h != NULL);
+    
     cached_cuda_iov_count = cached_cuda_iov->cuda_iov_count;
     cuda_iov_start_pos = pConvertor->current_cuda_iov_pos;
     cuda_iov_end_pos = cached_cuda_iov_count;
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
index fe8475a201a..6808ab56fed 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
@@ -694,11 +694,6 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
     total_unpacked = 0;
     cuda_streams->current_stream_id = 0;
     source_base = source;
-    opal_ddt_get_cached_cuda_iov(pConvertor, &cached_cuda_iov);
-    cached_cuda_iov_dist_d = cached_cuda_iov->cuda_iov_dist_d;
-    assert(cached_cuda_iov_dist_d != NULL);
-    cached_cuda_iov_nb_bytes_list_h = cached_cuda_iov->nb_bytes_h;
-    assert(cached_cuda_iov_nb_bytes_list_h != NULL);
 
 #if defined (OPAL_DATATYPE_CUDA_TIMING)
     GET_TIME( end );
@@ -715,7 +710,7 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
 #if defined (OPAL_DATATYPE_CUDA_TIMING)
         GET_TIME(start);
 #endif
-        if (opal_ddt_cache_cuda_iov(pConvertor, cached_cuda_iov_dist_d, cached_cuda_iov_nb_bytes_list_h, &nb_blocks_used) == OPAL_SUCCESS) {
+        if (opal_ddt_cache_cuda_iov(pConvertor, &nb_blocks_used) == OPAL_SUCCESS) {
             opal_ddt_set_cuda_iov_cached(pConvertor, nb_blocks_used);
             DT_CUDA_DEBUG ( opal_cuda_output(2, "Unpack cuda iov is cached, count %d\n", nb_blocks_used););
         }
@@ -727,6 +722,12 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
     }
       
     /* now we use cached cuda iov */
+    opal_ddt_get_cached_cuda_iov(pConvertor, &cached_cuda_iov);
+    cached_cuda_iov_dist_d = cached_cuda_iov->cuda_iov_dist_d;
+    assert(cached_cuda_iov_dist_d != NULL);
+    cached_cuda_iov_nb_bytes_list_h = cached_cuda_iov->nb_bytes_h;
+    assert(cached_cuda_iov_nb_bytes_list_h != NULL);
+    
     cached_cuda_iov_count = cached_cuda_iov->cuda_iov_count;
     opal_ddt_set_cuda_iov_position(pConvertor, pConvertor->bConverted, cached_cuda_iov_nb_bytes_list_h, cached_cuda_iov_count);
     cuda_iov_start_pos = pConvertor->current_cuda_iov_pos;

From 56eeffb5788e8e4b9b4587b2f87abfa5effe5136 Mon Sep 17 00:00:00 2001
From: eddy16112 <eddy16112@gmail.com>
Date: Tue, 17 Nov 2015 02:53:38 -0500
Subject: [PATCH 163/190] rearrange varibles

---
 opal/datatype/cuda/opal_datatype_cuda.cu | 2 +-
 opal/datatype/opal_datatype.h            | 7 ++++---
 opal/datatype/opal_datatype_create.c     | 2 +-
 3 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/opal/datatype/cuda/opal_datatype_cuda.cu b/opal/datatype/cuda/opal_datatype_cuda.cu
index d0927dc4162..f79e4e5ed0d 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cu
+++ b/opal/datatype/cuda/opal_datatype_cuda.cu
@@ -413,7 +413,7 @@ int32_t opal_ddt_cache_cuda_iov(opal_convertor_t* pConvertor, uint32_t *cuda_iov
     }
     cudaMemcpyAsync(cached_cuda_iov_dist_d, cuda_iov_dist_h, sizeof(ddt_cuda_iov_dist_cached_t)*(nb_blocks_used+1), cudaMemcpyHostToDevice, *cuda_stream_iov);
     cached_cuda_iov->cuda_iov_dist_d = cached_cuda_iov_dist_d;
-    datatype->cached_cuda_iov = cached_cuda_iov;
+    datatype->cached_cuda_iov = (unsigned char*)cached_cuda_iov;
     *cuda_iov_count = nb_blocks_used;
     return OPAL_SUCCESS;
 }
diff --git a/opal/datatype/opal_datatype.h b/opal/datatype/opal_datatype.h
index 01f876fd795..a3a6898dd89 100644
--- a/opal/datatype/opal_datatype.h
+++ b/opal/datatype/opal_datatype.h
@@ -131,13 +131,14 @@ struct opal_datatype_t {
     int                iov_count;
     size_t             max_data;
     /* size: 416, cachelines: 7, members: 18 */
-#if OPAL_CUDA_SUPPORT
-    void *             cached_cuda_iov;
-#endif /* OPAL_CUDA_SUPPORT */
     /* last cacheline: 32 bytes */
 
     struct iovec*      cached_iovec;
     uint32_t           cached_iovec_count;
+
+#if OPAL_CUDA_SUPPORT
+    unsigned char *             cached_cuda_iov;
+#endif /* OPAL_CUDA_SUPPORT */
 };
 
 typedef struct opal_datatype_t opal_datatype_t;
diff --git a/opal/datatype/opal_datatype_create.c b/opal/datatype/opal_datatype_create.c
index 44c0e3020b6..e57a7d6c668 100644
--- a/opal/datatype/opal_datatype_create.c
+++ b/opal/datatype/opal_datatype_create.c
@@ -102,7 +102,7 @@ static void opal_datatype_destruct( opal_datatype_t* datatype )
 #if OPAL_CUDA_SUPPORT   
     /* free cuda iov */
     if (opal_datatype_cuda_kernel_support == 1 && datatype->cached_cuda_iov != NULL) {
-        opal_cached_cuda_iov_fini(datatype->cached_cuda_iov);
+        opal_cached_cuda_iov_fini((void*)datatype->cached_cuda_iov);
         datatype->cached_cuda_iov = NULL;
     }
 #endif /* OPAL_CUDA_SUPPORT */

From 84f7abbdfd0082b8918a57035dd37e431ff0315f Mon Sep 17 00:00:00 2001
From: eddy16112 <eddy16112@gmail.com>
Date: Tue, 17 Nov 2015 18:13:00 -0500
Subject: [PATCH 164/190] if cuda_iov is not big enough, use realloc. However,
 cudaMallocHost does not work with realloc, so use malloc instead

---
 opal/datatype/cuda/opal_datatype_cuda.cu | 35 +++++++++++++++++++++---
 1 file changed, 31 insertions(+), 4 deletions(-)

diff --git a/opal/datatype/cuda/opal_datatype_cuda.cu b/opal/datatype/cuda/opal_datatype_cuda.cu
index f79e4e5ed0d..cd74a081693 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cu
+++ b/opal/datatype/cuda/opal_datatype_cuda.cu
@@ -222,7 +222,10 @@ int32_t opal_ddt_cuda_kernel_init(void)
             cuda_iov_pipeline_block = (ddt_cuda_iov_pipeline_block_t *)malloc(sizeof(ddt_cuda_iov_pipeline_block_t));
             cudaMallocHost((void **)(&(cuda_iov_pipeline_block->cuda_iov_dist_non_cached_h)), sizeof(ddt_cuda_iov_dist_non_cached_t) * CUDA_MAX_NB_BLOCKS * CUDA_IOV_MAX_TASK_PER_BLOCK);
             cudaMalloc((void **)(&(cuda_iov_pipeline_block->cuda_iov_dist_non_cached_d)), sizeof(ddt_cuda_iov_dist_non_cached_t) * CUDA_MAX_NB_BLOCKS * CUDA_IOV_MAX_TASK_PER_BLOCK);
-            cudaMallocHost((void **)(&(cuda_iov_pipeline_block->cuda_iov_dist_cached_h)), sizeof(ddt_cuda_iov_dist_cached_t) * CUDA_MAX_NB_BLOCKS * CUDA_IOV_MAX_TASK_PER_BLOCK);
+            if (j == 0) {
+            //    cudaMallocHost((void **)(&(cuda_iov_pipeline_block->cuda_iov_dist_cached_h)), sizeof(ddt_cuda_iov_dist_cached_t) * NUM_CUDA_IOV_PER_DDT);
+                cuda_iov_pipeline_block->cuda_iov_dist_cached_h = (ddt_cuda_iov_dist_cached_t *)malloc(sizeof(ddt_cuda_iov_dist_cached_t) * NUM_CUDA_IOV_PER_DDT);
+            }
             cuda_iov_pipeline_block->cuda_stream = &(cuda_streams->opal_cuda_stream[0]);
             cuda_iov_pipeline_block->cuda_stream_id = 0;
             cudaEventCreate(&(cuda_iov_pipeline_block->cuda_event), cudaEventDisableTiming);
@@ -261,7 +264,8 @@ int32_t opal_ddt_cuda_kernel_fini(void)
             if (cuda_iov_pipeline_block != NULL) {
                 cudaFreeHost(cuda_iov_pipeline_block->cuda_iov_dist_non_cached_h);
                 cudaFree(cuda_iov_pipeline_block->cuda_iov_dist_non_cached_d);
-                cudaFreeHost(cuda_iov_pipeline_block->cuda_iov_dist_cached_h);
+                //cudaFreeHost(cuda_iov_pipeline_block->cuda_iov_dist_cached_h);
+                free(cuda_iov_pipeline_block->cuda_iov_dist_cached_h);
                 cudaEventDestroy(cuda_iov_pipeline_block->cuda_event);
                 cuda_iov_pipeline_block->cuda_stream = NULL;
                 cuda_iov_pipeline_block->cuda_stream_id = -1;
@@ -319,6 +323,22 @@ void opal_ddt_cached_cuda_iov_fini(void* cached_cuda_iov)
 #endif /* OPAL_DATATYPE_CUDA_IOV_CACHE */
 }
 
+static inline int32_t opal_ddt_cached_cuda_iov_isfull(ddt_cuda_iov_total_cached_t *cached_cuda_iov, ddt_cuda_iov_dist_cached_t **cuda_iov_dist_h, uint32_t nb_blocks_used)
+{
+    if (nb_blocks_used < cached_cuda_iov->cuda_iov_count) {
+        return 0;
+    } else {
+realloc_cuda_iov:
+        cached_cuda_iov->nb_bytes_h = (uint32_t *)realloc(cached_cuda_iov->nb_bytes_h, sizeof(uint32_t)*cached_cuda_iov->cuda_iov_count*2);
+        assert(cached_cuda_iov->nb_bytes_h != NULL);
+        cached_cuda_iov->cuda_iov_count *= 2;
+        if (nb_blocks_used >= cached_cuda_iov->cuda_iov_count) {
+            goto realloc_cuda_iov;
+        }
+        return 1;
+    }
+}
+
 /* cached_cuda_iov_d is not ready until explicitlt sync with cuda stream 0 
 */
 int32_t opal_ddt_cache_cuda_iov(opal_convertor_t* pConvertor, uint32_t *cuda_iov_count)
@@ -371,6 +391,13 @@ int32_t opal_ddt_cache_cuda_iov(opal_convertor_t* pConvertor, uint32_t *cuda_iov
         residue_desc = length_per_iovec % alignment;
         nb_blocks_per_description = (count_desc + thread_per_block - 1) / thread_per_block;
         DT_CUDA_DEBUG ( opal_cuda_output(10, "Cache cuda IOV description %d, size %d, residue %d, alignment %d, nb_block_aquired %d\n", i, count_desc, residue_desc, alignment, nb_blocks_per_description); );
+        if (opal_ddt_cached_cuda_iov_isfull(cached_cuda_iov, &(cuda_iov_pipeline_block->cuda_iov_dist_cached_h), nb_blocks_used + nb_blocks_per_description + 1)) {
+            cached_cuda_iov_nb_bytes_list_h = cached_cuda_iov->nb_bytes_h;
+            cuda_iov_dist_h = (ddt_cuda_iov_dist_cached_t *)realloc(cuda_iov_dist_h, sizeof(ddt_cuda_iov_dist_cached_t)*cached_cuda_iov->cuda_iov_count);
+            assert(cuda_iov_dist_h != NULL);
+            cuda_iov_pipeline_block->cuda_iov_dist_cached_h = cuda_iov_dist_h;
+        }
+        
         for (j = 0; j < nb_blocks_per_description; j++) {
             cuda_iov_dist_h[nb_blocks_used].ncontig_disp = ncontig_disp_base + j * thread_per_block * alignment;
             cuda_iov_dist_h[nb_blocks_used].contig_disp = contig_disp;
@@ -385,7 +412,7 @@ int32_t opal_ddt_cache_cuda_iov(opal_convertor_t* pConvertor, uint32_t *cuda_iov
             contig_disp += cached_cuda_iov_nb_bytes_list_h[nb_blocks_used];
             DT_CUDA_DEBUG( opal_cuda_output(12, "Cache cuda IOV \tblock %d, ncontig_disp %ld, contig_disp %ld, nb_bytes %ld\n", nb_blocks_used, cuda_iov_dist_h[nb_blocks_used].ncontig_disp, cuda_iov_dist_h[nb_blocks_used].contig_disp, cached_cuda_iov_nb_bytes_list_h[nb_blocks_used]); );
             nb_blocks_used ++;
-            assert (nb_blocks_used < CUDA_MAX_NB_BLOCKS*CUDA_IOV_MAX_TASK_PER_BLOCK);
+         //   assert (nb_blocks_used < NUM_CUDA_IOV_PER_DDT);
         }
     
         /* handle residue */
@@ -400,7 +427,7 @@ int32_t opal_ddt_cache_cuda_iov(opal_convertor_t* pConvertor, uint32_t *cuda_iov
             DT_CUDA_DEBUG( opal_cuda_output(12, "Cache cuda IOV \tblock %d, ncontig_disp %ld, contig_disp %ld, nb_bytes %ld\n", nb_blocks_used, cuda_iov_dist_h[nb_blocks_used].ncontig_disp, cuda_iov_dist_h[nb_blocks_used].contig_disp, cached_cuda_iov_nb_bytes_list_h[nb_blocks_used]); );
             nb_blocks_used ++;
 #if defined (OPAL_DATATYPE_CUDA_DEBUG)
-            assert (nb_blocks_used < CUDA_MAX_NB_BLOCKS*CUDA_IOV_MAX_TASK_PER_BLOCK);
+            //assert (nb_blocks_used < NUM_CUDA_IOV_PER_DDT);
 #endif /* OPAL_DATATYPE_CUDA_DEBUG */
         }
     }

From 65424d03cc65066a23e38b08e6abdb8762b9724d Mon Sep 17 00:00:00 2001
From: eddy16112 <eddy16112@gmail.com>
Date: Wed, 18 Nov 2015 15:26:31 -0500
Subject: [PATCH 165/190] make sure check pointer is not NULL before free it

---
 opal/datatype/cuda/opal_datatype_cuda.cu | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/opal/datatype/cuda/opal_datatype_cuda.cu b/opal/datatype/cuda/opal_datatype_cuda.cu
index cd74a081693..2df143f2c61 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cu
+++ b/opal/datatype/cuda/opal_datatype_cuda.cu
@@ -225,6 +225,8 @@ int32_t opal_ddt_cuda_kernel_init(void)
             if (j == 0) {
             //    cudaMallocHost((void **)(&(cuda_iov_pipeline_block->cuda_iov_dist_cached_h)), sizeof(ddt_cuda_iov_dist_cached_t) * NUM_CUDA_IOV_PER_DDT);
                 cuda_iov_pipeline_block->cuda_iov_dist_cached_h = (ddt_cuda_iov_dist_cached_t *)malloc(sizeof(ddt_cuda_iov_dist_cached_t) * NUM_CUDA_IOV_PER_DDT);
+            } else {
+                cuda_iov_pipeline_block->cuda_iov_dist_cached_h = NULL;
             }
             cuda_iov_pipeline_block->cuda_stream = &(cuda_streams->opal_cuda_stream[0]);
             cuda_iov_pipeline_block->cuda_stream_id = 0;
@@ -262,10 +264,19 @@ int32_t opal_ddt_cuda_kernel_fini(void)
             cudaStreamDestroy(cuda_devices[i].cuda_streams->opal_cuda_stream[j]);
             cuda_iov_pipeline_block = cuda_devices[i].cuda_iov_pipeline_block[j];
             if (cuda_iov_pipeline_block != NULL) {
-                cudaFreeHost(cuda_iov_pipeline_block->cuda_iov_dist_non_cached_h);
-                cudaFree(cuda_iov_pipeline_block->cuda_iov_dist_non_cached_d);
-                //cudaFreeHost(cuda_iov_pipeline_block->cuda_iov_dist_cached_h);
-                free(cuda_iov_pipeline_block->cuda_iov_dist_cached_h);
+                if (cuda_iov_pipeline_block->cuda_iov_dist_non_cached_h != NULL) {
+                    cudaFreeHost(cuda_iov_pipeline_block->cuda_iov_dist_non_cached_h);
+                    cuda_iov_pipeline_block->cuda_iov_dist_non_cached_h = NULL;
+                }
+                if (cuda_iov_pipeline_block->cuda_iov_dist_non_cached_d != NULL) {
+                    cudaFree(cuda_iov_pipeline_block->cuda_iov_dist_non_cached_d);
+                    cuda_iov_pipeline_block->cuda_iov_dist_non_cached_d = NULL;
+                }
+                if (cuda_iov_pipeline_block->cuda_iov_dist_cached_h != NULL) {
+                    //cudaFreeHost(cuda_iov_pipeline_block->cuda_iov_dist_cached_h);
+                    free(cuda_iov_pipeline_block->cuda_iov_dist_cached_h);
+                    cuda_iov_pipeline_block->cuda_iov_dist_cached_h = NULL;
+                }
                 cudaEventDestroy(cuda_iov_pipeline_block->cuda_event);
                 cuda_iov_pipeline_block->cuda_stream = NULL;
                 cuda_iov_pipeline_block->cuda_stream_id = -1;

From 5d316d95b7b9f3ce1ed0c2c10008457291111579 Mon Sep 17 00:00:00 2001
From: eddy16112 <eddy16112@gmail.com>
Date: Tue, 24 Nov 2015 20:18:17 -0500
Subject: [PATCH 166/190] checkpoint, rewrite non-cached version

---
 opal/datatype/cuda/opal_datatype_cuda.cu      |   6 +-
 .../cuda/opal_datatype_cuda_internal.cuh      |   4 +-
 .../cuda/opal_datatype_pack_cuda_wrapper.cu   | 251 +++++++++++++++++-
 .../cuda/opal_datatype_unpack_cuda_wrapper.cu | 220 ++++++++++++++-
 4 files changed, 463 insertions(+), 18 deletions(-)

diff --git a/opal/datatype/cuda/opal_datatype_cuda.cu b/opal/datatype/cuda/opal_datatype_cuda.cu
index 2df143f2c61..5ba8e1361c0 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cu
+++ b/opal/datatype/cuda/opal_datatype_cuda.cu
@@ -220,10 +220,9 @@ int32_t opal_ddt_cuda_kernel_init(void)
         for (j = 0; j < NB_STREAMS; j++) {
             cudaStreamCreate(&(cuda_streams->opal_cuda_stream[j]));
             cuda_iov_pipeline_block = (ddt_cuda_iov_pipeline_block_t *)malloc(sizeof(ddt_cuda_iov_pipeline_block_t));
-            cudaMallocHost((void **)(&(cuda_iov_pipeline_block->cuda_iov_dist_non_cached_h)), sizeof(ddt_cuda_iov_dist_non_cached_t) * CUDA_MAX_NB_BLOCKS * CUDA_IOV_MAX_TASK_PER_BLOCK);
-            cudaMalloc((void **)(&(cuda_iov_pipeline_block->cuda_iov_dist_non_cached_d)), sizeof(ddt_cuda_iov_dist_non_cached_t) * CUDA_MAX_NB_BLOCKS * CUDA_IOV_MAX_TASK_PER_BLOCK);
+            cudaMallocHost((void **)(&(cuda_iov_pipeline_block->cuda_iov_dist_non_cached_h)), sizeof(ddt_cuda_iov_dist_cached_t) * CUDA_MAX_NB_BLOCKS * CUDA_IOV_MAX_TASK_PER_BLOCK);
+            cudaMalloc((void **)(&(cuda_iov_pipeline_block->cuda_iov_dist_non_cached_d)), sizeof(ddt_cuda_iov_dist_cached_t) * CUDA_MAX_NB_BLOCKS * CUDA_IOV_MAX_TASK_PER_BLOCK);
             if (j == 0) {
-            //    cudaMallocHost((void **)(&(cuda_iov_pipeline_block->cuda_iov_dist_cached_h)), sizeof(ddt_cuda_iov_dist_cached_t) * NUM_CUDA_IOV_PER_DDT);
                 cuda_iov_pipeline_block->cuda_iov_dist_cached_h = (ddt_cuda_iov_dist_cached_t *)malloc(sizeof(ddt_cuda_iov_dist_cached_t) * NUM_CUDA_IOV_PER_DDT);
             } else {
                 cuda_iov_pipeline_block->cuda_iov_dist_cached_h = NULL;
@@ -273,7 +272,6 @@ int32_t opal_ddt_cuda_kernel_fini(void)
                     cuda_iov_pipeline_block->cuda_iov_dist_non_cached_d = NULL;
                 }
                 if (cuda_iov_pipeline_block->cuda_iov_dist_cached_h != NULL) {
-                    //cudaFreeHost(cuda_iov_pipeline_block->cuda_iov_dist_cached_h);
                     free(cuda_iov_pipeline_block->cuda_iov_dist_cached_h);
                     cuda_iov_pipeline_block->cuda_iov_dist_cached_h = NULL;
                 }
diff --git a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
index 5e7bb41d0dc..99dc76f1e05 100644
--- a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
+++ b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
@@ -71,8 +71,8 @@ typedef struct {
 } ddt_cuda_iov_total_cached_t;
 
 typedef struct {
-    ddt_cuda_iov_dist_non_cached_t* cuda_iov_dist_non_cached_h;
-    ddt_cuda_iov_dist_non_cached_t* cuda_iov_dist_non_cached_d;
+    ddt_cuda_iov_dist_cached_t* cuda_iov_dist_non_cached_h;
+    ddt_cuda_iov_dist_cached_t* cuda_iov_dist_non_cached_d;
     ddt_cuda_iov_dist_cached_t* cuda_iov_dist_cached_h;
     cudaStream_t *cuda_stream;
     int32_t cuda_stream_id;
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
index c98d540e54e..5bdfa88fbdb 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
@@ -664,9 +664,11 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConve
                                                         uint32_t* out_size,
                                                         size_t* max_data )
 {
-    return opal_ddt_generic_simple_pack_function_cuda_iov_cached(pConvertor, iov, out_size, max_data);   
+    return opal_ddt_generic_simple_pack_function_cuda_iov_non_cached(pConvertor, iov, out_size, max_data);   
 }
 
+#if 0
+
 int32_t opal_ddt_generic_simple_pack_function_cuda_iov_non_cached( opal_convertor_t* pConvertor,
                                                                    struct iovec* iov,
                                                                    uint32_t* out_size,
@@ -927,6 +929,243 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_non_cached( opal_converto
     return 0;
 }
 
+#endif
+
+
+int32_t opal_ddt_generic_simple_pack_function_cuda_iov_non_cached( opal_convertor_t* pConvertor,
+                                                                   struct iovec* iov,
+                                                                   uint32_t* out_size,
+                                                                   size_t* max_data )
+{
+    uint32_t i, j;
+    uint32_t count_desc, nb_blocks_per_description, residue_desc;
+    uint32_t nb_blocks, thread_per_block, nb_blocks_used;
+    size_t buffer_size, length_per_iovec;
+    unsigned char *destination, *destination_base, *source_base;
+    size_t total_packed;
+    uint8_t buffer_isfull = 0, transfer_required, free_required;
+    uint8_t alignment;
+    cudaError_t cuda_err;
+    ddt_cuda_stream_t *cuda_streams = current_cuda_device->cuda_streams;
+    ddt_cuda_iov_dist_cached_t* cuda_iov_dist_h_current;
+    ddt_cuda_iov_dist_cached_t* cuda_iov_dist_d_current;
+    ddt_cuda_iov_pipeline_block_t *cuda_iov_pipeline_block;
+    int iov_pipeline_block_id = 0;
+    cudaStream_t *cuda_stream_iov = NULL;
+    const struct iovec *ddt_iov = NULL;
+    uint32_t ddt_iov_count = 0;
+    size_t current_cuda_iov_length = 0;
+    size_t ncontig_disp_base;
+    size_t contig_disp = 0;
+    uint32_t ddt_iov_start_pos, ddt_iov_end_pos;
+    OPAL_PTRDIFF_TYPE ddt_extent;
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)    
+    TIMER_DATA_TYPE start, end, start_total, end_total;
+    long total_time, move_time;
+#endif
+    
+
+ //   printf("buffer size %d, max_data %d\n", iov[0].iov_len, *max_data);
+    if ((iov[0].iov_base == NULL) || opal_ddt_cuda_is_gpu_buffer(iov[0].iov_base)) {
+        if (iov[0].iov_len == 0) {
+            buffer_size = DT_CUDA_BUFFER_SIZE;
+        } else {
+            buffer_size = iov[0].iov_len;
+        }
+        
+        if (iov[0].iov_base == NULL) {
+            iov[0].iov_base = (unsigned char *)opal_ddt_cuda_malloc_gpu_buffer(buffer_size, 0);
+            destination = (unsigned char *)iov[0].iov_base;
+            pConvertor->gpu_buffer_ptr = destination;
+            free_required = 1;
+        } else {
+            destination = (unsigned char *)iov[0].iov_base;
+            free_required = 0;
+        }
+        transfer_required = 0;
+    } else {
+        buffer_size = iov[0].iov_len;
+        if (OPAL_DATATYPE_VECTOR_USE_ZEROCPY) {
+            pConvertor->gpu_buffer_ptr = NULL;
+            transfer_required = 0;
+            free_required = 0;
+            cudaHostGetDevicePointer((void **)&destination, (void *)iov[0].iov_base, 0);
+        } else {
+            if (pConvertor->gpu_buffer_ptr == NULL) {
+                pConvertor->gpu_buffer_ptr = (unsigned char*)opal_ddt_cuda_malloc_gpu_buffer(buffer_size, 0);
+            }
+            transfer_required = 1;
+            free_required = 1;
+            destination = pConvertor->gpu_buffer_ptr;
+        }
+    }   
+    
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "Pack using IOV non cached, GPU base %p, pack to buffer %p\n", pConvertor->pBaseBuf, destination););
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+    GET_TIME(start_total);
+#endif
+    
+    opal_convertor_raw_cached( pConvertor, &ddt_iov, &ddt_iov_count);
+    if (ddt_iov == NULL) {
+        DT_CUDA_DEBUG ( opal_cuda_output(0, "Can not get ddt iov\n"););
+        return OPAL_ERROR;
+    }
+    
+    total_packed = 0;
+    cuda_streams->current_stream_id = 0;
+    thread_per_block = CUDA_WARP_SIZE * 5;
+    nb_blocks = 256;
+    opal_datatype_type_extent(pConvertor->pDesc, &ddt_extent);
+    source_base = (unsigned char*)pConvertor->pBaseBuf + pConvertor->current_count * ddt_extent; 
+    destination_base = destination;
+    
+    for (i = 0; i < NB_STREAMS; i++) {
+        cudaStreamSynchronize(cuda_streams->opal_cuda_stream[i]);
+    }
+    
+    while( pConvertor->current_count < pConvertor->count && !buffer_isfull) {
+        
+        nb_blocks_used = 0;
+        ddt_iov_start_pos = pConvertor->current_iov_pos;
+        ddt_iov_end_pos = ddt_iov_start_pos + IOV_PIPELINE_SIZE;
+        if (ddt_iov_end_pos > ddt_iov_count) {
+            ddt_iov_end_pos = ddt_iov_count;
+        }
+        cuda_iov_pipeline_block = current_cuda_device->cuda_iov_pipeline_block[iov_pipeline_block_id];
+        cuda_iov_dist_h_current = cuda_iov_pipeline_block->cuda_iov_dist_non_cached_h;
+        cuda_iov_dist_d_current = cuda_iov_pipeline_block->cuda_iov_dist_non_cached_d;
+        cuda_stream_iov = cuda_iov_pipeline_block->cuda_stream;
+        cuda_err = cudaStreamWaitEvent(*cuda_stream_iov, cuda_iov_pipeline_block->cuda_event, 0);
+        opal_cuda_check_error(cuda_err);
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+        GET_TIME(start);
+#endif
+
+        for (i = ddt_iov_start_pos; i < ddt_iov_end_pos && !buffer_isfull; i++) {
+            if (pConvertor->current_iov_partial_length > 0) {
+                ncontig_disp_base = (size_t)(ddt_iov[i].iov_base) + ddt_iov[i].iov_len - pConvertor->current_iov_partial_length;
+                length_per_iovec = pConvertor->current_iov_partial_length;
+                pConvertor->current_iov_partial_length = 0;
+            } else {
+                ncontig_disp_base = (size_t)(ddt_iov[i].iov_base);
+                length_per_iovec = ddt_iov[i].iov_len;
+            }
+            if (buffer_size < length_per_iovec) {
+                pConvertor->current_iov_pos = i;
+                pConvertor->current_iov_partial_length = length_per_iovec - buffer_size;
+                length_per_iovec = buffer_size; 
+                buffer_isfull = 1;
+            }
+            buffer_size -= length_per_iovec;
+            total_packed += length_per_iovec;
+            
+            alignment = ALIGNMENT_DOUBLE;
+
+            count_desc = length_per_iovec / alignment;
+            residue_desc = length_per_iovec % alignment;
+            nb_blocks_per_description = (count_desc + thread_per_block - 1) / thread_per_block;
+            DT_CUDA_DEBUG ( opal_cuda_output(10, "Pack description %d, size %d, residue %d, alignment %d, nb_block_aquired %d\n", i, count_desc, residue_desc, alignment, nb_blocks_per_description); );
+            for (j = 0; j < nb_blocks_per_description; j++) {
+                cuda_iov_dist_h_current[nb_blocks_used].ncontig_disp = ncontig_disp_base + j * thread_per_block * alignment;
+                cuda_iov_dist_h_current[nb_blocks_used].contig_disp = contig_disp;
+                if ( (j+1) * thread_per_block <= count_desc) {
+                    current_cuda_iov_length = thread_per_block * alignment;
+                } else {
+                    current_cuda_iov_length = (count_desc - j*thread_per_block) * alignment; 
+                }
+#if defined (OPAL_DATATYPE_CUDA_DEBUG)
+                assert(current_cuda_iov_length > 0);
+#endif /* OPAL_DATATYPE_CUDA_DEBUG */
+                contig_disp += current_cuda_iov_length;
+                DT_CUDA_DEBUG( opal_cuda_output(12, "Pack \tblock %d, ncontig_disp %ld, contig_disp %ld, nb_bytes %ld\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].ncontig_disp, cuda_iov_dist_h_current[nb_blocks_used].contig_disp, current_cuda_iov_length); );
+                nb_blocks_used ++;
+                assert (nb_blocks_used < CUDA_MAX_NB_BLOCKS*CUDA_IOV_MAX_TASK_PER_BLOCK);
+            }
+            
+            /* handle residue */
+            if (residue_desc != 0) {
+                cuda_iov_dist_h_current[nb_blocks_used].ncontig_disp = ncontig_disp_base + length_per_iovec / alignment * alignment;
+                cuda_iov_dist_h_current[nb_blocks_used].contig_disp = contig_disp;
+                current_cuda_iov_length= length_per_iovec - length_per_iovec / alignment * alignment;
+#if defined (OPAL_DATATYPE_CUDA_DEBUG)
+                assert(current_cuda_iov_length > 0);
+#endif /* OPAL_DATATYPE_CUDA_DEBUG */
+                contig_disp += current_cuda_iov_length;
+                DT_CUDA_DEBUG( opal_cuda_output(12, "Pack \tblock %d, ncontig_disp %ld, contig_disp %ld, nb_bytes %ld\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].ncontig_disp, cuda_iov_dist_h_current[nb_blocks_used].contig_disp, current_cuda_iov_length); );
+                nb_blocks_used ++;
+                assert (nb_blocks_used < CUDA_MAX_NB_BLOCKS*CUDA_IOV_MAX_TASK_PER_BLOCK);
+            }
+        }
+        cuda_iov_dist_h_current[nb_blocks_used].contig_disp = contig_disp;
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)    
+        GET_TIME( end );
+        total_time = ELAPSED_TIME( start, end );
+        DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Pack src %p to dest %p, iov is prepared in %ld microsec, kernel submitted to CUDA stream %d, nb_blocks %d\n", source_base, destination_base, total_time,  cuda_iov_pipeline_block->cuda_stream_id, nb_blocks_used); );
+#endif
+
+        cudaMemcpyAsync(cuda_iov_dist_d_current, cuda_iov_dist_h_current, sizeof(ddt_cuda_iov_dist_cached_t)*(nb_blocks_used+1), cudaMemcpyHostToDevice, *cuda_stream_iov);
+        opal_generic_simple_pack_cuda_iov_cached_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cuda_iov_dist_d_current, 0, nb_blocks_used, 0, 0, nb_blocks_used, source_base, destination_base);
+        cuda_err = cudaEventRecord(cuda_iov_pipeline_block->cuda_event, *cuda_stream_iov);
+        opal_cuda_check_error(cuda_err);
+        iov_pipeline_block_id ++;
+        iov_pipeline_block_id = iov_pipeline_block_id % NB_STREAMS;
+        destination_base += contig_disp;
+        contig_disp = 0;
+        if (!buffer_isfull) {
+            pConvertor->current_iov_pos = i;
+            if (i == ddt_iov_count) {
+                pConvertor->current_count ++;
+                pConvertor->current_iov_pos = 0;
+                source_base += ddt_extent;
+            }
+        }
+        
+    }
+    
+
+    for (i = 0; i < NB_STREAMS; i++) {
+        cudaStreamSynchronize(cuda_streams->opal_cuda_stream[i]);
+    }
+    
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+    GET_TIME(start);
+#endif
+    if (transfer_required) {
+        cudaMemcpy(iov[0].iov_base, pConvertor->gpu_buffer_ptr, total_packed, cudaMemcpyDeviceToHost);
+    } 
+#if defined(OPAL_DATATYPE_CUDA_TIMING) 
+    GET_TIME( end );
+    move_time = ELAPSED_TIME( start, end );
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: DtoH memcpy in %ld microsec, transfer required %d\n", move_time, transfer_required ); );
+#endif
+
+    pConvertor->bConverted += total_packed;
+    iov[0].iov_len = total_packed;
+    *max_data = total_packed;
+    *out_size = 1;
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "Pack total packed %d\n", total_packed); );
+    
+#if defined(OPAL_DATATYPE_CUDA_TIMING)    
+    GET_TIME( end_total );
+    total_time = ELAPSED_TIME( start_total, end_total );
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: total packing in %ld microsec, kernel %ld microsec\n", total_time, total_time-move_time ); );
+#endif
+    
+    if( pConvertor->bConverted == pConvertor->local_size ) {
+        pConvertor->flags |= CONVERTOR_COMPLETED;
+        if (pConvertor->gpu_buffer_ptr != NULL && free_required) {
+           opal_ddt_cuda_free_gpu_buffer(pConvertor->gpu_buffer_ptr, 0);
+           pConvertor->gpu_buffer_ptr = NULL;
+        }
+        return 1;
+    }        
+    return 0;
+}
+
 int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t* pConvertor,
                                                                struct iovec* iov,
                                                                uint32_t* out_size,
@@ -1000,16 +1239,6 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
     GET_TIME(start_total);
 #endif
     
-#if defined(OPAL_DATATYPE_CUDA_TIMING)
-    GET_TIME(start);
-#endif
-
-#if defined(OPAL_DATATYPE_CUDA_TIMING)    
-    GET_TIME( end );
-    total_time = ELAPSED_TIME( start, end );
-    DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: ddt to iov in %ld microsec\n", total_time ); );
-#endif
-    
     thread_per_block = CUDA_WARP_SIZE * 5;
     nb_blocks = 256;
     source_base = (unsigned char*)pConvertor->pBaseBuf; 
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
index 6808ab56fed..6d0b906c0b0 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
@@ -370,14 +370,16 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_vector2( opal_convertor_t*
     return 0;
 }
 
+
 int32_t opal_ddt_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvertor,
                                                           struct iovec* iov,
                                                           uint32_t* out_size,
                                                           size_t* max_data )
 {
-    return opal_ddt_generic_simple_unpack_function_cuda_iov_cached(pConvertor, iov, out_size, max_data);
+    return opal_ddt_generic_simple_unpack_function_cuda_iov_non_cached(pConvertor, iov, out_size, max_data);
 }
 
+#if 0
 int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_non_cached( opal_convertor_t* pConvertor,
                                                                      struct iovec* iov,
                                                                      uint32_t* out_size,
@@ -622,6 +624,222 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_non_cached( opal_conver
     return 0;
 }
 
+#endif
+
+int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_non_cached( opal_convertor_t* pConvertor,
+                                                                     struct iovec* iov,
+                                                                     uint32_t* out_size,
+                                                                     size_t* max_data )
+{
+    uint32_t i, j;
+    uint32_t count_desc, nb_blocks_per_description, dst_offset, residue_desc;
+    uint32_t nb_blocks, thread_per_block, nb_blocks_used;
+    size_t buffer_size, length_per_iovec;
+    unsigned char *source, *source_base, *destination_base;
+    size_t total_unpacked;
+    uint8_t buffer_isfull = 0;
+    uint8_t free_required = 0;
+    uint8_t alignment;
+    cudaError_t cuda_err;
+    ddt_cuda_stream_t *cuda_streams = current_cuda_device->cuda_streams;
+    ddt_cuda_iov_dist_cached_t* cuda_iov_dist_h_current;
+    ddt_cuda_iov_dist_cached_t* cuda_iov_dist_d_current;
+    ddt_cuda_iov_pipeline_block_t *cuda_iov_pipeline_block;
+    int iov_pipeline_block_id = 0;
+    cudaStream_t *cuda_stream_iov = NULL;
+    const struct iovec *ddt_iov = NULL;
+    uint32_t ddt_iov_count = 0;
+    size_t current_cuda_iov_length = 0;
+    size_t ncontig_disp_base;
+    size_t contig_disp = 0;
+    uint32_t ddt_iov_start_pos, ddt_iov_end_pos;
+    OPAL_PTRDIFF_TYPE ddt_extent;
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+    TIMER_DATA_TYPE start, end, start_total, end_total;
+    long total_time, move_time;
+#endif
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+    GET_TIME(start_total);
+#endif
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+    GET_TIME(start);
+#endif
+    if (opal_ddt_cuda_is_gpu_buffer(iov[0].iov_base)) {
+        source = (unsigned char*)iov[0].iov_base;
+        free_required = 0;
+    } else {
+        if (OPAL_DATATYPE_VECTOR_USE_ZEROCPY) {
+            cudaHostGetDevicePointer((void **)&source, (void *)iov[0].iov_base, 0);
+            pConvertor->gpu_buffer_ptr = NULL;
+            free_required = 0;
+        } else {
+            if (pConvertor->gpu_buffer_ptr == NULL) {
+                pConvertor->gpu_buffer_ptr = (unsigned char*)opal_ddt_cuda_malloc_gpu_buffer(iov[0].iov_len, 0);
+            }
+            source = pConvertor->gpu_buffer_ptr;
+            cudaMemcpy(source, iov[0].iov_base, iov[0].iov_len, cudaMemcpyHostToDevice);
+            free_required = 1;
+        }
+    }
+
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "Unpack using IOV non cached, GPU base %p, unpack from buffer %p, total size %ld\n",
+                                     pConvertor->pBaseBuf, source, iov[0].iov_len); );
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+    GET_TIME( end );
+    move_time = ELAPSED_TIME( start, end );
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: HtoD memcpy in %ld microsec, free required %d\n", move_time, free_required ); );
+#endif
+    
+    opal_convertor_raw_cached( pConvertor, &ddt_iov, &ddt_iov_count);
+    if (ddt_iov == NULL) {
+        DT_CUDA_DEBUG ( opal_cuda_output(0, "Can not get ddt iov\n"););
+        return OPAL_ERROR;
+    }
+    
+    buffer_size = iov[0].iov_len;
+    total_unpacked = 0;
+    cuda_streams->current_stream_id = 0;
+
+    thread_per_block = CUDA_WARP_SIZE * 5;
+    nb_blocks = 256;
+    source_base = source;
+    opal_datatype_type_extent(pConvertor->pDesc, &ddt_extent);
+    destination_base = (unsigned char*)pConvertor->pBaseBuf + pConvertor->current_count * ddt_extent;
+    
+    for (i = 0; i < NB_STREAMS; i++) {
+        cudaStreamSynchronize(cuda_streams->opal_cuda_stream[i]);
+    }
+
+    while( pConvertor->current_count < pConvertor->count && !buffer_isfull) {
+
+        nb_blocks_used = 0;
+        ddt_iov_start_pos = pConvertor->current_iov_pos;
+        ddt_iov_end_pos = ddt_iov_start_pos + IOV_PIPELINE_SIZE;
+        if (ddt_iov_end_pos > ddt_iov_count) {
+            ddt_iov_end_pos = ddt_iov_count;
+        }
+        cuda_iov_pipeline_block = current_cuda_device->cuda_iov_pipeline_block[iov_pipeline_block_id];
+        cuda_iov_dist_h_current = cuda_iov_pipeline_block->cuda_iov_dist_non_cached_h;
+        cuda_iov_dist_d_current = cuda_iov_pipeline_block->cuda_iov_dist_non_cached_d;
+        cuda_stream_iov = cuda_iov_pipeline_block->cuda_stream;
+        cuda_err = cudaStreamWaitEvent(*cuda_stream_iov, cuda_iov_pipeline_block->cuda_event, 0);
+        opal_cuda_check_error(cuda_err);
+        
+
+#if defined (OPAL_DATATYPE_CUDA_TIMING)
+        GET_TIME(start);
+#endif
+
+        for (i = ddt_iov_start_pos; i < ddt_iov_end_pos && !buffer_isfull; i++) {
+            if (pConvertor->current_iov_partial_length > 0) {
+                ncontig_disp_base = (size_t)(ddt_iov[i].iov_base) + ddt_iov[i].iov_len - pConvertor->current_iov_partial_length;
+                length_per_iovec = pConvertor->current_iov_partial_length;
+                pConvertor->current_iov_partial_length = 0;
+            } else {
+                ncontig_disp_base = (size_t)(ddt_iov[i].iov_base);
+                length_per_iovec = ddt_iov[i].iov_len;
+            }
+            if (buffer_size < length_per_iovec) {
+                pConvertor->current_iov_pos = i;
+                pConvertor->current_iov_partial_length = length_per_iovec - buffer_size;
+                length_per_iovec = buffer_size; 
+                buffer_isfull = 1;
+            }
+            buffer_size -= length_per_iovec;
+            total_unpacked += length_per_iovec;
+
+            alignment = ALIGNMENT_DOUBLE;
+
+            count_desc = length_per_iovec / alignment;
+            residue_desc = length_per_iovec % alignment;
+            nb_blocks_per_description = (count_desc + thread_per_block - 1) / thread_per_block;
+            DT_CUDA_DEBUG ( opal_cuda_output(10, "Unpack description %d, size %d, residue %d, alignment %d\n", i, count_desc, residue_desc, alignment); );
+            for (j = 0; j < nb_blocks_per_description; j++) {
+                cuda_iov_dist_h_current[nb_blocks_used].ncontig_disp = ncontig_disp_base + j * thread_per_block * alignment;
+                cuda_iov_dist_h_current[nb_blocks_used].contig_disp = contig_disp;
+                if ( (j+1) * thread_per_block <= count_desc) {
+                    current_cuda_iov_length = thread_per_block * alignment;
+                } else {
+                    current_cuda_iov_length = (count_desc - j*thread_per_block) * alignment; 
+                }
+#if defined (OPAL_DATATYPE_CUDA_DEBUG)
+                assert(current_cuda_iov_length > 0);
+#endif /* OPAL_DATATYPE_CUDA_DEBUG */
+                contig_disp += current_cuda_iov_length;
+                DT_CUDA_DEBUG( opal_cuda_output(12, "Pack \tblock %d, ncontig_disp %ld, contig_disp %ld, nb_bytes %ld\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].ncontig_disp, cuda_iov_dist_h_current[nb_blocks_used].contig_disp, current_cuda_iov_length); );
+                nb_blocks_used ++;
+                assert (nb_blocks_used < CUDA_MAX_NB_BLOCKS*CUDA_IOV_MAX_TASK_PER_BLOCK);
+            }
+
+            /* handle residue */
+            if (residue_desc != 0) {
+                cuda_iov_dist_h_current[nb_blocks_used].ncontig_disp = ncontig_disp_base + length_per_iovec / alignment * alignment;
+                cuda_iov_dist_h_current[nb_blocks_used].contig_disp = contig_disp;
+                current_cuda_iov_length= length_per_iovec - length_per_iovec / alignment * alignment;
+#if defined (OPAL_DATATYPE_CUDA_DEBUG)
+                assert(current_cuda_iov_length > 0);
+#endif /* OPAL_DATATYPE_CUDA_DEBUG */
+                contig_disp += current_cuda_iov_length;
+                DT_CUDA_DEBUG( opal_cuda_output(12, "Pack \tblock %d, ncontig_disp %ld, contig_disp %ld, nb_bytes %ld\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].ncontig_disp, cuda_iov_dist_h_current[nb_blocks_used].contig_disp, current_cuda_iov_length); );
+                nb_blocks_used ++;
+                assert (nb_blocks_used < CUDA_MAX_NB_BLOCKS*CUDA_IOV_MAX_TASK_PER_BLOCK);
+            }
+            cuda_iov_dist_h_current[nb_blocks_used].contig_disp = contig_disp;
+        }
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+        GET_TIME( end );
+        total_time = ELAPSED_TIME( start, end );
+        DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Unpack src %p to dest %p, iov is prepared in %ld microsec, kernel submitted to CUDA stream %d, nb_blocks_used %d\n", source_base, destination_base, total_time,  cuda_iov_pipeline_block->cuda_stream_id, nb_blocks_used); );
+#endif
+
+        cudaMemcpyAsync(cuda_iov_dist_d_current, cuda_iov_dist_h_current, sizeof(ddt_cuda_iov_dist_cached_t)*(nb_blocks_used+1), cudaMemcpyHostToDevice, *cuda_stream_iov);
+        opal_generic_simple_unpack_cuda_iov_cached_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cuda_iov_dist_d_current, 0, nb_blocks_used, 0, 0, nb_blocks_used, destination_base, source_base, 0, 0);
+        cuda_err = cudaEventRecord(cuda_iov_pipeline_block->cuda_event, *cuda_stream_iov);
+        opal_cuda_check_error(cuda_err);
+        iov_pipeline_block_id ++;
+        iov_pipeline_block_id = iov_pipeline_block_id % NB_STREAMS;
+        source_base += contig_disp;
+        contig_disp = 0;
+        if (!buffer_isfull) {
+            pConvertor->current_iov_pos = i;
+            if (i == ddt_iov_count) {
+                pConvertor->current_count ++;
+                pConvertor->current_iov_pos = 0;
+                destination_base += ddt_extent;
+            }
+        }
+    }
+
+    for (i = 0; i < NB_STREAMS; i++) {
+        cudaStreamSynchronize(cuda_streams->opal_cuda_stream[i]);
+    }
+
+    iov[0].iov_len = total_unpacked;
+    *max_data = total_unpacked;
+    *out_size = 1;
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "Unpack total unpacked %d\n", total_unpacked); );
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+    GET_TIME( end_total );
+    total_time = ELAPSED_TIME( start_total, end_total );
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: total unpacking in %ld microsec, kernel %ld microsec\n", total_time, total_time-move_time ); );
+#endif
+
+    if( pConvertor->bConverted == pConvertor->local_size ) {
+        pConvertor->flags |= CONVERTOR_COMPLETED;
+        if (pConvertor->gpu_buffer_ptr != NULL && free_required) {
+            opal_ddt_cuda_free_gpu_buffer(pConvertor->gpu_buffer_ptr, 0);
+            pConvertor->gpu_buffer_ptr = NULL;
+        }
+        return 1;
+    }
+    return 0;
+}
+
 int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_t* pConvertor,
                                                                  struct iovec* iov,
                                                                  uint32_t* out_size,

From 02c8b7f4f2c692789c76e0a005ad063eccc0fcb8 Mon Sep 17 00:00:00 2001
From: eddy16112 <eddy16112@gmail.com>
Date: Tue, 24 Nov 2015 21:14:23 -0500
Subject: [PATCH 167/190] fix for non cached iov

---
 opal/datatype/cuda/opal_datatype_cuda.cu      | 28 +++++++++++++++++++
 opal/datatype/cuda/opal_datatype_cuda.cuh     |  4 ++-
 .../cuda/opal_datatype_unpack_cuda_wrapper.cu |  2 ++
 3 files changed, 33 insertions(+), 1 deletion(-)

diff --git a/opal/datatype/cuda/opal_datatype_cuda.cu b/opal/datatype/cuda/opal_datatype_cuda.cu
index 5ba8e1361c0..7f00ef7dd51 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cu
+++ b/opal/datatype/cuda/opal_datatype_cuda.cu
@@ -511,6 +511,34 @@ void opal_ddt_set_cuda_iov_position(struct opal_convertor_t *convertor, size_t d
     }
 }
 
+void opal_ddt_set_ddt_iov_position(struct opal_convertor_t *convertor, size_t ddt_offset, const struct iovec *ddt_iov, const uint32_t ddt_iov_count)
+{
+    int i;
+    size_t iov_size = 0;
+    size_t ddt_size;
+    convertor->current_iov_partial_length = 0;
+    convertor->current_iov_pos = 0;
+    convertor->current_count = 0;
+    if (ddt_offset == 0) {
+       return;
+    }
+    opal_datatype_type_size(convertor->pDesc, &ddt_size);
+    convertor->current_count = ddt_offset / ddt_size;
+    ddt_offset = ddt_offset % ddt_size;
+    for(i = 0; i < ddt_iov_count; i++) {
+        iov_size += ddt_iov[i].iov_len;
+        if (iov_size > ddt_offset) {
+            convertor->current_iov_partial_length = iov_size - ddt_offset;
+            convertor->current_iov_pos = i;
+            break;
+        } else if (iov_size == ddt_offset){
+            convertor->current_iov_partial_length = 0;
+            convertor->current_iov_pos = i+1;
+            break;
+        }
+    }
+}
+
 void opal_ddt_check_cuda_iov_is_full(struct opal_convertor_t *convertor, uint32_t cuda_iov_count)
 {
 #if 0
diff --git a/opal/datatype/cuda/opal_datatype_cuda.cuh b/opal/datatype/cuda/opal_datatype_cuda.cuh
index 8ad9b3ec658..64f69c2974c 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cuh
+++ b/opal/datatype/cuda/opal_datatype_cuda.cuh
@@ -131,8 +131,10 @@ void opal_ddt_check_cuda_iov_is_full(struct opal_convertor_t *convertor, uint32_
 
 void opal_ddt_set_cuda_iov_position(struct opal_convertor_t *convertor, size_t ddt_offset, const uint32_t *cached_cuda_iov_nb_bytes_list_h, const uint32_t cuda_iov_count);
 
+void opal_ddt_set_ddt_iov_position(struct opal_convertor_t *convertor, size_t ddt_offset, const struct iovec *ddt_iov,  const uint32_t ddt_iov_count);
+
 int32_t opal_ddt_cache_cuda_iov(opal_convertor_t* pConvertor, uint32_t *cuda_iov_count);
 
 }
                             
-#endif  /* OPAL_DATATYPE_CUDA_H_HAS_BEEN_INCLUDED */
\ No newline at end of file
+#endif  /* OPAL_DATATYPE_CUDA_H_HAS_BEEN_INCLUDED */
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
index 6d0b906c0b0..b17dee516d2 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
@@ -708,6 +708,7 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_non_cached( opal_conver
     source_base = source;
     opal_datatype_type_extent(pConvertor->pDesc, &ddt_extent);
     destination_base = (unsigned char*)pConvertor->pBaseBuf + pConvertor->current_count * ddt_extent;
+    opal_ddt_set_ddt_iov_position(pConvertor, pConvertor->bConverted, ddt_iov, ddt_iov_count);
     
     for (i = 0; i < NB_STREAMS; i++) {
         cudaStreamSynchronize(cuda_streams->opal_cuda_stream[i]);
@@ -818,6 +819,7 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_non_cached( opal_conver
         cudaStreamSynchronize(cuda_streams->opal_cuda_stream[i]);
     }
 
+    pConvertor->bConverted += total_unpacked;
     iov[0].iov_len = total_unpacked;
     *max_data = total_unpacked;
     *out_size = 1;

From bb807fcd61bc6d0bda6530f82bbaa1a71b334391 Mon Sep 17 00:00:00 2001
From: Wei Wu <eddy16112@gmail.com>
Date: Tue, 24 Nov 2015 18:30:18 -0800
Subject: [PATCH 168/190] fix the non cached iov, set position should be put at
 first

---
 opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
index b17dee516d2..c84f09ca738 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
@@ -707,8 +707,8 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_non_cached( opal_conver
     nb_blocks = 256;
     source_base = source;
     opal_datatype_type_extent(pConvertor->pDesc, &ddt_extent);
-    destination_base = (unsigned char*)pConvertor->pBaseBuf + pConvertor->current_count * ddt_extent;
     opal_ddt_set_ddt_iov_position(pConvertor, pConvertor->bConverted, ddt_iov, ddt_iov_count);
+    destination_base = (unsigned char*)pConvertor->pBaseBuf + pConvertor->current_count * ddt_extent;
     
     for (i = 0; i < NB_STREAMS; i++) {
         cudaStreamSynchronize(cuda_streams->opal_cuda_stream[i]);

From 842cc3f03d6a2cd43a86f229c5d7af33c4a47aba Mon Sep 17 00:00:00 2001
From: eddy16112 <eddy16112@gmail.com>
Date: Wed, 25 Nov 2015 15:02:14 -0500
Subject: [PATCH 169/190] move ddt iov to cuda iov into a function

---
 opal/datatype/cuda/opal_datatype_cuda.cu      | 76 +++++++++++++++++++
 opal/datatype/cuda/opal_datatype_cuda.cuh     |  2 +
 .../cuda/opal_datatype_pack_cuda_wrapper.cu   | 73 ++----------------
 .../cuda/opal_datatype_unpack_cuda_wrapper.cu | 72 ++----------------
 4 files changed, 91 insertions(+), 132 deletions(-)

diff --git a/opal/datatype/cuda/opal_datatype_cuda.cu b/opal/datatype/cuda/opal_datatype_cuda.cu
index 7f00ef7dd51..02559ed283f 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cu
+++ b/opal/datatype/cuda/opal_datatype_cuda.cu
@@ -454,6 +454,82 @@ int32_t opal_ddt_cache_cuda_iov(opal_convertor_t* pConvertor, uint32_t *cuda_iov
     return OPAL_SUCCESS;
 }
 
+uint8_t opal_ddt_iov_to_cuda_iov(opal_convertor_t* pConvertor, const struct iovec *ddt_iov, ddt_cuda_iov_dist_cached_t* cuda_iov_dist_h_current, uint32_t ddt_iov_start_pos, uint32_t ddt_iov_end_pos, size_t *buffer_size, uint32_t *nb_blocks_used, size_t *total_converted, size_t *contig_disp_out, uint32_t *current_ddt_iov_pos)
+{
+    size_t ncontig_disp_base;
+    size_t contig_disp = 0;
+    size_t current_cuda_iov_length = 0;
+    uint8_t buffer_isfull = 0;
+    uint8_t alignment;
+    uint32_t count_desc, nb_blocks_per_description, residue_desc;
+    uint32_t thread_per_block;
+    size_t length_per_iovec;
+    uint32_t i, j;
+    
+    thread_per_block = CUDA_WARP_SIZE * 5;
+    
+    for (i = ddt_iov_start_pos; i < ddt_iov_end_pos && !buffer_isfull; i++) {
+        if (pConvertor->current_iov_partial_length > 0) {
+            ncontig_disp_base = (size_t)(ddt_iov[i].iov_base) + ddt_iov[i].iov_len - pConvertor->current_iov_partial_length;
+            length_per_iovec = pConvertor->current_iov_partial_length;
+            pConvertor->current_iov_partial_length = 0;
+        } else {
+            ncontig_disp_base = (size_t)(ddt_iov[i].iov_base);
+            length_per_iovec = ddt_iov[i].iov_len;
+        }
+        if (*buffer_size < length_per_iovec) {
+            pConvertor->current_iov_pos = i;
+            pConvertor->current_iov_partial_length = length_per_iovec - *buffer_size;
+            length_per_iovec = *buffer_size; 
+            buffer_isfull = 1;
+        }
+        *buffer_size -= length_per_iovec;
+        *total_converted += length_per_iovec;
+        
+        alignment = ALIGNMENT_DOUBLE;
+
+        count_desc = length_per_iovec / alignment;
+        residue_desc = length_per_iovec % alignment;
+        nb_blocks_per_description = (count_desc + thread_per_block - 1) / thread_per_block;
+        DT_CUDA_DEBUG ( opal_cuda_output(10, "DDT IOV to CUDA IOV description %d, size %d, residue %d, alignment %d, nb_block_aquired %d\n", i, count_desc, residue_desc, alignment, nb_blocks_per_description); );
+        for (j = 0; j < nb_blocks_per_description; j++) {
+            cuda_iov_dist_h_current[*nb_blocks_used].ncontig_disp = ncontig_disp_base + j * thread_per_block * alignment;
+            cuda_iov_dist_h_current[*nb_blocks_used].contig_disp = contig_disp;
+            if ( (j+1) * thread_per_block <= count_desc) {
+                current_cuda_iov_length = thread_per_block * alignment;
+            } else {
+                current_cuda_iov_length = (count_desc - j*thread_per_block) * alignment; 
+            }
+#if defined (OPAL_DATATYPE_CUDA_DEBUG)
+            assert(current_cuda_iov_length > 0);
+#endif /* OPAL_DATATYPE_CUDA_DEBUG */
+            contig_disp += current_cuda_iov_length;
+            DT_CUDA_DEBUG( opal_cuda_output(12, "DDT IOV to CUDA IOV \tblock %d, ncontig_disp %ld, contig_disp %ld, nb_bytes %ld\n", nb_blocks_used, cuda_iov_dist_h_current[*nb_blocks_used].ncontig_disp, cuda_iov_dist_h_current[*nb_blocks_used].contig_disp, current_cuda_iov_length); );
+            (*nb_blocks_used) ++;
+            assert (*nb_blocks_used < CUDA_MAX_NB_BLOCKS*CUDA_IOV_MAX_TASK_PER_BLOCK);
+        }
+        
+        /* handle residue */
+        if (residue_desc != 0) {
+            cuda_iov_dist_h_current[*nb_blocks_used].ncontig_disp = ncontig_disp_base + length_per_iovec / alignment * alignment;
+            cuda_iov_dist_h_current[*nb_blocks_used].contig_disp = contig_disp;
+            current_cuda_iov_length= length_per_iovec - length_per_iovec / alignment * alignment;
+#if defined (OPAL_DATATYPE_CUDA_DEBUG)
+            assert(current_cuda_iov_length > 0);
+#endif /* OPAL_DATATYPE_CUDA_DEBUG */
+            contig_disp += current_cuda_iov_length;
+            DT_CUDA_DEBUG( opal_cuda_output(12, "DDT IOV to CUDA IOV \tblock %d, ncontig_disp %ld, contig_disp %ld, nb_bytes %ld\n", nb_blocks_used, cuda_iov_dist_h_current[*nb_blocks_used].ncontig_disp, cuda_iov_dist_h_current[*nb_blocks_used].contig_disp, current_cuda_iov_length); );
+            (*nb_blocks_used) ++;
+            assert (*nb_blocks_used < CUDA_MAX_NB_BLOCKS*CUDA_IOV_MAX_TASK_PER_BLOCK);
+        }
+    }
+    cuda_iov_dist_h_current[*nb_blocks_used].contig_disp = contig_disp;
+    *contig_disp_out = contig_disp;
+    *current_ddt_iov_pos = i;
+    return buffer_isfull;
+        
+}
+
 void opal_ddt_get_cached_cuda_iov(struct opal_convertor_t *convertor, ddt_cuda_iov_total_cached_t **cached_cuda_iov)
 {
     opal_datatype_t *datatype = (opal_datatype_t *)convertor->pDesc;
diff --git a/opal/datatype/cuda/opal_datatype_cuda.cuh b/opal/datatype/cuda/opal_datatype_cuda.cuh
index 64f69c2974c..8e2a008ce22 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cuh
+++ b/opal/datatype/cuda/opal_datatype_cuda.cuh
@@ -135,6 +135,8 @@ void opal_ddt_set_ddt_iov_position(struct opal_convertor_t *convertor, size_t dd
 
 int32_t opal_ddt_cache_cuda_iov(opal_convertor_t* pConvertor, uint32_t *cuda_iov_count);
 
+uint8_t opal_ddt_iov_to_cuda_iov(opal_convertor_t* pConvertor, const struct iovec *ddt_iov, ddt_cuda_iov_dist_cached_t* cuda_iov_dist_h_current, uint32_t ddt_iov_start_pos, uint32_t ddt_iov_end_pos, size_t *buffer_size, uint32_t *nb_blocks_used, size_t *total_packed, size_t *contig_disp_out, uint32_t *current_ddt_iov_pos);
+
 }
                             
 #endif  /* OPAL_DATATYPE_CUDA_H_HAS_BEEN_INCLUDED */
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
index 5bdfa88fbdb..ad0e2d771d5 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
@@ -937,14 +937,12 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_non_cached( opal_converto
                                                                    uint32_t* out_size,
                                                                    size_t* max_data )
 {
-    uint32_t i, j;
-    uint32_t count_desc, nb_blocks_per_description, residue_desc;
+    uint32_t i;
     uint32_t nb_blocks, thread_per_block, nb_blocks_used;
-    size_t buffer_size, length_per_iovec;
+    size_t buffer_size;
     unsigned char *destination, *destination_base, *source_base;
     size_t total_packed;
     uint8_t buffer_isfull = 0, transfer_required, free_required;
-    uint8_t alignment;
     cudaError_t cuda_err;
     ddt_cuda_stream_t *cuda_streams = current_cuda_device->cuda_streams;
     ddt_cuda_iov_dist_cached_t* cuda_iov_dist_h_current;
@@ -954,10 +952,8 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_non_cached( opal_converto
     cudaStream_t *cuda_stream_iov = NULL;
     const struct iovec *ddt_iov = NULL;
     uint32_t ddt_iov_count = 0;
-    size_t current_cuda_iov_length = 0;
-    size_t ncontig_disp_base;
     size_t contig_disp = 0;
-    uint32_t ddt_iov_start_pos, ddt_iov_end_pos;
+    uint32_t ddt_iov_start_pos, ddt_iov_end_pos, current_ddt_iov_pos;
     OPAL_PTRDIFF_TYPE ddt_extent;
 
 #if defined(OPAL_DATATYPE_CUDA_TIMING)    
@@ -1044,62 +1040,7 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_non_cached( opal_converto
         GET_TIME(start);
 #endif
 
-        for (i = ddt_iov_start_pos; i < ddt_iov_end_pos && !buffer_isfull; i++) {
-            if (pConvertor->current_iov_partial_length > 0) {
-                ncontig_disp_base = (size_t)(ddt_iov[i].iov_base) + ddt_iov[i].iov_len - pConvertor->current_iov_partial_length;
-                length_per_iovec = pConvertor->current_iov_partial_length;
-                pConvertor->current_iov_partial_length = 0;
-            } else {
-                ncontig_disp_base = (size_t)(ddt_iov[i].iov_base);
-                length_per_iovec = ddt_iov[i].iov_len;
-            }
-            if (buffer_size < length_per_iovec) {
-                pConvertor->current_iov_pos = i;
-                pConvertor->current_iov_partial_length = length_per_iovec - buffer_size;
-                length_per_iovec = buffer_size; 
-                buffer_isfull = 1;
-            }
-            buffer_size -= length_per_iovec;
-            total_packed += length_per_iovec;
-            
-            alignment = ALIGNMENT_DOUBLE;
-
-            count_desc = length_per_iovec / alignment;
-            residue_desc = length_per_iovec % alignment;
-            nb_blocks_per_description = (count_desc + thread_per_block - 1) / thread_per_block;
-            DT_CUDA_DEBUG ( opal_cuda_output(10, "Pack description %d, size %d, residue %d, alignment %d, nb_block_aquired %d\n", i, count_desc, residue_desc, alignment, nb_blocks_per_description); );
-            for (j = 0; j < nb_blocks_per_description; j++) {
-                cuda_iov_dist_h_current[nb_blocks_used].ncontig_disp = ncontig_disp_base + j * thread_per_block * alignment;
-                cuda_iov_dist_h_current[nb_blocks_used].contig_disp = contig_disp;
-                if ( (j+1) * thread_per_block <= count_desc) {
-                    current_cuda_iov_length = thread_per_block * alignment;
-                } else {
-                    current_cuda_iov_length = (count_desc - j*thread_per_block) * alignment; 
-                }
-#if defined (OPAL_DATATYPE_CUDA_DEBUG)
-                assert(current_cuda_iov_length > 0);
-#endif /* OPAL_DATATYPE_CUDA_DEBUG */
-                contig_disp += current_cuda_iov_length;
-                DT_CUDA_DEBUG( opal_cuda_output(12, "Pack \tblock %d, ncontig_disp %ld, contig_disp %ld, nb_bytes %ld\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].ncontig_disp, cuda_iov_dist_h_current[nb_blocks_used].contig_disp, current_cuda_iov_length); );
-                nb_blocks_used ++;
-                assert (nb_blocks_used < CUDA_MAX_NB_BLOCKS*CUDA_IOV_MAX_TASK_PER_BLOCK);
-            }
-            
-            /* handle residue */
-            if (residue_desc != 0) {
-                cuda_iov_dist_h_current[nb_blocks_used].ncontig_disp = ncontig_disp_base + length_per_iovec / alignment * alignment;
-                cuda_iov_dist_h_current[nb_blocks_used].contig_disp = contig_disp;
-                current_cuda_iov_length= length_per_iovec - length_per_iovec / alignment * alignment;
-#if defined (OPAL_DATATYPE_CUDA_DEBUG)
-                assert(current_cuda_iov_length > 0);
-#endif /* OPAL_DATATYPE_CUDA_DEBUG */
-                contig_disp += current_cuda_iov_length;
-                DT_CUDA_DEBUG( opal_cuda_output(12, "Pack \tblock %d, ncontig_disp %ld, contig_disp %ld, nb_bytes %ld\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].ncontig_disp, cuda_iov_dist_h_current[nb_blocks_used].contig_disp, current_cuda_iov_length); );
-                nb_blocks_used ++;
-                assert (nb_blocks_used < CUDA_MAX_NB_BLOCKS*CUDA_IOV_MAX_TASK_PER_BLOCK);
-            }
-        }
-        cuda_iov_dist_h_current[nb_blocks_used].contig_disp = contig_disp;
+        buffer_isfull = opal_ddt_iov_to_cuda_iov(pConvertor, ddt_iov, cuda_iov_dist_h_current, ddt_iov_start_pos, ddt_iov_end_pos, &buffer_size, &nb_blocks_used, &total_packed, &contig_disp, &current_ddt_iov_pos);
 
 #if defined(OPAL_DATATYPE_CUDA_TIMING)    
         GET_TIME( end );
@@ -1114,10 +1055,10 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_non_cached( opal_converto
         iov_pipeline_block_id ++;
         iov_pipeline_block_id = iov_pipeline_block_id % NB_STREAMS;
         destination_base += contig_disp;
-        contig_disp = 0;
+        
         if (!buffer_isfull) {
-            pConvertor->current_iov_pos = i;
-            if (i == ddt_iov_count) {
+            pConvertor->current_iov_pos = current_ddt_iov_pos;
+            if (current_ddt_iov_pos == ddt_iov_count) {
                 pConvertor->current_count ++;
                 pConvertor->current_iov_pos = 0;
                 source_base += ddt_extent;
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
index c84f09ca738..648036b1bb1 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
@@ -631,15 +631,13 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_non_cached( opal_conver
                                                                      uint32_t* out_size,
                                                                      size_t* max_data )
 {
-    uint32_t i, j;
-    uint32_t count_desc, nb_blocks_per_description, dst_offset, residue_desc;
+    uint32_t i;
     uint32_t nb_blocks, thread_per_block, nb_blocks_used;
-    size_t buffer_size, length_per_iovec;
+    size_t buffer_size;
     unsigned char *source, *source_base, *destination_base;
     size_t total_unpacked;
     uint8_t buffer_isfull = 0;
     uint8_t free_required = 0;
-    uint8_t alignment;
     cudaError_t cuda_err;
     ddt_cuda_stream_t *cuda_streams = current_cuda_device->cuda_streams;
     ddt_cuda_iov_dist_cached_t* cuda_iov_dist_h_current;
@@ -649,10 +647,8 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_non_cached( opal_conver
     cudaStream_t *cuda_stream_iov = NULL;
     const struct iovec *ddt_iov = NULL;
     uint32_t ddt_iov_count = 0;
-    size_t current_cuda_iov_length = 0;
-    size_t ncontig_disp_base;
     size_t contig_disp = 0;
-    uint32_t ddt_iov_start_pos, ddt_iov_end_pos;
+    uint32_t ddt_iov_start_pos, ddt_iov_end_pos, current_ddt_iov_pos;
     OPAL_PTRDIFF_TYPE ddt_extent;
 
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
@@ -734,62 +730,7 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_non_cached( opal_conver
         GET_TIME(start);
 #endif
 
-        for (i = ddt_iov_start_pos; i < ddt_iov_end_pos && !buffer_isfull; i++) {
-            if (pConvertor->current_iov_partial_length > 0) {
-                ncontig_disp_base = (size_t)(ddt_iov[i].iov_base) + ddt_iov[i].iov_len - pConvertor->current_iov_partial_length;
-                length_per_iovec = pConvertor->current_iov_partial_length;
-                pConvertor->current_iov_partial_length = 0;
-            } else {
-                ncontig_disp_base = (size_t)(ddt_iov[i].iov_base);
-                length_per_iovec = ddt_iov[i].iov_len;
-            }
-            if (buffer_size < length_per_iovec) {
-                pConvertor->current_iov_pos = i;
-                pConvertor->current_iov_partial_length = length_per_iovec - buffer_size;
-                length_per_iovec = buffer_size; 
-                buffer_isfull = 1;
-            }
-            buffer_size -= length_per_iovec;
-            total_unpacked += length_per_iovec;
-
-            alignment = ALIGNMENT_DOUBLE;
-
-            count_desc = length_per_iovec / alignment;
-            residue_desc = length_per_iovec % alignment;
-            nb_blocks_per_description = (count_desc + thread_per_block - 1) / thread_per_block;
-            DT_CUDA_DEBUG ( opal_cuda_output(10, "Unpack description %d, size %d, residue %d, alignment %d\n", i, count_desc, residue_desc, alignment); );
-            for (j = 0; j < nb_blocks_per_description; j++) {
-                cuda_iov_dist_h_current[nb_blocks_used].ncontig_disp = ncontig_disp_base + j * thread_per_block * alignment;
-                cuda_iov_dist_h_current[nb_blocks_used].contig_disp = contig_disp;
-                if ( (j+1) * thread_per_block <= count_desc) {
-                    current_cuda_iov_length = thread_per_block * alignment;
-                } else {
-                    current_cuda_iov_length = (count_desc - j*thread_per_block) * alignment; 
-                }
-#if defined (OPAL_DATATYPE_CUDA_DEBUG)
-                assert(current_cuda_iov_length > 0);
-#endif /* OPAL_DATATYPE_CUDA_DEBUG */
-                contig_disp += current_cuda_iov_length;
-                DT_CUDA_DEBUG( opal_cuda_output(12, "Pack \tblock %d, ncontig_disp %ld, contig_disp %ld, nb_bytes %ld\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].ncontig_disp, cuda_iov_dist_h_current[nb_blocks_used].contig_disp, current_cuda_iov_length); );
-                nb_blocks_used ++;
-                assert (nb_blocks_used < CUDA_MAX_NB_BLOCKS*CUDA_IOV_MAX_TASK_PER_BLOCK);
-            }
-
-            /* handle residue */
-            if (residue_desc != 0) {
-                cuda_iov_dist_h_current[nb_blocks_used].ncontig_disp = ncontig_disp_base + length_per_iovec / alignment * alignment;
-                cuda_iov_dist_h_current[nb_blocks_used].contig_disp = contig_disp;
-                current_cuda_iov_length= length_per_iovec - length_per_iovec / alignment * alignment;
-#if defined (OPAL_DATATYPE_CUDA_DEBUG)
-                assert(current_cuda_iov_length > 0);
-#endif /* OPAL_DATATYPE_CUDA_DEBUG */
-                contig_disp += current_cuda_iov_length;
-                DT_CUDA_DEBUG( opal_cuda_output(12, "Pack \tblock %d, ncontig_disp %ld, contig_disp %ld, nb_bytes %ld\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].ncontig_disp, cuda_iov_dist_h_current[nb_blocks_used].contig_disp, current_cuda_iov_length); );
-                nb_blocks_used ++;
-                assert (nb_blocks_used < CUDA_MAX_NB_BLOCKS*CUDA_IOV_MAX_TASK_PER_BLOCK);
-            }
-            cuda_iov_dist_h_current[nb_blocks_used].contig_disp = contig_disp;
-        }
+        buffer_isfull = opal_ddt_iov_to_cuda_iov(pConvertor, ddt_iov, cuda_iov_dist_h_current, ddt_iov_start_pos, ddt_iov_end_pos, &buffer_size, &nb_blocks_used, &total_unpacked, &contig_disp, &current_ddt_iov_pos);
 
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
         GET_TIME( end );
@@ -804,10 +745,9 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_non_cached( opal_conver
         iov_pipeline_block_id ++;
         iov_pipeline_block_id = iov_pipeline_block_id % NB_STREAMS;
         source_base += contig_disp;
-        contig_disp = 0;
         if (!buffer_isfull) {
-            pConvertor->current_iov_pos = i;
-            if (i == ddt_iov_count) {
+            pConvertor->current_iov_pos = current_ddt_iov_pos;
+            if (current_ddt_iov_pos == ddt_iov_count) {
                 pConvertor->current_count ++;
                 pConvertor->current_iov_pos = 0;
                 destination_base += ddt_extent;

From 6df01a5ee677f0c11d7d3875af8951b2aa6fb058 Mon Sep 17 00:00:00 2001
From: eddy16112 <eddy16112@gmail.com>
Date: Mon, 30 Nov 2015 17:43:02 -0500
Subject: [PATCH 170/190] merge iov cached and non-cached

---
 opal/datatype/cuda/opal_datatype_cuda.cu      |   2 +
 opal/datatype/cuda/opal_datatype_cuda.cuh     |  24 +-
 .../cuda/opal_datatype_cuda_internal.cuh      |   1 +
 .../cuda/opal_datatype_pack_cuda_wrapper.cu   | 281 +++++++-----------
 .../cuda/opal_datatype_unpack_cuda_wrapper.cu | 238 ++++++---------
 5 files changed, 209 insertions(+), 337 deletions(-)

diff --git a/opal/datatype/cuda/opal_datatype_cuda.cu b/opal/datatype/cuda/opal_datatype_cuda.cu
index 02559ed283f..b488ac4ab6c 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cu
+++ b/opal/datatype/cuda/opal_datatype_cuda.cu
@@ -15,6 +15,7 @@ ddt_cuda_device_t *cuda_devices;
 ddt_cuda_device_t *current_cuda_device;
 struct iovec cuda_iov[CUDA_NB_IOV];
 uint32_t cuda_iov_count;
+uint32_t cuda_iov_cache_enabled;
 
 //uint8_t ALIGNMENT_DOUBLE, ALIGNMENT_FLOAT, ALIGNMENT_CHAR;
 
@@ -239,6 +240,7 @@ int32_t opal_ddt_cuda_kernel_init(void)
     current_cuda_device = &(cuda_devices[0]);
     
     /* init cuda_iov */
+    cuda_iov_cache_enabled = 1;
     cuda_iov_count = CUDA_NB_IOV;
     
     // /* init size for double, float, char */
diff --git a/opal/datatype/cuda/opal_datatype_cuda.cuh b/opal/datatype/cuda/opal_datatype_cuda.cuh
index 8e2a008ce22..c33ff606bd9 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cuh
+++ b/opal/datatype/cuda/opal_datatype_cuda.cuh
@@ -29,25 +29,13 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pCon
                                                           uint32_t* out_size,
                                                           size_t* max_data ); 
                                                           
-int32_t opal_ddt_generic_simple_pack_function_cuda_iov_non_cached( opal_convertor_t* pConvertor,
-                                                                   struct iovec* iov, 
-                                                                   uint32_t* out_size,
-                                                                   size_t* max_data );                                              
-
-int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_non_cached( opal_convertor_t* pConvertor,
-                                                                     struct iovec* iov, 
-                                                                     uint32_t* out_size,
-                                                                     size_t* max_data ); 
+int32_t opal_ddt_generic_simple_pack_function_cuda_iov_non_cached( opal_convertor_t* pConvertor, unsigned char *destination, size_t buffer_size, size_t *total_packed);                                        
+
+int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_non_cached( opal_convertor_t* pConvertor, unsigned char *source, size_t buffer_size, size_t *total_unpacked);
                                                                                                                     
-int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t* pConvertor,
-                                                               struct iovec* iov, 
-                                                               uint32_t* out_size,
-                                                               size_t* max_data );                                              
-
-int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_t* pConvertor,
-                                                                 struct iovec* iov, 
-                                                                 uint32_t* out_size,
-                                                                 size_t* max_data ); 
+int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t* pConvertor, unsigned char *destination, size_t buffer_size, size_t *total_packed);                                        
+
+int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_t* pConvertor, unsigned char *source, size_t buffer_size, size_t *total_unpacked);
 
 void pack_contiguous_loop_cuda( dt_elem_desc_t* ELEM,
                                 uint32_t* COUNT,
diff --git a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
index 99dc76f1e05..72edcb3d8a3 100644
--- a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
+++ b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
@@ -109,6 +109,7 @@ extern ddt_cuda_device_t *cuda_devices;
 extern ddt_cuda_device_t *current_cuda_device;
 extern struct iovec cuda_iov[CUDA_NB_IOV];
 extern uint32_t cuda_iov_count;
+extern uint32_t cuda_iov_cache_enabled;
 
 //extern uint8_t ALIGNMENT_DOUBLE, ALIGNMENT_FLOAT, ALIGNMENT_CHAR;
 
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
index ad0e2d771d5..0137601bf70 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
@@ -664,7 +664,98 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConve
                                                         uint32_t* out_size,
                                                         size_t* max_data )
 {
-    return opal_ddt_generic_simple_pack_function_cuda_iov_non_cached(pConvertor, iov, out_size, max_data);   
+    size_t buffer_size;
+    unsigned char *destination;
+    size_t total_packed;
+    uint8_t transfer_required, free_required;
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)    
+    TIMER_DATA_TYPE start, end, start_total, end_total;
+    long total_time, move_time;
+#endif
+
+ //   printf("buffer size %d, max_data %d\n", iov[0].iov_len, *max_data);
+    if ((iov[0].iov_base == NULL) || opal_ddt_cuda_is_gpu_buffer(iov[0].iov_base)) {
+        if (iov[0].iov_len == 0) {
+            buffer_size = DT_CUDA_BUFFER_SIZE;
+        } else {
+            buffer_size = iov[0].iov_len;
+        }
+        
+        if (iov[0].iov_base == NULL) {
+            iov[0].iov_base = (unsigned char *)opal_ddt_cuda_malloc_gpu_buffer(buffer_size, 0);
+            destination = (unsigned char *)iov[0].iov_base;
+            pConvertor->gpu_buffer_ptr = destination;
+            free_required = 1;
+        } else {
+            destination = (unsigned char *)iov[0].iov_base;
+            free_required = 0;
+        }
+        transfer_required = 0;
+    } else {
+        buffer_size = iov[0].iov_len;
+        if (OPAL_DATATYPE_VECTOR_USE_ZEROCPY) {
+            pConvertor->gpu_buffer_ptr = NULL;
+            transfer_required = 0;
+            free_required = 0;
+            cudaHostGetDevicePointer((void **)&destination, (void *)iov[0].iov_base, 0);
+        } else {
+            if (pConvertor->gpu_buffer_ptr == NULL) {
+                pConvertor->gpu_buffer_ptr = (unsigned char*)opal_ddt_cuda_malloc_gpu_buffer(buffer_size, 0);
+            }
+            transfer_required = 1;
+            free_required = 1;
+            destination = pConvertor->gpu_buffer_ptr;
+        }
+    }   
+
+    total_packed = 0;
+    
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+    GET_TIME(start_total);
+#endif
+    
+    /* start pack */
+    if (cuda_iov_cache_enabled) {
+        opal_ddt_generic_simple_pack_function_cuda_iov_cached(pConvertor, destination, buffer_size, &total_packed);
+    } else {
+        opal_ddt_generic_simple_pack_function_cuda_iov_non_cached(pConvertor, destination, buffer_size, &total_packed);
+    }
+
+    pConvertor->bConverted += total_packed;
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "Pack total packed %d\n", total_packed); );
+    
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+    GET_TIME(start);
+#endif
+    if (transfer_required) {
+        cudaMemcpy(iov[0].iov_base, pConvertor->gpu_buffer_ptr, total_packed, cudaMemcpyDeviceToHost);
+    } 
+#if defined(OPAL_DATATYPE_CUDA_TIMING) 
+    GET_TIME( end );
+    move_time = ELAPSED_TIME( start, end );
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: DtoH memcpy in %ld microsec, transfer required %d\n", move_time, transfer_required ); );
+#endif
+
+    iov[0].iov_len = total_packed;
+    *max_data = total_packed;
+    *out_size = 1;
+    
+#if defined(OPAL_DATATYPE_CUDA_TIMING)    
+    GET_TIME( end_total );
+    total_time = ELAPSED_TIME( start_total, end_total );
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: total packing in %ld microsec, kernel %ld microsec\n", total_time, total_time-move_time ); );
+#endif
+    
+    if( pConvertor->bConverted == pConvertor->local_size ) {
+        pConvertor->flags |= CONVERTOR_COMPLETED;
+        if (pConvertor->gpu_buffer_ptr != NULL && free_required) {
+           opal_ddt_cuda_free_gpu_buffer(pConvertor->gpu_buffer_ptr, 0);
+           pConvertor->gpu_buffer_ptr = NULL;
+        }
+        return 1;
+    }        
+    return 0; 
 }
 
 #if 0
@@ -932,17 +1023,12 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_non_cached( opal_converto
 #endif
 
 
-int32_t opal_ddt_generic_simple_pack_function_cuda_iov_non_cached( opal_convertor_t* pConvertor,
-                                                                   struct iovec* iov,
-                                                                   uint32_t* out_size,
-                                                                   size_t* max_data )
+int32_t opal_ddt_generic_simple_pack_function_cuda_iov_non_cached( opal_convertor_t* pConvertor, unsigned char *destination, size_t buffer_size, size_t *total_packed)
 {
     uint32_t i;
     uint32_t nb_blocks, thread_per_block, nb_blocks_used;
-    size_t buffer_size;
-    unsigned char *destination, *destination_base, *source_base;
-    size_t total_packed;
-    uint8_t buffer_isfull = 0, transfer_required, free_required;
+    unsigned char *destination_base, *source_base;
+    uint8_t buffer_isfull = 0;
     cudaError_t cuda_err;
     ddt_cuda_stream_t *cuda_streams = current_cuda_device->cuda_streams;
     ddt_cuda_iov_dist_cached_t* cuda_iov_dist_h_current;
@@ -957,51 +1043,11 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_non_cached( opal_converto
     OPAL_PTRDIFF_TYPE ddt_extent;
 
 #if defined(OPAL_DATATYPE_CUDA_TIMING)    
-    TIMER_DATA_TYPE start, end, start_total, end_total;
-    long total_time, move_time;
+    TIMER_DATA_TYPE start, end;
+    long total_time;
 #endif
     
-
- //   printf("buffer size %d, max_data %d\n", iov[0].iov_len, *max_data);
-    if ((iov[0].iov_base == NULL) || opal_ddt_cuda_is_gpu_buffer(iov[0].iov_base)) {
-        if (iov[0].iov_len == 0) {
-            buffer_size = DT_CUDA_BUFFER_SIZE;
-        } else {
-            buffer_size = iov[0].iov_len;
-        }
-        
-        if (iov[0].iov_base == NULL) {
-            iov[0].iov_base = (unsigned char *)opal_ddt_cuda_malloc_gpu_buffer(buffer_size, 0);
-            destination = (unsigned char *)iov[0].iov_base;
-            pConvertor->gpu_buffer_ptr = destination;
-            free_required = 1;
-        } else {
-            destination = (unsigned char *)iov[0].iov_base;
-            free_required = 0;
-        }
-        transfer_required = 0;
-    } else {
-        buffer_size = iov[0].iov_len;
-        if (OPAL_DATATYPE_VECTOR_USE_ZEROCPY) {
-            pConvertor->gpu_buffer_ptr = NULL;
-            transfer_required = 0;
-            free_required = 0;
-            cudaHostGetDevicePointer((void **)&destination, (void *)iov[0].iov_base, 0);
-        } else {
-            if (pConvertor->gpu_buffer_ptr == NULL) {
-                pConvertor->gpu_buffer_ptr = (unsigned char*)opal_ddt_cuda_malloc_gpu_buffer(buffer_size, 0);
-            }
-            transfer_required = 1;
-            free_required = 1;
-            destination = pConvertor->gpu_buffer_ptr;
-        }
-    }   
-    
     DT_CUDA_DEBUG ( opal_cuda_output(2, "Pack using IOV non cached, GPU base %p, pack to buffer %p\n", pConvertor->pBaseBuf, destination););
-
-#if defined(OPAL_DATATYPE_CUDA_TIMING)
-    GET_TIME(start_total);
-#endif
     
     opal_convertor_raw_cached( pConvertor, &ddt_iov, &ddt_iov_count);
     if (ddt_iov == NULL) {
@@ -1009,7 +1055,6 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_non_cached( opal_converto
         return OPAL_ERROR;
     }
     
-    total_packed = 0;
     cuda_streams->current_stream_id = 0;
     thread_per_block = CUDA_WARP_SIZE * 5;
     nb_blocks = 256;
@@ -1040,7 +1085,7 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_non_cached( opal_converto
         GET_TIME(start);
 #endif
 
-        buffer_isfull = opal_ddt_iov_to_cuda_iov(pConvertor, ddt_iov, cuda_iov_dist_h_current, ddt_iov_start_pos, ddt_iov_end_pos, &buffer_size, &nb_blocks_used, &total_packed, &contig_disp, &current_ddt_iov_pos);
+        buffer_isfull = opal_ddt_iov_to_cuda_iov(pConvertor, ddt_iov, cuda_iov_dist_h_current, ddt_iov_start_pos, ddt_iov_end_pos, &buffer_size, &nb_blocks_used, total_packed, &contig_disp, &current_ddt_iov_pos);
 
 #if defined(OPAL_DATATYPE_CUDA_TIMING)    
         GET_TIME( end );
@@ -1067,57 +1112,19 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_non_cached( opal_converto
         
     }
     
-
     for (i = 0; i < NB_STREAMS; i++) {
         cudaStreamSynchronize(cuda_streams->opal_cuda_stream[i]);
     }
-    
-#if defined(OPAL_DATATYPE_CUDA_TIMING)
-    GET_TIME(start);
-#endif
-    if (transfer_required) {
-        cudaMemcpy(iov[0].iov_base, pConvertor->gpu_buffer_ptr, total_packed, cudaMemcpyDeviceToHost);
-    } 
-#if defined(OPAL_DATATYPE_CUDA_TIMING) 
-    GET_TIME( end );
-    move_time = ELAPSED_TIME( start, end );
-    DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: DtoH memcpy in %ld microsec, transfer required %d\n", move_time, transfer_required ); );
-#endif
-
-    pConvertor->bConverted += total_packed;
-    iov[0].iov_len = total_packed;
-    *max_data = total_packed;
-    *out_size = 1;
-    DT_CUDA_DEBUG ( opal_cuda_output(2, "Pack total packed %d\n", total_packed); );
-    
-#if defined(OPAL_DATATYPE_CUDA_TIMING)    
-    GET_TIME( end_total );
-    total_time = ELAPSED_TIME( start_total, end_total );
-    DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: total packing in %ld microsec, kernel %ld microsec\n", total_time, total_time-move_time ); );
-#endif
-    
-    if( pConvertor->bConverted == pConvertor->local_size ) {
-        pConvertor->flags |= CONVERTOR_COMPLETED;
-        if (pConvertor->gpu_buffer_ptr != NULL && free_required) {
-           opal_ddt_cuda_free_gpu_buffer(pConvertor->gpu_buffer_ptr, 0);
-           pConvertor->gpu_buffer_ptr = NULL;
-        }
-        return 1;
-    }        
-    return 0;
+        
+    return OPAL_SUCCESS;
 }
 
-int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t* pConvertor,
-                                                               struct iovec* iov,
-                                                               uint32_t* out_size,
-                                                               size_t* max_data )
+int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t* pConvertor, unsigned char *destination, size_t buffer_size, size_t *total_packed)
 {
     uint32_t i;
     uint32_t nb_blocks, thread_per_block, nb_blocks_used;
-    size_t buffer_size;
-    unsigned char *destination, *destination_base, *source_base;
-    size_t total_packed;
-    uint8_t buffer_isfull = 0, transfer_required, free_required;
+    unsigned char *destination_base, *source_base;
+    uint8_t buffer_isfull = 0;
     cudaError_t cuda_err;
     ddt_cuda_stream_t *cuda_streams = current_cuda_device->cuda_streams;
     ddt_cuda_iov_pipeline_block_t *cuda_iov_pipeline_block = NULL;
@@ -1131,55 +1138,14 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
     OPAL_PTRDIFF_TYPE ddt_extent;
 
 #if defined(OPAL_DATATYPE_CUDA_TIMING)    
-    TIMER_DATA_TYPE start, end, start_total, end_total;
-    long total_time, move_time;
+    TIMER_DATA_TYPE start, end;
+    long total_time;
 #endif
-
- //   printf("buffer size %d, max_data %d\n", iov[0].iov_len, *max_data);
-    if ((iov[0].iov_base == NULL) || opal_ddt_cuda_is_gpu_buffer(iov[0].iov_base)) {
-        if (iov[0].iov_len == 0) {
-            buffer_size = DT_CUDA_BUFFER_SIZE;
-        } else {
-            buffer_size = iov[0].iov_len;
-        }
-        
-        if (iov[0].iov_base == NULL) {
-            iov[0].iov_base = (unsigned char *)opal_ddt_cuda_malloc_gpu_buffer(buffer_size, 0);
-            destination = (unsigned char *)iov[0].iov_base;
-            pConvertor->gpu_buffer_ptr = destination;
-            free_required = 1;
-        } else {
-            destination = (unsigned char *)iov[0].iov_base;
-            free_required = 0;
-        }
-        transfer_required = 0;
-    } else {
-        buffer_size = iov[0].iov_len;
-        if (OPAL_DATATYPE_VECTOR_USE_ZEROCPY) {
-            pConvertor->gpu_buffer_ptr = NULL;
-            transfer_required = 0;
-            free_required = 0;
-            cudaHostGetDevicePointer((void **)&destination, (void *)iov[0].iov_base, 0);
-        } else {
-            if (pConvertor->gpu_buffer_ptr == NULL) {
-                pConvertor->gpu_buffer_ptr = (unsigned char*)opal_ddt_cuda_malloc_gpu_buffer(buffer_size, 0);
-            }
-            transfer_required = 1;
-            free_required = 1;
-            destination = pConvertor->gpu_buffer_ptr;
-        }
-    }   
     
     DT_CUDA_DEBUG ( opal_cuda_output(2, "Pack using IOV cached, GPU base %p, pack to buffer %p\n", pConvertor->pBaseBuf, destination););
 
-    total_packed = 0;
     cuda_streams->current_stream_id = 0;
     destination_base = destination;
-
-#if defined(OPAL_DATATYPE_CUDA_TIMING)
-    GET_TIME(start_total);
-#endif
-    
     thread_per_block = CUDA_WARP_SIZE * 5;
     nb_blocks = 256;
     source_base = (unsigned char*)pConvertor->pBaseBuf; 
@@ -1224,7 +1190,7 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
     while( pConvertor->current_count < pConvertor->count && !buffer_isfull) {
         for (i = cuda_iov_start_pos; i < cuda_iov_end_pos && !buffer_isfull; i++) {
             if (buffer_size >= cached_cuda_iov_nb_bytes_list_h[i]) {
-                total_packed += cached_cuda_iov_nb_bytes_list_h[i];
+                *total_packed += cached_cuda_iov_nb_bytes_list_h[i];
                 buffer_size -= cached_cuda_iov_nb_bytes_list_h[i];
                 nb_blocks_used++;
             } else {
@@ -1250,41 +1216,8 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
     pConvertor->current_cuda_iov_pos = pConvertor->current_cuda_iov_pos % cached_cuda_iov->cuda_iov_count;
 
     cudaStreamSynchronize(cuda_streams->opal_cuda_stream[0]);
-
-    pConvertor->bConverted += total_packed;
-    DT_CUDA_DEBUG ( opal_cuda_output(2, "Pack total packed %d\n", total_packed); );
     
-#if defined(OPAL_DATATYPE_CUDA_TIMING)
-    GET_TIME(start);
-#endif
-    if (transfer_required) {
-        cudaMemcpy(iov[0].iov_base, pConvertor->gpu_buffer_ptr, total_packed, cudaMemcpyDeviceToHost);
-    } 
-#if defined(OPAL_DATATYPE_CUDA_TIMING) 
-    GET_TIME( end );
-    move_time = ELAPSED_TIME( start, end );
-    DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: DtoH memcpy in %ld microsec, transfer required %d\n", move_time, transfer_required ); );
-#endif
-
-    iov[0].iov_len = total_packed;
-    *max_data = total_packed;
-    *out_size = 1;
-    
-#if defined(OPAL_DATATYPE_CUDA_TIMING)    
-    GET_TIME( end_total );
-    total_time = ELAPSED_TIME( start_total, end_total );
-    DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: total packing in %ld microsec, kernel %ld microsec\n", total_time, total_time-move_time ); );
-#endif
-    
-    if( pConvertor->bConverted == pConvertor->local_size ) {
-        pConvertor->flags |= CONVERTOR_COMPLETED;
-        if (pConvertor->gpu_buffer_ptr != NULL && free_required) {
-           opal_ddt_cuda_free_gpu_buffer(pConvertor->gpu_buffer_ptr, 0);
-           pConvertor->gpu_buffer_ptr = NULL;
-        }
-        return 1;
-    }        
-    return 0;
+    return OPAL_SUCCESS;
 }
 
 void pack_predefined_data_cuda( dt_elem_desc_t* ELEM,
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
index 648036b1bb1..bb54dfeeb0a 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
@@ -376,7 +376,80 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pCon
                                                           uint32_t* out_size,
                                                           size_t* max_data )
 {
-    return opal_ddt_generic_simple_unpack_function_cuda_iov_non_cached(pConvertor, iov, out_size, max_data);
+    size_t buffer_size;
+    unsigned char *source;
+    size_t total_unpacked;
+    uint8_t free_required = 0;
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+    TIMER_DATA_TYPE start, end, start_total, end_total;
+    long total_time, move_time;
+#endif
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+    GET_TIME(start_total);
+#endif
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+    GET_TIME(start);
+#endif
+    if (opal_ddt_cuda_is_gpu_buffer(iov[0].iov_base)) {
+        source = (unsigned char*)iov[0].iov_base;
+        free_required = 0;
+    } else {
+        if (OPAL_DATATYPE_VECTOR_USE_ZEROCPY) {
+            cudaHostGetDevicePointer((void **)&source, (void *)iov[0].iov_base, 0);
+            pConvertor->gpu_buffer_ptr = NULL;
+            free_required = 0;
+        } else {
+            if (pConvertor->gpu_buffer_ptr == NULL) {
+                pConvertor->gpu_buffer_ptr = (unsigned char*)opal_ddt_cuda_malloc_gpu_buffer(iov[0].iov_len, 0);
+            }
+            source = pConvertor->gpu_buffer_ptr;
+            cudaMemcpy(source, iov[0].iov_base, iov[0].iov_len, cudaMemcpyHostToDevice);
+            free_required = 1;
+        }
+    }
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+    GET_TIME( end );
+    move_time = ELAPSED_TIME( start, end );
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: HtoD memcpy in %ld microsec, free required %d\n", move_time, free_required ); );
+#endif
+
+
+    buffer_size = iov[0].iov_len;
+    total_unpacked = 0;
+    
+    /* start unpack */
+    if (cuda_iov_cache_enabled) {
+        opal_ddt_generic_simple_unpack_function_cuda_iov_cached(pConvertor, source, buffer_size, &total_unpacked);
+    } else {
+        opal_ddt_generic_simple_unpack_function_cuda_iov_non_cached(pConvertor, source, buffer_size, &total_unpacked);
+    }
+    
+    pConvertor->bConverted += total_unpacked;
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "Unpack total unpacked %d\n", total_unpacked); );
+
+    iov[0].iov_len = total_unpacked;
+    *max_data = total_unpacked;
+    *out_size = 1;
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+    GET_TIME( end_total );
+    total_time = ELAPSED_TIME( start_total, end_total );
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: total unpacking in %ld microsec, kernel %ld microsec\n", total_time, total_time-move_time ); );
+#endif
+
+    if( pConvertor->bConverted == pConvertor->local_size ) {
+        pConvertor->flags |= CONVERTOR_COMPLETED;
+        if (pConvertor->gpu_buffer_ptr != NULL && free_required) {
+            opal_ddt_cuda_free_gpu_buffer(pConvertor->gpu_buffer_ptr, 0);
+            pConvertor->gpu_buffer_ptr = NULL;
+        }
+        return 1;
+    }
+    return 0;
 }
 
 #if 0
@@ -626,18 +699,12 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_non_cached( opal_conver
 
 #endif
 
-int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_non_cached( opal_convertor_t* pConvertor,
-                                                                     struct iovec* iov,
-                                                                     uint32_t* out_size,
-                                                                     size_t* max_data )
+int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_non_cached( opal_convertor_t* pConvertor, unsigned char *source, size_t buffer_size, size_t *total_unpacked)
 {
     uint32_t i;
     uint32_t nb_blocks, thread_per_block, nb_blocks_used;
-    size_t buffer_size;
-    unsigned char *source, *source_base, *destination_base;
-    size_t total_unpacked;
+    unsigned char *source_base, *destination_base;
     uint8_t buffer_isfull = 0;
-    uint8_t free_required = 0;
     cudaError_t cuda_err;
     ddt_cuda_stream_t *cuda_streams = current_cuda_device->cuda_streams;
     ddt_cuda_iov_dist_cached_t* cuda_iov_dist_h_current;
@@ -652,42 +719,12 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_non_cached( opal_conver
     OPAL_PTRDIFF_TYPE ddt_extent;
 
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
-    TIMER_DATA_TYPE start, end, start_total, end_total;
-    long total_time, move_time;
-#endif
-
-#if defined(OPAL_DATATYPE_CUDA_TIMING)
-    GET_TIME(start_total);
-#endif
-
-#if defined(OPAL_DATATYPE_CUDA_TIMING)
-    GET_TIME(start);
+    TIMER_DATA_TYPE start, end;
+    long total_time;
 #endif
-    if (opal_ddt_cuda_is_gpu_buffer(iov[0].iov_base)) {
-        source = (unsigned char*)iov[0].iov_base;
-        free_required = 0;
-    } else {
-        if (OPAL_DATATYPE_VECTOR_USE_ZEROCPY) {
-            cudaHostGetDevicePointer((void **)&source, (void *)iov[0].iov_base, 0);
-            pConvertor->gpu_buffer_ptr = NULL;
-            free_required = 0;
-        } else {
-            if (pConvertor->gpu_buffer_ptr == NULL) {
-                pConvertor->gpu_buffer_ptr = (unsigned char*)opal_ddt_cuda_malloc_gpu_buffer(iov[0].iov_len, 0);
-            }
-            source = pConvertor->gpu_buffer_ptr;
-            cudaMemcpy(source, iov[0].iov_base, iov[0].iov_len, cudaMemcpyHostToDevice);
-            free_required = 1;
-        }
-    }
-
+    
     DT_CUDA_DEBUG ( opal_cuda_output(2, "Unpack using IOV non cached, GPU base %p, unpack from buffer %p, total size %ld\n",
-                                     pConvertor->pBaseBuf, source, iov[0].iov_len); );
-#if defined(OPAL_DATATYPE_CUDA_TIMING)
-    GET_TIME( end );
-    move_time = ELAPSED_TIME( start, end );
-    DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: HtoD memcpy in %ld microsec, free required %d\n", move_time, free_required ); );
-#endif
+                                     pConvertor->pBaseBuf, source, buffer_size); );
     
     opal_convertor_raw_cached( pConvertor, &ddt_iov, &ddt_iov_count);
     if (ddt_iov == NULL) {
@@ -695,10 +732,7 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_non_cached( opal_conver
         return OPAL_ERROR;
     }
     
-    buffer_size = iov[0].iov_len;
-    total_unpacked = 0;
     cuda_streams->current_stream_id = 0;
-
     thread_per_block = CUDA_WARP_SIZE * 5;
     nb_blocks = 256;
     source_base = source;
@@ -730,7 +764,7 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_non_cached( opal_conver
         GET_TIME(start);
 #endif
 
-        buffer_isfull = opal_ddt_iov_to_cuda_iov(pConvertor, ddt_iov, cuda_iov_dist_h_current, ddt_iov_start_pos, ddt_iov_end_pos, &buffer_size, &nb_blocks_used, &total_unpacked, &contig_disp, &current_ddt_iov_pos);
+        buffer_isfull = opal_ddt_iov_to_cuda_iov(pConvertor, ddt_iov, cuda_iov_dist_h_current, ddt_iov_start_pos, ddt_iov_end_pos, &buffer_size, &nb_blocks_used, total_unpacked, &contig_disp, &current_ddt_iov_pos);
 
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
         GET_TIME( end );
@@ -759,41 +793,15 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_non_cached( opal_conver
         cudaStreamSynchronize(cuda_streams->opal_cuda_stream[i]);
     }
 
-    pConvertor->bConverted += total_unpacked;
-    iov[0].iov_len = total_unpacked;
-    *max_data = total_unpacked;
-    *out_size = 1;
-    DT_CUDA_DEBUG ( opal_cuda_output(2, "Unpack total unpacked %d\n", total_unpacked); );
-
-#if defined(OPAL_DATATYPE_CUDA_TIMING)
-    GET_TIME( end_total );
-    total_time = ELAPSED_TIME( start_total, end_total );
-    DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: total unpacking in %ld microsec, kernel %ld microsec\n", total_time, total_time-move_time ); );
-#endif
-
-    if( pConvertor->bConverted == pConvertor->local_size ) {
-        pConvertor->flags |= CONVERTOR_COMPLETED;
-        if (pConvertor->gpu_buffer_ptr != NULL && free_required) {
-            opal_ddt_cuda_free_gpu_buffer(pConvertor->gpu_buffer_ptr, 0);
-            pConvertor->gpu_buffer_ptr = NULL;
-        }
-        return 1;
-    }
-    return 0;
+    return OPAL_SUCCESS;
 }
 
-int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_t* pConvertor,
-                                                                 struct iovec* iov,
-                                                                 uint32_t* out_size,
-                                                                 size_t* max_data )
+int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_t* pConvertor, unsigned char *source, size_t buffer_size, size_t *total_unpacked)
 {
     uint32_t i;
     uint32_t nb_blocks, thread_per_block, nb_blocks_used;
-    size_t buffer_size;
-    unsigned char *source, *source_base, *destination_base, *destination;
-    size_t total_unpacked;
+    unsigned char *source_base, *destination_base;
     uint8_t buffer_isfull = 0;
-    uint8_t free_required = 0;
     cudaError_t cuda_err;
     ddt_cuda_stream_t *cuda_streams = current_cuda_device->cuda_streams;
     ddt_cuda_iov_pipeline_block_t *cuda_iov_pipeline_block = NULL;
@@ -809,58 +817,19 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
     OPAL_PTRDIFF_TYPE ddt_extent;
 
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
-    TIMER_DATA_TYPE start, end, start_total, end_total;
-    long total_time, move_time;
-#endif
-
-#if defined(OPAL_DATATYPE_CUDA_TIMING)
-    GET_TIME(start_total);
-#endif
-
-#if defined(OPAL_DATATYPE_CUDA_TIMING)
-    GET_TIME(start);
+    TIMER_DATA_TYPE start, end;
+    long total_time;
 #endif
-    if (opal_ddt_cuda_is_gpu_buffer(iov[0].iov_base)) {
-        source = (unsigned char*)iov[0].iov_base;
-        free_required = 0;
-    } else {
-        if (OPAL_DATATYPE_VECTOR_USE_ZEROCPY) {
-            cudaHostGetDevicePointer((void **)&source, (void *)iov[0].iov_base, 0);
-            pConvertor->gpu_buffer_ptr = NULL;
-            free_required = 0;
-        } else {
-            if (pConvertor->gpu_buffer_ptr == NULL) {
-                pConvertor->gpu_buffer_ptr = (unsigned char*)opal_ddt_cuda_malloc_gpu_buffer(iov[0].iov_len, 0);
-            }
-            source = pConvertor->gpu_buffer_ptr;
-            cudaMemcpy(source, iov[0].iov_base, iov[0].iov_len, cudaMemcpyHostToDevice);
-            free_required = 1;
-        }
-    }
-
+    
     DT_CUDA_DEBUG ( opal_cuda_output(2, "Unpack using IOV cached, GPU base %p, unpack from buffer %p, total size %ld\n",
-                                     pConvertor->pBaseBuf, source, iov[0].iov_len); );
-#if defined(OPAL_DATATYPE_CUDA_TIMING)
-    GET_TIME( end );
-    move_time = ELAPSED_TIME( start, end );
-    DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: HtoD memcpy in %ld microsec, free required %d\n", move_time, free_required ); );
-#endif
-
+                                     pConvertor->pBaseBuf, source, buffer_size); );
 
 #if defined (OPAL_DATATYPE_CUDA_TIMING)
     GET_TIME(start);
 #endif
-    buffer_size = iov[0].iov_len;
-    total_unpacked = 0;
+
     cuda_streams->current_stream_id = 0;
     source_base = source;
-
-#if defined (OPAL_DATATYPE_CUDA_TIMING)
-    GET_TIME( end );
-    total_time = ELAPSED_TIME( start, end );
-    DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: ddt to iov in %ld microsec\n", total_time ); );
-#endif
-
     thread_per_block = CUDA_WARP_SIZE * 5;
     nb_blocks = 256;
     destination_base = (unsigned char*)pConvertor->pBaseBuf;
@@ -899,7 +868,7 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
     
     if (pConvertor->current_iov_partial_length > 0) {
         cuda_iov_partial_length_start = pConvertor->current_iov_partial_length;
-        total_unpacked += cuda_iov_partial_length_start;
+        *total_unpacked += cuda_iov_partial_length_start;
         buffer_size -= cuda_iov_partial_length_start;
         pConvertor->current_iov_partial_length = 0;
         cuda_iov_start_pos ++;
@@ -912,13 +881,13 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
     while( pConvertor->current_count < pConvertor->count && !buffer_isfull) {
         for (i = cuda_iov_start_pos; i < cuda_iov_end_pos && !buffer_isfull; i++) {
             if (buffer_size >= cached_cuda_iov_nb_bytes_list_h[i]) {
-                total_unpacked += cached_cuda_iov_nb_bytes_list_h[i];
+                *total_unpacked += cached_cuda_iov_nb_bytes_list_h[i];
                 buffer_size -= cached_cuda_iov_nb_bytes_list_h[i];
                 nb_blocks_used ++;
             } else {
                 if (buffer_size > 0) {
                     cuda_iov_partial_length_end = buffer_size;
-                    total_unpacked += cuda_iov_partial_length_end;
+                    *total_unpacked += cuda_iov_partial_length_end;
                     nb_blocks_used ++;
                 }
                 buffer_size = 0;
@@ -943,28 +912,7 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
 
     cudaStreamSynchronize(cuda_streams->opal_cuda_stream[0]);
     
-    pConvertor->bConverted += total_unpacked;
-    DT_CUDA_DEBUG ( opal_cuda_output(2, "Unpack total unpacked %d\n", total_unpacked); );
-
-    iov[0].iov_len = total_unpacked;
-    *max_data = total_unpacked;
-    *out_size = 1;
-
-#if defined(OPAL_DATATYPE_CUDA_TIMING)
-    GET_TIME( end_total );
-    total_time = ELAPSED_TIME( start_total, end_total );
-    DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: total unpacking in %ld microsec, kernel %ld microsec\n", total_time, total_time-move_time ); );
-#endif
-
-    if( pConvertor->bConverted == pConvertor->local_size ) {
-        pConvertor->flags |= CONVERTOR_COMPLETED;
-        if (pConvertor->gpu_buffer_ptr != NULL && free_required) {
-            opal_ddt_cuda_free_gpu_buffer(pConvertor->gpu_buffer_ptr, 0);
-            pConvertor->gpu_buffer_ptr = NULL;
-        }
-        return 1;
-    }
-    return 0;
+    return OPAL_SUCCESS;
 }
 
 void unpack_contiguous_loop_cuda( dt_elem_desc_t* ELEM,

From da23f82be318b4f79e725ab2d67e3bc891ec9bcd Mon Sep 17 00:00:00 2001
From: eddy16112 <eddy16112@gmail.com>
Date: Tue, 1 Dec 2015 16:40:27 -0500
Subject: [PATCH 171/190] for non cached iov, if there is no enough cuda iov
 space, break

---
 opal/datatype/cuda/opal_datatype_cuda.cu | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/opal/datatype/cuda/opal_datatype_cuda.cu b/opal/datatype/cuda/opal_datatype_cuda.cu
index b488ac4ab6c..2c76a327197 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cu
+++ b/opal/datatype/cuda/opal_datatype_cuda.cu
@@ -493,6 +493,9 @@ uint8_t opal_ddt_iov_to_cuda_iov(opal_convertor_t* pConvertor, const struct iove
         count_desc = length_per_iovec / alignment;
         residue_desc = length_per_iovec % alignment;
         nb_blocks_per_description = (count_desc + thread_per_block - 1) / thread_per_block;
+        if ((*nb_blocks_used + nb_blocks_per_description + 1) > (CUDA_MAX_NB_BLOCKS*CUDA_IOV_MAX_TASK_PER_BLOCK)) {
+            break;
+        }
         DT_CUDA_DEBUG ( opal_cuda_output(10, "DDT IOV to CUDA IOV description %d, size %d, residue %d, alignment %d, nb_block_aquired %d\n", i, count_desc, residue_desc, alignment, nb_blocks_per_description); );
         for (j = 0; j < nb_blocks_per_description; j++) {
             cuda_iov_dist_h_current[*nb_blocks_used].ncontig_disp = ncontig_disp_base + j * thread_per_block * alignment;

From 880a2331a4dc374b2ad40abbe496aff9193d4ce8 Mon Sep 17 00:00:00 2001
From: eddy16112 <eddy16112@gmail.com>
Date: Fri, 6 Nov 2015 23:23:33 -0500
Subject: [PATCH 172/190] cached iov is working for count = 1 check point use
 raw_cached, but cuda iov caching is not enabled

check point, split iov into two version, non-cached and cached

check point iov cache

another checkpoint

check point, cuda iov is cached, but not used for pack/unpack

check point, ready to use cached cuda iov

checkpoint, cached cuda iov is working with multiple send, but not for
count > 1

checkpoint, fix a bug for partial unpack

checkpoint, fix unpack size
---
 opal/datatype/cuda/opal_datatype_cuda.cu      | 305 +----------
 opal/datatype/cuda/opal_datatype_cuda.cuh     |  30 +-
 .../cuda/opal_datatype_cuda_internal.cuh      |  14 +-
 .../cuda/opal_datatype_pack_cuda_kernel.cu    |  60 +-
 .../cuda/opal_datatype_pack_cuda_wrapper.cu   | 516 ++++++++++--------
 .../cuda/opal_datatype_unpack_cuda_kernel.cu  |  79 ++-
 .../cuda/opal_datatype_unpack_cuda_wrapper.cu | 427 ++++++++++-----
 opal/datatype/opal_datatype.h                 |   3 +
 opal/datatype/opal_datatype_create.c          |   2 +-
 9 files changed, 660 insertions(+), 776 deletions(-)

diff --git a/opal/datatype/cuda/opal_datatype_cuda.cu b/opal/datatype/cuda/opal_datatype_cuda.cu
index 2c76a327197..c92d44cfdbe 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cu
+++ b/opal/datatype/cuda/opal_datatype_cuda.cu
@@ -221,13 +221,11 @@ int32_t opal_ddt_cuda_kernel_init(void)
         for (j = 0; j < NB_STREAMS; j++) {
             cudaStreamCreate(&(cuda_streams->opal_cuda_stream[j]));
             cuda_iov_pipeline_block = (ddt_cuda_iov_pipeline_block_t *)malloc(sizeof(ddt_cuda_iov_pipeline_block_t));
-            cudaMallocHost((void **)(&(cuda_iov_pipeline_block->cuda_iov_dist_non_cached_h)), sizeof(ddt_cuda_iov_dist_cached_t) * CUDA_MAX_NB_BLOCKS * CUDA_IOV_MAX_TASK_PER_BLOCK);
-            cudaMalloc((void **)(&(cuda_iov_pipeline_block->cuda_iov_dist_non_cached_d)), sizeof(ddt_cuda_iov_dist_cached_t) * CUDA_MAX_NB_BLOCKS * CUDA_IOV_MAX_TASK_PER_BLOCK);
-            if (j == 0) {
-                cuda_iov_pipeline_block->cuda_iov_dist_cached_h = (ddt_cuda_iov_dist_cached_t *)malloc(sizeof(ddt_cuda_iov_dist_cached_t) * NUM_CUDA_IOV_PER_DDT);
-            } else {
-                cuda_iov_pipeline_block->cuda_iov_dist_cached_h = NULL;
-            }
+            cudaMallocHost((void **)(&(cuda_iov_pipeline_block->cuda_iov_dist_non_cached_h)), sizeof(ddt_cuda_iov_dist_non_cached_t) * CUDA_MAX_NB_BLOCKS * CUDA_IOV_MAX_TASK_PER_BLOCK);
+            cudaMalloc((void **)(&(cuda_iov_pipeline_block->cuda_iov_dist_non_cached_d)), sizeof(ddt_cuda_iov_dist_non_cached_t) * CUDA_MAX_NB_BLOCKS * CUDA_IOV_MAX_TASK_PER_BLOCK);
+            cudaMallocHost((void **)(&(cuda_iov_pipeline_block->cuda_iov_dist_cached_h)), sizeof(ddt_cuda_iov_dist_cached_t) * CUDA_MAX_NB_BLOCKS * CUDA_IOV_MAX_TASK_PER_BLOCK);
+            cudaMallocHost((void **)(&(cuda_iov_pipeline_block->cuda_iov_contig_buf_h)), sizeof(uintptr_t) * CUDA_MAX_NB_BLOCKS * CUDA_IOV_MAX_TASK_PER_BLOCK);
+            cudaMalloc((void **)(&(cuda_iov_pipeline_block->cuda_iov_contig_buf_d)), sizeof(uintptr_t) * CUDA_MAX_NB_BLOCKS * CUDA_IOV_MAX_TASK_PER_BLOCK);
             cuda_iov_pipeline_block->cuda_stream = &(cuda_streams->opal_cuda_stream[0]);
             cuda_iov_pipeline_block->cuda_stream_id = 0;
             cudaEventCreate(&(cuda_iov_pipeline_block->cuda_event), cudaEventDisableTiming);
@@ -265,18 +263,11 @@ int32_t opal_ddt_cuda_kernel_fini(void)
             cudaStreamDestroy(cuda_devices[i].cuda_streams->opal_cuda_stream[j]);
             cuda_iov_pipeline_block = cuda_devices[i].cuda_iov_pipeline_block[j];
             if (cuda_iov_pipeline_block != NULL) {
-                if (cuda_iov_pipeline_block->cuda_iov_dist_non_cached_h != NULL) {
-                    cudaFreeHost(cuda_iov_pipeline_block->cuda_iov_dist_non_cached_h);
-                    cuda_iov_pipeline_block->cuda_iov_dist_non_cached_h = NULL;
-                }
-                if (cuda_iov_pipeline_block->cuda_iov_dist_non_cached_d != NULL) {
-                    cudaFree(cuda_iov_pipeline_block->cuda_iov_dist_non_cached_d);
-                    cuda_iov_pipeline_block->cuda_iov_dist_non_cached_d = NULL;
-                }
-                if (cuda_iov_pipeline_block->cuda_iov_dist_cached_h != NULL) {
-                    free(cuda_iov_pipeline_block->cuda_iov_dist_cached_h);
-                    cuda_iov_pipeline_block->cuda_iov_dist_cached_h = NULL;
-                }
+                cudaFreeHost(cuda_iov_pipeline_block->cuda_iov_dist_non_cached_h);
+                cudaFree(cuda_iov_pipeline_block->cuda_iov_dist_non_cached_d);
+                cudaFreeHost(cuda_iov_pipeline_block->cuda_iov_dist_cached_h);
+                cudaFreeHost(cuda_iov_pipeline_block->cuda_iov_contig_buf_h);
+                cudaFree(cuda_iov_pipeline_block->cuda_iov_contig_buf_d);
                 cudaEventDestroy(cuda_iov_pipeline_block->cuda_event);
                 cuda_iov_pipeline_block->cuda_stream = NULL;
                 cuda_iov_pipeline_block->cuda_stream_id = -1;
@@ -296,13 +287,15 @@ void* opal_ddt_cached_cuda_iov_init(uint32_t size)
 {
 #if OPAL_DATATYPE_CUDA_IOV_CACHE 
     ddt_cuda_iov_total_cached_t *tmp = (ddt_cuda_iov_total_cached_t *)malloc(sizeof(ddt_cuda_iov_total_cached_t));
+    ddt_cuda_iov_dist_cached_t *tmp_cuda_iov_d = NULL;
+    cudaMalloc((void **)(&tmp_cuda_iov_d), sizeof(ddt_cuda_iov_dist_cached_t) * size);
     uint32_t *tmp_nb_bytes = (uint32_t *)malloc(sizeof(uint32_t) * size);
-    if (tmp != NULL && tmp_nb_bytes != NULL) {
-        tmp->cuda_iov_dist_d = NULL;
+    if (tmp != NULL && tmp_cuda_iov_d != NULL && tmp_nb_bytes != NULL) {
+        tmp->cuda_iov_dist_d = tmp_cuda_iov_d;
         tmp->cuda_iov_count = size;
         tmp->cuda_iov_is_cached = 0;
         tmp->nb_bytes_h = tmp_nb_bytes;
-        DT_CUDA_DEBUG( opal_cuda_output( 2, "Malloc cuda_iov_dist_cached for ddt is successed, cached cuda iov %p, nb_bytes_h %p, size %d.\n", tmp, tmp_nb_bytes, size); );
+        DT_CUDA_DEBUG( opal_cuda_output( 2, "Malloc cuda_iov_dist_cached for ddt is successed, cached cuda iov %p, cuda_iov_d %p, nb_bytes_h %p, size %d.\n", tmp, tmp_cuda_iov_d, tmp_nb_bytes, size); );
         return tmp;
     } else {
         DT_CUDA_DEBUG( opal_cuda_output( 0, "Malloc cuda_iov_dist_cached for ddt is failed.\n"); );
@@ -334,215 +327,13 @@ void opal_ddt_cached_cuda_iov_fini(void* cached_cuda_iov)
 #endif /* OPAL_DATATYPE_CUDA_IOV_CACHE */
 }
 
-static inline int32_t opal_ddt_cached_cuda_iov_isfull(ddt_cuda_iov_total_cached_t *cached_cuda_iov, ddt_cuda_iov_dist_cached_t **cuda_iov_dist_h, uint32_t nb_blocks_used)
-{
-    if (nb_blocks_used < cached_cuda_iov->cuda_iov_count) {
-        return 0;
-    } else {
-realloc_cuda_iov:
-        cached_cuda_iov->nb_bytes_h = (uint32_t *)realloc(cached_cuda_iov->nb_bytes_h, sizeof(uint32_t)*cached_cuda_iov->cuda_iov_count*2);
-        assert(cached_cuda_iov->nb_bytes_h != NULL);
-        cached_cuda_iov->cuda_iov_count *= 2;
-        if (nb_blocks_used >= cached_cuda_iov->cuda_iov_count) {
-            goto realloc_cuda_iov;
-        }
-        return 1;
-    }
-}
-
-/* cached_cuda_iov_d is not ready until explicitlt sync with cuda stream 0 
-*/
-int32_t opal_ddt_cache_cuda_iov(opal_convertor_t* pConvertor, uint32_t *cuda_iov_count)
-{
-    uint32_t i, j;
-    uint32_t count_desc, nb_blocks_per_description, residue_desc;
-    uint32_t thread_per_block, nb_blocks_used;
-    size_t length_per_iovec;
-    uint8_t alignment;
-    ddt_cuda_iov_pipeline_block_t *cuda_iov_pipeline_block = NULL;
-    ddt_cuda_iov_total_cached_t* cached_cuda_iov = NULL;
-    ddt_cuda_iov_dist_cached_t *cached_cuda_iov_dist_d = NULL;
-    ddt_cuda_iov_dist_cached_t *cuda_iov_dist_h = NULL;
-    cudaStream_t *cuda_stream_iov = NULL;
-    const struct iovec *ddt_iov = NULL;
-    uint32_t ddt_iov_count = 0;
-    size_t ncontig_disp_base;
-    size_t contig_disp = 0;
-    uint32_t *cached_cuda_iov_nb_bytes_list_h = NULL;
-    
-    opal_datatype_t *datatype = (opal_datatype_t *)pConvertor->pDesc;
-    
-    opal_convertor_raw_cached( pConvertor, &ddt_iov, &ddt_iov_count);
-    if (ddt_iov == NULL) {
-        DT_CUDA_DEBUG ( opal_cuda_output(0, "Can not get ddt iov\n"););
-        return OPAL_ERROR;
-    }
-    
-    
-    cached_cuda_iov = (ddt_cuda_iov_total_cached_t *)opal_ddt_cached_cuda_iov_init(NUM_CUDA_IOV_PER_DDT);
-    if (cached_cuda_iov == NULL) {
-        DT_CUDA_DEBUG ( opal_cuda_output(0, "Can not init cuda iov\n"););
-        return OPAL_ERROR;
-    }
-    cached_cuda_iov_nb_bytes_list_h = cached_cuda_iov->nb_bytes_h;
-    nb_blocks_used = 0;
-    cuda_iov_pipeline_block = current_cuda_device->cuda_iov_pipeline_block[0];
-    cuda_iov_dist_h = cuda_iov_pipeline_block->cuda_iov_dist_cached_h;
-    cuda_stream_iov = cuda_iov_pipeline_block->cuda_stream;
-    thread_per_block = CUDA_WARP_SIZE * 5;
-
-    for (i = 0; i < ddt_iov_count; i++) {
-        length_per_iovec = ddt_iov[i].iov_len;
-        ncontig_disp_base = (size_t)(ddt_iov[i].iov_base);
-    
-        /* block size is either multiple of ALIGNMENT_DOUBLE or residule */
-        alignment = ALIGNMENT_DOUBLE;
-
-        count_desc = length_per_iovec / alignment;
-        residue_desc = length_per_iovec % alignment;
-        nb_blocks_per_description = (count_desc + thread_per_block - 1) / thread_per_block;
-        DT_CUDA_DEBUG ( opal_cuda_output(10, "Cache cuda IOV description %d, size %d, residue %d, alignment %d, nb_block_aquired %d\n", i, count_desc, residue_desc, alignment, nb_blocks_per_description); );
-        if (opal_ddt_cached_cuda_iov_isfull(cached_cuda_iov, &(cuda_iov_pipeline_block->cuda_iov_dist_cached_h), nb_blocks_used + nb_blocks_per_description + 1)) {
-            cached_cuda_iov_nb_bytes_list_h = cached_cuda_iov->nb_bytes_h;
-            cuda_iov_dist_h = (ddt_cuda_iov_dist_cached_t *)realloc(cuda_iov_dist_h, sizeof(ddt_cuda_iov_dist_cached_t)*cached_cuda_iov->cuda_iov_count);
-            assert(cuda_iov_dist_h != NULL);
-            cuda_iov_pipeline_block->cuda_iov_dist_cached_h = cuda_iov_dist_h;
-        }
-        
-        for (j = 0; j < nb_blocks_per_description; j++) {
-            cuda_iov_dist_h[nb_blocks_used].ncontig_disp = ncontig_disp_base + j * thread_per_block * alignment;
-            cuda_iov_dist_h[nb_blocks_used].contig_disp = contig_disp;
-            if ( (j+1) * thread_per_block <= count_desc) {
-                cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] = thread_per_block * alignment;
-            } else {
-                cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] = (count_desc - j*thread_per_block) * alignment; 
-            }
-#if defined (OPAL_DATATYPE_CUDA_DEBUG)
-            assert(cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] > 0);
-#endif /* OPAL_DATATYPE_CUDA_DEBUG */
-            contig_disp += cached_cuda_iov_nb_bytes_list_h[nb_blocks_used];
-            DT_CUDA_DEBUG( opal_cuda_output(12, "Cache cuda IOV \tblock %d, ncontig_disp %ld, contig_disp %ld, nb_bytes %ld\n", nb_blocks_used, cuda_iov_dist_h[nb_blocks_used].ncontig_disp, cuda_iov_dist_h[nb_blocks_used].contig_disp, cached_cuda_iov_nb_bytes_list_h[nb_blocks_used]); );
-            nb_blocks_used ++;
-         //   assert (nb_blocks_used < NUM_CUDA_IOV_PER_DDT);
-        }
-    
-        /* handle residue */
-        if (residue_desc != 0) {
-            cuda_iov_dist_h[nb_blocks_used].ncontig_disp = ncontig_disp_base + length_per_iovec / alignment * alignment;
-            cuda_iov_dist_h[nb_blocks_used].contig_disp = contig_disp;
-            cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] = length_per_iovec - length_per_iovec / alignment * alignment;
-#if defined (OPAL_DATATYPE_CUDA_DEBUG)
-            assert(cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] > 0);
-#endif /* OPAL_DATATYPE_CUDA_DEBUG */
-            contig_disp += cached_cuda_iov_nb_bytes_list_h[nb_blocks_used];
-            DT_CUDA_DEBUG( opal_cuda_output(12, "Cache cuda IOV \tblock %d, ncontig_disp %ld, contig_disp %ld, nb_bytes %ld\n", nb_blocks_used, cuda_iov_dist_h[nb_blocks_used].ncontig_disp, cuda_iov_dist_h[nb_blocks_used].contig_disp, cached_cuda_iov_nb_bytes_list_h[nb_blocks_used]); );
-            nb_blocks_used ++;
-#if defined (OPAL_DATATYPE_CUDA_DEBUG)
-            //assert (nb_blocks_used < NUM_CUDA_IOV_PER_DDT);
-#endif /* OPAL_DATATYPE_CUDA_DEBUG */
-        }
-    }
-    /* use additional entry to store the size of entire contiguous buffer needed for one ddt */
-    cuda_iov_dist_h[nb_blocks_used].contig_disp = contig_disp;
-    cudaMalloc((void **)(&cached_cuda_iov_dist_d), sizeof(ddt_cuda_iov_dist_cached_t) * (nb_blocks_used+1));
-    if (cached_cuda_iov_dist_d == NULL) {
-        DT_CUDA_DEBUG ( opal_cuda_output(0, "Can not malloc cuda iov in GPU\n"););
-        return OPAL_ERROR;
-    }
-    cudaMemcpyAsync(cached_cuda_iov_dist_d, cuda_iov_dist_h, sizeof(ddt_cuda_iov_dist_cached_t)*(nb_blocks_used+1), cudaMemcpyHostToDevice, *cuda_stream_iov);
-    cached_cuda_iov->cuda_iov_dist_d = cached_cuda_iov_dist_d;
-    datatype->cached_cuda_iov = (unsigned char*)cached_cuda_iov;
-    *cuda_iov_count = nb_blocks_used;
-    return OPAL_SUCCESS;
-}
-
-uint8_t opal_ddt_iov_to_cuda_iov(opal_convertor_t* pConvertor, const struct iovec *ddt_iov, ddt_cuda_iov_dist_cached_t* cuda_iov_dist_h_current, uint32_t ddt_iov_start_pos, uint32_t ddt_iov_end_pos, size_t *buffer_size, uint32_t *nb_blocks_used, size_t *total_converted, size_t *contig_disp_out, uint32_t *current_ddt_iov_pos)
-{
-    size_t ncontig_disp_base;
-    size_t contig_disp = 0;
-    size_t current_cuda_iov_length = 0;
-    uint8_t buffer_isfull = 0;
-    uint8_t alignment;
-    uint32_t count_desc, nb_blocks_per_description, residue_desc;
-    uint32_t thread_per_block;
-    size_t length_per_iovec;
-    uint32_t i, j;
-    
-    thread_per_block = CUDA_WARP_SIZE * 5;
-    
-    for (i = ddt_iov_start_pos; i < ddt_iov_end_pos && !buffer_isfull; i++) {
-        if (pConvertor->current_iov_partial_length > 0) {
-            ncontig_disp_base = (size_t)(ddt_iov[i].iov_base) + ddt_iov[i].iov_len - pConvertor->current_iov_partial_length;
-            length_per_iovec = pConvertor->current_iov_partial_length;
-            pConvertor->current_iov_partial_length = 0;
-        } else {
-            ncontig_disp_base = (size_t)(ddt_iov[i].iov_base);
-            length_per_iovec = ddt_iov[i].iov_len;
-        }
-        if (*buffer_size < length_per_iovec) {
-            pConvertor->current_iov_pos = i;
-            pConvertor->current_iov_partial_length = length_per_iovec - *buffer_size;
-            length_per_iovec = *buffer_size; 
-            buffer_isfull = 1;
-        }
-        *buffer_size -= length_per_iovec;
-        *total_converted += length_per_iovec;
-        
-        alignment = ALIGNMENT_DOUBLE;
-
-        count_desc = length_per_iovec / alignment;
-        residue_desc = length_per_iovec % alignment;
-        nb_blocks_per_description = (count_desc + thread_per_block - 1) / thread_per_block;
-        if ((*nb_blocks_used + nb_blocks_per_description + 1) > (CUDA_MAX_NB_BLOCKS*CUDA_IOV_MAX_TASK_PER_BLOCK)) {
-            break;
-        }
-        DT_CUDA_DEBUG ( opal_cuda_output(10, "DDT IOV to CUDA IOV description %d, size %d, residue %d, alignment %d, nb_block_aquired %d\n", i, count_desc, residue_desc, alignment, nb_blocks_per_description); );
-        for (j = 0; j < nb_blocks_per_description; j++) {
-            cuda_iov_dist_h_current[*nb_blocks_used].ncontig_disp = ncontig_disp_base + j * thread_per_block * alignment;
-            cuda_iov_dist_h_current[*nb_blocks_used].contig_disp = contig_disp;
-            if ( (j+1) * thread_per_block <= count_desc) {
-                current_cuda_iov_length = thread_per_block * alignment;
-            } else {
-                current_cuda_iov_length = (count_desc - j*thread_per_block) * alignment; 
-            }
-#if defined (OPAL_DATATYPE_CUDA_DEBUG)
-            assert(current_cuda_iov_length > 0);
-#endif /* OPAL_DATATYPE_CUDA_DEBUG */
-            contig_disp += current_cuda_iov_length;
-            DT_CUDA_DEBUG( opal_cuda_output(12, "DDT IOV to CUDA IOV \tblock %d, ncontig_disp %ld, contig_disp %ld, nb_bytes %ld\n", nb_blocks_used, cuda_iov_dist_h_current[*nb_blocks_used].ncontig_disp, cuda_iov_dist_h_current[*nb_blocks_used].contig_disp, current_cuda_iov_length); );
-            (*nb_blocks_used) ++;
-            assert (*nb_blocks_used < CUDA_MAX_NB_BLOCKS*CUDA_IOV_MAX_TASK_PER_BLOCK);
-        }
-        
-        /* handle residue */
-        if (residue_desc != 0) {
-            cuda_iov_dist_h_current[*nb_blocks_used].ncontig_disp = ncontig_disp_base + length_per_iovec / alignment * alignment;
-            cuda_iov_dist_h_current[*nb_blocks_used].contig_disp = contig_disp;
-            current_cuda_iov_length= length_per_iovec - length_per_iovec / alignment * alignment;
-#if defined (OPAL_DATATYPE_CUDA_DEBUG)
-            assert(current_cuda_iov_length > 0);
-#endif /* OPAL_DATATYPE_CUDA_DEBUG */
-            contig_disp += current_cuda_iov_length;
-            DT_CUDA_DEBUG( opal_cuda_output(12, "DDT IOV to CUDA IOV \tblock %d, ncontig_disp %ld, contig_disp %ld, nb_bytes %ld\n", nb_blocks_used, cuda_iov_dist_h_current[*nb_blocks_used].ncontig_disp, cuda_iov_dist_h_current[*nb_blocks_used].contig_disp, current_cuda_iov_length); );
-            (*nb_blocks_used) ++;
-            assert (*nb_blocks_used < CUDA_MAX_NB_BLOCKS*CUDA_IOV_MAX_TASK_PER_BLOCK);
-        }
-    }
-    cuda_iov_dist_h_current[*nb_blocks_used].contig_disp = contig_disp;
-    *contig_disp_out = contig_disp;
-    *current_ddt_iov_pos = i;
-    return buffer_isfull;
-        
-}
-
 void opal_ddt_get_cached_cuda_iov(struct opal_convertor_t *convertor, ddt_cuda_iov_total_cached_t **cached_cuda_iov)
 {
     opal_datatype_t *datatype = (opal_datatype_t *)convertor->pDesc;
     if (datatype->cached_cuda_iov == NULL) {
-        *cached_cuda_iov = NULL;
-    } else {
-        *cached_cuda_iov = (ddt_cuda_iov_total_cached_t *)datatype->cached_cuda_iov;
-    }                 
+        datatype->cached_cuda_iov = opal_ddt_cached_cuda_iov_init(NUM_CUDA_IOV_PER_DDT);
+    }
+    *cached_cuda_iov = (ddt_cuda_iov_total_cached_t *)datatype->cached_cuda_iov;                   
 }
 
 void opal_ddt_set_cuda_iov_cached(struct opal_convertor_t *convertor, uint32_t cuda_iov_count)
@@ -557,69 +348,11 @@ void opal_ddt_set_cuda_iov_cached(struct opal_convertor_t *convertor, uint32_t c
 uint8_t opal_ddt_cuda_iov_is_cached(struct opal_convertor_t *convertor)
 {
     opal_datatype_t *datatype = (opal_datatype_t *)convertor->pDesc;
-    if (datatype->cached_cuda_iov == NULL) {
-        return 0;
-    }
+    assert(datatype->cached_cuda_iov != NULL);
     ddt_cuda_iov_total_cached_t *tmp = (ddt_cuda_iov_total_cached_t *)datatype->cached_cuda_iov;
     return tmp->cuda_iov_is_cached;
 }
 
-void opal_ddt_set_cuda_iov_position(struct opal_convertor_t *convertor, size_t ddt_offset, const uint32_t *cached_cuda_iov_nb_bytes_list_h, const uint32_t cuda_iov_count)
-{
-    int i;
-    size_t iov_size = 0;
-    size_t ddt_size;
-    convertor->current_iov_partial_length = 0;
-    convertor->current_cuda_iov_pos = 0;
-    convertor->current_count = 0;
-    if (ddt_offset == 0) {
-       return;
-    }
-    opal_datatype_type_size(convertor->pDesc, &ddt_size);
-    convertor->current_count = ddt_offset / ddt_size;
-    ddt_offset = ddt_offset % ddt_size;
-    for(i = 0; i < cuda_iov_count; i++) {
-        iov_size += cached_cuda_iov_nb_bytes_list_h[i];
-        if (iov_size > ddt_offset) {
-            convertor->current_iov_partial_length = iov_size - ddt_offset;
-            convertor->current_cuda_iov_pos = i;
-            break;
-        } else if (iov_size == ddt_offset){
-            convertor->current_iov_partial_length = 0;
-            convertor->current_cuda_iov_pos = i+1;
-            break;
-        }
-    }
-}
-
-void opal_ddt_set_ddt_iov_position(struct opal_convertor_t *convertor, size_t ddt_offset, const struct iovec *ddt_iov, const uint32_t ddt_iov_count)
-{
-    int i;
-    size_t iov_size = 0;
-    size_t ddt_size;
-    convertor->current_iov_partial_length = 0;
-    convertor->current_iov_pos = 0;
-    convertor->current_count = 0;
-    if (ddt_offset == 0) {
-       return;
-    }
-    opal_datatype_type_size(convertor->pDesc, &ddt_size);
-    convertor->current_count = ddt_offset / ddt_size;
-    ddt_offset = ddt_offset % ddt_size;
-    for(i = 0; i < ddt_iov_count; i++) {
-        iov_size += ddt_iov[i].iov_len;
-        if (iov_size > ddt_offset) {
-            convertor->current_iov_partial_length = iov_size - ddt_offset;
-            convertor->current_iov_pos = i;
-            break;
-        } else if (iov_size == ddt_offset){
-            convertor->current_iov_partial_length = 0;
-            convertor->current_iov_pos = i+1;
-            break;
-        }
-    }
-}
-
 void opal_ddt_check_cuda_iov_is_full(struct opal_convertor_t *convertor, uint32_t cuda_iov_count)
 {
 #if 0
diff --git a/opal/datatype/cuda/opal_datatype_cuda.cuh b/opal/datatype/cuda/opal_datatype_cuda.cuh
index c33ff606bd9..d0f9ba2f27f 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cuh
+++ b/opal/datatype/cuda/opal_datatype_cuda.cuh
@@ -29,13 +29,25 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pCon
                                                           uint32_t* out_size,
                                                           size_t* max_data ); 
                                                           
-int32_t opal_ddt_generic_simple_pack_function_cuda_iov_non_cached( opal_convertor_t* pConvertor, unsigned char *destination, size_t buffer_size, size_t *total_packed);                                        
-
-int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_non_cached( opal_convertor_t* pConvertor, unsigned char *source, size_t buffer_size, size_t *total_unpacked);
+int32_t opal_ddt_generic_simple_pack_function_cuda_iov_non_cached( opal_convertor_t* pConvertor,
+                                                                   struct iovec* iov, 
+                                                                   uint32_t* out_size,
+                                                                   size_t* max_data );                                              
+
+int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_non_cached( opal_convertor_t* pConvertor,
+                                                                     struct iovec* iov, 
+                                                                     uint32_t* out_size,
+                                                                     size_t* max_data ); 
                                                                                                                     
-int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t* pConvertor, unsigned char *destination, size_t buffer_size, size_t *total_packed);                                        
+int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t* pConvertor,
+                                                               struct iovec* iov, 
+                                                               uint32_t* out_size,
+                                                               size_t* max_data );                                              
 
-int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_t* pConvertor, unsigned char *source, size_t buffer_size, size_t *total_unpacked);
+int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_t* pConvertor,
+                                                                 struct iovec* iov, 
+                                                                 uint32_t* out_size,
+                                                                 size_t* max_data ); 
 
 void pack_contiguous_loop_cuda( dt_elem_desc_t* ELEM,
                                 uint32_t* COUNT,
@@ -117,14 +129,6 @@ uint8_t opal_ddt_cuda_iov_is_cached(struct opal_convertor_t *convertor);
 
 void opal_ddt_check_cuda_iov_is_full(struct opal_convertor_t *convertor, uint32_t cuda_iov_count);
 
-void opal_ddt_set_cuda_iov_position(struct opal_convertor_t *convertor, size_t ddt_offset, const uint32_t *cached_cuda_iov_nb_bytes_list_h, const uint32_t cuda_iov_count);
-
-void opal_ddt_set_ddt_iov_position(struct opal_convertor_t *convertor, size_t ddt_offset, const struct iovec *ddt_iov,  const uint32_t ddt_iov_count);
-
-int32_t opal_ddt_cache_cuda_iov(opal_convertor_t* pConvertor, uint32_t *cuda_iov_count);
-
-uint8_t opal_ddt_iov_to_cuda_iov(opal_convertor_t* pConvertor, const struct iovec *ddt_iov, ddt_cuda_iov_dist_cached_t* cuda_iov_dist_h_current, uint32_t ddt_iov_start_pos, uint32_t ddt_iov_end_pos, size_t *buffer_size, uint32_t *nb_blocks_used, size_t *total_packed, size_t *contig_disp_out, uint32_t *current_ddt_iov_pos);
-
 }
                             
 #endif  /* OPAL_DATATYPE_CUDA_H_HAS_BEEN_INCLUDED */
diff --git a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
index 72edcb3d8a3..2fc03173f51 100644
--- a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
+++ b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
@@ -59,8 +59,8 @@ typedef struct {
 } ddt_cuda_iov_dist_non_cached_t;
 
 typedef struct {
-    size_t ncontig_disp;
-    size_t contig_disp;
+    size_t ptr_offset;
+    uint32_t nb_bytes;
 } ddt_cuda_iov_dist_cached_t;
 
 typedef struct {
@@ -71,9 +71,11 @@ typedef struct {
 } ddt_cuda_iov_total_cached_t;
 
 typedef struct {
-    ddt_cuda_iov_dist_cached_t* cuda_iov_dist_non_cached_h;
-    ddt_cuda_iov_dist_cached_t* cuda_iov_dist_non_cached_d;
+    ddt_cuda_iov_dist_non_cached_t* cuda_iov_dist_non_cached_h;
+    ddt_cuda_iov_dist_non_cached_t* cuda_iov_dist_non_cached_d;
     ddt_cuda_iov_dist_cached_t* cuda_iov_dist_cached_h;
+    uintptr_t *cuda_iov_contig_buf_h;
+    uintptr_t *cuda_iov_contig_buf_d;
     cudaStream_t *cuda_stream;
     int32_t cuda_stream_id;
     cudaEvent_t cuda_event;
@@ -137,9 +139,9 @@ __global__ void opal_generic_simple_pack_cuda_iov_non_cached_kernel( ddt_cuda_io
 
 __global__ void opal_generic_simple_unpack_cuda_iov_non_cached_kernel( ddt_cuda_iov_dist_non_cached_t* cuda_iov_dist, int nb_blocks_used);
 
-__global__ void opal_generic_simple_pack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, uint32_t cuda_iov_pos, uint32_t cuda_iov_count, uint32_t ddt_extent, uint32_t current_count, int nb_blocks_used, unsigned char* source_base, unsigned char* destination_base);
+__global__ void opal_generic_simple_pack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, uintptr_t* cuda_iov_contig_buf_d, int nb_blocks_used, unsigned char* source_base);
 
-__global__ void opal_generic_simple_unpack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, uint32_t cuda_iov_pos, uint32_t cuda_iov_count, uint32_t ddt_extent, uint32_t current_count, int nb_blocks_used, unsigned char* destination_base, unsigned char* source_base, size_t cuda_iov_partial_length_start, size_t cuda_iov_partial_length_end);
+__global__ void opal_generic_simple_unpack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, uintptr_t* cuda_iov_contig_buf_d, int nb_blocks_used, unsigned char* destination_base, size_t cuda_iov_partial_length_start, size_t cuda_iov_partial_length_end);
 
 void opal_cuda_output(int output_id, const char *format, ...);
 
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
index 2564fe1393c..8ddbe77d99d 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
@@ -88,64 +88,50 @@ __global__ void opal_generic_simple_pack_cuda_iov_non_cached_kernel( ddt_cuda_io
     }
 }
 
-__global__ void opal_generic_simple_pack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, uint32_t cuda_iov_pos, uint32_t cuda_iov_count, uint32_t ddt_extent, uint32_t current_count, int nb_blocks_used, unsigned char* source_base, unsigned char* destination_base)
+__global__ void opal_generic_simple_pack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, uintptr_t* cuda_iov_contig_buf_d, int nb_blocks_used, unsigned char* source_base)
 {
     uint32_t i, j;
-    uint32_t _nb_bytes;    
-    size_t src_offset, dst_offset;
+    size_t src_offset;
+    unsigned char *dst;
     unsigned char *_source_tmp, *_destination_tmp;
-    uint32_t current_cuda_iov_pos = cuda_iov_pos;
-    size_t destination_disp = cuda_iov_dist[current_cuda_iov_pos].contig_disp;
-    size_t contig_disp;
-    uint32_t _my_cuda_iov_pos;
-    uint32_t _my_cuda_iov_iteration;
-    size_t ddt_size = cuda_iov_dist[cuda_iov_count].contig_disp;
     
     __shared__ uint32_t nb_tasks;
-    uint32_t copy_count;
-    uint8_t alignment;
+    __shared__ uint32_t copy_count;
+    __shared__ uint8_t alignment;
     
     if (threadIdx.x == 0) {
         nb_tasks = nb_blocks_used / gridDim.x;
         if (blockIdx.x < (nb_blocks_used % gridDim.x)) {
             nb_tasks ++;
         }
-    //    printf("cuda_iov_count %d, ddt_extent %d, current_count %d\n", cuda_iov_count, ddt_extent, current_count);
-    //     printf("nb_tasks %d, griddim %d, nb_blocks_used %d, bloid %d \n", nb_tasks, gridDim.x, nb_blocks_used, blockIdx.x);
+   //     printf("nb_tasks %d, griddim %d, nb_blocks_used %d, bloid %d \n", nb_tasks, gridDim.x, nb_blocks_used, blockIdx.x);
     }
     __syncthreads();
     
     for (i = 0; i < nb_tasks; i++) {
-        /* these 3 variables are used multiple times, so put in in register */
-        _my_cuda_iov_pos = (blockIdx.x + i * gridDim.x + current_cuda_iov_pos) % cuda_iov_count;
-        _my_cuda_iov_iteration = (blockIdx.x + i * gridDim.x + current_cuda_iov_pos) / cuda_iov_count;
-        contig_disp = cuda_iov_dist[_my_cuda_iov_pos].contig_disp;  
+        src_offset = cuda_iov_dist[blockIdx.x + i * gridDim.x].ptr_offset;
+        dst = (unsigned char *)cuda_iov_contig_buf_d[blockIdx.x + i * gridDim.x];
         
-        src_offset = cuda_iov_dist[_my_cuda_iov_pos].ncontig_disp + (_my_cuda_iov_iteration + current_count) * ddt_extent;
-        dst_offset = contig_disp + ddt_size * _my_cuda_iov_iteration - destination_disp;
-        _nb_bytes = cuda_iov_dist[_my_cuda_iov_pos + 1].contig_disp - contig_disp;
-        
-        _source_tmp = source_base + src_offset;
-        _destination_tmp = destination_base + dst_offset;
-        /* block size is either multiple of ALIGNMENT_DOUBLE or residule */
-        if ((uintptr_t)(_source_tmp) % ALIGNMENT_DOUBLE == 0 && (uintptr_t)(_destination_tmp) % ALIGNMENT_DOUBLE == 0 && _nb_bytes % ALIGNMENT_DOUBLE == 0) {
-            alignment = ALIGNMENT_DOUBLE;
-        } else if ((uintptr_t)(_source_tmp) % ALIGNMENT_FLOAT == 0 && (uintptr_t)(_destination_tmp) % ALIGNMENT_FLOAT == 0 && _nb_bytes % ALIGNMENT_FLOAT == 0) {
-            alignment = ALIGNMENT_FLOAT;
-        } else {
-            alignment = ALIGNMENT_CHAR;
-        }
-        copy_count = _nb_bytes / alignment;
-    /*    
-        if (threadIdx.x == 0 && nb_tasks != 0) {
-            printf("pack block %d, src_offset %ld, dst_offset %ld, count %d, nb_bytes %d, nb_tasks %d, i %d\n", blockIdx.x, src_offset, dst_offset, copy_count, _nb_bytes, nb_tasks, i);
+        if (threadIdx.x == 0) {
+            _source_tmp = source_base + src_offset;
+            _destination_tmp = dst;
+            uint32_t _nb_bytes = cuda_iov_dist[blockIdx.x + i * gridDim.x].nb_bytes;
+            /* block size is either multiple of ALIGNMENT_DOUBLE or residule */
+            if ((uintptr_t)(_source_tmp) % ALIGNMENT_DOUBLE == 0 && (uintptr_t)_destination_tmp % ALIGNMENT_DOUBLE == 0 && _nb_bytes % ALIGNMENT_DOUBLE == 0) {
+                alignment = ALIGNMENT_DOUBLE;
+            } else if ((uintptr_t)(_source_tmp) % ALIGNMENT_FLOAT == 0 && (uintptr_t)_destination_tmp % ALIGNMENT_FLOAT == 0 && _nb_bytes % ALIGNMENT_FLOAT == 0) {
+                alignment = ALIGNMENT_FLOAT;
+            } else {
+                alignment = ALIGNMENT_CHAR;
+            }
+            copy_count = _nb_bytes / alignment;
         }
         __syncthreads();
-      */  
+        
         for (j = threadIdx.x; j < copy_count; j += blockDim.x) {
             if (j < copy_count) {
                 _source_tmp = source_base + src_offset + j * alignment;
-                _destination_tmp = destination_base + dst_offset + j * alignment;
+                _destination_tmp = dst + j * alignment;
 #if !defined (OPAL_DATATYPE_CUDA_DRY_RUN)
                 if (alignment == ALIGNMENT_DOUBLE) {
                     *((long *)_destination_tmp) = *((long *)_source_tmp);
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
index 0137601bf70..55cb955808e 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
@@ -664,102 +664,9 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConve
                                                         uint32_t* out_size,
                                                         size_t* max_data )
 {
-    size_t buffer_size;
-    unsigned char *destination;
-    size_t total_packed;
-    uint8_t transfer_required, free_required;
-
-#if defined(OPAL_DATATYPE_CUDA_TIMING)    
-    TIMER_DATA_TYPE start, end, start_total, end_total;
-    long total_time, move_time;
-#endif
-
- //   printf("buffer size %d, max_data %d\n", iov[0].iov_len, *max_data);
-    if ((iov[0].iov_base == NULL) || opal_ddt_cuda_is_gpu_buffer(iov[0].iov_base)) {
-        if (iov[0].iov_len == 0) {
-            buffer_size = DT_CUDA_BUFFER_SIZE;
-        } else {
-            buffer_size = iov[0].iov_len;
-        }
-        
-        if (iov[0].iov_base == NULL) {
-            iov[0].iov_base = (unsigned char *)opal_ddt_cuda_malloc_gpu_buffer(buffer_size, 0);
-            destination = (unsigned char *)iov[0].iov_base;
-            pConvertor->gpu_buffer_ptr = destination;
-            free_required = 1;
-        } else {
-            destination = (unsigned char *)iov[0].iov_base;
-            free_required = 0;
-        }
-        transfer_required = 0;
-    } else {
-        buffer_size = iov[0].iov_len;
-        if (OPAL_DATATYPE_VECTOR_USE_ZEROCPY) {
-            pConvertor->gpu_buffer_ptr = NULL;
-            transfer_required = 0;
-            free_required = 0;
-            cudaHostGetDevicePointer((void **)&destination, (void *)iov[0].iov_base, 0);
-        } else {
-            if (pConvertor->gpu_buffer_ptr == NULL) {
-                pConvertor->gpu_buffer_ptr = (unsigned char*)opal_ddt_cuda_malloc_gpu_buffer(buffer_size, 0);
-            }
-            transfer_required = 1;
-            free_required = 1;
-            destination = pConvertor->gpu_buffer_ptr;
-        }
-    }   
-
-    total_packed = 0;
-    
-#if defined(OPAL_DATATYPE_CUDA_TIMING)
-    GET_TIME(start_total);
-#endif
-    
-    /* start pack */
-    if (cuda_iov_cache_enabled) {
-        opal_ddt_generic_simple_pack_function_cuda_iov_cached(pConvertor, destination, buffer_size, &total_packed);
-    } else {
-        opal_ddt_generic_simple_pack_function_cuda_iov_non_cached(pConvertor, destination, buffer_size, &total_packed);
-    }
-
-    pConvertor->bConverted += total_packed;
-    DT_CUDA_DEBUG ( opal_cuda_output(2, "Pack total packed %d\n", total_packed); );
-    
-#if defined(OPAL_DATATYPE_CUDA_TIMING)
-    GET_TIME(start);
-#endif
-    if (transfer_required) {
-        cudaMemcpy(iov[0].iov_base, pConvertor->gpu_buffer_ptr, total_packed, cudaMemcpyDeviceToHost);
-    } 
-#if defined(OPAL_DATATYPE_CUDA_TIMING) 
-    GET_TIME( end );
-    move_time = ELAPSED_TIME( start, end );
-    DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: DtoH memcpy in %ld microsec, transfer required %d\n", move_time, transfer_required ); );
-#endif
-
-    iov[0].iov_len = total_packed;
-    *max_data = total_packed;
-    *out_size = 1;
-    
-#if defined(OPAL_DATATYPE_CUDA_TIMING)    
-    GET_TIME( end_total );
-    total_time = ELAPSED_TIME( start_total, end_total );
-    DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: total packing in %ld microsec, kernel %ld microsec\n", total_time, total_time-move_time ); );
-#endif
-    
-    if( pConvertor->bConverted == pConvertor->local_size ) {
-        pConvertor->flags |= CONVERTOR_COMPLETED;
-        if (pConvertor->gpu_buffer_ptr != NULL && free_required) {
-           opal_ddt_cuda_free_gpu_buffer(pConvertor->gpu_buffer_ptr, 0);
-           pConvertor->gpu_buffer_ptr = NULL;
-        }
-        return 1;
-    }        
-    return 0; 
+    return opal_ddt_generic_simple_pack_function_cuda_iov_cached(pConvertor, iov, out_size, max_data);   
 }
 
-#if 0
-
 int32_t opal_ddt_generic_simple_pack_function_cuda_iov_non_cached( opal_convertor_t* pConvertor,
                                                                    struct iovec* iov,
                                                                    uint32_t* out_size,
@@ -1020,177 +927,279 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_non_cached( opal_converto
     return 0;
 }
 
-#endif
-
-
-int32_t opal_ddt_generic_simple_pack_function_cuda_iov_non_cached( opal_convertor_t* pConvertor, unsigned char *destination, size_t buffer_size, size_t *total_packed)
+int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t* pConvertor,
+                                                               struct iovec* iov,
+                                                               uint32_t* out_size,
+                                                               size_t* max_data )
 {
-    uint32_t i;
+    uint32_t i, j;
+    uint32_t count_desc, nb_blocks_per_description, residue_desc;
     uint32_t nb_blocks, thread_per_block, nb_blocks_used;
-    unsigned char *destination_base, *source_base;
-    uint8_t buffer_isfull = 0;
+    size_t length, buffer_size, length_per_iovec;
+    unsigned char *destination, *destination_base, *source_base, *source;
+    size_t total_packed, packed_w_cache ,packed_wo_cache;
+    int32_t complete_flag = 0;
+    uint8_t buffer_isfull = 0, transfer_required, free_required;
+    uint32_t convertor_flags;
+//    dt_elem_desc_t* description;
+//    dt_elem_desc_t* pElem;
+//    dt_stack_t* pStack;
+    uint8_t alignment, orig_alignment;
+//    int32_t orig_stack_index;
     cudaError_t cuda_err;
     ddt_cuda_stream_t *cuda_streams = current_cuda_device->cuda_streams;
     ddt_cuda_iov_dist_cached_t* cuda_iov_dist_h_current;
     ddt_cuda_iov_dist_cached_t* cuda_iov_dist_d_current;
+    uintptr_t *cuda_iov_contig_buf_h_current, *cuda_iov_contig_buf_d_current;
     ddt_cuda_iov_pipeline_block_t *cuda_iov_pipeline_block;
     int iov_pipeline_block_id = 0;
     cudaStream_t *cuda_stream_iov = NULL;
     const struct iovec *ddt_iov = NULL;
     uint32_t ddt_iov_count = 0;
-    size_t contig_disp = 0;
-    uint32_t ddt_iov_start_pos, ddt_iov_end_pos, current_ddt_iov_pos;
-    OPAL_PTRDIFF_TYPE ddt_extent;
+    size_t iov_len = 0;
+    uint32_t iov_start_pos, iov_end_pos, cuda_iov_start_pos, cuda_iov_end_pos;
+    ddt_cuda_iov_total_cached_t* cached_cuda_iov;
+    ddt_cuda_iov_dist_cached_t* cached_cuda_iov_dist_d;
+    uint32_t *cached_cuda_iov_nb_bytes_list_h, *cuda_iov_nb_bytes_list_h_current;
+    uint32_t cached_cuda_iov_count = 0;
+    uint8_t cuda_iov_is_cached = 0;
 
 #if defined(OPAL_DATATYPE_CUDA_TIMING)    
-    TIMER_DATA_TYPE start, end;
-    long total_time;
+    TIMER_DATA_TYPE start, end, start_total, end_total;
+    long total_time, move_time;
 #endif
     
-    DT_CUDA_DEBUG ( opal_cuda_output(2, "Pack using IOV non cached, GPU base %p, pack to buffer %p\n", pConvertor->pBaseBuf, destination););
+    /*description = pConvertor->use_desc->desc;
+    pStack = pConvertor->pStack + pConvertor->stack_pos;
+    pElem = &(description[pStack->index]);
+    printf("size elem %lu, size %d\n", pElem->elem.common.type, opal_datatype_basicDatatypes[pElem->elem.common.type]->size);
+    */
     
-    opal_convertor_raw_cached( pConvertor, &ddt_iov, &ddt_iov_count);
-    if (ddt_iov == NULL) {
-        DT_CUDA_DEBUG ( opal_cuda_output(0, "Can not get ddt iov\n"););
-        return OPAL_ERROR;
-    }
+//    assert(opal_datatype_basicDatatypes[pElem->elem.common.type]->size != 0);
+
+ //   printf("buffer size %d, max_data %d\n", iov[0].iov_len, *max_data);
+    if ((iov[0].iov_base == NULL) || opal_ddt_cuda_is_gpu_buffer(iov[0].iov_base)) {
+        if (iov[0].iov_len == 0) {
+            buffer_size = DT_CUDA_BUFFER_SIZE;
+        } else {
+            buffer_size = iov[0].iov_len;
+        }
+        
+        if (iov[0].iov_base == NULL) {
+            iov[0].iov_base = (unsigned char *)opal_ddt_cuda_malloc_gpu_buffer(buffer_size, 0);
+            destination = (unsigned char *)iov[0].iov_base;
+            pConvertor->gpu_buffer_ptr = destination;
+            free_required = 1;
+        } else {
+            destination = (unsigned char *)iov[0].iov_base;
+            free_required = 0;
+        }
+        transfer_required = 0;
+    } else {
+        buffer_size = iov[0].iov_len;
+        if (OPAL_DATATYPE_VECTOR_USE_ZEROCPY) {
+            pConvertor->gpu_buffer_ptr = NULL;
+            transfer_required = 0;
+            free_required = 0;
+            cudaHostGetDevicePointer((void **)&destination, (void *)iov[0].iov_base, 0);
+        } else {
+            if (pConvertor->gpu_buffer_ptr == NULL) {
+                pConvertor->gpu_buffer_ptr = (unsigned char*)opal_ddt_cuda_malloc_gpu_buffer(buffer_size, 0);
+            }
+            transfer_required = 1;
+            free_required = 1;
+            destination = pConvertor->gpu_buffer_ptr;
+        }
+    }   
     
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "Pack using IOV cached, GPU base %p, pack to buffer %p\n", pConvertor->pBaseBuf, destination););
+
+    total_packed = 0;
+    packed_wo_cache = 0;
+    packed_w_cache = 0;
     cuda_streams->current_stream_id = 0;
-    thread_per_block = CUDA_WARP_SIZE * 5;
-    nb_blocks = 256;
-    opal_datatype_type_extent(pConvertor->pDesc, &ddt_extent);
-    source_base = (unsigned char*)pConvertor->pBaseBuf + pConvertor->current_count * ddt_extent; 
+  //  orig_stack_index = pStack->index;
     destination_base = destination;
-    
-    for (i = 0; i < NB_STREAMS; i++) {
-        cudaStreamSynchronize(cuda_streams->opal_cuda_stream[i]);
-    }
-    
-    while( pConvertor->current_count < pConvertor->count && !buffer_isfull) {
-        
-        nb_blocks_used = 0;
-        ddt_iov_start_pos = pConvertor->current_iov_pos;
-        ddt_iov_end_pos = ddt_iov_start_pos + IOV_PIPELINE_SIZE;
-        if (ddt_iov_end_pos > ddt_iov_count) {
-            ddt_iov_end_pos = ddt_iov_count;
-        }
-        cuda_iov_pipeline_block = current_cuda_device->cuda_iov_pipeline_block[iov_pipeline_block_id];
-        cuda_iov_dist_h_current = cuda_iov_pipeline_block->cuda_iov_dist_non_cached_h;
-        cuda_iov_dist_d_current = cuda_iov_pipeline_block->cuda_iov_dist_non_cached_d;
-        cuda_stream_iov = cuda_iov_pipeline_block->cuda_stream;
-        cuda_err = cudaStreamWaitEvent(*cuda_stream_iov, cuda_iov_pipeline_block->cuda_event, 0);
-        opal_cuda_check_error(cuda_err);
 
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
-        GET_TIME(start);
+    GET_TIME(start_total);
 #endif
-
-        buffer_isfull = opal_ddt_iov_to_cuda_iov(pConvertor, ddt_iov, cuda_iov_dist_h_current, ddt_iov_start_pos, ddt_iov_end_pos, &buffer_size, &nb_blocks_used, total_packed, &contig_disp, &current_ddt_iov_pos);
-
-#if defined(OPAL_DATATYPE_CUDA_TIMING)    
-        GET_TIME( end );
-        total_time = ELAPSED_TIME( start, end );
-        DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Pack src %p to dest %p, iov is prepared in %ld microsec, kernel submitted to CUDA stream %d, nb_blocks %d\n", source_base, destination_base, total_time,  cuda_iov_pipeline_block->cuda_stream_id, nb_blocks_used); );
+    
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+    GET_TIME(start);
 #endif
-
-        cudaMemcpyAsync(cuda_iov_dist_d_current, cuda_iov_dist_h_current, sizeof(ddt_cuda_iov_dist_cached_t)*(nb_blocks_used+1), cudaMemcpyHostToDevice, *cuda_stream_iov);
-        opal_generic_simple_pack_cuda_iov_cached_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cuda_iov_dist_d_current, 0, nb_blocks_used, 0, 0, nb_blocks_used, source_base, destination_base);
-        cuda_err = cudaEventRecord(cuda_iov_pipeline_block->cuda_event, *cuda_stream_iov);
-        opal_cuda_check_error(cuda_err);
-        iov_pipeline_block_id ++;
-        iov_pipeline_block_id = iov_pipeline_block_id % NB_STREAMS;
-        destination_base += contig_disp;
-        
-        if (!buffer_isfull) {
-            pConvertor->current_iov_pos = current_ddt_iov_pos;
-            if (current_ddt_iov_pos == ddt_iov_count) {
-                pConvertor->current_count ++;
-                pConvertor->current_iov_pos = 0;
-                source_base += ddt_extent;
-            }
-        }
-        
-    }
     
-    for (i = 0; i < NB_STREAMS; i++) {
-        cudaStreamSynchronize(cuda_streams->opal_cuda_stream[i]);
-    }
-        
-    return OPAL_SUCCESS;
-}
-
-int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t* pConvertor, unsigned char *destination, size_t buffer_size, size_t *total_packed)
-{
-    uint32_t i;
-    uint32_t nb_blocks, thread_per_block, nb_blocks_used;
-    unsigned char *destination_base, *source_base;
-    uint8_t buffer_isfull = 0;
-    cudaError_t cuda_err;
-    ddt_cuda_stream_t *cuda_streams = current_cuda_device->cuda_streams;
-    ddt_cuda_iov_pipeline_block_t *cuda_iov_pipeline_block = NULL;
-    cudaStream_t *cuda_stream_iov = NULL;
-    uint32_t cuda_iov_start_pos, cuda_iov_end_pos;
-    ddt_cuda_iov_total_cached_t* cached_cuda_iov = NULL;
-    ddt_cuda_iov_dist_cached_t* cached_cuda_iov_dist_d = NULL;
-    uint32_t *cached_cuda_iov_nb_bytes_list_h = NULL;
-    uint32_t cached_cuda_iov_count = 0;
-    opal_datatype_count_t convertor_current_count;
-    OPAL_PTRDIFF_TYPE ddt_extent;
+    opal_convertor_raw_cached( pConvertor, &ddt_iov, &ddt_iov_count);
+    assert(ddt_iov != NULL);
+    opal_ddt_get_cached_cuda_iov(pConvertor, &cached_cuda_iov);
+    cached_cuda_iov_dist_d = cached_cuda_iov->cuda_iov_dist_d;
+    assert(cached_cuda_iov_dist_d != NULL);
+    cached_cuda_iov_nb_bytes_list_h = cached_cuda_iov->nb_bytes_h;
+    assert(cached_cuda_iov_nb_bytes_list_h != NULL);
+    cached_cuda_iov_count = cached_cuda_iov->cuda_iov_count;
+    cuda_iov_is_cached = cached_cuda_iov->cuda_iov_is_cached;
+    DT_CUDA_DEBUG ( opal_cuda_output(4, "Pack iov count %d, submit to CUDA stream %d\n", ddt_iov_count, cuda_streams->current_stream_id); );
 
 #if defined(OPAL_DATATYPE_CUDA_TIMING)    
-    TIMER_DATA_TYPE start, end;
-    long total_time;
+    GET_TIME( end );
+    total_time = ELAPSED_TIME( start, end );
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: ddt to iov in %ld microsec\n", total_time ); );
 #endif
     
-    DT_CUDA_DEBUG ( opal_cuda_output(2, "Pack using IOV cached, GPU base %p, pack to buffer %p\n", pConvertor->pBaseBuf, destination););
-
-    cuda_streams->current_stream_id = 0;
-    destination_base = destination;
     thread_per_block = CUDA_WARP_SIZE * 5;
     nb_blocks = 256;
     source_base = (unsigned char*)pConvertor->pBaseBuf; 
     
     /* cuda iov is not cached, start to cache iov */
     if(opal_ddt_cuda_iov_is_cached(pConvertor) == 0) {
+    
+        iov_start_pos = pConvertor->current_iov_pos;
+        iov_end_pos = iov_start_pos + IOV_PIPELINE_SIZE;
+        if (iov_end_pos > ddt_iov_count) {
+            iov_end_pos = ddt_iov_count;
+        }
+    
+        while (iov_start_pos < iov_end_pos && !buffer_isfull) {
+        
+            nb_blocks_used = 0;
+            cuda_iov_pipeline_block = current_cuda_device->cuda_iov_pipeline_block[iov_pipeline_block_id];
+            cuda_iov_dist_h_current = cuda_iov_pipeline_block->cuda_iov_dist_cached_h;
+            cuda_iov_contig_buf_h_current = cuda_iov_pipeline_block->cuda_iov_contig_buf_h;
+            cuda_iov_contig_buf_d_current = cuda_iov_pipeline_block->cuda_iov_contig_buf_d;
+            cuda_iov_dist_d_current = cached_cuda_iov_dist_d + pConvertor->current_cuda_iov_pos;
+            cuda_iov_nb_bytes_list_h_current = cached_cuda_iov_nb_bytes_list_h + pConvertor->current_cuda_iov_pos;
+            cuda_stream_iov = cuda_iov_pipeline_block->cuda_stream;
+            cuda_err = cudaStreamWaitEvent(*cuda_stream_iov, cuda_iov_pipeline_block->cuda_event, 0);
+            opal_cuda_check_error(cuda_err);
+
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
-        GET_TIME(start);
+            GET_TIME(start);
 #endif
-        if (opal_ddt_cache_cuda_iov(pConvertor, &nb_blocks_used) == OPAL_SUCCESS) {
-            opal_ddt_set_cuda_iov_cached(pConvertor, nb_blocks_used);
-            DT_CUDA_DEBUG ( opal_cuda_output(2, "Pack cuda iov is cached, count %d\n", nb_blocks_used););
-        } else {
-            DT_CUDA_DEBUG ( opal_cuda_output(0, "Pack cache cuda iov is failed\n"););
-            return OPAL_ERROR;
-        }
+
+            for (i = iov_start_pos; i < iov_end_pos && !buffer_isfull; i++) {
+                if (pConvertor->current_iov_partial_length > 0) {
+                    iov_len = pConvertor->current_iov_partial_length;
+                    pConvertor->current_iov_partial_length = 0;
+                } else {
+                    iov_len = ddt_iov[i].iov_len;
+                }
+                if (buffer_size >= iov_len) {
+                    length_per_iovec = iov_len;
+                } else {
+                    /*orig_alignment = opal_datatype_basicDatatypes[pElem->elem.common.type]->size;*/
+                    orig_alignment = ALIGNMENT_CHAR;
+                    length_per_iovec = buffer_size / orig_alignment * orig_alignment;
+                    buffer_isfull = 1;
+                    pConvertor->current_iov_partial_length = iov_len - length_per_iovec;
+                    pConvertor->current_iov_pos = i;
+                }
+                buffer_size -= length_per_iovec;
+                packed_wo_cache += length_per_iovec;
+                source = (size_t)(ddt_iov[i].iov_base) + (ddt_iov[i].iov_len - iov_len) + source_base;
+            
+                /* block size is either multiple of ALIGNMENT_DOUBLE or residule */
+                alignment = ALIGNMENT_DOUBLE;
+
+                count_desc = length_per_iovec / alignment;
+                residue_desc = length_per_iovec % alignment;
+                nb_blocks_per_description = (count_desc + thread_per_block - 1) / thread_per_block;
+                DT_CUDA_DEBUG ( opal_cuda_output(10, "Pack description %d, size %d, residue %d, alignment %d, nb_block_aquired %d\n", i, count_desc, residue_desc, alignment, nb_blocks_per_description); );
+                for (j = 0; j < nb_blocks_per_description; j++) {
+                    cuda_iov_dist_h_current[nb_blocks_used].ptr_offset = source + j * thread_per_block * alignment - source_base;
+                    cuda_iov_contig_buf_h_current[nb_blocks_used] = (uintptr_t)destination;
+                    if ( (j+1) * thread_per_block <= count_desc) {
+                        cuda_iov_dist_h_current[nb_blocks_used].nb_bytes = thread_per_block * alignment;
+                    } else {
+                        cuda_iov_dist_h_current[nb_blocks_used].nb_bytes = (count_desc - j*thread_per_block) * alignment; 
+                    }
+#if defined (OPAL_DATATYPE_CUDA_DEBUG)
+                    assert(cuda_iov_dist_h_current[nb_blocks_used].nb_bytes > 0);
+#endif /* OPAL_DATATYPE_CUDA_DEBUG */
+                    cuda_iov_nb_bytes_list_h_current[nb_blocks_used] = cuda_iov_dist_h_current[nb_blocks_used].nb_bytes;
+                    destination += cuda_iov_dist_h_current[nb_blocks_used].nb_bytes;
+                    DT_CUDA_DEBUG( opal_cuda_output(12, "Pack \tblock %d, src_offset %ld, dst %p, nb_bytes %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].ptr_offset, cuda_iov_contig_buf_h_current[nb_blocks_used], cuda_iov_dist_h_current[nb_blocks_used].nb_bytes); );
+                    nb_blocks_used ++;
+                    assert (nb_blocks_used < CUDA_MAX_NB_BLOCKS*CUDA_IOV_MAX_TASK_PER_BLOCK);
+                }
+            
+                /* handle residue */
+                if (residue_desc != 0) {
+                    /*orig_alignment = opal_datatype_basicDatatypes[pElem->elem.common.type]->size;*/
+                    orig_alignment = ALIGNMENT_CHAR;
+                    cuda_iov_dist_h_current[nb_blocks_used].ptr_offset = source + length_per_iovec / alignment * alignment - source_base;
+                    cuda_iov_contig_buf_h_current[nb_blocks_used] = (uintptr_t)destination;
+                    cuda_iov_dist_h_current[nb_blocks_used].nb_bytes = length_per_iovec - length_per_iovec / alignment * alignment;
+#if defined (OPAL_DATATYPE_CUDA_DEBUG)
+                    assert(cuda_iov_dist_h_current[nb_blocks_used].nb_bytes > 0);
+#endif /* OPAL_DATATYPE_CUDA_DEBUG */
+                    cuda_iov_nb_bytes_list_h_current[nb_blocks_used] = cuda_iov_dist_h_current[nb_blocks_used].nb_bytes;
+                    destination += cuda_iov_dist_h_current[nb_blocks_used].nb_bytes;
+                    DT_CUDA_DEBUG( opal_cuda_output(12, "Pack \tblock %d, src_offset %ld, dst %p, nb_bytes %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].ptr_offset, cuda_iov_contig_buf_h_current[nb_blocks_used], cuda_iov_dist_h_current[nb_blocks_used].nb_bytes); );
+                    nb_blocks_used ++;
+                    assert (nb_blocks_used < CUDA_MAX_NB_BLOCKS*CUDA_IOV_MAX_TASK_PER_BLOCK);
+                }
+            }
+            
+            if (!buffer_isfull) {
+                pConvertor->current_iov_pos = i;
+            }
+
 #if defined(OPAL_DATATYPE_CUDA_TIMING)    
-        GET_TIME( end );
-        total_time = ELAPSED_TIME( start, end );
-        DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Pack cuda iov is cached in %ld microsec, nb_blocks %d\n", total_time, nb_blocks_used); );
+            GET_TIME( end );
+            total_time = ELAPSED_TIME( start, end );
+            DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Pack to dest %p, iov is prepared in %ld microsec, kernel submitted to CUDA stream %d, nb_blocks %d\n", destination_base, total_time,  cuda_iov_pipeline_block->cuda_stream_id, nb_blocks_used); );
 #endif
+
+          //  opal_ddt_check_cuda_iov_is_full(pConvertor, pConvertor->current_cuda_iov_pos + nb_blocks_used); /* make sure cuda iov has enough space */
+            cudaMemcpyAsync(cuda_iov_dist_d_current, cuda_iov_dist_h_current, sizeof(ddt_cuda_iov_dist_cached_t)*(nb_blocks_used), cudaMemcpyHostToDevice, *cuda_stream_iov);
+            cudaMemcpyAsync(cuda_iov_contig_buf_d_current, cuda_iov_contig_buf_h_current, sizeof(uintptr_t)*(nb_blocks_used), cudaMemcpyHostToDevice, *cuda_stream_iov);
+            DT_CUDA_DEBUG ( opal_cuda_output(2, "kernel launched src_base %p, dst_base %p, nb_blocks %ld\n", source_base, destination_base, nb_blocks_used ); );
+            opal_generic_simple_pack_cuda_iov_cached_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cuda_iov_dist_d_current, cuda_iov_contig_buf_d_current, nb_blocks_used, source_base);
+            cuda_err = cudaEventRecord(cuda_iov_pipeline_block->cuda_event, *cuda_stream_iov);
+            opal_cuda_check_error(cuda_err);
+            iov_pipeline_block_id ++;
+            iov_pipeline_block_id = iov_pipeline_block_id % NB_STREAMS;
+            pConvertor->current_cuda_iov_pos += nb_blocks_used;
+        
+    //        orig_stack_index = pStack->index;
+            iov_start_pos = iov_end_pos;
+            iov_end_pos = iov_start_pos + IOV_PIPELINE_SIZE;
+            if (iov_end_pos >= ddt_iov_count) {
+                iov_end_pos = ddt_iov_count;
+            }
+            /* count = 0 done, iov cached finished */
+            if (pConvertor->current_iov_pos == ddt_iov_count) {
+                pConvertor->current_count ++;
+                opal_ddt_set_cuda_iov_cached(pConvertor, pConvertor->current_cuda_iov_pos);
+                DT_CUDA_DEBUG ( opal_cuda_output(2, "Pack cuda iov is cached, count %d\n", pConvertor->current_cuda_iov_pos););
+            }
+            DT_CUDA_DEBUG ( opal_cuda_output(4, "Pack iov start pos %d end pos %d, submit to CUDA stream %d\n", iov_start_pos, iov_end_pos, cuda_streams->current_stream_id); );
+        }
     }
-    
-    /* now we use cached cuda iov */
-    opal_ddt_get_cached_cuda_iov(pConvertor, &cached_cuda_iov);
-    cached_cuda_iov_dist_d = cached_cuda_iov->cuda_iov_dist_d;
-    assert(cached_cuda_iov_dist_d != NULL);
-    cached_cuda_iov_nb_bytes_list_h = cached_cuda_iov->nb_bytes_h;
-    assert(cached_cuda_iov_nb_bytes_list_h != NULL);
-    
-    cached_cuda_iov_count = cached_cuda_iov->cuda_iov_count;
-    cuda_iov_start_pos = pConvertor->current_cuda_iov_pos;
-    cuda_iov_end_pos = cached_cuda_iov_count;
-    nb_blocks_used = 0;
-    cuda_iov_pipeline_block = current_cuda_device->cuda_iov_pipeline_block[0];
-    cuda_stream_iov = cuda_iov_pipeline_block->cuda_stream;
-    convertor_current_count = pConvertor->current_count;
+    total_packed += packed_wo_cache;
+    pConvertor->bConverted += packed_wo_cache;
+
    
+    /* now we use cached cuda iov */
+    if( pConvertor->bConverted != pConvertor->local_size && !buffer_isfull) {
+        cuda_iov_start_pos = pConvertor->current_cuda_iov_pos;
+        cuda_iov_end_pos = cached_cuda_iov_count;
+        nb_blocks_used = 0;
+        cuda_iov_pipeline_block = current_cuda_device->cuda_iov_pipeline_block[iov_pipeline_block_id];
+        cuda_iov_contig_buf_h_current = cuda_iov_pipeline_block->cuda_iov_contig_buf_h;
+        cuda_iov_contig_buf_d_current = cuda_iov_pipeline_block->cuda_iov_contig_buf_d;
+        cuda_iov_dist_d_current = cached_cuda_iov_dist_d + pConvertor->current_cuda_iov_pos;
+        cuda_stream_iov = cuda_iov_pipeline_block->cuda_stream;
+        cuda_err = cudaStreamWaitEvent(*cuda_stream_iov, cuda_iov_pipeline_block->cuda_event, 0);
+        opal_cuda_check_error(cuda_err);
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
-    GET_TIME(start);
+        GET_TIME(start);
 #endif
-    while( pConvertor->current_count < pConvertor->count && !buffer_isfull) {
         for (i = cuda_iov_start_pos; i < cuda_iov_end_pos && !buffer_isfull; i++) {
             if (buffer_size >= cached_cuda_iov_nb_bytes_list_h[i]) {
-                *total_packed += cached_cuda_iov_nb_bytes_list_h[i];
+                cuda_iov_contig_buf_h_current[nb_blocks_used] = (uintptr_t)destination;
+                destination += cached_cuda_iov_nb_bytes_list_h[i];
+                packed_w_cache += cached_cuda_iov_nb_bytes_list_h[i];
                 buffer_size -= cached_cuda_iov_nb_bytes_list_h[i];
                 nb_blocks_used++;
             } else {
@@ -1198,26 +1207,61 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
                 break;
             }
         }
-        if (!buffer_isfull) {
-            pConvertor->current_count ++;
-            cuda_iov_start_pos = 0;
-            cuda_iov_end_pos = cached_cuda_iov->cuda_iov_count;
-        }
-    }
+        printf("nb_blocks_used %d, my %d\n", nb_blocks_used, i - cuda_iov_start_pos);
 #if defined(OPAL_DATATYPE_CUDA_TIMING)    
+        GET_TIME( end );
+        total_time = ELAPSED_TIME( start, end );
+        DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Pack to dest %p, cached cuda iov is prepared in %ld microsec, kernel submitted to CUDA stream %d, nb_blocks %d\n", destination_base, total_time,  cuda_iov_pipeline_block->cuda_stream_id, nb_blocks_used); );
+#endif
+        pConvertor->current_cuda_iov_pos += nb_blocks_used;
+        cudaMemcpyAsync(cuda_iov_contig_buf_d_current, cuda_iov_contig_buf_h_current, sizeof(uintptr_t)*(nb_blocks_used), cudaMemcpyHostToDevice, *cuda_stream_iov);
+        DT_CUDA_DEBUG ( opal_cuda_output(2, "kernel launched src_base %p, dst_base %p, nb_blocks %ld\n", source_base, destination_base, nb_blocks_used ); );
+        opal_generic_simple_pack_cuda_iov_cached_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cuda_iov_dist_d_current, cuda_iov_contig_buf_d_current, nb_blocks_used, source_base);
+        cuda_err = cudaEventRecord(cuda_iov_pipeline_block->cuda_event, *cuda_stream_iov);
+        opal_cuda_check_error(cuda_err);
+        iov_pipeline_block_id ++;
+        iov_pipeline_block_id = iov_pipeline_block_id % NB_STREAMS;
+    }
+
+    for (i = 0; i < NB_STREAMS; i++) {
+        cudaStreamSynchronize(cuda_streams->opal_cuda_stream[i]);
+    }
+    
+    total_packed += packed_w_cache;
+    pConvertor->bConverted += packed_w_cache;
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "Pack total packed %d\n", total_packed); );
+    
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+    GET_TIME(start);
+#endif
+    if (transfer_required) {
+        cudaMemcpy(iov[0].iov_base, pConvertor->gpu_buffer_ptr, total_packed, cudaMemcpyDeviceToHost);
+    } 
+#if defined(OPAL_DATATYPE_CUDA_TIMING) 
     GET_TIME( end );
-    total_time = ELAPSED_TIME( start, end );
-    DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Pack to dest %p, cached cuda iov is prepared in %ld microsec, kernel submitted to CUDA stream %d, nb_blocks %d\n", destination_base, total_time,  cuda_iov_pipeline_block->cuda_stream_id, nb_blocks_used); );
+    move_time = ELAPSED_TIME( start, end );
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: DtoH memcpy in %ld microsec, transfer required %d\n", move_time, transfer_required ); );
 #endif
-    opal_datatype_type_extent(pConvertor->pDesc, &ddt_extent);
-    DT_CUDA_DEBUG ( opal_cuda_output(2, "Pack kernel launched src_base %p, dst_base %p, nb_blocks %ld, extent %ld\n", source_base, destination_base, nb_blocks_used, ddt_extent ); );
-    opal_generic_simple_pack_cuda_iov_cached_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cached_cuda_iov_dist_d, pConvertor->current_cuda_iov_pos, cached_cuda_iov_count, ddt_extent, convertor_current_count, nb_blocks_used, source_base, destination_base);
-    pConvertor->current_cuda_iov_pos += nb_blocks_used;
-    pConvertor->current_cuda_iov_pos = pConvertor->current_cuda_iov_pos % cached_cuda_iov->cuda_iov_count;
 
-    cudaStreamSynchronize(cuda_streams->opal_cuda_stream[0]);
+    iov[0].iov_len = total_packed;
+    *max_data = total_packed;
+    *out_size = 1;
+    
+#if defined(OPAL_DATATYPE_CUDA_TIMING)    
+    GET_TIME( end_total );
+    total_time = ELAPSED_TIME( start_total, end_total );
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: total packing in %ld microsec, kernel %ld microsec\n", total_time, total_time-move_time ); );
+#endif
     
-    return OPAL_SUCCESS;
+    if( pConvertor->bConverted == pConvertor->local_size ) {
+        pConvertor->flags |= CONVERTOR_COMPLETED;
+        if (pConvertor->gpu_buffer_ptr != NULL && free_required) {
+           opal_ddt_cuda_free_gpu_buffer(pConvertor->gpu_buffer_ptr, 0);
+           pConvertor->gpu_buffer_ptr = NULL;
+        }
+        return 1;
+    }        
+    return 0;
 }
 
 void pack_predefined_data_cuda( dt_elem_desc_t* ELEM,
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
index f6ee8e0bfc4..c553a7991b0 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
@@ -46,77 +46,60 @@ __global__ void opal_generic_simple_unpack_cuda_iov_non_cached_kernel( ddt_cuda_
     }
 }
 
-__global__ void opal_generic_simple_unpack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, uint32_t cuda_iov_pos, uint32_t cuda_iov_count, uint32_t ddt_extent, uint32_t current_count, int nb_blocks_used, unsigned char* destination_base, unsigned char* source_base, size_t cuda_iov_partial_length_start, size_t cuda_iov_partial_length_end)
+__global__ void opal_generic_simple_unpack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, uintptr_t* cuda_iov_contig_buf_d, int nb_blocks_used, unsigned char* destination_base, size_t cuda_iov_partial_length_start, size_t cuda_iov_partial_length_end)
 {
     uint32_t i, j;
-    size_t dst_offset, src_offset;
+    size_t dst_offset;
+    unsigned char *src;
     unsigned char *_source_tmp, *_destination_tmp;
-    uint32_t _nb_bytes;
-    uint32_t current_cuda_iov_pos = cuda_iov_pos;
-    size_t source_disp = cuda_iov_dist[current_cuda_iov_pos].contig_disp;
-    size_t source_partial_disp = 0;
-    size_t contig_disp; 
-    uint32_t _my_cuda_iov_pos;
-    uint32_t _my_cuda_iov_iteration;
-    size_t ddt_size = cuda_iov_dist[cuda_iov_count].contig_disp;
-
+    
     __shared__ uint32_t nb_tasks;
-    uint32_t copy_count;
-    uint8_t alignment;
+    __shared__ uint32_t copy_count;
+    __shared__ uint8_t alignment;
     
     if (threadIdx.x == 0) {
         nb_tasks = nb_blocks_used / gridDim.x;
         if (blockIdx.x < nb_blocks_used % gridDim.x) {
             nb_tasks ++;
         }
-     //   printf("cuda_iov_count %d, ddt_extent %d, current_count %d, ddt_size %d\n", cuda_iov_count, ddt_extent, current_count, ddt_size);
     }
     __syncthreads();
     
-    if (cuda_iov_partial_length_start != 0) {
-        source_partial_disp = (cuda_iov_dist[current_cuda_iov_pos+1].contig_disp - cuda_iov_dist[current_cuda_iov_pos].contig_disp) - cuda_iov_partial_length_start;
-    }
-    
     for (i = 0; i < nb_tasks; i++) {
-        /* these 3 variables are used multiple times, so put in in register */
-        _my_cuda_iov_pos = (blockIdx.x + i * gridDim.x + current_cuda_iov_pos) % cuda_iov_count;
-        _my_cuda_iov_iteration = (blockIdx.x + i * gridDim.x + current_cuda_iov_pos) / cuda_iov_count;
-        contig_disp = cuda_iov_dist[_my_cuda_iov_pos].contig_disp; 
-        
-        src_offset = contig_disp + ddt_size * _my_cuda_iov_iteration - source_disp - source_partial_disp;
-        dst_offset = cuda_iov_dist[_my_cuda_iov_pos].ncontig_disp + (_my_cuda_iov_iteration + current_count) * ddt_extent;
-        _nb_bytes = cuda_iov_dist[_my_cuda_iov_pos + 1].contig_disp - contig_disp;
-
+        src = (unsigned char *)cuda_iov_contig_buf_d[blockIdx.x + i * gridDim.x];
+        dst_offset = cuda_iov_dist[blockIdx.x + i * gridDim.x].ptr_offset;
         if (i == 0 && blockIdx.x == 0 && cuda_iov_partial_length_start != 0) {
-            src_offset = contig_disp + ddt_size * _my_cuda_iov_iteration - source_disp;
-            dst_offset = dst_offset + _nb_bytes - cuda_iov_partial_length_start;  
-            _nb_bytes = cuda_iov_partial_length_start;
-        } else if (i == nb_tasks-1 && (blockIdx.x == (nb_blocks_used-1) % gridDim.x) && cuda_iov_partial_length_end != 0) {
-            _nb_bytes = cuda_iov_partial_length_end;
-        }
-        
-        _destination_tmp = destination_base + dst_offset; 
-        _source_tmp = source_base + src_offset;
-        if ((uintptr_t)(_destination_tmp) % ALIGNMENT_DOUBLE == 0 && (uintptr_t)(_source_tmp) % ALIGNMENT_DOUBLE == 0 && _nb_bytes % ALIGNMENT_DOUBLE == 0) {
-            alignment = ALIGNMENT_DOUBLE;
-        } else if ((uintptr_t)(_destination_tmp) % ALIGNMENT_FLOAT == 0 && (uintptr_t)(_source_tmp) % ALIGNMENT_FLOAT == 0 && _nb_bytes % ALIGNMENT_FLOAT == 0) {
-            alignment = ALIGNMENT_FLOAT;
-        } else {
-            alignment = ALIGNMENT_CHAR;
+         //   if (threadIdx.x == 0) printf("cuda_iov_partial_length_start %d", cuda_iov_partial_length_start);
+            dst_offset = dst_offset + cuda_iov_dist[blockIdx.x + i * gridDim.x].nb_bytes - cuda_iov_partial_length_start; 
         }
-        copy_count = _nb_bytes / alignment;
-   /*     
-        if (threadIdx.x == 0 && nb_tasks != 0) {
-            printf("unpack block %d, src_offset %ld, dst_offset %ld, count %d, nb_bytes %d, nb_tasks %d, i %d\n", blockIdx.x, src_offset, dst_offset, copy_count, _nb_bytes, nb_tasks, i);
+        if (threadIdx.x == 0) {
+            _source_tmp = src;
+            _destination_tmp = destination_base + dst_offset;
+            uint32_t _nb_bytes = 0;
+            if (i == 0 && blockIdx.x == 0 && cuda_iov_partial_length_start != 0) {
+                _nb_bytes = cuda_iov_partial_length_start;
+            } else if (i == nb_tasks-1 && (blockIdx.x == (nb_blocks_used-1) % gridDim.x) && cuda_iov_partial_length_end != 0) {
+                _nb_bytes = cuda_iov_partial_length_end;
+            } else {
+                _nb_bytes = cuda_iov_dist[blockIdx.x + i * gridDim.x].nb_bytes;
+            }
+            if ((uintptr_t)(_destination_tmp) % ALIGNMENT_DOUBLE == 0 && (uintptr_t)_source_tmp % ALIGNMENT_DOUBLE == 0 && _nb_bytes % ALIGNMENT_DOUBLE == 0) {
+                alignment = ALIGNMENT_DOUBLE;
+            } else if ((uintptr_t)(_destination_tmp) % ALIGNMENT_FLOAT == 0 && (uintptr_t)_source_tmp % ALIGNMENT_FLOAT == 0 && _nb_bytes % ALIGNMENT_FLOAT == 0) {
+                alignment = ALIGNMENT_FLOAT;
+            } else {
+                alignment = ALIGNMENT_CHAR;
+            }
+            copy_count = _nb_bytes / alignment;
         }
         __syncthreads();
-     */   
+        
         for (j = threadIdx.x; j < copy_count; j += blockDim.x) {
 /*            if (threadIdx.x == 0) {
                 if (copy_count > blockDim.x) printf("copy_count %d, dim %d\n", copy_count, blockDim.x);
             }*/
             if (j < copy_count) {
-                _source_tmp = source_base + src_offset + j * alignment;
+                _source_tmp = src + j * alignment;
                 _destination_tmp = destination_base + dst_offset + j * alignment;
   /*              if (threadIdx.x == 0) {
                     printf("_src %p, dst %p, alignment %d, blk %d, j %d, count %d\n", _source_tmp, _destination_tmp, alignment, blockIdx.x, j, copy_count);
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
index bb54dfeeb0a..5092c7ec806 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
@@ -376,6 +376,7 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pCon
                                                           uint32_t* out_size,
                                                           size_t* max_data )
 {
+<<<<<<< HEAD
     size_t buffer_size;
     unsigned char *source;
     size_t total_unpacked;
@@ -453,6 +454,11 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pCon
 }
 
 #if 0
+=======
+    return opal_ddt_generic_simple_unpack_function_cuda_iov_cached(pConvertor, iov, out_size, max_data);
+}
+
+>>>>>>> cached iov is working for count = 1
 int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_non_cached( opal_convertor_t* pConvertor,
                                                                      struct iovec* iov,
                                                                      uint32_t* out_size,
@@ -697,197 +703,291 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_non_cached( opal_conver
     return 0;
 }
 
-#endif
-
-int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_non_cached( opal_convertor_t* pConvertor, unsigned char *source, size_t buffer_size, size_t *total_unpacked)
+int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_t* pConvertor,
+                                                                 struct iovec* iov,
+                                                                 uint32_t* out_size,
+                                                                 size_t* max_data )
 {
-    uint32_t i;
+    uint32_t i, j;
+    uint32_t count_desc, nb_blocks_per_description, residue_desc;
     uint32_t nb_blocks, thread_per_block, nb_blocks_used;
-    unsigned char *source_base, *destination_base;
+    size_t length, buffer_size, length_per_iovec;
+    unsigned char *source, *source_base, *destination_base, *destination;
+    size_t total_unpacked, unpacked_wo_cache, unpacked_w_cache;
+    int32_t complete_flag = 0;
     uint8_t buffer_isfull = 0;
+    uint8_t free_required = 0;
+    uint32_t convertor_flags;
+//    dt_elem_desc_t* description;
+//    dt_elem_desc_t* pElem;
+//    dt_stack_t* pStack;
+    uint8_t alignment, orig_alignment;
+//    int32_t orig_stack_index;
     cudaError_t cuda_err;
     ddt_cuda_stream_t *cuda_streams = current_cuda_device->cuda_streams;
     ddt_cuda_iov_dist_cached_t* cuda_iov_dist_h_current;
     ddt_cuda_iov_dist_cached_t* cuda_iov_dist_d_current;
+    uintptr_t *cuda_iov_contig_buf_h_current, *cuda_iov_contig_buf_d_current;
     ddt_cuda_iov_pipeline_block_t *cuda_iov_pipeline_block;
     int iov_pipeline_block_id = 0;
     cudaStream_t *cuda_stream_iov = NULL;
     const struct iovec *ddt_iov = NULL;
     uint32_t ddt_iov_count = 0;
-    size_t contig_disp = 0;
-    uint32_t ddt_iov_start_pos, ddt_iov_end_pos, current_ddt_iov_pos;
-    OPAL_PTRDIFF_TYPE ddt_extent;
+    size_t iov_len = 0;
+    uint32_t iov_start_pos, iov_end_pos, cuda_iov_start_pos, cuda_iov_end_pos;
+    ddt_cuda_iov_total_cached_t* cached_cuda_iov;
+    ddt_cuda_iov_dist_cached_t* cached_cuda_iov_dist_d;
+    uint32_t *cached_cuda_iov_nb_bytes_list_h, *cuda_iov_nb_bytes_list_h_current;
+    uint32_t cached_cuda_iov_count = 0;
+    uint8_t cuda_iov_is_cached = 0;
+    size_t cuda_iov_partial_length_start = 0;
+    size_t cuda_iov_partial_length_end = 0;
 
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
-    TIMER_DATA_TYPE start, end;
-    long total_time;
+    TIMER_DATA_TYPE start, end, start_total, end_total;
+    long total_time, move_time;
 #endif
-    
-    DT_CUDA_DEBUG ( opal_cuda_output(2, "Unpack using IOV non cached, GPU base %p, unpack from buffer %p, total size %ld\n",
-                                     pConvertor->pBaseBuf, source, buffer_size); );
-    
-    opal_convertor_raw_cached( pConvertor, &ddt_iov, &ddt_iov_count);
-    if (ddt_iov == NULL) {
-        DT_CUDA_DEBUG ( opal_cuda_output(0, "Can not get ddt iov\n"););
-        return OPAL_ERROR;
-    }
-    
-    cuda_streams->current_stream_id = 0;
-    thread_per_block = CUDA_WARP_SIZE * 5;
-    nb_blocks = 256;
-    source_base = source;
-    opal_datatype_type_extent(pConvertor->pDesc, &ddt_extent);
-    opal_ddt_set_ddt_iov_position(pConvertor, pConvertor->bConverted, ddt_iov, ddt_iov_count);
-    destination_base = (unsigned char*)pConvertor->pBaseBuf + pConvertor->current_count * ddt_extent;
-    
-    for (i = 0; i < NB_STREAMS; i++) {
-        cudaStreamSynchronize(cuda_streams->opal_cuda_stream[i]);
-    }
 
-    while( pConvertor->current_count < pConvertor->count && !buffer_isfull) {
-
-        nb_blocks_used = 0;
-        ddt_iov_start_pos = pConvertor->current_iov_pos;
-        ddt_iov_end_pos = ddt_iov_start_pos + IOV_PIPELINE_SIZE;
-        if (ddt_iov_end_pos > ddt_iov_count) {
-            ddt_iov_end_pos = ddt_iov_count;
-        }
-        cuda_iov_pipeline_block = current_cuda_device->cuda_iov_pipeline_block[iov_pipeline_block_id];
-        cuda_iov_dist_h_current = cuda_iov_pipeline_block->cuda_iov_dist_non_cached_h;
-        cuda_iov_dist_d_current = cuda_iov_pipeline_block->cuda_iov_dist_non_cached_d;
-        cuda_stream_iov = cuda_iov_pipeline_block->cuda_stream;
-        cuda_err = cudaStreamWaitEvent(*cuda_stream_iov, cuda_iov_pipeline_block->cuda_event, 0);
-        opal_cuda_check_error(cuda_err);
-        
-
-#if defined (OPAL_DATATYPE_CUDA_TIMING)
-        GET_TIME(start);
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+    GET_TIME(start_total);
 #endif
 
-        buffer_isfull = opal_ddt_iov_to_cuda_iov(pConvertor, ddt_iov, cuda_iov_dist_h_current, ddt_iov_start_pos, ddt_iov_end_pos, &buffer_size, &nb_blocks_used, total_unpacked, &contig_disp, &current_ddt_iov_pos);
+/*    description = pConvertor->use_desc->desc;
+    pStack = pConvertor->pStack + pConvertor->stack_pos;
+    pElem = &(description[pStack->index]);
+    printf("size elem %d, size %lu\n", pElem->elem.common.type, opal_datatype_basicDatatypes[pElem->elem.common.type]->size);
+*/
 
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
-        GET_TIME( end );
-        total_time = ELAPSED_TIME( start, end );
-        DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Unpack src %p to dest %p, iov is prepared in %ld microsec, kernel submitted to CUDA stream %d, nb_blocks_used %d\n", source_base, destination_base, total_time,  cuda_iov_pipeline_block->cuda_stream_id, nb_blocks_used); );
+    GET_TIME(start);
 #endif
-
-        cudaMemcpyAsync(cuda_iov_dist_d_current, cuda_iov_dist_h_current, sizeof(ddt_cuda_iov_dist_cached_t)*(nb_blocks_used+1), cudaMemcpyHostToDevice, *cuda_stream_iov);
-        opal_generic_simple_unpack_cuda_iov_cached_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cuda_iov_dist_d_current, 0, nb_blocks_used, 0, 0, nb_blocks_used, destination_base, source_base, 0, 0);
-        cuda_err = cudaEventRecord(cuda_iov_pipeline_block->cuda_event, *cuda_stream_iov);
-        opal_cuda_check_error(cuda_err);
-        iov_pipeline_block_id ++;
-        iov_pipeline_block_id = iov_pipeline_block_id % NB_STREAMS;
-        source_base += contig_disp;
-        if (!buffer_isfull) {
-            pConvertor->current_iov_pos = current_ddt_iov_pos;
-            if (current_ddt_iov_pos == ddt_iov_count) {
-                pConvertor->current_count ++;
-                pConvertor->current_iov_pos = 0;
-                destination_base += ddt_extent;
+    if (opal_ddt_cuda_is_gpu_buffer(iov[0].iov_base)) {
+        source = (unsigned char*)iov[0].iov_base;
+        free_required = 0;
+    } else {
+        if (OPAL_DATATYPE_VECTOR_USE_ZEROCPY) {
+            cudaHostGetDevicePointer((void **)&source, (void *)iov[0].iov_base, 0);
+            pConvertor->gpu_buffer_ptr = NULL;
+            free_required = 0;
+        } else {
+            if (pConvertor->gpu_buffer_ptr == NULL) {
+                pConvertor->gpu_buffer_ptr = (unsigned char*)opal_ddt_cuda_malloc_gpu_buffer(iov[0].iov_len, 0);
             }
+            source = pConvertor->gpu_buffer_ptr;
+            cudaMemcpy(source, iov[0].iov_base, iov[0].iov_len, cudaMemcpyHostToDevice);
+            free_required = 1;
         }
     }
 
-    for (i = 0; i < NB_STREAMS; i++) {
-        cudaStreamSynchronize(cuda_streams->opal_cuda_stream[i]);
-    }
-
-    return OPAL_SUCCESS;
-}
-
-int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_t* pConvertor, unsigned char *source, size_t buffer_size, size_t *total_unpacked)
-{
-    uint32_t i;
-    uint32_t nb_blocks, thread_per_block, nb_blocks_used;
-    unsigned char *source_base, *destination_base;
-    uint8_t buffer_isfull = 0;
-    cudaError_t cuda_err;
-    ddt_cuda_stream_t *cuda_streams = current_cuda_device->cuda_streams;
-    ddt_cuda_iov_pipeline_block_t *cuda_iov_pipeline_block = NULL;
-    cudaStream_t *cuda_stream_iov = NULL;
-    uint32_t cuda_iov_start_pos, cuda_iov_end_pos;
-    ddt_cuda_iov_total_cached_t* cached_cuda_iov = NULL;
-    ddt_cuda_iov_dist_cached_t* cached_cuda_iov_dist_d = NULL;
-    uint32_t *cached_cuda_iov_nb_bytes_list_h = NULL;
-    uint32_t cached_cuda_iov_count = 0;
-    size_t cuda_iov_partial_length_start = 0;
-    size_t cuda_iov_partial_length_end = 0;
-    opal_datatype_count_t convertor_current_count;
-    OPAL_PTRDIFF_TYPE ddt_extent;
-
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "Unpack using IOV cached, GPU base %p, unpack from buffer %p, total size %ld\n",
+                                     pConvertor->pBaseBuf, source, iov[0].iov_len); );
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
-    TIMER_DATA_TYPE start, end;
-    long total_time;
+    GET_TIME( end );
+    move_time = ELAPSED_TIME( start, end );
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: HtoD memcpy in %ld microsec, free required %d\n", move_time, free_required ); );
 #endif
     
-    DT_CUDA_DEBUG ( opal_cuda_output(2, "Unpack using IOV cached, GPU base %p, unpack from buffer %p, total size %ld\n",
-                                     pConvertor->pBaseBuf, source, buffer_size); );
+//    cuda_err = cudaEventRecord(current_cuda_device->memcpy_event, current_cuda_device->cuda_streams->opal_cuda_stream[0]);
+//    opal_cuda_check_error(cuda_err);
+
 
 #if defined (OPAL_DATATYPE_CUDA_TIMING)
     GET_TIME(start);
 #endif
-
+    buffer_size = iov[0].iov_len;
+    total_unpacked = 0;
+    unpacked_wo_cache = 0;
+    unpacked_w_cache = 0;
     cuda_streams->current_stream_id = 0;
+    convertor_flags = pConvertor->flags;
+//    orig_stack_index = pStack->index;
     source_base = source;
+    opal_convertor_raw_cached( pConvertor, &ddt_iov, &ddt_iov_count);
+    assert(ddt_iov != NULL);
+    opal_ddt_get_cached_cuda_iov(pConvertor, &cached_cuda_iov);
+    cached_cuda_iov_dist_d = cached_cuda_iov->cuda_iov_dist_d;
+    assert(cached_cuda_iov_dist_d != NULL);
+    cached_cuda_iov_nb_bytes_list_h = cached_cuda_iov->nb_bytes_h;
+    assert(cached_cuda_iov_nb_bytes_list_h != NULL);
+    cached_cuda_iov_count = cached_cuda_iov->cuda_iov_count;
+    cuda_iov_is_cached = cached_cuda_iov->cuda_iov_is_cached;
+    DT_CUDA_DEBUG ( opal_cuda_output(4, "Unpack iov count %d, submit to CUDA stream %d\n", ddt_iov_count, cuda_streams->current_stream_id); );
+
+#if defined (OPAL_DATATYPE_CUDA_TIMING)
+    GET_TIME( end );
+    total_time = ELAPSED_TIME( start, end );
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: ddt to iov in %ld microsec\n", total_time ); );
+#endif
+
     thread_per_block = CUDA_WARP_SIZE * 5;
     nb_blocks = 256;
     destination_base = (unsigned char*)pConvertor->pBaseBuf;
     
     /* cuda iov is not cached, start to cache iov */
     if(opal_ddt_cuda_iov_is_cached(pConvertor) == 0) {
+    
+        iov_start_pos = pConvertor->current_iov_pos;
+        iov_end_pos = iov_start_pos + IOV_PIPELINE_SIZE;
+        if (iov_end_pos > ddt_iov_count) {
+            iov_end_pos = ddt_iov_count;
+        }
+
+        while (iov_start_pos < iov_end_pos && !buffer_isfull) {
+
+            nb_blocks_used = 0;
+            cuda_iov_pipeline_block = current_cuda_device->cuda_iov_pipeline_block[iov_pipeline_block_id];
+            cuda_iov_dist_h_current = cuda_iov_pipeline_block->cuda_iov_dist_cached_h;
+            cuda_iov_contig_buf_h_current = cuda_iov_pipeline_block->cuda_iov_contig_buf_h;
+            cuda_iov_contig_buf_d_current = cuda_iov_pipeline_block->cuda_iov_contig_buf_d;
+            cuda_iov_dist_d_current = cached_cuda_iov_dist_d + pConvertor->current_cuda_iov_pos;
+            cuda_iov_nb_bytes_list_h_current = cached_cuda_iov_nb_bytes_list_h + pConvertor->current_cuda_iov_pos;
+            DT_CUDA_DEBUG ( opal_cuda_output(2, "Pack cuda iov pos %d\n", pConvertor->current_cuda_iov_pos););
+            cuda_stream_iov = cuda_iov_pipeline_block->cuda_stream;
+            cuda_err = cudaStreamWaitEvent(*cuda_stream_iov, cuda_iov_pipeline_block->cuda_event, 0);
+            opal_cuda_check_error(cuda_err);
+        
+
 #if defined (OPAL_DATATYPE_CUDA_TIMING)
-        GET_TIME(start);
+            GET_TIME(start);
 #endif
-        if (opal_ddt_cache_cuda_iov(pConvertor, &nb_blocks_used) == OPAL_SUCCESS) {
-            opal_ddt_set_cuda_iov_cached(pConvertor, nb_blocks_used);
-            DT_CUDA_DEBUG ( opal_cuda_output(2, "Unpack cuda iov is cached, count %d\n", nb_blocks_used););
-        }
+
+            for (i = iov_start_pos; i < iov_end_pos && !buffer_isfull; i++) {
+                if (pConvertor->current_iov_partial_length > 0) {
+                    iov_len = pConvertor->current_iov_partial_length;
+                    pConvertor->current_iov_partial_length = 0;
+                } else {
+                    iov_len = ddt_iov[i].iov_len;
+                }
+                if (buffer_size >= iov_len) {
+                    length_per_iovec = iov_len;
+                } else {
+                  /*  orig_alignment = opal_datatype_basicDatatypes[pElem->elem.common.type]->size;*/
+                    orig_alignment = ALIGNMENT_CHAR;
+                    length_per_iovec = buffer_size / orig_alignment * orig_alignment;
+                    buffer_isfull = 1;
+                    pConvertor->current_iov_partial_length = iov_len - length_per_iovec;
+                    pConvertor->current_iov_pos = i;
+                }
+                buffer_size -= length_per_iovec;
+                unpacked_wo_cache += length_per_iovec;
+                destination = (size_t)(ddt_iov[i].iov_base) + (ddt_iov[i].iov_len - iov_len) + destination_base;
+
+                alignment = ALIGNMENT_DOUBLE;
+
+                count_desc = length_per_iovec / alignment;
+                residue_desc = length_per_iovec % alignment;
+                nb_blocks_per_description = (count_desc + thread_per_block - 1) / thread_per_block;
+                DT_CUDA_DEBUG ( opal_cuda_output(10, "Unpack description %d, size %d, residue %d, alignment %d\n", i, count_desc, residue_desc, alignment); );
+                for (j = 0; j < nb_blocks_per_description; j++) {
+                    cuda_iov_dist_h_current[nb_blocks_used].ptr_offset = destination + j * thread_per_block * alignment - destination_base;
+                    cuda_iov_contig_buf_h_current[nb_blocks_used] = (uintptr_t)source;
+                    if ( (j+1) * thread_per_block <= count_desc) {
+                        cuda_iov_dist_h_current[nb_blocks_used].nb_bytes = thread_per_block * alignment;
+                    } else {
+                        cuda_iov_dist_h_current[nb_blocks_used].nb_bytes = (thread_per_block - ((j+1)*thread_per_block - count_desc)) * alignment;
+                    }
+#if defined (OPAL_DATATYPE_CUDA_DEBUG)
+                    assert (cuda_iov_dist_h_current[nb_blocks_used].nb_bytes > 0); 
+#endif /* OPAL_DATATYPE_CUDA_DEBUG */
+                    cuda_iov_nb_bytes_list_h_current[nb_blocks_used] = cuda_iov_dist_h_current[nb_blocks_used].nb_bytes;
+                    source += cuda_iov_dist_h_current[nb_blocks_used].nb_bytes;
+                    DT_CUDA_DEBUG( opal_cuda_output(12, "Unpack \tblock %d, src_offset %ld, dst %p, nb_bytes %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].ptr_offset, cuda_iov_contig_buf_h_current[nb_blocks_used], cuda_iov_dist_h_current[nb_blocks_used].nb_bytes); );
+                    nb_blocks_used ++;
+                }
+
+                /* handle residue */
+                if (residue_desc != 0) {
+                   /* orig_alignment = opal_datatype_basicDatatypes[pElem->elem.common.type]->size;*/
+                    orig_alignment = ALIGNMENT_CHAR;
+                    cuda_iov_dist_h_current[nb_blocks_used].ptr_offset = destination + length_per_iovec / alignment * alignment - destination_base;
+                    cuda_iov_contig_buf_h_current[nb_blocks_used] = (uintptr_t)source;
+                    cuda_iov_dist_h_current[nb_blocks_used].nb_bytes = length_per_iovec - length_per_iovec / alignment * alignment;
+#if defined (OPAL_DATATYPE_CUDA_DEBUG)
+                    assert (cuda_iov_dist_h_current[nb_blocks_used].nb_bytes > 0);
+#endif /* OPAL_DATATYPE_CUDA_DEBUG */
+                    cuda_iov_nb_bytes_list_h_current[nb_blocks_used] = cuda_iov_dist_h_current[nb_blocks_used].nb_bytes;
+                    source += cuda_iov_dist_h_current[nb_blocks_used].nb_bytes;
+                    DT_CUDA_DEBUG( opal_cuda_output(12, "Unpack \tblock %d, src_offset %ld, dst %p, nb_bytes %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].ptr_offset, cuda_iov_contig_buf_h_current[nb_blocks_used], cuda_iov_dist_h_current[nb_blocks_used].nb_bytes); );
+                    nb_blocks_used ++;
+                }
+            }
+            
+            if (!buffer_isfull) {
+                pConvertor->current_iov_pos = i;
+            }
+
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
-        GET_TIME( end );
-        total_time = ELAPSED_TIME( start, end );
-        DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Unpack cuda iov is cached in %ld microsec, nb_blocks_used %d\n", total_time, nb_blocks_used); );
+            GET_TIME( end );
+            total_time = ELAPSED_TIME( start, end );
+            DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Unpack src %p, iov is prepared in %ld microsec, kernel submitted to CUDA stream %d, nb_blocks_used %d\n", source_base, total_time,  cuda_iov_pipeline_block->cuda_stream_id, nb_blocks_used); );
 #endif
+
+            cudaMemcpyAsync(cuda_iov_dist_d_current, cuda_iov_dist_h_current, sizeof(ddt_cuda_iov_dist_cached_t)*(nb_blocks_used), cudaMemcpyHostToDevice, *cuda_stream_iov);
+            cudaMemcpyAsync(cuda_iov_contig_buf_d_current, cuda_iov_contig_buf_h_current, sizeof(uintptr_t)*(nb_blocks_used), cudaMemcpyHostToDevice, *cuda_stream_iov);
+            opal_generic_simple_unpack_cuda_iov_cached_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cuda_iov_dist_d_current, cuda_iov_contig_buf_d_current, nb_blocks_used, destination_base, 0, 0);
+            cuda_err = cudaEventRecord(cuda_iov_pipeline_block->cuda_event, *cuda_stream_iov);
+            opal_cuda_check_error(cuda_err);
+            iov_pipeline_block_id ++;
+            iov_pipeline_block_id = iov_pipeline_block_id % NB_STREAMS;
+            pConvertor->current_cuda_iov_pos += nb_blocks_used;
+        
+            iov_start_pos = iov_end_pos;
+            iov_end_pos = iov_start_pos + IOV_PIPELINE_SIZE;
+            if (iov_end_pos >= ddt_iov_count) {
+                iov_end_pos = ddt_iov_count;
+            }
+            /* finished */
+            if (pConvertor->current_iov_pos == ddt_iov_count) {
+                pConvertor->current_count ++;
+                opal_ddt_set_cuda_iov_cached(pConvertor, pConvertor->current_cuda_iov_pos);
+                DT_CUDA_DEBUG ( opal_cuda_output(2, "Pack cuda iov is cached, count %d\n", pConvertor->current_cuda_iov_pos););
+            }
+            DT_CUDA_DEBUG ( opal_cuda_output(4, "Unpack iov start pos %d end pos %d, submit to CUDA stream %d\n", iov_start_pos, iov_end_pos, cuda_streams->current_stream_id); );
+
+        }
     }
-      
+    total_unpacked += unpacked_wo_cache;
+    pConvertor->bConverted += unpacked_wo_cache;
+#if 1    
     /* now we use cached cuda iov */
-    opal_ddt_get_cached_cuda_iov(pConvertor, &cached_cuda_iov);
-    cached_cuda_iov_dist_d = cached_cuda_iov->cuda_iov_dist_d;
-    assert(cached_cuda_iov_dist_d != NULL);
-    cached_cuda_iov_nb_bytes_list_h = cached_cuda_iov->nb_bytes_h;
-    assert(cached_cuda_iov_nb_bytes_list_h != NULL);
-    
-    cached_cuda_iov_count = cached_cuda_iov->cuda_iov_count;
-    opal_ddt_set_cuda_iov_position(pConvertor, pConvertor->bConverted, cached_cuda_iov_nb_bytes_list_h, cached_cuda_iov_count);
-    cuda_iov_start_pos = pConvertor->current_cuda_iov_pos;
-    cuda_iov_end_pos = cached_cuda_iov_count;
-    nb_blocks_used = 0;
-    cuda_iov_pipeline_block = current_cuda_device->cuda_iov_pipeline_block[0];
-    cuda_stream_iov = cuda_iov_pipeline_block->cuda_stream;
-    convertor_current_count = pConvertor->current_count;
-    
-    if (pConvertor->current_iov_partial_length > 0) {
-        cuda_iov_partial_length_start = pConvertor->current_iov_partial_length;
-        *total_unpacked += cuda_iov_partial_length_start;
-        buffer_size -= cuda_iov_partial_length_start;
-        pConvertor->current_iov_partial_length = 0;
-        cuda_iov_start_pos ++;
-        nb_blocks_used ++;
-    }
-    
+    if( pConvertor->bConverted != pConvertor->local_size && !buffer_isfull) {
+        cuda_iov_start_pos = pConvertor->current_cuda_iov_pos;
+        cuda_iov_end_pos = cached_cuda_iov_count;
+        nb_blocks_used = 0;
+        cuda_iov_pipeline_block = current_cuda_device->cuda_iov_pipeline_block[iov_pipeline_block_id];
+        cuda_iov_contig_buf_h_current = cuda_iov_pipeline_block->cuda_iov_contig_buf_h;
+        cuda_iov_contig_buf_d_current = cuda_iov_pipeline_block->cuda_iov_contig_buf_d;
+        cuda_iov_dist_d_current = cached_cuda_iov_dist_d + pConvertor->current_cuda_iov_pos;
+        cuda_stream_iov = cuda_iov_pipeline_block->cuda_stream;
+        cuda_err = cudaStreamWaitEvent(*cuda_stream_iov, cuda_iov_pipeline_block->cuda_event, 0);
+        opal_cuda_check_error(cuda_err);
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
-    GET_TIME(start);
+        GET_TIME(start);
 #endif
-    while( pConvertor->current_count < pConvertor->count && !buffer_isfull) {
+        if (pConvertor->current_iov_partial_length > 0) {
+            cuda_iov_partial_length_start = pConvertor->current_iov_partial_length;
+            unpacked_w_cache += cuda_iov_partial_length_start;
+            buffer_size -= cuda_iov_partial_length_start;
+            pConvertor->current_iov_partial_length = 0;
+            cuda_iov_contig_buf_h_current[nb_blocks_used] = (uintptr_t)source;
+            source += cuda_iov_partial_length_start;
+            cuda_iov_start_pos ++;
+            nb_blocks_used ++;
+        }
         for (i = cuda_iov_start_pos; i < cuda_iov_end_pos && !buffer_isfull; i++) {
             if (buffer_size >= cached_cuda_iov_nb_bytes_list_h[i]) {
-                *total_unpacked += cached_cuda_iov_nb_bytes_list_h[i];
+                cuda_iov_contig_buf_h_current[nb_blocks_used] = (uintptr_t)source;
+                source += cached_cuda_iov_nb_bytes_list_h[i];
+                unpacked_w_cache += cached_cuda_iov_nb_bytes_list_h[i];
                 buffer_size -= cached_cuda_iov_nb_bytes_list_h[i];
                 nb_blocks_used ++;
             } else {
                 if (buffer_size > 0) {
                     cuda_iov_partial_length_end = buffer_size;
-                    *total_unpacked += cuda_iov_partial_length_end;
+                    unpacked_w_cache += cuda_iov_partial_length_end;
+                    cuda_iov_contig_buf_h_current[nb_blocks_used] = (uintptr_t)source;
+                    source += cuda_iov_partial_length_end;
+                    pConvertor->current_iov_partial_length = cached_cuda_iov_nb_bytes_list_h[i] - cuda_iov_partial_length_end;
                     nb_blocks_used ++;
                 }
                 buffer_size = 0;
@@ -895,24 +995,53 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
                 break;
             }
         }
-        if (!buffer_isfull) {
-            pConvertor->current_count ++;
-            cuda_iov_start_pos = 0;
-            cuda_iov_end_pos = cached_cuda_iov_count;
+#if defined(OPAL_DATATYPE_CUDA_TIMING)    
+        GET_TIME( end );
+        total_time = ELAPSED_TIME( start, end );
+        DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Unpack src %p, cached cuda iov is prepared in %ld microsec, kernel submitted to CUDA stream %d, nb_blocks %d\n", source_base, total_time,  cuda_iov_pipeline_block->cuda_stream_id, nb_blocks_used); );
+#endif
+        if (pConvertor->current_iov_partial_length > 0) {
+            pConvertor->current_cuda_iov_pos += nb_blocks_used - 1;
+        } else {
+            pConvertor->current_cuda_iov_pos += nb_blocks_used;
         }
+        cudaMemcpyAsync(cuda_iov_contig_buf_d_current, cuda_iov_contig_buf_h_current, sizeof(uintptr_t)*(nb_blocks_used), cudaMemcpyHostToDevice, *cuda_stream_iov);
+        DT_CUDA_DEBUG ( opal_cuda_output(2, "kernel launched src_base %p, dst_base %p, nb_blocks %ld\n", source_base, destination_base, nb_blocks_used ); );
+        opal_generic_simple_unpack_cuda_iov_cached_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cuda_iov_dist_d_current, cuda_iov_contig_buf_d_current, nb_blocks_used, destination_base, cuda_iov_partial_length_start, cuda_iov_partial_length_end);
+        cuda_err = cudaEventRecord(cuda_iov_pipeline_block->cuda_event, *cuda_stream_iov);
+        opal_cuda_check_error(cuda_err);
+        iov_pipeline_block_id ++;
+        iov_pipeline_block_id = iov_pipeline_block_id % NB_STREAMS;
     }
-#if defined(OPAL_DATATYPE_CUDA_TIMING)    
-    GET_TIME( end );
-    total_time = ELAPSED_TIME( start, end );
-    DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Unpack src %p, cached cuda iov is prepared in %ld microsec, kernel submitted to CUDA stream %d, nb_blocks %d\n", source_base, total_time,  cuda_iov_pipeline_block->cuda_stream_id, nb_blocks_used); );
 #endif
-    opal_datatype_type_extent(pConvertor->pDesc, &ddt_extent);
-    DT_CUDA_DEBUG ( opal_cuda_output(2, "Unpack kernel launched src_base %p, dst_base %p, nb_blocks %ld\n", source_base, destination_base, nb_blocks_used ); );
-    opal_generic_simple_unpack_cuda_iov_cached_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cached_cuda_iov_dist_d, pConvertor->current_cuda_iov_pos, cached_cuda_iov_count, ddt_extent, convertor_current_count, nb_blocks_used, destination_base, source_base, cuda_iov_partial_length_start, cuda_iov_partial_length_end);
 
-    cudaStreamSynchronize(cuda_streams->opal_cuda_stream[0]);
+    for (i = 0; i < NB_STREAMS; i++) {
+        cudaStreamSynchronize(cuda_streams->opal_cuda_stream[i]);
+    }
     
-    return OPAL_SUCCESS;
+    total_unpacked += unpacked_w_cache;
+    pConvertor->bConverted += unpacked_w_cache;
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "Unpack total unpacked %d\n", total_unpacked); );
+
+    iov[0].iov_len = total_unpacked;
+    *max_data = total_unpacked;
+    *out_size = 1;
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+    GET_TIME( end_total );
+    total_time = ELAPSED_TIME( start_total, end_total );
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: total unpacking in %ld microsec, kernel %ld microsec\n", total_time, total_time-move_time ); );
+#endif
+
+    if( pConvertor->bConverted == pConvertor->local_size ) {
+        pConvertor->flags |= CONVERTOR_COMPLETED;
+        if (pConvertor->gpu_buffer_ptr != NULL && free_required) {
+            opal_ddt_cuda_free_gpu_buffer(pConvertor->gpu_buffer_ptr, 0);
+            pConvertor->gpu_buffer_ptr = NULL;
+        }
+        return 1;
+    }
+    return 0;
 }
 
 void unpack_contiguous_loop_cuda( dt_elem_desc_t* ELEM,
diff --git a/opal/datatype/opal_datatype.h b/opal/datatype/opal_datatype.h
index a3a6898dd89..1337f2cc57c 100644
--- a/opal/datatype/opal_datatype.h
+++ b/opal/datatype/opal_datatype.h
@@ -131,6 +131,9 @@ struct opal_datatype_t {
     int                iov_count;
     size_t             max_data;
     /* size: 416, cachelines: 7, members: 18 */
+#if OPAL_CUDA_SUPPORT
+    void *             cached_cuda_iov;
+#endif /* OPAL_CUDA_SUPPORT */
     /* last cacheline: 32 bytes */
 
     struct iovec*      cached_iovec;
diff --git a/opal/datatype/opal_datatype_create.c b/opal/datatype/opal_datatype_create.c
index e57a7d6c668..44c0e3020b6 100644
--- a/opal/datatype/opal_datatype_create.c
+++ b/opal/datatype/opal_datatype_create.c
@@ -102,7 +102,7 @@ static void opal_datatype_destruct( opal_datatype_t* datatype )
 #if OPAL_CUDA_SUPPORT   
     /* free cuda iov */
     if (opal_datatype_cuda_kernel_support == 1 && datatype->cached_cuda_iov != NULL) {
-        opal_cached_cuda_iov_fini((void*)datatype->cached_cuda_iov);
+        opal_cached_cuda_iov_fini(datatype->cached_cuda_iov);
         datatype->cached_cuda_iov = NULL;
     }
 #endif /* OPAL_CUDA_SUPPORT */

From 7b26aaab6aecab2c9fdf6a08c0ce4263083751e1 Mon Sep 17 00:00:00 2001
From: eddy16112 <eddy16112@gmail.com>
Date: Wed, 11 Nov 2015 17:49:25 -0500
Subject: [PATCH 173/190] cache the entire cuda iov checkpoint, during unpack,
 cache the entire iov before unpack

another checkpoint

checkpoint , remove unnecessary cuda stream sync

use bit to replace %

rollback to use %, not bit, since it is faster, not sure why
---
 opal/datatype/cuda/opal_datatype_cuda.cu      |  14 ++
 opal/datatype/cuda/opal_datatype_cuda.cuh     |   2 +
 .../cuda/opal_datatype_cuda_internal.cuh      |   4 +-
 .../cuda/opal_datatype_pack_cuda_kernel.cu    |  35 ++-
 .../cuda/opal_datatype_pack_cuda_wrapper.cu   | 203 ++++++------------
 .../cuda/opal_datatype_unpack_cuda_kernel.cu  |  48 ++---
 .../cuda/opal_datatype_unpack_cuda_wrapper.cu | 203 ++++++------------
 test/datatype/ddt_benchmark.c                 |   4 +-
 8 files changed, 188 insertions(+), 325 deletions(-)

diff --git a/opal/datatype/cuda/opal_datatype_cuda.cu b/opal/datatype/cuda/opal_datatype_cuda.cu
index c92d44cfdbe..e3fe580cf76 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cu
+++ b/opal/datatype/cuda/opal_datatype_cuda.cu
@@ -353,6 +353,20 @@ uint8_t opal_ddt_cuda_iov_is_cached(struct opal_convertor_t *convertor)
     return tmp->cuda_iov_is_cached;
 }
 
+void opal_ddt_set_cuda_iov_position(struct opal_convertor_t *convertor, size_t ddt_offset, const uint32_t *cached_cuda_iov_nb_bytes_list_h, const uint32_t cuda_iov_count)
+{
+    int i;
+    size_t iov_size = 0;
+    for(i = 0; i < cuda_iov_count; i++) {
+        iov_size += cached_cuda_iov_nb_bytes_list_h[i];
+        if (iov_size > ddt_offset) {
+            convertor->current_iov_partial_length = iov_size - ddt_offset;
+            convertor->current_cuda_iov_pos = i;
+            break;
+        }
+    }
+}
+
 void opal_ddt_check_cuda_iov_is_full(struct opal_convertor_t *convertor, uint32_t cuda_iov_count)
 {
 #if 0
diff --git a/opal/datatype/cuda/opal_datatype_cuda.cuh b/opal/datatype/cuda/opal_datatype_cuda.cuh
index d0f9ba2f27f..96a045f66cd 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cuh
+++ b/opal/datatype/cuda/opal_datatype_cuda.cuh
@@ -129,6 +129,8 @@ uint8_t opal_ddt_cuda_iov_is_cached(struct opal_convertor_t *convertor);
 
 void opal_ddt_check_cuda_iov_is_full(struct opal_convertor_t *convertor, uint32_t cuda_iov_count);
 
+void opal_ddt_set_cuda_iov_position(struct opal_convertor_t *convertor, size_t ddt_offset, const uint32_t *cached_cuda_iov_nb_bytes_list_h, const uint32_t cuda_iov_count);
+
 }
                             
 #endif  /* OPAL_DATATYPE_CUDA_H_HAS_BEEN_INCLUDED */
diff --git a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
index 2fc03173f51..528f4e17c91 100644
--- a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
+++ b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
@@ -139,9 +139,9 @@ __global__ void opal_generic_simple_pack_cuda_iov_non_cached_kernel( ddt_cuda_io
 
 __global__ void opal_generic_simple_unpack_cuda_iov_non_cached_kernel( ddt_cuda_iov_dist_non_cached_t* cuda_iov_dist, int nb_blocks_used);
 
-__global__ void opal_generic_simple_pack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, uintptr_t* cuda_iov_contig_buf_d, int nb_blocks_used, unsigned char* source_base);
+__global__ void opal_generic_simple_pack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, uint32_t cuda_iov_pos, uintptr_t* cuda_iov_contig_buf_d, int nb_blocks_used, unsigned char* source_base);
 
-__global__ void opal_generic_simple_unpack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, uintptr_t* cuda_iov_contig_buf_d, int nb_blocks_used, unsigned char* destination_base, size_t cuda_iov_partial_length_start, size_t cuda_iov_partial_length_end);
+__global__ void opal_generic_simple_unpack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, uint32_t cuda_iov_pos, uintptr_t* cuda_iov_contig_buf_d, int nb_blocks_used, unsigned char* destination_base, size_t cuda_iov_partial_length_start, size_t cuda_iov_partial_length_end);
 
 void opal_cuda_output(int output_id, const char *format, ...);
 
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
index 8ddbe77d99d..93fb188ddcd 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
@@ -88,16 +88,17 @@ __global__ void opal_generic_simple_pack_cuda_iov_non_cached_kernel( ddt_cuda_io
     }
 }
 
-__global__ void opal_generic_simple_pack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, uintptr_t* cuda_iov_contig_buf_d, int nb_blocks_used, unsigned char* source_base)
+__global__ void opal_generic_simple_pack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, uint32_t cuda_iov_pos, uintptr_t* cuda_iov_contig_buf_d, int nb_blocks_used, unsigned char* source_base)
 {
-    uint32_t i, j;
+    uint32_t i, j, _nb_bytes;    
     size_t src_offset;
     unsigned char *dst;
     unsigned char *_source_tmp, *_destination_tmp;
+    uint32_t current_cuda_iov_pos = cuda_iov_pos;
     
     __shared__ uint32_t nb_tasks;
-    __shared__ uint32_t copy_count;
-    __shared__ uint8_t alignment;
+    uint32_t copy_count;
+    uint8_t alignment;
     
     if (threadIdx.x == 0) {
         nb_tasks = nb_blocks_used / gridDim.x;
@@ -109,24 +110,20 @@ __global__ void opal_generic_simple_pack_cuda_iov_cached_kernel( ddt_cuda_iov_di
     __syncthreads();
     
     for (i = 0; i < nb_tasks; i++) {
-        src_offset = cuda_iov_dist[blockIdx.x + i * gridDim.x].ptr_offset;
+        src_offset = cuda_iov_dist[blockIdx.x + i * gridDim.x + current_cuda_iov_pos].ptr_offset;
         dst = (unsigned char *)cuda_iov_contig_buf_d[blockIdx.x + i * gridDim.x];
+        _nb_bytes = cuda_iov_dist[blockIdx.x + i * gridDim.x + current_cuda_iov_pos].nb_bytes;
         
-        if (threadIdx.x == 0) {
-            _source_tmp = source_base + src_offset;
-            _destination_tmp = dst;
-            uint32_t _nb_bytes = cuda_iov_dist[blockIdx.x + i * gridDim.x].nb_bytes;
-            /* block size is either multiple of ALIGNMENT_DOUBLE or residule */
-            if ((uintptr_t)(_source_tmp) % ALIGNMENT_DOUBLE == 0 && (uintptr_t)_destination_tmp % ALIGNMENT_DOUBLE == 0 && _nb_bytes % ALIGNMENT_DOUBLE == 0) {
-                alignment = ALIGNMENT_DOUBLE;
-            } else if ((uintptr_t)(_source_tmp) % ALIGNMENT_FLOAT == 0 && (uintptr_t)_destination_tmp % ALIGNMENT_FLOAT == 0 && _nb_bytes % ALIGNMENT_FLOAT == 0) {
-                alignment = ALIGNMENT_FLOAT;
-            } else {
-                alignment = ALIGNMENT_CHAR;
-            }
-            copy_count = _nb_bytes / alignment;
+        _source_tmp = source_base + src_offset;
+        /* block size is either multiple of ALIGNMENT_DOUBLE or residule */
+        if ((uintptr_t)(_source_tmp) % ALIGNMENT_DOUBLE == 0 && (uintptr_t)dst % ALIGNMENT_DOUBLE == 0 && _nb_bytes % ALIGNMENT_DOUBLE == 0) {
+            alignment = ALIGNMENT_DOUBLE;
+        } else if ((uintptr_t)(_source_tmp) % ALIGNMENT_FLOAT == 0 && (uintptr_t)dst % ALIGNMENT_FLOAT == 0 && _nb_bytes % ALIGNMENT_FLOAT == 0) {
+            alignment = ALIGNMENT_FLOAT;
+        } else {
+            alignment = ALIGNMENT_CHAR;
         }
-        __syncthreads();
+        copy_count = _nb_bytes / alignment;
         
         for (j = threadIdx.x; j < copy_count; j += blockDim.x) {
             if (j < copy_count) {
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
index 55cb955808e..1d14c000977 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
@@ -937,7 +937,7 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
     uint32_t nb_blocks, thread_per_block, nb_blocks_used;
     size_t length, buffer_size, length_per_iovec;
     unsigned char *destination, *destination_base, *source_base, *source;
-    size_t total_packed, packed_w_cache ,packed_wo_cache;
+    size_t total_packed;
     int32_t complete_flag = 0;
     uint8_t buffer_isfull = 0, transfer_required, free_required;
     uint32_t convertor_flags;
@@ -948,19 +948,21 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
 //    int32_t orig_stack_index;
     cudaError_t cuda_err;
     ddt_cuda_stream_t *cuda_streams = current_cuda_device->cuda_streams;
-    ddt_cuda_iov_dist_cached_t* cuda_iov_dist_h_current;
-    ddt_cuda_iov_dist_cached_t* cuda_iov_dist_d_current;
-    uintptr_t *cuda_iov_contig_buf_h_current, *cuda_iov_contig_buf_d_current;
-    ddt_cuda_iov_pipeline_block_t *cuda_iov_pipeline_block;
+    ddt_cuda_iov_dist_cached_t* cuda_iov_dist_h_current = NULL;
+    ddt_cuda_iov_dist_cached_t* cuda_iov_dist_d_current = NULL;
+    uintptr_t *cuda_iov_contig_buf_h_current = NULL; 
+    uintptr_t *cuda_iov_contig_buf_d_current = NULL;
+    ddt_cuda_iov_pipeline_block_t *cuda_iov_pipeline_block = NULL;
     int iov_pipeline_block_id = 0;
     cudaStream_t *cuda_stream_iov = NULL;
     const struct iovec *ddt_iov = NULL;
     uint32_t ddt_iov_count = 0;
     size_t iov_len = 0;
     uint32_t iov_start_pos, iov_end_pos, cuda_iov_start_pos, cuda_iov_end_pos;
-    ddt_cuda_iov_total_cached_t* cached_cuda_iov;
-    ddt_cuda_iov_dist_cached_t* cached_cuda_iov_dist_d;
-    uint32_t *cached_cuda_iov_nb_bytes_list_h, *cuda_iov_nb_bytes_list_h_current;
+    ddt_cuda_iov_total_cached_t* cached_cuda_iov = NULL;
+    ddt_cuda_iov_dist_cached_t* cached_cuda_iov_dist_d = NULL;
+    uint32_t *cached_cuda_iov_nb_bytes_list_h = NULL;
+    uint32_t *cuda_iov_nb_bytes_list_h_current = NULL;
     uint32_t cached_cuda_iov_count = 0;
     uint8_t cuda_iov_is_cached = 0;
 
@@ -1015,8 +1017,6 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
     DT_CUDA_DEBUG ( opal_cuda_output(2, "Pack using IOV cached, GPU base %p, pack to buffer %p\n", pConvertor->pBaseBuf, destination););
 
     total_packed = 0;
-    packed_wo_cache = 0;
-    packed_w_cache = 0;
     cuda_streams->current_stream_id = 0;
   //  orig_stack_index = pStack->index;
     destination_base = destination;
@@ -1036,8 +1036,6 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
     assert(cached_cuda_iov_dist_d != NULL);
     cached_cuda_iov_nb_bytes_list_h = cached_cuda_iov->nb_bytes_h;
     assert(cached_cuda_iov_nb_bytes_list_h != NULL);
-    cached_cuda_iov_count = cached_cuda_iov->cuda_iov_count;
-    cuda_iov_is_cached = cached_cuda_iov->cuda_iov_is_cached;
     DT_CUDA_DEBUG ( opal_cuda_output(4, "Pack iov count %d, submit to CUDA stream %d\n", ddt_iov_count, cuda_streams->current_stream_id); );
 
 #if defined(OPAL_DATATYPE_CUDA_TIMING)    
@@ -1052,133 +1050,69 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
     
     /* cuda iov is not cached, start to cache iov */
     if(opal_ddt_cuda_iov_is_cached(pConvertor) == 0) {
-    
-        iov_start_pos = pConvertor->current_iov_pos;
-        iov_end_pos = iov_start_pos + IOV_PIPELINE_SIZE;
-        if (iov_end_pos > ddt_iov_count) {
-            iov_end_pos = ddt_iov_count;
-        }
-    
-        while (iov_start_pos < iov_end_pos && !buffer_isfull) {
-        
-            nb_blocks_used = 0;
-            cuda_iov_pipeline_block = current_cuda_device->cuda_iov_pipeline_block[iov_pipeline_block_id];
-            cuda_iov_dist_h_current = cuda_iov_pipeline_block->cuda_iov_dist_cached_h;
-            cuda_iov_contig_buf_h_current = cuda_iov_pipeline_block->cuda_iov_contig_buf_h;
-            cuda_iov_contig_buf_d_current = cuda_iov_pipeline_block->cuda_iov_contig_buf_d;
-            cuda_iov_dist_d_current = cached_cuda_iov_dist_d + pConvertor->current_cuda_iov_pos;
-            cuda_iov_nb_bytes_list_h_current = cached_cuda_iov_nb_bytes_list_h + pConvertor->current_cuda_iov_pos;
-            cuda_stream_iov = cuda_iov_pipeline_block->cuda_stream;
-            cuda_err = cudaStreamWaitEvent(*cuda_stream_iov, cuda_iov_pipeline_block->cuda_event, 0);
-            opal_cuda_check_error(cuda_err);
+        nb_blocks_used = 0;
+        cuda_iov_pipeline_block = current_cuda_device->cuda_iov_pipeline_block[iov_pipeline_block_id];
+        cuda_iov_dist_h_current = cuda_iov_pipeline_block->cuda_iov_dist_cached_h;
+        cuda_stream_iov = cuda_iov_pipeline_block->cuda_stream;
+        cuda_err = cudaStreamWaitEvent(*cuda_stream_iov, cuda_iov_pipeline_block->cuda_event, 0);
+        opal_cuda_check_error(cuda_err);
 
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
-            GET_TIME(start);
+        GET_TIME(start);
 #endif
 
-            for (i = iov_start_pos; i < iov_end_pos && !buffer_isfull; i++) {
-                if (pConvertor->current_iov_partial_length > 0) {
-                    iov_len = pConvertor->current_iov_partial_length;
-                    pConvertor->current_iov_partial_length = 0;
-                } else {
-                    iov_len = ddt_iov[i].iov_len;
-                }
-                if (buffer_size >= iov_len) {
-                    length_per_iovec = iov_len;
+        for (i = 0; i < ddt_iov_count; i++) {
+            length_per_iovec = ddt_iov[i].iov_len;
+            source = (size_t)(ddt_iov[i].iov_base) + source_base;
+        
+            /* block size is either multiple of ALIGNMENT_DOUBLE or residule */
+            alignment = ALIGNMENT_DOUBLE;
+
+            count_desc = length_per_iovec / alignment;
+            residue_desc = length_per_iovec % alignment;
+            nb_blocks_per_description = (count_desc + thread_per_block - 1) / thread_per_block;
+            DT_CUDA_DEBUG ( opal_cuda_output(10, "Pack description %d, size %d, residue %d, alignment %d, nb_block_aquired %d\n", i, count_desc, residue_desc, alignment, nb_blocks_per_description); );
+            for (j = 0; j < nb_blocks_per_description; j++) {
+                cuda_iov_dist_h_current[nb_blocks_used].ptr_offset = source + j * thread_per_block * alignment - source_base;
+                if ( (j+1) * thread_per_block <= count_desc) {
+                    cuda_iov_dist_h_current[nb_blocks_used].nb_bytes = thread_per_block * alignment;
                 } else {
-                    /*orig_alignment = opal_datatype_basicDatatypes[pElem->elem.common.type]->size;*/
-                    orig_alignment = ALIGNMENT_CHAR;
-                    length_per_iovec = buffer_size / orig_alignment * orig_alignment;
-                    buffer_isfull = 1;
-                    pConvertor->current_iov_partial_length = iov_len - length_per_iovec;
-                    pConvertor->current_iov_pos = i;
+                    cuda_iov_dist_h_current[nb_blocks_used].nb_bytes = (count_desc - j*thread_per_block) * alignment; 
                 }
-                buffer_size -= length_per_iovec;
-                packed_wo_cache += length_per_iovec;
-                source = (size_t)(ddt_iov[i].iov_base) + (ddt_iov[i].iov_len - iov_len) + source_base;
-            
-                /* block size is either multiple of ALIGNMENT_DOUBLE or residule */
-                alignment = ALIGNMENT_DOUBLE;
-
-                count_desc = length_per_iovec / alignment;
-                residue_desc = length_per_iovec % alignment;
-                nb_blocks_per_description = (count_desc + thread_per_block - 1) / thread_per_block;
-                DT_CUDA_DEBUG ( opal_cuda_output(10, "Pack description %d, size %d, residue %d, alignment %d, nb_block_aquired %d\n", i, count_desc, residue_desc, alignment, nb_blocks_per_description); );
-                for (j = 0; j < nb_blocks_per_description; j++) {
-                    cuda_iov_dist_h_current[nb_blocks_used].ptr_offset = source + j * thread_per_block * alignment - source_base;
-                    cuda_iov_contig_buf_h_current[nb_blocks_used] = (uintptr_t)destination;
-                    if ( (j+1) * thread_per_block <= count_desc) {
-                        cuda_iov_dist_h_current[nb_blocks_used].nb_bytes = thread_per_block * alignment;
-                    } else {
-                        cuda_iov_dist_h_current[nb_blocks_used].nb_bytes = (count_desc - j*thread_per_block) * alignment; 
-                    }
 #if defined (OPAL_DATATYPE_CUDA_DEBUG)
-                    assert(cuda_iov_dist_h_current[nb_blocks_used].nb_bytes > 0);
+                assert(cuda_iov_dist_h_current[nb_blocks_used].nb_bytes > 0);
 #endif /* OPAL_DATATYPE_CUDA_DEBUG */
-                    cuda_iov_nb_bytes_list_h_current[nb_blocks_used] = cuda_iov_dist_h_current[nb_blocks_used].nb_bytes;
-                    destination += cuda_iov_dist_h_current[nb_blocks_used].nb_bytes;
-                    DT_CUDA_DEBUG( opal_cuda_output(12, "Pack \tblock %d, src_offset %ld, dst %p, nb_bytes %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].ptr_offset, cuda_iov_contig_buf_h_current[nb_blocks_used], cuda_iov_dist_h_current[nb_blocks_used].nb_bytes); );
-                    nb_blocks_used ++;
-                    assert (nb_blocks_used < CUDA_MAX_NB_BLOCKS*CUDA_IOV_MAX_TASK_PER_BLOCK);
-                }
-            
-                /* handle residue */
-                if (residue_desc != 0) {
-                    /*orig_alignment = opal_datatype_basicDatatypes[pElem->elem.common.type]->size;*/
-                    orig_alignment = ALIGNMENT_CHAR;
-                    cuda_iov_dist_h_current[nb_blocks_used].ptr_offset = source + length_per_iovec / alignment * alignment - source_base;
-                    cuda_iov_contig_buf_h_current[nb_blocks_used] = (uintptr_t)destination;
-                    cuda_iov_dist_h_current[nb_blocks_used].nb_bytes = length_per_iovec - length_per_iovec / alignment * alignment;
+                cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] = cuda_iov_dist_h_current[nb_blocks_used].nb_bytes;
+                DT_CUDA_DEBUG( opal_cuda_output(12, "Pack \tblock %d, src_offset %ld, nb_bytes %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].ptr_offset, cuda_iov_dist_h_current[nb_blocks_used].nb_bytes); );
+                nb_blocks_used ++;
+                assert (nb_blocks_used < CUDA_MAX_NB_BLOCKS*CUDA_IOV_MAX_TASK_PER_BLOCK);
+            }
+        
+            /* handle residue */
+            if (residue_desc != 0) {
+                /*orig_alignment = opal_datatype_basicDatatypes[pElem->elem.common.type]->size;*/
+                cuda_iov_dist_h_current[nb_blocks_used].ptr_offset = source + length_per_iovec / alignment * alignment - source_base;
+                cuda_iov_dist_h_current[nb_blocks_used].nb_bytes = length_per_iovec - length_per_iovec / alignment * alignment;
 #if defined (OPAL_DATATYPE_CUDA_DEBUG)
-                    assert(cuda_iov_dist_h_current[nb_blocks_used].nb_bytes > 0);
+                assert(cuda_iov_dist_h_current[nb_blocks_used].nb_bytes > 0);
 #endif /* OPAL_DATATYPE_CUDA_DEBUG */
-                    cuda_iov_nb_bytes_list_h_current[nb_blocks_used] = cuda_iov_dist_h_current[nb_blocks_used].nb_bytes;
-                    destination += cuda_iov_dist_h_current[nb_blocks_used].nb_bytes;
-                    DT_CUDA_DEBUG( opal_cuda_output(12, "Pack \tblock %d, src_offset %ld, dst %p, nb_bytes %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].ptr_offset, cuda_iov_contig_buf_h_current[nb_blocks_used], cuda_iov_dist_h_current[nb_blocks_used].nb_bytes); );
-                    nb_blocks_used ++;
-                    assert (nb_blocks_used < CUDA_MAX_NB_BLOCKS*CUDA_IOV_MAX_TASK_PER_BLOCK);
-                }
-            }
-            
-            if (!buffer_isfull) {
-                pConvertor->current_iov_pos = i;
+                cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] = cuda_iov_dist_h_current[nb_blocks_used].nb_bytes;
+                DT_CUDA_DEBUG( opal_cuda_output(12, "Pack \tblock %d, src_offset %ld, nb_bytes %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].ptr_offset, cuda_iov_dist_h_current[nb_blocks_used].nb_bytes); );
+                nb_blocks_used ++;
+                assert (nb_blocks_used < CUDA_MAX_NB_BLOCKS*CUDA_IOV_MAX_TASK_PER_BLOCK);
             }
-
+        }
+        cudaMemcpyAsync(cached_cuda_iov_dist_d, cuda_iov_dist_h_current, sizeof(ddt_cuda_iov_dist_cached_t)*(nb_blocks_used), cudaMemcpyHostToDevice, *cuda_stream_iov);
+        opal_ddt_set_cuda_iov_cached(pConvertor, nb_blocks_used);
+        DT_CUDA_DEBUG ( opal_cuda_output(2, "Pack cuda iov is cached, count %d\n", nb_blocks_used););
 #if defined(OPAL_DATATYPE_CUDA_TIMING)    
-            GET_TIME( end );
-            total_time = ELAPSED_TIME( start, end );
-            DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Pack to dest %p, iov is prepared in %ld microsec, kernel submitted to CUDA stream %d, nb_blocks %d\n", destination_base, total_time,  cuda_iov_pipeline_block->cuda_stream_id, nb_blocks_used); );
+        GET_TIME( end );
+        total_time = ELAPSED_TIME( start, end );
+        DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Pack cached cuda iov is prepared in %ld microsec, nb_blocks %d\n", total_time, nb_blocks_used); );
 #endif
-
-          //  opal_ddt_check_cuda_iov_is_full(pConvertor, pConvertor->current_cuda_iov_pos + nb_blocks_used); /* make sure cuda iov has enough space */
-            cudaMemcpyAsync(cuda_iov_dist_d_current, cuda_iov_dist_h_current, sizeof(ddt_cuda_iov_dist_cached_t)*(nb_blocks_used), cudaMemcpyHostToDevice, *cuda_stream_iov);
-            cudaMemcpyAsync(cuda_iov_contig_buf_d_current, cuda_iov_contig_buf_h_current, sizeof(uintptr_t)*(nb_blocks_used), cudaMemcpyHostToDevice, *cuda_stream_iov);
-            DT_CUDA_DEBUG ( opal_cuda_output(2, "kernel launched src_base %p, dst_base %p, nb_blocks %ld\n", source_base, destination_base, nb_blocks_used ); );
-            opal_generic_simple_pack_cuda_iov_cached_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cuda_iov_dist_d_current, cuda_iov_contig_buf_d_current, nb_blocks_used, source_base);
-            cuda_err = cudaEventRecord(cuda_iov_pipeline_block->cuda_event, *cuda_stream_iov);
-            opal_cuda_check_error(cuda_err);
-            iov_pipeline_block_id ++;
-            iov_pipeline_block_id = iov_pipeline_block_id % NB_STREAMS;
-            pConvertor->current_cuda_iov_pos += nb_blocks_used;
-        
-    //        orig_stack_index = pStack->index;
-            iov_start_pos = iov_end_pos;
-            iov_end_pos = iov_start_pos + IOV_PIPELINE_SIZE;
-            if (iov_end_pos >= ddt_iov_count) {
-                iov_end_pos = ddt_iov_count;
-            }
-            /* count = 0 done, iov cached finished */
-            if (pConvertor->current_iov_pos == ddt_iov_count) {
-                pConvertor->current_count ++;
-                opal_ddt_set_cuda_iov_cached(pConvertor, pConvertor->current_cuda_iov_pos);
-                DT_CUDA_DEBUG ( opal_cuda_output(2, "Pack cuda iov is cached, count %d\n", pConvertor->current_cuda_iov_pos););
-            }
-            DT_CUDA_DEBUG ( opal_cuda_output(4, "Pack iov start pos %d end pos %d, submit to CUDA stream %d\n", iov_start_pos, iov_end_pos, cuda_streams->current_stream_id); );
-        }
     }
-    total_packed += packed_wo_cache;
-    pConvertor->bConverted += packed_wo_cache;
-
+    
+    cached_cuda_iov_count = cached_cuda_iov->cuda_iov_count;
    
     /* now we use cached cuda iov */
     if( pConvertor->bConverted != pConvertor->local_size && !buffer_isfull) {
@@ -1188,10 +1122,7 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
         cuda_iov_pipeline_block = current_cuda_device->cuda_iov_pipeline_block[iov_pipeline_block_id];
         cuda_iov_contig_buf_h_current = cuda_iov_pipeline_block->cuda_iov_contig_buf_h;
         cuda_iov_contig_buf_d_current = cuda_iov_pipeline_block->cuda_iov_contig_buf_d;
-        cuda_iov_dist_d_current = cached_cuda_iov_dist_d + pConvertor->current_cuda_iov_pos;
         cuda_stream_iov = cuda_iov_pipeline_block->cuda_stream;
-        cuda_err = cudaStreamWaitEvent(*cuda_stream_iov, cuda_iov_pipeline_block->cuda_event, 0);
-        opal_cuda_check_error(cuda_err);
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
         GET_TIME(start);
 #endif
@@ -1199,7 +1130,7 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
             if (buffer_size >= cached_cuda_iov_nb_bytes_list_h[i]) {
                 cuda_iov_contig_buf_h_current[nb_blocks_used] = (uintptr_t)destination;
                 destination += cached_cuda_iov_nb_bytes_list_h[i];
-                packed_w_cache += cached_cuda_iov_nb_bytes_list_h[i];
+                total_packed += cached_cuda_iov_nb_bytes_list_h[i];
                 buffer_size -= cached_cuda_iov_nb_bytes_list_h[i];
                 nb_blocks_used++;
             } else {
@@ -1207,28 +1138,22 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
                 break;
             }
         }
-        printf("nb_blocks_used %d, my %d\n", nb_blocks_used, i - cuda_iov_start_pos);
 #if defined(OPAL_DATATYPE_CUDA_TIMING)    
         GET_TIME( end );
         total_time = ELAPSED_TIME( start, end );
         DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Pack to dest %p, cached cuda iov is prepared in %ld microsec, kernel submitted to CUDA stream %d, nb_blocks %d\n", destination_base, total_time,  cuda_iov_pipeline_block->cuda_stream_id, nb_blocks_used); );
 #endif
-        pConvertor->current_cuda_iov_pos += nb_blocks_used;
         cudaMemcpyAsync(cuda_iov_contig_buf_d_current, cuda_iov_contig_buf_h_current, sizeof(uintptr_t)*(nb_blocks_used), cudaMemcpyHostToDevice, *cuda_stream_iov);
         DT_CUDA_DEBUG ( opal_cuda_output(2, "kernel launched src_base %p, dst_base %p, nb_blocks %ld\n", source_base, destination_base, nb_blocks_used ); );
-        opal_generic_simple_pack_cuda_iov_cached_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cuda_iov_dist_d_current, cuda_iov_contig_buf_d_current, nb_blocks_used, source_base);
-        cuda_err = cudaEventRecord(cuda_iov_pipeline_block->cuda_event, *cuda_stream_iov);
-        opal_cuda_check_error(cuda_err);
-        iov_pipeline_block_id ++;
-        iov_pipeline_block_id = iov_pipeline_block_id % NB_STREAMS;
+        opal_generic_simple_pack_cuda_iov_cached_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cached_cuda_iov_dist_d, pConvertor->current_cuda_iov_pos, cuda_iov_contig_buf_d_current, nb_blocks_used, source_base);
+        pConvertor->current_cuda_iov_pos += nb_blocks_used;
     }
 
     for (i = 0; i < NB_STREAMS; i++) {
         cudaStreamSynchronize(cuda_streams->opal_cuda_stream[i]);
     }
-    
-    total_packed += packed_w_cache;
-    pConvertor->bConverted += packed_w_cache;
+
+    pConvertor->bConverted += total_packed;
     DT_CUDA_DEBUG ( opal_cuda_output(2, "Pack total packed %d\n", total_packed); );
     
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
index c553a7991b0..f98a8c0b2ea 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
@@ -46,16 +46,18 @@ __global__ void opal_generic_simple_unpack_cuda_iov_non_cached_kernel( ddt_cuda_
     }
 }
 
-__global__ void opal_generic_simple_unpack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, uintptr_t* cuda_iov_contig_buf_d, int nb_blocks_used, unsigned char* destination_base, size_t cuda_iov_partial_length_start, size_t cuda_iov_partial_length_end)
+__global__ void opal_generic_simple_unpack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, uint32_t cuda_iov_pos, uintptr_t* cuda_iov_contig_buf_d, int nb_blocks_used, unsigned char* destination_base, size_t cuda_iov_partial_length_start, size_t cuda_iov_partial_length_end)
 {
     uint32_t i, j;
     size_t dst_offset;
     unsigned char *src;
     unsigned char *_source_tmp, *_destination_tmp;
+    uint32_t _nb_bytes;
+    uint32_t current_cuda_iov_pos = cuda_iov_pos; 
     
     __shared__ uint32_t nb_tasks;
-    __shared__ uint32_t copy_count;
-    __shared__ uint8_t alignment;
+    uint32_t copy_count;
+    uint8_t alignment;
     
     if (threadIdx.x == 0) {
         nb_tasks = nb_blocks_used / gridDim.x;
@@ -67,32 +69,28 @@ __global__ void opal_generic_simple_unpack_cuda_iov_cached_kernel( ddt_cuda_iov_
     
     for (i = 0; i < nb_tasks; i++) {
         src = (unsigned char *)cuda_iov_contig_buf_d[blockIdx.x + i * gridDim.x];
-        dst_offset = cuda_iov_dist[blockIdx.x + i * gridDim.x].ptr_offset;
+        dst_offset = cuda_iov_dist[blockIdx.x + i * gridDim.x + current_cuda_iov_pos].ptr_offset;
         if (i == 0 && blockIdx.x == 0 && cuda_iov_partial_length_start != 0) {
          //   if (threadIdx.x == 0) printf("cuda_iov_partial_length_start %d", cuda_iov_partial_length_start);
-            dst_offset = dst_offset + cuda_iov_dist[blockIdx.x + i * gridDim.x].nb_bytes - cuda_iov_partial_length_start; 
+            dst_offset = dst_offset + cuda_iov_dist[blockIdx.x + i * gridDim.x + current_cuda_iov_pos].nb_bytes - cuda_iov_partial_length_start; 
         }
-        if (threadIdx.x == 0) {
-            _source_tmp = src;
-            _destination_tmp = destination_base + dst_offset;
-            uint32_t _nb_bytes = 0;
-            if (i == 0 && blockIdx.x == 0 && cuda_iov_partial_length_start != 0) {
-                _nb_bytes = cuda_iov_partial_length_start;
-            } else if (i == nb_tasks-1 && (blockIdx.x == (nb_blocks_used-1) % gridDim.x) && cuda_iov_partial_length_end != 0) {
-                _nb_bytes = cuda_iov_partial_length_end;
-            } else {
-                _nb_bytes = cuda_iov_dist[blockIdx.x + i * gridDim.x].nb_bytes;
-            }
-            if ((uintptr_t)(_destination_tmp) % ALIGNMENT_DOUBLE == 0 && (uintptr_t)_source_tmp % ALIGNMENT_DOUBLE == 0 && _nb_bytes % ALIGNMENT_DOUBLE == 0) {
-                alignment = ALIGNMENT_DOUBLE;
-            } else if ((uintptr_t)(_destination_tmp) % ALIGNMENT_FLOAT == 0 && (uintptr_t)_source_tmp % ALIGNMENT_FLOAT == 0 && _nb_bytes % ALIGNMENT_FLOAT == 0) {
-                alignment = ALIGNMENT_FLOAT;
-            } else {
-                alignment = ALIGNMENT_CHAR;
-            }
-            copy_count = _nb_bytes / alignment;
+        _destination_tmp = destination_base + dst_offset;
+
+        if (i == 0 && blockIdx.x == 0 && cuda_iov_partial_length_start != 0) {
+            _nb_bytes = cuda_iov_partial_length_start;
+        } else if (i == nb_tasks-1 && (blockIdx.x == (nb_blocks_used-1) % gridDim.x) && cuda_iov_partial_length_end != 0) {
+            _nb_bytes = cuda_iov_partial_length_end;
+        } else {
+            _nb_bytes = cuda_iov_dist[blockIdx.x + i * gridDim.x + current_cuda_iov_pos].nb_bytes;
+        }
+        if ((uintptr_t)(_destination_tmp) % ALIGNMENT_DOUBLE == 0 && (uintptr_t)src % ALIGNMENT_DOUBLE == 0 && _nb_bytes % ALIGNMENT_DOUBLE == 0) {
+            alignment = ALIGNMENT_DOUBLE;
+        } else if ((uintptr_t)(_destination_tmp) % ALIGNMENT_FLOAT == 0 && (uintptr_t)src % ALIGNMENT_FLOAT == 0 && _nb_bytes % ALIGNMENT_FLOAT == 0) {
+            alignment = ALIGNMENT_FLOAT;
+        } else {
+            alignment = ALIGNMENT_CHAR;
         }
-        __syncthreads();
+        copy_count = _nb_bytes / alignment;
         
         for (j = threadIdx.x; j < copy_count; j += blockDim.x) {
 /*            if (threadIdx.x == 0) {
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
index 5092c7ec806..9b807159f08 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
@@ -713,7 +713,7 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
     uint32_t nb_blocks, thread_per_block, nb_blocks_used;
     size_t length, buffer_size, length_per_iovec;
     unsigned char *source, *source_base, *destination_base, *destination;
-    size_t total_unpacked, unpacked_wo_cache, unpacked_w_cache;
+    size_t total_unpacked;
     int32_t complete_flag = 0;
     uint8_t buffer_isfull = 0;
     uint8_t free_required = 0;
@@ -725,19 +725,21 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
 //    int32_t orig_stack_index;
     cudaError_t cuda_err;
     ddt_cuda_stream_t *cuda_streams = current_cuda_device->cuda_streams;
-    ddt_cuda_iov_dist_cached_t* cuda_iov_dist_h_current;
-    ddt_cuda_iov_dist_cached_t* cuda_iov_dist_d_current;
-    uintptr_t *cuda_iov_contig_buf_h_current, *cuda_iov_contig_buf_d_current;
-    ddt_cuda_iov_pipeline_block_t *cuda_iov_pipeline_block;
+    ddt_cuda_iov_dist_cached_t* cuda_iov_dist_h_current = NULL;
+    ddt_cuda_iov_dist_cached_t* cuda_iov_dist_d_current = NULL;
+    uintptr_t *cuda_iov_contig_buf_h_current = NULL;
+    uintptr_t *cuda_iov_contig_buf_d_current = NULL;
+    ddt_cuda_iov_pipeline_block_t *cuda_iov_pipeline_block = NULL;
     int iov_pipeline_block_id = 0;
     cudaStream_t *cuda_stream_iov = NULL;
     const struct iovec *ddt_iov = NULL;
     uint32_t ddt_iov_count = 0;
     size_t iov_len = 0;
     uint32_t iov_start_pos, iov_end_pos, cuda_iov_start_pos, cuda_iov_end_pos;
-    ddt_cuda_iov_total_cached_t* cached_cuda_iov;
-    ddt_cuda_iov_dist_cached_t* cached_cuda_iov_dist_d;
-    uint32_t *cached_cuda_iov_nb_bytes_list_h, *cuda_iov_nb_bytes_list_h_current;
+    ddt_cuda_iov_total_cached_t* cached_cuda_iov = NULL;
+    ddt_cuda_iov_dist_cached_t* cached_cuda_iov_dist_d = NULL;
+    uint32_t *cached_cuda_iov_nb_bytes_list_h = NULL;
+    uint32_t *cuda_iov_nb_bytes_list_h_current = NULL;
     uint32_t cached_cuda_iov_count = 0;
     uint8_t cuda_iov_is_cached = 0;
     size_t cuda_iov_partial_length_start = 0;
@@ -796,8 +798,6 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
 #endif
     buffer_size = iov[0].iov_len;
     total_unpacked = 0;
-    unpacked_wo_cache = 0;
-    unpacked_w_cache = 0;
     cuda_streams->current_stream_id = 0;
     convertor_flags = pConvertor->flags;
 //    orig_stack_index = pStack->index;
@@ -809,8 +809,6 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
     assert(cached_cuda_iov_dist_d != NULL);
     cached_cuda_iov_nb_bytes_list_h = cached_cuda_iov->nb_bytes_h;
     assert(cached_cuda_iov_nb_bytes_list_h != NULL);
-    cached_cuda_iov_count = cached_cuda_iov->cuda_iov_count;
-    cuda_iov_is_cached = cached_cuda_iov->cuda_iov_is_cached;
     DT_CUDA_DEBUG ( opal_cuda_output(4, "Unpack iov count %d, submit to CUDA stream %d\n", ddt_iov_count, cuda_streams->current_stream_id); );
 
 #if defined (OPAL_DATATYPE_CUDA_TIMING)
@@ -825,132 +823,73 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
     
     /* cuda iov is not cached, start to cache iov */
     if(opal_ddt_cuda_iov_is_cached(pConvertor) == 0) {
+        nb_blocks_used = 0;
+        cuda_iov_pipeline_block = current_cuda_device->cuda_iov_pipeline_block[iov_pipeline_block_id];
+        cuda_iov_dist_h_current = cuda_iov_pipeline_block->cuda_iov_dist_cached_h;
+        cuda_stream_iov = cuda_iov_pipeline_block->cuda_stream;
+        cuda_err = cudaStreamWaitEvent(*cuda_stream_iov, cuda_iov_pipeline_block->cuda_event, 0);
+        opal_cuda_check_error(cuda_err);
     
-        iov_start_pos = pConvertor->current_iov_pos;
-        iov_end_pos = iov_start_pos + IOV_PIPELINE_SIZE;
-        if (iov_end_pos > ddt_iov_count) {
-            iov_end_pos = ddt_iov_count;
-        }
-
-        while (iov_start_pos < iov_end_pos && !buffer_isfull) {
-
-            nb_blocks_used = 0;
-            cuda_iov_pipeline_block = current_cuda_device->cuda_iov_pipeline_block[iov_pipeline_block_id];
-            cuda_iov_dist_h_current = cuda_iov_pipeline_block->cuda_iov_dist_cached_h;
-            cuda_iov_contig_buf_h_current = cuda_iov_pipeline_block->cuda_iov_contig_buf_h;
-            cuda_iov_contig_buf_d_current = cuda_iov_pipeline_block->cuda_iov_contig_buf_d;
-            cuda_iov_dist_d_current = cached_cuda_iov_dist_d + pConvertor->current_cuda_iov_pos;
-            cuda_iov_nb_bytes_list_h_current = cached_cuda_iov_nb_bytes_list_h + pConvertor->current_cuda_iov_pos;
-            DT_CUDA_DEBUG ( opal_cuda_output(2, "Pack cuda iov pos %d\n", pConvertor->current_cuda_iov_pos););
-            cuda_stream_iov = cuda_iov_pipeline_block->cuda_stream;
-            cuda_err = cudaStreamWaitEvent(*cuda_stream_iov, cuda_iov_pipeline_block->cuda_event, 0);
-            opal_cuda_check_error(cuda_err);
-        
 
 #if defined (OPAL_DATATYPE_CUDA_TIMING)
-            GET_TIME(start);
+        GET_TIME(start);
 #endif
 
-            for (i = iov_start_pos; i < iov_end_pos && !buffer_isfull; i++) {
-                if (pConvertor->current_iov_partial_length > 0) {
-                    iov_len = pConvertor->current_iov_partial_length;
-                    pConvertor->current_iov_partial_length = 0;
-                } else {
-                    iov_len = ddt_iov[i].iov_len;
-                }
-                if (buffer_size >= iov_len) {
-                    length_per_iovec = iov_len;
-                } else {
-                  /*  orig_alignment = opal_datatype_basicDatatypes[pElem->elem.common.type]->size;*/
-                    orig_alignment = ALIGNMENT_CHAR;
-                    length_per_iovec = buffer_size / orig_alignment * orig_alignment;
-                    buffer_isfull = 1;
-                    pConvertor->current_iov_partial_length = iov_len - length_per_iovec;
-                    pConvertor->current_iov_pos = i;
-                }
-                buffer_size -= length_per_iovec;
-                unpacked_wo_cache += length_per_iovec;
-                destination = (size_t)(ddt_iov[i].iov_base) + (ddt_iov[i].iov_len - iov_len) + destination_base;
+        for (i = 0; i < ddt_iov_count; i++) {
+            length_per_iovec = ddt_iov[i].iov_len;
+            destination = (size_t)(ddt_iov[i].iov_base) + destination_base;
 
-                alignment = ALIGNMENT_DOUBLE;
+            alignment = ALIGNMENT_DOUBLE;
 
-                count_desc = length_per_iovec / alignment;
-                residue_desc = length_per_iovec % alignment;
-                nb_blocks_per_description = (count_desc + thread_per_block - 1) / thread_per_block;
-                DT_CUDA_DEBUG ( opal_cuda_output(10, "Unpack description %d, size %d, residue %d, alignment %d\n", i, count_desc, residue_desc, alignment); );
-                for (j = 0; j < nb_blocks_per_description; j++) {
-                    cuda_iov_dist_h_current[nb_blocks_used].ptr_offset = destination + j * thread_per_block * alignment - destination_base;
-                    cuda_iov_contig_buf_h_current[nb_blocks_used] = (uintptr_t)source;
-                    if ( (j+1) * thread_per_block <= count_desc) {
-                        cuda_iov_dist_h_current[nb_blocks_used].nb_bytes = thread_per_block * alignment;
-                    } else {
-                        cuda_iov_dist_h_current[nb_blocks_used].nb_bytes = (thread_per_block - ((j+1)*thread_per_block - count_desc)) * alignment;
-                    }
+            count_desc = length_per_iovec / alignment;
+            residue_desc = length_per_iovec % alignment;
+            nb_blocks_per_description = (count_desc + thread_per_block - 1) / thread_per_block;
+            DT_CUDA_DEBUG ( opal_cuda_output(10, "Unpack description %d, size %d, residue %d, alignment %d\n", i, count_desc, residue_desc, alignment); );
+            for (j = 0; j < nb_blocks_per_description; j++) {
+                cuda_iov_dist_h_current[nb_blocks_used].ptr_offset = destination + j * thread_per_block * alignment - destination_base;
+                if ( (j+1) * thread_per_block <= count_desc) {
+                    cuda_iov_dist_h_current[nb_blocks_used].nb_bytes = thread_per_block * alignment;
+                } else {
+                    cuda_iov_dist_h_current[nb_blocks_used].nb_bytes = (thread_per_block - ((j+1)*thread_per_block - count_desc)) * alignment;
+                }
 #if defined (OPAL_DATATYPE_CUDA_DEBUG)
-                    assert (cuda_iov_dist_h_current[nb_blocks_used].nb_bytes > 0); 
+                assert (cuda_iov_dist_h_current[nb_blocks_used].nb_bytes > 0); 
 #endif /* OPAL_DATATYPE_CUDA_DEBUG */
-                    cuda_iov_nb_bytes_list_h_current[nb_blocks_used] = cuda_iov_dist_h_current[nb_blocks_used].nb_bytes;
-                    source += cuda_iov_dist_h_current[nb_blocks_used].nb_bytes;
-                    DT_CUDA_DEBUG( opal_cuda_output(12, "Unpack \tblock %d, src_offset %ld, dst %p, nb_bytes %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].ptr_offset, cuda_iov_contig_buf_h_current[nb_blocks_used], cuda_iov_dist_h_current[nb_blocks_used].nb_bytes); );
-                    nb_blocks_used ++;
-                }
+                cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] = cuda_iov_dist_h_current[nb_blocks_used].nb_bytes;
+                DT_CUDA_DEBUG( opal_cuda_output(12, "Unpack \tblock %d, src_offset %ld, nb_bytes %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].ptr_offset, cuda_iov_dist_h_current[nb_blocks_used].nb_bytes); );
+                nb_blocks_used ++;
+            }
 
-                /* handle residue */
-                if (residue_desc != 0) {
-                   /* orig_alignment = opal_datatype_basicDatatypes[pElem->elem.common.type]->size;*/
-                    orig_alignment = ALIGNMENT_CHAR;
-                    cuda_iov_dist_h_current[nb_blocks_used].ptr_offset = destination + length_per_iovec / alignment * alignment - destination_base;
-                    cuda_iov_contig_buf_h_current[nb_blocks_used] = (uintptr_t)source;
-                    cuda_iov_dist_h_current[nb_blocks_used].nb_bytes = length_per_iovec - length_per_iovec / alignment * alignment;
+            /* handle residue */
+            if (residue_desc != 0) {
+               /* orig_alignment = opal_datatype_basicDatatypes[pElem->elem.common.type]->size;*/
+                orig_alignment = ALIGNMENT_CHAR;
+                cuda_iov_dist_h_current[nb_blocks_used].ptr_offset = destination + length_per_iovec / alignment * alignment - destination_base;
+                cuda_iov_dist_h_current[nb_blocks_used].nb_bytes = length_per_iovec - length_per_iovec / alignment * alignment;
 #if defined (OPAL_DATATYPE_CUDA_DEBUG)
-                    assert (cuda_iov_dist_h_current[nb_blocks_used].nb_bytes > 0);
+                assert (cuda_iov_dist_h_current[nb_blocks_used].nb_bytes > 0);
 #endif /* OPAL_DATATYPE_CUDA_DEBUG */
-                    cuda_iov_nb_bytes_list_h_current[nb_blocks_used] = cuda_iov_dist_h_current[nb_blocks_used].nb_bytes;
-                    source += cuda_iov_dist_h_current[nb_blocks_used].nb_bytes;
-                    DT_CUDA_DEBUG( opal_cuda_output(12, "Unpack \tblock %d, src_offset %ld, dst %p, nb_bytes %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].ptr_offset, cuda_iov_contig_buf_h_current[nb_blocks_used], cuda_iov_dist_h_current[nb_blocks_used].nb_bytes); );
-                    nb_blocks_used ++;
-                }
-            }
-            
-            if (!buffer_isfull) {
-                pConvertor->current_iov_pos = i;
+                cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] = cuda_iov_dist_h_current[nb_blocks_used].nb_bytes;
+                DT_CUDA_DEBUG( opal_cuda_output(12, "Unpack \tblock %d, src_offset %ld, nb_bytes %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].ptr_offset, cuda_iov_dist_h_current[nb_blocks_used].nb_bytes); );
+                nb_blocks_used ++;
             }
-
+        }
+        
+        cudaMemcpy(cached_cuda_iov_dist_d, cuda_iov_dist_h_current, sizeof(ddt_cuda_iov_dist_cached_t)*(nb_blocks_used), cudaMemcpyHostToDevice);
+        opal_ddt_set_cuda_iov_cached(pConvertor, nb_blocks_used);
+        DT_CUDA_DEBUG ( opal_cuda_output(2, "Unpack cuda iov is cached, count %d\n", nb_blocks_used););
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
-            GET_TIME( end );
-            total_time = ELAPSED_TIME( start, end );
-            DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Unpack src %p, iov is prepared in %ld microsec, kernel submitted to CUDA stream %d, nb_blocks_used %d\n", source_base, total_time,  cuda_iov_pipeline_block->cuda_stream_id, nb_blocks_used); );
+        GET_TIME( end );
+        total_time = ELAPSED_TIME( start, end );
+        DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Unpack cached cuda iov is prepared in %ld microsec, nb_blocks_used %d\n", total_time, nb_blocks_used); );
 #endif
-
-            cudaMemcpyAsync(cuda_iov_dist_d_current, cuda_iov_dist_h_current, sizeof(ddt_cuda_iov_dist_cached_t)*(nb_blocks_used), cudaMemcpyHostToDevice, *cuda_stream_iov);
-            cudaMemcpyAsync(cuda_iov_contig_buf_d_current, cuda_iov_contig_buf_h_current, sizeof(uintptr_t)*(nb_blocks_used), cudaMemcpyHostToDevice, *cuda_stream_iov);
-            opal_generic_simple_unpack_cuda_iov_cached_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cuda_iov_dist_d_current, cuda_iov_contig_buf_d_current, nb_blocks_used, destination_base, 0, 0);
-            cuda_err = cudaEventRecord(cuda_iov_pipeline_block->cuda_event, *cuda_stream_iov);
-            opal_cuda_check_error(cuda_err);
-            iov_pipeline_block_id ++;
-            iov_pipeline_block_id = iov_pipeline_block_id % NB_STREAMS;
-            pConvertor->current_cuda_iov_pos += nb_blocks_used;
-        
-            iov_start_pos = iov_end_pos;
-            iov_end_pos = iov_start_pos + IOV_PIPELINE_SIZE;
-            if (iov_end_pos >= ddt_iov_count) {
-                iov_end_pos = ddt_iov_count;
-            }
-            /* finished */
-            if (pConvertor->current_iov_pos == ddt_iov_count) {
-                pConvertor->current_count ++;
-                opal_ddt_set_cuda_iov_cached(pConvertor, pConvertor->current_cuda_iov_pos);
-                DT_CUDA_DEBUG ( opal_cuda_output(2, "Pack cuda iov is cached, count %d\n", pConvertor->current_cuda_iov_pos););
-            }
-            DT_CUDA_DEBUG ( opal_cuda_output(4, "Unpack iov start pos %d end pos %d, submit to CUDA stream %d\n", iov_start_pos, iov_end_pos, cuda_streams->current_stream_id); );
-
-        }
     }
-    total_unpacked += unpacked_wo_cache;
-    pConvertor->bConverted += unpacked_wo_cache;
-#if 1    
+    
+    cached_cuda_iov_count = cached_cuda_iov->cuda_iov_count;
+      
     /* now we use cached cuda iov */
     if( pConvertor->bConverted != pConvertor->local_size && !buffer_isfull) {
+        opal_ddt_set_cuda_iov_position(pConvertor, pConvertor->bConverted, cached_cuda_iov_nb_bytes_list_h, cached_cuda_iov_count);
         cuda_iov_start_pos = pConvertor->current_cuda_iov_pos;
         cuda_iov_end_pos = cached_cuda_iov_count;
         nb_blocks_used = 0;
@@ -959,14 +898,13 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
         cuda_iov_contig_buf_d_current = cuda_iov_pipeline_block->cuda_iov_contig_buf_d;
         cuda_iov_dist_d_current = cached_cuda_iov_dist_d + pConvertor->current_cuda_iov_pos;
         cuda_stream_iov = cuda_iov_pipeline_block->cuda_stream;
-        cuda_err = cudaStreamWaitEvent(*cuda_stream_iov, cuda_iov_pipeline_block->cuda_event, 0);
-        opal_cuda_check_error(cuda_err);
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
         GET_TIME(start);
 #endif
+        printf("[00000] partial_length %ld, pos %d\n", pConvertor->current_iov_partial_length, pConvertor->current_cuda_iov_pos);
         if (pConvertor->current_iov_partial_length > 0) {
             cuda_iov_partial_length_start = pConvertor->current_iov_partial_length;
-            unpacked_w_cache += cuda_iov_partial_length_start;
+            total_unpacked += cuda_iov_partial_length_start;
             buffer_size -= cuda_iov_partial_length_start;
             pConvertor->current_iov_partial_length = 0;
             cuda_iov_contig_buf_h_current[nb_blocks_used] = (uintptr_t)source;
@@ -978,13 +916,13 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
             if (buffer_size >= cached_cuda_iov_nb_bytes_list_h[i]) {
                 cuda_iov_contig_buf_h_current[nb_blocks_used] = (uintptr_t)source;
                 source += cached_cuda_iov_nb_bytes_list_h[i];
-                unpacked_w_cache += cached_cuda_iov_nb_bytes_list_h[i];
+                total_unpacked += cached_cuda_iov_nb_bytes_list_h[i];
                 buffer_size -= cached_cuda_iov_nb_bytes_list_h[i];
                 nb_blocks_used ++;
             } else {
                 if (buffer_size > 0) {
                     cuda_iov_partial_length_end = buffer_size;
-                    unpacked_w_cache += cuda_iov_partial_length_end;
+                    total_unpacked += cuda_iov_partial_length_end;
                     cuda_iov_contig_buf_h_current[nb_blocks_used] = (uintptr_t)source;
                     source += cuda_iov_partial_length_end;
                     pConvertor->current_iov_partial_length = cached_cuda_iov_nb_bytes_list_h[i] - cuda_iov_partial_length_end;
@@ -1000,27 +938,16 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
         total_time = ELAPSED_TIME( start, end );
         DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Unpack src %p, cached cuda iov is prepared in %ld microsec, kernel submitted to CUDA stream %d, nb_blocks %d\n", source_base, total_time,  cuda_iov_pipeline_block->cuda_stream_id, nb_blocks_used); );
 #endif
-        if (pConvertor->current_iov_partial_length > 0) {
-            pConvertor->current_cuda_iov_pos += nb_blocks_used - 1;
-        } else {
-            pConvertor->current_cuda_iov_pos += nb_blocks_used;
-        }
         cudaMemcpyAsync(cuda_iov_contig_buf_d_current, cuda_iov_contig_buf_h_current, sizeof(uintptr_t)*(nb_blocks_used), cudaMemcpyHostToDevice, *cuda_stream_iov);
         DT_CUDA_DEBUG ( opal_cuda_output(2, "kernel launched src_base %p, dst_base %p, nb_blocks %ld\n", source_base, destination_base, nb_blocks_used ); );
-        opal_generic_simple_unpack_cuda_iov_cached_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cuda_iov_dist_d_current, cuda_iov_contig_buf_d_current, nb_blocks_used, destination_base, cuda_iov_partial_length_start, cuda_iov_partial_length_end);
-        cuda_err = cudaEventRecord(cuda_iov_pipeline_block->cuda_event, *cuda_stream_iov);
-        opal_cuda_check_error(cuda_err);
-        iov_pipeline_block_id ++;
-        iov_pipeline_block_id = iov_pipeline_block_id % NB_STREAMS;
+        opal_generic_simple_unpack_cuda_iov_cached_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cached_cuda_iov_dist_d, pConvertor->current_cuda_iov_pos, cuda_iov_contig_buf_d_current, nb_blocks_used, destination_base, cuda_iov_partial_length_start, cuda_iov_partial_length_end);
     }
-#endif
 
     for (i = 0; i < NB_STREAMS; i++) {
         cudaStreamSynchronize(cuda_streams->opal_cuda_stream[i]);
     }
     
-    total_unpacked += unpacked_w_cache;
-    pConvertor->bConverted += unpacked_w_cache;
+    pConvertor->bConverted += total_unpacked;
     DT_CUDA_DEBUG ( opal_cuda_output(2, "Unpack total unpacked %d\n", total_unpacked); );
 
     iov[0].iov_len = total_unpacked;
diff --git a/test/datatype/ddt_benchmark.c b/test/datatype/ddt_benchmark.c
index e879e5c0192..d961ef34e4e 100644
--- a/test/datatype/ddt_benchmark.c
+++ b/test/datatype/ddt_benchmark.c
@@ -895,14 +895,14 @@ static int local_copy_with_convertor( ompi_datatype_t* pdt, int count, int chunk
             
         }
 #if defined (TEST_CHAR)
-        mat_char = (unsigned char *)ptemp;
+   /*     mat_char = (unsigned char *)ptemp;
         for (j = 0; j < max_data; j++) {
             if (mat_char[j] != 'a') {
                 t_error ++;
                 printf("error %d, %c\n", j, mat_char[j]);
             }
         }
-        printf("total error %d\n", t_error);
+        printf("total error %d\n", t_error);*/
 #endif
 
         if( done2 == 0 ) {

From 6af665858b9cbd5aa964edf9f9e815a3f8e5445b Mon Sep 17 00:00:00 2001
From: eddy16112 <eddy16112@gmail.com>
Date: Fri, 13 Nov 2015 16:48:09 -0500
Subject: [PATCH 174/190] now cuda iov is {nc_disp, c_disp}

---
 .../cuda/opal_datatype_cuda_internal.cuh      |  8 ++--
 .../cuda/opal_datatype_pack_cuda_kernel.cu    | 22 ++++++-----
 .../cuda/opal_datatype_pack_cuda_wrapper.cu   | 35 +++++++++--------
 .../cuda/opal_datatype_unpack_cuda_kernel.cu  | 35 +++++++++--------
 .../cuda/opal_datatype_unpack_cuda_wrapper.cu | 39 +++++++++----------
 test/datatype/ddt_benchmark.c                 |  4 +-
 6 files changed, 73 insertions(+), 70 deletions(-)

diff --git a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
index 528f4e17c91..c038b9001f6 100644
--- a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
+++ b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
@@ -59,8 +59,8 @@ typedef struct {
 } ddt_cuda_iov_dist_non_cached_t;
 
 typedef struct {
-    size_t ptr_offset;
-    uint32_t nb_bytes;
+    size_t ncontig_disp;
+    size_t contig_disp;
 } ddt_cuda_iov_dist_cached_t;
 
 typedef struct {
@@ -139,9 +139,9 @@ __global__ void opal_generic_simple_pack_cuda_iov_non_cached_kernel( ddt_cuda_io
 
 __global__ void opal_generic_simple_unpack_cuda_iov_non_cached_kernel( ddt_cuda_iov_dist_non_cached_t* cuda_iov_dist, int nb_blocks_used);
 
-__global__ void opal_generic_simple_pack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, uint32_t cuda_iov_pos, uintptr_t* cuda_iov_contig_buf_d, int nb_blocks_used, unsigned char* source_base);
+__global__ void opal_generic_simple_pack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, uint32_t cuda_iov_pos, uintptr_t* cuda_iov_contig_buf_d, int nb_blocks_used, unsigned char* source_base, unsigned char* destination_base);
 
-__global__ void opal_generic_simple_unpack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, uint32_t cuda_iov_pos, uintptr_t* cuda_iov_contig_buf_d, int nb_blocks_used, unsigned char* destination_base, size_t cuda_iov_partial_length_start, size_t cuda_iov_partial_length_end);
+__global__ void opal_generic_simple_unpack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, uint32_t cuda_iov_pos, uintptr_t* cuda_iov_contig_buf_d, int nb_blocks_used, unsigned char* destination_base, unsigned char* source_base, size_t cuda_iov_partial_length_start, size_t cuda_iov_partial_length_end);
 
 void opal_cuda_output(int output_id, const char *format, ...);
 
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
index 93fb188ddcd..ddfd68b0e4c 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
@@ -88,13 +88,14 @@ __global__ void opal_generic_simple_pack_cuda_iov_non_cached_kernel( ddt_cuda_io
     }
 }
 
-__global__ void opal_generic_simple_pack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, uint32_t cuda_iov_pos, uintptr_t* cuda_iov_contig_buf_d, int nb_blocks_used, unsigned char* source_base)
+__global__ void opal_generic_simple_pack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, uint32_t cuda_iov_pos, uintptr_t* cuda_iov_contig_buf_d, int nb_blocks_used, unsigned char* source_base, unsigned char* destination_base)
 {
-    uint32_t i, j, _nb_bytes;    
-    size_t src_offset;
-    unsigned char *dst;
+    uint32_t i, j;
+    size_t _nb_bytes;    
+    size_t src_offset, dst_offset;
     unsigned char *_source_tmp, *_destination_tmp;
     uint32_t current_cuda_iov_pos = cuda_iov_pos;
+    size_t destination_disp = cuda_iov_dist[current_cuda_iov_pos].contig_disp;
     
     __shared__ uint32_t nb_tasks;
     uint32_t copy_count;
@@ -110,15 +111,16 @@ __global__ void opal_generic_simple_pack_cuda_iov_cached_kernel( ddt_cuda_iov_di
     __syncthreads();
     
     for (i = 0; i < nb_tasks; i++) {
-        src_offset = cuda_iov_dist[blockIdx.x + i * gridDim.x + current_cuda_iov_pos].ptr_offset;
-        dst = (unsigned char *)cuda_iov_contig_buf_d[blockIdx.x + i * gridDim.x];
-        _nb_bytes = cuda_iov_dist[blockIdx.x + i * gridDim.x + current_cuda_iov_pos].nb_bytes;
+        src_offset = cuda_iov_dist[blockIdx.x + i * gridDim.x + current_cuda_iov_pos].ncontig_disp;
+        dst_offset = cuda_iov_dist[blockIdx.x + i * gridDim.x + current_cuda_iov_pos].contig_disp - destination_disp;
+        _nb_bytes = cuda_iov_dist[blockIdx.x + i * gridDim.x + current_cuda_iov_pos + 1].contig_disp - cuda_iov_dist[blockIdx.x + i * gridDim.x + current_cuda_iov_pos].contig_disp;
         
         _source_tmp = source_base + src_offset;
+        _destination_tmp = destination_base + dst_offset;
         /* block size is either multiple of ALIGNMENT_DOUBLE or residule */
-        if ((uintptr_t)(_source_tmp) % ALIGNMENT_DOUBLE == 0 && (uintptr_t)dst % ALIGNMENT_DOUBLE == 0 && _nb_bytes % ALIGNMENT_DOUBLE == 0) {
+        if ((uintptr_t)(_source_tmp) % ALIGNMENT_DOUBLE == 0 && (uintptr_t)(_destination_tmp) % ALIGNMENT_DOUBLE == 0 && _nb_bytes % ALIGNMENT_DOUBLE == 0) {
             alignment = ALIGNMENT_DOUBLE;
-        } else if ((uintptr_t)(_source_tmp) % ALIGNMENT_FLOAT == 0 && (uintptr_t)dst % ALIGNMENT_FLOAT == 0 && _nb_bytes % ALIGNMENT_FLOAT == 0) {
+        } else if ((uintptr_t)(_source_tmp) % ALIGNMENT_FLOAT == 0 && (uintptr_t)(_destination_tmp) % ALIGNMENT_FLOAT == 0 && _nb_bytes % ALIGNMENT_FLOAT == 0) {
             alignment = ALIGNMENT_FLOAT;
         } else {
             alignment = ALIGNMENT_CHAR;
@@ -128,7 +130,7 @@ __global__ void opal_generic_simple_pack_cuda_iov_cached_kernel( ddt_cuda_iov_di
         for (j = threadIdx.x; j < copy_count; j += blockDim.x) {
             if (j < copy_count) {
                 _source_tmp = source_base + src_offset + j * alignment;
-                _destination_tmp = dst + j * alignment;
+                _destination_tmp = destination_base + dst_offset + j * alignment;
 #if !defined (OPAL_DATATYPE_CUDA_DRY_RUN)
                 if (alignment == ALIGNMENT_DOUBLE) {
                     *((long *)_destination_tmp) = *((long *)_source_tmp);
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
index 1d14c000977..f1ce6dbda7d 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
@@ -965,6 +965,7 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
     uint32_t *cuda_iov_nb_bytes_list_h_current = NULL;
     uint32_t cached_cuda_iov_count = 0;
     uint8_t cuda_iov_is_cached = 0;
+    size_t destionation_disp = 0;
 
 #if defined(OPAL_DATATYPE_CUDA_TIMING)    
     TIMER_DATA_TYPE start, end, start_total, end_total;
@@ -1073,17 +1074,18 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
             nb_blocks_per_description = (count_desc + thread_per_block - 1) / thread_per_block;
             DT_CUDA_DEBUG ( opal_cuda_output(10, "Pack description %d, size %d, residue %d, alignment %d, nb_block_aquired %d\n", i, count_desc, residue_desc, alignment, nb_blocks_per_description); );
             for (j = 0; j < nb_blocks_per_description; j++) {
-                cuda_iov_dist_h_current[nb_blocks_used].ptr_offset = source + j * thread_per_block * alignment - source_base;
+                cuda_iov_dist_h_current[nb_blocks_used].ncontig_disp = source + j * thread_per_block * alignment - source_base;
+                cuda_iov_dist_h_current[nb_blocks_used].contig_disp = destionation_disp;
                 if ( (j+1) * thread_per_block <= count_desc) {
-                    cuda_iov_dist_h_current[nb_blocks_used].nb_bytes = thread_per_block * alignment;
+                    cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] = thread_per_block * alignment;
                 } else {
-                    cuda_iov_dist_h_current[nb_blocks_used].nb_bytes = (count_desc - j*thread_per_block) * alignment; 
+                    cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] = (count_desc - j*thread_per_block) * alignment; 
                 }
 #if defined (OPAL_DATATYPE_CUDA_DEBUG)
-                assert(cuda_iov_dist_h_current[nb_blocks_used].nb_bytes > 0);
+                assert(cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] > 0);
 #endif /* OPAL_DATATYPE_CUDA_DEBUG */
-                cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] = cuda_iov_dist_h_current[nb_blocks_used].nb_bytes;
-                DT_CUDA_DEBUG( opal_cuda_output(12, "Pack \tblock %d, src_offset %ld, nb_bytes %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].ptr_offset, cuda_iov_dist_h_current[nb_blocks_used].nb_bytes); );
+                destionation_disp += cached_cuda_iov_nb_bytes_list_h[nb_blocks_used];
+                DT_CUDA_DEBUG( opal_cuda_output(12, "Pack \tblock %d, ncontig_disp %ld, contig_disp %ld, nb_bytes %ld\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].ncontig_disp, cuda_iov_dist_h_current[nb_blocks_used].contig_disp, cached_cuda_iov_nb_bytes_list_h[nb_blocks_used]); );
                 nb_blocks_used ++;
                 assert (nb_blocks_used < CUDA_MAX_NB_BLOCKS*CUDA_IOV_MAX_TASK_PER_BLOCK);
             }
@@ -1091,18 +1093,21 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
             /* handle residue */
             if (residue_desc != 0) {
                 /*orig_alignment = opal_datatype_basicDatatypes[pElem->elem.common.type]->size;*/
-                cuda_iov_dist_h_current[nb_blocks_used].ptr_offset = source + length_per_iovec / alignment * alignment - source_base;
-                cuda_iov_dist_h_current[nb_blocks_used].nb_bytes = length_per_iovec - length_per_iovec / alignment * alignment;
+                cuda_iov_dist_h_current[nb_blocks_used].ncontig_disp = source + length_per_iovec / alignment * alignment - source_base;
+                cuda_iov_dist_h_current[nb_blocks_used].contig_disp = destionation_disp;
+                cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] = length_per_iovec - length_per_iovec / alignment * alignment;
 #if defined (OPAL_DATATYPE_CUDA_DEBUG)
-                assert(cuda_iov_dist_h_current[nb_blocks_used].nb_bytes > 0);
+                assert(cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] > 0);
 #endif /* OPAL_DATATYPE_CUDA_DEBUG */
-                cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] = cuda_iov_dist_h_current[nb_blocks_used].nb_bytes;
-                DT_CUDA_DEBUG( opal_cuda_output(12, "Pack \tblock %d, src_offset %ld, nb_bytes %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].ptr_offset, cuda_iov_dist_h_current[nb_blocks_used].nb_bytes); );
+                destionation_disp += cached_cuda_iov_nb_bytes_list_h[nb_blocks_used];
+                DT_CUDA_DEBUG( opal_cuda_output(12, "Pack \tblock %d, ncontig_disp %ld, contig_disp %ld, nb_bytes %ld\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].ncontig_disp, cuda_iov_dist_h_current[nb_blocks_used].contig_disp, cached_cuda_iov_nb_bytes_list_h[nb_blocks_used]); );
                 nb_blocks_used ++;
                 assert (nb_blocks_used < CUDA_MAX_NB_BLOCKS*CUDA_IOV_MAX_TASK_PER_BLOCK);
             }
         }
-        cudaMemcpyAsync(cached_cuda_iov_dist_d, cuda_iov_dist_h_current, sizeof(ddt_cuda_iov_dist_cached_t)*(nb_blocks_used), cudaMemcpyHostToDevice, *cuda_stream_iov);
+        /* use additional entry to store the size of entire contiguous buffer needed for one ddt */
+        cuda_iov_dist_h_current[nb_blocks_used].contig_disp = destionation_disp;
+        cudaMemcpyAsync(cached_cuda_iov_dist_d, cuda_iov_dist_h_current, sizeof(ddt_cuda_iov_dist_cached_t)*(nb_blocks_used+1), cudaMemcpyHostToDevice, *cuda_stream_iov);
         opal_ddt_set_cuda_iov_cached(pConvertor, nb_blocks_used);
         DT_CUDA_DEBUG ( opal_cuda_output(2, "Pack cuda iov is cached, count %d\n", nb_blocks_used););
 #if defined(OPAL_DATATYPE_CUDA_TIMING)    
@@ -1128,8 +1133,6 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
 #endif
         for (i = cuda_iov_start_pos; i < cuda_iov_end_pos && !buffer_isfull; i++) {
             if (buffer_size >= cached_cuda_iov_nb_bytes_list_h[i]) {
-                cuda_iov_contig_buf_h_current[nb_blocks_used] = (uintptr_t)destination;
-                destination += cached_cuda_iov_nb_bytes_list_h[i];
                 total_packed += cached_cuda_iov_nb_bytes_list_h[i];
                 buffer_size -= cached_cuda_iov_nb_bytes_list_h[i];
                 nb_blocks_used++;
@@ -1143,9 +1146,9 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
         total_time = ELAPSED_TIME( start, end );
         DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Pack to dest %p, cached cuda iov is prepared in %ld microsec, kernel submitted to CUDA stream %d, nb_blocks %d\n", destination_base, total_time,  cuda_iov_pipeline_block->cuda_stream_id, nb_blocks_used); );
 #endif
-        cudaMemcpyAsync(cuda_iov_contig_buf_d_current, cuda_iov_contig_buf_h_current, sizeof(uintptr_t)*(nb_blocks_used), cudaMemcpyHostToDevice, *cuda_stream_iov);
+//        cudaMemcpyAsync(cuda_iov_contig_buf_d_current, cuda_iov_contig_buf_h_current, sizeof(uintptr_t)*(nb_blocks_used), cudaMemcpyHostToDevice, *cuda_stream_iov);
         DT_CUDA_DEBUG ( opal_cuda_output(2, "kernel launched src_base %p, dst_base %p, nb_blocks %ld\n", source_base, destination_base, nb_blocks_used ); );
-        opal_generic_simple_pack_cuda_iov_cached_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cached_cuda_iov_dist_d, pConvertor->current_cuda_iov_pos, cuda_iov_contig_buf_d_current, nb_blocks_used, source_base);
+        opal_generic_simple_pack_cuda_iov_cached_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cached_cuda_iov_dist_d, pConvertor->current_cuda_iov_pos, cuda_iov_contig_buf_d_current, nb_blocks_used, source_base, destination_base);
         pConvertor->current_cuda_iov_pos += nb_blocks_used;
     }
 
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
index f98a8c0b2ea..9cf705ae7e3 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
@@ -46,15 +46,17 @@ __global__ void opal_generic_simple_unpack_cuda_iov_non_cached_kernel( ddt_cuda_
     }
 }
 
-__global__ void opal_generic_simple_unpack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, uint32_t cuda_iov_pos, uintptr_t* cuda_iov_contig_buf_d, int nb_blocks_used, unsigned char* destination_base, size_t cuda_iov_partial_length_start, size_t cuda_iov_partial_length_end)
+__global__ void opal_generic_simple_unpack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, uint32_t cuda_iov_pos, uintptr_t* cuda_iov_contig_buf_d, int nb_blocks_used, unsigned char* destination_base, unsigned char* source_base, size_t cuda_iov_partial_length_start, size_t cuda_iov_partial_length_end)
 {
     uint32_t i, j;
-    size_t dst_offset;
+    size_t dst_offset, src_offset;
     unsigned char *src;
     unsigned char *_source_tmp, *_destination_tmp;
-    uint32_t _nb_bytes;
-    uint32_t current_cuda_iov_pos = cuda_iov_pos; 
-    
+    size_t _nb_bytes;
+    uint32_t current_cuda_iov_pos = cuda_iov_pos;
+    size_t source_disp = cuda_iov_dist[current_cuda_iov_pos].contig_disp;
+    size_t source_partial_disp = (cuda_iov_dist[current_cuda_iov_pos+1].contig_disp - cuda_iov_dist[current_cuda_iov_pos].contig_disp) - cuda_iov_partial_length_start; 
+
     __shared__ uint32_t nb_tasks;
     uint32_t copy_count;
     uint8_t alignment;
@@ -68,24 +70,23 @@ __global__ void opal_generic_simple_unpack_cuda_iov_cached_kernel( ddt_cuda_iov_
     __syncthreads();
     
     for (i = 0; i < nb_tasks; i++) {
-        src = (unsigned char *)cuda_iov_contig_buf_d[blockIdx.x + i * gridDim.x];
-        dst_offset = cuda_iov_dist[blockIdx.x + i * gridDim.x + current_cuda_iov_pos].ptr_offset;
-        if (i == 0 && blockIdx.x == 0 && cuda_iov_partial_length_start != 0) {
-         //   if (threadIdx.x == 0) printf("cuda_iov_partial_length_start %d", cuda_iov_partial_length_start);
-            dst_offset = dst_offset + cuda_iov_dist[blockIdx.x + i * gridDim.x + current_cuda_iov_pos].nb_bytes - cuda_iov_partial_length_start; 
-        }
-        _destination_tmp = destination_base + dst_offset;
+        src_offset = cuda_iov_dist[blockIdx.x + i * gridDim.x + current_cuda_iov_pos].contig_disp - source_disp - source_partial_disp;
+        dst_offset = cuda_iov_dist[blockIdx.x + i * gridDim.x + current_cuda_iov_pos].ncontig_disp;
+        _nb_bytes = cuda_iov_dist[blockIdx.x + i * gridDim.x + current_cuda_iov_pos + 1].contig_disp - cuda_iov_dist[blockIdx.x + i * gridDim.x + current_cuda_iov_pos].contig_disp;
 
         if (i == 0 && blockIdx.x == 0 && cuda_iov_partial_length_start != 0) {
+            src_offset = cuda_iov_dist[current_cuda_iov_pos].contig_disp - source_disp;
+            dst_offset = dst_offset + _nb_bytes - cuda_iov_partial_length_start;  
             _nb_bytes = cuda_iov_partial_length_start;
         } else if (i == nb_tasks-1 && (blockIdx.x == (nb_blocks_used-1) % gridDim.x) && cuda_iov_partial_length_end != 0) {
             _nb_bytes = cuda_iov_partial_length_end;
-        } else {
-            _nb_bytes = cuda_iov_dist[blockIdx.x + i * gridDim.x + current_cuda_iov_pos].nb_bytes;
         }
-        if ((uintptr_t)(_destination_tmp) % ALIGNMENT_DOUBLE == 0 && (uintptr_t)src % ALIGNMENT_DOUBLE == 0 && _nb_bytes % ALIGNMENT_DOUBLE == 0) {
+        
+        _destination_tmp = destination_base + dst_offset; 
+        _source_tmp = source_base + src_offset;
+        if ((uintptr_t)(_destination_tmp) % ALIGNMENT_DOUBLE == 0 && (uintptr_t)(_source_tmp) % ALIGNMENT_DOUBLE == 0 && _nb_bytes % ALIGNMENT_DOUBLE == 0) {
             alignment = ALIGNMENT_DOUBLE;
-        } else if ((uintptr_t)(_destination_tmp) % ALIGNMENT_FLOAT == 0 && (uintptr_t)src % ALIGNMENT_FLOAT == 0 && _nb_bytes % ALIGNMENT_FLOAT == 0) {
+        } else if ((uintptr_t)(_destination_tmp) % ALIGNMENT_FLOAT == 0 && (uintptr_t)(_source_tmp) % ALIGNMENT_FLOAT == 0 && _nb_bytes % ALIGNMENT_FLOAT == 0) {
             alignment = ALIGNMENT_FLOAT;
         } else {
             alignment = ALIGNMENT_CHAR;
@@ -97,7 +98,7 @@ __global__ void opal_generic_simple_unpack_cuda_iov_cached_kernel( ddt_cuda_iov_
                 if (copy_count > blockDim.x) printf("copy_count %d, dim %d\n", copy_count, blockDim.x);
             }*/
             if (j < copy_count) {
-                _source_tmp = src + j * alignment;
+                _source_tmp = source_base + src_offset + j * alignment;
                 _destination_tmp = destination_base + dst_offset + j * alignment;
   /*              if (threadIdx.x == 0) {
                     printf("_src %p, dst %p, alignment %d, blk %d, j %d, count %d\n", _source_tmp, _destination_tmp, alignment, blockIdx.x, j, copy_count);
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
index 9b807159f08..d45e345b315 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
@@ -744,6 +744,7 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
     uint8_t cuda_iov_is_cached = 0;
     size_t cuda_iov_partial_length_start = 0;
     size_t cuda_iov_partial_length_end = 0;
+    size_t source_disp = 0;
 
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
     TIMER_DATA_TYPE start, end, start_total, end_total;
@@ -846,17 +847,18 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
             nb_blocks_per_description = (count_desc + thread_per_block - 1) / thread_per_block;
             DT_CUDA_DEBUG ( opal_cuda_output(10, "Unpack description %d, size %d, residue %d, alignment %d\n", i, count_desc, residue_desc, alignment); );
             for (j = 0; j < nb_blocks_per_description; j++) {
-                cuda_iov_dist_h_current[nb_blocks_used].ptr_offset = destination + j * thread_per_block * alignment - destination_base;
+                cuda_iov_dist_h_current[nb_blocks_used].ncontig_disp = destination + j * thread_per_block * alignment - destination_base;
+                cuda_iov_dist_h_current[nb_blocks_used].contig_disp = source_disp;
                 if ( (j+1) * thread_per_block <= count_desc) {
-                    cuda_iov_dist_h_current[nb_blocks_used].nb_bytes = thread_per_block * alignment;
+                    cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] = thread_per_block * alignment;
                 } else {
-                    cuda_iov_dist_h_current[nb_blocks_used].nb_bytes = (thread_per_block - ((j+1)*thread_per_block - count_desc)) * alignment;
+                    cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] = (thread_per_block - ((j+1)*thread_per_block - count_desc)) * alignment;
                 }
 #if defined (OPAL_DATATYPE_CUDA_DEBUG)
-                assert (cuda_iov_dist_h_current[nb_blocks_used].nb_bytes > 0); 
+                assert (cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] > 0); 
 #endif /* OPAL_DATATYPE_CUDA_DEBUG */
-                cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] = cuda_iov_dist_h_current[nb_blocks_used].nb_bytes;
-                DT_CUDA_DEBUG( opal_cuda_output(12, "Unpack \tblock %d, src_offset %ld, nb_bytes %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].ptr_offset, cuda_iov_dist_h_current[nb_blocks_used].nb_bytes); );
+                source_disp += cached_cuda_iov_nb_bytes_list_h[nb_blocks_used];
+                DT_CUDA_DEBUG( opal_cuda_output(12, "Unpack \tblock %d, ncontig_disp %ld, contig_disp %ld, nb_bytes %ld\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].ncontig_disp, cuda_iov_dist_h_current[nb_blocks_used].contig_disp, cached_cuda_iov_nb_bytes_list_h[nb_blocks_used]); );
                 nb_blocks_used ++;
             }
 
@@ -864,18 +866,20 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
             if (residue_desc != 0) {
                /* orig_alignment = opal_datatype_basicDatatypes[pElem->elem.common.type]->size;*/
                 orig_alignment = ALIGNMENT_CHAR;
-                cuda_iov_dist_h_current[nb_blocks_used].ptr_offset = destination + length_per_iovec / alignment * alignment - destination_base;
-                cuda_iov_dist_h_current[nb_blocks_used].nb_bytes = length_per_iovec - length_per_iovec / alignment * alignment;
+                cuda_iov_dist_h_current[nb_blocks_used].ncontig_disp = destination + length_per_iovec / alignment * alignment - destination_base;
+                cuda_iov_dist_h_current[nb_blocks_used].contig_disp = source_disp;
+                cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] = length_per_iovec - length_per_iovec / alignment * alignment;
 #if defined (OPAL_DATATYPE_CUDA_DEBUG)
-                assert (cuda_iov_dist_h_current[nb_blocks_used].nb_bytes > 0);
+                assert (cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] > 0);
 #endif /* OPAL_DATATYPE_CUDA_DEBUG */
-                cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] = cuda_iov_dist_h_current[nb_blocks_used].nb_bytes;
-                DT_CUDA_DEBUG( opal_cuda_output(12, "Unpack \tblock %d, src_offset %ld, nb_bytes %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].ptr_offset, cuda_iov_dist_h_current[nb_blocks_used].nb_bytes); );
+                source_disp += cached_cuda_iov_nb_bytes_list_h[nb_blocks_used];
+                DT_CUDA_DEBUG( opal_cuda_output(12, "Unpack \tblock %d, ncontig_disp %ld, contig_disp %ld, nb_bytes %ld\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].ncontig_disp, cuda_iov_dist_h_current[nb_blocks_used].contig_disp, cached_cuda_iov_nb_bytes_list_h[nb_blocks_used]); );
                 nb_blocks_used ++;
             }
         }
-        
-        cudaMemcpy(cached_cuda_iov_dist_d, cuda_iov_dist_h_current, sizeof(ddt_cuda_iov_dist_cached_t)*(nb_blocks_used), cudaMemcpyHostToDevice);
+        /* use additional entry to store the size of entire contiguous buffer needed for one ddt */
+        cuda_iov_dist_h_current[nb_blocks_used].contig_disp = source_disp;
+        cudaMemcpy(cached_cuda_iov_dist_d, cuda_iov_dist_h_current, sizeof(ddt_cuda_iov_dist_cached_t)*(nb_blocks_used+1), cudaMemcpyHostToDevice);
         opal_ddt_set_cuda_iov_cached(pConvertor, nb_blocks_used);
         DT_CUDA_DEBUG ( opal_cuda_output(2, "Unpack cuda iov is cached, count %d\n", nb_blocks_used););
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
@@ -907,15 +911,11 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
             total_unpacked += cuda_iov_partial_length_start;
             buffer_size -= cuda_iov_partial_length_start;
             pConvertor->current_iov_partial_length = 0;
-            cuda_iov_contig_buf_h_current[nb_blocks_used] = (uintptr_t)source;
-            source += cuda_iov_partial_length_start;
             cuda_iov_start_pos ++;
             nb_blocks_used ++;
         }
         for (i = cuda_iov_start_pos; i < cuda_iov_end_pos && !buffer_isfull; i++) {
             if (buffer_size >= cached_cuda_iov_nb_bytes_list_h[i]) {
-                cuda_iov_contig_buf_h_current[nb_blocks_used] = (uintptr_t)source;
-                source += cached_cuda_iov_nb_bytes_list_h[i];
                 total_unpacked += cached_cuda_iov_nb_bytes_list_h[i];
                 buffer_size -= cached_cuda_iov_nb_bytes_list_h[i];
                 nb_blocks_used ++;
@@ -923,9 +923,6 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
                 if (buffer_size > 0) {
                     cuda_iov_partial_length_end = buffer_size;
                     total_unpacked += cuda_iov_partial_length_end;
-                    cuda_iov_contig_buf_h_current[nb_blocks_used] = (uintptr_t)source;
-                    source += cuda_iov_partial_length_end;
-                    pConvertor->current_iov_partial_length = cached_cuda_iov_nb_bytes_list_h[i] - cuda_iov_partial_length_end;
                     nb_blocks_used ++;
                 }
                 buffer_size = 0;
@@ -940,7 +937,7 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
 #endif
         cudaMemcpyAsync(cuda_iov_contig_buf_d_current, cuda_iov_contig_buf_h_current, sizeof(uintptr_t)*(nb_blocks_used), cudaMemcpyHostToDevice, *cuda_stream_iov);
         DT_CUDA_DEBUG ( opal_cuda_output(2, "kernel launched src_base %p, dst_base %p, nb_blocks %ld\n", source_base, destination_base, nb_blocks_used ); );
-        opal_generic_simple_unpack_cuda_iov_cached_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cached_cuda_iov_dist_d, pConvertor->current_cuda_iov_pos, cuda_iov_contig_buf_d_current, nb_blocks_used, destination_base, cuda_iov_partial_length_start, cuda_iov_partial_length_end);
+        opal_generic_simple_unpack_cuda_iov_cached_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cached_cuda_iov_dist_d, pConvertor->current_cuda_iov_pos, cuda_iov_contig_buf_d_current, nb_blocks_used, destination_base, source_base, cuda_iov_partial_length_start, cuda_iov_partial_length_end);
     }
 
     for (i = 0; i < NB_STREAMS; i++) {
diff --git a/test/datatype/ddt_benchmark.c b/test/datatype/ddt_benchmark.c
index d961ef34e4e..e879e5c0192 100644
--- a/test/datatype/ddt_benchmark.c
+++ b/test/datatype/ddt_benchmark.c
@@ -895,14 +895,14 @@ static int local_copy_with_convertor( ompi_datatype_t* pdt, int count, int chunk
             
         }
 #if defined (TEST_CHAR)
-   /*     mat_char = (unsigned char *)ptemp;
+        mat_char = (unsigned char *)ptemp;
         for (j = 0; j < max_data; j++) {
             if (mat_char[j] != 'a') {
                 t_error ++;
                 printf("error %d, %c\n", j, mat_char[j]);
             }
         }
-        printf("total error %d\n", t_error);*/
+        printf("total error %d\n", t_error);
 #endif
 
         if( done2 == 0 ) {

From 63e148e1a50d6d2a844d5bfbc612533e02bbb510 Mon Sep 17 00:00:00 2001
From: eddy16112 <eddy16112@gmail.com>
Date: Fri, 13 Nov 2015 18:33:48 -0500
Subject: [PATCH 175/190] clean up kernel, put variables uses multiple times
 into register

---
 .../datatype/cuda/opal_datatype_pack_cuda_kernel.cu |  8 +++++---
 .../cuda/opal_datatype_unpack_cuda_kernel.cu        | 13 +++++++------
 .../cuda/opal_datatype_unpack_cuda_wrapper.cu       |  1 -
 3 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
index ddfd68b0e4c..92a96d1cb2b 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
@@ -91,11 +91,12 @@ __global__ void opal_generic_simple_pack_cuda_iov_non_cached_kernel( ddt_cuda_io
 __global__ void opal_generic_simple_pack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, uint32_t cuda_iov_pos, uintptr_t* cuda_iov_contig_buf_d, int nb_blocks_used, unsigned char* source_base, unsigned char* destination_base)
 {
     uint32_t i, j;
-    size_t _nb_bytes;    
+    uint32_t _nb_bytes;    
     size_t src_offset, dst_offset;
     unsigned char *_source_tmp, *_destination_tmp;
     uint32_t current_cuda_iov_pos = cuda_iov_pos;
     size_t destination_disp = cuda_iov_dist[current_cuda_iov_pos].contig_disp;
+    size_t contig_disp;
     
     __shared__ uint32_t nb_tasks;
     uint32_t copy_count;
@@ -111,9 +112,10 @@ __global__ void opal_generic_simple_pack_cuda_iov_cached_kernel( ddt_cuda_iov_di
     __syncthreads();
     
     for (i = 0; i < nb_tasks; i++) {
+        contig_disp = cuda_iov_dist[blockIdx.x + i * gridDim.x + current_cuda_iov_pos].contig_disp;  /* this variable is used multiple times, so put in in register */
         src_offset = cuda_iov_dist[blockIdx.x + i * gridDim.x + current_cuda_iov_pos].ncontig_disp;
-        dst_offset = cuda_iov_dist[blockIdx.x + i * gridDim.x + current_cuda_iov_pos].contig_disp - destination_disp;
-        _nb_bytes = cuda_iov_dist[blockIdx.x + i * gridDim.x + current_cuda_iov_pos + 1].contig_disp - cuda_iov_dist[blockIdx.x + i * gridDim.x + current_cuda_iov_pos].contig_disp;
+        dst_offset = contig_disp - destination_disp;
+        _nb_bytes = cuda_iov_dist[blockIdx.x + i * gridDim.x + current_cuda_iov_pos + 1].contig_disp - contig_disp;
         
         _source_tmp = source_base + src_offset;
         _destination_tmp = destination_base + dst_offset;
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
index 9cf705ae7e3..f2c337ea682 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
@@ -50,12 +50,12 @@ __global__ void opal_generic_simple_unpack_cuda_iov_cached_kernel( ddt_cuda_iov_
 {
     uint32_t i, j;
     size_t dst_offset, src_offset;
-    unsigned char *src;
     unsigned char *_source_tmp, *_destination_tmp;
-    size_t _nb_bytes;
+    uint32_t _nb_bytes;
     uint32_t current_cuda_iov_pos = cuda_iov_pos;
     size_t source_disp = cuda_iov_dist[current_cuda_iov_pos].contig_disp;
-    size_t source_partial_disp = (cuda_iov_dist[current_cuda_iov_pos+1].contig_disp - cuda_iov_dist[current_cuda_iov_pos].contig_disp) - cuda_iov_partial_length_start; 
+    size_t source_partial_disp = (cuda_iov_dist[current_cuda_iov_pos+1].contig_disp - cuda_iov_dist[current_cuda_iov_pos].contig_disp) - cuda_iov_partial_length_start;
+    size_t contig_disp; 
 
     __shared__ uint32_t nb_tasks;
     uint32_t copy_count;
@@ -70,12 +70,13 @@ __global__ void opal_generic_simple_unpack_cuda_iov_cached_kernel( ddt_cuda_iov_
     __syncthreads();
     
     for (i = 0; i < nb_tasks; i++) {
-        src_offset = cuda_iov_dist[blockIdx.x + i * gridDim.x + current_cuda_iov_pos].contig_disp - source_disp - source_partial_disp;
+        contig_disp = cuda_iov_dist[blockIdx.x + i * gridDim.x + current_cuda_iov_pos].contig_disp;  /* this variable is used multiple times, so put in in register */
+        src_offset = contig_disp - source_disp - source_partial_disp;
         dst_offset = cuda_iov_dist[blockIdx.x + i * gridDim.x + current_cuda_iov_pos].ncontig_disp;
-        _nb_bytes = cuda_iov_dist[blockIdx.x + i * gridDim.x + current_cuda_iov_pos + 1].contig_disp - cuda_iov_dist[blockIdx.x + i * gridDim.x + current_cuda_iov_pos].contig_disp;
+        _nb_bytes = cuda_iov_dist[blockIdx.x + i * gridDim.x + current_cuda_iov_pos + 1].contig_disp - contig_disp;
 
         if (i == 0 && blockIdx.x == 0 && cuda_iov_partial_length_start != 0) {
-            src_offset = cuda_iov_dist[current_cuda_iov_pos].contig_disp - source_disp;
+            src_offset = contig_disp - source_disp;
             dst_offset = dst_offset + _nb_bytes - cuda_iov_partial_length_start;  
             _nb_bytes = cuda_iov_partial_length_start;
         } else if (i == nb_tasks-1 && (blockIdx.x == (nb_blocks_used-1) % gridDim.x) && cuda_iov_partial_length_end != 0) {
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
index d45e345b315..3d4b8c95ff0 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
@@ -935,7 +935,6 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
         total_time = ELAPSED_TIME( start, end );
         DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Unpack src %p, cached cuda iov is prepared in %ld microsec, kernel submitted to CUDA stream %d, nb_blocks %d\n", source_base, total_time,  cuda_iov_pipeline_block->cuda_stream_id, nb_blocks_used); );
 #endif
-        cudaMemcpyAsync(cuda_iov_contig_buf_d_current, cuda_iov_contig_buf_h_current, sizeof(uintptr_t)*(nb_blocks_used), cudaMemcpyHostToDevice, *cuda_stream_iov);
         DT_CUDA_DEBUG ( opal_cuda_output(2, "kernel launched src_base %p, dst_base %p, nb_blocks %ld\n", source_base, destination_base, nb_blocks_used ); );
         opal_generic_simple_unpack_cuda_iov_cached_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cached_cuda_iov_dist_d, pConvertor->current_cuda_iov_pos, cuda_iov_contig_buf_d_current, nb_blocks_used, destination_base, source_base, cuda_iov_partial_length_start, cuda_iov_partial_length_end);
     }

From c75393f9a5253f7f0053a6f97189f187935b57ae Mon Sep 17 00:00:00 2001
From: eddy16112 <eddy16112@gmail.com>
Date: Fri, 13 Nov 2015 22:18:41 -0500
Subject: [PATCH 176/190] cached cuda iov is working for count > 1

another checkpoint

now convertor->count > 1 is woring
---
 opal/datatype/cuda/opal_datatype_cuda.cu      | 12 ++++
 .../cuda/opal_datatype_cuda_internal.cuh      |  4 +-
 .../cuda/opal_datatype_pack_cuda_kernel.cu    | 27 +++++---
 .../cuda/opal_datatype_pack_cuda_wrapper.cu   | 43 +++++++------
 .../cuda/opal_datatype_unpack_cuda_kernel.cu  | 33 +++++++---
 .../cuda/opal_datatype_unpack_cuda_wrapper.cu | 63 ++++++++++---------
 6 files changed, 119 insertions(+), 63 deletions(-)

diff --git a/opal/datatype/cuda/opal_datatype_cuda.cu b/opal/datatype/cuda/opal_datatype_cuda.cu
index e3fe580cf76..dcacedca00e 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cu
+++ b/opal/datatype/cuda/opal_datatype_cuda.cu
@@ -357,12 +357,24 @@ void opal_ddt_set_cuda_iov_position(struct opal_convertor_t *convertor, size_t d
 {
     int i;
     size_t iov_size = 0;
+    size_t ddt_size;
+    convertor->current_iov_partial_length = 0;
+    convertor->current_cuda_iov_pos = 0;
+    if (ddt_offset == 0) {
+       return;
+    }
+    opal_datatype_type_size(convertor->pDesc, &ddt_size);
+    ddt_offset = ddt_offset % ddt_size;
     for(i = 0; i < cuda_iov_count; i++) {
         iov_size += cached_cuda_iov_nb_bytes_list_h[i];
         if (iov_size > ddt_offset) {
             convertor->current_iov_partial_length = iov_size - ddt_offset;
             convertor->current_cuda_iov_pos = i;
             break;
+        } else if (iov_size == ddt_offset){
+            convertor->current_iov_partial_length = 0;
+            convertor->current_cuda_iov_pos = i+1;
+            break;
         }
     }
 }
diff --git a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
index c038b9001f6..84e6ce059b3 100644
--- a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
+++ b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
@@ -139,9 +139,9 @@ __global__ void opal_generic_simple_pack_cuda_iov_non_cached_kernel( ddt_cuda_io
 
 __global__ void opal_generic_simple_unpack_cuda_iov_non_cached_kernel( ddt_cuda_iov_dist_non_cached_t* cuda_iov_dist, int nb_blocks_used);
 
-__global__ void opal_generic_simple_pack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, uint32_t cuda_iov_pos, uintptr_t* cuda_iov_contig_buf_d, int nb_blocks_used, unsigned char* source_base, unsigned char* destination_base);
+__global__ void opal_generic_simple_pack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, uint32_t cuda_iov_pos, uint32_t cuda_iov_count, uint32_t ddt_extent, uint32_t current_count, int nb_blocks_used, unsigned char* source_base, unsigned char* destination_base);
 
-__global__ void opal_generic_simple_unpack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, uint32_t cuda_iov_pos, uintptr_t* cuda_iov_contig_buf_d, int nb_blocks_used, unsigned char* destination_base, unsigned char* source_base, size_t cuda_iov_partial_length_start, size_t cuda_iov_partial_length_end);
+__global__ void opal_generic_simple_unpack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, uint32_t cuda_iov_pos, uint32_t cuda_iov_count, uint32_t ddt_extent, uint32_t current_count, int nb_blocks_used, unsigned char* destination_base, unsigned char* source_base, size_t cuda_iov_partial_length_start, size_t cuda_iov_partial_length_end);
 
 void opal_cuda_output(int output_id, const char *format, ...);
 
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
index 92a96d1cb2b..2564fe1393c 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
@@ -88,7 +88,7 @@ __global__ void opal_generic_simple_pack_cuda_iov_non_cached_kernel( ddt_cuda_io
     }
 }
 
-__global__ void opal_generic_simple_pack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, uint32_t cuda_iov_pos, uintptr_t* cuda_iov_contig_buf_d, int nb_blocks_used, unsigned char* source_base, unsigned char* destination_base)
+__global__ void opal_generic_simple_pack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, uint32_t cuda_iov_pos, uint32_t cuda_iov_count, uint32_t ddt_extent, uint32_t current_count, int nb_blocks_used, unsigned char* source_base, unsigned char* destination_base)
 {
     uint32_t i, j;
     uint32_t _nb_bytes;    
@@ -97,6 +97,9 @@ __global__ void opal_generic_simple_pack_cuda_iov_cached_kernel( ddt_cuda_iov_di
     uint32_t current_cuda_iov_pos = cuda_iov_pos;
     size_t destination_disp = cuda_iov_dist[current_cuda_iov_pos].contig_disp;
     size_t contig_disp;
+    uint32_t _my_cuda_iov_pos;
+    uint32_t _my_cuda_iov_iteration;
+    size_t ddt_size = cuda_iov_dist[cuda_iov_count].contig_disp;
     
     __shared__ uint32_t nb_tasks;
     uint32_t copy_count;
@@ -107,15 +110,20 @@ __global__ void opal_generic_simple_pack_cuda_iov_cached_kernel( ddt_cuda_iov_di
         if (blockIdx.x < (nb_blocks_used % gridDim.x)) {
             nb_tasks ++;
         }
-   //     printf("nb_tasks %d, griddim %d, nb_blocks_used %d, bloid %d \n", nb_tasks, gridDim.x, nb_blocks_used, blockIdx.x);
+    //    printf("cuda_iov_count %d, ddt_extent %d, current_count %d\n", cuda_iov_count, ddt_extent, current_count);
+    //     printf("nb_tasks %d, griddim %d, nb_blocks_used %d, bloid %d \n", nb_tasks, gridDim.x, nb_blocks_used, blockIdx.x);
     }
     __syncthreads();
     
     for (i = 0; i < nb_tasks; i++) {
-        contig_disp = cuda_iov_dist[blockIdx.x + i * gridDim.x + current_cuda_iov_pos].contig_disp;  /* this variable is used multiple times, so put in in register */
-        src_offset = cuda_iov_dist[blockIdx.x + i * gridDim.x + current_cuda_iov_pos].ncontig_disp;
-        dst_offset = contig_disp - destination_disp;
-        _nb_bytes = cuda_iov_dist[blockIdx.x + i * gridDim.x + current_cuda_iov_pos + 1].contig_disp - contig_disp;
+        /* these 3 variables are used multiple times, so put in in register */
+        _my_cuda_iov_pos = (blockIdx.x + i * gridDim.x + current_cuda_iov_pos) % cuda_iov_count;
+        _my_cuda_iov_iteration = (blockIdx.x + i * gridDim.x + current_cuda_iov_pos) / cuda_iov_count;
+        contig_disp = cuda_iov_dist[_my_cuda_iov_pos].contig_disp;  
+        
+        src_offset = cuda_iov_dist[_my_cuda_iov_pos].ncontig_disp + (_my_cuda_iov_iteration + current_count) * ddt_extent;
+        dst_offset = contig_disp + ddt_size * _my_cuda_iov_iteration - destination_disp;
+        _nb_bytes = cuda_iov_dist[_my_cuda_iov_pos + 1].contig_disp - contig_disp;
         
         _source_tmp = source_base + src_offset;
         _destination_tmp = destination_base + dst_offset;
@@ -128,7 +136,12 @@ __global__ void opal_generic_simple_pack_cuda_iov_cached_kernel( ddt_cuda_iov_di
             alignment = ALIGNMENT_CHAR;
         }
         copy_count = _nb_bytes / alignment;
-        
+    /*    
+        if (threadIdx.x == 0 && nb_tasks != 0) {
+            printf("pack block %d, src_offset %ld, dst_offset %ld, count %d, nb_bytes %d, nb_tasks %d, i %d\n", blockIdx.x, src_offset, dst_offset, copy_count, _nb_bytes, nb_tasks, i);
+        }
+        __syncthreads();
+      */  
         for (j = threadIdx.x; j < copy_count; j += blockDim.x) {
             if (j < copy_count) {
                 _source_tmp = source_base + src_offset + j * alignment;
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
index f1ce6dbda7d..fc9181e902b 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
@@ -966,6 +966,8 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
     uint32_t cached_cuda_iov_count = 0;
     uint8_t cuda_iov_is_cached = 0;
     size_t destionation_disp = 0;
+    opal_datatype_count_t convertor_current_count;
+    OPAL_PTRDIFF_TYPE ddt_extent;
 
 #if defined(OPAL_DATATYPE_CUDA_TIMING)    
     TIMER_DATA_TYPE start, end, start_total, end_total;
@@ -1117,20 +1119,19 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
 #endif
     }
     
+    /* now we use cached cuda iov */
     cached_cuda_iov_count = cached_cuda_iov->cuda_iov_count;
+    cuda_iov_start_pos = pConvertor->current_cuda_iov_pos;
+    cuda_iov_end_pos = cached_cuda_iov_count;
+    nb_blocks_used = 0;
+    cuda_iov_pipeline_block = current_cuda_device->cuda_iov_pipeline_block[iov_pipeline_block_id];
+    cuda_stream_iov = cuda_iov_pipeline_block->cuda_stream;
+    convertor_current_count = pConvertor->current_count;
    
-    /* now we use cached cuda iov */
-    if( pConvertor->bConverted != pConvertor->local_size && !buffer_isfull) {
-        cuda_iov_start_pos = pConvertor->current_cuda_iov_pos;
-        cuda_iov_end_pos = cached_cuda_iov_count;
-        nb_blocks_used = 0;
-        cuda_iov_pipeline_block = current_cuda_device->cuda_iov_pipeline_block[iov_pipeline_block_id];
-        cuda_iov_contig_buf_h_current = cuda_iov_pipeline_block->cuda_iov_contig_buf_h;
-        cuda_iov_contig_buf_d_current = cuda_iov_pipeline_block->cuda_iov_contig_buf_d;
-        cuda_stream_iov = cuda_iov_pipeline_block->cuda_stream;
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
-        GET_TIME(start);
+    GET_TIME(start);
 #endif
+    while( pConvertor->current_count < pConvertor->count && !buffer_isfull) {
         for (i = cuda_iov_start_pos; i < cuda_iov_end_pos && !buffer_isfull; i++) {
             if (buffer_size >= cached_cuda_iov_nb_bytes_list_h[i]) {
                 total_packed += cached_cuda_iov_nb_bytes_list_h[i];
@@ -1141,16 +1142,22 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
                 break;
             }
         }
+        if (!buffer_isfull) {
+            pConvertor->current_count ++;
+            cuda_iov_start_pos = 0;
+            cuda_iov_end_pos = cached_cuda_iov->cuda_iov_count;
+        }
+    }
 #if defined(OPAL_DATATYPE_CUDA_TIMING)    
-        GET_TIME( end );
-        total_time = ELAPSED_TIME( start, end );
-        DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Pack to dest %p, cached cuda iov is prepared in %ld microsec, kernel submitted to CUDA stream %d, nb_blocks %d\n", destination_base, total_time,  cuda_iov_pipeline_block->cuda_stream_id, nb_blocks_used); );
+    GET_TIME( end );
+    total_time = ELAPSED_TIME( start, end );
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Pack to dest %p, cached cuda iov is prepared in %ld microsec, kernel submitted to CUDA stream %d, nb_blocks %d\n", destination_base, total_time,  cuda_iov_pipeline_block->cuda_stream_id, nb_blocks_used); );
 #endif
-//        cudaMemcpyAsync(cuda_iov_contig_buf_d_current, cuda_iov_contig_buf_h_current, sizeof(uintptr_t)*(nb_blocks_used), cudaMemcpyHostToDevice, *cuda_stream_iov);
-        DT_CUDA_DEBUG ( opal_cuda_output(2, "kernel launched src_base %p, dst_base %p, nb_blocks %ld\n", source_base, destination_base, nb_blocks_used ); );
-        opal_generic_simple_pack_cuda_iov_cached_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cached_cuda_iov_dist_d, pConvertor->current_cuda_iov_pos, cuda_iov_contig_buf_d_current, nb_blocks_used, source_base, destination_base);
-        pConvertor->current_cuda_iov_pos += nb_blocks_used;
-    }
+    opal_datatype_type_extent(pConvertor->pDesc, &ddt_extent);
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "kernel launched src_base %p, dst_base %p, nb_blocks %ld, extent %ld\n", source_base, destination_base, nb_blocks_used, ddt_extent ); );
+    opal_generic_simple_pack_cuda_iov_cached_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cached_cuda_iov_dist_d, pConvertor->current_cuda_iov_pos, cached_cuda_iov_count, ddt_extent, convertor_current_count, nb_blocks_used, source_base, destination_base);
+    pConvertor->current_cuda_iov_pos += nb_blocks_used;
+    pConvertor->current_cuda_iov_pos = pConvertor->current_cuda_iov_pos % cached_cuda_iov->cuda_iov_count;
 
     for (i = 0; i < NB_STREAMS; i++) {
         cudaStreamSynchronize(cuda_streams->opal_cuda_stream[i]);
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
index f2c337ea682..f6ee8e0bfc4 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
@@ -46,7 +46,7 @@ __global__ void opal_generic_simple_unpack_cuda_iov_non_cached_kernel( ddt_cuda_
     }
 }
 
-__global__ void opal_generic_simple_unpack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, uint32_t cuda_iov_pos, uintptr_t* cuda_iov_contig_buf_d, int nb_blocks_used, unsigned char* destination_base, unsigned char* source_base, size_t cuda_iov_partial_length_start, size_t cuda_iov_partial_length_end)
+__global__ void opal_generic_simple_unpack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, uint32_t cuda_iov_pos, uint32_t cuda_iov_count, uint32_t ddt_extent, uint32_t current_count, int nb_blocks_used, unsigned char* destination_base, unsigned char* source_base, size_t cuda_iov_partial_length_start, size_t cuda_iov_partial_length_end)
 {
     uint32_t i, j;
     size_t dst_offset, src_offset;
@@ -54,8 +54,11 @@ __global__ void opal_generic_simple_unpack_cuda_iov_cached_kernel( ddt_cuda_iov_
     uint32_t _nb_bytes;
     uint32_t current_cuda_iov_pos = cuda_iov_pos;
     size_t source_disp = cuda_iov_dist[current_cuda_iov_pos].contig_disp;
-    size_t source_partial_disp = (cuda_iov_dist[current_cuda_iov_pos+1].contig_disp - cuda_iov_dist[current_cuda_iov_pos].contig_disp) - cuda_iov_partial_length_start;
+    size_t source_partial_disp = 0;
     size_t contig_disp; 
+    uint32_t _my_cuda_iov_pos;
+    uint32_t _my_cuda_iov_iteration;
+    size_t ddt_size = cuda_iov_dist[cuda_iov_count].contig_disp;
 
     __shared__ uint32_t nb_tasks;
     uint32_t copy_count;
@@ -66,17 +69,26 @@ __global__ void opal_generic_simple_unpack_cuda_iov_cached_kernel( ddt_cuda_iov_
         if (blockIdx.x < nb_blocks_used % gridDim.x) {
             nb_tasks ++;
         }
+     //   printf("cuda_iov_count %d, ddt_extent %d, current_count %d, ddt_size %d\n", cuda_iov_count, ddt_extent, current_count, ddt_size);
     }
     __syncthreads();
     
+    if (cuda_iov_partial_length_start != 0) {
+        source_partial_disp = (cuda_iov_dist[current_cuda_iov_pos+1].contig_disp - cuda_iov_dist[current_cuda_iov_pos].contig_disp) - cuda_iov_partial_length_start;
+    }
+    
     for (i = 0; i < nb_tasks; i++) {
-        contig_disp = cuda_iov_dist[blockIdx.x + i * gridDim.x + current_cuda_iov_pos].contig_disp;  /* this variable is used multiple times, so put in in register */
-        src_offset = contig_disp - source_disp - source_partial_disp;
-        dst_offset = cuda_iov_dist[blockIdx.x + i * gridDim.x + current_cuda_iov_pos].ncontig_disp;
-        _nb_bytes = cuda_iov_dist[blockIdx.x + i * gridDim.x + current_cuda_iov_pos + 1].contig_disp - contig_disp;
+        /* these 3 variables are used multiple times, so put in in register */
+        _my_cuda_iov_pos = (blockIdx.x + i * gridDim.x + current_cuda_iov_pos) % cuda_iov_count;
+        _my_cuda_iov_iteration = (blockIdx.x + i * gridDim.x + current_cuda_iov_pos) / cuda_iov_count;
+        contig_disp = cuda_iov_dist[_my_cuda_iov_pos].contig_disp; 
+        
+        src_offset = contig_disp + ddt_size * _my_cuda_iov_iteration - source_disp - source_partial_disp;
+        dst_offset = cuda_iov_dist[_my_cuda_iov_pos].ncontig_disp + (_my_cuda_iov_iteration + current_count) * ddt_extent;
+        _nb_bytes = cuda_iov_dist[_my_cuda_iov_pos + 1].contig_disp - contig_disp;
 
         if (i == 0 && blockIdx.x == 0 && cuda_iov_partial_length_start != 0) {
-            src_offset = contig_disp - source_disp;
+            src_offset = contig_disp + ddt_size * _my_cuda_iov_iteration - source_disp;
             dst_offset = dst_offset + _nb_bytes - cuda_iov_partial_length_start;  
             _nb_bytes = cuda_iov_partial_length_start;
         } else if (i == nb_tasks-1 && (blockIdx.x == (nb_blocks_used-1) % gridDim.x) && cuda_iov_partial_length_end != 0) {
@@ -93,7 +105,12 @@ __global__ void opal_generic_simple_unpack_cuda_iov_cached_kernel( ddt_cuda_iov_
             alignment = ALIGNMENT_CHAR;
         }
         copy_count = _nb_bytes / alignment;
-        
+   /*     
+        if (threadIdx.x == 0 && nb_tasks != 0) {
+            printf("unpack block %d, src_offset %ld, dst_offset %ld, count %d, nb_bytes %d, nb_tasks %d, i %d\n", blockIdx.x, src_offset, dst_offset, copy_count, _nb_bytes, nb_tasks, i);
+        }
+        __syncthreads();
+     */   
         for (j = threadIdx.x; j < copy_count; j += blockDim.x) {
 /*            if (threadIdx.x == 0) {
                 if (copy_count > blockDim.x) printf("copy_count %d, dim %d\n", copy_count, blockDim.x);
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
index 3d4b8c95ff0..604e68b1d89 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
@@ -745,6 +745,8 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
     size_t cuda_iov_partial_length_start = 0;
     size_t cuda_iov_partial_length_end = 0;
     size_t source_disp = 0;
+    opal_datatype_count_t convertor_current_count;
+    OPAL_PTRDIFF_TYPE ddt_extent;
 
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
     TIMER_DATA_TYPE start, end, start_total, end_total;
@@ -888,32 +890,31 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
         DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Unpack cached cuda iov is prepared in %ld microsec, nb_blocks_used %d\n", total_time, nb_blocks_used); );
 #endif
     }
-    
-    cached_cuda_iov_count = cached_cuda_iov->cuda_iov_count;
       
     /* now we use cached cuda iov */
-    if( pConvertor->bConverted != pConvertor->local_size && !buffer_isfull) {
-        opal_ddt_set_cuda_iov_position(pConvertor, pConvertor->bConverted, cached_cuda_iov_nb_bytes_list_h, cached_cuda_iov_count);
-        cuda_iov_start_pos = pConvertor->current_cuda_iov_pos;
-        cuda_iov_end_pos = cached_cuda_iov_count;
-        nb_blocks_used = 0;
-        cuda_iov_pipeline_block = current_cuda_device->cuda_iov_pipeline_block[iov_pipeline_block_id];
-        cuda_iov_contig_buf_h_current = cuda_iov_pipeline_block->cuda_iov_contig_buf_h;
-        cuda_iov_contig_buf_d_current = cuda_iov_pipeline_block->cuda_iov_contig_buf_d;
-        cuda_iov_dist_d_current = cached_cuda_iov_dist_d + pConvertor->current_cuda_iov_pos;
-        cuda_stream_iov = cuda_iov_pipeline_block->cuda_stream;
+    cached_cuda_iov_count = cached_cuda_iov->cuda_iov_count;
+    opal_ddt_set_cuda_iov_position(pConvertor, pConvertor->bConverted, cached_cuda_iov_nb_bytes_list_h, cached_cuda_iov_count);
+    cuda_iov_start_pos = pConvertor->current_cuda_iov_pos;
+    cuda_iov_end_pos = cached_cuda_iov_count;
+    nb_blocks_used = 0;
+    cuda_iov_pipeline_block = current_cuda_device->cuda_iov_pipeline_block[iov_pipeline_block_id];
+    cuda_stream_iov = cuda_iov_pipeline_block->cuda_stream;
+    convertor_current_count = pConvertor->current_count;
+    
+    printf("[00000] partial_length %ld, pos %d\n", pConvertor->current_iov_partial_length, pConvertor->current_cuda_iov_pos);
+    if (pConvertor->current_iov_partial_length > 0) {
+        cuda_iov_partial_length_start = pConvertor->current_iov_partial_length;
+        total_unpacked += cuda_iov_partial_length_start;
+        buffer_size -= cuda_iov_partial_length_start;
+        pConvertor->current_iov_partial_length = 0;
+        cuda_iov_start_pos ++;
+        nb_blocks_used ++;
+    }
+    
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
-        GET_TIME(start);
+    GET_TIME(start);
 #endif
-        printf("[00000] partial_length %ld, pos %d\n", pConvertor->current_iov_partial_length, pConvertor->current_cuda_iov_pos);
-        if (pConvertor->current_iov_partial_length > 0) {
-            cuda_iov_partial_length_start = pConvertor->current_iov_partial_length;
-            total_unpacked += cuda_iov_partial_length_start;
-            buffer_size -= cuda_iov_partial_length_start;
-            pConvertor->current_iov_partial_length = 0;
-            cuda_iov_start_pos ++;
-            nb_blocks_used ++;
-        }
+    while( pConvertor->current_count < pConvertor->count && !buffer_isfull) {
         for (i = cuda_iov_start_pos; i < cuda_iov_end_pos && !buffer_isfull; i++) {
             if (buffer_size >= cached_cuda_iov_nb_bytes_list_h[i]) {
                 total_unpacked += cached_cuda_iov_nb_bytes_list_h[i];
@@ -930,14 +931,20 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
                 break;
             }
         }
+        if (!buffer_isfull) {
+            pConvertor->current_count ++;
+            cuda_iov_start_pos = 0;
+            cuda_iov_end_pos = cached_cuda_iov_count;
+        }
+    }
 #if defined(OPAL_DATATYPE_CUDA_TIMING)    
-        GET_TIME( end );
-        total_time = ELAPSED_TIME( start, end );
-        DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Unpack src %p, cached cuda iov is prepared in %ld microsec, kernel submitted to CUDA stream %d, nb_blocks %d\n", source_base, total_time,  cuda_iov_pipeline_block->cuda_stream_id, nb_blocks_used); );
+    GET_TIME( end );
+    total_time = ELAPSED_TIME( start, end );
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Unpack src %p, cached cuda iov is prepared in %ld microsec, kernel submitted to CUDA stream %d, nb_blocks %d\n", source_base, total_time,  cuda_iov_pipeline_block->cuda_stream_id, nb_blocks_used); );
 #endif
-        DT_CUDA_DEBUG ( opal_cuda_output(2, "kernel launched src_base %p, dst_base %p, nb_blocks %ld\n", source_base, destination_base, nb_blocks_used ); );
-        opal_generic_simple_unpack_cuda_iov_cached_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cached_cuda_iov_dist_d, pConvertor->current_cuda_iov_pos, cuda_iov_contig_buf_d_current, nb_blocks_used, destination_base, source_base, cuda_iov_partial_length_start, cuda_iov_partial_length_end);
-    }
+    opal_datatype_type_extent(pConvertor->pDesc, &ddt_extent);
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "kernel launched src_base %p, dst_base %p, nb_blocks %ld\n", source_base, destination_base, nb_blocks_used ); );
+    opal_generic_simple_unpack_cuda_iov_cached_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cached_cuda_iov_dist_d, pConvertor->current_cuda_iov_pos, cached_cuda_iov_count, ddt_extent, convertor_current_count, nb_blocks_used, destination_base, source_base, cuda_iov_partial_length_start, cuda_iov_partial_length_end);
 
     for (i = 0; i < NB_STREAMS; i++) {
         cudaStreamSynchronize(cuda_streams->opal_cuda_stream[i]);

From 11d4a5bc811df32ce5ae718537e898eed892bff2 Mon Sep 17 00:00:00 2001
From: eddy16112 <eddy16112@gmail.com>
Date: Mon, 16 Nov 2015 16:02:11 -0500
Subject: [PATCH 177/190] move the cuda iov caching into a seperate function

---
 opal/datatype/cuda/opal_datatype_cuda.cu      |  79 +++++++++++++
 opal/datatype/cuda/opal_datatype_cuda.cuh     |   4 +-
 .../cuda/opal_datatype_pack_cuda_wrapper.cu   | 110 +++---------------
 .../cuda/opal_datatype_unpack_cuda_wrapper.cu | 107 ++---------------
 4 files changed, 106 insertions(+), 194 deletions(-)

diff --git a/opal/datatype/cuda/opal_datatype_cuda.cu b/opal/datatype/cuda/opal_datatype_cuda.cu
index dcacedca00e..7ba18f297fa 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cu
+++ b/opal/datatype/cuda/opal_datatype_cuda.cu
@@ -327,6 +327,85 @@ void opal_ddt_cached_cuda_iov_fini(void* cached_cuda_iov)
 #endif /* OPAL_DATATYPE_CUDA_IOV_CACHE */
 }
 
+/* cached_cuda_iov_d is not ready until explicitlt sync with cuda stream 0 
+*/
+int32_t opal_ddt_cache_cuda_iov(opal_convertor_t* pConvertor, ddt_cuda_iov_dist_cached_t *cached_cuda_iov_d, uint32_t *cached_cuda_iov_nb_bytes_list_h, uint32_t *cuda_iov_count)
+{
+    uint32_t i, j;
+    uint32_t count_desc, nb_blocks_per_description, residue_desc;
+    uint32_t thread_per_block, nb_blocks_used;
+    size_t length_per_iovec;
+    uint8_t alignment;
+    ddt_cuda_iov_pipeline_block_t *cuda_iov_pipeline_block = NULL;
+    ddt_cuda_iov_dist_cached_t *cuda_iov_h = NULL;
+    cudaStream_t *cuda_stream_iov = NULL;
+    const struct iovec *ddt_iov = NULL;
+    uint32_t ddt_iov_count = 0;
+    size_t ncontig_disp_base;
+    size_t contig_disp = 0;
+    
+    opal_convertor_raw_cached( pConvertor, &ddt_iov, &ddt_iov_count);
+    if (ddt_iov == NULL) {
+        DT_CUDA_DEBUG ( opal_cuda_output(0, "Can not get ddt iov\n"););
+        return OPAL_ERROR;
+    }
+    
+    nb_blocks_used = 0;
+    cuda_iov_pipeline_block = current_cuda_device->cuda_iov_pipeline_block[0];
+    cuda_iov_h = cuda_iov_pipeline_block->cuda_iov_dist_cached_h;
+    cuda_stream_iov = cuda_iov_pipeline_block->cuda_stream;
+
+    for (i = 0; i < ddt_iov_count; i++) {
+        length_per_iovec = ddt_iov[i].iov_len;
+        ncontig_disp_base = (size_t)(ddt_iov[i].iov_base);
+    
+        /* block size is either multiple of ALIGNMENT_DOUBLE or residule */
+        alignment = ALIGNMENT_DOUBLE;
+
+        count_desc = length_per_iovec / alignment;
+        residue_desc = length_per_iovec % alignment;
+        nb_blocks_per_description = (count_desc + thread_per_block - 1) / thread_per_block;
+        DT_CUDA_DEBUG ( opal_cuda_output(10, "Cache cuda IOV description %d, size %d, residue %d, alignment %d, nb_block_aquired %d\n", i, count_desc, residue_desc, alignment, nb_blocks_per_description); );
+        for (j = 0; j < nb_blocks_per_description; j++) {
+            cuda_iov_h[nb_blocks_used].ncontig_disp = ncontig_disp_base + j * thread_per_block * alignment;
+            cuda_iov_h[nb_blocks_used].contig_disp = contig_disp;
+            if ( (j+1) * thread_per_block <= count_desc) {
+                cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] = thread_per_block * alignment;
+            } else {
+                cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] = (count_desc - j*thread_per_block) * alignment; 
+            }
+#if defined (OPAL_DATATYPE_CUDA_DEBUG)
+            assert(cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] > 0);
+#endif /* OPAL_DATATYPE_CUDA_DEBUG */
+            contig_disp += cached_cuda_iov_nb_bytes_list_h[nb_blocks_used];
+            DT_CUDA_DEBUG( opal_cuda_output(12, "Cache cuda IOV \tblock %d, ncontig_disp %ld, contig_disp %ld, nb_bytes %ld\n", nb_blocks_used, cuda_iov_h[nb_blocks_used].ncontig_disp, cuda_iov_h[nb_blocks_used].contig_disp, cached_cuda_iov_nb_bytes_list_h[nb_blocks_used]); );
+            nb_blocks_used ++;
+            assert (nb_blocks_used < CUDA_MAX_NB_BLOCKS*CUDA_IOV_MAX_TASK_PER_BLOCK);
+        }
+    
+        /* handle residue */
+        if (residue_desc != 0) {
+            cuda_iov_h[nb_blocks_used].ncontig_disp = ncontig_disp_base + length_per_iovec / alignment * alignment;
+            cuda_iov_h[nb_blocks_used].contig_disp = contig_disp;
+            cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] = length_per_iovec - length_per_iovec / alignment * alignment;
+#if defined (OPAL_DATATYPE_CUDA_DEBUG)
+            assert(cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] > 0);
+#endif /* OPAL_DATATYPE_CUDA_DEBUG */
+            contig_disp += cached_cuda_iov_nb_bytes_list_h[nb_blocks_used];
+            DT_CUDA_DEBUG( opal_cuda_output(12, "Cache cuda IOV \tblock %d, ncontig_disp %ld, contig_disp %ld, nb_bytes %ld\n", nb_blocks_used, cuda_iov_h[nb_blocks_used].ncontig_disp, cuda_iov_h[nb_blocks_used].contig_disp, cached_cuda_iov_nb_bytes_list_h[nb_blocks_used]); );
+            nb_blocks_used ++;
+#if defined (OPAL_DATATYPE_CUDA_DEBUG)
+            assert (nb_blocks_used < CUDA_MAX_NB_BLOCKS*CUDA_IOV_MAX_TASK_PER_BLOCK);
+#endif /* OPAL_DATATYPE_CUDA_DEBUG */
+        }
+    }
+    /* use additional entry to store the size of entire contiguous buffer needed for one ddt */
+    cuda_iov_h[nb_blocks_used].contig_disp = contig_disp;
+    cudaMemcpyAsync(cached_cuda_iov_d, cuda_iov_h, sizeof(ddt_cuda_iov_dist_cached_t)*(nb_blocks_used+1), cudaMemcpyHostToDevice, *cuda_stream_iov);
+    *cuda_iov_count = nb_blocks_used;
+    return OPAL_SUCCESS;
+}
+
 void opal_ddt_get_cached_cuda_iov(struct opal_convertor_t *convertor, ddt_cuda_iov_total_cached_t **cached_cuda_iov)
 {
     opal_datatype_t *datatype = (opal_datatype_t *)convertor->pDesc;
diff --git a/opal/datatype/cuda/opal_datatype_cuda.cuh b/opal/datatype/cuda/opal_datatype_cuda.cuh
index 96a045f66cd..4a71ab37d63 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cuh
+++ b/opal/datatype/cuda/opal_datatype_cuda.cuh
@@ -131,6 +131,8 @@ void opal_ddt_check_cuda_iov_is_full(struct opal_convertor_t *convertor, uint32_
 
 void opal_ddt_set_cuda_iov_position(struct opal_convertor_t *convertor, size_t ddt_offset, const uint32_t *cached_cuda_iov_nb_bytes_list_h, const uint32_t cuda_iov_count);
 
+int32_t opal_ddt_cache_cuda_iov(opal_convertor_t* pConvertor, ddt_cuda_iov_dist_cached_t *cached_cuda_iov_d, uint32_t *cached_cuda_iov_nb_bytes_list_h, uint32_t *cuda_iov_count);
+
 }
                             
-#endif  /* OPAL_DATATYPE_CUDA_H_HAS_BEEN_INCLUDED */
+#endif  /* OPAL_DATATYPE_CUDA_H_HAS_BEEN_INCLUDED */
\ No newline at end of file
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
index fc9181e902b..ddc2ec08a89 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
@@ -932,40 +932,21 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
                                                                uint32_t* out_size,
                                                                size_t* max_data )
 {
-    uint32_t i, j;
-    uint32_t count_desc, nb_blocks_per_description, residue_desc;
+    uint32_t i;
     uint32_t nb_blocks, thread_per_block, nb_blocks_used;
-    size_t length, buffer_size, length_per_iovec;
-    unsigned char *destination, *destination_base, *source_base, *source;
+    size_t buffer_size;
+    unsigned char *destination, *destination_base, *source_base;
     size_t total_packed;
-    int32_t complete_flag = 0;
     uint8_t buffer_isfull = 0, transfer_required, free_required;
-    uint32_t convertor_flags;
-//    dt_elem_desc_t* description;
-//    dt_elem_desc_t* pElem;
-//    dt_stack_t* pStack;
-    uint8_t alignment, orig_alignment;
-//    int32_t orig_stack_index;
     cudaError_t cuda_err;
     ddt_cuda_stream_t *cuda_streams = current_cuda_device->cuda_streams;
-    ddt_cuda_iov_dist_cached_t* cuda_iov_dist_h_current = NULL;
-    ddt_cuda_iov_dist_cached_t* cuda_iov_dist_d_current = NULL;
-    uintptr_t *cuda_iov_contig_buf_h_current = NULL; 
-    uintptr_t *cuda_iov_contig_buf_d_current = NULL;
     ddt_cuda_iov_pipeline_block_t *cuda_iov_pipeline_block = NULL;
-    int iov_pipeline_block_id = 0;
     cudaStream_t *cuda_stream_iov = NULL;
-    const struct iovec *ddt_iov = NULL;
-    uint32_t ddt_iov_count = 0;
-    size_t iov_len = 0;
-    uint32_t iov_start_pos, iov_end_pos, cuda_iov_start_pos, cuda_iov_end_pos;
+    uint32_t cuda_iov_start_pos, cuda_iov_end_pos;
     ddt_cuda_iov_total_cached_t* cached_cuda_iov = NULL;
     ddt_cuda_iov_dist_cached_t* cached_cuda_iov_dist_d = NULL;
     uint32_t *cached_cuda_iov_nb_bytes_list_h = NULL;
-    uint32_t *cuda_iov_nb_bytes_list_h_current = NULL;
     uint32_t cached_cuda_iov_count = 0;
-    uint8_t cuda_iov_is_cached = 0;
-    size_t destionation_disp = 0;
     opal_datatype_count_t convertor_current_count;
     OPAL_PTRDIFF_TYPE ddt_extent;
 
@@ -973,14 +954,6 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
     TIMER_DATA_TYPE start, end, start_total, end_total;
     long total_time, move_time;
 #endif
-    
-    /*description = pConvertor->use_desc->desc;
-    pStack = pConvertor->pStack + pConvertor->stack_pos;
-    pElem = &(description[pStack->index]);
-    printf("size elem %lu, size %d\n", pElem->elem.common.type, opal_datatype_basicDatatypes[pElem->elem.common.type]->size);
-    */
-    
-//    assert(opal_datatype_basicDatatypes[pElem->elem.common.type]->size != 0);
 
  //   printf("buffer size %d, max_data %d\n", iov[0].iov_len, *max_data);
     if ((iov[0].iov_base == NULL) || opal_ddt_cuda_is_gpu_buffer(iov[0].iov_base)) {
@@ -1021,7 +994,6 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
 
     total_packed = 0;
     cuda_streams->current_stream_id = 0;
-  //  orig_stack_index = pStack->index;
     destination_base = destination;
 
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
@@ -1032,14 +1004,11 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
     GET_TIME(start);
 #endif
     
-    opal_convertor_raw_cached( pConvertor, &ddt_iov, &ddt_iov_count);
-    assert(ddt_iov != NULL);
     opal_ddt_get_cached_cuda_iov(pConvertor, &cached_cuda_iov);
     cached_cuda_iov_dist_d = cached_cuda_iov->cuda_iov_dist_d;
     assert(cached_cuda_iov_dist_d != NULL);
     cached_cuda_iov_nb_bytes_list_h = cached_cuda_iov->nb_bytes_h;
     assert(cached_cuda_iov_nb_bytes_list_h != NULL);
-    DT_CUDA_DEBUG ( opal_cuda_output(4, "Pack iov count %d, submit to CUDA stream %d\n", ddt_iov_count, cuda_streams->current_stream_id); );
 
 #if defined(OPAL_DATATYPE_CUDA_TIMING)    
     GET_TIME( end );
@@ -1053,69 +1022,20 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
     
     /* cuda iov is not cached, start to cache iov */
     if(opal_ddt_cuda_iov_is_cached(pConvertor) == 0) {
-        nb_blocks_used = 0;
-        cuda_iov_pipeline_block = current_cuda_device->cuda_iov_pipeline_block[iov_pipeline_block_id];
-        cuda_iov_dist_h_current = cuda_iov_pipeline_block->cuda_iov_dist_cached_h;
-        cuda_stream_iov = cuda_iov_pipeline_block->cuda_stream;
-        cuda_err = cudaStreamWaitEvent(*cuda_stream_iov, cuda_iov_pipeline_block->cuda_event, 0);
-        opal_cuda_check_error(cuda_err);
-
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
         GET_TIME(start);
 #endif
-
-        for (i = 0; i < ddt_iov_count; i++) {
-            length_per_iovec = ddt_iov[i].iov_len;
-            source = (size_t)(ddt_iov[i].iov_base) + source_base;
-        
-            /* block size is either multiple of ALIGNMENT_DOUBLE or residule */
-            alignment = ALIGNMENT_DOUBLE;
-
-            count_desc = length_per_iovec / alignment;
-            residue_desc = length_per_iovec % alignment;
-            nb_blocks_per_description = (count_desc + thread_per_block - 1) / thread_per_block;
-            DT_CUDA_DEBUG ( opal_cuda_output(10, "Pack description %d, size %d, residue %d, alignment %d, nb_block_aquired %d\n", i, count_desc, residue_desc, alignment, nb_blocks_per_description); );
-            for (j = 0; j < nb_blocks_per_description; j++) {
-                cuda_iov_dist_h_current[nb_blocks_used].ncontig_disp = source + j * thread_per_block * alignment - source_base;
-                cuda_iov_dist_h_current[nb_blocks_used].contig_disp = destionation_disp;
-                if ( (j+1) * thread_per_block <= count_desc) {
-                    cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] = thread_per_block * alignment;
-                } else {
-                    cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] = (count_desc - j*thread_per_block) * alignment; 
-                }
-#if defined (OPAL_DATATYPE_CUDA_DEBUG)
-                assert(cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] > 0);
-#endif /* OPAL_DATATYPE_CUDA_DEBUG */
-                destionation_disp += cached_cuda_iov_nb_bytes_list_h[nb_blocks_used];
-                DT_CUDA_DEBUG( opal_cuda_output(12, "Pack \tblock %d, ncontig_disp %ld, contig_disp %ld, nb_bytes %ld\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].ncontig_disp, cuda_iov_dist_h_current[nb_blocks_used].contig_disp, cached_cuda_iov_nb_bytes_list_h[nb_blocks_used]); );
-                nb_blocks_used ++;
-                assert (nb_blocks_used < CUDA_MAX_NB_BLOCKS*CUDA_IOV_MAX_TASK_PER_BLOCK);
-            }
-        
-            /* handle residue */
-            if (residue_desc != 0) {
-                /*orig_alignment = opal_datatype_basicDatatypes[pElem->elem.common.type]->size;*/
-                cuda_iov_dist_h_current[nb_blocks_used].ncontig_disp = source + length_per_iovec / alignment * alignment - source_base;
-                cuda_iov_dist_h_current[nb_blocks_used].contig_disp = destionation_disp;
-                cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] = length_per_iovec - length_per_iovec / alignment * alignment;
-#if defined (OPAL_DATATYPE_CUDA_DEBUG)
-                assert(cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] > 0);
-#endif /* OPAL_DATATYPE_CUDA_DEBUG */
-                destionation_disp += cached_cuda_iov_nb_bytes_list_h[nb_blocks_used];
-                DT_CUDA_DEBUG( opal_cuda_output(12, "Pack \tblock %d, ncontig_disp %ld, contig_disp %ld, nb_bytes %ld\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].ncontig_disp, cuda_iov_dist_h_current[nb_blocks_used].contig_disp, cached_cuda_iov_nb_bytes_list_h[nb_blocks_used]); );
-                nb_blocks_used ++;
-                assert (nb_blocks_used < CUDA_MAX_NB_BLOCKS*CUDA_IOV_MAX_TASK_PER_BLOCK);
-            }
+        if (opal_ddt_cache_cuda_iov(pConvertor, cached_cuda_iov_dist_d, cached_cuda_iov_nb_bytes_list_h, &nb_blocks_used) == OPAL_SUCCESS) {
+            opal_ddt_set_cuda_iov_cached(pConvertor, nb_blocks_used);
+            DT_CUDA_DEBUG ( opal_cuda_output(2, "Pack cuda iov is cached, count %d\n", nb_blocks_used););
+        } else {
+            DT_CUDA_DEBUG ( opal_cuda_output(0, "Pack cache cuda iov is failed\n"););
+            return OPAL_ERROR;
         }
-        /* use additional entry to store the size of entire contiguous buffer needed for one ddt */
-        cuda_iov_dist_h_current[nb_blocks_used].contig_disp = destionation_disp;
-        cudaMemcpyAsync(cached_cuda_iov_dist_d, cuda_iov_dist_h_current, sizeof(ddt_cuda_iov_dist_cached_t)*(nb_blocks_used+1), cudaMemcpyHostToDevice, *cuda_stream_iov);
-        opal_ddt_set_cuda_iov_cached(pConvertor, nb_blocks_used);
-        DT_CUDA_DEBUG ( opal_cuda_output(2, "Pack cuda iov is cached, count %d\n", nb_blocks_used););
 #if defined(OPAL_DATATYPE_CUDA_TIMING)    
         GET_TIME( end );
         total_time = ELAPSED_TIME( start, end );
-        DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Pack cached cuda iov is prepared in %ld microsec, nb_blocks %d\n", total_time, nb_blocks_used); );
+        DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Pack cuda iov is cached in %ld microsec, nb_blocks %d\n", total_time, nb_blocks_used); );
 #endif
     }
     
@@ -1124,7 +1044,7 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
     cuda_iov_start_pos = pConvertor->current_cuda_iov_pos;
     cuda_iov_end_pos = cached_cuda_iov_count;
     nb_blocks_used = 0;
-    cuda_iov_pipeline_block = current_cuda_device->cuda_iov_pipeline_block[iov_pipeline_block_id];
+    cuda_iov_pipeline_block = current_cuda_device->cuda_iov_pipeline_block[0];
     cuda_stream_iov = cuda_iov_pipeline_block->cuda_stream;
     convertor_current_count = pConvertor->current_count;
    
@@ -1154,14 +1074,12 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
     DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Pack to dest %p, cached cuda iov is prepared in %ld microsec, kernel submitted to CUDA stream %d, nb_blocks %d\n", destination_base, total_time,  cuda_iov_pipeline_block->cuda_stream_id, nb_blocks_used); );
 #endif
     opal_datatype_type_extent(pConvertor->pDesc, &ddt_extent);
-    DT_CUDA_DEBUG ( opal_cuda_output(2, "kernel launched src_base %p, dst_base %p, nb_blocks %ld, extent %ld\n", source_base, destination_base, nb_blocks_used, ddt_extent ); );
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "Pack kernel launched src_base %p, dst_base %p, nb_blocks %ld, extent %ld\n", source_base, destination_base, nb_blocks_used, ddt_extent ); );
     opal_generic_simple_pack_cuda_iov_cached_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cached_cuda_iov_dist_d, pConvertor->current_cuda_iov_pos, cached_cuda_iov_count, ddt_extent, convertor_current_count, nb_blocks_used, source_base, destination_base);
     pConvertor->current_cuda_iov_pos += nb_blocks_used;
     pConvertor->current_cuda_iov_pos = pConvertor->current_cuda_iov_pos % cached_cuda_iov->cuda_iov_count;
 
-    for (i = 0; i < NB_STREAMS; i++) {
-        cudaStreamSynchronize(cuda_streams->opal_cuda_stream[i]);
-    }
+    cudaStreamSynchronize(cuda_streams->opal_cuda_stream[0]);
 
     pConvertor->bConverted += total_packed;
     DT_CUDA_DEBUG ( opal_cuda_output(2, "Pack total packed %d\n", total_packed); );
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
index 604e68b1d89..8b727e9252d 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
@@ -708,43 +708,24 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
                                                                  uint32_t* out_size,
                                                                  size_t* max_data )
 {
-    uint32_t i, j;
-    uint32_t count_desc, nb_blocks_per_description, residue_desc;
+    uint32_t i;
     uint32_t nb_blocks, thread_per_block, nb_blocks_used;
-    size_t length, buffer_size, length_per_iovec;
+    size_t buffer_size;
     unsigned char *source, *source_base, *destination_base, *destination;
     size_t total_unpacked;
-    int32_t complete_flag = 0;
     uint8_t buffer_isfull = 0;
     uint8_t free_required = 0;
-    uint32_t convertor_flags;
-//    dt_elem_desc_t* description;
-//    dt_elem_desc_t* pElem;
-//    dt_stack_t* pStack;
-    uint8_t alignment, orig_alignment;
-//    int32_t orig_stack_index;
     cudaError_t cuda_err;
     ddt_cuda_stream_t *cuda_streams = current_cuda_device->cuda_streams;
-    ddt_cuda_iov_dist_cached_t* cuda_iov_dist_h_current = NULL;
-    ddt_cuda_iov_dist_cached_t* cuda_iov_dist_d_current = NULL;
-    uintptr_t *cuda_iov_contig_buf_h_current = NULL;
-    uintptr_t *cuda_iov_contig_buf_d_current = NULL;
     ddt_cuda_iov_pipeline_block_t *cuda_iov_pipeline_block = NULL;
-    int iov_pipeline_block_id = 0;
     cudaStream_t *cuda_stream_iov = NULL;
-    const struct iovec *ddt_iov = NULL;
-    uint32_t ddt_iov_count = 0;
-    size_t iov_len = 0;
-    uint32_t iov_start_pos, iov_end_pos, cuda_iov_start_pos, cuda_iov_end_pos;
+    uint32_t cuda_iov_start_pos, cuda_iov_end_pos;
     ddt_cuda_iov_total_cached_t* cached_cuda_iov = NULL;
     ddt_cuda_iov_dist_cached_t* cached_cuda_iov_dist_d = NULL;
     uint32_t *cached_cuda_iov_nb_bytes_list_h = NULL;
-    uint32_t *cuda_iov_nb_bytes_list_h_current = NULL;
     uint32_t cached_cuda_iov_count = 0;
-    uint8_t cuda_iov_is_cached = 0;
     size_t cuda_iov_partial_length_start = 0;
     size_t cuda_iov_partial_length_end = 0;
-    size_t source_disp = 0;
     opal_datatype_count_t convertor_current_count;
     OPAL_PTRDIFF_TYPE ddt_extent;
 
@@ -757,12 +738,6 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
     GET_TIME(start_total);
 #endif
 
-/*    description = pConvertor->use_desc->desc;
-    pStack = pConvertor->pStack + pConvertor->stack_pos;
-    pElem = &(description[pStack->index]);
-    printf("size elem %d, size %lu\n", pElem->elem.common.type, opal_datatype_basicDatatypes[pElem->elem.common.type]->size);
-*/
-
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
     GET_TIME(start);
 #endif
@@ -791,9 +766,6 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
     move_time = ELAPSED_TIME( start, end );
     DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: HtoD memcpy in %ld microsec, free required %d\n", move_time, free_required ); );
 #endif
-    
-//    cuda_err = cudaEventRecord(current_cuda_device->memcpy_event, current_cuda_device->cuda_streams->opal_cuda_stream[0]);
-//    opal_cuda_check_error(cuda_err);
 
 
 #if defined (OPAL_DATATYPE_CUDA_TIMING)
@@ -802,17 +774,12 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
     buffer_size = iov[0].iov_len;
     total_unpacked = 0;
     cuda_streams->current_stream_id = 0;
-    convertor_flags = pConvertor->flags;
-//    orig_stack_index = pStack->index;
     source_base = source;
-    opal_convertor_raw_cached( pConvertor, &ddt_iov, &ddt_iov_count);
-    assert(ddt_iov != NULL);
     opal_ddt_get_cached_cuda_iov(pConvertor, &cached_cuda_iov);
     cached_cuda_iov_dist_d = cached_cuda_iov->cuda_iov_dist_d;
     assert(cached_cuda_iov_dist_d != NULL);
     cached_cuda_iov_nb_bytes_list_h = cached_cuda_iov->nb_bytes_h;
     assert(cached_cuda_iov_nb_bytes_list_h != NULL);
-    DT_CUDA_DEBUG ( opal_cuda_output(4, "Unpack iov count %d, submit to CUDA stream %d\n", ddt_iov_count, cuda_streams->current_stream_id); );
 
 #if defined (OPAL_DATATYPE_CUDA_TIMING)
     GET_TIME( end );
@@ -826,68 +793,17 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
     
     /* cuda iov is not cached, start to cache iov */
     if(opal_ddt_cuda_iov_is_cached(pConvertor) == 0) {
-        nb_blocks_used = 0;
-        cuda_iov_pipeline_block = current_cuda_device->cuda_iov_pipeline_block[iov_pipeline_block_id];
-        cuda_iov_dist_h_current = cuda_iov_pipeline_block->cuda_iov_dist_cached_h;
-        cuda_stream_iov = cuda_iov_pipeline_block->cuda_stream;
-        cuda_err = cudaStreamWaitEvent(*cuda_stream_iov, cuda_iov_pipeline_block->cuda_event, 0);
-        opal_cuda_check_error(cuda_err);
-    
-
 #if defined (OPAL_DATATYPE_CUDA_TIMING)
         GET_TIME(start);
 #endif
-
-        for (i = 0; i < ddt_iov_count; i++) {
-            length_per_iovec = ddt_iov[i].iov_len;
-            destination = (size_t)(ddt_iov[i].iov_base) + destination_base;
-
-            alignment = ALIGNMENT_DOUBLE;
-
-            count_desc = length_per_iovec / alignment;
-            residue_desc = length_per_iovec % alignment;
-            nb_blocks_per_description = (count_desc + thread_per_block - 1) / thread_per_block;
-            DT_CUDA_DEBUG ( opal_cuda_output(10, "Unpack description %d, size %d, residue %d, alignment %d\n", i, count_desc, residue_desc, alignment); );
-            for (j = 0; j < nb_blocks_per_description; j++) {
-                cuda_iov_dist_h_current[nb_blocks_used].ncontig_disp = destination + j * thread_per_block * alignment - destination_base;
-                cuda_iov_dist_h_current[nb_blocks_used].contig_disp = source_disp;
-                if ( (j+1) * thread_per_block <= count_desc) {
-                    cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] = thread_per_block * alignment;
-                } else {
-                    cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] = (thread_per_block - ((j+1)*thread_per_block - count_desc)) * alignment;
-                }
-#if defined (OPAL_DATATYPE_CUDA_DEBUG)
-                assert (cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] > 0); 
-#endif /* OPAL_DATATYPE_CUDA_DEBUG */
-                source_disp += cached_cuda_iov_nb_bytes_list_h[nb_blocks_used];
-                DT_CUDA_DEBUG( opal_cuda_output(12, "Unpack \tblock %d, ncontig_disp %ld, contig_disp %ld, nb_bytes %ld\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].ncontig_disp, cuda_iov_dist_h_current[nb_blocks_used].contig_disp, cached_cuda_iov_nb_bytes_list_h[nb_blocks_used]); );
-                nb_blocks_used ++;
-            }
-
-            /* handle residue */
-            if (residue_desc != 0) {
-               /* orig_alignment = opal_datatype_basicDatatypes[pElem->elem.common.type]->size;*/
-                orig_alignment = ALIGNMENT_CHAR;
-                cuda_iov_dist_h_current[nb_blocks_used].ncontig_disp = destination + length_per_iovec / alignment * alignment - destination_base;
-                cuda_iov_dist_h_current[nb_blocks_used].contig_disp = source_disp;
-                cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] = length_per_iovec - length_per_iovec / alignment * alignment;
-#if defined (OPAL_DATATYPE_CUDA_DEBUG)
-                assert (cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] > 0);
-#endif /* OPAL_DATATYPE_CUDA_DEBUG */
-                source_disp += cached_cuda_iov_nb_bytes_list_h[nb_blocks_used];
-                DT_CUDA_DEBUG( opal_cuda_output(12, "Unpack \tblock %d, ncontig_disp %ld, contig_disp %ld, nb_bytes %ld\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].ncontig_disp, cuda_iov_dist_h_current[nb_blocks_used].contig_disp, cached_cuda_iov_nb_bytes_list_h[nb_blocks_used]); );
-                nb_blocks_used ++;
-            }
+        if (opal_ddt_cache_cuda_iov(pConvertor, cached_cuda_iov_dist_d, cached_cuda_iov_nb_bytes_list_h, &nb_blocks_used) == OPAL_SUCCESS) {
+            opal_ddt_set_cuda_iov_cached(pConvertor, nb_blocks_used);
+            DT_CUDA_DEBUG ( opal_cuda_output(2, "Unpack cuda iov is cached, count %d\n", nb_blocks_used););
         }
-        /* use additional entry to store the size of entire contiguous buffer needed for one ddt */
-        cuda_iov_dist_h_current[nb_blocks_used].contig_disp = source_disp;
-        cudaMemcpy(cached_cuda_iov_dist_d, cuda_iov_dist_h_current, sizeof(ddt_cuda_iov_dist_cached_t)*(nb_blocks_used+1), cudaMemcpyHostToDevice);
-        opal_ddt_set_cuda_iov_cached(pConvertor, nb_blocks_used);
-        DT_CUDA_DEBUG ( opal_cuda_output(2, "Unpack cuda iov is cached, count %d\n", nb_blocks_used););
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
         GET_TIME( end );
         total_time = ELAPSED_TIME( start, end );
-        DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Unpack cached cuda iov is prepared in %ld microsec, nb_blocks_used %d\n", total_time, nb_blocks_used); );
+        DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Unpack cuda iov is cached in %ld microsec, nb_blocks_used %d\n", total_time, nb_blocks_used); );
 #endif
     }
       
@@ -897,11 +813,10 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
     cuda_iov_start_pos = pConvertor->current_cuda_iov_pos;
     cuda_iov_end_pos = cached_cuda_iov_count;
     nb_blocks_used = 0;
-    cuda_iov_pipeline_block = current_cuda_device->cuda_iov_pipeline_block[iov_pipeline_block_id];
+    cuda_iov_pipeline_block = current_cuda_device->cuda_iov_pipeline_block[0];
     cuda_stream_iov = cuda_iov_pipeline_block->cuda_stream;
     convertor_current_count = pConvertor->current_count;
     
-    printf("[00000] partial_length %ld, pos %d\n", pConvertor->current_iov_partial_length, pConvertor->current_cuda_iov_pos);
     if (pConvertor->current_iov_partial_length > 0) {
         cuda_iov_partial_length_start = pConvertor->current_iov_partial_length;
         total_unpacked += cuda_iov_partial_length_start;
@@ -943,12 +858,10 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
     DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Unpack src %p, cached cuda iov is prepared in %ld microsec, kernel submitted to CUDA stream %d, nb_blocks %d\n", source_base, total_time,  cuda_iov_pipeline_block->cuda_stream_id, nb_blocks_used); );
 #endif
     opal_datatype_type_extent(pConvertor->pDesc, &ddt_extent);
-    DT_CUDA_DEBUG ( opal_cuda_output(2, "kernel launched src_base %p, dst_base %p, nb_blocks %ld\n", source_base, destination_base, nb_blocks_used ); );
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "Unpack kernel launched src_base %p, dst_base %p, nb_blocks %ld\n", source_base, destination_base, nb_blocks_used ); );
     opal_generic_simple_unpack_cuda_iov_cached_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cached_cuda_iov_dist_d, pConvertor->current_cuda_iov_pos, cached_cuda_iov_count, ddt_extent, convertor_current_count, nb_blocks_used, destination_base, source_base, cuda_iov_partial_length_start, cuda_iov_partial_length_end);
 
-    for (i = 0; i < NB_STREAMS; i++) {
-        cudaStreamSynchronize(cuda_streams->opal_cuda_stream[i]);
-    }
+    cudaStreamSynchronize(cuda_streams->opal_cuda_stream[0]);
     
     pConvertor->bConverted += total_unpacked;
     DT_CUDA_DEBUG ( opal_cuda_output(2, "Unpack total unpacked %d\n", total_unpacked); );

From 1e29fc0b4e84becac0a36fca459aca45776ca2cb Mon Sep 17 00:00:00 2001
From: eddy16112 <eddy16112@gmail.com>
Date: Mon, 16 Nov 2015 16:30:27 -0500
Subject: [PATCH 178/190] these two variables are useless now

---
 opal/datatype/cuda/opal_datatype_cuda.cu           | 4 ----
 opal/datatype/cuda/opal_datatype_cuda_internal.cuh | 2 --
 2 files changed, 6 deletions(-)

diff --git a/opal/datatype/cuda/opal_datatype_cuda.cu b/opal/datatype/cuda/opal_datatype_cuda.cu
index 7ba18f297fa..298c5b9199b 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cu
+++ b/opal/datatype/cuda/opal_datatype_cuda.cu
@@ -224,8 +224,6 @@ int32_t opal_ddt_cuda_kernel_init(void)
             cudaMallocHost((void **)(&(cuda_iov_pipeline_block->cuda_iov_dist_non_cached_h)), sizeof(ddt_cuda_iov_dist_non_cached_t) * CUDA_MAX_NB_BLOCKS * CUDA_IOV_MAX_TASK_PER_BLOCK);
             cudaMalloc((void **)(&(cuda_iov_pipeline_block->cuda_iov_dist_non_cached_d)), sizeof(ddt_cuda_iov_dist_non_cached_t) * CUDA_MAX_NB_BLOCKS * CUDA_IOV_MAX_TASK_PER_BLOCK);
             cudaMallocHost((void **)(&(cuda_iov_pipeline_block->cuda_iov_dist_cached_h)), sizeof(ddt_cuda_iov_dist_cached_t) * CUDA_MAX_NB_BLOCKS * CUDA_IOV_MAX_TASK_PER_BLOCK);
-            cudaMallocHost((void **)(&(cuda_iov_pipeline_block->cuda_iov_contig_buf_h)), sizeof(uintptr_t) * CUDA_MAX_NB_BLOCKS * CUDA_IOV_MAX_TASK_PER_BLOCK);
-            cudaMalloc((void **)(&(cuda_iov_pipeline_block->cuda_iov_contig_buf_d)), sizeof(uintptr_t) * CUDA_MAX_NB_BLOCKS * CUDA_IOV_MAX_TASK_PER_BLOCK);
             cuda_iov_pipeline_block->cuda_stream = &(cuda_streams->opal_cuda_stream[0]);
             cuda_iov_pipeline_block->cuda_stream_id = 0;
             cudaEventCreate(&(cuda_iov_pipeline_block->cuda_event), cudaEventDisableTiming);
@@ -266,8 +264,6 @@ int32_t opal_ddt_cuda_kernel_fini(void)
                 cudaFreeHost(cuda_iov_pipeline_block->cuda_iov_dist_non_cached_h);
                 cudaFree(cuda_iov_pipeline_block->cuda_iov_dist_non_cached_d);
                 cudaFreeHost(cuda_iov_pipeline_block->cuda_iov_dist_cached_h);
-                cudaFreeHost(cuda_iov_pipeline_block->cuda_iov_contig_buf_h);
-                cudaFree(cuda_iov_pipeline_block->cuda_iov_contig_buf_d);
                 cudaEventDestroy(cuda_iov_pipeline_block->cuda_event);
                 cuda_iov_pipeline_block->cuda_stream = NULL;
                 cuda_iov_pipeline_block->cuda_stream_id = -1;
diff --git a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
index 84e6ce059b3..38720b098fa 100644
--- a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
+++ b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
@@ -74,8 +74,6 @@ typedef struct {
     ddt_cuda_iov_dist_non_cached_t* cuda_iov_dist_non_cached_h;
     ddt_cuda_iov_dist_non_cached_t* cuda_iov_dist_non_cached_d;
     ddt_cuda_iov_dist_cached_t* cuda_iov_dist_cached_h;
-    uintptr_t *cuda_iov_contig_buf_h;
-    uintptr_t *cuda_iov_contig_buf_d;
     cudaStream_t *cuda_stream;
     int32_t cuda_stream_id;
     cudaEvent_t cuda_event;

From 1bac78c1a3a39070843a33fa5f7208c1f0c121fa Mon Sep 17 00:00:00 2001
From: eddy16112 <eddy16112@gmail.com>
Date: Mon, 16 Nov 2015 16:49:55 -0500
Subject: [PATCH 179/190] fix a bug for ib, current count of convertor should
 be set in set_cuda_iov_position

---
 opal/datatype/cuda/opal_datatype_cuda.cu | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/opal/datatype/cuda/opal_datatype_cuda.cu b/opal/datatype/cuda/opal_datatype_cuda.cu
index 298c5b9199b..c49b7d34a26 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cu
+++ b/opal/datatype/cuda/opal_datatype_cuda.cu
@@ -435,10 +435,12 @@ void opal_ddt_set_cuda_iov_position(struct opal_convertor_t *convertor, size_t d
     size_t ddt_size;
     convertor->current_iov_partial_length = 0;
     convertor->current_cuda_iov_pos = 0;
+    convertor->current_count = 0;
     if (ddt_offset == 0) {
        return;
     }
     opal_datatype_type_size(convertor->pDesc, &ddt_size);
+    convertor->current_count = ddt_offset / ddt_size;
     ddt_offset = ddt_offset % ddt_size;
     for(i = 0; i < cuda_iov_count; i++) {
         iov_size += cached_cuda_iov_nb_bytes_list_h[i];

From 686c90ec13e5beda94366b8d1a94b5e30ef4c7e6 Mon Sep 17 00:00:00 2001
From: eddy16112 <eddy16112@gmail.com>
Date: Mon, 16 Nov 2015 21:00:09 -0500
Subject: [PATCH 180/190] cleanup, move cudamalloc into cache cuda iov

---
 opal/datatype/cuda/opal_datatype_cuda.cu      | 61 +++++++++++++------
 opal/datatype/cuda/opal_datatype_cuda.cuh     |  2 +-
 .../cuda/opal_datatype_pack_cuda_wrapper.cu   | 14 ++---
 .../cuda/opal_datatype_unpack_cuda_wrapper.cu | 13 ++--
 4 files changed, 56 insertions(+), 34 deletions(-)

diff --git a/opal/datatype/cuda/opal_datatype_cuda.cu b/opal/datatype/cuda/opal_datatype_cuda.cu
index c49b7d34a26..db75e52e739 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cu
+++ b/opal/datatype/cuda/opal_datatype_cuda.cu
@@ -283,15 +283,13 @@ void* opal_ddt_cached_cuda_iov_init(uint32_t size)
 {
 #if OPAL_DATATYPE_CUDA_IOV_CACHE 
     ddt_cuda_iov_total_cached_t *tmp = (ddt_cuda_iov_total_cached_t *)malloc(sizeof(ddt_cuda_iov_total_cached_t));
-    ddt_cuda_iov_dist_cached_t *tmp_cuda_iov_d = NULL;
-    cudaMalloc((void **)(&tmp_cuda_iov_d), sizeof(ddt_cuda_iov_dist_cached_t) * size);
     uint32_t *tmp_nb_bytes = (uint32_t *)malloc(sizeof(uint32_t) * size);
-    if (tmp != NULL && tmp_cuda_iov_d != NULL && tmp_nb_bytes != NULL) {
-        tmp->cuda_iov_dist_d = tmp_cuda_iov_d;
+    if (tmp != NULL && tmp_nb_bytes != NULL) {
+        tmp->cuda_iov_dist_d = NULL;
         tmp->cuda_iov_count = size;
         tmp->cuda_iov_is_cached = 0;
         tmp->nb_bytes_h = tmp_nb_bytes;
-        DT_CUDA_DEBUG( opal_cuda_output( 2, "Malloc cuda_iov_dist_cached for ddt is successed, cached cuda iov %p, cuda_iov_d %p, nb_bytes_h %p, size %d.\n", tmp, tmp_cuda_iov_d, tmp_nb_bytes, size); );
+        DT_CUDA_DEBUG( opal_cuda_output( 2, "Malloc cuda_iov_dist_cached for ddt is successed, cached cuda iov %p, nb_bytes_h %p, size %d.\n", tmp, tmp_nb_bytes, size); );
         return tmp;
     } else {
         DT_CUDA_DEBUG( opal_cuda_output( 0, "Malloc cuda_iov_dist_cached for ddt is failed.\n"); );
@@ -325,7 +323,7 @@ void opal_ddt_cached_cuda_iov_fini(void* cached_cuda_iov)
 
 /* cached_cuda_iov_d is not ready until explicitlt sync with cuda stream 0 
 */
-int32_t opal_ddt_cache_cuda_iov(opal_convertor_t* pConvertor, ddt_cuda_iov_dist_cached_t *cached_cuda_iov_d, uint32_t *cached_cuda_iov_nb_bytes_list_h, uint32_t *cuda_iov_count)
+int32_t opal_ddt_cache_cuda_iov(opal_convertor_t* pConvertor, uint32_t *cuda_iov_count)
 {
     uint32_t i, j;
     uint32_t count_desc, nb_blocks_per_description, residue_desc;
@@ -333,12 +331,17 @@ int32_t opal_ddt_cache_cuda_iov(opal_convertor_t* pConvertor, ddt_cuda_iov_dist_
     size_t length_per_iovec;
     uint8_t alignment;
     ddt_cuda_iov_pipeline_block_t *cuda_iov_pipeline_block = NULL;
-    ddt_cuda_iov_dist_cached_t *cuda_iov_h = NULL;
+    ddt_cuda_iov_total_cached_t* cached_cuda_iov = NULL;
+    ddt_cuda_iov_dist_cached_t *cached_cuda_iov_dist_d = NULL;
+    ddt_cuda_iov_dist_cached_t *cuda_iov_dist_h = NULL;
     cudaStream_t *cuda_stream_iov = NULL;
     const struct iovec *ddt_iov = NULL;
     uint32_t ddt_iov_count = 0;
     size_t ncontig_disp_base;
     size_t contig_disp = 0;
+    uint32_t *cached_cuda_iov_nb_bytes_list_h = NULL;
+    
+    opal_datatype_t *datatype = (opal_datatype_t *)pConvertor->pDesc;
     
     opal_convertor_raw_cached( pConvertor, &ddt_iov, &ddt_iov_count);
     if (ddt_iov == NULL) {
@@ -346,10 +349,18 @@ int32_t opal_ddt_cache_cuda_iov(opal_convertor_t* pConvertor, ddt_cuda_iov_dist_
         return OPAL_ERROR;
     }
     
+    
+    cached_cuda_iov = (ddt_cuda_iov_total_cached_t *)opal_ddt_cached_cuda_iov_init(NUM_CUDA_IOV_PER_DDT);
+    if (cached_cuda_iov == NULL) {
+        DT_CUDA_DEBUG ( opal_cuda_output(0, "Can not init cuda iov\n"););
+        return OPAL_ERROR;
+    }
+    cached_cuda_iov_nb_bytes_list_h = cached_cuda_iov->nb_bytes_h;
     nb_blocks_used = 0;
     cuda_iov_pipeline_block = current_cuda_device->cuda_iov_pipeline_block[0];
-    cuda_iov_h = cuda_iov_pipeline_block->cuda_iov_dist_cached_h;
+    cuda_iov_dist_h = cuda_iov_pipeline_block->cuda_iov_dist_cached_h;
     cuda_stream_iov = cuda_iov_pipeline_block->cuda_stream;
+    thread_per_block = CUDA_WARP_SIZE * 5;
 
     for (i = 0; i < ddt_iov_count; i++) {
         length_per_iovec = ddt_iov[i].iov_len;
@@ -363,8 +374,8 @@ int32_t opal_ddt_cache_cuda_iov(opal_convertor_t* pConvertor, ddt_cuda_iov_dist_
         nb_blocks_per_description = (count_desc + thread_per_block - 1) / thread_per_block;
         DT_CUDA_DEBUG ( opal_cuda_output(10, "Cache cuda IOV description %d, size %d, residue %d, alignment %d, nb_block_aquired %d\n", i, count_desc, residue_desc, alignment, nb_blocks_per_description); );
         for (j = 0; j < nb_blocks_per_description; j++) {
-            cuda_iov_h[nb_blocks_used].ncontig_disp = ncontig_disp_base + j * thread_per_block * alignment;
-            cuda_iov_h[nb_blocks_used].contig_disp = contig_disp;
+            cuda_iov_dist_h[nb_blocks_used].ncontig_disp = ncontig_disp_base + j * thread_per_block * alignment;
+            cuda_iov_dist_h[nb_blocks_used].contig_disp = contig_disp;
             if ( (j+1) * thread_per_block <= count_desc) {
                 cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] = thread_per_block * alignment;
             } else {
@@ -374,21 +385,21 @@ int32_t opal_ddt_cache_cuda_iov(opal_convertor_t* pConvertor, ddt_cuda_iov_dist_
             assert(cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] > 0);
 #endif /* OPAL_DATATYPE_CUDA_DEBUG */
             contig_disp += cached_cuda_iov_nb_bytes_list_h[nb_blocks_used];
-            DT_CUDA_DEBUG( opal_cuda_output(12, "Cache cuda IOV \tblock %d, ncontig_disp %ld, contig_disp %ld, nb_bytes %ld\n", nb_blocks_used, cuda_iov_h[nb_blocks_used].ncontig_disp, cuda_iov_h[nb_blocks_used].contig_disp, cached_cuda_iov_nb_bytes_list_h[nb_blocks_used]); );
+            DT_CUDA_DEBUG( opal_cuda_output(12, "Cache cuda IOV \tblock %d, ncontig_disp %ld, contig_disp %ld, nb_bytes %ld\n", nb_blocks_used, cuda_iov_dist_h[nb_blocks_used].ncontig_disp, cuda_iov_dist_h[nb_blocks_used].contig_disp, cached_cuda_iov_nb_bytes_list_h[nb_blocks_used]); );
             nb_blocks_used ++;
             assert (nb_blocks_used < CUDA_MAX_NB_BLOCKS*CUDA_IOV_MAX_TASK_PER_BLOCK);
         }
     
         /* handle residue */
         if (residue_desc != 0) {
-            cuda_iov_h[nb_blocks_used].ncontig_disp = ncontig_disp_base + length_per_iovec / alignment * alignment;
-            cuda_iov_h[nb_blocks_used].contig_disp = contig_disp;
+            cuda_iov_dist_h[nb_blocks_used].ncontig_disp = ncontig_disp_base + length_per_iovec / alignment * alignment;
+            cuda_iov_dist_h[nb_blocks_used].contig_disp = contig_disp;
             cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] = length_per_iovec - length_per_iovec / alignment * alignment;
 #if defined (OPAL_DATATYPE_CUDA_DEBUG)
             assert(cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] > 0);
 #endif /* OPAL_DATATYPE_CUDA_DEBUG */
             contig_disp += cached_cuda_iov_nb_bytes_list_h[nb_blocks_used];
-            DT_CUDA_DEBUG( opal_cuda_output(12, "Cache cuda IOV \tblock %d, ncontig_disp %ld, contig_disp %ld, nb_bytes %ld\n", nb_blocks_used, cuda_iov_h[nb_blocks_used].ncontig_disp, cuda_iov_h[nb_blocks_used].contig_disp, cached_cuda_iov_nb_bytes_list_h[nb_blocks_used]); );
+            DT_CUDA_DEBUG( opal_cuda_output(12, "Cache cuda IOV \tblock %d, ncontig_disp %ld, contig_disp %ld, nb_bytes %ld\n", nb_blocks_used, cuda_iov_dist_h[nb_blocks_used].ncontig_disp, cuda_iov_dist_h[nb_blocks_used].contig_disp, cached_cuda_iov_nb_bytes_list_h[nb_blocks_used]); );
             nb_blocks_used ++;
 #if defined (OPAL_DATATYPE_CUDA_DEBUG)
             assert (nb_blocks_used < CUDA_MAX_NB_BLOCKS*CUDA_IOV_MAX_TASK_PER_BLOCK);
@@ -396,8 +407,15 @@ int32_t opal_ddt_cache_cuda_iov(opal_convertor_t* pConvertor, ddt_cuda_iov_dist_
         }
     }
     /* use additional entry to store the size of entire contiguous buffer needed for one ddt */
-    cuda_iov_h[nb_blocks_used].contig_disp = contig_disp;
-    cudaMemcpyAsync(cached_cuda_iov_d, cuda_iov_h, sizeof(ddt_cuda_iov_dist_cached_t)*(nb_blocks_used+1), cudaMemcpyHostToDevice, *cuda_stream_iov);
+    cuda_iov_dist_h[nb_blocks_used].contig_disp = contig_disp;
+    cudaMalloc((void **)(&cached_cuda_iov_dist_d), sizeof(ddt_cuda_iov_dist_cached_t) * (nb_blocks_used+1));
+    if (cached_cuda_iov_dist_d == NULL) {
+        DT_CUDA_DEBUG ( opal_cuda_output(0, "Can not malloc cuda iov in GPU\n"););
+        return OPAL_ERROR;
+    }
+    cudaMemcpyAsync(cached_cuda_iov_dist_d, cuda_iov_dist_h, sizeof(ddt_cuda_iov_dist_cached_t)*(nb_blocks_used+1), cudaMemcpyHostToDevice, *cuda_stream_iov);
+    cached_cuda_iov->cuda_iov_dist_d = cached_cuda_iov_dist_d;
+    datatype->cached_cuda_iov = cached_cuda_iov;
     *cuda_iov_count = nb_blocks_used;
     return OPAL_SUCCESS;
 }
@@ -406,9 +424,10 @@ void opal_ddt_get_cached_cuda_iov(struct opal_convertor_t *convertor, ddt_cuda_i
 {
     opal_datatype_t *datatype = (opal_datatype_t *)convertor->pDesc;
     if (datatype->cached_cuda_iov == NULL) {
-        datatype->cached_cuda_iov = opal_ddt_cached_cuda_iov_init(NUM_CUDA_IOV_PER_DDT);
-    }
-    *cached_cuda_iov = (ddt_cuda_iov_total_cached_t *)datatype->cached_cuda_iov;                   
+        *cached_cuda_iov = NULL;
+    } else {
+        *cached_cuda_iov = (ddt_cuda_iov_total_cached_t *)datatype->cached_cuda_iov;
+    }                 
 }
 
 void opal_ddt_set_cuda_iov_cached(struct opal_convertor_t *convertor, uint32_t cuda_iov_count)
@@ -423,7 +442,9 @@ void opal_ddt_set_cuda_iov_cached(struct opal_convertor_t *convertor, uint32_t c
 uint8_t opal_ddt_cuda_iov_is_cached(struct opal_convertor_t *convertor)
 {
     opal_datatype_t *datatype = (opal_datatype_t *)convertor->pDesc;
-    assert(datatype->cached_cuda_iov != NULL);
+    if (datatype->cached_cuda_iov == NULL) {
+        return 0;
+    }
     ddt_cuda_iov_total_cached_t *tmp = (ddt_cuda_iov_total_cached_t *)datatype->cached_cuda_iov;
     return tmp->cuda_iov_is_cached;
 }
diff --git a/opal/datatype/cuda/opal_datatype_cuda.cuh b/opal/datatype/cuda/opal_datatype_cuda.cuh
index 4a71ab37d63..8ad9b3ec658 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cuh
+++ b/opal/datatype/cuda/opal_datatype_cuda.cuh
@@ -131,7 +131,7 @@ void opal_ddt_check_cuda_iov_is_full(struct opal_convertor_t *convertor, uint32_
 
 void opal_ddt_set_cuda_iov_position(struct opal_convertor_t *convertor, size_t ddt_offset, const uint32_t *cached_cuda_iov_nb_bytes_list_h, const uint32_t cuda_iov_count);
 
-int32_t opal_ddt_cache_cuda_iov(opal_convertor_t* pConvertor, ddt_cuda_iov_dist_cached_t *cached_cuda_iov_d, uint32_t *cached_cuda_iov_nb_bytes_list_h, uint32_t *cuda_iov_count);
+int32_t opal_ddt_cache_cuda_iov(opal_convertor_t* pConvertor, uint32_t *cuda_iov_count);
 
 }
                             
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
index ddc2ec08a89..c98d540e54e 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
@@ -1003,12 +1003,6 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
     GET_TIME(start);
 #endif
-    
-    opal_ddt_get_cached_cuda_iov(pConvertor, &cached_cuda_iov);
-    cached_cuda_iov_dist_d = cached_cuda_iov->cuda_iov_dist_d;
-    assert(cached_cuda_iov_dist_d != NULL);
-    cached_cuda_iov_nb_bytes_list_h = cached_cuda_iov->nb_bytes_h;
-    assert(cached_cuda_iov_nb_bytes_list_h != NULL);
 
 #if defined(OPAL_DATATYPE_CUDA_TIMING)    
     GET_TIME( end );
@@ -1025,7 +1019,7 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
         GET_TIME(start);
 #endif
-        if (opal_ddt_cache_cuda_iov(pConvertor, cached_cuda_iov_dist_d, cached_cuda_iov_nb_bytes_list_h, &nb_blocks_used) == OPAL_SUCCESS) {
+        if (opal_ddt_cache_cuda_iov(pConvertor, &nb_blocks_used) == OPAL_SUCCESS) {
             opal_ddt_set_cuda_iov_cached(pConvertor, nb_blocks_used);
             DT_CUDA_DEBUG ( opal_cuda_output(2, "Pack cuda iov is cached, count %d\n", nb_blocks_used););
         } else {
@@ -1040,6 +1034,12 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
     }
     
     /* now we use cached cuda iov */
+    opal_ddt_get_cached_cuda_iov(pConvertor, &cached_cuda_iov);
+    cached_cuda_iov_dist_d = cached_cuda_iov->cuda_iov_dist_d;
+    assert(cached_cuda_iov_dist_d != NULL);
+    cached_cuda_iov_nb_bytes_list_h = cached_cuda_iov->nb_bytes_h;
+    assert(cached_cuda_iov_nb_bytes_list_h != NULL);
+    
     cached_cuda_iov_count = cached_cuda_iov->cuda_iov_count;
     cuda_iov_start_pos = pConvertor->current_cuda_iov_pos;
     cuda_iov_end_pos = cached_cuda_iov_count;
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
index 8b727e9252d..6cda6f08cc4 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
@@ -775,11 +775,6 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
     total_unpacked = 0;
     cuda_streams->current_stream_id = 0;
     source_base = source;
-    opal_ddt_get_cached_cuda_iov(pConvertor, &cached_cuda_iov);
-    cached_cuda_iov_dist_d = cached_cuda_iov->cuda_iov_dist_d;
-    assert(cached_cuda_iov_dist_d != NULL);
-    cached_cuda_iov_nb_bytes_list_h = cached_cuda_iov->nb_bytes_h;
-    assert(cached_cuda_iov_nb_bytes_list_h != NULL);
 
 #if defined (OPAL_DATATYPE_CUDA_TIMING)
     GET_TIME( end );
@@ -796,7 +791,7 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
 #if defined (OPAL_DATATYPE_CUDA_TIMING)
         GET_TIME(start);
 #endif
-        if (opal_ddt_cache_cuda_iov(pConvertor, cached_cuda_iov_dist_d, cached_cuda_iov_nb_bytes_list_h, &nb_blocks_used) == OPAL_SUCCESS) {
+        if (opal_ddt_cache_cuda_iov(pConvertor, &nb_blocks_used) == OPAL_SUCCESS) {
             opal_ddt_set_cuda_iov_cached(pConvertor, nb_blocks_used);
             DT_CUDA_DEBUG ( opal_cuda_output(2, "Unpack cuda iov is cached, count %d\n", nb_blocks_used););
         }
@@ -808,6 +803,12 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
     }
       
     /* now we use cached cuda iov */
+    opal_ddt_get_cached_cuda_iov(pConvertor, &cached_cuda_iov);
+    cached_cuda_iov_dist_d = cached_cuda_iov->cuda_iov_dist_d;
+    assert(cached_cuda_iov_dist_d != NULL);
+    cached_cuda_iov_nb_bytes_list_h = cached_cuda_iov->nb_bytes_h;
+    assert(cached_cuda_iov_nb_bytes_list_h != NULL);
+    
     cached_cuda_iov_count = cached_cuda_iov->cuda_iov_count;
     opal_ddt_set_cuda_iov_position(pConvertor, pConvertor->bConverted, cached_cuda_iov_nb_bytes_list_h, cached_cuda_iov_count);
     cuda_iov_start_pos = pConvertor->current_cuda_iov_pos;

From 85dad6c87ec26158d889ec9932588c5c7d3a1b40 Mon Sep 17 00:00:00 2001
From: eddy16112 <eddy16112@gmail.com>
Date: Tue, 17 Nov 2015 02:53:38 -0500
Subject: [PATCH 181/190] rearrange varibles

---
 opal/datatype/cuda/opal_datatype_cuda.cu | 2 +-
 opal/datatype/opal_datatype.h            | 3 ---
 opal/datatype/opal_datatype_create.c     | 2 +-
 3 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/opal/datatype/cuda/opal_datatype_cuda.cu b/opal/datatype/cuda/opal_datatype_cuda.cu
index db75e52e739..b8836c902a4 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cu
+++ b/opal/datatype/cuda/opal_datatype_cuda.cu
@@ -415,7 +415,7 @@ int32_t opal_ddt_cache_cuda_iov(opal_convertor_t* pConvertor, uint32_t *cuda_iov
     }
     cudaMemcpyAsync(cached_cuda_iov_dist_d, cuda_iov_dist_h, sizeof(ddt_cuda_iov_dist_cached_t)*(nb_blocks_used+1), cudaMemcpyHostToDevice, *cuda_stream_iov);
     cached_cuda_iov->cuda_iov_dist_d = cached_cuda_iov_dist_d;
-    datatype->cached_cuda_iov = cached_cuda_iov;
+    datatype->cached_cuda_iov = (unsigned char*)cached_cuda_iov;
     *cuda_iov_count = nb_blocks_used;
     return OPAL_SUCCESS;
 }
diff --git a/opal/datatype/opal_datatype.h b/opal/datatype/opal_datatype.h
index 1337f2cc57c..a3a6898dd89 100644
--- a/opal/datatype/opal_datatype.h
+++ b/opal/datatype/opal_datatype.h
@@ -131,9 +131,6 @@ struct opal_datatype_t {
     int                iov_count;
     size_t             max_data;
     /* size: 416, cachelines: 7, members: 18 */
-#if OPAL_CUDA_SUPPORT
-    void *             cached_cuda_iov;
-#endif /* OPAL_CUDA_SUPPORT */
     /* last cacheline: 32 bytes */
 
     struct iovec*      cached_iovec;
diff --git a/opal/datatype/opal_datatype_create.c b/opal/datatype/opal_datatype_create.c
index 44c0e3020b6..e57a7d6c668 100644
--- a/opal/datatype/opal_datatype_create.c
+++ b/opal/datatype/opal_datatype_create.c
@@ -102,7 +102,7 @@ static void opal_datatype_destruct( opal_datatype_t* datatype )
 #if OPAL_CUDA_SUPPORT   
     /* free cuda iov */
     if (opal_datatype_cuda_kernel_support == 1 && datatype->cached_cuda_iov != NULL) {
-        opal_cached_cuda_iov_fini(datatype->cached_cuda_iov);
+        opal_cached_cuda_iov_fini((void*)datatype->cached_cuda_iov);
         datatype->cached_cuda_iov = NULL;
     }
 #endif /* OPAL_CUDA_SUPPORT */

From 4c6c0e4a9ec6d5772ef2a4620b508bcfac2d93aa Mon Sep 17 00:00:00 2001
From: eddy16112 <eddy16112@gmail.com>
Date: Tue, 17 Nov 2015 18:13:00 -0500
Subject: [PATCH 182/190] if cuda_iov is not big enough, use realloc. However,
 cudaMallocHost does not work with realloc, so use malloc instead

---
 opal/datatype/cuda/opal_datatype_cuda.cu | 35 +++++++++++++++++++++---
 1 file changed, 31 insertions(+), 4 deletions(-)

diff --git a/opal/datatype/cuda/opal_datatype_cuda.cu b/opal/datatype/cuda/opal_datatype_cuda.cu
index b8836c902a4..7c1d50bcfb8 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cu
+++ b/opal/datatype/cuda/opal_datatype_cuda.cu
@@ -223,7 +223,10 @@ int32_t opal_ddt_cuda_kernel_init(void)
             cuda_iov_pipeline_block = (ddt_cuda_iov_pipeline_block_t *)malloc(sizeof(ddt_cuda_iov_pipeline_block_t));
             cudaMallocHost((void **)(&(cuda_iov_pipeline_block->cuda_iov_dist_non_cached_h)), sizeof(ddt_cuda_iov_dist_non_cached_t) * CUDA_MAX_NB_BLOCKS * CUDA_IOV_MAX_TASK_PER_BLOCK);
             cudaMalloc((void **)(&(cuda_iov_pipeline_block->cuda_iov_dist_non_cached_d)), sizeof(ddt_cuda_iov_dist_non_cached_t) * CUDA_MAX_NB_BLOCKS * CUDA_IOV_MAX_TASK_PER_BLOCK);
-            cudaMallocHost((void **)(&(cuda_iov_pipeline_block->cuda_iov_dist_cached_h)), sizeof(ddt_cuda_iov_dist_cached_t) * CUDA_MAX_NB_BLOCKS * CUDA_IOV_MAX_TASK_PER_BLOCK);
+            if (j == 0) {
+            //    cudaMallocHost((void **)(&(cuda_iov_pipeline_block->cuda_iov_dist_cached_h)), sizeof(ddt_cuda_iov_dist_cached_t) * NUM_CUDA_IOV_PER_DDT);
+                cuda_iov_pipeline_block->cuda_iov_dist_cached_h = (ddt_cuda_iov_dist_cached_t *)malloc(sizeof(ddt_cuda_iov_dist_cached_t) * NUM_CUDA_IOV_PER_DDT);
+            }
             cuda_iov_pipeline_block->cuda_stream = &(cuda_streams->opal_cuda_stream[0]);
             cuda_iov_pipeline_block->cuda_stream_id = 0;
             cudaEventCreate(&(cuda_iov_pipeline_block->cuda_event), cudaEventDisableTiming);
@@ -263,7 +266,8 @@ int32_t opal_ddt_cuda_kernel_fini(void)
             if (cuda_iov_pipeline_block != NULL) {
                 cudaFreeHost(cuda_iov_pipeline_block->cuda_iov_dist_non_cached_h);
                 cudaFree(cuda_iov_pipeline_block->cuda_iov_dist_non_cached_d);
-                cudaFreeHost(cuda_iov_pipeline_block->cuda_iov_dist_cached_h);
+                //cudaFreeHost(cuda_iov_pipeline_block->cuda_iov_dist_cached_h);
+                free(cuda_iov_pipeline_block->cuda_iov_dist_cached_h);
                 cudaEventDestroy(cuda_iov_pipeline_block->cuda_event);
                 cuda_iov_pipeline_block->cuda_stream = NULL;
                 cuda_iov_pipeline_block->cuda_stream_id = -1;
@@ -321,6 +325,22 @@ void opal_ddt_cached_cuda_iov_fini(void* cached_cuda_iov)
 #endif /* OPAL_DATATYPE_CUDA_IOV_CACHE */
 }
 
+static inline int32_t opal_ddt_cached_cuda_iov_isfull(ddt_cuda_iov_total_cached_t *cached_cuda_iov, ddt_cuda_iov_dist_cached_t **cuda_iov_dist_h, uint32_t nb_blocks_used)
+{
+    if (nb_blocks_used < cached_cuda_iov->cuda_iov_count) {
+        return 0;
+    } else {
+realloc_cuda_iov:
+        cached_cuda_iov->nb_bytes_h = (uint32_t *)realloc(cached_cuda_iov->nb_bytes_h, sizeof(uint32_t)*cached_cuda_iov->cuda_iov_count*2);
+        assert(cached_cuda_iov->nb_bytes_h != NULL);
+        cached_cuda_iov->cuda_iov_count *= 2;
+        if (nb_blocks_used >= cached_cuda_iov->cuda_iov_count) {
+            goto realloc_cuda_iov;
+        }
+        return 1;
+    }
+}
+
 /* cached_cuda_iov_d is not ready until explicitlt sync with cuda stream 0 
 */
 int32_t opal_ddt_cache_cuda_iov(opal_convertor_t* pConvertor, uint32_t *cuda_iov_count)
@@ -373,6 +393,13 @@ int32_t opal_ddt_cache_cuda_iov(opal_convertor_t* pConvertor, uint32_t *cuda_iov
         residue_desc = length_per_iovec % alignment;
         nb_blocks_per_description = (count_desc + thread_per_block - 1) / thread_per_block;
         DT_CUDA_DEBUG ( opal_cuda_output(10, "Cache cuda IOV description %d, size %d, residue %d, alignment %d, nb_block_aquired %d\n", i, count_desc, residue_desc, alignment, nb_blocks_per_description); );
+        if (opal_ddt_cached_cuda_iov_isfull(cached_cuda_iov, &(cuda_iov_pipeline_block->cuda_iov_dist_cached_h), nb_blocks_used + nb_blocks_per_description + 1)) {
+            cached_cuda_iov_nb_bytes_list_h = cached_cuda_iov->nb_bytes_h;
+            cuda_iov_dist_h = (ddt_cuda_iov_dist_cached_t *)realloc(cuda_iov_dist_h, sizeof(ddt_cuda_iov_dist_cached_t)*cached_cuda_iov->cuda_iov_count);
+            assert(cuda_iov_dist_h != NULL);
+            cuda_iov_pipeline_block->cuda_iov_dist_cached_h = cuda_iov_dist_h;
+        }
+        
         for (j = 0; j < nb_blocks_per_description; j++) {
             cuda_iov_dist_h[nb_blocks_used].ncontig_disp = ncontig_disp_base + j * thread_per_block * alignment;
             cuda_iov_dist_h[nb_blocks_used].contig_disp = contig_disp;
@@ -387,7 +414,7 @@ int32_t opal_ddt_cache_cuda_iov(opal_convertor_t* pConvertor, uint32_t *cuda_iov
             contig_disp += cached_cuda_iov_nb_bytes_list_h[nb_blocks_used];
             DT_CUDA_DEBUG( opal_cuda_output(12, "Cache cuda IOV \tblock %d, ncontig_disp %ld, contig_disp %ld, nb_bytes %ld\n", nb_blocks_used, cuda_iov_dist_h[nb_blocks_used].ncontig_disp, cuda_iov_dist_h[nb_blocks_used].contig_disp, cached_cuda_iov_nb_bytes_list_h[nb_blocks_used]); );
             nb_blocks_used ++;
-            assert (nb_blocks_used < CUDA_MAX_NB_BLOCKS*CUDA_IOV_MAX_TASK_PER_BLOCK);
+         //   assert (nb_blocks_used < NUM_CUDA_IOV_PER_DDT);
         }
     
         /* handle residue */
@@ -402,7 +429,7 @@ int32_t opal_ddt_cache_cuda_iov(opal_convertor_t* pConvertor, uint32_t *cuda_iov
             DT_CUDA_DEBUG( opal_cuda_output(12, "Cache cuda IOV \tblock %d, ncontig_disp %ld, contig_disp %ld, nb_bytes %ld\n", nb_blocks_used, cuda_iov_dist_h[nb_blocks_used].ncontig_disp, cuda_iov_dist_h[nb_blocks_used].contig_disp, cached_cuda_iov_nb_bytes_list_h[nb_blocks_used]); );
             nb_blocks_used ++;
 #if defined (OPAL_DATATYPE_CUDA_DEBUG)
-            assert (nb_blocks_used < CUDA_MAX_NB_BLOCKS*CUDA_IOV_MAX_TASK_PER_BLOCK);
+            //assert (nb_blocks_used < NUM_CUDA_IOV_PER_DDT);
 #endif /* OPAL_DATATYPE_CUDA_DEBUG */
         }
     }

From 2120eddd0fa5395c26b89202b229d3445cb5167f Mon Sep 17 00:00:00 2001
From: eddy16112 <eddy16112@gmail.com>
Date: Wed, 18 Nov 2015 15:26:31 -0500
Subject: [PATCH 183/190] make sure check pointer is not NULL before free it

---
 opal/datatype/cuda/opal_datatype_cuda.cu | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/opal/datatype/cuda/opal_datatype_cuda.cu b/opal/datatype/cuda/opal_datatype_cuda.cu
index 7c1d50bcfb8..0e6a90b6582 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cu
+++ b/opal/datatype/cuda/opal_datatype_cuda.cu
@@ -226,6 +226,8 @@ int32_t opal_ddt_cuda_kernel_init(void)
             if (j == 0) {
             //    cudaMallocHost((void **)(&(cuda_iov_pipeline_block->cuda_iov_dist_cached_h)), sizeof(ddt_cuda_iov_dist_cached_t) * NUM_CUDA_IOV_PER_DDT);
                 cuda_iov_pipeline_block->cuda_iov_dist_cached_h = (ddt_cuda_iov_dist_cached_t *)malloc(sizeof(ddt_cuda_iov_dist_cached_t) * NUM_CUDA_IOV_PER_DDT);
+            } else {
+                cuda_iov_pipeline_block->cuda_iov_dist_cached_h = NULL;
             }
             cuda_iov_pipeline_block->cuda_stream = &(cuda_streams->opal_cuda_stream[0]);
             cuda_iov_pipeline_block->cuda_stream_id = 0;
@@ -264,10 +266,19 @@ int32_t opal_ddt_cuda_kernel_fini(void)
             cudaStreamDestroy(cuda_devices[i].cuda_streams->opal_cuda_stream[j]);
             cuda_iov_pipeline_block = cuda_devices[i].cuda_iov_pipeline_block[j];
             if (cuda_iov_pipeline_block != NULL) {
-                cudaFreeHost(cuda_iov_pipeline_block->cuda_iov_dist_non_cached_h);
-                cudaFree(cuda_iov_pipeline_block->cuda_iov_dist_non_cached_d);
-                //cudaFreeHost(cuda_iov_pipeline_block->cuda_iov_dist_cached_h);
-                free(cuda_iov_pipeline_block->cuda_iov_dist_cached_h);
+                if (cuda_iov_pipeline_block->cuda_iov_dist_non_cached_h != NULL) {
+                    cudaFreeHost(cuda_iov_pipeline_block->cuda_iov_dist_non_cached_h);
+                    cuda_iov_pipeline_block->cuda_iov_dist_non_cached_h = NULL;
+                }
+                if (cuda_iov_pipeline_block->cuda_iov_dist_non_cached_d != NULL) {
+                    cudaFree(cuda_iov_pipeline_block->cuda_iov_dist_non_cached_d);
+                    cuda_iov_pipeline_block->cuda_iov_dist_non_cached_d = NULL;
+                }
+                if (cuda_iov_pipeline_block->cuda_iov_dist_cached_h != NULL) {
+                    //cudaFreeHost(cuda_iov_pipeline_block->cuda_iov_dist_cached_h);
+                    free(cuda_iov_pipeline_block->cuda_iov_dist_cached_h);
+                    cuda_iov_pipeline_block->cuda_iov_dist_cached_h = NULL;
+                }
                 cudaEventDestroy(cuda_iov_pipeline_block->cuda_event);
                 cuda_iov_pipeline_block->cuda_stream = NULL;
                 cuda_iov_pipeline_block->cuda_stream_id = -1;

From 98fc62cff1ca5a08d5e8e48ac678eddcc7fc5b5c Mon Sep 17 00:00:00 2001
From: eddy16112 <eddy16112@gmail.com>
Date: Tue, 24 Nov 2015 20:18:17 -0500
Subject: [PATCH 184/190] rewrite non cached iov, make it unified with cached
 iov

checkpoint, rewrite non-cached version

fix for non cached iov

fix the non cached iov, set position should be put at first

move ddt iov to cuda iov into a function

merge iov cached and non-cached

for non cached iov, if there is no enough cuda iov space, break
---
 opal/datatype/cuda/opal_datatype_cuda.cu      | 113 ++++++-
 opal/datatype/cuda/opal_datatype_cuda.cuh     |  28 +-
 .../cuda/opal_datatype_cuda_internal.cuh      |   4 +-
 .../cuda/opal_datatype_pack_cuda_wrapper.cu   | 297 ++++++++++++------
 .../cuda/opal_datatype_unpack_cuda_wrapper.cu | 191 ++++++-----
 5 files changed, 430 insertions(+), 203 deletions(-)

diff --git a/opal/datatype/cuda/opal_datatype_cuda.cu b/opal/datatype/cuda/opal_datatype_cuda.cu
index 0e6a90b6582..2c76a327197 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cu
+++ b/opal/datatype/cuda/opal_datatype_cuda.cu
@@ -221,10 +221,9 @@ int32_t opal_ddt_cuda_kernel_init(void)
         for (j = 0; j < NB_STREAMS; j++) {
             cudaStreamCreate(&(cuda_streams->opal_cuda_stream[j]));
             cuda_iov_pipeline_block = (ddt_cuda_iov_pipeline_block_t *)malloc(sizeof(ddt_cuda_iov_pipeline_block_t));
-            cudaMallocHost((void **)(&(cuda_iov_pipeline_block->cuda_iov_dist_non_cached_h)), sizeof(ddt_cuda_iov_dist_non_cached_t) * CUDA_MAX_NB_BLOCKS * CUDA_IOV_MAX_TASK_PER_BLOCK);
-            cudaMalloc((void **)(&(cuda_iov_pipeline_block->cuda_iov_dist_non_cached_d)), sizeof(ddt_cuda_iov_dist_non_cached_t) * CUDA_MAX_NB_BLOCKS * CUDA_IOV_MAX_TASK_PER_BLOCK);
+            cudaMallocHost((void **)(&(cuda_iov_pipeline_block->cuda_iov_dist_non_cached_h)), sizeof(ddt_cuda_iov_dist_cached_t) * CUDA_MAX_NB_BLOCKS * CUDA_IOV_MAX_TASK_PER_BLOCK);
+            cudaMalloc((void **)(&(cuda_iov_pipeline_block->cuda_iov_dist_non_cached_d)), sizeof(ddt_cuda_iov_dist_cached_t) * CUDA_MAX_NB_BLOCKS * CUDA_IOV_MAX_TASK_PER_BLOCK);
             if (j == 0) {
-            //    cudaMallocHost((void **)(&(cuda_iov_pipeline_block->cuda_iov_dist_cached_h)), sizeof(ddt_cuda_iov_dist_cached_t) * NUM_CUDA_IOV_PER_DDT);
                 cuda_iov_pipeline_block->cuda_iov_dist_cached_h = (ddt_cuda_iov_dist_cached_t *)malloc(sizeof(ddt_cuda_iov_dist_cached_t) * NUM_CUDA_IOV_PER_DDT);
             } else {
                 cuda_iov_pipeline_block->cuda_iov_dist_cached_h = NULL;
@@ -275,7 +274,6 @@ int32_t opal_ddt_cuda_kernel_fini(void)
                     cuda_iov_pipeline_block->cuda_iov_dist_non_cached_d = NULL;
                 }
                 if (cuda_iov_pipeline_block->cuda_iov_dist_cached_h != NULL) {
-                    //cudaFreeHost(cuda_iov_pipeline_block->cuda_iov_dist_cached_h);
                     free(cuda_iov_pipeline_block->cuda_iov_dist_cached_h);
                     cuda_iov_pipeline_block->cuda_iov_dist_cached_h = NULL;
                 }
@@ -458,6 +456,85 @@ int32_t opal_ddt_cache_cuda_iov(opal_convertor_t* pConvertor, uint32_t *cuda_iov
     return OPAL_SUCCESS;
 }
 
+uint8_t opal_ddt_iov_to_cuda_iov(opal_convertor_t* pConvertor, const struct iovec *ddt_iov, ddt_cuda_iov_dist_cached_t* cuda_iov_dist_h_current, uint32_t ddt_iov_start_pos, uint32_t ddt_iov_end_pos, size_t *buffer_size, uint32_t *nb_blocks_used, size_t *total_converted, size_t *contig_disp_out, uint32_t *current_ddt_iov_pos)
+{
+    size_t ncontig_disp_base;
+    size_t contig_disp = 0;
+    size_t current_cuda_iov_length = 0;
+    uint8_t buffer_isfull = 0;
+    uint8_t alignment;
+    uint32_t count_desc, nb_blocks_per_description, residue_desc;
+    uint32_t thread_per_block;
+    size_t length_per_iovec;
+    uint32_t i, j;
+    
+    thread_per_block = CUDA_WARP_SIZE * 5;
+    
+    for (i = ddt_iov_start_pos; i < ddt_iov_end_pos && !buffer_isfull; i++) {
+        if (pConvertor->current_iov_partial_length > 0) {
+            ncontig_disp_base = (size_t)(ddt_iov[i].iov_base) + ddt_iov[i].iov_len - pConvertor->current_iov_partial_length;
+            length_per_iovec = pConvertor->current_iov_partial_length;
+            pConvertor->current_iov_partial_length = 0;
+        } else {
+            ncontig_disp_base = (size_t)(ddt_iov[i].iov_base);
+            length_per_iovec = ddt_iov[i].iov_len;
+        }
+        if (*buffer_size < length_per_iovec) {
+            pConvertor->current_iov_pos = i;
+            pConvertor->current_iov_partial_length = length_per_iovec - *buffer_size;
+            length_per_iovec = *buffer_size; 
+            buffer_isfull = 1;
+        }
+        *buffer_size -= length_per_iovec;
+        *total_converted += length_per_iovec;
+        
+        alignment = ALIGNMENT_DOUBLE;
+
+        count_desc = length_per_iovec / alignment;
+        residue_desc = length_per_iovec % alignment;
+        nb_blocks_per_description = (count_desc + thread_per_block - 1) / thread_per_block;
+        if ((*nb_blocks_used + nb_blocks_per_description + 1) > (CUDA_MAX_NB_BLOCKS*CUDA_IOV_MAX_TASK_PER_BLOCK)) {
+            break;
+        }
+        DT_CUDA_DEBUG ( opal_cuda_output(10, "DDT IOV to CUDA IOV description %d, size %d, residue %d, alignment %d, nb_block_aquired %d\n", i, count_desc, residue_desc, alignment, nb_blocks_per_description); );
+        for (j = 0; j < nb_blocks_per_description; j++) {
+            cuda_iov_dist_h_current[*nb_blocks_used].ncontig_disp = ncontig_disp_base + j * thread_per_block * alignment;
+            cuda_iov_dist_h_current[*nb_blocks_used].contig_disp = contig_disp;
+            if ( (j+1) * thread_per_block <= count_desc) {
+                current_cuda_iov_length = thread_per_block * alignment;
+            } else {
+                current_cuda_iov_length = (count_desc - j*thread_per_block) * alignment; 
+            }
+#if defined (OPAL_DATATYPE_CUDA_DEBUG)
+            assert(current_cuda_iov_length > 0);
+#endif /* OPAL_DATATYPE_CUDA_DEBUG */
+            contig_disp += current_cuda_iov_length;
+            DT_CUDA_DEBUG( opal_cuda_output(12, "DDT IOV to CUDA IOV \tblock %d, ncontig_disp %ld, contig_disp %ld, nb_bytes %ld\n", nb_blocks_used, cuda_iov_dist_h_current[*nb_blocks_used].ncontig_disp, cuda_iov_dist_h_current[*nb_blocks_used].contig_disp, current_cuda_iov_length); );
+            (*nb_blocks_used) ++;
+            assert (*nb_blocks_used < CUDA_MAX_NB_BLOCKS*CUDA_IOV_MAX_TASK_PER_BLOCK);
+        }
+        
+        /* handle residue */
+        if (residue_desc != 0) {
+            cuda_iov_dist_h_current[*nb_blocks_used].ncontig_disp = ncontig_disp_base + length_per_iovec / alignment * alignment;
+            cuda_iov_dist_h_current[*nb_blocks_used].contig_disp = contig_disp;
+            current_cuda_iov_length= length_per_iovec - length_per_iovec / alignment * alignment;
+#if defined (OPAL_DATATYPE_CUDA_DEBUG)
+            assert(current_cuda_iov_length > 0);
+#endif /* OPAL_DATATYPE_CUDA_DEBUG */
+            contig_disp += current_cuda_iov_length;
+            DT_CUDA_DEBUG( opal_cuda_output(12, "DDT IOV to CUDA IOV \tblock %d, ncontig_disp %ld, contig_disp %ld, nb_bytes %ld\n", nb_blocks_used, cuda_iov_dist_h_current[*nb_blocks_used].ncontig_disp, cuda_iov_dist_h_current[*nb_blocks_used].contig_disp, current_cuda_iov_length); );
+            (*nb_blocks_used) ++;
+            assert (*nb_blocks_used < CUDA_MAX_NB_BLOCKS*CUDA_IOV_MAX_TASK_PER_BLOCK);
+        }
+    }
+    cuda_iov_dist_h_current[*nb_blocks_used].contig_disp = contig_disp;
+    *contig_disp_out = contig_disp;
+    *current_ddt_iov_pos = i;
+    return buffer_isfull;
+        
+}
+
 void opal_ddt_get_cached_cuda_iov(struct opal_convertor_t *convertor, ddt_cuda_iov_total_cached_t **cached_cuda_iov)
 {
     opal_datatype_t *datatype = (opal_datatype_t *)convertor->pDesc;
@@ -515,6 +592,34 @@ void opal_ddt_set_cuda_iov_position(struct opal_convertor_t *convertor, size_t d
     }
 }
 
+void opal_ddt_set_ddt_iov_position(struct opal_convertor_t *convertor, size_t ddt_offset, const struct iovec *ddt_iov, const uint32_t ddt_iov_count)
+{
+    int i;
+    size_t iov_size = 0;
+    size_t ddt_size;
+    convertor->current_iov_partial_length = 0;
+    convertor->current_iov_pos = 0;
+    convertor->current_count = 0;
+    if (ddt_offset == 0) {
+       return;
+    }
+    opal_datatype_type_size(convertor->pDesc, &ddt_size);
+    convertor->current_count = ddt_offset / ddt_size;
+    ddt_offset = ddt_offset % ddt_size;
+    for(i = 0; i < ddt_iov_count; i++) {
+        iov_size += ddt_iov[i].iov_len;
+        if (iov_size > ddt_offset) {
+            convertor->current_iov_partial_length = iov_size - ddt_offset;
+            convertor->current_iov_pos = i;
+            break;
+        } else if (iov_size == ddt_offset){
+            convertor->current_iov_partial_length = 0;
+            convertor->current_iov_pos = i+1;
+            break;
+        }
+    }
+}
+
 void opal_ddt_check_cuda_iov_is_full(struct opal_convertor_t *convertor, uint32_t cuda_iov_count)
 {
 #if 0
diff --git a/opal/datatype/cuda/opal_datatype_cuda.cuh b/opal/datatype/cuda/opal_datatype_cuda.cuh
index 8ad9b3ec658..c33ff606bd9 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cuh
+++ b/opal/datatype/cuda/opal_datatype_cuda.cuh
@@ -29,25 +29,13 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pCon
                                                           uint32_t* out_size,
                                                           size_t* max_data ); 
                                                           
-int32_t opal_ddt_generic_simple_pack_function_cuda_iov_non_cached( opal_convertor_t* pConvertor,
-                                                                   struct iovec* iov, 
-                                                                   uint32_t* out_size,
-                                                                   size_t* max_data );                                              
-
-int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_non_cached( opal_convertor_t* pConvertor,
-                                                                     struct iovec* iov, 
-                                                                     uint32_t* out_size,
-                                                                     size_t* max_data ); 
+int32_t opal_ddt_generic_simple_pack_function_cuda_iov_non_cached( opal_convertor_t* pConvertor, unsigned char *destination, size_t buffer_size, size_t *total_packed);                                        
+
+int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_non_cached( opal_convertor_t* pConvertor, unsigned char *source, size_t buffer_size, size_t *total_unpacked);
                                                                                                                     
-int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t* pConvertor,
-                                                               struct iovec* iov, 
-                                                               uint32_t* out_size,
-                                                               size_t* max_data );                                              
+int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t* pConvertor, unsigned char *destination, size_t buffer_size, size_t *total_packed);                                        
 
-int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_t* pConvertor,
-                                                                 struct iovec* iov, 
-                                                                 uint32_t* out_size,
-                                                                 size_t* max_data ); 
+int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_t* pConvertor, unsigned char *source, size_t buffer_size, size_t *total_unpacked);
 
 void pack_contiguous_loop_cuda( dt_elem_desc_t* ELEM,
                                 uint32_t* COUNT,
@@ -131,8 +119,12 @@ void opal_ddt_check_cuda_iov_is_full(struct opal_convertor_t *convertor, uint32_
 
 void opal_ddt_set_cuda_iov_position(struct opal_convertor_t *convertor, size_t ddt_offset, const uint32_t *cached_cuda_iov_nb_bytes_list_h, const uint32_t cuda_iov_count);
 
+void opal_ddt_set_ddt_iov_position(struct opal_convertor_t *convertor, size_t ddt_offset, const struct iovec *ddt_iov,  const uint32_t ddt_iov_count);
+
 int32_t opal_ddt_cache_cuda_iov(opal_convertor_t* pConvertor, uint32_t *cuda_iov_count);
 
+uint8_t opal_ddt_iov_to_cuda_iov(opal_convertor_t* pConvertor, const struct iovec *ddt_iov, ddt_cuda_iov_dist_cached_t* cuda_iov_dist_h_current, uint32_t ddt_iov_start_pos, uint32_t ddt_iov_end_pos, size_t *buffer_size, uint32_t *nb_blocks_used, size_t *total_packed, size_t *contig_disp_out, uint32_t *current_ddt_iov_pos);
+
 }
                             
-#endif  /* OPAL_DATATYPE_CUDA_H_HAS_BEEN_INCLUDED */
\ No newline at end of file
+#endif  /* OPAL_DATATYPE_CUDA_H_HAS_BEEN_INCLUDED */
diff --git a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
index 38720b098fa..72edcb3d8a3 100644
--- a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
+++ b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
@@ -71,8 +71,8 @@ typedef struct {
 } ddt_cuda_iov_total_cached_t;
 
 typedef struct {
-    ddt_cuda_iov_dist_non_cached_t* cuda_iov_dist_non_cached_h;
-    ddt_cuda_iov_dist_non_cached_t* cuda_iov_dist_non_cached_d;
+    ddt_cuda_iov_dist_cached_t* cuda_iov_dist_non_cached_h;
+    ddt_cuda_iov_dist_cached_t* cuda_iov_dist_non_cached_d;
     ddt_cuda_iov_dist_cached_t* cuda_iov_dist_cached_h;
     cudaStream_t *cuda_stream;
     int32_t cuda_stream_id;
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
index c98d540e54e..0137601bf70 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
@@ -664,9 +664,102 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConve
                                                         uint32_t* out_size,
                                                         size_t* max_data )
 {
-    return opal_ddt_generic_simple_pack_function_cuda_iov_cached(pConvertor, iov, out_size, max_data);   
+    size_t buffer_size;
+    unsigned char *destination;
+    size_t total_packed;
+    uint8_t transfer_required, free_required;
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)    
+    TIMER_DATA_TYPE start, end, start_total, end_total;
+    long total_time, move_time;
+#endif
+
+ //   printf("buffer size %d, max_data %d\n", iov[0].iov_len, *max_data);
+    if ((iov[0].iov_base == NULL) || opal_ddt_cuda_is_gpu_buffer(iov[0].iov_base)) {
+        if (iov[0].iov_len == 0) {
+            buffer_size = DT_CUDA_BUFFER_SIZE;
+        } else {
+            buffer_size = iov[0].iov_len;
+        }
+        
+        if (iov[0].iov_base == NULL) {
+            iov[0].iov_base = (unsigned char *)opal_ddt_cuda_malloc_gpu_buffer(buffer_size, 0);
+            destination = (unsigned char *)iov[0].iov_base;
+            pConvertor->gpu_buffer_ptr = destination;
+            free_required = 1;
+        } else {
+            destination = (unsigned char *)iov[0].iov_base;
+            free_required = 0;
+        }
+        transfer_required = 0;
+    } else {
+        buffer_size = iov[0].iov_len;
+        if (OPAL_DATATYPE_VECTOR_USE_ZEROCPY) {
+            pConvertor->gpu_buffer_ptr = NULL;
+            transfer_required = 0;
+            free_required = 0;
+            cudaHostGetDevicePointer((void **)&destination, (void *)iov[0].iov_base, 0);
+        } else {
+            if (pConvertor->gpu_buffer_ptr == NULL) {
+                pConvertor->gpu_buffer_ptr = (unsigned char*)opal_ddt_cuda_malloc_gpu_buffer(buffer_size, 0);
+            }
+            transfer_required = 1;
+            free_required = 1;
+            destination = pConvertor->gpu_buffer_ptr;
+        }
+    }   
+
+    total_packed = 0;
+    
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+    GET_TIME(start_total);
+#endif
+    
+    /* start pack */
+    if (cuda_iov_cache_enabled) {
+        opal_ddt_generic_simple_pack_function_cuda_iov_cached(pConvertor, destination, buffer_size, &total_packed);
+    } else {
+        opal_ddt_generic_simple_pack_function_cuda_iov_non_cached(pConvertor, destination, buffer_size, &total_packed);
+    }
+
+    pConvertor->bConverted += total_packed;
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "Pack total packed %d\n", total_packed); );
+    
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+    GET_TIME(start);
+#endif
+    if (transfer_required) {
+        cudaMemcpy(iov[0].iov_base, pConvertor->gpu_buffer_ptr, total_packed, cudaMemcpyDeviceToHost);
+    } 
+#if defined(OPAL_DATATYPE_CUDA_TIMING) 
+    GET_TIME( end );
+    move_time = ELAPSED_TIME( start, end );
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: DtoH memcpy in %ld microsec, transfer required %d\n", move_time, transfer_required ); );
+#endif
+
+    iov[0].iov_len = total_packed;
+    *max_data = total_packed;
+    *out_size = 1;
+    
+#if defined(OPAL_DATATYPE_CUDA_TIMING)    
+    GET_TIME( end_total );
+    total_time = ELAPSED_TIME( start_total, end_total );
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: total packing in %ld microsec, kernel %ld microsec\n", total_time, total_time-move_time ); );
+#endif
+    
+    if( pConvertor->bConverted == pConvertor->local_size ) {
+        pConvertor->flags |= CONVERTOR_COMPLETED;
+        if (pConvertor->gpu_buffer_ptr != NULL && free_required) {
+           opal_ddt_cuda_free_gpu_buffer(pConvertor->gpu_buffer_ptr, 0);
+           pConvertor->gpu_buffer_ptr = NULL;
+        }
+        return 1;
+    }        
+    return 0; 
 }
 
+#if 0
+
 int32_t opal_ddt_generic_simple_pack_function_cuda_iov_non_cached( opal_convertor_t* pConvertor,
                                                                    struct iovec* iov,
                                                                    uint32_t* out_size,
@@ -927,17 +1020,111 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_non_cached( opal_converto
     return 0;
 }
 
-int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t* pConvertor,
-                                                               struct iovec* iov,
-                                                               uint32_t* out_size,
-                                                               size_t* max_data )
+#endif
+
+
+int32_t opal_ddt_generic_simple_pack_function_cuda_iov_non_cached( opal_convertor_t* pConvertor, unsigned char *destination, size_t buffer_size, size_t *total_packed)
 {
     uint32_t i;
     uint32_t nb_blocks, thread_per_block, nb_blocks_used;
-    size_t buffer_size;
-    unsigned char *destination, *destination_base, *source_base;
-    size_t total_packed;
-    uint8_t buffer_isfull = 0, transfer_required, free_required;
+    unsigned char *destination_base, *source_base;
+    uint8_t buffer_isfull = 0;
+    cudaError_t cuda_err;
+    ddt_cuda_stream_t *cuda_streams = current_cuda_device->cuda_streams;
+    ddt_cuda_iov_dist_cached_t* cuda_iov_dist_h_current;
+    ddt_cuda_iov_dist_cached_t* cuda_iov_dist_d_current;
+    ddt_cuda_iov_pipeline_block_t *cuda_iov_pipeline_block;
+    int iov_pipeline_block_id = 0;
+    cudaStream_t *cuda_stream_iov = NULL;
+    const struct iovec *ddt_iov = NULL;
+    uint32_t ddt_iov_count = 0;
+    size_t contig_disp = 0;
+    uint32_t ddt_iov_start_pos, ddt_iov_end_pos, current_ddt_iov_pos;
+    OPAL_PTRDIFF_TYPE ddt_extent;
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)    
+    TIMER_DATA_TYPE start, end;
+    long total_time;
+#endif
+    
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "Pack using IOV non cached, GPU base %p, pack to buffer %p\n", pConvertor->pBaseBuf, destination););
+    
+    opal_convertor_raw_cached( pConvertor, &ddt_iov, &ddt_iov_count);
+    if (ddt_iov == NULL) {
+        DT_CUDA_DEBUG ( opal_cuda_output(0, "Can not get ddt iov\n"););
+        return OPAL_ERROR;
+    }
+    
+    cuda_streams->current_stream_id = 0;
+    thread_per_block = CUDA_WARP_SIZE * 5;
+    nb_blocks = 256;
+    opal_datatype_type_extent(pConvertor->pDesc, &ddt_extent);
+    source_base = (unsigned char*)pConvertor->pBaseBuf + pConvertor->current_count * ddt_extent; 
+    destination_base = destination;
+    
+    for (i = 0; i < NB_STREAMS; i++) {
+        cudaStreamSynchronize(cuda_streams->opal_cuda_stream[i]);
+    }
+    
+    while( pConvertor->current_count < pConvertor->count && !buffer_isfull) {
+        
+        nb_blocks_used = 0;
+        ddt_iov_start_pos = pConvertor->current_iov_pos;
+        ddt_iov_end_pos = ddt_iov_start_pos + IOV_PIPELINE_SIZE;
+        if (ddt_iov_end_pos > ddt_iov_count) {
+            ddt_iov_end_pos = ddt_iov_count;
+        }
+        cuda_iov_pipeline_block = current_cuda_device->cuda_iov_pipeline_block[iov_pipeline_block_id];
+        cuda_iov_dist_h_current = cuda_iov_pipeline_block->cuda_iov_dist_non_cached_h;
+        cuda_iov_dist_d_current = cuda_iov_pipeline_block->cuda_iov_dist_non_cached_d;
+        cuda_stream_iov = cuda_iov_pipeline_block->cuda_stream;
+        cuda_err = cudaStreamWaitEvent(*cuda_stream_iov, cuda_iov_pipeline_block->cuda_event, 0);
+        opal_cuda_check_error(cuda_err);
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+        GET_TIME(start);
+#endif
+
+        buffer_isfull = opal_ddt_iov_to_cuda_iov(pConvertor, ddt_iov, cuda_iov_dist_h_current, ddt_iov_start_pos, ddt_iov_end_pos, &buffer_size, &nb_blocks_used, total_packed, &contig_disp, &current_ddt_iov_pos);
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)    
+        GET_TIME( end );
+        total_time = ELAPSED_TIME( start, end );
+        DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Pack src %p to dest %p, iov is prepared in %ld microsec, kernel submitted to CUDA stream %d, nb_blocks %d\n", source_base, destination_base, total_time,  cuda_iov_pipeline_block->cuda_stream_id, nb_blocks_used); );
+#endif
+
+        cudaMemcpyAsync(cuda_iov_dist_d_current, cuda_iov_dist_h_current, sizeof(ddt_cuda_iov_dist_cached_t)*(nb_blocks_used+1), cudaMemcpyHostToDevice, *cuda_stream_iov);
+        opal_generic_simple_pack_cuda_iov_cached_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cuda_iov_dist_d_current, 0, nb_blocks_used, 0, 0, nb_blocks_used, source_base, destination_base);
+        cuda_err = cudaEventRecord(cuda_iov_pipeline_block->cuda_event, *cuda_stream_iov);
+        opal_cuda_check_error(cuda_err);
+        iov_pipeline_block_id ++;
+        iov_pipeline_block_id = iov_pipeline_block_id % NB_STREAMS;
+        destination_base += contig_disp;
+        
+        if (!buffer_isfull) {
+            pConvertor->current_iov_pos = current_ddt_iov_pos;
+            if (current_ddt_iov_pos == ddt_iov_count) {
+                pConvertor->current_count ++;
+                pConvertor->current_iov_pos = 0;
+                source_base += ddt_extent;
+            }
+        }
+        
+    }
+    
+    for (i = 0; i < NB_STREAMS; i++) {
+        cudaStreamSynchronize(cuda_streams->opal_cuda_stream[i]);
+    }
+        
+    return OPAL_SUCCESS;
+}
+
+int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t* pConvertor, unsigned char *destination, size_t buffer_size, size_t *total_packed)
+{
+    uint32_t i;
+    uint32_t nb_blocks, thread_per_block, nb_blocks_used;
+    unsigned char *destination_base, *source_base;
+    uint8_t buffer_isfull = 0;
     cudaError_t cuda_err;
     ddt_cuda_stream_t *cuda_streams = current_cuda_device->cuda_streams;
     ddt_cuda_iov_pipeline_block_t *cuda_iov_pipeline_block = NULL;
@@ -951,65 +1138,14 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
     OPAL_PTRDIFF_TYPE ddt_extent;
 
 #if defined(OPAL_DATATYPE_CUDA_TIMING)    
-    TIMER_DATA_TYPE start, end, start_total, end_total;
-    long total_time, move_time;
+    TIMER_DATA_TYPE start, end;
+    long total_time;
 #endif
-
- //   printf("buffer size %d, max_data %d\n", iov[0].iov_len, *max_data);
-    if ((iov[0].iov_base == NULL) || opal_ddt_cuda_is_gpu_buffer(iov[0].iov_base)) {
-        if (iov[0].iov_len == 0) {
-            buffer_size = DT_CUDA_BUFFER_SIZE;
-        } else {
-            buffer_size = iov[0].iov_len;
-        }
-        
-        if (iov[0].iov_base == NULL) {
-            iov[0].iov_base = (unsigned char *)opal_ddt_cuda_malloc_gpu_buffer(buffer_size, 0);
-            destination = (unsigned char *)iov[0].iov_base;
-            pConvertor->gpu_buffer_ptr = destination;
-            free_required = 1;
-        } else {
-            destination = (unsigned char *)iov[0].iov_base;
-            free_required = 0;
-        }
-        transfer_required = 0;
-    } else {
-        buffer_size = iov[0].iov_len;
-        if (OPAL_DATATYPE_VECTOR_USE_ZEROCPY) {
-            pConvertor->gpu_buffer_ptr = NULL;
-            transfer_required = 0;
-            free_required = 0;
-            cudaHostGetDevicePointer((void **)&destination, (void *)iov[0].iov_base, 0);
-        } else {
-            if (pConvertor->gpu_buffer_ptr == NULL) {
-                pConvertor->gpu_buffer_ptr = (unsigned char*)opal_ddt_cuda_malloc_gpu_buffer(buffer_size, 0);
-            }
-            transfer_required = 1;
-            free_required = 1;
-            destination = pConvertor->gpu_buffer_ptr;
-        }
-    }   
     
     DT_CUDA_DEBUG ( opal_cuda_output(2, "Pack using IOV cached, GPU base %p, pack to buffer %p\n", pConvertor->pBaseBuf, destination););
 
-    total_packed = 0;
     cuda_streams->current_stream_id = 0;
     destination_base = destination;
-
-#if defined(OPAL_DATATYPE_CUDA_TIMING)
-    GET_TIME(start_total);
-#endif
-    
-#if defined(OPAL_DATATYPE_CUDA_TIMING)
-    GET_TIME(start);
-#endif
-
-#if defined(OPAL_DATATYPE_CUDA_TIMING)    
-    GET_TIME( end );
-    total_time = ELAPSED_TIME( start, end );
-    DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: ddt to iov in %ld microsec\n", total_time ); );
-#endif
-    
     thread_per_block = CUDA_WARP_SIZE * 5;
     nb_blocks = 256;
     source_base = (unsigned char*)pConvertor->pBaseBuf; 
@@ -1054,7 +1190,7 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
     while( pConvertor->current_count < pConvertor->count && !buffer_isfull) {
         for (i = cuda_iov_start_pos; i < cuda_iov_end_pos && !buffer_isfull; i++) {
             if (buffer_size >= cached_cuda_iov_nb_bytes_list_h[i]) {
-                total_packed += cached_cuda_iov_nb_bytes_list_h[i];
+                *total_packed += cached_cuda_iov_nb_bytes_list_h[i];
                 buffer_size -= cached_cuda_iov_nb_bytes_list_h[i];
                 nb_blocks_used++;
             } else {
@@ -1080,41 +1216,8 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
     pConvertor->current_cuda_iov_pos = pConvertor->current_cuda_iov_pos % cached_cuda_iov->cuda_iov_count;
 
     cudaStreamSynchronize(cuda_streams->opal_cuda_stream[0]);
-
-    pConvertor->bConverted += total_packed;
-    DT_CUDA_DEBUG ( opal_cuda_output(2, "Pack total packed %d\n", total_packed); );
     
-#if defined(OPAL_DATATYPE_CUDA_TIMING)
-    GET_TIME(start);
-#endif
-    if (transfer_required) {
-        cudaMemcpy(iov[0].iov_base, pConvertor->gpu_buffer_ptr, total_packed, cudaMemcpyDeviceToHost);
-    } 
-#if defined(OPAL_DATATYPE_CUDA_TIMING) 
-    GET_TIME( end );
-    move_time = ELAPSED_TIME( start, end );
-    DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: DtoH memcpy in %ld microsec, transfer required %d\n", move_time, transfer_required ); );
-#endif
-
-    iov[0].iov_len = total_packed;
-    *max_data = total_packed;
-    *out_size = 1;
-    
-#if defined(OPAL_DATATYPE_CUDA_TIMING)    
-    GET_TIME( end_total );
-    total_time = ELAPSED_TIME( start_total, end_total );
-    DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: total packing in %ld microsec, kernel %ld microsec\n", total_time, total_time-move_time ); );
-#endif
-    
-    if( pConvertor->bConverted == pConvertor->local_size ) {
-        pConvertor->flags |= CONVERTOR_COMPLETED;
-        if (pConvertor->gpu_buffer_ptr != NULL && free_required) {
-           opal_ddt_cuda_free_gpu_buffer(pConvertor->gpu_buffer_ptr, 0);
-           pConvertor->gpu_buffer_ptr = NULL;
-        }
-        return 1;
-    }        
-    return 0;
+    return OPAL_SUCCESS;
 }
 
 void pack_predefined_data_cuda( dt_elem_desc_t* ELEM,
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
index 6cda6f08cc4..bb54dfeeb0a 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
@@ -376,7 +376,6 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pCon
                                                           uint32_t* out_size,
                                                           size_t* max_data )
 {
-<<<<<<< HEAD
     size_t buffer_size;
     unsigned char *source;
     size_t total_unpacked;
@@ -454,11 +453,6 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pCon
 }
 
 #if 0
-=======
-    return opal_ddt_generic_simple_unpack_function_cuda_iov_cached(pConvertor, iov, out_size, max_data);
-}
-
->>>>>>> cached iov is working for count = 1
 int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_non_cached( opal_convertor_t* pConvertor,
                                                                      struct iovec* iov,
                                                                      uint32_t* out_size,
@@ -703,85 +697,139 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_non_cached( opal_conver
     return 0;
 }
 
-int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_t* pConvertor,
-                                                                 struct iovec* iov,
-                                                                 uint32_t* out_size,
-                                                                 size_t* max_data )
+#endif
+
+int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_non_cached( opal_convertor_t* pConvertor, unsigned char *source, size_t buffer_size, size_t *total_unpacked)
 {
     uint32_t i;
     uint32_t nb_blocks, thread_per_block, nb_blocks_used;
-    size_t buffer_size;
-    unsigned char *source, *source_base, *destination_base, *destination;
-    size_t total_unpacked;
+    unsigned char *source_base, *destination_base;
     uint8_t buffer_isfull = 0;
-    uint8_t free_required = 0;
     cudaError_t cuda_err;
     ddt_cuda_stream_t *cuda_streams = current_cuda_device->cuda_streams;
-    ddt_cuda_iov_pipeline_block_t *cuda_iov_pipeline_block = NULL;
+    ddt_cuda_iov_dist_cached_t* cuda_iov_dist_h_current;
+    ddt_cuda_iov_dist_cached_t* cuda_iov_dist_d_current;
+    ddt_cuda_iov_pipeline_block_t *cuda_iov_pipeline_block;
+    int iov_pipeline_block_id = 0;
     cudaStream_t *cuda_stream_iov = NULL;
-    uint32_t cuda_iov_start_pos, cuda_iov_end_pos;
-    ddt_cuda_iov_total_cached_t* cached_cuda_iov = NULL;
-    ddt_cuda_iov_dist_cached_t* cached_cuda_iov_dist_d = NULL;
-    uint32_t *cached_cuda_iov_nb_bytes_list_h = NULL;
-    uint32_t cached_cuda_iov_count = 0;
-    size_t cuda_iov_partial_length_start = 0;
-    size_t cuda_iov_partial_length_end = 0;
-    opal_datatype_count_t convertor_current_count;
+    const struct iovec *ddt_iov = NULL;
+    uint32_t ddt_iov_count = 0;
+    size_t contig_disp = 0;
+    uint32_t ddt_iov_start_pos, ddt_iov_end_pos, current_ddt_iov_pos;
     OPAL_PTRDIFF_TYPE ddt_extent;
 
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
-    TIMER_DATA_TYPE start, end, start_total, end_total;
-    long total_time, move_time;
+    TIMER_DATA_TYPE start, end;
+    long total_time;
 #endif
+    
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "Unpack using IOV non cached, GPU base %p, unpack from buffer %p, total size %ld\n",
+                                     pConvertor->pBaseBuf, source, buffer_size); );
+    
+    opal_convertor_raw_cached( pConvertor, &ddt_iov, &ddt_iov_count);
+    if (ddt_iov == NULL) {
+        DT_CUDA_DEBUG ( opal_cuda_output(0, "Can not get ddt iov\n"););
+        return OPAL_ERROR;
+    }
+    
+    cuda_streams->current_stream_id = 0;
+    thread_per_block = CUDA_WARP_SIZE * 5;
+    nb_blocks = 256;
+    source_base = source;
+    opal_datatype_type_extent(pConvertor->pDesc, &ddt_extent);
+    opal_ddt_set_ddt_iov_position(pConvertor, pConvertor->bConverted, ddt_iov, ddt_iov_count);
+    destination_base = (unsigned char*)pConvertor->pBaseBuf + pConvertor->current_count * ddt_extent;
+    
+    for (i = 0; i < NB_STREAMS; i++) {
+        cudaStreamSynchronize(cuda_streams->opal_cuda_stream[i]);
+    }
 
-#if defined(OPAL_DATATYPE_CUDA_TIMING)
-    GET_TIME(start_total);
+    while( pConvertor->current_count < pConvertor->count && !buffer_isfull) {
+
+        nb_blocks_used = 0;
+        ddt_iov_start_pos = pConvertor->current_iov_pos;
+        ddt_iov_end_pos = ddt_iov_start_pos + IOV_PIPELINE_SIZE;
+        if (ddt_iov_end_pos > ddt_iov_count) {
+            ddt_iov_end_pos = ddt_iov_count;
+        }
+        cuda_iov_pipeline_block = current_cuda_device->cuda_iov_pipeline_block[iov_pipeline_block_id];
+        cuda_iov_dist_h_current = cuda_iov_pipeline_block->cuda_iov_dist_non_cached_h;
+        cuda_iov_dist_d_current = cuda_iov_pipeline_block->cuda_iov_dist_non_cached_d;
+        cuda_stream_iov = cuda_iov_pipeline_block->cuda_stream;
+        cuda_err = cudaStreamWaitEvent(*cuda_stream_iov, cuda_iov_pipeline_block->cuda_event, 0);
+        opal_cuda_check_error(cuda_err);
+        
+
+#if defined (OPAL_DATATYPE_CUDA_TIMING)
+        GET_TIME(start);
 #endif
 
+        buffer_isfull = opal_ddt_iov_to_cuda_iov(pConvertor, ddt_iov, cuda_iov_dist_h_current, ddt_iov_start_pos, ddt_iov_end_pos, &buffer_size, &nb_blocks_used, total_unpacked, &contig_disp, &current_ddt_iov_pos);
+
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
-    GET_TIME(start);
+        GET_TIME( end );
+        total_time = ELAPSED_TIME( start, end );
+        DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Unpack src %p to dest %p, iov is prepared in %ld microsec, kernel submitted to CUDA stream %d, nb_blocks_used %d\n", source_base, destination_base, total_time,  cuda_iov_pipeline_block->cuda_stream_id, nb_blocks_used); );
 #endif
-    if (opal_ddt_cuda_is_gpu_buffer(iov[0].iov_base)) {
-        source = (unsigned char*)iov[0].iov_base;
-        free_required = 0;
-    } else {
-        if (OPAL_DATATYPE_VECTOR_USE_ZEROCPY) {
-            cudaHostGetDevicePointer((void **)&source, (void *)iov[0].iov_base, 0);
-            pConvertor->gpu_buffer_ptr = NULL;
-            free_required = 0;
-        } else {
-            if (pConvertor->gpu_buffer_ptr == NULL) {
-                pConvertor->gpu_buffer_ptr = (unsigned char*)opal_ddt_cuda_malloc_gpu_buffer(iov[0].iov_len, 0);
+
+        cudaMemcpyAsync(cuda_iov_dist_d_current, cuda_iov_dist_h_current, sizeof(ddt_cuda_iov_dist_cached_t)*(nb_blocks_used+1), cudaMemcpyHostToDevice, *cuda_stream_iov);
+        opal_generic_simple_unpack_cuda_iov_cached_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cuda_iov_dist_d_current, 0, nb_blocks_used, 0, 0, nb_blocks_used, destination_base, source_base, 0, 0);
+        cuda_err = cudaEventRecord(cuda_iov_pipeline_block->cuda_event, *cuda_stream_iov);
+        opal_cuda_check_error(cuda_err);
+        iov_pipeline_block_id ++;
+        iov_pipeline_block_id = iov_pipeline_block_id % NB_STREAMS;
+        source_base += contig_disp;
+        if (!buffer_isfull) {
+            pConvertor->current_iov_pos = current_ddt_iov_pos;
+            if (current_ddt_iov_pos == ddt_iov_count) {
+                pConvertor->current_count ++;
+                pConvertor->current_iov_pos = 0;
+                destination_base += ddt_extent;
             }
-            source = pConvertor->gpu_buffer_ptr;
-            cudaMemcpy(source, iov[0].iov_base, iov[0].iov_len, cudaMemcpyHostToDevice);
-            free_required = 1;
         }
     }
 
-    DT_CUDA_DEBUG ( opal_cuda_output(2, "Unpack using IOV cached, GPU base %p, unpack from buffer %p, total size %ld\n",
-                                     pConvertor->pBaseBuf, source, iov[0].iov_len); );
+    for (i = 0; i < NB_STREAMS; i++) {
+        cudaStreamSynchronize(cuda_streams->opal_cuda_stream[i]);
+    }
+
+    return OPAL_SUCCESS;
+}
+
+int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_t* pConvertor, unsigned char *source, size_t buffer_size, size_t *total_unpacked)
+{
+    uint32_t i;
+    uint32_t nb_blocks, thread_per_block, nb_blocks_used;
+    unsigned char *source_base, *destination_base;
+    uint8_t buffer_isfull = 0;
+    cudaError_t cuda_err;
+    ddt_cuda_stream_t *cuda_streams = current_cuda_device->cuda_streams;
+    ddt_cuda_iov_pipeline_block_t *cuda_iov_pipeline_block = NULL;
+    cudaStream_t *cuda_stream_iov = NULL;
+    uint32_t cuda_iov_start_pos, cuda_iov_end_pos;
+    ddt_cuda_iov_total_cached_t* cached_cuda_iov = NULL;
+    ddt_cuda_iov_dist_cached_t* cached_cuda_iov_dist_d = NULL;
+    uint32_t *cached_cuda_iov_nb_bytes_list_h = NULL;
+    uint32_t cached_cuda_iov_count = 0;
+    size_t cuda_iov_partial_length_start = 0;
+    size_t cuda_iov_partial_length_end = 0;
+    opal_datatype_count_t convertor_current_count;
+    OPAL_PTRDIFF_TYPE ddt_extent;
+
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
-    GET_TIME( end );
-    move_time = ELAPSED_TIME( start, end );
-    DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: HtoD memcpy in %ld microsec, free required %d\n", move_time, free_required ); );
+    TIMER_DATA_TYPE start, end;
+    long total_time;
 #endif
-
+    
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "Unpack using IOV cached, GPU base %p, unpack from buffer %p, total size %ld\n",
+                                     pConvertor->pBaseBuf, source, buffer_size); );
 
 #if defined (OPAL_DATATYPE_CUDA_TIMING)
     GET_TIME(start);
 #endif
-    buffer_size = iov[0].iov_len;
-    total_unpacked = 0;
+
     cuda_streams->current_stream_id = 0;
     source_base = source;
-
-#if defined (OPAL_DATATYPE_CUDA_TIMING)
-    GET_TIME( end );
-    total_time = ELAPSED_TIME( start, end );
-    DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: ddt to iov in %ld microsec\n", total_time ); );
-#endif
-
     thread_per_block = CUDA_WARP_SIZE * 5;
     nb_blocks = 256;
     destination_base = (unsigned char*)pConvertor->pBaseBuf;
@@ -820,7 +868,7 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
     
     if (pConvertor->current_iov_partial_length > 0) {
         cuda_iov_partial_length_start = pConvertor->current_iov_partial_length;
-        total_unpacked += cuda_iov_partial_length_start;
+        *total_unpacked += cuda_iov_partial_length_start;
         buffer_size -= cuda_iov_partial_length_start;
         pConvertor->current_iov_partial_length = 0;
         cuda_iov_start_pos ++;
@@ -833,13 +881,13 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
     while( pConvertor->current_count < pConvertor->count && !buffer_isfull) {
         for (i = cuda_iov_start_pos; i < cuda_iov_end_pos && !buffer_isfull; i++) {
             if (buffer_size >= cached_cuda_iov_nb_bytes_list_h[i]) {
-                total_unpacked += cached_cuda_iov_nb_bytes_list_h[i];
+                *total_unpacked += cached_cuda_iov_nb_bytes_list_h[i];
                 buffer_size -= cached_cuda_iov_nb_bytes_list_h[i];
                 nb_blocks_used ++;
             } else {
                 if (buffer_size > 0) {
                     cuda_iov_partial_length_end = buffer_size;
-                    total_unpacked += cuda_iov_partial_length_end;
+                    *total_unpacked += cuda_iov_partial_length_end;
                     nb_blocks_used ++;
                 }
                 buffer_size = 0;
@@ -864,28 +912,7 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
 
     cudaStreamSynchronize(cuda_streams->opal_cuda_stream[0]);
     
-    pConvertor->bConverted += total_unpacked;
-    DT_CUDA_DEBUG ( opal_cuda_output(2, "Unpack total unpacked %d\n", total_unpacked); );
-
-    iov[0].iov_len = total_unpacked;
-    *max_data = total_unpacked;
-    *out_size = 1;
-
-#if defined(OPAL_DATATYPE_CUDA_TIMING)
-    GET_TIME( end_total );
-    total_time = ELAPSED_TIME( start_total, end_total );
-    DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: total unpacking in %ld microsec, kernel %ld microsec\n", total_time, total_time-move_time ); );
-#endif
-
-    if( pConvertor->bConverted == pConvertor->local_size ) {
-        pConvertor->flags |= CONVERTOR_COMPLETED;
-        if (pConvertor->gpu_buffer_ptr != NULL && free_required) {
-            opal_ddt_cuda_free_gpu_buffer(pConvertor->gpu_buffer_ptr, 0);
-            pConvertor->gpu_buffer_ptr = NULL;
-        }
-        return 1;
-    }
-    return 0;
+    return OPAL_SUCCESS;
 }
 
 void unpack_contiguous_loop_cuda( dt_elem_desc_t* ELEM,

From eb143dc6a56c07a0fbfa30f09c82a882808e8a13 Mon Sep 17 00:00:00 2001
From: Wei Wu <eddy16112@gmail.com>
Date: Fri, 5 Feb 2016 12:36:36 -0800
Subject: [PATCH 185/190] apply loop unroll on packing kernels

---
 opal/datatype/cuda/opal_datatype_cuda.cu      |   6 +-
 .../cuda/opal_datatype_cuda_internal.cuh      |   6 +-
 .../cuda/opal_datatype_pack_cuda_kernel.cu    | 512 +++++++++++++++++-
 .../cuda/opal_datatype_pack_cuda_wrapper.cu   |  17 +-
 .../cuda/opal_datatype_unpack_cuda_wrapper.cu |  13 +-
 test/datatype/Makefile.am                     |   2 +-
 test/datatype/ddt_benchmark.c                 | 125 +++--
 test/datatype/ddt_lib.c                       |   8 +
 test/datatype/ddt_lib.h                       |   4 +-
 9 files changed, 635 insertions(+), 58 deletions(-)

diff --git a/opal/datatype/cuda/opal_datatype_cuda.cu b/opal/datatype/cuda/opal_datatype_cuda.cu
index 2c76a327197..372edefa96a 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cu
+++ b/opal/datatype/cuda/opal_datatype_cuda.cu
@@ -358,7 +358,7 @@ int32_t opal_ddt_cache_cuda_iov(opal_convertor_t* pConvertor, uint32_t *cuda_iov
     uint32_t count_desc, nb_blocks_per_description, residue_desc;
     uint32_t thread_per_block, nb_blocks_used;
     size_t length_per_iovec;
-    uint8_t alignment;
+    uint32_t alignment;
     ddt_cuda_iov_pipeline_block_t *cuda_iov_pipeline_block = NULL;
     ddt_cuda_iov_total_cached_t* cached_cuda_iov = NULL;
     ddt_cuda_iov_dist_cached_t *cached_cuda_iov_dist_d = NULL;
@@ -389,14 +389,14 @@ int32_t opal_ddt_cache_cuda_iov(opal_convertor_t* pConvertor, uint32_t *cuda_iov
     cuda_iov_pipeline_block = current_cuda_device->cuda_iov_pipeline_block[0];
     cuda_iov_dist_h = cuda_iov_pipeline_block->cuda_iov_dist_cached_h;
     cuda_stream_iov = cuda_iov_pipeline_block->cuda_stream;
-    thread_per_block = CUDA_WARP_SIZE * 5;
+    thread_per_block = CUDA_WARP_SIZE * 32;
 
     for (i = 0; i < ddt_iov_count; i++) {
         length_per_iovec = ddt_iov[i].iov_len;
         ncontig_disp_base = (size_t)(ddt_iov[i].iov_base);
     
         /* block size is either multiple of ALIGNMENT_DOUBLE or residule */
-        alignment = ALIGNMENT_DOUBLE;
+        alignment = ALIGNMENT_DOUBLE * 1;
 
         count_desc = length_per_iovec / alignment;
         residue_desc = length_per_iovec % alignment;
diff --git a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
index 72edcb3d8a3..e6268fadc05 100644
--- a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
+++ b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
@@ -17,7 +17,7 @@
 #define OPAL_DATATYPE_VECTOR_USE_MEMCPY2D_D2H   0
 #define OPAL_DATATYPE_VECTOR_USE_ZEROCPY   0
 #define OPAL_DATATYPE_VECTOR_USE_PIPELINE   0
-#define OPAL_DATATYPE_VECTOR_USE_MEMCPY2D_AS_KERNEL   1
+#define OPAL_DATATYPE_VECTOR_USE_MEMCPY2D_AS_KERNEL   0
 #define OPAL_DATATYPE_CUDA_IOV_CACHE    1
 
 
@@ -40,6 +40,10 @@
 #define ALIGNMENT_CHAR      1
 #define NUM_CUDA_IOV_PER_DDT    150000
 #define IOV_PIPELINE_SIZE   1000
+#define KERNEL_UNROLL       16
+#define UNROLL_16           16
+#define UNROLL_8            8
+#define UNROLL_4            4
 
 #define TIMER_DATA_TYPE struct timeval
 #define GET_TIME(TV)   gettimeofday( &(TV), NULL )
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
index 2564fe1393c..79138a72f9a 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
@@ -5,6 +5,7 @@
 #include <stdio.h> 
 #include <time.h>
 
+#if 1
 __global__ void pack_contiguous_loop_cuda_kernel_global( uint32_t copy_loops,
                                                          size_t size,
                                                          OPAL_PTRDIFF_TYPE extent,
@@ -13,17 +14,17 @@ __global__ void pack_contiguous_loop_cuda_kernel_global( uint32_t copy_loops,
 {
     uint32_t _i, tid, num_threads;
     uint32_t gap, nb_elements;
-    double *_source_tmp, *_destination_tmp, *_src_disp_tmp;;
+    uint64_t *_source_tmp, *_destination_tmp, *_src_disp_tmp;;
     
     tid = threadIdx.x + blockIdx.x * blockDim.x;
     num_threads = gridDim.x * blockDim.x;
     
     gap = (extent - size) / 8;
     nb_elements = size / 8;
-    _src_disp_tmp = (double*)source;
-    _destination_tmp = (double*)destination;
+    _src_disp_tmp = (uint64_t*)source;
+    _destination_tmp = (uint64_t*)destination;
     _destination_tmp += tid;
-
+#if 0
     for (_i = tid; _i < copy_loops*nb_elements; _i+=num_threads) {
         _source_tmp = _src_disp_tmp + tid + _i/num_threads*num_threads + _i/nb_elements * gap;
 #if defined (OPAL_DATATYPE_CUDA_DEBUG)
@@ -41,8 +42,225 @@ __global__ void pack_contiguous_loop_cuda_kernel_global( uint32_t copy_loops,
 #endif /* ! OPAL_DATATYPE_CUDA_DRY_RUN */
         _destination_tmp += num_threads;
     }
+#else
+    for (_i = tid; _i < copy_loops*nb_elements; _i+=16*num_threads) {
+        uint64_t val[16];
+        uint32_t _j;
+        uint32_t u;
+        uint64_t *mysrc = _src_disp_tmp + tid;
+        
+        #pragma unroll      
+        for (u = 0; u < 16; u++) {
+            _j = _i + u * num_threads;
+            val[u] = *(mysrc + _j/num_threads*num_threads + _j/nb_elements * gap);
+        } 
+        
+        #pragma unroll
+        for (u = 0; u < 16; u++) {
+            *_destination_tmp = val[u];
+            _destination_tmp += num_threads;
+        } 
+/*
+        _j = _i;
+        val[0] = *(_src_disp_tmp + tid + _j/num_threads*num_threads + _j/nb_elements * gap);
+
+        _j += num_threads;
+        val[1] = *(_src_disp_tmp + tid + _j/num_threads*num_threads + _j/nb_elements * gap);
+
+        _j += num_threads;
+        val[2] = *(_src_disp_tmp + tid + _j/num_threads*num_threads + _j/nb_elements * gap);
+
+        _j += num_threads;
+        val[3] = *(_src_disp_tmp + tid + _j/num_threads*num_threads + _j/nb_elements * gap);
+        
+	_j += num_threads;
+        val[4] = *(_src_disp_tmp + tid + _j/num_threads*num_threads + _j/nb_elements * gap);
+
+	_j += num_threads;
+        val[5] = *(_src_disp_tmp + tid + _j/num_threads*num_threads + _j/nb_elements * gap);
+
+	_j += num_threads;
+        val[6] = *(_src_disp_tmp + tid + _j/num_threads*num_threads + _j/nb_elements * gap);
+
+	_j += num_threads;
+        val[7] = *(_src_disp_tmp + tid + _j/num_threads*num_threads + _j/nb_elements * gap);
+
+	_j += num_threads;
+        val[8] = *(_src_disp_tmp + tid + _j/num_threads*num_threads + _j/nb_elements * gap);
+
+	_j += num_threads;
+        val[9] = *(_src_disp_tmp + tid + _j/num_threads*num_threads + _j/nb_elements * gap);
+
+	_j += num_threads;
+        val[10] = *(_src_disp_tmp + tid + _j/num_threads*num_threads + _j/nb_elements * gap);
+
+        _j += num_threads;
+        val[11] = *(_src_disp_tmp + tid + _j/num_threads*num_threads + _j/nb_elements * gap);
+
+        _j += num_threads;
+        val[12] = *(_src_disp_tmp + tid + _j/num_threads*num_threads + _j/nb_elements * gap);
+
+        _j += num_threads;
+        val[13] = *(_src_disp_tmp + tid + _j/num_threads*num_threads + _j/nb_elements * gap);
+
+        _j += num_threads;
+        val[14] = *(_src_disp_tmp + tid + _j/num_threads*num_threads + _j/nb_elements * gap);
+
+        _j += num_threads;
+        val[15] = *(_src_disp_tmp + tid + _j/num_threads*num_threads + _j/nb_elements * gap);
+
+        *_destination_tmp = val[0];
+        _destination_tmp += num_threads;
+        *_destination_tmp = val[1];
+        _destination_tmp += num_threads;
+        *_destination_tmp = val[2];
+        _destination_tmp += num_threads;
+        *_destination_tmp = val[3];
+        _destination_tmp += num_threads;
+        *_destination_tmp = val[4];
+        _destination_tmp += num_threads;
+        *_destination_tmp = val[5];
+        _destination_tmp += num_threads;
+        *_destination_tmp = val[6];
+        _destination_tmp += num_threads;
+        *_destination_tmp = val[7];
+        _destination_tmp += num_threads;
+        *_destination_tmp = val[8];
+        _destination_tmp += num_threads;
+        *_destination_tmp = val[9];
+        _destination_tmp += num_threads;
+        *_destination_tmp = val[10];
+        _destination_tmp += num_threads;
+        *_destination_tmp = val[11];
+        _destination_tmp += num_threads;
+        *_destination_tmp = val[12];
+        _destination_tmp += num_threads;
+        *_destination_tmp = val[13];
+        _destination_tmp += num_threads;
+        *_destination_tmp = val[14];
+        _destination_tmp += num_threads;
+        *_destination_tmp = val[15];
+        _destination_tmp += num_threads;
+*/  
+    }
+#endif
+}
+
+#else
+
+#define SEG_ADD(s) \
+    l += s; \
+    while (l >= lines) { \
+	l -= lines; \
+	c += width; \
+    }
+
+__global__ void pack_contiguous_loop_cuda_kernel_global( uint32_t lines,
+                                                         size_t nb_size,
+                                                         OPAL_PTRDIFF_TYPE nb_extent,
+                                                         unsigned char * b_source,
+                                                         unsigned char * b_destination )
+{
+    uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
+    uint32_t num_threads = gridDim.x * blockDim.x;
+  
+    //size_t lines = (size_t)lines;
+    size_t size = nb_size / 8;
+    size_t extent = nb_extent / 8;
+    uint64_t * source = (uint64_t *) b_source;
+    uint64_t *destination = (uint64_t *) b_destination;
+    uint64_t val[KERNEL_UNROLL];
+    
+    int col = 0;
+    for (int width = 32; width > 0 && col < size; width >>= 1) {
+    	while (size-col >= width) {
+    	    const int warp_id = tid / width;
+    	    const int warp_tid = tid & (width-1);
+    	    const int warp_nb = num_threads / width;
+    	    const int c = col + warp_tid;
+            int l = warp_id * KERNEL_UNROLL;
+    	    uint64_t *src = source + c;
+    	    uint64_t *dst = destination + c;
+    	    for (int b=0; b<lines/(KERNEL_UNROLL*warp_nb); b++) {
+    		    #pragma unroll
+    		    for (int u=0; u<KERNEL_UNROLL; u++) {
+    		        val[u] = __ldg(src+(l+u)*extent);
+    		    }
+    		    #pragma unroll
+    		    for (int u=0; u<KERNEL_UNROLL; u++) {
+    		        dst[(l+u)*size] = val[u];
+    		    }
+    		    l += warp_nb * KERNEL_UNROLL;
+    	    }
+    	    /* Finish non-unrollable case */
+    	    for (int u=0; u<KERNEL_UNROLL && l<lines; u++) {
+    		    dst[l*size] = __ldg(src+l*extent);
+    		    l++;
+    	    }		
+    	    col += width;
+    	}
+    }
+
+    
 }
 
+/*
+#define COLOFF_INC(jump, width, ext) \
+     col += jump; \
+     off += jump; \
+     while (col >= width) { \
+         col -= width; \
+         off += ext - width; \
+     }
+
+#define ELEMSIZE 32
+
+__global__ void pack_contiguous_loop_cuda_kernel_global( uint32_t
+copy_loops,
+size_t size,
+OPAL_PTRDIFF_TYPE extent,
+unsigned char * source,
+unsigned char * destination )
+{
+     uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x ;
+     uint32_t num_threads = gridDim.x * blockDim.x;
+
+     int col = 0;
+     int off = 0;
+
+     COLOFF_INC(tid, size/ELEMSIZE, extent/ELEMSIZE);
+
+     if (ELEMSIZE % 8 == 0) {
+         volatile uint64_t * __restrict__ dst = (uint64_t*)destination +
+tid * ELEMSIZE/8;
+         for (int offset = tid; offset < copy_loops*size/ELEMSIZE;
+offset+=num_threads) {
+             const volatile uint64_t * __restrict__ src = (uint64_t*)source + off * ELEMSIZE/8;
+#if 1
+             uint64_t val[ELEMSIZE/8];
+             #pragma unroll
+             for (int i = 0; i < ELEMSIZE/8; i++) {
+                 val[i] = src[i];
+             }
+             #pragma unroll
+             for (int i = 0; i < ELEMSIZE/8; i++) {
+                 dst[i] = val[i];
+             }
+#else
+             #pragma unroll
+             for (int i = 0; i < ELEMSIZE/8; i++) {
+                 dst[i] = __ldg(src+i);
+             }
+#endif
+             dst += num_threads*ELEMSIZE/8;
+             COLOFF_INC(num_threads, size/ELEMSIZE, extent/ELEMSIZE);
+         }
+     }
+}
+*/
+#endif
+
+
 __global__ void opal_generic_simple_pack_cuda_iov_non_cached_kernel( ddt_cuda_iov_dist_non_cached_t* cuda_iov_dist, int nb_blocks_used)
 {
     uint32_t i, _copy_count;
@@ -88,6 +306,7 @@ __global__ void opal_generic_simple_pack_cuda_iov_non_cached_kernel( ddt_cuda_io
     }
 }
 
+#if 0
 __global__ void opal_generic_simple_pack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, uint32_t cuda_iov_pos, uint32_t cuda_iov_count, uint32_t ddt_extent, uint32_t current_count, int nb_blocks_used, unsigned char* source_base, unsigned char* destination_base)
 {
     uint32_t i, j;
@@ -141,7 +360,7 @@ __global__ void opal_generic_simple_pack_cuda_iov_cached_kernel( ddt_cuda_iov_di
             printf("pack block %d, src_offset %ld, dst_offset %ld, count %d, nb_bytes %d, nb_tasks %d, i %d\n", blockIdx.x, src_offset, dst_offset, copy_count, _nb_bytes, nb_tasks, i);
         }
         __syncthreads();
-      */  
+      */
         for (j = threadIdx.x; j < copy_count; j += blockDim.x) {
             if (j < copy_count) {
                 _source_tmp = source_base + src_offset + j * alignment;
@@ -159,3 +378,286 @@ __global__ void opal_generic_simple_pack_cuda_iov_cached_kernel( ddt_cuda_iov_di
         }
     }
 }
+
+#else
+__global__ void opal_generic_simple_pack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, uint32_t cuda_iov_pos, uint32_t cuda_iov_count, uint32_t ddt_extent, uint32_t current_count, int nb_blocks_used, unsigned char* source_base, unsigned char* destination_base)
+{
+    uint32_t i, j;
+    uint32_t _nb_bytes;    
+    size_t src_offset, dst_offset;
+    unsigned char *_source_tmp, *_destination_tmp;
+    uint32_t current_cuda_iov_pos = cuda_iov_pos;
+    size_t destination_disp = cuda_iov_dist[current_cuda_iov_pos].contig_disp;
+    size_t contig_disp;
+    uint32_t _my_cuda_iov_pos;
+    uint32_t _my_cuda_iov_iteration;
+    size_t ddt_size = cuda_iov_dist[cuda_iov_count].contig_disp;
+    
+    __shared__ uint32_t nb_tasks_per_block;
+    __shared__ uint32_t WARP_SIZE;
+    __shared__ uint32_t nb_warp_per_block;
+    uint32_t copy_count;
+    uint8_t alignment;
+    uint64_t tmp_var_64[KERNEL_UNROLL];
+    uint32_t tmp_var_32[KERNEL_UNROLL];
+    unsigned char tmp_var_8[KERNEL_UNROLL];
+    uint32_t u, k;
+    uint32_t copy_count_16, copy_count_8, copy_count_left;
+    
+    if (threadIdx.x == 0) {
+        nb_tasks_per_block = nb_blocks_used / gridDim.x;
+        if (blockIdx.x < (nb_blocks_used % gridDim.x)) {
+            nb_tasks_per_block ++;
+        }
+        if (nb_tasks_per_block >= 4) {
+            WARP_SIZE = 32;
+        } else if (nb_tasks_per_block == 1) {
+            WARP_SIZE = blockDim.x;
+        } else {
+            WARP_SIZE = 64;
+        }
+        nb_warp_per_block = blockDim.x / WARP_SIZE;
+ //       nb_warp_per_block = 1;
+     //   if (nb_tasks_per_block == )
+    //    printf("cuda_iov_count %d, ddt_extent %d, current_count %d\n", cuda_iov_count, ddt_extent, current_count);
+    //     printf("nb_tasks %d, griddim %d, nb_blocks_used %d, bloid %d \n", nb_tasks, gridDim.x, nb_blocks_used, blockIdx.x);
+    }
+    __syncthreads();
+      
+      const uint32_t warp_id_per_block = threadIdx.x / WARP_SIZE;
+      const uint32_t tid_per_warp = threadIdx.x & (WARP_SIZE - 1);
+ //     uint32_t warp_id_per_block = 0;
+ //     uint32_t tid_per_warp = threadIdx.x;  
+    
+    for (i = warp_id_per_block; i < nb_tasks_per_block; i+= nb_warp_per_block) {
+        /* these 3 variables are used multiple times, so put in in register */
+        _my_cuda_iov_pos = (blockIdx.x + i * gridDim.x + current_cuda_iov_pos) % cuda_iov_count;
+        _my_cuda_iov_iteration = (blockIdx.x + i * gridDim.x + current_cuda_iov_pos) / cuda_iov_count;
+        contig_disp = cuda_iov_dist[_my_cuda_iov_pos].contig_disp;  
+        
+        src_offset = cuda_iov_dist[_my_cuda_iov_pos].ncontig_disp + (_my_cuda_iov_iteration + current_count) * ddt_extent;
+        dst_offset = contig_disp + ddt_size * _my_cuda_iov_iteration - destination_disp;
+        _nb_bytes = cuda_iov_dist[_my_cuda_iov_pos + 1].contig_disp - contig_disp;
+        
+        _source_tmp = source_base + src_offset;
+        _destination_tmp = destination_base + dst_offset;
+        /* block size is either multiple of ALIGNMENT_DOUBLE or residule */
+        if ((uintptr_t)(_source_tmp) % ALIGNMENT_DOUBLE == 0 && (uintptr_t)(_destination_tmp) % ALIGNMENT_DOUBLE == 0 && _nb_bytes % ALIGNMENT_DOUBLE == 0) {
+            alignment = ALIGNMENT_DOUBLE;
+        } else if ((uintptr_t)(_source_tmp) % ALIGNMENT_FLOAT == 0 && (uintptr_t)(_destination_tmp) % ALIGNMENT_FLOAT == 0 && _nb_bytes % ALIGNMENT_FLOAT == 0) {
+            alignment = ALIGNMENT_FLOAT;
+        } else {
+            alignment = ALIGNMENT_CHAR;
+        }
+        
+     //   alignment = ALIGNMENT_DOUBLE;
+        copy_count = _nb_bytes / alignment;
+    /*    
+        if (threadIdx.x == 0 && nb_tasks != 0) {
+            printf("pack block %d, src_offset %ld, dst_offset %ld, count %d, nb_bytes %d, nb_tasks %d, i %d\n", blockIdx.x, src_offset, dst_offset, copy_count, _nb_bytes, nb_tasks, i);
+        }
+        __syncthreads();
+      */
+       /* if (threadIdx.x == 0){
+            printf("bytes %d, copy count %d, alignment %d, task %d, nb_block_used %d\n", _nb_bytes, copy_count, alignment, i, nb_blocks_used);
+        } */
+        if (alignment == ALIGNMENT_DOUBLE) {
+            uint64_t *_source_base_64, *_destination_base_64; 
+            copy_count_16 = copy_count  / (WARP_SIZE * UNROLL_16) * (WARP_SIZE * UNROLL_16);
+            _source_base_64 = (uint64_t *)(source_base + src_offset);
+            _destination_base_64 = (uint64_t *)(destination_base + dst_offset);
+            if (copy_count_16 > 0) {
+                for (k = 0; k < copy_count_16; k += UNROLL_16 * WARP_SIZE) {
+                    #pragma unroll
+                    for (u = 0; u < UNROLL_16; u++) {
+                        j = tid_per_warp + u * WARP_SIZE + k;
+                        tmp_var_64[u] = *(_source_base_64 + j);
+
+                    }
+                    #pragma unroll
+                    for (u = 0; u < UNROLL_16; u++) {
+                        j = tid_per_warp + u * WARP_SIZE + k;
+                        *(_destination_base_64 + j) = tmp_var_64[u];
+
+                    }
+                }
+            }
+            _source_base_64 += copy_count_16;
+            _destination_base_64 += copy_count_16;
+        
+            copy_count_8 = (copy_count - copy_count_16) / (WARP_SIZE * UNROLL_8) * (WARP_SIZE * UNROLL_8);
+            if (copy_count_8 > 0) {
+                for (k = 0; k < copy_count_8; k += UNROLL_8 * WARP_SIZE) {
+                    #pragma unroll
+                    for (u = 0; u < UNROLL_8; u++) {
+                        j = tid_per_warp + u * WARP_SIZE + k;
+                        tmp_var_64[u] = *(_source_base_64 + j);
+
+                    }
+                    #pragma unroll
+                    for (u = 0; u < UNROLL_8; u++) {
+                        j = tid_per_warp + u * WARP_SIZE + k;
+                        *(_destination_base_64 + j) = tmp_var_64[u];
+
+                    }
+                }
+            }
+            _source_base_64 += copy_count_8;
+            _destination_base_64 += copy_count_8;
+        
+            copy_count_left = copy_count - copy_count_16 - copy_count_8;
+            if (copy_count_left > 0) {
+                #pragma unroll
+                for (u = 0; u < UNROLL_8; u++) {
+                    j = tid_per_warp + u * WARP_SIZE;
+                    if (j < copy_count_left) {
+                        tmp_var_64[u] = *(_source_base_64 + j);
+                    } else {
+                        break;
+                    }
+                }
+                #pragma unroll
+                for (u = 0; u < UNROLL_8; u++) {
+                    j = tid_per_warp + u * WARP_SIZE;
+                    if (j < copy_count_left) {
+                        *(_destination_base_64 + j) = tmp_var_64[u];
+                    } else {
+                        break;
+                    }
+                }
+            }
+        } else if (alignment == ALIGNMENT_FLOAT) {
+            uint32_t *_source_base_32, *_destination_base_32;    
+            copy_count_16 = copy_count  / (WARP_SIZE * UNROLL_16) * (WARP_SIZE * UNROLL_16);
+            _source_base_32 = (uint32_t *)(source_base + src_offset);
+            _destination_base_32 = (uint32_t *)(destination_base + dst_offset);
+            if (copy_count_16 > 0) {
+                for (k = 0; k < copy_count_16; k += UNROLL_16 * WARP_SIZE) {
+                    #pragma unroll
+                    for (u = 0; u < UNROLL_16; u++) {
+                        j = tid_per_warp + u * WARP_SIZE + k;
+                        tmp_var_32[u] = *(_source_base_32 + j);
+
+                    }
+                    #pragma unroll
+                    for (u = 0; u < UNROLL_16; u++) {
+                        j = tid_per_warp + u * WARP_SIZE + k;
+                        *(_destination_base_32 + j) = tmp_var_32[u];
+
+                    }
+                }
+            }
+            _source_base_32 += copy_count_16;
+            _destination_base_32 += copy_count_16;
+        
+            copy_count_8 = (copy_count - copy_count_16) / (WARP_SIZE * UNROLL_8) * (WARP_SIZE * UNROLL_8);
+            if (copy_count_8 > 0) {
+                for (k = 0; k < copy_count_8; k += UNROLL_8 * WARP_SIZE) {
+                    #pragma unroll
+                    for (u = 0; u < UNROLL_8; u++) {
+                        j = tid_per_warp + u * WARP_SIZE + k;
+                        tmp_var_32[u] = *(_source_base_32 + j);
+
+                    }
+                    #pragma unroll
+                    for (u = 0; u < UNROLL_8; u++) {
+                        j = tid_per_warp + u * WARP_SIZE + k;
+                        *(_destination_base_32 + j) = tmp_var_32[u];
+
+                    }
+                }
+            }
+            _source_base_32 += copy_count_8;
+            _destination_base_32 += copy_count_8;
+        
+            copy_count_left = copy_count - copy_count_16 - copy_count_8;
+            if (copy_count_left > 0) {
+                #pragma unroll
+                for (u = 0; u < UNROLL_8; u++) {
+                    j = tid_per_warp + u * WARP_SIZE;
+                    if (j < copy_count_left) {
+                        tmp_var_32[u] = *(_source_base_32 + j);
+                    } else {
+                        break;
+                    }
+                }
+                #pragma unroll
+                for (u = 0; u < UNROLL_8; u++) {
+                    j = tid_per_warp + u * WARP_SIZE;
+                    if (j < copy_count_left) {
+                        *(_destination_base_32 + j) = tmp_var_32[u];
+                    } else {
+                        break;
+                    }
+                }
+            }
+        } else {
+            unsigned char *_source_base_8, *_destination_base_8;
+        
+            copy_count_16 = copy_count  / (WARP_SIZE * UNROLL_16) * (WARP_SIZE * UNROLL_16);
+            _source_base_8 = (unsigned char *)(source_base + src_offset);
+            _destination_base_8 = (unsigned char *)(destination_base + dst_offset);
+            if (copy_count_16 > 0) {
+                for (k = 0; k < copy_count_16; k += UNROLL_16 * WARP_SIZE) {
+                    #pragma unroll
+                    for (u = 0; u < UNROLL_16; u++) {
+                        j = tid_per_warp + u * WARP_SIZE + k;
+                        tmp_var_8[u] = *(_source_base_8 + j);
+
+                    }
+                    #pragma unroll
+                    for (u = 0; u < UNROLL_16; u++) {
+                        j = tid_per_warp + u * WARP_SIZE + k;
+                        *(_destination_base_8 + j) = tmp_var_8[u];
+
+                    }
+                }
+            }
+            _source_base_8 += copy_count_16;
+            _destination_base_8 += copy_count_16;
+        
+            copy_count_8 = (copy_count - copy_count_16) / (WARP_SIZE * UNROLL_8) * (WARP_SIZE * UNROLL_8);
+            if (copy_count_8 > 0) {
+                for (k = 0; k < copy_count_8; k += UNROLL_8 * WARP_SIZE) {
+                    #pragma unroll
+                    for (u = 0; u < UNROLL_8; u++) {
+                        j = tid_per_warp + u * WARP_SIZE + k;
+                        tmp_var_8[u] = *(_source_base_8 + j);
+
+                    }
+                    #pragma unroll
+                    for (u = 0; u < UNROLL_8; u++) {
+                        j = tid_per_warp + u * WARP_SIZE + k;
+                        *(_destination_base_8 + j) = tmp_var_8[u];
+
+                    }
+                }
+            }
+            _source_base_8 += copy_count_8;
+            _destination_base_8 += copy_count_8;
+        
+            copy_count_left = copy_count - copy_count_16 - copy_count_8;
+            if (copy_count_left > 0) {
+                #pragma unroll
+                for (u = 0; u < UNROLL_8; u++) {
+                    j = tid_per_warp + u * WARP_SIZE;
+                    if (j < copy_count_left) {
+                        tmp_var_8[u] = *(_source_base_8 + j);
+                    } else {
+                        break;
+                    }
+                }
+                #pragma unroll
+                for (u = 0; u < UNROLL_8; u++) {
+                    j = tid_per_warp + u * WARP_SIZE;
+                    if (j < copy_count_left) {
+                        *(_destination_base_8 + j) = tmp_var_8[u];
+                    } else {
+                        break;
+                    }
+                }
+            }
+        }
+    }
+}
+#endif
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
index 0137601bf70..dd23aa853ed 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
@@ -463,7 +463,7 @@ void pack_contiguous_loop_cuda( dt_elem_desc_t* ELEM,
 #if OPAL_DATATYPE_VECTOR_USE_MEMCPY2D_AS_KERNEL
     cudaMemcpy2DAsync(_destination, _end_loop->size, _source, _loop->extent, _end_loop->size, _copy_loops, cudaMemcpyDeviceToDevice, cuda_streams->opal_cuda_stream[0]);
 #else
-    pack_contiguous_loop_cuda_kernel_global<<<192, 4*THREAD_PER_BLOCK, 0, cuda_streams->opal_cuda_stream[0]>>>(_copy_loops, _end_loop->size, _loop->extent, _source, _destination);
+    pack_contiguous_loop_cuda_kernel_global<<<1, 8*THREAD_PER_BLOCK, 0, cuda_streams->opal_cuda_stream[0]>>>(_copy_loops, _end_loop->size, _loop->extent, _source, _destination);
 #endif /* OPAL_DATATYPE_VECTOR_USE_MEMCPY2D_AS_KERNEL */
 
 #if !defined(OPAL_DATATYPE_CUDA_DRY_RUN)    
@@ -1056,7 +1056,7 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_non_cached( opal_converto
     }
     
     cuda_streams->current_stream_id = 0;
-    thread_per_block = CUDA_WARP_SIZE * 5;
+    thread_per_block = CUDA_WARP_SIZE * 4;
     nb_blocks = 256;
     opal_datatype_type_extent(pConvertor->pDesc, &ddt_extent);
     source_base = (unsigned char*)pConvertor->pBaseBuf + pConvertor->current_count * ddt_extent; 
@@ -1146,8 +1146,8 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
 
     cuda_streams->current_stream_id = 0;
     destination_base = destination;
-    thread_per_block = CUDA_WARP_SIZE * 5;
-    nb_blocks = 256;
+    thread_per_block = CUDA_WARP_SIZE * 8;
+    nb_blocks = 1;
     source_base = (unsigned char*)pConvertor->pBaseBuf; 
     
     /* cuda iov is not cached, start to cache iov */
@@ -1211,12 +1211,19 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
 #endif
     opal_datatype_type_extent(pConvertor->pDesc, &ddt_extent);
     DT_CUDA_DEBUG ( opal_cuda_output(2, "Pack kernel launched src_base %p, dst_base %p, nb_blocks %ld, extent %ld\n", source_base, destination_base, nb_blocks_used, ddt_extent ); );
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+    GET_TIME(start);
+#endif    
     opal_generic_simple_pack_cuda_iov_cached_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cached_cuda_iov_dist_d, pConvertor->current_cuda_iov_pos, cached_cuda_iov_count, ddt_extent, convertor_current_count, nb_blocks_used, source_base, destination_base);
     pConvertor->current_cuda_iov_pos += nb_blocks_used;
     pConvertor->current_cuda_iov_pos = pConvertor->current_cuda_iov_pos % cached_cuda_iov->cuda_iov_count;
 
     cudaStreamSynchronize(cuda_streams->opal_cuda_stream[0]);
-    
+#if defined(OPAL_DATATYPE_CUDA_TIMING)    
+    GET_TIME( end );
+    total_time = ELAPSED_TIME( start, end );
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Pack kernel %ld microsec\n", total_time); );
+#endif    
     return OPAL_SUCCESS;
 }
 
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
index bb54dfeeb0a..f4e89accefe 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
@@ -830,7 +830,7 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
 
     cuda_streams->current_stream_id = 0;
     source_base = source;
-    thread_per_block = CUDA_WARP_SIZE * 5;
+    thread_per_block = CUDA_WARP_SIZE * 4;
     nb_blocks = 256;
     destination_base = (unsigned char*)pConvertor->pBaseBuf;
     
@@ -908,10 +908,19 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
 #endif
     opal_datatype_type_extent(pConvertor->pDesc, &ddt_extent);
     DT_CUDA_DEBUG ( opal_cuda_output(2, "Unpack kernel launched src_base %p, dst_base %p, nb_blocks %ld\n", source_base, destination_base, nb_blocks_used ); );
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+    GET_TIME(start);
+#endif    
     opal_generic_simple_unpack_cuda_iov_cached_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cached_cuda_iov_dist_d, pConvertor->current_cuda_iov_pos, cached_cuda_iov_count, ddt_extent, convertor_current_count, nb_blocks_used, destination_base, source_base, cuda_iov_partial_length_start, cuda_iov_partial_length_end);
 
     cudaStreamSynchronize(cuda_streams->opal_cuda_stream[0]);
-    
+#if defined(OPAL_DATATYPE_CUDA_TIMING)    
+    GET_TIME( end );
+    total_time = ELAPSED_TIME( start, end );
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Unpack kernel %ld microsec\n", total_time); );
+#endif
+
     return OPAL_SUCCESS;
 }
 
diff --git a/test/datatype/Makefile.am b/test/datatype/Makefile.am
index de658c503cb..3dd69732ba8 100644
--- a/test/datatype/Makefile.am
+++ b/test/datatype/Makefile.am
@@ -35,7 +35,7 @@ ddt_test_LDADD = $(top_builddir)/ompi/libmpi.la $(top_builddir)/opal/mca/common/
 ddt_benchmark_SOURCES = ddt_benchmark.c ddt_lib.c ddt_lib.h
 ddt_benchmark_LDFLAGS = $(WRAPPER_EXTRA_LDFLAGS)
 ddt_benchmark_CFLAGS = -I/mnt/sw/cuda/include -g -O0
-ddt_benchmark_LDADD = $(top_builddir)/ompi/libmpi.la $(top_builddir)/opal/mca/common/cuda/libmca_common_cuda.la -L/mnt/sw/cuda/lib64 -lcudart
+ddt_benchmark_LDADD = $(top_builddir)/ompi/libmpi.la $(top_builddir)/opal/mca/common/cuda/libmca_common_cuda.la -L/shared/apps/cuda/CUDA-v7.5.18/lib64 -lcudart
 
 #ddt_test_old_SOURCES = ddt_test_old.c ddt_lib.c ddt_lib.h
 #ddt_test_old_LDFLAGS = $(WRAPPER_EXTRA_LDFLAGS)
diff --git a/test/datatype/ddt_benchmark.c b/test/datatype/ddt_benchmark.c
index e879e5c0192..1ce768900db 100644
--- a/test/datatype/ddt_benchmark.c
+++ b/test/datatype/ddt_benchmark.c
@@ -198,24 +198,27 @@ static void fill_vectors(double* vp, int itera, int contig, int gap)
     for (i = (itera-1)*gap; i < (itera-1)*gap+contig; i++) {
         vp[i] = 1.1;
     }
-    
-    // printf("vector generated:\n");
-    // for (i = 0; i < (itera-1)*gap+contig; i++) {
-    //     printf("%1.f ", vp[i]);
-    // }
-    printf("\n");
+   /* 
+     printf("vector generated:\n");
+     for (i = 0; i < (itera-1)*gap+contig; i++) {
+         printf("%1.f ", vp[i]);
+         if ((i+1) % gap == 0) printf("\n");
+     }
+    printf("\n");*/
 }
 
 static void verify_vectors(double *vp, int itera, int contig, int gap)
 {
     int i, j;
     int error = 0;
+    int count = 0;
     for (i = 0; i < itera-1; i++) {
         for (j = i*gap; j < (i+1)*gap; j++) {
             if (j >= i*gap && j < i*gap+contig) {
                 if (vp[j] != 1.1) {
                     error ++;
                 }
+                count ++;
             } 
         }
     }
@@ -223,15 +226,19 @@ static void verify_vectors(double *vp, int itera, int contig, int gap)
         if (vp[i] != 1.1) {
             error ++;
         }
+        count ++;
     }
-    // printf("vector received:\n");
-    // for (i = 0; i < (itera-1)*gap+contig; i++) {
-    //     printf("%1.f ", vp[i]);
-    // }
-    if (error != 0) {
-        printf("%d error is found\n", error);
+/*
+     printf("vector received:\n");
+     for (i = 0; i < (itera-1)*gap+contig; i++) {
+         printf("%1.f ", vp[i]);
+         if ((i+1) % gap == 0) printf("\n");
+     }
+  */
+     if (error != 0) {
+        printf("%d errors out of %d\n", error, count);
     } else {
-        printf("no error is found\n");
+        printf("no errors out of %d\n", count);
     }
 }
 
@@ -249,9 +256,10 @@ vector_ddt( ompi_datatype_t* send_type, int send_count,
     TIMER_DATA_TYPE start, end, unpack_start, unpack_end;
     long total_time, unpack_time = 0, push_time = 0, pop_time = 0, pack_time = 0;
     size_t slength, rlength;
+    int shift_n = 0;
 
-    rlength = compute_buffer_length(recv_type, recv_count);
-    slength = compute_buffer_length(send_type, send_count);
+    rlength = compute_buffer_length(recv_type, recv_count) + sizeof(double)*shift_n;
+    slength = compute_buffer_length(send_type, send_count) + sizeof(double)*shift_n;
     
     cudaSetDevice(0);
 
@@ -261,6 +269,7 @@ vector_ddt( ompi_datatype_t* send_type, int send_count,
         exit(-1);
     }
     cudaMemset(psrc, 0, slength);
+    psrc += sizeof(double)*shift_n;
     printf("cudamalloc psrc %p\n", psrc);
     
     error = cudaMalloc((void **)&pdst, rlength);
@@ -269,6 +278,7 @@ vector_ddt( ompi_datatype_t* send_type, int send_count,
         exit(-1);
     }
     cudaMemset(pdst, 0, rlength); 
+    pdst += sizeof(double)*shift_n;
     printf("cudamalloc pdst %p\n", pdst);
     
  //   error = cudaHostAlloc((void **)&ptemp, chunk, cudaHostAllocMapped);
@@ -279,6 +289,7 @@ vector_ddt( ompi_datatype_t* send_type, int send_count,
         exit(-1);
     }
     memset(ptemp, 0, chunk);
+    ptemp += sizeof(double)*shift_n;
     printf("cudamallochost ptemp %p\n", ptemp);
     
     
@@ -290,6 +301,10 @@ vector_ddt( ompi_datatype_t* send_type, int send_count,
     
     memset(psrc_host, 0, slength);
     memset(pdst_host, 0, rlength);
+    pdst_host += sizeof(double)*shift_n;
+    psrc_host += sizeof(double)*shift_n;
+    slength -= sizeof(double)*shift_n;
+    rlength -= sizeof(double)*shift_n;
     if (itera > 0) {
         fill_vectors((double *)psrc_host, itera, contig, gap);
     }
@@ -708,6 +723,14 @@ static void fill_upper_matrix(void *matt, int msize)
         blklens[i] = msize - i;
         displs[i] = i*msize + i;
     }
+    /*int ct = 0;
+    for (i = 0; i < msize; i++) {
+        blklens[i] = msize - ct*160;
+        displs[i] = i*msize + ct*160;
+        if (i % 160 == 0 && i != 0) {
+            ct++;
+        }
+    }*/
     for (i = 0; i < msize; i++) {
         start = displs[i];
         end = start + blklens[i];
@@ -722,13 +745,14 @@ static void fill_upper_matrix(void *matt, int msize)
     free(blklens);
     free(displs);
 
-   // printf("matrix generate\n");
-   // for (i = 0; i < msize; i++) {
-   //     for (j = 0; j < msize; j++) {
-   //         printf(" %1.f ", mat[i*msize+j]);
-   //     }
-   //     printf("\n");
-   // }
+    /*
+    printf("matrix generate\n");
+    for (i = 0; i < msize; i++) {
+        for (j = 0; j < msize; j++) {
+            printf(" %1.f ", mat[i*msize+j]);
+        }
+        printf("\n");
+    }*/
 }
 
 static void verify_mat_result(void *matt, int msize)
@@ -752,6 +776,14 @@ static void verify_mat_result(void *matt, int msize)
         blklens[i] = msize - i;
         displs[i] = i*msize + i;
     }
+    /*int ct = 0;
+    for (i = 0; i < msize; i++) {
+        blklens[i] = msize - ct*160;
+        displs[i] = i*msize + ct*160;
+        if (i % 160 == 0 && i != 0) {
+            ct++;
+        }
+    }*/
     for (i = 0; i < msize; i++) {
         start = displs[i];
         end = start + blklens[i];
@@ -767,15 +799,15 @@ static void verify_mat_result(void *matt, int msize)
     }
     free(blklens);
     free(displs);
-    
-    // printf("matrix received\n");
-    // for (i = 0; i < msize; i++) {
-    //     for (j = 0; j < msize; j++) {
-    //         printf(" %1.f ", mat[i*msize+j]);
-    //     }
-    //     printf("\n");
-    // }
-    
+   /* 
+     printf("matrix received\n");
+     for (i = 0; i < msize; i++) {
+         for (j = 0; j < msize; j++) {
+             printf(" %1.f ", mat[i*msize+j]);
+         }
+         printf("\n");
+     }
+    */
     if (error != 0) {
         printf("error is found %d\n", error);
     } else {
@@ -795,8 +827,9 @@ static int local_copy_with_convertor( ompi_datatype_t* pdt, int count, int chunk
     long total_time, unpack_time = 0;
     int j, t_error = 0;
     unsigned char *mat_char;
+    int shift_n = 0;
 
-    dt_length = compute_buffer_length(pdt, count);
+    dt_length = compute_buffer_length(pdt, count) + sizeof(double) * shift_n;
     printf("length %lu\n", dt_length);
 
 #if defined (DDT_TEST_CUDA)
@@ -809,6 +842,7 @@ static int local_copy_with_convertor( ompi_datatype_t* pdt, int count, int chunk
         printf("CUDA error: %s\n", cudaGetErrorString(error));
         exit(-1);
     }
+    psrc += sizeof(double) * shift_n;
     cudaMemset(psrc, 0, dt_length);
     printf("cudamalloc psrc %p\n", psrc);
     
@@ -817,6 +851,7 @@ static int local_copy_with_convertor( ompi_datatype_t* pdt, int count, int chunk
         printf("CUDA error: %s\n", cudaGetErrorString(error));
         exit(-1);
     }
+    pdst += sizeof(double) * shift_n;
     cudaMemset(pdst, 0, dt_length); 
     printf("cudamalloc pdst %p\n", pdst);
     
@@ -825,6 +860,7 @@ static int local_copy_with_convertor( ompi_datatype_t* pdt, int count, int chunk
         printf("CUDA error: %s\n", cudaGetErrorString(error));
         exit(-1);
     }
+    ptemp += sizeof(double) * shift_n;
     memset(ptemp, 0, chunk);
     printf("cudamallochost ptemp %p\n", ptemp);
     
@@ -833,6 +869,7 @@ static int local_copy_with_convertor( ompi_datatype_t* pdt, int count, int chunk
         printf("CUDA error: %s\n", cudaGetErrorString(error));
         exit(-1);
     }
+    phost += sizeof(double) * shift_n;
     memset(phost, 0, dt_length);
     printf("cudamallochost phost %p\n", phost);
 #else
@@ -845,6 +882,7 @@ static int local_copy_with_convertor( ompi_datatype_t* pdt, int count, int chunk
 #endif
 
 #if defined (DDT_TEST_CUDA)
+    dt_length -= sizeof(double) * shift_n;
     if (msize > 0) {
         fill_upper_matrix(phost, msize);
     }
@@ -904,6 +942,11 @@ static int local_copy_with_convertor( ompi_datatype_t* pdt, int count, int chunk
         }
         printf("total error %d\n", t_error);
 #endif
+      /*  double *mat_d = (double *)ptemp;
+        for (j = 0; j < max_data/sizeof(double); j++) {
+            printf("%1.f ", mat_d[j]);
+        }*/
+      //  printf("max data %d, ptemp %p \n", max_data, ptemp);
 
         if( done2 == 0 ) {
             GET_TIME( unpack_start );
@@ -936,6 +979,10 @@ static int local_copy_with_convertor( ompi_datatype_t* pdt, int count, int chunk
     if( NULL != recv_convertor ) OBJ_RELEASE( recv_convertor );
 
 #if defined (DDT_TEST_CUDA)
+    psrc -= sizeof(double) * shift_n;
+    pdst -= sizeof(double) * shift_n;
+    ptemp -= sizeof(double) * shift_n;
+    phost -= sizeof(double) * shift_n;
     if( NULL != pdst ) cudaFree( pdst );
     if( NULL != psrc ) cudaFree( psrc );
     if( NULL != ptemp ) cudaFreeHost( ptemp );
@@ -1224,12 +1271,12 @@ int main( int argc, char* argv[] )
     
     printf( "\n\n#\n * TEST UPPER TRIANGULAR MATRIX (size 100)\n #\n\n" );
     int mat_size = 500;
-    for (mat_size = 2000; mat_size <= 2000; mat_size +=500) {
+    for (mat_size = 1000; mat_size <= 4000; mat_size +=1000) {
         pdt = upper_matrix(mat_size);
         printf("----matrix size %d-----\n", mat_size);
         if( outputFlags & CHECK_PACK_UNPACK ) {
-            for (i = 1; i <= 1; i++) {
-                local_copy_with_convertor(pdt, 1, 40000000, mat_size);
+            for (i = 1; i <= 5; i++) {
+          //       local_copy_with_convertor(pdt, 1, 200000000, mat_size);
             }
         }
         OBJ_RELEASE( pdt ); assert( pdt == NULL );
@@ -1292,13 +1339,13 @@ int main( int argc, char* argv[] )
     }
     
     
-    for (blk_len = 1000; blk_len <= 1000; blk_len += 2) {
+    for (blk_len = 4000; blk_len <= 4000; blk_len += 2000) {
         printf( ">>--------------------------------------------<<\n" );
         printf( "Vector data-type (1024 times %d double stride 512)\n", blk_len );
-        pdt = create_vector_type( MPI_DOUBLE, 1000, blk_len, blk_len*2);
+        pdt = create_vector_type( MPI_DOUBLE, blk_len, blk_len, blk_len*2);
         if( outputFlags & CHECK_PACK_UNPACK ) {
-            for (i = 0; i < 1; i++) {
-        //         vector_ddt( pdt, 1, pdt, 1, 2000000 , 1000, blk_len, blk_len*2);
+            for (i = 0; i < 4; i++) {
+                 vector_ddt( pdt, 1, pdt, 1, 1024*1024*200 , blk_len, blk_len, blk_len*2);
      //          vector_ddt_2d( pdt, 1, pdt, 1, 1024*1024*100 , 8192, blk_len, blk_len+128);
             }
         }
diff --git a/test/datatype/ddt_lib.c b/test/datatype/ddt_lib.c
index 321a5c4be88..a96ec085ddd 100644
--- a/test/datatype/ddt_lib.c
+++ b/test/datatype/ddt_lib.c
@@ -363,6 +363,14 @@ ompi_datatype_t* upper_matrix( unsigned int mat_size )
         disp[i] = i * mat_size + i;
         blocklen[i] = mat_size - i;
     }
+    /*int ct = 0;
+    for (i = 0; i < mat_size; i++) {
+        blocklen[i] = mat_size - ct*160;
+        disp[i] = i*mat_size + ct*160;
+        if (i % 160 == 0 && i != 0) {
+            ct++;
+        }
+    }*/
 #if defined (TEST_DOUBLE)
     ompi_datatype_create_indexed( mat_size, blocklen, disp, &ompi_mpi_double.dt,
                              &upper );
diff --git a/test/datatype/ddt_lib.h b/test/datatype/ddt_lib.h
index ef462ce0f31..0f6bbc2cb37 100644
--- a/test/datatype/ddt_lib.h
+++ b/test/datatype/ddt_lib.h
@@ -34,9 +34,9 @@
 #define DUMP_DATA_AFTER_COMMIT 0x00000001
 #define CHECK_PACK_UNPACK      0x00000002
 
-//#define TEST_DOUBLE
+#define TEST_DOUBLE
 //#define TEST_FLOAT
-#define TEST_CHAR
+//#define TEST_CHAR
 
 
 extern uint32_t outputFlags;

From b45b646983c26c7d075606eb6bc3d9be8b5e7fdb Mon Sep 17 00:00:00 2001
From: Wei Wu <eddy16112@gmail.com>
Date: Mon, 22 Feb 2016 17:13:30 -0800
Subject: [PATCH 186/190] apply unroll to unpack

---
 opal/datatype/cuda/opal_datatype_cuda.cu      |   2 +-
 .../cuda/opal_datatype_pack_cuda_kernel.cu    |  12 +-
 .../cuda/opal_datatype_pack_cuda_wrapper.cu   |   7 +-
 .../cuda/opal_datatype_unpack_cuda_kernel.cu  | 288 ++++++++++++++++++
 .../cuda/opal_datatype_unpack_cuda_wrapper.cu |   5 +-
 test/datatype/ddt_benchmark.c                 |   2 +-
 6 files changed, 303 insertions(+), 13 deletions(-)

diff --git a/opal/datatype/cuda/opal_datatype_cuda.cu b/opal/datatype/cuda/opal_datatype_cuda.cu
index 372edefa96a..7d12a5d80db 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cu
+++ b/opal/datatype/cuda/opal_datatype_cuda.cu
@@ -389,7 +389,7 @@ int32_t opal_ddt_cache_cuda_iov(opal_convertor_t* pConvertor, uint32_t *cuda_iov
     cuda_iov_pipeline_block = current_cuda_device->cuda_iov_pipeline_block[0];
     cuda_iov_dist_h = cuda_iov_pipeline_block->cuda_iov_dist_cached_h;
     cuda_stream_iov = cuda_iov_pipeline_block->cuda_stream;
-    thread_per_block = CUDA_WARP_SIZE * 32;
+    thread_per_block = CUDA_WARP_SIZE * 64;
 
     for (i = 0; i < ddt_iov_count; i++) {
         length_per_iovec = ddt_iov[i].iov_len;
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
index 79138a72f9a..81e7f7c4dcd 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
@@ -5,7 +5,7 @@
 #include <stdio.h> 
 #include <time.h>
 
-#if 1
+#if 0
 __global__ void pack_contiguous_loop_cuda_kernel_global( uint32_t copy_loops,
                                                          size_t size,
                                                          OPAL_PTRDIFF_TYPE extent,
@@ -43,20 +43,20 @@ __global__ void pack_contiguous_loop_cuda_kernel_global( uint32_t copy_loops,
         _destination_tmp += num_threads;
     }
 #else
-    for (_i = tid; _i < copy_loops*nb_elements; _i+=16*num_threads) {
+    for (_i = tid; _i < copy_loops*nb_elements; _i+=8*num_threads) {
         uint64_t val[16];
         uint32_t _j;
         uint32_t u;
         uint64_t *mysrc = _src_disp_tmp + tid;
         
         #pragma unroll      
-        for (u = 0; u < 16; u++) {
+        for (u = 0; u < 8; u++) {
             _j = _i + u * num_threads;
             val[u] = *(mysrc + _j/num_threads*num_threads + _j/nb_elements * gap);
         } 
         
         #pragma unroll
-        for (u = 0; u < 16; u++) {
+        for (u = 0; u < 8; u++) {
             *_destination_tmp = val[u];
             _destination_tmp += num_threads;
         } 
@@ -184,7 +184,7 @@ __global__ void pack_contiguous_loop_cuda_kernel_global( uint32_t lines,
     	    for (int b=0; b<lines/(KERNEL_UNROLL*warp_nb); b++) {
     		    #pragma unroll
     		    for (int u=0; u<KERNEL_UNROLL; u++) {
-    		        val[u] = __ldg(src+(l+u)*extent);
+    		        val[u] = *(src+(l+u)*extent);
     		    }
     		    #pragma unroll
     		    for (int u=0; u<KERNEL_UNROLL; u++) {
@@ -194,7 +194,7 @@ __global__ void pack_contiguous_loop_cuda_kernel_global( uint32_t lines,
     	    }
     	    /* Finish non-unrollable case */
     	    for (int u=0; u<KERNEL_UNROLL && l<lines; u++) {
-    		    dst[l*size] = __ldg(src+l*extent);
+    		    dst[l*size] = *(src+l*extent);
     		    l++;
     	    }		
     	    col += width;
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
index dd23aa853ed..534c3372d60 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
@@ -463,7 +463,7 @@ void pack_contiguous_loop_cuda( dt_elem_desc_t* ELEM,
 #if OPAL_DATATYPE_VECTOR_USE_MEMCPY2D_AS_KERNEL
     cudaMemcpy2DAsync(_destination, _end_loop->size, _source, _loop->extent, _end_loop->size, _copy_loops, cudaMemcpyDeviceToDevice, cuda_streams->opal_cuda_stream[0]);
 #else
-    pack_contiguous_loop_cuda_kernel_global<<<1, 8*THREAD_PER_BLOCK, 0, cuda_streams->opal_cuda_stream[0]>>>(_copy_loops, _end_loop->size, _loop->extent, _source, _destination);
+    pack_contiguous_loop_cuda_kernel_global<<<32, 8*THREAD_PER_BLOCK, 0, cuda_streams->opal_cuda_stream[0]>>>(_copy_loops, _end_loop->size, _loop->extent, _source, _destination);
 #endif /* OPAL_DATATYPE_VECTOR_USE_MEMCPY2D_AS_KERNEL */
 
 #if !defined(OPAL_DATATYPE_CUDA_DRY_RUN)    
@@ -1056,7 +1056,7 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_non_cached( opal_converto
     }
     
     cuda_streams->current_stream_id = 0;
-    thread_per_block = CUDA_WARP_SIZE * 4;
+    thread_per_block = CUDA_WARP_SIZE * 5;
     nb_blocks = 256;
     opal_datatype_type_extent(pConvertor->pDesc, &ddt_extent);
     source_base = (unsigned char*)pConvertor->pBaseBuf + pConvertor->current_count * ddt_extent; 
@@ -1095,6 +1095,7 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_non_cached( opal_converto
 
         cudaMemcpyAsync(cuda_iov_dist_d_current, cuda_iov_dist_h_current, sizeof(ddt_cuda_iov_dist_cached_t)*(nb_blocks_used+1), cudaMemcpyHostToDevice, *cuda_stream_iov);
         opal_generic_simple_pack_cuda_iov_cached_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cuda_iov_dist_d_current, 0, nb_blocks_used, 0, 0, nb_blocks_used, source_base, destination_base);
+        //cudaStreamSynchronize(*cuda_stream_iov);
         cuda_err = cudaEventRecord(cuda_iov_pipeline_block->cuda_event, *cuda_stream_iov);
         opal_cuda_check_error(cuda_err);
         iov_pipeline_block_id ++;
@@ -1147,7 +1148,7 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
     cuda_streams->current_stream_id = 0;
     destination_base = destination;
     thread_per_block = CUDA_WARP_SIZE * 8;
-    nb_blocks = 1;
+    nb_blocks = 4;
     source_base = (unsigned char*)pConvertor->pBaseBuf; 
     
     /* cuda iov is not cached, start to cache iov */
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
index f6ee8e0bfc4..4774abf5f38 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
@@ -46,6 +46,7 @@ __global__ void opal_generic_simple_unpack_cuda_iov_non_cached_kernel( ddt_cuda_
     }
 }
 
+#if 0
 __global__ void opal_generic_simple_unpack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, uint32_t cuda_iov_pos, uint32_t cuda_iov_count, uint32_t ddt_extent, uint32_t current_count, int nb_blocks_used, unsigned char* destination_base, unsigned char* source_base, size_t cuda_iov_partial_length_start, size_t cuda_iov_partial_length_end)
 {
     uint32_t i, j;
@@ -136,6 +137,293 @@ __global__ void opal_generic_simple_unpack_cuda_iov_cached_kernel( ddt_cuda_iov_
     }
 }
 
+#else 
+__global__ void opal_generic_simple_unpack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, uint32_t cuda_iov_pos, uint32_t cuda_iov_count, uint32_t ddt_extent, uint32_t current_count, int nb_blocks_used, unsigned char* destination_base, unsigned char* source_base, size_t cuda_iov_partial_length_start, size_t cuda_iov_partial_length_end)
+{
+    uint32_t i, j;
+    size_t dst_offset, src_offset;
+    unsigned char *_source_tmp, *_destination_tmp;
+    uint32_t _nb_bytes;
+    uint32_t current_cuda_iov_pos = cuda_iov_pos;
+    size_t source_disp = cuda_iov_dist[current_cuda_iov_pos].contig_disp;
+    size_t source_partial_disp = 0;
+    size_t contig_disp; 
+    uint32_t _my_cuda_iov_pos;
+    uint32_t _my_cuda_iov_iteration;
+    size_t ddt_size = cuda_iov_dist[cuda_iov_count].contig_disp;
+
+    __shared__ uint32_t nb_tasks_per_block;
+    __shared__ uint32_t WARP_SIZE;
+    __shared__ uint32_t nb_warp_per_block;
+    uint32_t copy_count;
+    uint8_t alignment;
+    uint64_t tmp_var_64[KERNEL_UNROLL];
+    uint32_t tmp_var_32[KERNEL_UNROLL];
+    unsigned char tmp_var_8[KERNEL_UNROLL];
+    uint32_t u, k;
+    uint32_t copy_count_16, copy_count_8, copy_count_left;
+    
+    if (threadIdx.x == 0) {
+        nb_tasks_per_block = nb_blocks_used / gridDim.x;
+        if (blockIdx.x < nb_blocks_used % gridDim.x) {
+            nb_tasks_per_block ++;
+        }
+        if (nb_tasks_per_block >= 4) {
+            WARP_SIZE = 32;
+        } else if (nb_tasks_per_block == 1) {
+            WARP_SIZE = blockDim.x;
+        } else {
+            WARP_SIZE = 64;
+        }
+        nb_warp_per_block = blockDim.x / WARP_SIZE;
+     //   printf("cuda_iov_count %d, ddt_extent %d, current_count %d, ddt_size %d\n", cuda_iov_count, ddt_extent, current_count, ddt_size);
+    }
+    __syncthreads();
+    
+    const uint32_t warp_id_per_block = threadIdx.x / WARP_SIZE;
+    const uint32_t tid_per_warp = threadIdx.x & (WARP_SIZE - 1);
+    
+    if (cuda_iov_partial_length_start != 0) {
+        source_partial_disp = (cuda_iov_dist[current_cuda_iov_pos+1].contig_disp - cuda_iov_dist[current_cuda_iov_pos].contig_disp) - cuda_iov_partial_length_start;
+    }
+    
+    for (i = warp_id_per_block; i < nb_tasks_per_block; i+= nb_warp_per_block) {
+        /* these 3 variables are used multiple times, so put in in register */
+        _my_cuda_iov_pos = (blockIdx.x + i * gridDim.x + current_cuda_iov_pos) % cuda_iov_count;
+        _my_cuda_iov_iteration = (blockIdx.x + i * gridDim.x + current_cuda_iov_pos) / cuda_iov_count;
+        contig_disp = cuda_iov_dist[_my_cuda_iov_pos].contig_disp; 
+        
+        src_offset = contig_disp + ddt_size * _my_cuda_iov_iteration - source_disp - source_partial_disp;
+        dst_offset = cuda_iov_dist[_my_cuda_iov_pos].ncontig_disp + (_my_cuda_iov_iteration + current_count) * ddt_extent;
+        _nb_bytes = cuda_iov_dist[_my_cuda_iov_pos + 1].contig_disp - contig_disp;
+
+        if (i == 0 && blockIdx.x == 0 && cuda_iov_partial_length_start != 0) {
+            src_offset = contig_disp + ddt_size * _my_cuda_iov_iteration - source_disp;
+            dst_offset = dst_offset + _nb_bytes - cuda_iov_partial_length_start;  
+            _nb_bytes = cuda_iov_partial_length_start;
+        } else if (i == nb_tasks_per_block-1 && (blockIdx.x == (nb_blocks_used-1) % gridDim.x) && cuda_iov_partial_length_end != 0) {
+            _nb_bytes = cuda_iov_partial_length_end;
+        }
+        
+        _destination_tmp = destination_base + dst_offset; 
+        _source_tmp = source_base + src_offset;
+        if ((uintptr_t)(_destination_tmp) % ALIGNMENT_DOUBLE == 0 && (uintptr_t)(_source_tmp) % ALIGNMENT_DOUBLE == 0 && _nb_bytes % ALIGNMENT_DOUBLE == 0) {
+            alignment = ALIGNMENT_DOUBLE;
+        } else if ((uintptr_t)(_destination_tmp) % ALIGNMENT_FLOAT == 0 && (uintptr_t)(_source_tmp) % ALIGNMENT_FLOAT == 0 && _nb_bytes % ALIGNMENT_FLOAT == 0) {
+            alignment = ALIGNMENT_FLOAT;
+        } else {
+            alignment = ALIGNMENT_CHAR;
+        }
+        
+        copy_count = _nb_bytes / alignment;
+   /*     
+        if (threadIdx.x == 0 && nb_tasks != 0) {
+            printf("unpack block %d, src_offset %ld, dst_offset %ld, count %d, nb_bytes %d, nb_tasks %d, i %d\n", blockIdx.x, src_offset, dst_offset, copy_count, _nb_bytes, nb_tasks, i);
+        }
+        __syncthreads();
+     */   
+        if (alignment == ALIGNMENT_DOUBLE) {
+            uint64_t *_source_base_64, *_destination_base_64; 
+            copy_count_16 = copy_count  / (WARP_SIZE * UNROLL_16) * (WARP_SIZE * UNROLL_16);
+            _source_base_64 = (uint64_t *)(source_base + src_offset);
+            _destination_base_64 = (uint64_t *)(destination_base + dst_offset);
+            if (copy_count_16 > 0) {
+                for (k = 0; k < copy_count_16; k += UNROLL_16 * WARP_SIZE) {
+                    #pragma unroll
+                    for (u = 0; u < UNROLL_16; u++) {
+                        j = tid_per_warp + u * WARP_SIZE + k;
+                        tmp_var_64[u] = *(_source_base_64 + j);
+
+                    }
+                    #pragma unroll
+                    for (u = 0; u < UNROLL_16; u++) {
+                        j = tid_per_warp + u * WARP_SIZE + k;
+                        *(_destination_base_64 + j) = tmp_var_64[u];
+
+                    }
+                }
+            }
+            _source_base_64 += copy_count_16;
+            _destination_base_64 += copy_count_16;
+            
+            copy_count_8 = (copy_count - copy_count_16) / (WARP_SIZE * UNROLL_8) * (WARP_SIZE * UNROLL_8);
+            if (copy_count_8 > 0) {
+                for (k = 0; k < copy_count_8; k += UNROLL_8 * WARP_SIZE) {
+                    #pragma unroll
+                    for (u = 0; u < UNROLL_8; u++) {
+                        j = tid_per_warp + u * WARP_SIZE + k;
+                        tmp_var_64[u] = *(_source_base_64 + j);
+
+                    }
+                    #pragma unroll
+                    for (u = 0; u < UNROLL_8; u++) {
+                        j = tid_per_warp + u * WARP_SIZE + k;
+                        *(_destination_base_64 + j) = tmp_var_64[u];
+
+                    }
+                }
+            }
+            _source_base_64 += copy_count_8;
+            _destination_base_64 += copy_count_8;
+        
+            copy_count_left = copy_count - copy_count_16 - copy_count_8;
+            if (copy_count_left > 0) {
+                #pragma unroll
+                for (u = 0; u < UNROLL_8; u++) {
+                    j = tid_per_warp + u * WARP_SIZE;
+                    if (j < copy_count_left) {
+                        tmp_var_64[u] = *(_source_base_64 + j);
+                    } else {
+                        break;
+                    }
+                }
+                #pragma unroll
+                for (u = 0; u < UNROLL_8; u++) {
+                    j = tid_per_warp + u * WARP_SIZE;
+                    if (j < copy_count_left) {
+                        *(_destination_base_64 + j) = tmp_var_64[u];
+                    } else {
+                        break;
+                    }
+                }
+            }
+        } else if (alignment == ALIGNMENT_FLOAT) {
+            uint32_t *_source_base_32, *_destination_base_32;    
+            copy_count_16 = copy_count  / (WARP_SIZE * UNROLL_16) * (WARP_SIZE * UNROLL_16);
+            _source_base_32 = (uint32_t *)(source_base + src_offset);
+            _destination_base_32 = (uint32_t *)(destination_base + dst_offset);
+            if (copy_count_16 > 0) {
+                for (k = 0; k < copy_count_16; k += UNROLL_16 * WARP_SIZE) {
+                    #pragma unroll
+                    for (u = 0; u < UNROLL_16; u++) {
+                        j = tid_per_warp + u * WARP_SIZE + k;
+                        tmp_var_32[u] = *(_source_base_32 + j);
+
+                    }
+                    #pragma unroll
+                    for (u = 0; u < UNROLL_16; u++) {
+                        j = tid_per_warp + u * WARP_SIZE + k;
+                        *(_destination_base_32 + j) = tmp_var_32[u];
+
+                    }
+                }
+            }
+            _source_base_32 += copy_count_16;
+            _destination_base_32 += copy_count_16;
+        
+            copy_count_8 = (copy_count - copy_count_16) / (WARP_SIZE * UNROLL_8) * (WARP_SIZE * UNROLL_8);
+            if (copy_count_8 > 0) {
+                for (k = 0; k < copy_count_8; k += UNROLL_8 * WARP_SIZE) {
+                    #pragma unroll
+                    for (u = 0; u < UNROLL_8; u++) {
+                        j = tid_per_warp + u * WARP_SIZE + k;
+                        tmp_var_32[u] = *(_source_base_32 + j);
+
+                    }
+                    #pragma unroll
+                    for (u = 0; u < UNROLL_8; u++) {
+                        j = tid_per_warp + u * WARP_SIZE + k;
+                        *(_destination_base_32 + j) = tmp_var_32[u];
+
+                    }
+                }
+            }
+            _source_base_32 += copy_count_8;
+            _destination_base_32 += copy_count_8;
+        
+            copy_count_left = copy_count - copy_count_16 - copy_count_8;
+            if (copy_count_left > 0) {
+                #pragma unroll
+                for (u = 0; u < UNROLL_8; u++) {
+                    j = tid_per_warp + u * WARP_SIZE;
+                    if (j < copy_count_left) {
+                        tmp_var_32[u] = *(_source_base_32 + j);
+                    } else {
+                        break;
+                    }
+                }
+                #pragma unroll
+                for (u = 0; u < UNROLL_8; u++) {
+                    j = tid_per_warp + u * WARP_SIZE;
+                    if (j < copy_count_left) {
+                        *(_destination_base_32 + j) = tmp_var_32[u];
+                    } else {
+                        break;
+                    }
+                }
+            }
+        } else {
+            unsigned char *_source_base_8, *_destination_base_8;
+        
+            copy_count_16 = copy_count  / (WARP_SIZE * UNROLL_16) * (WARP_SIZE * UNROLL_16);
+            _source_base_8 = (unsigned char *)(source_base + src_offset);
+            _destination_base_8 = (unsigned char *)(destination_base + dst_offset);
+            if (copy_count_16 > 0) {
+                for (k = 0; k < copy_count_16; k += UNROLL_16 * WARP_SIZE) {
+                    #pragma unroll
+                    for (u = 0; u < UNROLL_16; u++) {
+                        j = tid_per_warp + u * WARP_SIZE + k;
+                        tmp_var_8[u] = *(_source_base_8 + j);
+
+                    }
+                    #pragma unroll
+                    for (u = 0; u < UNROLL_16; u++) {
+                        j = tid_per_warp + u * WARP_SIZE + k;
+                        *(_destination_base_8 + j) = tmp_var_8[u];
+
+                    }
+                }
+            }
+            _source_base_8 += copy_count_16;
+            _destination_base_8 += copy_count_16;
+        
+            copy_count_8 = (copy_count - copy_count_16) / (WARP_SIZE * UNROLL_8) * (WARP_SIZE * UNROLL_8);
+            if (copy_count_8 > 0) {
+                for (k = 0; k < copy_count_8; k += UNROLL_8 * WARP_SIZE) {
+                    #pragma unroll
+                    for (u = 0; u < UNROLL_8; u++) {
+                        j = tid_per_warp + u * WARP_SIZE + k;
+                        tmp_var_8[u] = *(_source_base_8 + j);
+
+                    }
+                    #pragma unroll
+                    for (u = 0; u < UNROLL_8; u++) {
+                        j = tid_per_warp + u * WARP_SIZE + k;
+                        *(_destination_base_8 + j) = tmp_var_8[u];
+
+                    }
+                }
+            }
+            _source_base_8 += copy_count_8;
+            _destination_base_8 += copy_count_8;
+        
+            copy_count_left = copy_count - copy_count_16 - copy_count_8;
+            if (copy_count_left > 0) {
+                #pragma unroll
+                for (u = 0; u < UNROLL_8; u++) {
+                    j = tid_per_warp + u * WARP_SIZE;
+                    if (j < copy_count_left) {
+                        tmp_var_8[u] = *(_source_base_8 + j);
+                    } else {
+                        break;
+                    }
+                }
+                #pragma unroll
+                for (u = 0; u < UNROLL_8; u++) {
+                    j = tid_per_warp + u * WARP_SIZE;
+                    if (j < copy_count_left) {
+                        *(_destination_base_8 + j) = tmp_var_8[u];
+                    } else {
+                        break;
+                    }
+                }
+            }
+        }
+    }
+}
+
+#endif
+
 __global__ void unpack_contiguous_loop_cuda_kernel_global( uint32_t copy_loops,
                                                            size_t size,
                                                            OPAL_PTRDIFF_TYPE extent,
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
index f4e89accefe..7e30f114d06 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
@@ -774,6 +774,7 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_non_cached( opal_conver
 
         cudaMemcpyAsync(cuda_iov_dist_d_current, cuda_iov_dist_h_current, sizeof(ddt_cuda_iov_dist_cached_t)*(nb_blocks_used+1), cudaMemcpyHostToDevice, *cuda_stream_iov);
         opal_generic_simple_unpack_cuda_iov_cached_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cuda_iov_dist_d_current, 0, nb_blocks_used, 0, 0, nb_blocks_used, destination_base, source_base, 0, 0);
+        //cudaStreamSynchronize(*cuda_stream_iov);
         cuda_err = cudaEventRecord(cuda_iov_pipeline_block->cuda_event, *cuda_stream_iov);
         opal_cuda_check_error(cuda_err);
         iov_pipeline_block_id ++;
@@ -830,8 +831,8 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
 
     cuda_streams->current_stream_id = 0;
     source_base = source;
-    thread_per_block = CUDA_WARP_SIZE * 4;
-    nb_blocks = 256;
+    thread_per_block = CUDA_WARP_SIZE * 8;
+    nb_blocks = 2;
     destination_base = (unsigned char*)pConvertor->pBaseBuf;
     
     /* cuda iov is not cached, start to cache iov */
diff --git a/test/datatype/ddt_benchmark.c b/test/datatype/ddt_benchmark.c
index 1ce768900db..8b3c7ce7981 100644
--- a/test/datatype/ddt_benchmark.c
+++ b/test/datatype/ddt_benchmark.c
@@ -1276,7 +1276,7 @@ int main( int argc, char* argv[] )
         printf("----matrix size %d-----\n", mat_size);
         if( outputFlags & CHECK_PACK_UNPACK ) {
             for (i = 1; i <= 5; i++) {
-          //       local_copy_with_convertor(pdt, 1, 200000000, mat_size);
+               //  local_copy_with_convertor(pdt, 1, 200000000, mat_size);
             }
         }
         OBJ_RELEASE( pdt ); assert( pdt == NULL );

From 40375542ac3c5ad23d30ba63c790927e693a5e9f Mon Sep 17 00:00:00 2001
From: Wei Wu <eddy16112@gmail.com>
Date: Tue, 23 Feb 2016 15:48:40 -0800
Subject: [PATCH 187/190] fix a cuda event bug. cudaStreamWaitEvent is not
 blocking call. fix cuda stream

---
 opal/datatype/cuda/opal_datatype_cuda.cu      | 43 ++++++++++-----
 opal/datatype/cuda/opal_datatype_cuda.cuh     |  4 ++
 .../cuda/opal_datatype_cuda_internal.cuh      | 10 ++--
 .../cuda/opal_datatype_pack_cuda_kernel.cu    |  4 +-
 .../cuda/opal_datatype_pack_cuda_wrapper.cu   | 52 +++++++++----------
 .../cuda/opal_datatype_unpack_cuda_kernel.cu  |  4 +-
 .../cuda/opal_datatype_unpack_cuda_wrapper.cu | 42 +++++++--------
 opal/datatype/opal_datatype_cuda.c            | 23 ++++++++
 opal/datatype/opal_datatype_cuda.h            |  4 ++
 opal/mca/btl/smcuda/btl_smcuda.c              |  3 +-
 opal/mca/btl/smcuda/btl_smcuda_component.c    |  9 +++-
 test/datatype/ddt_benchmark.c                 |  6 +--
 12 files changed, 128 insertions(+), 76 deletions(-)

diff --git a/opal/datatype/cuda/opal_datatype_cuda.cu b/opal/datatype/cuda/opal_datatype_cuda.cu
index 7d12a5d80db..0a15fe3ab2b 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cu
+++ b/opal/datatype/cuda/opal_datatype_cuda.cu
@@ -217,9 +217,16 @@ int32_t opal_ddt_cuda_kernel_init(void)
     
         /* init cuda stream */
         ddt_cuda_stream_t *cuda_streams = (ddt_cuda_stream_t *)malloc(sizeof(ddt_cuda_stream_t));
-        ddt_cuda_iov_pipeline_block_t *cuda_iov_pipeline_block = NULL;
         for (j = 0; j < NB_STREAMS; j++) {
-            cudaStreamCreate(&(cuda_streams->opal_cuda_stream[j]));
+            cudaStreamCreate(&(cuda_streams->ddt_cuda_stream[j]));
+        }
+        cuda_streams->current_stream_id = 0;
+        cuda_devices[i].cuda_streams = cuda_streams;
+        cudaEventCreate(&(cuda_devices[i].memcpy_event), cudaEventDisableTiming);
+        
+        /* init iov pipeline blocks */
+        ddt_cuda_iov_pipeline_block_t *cuda_iov_pipeline_block = NULL;
+        for (j = 0; j < NB_PIPELINE_BLOCKS; j++) {
             cuda_iov_pipeline_block = (ddt_cuda_iov_pipeline_block_t *)malloc(sizeof(ddt_cuda_iov_pipeline_block_t));
             cudaMallocHost((void **)(&(cuda_iov_pipeline_block->cuda_iov_dist_non_cached_h)), sizeof(ddt_cuda_iov_dist_cached_t) * CUDA_MAX_NB_BLOCKS * CUDA_IOV_MAX_TASK_PER_BLOCK);
             cudaMalloc((void **)(&(cuda_iov_pipeline_block->cuda_iov_dist_non_cached_d)), sizeof(ddt_cuda_iov_dist_cached_t) * CUDA_MAX_NB_BLOCKS * CUDA_IOV_MAX_TASK_PER_BLOCK);
@@ -228,14 +235,11 @@ int32_t opal_ddt_cuda_kernel_init(void)
             } else {
                 cuda_iov_pipeline_block->cuda_iov_dist_cached_h = NULL;
             }
-            cuda_iov_pipeline_block->cuda_stream = &(cuda_streams->opal_cuda_stream[0]);
-            cuda_iov_pipeline_block->cuda_stream_id = 0;
-            cudaEventCreate(&(cuda_iov_pipeline_block->cuda_event), cudaEventDisableTiming);
+            // cuda_iov_pipeline_block->cuda_stream = &(cuda_streams->opal_cuda_stream[0]);
+            // cuda_iov_pipeline_block->cuda_stream_id = 0;
+            cudaEventCreateWithFlags(&(cuda_iov_pipeline_block->cuda_event), cudaEventDisableTiming);
             cuda_devices[i].cuda_iov_pipeline_block[j] = cuda_iov_pipeline_block;
         }
-        cuda_streams->current_stream_id = 0;
-        cuda_devices[i].cuda_streams = cuda_streams;
-        cudaEventCreate(&(cuda_devices[i].memcpy_event), cudaEventDisableTiming);
     }
     current_cuda_device = &(cuda_devices[0]);
     
@@ -262,7 +266,7 @@ int32_t opal_ddt_cuda_kernel_fini(void)
         /* destory cuda stream and iov*/
         ddt_cuda_iov_pipeline_block_t *cuda_iov_pipeline_block = NULL;
         for (j = 0; j < NB_STREAMS; j++) {
-            cudaStreamDestroy(cuda_devices[i].cuda_streams->opal_cuda_stream[j]);
+            cudaStreamDestroy(cuda_devices[i].cuda_streams->ddt_cuda_stream[j]);
             cuda_iov_pipeline_block = cuda_devices[i].cuda_iov_pipeline_block[j];
             if (cuda_iov_pipeline_block != NULL) {
                 if (cuda_iov_pipeline_block->cuda_iov_dist_non_cached_h != NULL) {
@@ -279,7 +283,6 @@ int32_t opal_ddt_cuda_kernel_fini(void)
                 }
                 cudaEventDestroy(cuda_iov_pipeline_block->cuda_event);
                 cuda_iov_pipeline_block->cuda_stream = NULL;
-                cuda_iov_pipeline_block->cuda_stream_id = -1;
                 free(cuda_iov_pipeline_block);
                 cuda_iov_pipeline_block = NULL;
             }
@@ -369,6 +372,7 @@ int32_t opal_ddt_cache_cuda_iov(opal_convertor_t* pConvertor, uint32_t *cuda_iov
     size_t ncontig_disp_base;
     size_t contig_disp = 0;
     uint32_t *cached_cuda_iov_nb_bytes_list_h = NULL;
+    ddt_cuda_stream_t *cuda_streams = current_cuda_device->cuda_streams;
     
     opal_datatype_t *datatype = (opal_datatype_t *)pConvertor->pDesc;
     
@@ -387,6 +391,7 @@ int32_t opal_ddt_cache_cuda_iov(opal_convertor_t* pConvertor, uint32_t *cuda_iov
     cached_cuda_iov_nb_bytes_list_h = cached_cuda_iov->nb_bytes_h;
     nb_blocks_used = 0;
     cuda_iov_pipeline_block = current_cuda_device->cuda_iov_pipeline_block[0];
+    cuda_iov_pipeline_block->cuda_stream = &(cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]);
     cuda_iov_dist_h = cuda_iov_pipeline_block->cuda_iov_dist_cached_h;
     cuda_stream_iov = cuda_iov_pipeline_block->cuda_stream;
     thread_per_block = CUDA_WARP_SIZE * 64;
@@ -735,13 +740,25 @@ void opal_cuda_check_error(cudaError_t err)
 
 void opal_ddt_cuda_d2dcpy_async(void* dst, const void* src, size_t count)
 {
-    cudaMemcpyAsync(dst, src, count, cudaMemcpyDeviceToDevice, current_cuda_device->cuda_streams->opal_cuda_stream[0]);
+    cudaMemcpyAsync(dst, src, count, cudaMemcpyDeviceToDevice, current_cuda_device->cuda_streams->ddt_cuda_stream[current_cuda_device->cuda_streams->current_stream_id]);
 }
 
 void opal_ddt_cuda_d2dcpy(void* dst, const void* src, size_t count)
 {
-    cudaMemcpyAsync(dst, src, count, cudaMemcpyDeviceToDevice, current_cuda_device->cuda_streams->opal_cuda_stream[0]);
-    cudaStreamSynchronize(current_cuda_device->cuda_streams->opal_cuda_stream[0]);
+    cudaMemcpyAsync(dst, src, count, cudaMemcpyDeviceToDevice, current_cuda_device->cuda_streams->ddt_cuda_stream[current_cuda_device->cuda_streams->current_stream_id]);
+    cudaStreamSynchronize(current_cuda_device->cuda_streams->ddt_cuda_stream[current_cuda_device->cuda_streams->current_stream_id]);
+}
+
+void opal_ddt_cuda_set_cuda_stream()
+{
+    ddt_cuda_stream_t *cuda_streams = current_cuda_device->cuda_streams;
+    cuda_streams->current_stream_id ++;
+    cuda_streams->current_stream_id = cuda_streams->current_stream_id & (NB_STREAMS-1);
+}
+
+int32_t opal_ddt_cuda_get_cuda_stream()
+{
+    return current_cuda_device->cuda_streams->current_stream_id;
 }
 
 void opal_dump_cuda_list(ddt_cuda_list_t *list)
diff --git a/opal/datatype/cuda/opal_datatype_cuda.cuh b/opal/datatype/cuda/opal_datatype_cuda.cuh
index c33ff606bd9..cab006e0f3f 100644
--- a/opal/datatype/cuda/opal_datatype_cuda.cuh
+++ b/opal/datatype/cuda/opal_datatype_cuda.cuh
@@ -125,6 +125,10 @@ int32_t opal_ddt_cache_cuda_iov(opal_convertor_t* pConvertor, uint32_t *cuda_iov
 
 uint8_t opal_ddt_iov_to_cuda_iov(opal_convertor_t* pConvertor, const struct iovec *ddt_iov, ddt_cuda_iov_dist_cached_t* cuda_iov_dist_h_current, uint32_t ddt_iov_start_pos, uint32_t ddt_iov_end_pos, size_t *buffer_size, uint32_t *nb_blocks_used, size_t *total_packed, size_t *contig_disp_out, uint32_t *current_ddt_iov_pos);
 
+void opal_ddt_cuda_set_cuda_stream();
+
+int32_t opal_ddt_cuda_get_cuda_stream();
+
 }
                             
 #endif  /* OPAL_DATATYPE_CUDA_H_HAS_BEEN_INCLUDED */
diff --git a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
index e6268fadc05..31be1def712 100644
--- a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
+++ b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
@@ -30,7 +30,8 @@
 #define THREAD_PER_BLOCK    32
 #define CUDA_WARP_SIZE      32
 #define TASK_PER_THREAD     2
-#define NB_STREAMS          8
+#define NB_STREAMS          4
+#define NB_PIPELINE_BLOCKS  4
 #define CUDA_NB_IOV         1024*20
 #define CUDA_IOV_LEN        1024*1204
 #define CUDA_MAX_NB_BLOCKS  1024
@@ -51,8 +52,8 @@
 
 
 typedef struct {
-    cudaStream_t opal_cuda_stream[NB_STREAMS];
-    uint32_t current_stream_id;
+    cudaStream_t ddt_cuda_stream[NB_STREAMS];
+    int32_t current_stream_id;
 } ddt_cuda_stream_t;
 
 typedef struct {
@@ -79,7 +80,6 @@ typedef struct {
     ddt_cuda_iov_dist_cached_t* cuda_iov_dist_non_cached_d;
     ddt_cuda_iov_dist_cached_t* cuda_iov_dist_cached_h;
     cudaStream_t *cuda_stream;
-    int32_t cuda_stream_id;
     cudaEvent_t cuda_event;
 } ddt_cuda_iov_pipeline_block_t;
 
@@ -104,7 +104,7 @@ typedef struct {
     size_t buffer_free_size;
     size_t buffer_used_size;
     ddt_cuda_stream_t *cuda_streams;
-    ddt_cuda_iov_pipeline_block_t *cuda_iov_pipeline_block[NB_STREAMS];
+    ddt_cuda_iov_pipeline_block_t *cuda_iov_pipeline_block[NB_PIPELINE_BLOCKS];
     cudaEvent_t memcpy_event;
 } ddt_cuda_device_t;
 
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
index 81e7f7c4dcd..929d1f7de88 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
@@ -412,9 +412,9 @@ __global__ void opal_generic_simple_pack_cuda_iov_cached_kernel( ddt_cuda_iov_di
         if (nb_tasks_per_block >= 4) {
             WARP_SIZE = 32;
         } else if (nb_tasks_per_block == 1) {
-            WARP_SIZE = blockDim.x;
+            WARP_SIZE = 32;//blockDim.x;
         } else {
-            WARP_SIZE = 64;
+            WARP_SIZE = 32;
         }
         nb_warp_per_block = blockDim.x / WARP_SIZE;
  //       nb_warp_per_block = 1;
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
index 534c3372d60..882c26a72b4 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
@@ -193,7 +193,7 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_vector(opal_convertor_t* pCon
         total_packed += iov[iov_count].iov_len;
  //       printf("iov_len %d, local %d\n", iov[iov_count].iov_len, iov_len_local);
         for (i = 0; i < NB_STREAMS; i++) {
-            cudaStreamSynchronize(cuda_streams->opal_cuda_stream[i]);
+            cudaStreamSynchronize(cuda_streams->ddt_cuda_stream[i]);
         }
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
         GET_TIME(start);
@@ -461,9 +461,9 @@ void pack_contiguous_loop_cuda( dt_elem_desc_t* ELEM,
  //   num_blocks = (*COUNT + tasks_per_block - 1) / tasks_per_block;
 //    printf("extent %ld, size %ld, count %ld\n", _loop->extent, _end_loop->size, _copy_loops);
 #if OPAL_DATATYPE_VECTOR_USE_MEMCPY2D_AS_KERNEL
-    cudaMemcpy2DAsync(_destination, _end_loop->size, _source, _loop->extent, _end_loop->size, _copy_loops, cudaMemcpyDeviceToDevice, cuda_streams->opal_cuda_stream[0]);
+    cudaMemcpy2DAsync(_destination, _end_loop->size, _source, _loop->extent, _end_loop->size, _copy_loops, cudaMemcpyDeviceToDevice, cuda_streams->ddt_cuda_stream[0]);
 #else
-    pack_contiguous_loop_cuda_kernel_global<<<32, 8*THREAD_PER_BLOCK, 0, cuda_streams->opal_cuda_stream[0]>>>(_copy_loops, _end_loop->size, _loop->extent, _source, _destination);
+    pack_contiguous_loop_cuda_kernel_global<<<32, 8*THREAD_PER_BLOCK, 0, cuda_streams->ddt_cuda_stream[0]>>>(_copy_loops, _end_loop->size, _loop->extent, _source, _destination);
 #endif /* OPAL_DATATYPE_VECTOR_USE_MEMCPY2D_AS_KERNEL */
 
 #if !defined(OPAL_DATATYPE_CUDA_DRY_RUN)    
@@ -473,7 +473,7 @@ void pack_contiguous_loop_cuda( dt_elem_desc_t* ELEM,
     *(COUNT) -= _copy_loops;
 #endif
     
-    cudaStreamSynchronize(cuda_streams->opal_cuda_stream[0]);
+    cudaStreamSynchronize(cuda_streams->ddt_cuda_stream[0]);
     
 #if defined(OPAL_DATATYPE_CUDA_TIMING) 
     GET_TIME( end );
@@ -525,9 +525,9 @@ void pack_contiguous_loop_cuda_pipeline( dt_elem_desc_t* ELEM,
     pipeline_blocks = 4;
     cuda_streams->current_stream_id = 0;
     _copy_loops_per_pipeline = (_copy_loops + pipeline_blocks -1 )/ pipeline_blocks;
-    pack_contiguous_loop_cuda_kernel_global<<<192, 4*THREAD_PER_BLOCK, 0, cuda_streams->opal_cuda_stream[cuda_streams->current_stream_id]>>>(_copy_loops_per_pipeline, _end_loop->size, _loop->extent, _source, _destination_dev);
+    pack_contiguous_loop_cuda_kernel_global<<<192, 4*THREAD_PER_BLOCK, 0, cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]>>>(_copy_loops_per_pipeline, _end_loop->size, _loop->extent, _source, _destination_dev);
     for (i = 1; i <= pipeline_blocks; i++) {
-        cudaMemcpyAsync(_destination_host, _destination_dev, _end_loop->size * _copy_loops_per_pipeline, cudaMemcpyDeviceToHost, cuda_streams->opal_cuda_stream[cuda_streams->current_stream_id]);
+        cudaMemcpyAsync(_destination_host, _destination_dev, _end_loop->size * _copy_loops_per_pipeline, cudaMemcpyDeviceToHost, cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]);
         cuda_streams->current_stream_id ++;
         cuda_streams->current_stream_id = cuda_streams->current_stream_id % NB_STREAMS;
         _source += _loop->extent * _copy_loops_per_pipeline;
@@ -536,9 +536,9 @@ void pack_contiguous_loop_cuda_pipeline( dt_elem_desc_t* ELEM,
         if (i == pipeline_blocks) {
             _copy_loops_per_pipeline = _copy_loops - _copy_loops_per_pipeline * (pipeline_blocks - 1);
         }
-        pack_contiguous_loop_cuda_kernel_global<<<192, 4*THREAD_PER_BLOCK, 0, cuda_streams->opal_cuda_stream[cuda_streams->current_stream_id]>>>(_copy_loops_per_pipeline, _end_loop->size, _loop->extent, _source, _destination_dev);
+        pack_contiguous_loop_cuda_kernel_global<<<192, 4*THREAD_PER_BLOCK, 0, cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]>>>(_copy_loops_per_pipeline, _end_loop->size, _loop->extent, _source, _destination_dev);
     }
-    cudaMemcpyAsync(_destination_host, _destination_dev, _end_loop->size * _copy_loops_per_pipeline, cudaMemcpyDeviceToHost, cuda_streams->opal_cuda_stream[cuda_streams->current_stream_id]);
+    cudaMemcpyAsync(_destination_host, _destination_dev, _end_loop->size * _copy_loops_per_pipeline, cudaMemcpyDeviceToHost, cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]);
 
 #if !defined(OPAL_DATATYPE_CUDA_DRY_RUN)    
     *(SOURCE) = _source +  _loop->extent*_copy_loops - _end_loop->first_elem_disp;
@@ -584,7 +584,7 @@ void pack_contiguous_loop_cuda_memcpy2d_d2h( dt_elem_desc_t* ELEM,
     GET_TIME(start);
 #endif    
 
-    cudaMemcpy2DAsync(_destination, _end_loop->size, _source, _loop->extent, _end_loop->size, _copy_loops, cudaMemcpyDeviceToHost, cuda_streams->opal_cuda_stream[0]);
+    cudaMemcpy2DAsync(_destination, _end_loop->size, _source, _loop->extent, _end_loop->size, _copy_loops, cudaMemcpyDeviceToHost, cuda_streams->ddt_cuda_stream[0]);
 
 #if !defined(OPAL_DATATYPE_CUDA_DRY_RUN)    
     *(SOURCE) = _source +  _loop->extent*_copy_loops - _end_loop->first_elem_disp;
@@ -593,7 +593,7 @@ void pack_contiguous_loop_cuda_memcpy2d_d2h( dt_elem_desc_t* ELEM,
     *(COUNT) -= _copy_loops;
 #endif
     
-    cudaStreamSynchronize(cuda_streams->opal_cuda_stream[0]);
+    cudaStreamSynchronize(cuda_streams->ddt_cuda_stream[0]);
     
 #if defined(OPAL_DATATYPE_CUDA_TIMING) 
     GET_TIME( end );
@@ -638,9 +638,9 @@ void pack_contiguous_loop_cuda_zerocopy( dt_elem_desc_t* ELEM,
         printf("can not get dev  mem, %s\n", cuda_err);
     }
 #if OPAL_DATATYPE_VECTOR_USE_MEMCPY2D_AS_KERNEL
-    cudaMemcpy2DAsync(_destination_dev, _end_loop->size, _source, _loop->extent, _end_loop->size, _copy_loops, cudaMemcpyDeviceToDevice, cuda_streams->opal_cuda_stream[0]);
+    cudaMemcpy2DAsync(_destination_dev, _end_loop->size, _source, _loop->extent, _end_loop->size, _copy_loops, cudaMemcpyDeviceToDevice, cuda_streams->ddt_cuda_stream[0]);
 #else
-    pack_contiguous_loop_cuda_kernel_global<<<192, 4*THREAD_PER_BLOCK, 0, cuda_streams->opal_cuda_stream[0]>>>(_copy_loops, _end_loop->size, _loop->extent, _source, _destination_dev);
+    pack_contiguous_loop_cuda_kernel_global<<<192, 4*THREAD_PER_BLOCK, 0, cuda_streams->ddt_cuda_stream[0]>>>(_copy_loops, _end_loop->size, _loop->extent, _source, _destination_dev);
 #endif /* OPAL_DATATYPE_VECTOR_USE_MEMCPY2D_AS_KERNEL */
 
 #if !defined(OPAL_DATATYPE_CUDA_DRY_RUN)    
@@ -650,7 +650,7 @@ void pack_contiguous_loop_cuda_zerocopy( dt_elem_desc_t* ELEM,
     *(COUNT) -= _copy_loops;
 #endif
     
-    cudaStreamSynchronize(cuda_streams->opal_cuda_stream[0]);
+    cudaStreamSynchronize(cuda_streams->ddt_cuda_stream[0]);
     
 #if defined(OPAL_DATATYPE_CUDA_TIMING) 
     GET_TIME( end );
@@ -1055,16 +1055,14 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_non_cached( opal_converto
         return OPAL_ERROR;
     }
     
-    cuda_streams->current_stream_id = 0;
+   // cuda_streams->current_stream_id = 0;
     thread_per_block = CUDA_WARP_SIZE * 5;
     nb_blocks = 256;
     opal_datatype_type_extent(pConvertor->pDesc, &ddt_extent);
     source_base = (unsigned char*)pConvertor->pBaseBuf + pConvertor->current_count * ddt_extent; 
     destination_base = destination;
     
-    for (i = 0; i < NB_STREAMS; i++) {
-        cudaStreamSynchronize(cuda_streams->opal_cuda_stream[i]);
-    }
+    cudaStreamSynchronize(cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]);
     
     while( pConvertor->current_count < pConvertor->count && !buffer_isfull) {
         
@@ -1075,10 +1073,11 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_non_cached( opal_converto
             ddt_iov_end_pos = ddt_iov_count;
         }
         cuda_iov_pipeline_block = current_cuda_device->cuda_iov_pipeline_block[iov_pipeline_block_id];
+        cuda_iov_pipeline_block->cuda_stream = &(cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]);
         cuda_iov_dist_h_current = cuda_iov_pipeline_block->cuda_iov_dist_non_cached_h;
         cuda_iov_dist_d_current = cuda_iov_pipeline_block->cuda_iov_dist_non_cached_d;
         cuda_stream_iov = cuda_iov_pipeline_block->cuda_stream;
-        cuda_err = cudaStreamWaitEvent(*cuda_stream_iov, cuda_iov_pipeline_block->cuda_event, 0);
+        cuda_err = cudaEventSynchronize(cuda_iov_pipeline_block->cuda_event);
         opal_cuda_check_error(cuda_err);
 
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
@@ -1090,7 +1089,7 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_non_cached( opal_converto
 #if defined(OPAL_DATATYPE_CUDA_TIMING)    
         GET_TIME( end );
         total_time = ELAPSED_TIME( start, end );
-        DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Pack src %p to dest %p, iov is prepared in %ld microsec, kernel submitted to CUDA stream %d, nb_blocks %d\n", source_base, destination_base, total_time,  cuda_iov_pipeline_block->cuda_stream_id, nb_blocks_used); );
+        DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Pack src %p to dest %p, iov is prepared in %ld microsec, kernel submitted to CUDA stream %d, nb_blocks %d\n", source_base, destination_base, total_time,  cuda_streams->current_stream_id, nb_blocks_used); );
 #endif
 
         cudaMemcpyAsync(cuda_iov_dist_d_current, cuda_iov_dist_h_current, sizeof(ddt_cuda_iov_dist_cached_t)*(nb_blocks_used+1), cudaMemcpyHostToDevice, *cuda_stream_iov);
@@ -1113,9 +1112,7 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_non_cached( opal_converto
         
     }
     
-    for (i = 0; i < NB_STREAMS; i++) {
-        cudaStreamSynchronize(cuda_streams->opal_cuda_stream[i]);
-    }
+    cudaStreamSynchronize(cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]);
         
     return OPAL_SUCCESS;
 }
@@ -1145,10 +1142,10 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
     
     DT_CUDA_DEBUG ( opal_cuda_output(2, "Pack using IOV cached, GPU base %p, pack to buffer %p\n", pConvertor->pBaseBuf, destination););
 
-    cuda_streams->current_stream_id = 0;
+   // cuda_streams->current_stream_id = 0;
     destination_base = destination;
     thread_per_block = CUDA_WARP_SIZE * 8;
-    nb_blocks = 4;
+    nb_blocks = 16;
     source_base = (unsigned char*)pConvertor->pBaseBuf; 
     
     /* cuda iov is not cached, start to cache iov */
@@ -1182,6 +1179,7 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
     cuda_iov_end_pos = cached_cuda_iov_count;
     nb_blocks_used = 0;
     cuda_iov_pipeline_block = current_cuda_device->cuda_iov_pipeline_block[0];
+    cuda_iov_pipeline_block->cuda_stream = &(cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]);
     cuda_stream_iov = cuda_iov_pipeline_block->cuda_stream;
     convertor_current_count = pConvertor->current_count;
    
@@ -1208,7 +1206,7 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
 #if defined(OPAL_DATATYPE_CUDA_TIMING)    
     GET_TIME( end );
     total_time = ELAPSED_TIME( start, end );
-    DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Pack to dest %p, cached cuda iov is prepared in %ld microsec, kernel submitted to CUDA stream %d, nb_blocks %d\n", destination_base, total_time,  cuda_iov_pipeline_block->cuda_stream_id, nb_blocks_used); );
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Pack to dest %p, cached cuda iov is prepared in %ld microsec, kernel submitted to CUDA stream %d, nb_blocks %d\n", destination_base, total_time,  cuda_streams->current_stream_id, nb_blocks_used); );
 #endif
     opal_datatype_type_extent(pConvertor->pDesc, &ddt_extent);
     DT_CUDA_DEBUG ( opal_cuda_output(2, "Pack kernel launched src_base %p, dst_base %p, nb_blocks %ld, extent %ld\n", source_base, destination_base, nb_blocks_used, ddt_extent ); );
@@ -1219,7 +1217,7 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t*
     pConvertor->current_cuda_iov_pos += nb_blocks_used;
     pConvertor->current_cuda_iov_pos = pConvertor->current_cuda_iov_pos % cached_cuda_iov->cuda_iov_count;
 
-    cudaStreamSynchronize(cuda_streams->opal_cuda_stream[0]);
+    cudaStreamSynchronize(cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]);
 #if defined(OPAL_DATATYPE_CUDA_TIMING)    
     GET_TIME( end );
     total_time = ELAPSED_TIME( start, end );
@@ -1265,7 +1263,7 @@ void pack_predefined_data_cuda( dt_elem_desc_t* ELEM,
  //   DBGPRINT("num_blocks %d, thread %d\n", nb_blocks, tasks_per_block);
  //   DBGPRINT( "GPU pack 1. memcpy( %p, %p, %lu ) => space %lu\n", _destination, _source, (unsigned long)_copy_count, (unsigned long)(*(SPACE)) );
     
-    pack_contiguous_loop_cuda_kernel_global<<<nb_blocks, thread_per_block, 0, cuda_streams->opal_cuda_stream[cuda_streams->current_stream_id]>>>(_copy_count, _copy_blength, _elem->extent, _source, _destination);
+    pack_contiguous_loop_cuda_kernel_global<<<nb_blocks, thread_per_block, 0, cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]>>>(_copy_count, _copy_blength, _elem->extent, _source, _destination);
     cuda_streams->current_stream_id ++;
     cuda_streams->current_stream_id = cuda_streams->current_stream_id % NB_STREAMS;
     
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
index 4774abf5f38..fb533d4cfc8 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
@@ -171,9 +171,9 @@ __global__ void opal_generic_simple_unpack_cuda_iov_cached_kernel( ddt_cuda_iov_
         if (nb_tasks_per_block >= 4) {
             WARP_SIZE = 32;
         } else if (nb_tasks_per_block == 1) {
-            WARP_SIZE = blockDim.x;
+            WARP_SIZE = 32;//blockDim.x;
         } else {
-            WARP_SIZE = 64;
+            WARP_SIZE = 32;
         }
         nb_warp_per_block = blockDim.x / WARP_SIZE;
      //   printf("cuda_iov_count %d, ddt_extent %d, current_count %d, ddt_size %d\n", cuda_iov_count, ddt_extent, current_count, ddt_size);
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
index 7e30f114d06..703e52280b5 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
@@ -179,7 +179,7 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_vector( opal_convertor_t* p
     }
  complete_conversion:
     for (i = 0; i < NB_STREAMS; i++) {
-        cudaStreamSynchronize(cuda_streams->opal_cuda_stream[i]);
+        cudaStreamSynchronize(cuda_streams->ddt_cuda_stream[i]);
     }
     *max_data = total_unpacked;
     pConvertor->bConverted += total_unpacked;  /* update the already converted bytes */
@@ -732,7 +732,7 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_non_cached( opal_conver
         return OPAL_ERROR;
     }
     
-    cuda_streams->current_stream_id = 0;
+  //  cuda_streams->current_stream_id = 0;
     thread_per_block = CUDA_WARP_SIZE * 5;
     nb_blocks = 256;
     source_base = source;
@@ -741,7 +741,7 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_non_cached( opal_conver
     destination_base = (unsigned char*)pConvertor->pBaseBuf + pConvertor->current_count * ddt_extent;
     
     for (i = 0; i < NB_STREAMS; i++) {
-        cudaStreamSynchronize(cuda_streams->opal_cuda_stream[i]);
+        cudaStreamSynchronize(cuda_streams->ddt_cuda_stream[i]);
     }
 
     while( pConvertor->current_count < pConvertor->count && !buffer_isfull) {
@@ -753,10 +753,11 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_non_cached( opal_conver
             ddt_iov_end_pos = ddt_iov_count;
         }
         cuda_iov_pipeline_block = current_cuda_device->cuda_iov_pipeline_block[iov_pipeline_block_id];
+        cuda_iov_pipeline_block->cuda_stream = &(cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]);
         cuda_iov_dist_h_current = cuda_iov_pipeline_block->cuda_iov_dist_non_cached_h;
         cuda_iov_dist_d_current = cuda_iov_pipeline_block->cuda_iov_dist_non_cached_d;
         cuda_stream_iov = cuda_iov_pipeline_block->cuda_stream;
-        cuda_err = cudaStreamWaitEvent(*cuda_stream_iov, cuda_iov_pipeline_block->cuda_event, 0);
+        cuda_err = cudaEventSynchronize(cuda_iov_pipeline_block->cuda_event);
         opal_cuda_check_error(cuda_err);
         
 
@@ -769,7 +770,7 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_non_cached( opal_conver
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
         GET_TIME( end );
         total_time = ELAPSED_TIME( start, end );
-        DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Unpack src %p to dest %p, iov is prepared in %ld microsec, kernel submitted to CUDA stream %d, nb_blocks_used %d\n", source_base, destination_base, total_time,  cuda_iov_pipeline_block->cuda_stream_id, nb_blocks_used); );
+        DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Unpack src %p to dest %p, iov is prepared in %ld microsec, kernel submitted to CUDA stream %d, nb_blocks_used %d\n", source_base, destination_base, total_time,  cuda_streams->current_stream_id, nb_blocks_used); );
 #endif
 
         cudaMemcpyAsync(cuda_iov_dist_d_current, cuda_iov_dist_h_current, sizeof(ddt_cuda_iov_dist_cached_t)*(nb_blocks_used+1), cudaMemcpyHostToDevice, *cuda_stream_iov);
@@ -790,9 +791,7 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_non_cached( opal_conver
         }
     }
 
-    for (i = 0; i < NB_STREAMS; i++) {
-        cudaStreamSynchronize(cuda_streams->opal_cuda_stream[i]);
-    }
+    cudaStreamSynchronize(cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]);
 
     return OPAL_SUCCESS;
 }
@@ -829,10 +828,10 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
     GET_TIME(start);
 #endif
 
-    cuda_streams->current_stream_id = 0;
+ //   cuda_streams->current_stream_id = 0;
     source_base = source;
     thread_per_block = CUDA_WARP_SIZE * 8;
-    nb_blocks = 2;
+    nb_blocks = 64;
     destination_base = (unsigned char*)pConvertor->pBaseBuf;
     
     /* cuda iov is not cached, start to cache iov */
@@ -864,6 +863,7 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
     cuda_iov_end_pos = cached_cuda_iov_count;
     nb_blocks_used = 0;
     cuda_iov_pipeline_block = current_cuda_device->cuda_iov_pipeline_block[0];
+    cuda_iov_pipeline_block->cuda_stream = &(cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]);
     cuda_stream_iov = cuda_iov_pipeline_block->cuda_stream;
     convertor_current_count = pConvertor->current_count;
     
@@ -905,7 +905,7 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
 #if defined(OPAL_DATATYPE_CUDA_TIMING)    
     GET_TIME( end );
     total_time = ELAPSED_TIME( start, end );
-    DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Unpack src %p, cached cuda iov is prepared in %ld microsec, kernel submitted to CUDA stream %d, nb_blocks %d\n", source_base, total_time,  cuda_iov_pipeline_block->cuda_stream_id, nb_blocks_used); );
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Unpack src %p, cached cuda iov is prepared in %ld microsec, kernel submitted to CUDA stream %d, nb_blocks %d\n", source_base, total_time,  cuda_streams->current_stream_id, nb_blocks_used); );
 #endif
     opal_datatype_type_extent(pConvertor->pDesc, &ddt_extent);
     DT_CUDA_DEBUG ( opal_cuda_output(2, "Unpack kernel launched src_base %p, dst_base %p, nb_blocks %ld\n", source_base, destination_base, nb_blocks_used ); );
@@ -915,7 +915,7 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_
 #endif    
     opal_generic_simple_unpack_cuda_iov_cached_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cached_cuda_iov_dist_d, pConvertor->current_cuda_iov_pos, cached_cuda_iov_count, ddt_extent, convertor_current_count, nb_blocks_used, destination_base, source_base, cuda_iov_partial_length_start, cuda_iov_partial_length_end);
 
-    cudaStreamSynchronize(cuda_streams->opal_cuda_stream[0]);
+    cudaStreamSynchronize(cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]);
 #if defined(OPAL_DATATYPE_CUDA_TIMING)    
     GET_TIME( end );
     total_time = ELAPSED_TIME( start, end );
@@ -955,9 +955,9 @@ void unpack_contiguous_loop_cuda( dt_elem_desc_t* ELEM,
 //    tasks_per_block = THREAD_PER_BLOCK * TASK_PER_THREAD;
 //    num_blocks = (*COUNT + tasks_per_block - 1) / tasks_per_block;
 #if OPAL_DATATYPE_VECTOR_USE_MEMCPY2D_AS_KERNEL
-     cudaMemcpy2DAsync(_destination, _loop->extent, _source, _end_loop->size, _end_loop->size, _copy_loops, cudaMemcpyDeviceToDevice, cuda_streams->opal_cuda_stream[0]);
+     cudaMemcpy2DAsync(_destination, _loop->extent, _source, _end_loop->size, _end_loop->size, _copy_loops, cudaMemcpyDeviceToDevice, cuda_streams->ddt_cuda_stream[0]);
 #else
-     unpack_contiguous_loop_cuda_kernel_global<<<192, 4*THREAD_PER_BLOCK, 0, cuda_streams->opal_cuda_stream[0]>>>(_copy_loops, _end_loop->size, _loop->extent, _source, _destination);
+     unpack_contiguous_loop_cuda_kernel_global<<<192, 4*THREAD_PER_BLOCK, 0, cuda_streams->ddt_cuda_stream[0]>>>(_copy_loops, _end_loop->size, _loop->extent, _source, _destination);
 #endif /* OPAL_DATATYPE_VECTOR_USE_MEMCPY2D_AS_KERNEL */
 
 #if !defined(OPAL_DATATYPE_CUDA_DRY_RUN)
@@ -967,7 +967,7 @@ void unpack_contiguous_loop_cuda( dt_elem_desc_t* ELEM,
     *(COUNT) -= _copy_loops;
 #endif
 
-    cudaStreamSynchronize(cuda_streams->opal_cuda_stream[0]);
+    cudaStreamSynchronize(cuda_streams->ddt_cuda_stream[0]);
 #if defined(OPAL_DATATYPE_CUDA_TIMING) 
     GET_TIME( end );
     total_time = ELAPSED_TIME( start, end );
@@ -1002,7 +1002,7 @@ void unpack_contiguous_loop_cuda_memcpy2d( dt_elem_desc_t* ELEM,
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
     GET_TIME(start);
 #endif
-    cudaMemcpy2DAsync(_destination, _loop->extent, _source, _end_loop->size, _end_loop->size, _copy_loops, cudaMemcpyHostToDevice, cuda_streams->opal_cuda_stream[0]);
+    cudaMemcpy2DAsync(_destination, _loop->extent, _source, _end_loop->size, _end_loop->size, _copy_loops, cudaMemcpyHostToDevice, cuda_streams->ddt_cuda_stream[0]);
 
 #if !defined(OPAL_DATATYPE_CUDA_DRY_RUN)
     *(DESTINATION) = _destination + _loop->extent*_copy_loops - _end_loop->first_elem_disp;
@@ -1011,7 +1011,7 @@ void unpack_contiguous_loop_cuda_memcpy2d( dt_elem_desc_t* ELEM,
     *(COUNT) -= _copy_loops;
 #endif
     
-    cudaStreamSynchronize(cuda_streams->opal_cuda_stream[0]);
+    cudaStreamSynchronize(cuda_streams->ddt_cuda_stream[0]);
     
 #if defined(OPAL_DATATYPE_CUDA_TIMING) 
     GET_TIME( end );
@@ -1057,9 +1057,9 @@ void unpack_contiguous_loop_cuda_zerocopy( dt_elem_desc_t* ELEM,
         printf("can not get dev mem, %s\n", cuda_err);
     }
 #if OPAL_DATATYPE_VECTOR_USE_MEMCPY2D_AS_KERNEL
-    cudaMemcpy2DAsync(_destination, _loop->extent, _source_dev, _end_loop->size, _end_loop->size, _copy_loops, cudaMemcpyDeviceToDevice, cuda_streams->opal_cuda_stream[0]);
+    cudaMemcpy2DAsync(_destination, _loop->extent, _source_dev, _end_loop->size, _end_loop->size, _copy_loops, cudaMemcpyDeviceToDevice, cuda_streams->ddt_cuda_stream[0]);
 #else
-    unpack_contiguous_loop_cuda_kernel_global<<<192, 4*THREAD_PER_BLOCK, 0, cuda_streams->opal_cuda_stream[0]>>>(_copy_loops, _end_loop->size, _loop->extent, _source_dev, _destination);
+    unpack_contiguous_loop_cuda_kernel_global<<<192, 4*THREAD_PER_BLOCK, 0, cuda_streams->ddt_cuda_stream[0]>>>(_copy_loops, _end_loop->size, _loop->extent, _source_dev, _destination);
 #endif /* OPAL_DATATYPE_VECTOR_USE_MEMCPY2D_AS_KERNEL */
 
 #if !defined(OPAL_DATATYPE_CUDA_DRY_RUN)
@@ -1069,7 +1069,7 @@ void unpack_contiguous_loop_cuda_zerocopy( dt_elem_desc_t* ELEM,
     *(COUNT) -= _copy_loops;
 #endif
 
-    cudaStreamSynchronize(cuda_streams->opal_cuda_stream[0]);
+    cudaStreamSynchronize(cuda_streams->ddt_cuda_stream[0]);
   //  cudaHostUnregister(_source);
 #if defined(OPAL_DATATYPE_CUDA_TIMING) 
     GET_TIME( end );
@@ -1115,7 +1115,7 @@ void unpack_predefined_data_cuda( dt_elem_desc_t* ELEM,
  //   DBGPRINT("num_blocks %d, thread %d\n", nb_blocks, tasks_per_block);
  //   DBGPRINT( "GPU pack 1. memcpy( %p, %p, %lu ) => space %lu\n", _destination, _source, (unsigned long)_copy_count, (unsigned long)(*(SPACE)) );
     
-    unpack_contiguous_loop_cuda_kernel_global<<<nb_blocks, thread_per_block, 0, cuda_streams->opal_cuda_stream[cuda_streams->current_stream_id]>>>(_copy_count, _copy_blength, _elem->extent, _source, _destination);
+    unpack_contiguous_loop_cuda_kernel_global<<<nb_blocks, thread_per_block, 0, cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]>>>(_copy_count, _copy_blength, _elem->extent, _source, _destination);
     cuda_streams->current_stream_id ++;
     cuda_streams->current_stream_id = cuda_streams->current_stream_id % NB_STREAMS;
     
diff --git a/opal/datatype/opal_datatype_cuda.c b/opal/datatype/opal_datatype_cuda.c
index c65e635a506..2aa73454724 100644
--- a/opal/datatype/opal_datatype_cuda.c
+++ b/opal/datatype/opal_datatype_cuda.c
@@ -247,6 +247,8 @@ int32_t opal_cuda_kernel_support_init(void)
         OPAL_DATATYPE_FIND_CUDA_KERNEL_FUNCTION_OR_RETURN( opal_datatype_cuda_kernel_handle, opal_ddt_cuda_d2dcpy_async );
         OPAL_DATATYPE_FIND_CUDA_KERNEL_FUNCTION_OR_RETURN( opal_datatype_cuda_kernel_handle, opal_ddt_cuda_d2dcpy );
         OPAL_DATATYPE_FIND_CUDA_KERNEL_FUNCTION_OR_RETURN( opal_datatype_cuda_kernel_handle, opal_ddt_cached_cuda_iov_fini );
+        OPAL_DATATYPE_FIND_CUDA_KERNEL_FUNCTION_OR_RETURN( opal_datatype_cuda_kernel_handle, opal_ddt_cuda_set_cuda_stream );
+        OPAL_DATATYPE_FIND_CUDA_KERNEL_FUNCTION_OR_RETURN( opal_datatype_cuda_kernel_handle, opal_ddt_cuda_get_cuda_stream );
         
         if (OPAL_SUCCESS != cuda_kernel_table.opal_ddt_cuda_kernel_init_p()) {
             return OPAL_ERROR;
@@ -273,6 +275,8 @@ int32_t opal_cuda_kernel_support_fini(void)
         cuda_kernel_table.opal_ddt_cuda_d2dcpy_async_p = NULL;
         cuda_kernel_table.opal_ddt_cuda_d2dcpy_p = NULL;
         cuda_kernel_table.opal_ddt_cached_cuda_iov_fini_p = NULL;
+        cuda_kernel_table.opal_ddt_cuda_set_cuda_stream_p = NULL;
+        cuda_kernel_table.opal_ddt_cuda_get_cuda_stream_p = NULL;
 
         dlclose(opal_datatype_cuda_kernel_handle);
         opal_datatype_cuda_kernel_handle = NULL;
@@ -372,3 +376,22 @@ void opal_cached_cuda_iov_fini(void *cached_cuda_iov)
     }
 }
 
+void opal_cuda_set_cuda_stream(void)
+{
+    if (cuda_kernel_table.opal_ddt_cuda_set_cuda_stream_p != NULL) {
+        cuda_kernel_table.opal_ddt_cuda_set_cuda_stream_p();
+    } else {
+        opal_output(0, "opal_ddt_cuda_set_cuda_stream function pointer is NULL\n");
+    }
+}
+
+int32_t opal_cuda_get_cuda_stream(void)
+{
+    if (cuda_kernel_table.opal_ddt_cuda_get_cuda_stream_p != NULL) {
+        return cuda_kernel_table.opal_ddt_cuda_get_cuda_stream_p();
+    } else {
+        opal_output(0, "opal_ddt_cuda_get_cuda_stream function pointer is NULL\n");
+        return -2;
+    }
+}
+
diff --git a/opal/datatype/opal_datatype_cuda.h b/opal/datatype/opal_datatype_cuda.h
index 7b613470ab0..cb82e93add3 100644
--- a/opal/datatype/opal_datatype_cuda.h
+++ b/opal/datatype/opal_datatype_cuda.h
@@ -29,6 +29,8 @@ struct opal_datatype_cuda_kernel_function_table {
     void (*opal_ddt_cuda_d2dcpy_async_p)(void* dst, const void* src, size_t count);
     void (*opal_ddt_cuda_d2dcpy_p)(void* dst, const void* src, size_t count);
     void (*opal_ddt_cached_cuda_iov_fini_p)(void *cached_cuda_iov);
+    void (*opal_ddt_cuda_set_cuda_stream_p)(void);
+    int32_t (*opal_ddt_cuda_get_cuda_stream_p)(void);
     int32_t (*opal_ddt_generic_simple_pack_function_cuda_iov_p)( opal_convertor_t* pConvertor, struct iovec* iov, uint32_t* out_size, size_t* max_data );
     int32_t (*opal_ddt_generic_simple_unpack_function_cuda_iov_p)( opal_convertor_t* pConvertor, struct iovec* iov, uint32_t* out_size, size_t* max_data );
     int32_t (*opal_ddt_generic_simple_pack_function_cuda_vector_p)( opal_convertor_t* pConvertor, struct iovec* iov, uint32_t* out_size, size_t* max_data );
@@ -57,5 +59,7 @@ void opal_cuda_d2dcpy(void* dst, const void* src, size_t count);
 void opal_cuda_d2dcpy_async(void* dst, const void* src, size_t count);
 void* opal_cached_cuda_iov_init(void);
 void opal_cached_cuda_iov_fini(void *cached_cuda_iov);
+void opal_cuda_set_cuda_stream(void);
+int32_t opal_cuda_get_cuda_stream(void);
 
 #endif
diff --git a/opal/mca/btl/smcuda/btl_smcuda.c b/opal/mca/btl/smcuda/btl_smcuda.c
index eeafea57fb6..7e1441fd8e1 100644
--- a/opal/mca/btl/smcuda/btl_smcuda.c
+++ b/opal/mca/btl/smcuda/btl_smcuda.c
@@ -1186,11 +1186,12 @@ int mca_btl_smcuda_get_cuda (struct mca_btl_base_module_t *btl,
                 struct iovec iov;
                 uint32_t iov_count = 1;
                 size_t max_data;
+                opal_cuda_set_cuda_stream();
                 if (!OPAL_DATATYPE_DIRECT_COPY_GPUMEM && remote_device != local_device) {
                     unpack_convertor->gpu_buffer_ptr = opal_cuda_malloc_gpu_buffer(size, 0);
                     opal_cuda_d2dcpy_async(unpack_convertor->gpu_buffer_ptr, remote_memory_address, size);
                     iov.iov_base = unpack_convertor->gpu_buffer_ptr;
-                    opal_output(0, "start D2D copy src %p, dst %p, size %lu\n", remote_memory_address, unpack_convertor->gpu_buffer_ptr, size);
+                    opal_output(0, "start D2D copy src %p, dst %p, size %lu, stream id %d\n", remote_memory_address, unpack_convertor->gpu_buffer_ptr, size, opal_cuda_get_cuda_stream());
                 } else {
                     iov.iov_base = unpack_convertor->gpu_buffer_ptr;
                 }
diff --git a/opal/mca/btl/smcuda/btl_smcuda_component.c b/opal/mca/btl/smcuda/btl_smcuda_component.c
index c4a299ef84a..f8bcb5eb865 100644
--- a/opal/mca/btl/smcuda/btl_smcuda_component.c
+++ b/opal/mca/btl/smcuda/btl_smcuda_component.c
@@ -895,17 +895,19 @@ static void btl_smcuda_datatype_unpack(mca_btl_base_module_t* btl,
             convertor->flags |= CONVERTOR_CUDA;
             unsigned char *local_address = my_cuda_dt_clone->current_unpack_convertor_pBaseBuf;
             remote_address = (unsigned char*)my_cuda_dt_clone->remote_gpu_address + seq * pipeline_size;
-            opal_output(0, "no unpack, start D2D copy local %p, remote %p, size %ld\n", local_address, remote_address, packed_size);
+            opal_output(0, "no unpack, start D2D copy local %p, remote %p, size %ld, stream id %d\n", local_address, remote_address, packed_size, opal_cuda_get_cuda_stream());
+            opal_cuda_set_cuda_stream();
             mca_common_cuda_memp2pcpy(local_address, (unsigned char*)my_cuda_dt_clone->remote_gpu_address + seq*pipeline_size, packed_size);
             my_cuda_dt_clone->current_unpack_convertor_pBaseBuf += packed_size;
         } else {     /* unpack */
             convertor->flags |= CONVERTOR_CUDA;
+            opal_cuda_set_cuda_stream();
             if (!OPAL_DATATYPE_DIRECT_COPY_GPUMEM && my_cuda_dt_clone->remote_device != my_cuda_dt_clone->local_device) {
                 convertor->gpu_buffer_ptr = opal_cuda_malloc_gpu_buffer(packed_size, 0);
                 remote_address = (unsigned char*)my_cuda_dt_clone->remote_gpu_address + seq * pipeline_size;
                 opal_cuda_d2dcpy_async(convertor->gpu_buffer_ptr, remote_address, packed_size);
                 iov.iov_base = convertor->gpu_buffer_ptr;
-                opal_output(0, "unpack, start D2D copy src %p, dst %p, size %lu\n", remote_address, convertor->gpu_buffer_ptr, packed_size);        
+                opal_output(0, "unpack, start D2D copy src %p, dst %p, size %lu, stream id %d\n", remote_address, convertor->gpu_buffer_ptr, packed_size, opal_cuda_get_cuda_stream());        
             } else {
                 iov.iov_base = convertor->gpu_buffer_ptr + seq * pipeline_size;
             }
@@ -968,6 +970,7 @@ static void btl_smcuda_datatype_pack(mca_btl_base_module_t* btl,
             struct iovec iov;
             iov.iov_base = convertor->gpu_buffer_ptr + seq * mca_btl_smcuda_component.cuda_ddt_pipeline_size;
             iov.iov_len = mca_btl_smcuda_component.cuda_ddt_pipeline_size;
+            opal_cuda_set_cuda_stream();
             rv_dt = opal_convertor_pack(convertor, &iov, &iov_count, &max_data );
             packed_size = max_data;
             send_msg.packed_size = packed_size;
@@ -985,6 +988,7 @@ static void btl_smcuda_datatype_pack(mca_btl_base_module_t* btl,
         iov.iov_len = mca_btl_smcuda_component.cuda_ddt_pipeline_size;
         seq = 0;
         while (rv_dt != 1 && convertor->gpu_buffer_size > 0) {
+            opal_cuda_set_cuda_stream();
             rv_dt = opal_convertor_pack(convertor, &iov, &iov_count, &max_data );
             iov.iov_base = (void*)((unsigned char*)iov.iov_base + mca_btl_smcuda_component.cuda_ddt_pipeline_size);
             convertor->gpu_buffer_size -= mca_btl_smcuda_component.cuda_ddt_pipeline_size;
@@ -1041,6 +1045,7 @@ static void btl_smcuda_datatype_put(mca_btl_base_module_t* btl,
     size_t max_data = 0;
     iov.iov_len = convertor->local_size;
     iov.iov_base = convertor->gpu_buffer_ptr;
+    opal_cuda_set_cuda_stream();
     rv_dt = opal_convertor_pack(convertor, &iov, &iov_count, &max_data );
     assert(rv_dt == 1);
     send_msg.lindex = lindex;
diff --git a/test/datatype/ddt_benchmark.c b/test/datatype/ddt_benchmark.c
index 8b3c7ce7981..afc33e1075e 100644
--- a/test/datatype/ddt_benchmark.c
+++ b/test/datatype/ddt_benchmark.c
@@ -1271,12 +1271,12 @@ int main( int argc, char* argv[] )
     
     printf( "\n\n#\n * TEST UPPER TRIANGULAR MATRIX (size 100)\n #\n\n" );
     int mat_size = 500;
-    for (mat_size = 1000; mat_size <= 4000; mat_size +=1000) {
+    for (mat_size = 4000; mat_size <= 4000; mat_size +=1000) {
         pdt = upper_matrix(mat_size);
         printf("----matrix size %d-----\n", mat_size);
         if( outputFlags & CHECK_PACK_UNPACK ) {
             for (i = 1; i <= 5; i++) {
-               //  local_copy_with_convertor(pdt, 1, 200000000, mat_size);
+                 local_copy_with_convertor(pdt, 1, 200000000, mat_size);
             }
         }
         OBJ_RELEASE( pdt ); assert( pdt == NULL );
@@ -1345,7 +1345,7 @@ int main( int argc, char* argv[] )
         pdt = create_vector_type( MPI_DOUBLE, blk_len, blk_len, blk_len*2);
         if( outputFlags & CHECK_PACK_UNPACK ) {
             for (i = 0; i < 4; i++) {
-                 vector_ddt( pdt, 1, pdt, 1, 1024*1024*200 , blk_len, blk_len, blk_len*2);
+    //             vector_ddt( pdt, 1, pdt, 1, 1024*1024*200 , blk_len, blk_len, blk_len*2);
      //          vector_ddt_2d( pdt, 1, pdt, 1, 1024*1024*100 , 8192, blk_len, blk_len+128);
             }
         }

From e6c765e41d502a11b754e803aabc96ca29c6de18 Mon Sep 17 00:00:00 2001
From: Wei Wu <eddy16112@gmail.com>
Date: Fri, 26 Feb 2016 13:41:42 -0800
Subject: [PATCH 188/190] new vector kernel

---
 .../cuda/opal_datatype_pack_cuda_kernel.cu    | 213 ++++++++++++++----
 .../cuda/opal_datatype_pack_cuda_wrapper.cu   |  20 +-
 .../cuda/opal_datatype_unpack_cuda_wrapper.cu |  20 +-
 opal/datatype/opal_datatype_pack.c            |   2 +-
 opal/datatype/opal_datatype_unpack.c          |   2 +-
 test/datatype/ddt_benchmark.c                 |   6 +-
 6 files changed, 186 insertions(+), 77 deletions(-)

diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
index 929d1f7de88..0f887753bf5 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
@@ -148,62 +148,175 @@ __global__ void pack_contiguous_loop_cuda_kernel_global( uint32_t copy_loops,
 
 #else
 
-#define SEG_ADD(s) \
-    l += s; \
-    while (l >= lines) { \
-	l -= lines; \
-	c += width; \
-    }
-
-__global__ void pack_contiguous_loop_cuda_kernel_global( uint32_t lines,
-                                                         size_t nb_size,
-                                                         OPAL_PTRDIFF_TYPE nb_extent,
-                                                         unsigned char * b_source,
-                                                         unsigned char * b_destination )
+__global__ void pack_contiguous_loop_cuda_kernel_global( uint32_t copy_loops,
+                                                         size_t size,
+                                                         OPAL_PTRDIFF_TYPE extent,
+                                                         unsigned char* source,
+                                                         unsigned char* destination )
 {
-    uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
-    uint32_t num_threads = gridDim.x * blockDim.x;
-  
-    //size_t lines = (size_t)lines;
-    size_t size = nb_size / 8;
-    size_t extent = nb_extent / 8;
-    uint64_t * source = (uint64_t *) b_source;
-    uint64_t *destination = (uint64_t *) b_destination;
-    uint64_t val[KERNEL_UNROLL];
+    uint32_t i, u, tid, num_threads, warp_id, tid_per_warp, nb_warps, nb_warps_x, nb_warps_y, pos_x, pos_y, size_last_y, size_last_x;
+    uint32_t size_nb, extent_nb;
+    uint64_t *_source_tmp, *_destination_tmp, *source_64, *destination_64, *_source_left_tmp, *_destination_left_tmp;
+    uint64_t val[UNROLL_16];
     
-    int col = 0;
-    for (int width = 32; width > 0 && col < size; width >>= 1) {
-    	while (size-col >= width) {
-    	    const int warp_id = tid / width;
-    	    const int warp_tid = tid & (width-1);
-    	    const int warp_nb = num_threads / width;
-    	    const int c = col + warp_tid;
-            int l = warp_id * KERNEL_UNROLL;
-    	    uint64_t *src = source + c;
-    	    uint64_t *dst = destination + c;
-    	    for (int b=0; b<lines/(KERNEL_UNROLL*warp_nb); b++) {
-    		    #pragma unroll
-    		    for (int u=0; u<KERNEL_UNROLL; u++) {
-    		        val[u] = *(src+(l+u)*extent);
-    		    }
-    		    #pragma unroll
-    		    for (int u=0; u<KERNEL_UNROLL; u++) {
-    		        dst[(l+u)*size] = val[u];
-    		    }
-    		    l += warp_nb * KERNEL_UNROLL;
-    	    }
-    	    /* Finish non-unrollable case */
-    	    for (int u=0; u<KERNEL_UNROLL && l<lines; u++) {
-    		    dst[l*size] = *(src+l*extent);
-    		    l++;
-    	    }		
-    	    col += width;
-    	}
+    tid = threadIdx.x + blockIdx.x * blockDim.x;
+    num_threads = gridDim.x * blockDim.x;
+    warp_id = tid / CUDA_WARP_SIZE;
+    tid_per_warp = threadIdx.x & (CUDA_WARP_SIZE-1);
+    nb_warps = num_threads / CUDA_WARP_SIZE;
+    
+    extent_nb = extent / 8;
+    size_nb = size / 8;
+    source_64 = (uint64_t*)source;
+    destination_64 = (uint64_t*)destination;
+    
+    nb_warps_x = size_nb / CUDA_WARP_SIZE;
+    size_last_x = size_nb & (CUDA_WARP_SIZE-1);
+    if ( size_last_x != 0) {
+        nb_warps_x ++;
+    } else {
+        size_last_x = CUDA_WARP_SIZE;
+    }
+    nb_warps_y = copy_loops / UNROLL_16;
+    size_last_y = copy_loops & (UNROLL_16-1);
+    if ( size_last_y != 0) {
+        nb_warps_y ++;
+    } else {
+        size_last_y = UNROLL_16;
+    }
+    // if (threadIdx.x == 0) {
+    //     printf("warp_id %u, nb_warps_x %u, nb_warps_y %u, tid_per_warps %u, nb_warps %u\n", warp_id, nb_warps_x, nb_warps_y, tid_per_warp, nb_warps);
+    // }
+    
+    const uint32_t extent_nb_times_UNROLL_16 =  extent_nb * UNROLL_16;
+    const uint32_t size_nb_times_UNROLL_16 = size_nb * UNROLL_16;
+    source_64 += tid_per_warp;
+    destination_64 += tid_per_warp;
+    
+    for (i = warp_id; i < (nb_warps_x-1) * (nb_warps_y-1); i += nb_warps) {
+        pos_x = i / (nb_warps_y-1);
+        pos_y = i % (nb_warps_y-1);
+        _source_tmp = source_64 + pos_y * extent_nb_times_UNROLL_16 + pos_x * CUDA_WARP_SIZE;
+        _destination_tmp = destination_64 + pos_y * size_nb_times_UNROLL_16 + pos_x * CUDA_WARP_SIZE;
+        #pragma unroll
+        for (u = 0; u < UNROLL_16; u++) {
+            val[u] = *(_source_tmp + u * extent_nb);
+        }
+        #pragma unroll
+        for (uint32_t u = 0; u < UNROLL_16; u++) {
+            *(_destination_tmp + u * size_nb) = val[u];
+        }
+    }
+    if (tid_per_warp < size_last_x) {
+        pos_x = nb_warps_x - 1;
+        _source_left_tmp = source_64 + pos_x * CUDA_WARP_SIZE;
+        _destination_left_tmp = destination_64 + pos_x * CUDA_WARP_SIZE;
+        for (i = warp_id; i < nb_warps_y-1; i += nb_warps) {
+            _source_tmp = _source_left_tmp + i * extent_nb_times_UNROLL_16;
+            _destination_tmp = _destination_left_tmp + i * size_nb_times_UNROLL_16;
+            #pragma unroll
+            for (u = 0; u < UNROLL_16; u++) {
+                val[u] = *(_source_tmp + u * extent_nb);
+            }
+            #pragma unroll
+            for (uint32_t u = 0; u < UNROLL_16; u++) {
+                *(_destination_tmp + u * size_nb) = val[u];
+            }
+        }
     }
-
     
+    pos_y = nb_warps_y - 1;
+    _source_left_tmp = source_64 + pos_y * extent_nb_times_UNROLL_16;
+    _destination_left_tmp = destination_64 + pos_y * size_nb_times_UNROLL_16;
+    if (size_last_y == UNROLL_16) {
+        for (i = warp_id; i < nb_warps_x-1; i += nb_warps) {
+            _source_tmp = _source_left_tmp + i * CUDA_WARP_SIZE;
+            _destination_tmp = _destination_left_tmp + i * CUDA_WARP_SIZE;
+            #pragma unroll
+            for (u = 0; u < UNROLL_16; u++) {
+                val[u] = *(_source_tmp + u * extent_nb);
+            }
+            #pragma unroll
+            for (uint32_t u = 0; u < UNROLL_16; u++) {
+                *(_destination_tmp + u * size_nb) = val[u];
+            }  
+        } 
+    } else {
+        for (i = warp_id; i < nb_warps_x-1; i += nb_warps) {
+            _source_tmp = _source_left_tmp + i * CUDA_WARP_SIZE;
+            _destination_tmp = _destination_left_tmp + i * CUDA_WARP_SIZE;
+            for (u = 0; u < size_last_y; u++) {
+                *(_destination_tmp + u * size_nb) = *(_source_tmp + u * extent_nb);
+            }
+        }
+    }
+    
+    if (warp_id == 0 && tid_per_warp < size_last_x) {
+        _source_tmp = source_64 + (nb_warps_y-1) * extent_nb_times_UNROLL_16 + (nb_warps_x-1) * CUDA_WARP_SIZE;
+        _destination_tmp = destination_64 + (nb_warps_y-1) * size_nb_times_UNROLL_16 + (nb_warps_x-1) * CUDA_WARP_SIZE;
+        for (u = 0; u < size_last_y; u++) {
+            *(_destination_tmp + u * size_nb) = *(_source_tmp + u * extent_nb);
+        }
+    }
 }
 
+
+// #define SEG_ADD(s) \
+//     l += s; \
+//     while (l >= lines) { \
+//     l -= lines; \
+//     c += width; \
+//     }
+//
+// __global__ void pack_contiguous_loop_cuda_kernel_global( uint32_t lines,
+//                                                          size_t nb_size,
+//                                                          OPAL_PTRDIFF_TYPE nb_extent,
+//                                                          unsigned char * b_source,
+//                                                          unsigned char * b_destination )
+// {
+//     uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
+//     uint32_t num_threads = gridDim.x * blockDim.x;
+//
+//     //size_t lines = (size_t)lines;
+//     size_t size = nb_size / 8;
+//     size_t extent = nb_extent / 8;
+//     uint64_t * source = (uint64_t *) b_source;
+//     uint64_t *destination = (uint64_t *) b_destination;
+//     uint64_t val[KERNEL_UNROLL];
+//
+//     int col = 0;
+//     for (int width = 32; width > 0 && col < size; width >>= 1) {
+//         while (size-col >= width) {
+//             const int warp_id = tid / width;
+//             const int warp_tid = tid & (width-1);
+//             const int warp_nb = num_threads / width;
+//             const int c = col + warp_tid;
+//             int l = warp_id * KERNEL_UNROLL;
+//             uint64_t *src = source + c;
+//             uint64_t *dst = destination + c;
+//             for (int b=0; b<lines/(KERNEL_UNROLL*warp_nb); b++) {
+//                 #pragma unroll
+//                 for (int u=0; u<KERNEL_UNROLL; u++) {
+//                     val[u] = *(src+(l+u)*extent);
+//                 }
+//                 #pragma unroll
+//                 for (int u=0; u<KERNEL_UNROLL; u++) {
+//                     dst[(l+u)*size] = val[u];
+//                 }
+//                 l += warp_nb * KERNEL_UNROLL;
+//             }
+//             /* Finish non-unrollable case */
+//             for (int u=0; u<KERNEL_UNROLL && l<lines; u++) {
+//                 dst[l*size] = *(src+l*extent);
+//                 l++;
+//             }
+//             col += width;
+//         }
+//     }
+//
+//
+// }
+
 /*
 #define COLOFF_INC(jump, width, ext) \
      col += jump; \
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
index 882c26a72b4..20e3b381994 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
@@ -192,9 +192,7 @@ int32_t opal_ddt_generic_simple_pack_function_cuda_vector(opal_convertor_t* pCon
         iov[iov_count].iov_len -= iov_len_local;  /* update the amount of valid data */
         total_packed += iov[iov_count].iov_len;
  //       printf("iov_len %d, local %d\n", iov[iov_count].iov_len, iov_len_local);
-        for (i = 0; i < NB_STREAMS; i++) {
-            cudaStreamSynchronize(cuda_streams->ddt_cuda_stream[i]);
-        }
+        cudaStreamSynchronize(cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]);
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
         GET_TIME(start);
 #endif
@@ -461,9 +459,9 @@ void pack_contiguous_loop_cuda( dt_elem_desc_t* ELEM,
  //   num_blocks = (*COUNT + tasks_per_block - 1) / tasks_per_block;
 //    printf("extent %ld, size %ld, count %ld\n", _loop->extent, _end_loop->size, _copy_loops);
 #if OPAL_DATATYPE_VECTOR_USE_MEMCPY2D_AS_KERNEL
-    cudaMemcpy2DAsync(_destination, _end_loop->size, _source, _loop->extent, _end_loop->size, _copy_loops, cudaMemcpyDeviceToDevice, cuda_streams->ddt_cuda_stream[0]);
+    cudaMemcpy2DAsync(_destination, _end_loop->size, _source, _loop->extent, _end_loop->size, _copy_loops, cudaMemcpyDeviceToDevice, cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]);
 #else
-    pack_contiguous_loop_cuda_kernel_global<<<32, 8*THREAD_PER_BLOCK, 0, cuda_streams->ddt_cuda_stream[0]>>>(_copy_loops, _end_loop->size, _loop->extent, _source, _destination);
+    pack_contiguous_loop_cuda_kernel_global<<<16, 8*THREAD_PER_BLOCK, 0, cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]>>>(_copy_loops, _end_loop->size, _loop->extent, _source, _destination);
 #endif /* OPAL_DATATYPE_VECTOR_USE_MEMCPY2D_AS_KERNEL */
 
 #if !defined(OPAL_DATATYPE_CUDA_DRY_RUN)    
@@ -473,7 +471,7 @@ void pack_contiguous_loop_cuda( dt_elem_desc_t* ELEM,
     *(COUNT) -= _copy_loops;
 #endif
     
-    cudaStreamSynchronize(cuda_streams->ddt_cuda_stream[0]);
+    cudaStreamSynchronize(cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]);
     
 #if defined(OPAL_DATATYPE_CUDA_TIMING) 
     GET_TIME( end );
@@ -584,7 +582,7 @@ void pack_contiguous_loop_cuda_memcpy2d_d2h( dt_elem_desc_t* ELEM,
     GET_TIME(start);
 #endif    
 
-    cudaMemcpy2DAsync(_destination, _end_loop->size, _source, _loop->extent, _end_loop->size, _copy_loops, cudaMemcpyDeviceToHost, cuda_streams->ddt_cuda_stream[0]);
+    cudaMemcpy2DAsync(_destination, _end_loop->size, _source, _loop->extent, _end_loop->size, _copy_loops, cudaMemcpyDeviceToHost, cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]);
 
 #if !defined(OPAL_DATATYPE_CUDA_DRY_RUN)    
     *(SOURCE) = _source +  _loop->extent*_copy_loops - _end_loop->first_elem_disp;
@@ -593,7 +591,7 @@ void pack_contiguous_loop_cuda_memcpy2d_d2h( dt_elem_desc_t* ELEM,
     *(COUNT) -= _copy_loops;
 #endif
     
-    cudaStreamSynchronize(cuda_streams->ddt_cuda_stream[0]);
+    cudaStreamSynchronize(cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]);
     
 #if defined(OPAL_DATATYPE_CUDA_TIMING) 
     GET_TIME( end );
@@ -638,9 +636,9 @@ void pack_contiguous_loop_cuda_zerocopy( dt_elem_desc_t* ELEM,
         printf("can not get dev  mem, %s\n", cuda_err);
     }
 #if OPAL_DATATYPE_VECTOR_USE_MEMCPY2D_AS_KERNEL
-    cudaMemcpy2DAsync(_destination_dev, _end_loop->size, _source, _loop->extent, _end_loop->size, _copy_loops, cudaMemcpyDeviceToDevice, cuda_streams->ddt_cuda_stream[0]);
+    cudaMemcpy2DAsync(_destination_dev, _end_loop->size, _source, _loop->extent, _end_loop->size, _copy_loops, cudaMemcpyDeviceToDevice, cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]);
 #else
-    pack_contiguous_loop_cuda_kernel_global<<<192, 4*THREAD_PER_BLOCK, 0, cuda_streams->ddt_cuda_stream[0]>>>(_copy_loops, _end_loop->size, _loop->extent, _source, _destination_dev);
+    pack_contiguous_loop_cuda_kernel_global<<<192, 4*THREAD_PER_BLOCK, 0, cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]>>>(_copy_loops, _end_loop->size, _loop->extent, _source, _destination_dev);
 #endif /* OPAL_DATATYPE_VECTOR_USE_MEMCPY2D_AS_KERNEL */
 
 #if !defined(OPAL_DATATYPE_CUDA_DRY_RUN)    
@@ -650,7 +648,7 @@ void pack_contiguous_loop_cuda_zerocopy( dt_elem_desc_t* ELEM,
     *(COUNT) -= _copy_loops;
 #endif
     
-    cudaStreamSynchronize(cuda_streams->ddt_cuda_stream[0]);
+    cudaStreamSynchronize(cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]);
     
 #if defined(OPAL_DATATYPE_CUDA_TIMING) 
     GET_TIME( end );
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
index 703e52280b5..9be53d2d5a7 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
@@ -178,9 +178,7 @@ int32_t opal_ddt_generic_simple_unpack_function_cuda_vector( opal_convertor_t* p
         total_unpacked += iov[iov_count].iov_len;
     }
  complete_conversion:
-    for (i = 0; i < NB_STREAMS; i++) {
-        cudaStreamSynchronize(cuda_streams->ddt_cuda_stream[i]);
-    }
+    cudaStreamSynchronize(cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]);
     *max_data = total_unpacked;
     pConvertor->bConverted += total_unpacked;  /* update the already converted bytes */
     *out_size = iov_count;
@@ -955,9 +953,9 @@ void unpack_contiguous_loop_cuda( dt_elem_desc_t* ELEM,
 //    tasks_per_block = THREAD_PER_BLOCK * TASK_PER_THREAD;
 //    num_blocks = (*COUNT + tasks_per_block - 1) / tasks_per_block;
 #if OPAL_DATATYPE_VECTOR_USE_MEMCPY2D_AS_KERNEL
-     cudaMemcpy2DAsync(_destination, _loop->extent, _source, _end_loop->size, _end_loop->size, _copy_loops, cudaMemcpyDeviceToDevice, cuda_streams->ddt_cuda_stream[0]);
+     cudaMemcpy2DAsync(_destination, _loop->extent, _source, _end_loop->size, _end_loop->size, _copy_loops, cudaMemcpyDeviceToDevice, cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]);
 #else
-     unpack_contiguous_loop_cuda_kernel_global<<<192, 4*THREAD_PER_BLOCK, 0, cuda_streams->ddt_cuda_stream[0]>>>(_copy_loops, _end_loop->size, _loop->extent, _source, _destination);
+     unpack_contiguous_loop_cuda_kernel_global<<<192, 4*THREAD_PER_BLOCK, 0, cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]>>>(_copy_loops, _end_loop->size, _loop->extent, _source, _destination);
 #endif /* OPAL_DATATYPE_VECTOR_USE_MEMCPY2D_AS_KERNEL */
 
 #if !defined(OPAL_DATATYPE_CUDA_DRY_RUN)
@@ -967,7 +965,7 @@ void unpack_contiguous_loop_cuda( dt_elem_desc_t* ELEM,
     *(COUNT) -= _copy_loops;
 #endif
 
-    cudaStreamSynchronize(cuda_streams->ddt_cuda_stream[0]);
+    cudaStreamSynchronize(cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]);
 #if defined(OPAL_DATATYPE_CUDA_TIMING) 
     GET_TIME( end );
     total_time = ELAPSED_TIME( start, end );
@@ -1002,7 +1000,7 @@ void unpack_contiguous_loop_cuda_memcpy2d( dt_elem_desc_t* ELEM,
 #if defined(OPAL_DATATYPE_CUDA_TIMING)
     GET_TIME(start);
 #endif
-    cudaMemcpy2DAsync(_destination, _loop->extent, _source, _end_loop->size, _end_loop->size, _copy_loops, cudaMemcpyHostToDevice, cuda_streams->ddt_cuda_stream[0]);
+    cudaMemcpy2DAsync(_destination, _loop->extent, _source, _end_loop->size, _end_loop->size, _copy_loops, cudaMemcpyHostToDevice, cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]);
 
 #if !defined(OPAL_DATATYPE_CUDA_DRY_RUN)
     *(DESTINATION) = _destination + _loop->extent*_copy_loops - _end_loop->first_elem_disp;
@@ -1011,7 +1009,7 @@ void unpack_contiguous_loop_cuda_memcpy2d( dt_elem_desc_t* ELEM,
     *(COUNT) -= _copy_loops;
 #endif
     
-    cudaStreamSynchronize(cuda_streams->ddt_cuda_stream[0]);
+    cudaStreamSynchronize(cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]);
     
 #if defined(OPAL_DATATYPE_CUDA_TIMING) 
     GET_TIME( end );
@@ -1057,9 +1055,9 @@ void unpack_contiguous_loop_cuda_zerocopy( dt_elem_desc_t* ELEM,
         printf("can not get dev mem, %s\n", cuda_err);
     }
 #if OPAL_DATATYPE_VECTOR_USE_MEMCPY2D_AS_KERNEL
-    cudaMemcpy2DAsync(_destination, _loop->extent, _source_dev, _end_loop->size, _end_loop->size, _copy_loops, cudaMemcpyDeviceToDevice, cuda_streams->ddt_cuda_stream[0]);
+    cudaMemcpy2DAsync(_destination, _loop->extent, _source_dev, _end_loop->size, _end_loop->size, _copy_loops, cudaMemcpyDeviceToDevice, cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]);
 #else
-    unpack_contiguous_loop_cuda_kernel_global<<<192, 4*THREAD_PER_BLOCK, 0, cuda_streams->ddt_cuda_stream[0]>>>(_copy_loops, _end_loop->size, _loop->extent, _source_dev, _destination);
+    unpack_contiguous_loop_cuda_kernel_global<<<192, 4*THREAD_PER_BLOCK, 0, cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]>>>(_copy_loops, _end_loop->size, _loop->extent, _source_dev, _destination);
 #endif /* OPAL_DATATYPE_VECTOR_USE_MEMCPY2D_AS_KERNEL */
 
 #if !defined(OPAL_DATATYPE_CUDA_DRY_RUN)
@@ -1069,7 +1067,7 @@ void unpack_contiguous_loop_cuda_zerocopy( dt_elem_desc_t* ELEM,
     *(COUNT) -= _copy_loops;
 #endif
 
-    cudaStreamSynchronize(cuda_streams->ddt_cuda_stream[0]);
+    cudaStreamSynchronize(cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]);
   //  cudaHostUnregister(_source);
 #if defined(OPAL_DATATYPE_CUDA_TIMING) 
     GET_TIME( end );
diff --git a/opal/datatype/opal_datatype_pack.c b/opal/datatype/opal_datatype_pack.c
index c8985db7913..1ae08565b73 100644
--- a/opal/datatype/opal_datatype_pack.c
+++ b/opal/datatype/opal_datatype_pack.c
@@ -416,7 +416,7 @@ opal_generic_simple_pack_cuda_function( opal_convertor_t* pConvertor,
     pos_desc   = pStack->index;
     pElem = &(description[pos_desc]);
    
-    return opal_generic_simple_pack_function_cuda_iov( pConvertor, iov, out_size, max_data);
+    //return opal_generic_simple_pack_function_cuda_iov( pConvertor, iov, out_size, max_data);
     if( OPAL_DATATYPE_LOOP == pElem->elem.common.type ) {
         return opal_generic_simple_pack_function_cuda_vector( pConvertor, iov, out_size, max_data);
     } else {
diff --git a/opal/datatype/opal_datatype_unpack.c b/opal/datatype/opal_datatype_unpack.c
index 5f51b3f828b..815f7b1e4bf 100644
--- a/opal/datatype/opal_datatype_unpack.c
+++ b/opal/datatype/opal_datatype_unpack.c
@@ -610,7 +610,7 @@ opal_generic_simple_unpack_cuda_function( opal_convertor_t* pConvertor,
     pos_desc   = pStack->index;
     pElem = &(description[pos_desc]);
    
-    return opal_generic_simple_unpack_function_cuda_iov( pConvertor, iov, out_size, max_data);
+    //return opal_generic_simple_unpack_function_cuda_iov( pConvertor, iov, out_size, max_data);
     if( OPAL_DATATYPE_LOOP == pElem->elem.common.type ) {
         return opal_generic_simple_unpack_function_cuda_vector( pConvertor, iov, out_size, max_data);
     } else {
diff --git a/test/datatype/ddt_benchmark.c b/test/datatype/ddt_benchmark.c
index afc33e1075e..de3f43a8759 100644
--- a/test/datatype/ddt_benchmark.c
+++ b/test/datatype/ddt_benchmark.c
@@ -1276,7 +1276,7 @@ int main( int argc, char* argv[] )
         printf("----matrix size %d-----\n", mat_size);
         if( outputFlags & CHECK_PACK_UNPACK ) {
             for (i = 1; i <= 5; i++) {
-                 local_copy_with_convertor(pdt, 1, 200000000, mat_size);
+               //  local_copy_with_convertor(pdt, 1, 200000000, mat_size);
             }
         }
         OBJ_RELEASE( pdt ); assert( pdt == NULL );
@@ -1339,13 +1339,13 @@ int main( int argc, char* argv[] )
     }
     
     
-    for (blk_len = 4000; blk_len <= 4000; blk_len += 2000) {
+    for (blk_len = 1000; blk_len <= 4000; blk_len += 2000) {
         printf( ">>--------------------------------------------<<\n" );
         printf( "Vector data-type (1024 times %d double stride 512)\n", blk_len );
         pdt = create_vector_type( MPI_DOUBLE, blk_len, blk_len, blk_len*2);
         if( outputFlags & CHECK_PACK_UNPACK ) {
             for (i = 0; i < 4; i++) {
-    //             vector_ddt( pdt, 1, pdt, 1, 1024*1024*200 , blk_len, blk_len, blk_len*2);
+                 vector_ddt( pdt, 1, pdt, 1, 1024*1024*200 , blk_len, blk_len, blk_len*2);
      //          vector_ddt_2d( pdt, 1, pdt, 1, 1024*1024*100 , 8192, blk_len, blk_len+128);
             }
         }

From 2b0048f470b4d7871ba64b028b6bca634245e86b Mon Sep 17 00:00:00 2001
From: Wei Wu <eddy16112@gmail.com>
Date: Fri, 26 Feb 2016 15:46:50 -0800
Subject: [PATCH 189/190] fix a if CUDA_41 error

---
 ompi/mca/pml/ob1/pml_ob1_cuda.c      | 1 -
 opal/datatype/opal_datatype_pack.c   | 2 +-
 opal/datatype/opal_datatype_unpack.c | 2 +-
 3 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/ompi/mca/pml/ob1/pml_ob1_cuda.c b/ompi/mca/pml/ob1/pml_ob1_cuda.c
index 68e97d77c4e..020a9f21bcd 100644
--- a/ompi/mca/pml/ob1/pml_ob1_cuda.c
+++ b/ompi/mca/pml/ob1/pml_ob1_cuda.c
@@ -70,7 +70,6 @@ int mca_pml_ob1_send_request_start_cuda(mca_pml_ob1_send_request_t* sendreq,
                                         size_t size) {
     int rc;
     int32_t local_device = 0;
-#if OPAL_CUDA_SUPPORT_41
 
     sendreq->req_send.req_base.req_convertor.flags &= ~CONVERTOR_CUDA;
     struct opal_convertor_t *convertor = &(sendreq->req_send.req_base.req_convertor);
diff --git a/opal/datatype/opal_datatype_pack.c b/opal/datatype/opal_datatype_pack.c
index 1ae08565b73..c8985db7913 100644
--- a/opal/datatype/opal_datatype_pack.c
+++ b/opal/datatype/opal_datatype_pack.c
@@ -416,7 +416,7 @@ opal_generic_simple_pack_cuda_function( opal_convertor_t* pConvertor,
     pos_desc   = pStack->index;
     pElem = &(description[pos_desc]);
    
-    //return opal_generic_simple_pack_function_cuda_iov( pConvertor, iov, out_size, max_data);
+    return opal_generic_simple_pack_function_cuda_iov( pConvertor, iov, out_size, max_data);
     if( OPAL_DATATYPE_LOOP == pElem->elem.common.type ) {
         return opal_generic_simple_pack_function_cuda_vector( pConvertor, iov, out_size, max_data);
     } else {
diff --git a/opal/datatype/opal_datatype_unpack.c b/opal/datatype/opal_datatype_unpack.c
index 815f7b1e4bf..5f51b3f828b 100644
--- a/opal/datatype/opal_datatype_unpack.c
+++ b/opal/datatype/opal_datatype_unpack.c
@@ -610,7 +610,7 @@ opal_generic_simple_unpack_cuda_function( opal_convertor_t* pConvertor,
     pos_desc   = pStack->index;
     pElem = &(description[pos_desc]);
    
-    //return opal_generic_simple_unpack_function_cuda_iov( pConvertor, iov, out_size, max_data);
+    return opal_generic_simple_unpack_function_cuda_iov( pConvertor, iov, out_size, max_data);
     if( OPAL_DATATYPE_LOOP == pElem->elem.common.type ) {
         return opal_generic_simple_unpack_function_cuda_vector( pConvertor, iov, out_size, max_data);
     } else {

From d22e54aee75029ea5330ef1475c4cb05c40968dc Mon Sep 17 00:00:00 2001
From: Wei Wu <eddy16112@gmail.com>
Date: Tue, 1 Mar 2016 11:04:31 -0800
Subject: [PATCH 190/190] clean up a if

---
 opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu   | 10 ++--------
 opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu | 10 ++--------
 2 files changed, 4 insertions(+), 16 deletions(-)

diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
index 0f887753bf5..10fb2356cad 100644
--- a/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
@@ -522,13 +522,7 @@ __global__ void opal_generic_simple_pack_cuda_iov_cached_kernel( ddt_cuda_iov_di
         if (blockIdx.x < (nb_blocks_used % gridDim.x)) {
             nb_tasks_per_block ++;
         }
-        if (nb_tasks_per_block >= 4) {
-            WARP_SIZE = 32;
-        } else if (nb_tasks_per_block == 1) {
-            WARP_SIZE = 32;//blockDim.x;
-        } else {
-            WARP_SIZE = 32;
-        }
+        WARP_SIZE = 32;
         nb_warp_per_block = blockDim.x / WARP_SIZE;
  //       nb_warp_per_block = 1;
      //   if (nb_tasks_per_block == )
@@ -563,7 +557,7 @@ __global__ void opal_generic_simple_pack_cuda_iov_cached_kernel( ddt_cuda_iov_di
             alignment = ALIGNMENT_CHAR;
         }
         
-     //   alignment = ALIGNMENT_DOUBLE;
+        //alignment = ALIGNMENT_DOUBLE;
         copy_count = _nb_bytes / alignment;
     /*    
         if (threadIdx.x == 0 && nb_tasks != 0) {
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
index fb533d4cfc8..38365013994 100644
--- a/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
@@ -168,13 +168,7 @@ __global__ void opal_generic_simple_unpack_cuda_iov_cached_kernel( ddt_cuda_iov_
         if (blockIdx.x < nb_blocks_used % gridDim.x) {
             nb_tasks_per_block ++;
         }
-        if (nb_tasks_per_block >= 4) {
-            WARP_SIZE = 32;
-        } else if (nb_tasks_per_block == 1) {
-            WARP_SIZE = 32;//blockDim.x;
-        } else {
-            WARP_SIZE = 32;
-        }
+        WARP_SIZE = 32;
         nb_warp_per_block = blockDim.x / WARP_SIZE;
      //   printf("cuda_iov_count %d, ddt_extent %d, current_count %d, ddt_size %d\n", cuda_iov_count, ddt_extent, current_count, ddt_size);
     }
@@ -214,7 +208,7 @@ __global__ void opal_generic_simple_unpack_cuda_iov_cached_kernel( ddt_cuda_iov_
         } else {
             alignment = ALIGNMENT_CHAR;
         }
-        
+        //alignment = ALIGNMENT_DOUBLE;
         copy_count = _nb_bytes / alignment;
    /*     
         if (threadIdx.x == 0 && nb_tasks != 0) {