diff --git a/configure.ac b/configure.ac
index 7bb7cbe8eb7..3e87ec6276e 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1357,6 +1357,10 @@ m4_ifdef([project_oshmem],
 
 opal_show_subtitle "Final output"
 
+if test "$OPAL_cuda_support" != "0"; then
+  AC_CONFIG_FILES([opal/datatype/cuda/Makefile])
+fi
+
 AC_CONFIG_FILES([
     Makefile
 
diff --git a/ompi/datatype/ompi_datatype.h b/ompi/datatype/ompi_datatype.h
index 17e1632e07d..9ff0719867c 100644
--- a/ompi/datatype/ompi_datatype.h
+++ b/ompi/datatype/ompi_datatype.h
@@ -94,7 +94,7 @@ OMPI_DECLSPEC OBJ_CLASS_DECLARATION(ompi_datatype_t);
 /* Using set constant for padding of the DATATYPE handles because the size of
  * base structure is very close to being the same no matter the bitness.
  */
-#define PREDEFINED_DATATYPE_PAD (512)
+#define PREDEFINED_DATATYPE_PAD (1024)
 
 struct ompi_predefined_datatype_t {
     struct ompi_datatype_t dt;
diff --git a/ompi/mca/pml/ob1/pml_ob1_cuda.c b/ompi/mca/pml/ob1/pml_ob1_cuda.c
index 12ad396363d..020a9f21bcd 100644
--- a/ompi/mca/pml/ob1/pml_ob1_cuda.c
+++ b/ompi/mca/pml/ob1/pml_ob1_cuda.c
@@ -37,11 +37,24 @@
 #include "ompi/mca/bml/base/base.h"
 #include "ompi/memchecker.h"
 
+#include "opal/datatype/opal_datatype_cuda.h"
+#include "opal/mca/common/cuda/common_cuda.h"
+#include "opal/mca/btl/smcuda/btl_smcuda.h"
+
+#define CUDA_DDT_WITH_RDMA 1
+
 size_t mca_pml_ob1_rdma_cuda_btls(
     mca_bml_base_endpoint_t* bml_endpoint,
     unsigned char* base,
     size_t size,
     mca_pml_ob1_com_btl_t* rdma_btls);
+    
+int mca_pml_ob1_rdma_cuda_btl_register_data(
+    mca_pml_ob1_com_btl_t* rdma_btls, 
+    uint32_t num_btls_used, 
+    struct opal_convertor_t *pack_convertor, uint8_t pack_required, int32_t gpu_device);
+
+size_t mca_pml_ob1_rdma_cuda_avail(mca_bml_base_endpoint_t* bml_endpoint);
 
 int mca_pml_ob1_cuda_need_buffers(void * rreq,
                                   mca_btl_base_module_t* btl);
@@ -56,16 +69,18 @@ int mca_pml_ob1_send_request_start_cuda(mca_pml_ob1_send_request_t* sendreq,
                                         mca_bml_base_btl_t* bml_btl,
                                         size_t size) {
     int rc;
-#if OPAL_CUDA_GDR_SUPPORT
-    /* With some BTLs, switch to RNDV from RGET at large messages */
-    if ((sendreq->req_send.req_base.req_convertor.flags & CONVERTOR_CUDA) &&
-        (sendreq->req_send.req_bytes_packed > (bml_btl->btl->btl_cuda_rdma_limit - sizeof(mca_pml_ob1_hdr_t)))) {
-        return mca_pml_ob1_send_request_start_rndv(sendreq, bml_btl, 0, 0);
-    }
-#endif /* OPAL_CUDA_GDR_SUPPORT */
+    int32_t local_device = 0;
 
     sendreq->req_send.req_base.req_convertor.flags &= ~CONVERTOR_CUDA;
+    struct opal_convertor_t *convertor = &(sendreq->req_send.req_base.req_convertor);
     if (opal_convertor_need_buffers(&sendreq->req_send.req_base.req_convertor) == false) {
+#if OPAL_CUDA_GDR_SUPPORT
+        /* With some BTLs, switch to RNDV from RGET at large messages */
+        if ((sendreq->req_send.req_bytes_packed > (bml_btl->btl->btl_cuda_rdma_limit - sizeof(mca_pml_ob1_hdr_t)))) {
+            sendreq->req_send.req_base.req_convertor.flags |= CONVERTOR_CUDA;
+            return mca_pml_ob1_send_request_start_rndv(sendreq, bml_btl, 0, 0);
+        }
+#endif /* OPAL_CUDA_GDR_SUPPORT */
         unsigned char *base;
         opal_convertor_get_current_pointer( &sendreq->req_send.req_base.req_convertor, (void**)&base );
         /* Set flag back */
@@ -75,6 +90,13 @@ int mca_pml_ob1_send_request_start_cuda(mca_pml_ob1_send_request_t* sendreq,
                                                                            base,
                                                                            sendreq->req_send.req_bytes_packed,
                                                                            sendreq->req_rdma))) {
+            
+            rc = mca_common_cuda_get_device(&local_device);
+            if (rc != 0) {
+                opal_output(0, "Failed to get the GPU device ID, rc= %d\n", rc);
+                return rc;
+            }                                                                   
+            mca_pml_ob1_rdma_cuda_btl_register_data(sendreq->req_rdma, sendreq->req_rdma_cnt, convertor, 0, local_device); 
             rc = mca_pml_ob1_send_request_start_rdma(sendreq, bml_btl,
                                                      sendreq->req_send.req_bytes_packed);
             if( OPAL_UNLIKELY(OMPI_SUCCESS != rc) ) {
@@ -92,7 +114,48 @@ int mca_pml_ob1_send_request_start_cuda(mca_pml_ob1_send_request_t* sendreq,
         /* Do not send anything with first rendezvous message as copying GPU
          * memory into RNDV message is expensive. */
         sendreq->req_send.req_base.req_convertor.flags |= CONVERTOR_CUDA;
-        rc = mca_pml_ob1_send_request_start_rndv(sendreq, bml_btl, 0, 0);
+        if ((mca_pml_ob1_rdma_cuda_avail(sendreq->req_endpoint) != 0) && 
+            (opal_datatype_cuda_kernel_support == 1) && 
+            (bml_btl->btl->btl_cuda_ddt_allow_rdma == 1)) {
+            unsigned char *base;
+            size_t buffer_size = 0;
+            if (convertor->local_size > bml_btl->btl->btl_cuda_ddt_pipeline_size) {
+                buffer_size = bml_btl->btl->btl_cuda_ddt_pipeline_size * bml_btl->btl->btl_cuda_ddt_pipeline_depth;
+            } else {
+                buffer_size = convertor->local_size;
+            }
+            base = opal_cuda_malloc_gpu_buffer(buffer_size, 0);
+            convertor->gpu_buffer_ptr = base;
+            convertor->gpu_buffer_size = buffer_size;
+            sendreq->req_send.req_bytes_packed = convertor->local_size;
+            opal_output(0, "malloc GPU BUFFER %p for pack, local size %lu, pipeline size %lu, depth %d\n", base, convertor->local_size, bml_btl->btl->btl_cuda_ddt_pipeline_size, bml_btl->btl->btl_cuda_ddt_pipeline_depth);
+            if( 0 != (sendreq->req_rdma_cnt = (uint32_t)mca_pml_ob1_rdma_cuda_btls(
+                                                                           sendreq->req_endpoint,
+                                                                           base,
+                                                                           sendreq->req_send.req_bytes_packed,
+                                                                           sendreq->req_rdma))) {
+    
+                rc = mca_common_cuda_get_device(&local_device);
+                if (rc != 0) {
+                    opal_output(0, "Failed to get the GPU device ID, rc=%d\n", rc);
+                    return rc;
+                }
+                mca_pml_ob1_rdma_cuda_btl_register_data(sendreq->req_rdma, sendreq->req_rdma_cnt, convertor, 1, local_device); 
+    
+                rc = mca_pml_ob1_send_request_start_rdma(sendreq, bml_btl,
+                                                         sendreq->req_send.req_bytes_packed);
+    
+                if( OPAL_UNLIKELY(OMPI_SUCCESS != rc) ) {
+                    mca_pml_ob1_free_rdma_resources(sendreq);
+                }
+            } else {
+                rc = mca_pml_ob1_send_request_start_rndv(sendreq, bml_btl, 0, 0);
+            }
+
+            
+        } else {
+            rc = mca_pml_ob1_send_request_start_rndv(sendreq, bml_btl, 0, 0);
+        }
     }
     return rc;
 }
@@ -152,6 +215,61 @@ size_t mca_pml_ob1_rdma_cuda_btls(
     return num_btls_used;
 }
 
+int mca_pml_ob1_rdma_cuda_btl_register_data(
+    mca_pml_ob1_com_btl_t* rdma_btls, 
+    uint32_t num_btls_used, 
+    struct opal_convertor_t *pack_convertor, uint8_t pack_required, int32_t gpu_device)
+{
+    uint32_t i;
+    for (i = 0; i < num_btls_used; i++) {
+        mca_btl_base_registration_handle_t *handle = rdma_btls[i].btl_reg;
+        mca_mpool_common_cuda_reg_t *cuda_reg = (mca_mpool_common_cuda_reg_t *)
+                ((intptr_t) handle - offsetof (mca_mpool_common_cuda_reg_t, data));
+      //   printf("base %p\n", cuda_reg->base.base);
+      //   for (j = 0; j < MAX_IPC_EVENT_HANDLE; j++) {
+      //       mca_common_cuda_geteventhandle(&convertor->pipeline_event[j], j, (mca_mpool_base_registration_t *)cuda_reg);
+      // //      printf("event %lu, j %d\n", convertor->pipeline_event[j], j);
+      //   }
+        cuda_reg->data.pack_required = pack_required;
+        cuda_reg->data.gpu_device = gpu_device;
+        cuda_reg->data.pack_convertor = pack_convertor;
+
+    }
+    return 0;
+}
+
+size_t mca_pml_ob1_rdma_cuda_avail(mca_bml_base_endpoint_t* bml_endpoint)
+{
+    int num_btls = mca_bml_base_btl_array_get_size(&bml_endpoint->btl_send);
+    double weight_total = 0;
+    int num_btls_used = 0, n;
+
+    /* shortcut when there are no rdma capable btls */
+    if(num_btls == 0) {
+        return 0;
+    }
+
+    /* check to see if memory is registered */
+    for(n = 0; n < num_btls && num_btls_used < mca_pml_ob1.max_rdma_per_request;
+            n++) {
+        mca_bml_base_btl_t* bml_btl =
+            mca_bml_base_btl_array_get_index(&bml_endpoint->btl_send, n);
+
+        if (bml_btl->btl_flags & MCA_BTL_FLAGS_CUDA_GET) {
+            weight_total += bml_btl->btl_weight;
+            num_btls_used++;
+        }
+    }
+
+    /* if we don't use leave_pinned and all BTLs that already have this memory
+ *      * registered amount to less then half of available bandwidth - fall back to
+ *           * pipeline protocol */
+    if(0 == num_btls_used || (!mca_pml_ob1.leave_pinned && weight_total < 0.5))
+        return 0;
+
+    return num_btls_used;
+}
+
 int mca_pml_ob1_cuda_need_buffers(void * rreq,
                                   mca_btl_base_module_t* btl)
 {
diff --git a/ompi/mca/pml/ob1/pml_ob1_recvreq.c b/ompi/mca/pml/ob1/pml_ob1_recvreq.c
index b7646890d03..15cfe8560ba 100644
--- a/ompi/mca/pml/ob1/pml_ob1_recvreq.c
+++ b/ompi/mca/pml/ob1/pml_ob1_recvreq.c
@@ -649,8 +649,11 @@ void mca_pml_ob1_recv_request_progress_rget( mca_pml_ob1_recv_request_t* recvreq
         if (mca_pml_ob1_cuda_need_buffers(recvreq, btl))
 #endif /* OPAL_CUDA_SUPPORT */
         {
-            mca_pml_ob1_recv_request_ack(recvreq, &hdr->hdr_rndv, 0);
-            return;
+            /* need more careful check here */
+            if (! (recvreq->req_recv.req_base.req_convertor.flags & CONVERTOR_CUDA)) {
+                mca_pml_ob1_recv_request_ack(recvreq, &hdr->hdr_rndv, 0);
+                return;    
+            }
         }
     }
 
diff --git a/ompi/mca/pml/ob1/pml_ob1_sendreq.c b/ompi/mca/pml/ob1/pml_ob1_sendreq.c
index f1f2744b2e3..78b7188cdbb 100644
--- a/ompi/mca/pml/ob1/pml_ob1_sendreq.c
+++ b/ompi/mca/pml/ob1/pml_ob1_sendreq.c
@@ -675,10 +675,26 @@ int mca_pml_ob1_send_request_start_rdma( mca_pml_ob1_send_request_t* sendreq,
                                                     MCA_PML_OB1_HDR_FLAGS_PIN);
     }
 
+#if OPAL_CUDA_SUPPORT
+    if ( (sendreq->req_send.req_base.req_convertor.flags & CONVERTOR_CUDA)) {
+        sendreq->req_send.req_base.req_convertor.flags &= ~CONVERTOR_CUDA;
+        if (opal_convertor_need_buffers(&sendreq->req_send.req_base.req_convertor) == true) {
+            data_ptr = sendreq->req_send.req_base.req_convertor.gpu_buffer_ptr;
+            printf("START RMDA data_ptr %p\n", data_ptr);
+        } else {
+            opal_convertor_get_current_pointer (&sendreq->req_send.req_base.req_convertor, &data_ptr);
+        }
+        /* Set flag back */
+        sendreq->req_send.req_base.req_convertor.flags |= CONVERTOR_CUDA;
+    } else {
+        opal_convertor_get_current_pointer (&sendreq->req_send.req_base.req_convertor, &data_ptr);
+    }
+#else
     /* at this time ob1 does not support non-contiguous gets. the convertor represents a
      * contiguous block of memory */
     opal_convertor_get_current_pointer (&sendreq->req_send.req_base.req_convertor, &data_ptr);
-
+#endif
+    
     local_handle = sendreq->req_rdma[0].btl_reg;
 
     /* allocate an rdma fragment to keep track of the request size for use in the fin message */
diff --git a/opal/datatype/Makefile.am b/opal/datatype/Makefile.am
index 6002a739f20..ca64cf29237 100644
--- a/opal/datatype/Makefile.am
+++ b/opal/datatype/Makefile.am
@@ -63,7 +63,7 @@ libdatatype_la_SOURCES = \
         opal_datatype_pack.c \
         opal_datatype_position.c \
         opal_datatype_resize.c \
-        opal_datatype_unpack.c
+        opal_datatype_unpack.c 
 
 libdatatype_la_LIBADD = libdatatype_reliable.la
 
diff --git a/opal/datatype/cuda/Makefile.in b/opal/datatype/cuda/Makefile.in
new file mode 100644
index 00000000000..ea0af09c6d0
--- /dev/null
+++ b/opal/datatype/cuda/Makefile.in
@@ -0,0 +1,60 @@
+@SET_MAKE@
+
+AM_CPPFLAGS = @common_cuda_CPPFLAGS@
+srcdir = @srcdir@
+top_builddir = @top_builddir@
+top_srcdir = @top_srcdir@
+VPATH = @srcdir@
+
+NVCC       = nvcc
+ARCH       = @AR@
+ARCHFLAGS  = cr
+STLIB     ?= opal_datatype_cuda_kernel.a
+DYLIB     ?= opal_datatype_cuda_kernel.so
+EXTLIB     = -L$(top_builddir)/opal/datatype/.libs -ldatatype -L$(top_builddir)/opal/.libs -lopen-pal -L/usr/local/cuda/lib -lcuda
+subdir     = opal/datatype/cuda
+
+CC = nvcc
+CFLAGS  = -I$(top_builddir)/opal/include -I$(top_srcdir)/opal/include -I$(top_builddir) -I$(top_srcdir) -gencode arch=compute_35,code=sm_35 --compiler-options '-fPIC @CFLAGS@'
+LDFLAGS = -shared --compiler-options '-fPIC @LDFLAGS@'
+
+SRC := \
+    opal_datatype_cuda.cu \
+    opal_datatype_pack_cuda_kernel.cu \
+    opal_datatype_pack_cuda_wrapper.cu \
+    opal_datatype_unpack_cuda_kernel.cu \
+    opal_datatype_unpack_cuda_wrapper.cu
+
+OBJ := $(SRC:.cu=.o)
+
+.PHONY: all clean cleanall
+
+all: Makefile $(STLIB) $(DYLIB)
+
+Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
+	@case '$?' in \
+	  *config.status*) \
+	    cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \
+	  *) \
+	    echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe)'; \
+	    cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe);; \
+	esac;
+
+$(STLIB): $(OBJ)
+	$(ARCH) $(ARCHFLAGS) $@ $(OBJ) 
+	@RANLIB@ $@
+
+$(DYLIB): $(OBJ)
+	$(NVCC) $(LDFLAGS) $(EXTLIB) -o $(DYLIB) $(OBJ)
+
+%.o: %.cu
+	$(NVCC) $(CFLAGS) $(EXTLIB) $(INC) -c $< -o $@ 
+
+install: $(DYLIB)
+	cp -f $(DYLIB) @OMPI_WRAPPER_LIBDIR@/
+
+clean:
+	rm -f $(OBJ)
+
+cleanall: clean
+	rm -f $(STLIB) $(DYLIB)
diff --git a/opal/datatype/cuda/opal_datatype_cuda.cu b/opal/datatype/cuda/opal_datatype_cuda.cu
new file mode 100644
index 00000000000..0a15fe3ab2b
--- /dev/null
+++ b/opal/datatype/cuda/opal_datatype_cuda.cu
@@ -0,0 +1,773 @@
+#include "opal/datatype/opal_convertor_internal.h"
+#include "opal/datatype/opal_datatype_internal.h"
+
+#include "opal_datatype_cuda_internal.cuh"
+#include "opal_datatype_cuda.cuh"
+#include <cuda_runtime_api.h>
+#include <cuda.h>
+#include <stdio.h>
+#include <assert.h>
+#include <stdarg.h> 
+
+
+ddt_cuda_list_t *cuda_free_list;
+ddt_cuda_device_t *cuda_devices;
+ddt_cuda_device_t *current_cuda_device;
+struct iovec cuda_iov[CUDA_NB_IOV];
+uint32_t cuda_iov_count;
+uint32_t cuda_iov_cache_enabled;
+
+//uint8_t ALIGNMENT_DOUBLE, ALIGNMENT_FLOAT, ALIGNMENT_CHAR;
+
+
+static inline ddt_cuda_buffer_t* obj_ddt_cuda_buffer_new()
+{
+    ddt_cuda_buffer_t *p = (ddt_cuda_buffer_t *)malloc(sizeof(ddt_cuda_buffer_t));
+    p->next = NULL;
+    p->prev = NULL;
+    p->size = 0;
+    p->gpu_addr = NULL;
+    return p; 
+}
+
+static inline void obj_ddt_cuda_buffer_chop(ddt_cuda_buffer_t *p)
+{
+    p->next = NULL;
+    p->prev = NULL;
+}
+
+static inline void obj_ddt_cuda_buffer_reset(ddt_cuda_buffer_t *p)
+{
+    p->size = 0;
+    p->gpu_addr = NULL;
+}
+
+static ddt_cuda_list_t* init_cuda_free_list()
+{
+    ddt_cuda_list_t *list = NULL;
+    ddt_cuda_buffer_t *p, *prev;
+    int i;
+    list = (ddt_cuda_list_t *)malloc(sizeof(ddt_cuda_list_t));
+    p = obj_ddt_cuda_buffer_new();
+    list->head = p;
+    prev = p;
+    for (i = 1; i < DT_CUDA_FREE_LIST_SIZE; i++) {
+        p = obj_ddt_cuda_buffer_new();
+        prev->next = p;
+        p->prev = prev;
+        prev = p;
+    }
+    list->tail = p;
+    list->nb_elements = DT_CUDA_FREE_LIST_SIZE;
+    return list;
+} 
+
+static inline ddt_cuda_buffer_t* cuda_list_pop_tail(ddt_cuda_list_t *list)
+{
+    ddt_cuda_buffer_t *p = NULL;
+    p = list->tail;
+    if (p == NULL) {
+        return p;
+    } else {
+        list->nb_elements --;
+        if (list->head == p) {
+            list->head = NULL;
+            list->tail = NULL;
+        } else {
+            list->tail = p->prev;
+            p->prev->next = NULL;
+            obj_ddt_cuda_buffer_chop(p);
+        }
+        return p;
+    }
+}
+
+static inline void cuda_list_push_head(ddt_cuda_list_t *list, ddt_cuda_buffer_t *item)
+{
+    ddt_cuda_buffer_t * orig_head = list->head;
+    assert(item->next == NULL && item->prev == NULL);
+    list->head = item;
+    item->next = orig_head;
+    if (orig_head == NULL) {
+        list->tail = item;
+    } else {
+        orig_head->prev = item;
+    }
+    list->nb_elements ++;
+}
+
+static inline void cuda_list_push_tail(ddt_cuda_list_t *list, ddt_cuda_buffer_t *item)
+{
+    ddt_cuda_buffer_t * orig_tail = list->tail;
+    assert(item->next == NULL && item->prev == NULL);
+    list->tail = item;
+    item->prev = orig_tail;
+    if (orig_tail == NULL) {
+        list->head = item;
+    } else {
+        orig_tail->next = item;
+    }
+    list->nb_elements ++;
+}
+
+static inline void cuda_list_delete(ddt_cuda_list_t *list, ddt_cuda_buffer_t *item)
+{
+    if (item->prev == NULL && item->next == NULL) {
+        list->head = NULL;
+        list->tail = NULL;
+    }else if (item->prev == NULL && item->next != NULL) {
+        list->head = item->next;
+        item->next->prev = NULL;
+    } else if (item->next == NULL && item->prev != NULL) {
+        list->tail = item->prev;
+        item->prev->next = NULL;
+    } else {
+        item->prev->next = item->next;
+        item->next->prev = item->prev;
+    }
+    list->nb_elements --;
+    obj_ddt_cuda_buffer_chop(item);
+}
+
+static inline void cuda_list_insert_before(ddt_cuda_list_t *list, ddt_cuda_buffer_t *item, ddt_cuda_buffer_t *next)
+{
+    assert(item->next == NULL && item->prev == NULL);
+    item->next = next;
+    item->prev = next->prev;
+    next->prev = item;
+    if (list->head == next) {
+        list->head = item;
+    }
+    list->nb_elements ++;
+}
+
+/**
+ * Collapse the list of free buffers by mergining consecutive buffers. As the property of this list
+ * is continously maintained, we only have to parse it up to the newest inserted elements.
+ */
+static inline void cuda_list_item_merge_by_addr(ddt_cuda_list_t *list, ddt_cuda_buffer_t* last)
+{
+    ddt_cuda_buffer_t *current = list->head;
+    ddt_cuda_buffer_t *next = NULL;
+    void* stop_addr = last->gpu_addr;
+
+    while(1) {  /* loop forever, the exit conditions are inside */
+        if( NULL == (next = current->next) ) return;
+        if ((current->gpu_addr + current->size) == next->gpu_addr) {
+            current->size += next->size;
+            cuda_list_delete(list, next);
+            free(next);  /* release the element, and try to continue merging */
+            continue;
+        }
+        current = current->next;
+        if( NULL == current ) return;
+        if( current->gpu_addr > stop_addr ) return;
+    }
+}
+
+void opal_cuda_output(int output_id, const char *format, ...)
+{
+    if (output_id >= 0 && output_id <= OPAL_DATATYPE_CUDA_DEBUG_LEVEL) {
+        va_list arglist;
+        fprintf( stderr, "[Debug %d]: ", output_id );
+        va_start(arglist, format);
+        vfprintf(stderr, format, arglist);
+        va_end(arglist);
+    }
+}
+
+int32_t opal_ddt_cuda_kernel_init(void)
+{
+    uint32_t i, j;
+    int device;
+    cudaError res;
+
+    res = cudaGetDevice(&device);
+    if( cudaSuccess != res ) {
+        opal_cuda_output(0, "Cannot retrieve the device being used. Drop CUDA support!\n");
+        return OPAL_ERROR;
+    }    
+
+    cuda_free_list = init_cuda_free_list();
+    
+    /* init device */
+    cuda_devices = (ddt_cuda_device_t *)malloc(sizeof(ddt_cuda_device_t)*NB_GPUS);
+    for (i = 0; i < NB_GPUS; i++) {
+        unsigned char *gpu_ptr = NULL;
+        if (cudaMalloc((void **)(&gpu_ptr), sizeof(char)*DT_CUDA_BUFFER_SIZE) != cudaSuccess) {
+            DT_CUDA_DEBUG( opal_cuda_output( 0, "cudaMalloc is failed in GPU %d\n", i); );
+            return OPAL_ERROR;
+        }
+        DT_CUDA_DEBUG ( opal_cuda_output(2, "DDT engine cudaMalloc buffer %p in GPU %d\n", gpu_ptr, i););
+        cudaMemset(gpu_ptr, 0, sizeof(char)*DT_CUDA_BUFFER_SIZE);
+        cuda_devices[i].gpu_buffer = gpu_ptr;
+        
+        cuda_devices[i].buffer_free_size = DT_CUDA_BUFFER_SIZE;
+        ddt_cuda_buffer_t *p = obj_ddt_cuda_buffer_new();
+        p->size = DT_CUDA_BUFFER_SIZE;
+        p->gpu_addr = gpu_ptr;
+        cuda_devices[i].buffer_free.head = p;
+        cuda_devices[i].buffer_free.tail = cuda_devices[i].buffer_free.head;
+        cuda_devices[i].buffer_free.nb_elements = 1;
+        
+        cuda_devices[i].buffer_used.head = NULL;
+        cuda_devices[i].buffer_used.tail = NULL;
+        cuda_devices[i].buffer_used_size = 0;
+        cuda_devices[i].buffer_used.nb_elements = 0;
+    
+        /* init cuda stream */
+        ddt_cuda_stream_t *cuda_streams = (ddt_cuda_stream_t *)malloc(sizeof(ddt_cuda_stream_t));
+        for (j = 0; j < NB_STREAMS; j++) {
+            cudaStreamCreate(&(cuda_streams->ddt_cuda_stream[j]));
+        }
+        cuda_streams->current_stream_id = 0;
+        cuda_devices[i].cuda_streams = cuda_streams;
+        cudaEventCreate(&(cuda_devices[i].memcpy_event), cudaEventDisableTiming);
+        
+        /* init iov pipeline blocks */
+        ddt_cuda_iov_pipeline_block_t *cuda_iov_pipeline_block = NULL;
+        for (j = 0; j < NB_PIPELINE_BLOCKS; j++) {
+            cuda_iov_pipeline_block = (ddt_cuda_iov_pipeline_block_t *)malloc(sizeof(ddt_cuda_iov_pipeline_block_t));
+            cudaMallocHost((void **)(&(cuda_iov_pipeline_block->cuda_iov_dist_non_cached_h)), sizeof(ddt_cuda_iov_dist_cached_t) * CUDA_MAX_NB_BLOCKS * CUDA_IOV_MAX_TASK_PER_BLOCK);
+            cudaMalloc((void **)(&(cuda_iov_pipeline_block->cuda_iov_dist_non_cached_d)), sizeof(ddt_cuda_iov_dist_cached_t) * CUDA_MAX_NB_BLOCKS * CUDA_IOV_MAX_TASK_PER_BLOCK);
+            if (j == 0) {
+                cuda_iov_pipeline_block->cuda_iov_dist_cached_h = (ddt_cuda_iov_dist_cached_t *)malloc(sizeof(ddt_cuda_iov_dist_cached_t) * NUM_CUDA_IOV_PER_DDT);
+            } else {
+                cuda_iov_pipeline_block->cuda_iov_dist_cached_h = NULL;
+            }
+            // cuda_iov_pipeline_block->cuda_stream = &(cuda_streams->opal_cuda_stream[0]);
+            // cuda_iov_pipeline_block->cuda_stream_id = 0;
+            cudaEventCreateWithFlags(&(cuda_iov_pipeline_block->cuda_event), cudaEventDisableTiming);
+            cuda_devices[i].cuda_iov_pipeline_block[j] = cuda_iov_pipeline_block;
+        }
+    }
+    current_cuda_device = &(cuda_devices[0]);
+    
+    /* init cuda_iov */
+    cuda_iov_cache_enabled = 1;
+    cuda_iov_count = CUDA_NB_IOV;
+    
+    // /* init size for double, float, char */
+    // ALIGNMENT_DOUBLE = sizeof(double);
+    // ALIGNMENT_FLOAT = sizeof(float);
+    // ALIGNMENT_CHAR = sizeof(char);
+    
+    cudaDeviceSynchronize();
+    return OPAL_SUCCESS;
+}
+
+int32_t opal_ddt_cuda_kernel_fini(void)
+{
+    uint32_t i, j;
+    
+    for (i = 0; i < NB_GPUS; i++) {
+        /* free gpu buffer */
+        cudaFree(cuda_devices[i].gpu_buffer);   
+        /* destory cuda stream and iov*/
+        ddt_cuda_iov_pipeline_block_t *cuda_iov_pipeline_block = NULL;
+        for (j = 0; j < NB_STREAMS; j++) {
+            cudaStreamDestroy(cuda_devices[i].cuda_streams->ddt_cuda_stream[j]);
+            cuda_iov_pipeline_block = cuda_devices[i].cuda_iov_pipeline_block[j];
+            if (cuda_iov_pipeline_block != NULL) {
+                if (cuda_iov_pipeline_block->cuda_iov_dist_non_cached_h != NULL) {
+                    cudaFreeHost(cuda_iov_pipeline_block->cuda_iov_dist_non_cached_h);
+                    cuda_iov_pipeline_block->cuda_iov_dist_non_cached_h = NULL;
+                }
+                if (cuda_iov_pipeline_block->cuda_iov_dist_non_cached_d != NULL) {
+                    cudaFree(cuda_iov_pipeline_block->cuda_iov_dist_non_cached_d);
+                    cuda_iov_pipeline_block->cuda_iov_dist_non_cached_d = NULL;
+                }
+                if (cuda_iov_pipeline_block->cuda_iov_dist_cached_h != NULL) {
+                    free(cuda_iov_pipeline_block->cuda_iov_dist_cached_h);
+                    cuda_iov_pipeline_block->cuda_iov_dist_cached_h = NULL;
+                }
+                cudaEventDestroy(cuda_iov_pipeline_block->cuda_event);
+                cuda_iov_pipeline_block->cuda_stream = NULL;
+                free(cuda_iov_pipeline_block);
+                cuda_iov_pipeline_block = NULL;
+            }
+        }
+        free(cuda_devices[i].cuda_streams);
+        cuda_devices[i].cuda_streams = NULL;
+        cudaEventDestroy(cuda_devices[i].memcpy_event);
+    }
+    current_cuda_device = NULL;
+    return OPAL_SUCCESS;
+}
+
+void* opal_ddt_cached_cuda_iov_init(uint32_t size) 
+{
+#if OPAL_DATATYPE_CUDA_IOV_CACHE 
+    ddt_cuda_iov_total_cached_t *tmp = (ddt_cuda_iov_total_cached_t *)malloc(sizeof(ddt_cuda_iov_total_cached_t));
+    uint32_t *tmp_nb_bytes = (uint32_t *)malloc(sizeof(uint32_t) * size);
+    if (tmp != NULL && tmp_nb_bytes != NULL) {
+        tmp->cuda_iov_dist_d = NULL;
+        tmp->cuda_iov_count = size;
+        tmp->cuda_iov_is_cached = 0;
+        tmp->nb_bytes_h = tmp_nb_bytes;
+        DT_CUDA_DEBUG( opal_cuda_output( 2, "Malloc cuda_iov_dist_cached for ddt is successed, cached cuda iov %p, nb_bytes_h %p, size %d.\n", tmp, tmp_nb_bytes, size); );
+        return tmp;
+    } else {
+        DT_CUDA_DEBUG( opal_cuda_output( 0, "Malloc cuda_iov_dist_cached for ddt is failed.\n"); );
+        return NULL;
+    }
+#else
+    DT_CUDA_DEBUG( opal_cuda_output( 2, "cuda iov cache is not enabled.\n"); );
+    return NULL;
+#endif /* OPAL_DATATYPE_CUDA_IOV_CACHE */
+}
+
+void opal_ddt_cached_cuda_iov_fini(void* cached_cuda_iov) 
+{
+#if OPAL_DATATYPE_CUDA_IOV_CACHE
+    ddt_cuda_iov_total_cached_t *tmp = (ddt_cuda_iov_total_cached_t *) cached_cuda_iov;
+    if (tmp != NULL) {
+        DT_CUDA_DEBUG( opal_cuda_output( 2, "Free cuda_iov_dist for ddt is successed %p.\n", tmp); );
+        if (tmp->cuda_iov_dist_d != NULL) {
+            cudaFree(tmp->cuda_iov_dist_d);
+            tmp->cuda_iov_dist_d = NULL;
+        }
+        if (tmp->nb_bytes_h != NULL) {
+            free(tmp->nb_bytes_h);
+            tmp->nb_bytes_h = NULL;
+        }
+        free(tmp);
+        tmp = NULL;
+    }
+#endif /* OPAL_DATATYPE_CUDA_IOV_CACHE */
+}
+
+static inline int32_t opal_ddt_cached_cuda_iov_isfull(ddt_cuda_iov_total_cached_t *cached_cuda_iov, ddt_cuda_iov_dist_cached_t **cuda_iov_dist_h, uint32_t nb_blocks_used)
+{
+    if (nb_blocks_used < cached_cuda_iov->cuda_iov_count) {
+        return 0;
+    } else {
+realloc_cuda_iov:
+        cached_cuda_iov->nb_bytes_h = (uint32_t *)realloc(cached_cuda_iov->nb_bytes_h, sizeof(uint32_t)*cached_cuda_iov->cuda_iov_count*2);
+        assert(cached_cuda_iov->nb_bytes_h != NULL);
+        cached_cuda_iov->cuda_iov_count *= 2;
+        if (nb_blocks_used >= cached_cuda_iov->cuda_iov_count) {
+            goto realloc_cuda_iov;
+        }
+        return 1;
+    }
+}
+
+/* cached_cuda_iov_d is not ready until explicitlt sync with cuda stream 0 
+*/
+int32_t opal_ddt_cache_cuda_iov(opal_convertor_t* pConvertor, uint32_t *cuda_iov_count)
+{
+    uint32_t i, j;
+    uint32_t count_desc, nb_blocks_per_description, residue_desc;
+    uint32_t thread_per_block, nb_blocks_used;
+    size_t length_per_iovec;
+    uint32_t alignment;
+    ddt_cuda_iov_pipeline_block_t *cuda_iov_pipeline_block = NULL;
+    ddt_cuda_iov_total_cached_t* cached_cuda_iov = NULL;
+    ddt_cuda_iov_dist_cached_t *cached_cuda_iov_dist_d = NULL;
+    ddt_cuda_iov_dist_cached_t *cuda_iov_dist_h = NULL;
+    cudaStream_t *cuda_stream_iov = NULL;
+    const struct iovec *ddt_iov = NULL;
+    uint32_t ddt_iov_count = 0;
+    size_t ncontig_disp_base;
+    size_t contig_disp = 0;
+    uint32_t *cached_cuda_iov_nb_bytes_list_h = NULL;
+    ddt_cuda_stream_t *cuda_streams = current_cuda_device->cuda_streams;
+    
+    opal_datatype_t *datatype = (opal_datatype_t *)pConvertor->pDesc;
+    
+    opal_convertor_raw_cached( pConvertor, &ddt_iov, &ddt_iov_count);
+    if (ddt_iov == NULL) {
+        DT_CUDA_DEBUG ( opal_cuda_output(0, "Can not get ddt iov\n"););
+        return OPAL_ERROR;
+    }
+    
+    
+    cached_cuda_iov = (ddt_cuda_iov_total_cached_t *)opal_ddt_cached_cuda_iov_init(NUM_CUDA_IOV_PER_DDT);
+    if (cached_cuda_iov == NULL) {
+        DT_CUDA_DEBUG ( opal_cuda_output(0, "Can not init cuda iov\n"););
+        return OPAL_ERROR;
+    }
+    cached_cuda_iov_nb_bytes_list_h = cached_cuda_iov->nb_bytes_h;
+    nb_blocks_used = 0;
+    cuda_iov_pipeline_block = current_cuda_device->cuda_iov_pipeline_block[0];
+    cuda_iov_pipeline_block->cuda_stream = &(cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]);
+    cuda_iov_dist_h = cuda_iov_pipeline_block->cuda_iov_dist_cached_h;
+    cuda_stream_iov = cuda_iov_pipeline_block->cuda_stream;
+    thread_per_block = CUDA_WARP_SIZE * 64;
+
+    for (i = 0; i < ddt_iov_count; i++) {
+        length_per_iovec = ddt_iov[i].iov_len;
+        ncontig_disp_base = (size_t)(ddt_iov[i].iov_base);
+    
+        /* block size is either multiple of ALIGNMENT_DOUBLE or residule */
+        alignment = ALIGNMENT_DOUBLE * 1;
+
+        count_desc = length_per_iovec / alignment;
+        residue_desc = length_per_iovec % alignment;
+        nb_blocks_per_description = (count_desc + thread_per_block - 1) / thread_per_block;
+        DT_CUDA_DEBUG ( opal_cuda_output(10, "Cache cuda IOV description %d, size %d, residue %d, alignment %d, nb_block_aquired %d\n", i, count_desc, residue_desc, alignment, nb_blocks_per_description); );
+        if (opal_ddt_cached_cuda_iov_isfull(cached_cuda_iov, &(cuda_iov_pipeline_block->cuda_iov_dist_cached_h), nb_blocks_used + nb_blocks_per_description + 1)) {
+            cached_cuda_iov_nb_bytes_list_h = cached_cuda_iov->nb_bytes_h;
+            cuda_iov_dist_h = (ddt_cuda_iov_dist_cached_t *)realloc(cuda_iov_dist_h, sizeof(ddt_cuda_iov_dist_cached_t)*cached_cuda_iov->cuda_iov_count);
+            assert(cuda_iov_dist_h != NULL);
+            cuda_iov_pipeline_block->cuda_iov_dist_cached_h = cuda_iov_dist_h;
+        }
+        
+        for (j = 0; j < nb_blocks_per_description; j++) {
+            cuda_iov_dist_h[nb_blocks_used].ncontig_disp = ncontig_disp_base + j * thread_per_block * alignment;
+            cuda_iov_dist_h[nb_blocks_used].contig_disp = contig_disp;
+            if ( (j+1) * thread_per_block <= count_desc) {
+                cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] = thread_per_block * alignment;
+            } else {
+                cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] = (count_desc - j*thread_per_block) * alignment; 
+            }
+#if defined (OPAL_DATATYPE_CUDA_DEBUG)
+            assert(cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] > 0);
+#endif /* OPAL_DATATYPE_CUDA_DEBUG */
+            contig_disp += cached_cuda_iov_nb_bytes_list_h[nb_blocks_used];
+            DT_CUDA_DEBUG( opal_cuda_output(12, "Cache cuda IOV \tblock %d, ncontig_disp %ld, contig_disp %ld, nb_bytes %ld\n", nb_blocks_used, cuda_iov_dist_h[nb_blocks_used].ncontig_disp, cuda_iov_dist_h[nb_blocks_used].contig_disp, cached_cuda_iov_nb_bytes_list_h[nb_blocks_used]); );
+            nb_blocks_used ++;
+         //   assert (nb_blocks_used < NUM_CUDA_IOV_PER_DDT);
+        }
+    
+        /* handle residue */
+        if (residue_desc != 0) {
+            cuda_iov_dist_h[nb_blocks_used].ncontig_disp = ncontig_disp_base + length_per_iovec / alignment * alignment;
+            cuda_iov_dist_h[nb_blocks_used].contig_disp = contig_disp;
+            cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] = length_per_iovec - length_per_iovec / alignment * alignment;
+#if defined (OPAL_DATATYPE_CUDA_DEBUG)
+            assert(cached_cuda_iov_nb_bytes_list_h[nb_blocks_used] > 0);
+#endif /* OPAL_DATATYPE_CUDA_DEBUG */
+            contig_disp += cached_cuda_iov_nb_bytes_list_h[nb_blocks_used];
+            DT_CUDA_DEBUG( opal_cuda_output(12, "Cache cuda IOV \tblock %d, ncontig_disp %ld, contig_disp %ld, nb_bytes %ld\n", nb_blocks_used, cuda_iov_dist_h[nb_blocks_used].ncontig_disp, cuda_iov_dist_h[nb_blocks_used].contig_disp, cached_cuda_iov_nb_bytes_list_h[nb_blocks_used]); );
+            nb_blocks_used ++;
+#if defined (OPAL_DATATYPE_CUDA_DEBUG)
+            //assert (nb_blocks_used < NUM_CUDA_IOV_PER_DDT);
+#endif /* OPAL_DATATYPE_CUDA_DEBUG */
+        }
+    }
+    /* use additional entry to store the size of entire contiguous buffer needed for one ddt */
+    cuda_iov_dist_h[nb_blocks_used].contig_disp = contig_disp;
+    cudaMalloc((void **)(&cached_cuda_iov_dist_d), sizeof(ddt_cuda_iov_dist_cached_t) * (nb_blocks_used+1));
+    if (cached_cuda_iov_dist_d == NULL) {
+        DT_CUDA_DEBUG ( opal_cuda_output(0, "Can not malloc cuda iov in GPU\n"););
+        return OPAL_ERROR;
+    }
+    cudaMemcpyAsync(cached_cuda_iov_dist_d, cuda_iov_dist_h, sizeof(ddt_cuda_iov_dist_cached_t)*(nb_blocks_used+1), cudaMemcpyHostToDevice, *cuda_stream_iov);
+    cached_cuda_iov->cuda_iov_dist_d = cached_cuda_iov_dist_d;
+    datatype->cached_cuda_iov = (unsigned char*)cached_cuda_iov;
+    *cuda_iov_count = nb_blocks_used;
+    return OPAL_SUCCESS;
+}
+
+uint8_t opal_ddt_iov_to_cuda_iov(opal_convertor_t* pConvertor, const struct iovec *ddt_iov, ddt_cuda_iov_dist_cached_t* cuda_iov_dist_h_current, uint32_t ddt_iov_start_pos, uint32_t ddt_iov_end_pos, size_t *buffer_size, uint32_t *nb_blocks_used, size_t *total_converted, size_t *contig_disp_out, uint32_t *current_ddt_iov_pos)
+{
+    size_t ncontig_disp_base;
+    size_t contig_disp = 0;
+    size_t current_cuda_iov_length = 0;
+    uint8_t buffer_isfull = 0;
+    uint8_t alignment;
+    uint32_t count_desc, nb_blocks_per_description, residue_desc;
+    uint32_t thread_per_block;
+    size_t length_per_iovec;
+    uint32_t i, j;
+    
+    thread_per_block = CUDA_WARP_SIZE * 5;
+    
+    for (i = ddt_iov_start_pos; i < ddt_iov_end_pos && !buffer_isfull; i++) {
+        if (pConvertor->current_iov_partial_length > 0) {
+            ncontig_disp_base = (size_t)(ddt_iov[i].iov_base) + ddt_iov[i].iov_len - pConvertor->current_iov_partial_length;
+            length_per_iovec = pConvertor->current_iov_partial_length;
+            pConvertor->current_iov_partial_length = 0;
+        } else {
+            ncontig_disp_base = (size_t)(ddt_iov[i].iov_base);
+            length_per_iovec = ddt_iov[i].iov_len;
+        }
+        if (*buffer_size < length_per_iovec) {
+            pConvertor->current_iov_pos = i;
+            pConvertor->current_iov_partial_length = length_per_iovec - *buffer_size;
+            length_per_iovec = *buffer_size; 
+            buffer_isfull = 1;
+        }
+        *buffer_size -= length_per_iovec;
+        *total_converted += length_per_iovec;
+        
+        alignment = ALIGNMENT_DOUBLE;
+
+        count_desc = length_per_iovec / alignment;
+        residue_desc = length_per_iovec % alignment;
+        nb_blocks_per_description = (count_desc + thread_per_block - 1) / thread_per_block;
+        if ((*nb_blocks_used + nb_blocks_per_description + 1) > (CUDA_MAX_NB_BLOCKS*CUDA_IOV_MAX_TASK_PER_BLOCK)) {
+            break;
+        }
+        DT_CUDA_DEBUG ( opal_cuda_output(10, "DDT IOV to CUDA IOV description %d, size %d, residue %d, alignment %d, nb_block_aquired %d\n", i, count_desc, residue_desc, alignment, nb_blocks_per_description); );
+        for (j = 0; j < nb_blocks_per_description; j++) {
+            cuda_iov_dist_h_current[*nb_blocks_used].ncontig_disp = ncontig_disp_base + j * thread_per_block * alignment;
+            cuda_iov_dist_h_current[*nb_blocks_used].contig_disp = contig_disp;
+            if ( (j+1) * thread_per_block <= count_desc) {
+                current_cuda_iov_length = thread_per_block * alignment;
+            } else {
+                current_cuda_iov_length = (count_desc - j*thread_per_block) * alignment; 
+            }
+#if defined (OPAL_DATATYPE_CUDA_DEBUG)
+            assert(current_cuda_iov_length > 0);
+#endif /* OPAL_DATATYPE_CUDA_DEBUG */
+            contig_disp += current_cuda_iov_length;
+            DT_CUDA_DEBUG( opal_cuda_output(12, "DDT IOV to CUDA IOV \tblock %d, ncontig_disp %ld, contig_disp %ld, nb_bytes %ld\n", nb_blocks_used, cuda_iov_dist_h_current[*nb_blocks_used].ncontig_disp, cuda_iov_dist_h_current[*nb_blocks_used].contig_disp, current_cuda_iov_length); );
+            (*nb_blocks_used) ++;
+            assert (*nb_blocks_used < CUDA_MAX_NB_BLOCKS*CUDA_IOV_MAX_TASK_PER_BLOCK);
+        }
+        
+        /* handle residue */
+        if (residue_desc != 0) {
+            cuda_iov_dist_h_current[*nb_blocks_used].ncontig_disp = ncontig_disp_base + length_per_iovec / alignment * alignment;
+            cuda_iov_dist_h_current[*nb_blocks_used].contig_disp = contig_disp;
+            current_cuda_iov_length= length_per_iovec - length_per_iovec / alignment * alignment;
+#if defined (OPAL_DATATYPE_CUDA_DEBUG)
+            assert(current_cuda_iov_length > 0);
+#endif /* OPAL_DATATYPE_CUDA_DEBUG */
+            contig_disp += current_cuda_iov_length;
+            DT_CUDA_DEBUG( opal_cuda_output(12, "DDT IOV to CUDA IOV \tblock %d, ncontig_disp %ld, contig_disp %ld, nb_bytes %ld\n", nb_blocks_used, cuda_iov_dist_h_current[*nb_blocks_used].ncontig_disp, cuda_iov_dist_h_current[*nb_blocks_used].contig_disp, current_cuda_iov_length); );
+            (*nb_blocks_used) ++;
+            assert (*nb_blocks_used < CUDA_MAX_NB_BLOCKS*CUDA_IOV_MAX_TASK_PER_BLOCK);
+        }
+    }
+    cuda_iov_dist_h_current[*nb_blocks_used].contig_disp = contig_disp;
+    *contig_disp_out = contig_disp;
+    *current_ddt_iov_pos = i;
+    return buffer_isfull;
+        
+}
+
+void opal_ddt_get_cached_cuda_iov(struct opal_convertor_t *convertor, ddt_cuda_iov_total_cached_t **cached_cuda_iov)
+{
+    opal_datatype_t *datatype = (opal_datatype_t *)convertor->pDesc;
+    if (datatype->cached_cuda_iov == NULL) {
+        *cached_cuda_iov = NULL;
+    } else {
+        *cached_cuda_iov = (ddt_cuda_iov_total_cached_t *)datatype->cached_cuda_iov;
+    }                 
+}
+
+void opal_ddt_set_cuda_iov_cached(struct opal_convertor_t *convertor, uint32_t cuda_iov_count)
+{
+    opal_datatype_t *datatype = (opal_datatype_t *)convertor->pDesc;
+    assert(datatype->cached_cuda_iov != NULL);
+    ddt_cuda_iov_total_cached_t *tmp = (ddt_cuda_iov_total_cached_t *)datatype->cached_cuda_iov;
+    tmp->cuda_iov_count = cuda_iov_count;
+    tmp->cuda_iov_is_cached = 1;
+}
+
+uint8_t opal_ddt_cuda_iov_is_cached(struct opal_convertor_t *convertor)
+{
+    opal_datatype_t *datatype = (opal_datatype_t *)convertor->pDesc;
+    if (datatype->cached_cuda_iov == NULL) {
+        return 0;
+    }
+    ddt_cuda_iov_total_cached_t *tmp = (ddt_cuda_iov_total_cached_t *)datatype->cached_cuda_iov;
+    return tmp->cuda_iov_is_cached;
+}
+
+void opal_ddt_set_cuda_iov_position(struct opal_convertor_t *convertor, size_t ddt_offset, const uint32_t *cached_cuda_iov_nb_bytes_list_h, const uint32_t cuda_iov_count)
+{
+    int i;
+    size_t iov_size = 0;
+    size_t ddt_size;
+    convertor->current_iov_partial_length = 0;
+    convertor->current_cuda_iov_pos = 0;
+    convertor->current_count = 0;
+    if (ddt_offset == 0) {
+       return;
+    }
+    opal_datatype_type_size(convertor->pDesc, &ddt_size);
+    convertor->current_count = ddt_offset / ddt_size;
+    ddt_offset = ddt_offset % ddt_size;
+    for(i = 0; i < cuda_iov_count; i++) {
+        iov_size += cached_cuda_iov_nb_bytes_list_h[i];
+        if (iov_size > ddt_offset) {
+            convertor->current_iov_partial_length = iov_size - ddt_offset;
+            convertor->current_cuda_iov_pos = i;
+            break;
+        } else if (iov_size == ddt_offset){
+            convertor->current_iov_partial_length = 0;
+            convertor->current_cuda_iov_pos = i+1;
+            break;
+        }
+    }
+}
+
+void opal_ddt_set_ddt_iov_position(struct opal_convertor_t *convertor, size_t ddt_offset, const struct iovec *ddt_iov, const uint32_t ddt_iov_count)
+{
+    int i;
+    size_t iov_size = 0;
+    size_t ddt_size;
+    convertor->current_iov_partial_length = 0;
+    convertor->current_iov_pos = 0;
+    convertor->current_count = 0;
+    if (ddt_offset == 0) {
+       return;
+    }
+    opal_datatype_type_size(convertor->pDesc, &ddt_size);
+    convertor->current_count = ddt_offset / ddt_size;
+    ddt_offset = ddt_offset % ddt_size;
+    for(i = 0; i < ddt_iov_count; i++) {
+        iov_size += ddt_iov[i].iov_len;
+        if (iov_size > ddt_offset) {
+            convertor->current_iov_partial_length = iov_size - ddt_offset;
+            convertor->current_iov_pos = i;
+            break;
+        } else if (iov_size == ddt_offset){
+            convertor->current_iov_partial_length = 0;
+            convertor->current_iov_pos = i+1;
+            break;
+        }
+    }
+}
+
+void opal_ddt_check_cuda_iov_is_full(struct opal_convertor_t *convertor, uint32_t cuda_iov_count)
+{
+#if 0
+    opal_datatype_t *datatype = (opal_datatype_t *)convertor->pDesc;
+    assert(datatype->cached_cuda_iov_dist != NULL);
+    if (datatype->cached_cuda_iov_count < cuda_iov_count) {
+        printf("cuda count %d, new count %d\n", datatype->cached_cuda_iov_count, cuda_iov_count);
+  //      assert(0);
+        void *old_iov = datatype->cached_cuda_iov_dist;
+        void *new_iov = opal_ddt_cuda_iov_dist_init(datatype->cached_cuda_iov_count + NUM_CUDA_IOV_PER_DDT);
+        assert(new_iov != NULL);
+        cudaMemcpy(new_iov, old_iov, datatype->cached_cuda_iov_count * sizeof(ddt_cuda_iov_dist_cached_t), cudaMemcpyDeviceToDevice);
+        datatype->cached_cuda_iov_dist = new_iov;
+        datatype->cached_cuda_iov_count += NUM_CUDA_IOV_PER_DDT;
+        opal_ddt_cuda_iov_dist_fini(old_iov);
+    }
+#endif
+}
+
+int32_t opal_ddt_cuda_is_gpu_buffer(const void *ptr)
+{
+    int res;
+    CUmemorytype memType;
+    CUdeviceptr dbuf = (CUdeviceptr)ptr;
+    res = cuPointerGetAttribute(&memType, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, dbuf);
+    if (res != CUDA_SUCCESS) {
+        /* If we cannot determine it is device pointer,
+         * just assume it is not. */
+        printf("!!!!!!! %p is not a gpu buffer. Take no-CUDA path!\n", ptr);
+        return 0;
+    }
+    /* Anything but CU_MEMORYTYPE_DEVICE is not a GPU memory */
+    return (memType == CU_MEMORYTYPE_DEVICE) ? 1 : 0;
+}
+
+void* opal_ddt_cuda_malloc_gpu_buffer(size_t size, int gpu_id)
+{
+    int dev_id;
+    cudaGetDevice(&dev_id);
+    ddt_cuda_device_t *device = &cuda_devices[gpu_id];
+    if (device->buffer_free_size < size) {
+        DT_CUDA_DEBUG( opal_cuda_output( 0, "No GPU buffer at dev_id %d.\n", dev_id); );
+        return NULL;
+    }
+    ddt_cuda_buffer_t *ptr = device->buffer_free.head;
+    while (ptr != NULL) {
+        if (ptr->size < size) {  /* Not enough room in this buffer, check next */
+            ptr = ptr->next;
+            continue;
+        }
+        void *addr = ptr->gpu_addr;
+        ptr->size -= size;
+        if (ptr->size == 0) {
+            cuda_list_delete(&device->buffer_free, ptr);
+            obj_ddt_cuda_buffer_reset(ptr);
+            /* hold on this ptr object, we will reuse it right away */
+        } else {
+            ptr->gpu_addr += size;
+            ptr = cuda_list_pop_tail(cuda_free_list);
+            if( NULL == ptr )
+                ptr = obj_ddt_cuda_buffer_new();
+        }
+        assert(NULL != ptr);
+        ptr->size = size;
+        ptr->gpu_addr = (unsigned char*)addr;
+        cuda_list_push_head(&device->buffer_used, ptr);
+        device->buffer_used_size += size;
+        device->buffer_free_size -= size;
+        DT_CUDA_DEBUG( opal_cuda_output( 2, "Malloc GPU buffer %p, dev_id %d.\n", addr, dev_id); );
+        return addr;
+    }
+    return NULL;
+}
+
+void opal_ddt_cuda_free_gpu_buffer(void *addr, int gpu_id)
+{
+    ddt_cuda_device_t *device = &cuda_devices[gpu_id];
+    ddt_cuda_buffer_t *ptr = device->buffer_used.head;
+
+    /* Find the holder of this GPU allocation */
+    for( ; (NULL != ptr) && (ptr->gpu_addr != addr); ptr = ptr->next );
+    if (NULL == ptr) {  /* we could not find it. something went wrong */
+        DT_CUDA_DEBUG( opal_cuda_output( 0, "addr %p is not managed.\n", addr); );
+        return;
+    }
+    cuda_list_delete(&device->buffer_used, ptr);
+    /* Insert the element in the list of free buffers ordered by the addr */
+    ddt_cuda_buffer_t *ptr_next = device->buffer_free.head;
+    while (ptr_next != NULL) {
+        if (ptr_next->gpu_addr > addr) {
+            break;
+        }
+        ptr_next = ptr_next->next;
+    }
+    if (ptr_next == NULL) {  /* buffer_free is empty, or insert to last one */
+        cuda_list_push_tail(&device->buffer_free, ptr);
+    } else {
+        cuda_list_insert_before(&device->buffer_free, ptr, ptr_next);
+    }
+    size_t size = ptr->size;
+    cuda_list_item_merge_by_addr(&device->buffer_free, ptr);
+    device->buffer_free_size += size;
+    device->buffer_used_size -= size;
+    DT_CUDA_DEBUG( opal_cuda_output( 2, "Free GPU buffer %p.\n", addr); );
+}
+
+void opal_cuda_check_error(cudaError_t err)
+{
+    if (err != cudaSuccess) {
+        DT_CUDA_DEBUG( opal_cuda_output(0, "CUDA calls error %s\n", cudaGetErrorString(err)); );
+    }
+}
+
+void opal_ddt_cuda_d2dcpy_async(void* dst, const void* src, size_t count)
+{
+    cudaMemcpyAsync(dst, src, count, cudaMemcpyDeviceToDevice, current_cuda_device->cuda_streams->ddt_cuda_stream[current_cuda_device->cuda_streams->current_stream_id]);
+}
+
+void opal_ddt_cuda_d2dcpy(void* dst, const void* src, size_t count)
+{
+    cudaMemcpyAsync(dst, src, count, cudaMemcpyDeviceToDevice, current_cuda_device->cuda_streams->ddt_cuda_stream[current_cuda_device->cuda_streams->current_stream_id]);
+    cudaStreamSynchronize(current_cuda_device->cuda_streams->ddt_cuda_stream[current_cuda_device->cuda_streams->current_stream_id]);
+}
+
+void opal_ddt_cuda_set_cuda_stream()
+{
+    ddt_cuda_stream_t *cuda_streams = current_cuda_device->cuda_streams;
+    cuda_streams->current_stream_id ++;
+    cuda_streams->current_stream_id = cuda_streams->current_stream_id & (NB_STREAMS-1);
+}
+
+int32_t opal_ddt_cuda_get_cuda_stream()
+{
+    return current_cuda_device->cuda_streams->current_stream_id;
+}
+
+void opal_dump_cuda_list(ddt_cuda_list_t *list)
+{
+    ddt_cuda_buffer_t *ptr = NULL;
+    ptr = list->head;
+    DT_CUDA_DEBUG( opal_cuda_output( 2, "DUMP cuda list %p, nb_elements %d\n", list, list->nb_elements); );
+    while (ptr != NULL) {
+        DT_CUDA_DEBUG( opal_cuda_output( 2, "\titem addr %p, size %ld.\n", ptr->gpu_addr, ptr->size); );
+        ptr = ptr->next;
+    }
+}
diff --git a/opal/datatype/cuda/opal_datatype_cuda.cuh b/opal/datatype/cuda/opal_datatype_cuda.cuh
new file mode 100644
index 00000000000..cab006e0f3f
--- /dev/null
+++ b/opal/datatype/cuda/opal_datatype_cuda.cuh
@@ -0,0 +1,134 @@
+#ifndef OPAL_DATATYPE_CUDA_H_HAS_BEEN_INCLUDED
+#define OPAL_DATATYPE_CUDA_H_HAS_BEEN_INCLUDED
+
+extern "C"
+{
+    
+int32_t opal_ddt_cuda_kernel_init(void);
+
+int32_t opal_ddt_cuda_kernel_fini(void);
+                                
+                                                
+int32_t opal_ddt_generic_simple_pack_function_cuda_vector( opal_convertor_t* pConvertor,
+                                                           struct iovec* iov, 
+                                                           uint32_t* out_size,
+                                                           size_t* max_data ); 
+                                                
+int32_t opal_ddt_generic_simple_unpack_function_cuda_vector( opal_convertor_t* pConvertor,
+                                                             struct iovec* iov, 
+                                                             uint32_t* out_size,
+                                                             size_t* max_data );
+                                                             
+int32_t opal_ddt_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor,
+                                                        struct iovec* iov, 
+                                                        uint32_t* out_size,
+                                                        size_t* max_data );                                              
+
+int32_t opal_ddt_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvertor,
+                                                          struct iovec* iov, 
+                                                          uint32_t* out_size,
+                                                          size_t* max_data ); 
+                                                          
+int32_t opal_ddt_generic_simple_pack_function_cuda_iov_non_cached( opal_convertor_t* pConvertor, unsigned char *destination, size_t buffer_size, size_t *total_packed);                                        
+
+int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_non_cached( opal_convertor_t* pConvertor, unsigned char *source, size_t buffer_size, size_t *total_unpacked);
+                                                                                                                    
+int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t* pConvertor, unsigned char *destination, size_t buffer_size, size_t *total_packed);                                        
+
+int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_t* pConvertor, unsigned char *source, size_t buffer_size, size_t *total_unpacked);
+
+void pack_contiguous_loop_cuda( dt_elem_desc_t* ELEM,
+                                uint32_t* COUNT,
+                                unsigned char** SOURCE,
+                                unsigned char** DESTINATION,
+                                size_t* SPACE );
+                                
+void pack_contiguous_loop_cuda_memcpy2d_d2h( dt_elem_desc_t* ELEM,
+                                         uint32_t* COUNT,
+                                         unsigned char** SOURCE,
+                                         unsigned char** DESTINATION,
+                                         size_t* SPACE );
+                                         
+void pack_contiguous_loop_cuda_zerocopy( dt_elem_desc_t* ELEM,
+                                         uint32_t* COUNT,
+                                         unsigned char** SOURCE,
+                                         unsigned char** DESTINATION,
+                                         size_t* SPACE );
+                                         
+void pack_contiguous_loop_cuda_pipeline( dt_elem_desc_t* ELEM,
+                                         uint32_t* COUNT,
+                                         unsigned char** SOURCE,
+                                         unsigned char** DESTINATION,
+                                         size_t* SPACE, unsigned char* gpu_buffer );
+                                
+void unpack_contiguous_loop_cuda( dt_elem_desc_t* ELEM,
+                                  uint32_t* COUNT,
+                                  unsigned char** SOURCE,
+                                  unsigned char** DESTINATION,
+                                  size_t* SPACE );
+
+void unpack_contiguous_loop_cuda_memcpy2d_d2h( dt_elem_desc_t* ELEM,
+                                           uint32_t* COUNT,
+                                           unsigned char** SOURCE,
+                                           unsigned char** DESTINATION,
+                                           size_t* SPACE );
+
+void unpack_contiguous_loop_cuda_zerocopy( dt_elem_desc_t* ELEM,
+                                           uint32_t* COUNT,
+                                           unsigned char** SOURCE,
+                                           unsigned char** DESTINATION,
+                                           size_t* SPACE);
+                                  
+void pack_predefined_data_cuda( dt_elem_desc_t* ELEM,
+                                uint32_t* COUNT,
+                                unsigned char** SOURCE,
+                                unsigned char** DESTINATION,
+                                size_t* SPACE );
+                                
+void unpack_predefined_data_cuda( dt_elem_desc_t* ELEM,
+                                  uint32_t* COUNT,
+                                  unsigned char** SOURCE,
+                                  unsigned char** DESTINATION,
+                                  size_t* SPACE );
+
+int32_t opal_ddt_cuda_is_gpu_buffer(const void *ptr);
+
+void* opal_ddt_cuda_malloc_gpu_buffer(size_t size, int gpu_id);
+
+void opal_ddt_cuda_free_gpu_buffer(void *addr, int gpu_id);
+
+void opal_ddt_cuda_d2dcpy_async(void* dst, const void* src, size_t count);
+
+void opal_ddt_cuda_d2dcpy(void* dst, const void* src, size_t count);
+
+void opal_dump_cuda_list(ddt_cuda_list_t *list);
+
+void* opal_ddt_cached_cuda_iov_init(void);
+
+void opal_ddt_cached_cuda_iov_fini(void *cached_cuda_iov);
+
+void pack_iov_cached(opal_convertor_t* pConvertor, unsigned char *destination);
+
+void opal_ddt_get_cached_cuda_iov(struct opal_convertor_t *convertor, ddt_cuda_iov_total_cached_t **cached_cuda_iov);
+                                  
+void opal_ddt_set_cuda_iov_cached(struct opal_convertor_t *convertor, uint32_t cuda_iov_count);
+
+uint8_t opal_ddt_cuda_iov_is_cached(struct opal_convertor_t *convertor);
+
+void opal_ddt_check_cuda_iov_is_full(struct opal_convertor_t *convertor, uint32_t cuda_iov_count);
+
+void opal_ddt_set_cuda_iov_position(struct opal_convertor_t *convertor, size_t ddt_offset, const uint32_t *cached_cuda_iov_nb_bytes_list_h, const uint32_t cuda_iov_count);
+
+void opal_ddt_set_ddt_iov_position(struct opal_convertor_t *convertor, size_t ddt_offset, const struct iovec *ddt_iov,  const uint32_t ddt_iov_count);
+
+int32_t opal_ddt_cache_cuda_iov(opal_convertor_t* pConvertor, uint32_t *cuda_iov_count);
+
+uint8_t opal_ddt_iov_to_cuda_iov(opal_convertor_t* pConvertor, const struct iovec *ddt_iov, ddt_cuda_iov_dist_cached_t* cuda_iov_dist_h_current, uint32_t ddt_iov_start_pos, uint32_t ddt_iov_end_pos, size_t *buffer_size, uint32_t *nb_blocks_used, size_t *total_packed, size_t *contig_disp_out, uint32_t *current_ddt_iov_pos);
+
+void opal_ddt_cuda_set_cuda_stream();
+
+int32_t opal_ddt_cuda_get_cuda_stream();
+
+}
+                            
+#endif  /* OPAL_DATATYPE_CUDA_H_HAS_BEEN_INCLUDED */
diff --git a/opal/datatype/cuda/opal_datatype_cuda_internal.cuh b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
new file mode 100644
index 00000000000..31be1def712
--- /dev/null
+++ b/opal/datatype/cuda/opal_datatype_cuda_internal.cuh
@@ -0,0 +1,171 @@
+#ifndef OPAL_DATATYPE_CUDA_INTERNAL_H_HAS_BEEN_INCLUDED
+#define OPAL_DATATYPE_CUDA_INTERNAL_H_HAS_BEEN_INCLUDED
+
+#include <stdint.h>
+#include <stddef.h>
+#include <sys/time.h>
+
+//#include "opal_datatype_orig_internal.h"
+
+
+/* OPAL_CUDA */
+// #define OPAL_DATATYPE_CUDA_DRY_RUN
+#define OPAL_DATATYPE_CUDA_DEBUG    1
+//#define OPAL_DATATYPE_CUDA_KERNEL_TIME
+#define OPAL_DATATYPE_CUDA_DEBUG_LEVEL  2
+#define OPAL_DATATYPE_CUDA_TIMING
+#define OPAL_DATATYPE_VECTOR_USE_MEMCPY2D_D2H   0
+#define OPAL_DATATYPE_VECTOR_USE_ZEROCPY   0
+#define OPAL_DATATYPE_VECTOR_USE_PIPELINE   0
+#define OPAL_DATATYPE_VECTOR_USE_MEMCPY2D_AS_KERNEL   0
+#define OPAL_DATATYPE_CUDA_IOV_CACHE    1
+
+
+
+#define NB_GPUS                 1
+#define IOV_ARRAY_SIZE          1
+#define DT_CUDA_BUFFER_SIZE    1024*1024*200
+#define DT_CUDA_FREE_LIST_SIZE  50
+
+#define THREAD_PER_BLOCK    32
+#define CUDA_WARP_SIZE      32
+#define TASK_PER_THREAD     2
+#define NB_STREAMS          4
+#define NB_PIPELINE_BLOCKS  4
+#define CUDA_NB_IOV         1024*20
+#define CUDA_IOV_LEN        1024*1204
+#define CUDA_MAX_NB_BLOCKS  1024
+#define CUDA_IOV_MAX_TASK_PER_BLOCK 400
+#define ALIGNMENT_DOUBLE    8
+#define ALIGNMENT_FLOAT     4
+#define ALIGNMENT_CHAR      1
+#define NUM_CUDA_IOV_PER_DDT    150000
+#define IOV_PIPELINE_SIZE   1000
+#define KERNEL_UNROLL       16
+#define UNROLL_16           16
+#define UNROLL_8            8
+#define UNROLL_4            4
+
+#define TIMER_DATA_TYPE struct timeval
+#define GET_TIME(TV)   gettimeofday( &(TV), NULL )
+#define ELAPSED_TIME(TSTART, TEND)  (((TEND).tv_sec - (TSTART).tv_sec) * 1000000 + ((TEND).tv_usec - (TSTART).tv_usec))
+
+
+typedef struct {
+    cudaStream_t ddt_cuda_stream[NB_STREAMS];
+    int32_t current_stream_id;
+} ddt_cuda_stream_t;
+
+typedef struct {
+    unsigned char* src;
+    unsigned char* dst;
+    uint32_t nb_elements;
+    uint8_t element_alignment;
+} ddt_cuda_iov_dist_non_cached_t;
+
+typedef struct {
+    size_t ncontig_disp;
+    size_t contig_disp;
+} ddt_cuda_iov_dist_cached_t;
+
+typedef struct {
+    ddt_cuda_iov_dist_cached_t* cuda_iov_dist_d;
+    uint32_t cuda_iov_count;
+    uint32_t* nb_bytes_h;
+    uint8_t cuda_iov_is_cached;
+} ddt_cuda_iov_total_cached_t;
+
+typedef struct {
+    ddt_cuda_iov_dist_cached_t* cuda_iov_dist_non_cached_h;
+    ddt_cuda_iov_dist_cached_t* cuda_iov_dist_non_cached_d;
+    ddt_cuda_iov_dist_cached_t* cuda_iov_dist_cached_h;
+    cudaStream_t *cuda_stream;
+    cudaEvent_t cuda_event;
+} ddt_cuda_iov_pipeline_block_t;
+
+typedef struct ddt_cuda_buffer{
+    unsigned char* gpu_addr;
+    size_t size;
+    struct ddt_cuda_buffer *next;
+    struct ddt_cuda_buffer *prev;
+} ddt_cuda_buffer_t;
+
+typedef struct {
+    ddt_cuda_buffer_t *head;
+    ddt_cuda_buffer_t *tail;
+    size_t nb_elements;
+} ddt_cuda_list_t;
+
+typedef struct {
+    int device_id;
+    unsigned char* gpu_buffer;
+    ddt_cuda_list_t buffer_free;
+    ddt_cuda_list_t buffer_used;
+    size_t buffer_free_size;
+    size_t buffer_used_size;
+    ddt_cuda_stream_t *cuda_streams;
+    ddt_cuda_iov_pipeline_block_t *cuda_iov_pipeline_block[NB_PIPELINE_BLOCKS];
+    cudaEvent_t memcpy_event;
+} ddt_cuda_device_t;
+
+extern ddt_cuda_list_t *cuda_free_list;
+extern ddt_cuda_device_t *cuda_devices;
+extern ddt_cuda_device_t *current_cuda_device;
+extern struct iovec cuda_iov[CUDA_NB_IOV];
+extern uint32_t cuda_iov_count;
+extern uint32_t cuda_iov_cache_enabled;
+
+//extern uint8_t ALIGNMENT_DOUBLE, ALIGNMENT_FLOAT, ALIGNMENT_CHAR;
+
+        
+#if defined (OPAL_DATATYPE_CUDA_DEBUG) 
+#define DBGPRINT(fmt, ...) printf(fmt, __VA_ARGS__) 
+#else 
+#define DBGPRINT(fmt, ...) 
+#endif 
+
+__global__ void pack_contiguous_loop_cuda_kernel_global( uint32_t copy_loops,
+                                                         size_t size,
+                                                         OPAL_PTRDIFF_TYPE extent,
+                                                         unsigned char* source,
+                                                         unsigned char* destination );
+                                                         
+__global__ void unpack_contiguous_loop_cuda_kernel_global( uint32_t copy_loops,
+                                                           size_t size,
+                                                           OPAL_PTRDIFF_TYPE extent,
+                                                           unsigned char* source,
+                                                           unsigned char* destination );
+                                                           
+
+__global__ void opal_generic_simple_pack_cuda_iov_non_cached_kernel( ddt_cuda_iov_dist_non_cached_t* cuda_iov_dist, int nb_blocks_used);
+
+__global__ void opal_generic_simple_unpack_cuda_iov_non_cached_kernel( ddt_cuda_iov_dist_non_cached_t* cuda_iov_dist, int nb_blocks_used);
+
+__global__ void opal_generic_simple_pack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, uint32_t cuda_iov_pos, uint32_t cuda_iov_count, uint32_t ddt_extent, uint32_t current_count, int nb_blocks_used, unsigned char* source_base, unsigned char* destination_base);
+
+__global__ void opal_generic_simple_unpack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, uint32_t cuda_iov_pos, uint32_t cuda_iov_count, uint32_t ddt_extent, uint32_t current_count, int nb_blocks_used, unsigned char* destination_base, unsigned char* source_base, size_t cuda_iov_partial_length_start, size_t cuda_iov_partial_length_end);
+
+void opal_cuda_output(int output_id, const char *format, ...);
+
+void opal_cuda_check_error(cudaError_t err);
+
+#if defined (OPAL_DATATYPE_CUDA_DEBUG)
+#define DT_CUDA_DEBUG( INST ) if (OPAL_DATATYPE_CUDA_DEBUG) { INST }
+#else
+#define DT_CUDA_DEBUG( INST )
+#endif
+
+extern "C"
+{
+int32_t opal_convertor_set_position_nocheck( opal_convertor_t* convertor, size_t* position );
+
+int32_t opal_convertor_raw( opal_convertor_t* pConvertor, 
+		                    struct iovec* iov, uint32_t* iov_count,
+		                    size_t* length );
+
+int opal_convertor_raw_cached(struct opal_convertor_t *convertor,
+                              const struct iovec **iov,
+                              uint32_t* iov_count);
+}
+
+#endif  /* OPAL_DATATYPE_CUDA_INTERNAL_H_HAS_BEEN_INCLUDED */
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
new file mode 100644
index 00000000000..10fb2356cad
--- /dev/null
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_kernel.cu
@@ -0,0 +1,770 @@
+#include "opal/datatype/opal_convertor_internal.h"
+#include "opal/datatype/opal_datatype_internal.h"
+
+#include "opal_datatype_cuda_internal.cuh"
+#include <stdio.h> 
+#include <time.h>
+
+#if 0
+__global__ void pack_contiguous_loop_cuda_kernel_global( uint32_t copy_loops,
+                                                         size_t size,
+                                                         OPAL_PTRDIFF_TYPE extent,
+                                                         unsigned char* source,
+                                                         unsigned char* destination )
+{
+    uint32_t _i, tid, num_threads;
+    uint32_t gap, nb_elements;
+    uint64_t *_source_tmp, *_destination_tmp, *_src_disp_tmp;;
+    
+    tid = threadIdx.x + blockIdx.x * blockDim.x;
+    num_threads = gridDim.x * blockDim.x;
+    
+    gap = (extent - size) / 8;
+    nb_elements = size / 8;
+    _src_disp_tmp = (uint64_t*)source;
+    _destination_tmp = (uint64_t*)destination;
+    _destination_tmp += tid;
+#if 0
+    for (_i = tid; _i < copy_loops*nb_elements; _i+=num_threads) {
+        _source_tmp = _src_disp_tmp + tid + _i/num_threads*num_threads + _i/nb_elements * gap;
+#if defined (OPAL_DATATYPE_CUDA_DEBUG)
+        // if (_i % nb_elements == 0 ) {
+        //     DBGPRINT("tid %d, pack 3. memcpy( %p, %p, %lu ) => _i %d, actual _i %d, count %d\n",
+        //                                     tid, _destination_tmp, _source_tmp, (unsigned long)size, _i/nb_elements, _i, copy_loops );
+        // }
+        // if (_i / nb_elements ==1 && tid == 0 ) {
+        //     DBGPRINT("tid %d, pack 3. memcpy( %p, %p, %lu ) => space %lu, _i %d, actual _i %d\n",
+        //                                     tid, _destination_tmp, _source_tmp, (unsigned long)_end_loop->size, (unsigned long)(*(SPACE) - _i/nb_elements * _end_loop->size), _i/nb_elements, _i );
+        // }
+#endif /* OPAL_DATATYPE_CUDA_DEBUG */
+#if !defined (OPAL_DATATYPE_CUDA_DRY_RUN)
+        *_destination_tmp = *_source_tmp;
+#endif /* ! OPAL_DATATYPE_CUDA_DRY_RUN */
+        _destination_tmp += num_threads;
+    }
+#else
+    for (_i = tid; _i < copy_loops*nb_elements; _i+=8*num_threads) {
+        uint64_t val[16];
+        uint32_t _j;
+        uint32_t u;
+        uint64_t *mysrc = _src_disp_tmp + tid;
+        
+        #pragma unroll      
+        for (u = 0; u < 8; u++) {
+            _j = _i + u * num_threads;
+            val[u] = *(mysrc + _j/num_threads*num_threads + _j/nb_elements * gap);
+        } 
+        
+        #pragma unroll
+        for (u = 0; u < 8; u++) {
+            *_destination_tmp = val[u];
+            _destination_tmp += num_threads;
+        } 
+/*
+        _j = _i;
+        val[0] = *(_src_disp_tmp + tid + _j/num_threads*num_threads + _j/nb_elements * gap);
+
+        _j += num_threads;
+        val[1] = *(_src_disp_tmp + tid + _j/num_threads*num_threads + _j/nb_elements * gap);
+
+        _j += num_threads;
+        val[2] = *(_src_disp_tmp + tid + _j/num_threads*num_threads + _j/nb_elements * gap);
+
+        _j += num_threads;
+        val[3] = *(_src_disp_tmp + tid + _j/num_threads*num_threads + _j/nb_elements * gap);
+        
+	_j += num_threads;
+        val[4] = *(_src_disp_tmp + tid + _j/num_threads*num_threads + _j/nb_elements * gap);
+
+	_j += num_threads;
+        val[5] = *(_src_disp_tmp + tid + _j/num_threads*num_threads + _j/nb_elements * gap);
+
+	_j += num_threads;
+        val[6] = *(_src_disp_tmp + tid + _j/num_threads*num_threads + _j/nb_elements * gap);
+
+	_j += num_threads;
+        val[7] = *(_src_disp_tmp + tid + _j/num_threads*num_threads + _j/nb_elements * gap);
+
+	_j += num_threads;
+        val[8] = *(_src_disp_tmp + tid + _j/num_threads*num_threads + _j/nb_elements * gap);
+
+	_j += num_threads;
+        val[9] = *(_src_disp_tmp + tid + _j/num_threads*num_threads + _j/nb_elements * gap);
+
+	_j += num_threads;
+        val[10] = *(_src_disp_tmp + tid + _j/num_threads*num_threads + _j/nb_elements * gap);
+
+        _j += num_threads;
+        val[11] = *(_src_disp_tmp + tid + _j/num_threads*num_threads + _j/nb_elements * gap);
+
+        _j += num_threads;
+        val[12] = *(_src_disp_tmp + tid + _j/num_threads*num_threads + _j/nb_elements * gap);
+
+        _j += num_threads;
+        val[13] = *(_src_disp_tmp + tid + _j/num_threads*num_threads + _j/nb_elements * gap);
+
+        _j += num_threads;
+        val[14] = *(_src_disp_tmp + tid + _j/num_threads*num_threads + _j/nb_elements * gap);
+
+        _j += num_threads;
+        val[15] = *(_src_disp_tmp + tid + _j/num_threads*num_threads + _j/nb_elements * gap);
+
+        *_destination_tmp = val[0];
+        _destination_tmp += num_threads;
+        *_destination_tmp = val[1];
+        _destination_tmp += num_threads;
+        *_destination_tmp = val[2];
+        _destination_tmp += num_threads;
+        *_destination_tmp = val[3];
+        _destination_tmp += num_threads;
+        *_destination_tmp = val[4];
+        _destination_tmp += num_threads;
+        *_destination_tmp = val[5];
+        _destination_tmp += num_threads;
+        *_destination_tmp = val[6];
+        _destination_tmp += num_threads;
+        *_destination_tmp = val[7];
+        _destination_tmp += num_threads;
+        *_destination_tmp = val[8];
+        _destination_tmp += num_threads;
+        *_destination_tmp = val[9];
+        _destination_tmp += num_threads;
+        *_destination_tmp = val[10];
+        _destination_tmp += num_threads;
+        *_destination_tmp = val[11];
+        _destination_tmp += num_threads;
+        *_destination_tmp = val[12];
+        _destination_tmp += num_threads;
+        *_destination_tmp = val[13];
+        _destination_tmp += num_threads;
+        *_destination_tmp = val[14];
+        _destination_tmp += num_threads;
+        *_destination_tmp = val[15];
+        _destination_tmp += num_threads;
+*/  
+    }
+#endif
+}
+
+#else
+
+__global__ void pack_contiguous_loop_cuda_kernel_global( uint32_t copy_loops,
+                                                         size_t size,
+                                                         OPAL_PTRDIFF_TYPE extent,
+                                                         unsigned char* source,
+                                                         unsigned char* destination )
+{
+    uint32_t i, u, tid, num_threads, warp_id, tid_per_warp, nb_warps, nb_warps_x, nb_warps_y, pos_x, pos_y, size_last_y, size_last_x;
+    uint32_t size_nb, extent_nb;
+    uint64_t *_source_tmp, *_destination_tmp, *source_64, *destination_64, *_source_left_tmp, *_destination_left_tmp;
+    uint64_t val[UNROLL_16];
+    
+    tid = threadIdx.x + blockIdx.x * blockDim.x;
+    num_threads = gridDim.x * blockDim.x;
+    warp_id = tid / CUDA_WARP_SIZE;
+    tid_per_warp = threadIdx.x & (CUDA_WARP_SIZE-1);
+    nb_warps = num_threads / CUDA_WARP_SIZE;
+    
+    extent_nb = extent / 8;
+    size_nb = size / 8;
+    source_64 = (uint64_t*)source;
+    destination_64 = (uint64_t*)destination;
+    
+    nb_warps_x = size_nb / CUDA_WARP_SIZE;
+    size_last_x = size_nb & (CUDA_WARP_SIZE-1);
+    if ( size_last_x != 0) {
+        nb_warps_x ++;
+    } else {
+        size_last_x = CUDA_WARP_SIZE;
+    }
+    nb_warps_y = copy_loops / UNROLL_16;
+    size_last_y = copy_loops & (UNROLL_16-1);
+    if ( size_last_y != 0) {
+        nb_warps_y ++;
+    } else {
+        size_last_y = UNROLL_16;
+    }
+    // if (threadIdx.x == 0) {
+    //     printf("warp_id %u, nb_warps_x %u, nb_warps_y %u, tid_per_warps %u, nb_warps %u\n", warp_id, nb_warps_x, nb_warps_y, tid_per_warp, nb_warps);
+    // }
+    
+    const uint32_t extent_nb_times_UNROLL_16 =  extent_nb * UNROLL_16;
+    const uint32_t size_nb_times_UNROLL_16 = size_nb * UNROLL_16;
+    source_64 += tid_per_warp;
+    destination_64 += tid_per_warp;
+    
+    for (i = warp_id; i < (nb_warps_x-1) * (nb_warps_y-1); i += nb_warps) {
+        pos_x = i / (nb_warps_y-1);
+        pos_y = i % (nb_warps_y-1);
+        _source_tmp = source_64 + pos_y * extent_nb_times_UNROLL_16 + pos_x * CUDA_WARP_SIZE;
+        _destination_tmp = destination_64 + pos_y * size_nb_times_UNROLL_16 + pos_x * CUDA_WARP_SIZE;
+        #pragma unroll
+        for (u = 0; u < UNROLL_16; u++) {
+            val[u] = *(_source_tmp + u * extent_nb);
+        }
+        #pragma unroll
+        for (uint32_t u = 0; u < UNROLL_16; u++) {
+            *(_destination_tmp + u * size_nb) = val[u];
+        }
+    }
+    if (tid_per_warp < size_last_x) {
+        pos_x = nb_warps_x - 1;
+        _source_left_tmp = source_64 + pos_x * CUDA_WARP_SIZE;
+        _destination_left_tmp = destination_64 + pos_x * CUDA_WARP_SIZE;
+        for (i = warp_id; i < nb_warps_y-1; i += nb_warps) {
+            _source_tmp = _source_left_tmp + i * extent_nb_times_UNROLL_16;
+            _destination_tmp = _destination_left_tmp + i * size_nb_times_UNROLL_16;
+            #pragma unroll
+            for (u = 0; u < UNROLL_16; u++) {
+                val[u] = *(_source_tmp + u * extent_nb);
+            }
+            #pragma unroll
+            for (uint32_t u = 0; u < UNROLL_16; u++) {
+                *(_destination_tmp + u * size_nb) = val[u];
+            }
+        }
+    }
+    
+    pos_y = nb_warps_y - 1;
+    _source_left_tmp = source_64 + pos_y * extent_nb_times_UNROLL_16;
+    _destination_left_tmp = destination_64 + pos_y * size_nb_times_UNROLL_16;
+    if (size_last_y == UNROLL_16) {
+        for (i = warp_id; i < nb_warps_x-1; i += nb_warps) {
+            _source_tmp = _source_left_tmp + i * CUDA_WARP_SIZE;
+            _destination_tmp = _destination_left_tmp + i * CUDA_WARP_SIZE;
+            #pragma unroll
+            for (u = 0; u < UNROLL_16; u++) {
+                val[u] = *(_source_tmp + u * extent_nb);
+            }
+            #pragma unroll
+            for (uint32_t u = 0; u < UNROLL_16; u++) {
+                *(_destination_tmp + u * size_nb) = val[u];
+            }  
+        } 
+    } else {
+        for (i = warp_id; i < nb_warps_x-1; i += nb_warps) {
+            _source_tmp = _source_left_tmp + i * CUDA_WARP_SIZE;
+            _destination_tmp = _destination_left_tmp + i * CUDA_WARP_SIZE;
+            for (u = 0; u < size_last_y; u++) {
+                *(_destination_tmp + u * size_nb) = *(_source_tmp + u * extent_nb);
+            }
+        }
+    }
+    
+    if (warp_id == 0 && tid_per_warp < size_last_x) {
+        _source_tmp = source_64 + (nb_warps_y-1) * extent_nb_times_UNROLL_16 + (nb_warps_x-1) * CUDA_WARP_SIZE;
+        _destination_tmp = destination_64 + (nb_warps_y-1) * size_nb_times_UNROLL_16 + (nb_warps_x-1) * CUDA_WARP_SIZE;
+        for (u = 0; u < size_last_y; u++) {
+            *(_destination_tmp + u * size_nb) = *(_source_tmp + u * extent_nb);
+        }
+    }
+}
+
+
+// #define SEG_ADD(s) \
+//     l += s; \
+//     while (l >= lines) { \
+//     l -= lines; \
+//     c += width; \
+//     }
+//
+// __global__ void pack_contiguous_loop_cuda_kernel_global( uint32_t lines,
+//                                                          size_t nb_size,
+//                                                          OPAL_PTRDIFF_TYPE nb_extent,
+//                                                          unsigned char * b_source,
+//                                                          unsigned char * b_destination )
+// {
+//     uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
+//     uint32_t num_threads = gridDim.x * blockDim.x;
+//
+//     //size_t lines = (size_t)lines;
+//     size_t size = nb_size / 8;
+//     size_t extent = nb_extent / 8;
+//     uint64_t * source = (uint64_t *) b_source;
+//     uint64_t *destination = (uint64_t *) b_destination;
+//     uint64_t val[KERNEL_UNROLL];
+//
+//     int col = 0;
+//     for (int width = 32; width > 0 && col < size; width >>= 1) {
+//         while (size-col >= width) {
+//             const int warp_id = tid / width;
+//             const int warp_tid = tid & (width-1);
+//             const int warp_nb = num_threads / width;
+//             const int c = col + warp_tid;
+//             int l = warp_id * KERNEL_UNROLL;
+//             uint64_t *src = source + c;
+//             uint64_t *dst = destination + c;
+//             for (int b=0; b<lines/(KERNEL_UNROLL*warp_nb); b++) {
+//                 #pragma unroll
+//                 for (int u=0; u<KERNEL_UNROLL; u++) {
+//                     val[u] = *(src+(l+u)*extent);
+//                 }
+//                 #pragma unroll
+//                 for (int u=0; u<KERNEL_UNROLL; u++) {
+//                     dst[(l+u)*size] = val[u];
+//                 }
+//                 l += warp_nb * KERNEL_UNROLL;
+//             }
+//             /* Finish non-unrollable case */
+//             for (int u=0; u<KERNEL_UNROLL && l<lines; u++) {
+//                 dst[l*size] = *(src+l*extent);
+//                 l++;
+//             }
+//             col += width;
+//         }
+//     }
+//
+//
+// }
+
+/*
+#define COLOFF_INC(jump, width, ext) \
+     col += jump; \
+     off += jump; \
+     while (col >= width) { \
+         col -= width; \
+         off += ext - width; \
+     }
+
+#define ELEMSIZE 32
+
+__global__ void pack_contiguous_loop_cuda_kernel_global( uint32_t
+copy_loops,
+size_t size,
+OPAL_PTRDIFF_TYPE extent,
+unsigned char * source,
+unsigned char * destination )
+{
+     uint32_t tid = threadIdx.x + blockIdx.x * blockDim.x ;
+     uint32_t num_threads = gridDim.x * blockDim.x;
+
+     int col = 0;
+     int off = 0;
+
+     COLOFF_INC(tid, size/ELEMSIZE, extent/ELEMSIZE);
+
+     if (ELEMSIZE % 8 == 0) {
+         volatile uint64_t * __restrict__ dst = (uint64_t*)destination +
+tid * ELEMSIZE/8;
+         for (int offset = tid; offset < copy_loops*size/ELEMSIZE;
+offset+=num_threads) {
+             const volatile uint64_t * __restrict__ src = (uint64_t*)source + off * ELEMSIZE/8;
+#if 1
+             uint64_t val[ELEMSIZE/8];
+             #pragma unroll
+             for (int i = 0; i < ELEMSIZE/8; i++) {
+                 val[i] = src[i];
+             }
+             #pragma unroll
+             for (int i = 0; i < ELEMSIZE/8; i++) {
+                 dst[i] = val[i];
+             }
+#else
+             #pragma unroll
+             for (int i = 0; i < ELEMSIZE/8; i++) {
+                 dst[i] = __ldg(src+i);
+             }
+#endif
+             dst += num_threads*ELEMSIZE/8;
+             COLOFF_INC(num_threads, size/ELEMSIZE, extent/ELEMSIZE);
+         }
+     }
+}
+*/
+#endif
+
+
+__global__ void opal_generic_simple_pack_cuda_iov_non_cached_kernel( ddt_cuda_iov_dist_non_cached_t* cuda_iov_dist, int nb_blocks_used)
+{
+    uint32_t i, _copy_count;
+    unsigned char *src, *dst;
+    uint8_t alignment;
+    unsigned char *_source_tmp, *_destination_tmp;
+    
+    __shared__ uint32_t nb_tasks;
+    
+    if (threadIdx.x == 0) {
+        //printf("iov pack kernel \n");
+        nb_tasks = nb_blocks_used / gridDim.x;
+        if (blockIdx.x < (nb_blocks_used % gridDim.x)) {
+            nb_tasks ++;
+        }
+   //     printf("nb_tasks %d, griddim %d, nb_blocks_used %d, bloid %d \n", nb_tasks, gridDim.x, nb_blocks_used, blockIdx.x);
+    }
+    __syncthreads();
+    
+    for (i = 0; i < nb_tasks; i++) {
+        src = cuda_iov_dist[blockIdx.x + i * gridDim.x].src;
+        dst = cuda_iov_dist[blockIdx.x + i * gridDim.x].dst;
+        _copy_count = cuda_iov_dist[blockIdx.x + i * gridDim.x].nb_elements;
+        alignment = cuda_iov_dist[blockIdx.x + i * gridDim.x].element_alignment;
+        
+        // if (threadIdx.x == 0) {
+        //     printf("block %d, ali %d, nb_element %d\n", blockIdx.x, cuda_iov_dist[blockIdx.x].element_alignment[i], _copy_count);
+        // }
+        
+        if (threadIdx.x < _copy_count) {
+            _source_tmp = src + threadIdx.x * alignment;
+            _destination_tmp = dst + threadIdx.x * alignment;
+#if !defined (OPAL_DATATYPE_CUDA_DRY_RUN)
+            if (alignment == ALIGNMENT_DOUBLE) {
+                *((long *)_destination_tmp) = *((long *)_source_tmp);
+            } else if (alignment == ALIGNMENT_FLOAT) {
+                *((int *)_destination_tmp) = *((int *)_source_tmp);
+            } else {
+                * _destination_tmp = *_source_tmp;
+            }
+#endif /* ! OPAL_DATATYPE_CUDA_DRY_RUN */
+        }
+    }
+}
+
+#if 0
+__global__ void opal_generic_simple_pack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, uint32_t cuda_iov_pos, uint32_t cuda_iov_count, uint32_t ddt_extent, uint32_t current_count, int nb_blocks_used, unsigned char* source_base, unsigned char* destination_base)
+{
+    uint32_t i, j;
+    uint32_t _nb_bytes;    
+    size_t src_offset, dst_offset;
+    unsigned char *_source_tmp, *_destination_tmp;
+    uint32_t current_cuda_iov_pos = cuda_iov_pos;
+    size_t destination_disp = cuda_iov_dist[current_cuda_iov_pos].contig_disp;
+    size_t contig_disp;
+    uint32_t _my_cuda_iov_pos;
+    uint32_t _my_cuda_iov_iteration;
+    size_t ddt_size = cuda_iov_dist[cuda_iov_count].contig_disp;
+    
+    __shared__ uint32_t nb_tasks;
+    uint32_t copy_count;
+    uint8_t alignment;
+    
+    if (threadIdx.x == 0) {
+        nb_tasks = nb_blocks_used / gridDim.x;
+        if (blockIdx.x < (nb_blocks_used % gridDim.x)) {
+            nb_tasks ++;
+        }
+    //    printf("cuda_iov_count %d, ddt_extent %d, current_count %d\n", cuda_iov_count, ddt_extent, current_count);
+    //     printf("nb_tasks %d, griddim %d, nb_blocks_used %d, bloid %d \n", nb_tasks, gridDim.x, nb_blocks_used, blockIdx.x);
+    }
+    __syncthreads();
+    
+    for (i = 0; i < nb_tasks; i++) {
+        /* these 3 variables are used multiple times, so put in in register */
+        _my_cuda_iov_pos = (blockIdx.x + i * gridDim.x + current_cuda_iov_pos) % cuda_iov_count;
+        _my_cuda_iov_iteration = (blockIdx.x + i * gridDim.x + current_cuda_iov_pos) / cuda_iov_count;
+        contig_disp = cuda_iov_dist[_my_cuda_iov_pos].contig_disp;  
+        
+        src_offset = cuda_iov_dist[_my_cuda_iov_pos].ncontig_disp + (_my_cuda_iov_iteration + current_count) * ddt_extent;
+        dst_offset = contig_disp + ddt_size * _my_cuda_iov_iteration - destination_disp;
+        _nb_bytes = cuda_iov_dist[_my_cuda_iov_pos + 1].contig_disp - contig_disp;
+        
+        _source_tmp = source_base + src_offset;
+        _destination_tmp = destination_base + dst_offset;
+        /* block size is either multiple of ALIGNMENT_DOUBLE or residule */
+        if ((uintptr_t)(_source_tmp) % ALIGNMENT_DOUBLE == 0 && (uintptr_t)(_destination_tmp) % ALIGNMENT_DOUBLE == 0 && _nb_bytes % ALIGNMENT_DOUBLE == 0) {
+            alignment = ALIGNMENT_DOUBLE;
+        } else if ((uintptr_t)(_source_tmp) % ALIGNMENT_FLOAT == 0 && (uintptr_t)(_destination_tmp) % ALIGNMENT_FLOAT == 0 && _nb_bytes % ALIGNMENT_FLOAT == 0) {
+            alignment = ALIGNMENT_FLOAT;
+        } else {
+            alignment = ALIGNMENT_CHAR;
+        }
+        copy_count = _nb_bytes / alignment;
+    /*    
+        if (threadIdx.x == 0 && nb_tasks != 0) {
+            printf("pack block %d, src_offset %ld, dst_offset %ld, count %d, nb_bytes %d, nb_tasks %d, i %d\n", blockIdx.x, src_offset, dst_offset, copy_count, _nb_bytes, nb_tasks, i);
+        }
+        __syncthreads();
+      */
+        for (j = threadIdx.x; j < copy_count; j += blockDim.x) {
+            if (j < copy_count) {
+                _source_tmp = source_base + src_offset + j * alignment;
+                _destination_tmp = destination_base + dst_offset + j * alignment;
+#if !defined (OPAL_DATATYPE_CUDA_DRY_RUN)
+                if (alignment == ALIGNMENT_DOUBLE) {
+                    *((long *)_destination_tmp) = *((long *)_source_tmp);
+                } else if (alignment == ALIGNMENT_FLOAT) {
+                    *((int *)_destination_tmp) = *((int *)_source_tmp);
+                } else {
+                    * _destination_tmp = *_source_tmp;
+                }
+#endif /* ! OPAL_DATATYPE_CUDA_DRY_RUN */
+            }
+        }
+    }
+}
+
+#else
+__global__ void opal_generic_simple_pack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, uint32_t cuda_iov_pos, uint32_t cuda_iov_count, uint32_t ddt_extent, uint32_t current_count, int nb_blocks_used, unsigned char* source_base, unsigned char* destination_base)
+{
+    uint32_t i, j;
+    uint32_t _nb_bytes;    
+    size_t src_offset, dst_offset;
+    unsigned char *_source_tmp, *_destination_tmp;
+    uint32_t current_cuda_iov_pos = cuda_iov_pos;
+    size_t destination_disp = cuda_iov_dist[current_cuda_iov_pos].contig_disp;
+    size_t contig_disp;
+    uint32_t _my_cuda_iov_pos;
+    uint32_t _my_cuda_iov_iteration;
+    size_t ddt_size = cuda_iov_dist[cuda_iov_count].contig_disp;
+    
+    __shared__ uint32_t nb_tasks_per_block;
+    __shared__ uint32_t WARP_SIZE;
+    __shared__ uint32_t nb_warp_per_block;
+    uint32_t copy_count;
+    uint8_t alignment;
+    uint64_t tmp_var_64[KERNEL_UNROLL];
+    uint32_t tmp_var_32[KERNEL_UNROLL];
+    unsigned char tmp_var_8[KERNEL_UNROLL];
+    uint32_t u, k;
+    uint32_t copy_count_16, copy_count_8, copy_count_left;
+    
+    if (threadIdx.x == 0) {
+        nb_tasks_per_block = nb_blocks_used / gridDim.x;
+        if (blockIdx.x < (nb_blocks_used % gridDim.x)) {
+            nb_tasks_per_block ++;
+        }
+        WARP_SIZE = 32;
+        nb_warp_per_block = blockDim.x / WARP_SIZE;
+ //       nb_warp_per_block = 1;
+     //   if (nb_tasks_per_block == )
+    //    printf("cuda_iov_count %d, ddt_extent %d, current_count %d\n", cuda_iov_count, ddt_extent, current_count);
+    //     printf("nb_tasks %d, griddim %d, nb_blocks_used %d, bloid %d \n", nb_tasks, gridDim.x, nb_blocks_used, blockIdx.x);
+    }
+    __syncthreads();
+      
+      const uint32_t warp_id_per_block = threadIdx.x / WARP_SIZE;
+      const uint32_t tid_per_warp = threadIdx.x & (WARP_SIZE - 1);
+ //     uint32_t warp_id_per_block = 0;
+ //     uint32_t tid_per_warp = threadIdx.x;  
+    
+    for (i = warp_id_per_block; i < nb_tasks_per_block; i+= nb_warp_per_block) {
+        /* these 3 variables are used multiple times, so put in in register */
+        _my_cuda_iov_pos = (blockIdx.x + i * gridDim.x + current_cuda_iov_pos) % cuda_iov_count;
+        _my_cuda_iov_iteration = (blockIdx.x + i * gridDim.x + current_cuda_iov_pos) / cuda_iov_count;
+        contig_disp = cuda_iov_dist[_my_cuda_iov_pos].contig_disp;  
+        
+        src_offset = cuda_iov_dist[_my_cuda_iov_pos].ncontig_disp + (_my_cuda_iov_iteration + current_count) * ddt_extent;
+        dst_offset = contig_disp + ddt_size * _my_cuda_iov_iteration - destination_disp;
+        _nb_bytes = cuda_iov_dist[_my_cuda_iov_pos + 1].contig_disp - contig_disp;
+        
+        _source_tmp = source_base + src_offset;
+        _destination_tmp = destination_base + dst_offset;
+        /* block size is either multiple of ALIGNMENT_DOUBLE or residule */
+        if ((uintptr_t)(_source_tmp) % ALIGNMENT_DOUBLE == 0 && (uintptr_t)(_destination_tmp) % ALIGNMENT_DOUBLE == 0 && _nb_bytes % ALIGNMENT_DOUBLE == 0) {
+            alignment = ALIGNMENT_DOUBLE;
+        } else if ((uintptr_t)(_source_tmp) % ALIGNMENT_FLOAT == 0 && (uintptr_t)(_destination_tmp) % ALIGNMENT_FLOAT == 0 && _nb_bytes % ALIGNMENT_FLOAT == 0) {
+            alignment = ALIGNMENT_FLOAT;
+        } else {
+            alignment = ALIGNMENT_CHAR;
+        }
+        
+        //alignment = ALIGNMENT_DOUBLE;
+        copy_count = _nb_bytes / alignment;
+    /*    
+        if (threadIdx.x == 0 && nb_tasks != 0) {
+            printf("pack block %d, src_offset %ld, dst_offset %ld, count %d, nb_bytes %d, nb_tasks %d, i %d\n", blockIdx.x, src_offset, dst_offset, copy_count, _nb_bytes, nb_tasks, i);
+        }
+        __syncthreads();
+      */
+       /* if (threadIdx.x == 0){
+            printf("bytes %d, copy count %d, alignment %d, task %d, nb_block_used %d\n", _nb_bytes, copy_count, alignment, i, nb_blocks_used);
+        } */
+        if (alignment == ALIGNMENT_DOUBLE) {
+            uint64_t *_source_base_64, *_destination_base_64; 
+            copy_count_16 = copy_count  / (WARP_SIZE * UNROLL_16) * (WARP_SIZE * UNROLL_16);
+            _source_base_64 = (uint64_t *)(source_base + src_offset);
+            _destination_base_64 = (uint64_t *)(destination_base + dst_offset);
+            if (copy_count_16 > 0) {
+                for (k = 0; k < copy_count_16; k += UNROLL_16 * WARP_SIZE) {
+                    #pragma unroll
+                    for (u = 0; u < UNROLL_16; u++) {
+                        j = tid_per_warp + u * WARP_SIZE + k;
+                        tmp_var_64[u] = *(_source_base_64 + j);
+
+                    }
+                    #pragma unroll
+                    for (u = 0; u < UNROLL_16; u++) {
+                        j = tid_per_warp + u * WARP_SIZE + k;
+                        *(_destination_base_64 + j) = tmp_var_64[u];
+
+                    }
+                }
+            }
+            _source_base_64 += copy_count_16;
+            _destination_base_64 += copy_count_16;
+        
+            copy_count_8 = (copy_count - copy_count_16) / (WARP_SIZE * UNROLL_8) * (WARP_SIZE * UNROLL_8);
+            if (copy_count_8 > 0) {
+                for (k = 0; k < copy_count_8; k += UNROLL_8 * WARP_SIZE) {
+                    #pragma unroll
+                    for (u = 0; u < UNROLL_8; u++) {
+                        j = tid_per_warp + u * WARP_SIZE + k;
+                        tmp_var_64[u] = *(_source_base_64 + j);
+
+                    }
+                    #pragma unroll
+                    for (u = 0; u < UNROLL_8; u++) {
+                        j = tid_per_warp + u * WARP_SIZE + k;
+                        *(_destination_base_64 + j) = tmp_var_64[u];
+
+                    }
+                }
+            }
+            _source_base_64 += copy_count_8;
+            _destination_base_64 += copy_count_8;
+        
+            copy_count_left = copy_count - copy_count_16 - copy_count_8;
+            if (copy_count_left > 0) {
+                #pragma unroll
+                for (u = 0; u < UNROLL_8; u++) {
+                    j = tid_per_warp + u * WARP_SIZE;
+                    if (j < copy_count_left) {
+                        tmp_var_64[u] = *(_source_base_64 + j);
+                    } else {
+                        break;
+                    }
+                }
+                #pragma unroll
+                for (u = 0; u < UNROLL_8; u++) {
+                    j = tid_per_warp + u * WARP_SIZE;
+                    if (j < copy_count_left) {
+                        *(_destination_base_64 + j) = tmp_var_64[u];
+                    } else {
+                        break;
+                    }
+                }
+            }
+        } else if (alignment == ALIGNMENT_FLOAT) {
+            uint32_t *_source_base_32, *_destination_base_32;    
+            copy_count_16 = copy_count  / (WARP_SIZE * UNROLL_16) * (WARP_SIZE * UNROLL_16);
+            _source_base_32 = (uint32_t *)(source_base + src_offset);
+            _destination_base_32 = (uint32_t *)(destination_base + dst_offset);
+            if (copy_count_16 > 0) {
+                for (k = 0; k < copy_count_16; k += UNROLL_16 * WARP_SIZE) {
+                    #pragma unroll
+                    for (u = 0; u < UNROLL_16; u++) {
+                        j = tid_per_warp + u * WARP_SIZE + k;
+                        tmp_var_32[u] = *(_source_base_32 + j);
+
+                    }
+                    #pragma unroll
+                    for (u = 0; u < UNROLL_16; u++) {
+                        j = tid_per_warp + u * WARP_SIZE + k;
+                        *(_destination_base_32 + j) = tmp_var_32[u];
+
+                    }
+                }
+            }
+            _source_base_32 += copy_count_16;
+            _destination_base_32 += copy_count_16;
+        
+            copy_count_8 = (copy_count - copy_count_16) / (WARP_SIZE * UNROLL_8) * (WARP_SIZE * UNROLL_8);
+            if (copy_count_8 > 0) {
+                for (k = 0; k < copy_count_8; k += UNROLL_8 * WARP_SIZE) {
+                    #pragma unroll
+                    for (u = 0; u < UNROLL_8; u++) {
+                        j = tid_per_warp + u * WARP_SIZE + k;
+                        tmp_var_32[u] = *(_source_base_32 + j);
+
+                    }
+                    #pragma unroll
+                    for (u = 0; u < UNROLL_8; u++) {
+                        j = tid_per_warp + u * WARP_SIZE + k;
+                        *(_destination_base_32 + j) = tmp_var_32[u];
+
+                    }
+                }
+            }
+            _source_base_32 += copy_count_8;
+            _destination_base_32 += copy_count_8;
+        
+            copy_count_left = copy_count - copy_count_16 - copy_count_8;
+            if (copy_count_left > 0) {
+                #pragma unroll
+                for (u = 0; u < UNROLL_8; u++) {
+                    j = tid_per_warp + u * WARP_SIZE;
+                    if (j < copy_count_left) {
+                        tmp_var_32[u] = *(_source_base_32 + j);
+                    } else {
+                        break;
+                    }
+                }
+                #pragma unroll
+                for (u = 0; u < UNROLL_8; u++) {
+                    j = tid_per_warp + u * WARP_SIZE;
+                    if (j < copy_count_left) {
+                        *(_destination_base_32 + j) = tmp_var_32[u];
+                    } else {
+                        break;
+                    }
+                }
+            }
+        } else {
+            unsigned char *_source_base_8, *_destination_base_8;
+        
+            copy_count_16 = copy_count  / (WARP_SIZE * UNROLL_16) * (WARP_SIZE * UNROLL_16);
+            _source_base_8 = (unsigned char *)(source_base + src_offset);
+            _destination_base_8 = (unsigned char *)(destination_base + dst_offset);
+            if (copy_count_16 > 0) {
+                for (k = 0; k < copy_count_16; k += UNROLL_16 * WARP_SIZE) {
+                    #pragma unroll
+                    for (u = 0; u < UNROLL_16; u++) {
+                        j = tid_per_warp + u * WARP_SIZE + k;
+                        tmp_var_8[u] = *(_source_base_8 + j);
+
+                    }
+                    #pragma unroll
+                    for (u = 0; u < UNROLL_16; u++) {
+                        j = tid_per_warp + u * WARP_SIZE + k;
+                        *(_destination_base_8 + j) = tmp_var_8[u];
+
+                    }
+                }
+            }
+            _source_base_8 += copy_count_16;
+            _destination_base_8 += copy_count_16;
+        
+            copy_count_8 = (copy_count - copy_count_16) / (WARP_SIZE * UNROLL_8) * (WARP_SIZE * UNROLL_8);
+            if (copy_count_8 > 0) {
+                for (k = 0; k < copy_count_8; k += UNROLL_8 * WARP_SIZE) {
+                    #pragma unroll
+                    for (u = 0; u < UNROLL_8; u++) {
+                        j = tid_per_warp + u * WARP_SIZE + k;
+                        tmp_var_8[u] = *(_source_base_8 + j);
+
+                    }
+                    #pragma unroll
+                    for (u = 0; u < UNROLL_8; u++) {
+                        j = tid_per_warp + u * WARP_SIZE + k;
+                        *(_destination_base_8 + j) = tmp_var_8[u];
+
+                    }
+                }
+            }
+            _source_base_8 += copy_count_8;
+            _destination_base_8 += copy_count_8;
+        
+            copy_count_left = copy_count - copy_count_16 - copy_count_8;
+            if (copy_count_left > 0) {
+                #pragma unroll
+                for (u = 0; u < UNROLL_8; u++) {
+                    j = tid_per_warp + u * WARP_SIZE;
+                    if (j < copy_count_left) {
+                        tmp_var_8[u] = *(_source_base_8 + j);
+                    } else {
+                        break;
+                    }
+                }
+                #pragma unroll
+                for (u = 0; u < UNROLL_8; u++) {
+                    j = tid_per_warp + u * WARP_SIZE;
+                    if (j < copy_count_left) {
+                        *(_destination_base_8 + j) = tmp_var_8[u];
+                    } else {
+                        break;
+                    }
+                }
+            }
+        }
+    }
+}
+#endif
diff --git a/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
new file mode 100644
index 00000000000..20e3b381994
--- /dev/null
+++ b/opal/datatype/cuda/opal_datatype_pack_cuda_wrapper.cu
@@ -0,0 +1,1277 @@
+#include "opal/datatype/opal_convertor_internal.h"
+#include "opal/datatype/opal_datatype_internal.h"
+
+#include "opal_datatype_cuda_internal.cuh"
+#include "opal_datatype_cuda.cuh"
+
+#include <stdio.h>
+#include <assert.h>
+
+
+int32_t opal_ddt_generic_simple_pack_function_cuda_vector(opal_convertor_t* pConvertor,
+                                                      struct iovec* iov,
+                                                      uint32_t* out_size,
+                                                      size_t* max_data )
+{
+    dt_stack_t* pStack;       /* pointer to the position on the stack */
+    uint32_t pos_desc;        /* actual position in the description of the derived datatype */
+    uint32_t count_desc;      /* the number of items already done in the actual pos_desc */
+    size_t total_packed = 0;  /* total amount packed this time */
+    dt_elem_desc_t* description;
+    dt_elem_desc_t* pElem;
+    const opal_datatype_t *pData = pConvertor->pDesc;
+    unsigned char *conv_ptr, *iov_ptr;
+    size_t iov_len_local;
+    uint32_t iov_count;
+    uint8_t transfer_required;
+    uint8_t free_required;
+    uint32_t count_desc_tmp;
+    
+    ddt_cuda_stream_t *cuda_streams = current_cuda_device->cuda_streams;
+    int contiguous_loop_flag = 0;
+    int i;
+    
+#if defined(OPAL_DATATYPE_CUDA_TIMING)    
+    TIMER_DATA_TYPE start, end, start_total, end_total;
+    long total_time;
+#endif
+    
+    DT_CUDA_DEBUG( opal_cuda_output( 2, "opal_convertor_generic_simple_pack_cuda_vector( %p:%p, {%p, %lu}, %u, %u )\n",
+                                (void*)pConvertor, (void*)pConvertor->pBaseBuf,
+                                iov[0].iov_base, (unsigned long)iov[0].iov_len, *out_size, *max_data ); );
+
+    description = pConvertor->use_desc->desc;
+    
+    /* For the first step we have to add both displacement to the source. After in the
+     * main while loop we will set back the conv_ptr to the correct value. This is
+     * due to the fact that the convertor can stop in the middle of a data with a count
+     */
+    pStack = pConvertor->pStack + pConvertor->stack_pos;
+    pos_desc   = pStack->index;
+    conv_ptr   = pConvertor->pBaseBuf + pStack->disp;
+    count_desc = (uint32_t)pStack->count;
+    pStack--;
+    pConvertor->stack_pos--;
+    pElem = &(description[pos_desc]);
+
+    DT_CUDA_DEBUG( opal_cuda_output( 4, "pack start pos_desc %d count_desc %d disp %ld\n"
+                           "stack_pos %d pos_desc %d count_desc %d disp %ld\n",
+                           pos_desc, count_desc, (long)(conv_ptr - pConvertor->pBaseBuf),
+                           pConvertor->stack_pos, pStack->index, (int)pStack->count, (long)pStack->disp ); );
+    
+    
+    for( iov_count = 0; iov_count < (*out_size); iov_count++ ) {
+        if ((iov[iov_count].iov_base == NULL) || opal_ddt_cuda_is_gpu_buffer(iov[iov_count].iov_base)) {
+            if (iov[iov_count].iov_len == 0) {
+                iov_len_local = DT_CUDA_BUFFER_SIZE;
+            } else {
+                iov_len_local = iov[iov_count].iov_len;
+            }
+        
+            if (iov[iov_count].iov_base == NULL) {
+                iov[iov_count].iov_base = (unsigned char *)opal_ddt_cuda_malloc_gpu_buffer(iov_len_local, 0);
+                iov_ptr = (unsigned char *)iov[iov_count].iov_base;
+                pConvertor->gpu_buffer_ptr = iov_ptr;
+                free_required = 1;
+            } else {
+                iov_ptr = (unsigned char *)iov[iov_count].iov_base;
+                free_required = 0;
+            }
+            transfer_required = 0;
+        } else {
+            if (OPAL_DATATYPE_VECTOR_USE_MEMCPY2D_D2H || OPAL_DATATYPE_VECTOR_USE_ZEROCPY) {
+                pConvertor->gpu_buffer_ptr = NULL;
+                transfer_required = 0;
+                free_required = 0;
+                iov_ptr = (unsigned char*)iov[iov_count].iov_base;
+                iov_len_local = iov[iov_count].iov_len;
+            } else if (OPAL_DATATYPE_VECTOR_USE_PIPELINE){
+                iov_len_local = iov[iov_count].iov_len;
+                if (pConvertor->gpu_buffer_ptr == NULL) {
+                    pConvertor->gpu_buffer_ptr = (unsigned char*)opal_ddt_cuda_malloc_gpu_buffer(iov_len_local, 0);
+                }
+                transfer_required = 0;
+                free_required = 1;
+                iov_ptr = (unsigned char*)iov[iov_count].iov_base;
+            } else {
+                iov_len_local = iov[iov_count].iov_len;
+                if (pConvertor->gpu_buffer_ptr == NULL) {
+                    pConvertor->gpu_buffer_ptr = (unsigned char*)opal_ddt_cuda_malloc_gpu_buffer(iov_len_local, 0);
+                }
+                transfer_required = 1;
+                free_required = 1;
+                iov_ptr = pConvertor->gpu_buffer_ptr;
+            }
+        }
+        while( 1 ) {
+            while( pElem->elem.common.flags & OPAL_DATATYPE_FLAG_DATA ) {
+                /* now here we have a basic datatype */
+                /* should not go into here */
+                pack_predefined_data_cuda( pElem, &count_desc, &conv_ptr, &iov_ptr, &iov_len_local );
+                if( 0 == count_desc ) {  /* completed */
+                    conv_ptr = pConvertor->pBaseBuf + pStack->disp;
+                    pos_desc++;  /* advance to the next data */
+                    UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
+                    continue;
+                }
+                if (contiguous_loop_flag) {
+                    pStack--;
+                    pConvertor->stack_pos--;
+                    pos_desc --;
+                    pElem = &(description[pos_desc]);
+                    count_desc = count_desc_tmp;
+                }
+                goto complete_loop;
+            }
+            if( OPAL_DATATYPE_END_LOOP == pElem->elem.common.type ) { /* end of the current loop */
+                DT_CUDA_DEBUG( opal_cuda_output( 4, "pack end_loop count %d stack_pos %d"
+                                                 " pos_desc %d disp %ld space %lu\n",
+                                                 (int)pStack->count, pConvertor->stack_pos,
+                                                 pos_desc, (long)pStack->disp, (unsigned long)iov_len_local ); );
+                if( --(pStack->count) == 0 ) { /* end of loop */
+                    if( 0 == pConvertor->stack_pos ) {
+                        /* we lie about the size of the next element in order to
+                         * make sure we exit the main loop.
+                         */
+                        *out_size = iov_count;
+                        goto complete_loop;  /* completed */
+                    }
+                    pConvertor->stack_pos--;
+                    pStack--;
+                    pos_desc++;
+                } else {
+                    pos_desc = pStack->index + 1;
+                    if( pStack->index == -1 ) {
+                        pStack->disp += (pData->ub - pData->lb);
+                    } else {
+                        assert( OPAL_DATATYPE_LOOP == description[pStack->index].loop.common.type );
+                        pStack->disp += description[pStack->index].loop.extent;
+                    }
+                }
+                conv_ptr = pConvertor->pBaseBuf + pStack->disp;
+                UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
+                DT_CUDA_DEBUG( opal_cuda_output( 4, "pack new_loop count %d stack_pos %d pos_desc %d count_desc %d disp %ld space %lu\n",
+                                                 (int)pStack->count, pConvertor->stack_pos, pos_desc,
+                                                 count_desc, (long)pStack->disp, (unsigned long)iov_len_local ); );
+            }
+            if( OPAL_DATATYPE_LOOP == pElem->elem.common.type ) {
+                OPAL_PTRDIFF_TYPE local_disp = (OPAL_PTRDIFF_TYPE)conv_ptr;
+                if( pElem->loop.common.flags & OPAL_DATATYPE_FLAG_CONTIGUOUS ) {
+                    if (OPAL_DATATYPE_VECTOR_USE_MEMCPY2D_D2H) {
+                        pack_contiguous_loop_cuda_memcpy2d_d2h(pElem, &count_desc, &conv_ptr, &iov_ptr, &iov_len_local);
+                    } else if (OPAL_DATATYPE_VECTOR_USE_ZEROCPY) {
+                        pack_contiguous_loop_cuda_zerocopy(pElem, &count_desc, &conv_ptr, &iov_ptr, &iov_len_local);
+                    } else if (OPAL_DATATYPE_VECTOR_USE_PIPELINE) {
+                        pack_contiguous_loop_cuda_pipeline(pElem, &count_desc, &conv_ptr, &iov_ptr, &iov_len_local, pConvertor->gpu_buffer_ptr);
+                    } else {
+                        pack_contiguous_loop_cuda(pElem, &count_desc, &conv_ptr, &iov_ptr, &iov_len_local);
+                    }
+                    if( 0 == count_desc ) {  /* completed */
+                        pos_desc += pElem->loop.items + 1;
+                        goto update_loop_description;
+                    } else {
+                        contiguous_loop_flag = 1;
+                    }
+                    /* Save the stack with the correct last_count value. */
+                }
+                local_disp = (OPAL_PTRDIFF_TYPE)conv_ptr - local_disp;
+                PUSH_STACK( pStack, pConvertor->stack_pos, pos_desc, OPAL_DATATYPE_LOOP, count_desc,
+                            pStack->disp + local_disp);
+                pos_desc++;
+            update_loop_description:  /* update the current state */
+                if (contiguous_loop_flag) {
+                    count_desc_tmp = count_desc;
+                } else {
+                    conv_ptr = pConvertor->pBaseBuf + pStack->disp;
+                }
+                UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
+                continue;
+            }
+        }
+    complete_loop:
+        iov[iov_count].iov_len -= iov_len_local;  /* update the amount of valid data */
+        total_packed += iov[iov_count].iov_len;
+ //       printf("iov_len %d, local %d\n", iov[iov_count].iov_len, iov_len_local);
+        cudaStreamSynchronize(cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]);
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+        GET_TIME(start);
+#endif
+        if (transfer_required) {
+            cudaMemcpy(iov[iov_count].iov_base, pConvertor->gpu_buffer_ptr, total_packed, cudaMemcpyDeviceToHost);
+        } 
+#if defined(OPAL_DATATYPE_CUDA_TIMING) 
+        GET_TIME( end );
+        total_time = ELAPSED_TIME( start, end );
+        DT_CUDA_DEBUG( opal_cuda_output( 2, "[Timing]: DtoH memcpy in %ld microsec, transfer required %d\n", total_time, transfer_required ); );
+#endif
+    }
+    *max_data = total_packed;
+    pConvertor->bConverted += total_packed;  /* update the already converted bytes */
+    *out_size = iov_count;
+    if( pConvertor->bConverted == pConvertor->local_size ) {
+        pConvertor->flags |= CONVERTOR_COMPLETED;
+        DT_CUDA_DEBUG( opal_cuda_output( 0, "Pack total packed %lu\n", pConvertor->bConverted); );
+        if (pConvertor->gpu_buffer_ptr != NULL && free_required == 1) {
+            printf("free\n");
+           opal_ddt_cuda_free_gpu_buffer(pConvertor->gpu_buffer_ptr, 0);
+           pConvertor->gpu_buffer_ptr = NULL;
+        }
+        return 1;
+    }
+    /* Save the global position for the next round */
+    PUSH_STACK( pStack, pConvertor->stack_pos, pos_desc, pElem->elem.common.type, count_desc,
+                conv_ptr - pConvertor->pBaseBuf );
+    DT_CUDA_DEBUG( opal_cuda_output( 4, "pack save stack stack_pos %d pos_desc %d count_desc %d disp %ld\n",
+                                     pConvertor->stack_pos, pStack->index, (int)pStack->count, (long)pStack->disp ); );
+    return 0;
+}
+
+int32_t opal_ddt_generic_simple_pack_function_cuda_vector2(opal_convertor_t* pConvertor,
+                                                      struct iovec* iov,
+                                                      uint32_t* out_size,
+                                                      size_t* max_data )
+{
+    dt_stack_t* pStack;       /* pointer to the position on the stack */
+    uint32_t pos_desc;        /* actual position in the description of the derived datatype */
+    uint32_t count_desc;      /* the number of items already done in the actual pos_desc */
+    size_t total_packed = 0;  /* total amount packed this time */
+    dt_elem_desc_t* description;
+    dt_elem_desc_t* pElem;
+    const opal_datatype_t *pData = pConvertor->pDesc;
+    unsigned char *conv_ptr, *iov_ptr;
+    size_t iov_len_local;
+    uint32_t iov_count;
+    uint8_t transfer_required;
+    uint8_t free_required;
+    uint32_t count_desc_tmp;
+    
+    ddt_cuda_stream_t *cuda_streams = current_cuda_device->cuda_streams;
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+    TIMER_DATA_TYPE start, end, start_total, end_total;
+    long total_time;
+#endif
+
+    DT_CUDA_DEBUG( opal_cuda_output( 2, "opal_convertor_generic_simple_pack_cuda_vector( %p:%p, {%p, %lu}, %u, %u )\n",
+                                (void*)pConvertor, (void*)pConvertor->pBaseBuf,
+                                iov[0].iov_base, (unsigned long)iov[0].iov_len, *out_size, *max_data ); );
+
+    description = pConvertor->use_desc->desc;
+
+    /* For the first step we have to add both displacement to the source. After in the
+     * main while loop we will set back the conv_ptr to the correct value. This is
+     * due to the fact that the convertor can stop in the middle of a data with a count
+     */
+    pStack = pConvertor->pStack + pConvertor->stack_pos;
+    pos_desc   = pStack->index;
+    conv_ptr   = pConvertor->pBaseBuf + pStack->disp;
+    count_desc = (uint32_t)pStack->count;
+    pStack--;
+    pConvertor->stack_pos--;
+    pElem = &(description[pos_desc]);
+
+    DT_CUDA_DEBUG( opal_cuda_output( 4, "pack start pos_desc %d count_desc %d disp %ld\n"
+                           "stack_pos %d pos_desc %d count_desc %d disp %ld\n",
+                           pos_desc, count_desc, (long)(conv_ptr - pConvertor->pBaseBuf),
+                           pConvertor->stack_pos, pStack->index, (int)pStack->count, (long)pStack->disp ); );
+
+
+    for( iov_count = 0; iov_count < (*out_size); iov_count++ ) {
+        if ((iov[iov_count].iov_base == NULL) || opal_ddt_cuda_is_gpu_buffer(iov[iov_count].iov_base)) {
+            if (iov[iov_count].iov_len == 0) {
+                iov_len_local = DT_CUDA_BUFFER_SIZE;
+            } else {
+                iov_len_local = iov[iov_count].iov_len;
+            }
+
+            if (iov[iov_count].iov_base == NULL) {
+                iov[iov_count].iov_base = (unsigned char *)opal_ddt_cuda_malloc_gpu_buffer(iov_len_local, 0);
+                iov_ptr = (unsigned char *)iov[iov_count].iov_base;
+                pConvertor->gpu_buffer_ptr = iov_ptr;
+                free_required = 1;
+            } else {
+                iov_ptr = (unsigned char *)iov[iov_count].iov_base;
+                free_required = 0;
+            }
+            transfer_required = 0;
+        } else {
+            if (OPAL_DATATYPE_VECTOR_USE_MEMCPY2D_D2H || OPAL_DATATYPE_VECTOR_USE_ZEROCPY) {
+                pConvertor->gpu_buffer_ptr = NULL;
+                transfer_required = 0;
+                free_required = 0;
+                iov_ptr = (unsigned char*)iov[iov_count].iov_base;
+                iov_len_local = iov[iov_count].iov_len;
+            } else if (OPAL_DATATYPE_VECTOR_USE_PIPELINE){
+                iov_len_local = iov[iov_count].iov_len;
+                if (pConvertor->gpu_buffer_ptr == NULL) {
+                    pConvertor->gpu_buffer_ptr = (unsigned char*)opal_ddt_cuda_malloc_gpu_buffer(iov_len_local, 0);
+                }
+                transfer_required = 0;
+                free_required = 1;
+                iov_ptr = (unsigned char*)iov[iov_count].iov_base;
+            } else {
+                iov_len_local = iov[iov_count].iov_len;
+                if (pConvertor->gpu_buffer_ptr == NULL) {
+                    pConvertor->gpu_buffer_ptr = (unsigned char*)opal_ddt_cuda_malloc_gpu_buffer(iov_len_local, 0);
+                }
+                transfer_required = 1;
+                free_required = 1;
+                iov_ptr = pConvertor->gpu_buffer_ptr;
+            }
+        }
+        while( 1 ) {
+            while( pElem->elem.common.flags & OPAL_DATATYPE_FLAG_DATA ) {
+                /* now here we have a basic datatype */
+                /* should not go into here */
+                pStack--;
+                pConvertor->stack_pos--;
+                pos_desc --;
+                pElem = &(description[pos_desc]);
+                count_desc = count_desc_tmp;
+                goto complete_loop;
+            }
+            if( OPAL_DATATYPE_END_LOOP == pElem->elem.common.type ) { /* end of the current loop */
+                DT_CUDA_DEBUG( opal_cuda_output( 4, "pack end_loop count %d stack_pos %d"
+                                                 " pos_desc %d disp %ld space %lu\n",
+                                                 (int)pStack->count, pConvertor->stack_pos,
+                                                 pos_desc, (long)pStack->disp, (unsigned long)iov_len_local ); );
+                if( --(pStack->count) == 0 ) { /* end of loop */
+                    if( 0 == pConvertor->stack_pos ) {
+                        /* we lie about the size of the next element in order to
+                         * make sure we exit the main loop.
+                         */
+                        *out_size = iov_count;
+                        goto complete_loop;  /* completed */
+                    }
+                    pConvertor->stack_pos--;
+                    pStack--;
+                    pos_desc++;
+                } else {
+                    pos_desc = pStack->index + 1;
+                    if( pStack->index == -1 ) {
+                        pStack->disp += (pData->ub - pData->lb);
+                    } else {
+                        assert( OPAL_DATATYPE_LOOP == description[pStack->index].loop.common.type );
+                        pStack->disp += description[pStack->index].loop.extent;
+                    }
+                }
+                conv_ptr = pConvertor->pBaseBuf + pStack->disp;
+                UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
+                DT_CUDA_DEBUG( opal_cuda_output( 4, "pack new_loop count %d stack_pos %d pos_desc %d count_desc %d disp %ld space %lu\n",
+                                                 (int)pStack->count, pConvertor->stack_pos, pos_desc,
+                                                 count_desc, (long)pStack->disp, (unsigned long)iov_len_local ); );
+            }
+            if( OPAL_DATATYPE_LOOP == pElem->elem.common.type ) {
+                OPAL_PTRDIFF_TYPE local_disp = (OPAL_PTRDIFF_TYPE)conv_ptr;
+                if( pElem->loop.common.flags & OPAL_DATATYPE_FLAG_CONTIGUOUS ) {
+                    if (OPAL_DATATYPE_VECTOR_USE_MEMCPY2D_D2H) {
+                        pack_contiguous_loop_cuda_memcpy2d_d2h(pElem, &count_desc, &conv_ptr, &iov_ptr, &iov_len_local);
+                    } else if (OPAL_DATATYPE_VECTOR_USE_ZEROCPY) {
+                        pack_contiguous_loop_cuda_zerocopy(pElem, &count_desc, &conv_ptr, &iov_ptr, &iov_len_local);
+                    } else if (OPAL_DATATYPE_VECTOR_USE_PIPELINE) {
+                        pack_contiguous_loop_cuda_pipeline(pElem, &count_desc, &conv_ptr, &iov_ptr, &iov_len_local, pConvertor->gpu_buffer_ptr);
+                    } else {
+                        pack_contiguous_loop_cuda(pElem, &count_desc, &conv_ptr, &iov_ptr, &iov_len_local);
+                    }
+                    if( 0 == count_desc ) {  /* completed */
+                        pos_desc += pElem->loop.items + 1;
+                        goto update_loop_description;
+                    }
+                    /* Save the stack with the correct last_count value. */
+                }
+                local_disp = (OPAL_PTRDIFF_TYPE)conv_ptr - local_disp;
+                PUSH_STACK( pStack, pConvertor->stack_pos, pos_desc, OPAL_DATATYPE_LOOP, count_desc,
+                            pStack->disp + local_disp);
+                pos_desc++;
+            update_loop_description:  /* update the current state */
+           //     conv_ptr = pConvertor->pBaseBuf + pStack->disp;
+                count_desc_tmp = count_desc;
+                UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
+                continue;
+            }
+        }
+    complete_loop:
+        iov[iov_count].iov_len -= iov_len_local;  /* update the amount of valid data */
+        total_packed += iov[iov_count].iov_len;
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+        GET_TIME(start);
+#endif
+        if (transfer_required) {
+            cudaMemcpy(iov[iov_count].iov_base, pConvertor->gpu_buffer_ptr, total_packed, cudaMemcpyDeviceToHost);
+        }
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+        GET_TIME( end );
+        total_time = ELAPSED_TIME( start, end );
+        DT_CUDA_DEBUG( opal_cuda_output( 2, "[Timing]: DtoH memcpy in %ld microsec, transfer required %d\n", total_time, transfer_required ); );
+#endif
+    }
+    *max_data = total_packed;
+    pConvertor->bConverted += total_packed;  /* update the already converted bytes */
+    *out_size = iov_count;
+    DT_CUDA_DEBUG( opal_cuda_output( 2, "Pack total packed %lu\n", total_packed); );
+    if( pConvertor->bConverted == pConvertor->local_size ) {
+        pConvertor->flags |= CONVERTOR_COMPLETED;
+        if (pConvertor->gpu_buffer_ptr != NULL && free_required == 1) {
+            printf("free\n");
+           opal_ddt_cuda_free_gpu_buffer(pConvertor->gpu_buffer_ptr, 0);
+           pConvertor->gpu_buffer_ptr = NULL;
+        }
+        return 1;
+    }
+    /* Save the global position for the next round */
+    PUSH_STACK( pStack, pConvertor->stack_pos, pos_desc, pElem->elem.common.type, count_desc,
+                conv_ptr - pConvertor->pBaseBuf );
+    DT_CUDA_DEBUG( opal_cuda_output( 4, "pack save stack stack_pos %d pos_desc %d count_desc %d disp %ld\n",
+                                     pConvertor->stack_pos, pStack->index, (int)pStack->count, (long)pStack->disp ); );
+    return 0;
+}
+
+void pack_contiguous_loop_cuda( dt_elem_desc_t* ELEM,
+                                uint32_t* COUNT,
+                                unsigned char** SOURCE,
+                                unsigned char** DESTINATION,
+                                size_t* SPACE )
+{
+    ddt_loop_desc_t *_loop = (ddt_loop_desc_t*)(ELEM);
+    ddt_endloop_desc_t* _end_loop = (ddt_endloop_desc_t*)((ELEM) + _loop->items);
+    unsigned char* _source = (*SOURCE) + _end_loop->first_elem_disp;
+    uint32_t _copy_loops = *(COUNT);
+    uint32_t num_blocks, tasks_per_block;
+    unsigned char* _destination = *(DESTINATION);
+    
+    ddt_cuda_stream_t *cuda_streams = current_cuda_device->cuda_streams;
+    
+#if defined(OPAL_DATATYPE_CUDA_TIMING)    
+    TIMER_DATA_TYPE start, end, start_total, end_total;
+    long total_time;
+#endif
+
+    DT_CUDA_DEBUG( opal_cuda_output( 2, "Pack using contiguous_loop_cuda\n"); );
+
+    if( (_copy_loops * _end_loop->size) > *(SPACE) )
+        _copy_loops = (uint32_t)(*(SPACE) / _end_loop->size);
+
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+    GET_TIME(start);
+#endif    
+ //   tasks_per_block = THREAD_PER_BLOCK * TASK_PER_THREAD;
+ //   num_blocks = (*COUNT + tasks_per_block - 1) / tasks_per_block;
+//    printf("extent %ld, size %ld, count %ld\n", _loop->extent, _end_loop->size, _copy_loops);
+#if OPAL_DATATYPE_VECTOR_USE_MEMCPY2D_AS_KERNEL
+    cudaMemcpy2DAsync(_destination, _end_loop->size, _source, _loop->extent, _end_loop->size, _copy_loops, cudaMemcpyDeviceToDevice, cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]);
+#else
+    pack_contiguous_loop_cuda_kernel_global<<<16, 8*THREAD_PER_BLOCK, 0, cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]>>>(_copy_loops, _end_loop->size, _loop->extent, _source, _destination);
+#endif /* OPAL_DATATYPE_VECTOR_USE_MEMCPY2D_AS_KERNEL */
+
+#if !defined(OPAL_DATATYPE_CUDA_DRY_RUN)    
+    *(SOURCE) = _source +  _loop->extent*_copy_loops - _end_loop->first_elem_disp;
+    *(DESTINATION) = *(DESTINATION)  + _copy_loops * _end_loop->size;
+    *(SPACE) -= _copy_loops * _end_loop->size;
+    *(COUNT) -= _copy_loops;
+#endif
+    
+    cudaStreamSynchronize(cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]);
+    
+#if defined(OPAL_DATATYPE_CUDA_TIMING) 
+    GET_TIME( end );
+    total_time = ELAPSED_TIME( start, end );
+    DT_CUDA_DEBUG( opal_cuda_output( 2, "[Timing]: vector packing in %ld microsec\n", total_time ); );
+#endif
+}
+
+/* this function will not be used */
+void pack_contiguous_loop_cuda_pipeline( dt_elem_desc_t* ELEM,
+                                         uint32_t* COUNT,
+                                         unsigned char** SOURCE,
+                                         unsigned char** DESTINATION,
+                                         size_t* SPACE, unsigned char* gpu_buffer )
+{
+    ddt_loop_desc_t *_loop = (ddt_loop_desc_t*)(ELEM);
+    ddt_endloop_desc_t* _end_loop = (ddt_endloop_desc_t*)((ELEM) + _loop->items);
+    unsigned char* _source = (*SOURCE) + _end_loop->first_elem_disp;
+    uint32_t _copy_loops = *(COUNT);
+    uint32_t num_blocks, tasks_per_block;
+    unsigned char* _destination_host = *(DESTINATION);
+    unsigned char* _destination_dev = gpu_buffer;
+    int i, pipeline_blocks;
+    uint32_t _copy_loops_per_pipeline; 
+    
+    ddt_cuda_stream_t *cuda_streams = current_cuda_device->cuda_streams;
+    
+#if defined(OPAL_DATATYPE_CUDA_TIMING)    
+    TIMER_DATA_TYPE start, end, start_total, end_total;
+    long total_time;
+#endif
+
+    DT_CUDA_DEBUG( opal_cuda_output( 2, "Pack using contiguous_loop_cuda_pipeline\n"); );
+
+    if( (_copy_loops * _end_loop->size) > *(SPACE) )
+        _copy_loops = (uint32_t)(*(SPACE) / _end_loop->size);
+
+#if !defined(OPAL_DATATYPE_CUDA_DRY_RUN)
+ //   _source = pBaseBuf_GPU;
+ //   _destination = (unsigned char*)cuda_desc_h->iov[0].iov_base;
+#endif
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+    GET_TIME(start);
+#endif    
+ //   tasks_per_block = THREAD_PER_BLOCK * TASK_PER_THREAD;
+ //   num_blocks = (*COUNT + tasks_per_block - 1) / tasks_per_block;
+//    cudaMemcpy2D(_destination, _end_loop->size, _source, _loop->extent, _end_loop->size, _copy_loops, cudaMemcpyDeviceToDevice);
+    pipeline_blocks = 4;
+    cuda_streams->current_stream_id = 0;
+    _copy_loops_per_pipeline = (_copy_loops + pipeline_blocks -1 )/ pipeline_blocks;
+    pack_contiguous_loop_cuda_kernel_global<<<192, 4*THREAD_PER_BLOCK, 0, cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]>>>(_copy_loops_per_pipeline, _end_loop->size, _loop->extent, _source, _destination_dev);
+    for (i = 1; i <= pipeline_blocks; i++) {
+        cudaMemcpyAsync(_destination_host, _destination_dev, _end_loop->size * _copy_loops_per_pipeline, cudaMemcpyDeviceToHost, cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]);
+        cuda_streams->current_stream_id ++;
+        cuda_streams->current_stream_id = cuda_streams->current_stream_id % NB_STREAMS;
+        _source += _loop->extent * _copy_loops_per_pipeline;
+        _destination_dev += _end_loop->size * _copy_loops_per_pipeline;
+        _destination_host += _end_loop->size * _copy_loops_per_pipeline;
+        if (i == pipeline_blocks) {
+            _copy_loops_per_pipeline = _copy_loops - _copy_loops_per_pipeline * (pipeline_blocks - 1);
+        }
+        pack_contiguous_loop_cuda_kernel_global<<<192, 4*THREAD_PER_BLOCK, 0, cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]>>>(_copy_loops_per_pipeline, _end_loop->size, _loop->extent, _source, _destination_dev);
+    }
+    cudaMemcpyAsync(_destination_host, _destination_dev, _end_loop->size * _copy_loops_per_pipeline, cudaMemcpyDeviceToHost, cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]);
+
+#if !defined(OPAL_DATATYPE_CUDA_DRY_RUN)    
+    *(SOURCE) = _source +  _loop->extent*_copy_loops - _end_loop->first_elem_disp;
+    *(DESTINATION) = *(DESTINATION)  + _copy_loops * _end_loop->size;
+    *(SPACE) -= _copy_loops * _end_loop->size;
+    *(COUNT) -= _copy_loops;
+#endif
+    
+    cudaDeviceSynchronize();
+    
+#if defined(OPAL_DATATYPE_CUDA_TIMING) 
+    GET_TIME( end );
+    total_time = ELAPSED_TIME( start, end );
+    DT_CUDA_DEBUG( opal_cuda_output( 2, "[Timing]: vector packing in %ld microsec\n", total_time ); );
+#endif
+} 
+
+void pack_contiguous_loop_cuda_memcpy2d_d2h( dt_elem_desc_t* ELEM,
+                                uint32_t* COUNT,
+                                unsigned char** SOURCE,
+                                unsigned char** DESTINATION,
+                                size_t* SPACE )
+{
+    ddt_loop_desc_t *_loop = (ddt_loop_desc_t*)(ELEM);
+    ddt_endloop_desc_t* _end_loop = (ddt_endloop_desc_t*)((ELEM) + _loop->items);
+    unsigned char* _source = (*SOURCE) + _end_loop->first_elem_disp;
+    uint32_t _copy_loops = *(COUNT);
+    uint32_t num_blocks, tasks_per_block;
+    unsigned char* _destination = *(DESTINATION);
+    ddt_cuda_stream_t *cuda_streams = current_cuda_device->cuda_streams;
+    
+#if defined(OPAL_DATATYPE_CUDA_TIMING)    
+    TIMER_DATA_TYPE start, end, start_total, end_total;
+    long total_time;
+#endif
+
+    DT_CUDA_DEBUG( opal_cuda_output( 2, "Pack using contiguous_loop_cuda_memcpy2d\n"); );
+
+    if( (_copy_loops * _end_loop->size) > *(SPACE) )
+        _copy_loops = (uint32_t)(*(SPACE) / _end_loop->size);
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+    GET_TIME(start);
+#endif    
+
+    cudaMemcpy2DAsync(_destination, _end_loop->size, _source, _loop->extent, _end_loop->size, _copy_loops, cudaMemcpyDeviceToHost, cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]);
+
+#if !defined(OPAL_DATATYPE_CUDA_DRY_RUN)    
+    *(SOURCE) = _source +  _loop->extent*_copy_loops - _end_loop->first_elem_disp;
+    *(DESTINATION) = *(DESTINATION)  + _copy_loops * _end_loop->size;
+    *(SPACE) -= _copy_loops * _end_loop->size;
+    *(COUNT) -= _copy_loops;
+#endif
+    
+    cudaStreamSynchronize(cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]);
+    
+#if defined(OPAL_DATATYPE_CUDA_TIMING) 
+    GET_TIME( end );
+    total_time = ELAPSED_TIME( start, end );
+    DT_CUDA_DEBUG( opal_cuda_output( 2, "[Timing]: vector packing with memcpy2d in %ld microsec\n", total_time ); );
+#endif
+}
+
+void pack_contiguous_loop_cuda_zerocopy( dt_elem_desc_t* ELEM,
+                                uint32_t* COUNT,
+                                unsigned char** SOURCE,
+                                unsigned char** DESTINATION,
+                                size_t* SPACE )
+{
+    ddt_loop_desc_t *_loop = (ddt_loop_desc_t*)(ELEM);
+    ddt_endloop_desc_t* _end_loop = (ddt_endloop_desc_t*)((ELEM) + _loop->items);
+    unsigned char* _source = (*SOURCE) + _end_loop->first_elem_disp;
+    uint32_t _copy_loops = *(COUNT);
+    uint32_t num_blocks, tasks_per_block;
+    unsigned char* _destination = *(DESTINATION);
+    unsigned char* _destination_dev;
+    ddt_cuda_stream_t *cuda_streams = current_cuda_device->cuda_streams;
+    
+#if defined(OPAL_DATATYPE_CUDA_TIMING)    
+    TIMER_DATA_TYPE start, end, start_total, end_total;
+    long total_time;
+#endif
+
+    DT_CUDA_DEBUG( opal_cuda_output( 2, "Pack using contiguous_loop_cuda_zerocopy\n"); );
+
+    if( (_copy_loops * _end_loop->size) > *(SPACE) )
+        _copy_loops = (uint32_t)(*(SPACE) / _end_loop->size);
+
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+    GET_TIME(start);
+#endif    
+
+    cudaError_t reg_rv = cudaHostGetDevicePointer((void **)&_destination_dev, (void *) _destination, 0);
+    if (reg_rv != cudaSuccess) {
+        const char *cuda_err = cudaGetErrorString(reg_rv);
+        printf("can not get dev  mem, %s\n", cuda_err);
+    }
+#if OPAL_DATATYPE_VECTOR_USE_MEMCPY2D_AS_KERNEL
+    cudaMemcpy2DAsync(_destination_dev, _end_loop->size, _source, _loop->extent, _end_loop->size, _copy_loops, cudaMemcpyDeviceToDevice, cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]);
+#else
+    pack_contiguous_loop_cuda_kernel_global<<<192, 4*THREAD_PER_BLOCK, 0, cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]>>>(_copy_loops, _end_loop->size, _loop->extent, _source, _destination_dev);
+#endif /* OPAL_DATATYPE_VECTOR_USE_MEMCPY2D_AS_KERNEL */
+
+#if !defined(OPAL_DATATYPE_CUDA_DRY_RUN)    
+    *(SOURCE) = _source +  _loop->extent*_copy_loops - _end_loop->first_elem_disp;
+    *(DESTINATION) = *(DESTINATION)  + _copy_loops * _end_loop->size;
+    *(SPACE) -= _copy_loops * _end_loop->size;
+    *(COUNT) -= _copy_loops;
+#endif
+    
+    cudaStreamSynchronize(cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]);
+    
+#if defined(OPAL_DATATYPE_CUDA_TIMING) 
+    GET_TIME( end );
+    total_time = ELAPSED_TIME( start, end );
+    DT_CUDA_DEBUG( opal_cuda_output( 2, "[Timing]: vector packing in %ld microsec\n", total_time ); );
+#endif
+}
+
+int32_t opal_ddt_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor,
+                                                        struct iovec* iov,
+                                                        uint32_t* out_size,
+                                                        size_t* max_data )
+{
+    size_t buffer_size;
+    unsigned char *destination;
+    size_t total_packed;
+    uint8_t transfer_required, free_required;
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)    
+    TIMER_DATA_TYPE start, end, start_total, end_total;
+    long total_time, move_time;
+#endif
+
+ //   printf("buffer size %d, max_data %d\n", iov[0].iov_len, *max_data);
+    if ((iov[0].iov_base == NULL) || opal_ddt_cuda_is_gpu_buffer(iov[0].iov_base)) {
+        if (iov[0].iov_len == 0) {
+            buffer_size = DT_CUDA_BUFFER_SIZE;
+        } else {
+            buffer_size = iov[0].iov_len;
+        }
+        
+        if (iov[0].iov_base == NULL) {
+            iov[0].iov_base = (unsigned char *)opal_ddt_cuda_malloc_gpu_buffer(buffer_size, 0);
+            destination = (unsigned char *)iov[0].iov_base;
+            pConvertor->gpu_buffer_ptr = destination;
+            free_required = 1;
+        } else {
+            destination = (unsigned char *)iov[0].iov_base;
+            free_required = 0;
+        }
+        transfer_required = 0;
+    } else {
+        buffer_size = iov[0].iov_len;
+        if (OPAL_DATATYPE_VECTOR_USE_ZEROCPY) {
+            pConvertor->gpu_buffer_ptr = NULL;
+            transfer_required = 0;
+            free_required = 0;
+            cudaHostGetDevicePointer((void **)&destination, (void *)iov[0].iov_base, 0);
+        } else {
+            if (pConvertor->gpu_buffer_ptr == NULL) {
+                pConvertor->gpu_buffer_ptr = (unsigned char*)opal_ddt_cuda_malloc_gpu_buffer(buffer_size, 0);
+            }
+            transfer_required = 1;
+            free_required = 1;
+            destination = pConvertor->gpu_buffer_ptr;
+        }
+    }   
+
+    total_packed = 0;
+    
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+    GET_TIME(start_total);
+#endif
+    
+    /* start pack */
+    if (cuda_iov_cache_enabled) {
+        opal_ddt_generic_simple_pack_function_cuda_iov_cached(pConvertor, destination, buffer_size, &total_packed);
+    } else {
+        opal_ddt_generic_simple_pack_function_cuda_iov_non_cached(pConvertor, destination, buffer_size, &total_packed);
+    }
+
+    pConvertor->bConverted += total_packed;
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "Pack total packed %d\n", total_packed); );
+    
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+    GET_TIME(start);
+#endif
+    if (transfer_required) {
+        cudaMemcpy(iov[0].iov_base, pConvertor->gpu_buffer_ptr, total_packed, cudaMemcpyDeviceToHost);
+    } 
+#if defined(OPAL_DATATYPE_CUDA_TIMING) 
+    GET_TIME( end );
+    move_time = ELAPSED_TIME( start, end );
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: DtoH memcpy in %ld microsec, transfer required %d\n", move_time, transfer_required ); );
+#endif
+
+    iov[0].iov_len = total_packed;
+    *max_data = total_packed;
+    *out_size = 1;
+    
+#if defined(OPAL_DATATYPE_CUDA_TIMING)    
+    GET_TIME( end_total );
+    total_time = ELAPSED_TIME( start_total, end_total );
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: total packing in %ld microsec, kernel %ld microsec\n", total_time, total_time-move_time ); );
+#endif
+    
+    if( pConvertor->bConverted == pConvertor->local_size ) {
+        pConvertor->flags |= CONVERTOR_COMPLETED;
+        if (pConvertor->gpu_buffer_ptr != NULL && free_required) {
+           opal_ddt_cuda_free_gpu_buffer(pConvertor->gpu_buffer_ptr, 0);
+           pConvertor->gpu_buffer_ptr = NULL;
+        }
+        return 1;
+    }        
+    return 0; 
+}
+
+#if 0
+
+int32_t opal_ddt_generic_simple_pack_function_cuda_iov_non_cached( opal_convertor_t* pConvertor,
+                                                                   struct iovec* iov,
+                                                                   uint32_t* out_size,
+                                                                   size_t* max_data )
+{
+    uint32_t i, j;
+    uint32_t count_desc, nb_blocks_per_description, residue_desc;
+    uint32_t nb_blocks, thread_per_block, nb_blocks_used;
+    size_t length, buffer_size, length_per_iovec, dst_offset;
+    unsigned char *destination, *destination_base;
+    size_t total_packed, total_converted;
+    int32_t complete_flag = 0;
+    uint8_t buffer_isfull = 0, transfer_required, free_required;
+    uint32_t convertor_flags;
+//    dt_elem_desc_t* description;
+//    dt_elem_desc_t* pElem;
+//    dt_stack_t* pStack;
+    uint8_t alignment, orig_alignment;
+//    int32_t orig_stack_index;
+    cudaError_t cuda_err;
+    ddt_cuda_stream_t *cuda_streams = current_cuda_device->cuda_streams;
+    ddt_cuda_iov_dist_non_cached_t* cuda_iov_dist_h_current;
+    ddt_cuda_iov_dist_non_cached_t* cuda_iov_dist_d_current;
+    ddt_cuda_iov_pipeline_block_t *cuda_iov_pipeline_block;
+    int iov_pipeline_block_id = 0;
+    cudaStream_t *cuda_stream_iov = NULL;
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)    
+    TIMER_DATA_TYPE start, end, start_total, end_total;
+    long total_time, move_time;
+#endif
+    
+    /*description = pConvertor->use_desc->desc;
+    pStack = pConvertor->pStack + pConvertor->stack_pos;
+    pElem = &(description[pStack->index]);
+    printf("size elem %lu, size %d\n", pElem->elem.common.type, opal_datatype_basicDatatypes[pElem->elem.common.type]->size);
+    */
+    
+//    assert(opal_datatype_basicDatatypes[pElem->elem.common.type]->size != 0);
+
+ //   printf("buffer size %d, max_data %d\n", iov[0].iov_len, *max_data);
+    if ((iov[0].iov_base == NULL) || opal_ddt_cuda_is_gpu_buffer(iov[0].iov_base)) {
+        if (iov[0].iov_len == 0) {
+            buffer_size = DT_CUDA_BUFFER_SIZE;
+        } else {
+            buffer_size = iov[0].iov_len;
+        }
+        
+        if (iov[0].iov_base == NULL) {
+            iov[0].iov_base = (unsigned char *)opal_ddt_cuda_malloc_gpu_buffer(buffer_size, 0);
+            destination = (unsigned char *)iov[0].iov_base;
+            pConvertor->gpu_buffer_ptr = destination;
+            free_required = 1;
+        } else {
+            destination = (unsigned char *)iov[0].iov_base;
+            free_required = 0;
+        }
+        transfer_required = 0;
+    } else {
+        buffer_size = iov[0].iov_len;
+        if (OPAL_DATATYPE_VECTOR_USE_ZEROCPY) {
+            pConvertor->gpu_buffer_ptr = NULL;
+            transfer_required = 0;
+            free_required = 0;
+            cudaHostGetDevicePointer((void **)&destination, (void *)iov[0].iov_base, 0);
+        } else {
+            if (pConvertor->gpu_buffer_ptr == NULL) {
+                pConvertor->gpu_buffer_ptr = (unsigned char*)opal_ddt_cuda_malloc_gpu_buffer(buffer_size, 0);
+            }
+            transfer_required = 1;
+            free_required = 1;
+            destination = pConvertor->gpu_buffer_ptr;
+        }
+    }   
+    
+    destination_base = destination;
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "Pack using IOV non cached, GPU base %p, pack to buffer %p\n", pConvertor->pBaseBuf, destination););
+
+    cuda_iov_count = 1000;//CUDA_NB_IOV;
+    total_packed = 0;
+    total_converted = pConvertor->bConverted;
+    cuda_streams->current_stream_id = 0;
+    convertor_flags = pConvertor->flags;
+  //  orig_stack_index = pStack->index;
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+    GET_TIME(start_total);
+#endif
+    
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+    GET_TIME(start);
+#endif
+    complete_flag = opal_convertor_raw( pConvertor, cuda_iov, &cuda_iov_count, &length );
+    DT_CUDA_DEBUG ( opal_cuda_output(4, "Pack complete flag %d, iov count %d, length %d, submit to CUDA stream %d\n", complete_flag, cuda_iov_count, length, cuda_streams->current_stream_id); );
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)    
+    GET_TIME( end );
+    total_time = ELAPSED_TIME( start, end );
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: ddt to iov in %ld microsec\n", total_time ); );
+#endif
+    
+    dst_offset = 0;
+    thread_per_block = CUDA_WARP_SIZE * 5;
+    nb_blocks = 256;
+    
+    while (cuda_iov_count > 0) {
+        
+        nb_blocks_used = 0;
+        cuda_iov_pipeline_block = current_cuda_device->cuda_iov_pipeline_block[iov_pipeline_block_id];
+        cuda_iov_dist_h_current = cuda_iov_pipeline_block->cuda_iov_dist_non_cached_h;
+        cuda_iov_dist_d_current = cuda_iov_pipeline_block->cuda_iov_dist_non_cached_d;
+        cuda_stream_iov = cuda_iov_pipeline_block->cuda_stream;
+        cuda_err = cudaStreamWaitEvent(*cuda_stream_iov, cuda_iov_pipeline_block->cuda_event, 0);
+        opal_cuda_check_error(cuda_err);
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+        GET_TIME(start);
+#endif
+
+        for (i = 0; i < cuda_iov_count; i++) {
+          /*  pElem = &(description[orig_stack_index+i]);*/
+            if (buffer_size >= cuda_iov[i].iov_len) {
+                length_per_iovec = cuda_iov[i].iov_len;
+            } else {
+                /*orig_alignment = opal_datatype_basicDatatypes[pElem->elem.common.type]->size;*/
+                orig_alignment = ALIGNMENT_CHAR;
+                length_per_iovec = buffer_size / orig_alignment * orig_alignment;
+                buffer_isfull = 1;
+            }
+            buffer_size -= length_per_iovec;
+            total_packed += length_per_iovec;
+            
+            /* check alignment */
+            if ((uintptr_t)(cuda_iov[i].iov_base) % ALIGNMENT_DOUBLE == 0 && (uintptr_t)destination % ALIGNMENT_DOUBLE == 0 && length_per_iovec >= ALIGNMENT_DOUBLE) {
+                alignment = ALIGNMENT_DOUBLE;
+            } else if ((uintptr_t)(cuda_iov[i].iov_base) % ALIGNMENT_FLOAT == 0 && (uintptr_t)destination % ALIGNMENT_FLOAT == 0 && length_per_iovec >= ALIGNMENT_FLOAT) {
+                alignment = ALIGNMENT_FLOAT;
+            } else {
+                alignment = ALIGNMENT_CHAR;
+            }
+
+            count_desc = length_per_iovec / alignment;
+            residue_desc = length_per_iovec % alignment;
+            nb_blocks_per_description = (count_desc + thread_per_block - 1) / thread_per_block;
+            DT_CUDA_DEBUG ( opal_cuda_output(10, "Pack description %d, size %d, residue %d, alignment %d, nb_block_aquired %d\n", i, count_desc, residue_desc, alignment, nb_blocks_per_description); );
+            for (j = 0; j < nb_blocks_per_description; j++) {
+                cuda_iov_dist_h_current[nb_blocks_used].src = (unsigned char *)(cuda_iov[i].iov_base) + j * thread_per_block * alignment;
+                cuda_iov_dist_h_current[nb_blocks_used].dst = destination;
+                cuda_iov_dist_h_current[nb_blocks_used].element_alignment = alignment;
+                if ( (j+1) * thread_per_block <= count_desc) {
+                    cuda_iov_dist_h_current[nb_blocks_used].nb_elements = thread_per_block;
+                } else {
+                    cuda_iov_dist_h_current[nb_blocks_used].nb_elements = count_desc - j*thread_per_block; 
+                }
+#if defined (OPAL_DATATYPE_CUDA_DEBUG)
+                assert(cuda_iov_dist_h_current[nb_blocks_used].nb_elements > 0);
+#endif /* OPAL_DATATYPE_CUDA_DEBUG */
+                destination += cuda_iov_dist_h_current[nb_blocks_used].nb_elements * alignment;
+                DT_CUDA_DEBUG( opal_cuda_output(12, "Pack \tblock %d, src %p, dst %p, nb_elements %d, alignment %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].src, cuda_iov_dist_h_current[nb_blocks_used].dst, cuda_iov_dist_h_current[nb_blocks_used].nb_elements, cuda_iov_dist_h_current[nb_blocks_used].element_alignment); );
+                nb_blocks_used ++;
+                assert (nb_blocks_used < CUDA_MAX_NB_BLOCKS*CUDA_IOV_MAX_TASK_PER_BLOCK);
+            }
+            
+            /* handle residue */
+            if (residue_desc != 0) {
+                /*orig_alignment = opal_datatype_basicDatatypes[pElem->elem.common.type]->size;*/
+                orig_alignment = ALIGNMENT_CHAR;
+                cuda_iov_dist_h_current[nb_blocks_used].src = (unsigned char *)(cuda_iov[i].iov_base) + length_per_iovec / alignment * alignment;
+                cuda_iov_dist_h_current[nb_blocks_used].dst = destination;
+                cuda_iov_dist_h_current[nb_blocks_used].element_alignment = orig_alignment;
+                cuda_iov_dist_h_current[nb_blocks_used].nb_elements = (length_per_iovec - length_per_iovec / alignment * alignment) / orig_alignment;
+#if defined (OPAL_DATATYPE_CUDA_DEBUG)
+                assert(cuda_iov_dist_h_current[nb_blocks_used].nb_elements > 0);
+#endif /* OPAL_DATATYPE_CUDA_DEBUG */
+                destination += cuda_iov_dist_h_current[nb_blocks_used].nb_elements * orig_alignment;
+                DT_CUDA_DEBUG( opal_cuda_output(12, "Pack \tblock %d, src %p, dst %p, nb_elements %d, alignment %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].src, cuda_iov_dist_h_current[nb_blocks_used].dst, cuda_iov_dist_h_current[nb_blocks_used].nb_elements, cuda_iov_dist_h_current[nb_blocks_used].element_alignment); );
+                nb_blocks_used ++;
+                assert (nb_blocks_used < CUDA_MAX_NB_BLOCKS*CUDA_IOV_MAX_TASK_PER_BLOCK);
+            }
+            
+            if (buffer_isfull) {
+                break;
+            }
+        }
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)    
+        GET_TIME( end );
+        total_time = ELAPSED_TIME( start, end );
+        DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Pack to dest %p, iov is prepared in %ld microsec, kernel submitted to CUDA stream %d, nb_blocks %d\n", destination_base, total_time,  cuda_iov_pipeline_block->cuda_stream_id, nb_blocks_used); );
+#endif
+
+        cudaMemcpyAsync(cuda_iov_dist_d_current, cuda_iov_dist_h_current, sizeof(ddt_cuda_iov_dist_non_cached_t)*(nb_blocks_used), cudaMemcpyHostToDevice, *cuda_stream_iov);
+        opal_generic_simple_pack_cuda_iov_non_cached_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cuda_iov_dist_d_current, nb_blocks_used);
+        cuda_err = cudaEventRecord(cuda_iov_pipeline_block->cuda_event, *cuda_stream_iov);
+        opal_cuda_check_error(cuda_err);
+        iov_pipeline_block_id ++;
+        iov_pipeline_block_id = iov_pipeline_block_id % NB_STREAMS;
+        
+        /* buffer is full */
+        if (buffer_isfull) {
+            size_t total_converted_tmp = total_converted;
+            pConvertor->flags = convertor_flags;
+            total_converted += total_packed;
+            opal_convertor_set_position_nocheck(pConvertor, &total_converted);
+            total_packed = total_converted - total_converted_tmp;
+            break;
+        }
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+        GET_TIME(start);
+#endif
+        convertor_flags = pConvertor->flags;
+//        orig_stack_index = pStack->index;
+        complete_flag = opal_convertor_raw( pConvertor, cuda_iov, &cuda_iov_count, &length );
+        DT_CUDA_DEBUG ( opal_cuda_output(4, "Pack complete flag %d, iov count %d, length %d, submit to CUDA stream %d\n", complete_flag, cuda_iov_count, length, cuda_streams->current_stream_id); );
+#if defined(OPAL_DATATYPE_CUDA_TIMING)    
+        GET_TIME( end );
+        total_time = ELAPSED_TIME( start, end );
+        DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: ddt to iov in %ld microsec\n", total_time ); );
+#endif
+    }
+    
+
+    for (i = 0; i < NB_STREAMS; i++) {
+        cudaStreamSynchronize(cuda_streams->opal_cuda_stream[i]);
+    }
+    
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+    GET_TIME(start);
+#endif
+    if (transfer_required) {
+        cudaMemcpy(iov[0].iov_base, pConvertor->gpu_buffer_ptr, total_packed, cudaMemcpyDeviceToHost);
+    } 
+#if defined(OPAL_DATATYPE_CUDA_TIMING) 
+    GET_TIME( end );
+    move_time = ELAPSED_TIME( start, end );
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: DtoH memcpy in %ld microsec, transfer required %d\n", move_time, transfer_required ); );
+#endif
+
+    iov[0].iov_len = total_packed;
+    *max_data = total_packed;
+    *out_size = 1;
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "Pack total packed %d\n", total_packed); );
+    
+#if defined(OPAL_DATATYPE_CUDA_TIMING)    
+    GET_TIME( end_total );
+    total_time = ELAPSED_TIME( start_total, end_total );
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: total packing in %ld microsec, kernel %ld microsec\n", total_time, total_time-move_time ); );
+#endif
+    
+    if( pConvertor->bConverted == pConvertor->local_size ) {
+        pConvertor->flags |= CONVERTOR_COMPLETED;
+        if (pConvertor->gpu_buffer_ptr != NULL && free_required) {
+           opal_ddt_cuda_free_gpu_buffer(pConvertor->gpu_buffer_ptr, 0);
+           pConvertor->gpu_buffer_ptr = NULL;
+        }
+        return 1;
+    }        
+    return 0;
+}
+
+#endif
+
+
+int32_t opal_ddt_generic_simple_pack_function_cuda_iov_non_cached( opal_convertor_t* pConvertor, unsigned char *destination, size_t buffer_size, size_t *total_packed)
+{
+    uint32_t i;
+    uint32_t nb_blocks, thread_per_block, nb_blocks_used;
+    unsigned char *destination_base, *source_base;
+    uint8_t buffer_isfull = 0;
+    cudaError_t cuda_err;
+    ddt_cuda_stream_t *cuda_streams = current_cuda_device->cuda_streams;
+    ddt_cuda_iov_dist_cached_t* cuda_iov_dist_h_current;
+    ddt_cuda_iov_dist_cached_t* cuda_iov_dist_d_current;
+    ddt_cuda_iov_pipeline_block_t *cuda_iov_pipeline_block;
+    int iov_pipeline_block_id = 0;
+    cudaStream_t *cuda_stream_iov = NULL;
+    const struct iovec *ddt_iov = NULL;
+    uint32_t ddt_iov_count = 0;
+    size_t contig_disp = 0;
+    uint32_t ddt_iov_start_pos, ddt_iov_end_pos, current_ddt_iov_pos;
+    OPAL_PTRDIFF_TYPE ddt_extent;
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)    
+    TIMER_DATA_TYPE start, end;
+    long total_time;
+#endif
+    
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "Pack using IOV non cached, GPU base %p, pack to buffer %p\n", pConvertor->pBaseBuf, destination););
+    
+    opal_convertor_raw_cached( pConvertor, &ddt_iov, &ddt_iov_count);
+    if (ddt_iov == NULL) {
+        DT_CUDA_DEBUG ( opal_cuda_output(0, "Can not get ddt iov\n"););
+        return OPAL_ERROR;
+    }
+    
+   // cuda_streams->current_stream_id = 0;
+    thread_per_block = CUDA_WARP_SIZE * 5;
+    nb_blocks = 256;
+    opal_datatype_type_extent(pConvertor->pDesc, &ddt_extent);
+    source_base = (unsigned char*)pConvertor->pBaseBuf + pConvertor->current_count * ddt_extent; 
+    destination_base = destination;
+    
+    cudaStreamSynchronize(cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]);
+    
+    while( pConvertor->current_count < pConvertor->count && !buffer_isfull) {
+        
+        nb_blocks_used = 0;
+        ddt_iov_start_pos = pConvertor->current_iov_pos;
+        ddt_iov_end_pos = ddt_iov_start_pos + IOV_PIPELINE_SIZE;
+        if (ddt_iov_end_pos > ddt_iov_count) {
+            ddt_iov_end_pos = ddt_iov_count;
+        }
+        cuda_iov_pipeline_block = current_cuda_device->cuda_iov_pipeline_block[iov_pipeline_block_id];
+        cuda_iov_pipeline_block->cuda_stream = &(cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]);
+        cuda_iov_dist_h_current = cuda_iov_pipeline_block->cuda_iov_dist_non_cached_h;
+        cuda_iov_dist_d_current = cuda_iov_pipeline_block->cuda_iov_dist_non_cached_d;
+        cuda_stream_iov = cuda_iov_pipeline_block->cuda_stream;
+        cuda_err = cudaEventSynchronize(cuda_iov_pipeline_block->cuda_event);
+        opal_cuda_check_error(cuda_err);
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+        GET_TIME(start);
+#endif
+
+        buffer_isfull = opal_ddt_iov_to_cuda_iov(pConvertor, ddt_iov, cuda_iov_dist_h_current, ddt_iov_start_pos, ddt_iov_end_pos, &buffer_size, &nb_blocks_used, total_packed, &contig_disp, &current_ddt_iov_pos);
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)    
+        GET_TIME( end );
+        total_time = ELAPSED_TIME( start, end );
+        DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Pack src %p to dest %p, iov is prepared in %ld microsec, kernel submitted to CUDA stream %d, nb_blocks %d\n", source_base, destination_base, total_time,  cuda_streams->current_stream_id, nb_blocks_used); );
+#endif
+
+        cudaMemcpyAsync(cuda_iov_dist_d_current, cuda_iov_dist_h_current, sizeof(ddt_cuda_iov_dist_cached_t)*(nb_blocks_used+1), cudaMemcpyHostToDevice, *cuda_stream_iov);
+        opal_generic_simple_pack_cuda_iov_cached_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cuda_iov_dist_d_current, 0, nb_blocks_used, 0, 0, nb_blocks_used, source_base, destination_base);
+        //cudaStreamSynchronize(*cuda_stream_iov);
+        cuda_err = cudaEventRecord(cuda_iov_pipeline_block->cuda_event, *cuda_stream_iov);
+        opal_cuda_check_error(cuda_err);
+        iov_pipeline_block_id ++;
+        iov_pipeline_block_id = iov_pipeline_block_id % NB_STREAMS;
+        destination_base += contig_disp;
+        
+        if (!buffer_isfull) {
+            pConvertor->current_iov_pos = current_ddt_iov_pos;
+            if (current_ddt_iov_pos == ddt_iov_count) {
+                pConvertor->current_count ++;
+                pConvertor->current_iov_pos = 0;
+                source_base += ddt_extent;
+            }
+        }
+        
+    }
+    
+    cudaStreamSynchronize(cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]);
+        
+    return OPAL_SUCCESS;
+}
+
+int32_t opal_ddt_generic_simple_pack_function_cuda_iov_cached( opal_convertor_t* pConvertor, unsigned char *destination, size_t buffer_size, size_t *total_packed)
+{
+    uint32_t i;
+    uint32_t nb_blocks, thread_per_block, nb_blocks_used;
+    unsigned char *destination_base, *source_base;
+    uint8_t buffer_isfull = 0;
+    cudaError_t cuda_err;
+    ddt_cuda_stream_t *cuda_streams = current_cuda_device->cuda_streams;
+    ddt_cuda_iov_pipeline_block_t *cuda_iov_pipeline_block = NULL;
+    cudaStream_t *cuda_stream_iov = NULL;
+    uint32_t cuda_iov_start_pos, cuda_iov_end_pos;
+    ddt_cuda_iov_total_cached_t* cached_cuda_iov = NULL;
+    ddt_cuda_iov_dist_cached_t* cached_cuda_iov_dist_d = NULL;
+    uint32_t *cached_cuda_iov_nb_bytes_list_h = NULL;
+    uint32_t cached_cuda_iov_count = 0;
+    opal_datatype_count_t convertor_current_count;
+    OPAL_PTRDIFF_TYPE ddt_extent;
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)    
+    TIMER_DATA_TYPE start, end;
+    long total_time;
+#endif
+    
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "Pack using IOV cached, GPU base %p, pack to buffer %p\n", pConvertor->pBaseBuf, destination););
+
+   // cuda_streams->current_stream_id = 0;
+    destination_base = destination;
+    thread_per_block = CUDA_WARP_SIZE * 8;
+    nb_blocks = 16;
+    source_base = (unsigned char*)pConvertor->pBaseBuf; 
+    
+    /* cuda iov is not cached, start to cache iov */
+    if(opal_ddt_cuda_iov_is_cached(pConvertor) == 0) {
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+        GET_TIME(start);
+#endif
+        if (opal_ddt_cache_cuda_iov(pConvertor, &nb_blocks_used) == OPAL_SUCCESS) {
+            opal_ddt_set_cuda_iov_cached(pConvertor, nb_blocks_used);
+            DT_CUDA_DEBUG ( opal_cuda_output(2, "Pack cuda iov is cached, count %d\n", nb_blocks_used););
+        } else {
+            DT_CUDA_DEBUG ( opal_cuda_output(0, "Pack cache cuda iov is failed\n"););
+            return OPAL_ERROR;
+        }
+#if defined(OPAL_DATATYPE_CUDA_TIMING)    
+        GET_TIME( end );
+        total_time = ELAPSED_TIME( start, end );
+        DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Pack cuda iov is cached in %ld microsec, nb_blocks %d\n", total_time, nb_blocks_used); );
+#endif
+    }
+    
+    /* now we use cached cuda iov */
+    opal_ddt_get_cached_cuda_iov(pConvertor, &cached_cuda_iov);
+    cached_cuda_iov_dist_d = cached_cuda_iov->cuda_iov_dist_d;
+    assert(cached_cuda_iov_dist_d != NULL);
+    cached_cuda_iov_nb_bytes_list_h = cached_cuda_iov->nb_bytes_h;
+    assert(cached_cuda_iov_nb_bytes_list_h != NULL);
+    
+    cached_cuda_iov_count = cached_cuda_iov->cuda_iov_count;
+    cuda_iov_start_pos = pConvertor->current_cuda_iov_pos;
+    cuda_iov_end_pos = cached_cuda_iov_count;
+    nb_blocks_used = 0;
+    cuda_iov_pipeline_block = current_cuda_device->cuda_iov_pipeline_block[0];
+    cuda_iov_pipeline_block->cuda_stream = &(cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]);
+    cuda_stream_iov = cuda_iov_pipeline_block->cuda_stream;
+    convertor_current_count = pConvertor->current_count;
+   
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+    GET_TIME(start);
+#endif
+    while( pConvertor->current_count < pConvertor->count && !buffer_isfull) {
+        for (i = cuda_iov_start_pos; i < cuda_iov_end_pos && !buffer_isfull; i++) {
+            if (buffer_size >= cached_cuda_iov_nb_bytes_list_h[i]) {
+                *total_packed += cached_cuda_iov_nb_bytes_list_h[i];
+                buffer_size -= cached_cuda_iov_nb_bytes_list_h[i];
+                nb_blocks_used++;
+            } else {
+                buffer_isfull = 1;
+                break;
+            }
+        }
+        if (!buffer_isfull) {
+            pConvertor->current_count ++;
+            cuda_iov_start_pos = 0;
+            cuda_iov_end_pos = cached_cuda_iov->cuda_iov_count;
+        }
+    }
+#if defined(OPAL_DATATYPE_CUDA_TIMING)    
+    GET_TIME( end );
+    total_time = ELAPSED_TIME( start, end );
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Pack to dest %p, cached cuda iov is prepared in %ld microsec, kernel submitted to CUDA stream %d, nb_blocks %d\n", destination_base, total_time,  cuda_streams->current_stream_id, nb_blocks_used); );
+#endif
+    opal_datatype_type_extent(pConvertor->pDesc, &ddt_extent);
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "Pack kernel launched src_base %p, dst_base %p, nb_blocks %ld, extent %ld\n", source_base, destination_base, nb_blocks_used, ddt_extent ); );
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+    GET_TIME(start);
+#endif    
+    opal_generic_simple_pack_cuda_iov_cached_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cached_cuda_iov_dist_d, pConvertor->current_cuda_iov_pos, cached_cuda_iov_count, ddt_extent, convertor_current_count, nb_blocks_used, source_base, destination_base);
+    pConvertor->current_cuda_iov_pos += nb_blocks_used;
+    pConvertor->current_cuda_iov_pos = pConvertor->current_cuda_iov_pos % cached_cuda_iov->cuda_iov_count;
+
+    cudaStreamSynchronize(cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]);
+#if defined(OPAL_DATATYPE_CUDA_TIMING)    
+    GET_TIME( end );
+    total_time = ELAPSED_TIME( start, end );
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Pack kernel %ld microsec\n", total_time); );
+#endif    
+    return OPAL_SUCCESS;
+}
+
+void pack_predefined_data_cuda( dt_elem_desc_t* ELEM,
+                                uint32_t* COUNT,
+                                unsigned char** SOURCE,
+                                unsigned char** DESTINATION,
+                                size_t* SPACE )
+{
+    uint32_t _copy_count = *(COUNT);
+    size_t _copy_blength;
+    ddt_elem_desc_t* _elem = &((ELEM)->elem);
+    unsigned char* _source = (*SOURCE) + _elem->disp;
+    uint32_t nb_blocks, tasks_per_block, thread_per_block;
+    unsigned char* _destination = *(DESTINATION);
+    
+    ddt_cuda_stream_t *cuda_streams = current_cuda_device->cuda_streams;
+
+    _copy_blength = 8;//opal_datatype_basicDatatypes[_elem->common.type]->size;
+    if( (_copy_count * _copy_blength) > *(SPACE) ) {
+        _copy_count = (uint32_t)(*(SPACE) / _copy_blength);
+        if( 0 == _copy_count ) return;  /* nothing to do */
+    }
+    
+    
+    if (*COUNT / TASK_PER_THREAD < CUDA_WARP_SIZE) {
+        thread_per_block = CUDA_WARP_SIZE;
+    } else if (*COUNT / TASK_PER_THREAD < CUDA_WARP_SIZE * 2) {
+        thread_per_block = CUDA_WARP_SIZE * 2;
+    } else if (*COUNT / TASK_PER_THREAD < CUDA_WARP_SIZE * 3) {
+        thread_per_block = CUDA_WARP_SIZE * 3;
+    } else {
+        thread_per_block = CUDA_WARP_SIZE * 5;
+    }
+    tasks_per_block = thread_per_block * TASK_PER_THREAD;
+    nb_blocks = (*COUNT + tasks_per_block - 1) / tasks_per_block;
+
+ //   DBGPRINT("num_blocks %d, thread %d\n", nb_blocks, tasks_per_block);
+ //   DBGPRINT( "GPU pack 1. memcpy( %p, %p, %lu ) => space %lu\n", _destination, _source, (unsigned long)_copy_count, (unsigned long)(*(SPACE)) );
+    
+    pack_contiguous_loop_cuda_kernel_global<<<nb_blocks, thread_per_block, 0, cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]>>>(_copy_count, _copy_blength, _elem->extent, _source, _destination);
+    cuda_streams->current_stream_id ++;
+    cuda_streams->current_stream_id = cuda_streams->current_stream_id % NB_STREAMS;
+    
+#if !defined(OPAL_DATATYPE_CUDA_DRY_RUN)  
+    _copy_blength *= _copy_count;
+    *(SOURCE)  = _source + _elem->extent*_copy_count - _elem->disp;
+    *(DESTINATION) += _copy_blength;
+    *(SPACE)  -= _copy_blength;
+    *(COUNT)  -= _copy_count;
+#endif
+    
+}
+
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
new file mode 100644
index 00000000000..38365013994
--- /dev/null
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_kernel.cu
@@ -0,0 +1,458 @@
+#include "opal/datatype/opal_convertor_internal.h"
+#include "opal/datatype/opal_datatype_internal.h"
+
+#include "opal_datatype_cuda_internal.cuh"
+#include <cuda.h>
+#include <stdio.h> 
+
+
+__global__ void opal_generic_simple_unpack_cuda_iov_non_cached_kernel( ddt_cuda_iov_dist_non_cached_t* cuda_iov_dist, int nb_blocks_used)
+{
+    uint32_t i, _copy_count;
+    unsigned char *src, *dst;
+    uint8_t alignment;
+    unsigned char *_source_tmp, *_destination_tmp;
+    
+    __shared__ uint32_t nb_tasks;
+    
+    if (threadIdx.x == 0) {
+        nb_tasks = nb_blocks_used / gridDim.x;
+        if (blockIdx.x < nb_blocks_used % gridDim.x) {
+            nb_tasks ++;
+        }
+    }
+    __syncthreads();
+    
+    for (i = 0; i < nb_tasks; i++) {
+        src = cuda_iov_dist[blockIdx.x + i * gridDim.x].src;
+        dst = cuda_iov_dist[blockIdx.x + i * gridDim.x].dst;
+        _copy_count = cuda_iov_dist[blockIdx.x + i * gridDim.x].nb_elements;
+        alignment = cuda_iov_dist[blockIdx.x + i * gridDim.x].element_alignment;
+        
+        if (threadIdx.x < _copy_count) {
+            _source_tmp = src + threadIdx.x * alignment;
+            _destination_tmp = dst + threadIdx.x * alignment;
+#if !defined (OPAL_DATATYPE_CUDA_DRY_RUN)
+                if (alignment == ALIGNMENT_DOUBLE) {
+                    *((long *)_destination_tmp) = *((long *)_source_tmp);
+                } else if (alignment == ALIGNMENT_FLOAT) {
+                    *((int *)_destination_tmp) = *((int *)_source_tmp);
+                } else {
+                    * _destination_tmp = *_source_tmp;
+                }
+        //   printf("src %p, %1.f | dst %p, %1.f\n", _source_tmp, *_source_tmp, _destination_tmp, *_destination_tmp);
+#endif /* ! OPAL_DATATYPE_CUDA_DRY_RUN */
+        }
+    }
+}
+
+#if 0
+__global__ void opal_generic_simple_unpack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, uint32_t cuda_iov_pos, uint32_t cuda_iov_count, uint32_t ddt_extent, uint32_t current_count, int nb_blocks_used, unsigned char* destination_base, unsigned char* source_base, size_t cuda_iov_partial_length_start, size_t cuda_iov_partial_length_end)
+{
+    uint32_t i, j;
+    size_t dst_offset, src_offset;
+    unsigned char *_source_tmp, *_destination_tmp;
+    uint32_t _nb_bytes;
+    uint32_t current_cuda_iov_pos = cuda_iov_pos;
+    size_t source_disp = cuda_iov_dist[current_cuda_iov_pos].contig_disp;
+    size_t source_partial_disp = 0;
+    size_t contig_disp; 
+    uint32_t _my_cuda_iov_pos;
+    uint32_t _my_cuda_iov_iteration;
+    size_t ddt_size = cuda_iov_dist[cuda_iov_count].contig_disp;
+
+    __shared__ uint32_t nb_tasks;
+    uint32_t copy_count;
+    uint8_t alignment;
+    
+    if (threadIdx.x == 0) {
+        nb_tasks = nb_blocks_used / gridDim.x;
+        if (blockIdx.x < nb_blocks_used % gridDim.x) {
+            nb_tasks ++;
+        }
+     //   printf("cuda_iov_count %d, ddt_extent %d, current_count %d, ddt_size %d\n", cuda_iov_count, ddt_extent, current_count, ddt_size);
+    }
+    __syncthreads();
+    
+    if (cuda_iov_partial_length_start != 0) {
+        source_partial_disp = (cuda_iov_dist[current_cuda_iov_pos+1].contig_disp - cuda_iov_dist[current_cuda_iov_pos].contig_disp) - cuda_iov_partial_length_start;
+    }
+    
+    for (i = 0; i < nb_tasks; i++) {
+        /* these 3 variables are used multiple times, so put in in register */
+        _my_cuda_iov_pos = (blockIdx.x + i * gridDim.x + current_cuda_iov_pos) % cuda_iov_count;
+        _my_cuda_iov_iteration = (blockIdx.x + i * gridDim.x + current_cuda_iov_pos) / cuda_iov_count;
+        contig_disp = cuda_iov_dist[_my_cuda_iov_pos].contig_disp; 
+        
+        src_offset = contig_disp + ddt_size * _my_cuda_iov_iteration - source_disp - source_partial_disp;
+        dst_offset = cuda_iov_dist[_my_cuda_iov_pos].ncontig_disp + (_my_cuda_iov_iteration + current_count) * ddt_extent;
+        _nb_bytes = cuda_iov_dist[_my_cuda_iov_pos + 1].contig_disp - contig_disp;
+
+        if (i == 0 && blockIdx.x == 0 && cuda_iov_partial_length_start != 0) {
+            src_offset = contig_disp + ddt_size * _my_cuda_iov_iteration - source_disp;
+            dst_offset = dst_offset + _nb_bytes - cuda_iov_partial_length_start;  
+            _nb_bytes = cuda_iov_partial_length_start;
+        } else if (i == nb_tasks-1 && (blockIdx.x == (nb_blocks_used-1) % gridDim.x) && cuda_iov_partial_length_end != 0) {
+            _nb_bytes = cuda_iov_partial_length_end;
+        }
+        
+        _destination_tmp = destination_base + dst_offset; 
+        _source_tmp = source_base + src_offset;
+        if ((uintptr_t)(_destination_tmp) % ALIGNMENT_DOUBLE == 0 && (uintptr_t)(_source_tmp) % ALIGNMENT_DOUBLE == 0 && _nb_bytes % ALIGNMENT_DOUBLE == 0) {
+            alignment = ALIGNMENT_DOUBLE;
+        } else if ((uintptr_t)(_destination_tmp) % ALIGNMENT_FLOAT == 0 && (uintptr_t)(_source_tmp) % ALIGNMENT_FLOAT == 0 && _nb_bytes % ALIGNMENT_FLOAT == 0) {
+            alignment = ALIGNMENT_FLOAT;
+        } else {
+            alignment = ALIGNMENT_CHAR;
+        }
+        copy_count = _nb_bytes / alignment;
+   /*     
+        if (threadIdx.x == 0 && nb_tasks != 0) {
+            printf("unpack block %d, src_offset %ld, dst_offset %ld, count %d, nb_bytes %d, nb_tasks %d, i %d\n", blockIdx.x, src_offset, dst_offset, copy_count, _nb_bytes, nb_tasks, i);
+        }
+        __syncthreads();
+     */   
+        for (j = threadIdx.x; j < copy_count; j += blockDim.x) {
+/*            if (threadIdx.x == 0) {
+                if (copy_count > blockDim.x) printf("copy_count %d, dim %d\n", copy_count, blockDim.x);
+            }*/
+            if (j < copy_count) {
+                _source_tmp = source_base + src_offset + j * alignment;
+                _destination_tmp = destination_base + dst_offset + j * alignment;
+  /*              if (threadIdx.x == 0) {
+                    printf("_src %p, dst %p, alignment %d, blk %d, j %d, count %d\n", _source_tmp, _destination_tmp, alignment, blockIdx.x, j, copy_count);
+                }*/
+#if !defined (OPAL_DATATYPE_CUDA_DRY_RUN)
+                    if (alignment == ALIGNMENT_DOUBLE) {
+                        *((long *)_destination_tmp) = *((long *)_source_tmp);
+                    } else if (alignment == ALIGNMENT_FLOAT) {
+                        *((int *)_destination_tmp) = *((int *)_source_tmp);
+                    } else {
+                        * _destination_tmp = *_source_tmp;
+                    }
+            //   printf("src %p, %1.f | dst %p, %1.f\n", _source_tmp, *_source_tmp, _destination_tmp, *_destination_tmp);
+#endif /* ! OPAL_DATATYPE_CUDA_DRY_RUN */
+            }
+        }
+    }
+}
+
+#else 
+__global__ void opal_generic_simple_unpack_cuda_iov_cached_kernel( ddt_cuda_iov_dist_cached_t* cuda_iov_dist, uint32_t cuda_iov_pos, uint32_t cuda_iov_count, uint32_t ddt_extent, uint32_t current_count, int nb_blocks_used, unsigned char* destination_base, unsigned char* source_base, size_t cuda_iov_partial_length_start, size_t cuda_iov_partial_length_end)
+{
+    uint32_t i, j;
+    size_t dst_offset, src_offset;
+    unsigned char *_source_tmp, *_destination_tmp;
+    uint32_t _nb_bytes;
+    uint32_t current_cuda_iov_pos = cuda_iov_pos;
+    size_t source_disp = cuda_iov_dist[current_cuda_iov_pos].contig_disp;
+    size_t source_partial_disp = 0;
+    size_t contig_disp; 
+    uint32_t _my_cuda_iov_pos;
+    uint32_t _my_cuda_iov_iteration;
+    size_t ddt_size = cuda_iov_dist[cuda_iov_count].contig_disp;
+
+    __shared__ uint32_t nb_tasks_per_block;
+    __shared__ uint32_t WARP_SIZE;
+    __shared__ uint32_t nb_warp_per_block;
+    uint32_t copy_count;
+    uint8_t alignment;
+    uint64_t tmp_var_64[KERNEL_UNROLL];
+    uint32_t tmp_var_32[KERNEL_UNROLL];
+    unsigned char tmp_var_8[KERNEL_UNROLL];
+    uint32_t u, k;
+    uint32_t copy_count_16, copy_count_8, copy_count_left;
+    
+    if (threadIdx.x == 0) {
+        nb_tasks_per_block = nb_blocks_used / gridDim.x;
+        if (blockIdx.x < nb_blocks_used % gridDim.x) {
+            nb_tasks_per_block ++;
+        }
+        WARP_SIZE = 32;
+        nb_warp_per_block = blockDim.x / WARP_SIZE;
+     //   printf("cuda_iov_count %d, ddt_extent %d, current_count %d, ddt_size %d\n", cuda_iov_count, ddt_extent, current_count, ddt_size);
+    }
+    __syncthreads();
+    
+    const uint32_t warp_id_per_block = threadIdx.x / WARP_SIZE;
+    const uint32_t tid_per_warp = threadIdx.x & (WARP_SIZE - 1);
+    
+    if (cuda_iov_partial_length_start != 0) {
+        source_partial_disp = (cuda_iov_dist[current_cuda_iov_pos+1].contig_disp - cuda_iov_dist[current_cuda_iov_pos].contig_disp) - cuda_iov_partial_length_start;
+    }
+    
+    for (i = warp_id_per_block; i < nb_tasks_per_block; i+= nb_warp_per_block) {
+        /* these 3 variables are used multiple times, so put in in register */
+        _my_cuda_iov_pos = (blockIdx.x + i * gridDim.x + current_cuda_iov_pos) % cuda_iov_count;
+        _my_cuda_iov_iteration = (blockIdx.x + i * gridDim.x + current_cuda_iov_pos) / cuda_iov_count;
+        contig_disp = cuda_iov_dist[_my_cuda_iov_pos].contig_disp; 
+        
+        src_offset = contig_disp + ddt_size * _my_cuda_iov_iteration - source_disp - source_partial_disp;
+        dst_offset = cuda_iov_dist[_my_cuda_iov_pos].ncontig_disp + (_my_cuda_iov_iteration + current_count) * ddt_extent;
+        _nb_bytes = cuda_iov_dist[_my_cuda_iov_pos + 1].contig_disp - contig_disp;
+
+        if (i == 0 && blockIdx.x == 0 && cuda_iov_partial_length_start != 0) {
+            src_offset = contig_disp + ddt_size * _my_cuda_iov_iteration - source_disp;
+            dst_offset = dst_offset + _nb_bytes - cuda_iov_partial_length_start;  
+            _nb_bytes = cuda_iov_partial_length_start;
+        } else if (i == nb_tasks_per_block-1 && (blockIdx.x == (nb_blocks_used-1) % gridDim.x) && cuda_iov_partial_length_end != 0) {
+            _nb_bytes = cuda_iov_partial_length_end;
+        }
+        
+        _destination_tmp = destination_base + dst_offset; 
+        _source_tmp = source_base + src_offset;
+        if ((uintptr_t)(_destination_tmp) % ALIGNMENT_DOUBLE == 0 && (uintptr_t)(_source_tmp) % ALIGNMENT_DOUBLE == 0 && _nb_bytes % ALIGNMENT_DOUBLE == 0) {
+            alignment = ALIGNMENT_DOUBLE;
+        } else if ((uintptr_t)(_destination_tmp) % ALIGNMENT_FLOAT == 0 && (uintptr_t)(_source_tmp) % ALIGNMENT_FLOAT == 0 && _nb_bytes % ALIGNMENT_FLOAT == 0) {
+            alignment = ALIGNMENT_FLOAT;
+        } else {
+            alignment = ALIGNMENT_CHAR;
+        }
+        //alignment = ALIGNMENT_DOUBLE;
+        copy_count = _nb_bytes / alignment;
+   /*     
+        if (threadIdx.x == 0 && nb_tasks != 0) {
+            printf("unpack block %d, src_offset %ld, dst_offset %ld, count %d, nb_bytes %d, nb_tasks %d, i %d\n", blockIdx.x, src_offset, dst_offset, copy_count, _nb_bytes, nb_tasks, i);
+        }
+        __syncthreads();
+     */   
+        if (alignment == ALIGNMENT_DOUBLE) {
+            uint64_t *_source_base_64, *_destination_base_64; 
+            copy_count_16 = copy_count  / (WARP_SIZE * UNROLL_16) * (WARP_SIZE * UNROLL_16);
+            _source_base_64 = (uint64_t *)(source_base + src_offset);
+            _destination_base_64 = (uint64_t *)(destination_base + dst_offset);
+            if (copy_count_16 > 0) {
+                for (k = 0; k < copy_count_16; k += UNROLL_16 * WARP_SIZE) {
+                    #pragma unroll
+                    for (u = 0; u < UNROLL_16; u++) {
+                        j = tid_per_warp + u * WARP_SIZE + k;
+                        tmp_var_64[u] = *(_source_base_64 + j);
+
+                    }
+                    #pragma unroll
+                    for (u = 0; u < UNROLL_16; u++) {
+                        j = tid_per_warp + u * WARP_SIZE + k;
+                        *(_destination_base_64 + j) = tmp_var_64[u];
+
+                    }
+                }
+            }
+            _source_base_64 += copy_count_16;
+            _destination_base_64 += copy_count_16;
+            
+            copy_count_8 = (copy_count - copy_count_16) / (WARP_SIZE * UNROLL_8) * (WARP_SIZE * UNROLL_8);
+            if (copy_count_8 > 0) {
+                for (k = 0; k < copy_count_8; k += UNROLL_8 * WARP_SIZE) {
+                    #pragma unroll
+                    for (u = 0; u < UNROLL_8; u++) {
+                        j = tid_per_warp + u * WARP_SIZE + k;
+                        tmp_var_64[u] = *(_source_base_64 + j);
+
+                    }
+                    #pragma unroll
+                    for (u = 0; u < UNROLL_8; u++) {
+                        j = tid_per_warp + u * WARP_SIZE + k;
+                        *(_destination_base_64 + j) = tmp_var_64[u];
+
+                    }
+                }
+            }
+            _source_base_64 += copy_count_8;
+            _destination_base_64 += copy_count_8;
+        
+            copy_count_left = copy_count - copy_count_16 - copy_count_8;
+            if (copy_count_left > 0) {
+                #pragma unroll
+                for (u = 0; u < UNROLL_8; u++) {
+                    j = tid_per_warp + u * WARP_SIZE;
+                    if (j < copy_count_left) {
+                        tmp_var_64[u] = *(_source_base_64 + j);
+                    } else {
+                        break;
+                    }
+                }
+                #pragma unroll
+                for (u = 0; u < UNROLL_8; u++) {
+                    j = tid_per_warp + u * WARP_SIZE;
+                    if (j < copy_count_left) {
+                        *(_destination_base_64 + j) = tmp_var_64[u];
+                    } else {
+                        break;
+                    }
+                }
+            }
+        } else if (alignment == ALIGNMENT_FLOAT) {
+            uint32_t *_source_base_32, *_destination_base_32;    
+            copy_count_16 = copy_count  / (WARP_SIZE * UNROLL_16) * (WARP_SIZE * UNROLL_16);
+            _source_base_32 = (uint32_t *)(source_base + src_offset);
+            _destination_base_32 = (uint32_t *)(destination_base + dst_offset);
+            if (copy_count_16 > 0) {
+                for (k = 0; k < copy_count_16; k += UNROLL_16 * WARP_SIZE) {
+                    #pragma unroll
+                    for (u = 0; u < UNROLL_16; u++) {
+                        j = tid_per_warp + u * WARP_SIZE + k;
+                        tmp_var_32[u] = *(_source_base_32 + j);
+
+                    }
+                    #pragma unroll
+                    for (u = 0; u < UNROLL_16; u++) {
+                        j = tid_per_warp + u * WARP_SIZE + k;
+                        *(_destination_base_32 + j) = tmp_var_32[u];
+
+                    }
+                }
+            }
+            _source_base_32 += copy_count_16;
+            _destination_base_32 += copy_count_16;
+        
+            copy_count_8 = (copy_count - copy_count_16) / (WARP_SIZE * UNROLL_8) * (WARP_SIZE * UNROLL_8);
+            if (copy_count_8 > 0) {
+                for (k = 0; k < copy_count_8; k += UNROLL_8 * WARP_SIZE) {
+                    #pragma unroll
+                    for (u = 0; u < UNROLL_8; u++) {
+                        j = tid_per_warp + u * WARP_SIZE + k;
+                        tmp_var_32[u] = *(_source_base_32 + j);
+
+                    }
+                    #pragma unroll
+                    for (u = 0; u < UNROLL_8; u++) {
+                        j = tid_per_warp + u * WARP_SIZE + k;
+                        *(_destination_base_32 + j) = tmp_var_32[u];
+
+                    }
+                }
+            }
+            _source_base_32 += copy_count_8;
+            _destination_base_32 += copy_count_8;
+        
+            copy_count_left = copy_count - copy_count_16 - copy_count_8;
+            if (copy_count_left > 0) {
+                #pragma unroll
+                for (u = 0; u < UNROLL_8; u++) {
+                    j = tid_per_warp + u * WARP_SIZE;
+                    if (j < copy_count_left) {
+                        tmp_var_32[u] = *(_source_base_32 + j);
+                    } else {
+                        break;
+                    }
+                }
+                #pragma unroll
+                for (u = 0; u < UNROLL_8; u++) {
+                    j = tid_per_warp + u * WARP_SIZE;
+                    if (j < copy_count_left) {
+                        *(_destination_base_32 + j) = tmp_var_32[u];
+                    } else {
+                        break;
+                    }
+                }
+            }
+        } else {
+            unsigned char *_source_base_8, *_destination_base_8;
+        
+            copy_count_16 = copy_count  / (WARP_SIZE * UNROLL_16) * (WARP_SIZE * UNROLL_16);
+            _source_base_8 = (unsigned char *)(source_base + src_offset);
+            _destination_base_8 = (unsigned char *)(destination_base + dst_offset);
+            if (copy_count_16 > 0) {
+                for (k = 0; k < copy_count_16; k += UNROLL_16 * WARP_SIZE) {
+                    #pragma unroll
+                    for (u = 0; u < UNROLL_16; u++) {
+                        j = tid_per_warp + u * WARP_SIZE + k;
+                        tmp_var_8[u] = *(_source_base_8 + j);
+
+                    }
+                    #pragma unroll
+                    for (u = 0; u < UNROLL_16; u++) {
+                        j = tid_per_warp + u * WARP_SIZE + k;
+                        *(_destination_base_8 + j) = tmp_var_8[u];
+
+                    }
+                }
+            }
+            _source_base_8 += copy_count_16;
+            _destination_base_8 += copy_count_16;
+        
+            copy_count_8 = (copy_count - copy_count_16) / (WARP_SIZE * UNROLL_8) * (WARP_SIZE * UNROLL_8);
+            if (copy_count_8 > 0) {
+                for (k = 0; k < copy_count_8; k += UNROLL_8 * WARP_SIZE) {
+                    #pragma unroll
+                    for (u = 0; u < UNROLL_8; u++) {
+                        j = tid_per_warp + u * WARP_SIZE + k;
+                        tmp_var_8[u] = *(_source_base_8 + j);
+
+                    }
+                    #pragma unroll
+                    for (u = 0; u < UNROLL_8; u++) {
+                        j = tid_per_warp + u * WARP_SIZE + k;
+                        *(_destination_base_8 + j) = tmp_var_8[u];
+
+                    }
+                }
+            }
+            _source_base_8 += copy_count_8;
+            _destination_base_8 += copy_count_8;
+        
+            copy_count_left = copy_count - copy_count_16 - copy_count_8;
+            if (copy_count_left > 0) {
+                #pragma unroll
+                for (u = 0; u < UNROLL_8; u++) {
+                    j = tid_per_warp + u * WARP_SIZE;
+                    if (j < copy_count_left) {
+                        tmp_var_8[u] = *(_source_base_8 + j);
+                    } else {
+                        break;
+                    }
+                }
+                #pragma unroll
+                for (u = 0; u < UNROLL_8; u++) {
+                    j = tid_per_warp + u * WARP_SIZE;
+                    if (j < copy_count_left) {
+                        *(_destination_base_8 + j) = tmp_var_8[u];
+                    } else {
+                        break;
+                    }
+                }
+            }
+        }
+    }
+}
+
+#endif
+
+__global__ void unpack_contiguous_loop_cuda_kernel_global( uint32_t copy_loops,
+                                                           size_t size,
+                                                           OPAL_PTRDIFF_TYPE extent,
+                                                           unsigned char* source,
+                                                           unsigned char* destination )
+{
+    uint32_t _i, tid, num_threads;
+    uint32_t gap, nb_elements;
+    double *_source_tmp, *_destination_tmp, *_dst_disp_tmp;;
+    
+    tid = threadIdx.x + blockIdx.x * blockDim.x;
+    num_threads = gridDim.x * blockDim.x;
+    
+    gap = (extent - size) / 8;
+    nb_elements = size / 8;
+    _dst_disp_tmp = (double*)destination;
+    _source_tmp = (double*)source;
+    _destination_tmp = _dst_disp_tmp + tid;
+    _source_tmp += tid;
+
+    for (_i = tid; _i < copy_loops*nb_elements; _i+=num_threads) {
+        _destination_tmp = _dst_disp_tmp + tid + _i/num_threads*num_threads + _i/nb_elements * gap;
+#if defined (OPAL_DATATYPE_CUDA_DEBUG)
+        // if (_i % nb_elements == 0 ) {
+        //     DBGPRINT("tid %d, pack 3. memcpy( %p, %p, %lu ) => _i %d, actual _i %d\n",
+        //                                     tid, _destination_tmp, _source_tmp, (unsigned long)size,  _i/nb_elements, _i );
+        // }
+        // if (_i / nb_elements ==1 && tid == 0 ) {
+        //     DBGPRINT("tid %d, pack 3. memcpy( %p, %p, %lu ) => space %lu, _i %d, actual _i %d\n",
+        //                                     tid, _destination_tmp, _source_tmp, (unsigned long)_end_loop->size, (unsigned long)(*(SPACE) - _i/nb_elements * _end_loop->size), _i/nb_elements, _i );
+        // }
+#endif /* OPAL_DATATYPE_CUDA_DEBUG */
+#if !defined (OPAL_DATATYPE_CUDA_DRY_RUN)
+        *_destination_tmp = *_source_tmp;
+#endif /* ! OPAL_DATATYPE_CUDA_DRY_RUN */
+        _source_tmp += num_threads;
+    }
+}
diff --git a/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
new file mode 100644
index 00000000000..9be53d2d5a7
--- /dev/null
+++ b/opal/datatype/cuda/opal_datatype_unpack_cuda_wrapper.cu
@@ -0,0 +1,1128 @@
+#include "opal/datatype/opal_convertor_internal.h"
+#include "opal/datatype/opal_datatype_internal.h"
+
+#include "opal_datatype_cuda_internal.cuh"
+#include "opal_datatype_cuda.cuh"
+
+#include <stdio.h>
+#include <assert.h>
+
+
+int32_t opal_ddt_generic_simple_unpack_function_cuda_vector( opal_convertor_t* pConvertor,
+                                                         struct iovec* iov, uint32_t* out_size,
+                                                         size_t* max_data )
+{
+    dt_stack_t* pStack;                /* pointer to the position on the stack */
+    uint32_t pos_desc;                 /* actual position in the description of the derived datatype */
+    uint32_t count_desc;               /* the number of items already done in the actual pos_desc */
+    size_t total_unpacked = 0;         /* total size unpacked this time */
+    dt_elem_desc_t* description;
+    dt_elem_desc_t* pElem;
+    const opal_datatype_t *pData = pConvertor->pDesc;
+    unsigned char *conv_ptr, *iov_ptr;
+    size_t iov_len_local;
+    uint32_t iov_count;
+    uint8_t free_required;
+    uint32_t count_desc_tmp;
+    
+    ddt_cuda_stream_t *cuda_streams = current_cuda_device->cuda_streams;
+    int contiguous_loop_flag = 0;
+    int i;
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+    TIMER_DATA_TYPE start, end;
+    long total_time;
+#endif
+
+    DT_CUDA_DEBUG( opal_cuda_output( 2, "opal_convertor_generic_simple_unpack( %p, {%p, %lu}, %u , %u)\n",
+                                     (void*)pConvertor, iov[0].iov_base, (unsigned long)iov[0].iov_len, *out_size, *max_data ); )
+
+    description = pConvertor->use_desc->desc;
+
+    /* For the first step we have to add both displacement to the source. After in the
+     * main while loop we will set back the source_base to the correct value. This is
+     * due to the fact that the convertor can stop in the middle of a data with a count
+     */
+    pStack     = pConvertor->pStack + pConvertor->stack_pos;
+    pos_desc   = pStack->index;
+    conv_ptr   = pConvertor->pBaseBuf + pStack->disp;
+    count_desc = (uint32_t)pStack->count;
+    pStack--;
+    pConvertor->stack_pos--;
+    pElem = &(description[pos_desc]);
+
+    DT_CUDA_DEBUG( opal_cuda_output( 4, "unpack start pos_desc %d count_desc %d disp %ld\n"
+                           "stack_pos %d pos_desc %d count_desc %d disp %ld\n",
+                           pos_desc, count_desc, (long)(conv_ptr - pConvertor->pBaseBuf),
+                           pConvertor->stack_pos, pStack->index, (int)pStack->count, (long)(pStack->disp) ); );
+
+    for( iov_count = 0; iov_count < (*out_size); iov_count++ ) {
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+        GET_TIME(start);
+#endif
+        if (opal_ddt_cuda_is_gpu_buffer(iov[iov_count].iov_base)) {
+            iov_ptr = (unsigned char*)iov[iov_count].iov_base;
+            free_required = 0;
+        } else {
+            if (OPAL_DATATYPE_VECTOR_USE_MEMCPY2D_D2H || OPAL_DATATYPE_VECTOR_USE_ZEROCPY) {
+                iov_ptr = (unsigned char*)iov[iov_count].iov_base;
+                pConvertor->gpu_buffer_ptr = NULL;
+                free_required = 0;
+            } else {
+                if (pConvertor->gpu_buffer_ptr == NULL) {
+                    pConvertor->gpu_buffer_ptr = (unsigned char*)opal_ddt_cuda_malloc_gpu_buffer(iov[iov_count].iov_len, 0);
+                }
+                iov_ptr = pConvertor->gpu_buffer_ptr;
+                cudaMemcpy(iov_ptr, iov[iov_count].iov_base, iov[iov_count].iov_len, cudaMemcpyHostToDevice);
+                free_required = 1;
+            }
+        } 
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+        GET_TIME( end );
+        total_time = ELAPSED_TIME( start, end );
+        DT_CUDA_DEBUG( opal_cuda_output( 2, "[Timing]: HtoD memcpy in %ld microsec, free required %d\n", total_time, free_required ); );
+#endif
+        iov_len_local = iov[iov_count].iov_len;
+        cudaDeviceSynchronize();
+        if( 0 != pConvertor->partial_length ) {
+            /* not support yet */
+        }
+        while( 1 ) {
+            while( pElem->elem.common.flags & OPAL_DATATYPE_FLAG_DATA ) {
+                /* now here we have a basic datatype */
+                /* should not go to here */
+                unpack_predefined_data_cuda( pElem, &count_desc, &iov_ptr, &conv_ptr, &iov_len_local );
+                if( 0 == count_desc ) {  /* completed */
+                    conv_ptr = pConvertor->pBaseBuf + pStack->disp;
+                    pos_desc++;  /* advance to the next data */
+                    UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
+                    continue;
+                }
+                if (contiguous_loop_flag) {
+                    pStack--;
+                    pConvertor->stack_pos--;
+                    pos_desc --;
+                    pElem = &(description[pos_desc]);
+                    count_desc = count_desc_tmp;
+                }
+                assert( pElem->elem.common.type < OPAL_DATATYPE_MAX_PREDEFINED );
+                if( 0 != iov_len_local ) {
+                    assert(0);
+                }
+                goto complete_loop;
+            }
+            if( OPAL_DATATYPE_END_LOOP == pElem->elem.common.type ) { /* end of the current loop */
+                DT_CUDA_DEBUG( opal_cuda_output( 4, "unpack end_loop count %d stack_pos %d pos_desc %d disp %ld space %lu\n",
+                                                 (int)pStack->count, pConvertor->stack_pos, pos_desc,
+                                                 (long)pStack->disp, (unsigned long)iov_len_local ); );
+                if( --(pStack->count) == 0 ) { /* end of loop */
+                    if( 0 == pConvertor->stack_pos ) {
+                        /* Do the same thing as when the loop is completed */
+                        iov[iov_count].iov_len -= iov_len_local;  /* update the amount of valid data */
+                        total_unpacked += iov[iov_count].iov_len;
+                        iov_count++;  /* go to the next */
+                        goto complete_conversion;
+                    }
+                    pConvertor->stack_pos--;
+                    pStack--;
+                    pos_desc++;
+                } else {
+                    pos_desc = pStack->index + 1;
+                    if( pStack->index == -1 ) {
+                        pStack->disp += (pData->ub - pData->lb);
+                    } else {
+                        assert( OPAL_DATATYPE_LOOP == description[pStack->index].loop.common.type );
+                        pStack->disp += description[pStack->index].loop.extent;
+                    }
+                }
+                conv_ptr = pConvertor->pBaseBuf + pStack->disp;
+                UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
+                DT_CUDA_DEBUG( opal_cuda_output( 4, "unpack new_loop count %d stack_pos %d pos_desc %d disp %ld space %lu\n",
+                                                 (int)pStack->count, pConvertor->stack_pos, pos_desc,
+                                                 (long)pStack->disp, (unsigned long)iov_len_local ); );
+            }
+            if( OPAL_DATATYPE_LOOP == pElem->elem.common.type ) {
+                OPAL_PTRDIFF_TYPE local_disp = (OPAL_PTRDIFF_TYPE)conv_ptr;
+                if( pElem->loop.common.flags & OPAL_DATATYPE_FLAG_CONTIGUOUS ) {
+                    if (OPAL_DATATYPE_VECTOR_USE_MEMCPY2D_D2H) {
+                        unpack_contiguous_loop_cuda_memcpy2d_d2h(pElem, &count_desc, &iov_ptr, &conv_ptr, &iov_len_local);
+                    } else if (OPAL_DATATYPE_VECTOR_USE_ZEROCPY) {
+                        unpack_contiguous_loop_cuda_zerocopy(pElem, &count_desc, &iov_ptr, &conv_ptr, &iov_len_local);
+                    } else {
+                        unpack_contiguous_loop_cuda(pElem, &count_desc, &iov_ptr, &conv_ptr, &iov_len_local);
+                    }
+                    if( 0 == count_desc ) {  /* completed */
+                        pos_desc += pElem->loop.items + 1;
+                        goto update_loop_description;
+                    } else {
+                        contiguous_loop_flag = 1;
+                    }
+                    /* Save the stack with the correct last_count value. */
+                }
+                local_disp = (OPAL_PTRDIFF_TYPE)conv_ptr - local_disp;
+                PUSH_STACK( pStack, pConvertor->stack_pos, pos_desc, OPAL_DATATYPE_LOOP, count_desc,
+                            pStack->disp + local_disp);
+                pos_desc++;
+            update_loop_description:  /* update the current state */
+                if (contiguous_loop_flag) {
+                    count_desc_tmp = count_desc;
+                } else {
+                    conv_ptr = pConvertor->pBaseBuf + pStack->disp;
+                }
+                UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
+                continue;
+            }
+        }
+    complete_loop:
+        iov[iov_count].iov_len -= iov_len_local;  /* update the amount of valid data */
+        total_unpacked += iov[iov_count].iov_len;
+    }
+ complete_conversion:
+    cudaStreamSynchronize(cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]);
+    *max_data = total_unpacked;
+    pConvertor->bConverted += total_unpacked;  /* update the already converted bytes */
+    *out_size = iov_count;
+    if( pConvertor->bConverted == pConvertor->remote_size ) {
+        pConvertor->flags |= CONVERTOR_COMPLETED;
+        DT_CUDA_DEBUG( opal_cuda_output( 2, "Unpack total unpacked %lu\n", pConvertor->bConverted); );
+        if (pConvertor->gpu_buffer_ptr != NULL && free_required == 1) {
+            opal_ddt_cuda_free_gpu_buffer(pConvertor->gpu_buffer_ptr, 0);
+            pConvertor->gpu_buffer_ptr = NULL;
+        }
+        return 1;
+    }
+    /* Save the global position for the next round */
+    PUSH_STACK( pStack, pConvertor->stack_pos, pos_desc, pElem->elem.common.type, count_desc,
+                conv_ptr - pConvertor->pBaseBuf );
+    DT_CUDA_DEBUG( opal_cuda_output( 4, "unpack save stack stack_pos %d pos_desc %d count_desc %d disp %ld\n",
+                                     pConvertor->stack_pos, pStack->index, (int)pStack->count, (long)pStack->disp ); );
+    return 0;
+}
+
+int32_t opal_ddt_generic_simple_unpack_function_cuda_vector2( opal_convertor_t* pConvertor,
+                                                         struct iovec* iov, uint32_t* out_size,
+                                                         size_t* max_data )
+{
+    dt_stack_t* pStack;                /* pointer to the position on the stack */
+    uint32_t pos_desc;                 /* actual position in the description of the derived datatype */
+    uint32_t count_desc;               /* the number of items already done in the actual pos_desc */
+    size_t total_unpacked = 0;         /* total size unpacked this time */
+    dt_elem_desc_t* description;
+    dt_elem_desc_t* pElem;
+    const opal_datatype_t *pData = pConvertor->pDesc;
+    unsigned char *conv_ptr, *iov_ptr;
+    size_t iov_len_local;
+    uint32_t iov_count;
+    uint8_t free_required;
+    uint32_t count_desc_tmp;
+    
+    ddt_cuda_stream_t *cuda_streams = current_cuda_device->cuda_streams;
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+    TIMER_DATA_TYPE start, end;
+    long total_time;
+#endif
+
+    DT_CUDA_DEBUG( opal_cuda_output( 2, "opal_convertor_generic_simple_unpack_vector( %p, {%p, %lu}, %u , %u)\n",
+                                     (void*)pConvertor, iov[0].iov_base, (unsigned long)iov[0].iov_len, *out_size, *max_data ); )
+
+    description = pConvertor->use_desc->desc;
+
+    /* For the first step we have to add both displacement to the source. After in the
+     * main while loop we will set back the source_base to the correct value. This is
+     * due to the fact that the convertor can stop in the middle of a data with a count
+     */
+    pStack     = pConvertor->pStack + pConvertor->stack_pos;
+    pos_desc   = pStack->index;
+    conv_ptr   = pConvertor->pBaseBuf + pStack->disp;
+    count_desc = (uint32_t)pStack->count;
+    pStack--;
+    pConvertor->stack_pos--;
+    pElem = &(description[pos_desc]);
+
+    DT_CUDA_DEBUG( opal_cuda_output( 4, "unpack start pos_desc %d count_desc %d disp %ld\n"
+                           "stack_pos %d pos_desc %d count_desc %d disp %ld\n",
+                           pos_desc, count_desc, (long)(conv_ptr - pConvertor->pBaseBuf),
+                           pConvertor->stack_pos, pStack->index, (int)pStack->count, (long)(pStack->disp) ); );
+
+    for( iov_count = 0; iov_count < (*out_size); iov_count++ ) {
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+        GET_TIME(start);
+#endif
+        if (opal_ddt_cuda_is_gpu_buffer(iov[iov_count].iov_base)) {
+            iov_ptr = (unsigned char*)iov[iov_count].iov_base;
+            free_required = 0;
+        } else {
+            if (OPAL_DATATYPE_VECTOR_USE_MEMCPY2D_D2H || OPAL_DATATYPE_VECTOR_USE_ZEROCPY) {
+                iov_ptr = (unsigned char*)iov[iov_count].iov_base;
+                pConvertor->gpu_buffer_ptr = NULL;
+                free_required = 0;
+            } else {
+                if (pConvertor->gpu_buffer_ptr == NULL) {
+                    pConvertor->gpu_buffer_ptr = (unsigned char*)opal_ddt_cuda_malloc_gpu_buffer(iov[iov_count].iov_len, 0);
+                }
+                iov_ptr = pConvertor->gpu_buffer_ptr;
+                cudaMemcpy(iov_ptr, iov[iov_count].iov_base, iov[iov_count].iov_len, cudaMemcpyHostToDevice);
+                free_required = 1;
+            }
+        }
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+        GET_TIME( end );
+        total_time = ELAPSED_TIME( start, end );
+        DT_CUDA_DEBUG( opal_cuda_output( 2, "[Timing]: HtoD memcpy in %ld microsec, free required %d\n", total_time, free_required ); );
+#endif
+        iov_len_local = iov[iov_count].iov_len;
+        if( 0 != pConvertor->partial_length ) {
+            /* not support yet */
+        }
+        while( 1 ) {
+            while( pElem->elem.common.flags & OPAL_DATATYPE_FLAG_DATA ) {
+                /* now here we have a basic datatype */
+                /* should not go to here */
+                pStack--;
+                pConvertor->stack_pos--;
+                pos_desc --;
+                pElem = &(description[pos_desc]);
+                count_desc = count_desc_tmp;
+                goto complete_loop;
+            }
+            if( OPAL_DATATYPE_END_LOOP == pElem->elem.common.type ) { /* end of the current loop */
+                DT_CUDA_DEBUG( opal_cuda_output( 4, "unpack end_loop count %d stack_pos %d pos_desc %d disp %ld space %lu\n",
+                                                 (int)pStack->count, pConvertor->stack_pos, pos_desc,
+                                                 (long)pStack->disp, (unsigned long)iov_len_local ); );
+                if( --(pStack->count) == 0 ) { /* end of loop */
+                    if( 0 == pConvertor->stack_pos ) {
+                        /* Do the same thing as when the loop is completed */
+                        iov[iov_count].iov_len -= iov_len_local;  /* update the amount of valid data */
+                        total_unpacked += iov[iov_count].iov_len;
+                        iov_count++;  /* go to the next */
+                        goto complete_conversion;
+                    }
+                    pConvertor->stack_pos--;
+                    pStack--;
+                    pos_desc++;
+                } else {
+                    pos_desc = pStack->index + 1;
+                    if( pStack->index == -1 ) {
+                        pStack->disp += (pData->ub - pData->lb);
+                    } else {
+                        assert( OPAL_DATATYPE_LOOP == description[pStack->index].loop.common.type );
+                        pStack->disp += description[pStack->index].loop.extent;
+                    }
+                }
+                conv_ptr = pConvertor->pBaseBuf + pStack->disp;
+                UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
+                DT_CUDA_DEBUG( opal_cuda_output( 4, "unpack new_loop count %d stack_pos %d pos_desc %d disp %ld space %lu\n",
+                                                 (int)pStack->count, pConvertor->stack_pos, pos_desc,
+                                                 (long)pStack->disp, (unsigned long)iov_len_local ); );
+            }
+            if( OPAL_DATATYPE_LOOP == pElem->elem.common.type ) {
+                OPAL_PTRDIFF_TYPE local_disp = (OPAL_PTRDIFF_TYPE)conv_ptr;
+                if( pElem->loop.common.flags & OPAL_DATATYPE_FLAG_CONTIGUOUS ) {
+                    if (OPAL_DATATYPE_VECTOR_USE_MEMCPY2D_D2H) {
+                        unpack_contiguous_loop_cuda_memcpy2d_d2h(pElem, &count_desc, &iov_ptr, &conv_ptr, &iov_len_local);
+                    } else if (OPAL_DATATYPE_VECTOR_USE_ZEROCPY) {
+                        unpack_contiguous_loop_cuda_zerocopy(pElem, &count_desc, &iov_ptr, &conv_ptr, &iov_len_local);
+                    } else {
+                        unpack_contiguous_loop_cuda(pElem, &count_desc, &iov_ptr, &conv_ptr, &iov_len_local);
+                    }
+                    if( 0 == count_desc ) {  /* completed */
+                        pos_desc += pElem->loop.items + 1;
+                        goto update_loop_description;
+                    }
+                    /* Save the stack with the correct last_count value. */
+                }
+                local_disp = (OPAL_PTRDIFF_TYPE)conv_ptr - local_disp;
+                PUSH_STACK( pStack, pConvertor->stack_pos, pos_desc, OPAL_DATATYPE_LOOP, count_desc,
+                            pStack->disp + local_disp);
+                pos_desc++;
+            update_loop_description:  /* update the current state */
+            //    conv_ptr = pConvertor->pBaseBuf + pStack->disp;
+                count_desc_tmp = count_desc;
+                UPDATE_INTERNAL_COUNTERS( description, pos_desc, pElem, count_desc );
+                continue;
+            }
+        }
+    complete_loop:
+        iov[iov_count].iov_len -= iov_len_local;  /* update the amount of valid data */
+        total_unpacked += iov[iov_count].iov_len;
+    }
+ complete_conversion:
+    *max_data = total_unpacked;
+    pConvertor->bConverted += total_unpacked;  /* update the already converted bytes */
+    *out_size = iov_count;
+    DT_CUDA_DEBUG( opal_cuda_output( 2, "Unpack total unpacked %lu\n", total_unpacked); );
+    if( pConvertor->bConverted == pConvertor->remote_size ) {
+        pConvertor->flags |= CONVERTOR_COMPLETED;
+        if (pConvertor->gpu_buffer_ptr != NULL && free_required == 1) {
+            opal_ddt_cuda_free_gpu_buffer(pConvertor->gpu_buffer_ptr, 0);
+            pConvertor->gpu_buffer_ptr = NULL;
+        }
+        return 1;
+    }
+    /* Save the global position for the next round */
+    PUSH_STACK( pStack, pConvertor->stack_pos, pos_desc, pElem->elem.common.type, count_desc,
+                conv_ptr - pConvertor->pBaseBuf );
+    DT_CUDA_DEBUG( opal_cuda_output( 4, "unpack save stack stack_pos %d pos_desc %d count_desc %d disp %ld\n",
+                                     pConvertor->stack_pos, pStack->index, (int)pStack->count, (long)pStack->disp ); );
+    return 0;
+}
+
+
+int32_t opal_ddt_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvertor,
+                                                          struct iovec* iov,
+                                                          uint32_t* out_size,
+                                                          size_t* max_data )
+{
+    size_t buffer_size;
+    unsigned char *source;
+    size_t total_unpacked;
+    uint8_t free_required = 0;
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+    TIMER_DATA_TYPE start, end, start_total, end_total;
+    long total_time, move_time;
+#endif
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+    GET_TIME(start_total);
+#endif
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+    GET_TIME(start);
+#endif
+    if (opal_ddt_cuda_is_gpu_buffer(iov[0].iov_base)) {
+        source = (unsigned char*)iov[0].iov_base;
+        free_required = 0;
+    } else {
+        if (OPAL_DATATYPE_VECTOR_USE_ZEROCPY) {
+            cudaHostGetDevicePointer((void **)&source, (void *)iov[0].iov_base, 0);
+            pConvertor->gpu_buffer_ptr = NULL;
+            free_required = 0;
+        } else {
+            if (pConvertor->gpu_buffer_ptr == NULL) {
+                pConvertor->gpu_buffer_ptr = (unsigned char*)opal_ddt_cuda_malloc_gpu_buffer(iov[0].iov_len, 0);
+            }
+            source = pConvertor->gpu_buffer_ptr;
+            cudaMemcpy(source, iov[0].iov_base, iov[0].iov_len, cudaMemcpyHostToDevice);
+            free_required = 1;
+        }
+    }
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+    GET_TIME( end );
+    move_time = ELAPSED_TIME( start, end );
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: HtoD memcpy in %ld microsec, free required %d\n", move_time, free_required ); );
+#endif
+
+
+    buffer_size = iov[0].iov_len;
+    total_unpacked = 0;
+    
+    /* start unpack */
+    if (cuda_iov_cache_enabled) {
+        opal_ddt_generic_simple_unpack_function_cuda_iov_cached(pConvertor, source, buffer_size, &total_unpacked);
+    } else {
+        opal_ddt_generic_simple_unpack_function_cuda_iov_non_cached(pConvertor, source, buffer_size, &total_unpacked);
+    }
+    
+    pConvertor->bConverted += total_unpacked;
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "Unpack total unpacked %d\n", total_unpacked); );
+
+    iov[0].iov_len = total_unpacked;
+    *max_data = total_unpacked;
+    *out_size = 1;
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+    GET_TIME( end_total );
+    total_time = ELAPSED_TIME( start_total, end_total );
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: total unpacking in %ld microsec, kernel %ld microsec\n", total_time, total_time-move_time ); );
+#endif
+
+    if( pConvertor->bConverted == pConvertor->local_size ) {
+        pConvertor->flags |= CONVERTOR_COMPLETED;
+        if (pConvertor->gpu_buffer_ptr != NULL && free_required) {
+            opal_ddt_cuda_free_gpu_buffer(pConvertor->gpu_buffer_ptr, 0);
+            pConvertor->gpu_buffer_ptr = NULL;
+        }
+        return 1;
+    }
+    return 0;
+}
+
+#if 0
+int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_non_cached( opal_convertor_t* pConvertor,
+                                                                     struct iovec* iov,
+                                                                     uint32_t* out_size,
+                                                                     size_t* max_data )
+{
+    uint32_t i, j;
+    uint32_t count_desc, nb_blocks_per_description, dst_offset, residue_desc;
+    uint32_t nb_blocks, thread_per_block, nb_blocks_used;
+    size_t length, buffer_size, length_per_iovec;
+    unsigned char *source, *source_base;
+    size_t total_unpacked, total_converted;
+    int32_t complete_flag = 0;
+    uint8_t buffer_isfull = 0;
+    uint8_t free_required = 0;
+    uint32_t convertor_flags;
+//    dt_elem_desc_t* description;
+//    dt_elem_desc_t* pElem;
+//    dt_stack_t* pStack;
+    uint8_t alignment, orig_alignment;
+//    int32_t orig_stack_index;
+    cudaError_t cuda_err;
+    ddt_cuda_stream_t *cuda_streams = current_cuda_device->cuda_streams;
+    ddt_cuda_iov_dist_non_cached_t* cuda_iov_dist_h_current;
+    ddt_cuda_iov_dist_non_cached_t* cuda_iov_dist_d_current;
+    ddt_cuda_iov_pipeline_block_t *cuda_iov_pipeline_block;
+    int iov_pipeline_block_id = 0;
+    cudaStream_t *cuda_stream_iov = NULL;
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+    TIMER_DATA_TYPE start, end, start_total, end_total;
+    long total_time, move_time;
+#endif
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+    GET_TIME(start_total);
+#endif
+
+/*    description = pConvertor->use_desc->desc;
+    pStack = pConvertor->pStack + pConvertor->stack_pos;
+    pElem = &(description[pStack->index]);
+    printf("size elem %d, size %lu\n", pElem->elem.common.type, opal_datatype_basicDatatypes[pElem->elem.common.type]->size);
+*/
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+    GET_TIME(start);
+#endif
+    if (opal_ddt_cuda_is_gpu_buffer(iov[0].iov_base)) {
+        source = (unsigned char*)iov[0].iov_base;
+        free_required = 0;
+    } else {
+        if (OPAL_DATATYPE_VECTOR_USE_ZEROCPY) {
+            cudaHostGetDevicePointer((void **)&source, (void *)iov[0].iov_base, 0);
+            pConvertor->gpu_buffer_ptr = NULL;
+            free_required = 0;
+        } else {
+            if (pConvertor->gpu_buffer_ptr == NULL) {
+                pConvertor->gpu_buffer_ptr = (unsigned char*)opal_ddt_cuda_malloc_gpu_buffer(iov[0].iov_len, 0);
+            }
+            source = pConvertor->gpu_buffer_ptr;
+            cudaMemcpy(source, iov[0].iov_base, iov[0].iov_len, cudaMemcpyHostToDevice);
+            free_required = 1;
+        }
+    }
+
+    source_base = source;
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "Unpack using IOV non cached, GPU base %p, unpack from buffer %p, total size %ld\n",
+                                     pConvertor->pBaseBuf, source, iov[0].iov_len); );
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+    GET_TIME( end );
+    move_time = ELAPSED_TIME( start, end );
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: HtoD memcpy in %ld microsec, free required %d\n", move_time, free_required ); );
+#endif
+    
+//    cuda_err = cudaEventRecord(current_cuda_device->memcpy_event, current_cuda_device->cuda_streams->opal_cuda_stream[0]);
+//    opal_cuda_check_error(cuda_err);
+
+
+#if defined (OPAL_DATATYPE_CUDA_TIMING)
+    GET_TIME(start);
+#endif
+    buffer_size = iov[0].iov_len;
+    cuda_iov_count = 1000;
+    total_unpacked = 0;
+    total_converted = pConvertor->bConverted;
+    cuda_streams->current_stream_id = 0;
+    convertor_flags = pConvertor->flags;
+//    orig_stack_index = pStack->index;
+    complete_flag = opal_convertor_raw( pConvertor, cuda_iov, &cuda_iov_count, &length );
+    DT_CUDA_DEBUG ( opal_cuda_output(4, "Unpack complete flag %d, iov count %d, length %d, submit to CUDA stream %d\n", complete_flag, cuda_iov_count, length, cuda_streams->current_stream_id); );
+
+#if defined (OPAL_DATATYPE_CUDA_TIMING)
+    GET_TIME( end );
+    total_time = ELAPSED_TIME( start, end );
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: ddt to iov in %ld microsec\n", total_time ); );
+#endif
+
+    dst_offset = 0;
+    thread_per_block = CUDA_WARP_SIZE * 5;
+    nb_blocks = 256;
+
+    while (cuda_iov_count > 0) {
+
+        nb_blocks_used = 0;
+        cuda_iov_pipeline_block = current_cuda_device->cuda_iov_pipeline_block[iov_pipeline_block_id];
+        cuda_iov_dist_h_current = cuda_iov_pipeline_block->cuda_iov_dist_non_cached_h;
+        cuda_iov_dist_d_current = cuda_iov_pipeline_block->cuda_iov_dist_non_cached_d;
+        cuda_stream_iov = cuda_iov_pipeline_block->cuda_stream;
+        cuda_err = cudaStreamWaitEvent(*cuda_stream_iov, cuda_iov_pipeline_block->cuda_event, 0);
+        opal_cuda_check_error(cuda_err);
+        
+
+#if defined (OPAL_DATATYPE_CUDA_TIMING)
+        GET_TIME(start);
+#endif
+
+        for (i = 0; i < cuda_iov_count; i++) {
+//            pElem = &(description[orig_stack_index+i]);
+            if (buffer_size >= cuda_iov[i].iov_len) {
+                length_per_iovec = cuda_iov[i].iov_len;
+            } else {
+              /*  orig_alignment = opal_datatype_basicDatatypes[pElem->elem.common.type]->size;*/
+                orig_alignment = ALIGNMENT_CHAR;
+                length_per_iovec = buffer_size / orig_alignment * orig_alignment;
+                buffer_isfull = 1;
+            }
+            buffer_size -= length_per_iovec;
+            total_unpacked += length_per_iovec;
+
+            /* check alignment */
+            if ((uintptr_t)(cuda_iov[i].iov_base) % ALIGNMENT_DOUBLE == 0 && (uintptr_t)source % ALIGNMENT_DOUBLE == 0 && length_per_iovec >= ALIGNMENT_DOUBLE) {
+                alignment = ALIGNMENT_DOUBLE;
+            } else if ((uintptr_t)(cuda_iov[i].iov_base) % ALIGNMENT_FLOAT == 0 && (uintptr_t)source % ALIGNMENT_FLOAT == 0 && length_per_iovec >= ALIGNMENT_FLOAT) {
+                alignment = ALIGNMENT_FLOAT;
+            } else {
+                alignment = ALIGNMENT_CHAR;
+            }
+
+            //alignment = ALIGNMENT_DOUBLE;
+
+            count_desc = length_per_iovec / alignment;
+            residue_desc = length_per_iovec % alignment;
+            nb_blocks_per_description = (count_desc + thread_per_block - 1) / thread_per_block;
+            DT_CUDA_DEBUG ( opal_cuda_output(10, "Unpack description %d, size %d, residue %d, alignment %d\n", i, count_desc, residue_desc, alignment); );
+            for (j = 0; j < nb_blocks_per_description; j++) {
+                cuda_iov_dist_h_current[nb_blocks_used].dst = (unsigned char *)(cuda_iov[i].iov_base) + j * thread_per_block * alignment;
+                cuda_iov_dist_h_current[nb_blocks_used].src = source;
+                cuda_iov_dist_h_current[nb_blocks_used].element_alignment = alignment;
+                if ( (j+1) * thread_per_block <= count_desc) {
+                    cuda_iov_dist_h_current[nb_blocks_used].nb_elements = thread_per_block;// * sizeof(double);
+                } else {
+                    cuda_iov_dist_h_current[nb_blocks_used].nb_elements = (thread_per_block - ((j+1)*thread_per_block - count_desc));// * sizeof(double);
+                }
+#if defined (OPAL_DATATYPE_CUDA_DEBUG)
+                assert (cuda_iov_dist_h_current[nb_blocks_used].nb_elements > 0); 
+#endif /* OPAL_DATATYPE_CUDA_DEBUG */
+                source += cuda_iov_dist_h_current[nb_blocks_used].nb_elements * alignment;
+                DT_CUDA_DEBUG( opal_cuda_output(12, "Unpack \tblock %d, src %p, dst %p, nb_elements %d, alignment %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].src, cuda_iov_dist_h_current[nb_blocks_used].dst, cuda_iov_dist_h_current[nb_blocks_used].nb_elements, cuda_iov_dist_h_current[nb_blocks_used].element_alignment); );
+                nb_blocks_used ++;
+            }
+
+            /* handle residue */
+            if (residue_desc != 0) {
+               /* orig_alignment = opal_datatype_basicDatatypes[pElem->elem.common.type]->size;*/
+                orig_alignment = ALIGNMENT_CHAR;
+                cuda_iov_dist_h_current[nb_blocks_used].dst = (unsigned char *)(cuda_iov[i].iov_base) + length_per_iovec / alignment * alignment;
+                cuda_iov_dist_h_current[nb_blocks_used].src = source;
+                cuda_iov_dist_h_current[nb_blocks_used].element_alignment = orig_alignment;
+                cuda_iov_dist_h_current[nb_blocks_used].nb_elements = (length_per_iovec - length_per_iovec / alignment * alignment) / orig_alignment;
+#if defined (OPAL_DATATYPE_CUDA_DEBUG)
+                assert (cuda_iov_dist_h_current[nb_blocks_used].nb_elements > 0);
+#endif /* OPAL_DATATYPE_CUDA_DEBUG */
+                source += cuda_iov_dist_h_current[nb_blocks_used].nb_elements * orig_alignment;
+                DT_CUDA_DEBUG( opal_cuda_output(12, "Unpack \tblock %d, src %p, dst %p, nb_elements %d, alignment %d\n", nb_blocks_used, cuda_iov_dist_h_current[nb_blocks_used].src, cuda_iov_dist_h_current[nb_blocks_used].dst, cuda_iov_dist_h_current[nb_blocks_used].nb_elements, cuda_iov_dist_h_current[nb_blocks_used].element_alignment); );
+                nb_blocks_used ++;
+            }
+
+            if (buffer_isfull) {
+                break;
+            }
+        }
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+        GET_TIME( end );
+        total_time = ELAPSED_TIME( start, end );
+        DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Unpack src %p, iov is prepared in %ld microsec, kernel submitted to CUDA stream %d, nb_blocks_used %d\n", source_base, total_time,  cuda_iov_pipeline_block->cuda_stream_id, nb_blocks_used); );
+#endif
+
+        cudaMemcpyAsync(cuda_iov_dist_d_current, cuda_iov_dist_h_current, sizeof(ddt_cuda_iov_dist_non_cached_t)*(nb_blocks_used), cudaMemcpyHostToDevice, *cuda_stream_iov);
+        opal_generic_simple_unpack_cuda_iov_non_cached_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cuda_iov_dist_d_current, nb_blocks_used);
+        cuda_err = cudaEventRecord(cuda_iov_pipeline_block->cuda_event, *cuda_stream_iov);
+        opal_cuda_check_error(cuda_err);
+        iov_pipeline_block_id ++;
+        iov_pipeline_block_id = iov_pipeline_block_id % NB_STREAMS;
+        
+        /* buffer is full */
+        if (buffer_isfull) {
+            size_t total_converted_tmp = total_converted;
+            pConvertor->flags = convertor_flags;
+            total_converted += total_unpacked;
+            opal_convertor_set_position_nocheck(pConvertor, &total_converted);
+            total_unpacked = total_converted - total_converted_tmp;
+            break;
+        }
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+        GET_TIME(start);
+#endif
+        convertor_flags = pConvertor->flags;
+//        orig_stack_index = pStack->index;
+        complete_flag = opal_convertor_raw( pConvertor, cuda_iov, &cuda_iov_count, &length );
+        DT_CUDA_DEBUG ( opal_cuda_output(4, "Unpack complete flag %d, iov count %d, length %d, submit to CUDA stream %d, nb_blocks %d\n", complete_flag, cuda_iov_count, length, cuda_streams->current_stream_id, nb_blocks_used); );
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+        GET_TIME( end );
+        total_time = ELAPSED_TIME( start, end );
+        DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: ddt to iov in %ld microsec\n", total_time ); );
+#endif
+
+    }
+
+    for (i = 0; i < NB_STREAMS; i++) {
+        cudaStreamSynchronize(cuda_streams->opal_cuda_stream[i]);
+    }
+
+    iov[0].iov_len = total_unpacked;
+    *max_data = total_unpacked;
+    *out_size = 1;
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "Unpack total unpacked %d\n", total_unpacked); );
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+    GET_TIME( end_total );
+    total_time = ELAPSED_TIME( start_total, end_total );
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: total unpacking in %ld microsec, kernel %ld microsec\n", total_time, total_time-move_time ); );
+#endif
+
+    if( pConvertor->bConverted == pConvertor->local_size ) {
+        pConvertor->flags |= CONVERTOR_COMPLETED;
+        if (pConvertor->gpu_buffer_ptr != NULL && free_required) {
+            opal_ddt_cuda_free_gpu_buffer(pConvertor->gpu_buffer_ptr, 0);
+            pConvertor->gpu_buffer_ptr = NULL;
+        }
+        return 1;
+    }
+    return 0;
+}
+
+#endif
+
+int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_non_cached( opal_convertor_t* pConvertor, unsigned char *source, size_t buffer_size, size_t *total_unpacked)
+{
+    uint32_t i;
+    uint32_t nb_blocks, thread_per_block, nb_blocks_used;
+    unsigned char *source_base, *destination_base;
+    uint8_t buffer_isfull = 0;
+    cudaError_t cuda_err;
+    ddt_cuda_stream_t *cuda_streams = current_cuda_device->cuda_streams;
+    ddt_cuda_iov_dist_cached_t* cuda_iov_dist_h_current;
+    ddt_cuda_iov_dist_cached_t* cuda_iov_dist_d_current;
+    ddt_cuda_iov_pipeline_block_t *cuda_iov_pipeline_block;
+    int iov_pipeline_block_id = 0;
+    cudaStream_t *cuda_stream_iov = NULL;
+    const struct iovec *ddt_iov = NULL;
+    uint32_t ddt_iov_count = 0;
+    size_t contig_disp = 0;
+    uint32_t ddt_iov_start_pos, ddt_iov_end_pos, current_ddt_iov_pos;
+    OPAL_PTRDIFF_TYPE ddt_extent;
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+    TIMER_DATA_TYPE start, end;
+    long total_time;
+#endif
+    
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "Unpack using IOV non cached, GPU base %p, unpack from buffer %p, total size %ld\n",
+                                     pConvertor->pBaseBuf, source, buffer_size); );
+    
+    opal_convertor_raw_cached( pConvertor, &ddt_iov, &ddt_iov_count);
+    if (ddt_iov == NULL) {
+        DT_CUDA_DEBUG ( opal_cuda_output(0, "Can not get ddt iov\n"););
+        return OPAL_ERROR;
+    }
+    
+  //  cuda_streams->current_stream_id = 0;
+    thread_per_block = CUDA_WARP_SIZE * 5;
+    nb_blocks = 256;
+    source_base = source;
+    opal_datatype_type_extent(pConvertor->pDesc, &ddt_extent);
+    opal_ddt_set_ddt_iov_position(pConvertor, pConvertor->bConverted, ddt_iov, ddt_iov_count);
+    destination_base = (unsigned char*)pConvertor->pBaseBuf + pConvertor->current_count * ddt_extent;
+    
+    for (i = 0; i < NB_STREAMS; i++) {
+        cudaStreamSynchronize(cuda_streams->ddt_cuda_stream[i]);
+    }
+
+    while( pConvertor->current_count < pConvertor->count && !buffer_isfull) {
+
+        nb_blocks_used = 0;
+        ddt_iov_start_pos = pConvertor->current_iov_pos;
+        ddt_iov_end_pos = ddt_iov_start_pos + IOV_PIPELINE_SIZE;
+        if (ddt_iov_end_pos > ddt_iov_count) {
+            ddt_iov_end_pos = ddt_iov_count;
+        }
+        cuda_iov_pipeline_block = current_cuda_device->cuda_iov_pipeline_block[iov_pipeline_block_id];
+        cuda_iov_pipeline_block->cuda_stream = &(cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]);
+        cuda_iov_dist_h_current = cuda_iov_pipeline_block->cuda_iov_dist_non_cached_h;
+        cuda_iov_dist_d_current = cuda_iov_pipeline_block->cuda_iov_dist_non_cached_d;
+        cuda_stream_iov = cuda_iov_pipeline_block->cuda_stream;
+        cuda_err = cudaEventSynchronize(cuda_iov_pipeline_block->cuda_event);
+        opal_cuda_check_error(cuda_err);
+        
+
+#if defined (OPAL_DATATYPE_CUDA_TIMING)
+        GET_TIME(start);
+#endif
+
+        buffer_isfull = opal_ddt_iov_to_cuda_iov(pConvertor, ddt_iov, cuda_iov_dist_h_current, ddt_iov_start_pos, ddt_iov_end_pos, &buffer_size, &nb_blocks_used, total_unpacked, &contig_disp, &current_ddt_iov_pos);
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+        GET_TIME( end );
+        total_time = ELAPSED_TIME( start, end );
+        DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Unpack src %p to dest %p, iov is prepared in %ld microsec, kernel submitted to CUDA stream %d, nb_blocks_used %d\n", source_base, destination_base, total_time,  cuda_streams->current_stream_id, nb_blocks_used); );
+#endif
+
+        cudaMemcpyAsync(cuda_iov_dist_d_current, cuda_iov_dist_h_current, sizeof(ddt_cuda_iov_dist_cached_t)*(nb_blocks_used+1), cudaMemcpyHostToDevice, *cuda_stream_iov);
+        opal_generic_simple_unpack_cuda_iov_cached_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cuda_iov_dist_d_current, 0, nb_blocks_used, 0, 0, nb_blocks_used, destination_base, source_base, 0, 0);
+        //cudaStreamSynchronize(*cuda_stream_iov);
+        cuda_err = cudaEventRecord(cuda_iov_pipeline_block->cuda_event, *cuda_stream_iov);
+        opal_cuda_check_error(cuda_err);
+        iov_pipeline_block_id ++;
+        iov_pipeline_block_id = iov_pipeline_block_id % NB_STREAMS;
+        source_base += contig_disp;
+        if (!buffer_isfull) {
+            pConvertor->current_iov_pos = current_ddt_iov_pos;
+            if (current_ddt_iov_pos == ddt_iov_count) {
+                pConvertor->current_count ++;
+                pConvertor->current_iov_pos = 0;
+                destination_base += ddt_extent;
+            }
+        }
+    }
+
+    cudaStreamSynchronize(cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]);
+
+    return OPAL_SUCCESS;
+}
+
+int32_t opal_ddt_generic_simple_unpack_function_cuda_iov_cached( opal_convertor_t* pConvertor, unsigned char *source, size_t buffer_size, size_t *total_unpacked)
+{
+    uint32_t i;
+    uint32_t nb_blocks, thread_per_block, nb_blocks_used;
+    unsigned char *source_base, *destination_base;
+    uint8_t buffer_isfull = 0;
+    cudaError_t cuda_err;
+    ddt_cuda_stream_t *cuda_streams = current_cuda_device->cuda_streams;
+    ddt_cuda_iov_pipeline_block_t *cuda_iov_pipeline_block = NULL;
+    cudaStream_t *cuda_stream_iov = NULL;
+    uint32_t cuda_iov_start_pos, cuda_iov_end_pos;
+    ddt_cuda_iov_total_cached_t* cached_cuda_iov = NULL;
+    ddt_cuda_iov_dist_cached_t* cached_cuda_iov_dist_d = NULL;
+    uint32_t *cached_cuda_iov_nb_bytes_list_h = NULL;
+    uint32_t cached_cuda_iov_count = 0;
+    size_t cuda_iov_partial_length_start = 0;
+    size_t cuda_iov_partial_length_end = 0;
+    opal_datatype_count_t convertor_current_count;
+    OPAL_PTRDIFF_TYPE ddt_extent;
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+    TIMER_DATA_TYPE start, end;
+    long total_time;
+#endif
+    
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "Unpack using IOV cached, GPU base %p, unpack from buffer %p, total size %ld\n",
+                                     pConvertor->pBaseBuf, source, buffer_size); );
+
+#if defined (OPAL_DATATYPE_CUDA_TIMING)
+    GET_TIME(start);
+#endif
+
+ //   cuda_streams->current_stream_id = 0;
+    source_base = source;
+    thread_per_block = CUDA_WARP_SIZE * 8;
+    nb_blocks = 64;
+    destination_base = (unsigned char*)pConvertor->pBaseBuf;
+    
+    /* cuda iov is not cached, start to cache iov */
+    if(opal_ddt_cuda_iov_is_cached(pConvertor) == 0) {
+#if defined (OPAL_DATATYPE_CUDA_TIMING)
+        GET_TIME(start);
+#endif
+        if (opal_ddt_cache_cuda_iov(pConvertor, &nb_blocks_used) == OPAL_SUCCESS) {
+            opal_ddt_set_cuda_iov_cached(pConvertor, nb_blocks_used);
+            DT_CUDA_DEBUG ( opal_cuda_output(2, "Unpack cuda iov is cached, count %d\n", nb_blocks_used););
+        }
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+        GET_TIME( end );
+        total_time = ELAPSED_TIME( start, end );
+        DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Unpack cuda iov is cached in %ld microsec, nb_blocks_used %d\n", total_time, nb_blocks_used); );
+#endif
+    }
+      
+    /* now we use cached cuda iov */
+    opal_ddt_get_cached_cuda_iov(pConvertor, &cached_cuda_iov);
+    cached_cuda_iov_dist_d = cached_cuda_iov->cuda_iov_dist_d;
+    assert(cached_cuda_iov_dist_d != NULL);
+    cached_cuda_iov_nb_bytes_list_h = cached_cuda_iov->nb_bytes_h;
+    assert(cached_cuda_iov_nb_bytes_list_h != NULL);
+    
+    cached_cuda_iov_count = cached_cuda_iov->cuda_iov_count;
+    opal_ddt_set_cuda_iov_position(pConvertor, pConvertor->bConverted, cached_cuda_iov_nb_bytes_list_h, cached_cuda_iov_count);
+    cuda_iov_start_pos = pConvertor->current_cuda_iov_pos;
+    cuda_iov_end_pos = cached_cuda_iov_count;
+    nb_blocks_used = 0;
+    cuda_iov_pipeline_block = current_cuda_device->cuda_iov_pipeline_block[0];
+    cuda_iov_pipeline_block->cuda_stream = &(cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]);
+    cuda_stream_iov = cuda_iov_pipeline_block->cuda_stream;
+    convertor_current_count = pConvertor->current_count;
+    
+    if (pConvertor->current_iov_partial_length > 0) {
+        cuda_iov_partial_length_start = pConvertor->current_iov_partial_length;
+        *total_unpacked += cuda_iov_partial_length_start;
+        buffer_size -= cuda_iov_partial_length_start;
+        pConvertor->current_iov_partial_length = 0;
+        cuda_iov_start_pos ++;
+        nb_blocks_used ++;
+    }
+    
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+    GET_TIME(start);
+#endif
+    while( pConvertor->current_count < pConvertor->count && !buffer_isfull) {
+        for (i = cuda_iov_start_pos; i < cuda_iov_end_pos && !buffer_isfull; i++) {
+            if (buffer_size >= cached_cuda_iov_nb_bytes_list_h[i]) {
+                *total_unpacked += cached_cuda_iov_nb_bytes_list_h[i];
+                buffer_size -= cached_cuda_iov_nb_bytes_list_h[i];
+                nb_blocks_used ++;
+            } else {
+                if (buffer_size > 0) {
+                    cuda_iov_partial_length_end = buffer_size;
+                    *total_unpacked += cuda_iov_partial_length_end;
+                    nb_blocks_used ++;
+                }
+                buffer_size = 0;
+                buffer_isfull = 1;
+                break;
+            }
+        }
+        if (!buffer_isfull) {
+            pConvertor->current_count ++;
+            cuda_iov_start_pos = 0;
+            cuda_iov_end_pos = cached_cuda_iov_count;
+        }
+    }
+#if defined(OPAL_DATATYPE_CUDA_TIMING)    
+    GET_TIME( end );
+    total_time = ELAPSED_TIME( start, end );
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Unpack src %p, cached cuda iov is prepared in %ld microsec, kernel submitted to CUDA stream %d, nb_blocks %d\n", source_base, total_time,  cuda_streams->current_stream_id, nb_blocks_used); );
+#endif
+    opal_datatype_type_extent(pConvertor->pDesc, &ddt_extent);
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "Unpack kernel launched src_base %p, dst_base %p, nb_blocks %ld\n", source_base, destination_base, nb_blocks_used ); );
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+    GET_TIME(start);
+#endif    
+    opal_generic_simple_unpack_cuda_iov_cached_kernel<<<nb_blocks, thread_per_block, 0, *cuda_stream_iov>>>(cached_cuda_iov_dist_d, pConvertor->current_cuda_iov_pos, cached_cuda_iov_count, ddt_extent, convertor_current_count, nb_blocks_used, destination_base, source_base, cuda_iov_partial_length_start, cuda_iov_partial_length_end);
+
+    cudaStreamSynchronize(cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]);
+#if defined(OPAL_DATATYPE_CUDA_TIMING)    
+    GET_TIME( end );
+    total_time = ELAPSED_TIME( start, end );
+    DT_CUDA_DEBUG ( opal_cuda_output(2, "[Timing]: Unpack kernel %ld microsec\n", total_time); );
+#endif
+
+    return OPAL_SUCCESS;
+}
+
+void unpack_contiguous_loop_cuda( dt_elem_desc_t* ELEM,
+                                  uint32_t* COUNT,
+                                  unsigned char** SOURCE,
+                                  unsigned char** DESTINATION,
+                                  size_t* SPACE )
+{
+    ddt_loop_desc_t *_loop = (ddt_loop_desc_t*)(ELEM);
+    ddt_endloop_desc_t* _end_loop = (ddt_endloop_desc_t*)((ELEM) + _loop->items);
+    unsigned char* _destination = (*DESTINATION) + _end_loop->first_elem_disp;
+    uint32_t _copy_loops = *(COUNT);
+    uint32_t num_blocks, tasks_per_block;
+    unsigned char* _source = *(SOURCE);
+    ddt_cuda_stream_t *cuda_streams = current_cuda_device->cuda_streams;
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)    
+    TIMER_DATA_TYPE start, end, start_total, end_total;
+    long total_time;
+#endif
+    
+    DT_CUDA_DEBUG( opal_cuda_output( 2, "Unpack using contiguous_loop_cuda\n"); );
+
+    if( (_copy_loops * _end_loop->size) > *(SPACE) )
+        _copy_loops = (uint32_t)(*(SPACE) / _end_loop->size);
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+    GET_TIME(start);
+#endif
+//    tasks_per_block = THREAD_PER_BLOCK * TASK_PER_THREAD;
+//    num_blocks = (*COUNT + tasks_per_block - 1) / tasks_per_block;
+#if OPAL_DATATYPE_VECTOR_USE_MEMCPY2D_AS_KERNEL
+     cudaMemcpy2DAsync(_destination, _loop->extent, _source, _end_loop->size, _end_loop->size, _copy_loops, cudaMemcpyDeviceToDevice, cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]);
+#else
+     unpack_contiguous_loop_cuda_kernel_global<<<192, 4*THREAD_PER_BLOCK, 0, cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]>>>(_copy_loops, _end_loop->size, _loop->extent, _source, _destination);
+#endif /* OPAL_DATATYPE_VECTOR_USE_MEMCPY2D_AS_KERNEL */
+
+#if !defined(OPAL_DATATYPE_CUDA_DRY_RUN)
+    *(DESTINATION) = _destination + _loop->extent*_copy_loops - _end_loop->first_elem_disp;
+    *(SOURCE) = *(SOURCE)  + _copy_loops * _end_loop->size;
+    *(SPACE) -= _copy_loops * _end_loop->size;
+    *(COUNT) -= _copy_loops;
+#endif
+
+    cudaStreamSynchronize(cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]);
+#if defined(OPAL_DATATYPE_CUDA_TIMING) 
+    GET_TIME( end );
+    total_time = ELAPSED_TIME( start, end );
+    DT_CUDA_DEBUG( opal_cuda_output( 2, "[Timing]: vector unpacking in %ld microsec\n", total_time ); );
+#endif
+}
+
+void unpack_contiguous_loop_cuda_memcpy2d( dt_elem_desc_t* ELEM,
+                                  uint32_t* COUNT,
+                                  unsigned char** SOURCE,
+                                  unsigned char** DESTINATION,
+                                  size_t* SPACE )
+{
+    ddt_loop_desc_t *_loop = (ddt_loop_desc_t*)(ELEM);
+    ddt_endloop_desc_t* _end_loop = (ddt_endloop_desc_t*)((ELEM) + _loop->items);
+    unsigned char* _destination = (*DESTINATION) + _end_loop->first_elem_disp;
+    uint32_t _copy_loops = *(COUNT);
+    uint32_t num_blocks, tasks_per_block;
+    unsigned char* _source = *(SOURCE);
+    ddt_cuda_stream_t *cuda_streams = current_cuda_device->cuda_streams;
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)    
+    TIMER_DATA_TYPE start, end, start_total, end_total;
+    long total_time;
+#endif
+    
+    DT_CUDA_DEBUG( opal_cuda_output( 2, "Unpack using contiguous_loop_cuda_memcpy2d\n"); );
+
+    if( (_copy_loops * _end_loop->size) > *(SPACE) )
+        _copy_loops = (uint32_t)(*(SPACE) / _end_loop->size);
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+    GET_TIME(start);
+#endif
+    cudaMemcpy2DAsync(_destination, _loop->extent, _source, _end_loop->size, _end_loop->size, _copy_loops, cudaMemcpyHostToDevice, cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]);
+
+#if !defined(OPAL_DATATYPE_CUDA_DRY_RUN)
+    *(DESTINATION) = _destination + _loop->extent*_copy_loops - _end_loop->first_elem_disp;
+    *(SOURCE) = *(SOURCE)  + _copy_loops * _end_loop->size;
+    *(SPACE) -= _copy_loops * _end_loop->size;
+    *(COUNT) -= _copy_loops;
+#endif
+    
+    cudaStreamSynchronize(cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]);
+    
+#if defined(OPAL_DATATYPE_CUDA_TIMING) 
+    GET_TIME( end );
+    total_time = ELAPSED_TIME( start, end );
+    DT_CUDA_DEBUG( opal_cuda_output( 2, "[Timing]: vector unpacking with memcpy2d in %ld microsec\n", total_time ); );
+#endif
+}
+
+void unpack_contiguous_loop_cuda_zerocopy( dt_elem_desc_t* ELEM,
+                                           uint32_t* COUNT,
+                                           unsigned char** SOURCE,
+                                           unsigned char** DESTINATION,
+                                           size_t* SPACE)
+{
+    ddt_loop_desc_t *_loop = (ddt_loop_desc_t*)(ELEM);
+    ddt_endloop_desc_t* _end_loop = (ddt_endloop_desc_t*)((ELEM) + _loop->items);
+    unsigned char* _destination = (*DESTINATION) + _end_loop->first_elem_disp;
+    uint32_t _copy_loops = *(COUNT);
+    uint32_t num_blocks, tasks_per_block;
+    unsigned char* _source = *(SOURCE);
+    unsigned char* _source_dev;
+    ddt_cuda_stream_t *cuda_streams = current_cuda_device->cuda_streams;
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)    
+    TIMER_DATA_TYPE start, end, start_total, end_total;
+    long total_time;
+#endif
+    
+    DT_CUDA_DEBUG( opal_cuda_output( 2, "Unpack using contiguous_loop_cuda_zerocopy\n"); );
+
+    if( (_copy_loops * _end_loop->size) > *(SPACE) )
+        _copy_loops = (uint32_t)(*(SPACE) / _end_loop->size);
+
+#if defined(OPAL_DATATYPE_CUDA_TIMING)
+    GET_TIME(start);
+#endif
+//    tasks_per_block = THREAD_PER_BLOCK * TASK_PER_THREAD;
+//    num_blocks = (*COUNT + tasks_per_block - 1) / tasks_per_block;
+
+    cudaError_t reg_rv = cudaHostGetDevicePointer((void **)&_source_dev, (void *) _source, 0);
+    if (reg_rv != cudaSuccess) {
+        const char *cuda_err = cudaGetErrorString(reg_rv);
+        printf("can not get dev mem, %s\n", cuda_err);
+    }
+#if OPAL_DATATYPE_VECTOR_USE_MEMCPY2D_AS_KERNEL
+    cudaMemcpy2DAsync(_destination, _loop->extent, _source_dev, _end_loop->size, _end_loop->size, _copy_loops, cudaMemcpyDeviceToDevice, cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]);
+#else
+    unpack_contiguous_loop_cuda_kernel_global<<<192, 4*THREAD_PER_BLOCK, 0, cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]>>>(_copy_loops, _end_loop->size, _loop->extent, _source_dev, _destination);
+#endif /* OPAL_DATATYPE_VECTOR_USE_MEMCPY2D_AS_KERNEL */
+
+#if !defined(OPAL_DATATYPE_CUDA_DRY_RUN)
+    *(DESTINATION) = _destination + _loop->extent*_copy_loops - _end_loop->first_elem_disp;
+    *(SOURCE) = *(SOURCE)  + _copy_loops * _end_loop->size;
+    *(SPACE) -= _copy_loops * _end_loop->size;
+    *(COUNT) -= _copy_loops;
+#endif
+
+    cudaStreamSynchronize(cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]);
+  //  cudaHostUnregister(_source);
+#if defined(OPAL_DATATYPE_CUDA_TIMING) 
+    GET_TIME( end );
+    total_time = ELAPSED_TIME( start, end );
+    DT_CUDA_DEBUG( opal_cuda_output( 2, "[Timing]: vector unpacking in %ld microsec\n", total_time ); );
+#endif
+}
+
+void unpack_predefined_data_cuda( dt_elem_desc_t* ELEM,
+                                  uint32_t* COUNT,
+                                  unsigned char** SOURCE,
+                                  unsigned char** DESTINATION,
+                                  size_t* SPACE )
+{
+    uint32_t _copy_count = *(COUNT);
+    size_t _copy_blength;
+    ddt_elem_desc_t* _elem = &((ELEM)->elem);
+    unsigned char* _source = (*SOURCE);
+    uint32_t nb_blocks, tasks_per_block, thread_per_block;
+    unsigned char* _destination = *(DESTINATION) + _elem->disp;
+    
+    ddt_cuda_stream_t *cuda_streams = current_cuda_device->cuda_streams;
+
+    _copy_blength = 8;//opal_datatype_basicDatatypes[_elem->common.type]->size;
+    if( (_copy_count * _copy_blength) > *(SPACE) ) {
+        _copy_count = (uint32_t)(*(SPACE) / _copy_blength);
+        if( 0 == _copy_count ) return;  /* nothing to do */
+    }
+    
+    
+    if (*COUNT / TASK_PER_THREAD < CUDA_WARP_SIZE) {
+        thread_per_block = CUDA_WARP_SIZE;
+    } else if (*COUNT / TASK_PER_THREAD < CUDA_WARP_SIZE * 2) {
+        thread_per_block = CUDA_WARP_SIZE * 2;
+    } else if (*COUNT / TASK_PER_THREAD < CUDA_WARP_SIZE * 3) {
+        thread_per_block = CUDA_WARP_SIZE * 3;
+    } else {
+        thread_per_block = CUDA_WARP_SIZE * 5;
+    }
+    tasks_per_block = thread_per_block * TASK_PER_THREAD;
+    nb_blocks = (*COUNT + tasks_per_block - 1) / tasks_per_block;
+
+ //   DBGPRINT("num_blocks %d, thread %d\n", nb_blocks, tasks_per_block);
+ //   DBGPRINT( "GPU pack 1. memcpy( %p, %p, %lu ) => space %lu\n", _destination, _source, (unsigned long)_copy_count, (unsigned long)(*(SPACE)) );
+    
+    unpack_contiguous_loop_cuda_kernel_global<<<nb_blocks, thread_per_block, 0, cuda_streams->ddt_cuda_stream[cuda_streams->current_stream_id]>>>(_copy_count, _copy_blength, _elem->extent, _source, _destination);
+    cuda_streams->current_stream_id ++;
+    cuda_streams->current_stream_id = cuda_streams->current_stream_id % NB_STREAMS;
+    
+#if !defined(OPAL_DATATYPE_CUDA_DRY_RUN)  
+    _copy_blength *= _copy_count;
+    *(DESTINATION)  = _destination + _elem->extent*_copy_count - _elem->disp;
+    *(SOURCE) += _copy_blength;
+    *(SPACE)  -= _copy_blength;
+    *(COUNT)  -= _copy_count;
+#endif
+    
+}
diff --git a/opal/datatype/opal_convertor.c b/opal/datatype/opal_convertor.c
index d5481283183..329ff4e62c1 100644
--- a/opal/datatype/opal_convertor.c
+++ b/opal/datatype/opal_convertor.c
@@ -560,8 +560,8 @@ int32_t opal_convertor_prepare_for_recv( opal_convertor_t* convertor,
 
     convertor->flags |= CONVERTOR_RECV;
 #if OPAL_CUDA_SUPPORT
-    mca_cuda_convertor_init(convertor, pUserBuf);
-#endif
+    mca_cuda_convertor_init(convertor, pUserBuf, datatype);
+#endif  /* OPAL_CUDA_SUPPORT */
 
     OPAL_CONVERTOR_PREPARE( convertor, datatype, count, pUserBuf );
 
@@ -574,7 +574,12 @@ int32_t opal_convertor_prepare_for_recv( opal_convertor_t* convertor,
         if( convertor->pDesc->flags & OPAL_DATATYPE_FLAG_CONTIGUOUS ) {
             convertor->fAdvance = opal_unpack_homogeneous_contig_checksum;
         } else {
-            convertor->fAdvance = opal_generic_simple_unpack_checksum;
+            if ((convertor->flags & CONVERTOR_CUDA) && (opal_datatype_cuda_kernel_support == 1)) {
+                convertor->fAdvance = opal_generic_simple_unpack_cuda_checksum;
+                convertor->gpu_buffer_ptr = NULL;
+            } else {
+                convertor->fAdvance = opal_generic_simple_unpack_checksum;
+            }
         }
     } else {
 #if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
@@ -585,7 +590,12 @@ int32_t opal_convertor_prepare_for_recv( opal_convertor_t* convertor,
         if( convertor->pDesc->flags & OPAL_DATATYPE_FLAG_CONTIGUOUS ) {
             convertor->fAdvance = opal_unpack_homogeneous_contig;
         } else {
-            convertor->fAdvance = opal_generic_simple_unpack;
+            if ((convertor->flags & CONVERTOR_CUDA) && (opal_datatype_cuda_kernel_support == 1)) {
+                convertor->fAdvance = opal_generic_simple_unpack_cuda;
+                convertor->gpu_buffer_ptr = NULL;
+            } else {
+                convertor->fAdvance = opal_generic_simple_unpack;
+            }
         }
     }
     return OPAL_SUCCESS;
@@ -599,8 +609,8 @@ int32_t opal_convertor_prepare_for_send( opal_convertor_t* convertor,
 {
     convertor->flags |= CONVERTOR_SEND;
 #if OPAL_CUDA_SUPPORT
-    mca_cuda_convertor_init(convertor, pUserBuf);
-#endif
+    mca_cuda_convertor_init(convertor, pUserBuf, datatype);
+#endif  /* OPAL_CUDA_SUPPORT */
 
     OPAL_CONVERTOR_PREPARE( convertor, datatype, count, pUserBuf );
 
@@ -612,7 +622,12 @@ int32_t opal_convertor_prepare_for_send( opal_convertor_t* convertor,
             else
                 convertor->fAdvance = opal_pack_homogeneous_contig_with_gaps_checksum;
         } else {
-            convertor->fAdvance = opal_generic_simple_pack_checksum;
+            if ((convertor->flags & CONVERTOR_CUDA) && (opal_datatype_cuda_kernel_support == 1)) {
+                convertor->fAdvance = opal_generic_simple_pack_cuda_checksum;
+                convertor->gpu_buffer_ptr = NULL;
+            } else {
+                convertor->fAdvance = opal_generic_simple_pack_checksum;
+            }
         }
     } else {
         if( datatype->flags & OPAL_DATATYPE_FLAG_CONTIGUOUS ) {
@@ -622,7 +637,12 @@ int32_t opal_convertor_prepare_for_send( opal_convertor_t* convertor,
             else
                 convertor->fAdvance = opal_pack_homogeneous_contig_with_gaps;
         } else {
-            convertor->fAdvance = opal_generic_simple_pack;
+            if ((convertor->flags & CONVERTOR_CUDA) && (opal_datatype_cuda_kernel_support == 1)) {
+                convertor->fAdvance = opal_generic_simple_pack_cuda;
+                convertor->gpu_buffer_ptr = NULL;
+            } else {
+                convertor->fAdvance = opal_generic_simple_pack;
+            }
         }
     }
     return OPAL_SUCCESS;
diff --git a/opal/datatype/opal_convertor.h b/opal/datatype/opal_convertor.h
index 5b26b7e7d63..b7c0a43a6ed 100644
--- a/opal/datatype/opal_convertor.h
+++ b/opal/datatype/opal_convertor.h
@@ -3,7 +3,7 @@
  * Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana
  *                         University Research and Technology
  *                         Corporation.  All rights reserved.
- * Copyright (c) 2004-2014 The University of Tennessee and The University
+ * Copyright (c) 2004-2015 The University of Tennessee and The University
  *                         of Tennessee Research Foundation.  All rights
  *                         reserved.
  * Copyright (c) 2004-2006 High Performance Computing Center Stuttgart,
@@ -79,6 +79,8 @@ typedef struct dt_stack_t dt_stack_t;
  */
 #define DT_STATIC_STACK_SIZE   5                /**< This should be sufficient for most applications */
 
+#define MAX_IPC_EVENT_HANDLE   10
+
 struct opal_convertor_t {
     opal_object_t                 super;          /**< basic superclass */
     uint32_t                      remoteArch;     /**< the remote architecture */
@@ -109,6 +111,13 @@ struct opal_convertor_t {
 #if OPAL_CUDA_SUPPORT
     memcpy_fct_t                  cbmemcpy;       /**< memcpy or cuMemcpy */
     void *                        stream;         /**< CUstream for async copy */
+
+    unsigned char *               gpu_buffer_ptr; /**< GPU buffer used for pack/unpack */
+    size_t                        gpu_buffer_size;
+    uint32_t                      current_cuda_iov_pos;
+    uint32_t                      current_iov_pos;
+    size_t                        current_iov_partial_length;
+    opal_datatype_count_t         current_count;
 #endif
     /* size: 248, cachelines: 4, members: 20 */
     /* last cacheline: 56 bytes */
@@ -278,7 +287,22 @@ opal_convertor_raw( opal_convertor_t* convertor,  /* [IN/OUT] */
                     struct iovec* iov,            /* [IN/OUT] */
                     uint32_t* iov_count,          /* [IN/OUT] */
                     size_t* length );             /* [OUT]    */
+OPAL_DECLSPEC void
+opal_convertor_to_iov(struct opal_convertor_t *convertor,
+                      struct iovec **iov,
+                      uint32_t *iov_count,
+                      size_t *max_data);
 
+/**
+ * A straighforward description of the datatype in terms of a NULL
+ * based iovec (so basically displacements from the begining of a pointer,
+ * will be generated and stored in the datatype itself. This description
+ * can be used to pack/unpack the data manually.
+ */
+OPAL_DECLSPEC int
+opal_convertor_raw_cached(struct opal_convertor_t *convertor,
+                          const struct iovec **iov,
+                          uint32_t* iov_count);
 /*
  * Upper level does not need to call the _nocheck function directly.
  */
diff --git a/opal/datatype/opal_convertor_raw.c b/opal/datatype/opal_convertor_raw.c
index b57d5aa1ded..bf46a7a9d5a 100644
--- a/opal/datatype/opal_convertor_raw.c
+++ b/opal/datatype/opal_convertor_raw.c
@@ -1,6 +1,6 @@
 /* -*- Mode: C; c-basic-offset:4 ; -*- */
 /*
- * Copyright (c) 2004-2009 The University of Tennessee and The University
+ * Copyright (c) 2004-2015 The University of Tennessee and The University
  *                         of Tennessee Research Foundation.  All rights
  *                         reserved.
  * Copyright (c) 2009      Oak Ridge National Labs.  All rights reserved.
@@ -211,3 +211,57 @@ opal_convertor_raw( opal_convertor_t* pConvertor,
                            pConvertor->stack_pos, pStack->index, (int)pStack->count, (long)pStack->disp ); );
     return 0;
 }
+
+#define IOVEC_INITIAL_SIZE 64
+
+void
+opal_convertor_to_iov(struct opal_convertor_t *convertor,
+                      struct iovec **iov,
+                      uint32_t *iov_count,
+                      size_t *max_data)
+{
+    uint32_t temp_count = IOVEC_INITIAL_SIZE;
+    struct iovec *iovec;
+    size_t temp_data;
+
+    *iov_count = 0;
+    *max_data = 0;
+
+    *iov = iovec = (struct iovec*) malloc(temp_count * sizeof(struct iovec));
+    while(1) {
+        int ret = opal_convertor_raw(convertor, iovec, &temp_count, &temp_data);
+        *iov_count += temp_count;
+        *max_data += temp_data;
+        if(ret)
+            break;
+
+        *iov = (struct iovec*)realloc(*iov, (*iov_count + IOVEC_INITIAL_SIZE) * sizeof(struct iovec));
+        temp_count = IOVEC_INITIAL_SIZE;
+        iovec = &((*iov)[*iov_count]);
+    }
+}
+
+int opal_convertor_raw_cached(struct opal_convertor_t *convertor,
+                              const struct iovec **iov,
+                              uint32_t* iov_count)
+{
+    if( NULL == convertor->pDesc->cached_iovec ) {
+        struct opal_convertor_t conv;
+        size_t max_data;
+
+        OBJ_CONSTRUCT(&conv, opal_convertor_t);
+        conv.remoteArch = convertor->remoteArch;
+        conv.stack_pos  = 0;
+        conv.flags      = convertor->flags;
+        conv.master     = convertor->master;
+        opal_convertor_prepare_for_send(&conv, convertor->pDesc, 1, NULL);
+        opal_convertor_get_packed_size(&conv, &max_data);
+        opal_convertor_to_iov(&conv, (struct iovec **)&convertor->pDesc->cached_iovec,
+                              (uint32_t *)&convertor->pDesc->cached_iovec_count, &max_data);
+        OBJ_DESTRUCT(&conv);
+    }
+    *iov = convertor->pDesc->cached_iovec;
+    *iov_count = convertor->pDesc->cached_iovec_count;
+
+    return OPAL_SUCCESS;
+}
diff --git a/opal/datatype/opal_datatype.h b/opal/datatype/opal_datatype.h
index 25f014ead0d..a3a6898dd89 100644
--- a/opal/datatype/opal_datatype.h
+++ b/opal/datatype/opal_datatype.h
@@ -107,29 +107,38 @@ struct opal_datatype_t {
     size_t             size;     /**< total size in bytes of the memory used by the data if
                                       the data is put on a contiguous buffer */
     OPAL_PTRDIFF_TYPE  true_lb;  /**< the true lb of the data without user defined lb and ub */
+    /* --- cacheline 1 boundary (64 bytes) --- */
     OPAL_PTRDIFF_TYPE  true_ub;  /**< the true ub of the data without user defined lb and ub */
     OPAL_PTRDIFF_TYPE  lb;       /**< lower bound in memory */
     OPAL_PTRDIFF_TYPE  ub;       /**< upper bound in memory */
-    /* --- cacheline 1 boundary (64 bytes) --- */
     size_t             nbElems;  /**< total number of elements inside the datatype */
-    uint32_t           align;    /**< data should be aligned to */
 
     /* Attribute fields */
     char               name[OPAL_MAX_OBJECT_NAME];  /**< name of the datatype */
-    /* --- cacheline 2 boundary (128 bytes) was 8-12 bytes ago --- */
+    /* --- cacheline 2 boundary (128 bytes) was 40 bytes ago --- */
     dt_type_desc_t     desc;     /**< the data description */
     dt_type_desc_t     opt_desc; /**< short description of the data used when conversion is useless
                                       or in the send case (without conversion) */
 
+    uint32_t           align;    /**< data should be aligned to */
     uint32_t           btypes[OPAL_DATATYPE_MAX_SUPPORTED];
                                  /**< basic elements count used to compute the size of the
                                       datatype for remote nodes. The length of the array is dependent on
                                       the maximum number of datatypes of all top layers.
                                       Reason being is that Fortran is not at the OPAL layer. */
-    /* --- cacheline 5 boundary (320 bytes) was 32-36 bytes ago --- */
-
-    /* size: 352, cachelines: 6, members: 15 */
-    /* last cacheline: 28-32 bytes */
+    /* --- cacheline 6 boundary (384 bytes) was 8 bytes ago --- */
+    struct iovec*      iov;
+    int                iov_count;
+    size_t             max_data;
+    /* size: 416, cachelines: 7, members: 18 */
+    /* last cacheline: 32 bytes */
+
+    struct iovec*      cached_iovec;
+    uint32_t           cached_iovec_count;
+
+#if OPAL_CUDA_SUPPORT
+    unsigned char *             cached_cuda_iov;
+#endif /* OPAL_CUDA_SUPPORT */
 };
 
 typedef struct opal_datatype_t opal_datatype_t;
diff --git a/opal/datatype/opal_datatype_create.c b/opal/datatype/opal_datatype_create.c
index e64e1f04190..e57a7d6c668 100644
--- a/opal/datatype/opal_datatype_create.c
+++ b/opal/datatype/opal_datatype_create.c
@@ -3,7 +3,7 @@
  * Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana
  *                         University Research and Technology
  *                         Corporation.  All rights reserved.
- * Copyright (c) 2004-2013 The University of Tennessee and The University
+ * Copyright (c) 2004-2015 The University of Tennessee and The University
  *                         of Tennessee Research Foundation.  All rights
  *                         reserved.
  * Copyright (c) 2004-2006 High Performance Computing Center Stuttgart,
@@ -27,6 +27,10 @@
 #include "opal/datatype/opal_datatype_internal.h"
 #include "limits.h"
 #include "opal/prefetch.h"
+#if OPAL_CUDA_SUPPORT
+#include "opal/datatype/opal_convertor.h"
+#include "opal/datatype/opal_datatype_cuda.h"
+#endif /* OPAL_CUDA_SUPPORT */ 
 
 static void opal_datatype_construct( opal_datatype_t* pData )
 {
@@ -53,6 +57,13 @@ static void opal_datatype_construct( opal_datatype_t* pData )
     pData->opt_desc.length    = 0;
     pData->opt_desc.used      = 0;
 
+    pData->cached_iovec       = NULL;
+    pData->cached_iovec_count = 0;
+    
+#if OPAL_CUDA_SUPPORT
+    pData->cached_cuda_iov = NULL;
+#endif /* OPAL_CUDA_SUPPORT */
+
     for( i = 0; i < OPAL_DATATYPE_MAX_SUPPORTED; i++ )
         pData->btypes[i]      = 0;
 }
@@ -82,6 +93,19 @@ static void opal_datatype_destruct( opal_datatype_t* datatype )
 
     /* make sure the name is set to empty */
     datatype->name[0] = '\0';
+
+    if( NULL != datatype->cached_iovec ) {
+        free(datatype->cached_iovec);
+        datatype->cached_iovec = NULL;
+    }
+    
+#if OPAL_CUDA_SUPPORT   
+    /* free cuda iov */
+    if (opal_datatype_cuda_kernel_support == 1 && datatype->cached_cuda_iov != NULL) {
+        opal_cached_cuda_iov_fini((void*)datatype->cached_cuda_iov);
+        datatype->cached_cuda_iov = NULL;
+    }
+#endif /* OPAL_CUDA_SUPPORT */
 }
 
 OBJ_CLASS_INSTANCE(opal_datatype_t, opal_object_t, opal_datatype_construct, opal_datatype_destruct);
diff --git a/opal/datatype/opal_datatype_cuda.c b/opal/datatype/opal_datatype_cuda.c
index 71b60e60801..2aa73454724 100644
--- a/opal/datatype/opal_datatype_cuda.c
+++ b/opal/datatype/opal_datatype_cuda.c
@@ -12,11 +12,13 @@
 #include <errno.h>
 #include <string.h>
 #include <unistd.h>
+#include <dlfcn.h>
 
 #include "opal/align.h"
 #include "opal/util/output.h"
 #include "opal/datatype/opal_convertor.h"
 #include "opal/datatype/opal_datatype_cuda.h"
+#include "opal/mca/installdirs/installdirs.h"
 
 static bool initialized = false;
 int opal_cuda_verbose = 0;
@@ -26,6 +28,24 @@ static void opal_cuda_support_init(void);
 static int (*common_cuda_initialization_function)(opal_common_cuda_function_table_t *) = NULL;
 static opal_common_cuda_function_table_t ftable;
 
+/* folowing variables are used for cuda ddt kernel support */
+static opal_datatype_cuda_kernel_function_table_t cuda_kernel_table;
+static void *opal_datatype_cuda_kernel_handle = NULL;
+static char *opal_datatype_cuda_kernel_lib = NULL;
+int32_t opal_datatype_cuda_kernel_support = 0;
+
+#define OPAL_DATATYPE_FIND_CUDA_KERNEL_FUNCTION_OR_RETURN(handle, fname)            \
+    do {                                                                            \
+        char* _error;                                                               \
+        *(void **)(&(cuda_kernel_table.fname ## _p)) = dlsym((handle), # fname);    \
+        if(NULL != (_error = dlerror()) )  {                                        \
+            opal_output(0, "Finding %s error: %s\n", # fname, _error);              \
+            cuda_kernel_table.fname ## _p = NULL;                                   \
+            return OPAL_ERROR;                                                      \
+        }                                                                           \
+    } while (0)
+
+
 /* This function allows the common cuda code to register an
  * initialization function that gets called the first time an attempt
  * is made to send or receive a GPU pointer.  This allows us to delay
@@ -41,7 +61,7 @@ void opal_cuda_add_initialization_function(int (*fptr)(opal_common_cuda_function
  * is enabled or not.  If CUDA is not enabled, then short circuit out
  * for all future calls.
  */
-void mca_cuda_convertor_init(opal_convertor_t* convertor, const void *pUserBuf)
+void mca_cuda_convertor_init(opal_convertor_t* convertor, const void *pUserBuf, const struct opal_datatype_t* datatype)
 {
     /* Only do the initialization on the first GPU access */
     if (!initialized) {
@@ -60,6 +80,15 @@ void mca_cuda_convertor_init(opal_convertor_t* convertor, const void *pUserBuf)
     if (ftable.gpu_is_gpu_buffer(pUserBuf, convertor)) {
         convertor->flags |= CONVERTOR_CUDA;
     }
+    
+    if (OPAL_SUCCESS != opal_cuda_kernel_support_init()) {
+        opal_cuda_kernel_support_fini();    
+    }
+
+    convertor->current_cuda_iov_pos = 0;
+    convertor->current_iov_pos = 0;
+    convertor->current_iov_partial_length = 0;
+    convertor->current_count = 0;
 }
 
 /* Checks the type of pointer
@@ -80,9 +109,8 @@ bool opal_cuda_check_bufs(char *dest, char *src)
 
     if (ftable.gpu_is_gpu_buffer(dest, NULL) || ftable.gpu_is_gpu_buffer(src, NULL)) {
         return true;
-    } else {
-        return false;
     }
+    return false;
 }
 
 /*
@@ -109,9 +137,8 @@ void *opal_cuda_memcpy(void *dest, const void *src, size_t size, opal_convertor_
         opal_output(0, "CUDA: Error in cuMemcpy: res=%d, dest=%p, src=%p, size=%d",
                     res, dest, src, (int)size);
         abort();
-    } else {
-        return dest;
     }
+    return dest;
 }
 
 /*
@@ -127,9 +154,8 @@ void *opal_cuda_memcpy_sync(void *dest, const void *src, size_t size)
         opal_output(0, "CUDA: Error in cuMemcpy: res=%d, dest=%p, src=%p, size=%d",
                     res, dest, src, (int)size);
         abort();
-    } else {
-        return dest;
     }
+    return dest;
 }
 
 /*
@@ -180,6 +206,7 @@ static void opal_cuda_support_init(void)
     }
 
     initialized = true;
+    
 }
 
 /**
@@ -191,3 +218,180 @@ void opal_cuda_set_copy_function_async(opal_convertor_t* convertor, void *stream
     convertor->flags |= CONVERTOR_CUDA_ASYNC;
     convertor->stream = stream;
 }
+
+/* following functions are used for cuda ddt kernel support */
+int32_t opal_cuda_kernel_support_init(void)
+{
+    if (opal_datatype_cuda_kernel_handle ==  NULL) {
+
+        /* If the library name was initialized but the load failed, we have another chance to change it */
+        if( NULL != opal_datatype_cuda_kernel_lib )
+            free(opal_datatype_cuda_kernel_lib);
+        asprintf(&opal_datatype_cuda_kernel_lib, "%s/%s", opal_install_dirs.libdir, "opal_datatype_cuda_kernel.so");
+
+        opal_datatype_cuda_kernel_handle = dlopen(opal_datatype_cuda_kernel_lib , RTLD_LAZY);
+        if (!opal_datatype_cuda_kernel_handle) {
+            opal_output( 0, "Failed to load %s library: error %s\n", opal_datatype_cuda_kernel_lib, dlerror());
+            opal_datatype_cuda_kernel_handle = NULL;
+            return OPAL_ERROR;
+        }
+        
+        OPAL_DATATYPE_FIND_CUDA_KERNEL_FUNCTION_OR_RETURN( opal_datatype_cuda_kernel_handle, opal_ddt_cuda_kernel_init );
+        OPAL_DATATYPE_FIND_CUDA_KERNEL_FUNCTION_OR_RETURN( opal_datatype_cuda_kernel_handle, opal_ddt_cuda_kernel_fini );
+        OPAL_DATATYPE_FIND_CUDA_KERNEL_FUNCTION_OR_RETURN( opal_datatype_cuda_kernel_handle, opal_ddt_generic_simple_pack_function_cuda_iov );
+        OPAL_DATATYPE_FIND_CUDA_KERNEL_FUNCTION_OR_RETURN( opal_datatype_cuda_kernel_handle, opal_ddt_generic_simple_unpack_function_cuda_iov );
+        OPAL_DATATYPE_FIND_CUDA_KERNEL_FUNCTION_OR_RETURN( opal_datatype_cuda_kernel_handle, opal_ddt_generic_simple_pack_function_cuda_vector );
+        OPAL_DATATYPE_FIND_CUDA_KERNEL_FUNCTION_OR_RETURN( opal_datatype_cuda_kernel_handle, opal_ddt_generic_simple_unpack_function_cuda_vector );
+        OPAL_DATATYPE_FIND_CUDA_KERNEL_FUNCTION_OR_RETURN( opal_datatype_cuda_kernel_handle, opal_ddt_cuda_free_gpu_buffer );
+        OPAL_DATATYPE_FIND_CUDA_KERNEL_FUNCTION_OR_RETURN( opal_datatype_cuda_kernel_handle, opal_ddt_cuda_malloc_gpu_buffer );
+        OPAL_DATATYPE_FIND_CUDA_KERNEL_FUNCTION_OR_RETURN( opal_datatype_cuda_kernel_handle, opal_ddt_cuda_d2dcpy_async );
+        OPAL_DATATYPE_FIND_CUDA_KERNEL_FUNCTION_OR_RETURN( opal_datatype_cuda_kernel_handle, opal_ddt_cuda_d2dcpy );
+        OPAL_DATATYPE_FIND_CUDA_KERNEL_FUNCTION_OR_RETURN( opal_datatype_cuda_kernel_handle, opal_ddt_cached_cuda_iov_fini );
+        OPAL_DATATYPE_FIND_CUDA_KERNEL_FUNCTION_OR_RETURN( opal_datatype_cuda_kernel_handle, opal_ddt_cuda_set_cuda_stream );
+        OPAL_DATATYPE_FIND_CUDA_KERNEL_FUNCTION_OR_RETURN( opal_datatype_cuda_kernel_handle, opal_ddt_cuda_get_cuda_stream );
+        
+        if (OPAL_SUCCESS != cuda_kernel_table.opal_ddt_cuda_kernel_init_p()) {
+            return OPAL_ERROR;
+        }
+        opal_datatype_cuda_kernel_support = 1;
+        opal_output( 0, "opal_cuda_kernel_support_init done\n");
+    }
+    return OPAL_SUCCESS;
+}
+
+int32_t opal_cuda_kernel_support_fini(void)
+{
+    if (opal_datatype_cuda_kernel_handle != NULL) {
+        cuda_kernel_table.opal_ddt_cuda_kernel_fini_p();
+        /* Reset all functions to NULL */
+        cuda_kernel_table.opal_ddt_cuda_kernel_init_p = NULL;
+        cuda_kernel_table.opal_ddt_cuda_kernel_fini_p = NULL;
+        cuda_kernel_table.opal_ddt_generic_simple_pack_function_cuda_iov_p = NULL;
+        cuda_kernel_table.opal_ddt_generic_simple_unpack_function_cuda_iov_p = NULL;
+        cuda_kernel_table.opal_ddt_generic_simple_pack_function_cuda_vector_p = NULL;
+        cuda_kernel_table.opal_ddt_generic_simple_unpack_function_cuda_vector_p = NULL;
+        cuda_kernel_table.opal_ddt_cuda_free_gpu_buffer_p = NULL;
+        cuda_kernel_table.opal_ddt_cuda_malloc_gpu_buffer_p = NULL;
+        cuda_kernel_table.opal_ddt_cuda_d2dcpy_async_p = NULL;
+        cuda_kernel_table.opal_ddt_cuda_d2dcpy_p = NULL;
+        cuda_kernel_table.opal_ddt_cached_cuda_iov_fini_p = NULL;
+        cuda_kernel_table.opal_ddt_cuda_set_cuda_stream_p = NULL;
+        cuda_kernel_table.opal_ddt_cuda_get_cuda_stream_p = NULL;
+
+        dlclose(opal_datatype_cuda_kernel_handle);
+        opal_datatype_cuda_kernel_handle = NULL;
+
+        if( NULL != opal_datatype_cuda_kernel_lib )
+            free(opal_datatype_cuda_kernel_lib);
+        opal_datatype_cuda_kernel_lib = NULL;
+        opal_datatype_cuda_kernel_support = 0;
+        opal_output( 0, "opal_cuda_kernel_support_fini done\n");
+    }
+    return OPAL_SUCCESS;
+}
+
+int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor, struct iovec* iov, uint32_t* out_size, size_t* max_data )
+{
+    if (cuda_kernel_table.opal_ddt_generic_simple_pack_function_cuda_iov_p != NULL) {
+        return cuda_kernel_table.opal_ddt_generic_simple_pack_function_cuda_iov_p(pConvertor, iov, out_size, max_data);
+    } else {
+        opal_output(0, "opal_ddt_generic_simple_pack_function_cuda_iov function pointer is NULL\n");
+        return -1;
+    }
+}
+
+int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvertor, struct iovec* iov, uint32_t* out_size, size_t* max_data )
+{
+    if (cuda_kernel_table.opal_ddt_generic_simple_unpack_function_cuda_iov_p != NULL) {
+        return cuda_kernel_table.opal_ddt_generic_simple_unpack_function_cuda_iov_p(pConvertor, iov, out_size, max_data);
+    } else {
+        opal_output(0, "opal_ddt_generic_simple_unpack_function_cuda_iov function pointer is NULL\n");
+        return -1;
+    }
+}
+
+int32_t opal_generic_simple_pack_function_cuda_vector( opal_convertor_t* pConvertor, struct iovec* iov, uint32_t* out_size, size_t* max_data )
+{
+    if (cuda_kernel_table.opal_ddt_generic_simple_pack_function_cuda_vector_p != NULL) {
+        return cuda_kernel_table.opal_ddt_generic_simple_pack_function_cuda_vector_p(pConvertor, iov, out_size, max_data);
+    } else {
+        opal_output(0, "opal_ddt_generic_simple_pack_function_cuda_vector function pointer is NULL\n");
+        return -1;
+    }
+}
+
+int32_t opal_generic_simple_unpack_function_cuda_vector( opal_convertor_t* pConvertor, struct iovec* iov, uint32_t* out_size, size_t* max_data )
+{
+    if (cuda_kernel_table.opal_ddt_generic_simple_unpack_function_cuda_vector_p != NULL) {
+        return cuda_kernel_table.opal_ddt_generic_simple_unpack_function_cuda_vector_p(pConvertor, iov, out_size, max_data);
+    } else {
+        opal_output(0, "opal_ddt_generic_simple_unpack_function_cuda_vector function pointer is NULL\n");
+        return -1;
+    }
+}
+
+void* opal_cuda_malloc_gpu_buffer(size_t size, int gpu_id)
+{
+    if (cuda_kernel_table.opal_ddt_cuda_malloc_gpu_buffer_p != NULL) {
+        return cuda_kernel_table.opal_ddt_cuda_malloc_gpu_buffer_p(size, gpu_id);
+    } else {
+        opal_output(0, "opal_ddt_cuda_malloc_gpu_buffer function pointer is NULL\n");
+        return NULL;
+    }
+}
+
+void opal_cuda_free_gpu_buffer(void *addr, int gpu_id)
+{
+    if (cuda_kernel_table.opal_ddt_cuda_free_gpu_buffer_p != NULL) {
+        cuda_kernel_table.opal_ddt_cuda_free_gpu_buffer_p(addr, gpu_id);
+    } else {
+        opal_output(0, "opal_ddt_cuda_free_gpu_buffer function pointer is NULL\n");
+    }
+}
+
+void opal_cuda_d2dcpy(void* dst, const void* src, size_t count)
+{
+    if (cuda_kernel_table.opal_ddt_cuda_d2dcpy_p != NULL) {
+        cuda_kernel_table.opal_ddt_cuda_d2dcpy_p(dst, src, count);
+    } else {
+        opal_output(0, "opal_ddt_cuda_d2dcpy function pointer is NULL\n");
+    }
+}
+
+void opal_cuda_d2dcpy_async(void* dst, const void* src, size_t count)
+{
+    if (cuda_kernel_table.opal_ddt_cuda_d2dcpy_async_p != NULL) {
+        cuda_kernel_table.opal_ddt_cuda_d2dcpy_async_p(dst, src, count);
+    } else {
+        opal_output(0, "opal_ddt_cuda_d2dcpy_async function pointer is NULL\n");
+    }
+}
+
+void opal_cached_cuda_iov_fini(void *cached_cuda_iov)
+{
+    if (cuda_kernel_table.opal_ddt_cached_cuda_iov_fini_p != NULL) {
+        cuda_kernel_table.opal_ddt_cached_cuda_iov_fini_p(cached_cuda_iov);
+    } else {
+        opal_output(0, "opal_ddt_cached_cuda_iov_fini function pointer is NULL\n");
+    }
+}
+
+void opal_cuda_set_cuda_stream(void)
+{
+    if (cuda_kernel_table.opal_ddt_cuda_set_cuda_stream_p != NULL) {
+        cuda_kernel_table.opal_ddt_cuda_set_cuda_stream_p();
+    } else {
+        opal_output(0, "opal_ddt_cuda_set_cuda_stream function pointer is NULL\n");
+    }
+}
+
+int32_t opal_cuda_get_cuda_stream(void)
+{
+    if (cuda_kernel_table.opal_ddt_cuda_get_cuda_stream_p != NULL) {
+        return cuda_kernel_table.opal_ddt_cuda_get_cuda_stream_p();
+    } else {
+        opal_output(0, "opal_ddt_cuda_get_cuda_stream function pointer is NULL\n");
+        return -2;
+    }
+}
+
diff --git a/opal/datatype/opal_datatype_cuda.h b/opal/datatype/opal_datatype_cuda.h
index 676af80273b..cb82e93add3 100644
--- a/opal/datatype/opal_datatype_cuda.h
+++ b/opal/datatype/opal_datatype_cuda.h
@@ -21,7 +21,25 @@ struct opal_common_cuda_function_table {
 };
 typedef struct opal_common_cuda_function_table opal_common_cuda_function_table_t;
 
-void mca_cuda_convertor_init(opal_convertor_t* convertor, const void *pUserBuf);
+struct opal_datatype_cuda_kernel_function_table {
+    int32_t (*opal_ddt_cuda_kernel_init_p)(void);
+    int32_t (*opal_ddt_cuda_kernel_fini_p)(void);
+    void (*opal_ddt_cuda_free_gpu_buffer_p)(void *addr, int gpu_id);
+    void* (*opal_ddt_cuda_malloc_gpu_buffer_p)(size_t size, int gpu_id);
+    void (*opal_ddt_cuda_d2dcpy_async_p)(void* dst, const void* src, size_t count);
+    void (*opal_ddt_cuda_d2dcpy_p)(void* dst, const void* src, size_t count);
+    void (*opal_ddt_cached_cuda_iov_fini_p)(void *cached_cuda_iov);
+    void (*opal_ddt_cuda_set_cuda_stream_p)(void);
+    int32_t (*opal_ddt_cuda_get_cuda_stream_p)(void);
+    int32_t (*opal_ddt_generic_simple_pack_function_cuda_iov_p)( opal_convertor_t* pConvertor, struct iovec* iov, uint32_t* out_size, size_t* max_data );
+    int32_t (*opal_ddt_generic_simple_unpack_function_cuda_iov_p)( opal_convertor_t* pConvertor, struct iovec* iov, uint32_t* out_size, size_t* max_data );
+    int32_t (*opal_ddt_generic_simple_pack_function_cuda_vector_p)( opal_convertor_t* pConvertor, struct iovec* iov, uint32_t* out_size, size_t* max_data );
+    int32_t (*opal_ddt_generic_simple_unpack_function_cuda_vector_p)( opal_convertor_t* pConvertor, struct iovec* iov, uint32_t* out_size, size_t* max_data );                                                         
+};
+typedef struct opal_datatype_cuda_kernel_function_table opal_datatype_cuda_kernel_function_table_t;
+extern int32_t opal_datatype_cuda_kernel_support;
+
+void mca_cuda_convertor_init(opal_convertor_t* convertor, const void *pUserBuf, const struct opal_datatype_t* datatype);
 bool opal_cuda_check_bufs(char *dest, char *src);
 void* opal_cuda_memcpy(void * dest, const void * src, size_t size, opal_convertor_t* convertor);
 void* opal_cuda_memcpy_sync(void * dest, const void * src, size_t size);
@@ -29,4 +47,19 @@ void* opal_cuda_memmove(void * dest, void * src, size_t size);
 void opal_cuda_add_initialization_function(int (*fptr)(opal_common_cuda_function_table_t *));
 void opal_cuda_set_copy_function_async(opal_convertor_t* convertor, void *stream);
 
+int32_t opal_cuda_kernel_support_init(void);
+int32_t opal_cuda_kernel_support_fini(void);
+int32_t opal_generic_simple_pack_function_cuda_iov( opal_convertor_t* pConvertor, struct iovec* iov, uint32_t* out_size, size_t* max_data );
+int32_t opal_generic_simple_unpack_function_cuda_iov( opal_convertor_t* pConvertor, struct iovec* iov, uint32_t* out_size, size_t* max_data );
+int32_t opal_generic_simple_pack_function_cuda_vector( opal_convertor_t* pConvertor, struct iovec* iov, uint32_t* out_size, size_t* max_data );
+int32_t opal_generic_simple_unpack_function_cuda_vector( opal_convertor_t* pConvertor, struct iovec* iov, uint32_t* out_size, size_t* max_data ); 
+void* opal_cuda_malloc_gpu_buffer(size_t size, int gpu_id);
+void opal_cuda_free_gpu_buffer(void *addr, int gpu_id);
+void opal_cuda_d2dcpy(void* dst, const void* src, size_t count);
+void opal_cuda_d2dcpy_async(void* dst, const void* src, size_t count);
+void* opal_cached_cuda_iov_init(void);
+void opal_cached_cuda_iov_fini(void *cached_cuda_iov);
+void opal_cuda_set_cuda_stream(void);
+int32_t opal_cuda_get_cuda_stream(void);
+
 #endif
diff --git a/opal/datatype/opal_datatype_destroy.c b/opal/datatype/opal_datatype_destroy.c
index d468cd07e8c..593d5bfd67a 100644
--- a/opal/datatype/opal_datatype_destroy.c
+++ b/opal/datatype/opal_datatype_destroy.c
@@ -21,7 +21,7 @@
 #include "opal_config.h"
 #include "opal/constants.h"
 #include "opal/datatype/opal_datatype.h"
-#include "opal/datatype/opal_datatype_internal.h"
+#include "opal/datatype/opal_datatype_internal.h"  
 
 int32_t opal_datatype_destroy( opal_datatype_t** dt )
 {
diff --git a/opal/datatype/opal_datatype_module.c b/opal/datatype/opal_datatype_module.c
index 7de8fae5b08..77d6bfa62ac 100644
--- a/opal/datatype/opal_datatype_module.c
+++ b/opal/datatype/opal_datatype_module.c
@@ -33,6 +33,9 @@
 #include "opal/datatype/opal_datatype.h"
 #include "opal/datatype/opal_convertor_internal.h"
 #include "opal/mca/base/mca_base_var.h"
+#if OPAL_CUDA_SUPPORT
+#include "opal/datatype/opal_datatype_cuda.h"
+#endif /* OPAL_CUDA_SUPPORT */
 
 /* by default the debuging is turned off */
 int opal_datatype_dfd = -1;
@@ -248,6 +251,10 @@ int32_t opal_datatype_finalize( void )
     /* clear all master convertors */
     opal_convertor_destroy_masters();
 
+#if OPAL_CUDA_SUPPORT
+    opal_cuda_kernel_support_fini();
+#endif /* OPAL_CUDA_SUPPORT */
+
     return OPAL_SUCCESS;
 }
 
diff --git a/opal/datatype/opal_datatype_optimize.c b/opal/datatype/opal_datatype_optimize.c
index 5b66e4df595..e8b8d9794bd 100644
--- a/opal/datatype/opal_datatype_optimize.c
+++ b/opal/datatype/opal_datatype_optimize.c
@@ -303,5 +303,11 @@ int32_t opal_datatype_commit( opal_datatype_t * pData )
         pLast->first_elem_disp = first_elem_disp;
         pLast->size            = pData->size;
     }
+
+    /* save a compressed datatype description as a iovec list */
+//    opal_convertor_t* conv = opal_convertor_create( opal_local_arch, 0 /* unused */);
+//    opal_convertor_prepare_for_send( conv, pData, 1, (void*)0 );
+//    opal_convertor_to_iov(conv, &pData->iov, &pData->iov_count, &pData->max_data);
+//    OBJ_RELEASE(conv);
     return OPAL_SUCCESS;
 }
diff --git a/opal/datatype/opal_datatype_pack.c b/opal/datatype/opal_datatype_pack.c
index 45f1213b811..c8985db7913 100644
--- a/opal/datatype/opal_datatype_pack.c
+++ b/opal/datatype/opal_datatype_pack.c
@@ -37,15 +37,20 @@
 #include "opal/datatype/opal_datatype_checksum.h"
 #include "opal/datatype/opal_datatype_pack.h"
 #include "opal/datatype/opal_datatype_prototypes.h"
+#if OPAL_CUDA_SUPPORT
+#include "opal/datatype/opal_datatype_cuda.h"
+#endif /* OPAL_CUDA_SUPPORT */
 
 #if defined(CHECKSUM)
 #define opal_pack_homogeneous_contig_function           opal_pack_homogeneous_contig_checksum
 #define opal_pack_homogeneous_contig_with_gaps_function opal_pack_homogeneous_contig_with_gaps_checksum
 #define opal_generic_simple_pack_function               opal_generic_simple_pack_checksum
+#define opal_generic_simple_pack_cuda_function          opal_generic_simple_pack_cuda_checksum
 #else
 #define opal_pack_homogeneous_contig_function           opal_pack_homogeneous_contig
 #define opal_pack_homogeneous_contig_with_gaps_function opal_pack_homogeneous_contig_with_gaps
 #define opal_generic_simple_pack_function               opal_generic_simple_pack
+#define opal_generic_simple_pack_cuda_function          opal_generic_simple_pack_cuda
 #endif  /* defined(CHECKSUM) */
 
 
@@ -287,6 +292,7 @@ opal_generic_simple_pack_function( opal_convertor_t* pConvertor,
                            (void*)pConvertor, (void*)pConvertor->pBaseBuf,
                            iov[0].iov_base, (unsigned long)iov[0].iov_len, *out_size ); );
 
+    opal_output(0, "I am in simple pack, max_data %lu, iov_len %lu\n", *max_data, iov[0].iov_len);
     description = pConvertor->use_desc->desc;
 
     /* For the first step we have to add both displacement to the source. After in the
@@ -313,7 +319,7 @@ opal_generic_simple_pack_function( opal_convertor_t* pConvertor,
             while( pElem->elem.common.flags & OPAL_DATATYPE_FLAG_DATA ) {
                 /* now here we have a basic datatype */
                 PACK_PREDEFINED_DATATYPE( pConvertor, pElem, count_desc,
-                                          conv_ptr, iov_ptr, iov_len_local );
+                                        conv_ptr, iov_ptr, iov_len_local );
                 if( 0 == count_desc ) {  /* completed */
                     conv_ptr = pConvertor->pBaseBuf + pStack->disp;
                     pos_desc++;  /* advance to the next data */
@@ -384,6 +390,7 @@ opal_generic_simple_pack_function( opal_convertor_t* pConvertor,
     *out_size = iov_count;
     if( pConvertor->bConverted == pConvertor->local_size ) {
         pConvertor->flags |= CONVERTOR_COMPLETED;
+        opal_output(0, "total packed %lu\n", pConvertor->bConverted);
         return 1;
     }
     /* Save the global position for the next round */
@@ -393,3 +400,27 @@ opal_generic_simple_pack_function( opal_convertor_t* pConvertor,
                            pConvertor->stack_pos, pStack->index, (int)pStack->count, (long)pStack->disp ); );
     return 0;
 }
+
+int32_t
+opal_generic_simple_pack_cuda_function( opal_convertor_t* pConvertor,
+                                        struct iovec* iov, uint32_t* out_size,
+                                        size_t* max_data )
+{
+    dt_stack_t* pStack;
+    uint32_t pos_desc;
+    dt_elem_desc_t* description;
+    dt_elem_desc_t* pElem;
+    
+    description = pConvertor->use_desc->desc;
+    pStack = pConvertor->pStack + pConvertor->stack_pos;
+    pos_desc   = pStack->index;
+    pElem = &(description[pos_desc]);
+   
+    return opal_generic_simple_pack_function_cuda_iov( pConvertor, iov, out_size, max_data);
+    if( OPAL_DATATYPE_LOOP == pElem->elem.common.type ) {
+        return opal_generic_simple_pack_function_cuda_vector( pConvertor, iov, out_size, max_data);
+    } else {
+        return opal_generic_simple_pack_function_cuda_iov( pConvertor, iov, out_size, max_data);
+    }
+    return 0;
+}
diff --git a/opal/datatype/opal_datatype_prototypes.h b/opal/datatype/opal_datatype_prototypes.h
index bcfb59b9b31..0f9099f552f 100644
--- a/opal/datatype/opal_datatype_prototypes.h
+++ b/opal/datatype/opal_datatype_prototypes.h
@@ -60,6 +60,14 @@ opal_generic_simple_pack_checksum( opal_convertor_t* pConvertor,
                                    struct iovec* iov, uint32_t* out_size,
                                    size_t* max_data );
 int32_t
+opal_generic_simple_pack_cuda( opal_convertor_t* pConvertor,
+                               struct iovec* iov, uint32_t* out_size,
+                               size_t* max_data );
+int32_t
+opal_generic_simple_pack_cuda_checksum( opal_convertor_t* pConvertor,
+                                        struct iovec* iov, uint32_t* out_size,
+                                        size_t* max_data );
+int32_t
 opal_unpack_homogeneous_contig( opal_convertor_t* pConv,
                                 struct iovec* iov, uint32_t* out_size,
                                 size_t* max_data );
@@ -75,6 +83,14 @@ int32_t
 opal_generic_simple_unpack_checksum( opal_convertor_t* pConvertor,
                                      struct iovec* iov, uint32_t* out_size,
                                      size_t* max_data );
+int32_t
+opal_generic_simple_unpack_cuda( opal_convertor_t* pConvertor,
+                                struct iovec* iov, uint32_t* out_size,
+                                size_t* max_data );                                     
+int32_t
+opal_generic_simple_unpack_cuda_checksum( opal_convertor_t* pConvertor,
+                                        struct iovec* iov, uint32_t* out_size,
+                                        size_t* max_data );
 
 END_C_DECLS
 
diff --git a/opal/datatype/opal_datatype_unpack.c b/opal/datatype/opal_datatype_unpack.c
index e5c05e14e2d..5f51b3f828b 100644
--- a/opal/datatype/opal_datatype_unpack.c
+++ b/opal/datatype/opal_datatype_unpack.c
@@ -39,15 +39,20 @@
 #include "opal/datatype/opal_datatype_checksum.h"
 #include "opal/datatype/opal_datatype_unpack.h"
 #include "opal/datatype/opal_datatype_prototypes.h"
+#if OPAL_CUDA_SUPPORT
+#include "opal/datatype/opal_datatype_cuda.h"
+#endif /* OPAL_CUDA_SUPPORT */
 
 #if defined(CHECKSUM)
 #define opal_unpack_general_function            opal_unpack_general_checksum
 #define opal_unpack_homogeneous_contig_function opal_unpack_homogeneous_contig_checksum
 #define opal_generic_simple_unpack_function     opal_generic_simple_unpack_checksum
+#define opal_generic_simple_unpack_cuda_function     opal_generic_simple_unpack_cuda_checksum
 #else
 #define opal_unpack_general_function            opal_unpack_general
 #define opal_unpack_homogeneous_contig_function opal_unpack_homogeneous_contig
 #define opal_generic_simple_unpack_function     opal_generic_simple_unpack
+#define opal_generic_simple_unpack_cuda_function     opal_generic_simple_unpack_cuda
 #endif  /* defined(CHECKSUM) */
 
 
@@ -272,8 +277,9 @@ opal_generic_simple_unpack_function( opal_convertor_t* pConvertor,
     size_t iov_len_local;
     uint32_t iov_count;
 
+    printf("i am in simple unpack, max_data %lu, iov len %lu\n", *max_data, iov[0].iov_len);
     DO_DEBUG( opal_output( 0, "opal_convertor_generic_simple_unpack( %p, {%p, %lu}, %u )\n",
-                           (void*)pConvertor, iov[0].iov_base, (unsigned long)iov[0].iov_len, *out_size ); );
+                           (void*)pConvertor, iov[0].iov_base, (unsigned long)iov[0].iov_len, *out_size ); )                  
 
     description = pConvertor->use_desc->desc;
 
@@ -379,7 +385,7 @@ opal_generic_simple_unpack_function( opal_convertor_t* pConvertor,
             if( OPAL_DATATYPE_LOOP == pElem->elem.common.type ) {
                 OPAL_PTRDIFF_TYPE local_disp = (OPAL_PTRDIFF_TYPE)conv_ptr;
                 if( pElem->loop.common.flags & OPAL_DATATYPE_FLAG_CONTIGUOUS ) {
-                    UNPACK_CONTIGUOUS_LOOP( pConvertor, pElem, count_desc,
+                    UNPACK_CONTIGUOUS_LOOP( pConvertor, pElem, count_desc, 
                                             iov_ptr, conv_ptr, iov_len_local );
                     if( 0 == count_desc ) {  /* completed */
                         pos_desc += pElem->loop.items + 1;
@@ -408,6 +414,13 @@ opal_generic_simple_unpack_function( opal_convertor_t* pConvertor,
     *out_size = iov_count;
     if( pConvertor->bConverted == pConvertor->remote_size ) {
         pConvertor->flags |= CONVERTOR_COMPLETED;
+        printf("total unpacked %lu\n", pConvertor->bConverted);
+        // double *vtmp = (double *)iov[0].iov_base;
+        // for (uint32_t i = 0; i < total_unpacked/8; i++) {
+        //     printf(" %1.f ", *vtmp);
+        //     vtmp ++;
+        // }
+        // printf("\n");
         return 1;
     }
     /* Save the global position for the next round */
@@ -581,3 +594,27 @@ opal_unpack_general_function( opal_convertor_t* pConvertor,
                            pConvertor->stack_pos, pStack->index, (int)pStack->count, (long)pStack->disp ); );
     return 0;
 }
+
+int32_t
+opal_generic_simple_unpack_cuda_function( opal_convertor_t* pConvertor,
+                                          struct iovec* iov, uint32_t* out_size,
+                                          size_t* max_data )
+{
+    dt_stack_t* pStack;
+    uint32_t pos_desc;
+    dt_elem_desc_t* description;
+    dt_elem_desc_t* pElem;
+    
+    description = pConvertor->use_desc->desc;
+    pStack = pConvertor->pStack + pConvertor->stack_pos;
+    pos_desc   = pStack->index;
+    pElem = &(description[pos_desc]);
+   
+    return opal_generic_simple_unpack_function_cuda_iov( pConvertor, iov, out_size, max_data);
+    if( OPAL_DATATYPE_LOOP == pElem->elem.common.type ) {
+        return opal_generic_simple_unpack_function_cuda_vector( pConvertor, iov, out_size, max_data);
+    } else {
+        return opal_generic_simple_unpack_function_cuda_iov( pConvertor, iov, out_size, max_data);
+    }
+    return 0;
+}
diff --git a/opal/mca/btl/btl.h b/opal/mca/btl/btl.h
index 691af933d14..7e693c62b84 100644
--- a/opal/mca/btl/btl.h
+++ b/opal/mca/btl/btl.h
@@ -188,6 +188,9 @@ typedef uint8_t mca_btl_base_tag_t;
 #define MCA_BTL_TAG_IB                (MCA_BTL_TAG_BTL + 0)
 #define MCA_BTL_TAG_UDAPL             (MCA_BTL_TAG_BTL + 1)
 #define MCA_BTL_TAG_SMCUDA            (MCA_BTL_TAG_BTL + 2)
+#define MCA_BTL_TAG_SMCUDA_DATATYPE_UNPACK   (MCA_BTL_TAG_BTL + 3)
+#define MCA_BTL_TAG_SMCUDA_DATATYPE_PACK     (MCA_BTL_TAG_BTL + 4)
+#define MCA_BTL_TAG_SMCUDA_DATATYPE_PUT      (MCA_BTL_TAG_BTL + 5)
 
 /* prefered protocol */
 #define MCA_BTL_FLAGS_SEND            0x0001
@@ -1179,6 +1182,9 @@ struct mca_btl_base_module_t {
 #endif /* OPAL_CUDA_GDR_SUPPORT */
 #if OPAL_CUDA_SUPPORT
     size_t      btl_cuda_max_send_size;   /**< set if CUDA max send_size is different from host max send size */
+    int32_t     btl_cuda_ddt_allow_rdma;
+    size_t      btl_cuda_ddt_pipeline_size;
+    int32_t     btl_cuda_ddt_pipeline_depth;
 #endif /* OPAL_CUDA_SUPPORT */
 };
 typedef struct mca_btl_base_module_t mca_btl_base_module_t;
diff --git a/opal/mca/btl/openib/btl_openib_frag.h b/opal/mca/btl/openib/btl_openib_frag.h
index 7ca37142429..b73a817e1e6 100644
--- a/opal/mca/btl/openib/btl_openib_frag.h
+++ b/opal/mca/btl/openib/btl_openib_frag.h
@@ -25,6 +25,8 @@
 #ifndef MCA_BTL_IB_FRAG_H
 #define MCA_BTL_IB_FRAG_H
 
+#define OPAL_OPENIB_PAD_HDR 1
+
 #include "opal_config.h"
 #include "opal/align.h"
 #include "opal/mca/btl/btl.h"
diff --git a/opal/mca/btl/openib/btl_openib_mca.c b/opal/mca/btl/openib/btl_openib_mca.c
index 07dcdd07c76..6a0d4ef25cf 100644
--- a/opal/mca/btl/openib/btl_openib_mca.c
+++ b/opal/mca/btl/openib/btl_openib_mca.c
@@ -648,6 +648,7 @@ int btl_openib_register_mca_params(void)
         mca_btl_openib_module.super.btl_cuda_rdma_limit = 0;  /* Unused */
     }
 #endif /* OPAL_CUDA_GDR_SUPPORT */
+    mca_btl_openib_module.super.btl_cuda_ddt_allow_rdma = 0;
 #endif /* OPAL_CUDA_SUPPORT */
     CHECK(mca_btl_base_param_register(
             &mca_btl_openib_component.super.btl_version,
diff --git a/opal/mca/btl/smcuda/btl_smcuda.c b/opal/mca/btl/smcuda/btl_smcuda.c
index bf470f4fb72..7e1441fd8e1 100644
--- a/opal/mca/btl/smcuda/btl_smcuda.c
+++ b/opal/mca/btl/smcuda/btl_smcuda.c
@@ -55,6 +55,7 @@
 
 #if OPAL_CUDA_SUPPORT
 #include "opal/mca/common/cuda/common_cuda.h"
+#include "opal/datatype/opal_datatype_cuda.h"
 #endif /* OPAL_CUDA_SUPPORT */
 #include "opal/mca/mpool/base/base.h"
 #include "opal/mca/mpool/sm/mpool_sm.h"
@@ -71,6 +72,10 @@
 #include "btl_smcuda_frag.h"
 #include "btl_smcuda_fifo.h"
 
+#include "ompi/mca/bml/bml.h"
+#include "ompi/mca/pml/ob1/pml_ob1_rdmafrag.h"
+#include "ompi/mca/pml/base/pml_base_request.h"
+
 #if OPAL_CUDA_SUPPORT
 static struct mca_btl_base_registration_handle_t *mca_btl_smcuda_register_mem (
     struct mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t *endpoint, void *base,
@@ -78,6 +83,14 @@ static struct mca_btl_base_registration_handle_t *mca_btl_smcuda_register_mem (
 
 static int mca_btl_smcuda_deregister_mem (struct mca_btl_base_module_t* btl,
                                           struct mca_btl_base_registration_handle_t *handle);
+                                          
+inline static int mca_btl_smcuda_cuda_ddt_start_pack(struct mca_btl_base_module_t *btl, 
+                                                     struct mca_btl_base_endpoint_t *endpoint,
+                                                     struct opal_convertor_t *pack_convertor,
+                                                     struct opal_convertor_t *unpack_convertor,
+                                                     void *remote_gpu_address,
+                                                     mca_btl_base_descriptor_t *frag,
+                                                     int lindex, int remote_device, int local_device);
 #endif
 
 mca_btl_smcuda_t mca_btl_smcuda = {
@@ -488,6 +501,14 @@ create_sm_endpoint(int local_proc, struct opal_proc_t *proc)
         ep->mpool = mca_mpool_base_module_create("rgpusm",
                                                  NULL,
                                                  &resources);
+        /* alloc array for pack/unpack use */
+        ep->smcuda_ddt_clone = NULL;
+        ep->smcuda_ddt_clone = (cuda_ddt_clone_t *)malloc(sizeof(cuda_ddt_clone_t) * SMCUDA_DT_CLONE_SIZE);
+        ep->smcuda_ddt_clone_size = SMCUDA_DT_CLONE_SIZE;
+        ep->smcuda_ddt_clone_avail = SMCUDA_DT_CLONE_SIZE;
+        for (int i = 0; i < ep->smcuda_ddt_clone_size; i++) {
+            ep->smcuda_ddt_clone[i].lindex = -1;
+        }
     }
 #endif /* OPAL_CUDA_SUPPORT */
     return ep;
@@ -693,6 +714,15 @@ int mca_btl_smcuda_del_procs(
     struct opal_proc_t **procs,
     struct mca_btl_base_endpoint_t **peers)
 {
+    int32_t proc;
+    struct mca_btl_base_endpoint_t * ep;
+    for (proc = 0; proc < (int32_t)nprocs; proc++) {
+        ep = peers[proc];
+        if (ep->smcuda_ddt_clone != NULL) {
+            free(ep->smcuda_ddt_clone);
+            ep->smcuda_ddt_clone = NULL;
+        }
+    }
     return OPAL_SUCCESS;
 }
 
@@ -1107,6 +1137,7 @@ int mca_btl_smcuda_get_cuda (struct mca_btl_base_module_t *btl,
     offset = (size_t) ((intptr_t) remote_address - (intptr_t) reg_ptr->base.base);
     remote_memory_address = (unsigned char *)reg_ptr->base.alloc_base + offset;
     if (0 != offset) {
+        printf("!!!!!!offset %lu, ra %p, base %p, remote %p\n", offset, (void*)remote_address, (void*)reg_ptr->base.base, remote_memory_address);
         opal_output(-1, "OFFSET=%d", (int)offset);
     }
 
@@ -1116,18 +1147,102 @@ int mca_btl_smcuda_get_cuda (struct mca_btl_base_module_t *btl,
      * on the IPC event that we received.  Note that we pull it from
      * rget_reg, not reg_ptr, as we do not cache the event. */
     mca_common_wait_stream_synchronize(&rget_reg);
-
-    rc = mca_common_cuda_memcpy(local_address, remote_memory_address, size,
-				"mca_btl_smcuda_get", (mca_btl_base_descriptor_t *)frag,
-				&done);
-    if (OPAL_SUCCESS != rc) {
-        /* Out of resources can be handled by upper layers. */
-        if (OPAL_ERR_OUT_OF_RESOURCE != rc) {
-            opal_output(0, "Failed to cuMemcpy GPU memory, rc=%d", rc);
+    
+    /* datatype RDMA */
+    mca_pml_ob1_rdma_frag_t *frag_ob1 = cbdata;
+    mca_bml_base_btl_t *bml_btl = frag_ob1->rdma_bml;
+    mca_pml_base_request_t *req = (mca_pml_base_request_t*) frag_ob1->rdma_req;
+    opal_convertor_t* unpack_convertor = &req->req_convertor;
+
+    if ((unpack_convertor->flags & CONVERTOR_CUDA) &&
+        (bml_btl->btl_flags & MCA_BTL_FLAGS_CUDA_GET)) {
+        unpack_convertor->flags &= ~CONVERTOR_CUDA;
+        uint8_t pack_required = remote_handle->reg_data.pack_required;
+        int lindex = -1;
+        int remote_device = remote_handle->reg_data.gpu_device;
+        opal_convertor_t* pack_convertor = remote_handle->reg_data.pack_convertor;
+        int local_device = 0;
+        rc = mca_common_cuda_get_device(&local_device);
+        if (rc != 0) {
+            opal_output(0, "Failed to get the GPU device ID, rc=%d", rc);
+            return rc;
+        }
+        if(opal_convertor_need_buffers(unpack_convertor) == true) {
+            unpack_convertor->flags |= CONVERTOR_CUDA;
+            
+            printf("local addr %p, pbase %p\n", local_address, unpack_convertor->pBaseBuf);
+            
+            if (remote_device != local_device && !OPAL_DATATYPE_DIRECT_COPY_GPUMEM) {
+                unpack_convertor->gpu_buffer_ptr = NULL;  
+            } else {
+                unpack_convertor->gpu_buffer_ptr = remote_memory_address;   
+            }
+            if (pack_required) {
+                lindex = mca_btl_smcuda_alloc_cuda_ddt_clone(ep);
+                mca_btl_smcuda_cuda_ddt_start_pack(btl, ep, pack_convertor, unpack_convertor, remote_memory_address, (mca_btl_base_descriptor_t *)frag, 
+                                                    lindex, remote_device, local_device);
+                done = 0;
+            } else {
+                struct iovec iov;
+                uint32_t iov_count = 1;
+                size_t max_data;
+                opal_cuda_set_cuda_stream();
+                if (!OPAL_DATATYPE_DIRECT_COPY_GPUMEM && remote_device != local_device) {
+                    unpack_convertor->gpu_buffer_ptr = opal_cuda_malloc_gpu_buffer(size, 0);
+                    opal_cuda_d2dcpy_async(unpack_convertor->gpu_buffer_ptr, remote_memory_address, size);
+                    iov.iov_base = unpack_convertor->gpu_buffer_ptr;
+                    opal_output(0, "start D2D copy src %p, dst %p, size %lu, stream id %d\n", remote_memory_address, unpack_convertor->gpu_buffer_ptr, size, opal_cuda_get_cuda_stream());
+                } else {
+                    iov.iov_base = unpack_convertor->gpu_buffer_ptr;
+                }
+                iov.iov_len = size;
+                max_data = size;
+                opal_convertor_unpack(unpack_convertor, &iov, &iov_count, &max_data );
+                opal_cuda_free_gpu_buffer(unpack_convertor->gpu_buffer_ptr, 0);
+                done = 1;
+            }
+        } else {
+            unpack_convertor->flags |= CONVERTOR_CUDA;
+            if (pack_required) {
+                lindex = mca_btl_smcuda_alloc_cuda_ddt_clone(ep);
+                if (remote_device == local_device || OPAL_DATATYPE_DIRECT_COPY_GPUMEM) {
+                    /* now we are able to let sender pack directly to my memory */
+                    mca_mpool_common_cuda_reg_t loc_reg;
+                    mca_mpool_common_cuda_reg_t *loc_reg_ptr = &loc_reg;
+                    cuda_ddt_put_hdr_t put_msg;
+                    if (OPAL_SUCCESS != cuda_getmemhandle(local_address, size, (mca_mpool_base_registration_t *)&loc_reg, NULL)) {
+                        mca_btl_smcuda_cuda_ddt_start_pack(btl, ep, pack_convertor, unpack_convertor, remote_memory_address, (mca_btl_base_descriptor_t *)frag, 
+                                                           lindex, remote_device, local_device);
+                    }
+                    memcpy(put_msg.mem_handle, loc_reg_ptr->data.memHandle, sizeof(loc_reg_ptr->data.memHandle));
+                    put_msg.remote_address = local_address;
+                    put_msg.remote_base = loc_reg.base.base;
+                    put_msg.lindex = lindex;
+                    put_msg.pack_convertor = pack_convertor;
+                    mca_btl_smcuda_cuda_ddt_clone(ep, pack_convertor, unpack_convertor, remote_memory_address, (mca_btl_base_descriptor_t *)frag, 
+                                                        lindex, 0, 0);
+                    mca_btl_smcuda_send_cuda_put_sig(btl, ep, &put_msg);
+                } else {
+                    mca_btl_smcuda_cuda_ddt_start_pack(btl, ep, pack_convertor, unpack_convertor, remote_memory_address, (mca_btl_base_descriptor_t *)frag, 
+                                                       lindex, remote_device, local_device);
+                }
+                done = 0;
+            } else {
+                rc = mca_common_cuda_memcpy(local_address, remote_memory_address, size,
+        		            "mca_btl_smcuda_get", (mca_btl_base_descriptor_t *)frag,
+        				    &done);
+                if (OPAL_SUCCESS != rc) {
+                    /* Out of resources can be handled by upper layers. */
+                    if (OPAL_ERR_OUT_OF_RESOURCE != rc) {
+                        opal_output(0, "Failed to cuMemcpy GPU memory, rc=%d", rc);
+                    }
+                    return rc;
+                }
+            }
         }
-        return rc;
     }
 
+
     if (OPAL_UNLIKELY(1 == done)) {
         cbfunc (btl, ep, local_address, local_handle, cbcontext, cbdata, OPAL_SUCCESS);
         mca_btl_smcuda_free(btl, (mca_btl_base_descriptor_t *)frag);
@@ -1217,6 +1332,138 @@ static void mca_btl_smcuda_send_cuda_ipc_request(struct mca_btl_base_module_t* b
 
 }
 
+int mca_btl_smcuda_send_cuda_unpack_sig(struct mca_btl_base_module_t* btl,
+                                           struct mca_btl_base_endpoint_t* endpoint, 
+                                           cuda_ddt_hdr_t *send_msg)
+{
+    mca_btl_smcuda_frag_t* frag;
+    int rc;
+    
+    /* allocate a fragment, giving up if we can't get one */
+    MCA_BTL_SMCUDA_FRAG_ALLOC_EAGER(frag);
+    if( OPAL_UNLIKELY(NULL == frag) ) {
+        opal_output(0, "no frag for send unpack sig\n");
+        return OPAL_ERR_OUT_OF_RESOURCE;;
+    }
+
+    /* Fill in fragment fields. */
+    frag->base.des_flags = MCA_BTL_DES_FLAGS_BTL_OWNERSHIP;
+    memcpy(frag->segment.seg_addr.pval, send_msg, sizeof(cuda_ddt_hdr_t));
+    
+    rc = mca_btl_smcuda_send(btl, endpoint, (struct mca_btl_base_descriptor_t*)frag,  MCA_BTL_TAG_SMCUDA_DATATYPE_UNPACK);
+    return rc;
+}
+
+int mca_btl_smcuda_send_cuda_pack_sig(struct mca_btl_base_module_t* btl,
+                                      struct mca_btl_base_endpoint_t* endpoint, 
+                                      cuda_ddt_hdr_t *send_msg)
+{
+    mca_btl_smcuda_frag_t* frag;
+    int rc;
+    
+    /* allocate a fragment, giving up if we can't get one */
+    MCA_BTL_SMCUDA_FRAG_ALLOC_EAGER(frag);
+    if( OPAL_UNLIKELY(NULL == frag) ) {
+        opal_output(0, "no frag for send pack sig\n");
+        return OPAL_ERR_OUT_OF_RESOURCE;;
+    }
+
+    /* Fill in fragment fields. */
+    frag->base.des_flags = MCA_BTL_DES_FLAGS_BTL_OWNERSHIP;
+    memcpy(frag->segment.seg_addr.pval, send_msg, sizeof(cuda_ddt_hdr_t));
+    
+    rc = mca_btl_smcuda_send(btl, endpoint, (struct mca_btl_base_descriptor_t*)frag,  MCA_BTL_TAG_SMCUDA_DATATYPE_PACK);
+    return rc;
+}
+
+int mca_btl_smcuda_send_cuda_put_sig(struct mca_btl_base_module_t* btl,
+                                     struct mca_btl_base_endpoint_t* endpoint, 
+                                     cuda_ddt_put_hdr_t *put_msg)
+{
+    mca_btl_smcuda_frag_t* frag;
+    int rc;
+    
+    /* allocate a fragment, giving up if we can't get one */
+    MCA_BTL_SMCUDA_FRAG_ALLOC_EAGER(frag);
+    if( OPAL_UNLIKELY(NULL == frag) ) {
+        opal_output(0, "no frag for send put sig\n");
+        return OPAL_ERR_OUT_OF_RESOURCE;;
+    }
+
+    /* Fill in fragment fields. */
+    frag->base.des_flags = MCA_BTL_DES_FLAGS_BTL_OWNERSHIP;
+    memcpy(frag->segment.seg_addr.pval, put_msg, sizeof(cuda_ddt_put_hdr_t));
+    
+    rc = mca_btl_smcuda_send(btl, endpoint, (struct mca_btl_base_descriptor_t*)frag,  MCA_BTL_TAG_SMCUDA_DATATYPE_PUT);
+    return rc;
+}
+
+inline static int mca_btl_smcuda_cuda_ddt_start_pack(struct mca_btl_base_module_t *btl,
+                                                     struct mca_btl_base_endpoint_t *endpoint,
+                                                     struct opal_convertor_t *pack_convertor,
+                                                     struct opal_convertor_t *unpack_convertor,
+                                                     void *remote_gpu_address,
+                                                     mca_btl_base_descriptor_t *frag,
+                                                     int lindex, int remote_device, int local_device)
+{
+    cuda_ddt_hdr_t send_msg;
+    mca_btl_smcuda_cuda_ddt_clone(endpoint, pack_convertor, unpack_convertor, remote_gpu_address, (mca_btl_base_descriptor_t *)frag, 
+                                        lindex, remote_device, local_device);
+    send_msg.lindex = lindex;
+    send_msg.packed_size = 0;
+    send_msg.seq = 0;
+    send_msg.msg_type = CUDA_DDT_PACK_START;
+    send_msg.pack_convertor = pack_convertor;
+    opal_output(0, "smcuda btl start pack, remote_gpu_address %p, frag %p, lindex %d, remote_device %d, local_device %d\n",
+                (void*)remote_gpu_address, (void*)frag, lindex, remote_device, local_device);
+    mca_btl_smcuda_send_cuda_pack_sig(btl, endpoint, &send_msg);
+    return OPAL_SUCCESS;
+}
+
+int mca_btl_smcuda_alloc_cuda_ddt_clone(struct mca_btl_base_endpoint_t *endpoint)
+{
+    int i;
+    if (endpoint->smcuda_ddt_clone_avail > 0) {
+        for (i = 0; i < endpoint->smcuda_ddt_clone_size; i++) {
+            if (endpoint->smcuda_ddt_clone[i].lindex == -1) {
+                endpoint->smcuda_ddt_clone_avail --;
+                opal_output(0, "Alloc cuda ddt clone array success, lindex %d\n",i);
+                return i;
+            }
+        }
+    } else {
+        endpoint->smcuda_ddt_clone = realloc(endpoint->smcuda_ddt_clone, endpoint->smcuda_ddt_clone_size + SMCUDA_DT_CLONE_SIZE);
+        endpoint->smcuda_ddt_clone_avail = SMCUDA_DT_CLONE_SIZE - 1;
+        endpoint->smcuda_ddt_clone_size += SMCUDA_DT_CLONE_SIZE;
+        return endpoint->smcuda_ddt_clone_size - SMCUDA_DT_CLONE_SIZE;
+    }
+    return -1;
+}
+
+void mca_btl_smcuda_free_cuda_ddt_clone(struct mca_btl_base_endpoint_t *endpoint, int lindex)
+{
+    assert(endpoint->smcuda_ddt_clone[lindex].lindex == lindex);
+    endpoint->smcuda_ddt_clone[lindex].lindex = -1;
+    endpoint->smcuda_ddt_clone_avail ++;
+}
+
+void mca_btl_smcuda_cuda_ddt_clone(struct mca_btl_base_endpoint_t *endpoint,
+                                   struct opal_convertor_t *pack_convertor,
+                                   struct opal_convertor_t *unpack_convertor,
+                                   void *remote_gpu_address,
+                                   mca_btl_base_descriptor_t *frag,
+                                   int lindex, int remote_device, int local_device)
+{
+    endpoint->smcuda_ddt_clone[lindex].pack_convertor = pack_convertor;
+    endpoint->smcuda_ddt_clone[lindex].unpack_convertor = unpack_convertor;
+    endpoint->smcuda_ddt_clone[lindex].current_unpack_convertor_pBaseBuf = unpack_convertor->pBaseBuf;
+    endpoint->smcuda_ddt_clone[lindex].remote_gpu_address = remote_gpu_address;
+    endpoint->smcuda_ddt_clone[lindex].lindex = lindex;
+    endpoint->smcuda_ddt_clone[lindex].remote_device = remote_device;
+    endpoint->smcuda_ddt_clone[lindex].local_device = local_device;
+    endpoint->smcuda_ddt_clone[lindex].frag = frag;
+}
+
 #endif /* OPAL_CUDA_SUPPORT */
 
 /**
diff --git a/opal/mca/btl/smcuda/btl_smcuda.h b/opal/mca/btl/smcuda/btl_smcuda.h
index 7c9d30faded..8305029d79e 100644
--- a/opal/mca/btl/smcuda/btl_smcuda.h
+++ b/opal/mca/btl/smcuda/btl_smcuda.h
@@ -41,6 +41,8 @@
 #include "opal/mca/btl/btl.h"
 #include "opal/mca/common/sm/common_sm.h"
 
+#define OPAL_DATATYPE_DIRECT_COPY_GPUMEM    0
+
 BEGIN_C_DECLS
 
 /*
@@ -205,6 +207,7 @@ struct mca_btl_smcuda_component_t {
     int cuda_ipc_output;
     int use_cuda_ipc;
     int use_cuda_ipc_same_gpu;
+    int cuda_ddt_pipeline_size;
 #endif /* OPAL_CUDA_SUPPORT */
 };
 typedef struct mca_btl_smcuda_component_t mca_btl_smcuda_component_t;
@@ -508,6 +511,58 @@ enum ipcState {
     IPC_BAD
 };
 
+/* cuda datatype pack/unpack message */
+typedef struct {
+    int lindex;
+    int seq;
+    int msg_type;
+    int packed_size;
+    struct opal_convertor_t *pack_convertor;
+} cuda_ddt_hdr_t;
+
+/* cuda datatype put message */
+typedef struct {
+    int lindex;
+    void *remote_address;
+    void *remote_base;
+    uint64_t mem_handle[8];
+    struct opal_convertor_t *pack_convertor;
+} cuda_ddt_put_hdr_t;
+
+#define CUDA_DDT_UNPACK_FROM_BLOCK  0
+#define CUDA_DDT_COMPLETE           1
+#define CUDA_DDT_COMPLETE_ACK       2
+#define CUDA_DDT_CLEANUP            3
+#define CUDA_DDT_PACK_START         4
+#define CUDA_DDT_PACK_TO_BLOCK      5
+#define CUDA_UNPACK_NO              6
+
+/* package save pack/unpack convertor and cbfunc */
+typedef struct {
+    struct opal_convertor_t *pack_convertor;
+    struct opal_convertor_t *unpack_convertor;
+    unsigned char *current_unpack_convertor_pBaseBuf;
+    void *remote_gpu_address;
+    int lindex;
+    int remote_device;
+    int local_device;
+    mca_btl_base_descriptor_t *frag;
+} cuda_ddt_clone_t;
+
+#define SMCUDA_DT_CLONE_SIZE 20
+
+int mca_btl_smcuda_send_cuda_unpack_sig(struct mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t* endpoint, cuda_ddt_hdr_t *send_msg);
+int mca_btl_smcuda_send_cuda_pack_sig(struct mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t* endpoint, cuda_ddt_hdr_t *send_msg);
+int mca_btl_smcuda_send_cuda_put_sig(struct mca_btl_base_module_t* btl, struct mca_btl_base_endpoint_t* endpoint, cuda_ddt_put_hdr_t *put_msg);
+int mca_btl_smcuda_alloc_cuda_ddt_clone(struct mca_btl_base_endpoint_t *endpoint);
+void mca_btl_smcuda_free_cuda_ddt_clone(struct mca_btl_base_endpoint_t *endpoint, int lindex);
+void mca_btl_smcuda_cuda_ddt_clone(struct mca_btl_base_endpoint_t *endpoint,
+                                   struct opal_convertor_t *pack_convertor,
+                                   struct opal_convertor_t *unpack_convertor,
+                                   void *remote_gpu_address,
+                                   mca_btl_base_descriptor_t *frag,
+                                   int lindex, int remote_device, int local_device);
+
 #endif /* OPAL_CUDA_SUPPORT */
 
 
diff --git a/opal/mca/btl/smcuda/btl_smcuda_component.c b/opal/mca/btl/smcuda/btl_smcuda_component.c
index dcbf0ec5180..f8bcb5eb865 100644
--- a/opal/mca/btl/smcuda/btl_smcuda_component.c
+++ b/opal/mca/btl/smcuda/btl_smcuda_component.c
@@ -54,6 +54,7 @@
 
 #if OPAL_CUDA_SUPPORT
 #include "opal/mca/common/cuda/common_cuda.h"
+#include "opal/datatype/opal_datatype_cuda.h"
 #endif /* OPAL_CUDA_SUPPORT */
 #if OPAL_ENABLE_FT_CR    == 1
 #include "opal/runtime/opal_cr.h"
@@ -141,7 +142,7 @@ static int mca_btl_smcuda_component_verify(void) {
 static int smcuda_register(void)
 {
     /* register SM component parameters */
-    mca_btl_smcuda_param_register_int("free_list_num", 8, OPAL_INFO_LVL_5, &mca_btl_smcuda_component.sm_free_list_num);
+    mca_btl_smcuda_param_register_int("free_list_num", 16, OPAL_INFO_LVL_5, &mca_btl_smcuda_component.sm_free_list_num);
     mca_btl_smcuda_param_register_int("free_list_max", -1, OPAL_INFO_LVL_5, &mca_btl_smcuda_component.sm_free_list_max);
     mca_btl_smcuda_param_register_int("free_list_inc", 64, OPAL_INFO_LVL_5, &mca_btl_smcuda_component.sm_free_list_inc);
     mca_btl_smcuda_param_register_int("max_procs", -1, OPAL_INFO_LVL_5, &mca_btl_smcuda_component.sm_max_procs);
@@ -166,11 +167,16 @@ static int smcuda_register(void)
     mca_btl_smcuda_param_register_int("use_cuda_ipc", 1, OPAL_INFO_LVL_4, &mca_btl_smcuda_component.use_cuda_ipc);
     mca_btl_smcuda_param_register_int("use_cuda_ipc_same_gpu", 1, OPAL_INFO_LVL_4,&mca_btl_smcuda_component.use_cuda_ipc_same_gpu);
     mca_btl_smcuda_param_register_int("cuda_ipc_verbose", 0, OPAL_INFO_LVL_4, &mca_btl_smcuda_component.cuda_ipc_verbose);
+    mca_btl_smcuda_param_register_int("cuda_ddt_pipeline_size", 1024*1024*400, OPAL_INFO_LVL_4, &mca_btl_smcuda_component.cuda_ddt_pipeline_size);
     mca_btl_smcuda_component.cuda_ipc_output = opal_output_open(NULL);
     opal_output_set_verbosity(mca_btl_smcuda_component.cuda_ipc_output, mca_btl_smcuda_component.cuda_ipc_verbose);
 #else /* OPAL_CUDA_SUPPORT */
     mca_btl_smcuda.super.btl_exclusivity = MCA_BTL_EXCLUSIVITY_LOW;
 #endif /* OPAL_CUDA_SUPPORT */
+    mca_btl_smcuda.super.btl_cuda_ddt_pipeline_size = mca_btl_smcuda_component.cuda_ddt_pipeline_size;
+    printf("pipeline size %lu\n", mca_btl_smcuda.super.btl_cuda_ddt_pipeline_size);
+    mca_btl_smcuda.super.btl_cuda_ddt_pipeline_depth = 4;
+    mca_btl_smcuda.super.btl_cuda_ddt_allow_rdma = 1;
     mca_btl_smcuda.super.btl_eager_limit = 4*1024;
     mca_btl_smcuda.super.btl_rndv_eager_limit = 4*1024;
     mca_btl_smcuda.super.btl_max_send_size = 32*1024;
@@ -846,6 +852,209 @@ static void btl_smcuda_control(mca_btl_base_module_t* btl,
     }
 }
 
+/* for receiver */
+static void btl_smcuda_datatype_unpack(mca_btl_base_module_t* btl,
+                                       mca_btl_base_tag_t tag,
+                                       mca_btl_base_descriptor_t* des, void* cbdata)
+{   
+    struct mca_btl_base_endpoint_t *endpoint = NULL;
+    cuda_ddt_hdr_t recv_msg;
+    mca_btl_base_segment_t* segments = des->des_segments;
+    memcpy(&recv_msg, segments->seg_addr.pval, sizeof(cuda_ddt_hdr_t));
+    int seq = recv_msg.seq;
+    int lindex = recv_msg.lindex;
+    size_t packed_size = recv_msg.packed_size;
+    int msg_type = recv_msg.msg_type;
+    mca_btl_smcuda_frag_t *frag = (mca_btl_smcuda_frag_t *)des;
+    cuda_ddt_clone_t *my_cuda_dt_clone;
+
+    /* We can find the endoint back from the rank embedded in the header */
+    endpoint = mca_btl_smcuda_component.sm_peers[frag->hdr->my_smp_rank];
+    my_cuda_dt_clone = &endpoint->smcuda_ddt_clone[lindex];
+    assert(my_cuda_dt_clone->lindex == lindex);
+    
+    cuda_ddt_hdr_t send_msg;
+    send_msg.lindex = lindex;
+    send_msg.pack_convertor = my_cuda_dt_clone->pack_convertor;
+    
+    if (msg_type == CUDA_DDT_CLEANUP) {
+        mca_btl_smcuda_frag_t *frag_recv = (mca_btl_smcuda_frag_t *) my_cuda_dt_clone->frag;
+        mca_btl_base_rdma_completion_fn_t cbfunc = (mca_btl_base_rdma_completion_fn_t) frag_recv->base.des_cbfunc;
+        cbfunc (btl, endpoint, frag_recv->segment.seg_addr.pval, frag_recv->local_handle, frag_recv->base.des_context, frag_recv->base.des_cbdata, OPAL_SUCCESS);
+        mca_btl_smcuda_free(btl, (mca_btl_base_descriptor_t *)frag_recv);
+        mca_btl_smcuda_free_cuda_ddt_clone(endpoint, lindex);
+    } else if (msg_type == CUDA_DDT_UNPACK_FROM_BLOCK || msg_type == CUDA_DDT_COMPLETE){
+        struct iovec iov;
+        uint32_t iov_count = 1;
+        size_t max_data;
+        struct opal_convertor_t *convertor = my_cuda_dt_clone->unpack_convertor;
+        size_t pipeline_size = mca_btl_smcuda_component.cuda_ddt_pipeline_size;
+        convertor->flags &= ~CONVERTOR_CUDA;
+        unsigned char *remote_address = NULL;
+        if (opal_convertor_need_buffers(convertor) == false) { /* do not unpack */
+            convertor->flags |= CONVERTOR_CUDA;
+            unsigned char *local_address = my_cuda_dt_clone->current_unpack_convertor_pBaseBuf;
+            remote_address = (unsigned char*)my_cuda_dt_clone->remote_gpu_address + seq * pipeline_size;
+            opal_output(0, "no unpack, start D2D copy local %p, remote %p, size %ld, stream id %d\n", local_address, remote_address, packed_size, opal_cuda_get_cuda_stream());
+            opal_cuda_set_cuda_stream();
+            mca_common_cuda_memp2pcpy(local_address, (unsigned char*)my_cuda_dt_clone->remote_gpu_address + seq*pipeline_size, packed_size);
+            my_cuda_dt_clone->current_unpack_convertor_pBaseBuf += packed_size;
+        } else {     /* unpack */
+            convertor->flags |= CONVERTOR_CUDA;
+            opal_cuda_set_cuda_stream();
+            if (!OPAL_DATATYPE_DIRECT_COPY_GPUMEM && my_cuda_dt_clone->remote_device != my_cuda_dt_clone->local_device) {
+                convertor->gpu_buffer_ptr = opal_cuda_malloc_gpu_buffer(packed_size, 0);
+                remote_address = (unsigned char*)my_cuda_dt_clone->remote_gpu_address + seq * pipeline_size;
+                opal_cuda_d2dcpy_async(convertor->gpu_buffer_ptr, remote_address, packed_size);
+                iov.iov_base = convertor->gpu_buffer_ptr;
+                opal_output(0, "unpack, start D2D copy src %p, dst %p, size %lu, stream id %d\n", remote_address, convertor->gpu_buffer_ptr, packed_size, opal_cuda_get_cuda_stream());        
+            } else {
+                iov.iov_base = convertor->gpu_buffer_ptr + seq * pipeline_size;
+            }
+            max_data = packed_size;
+            iov.iov_len = packed_size;
+            opal_convertor_unpack(convertor, &iov, &iov_count, &max_data );
+            if (!OPAL_DATATYPE_DIRECT_COPY_GPUMEM && my_cuda_dt_clone->remote_device != my_cuda_dt_clone->local_device) {
+                if (convertor->gpu_buffer_ptr != NULL) {
+                    opal_cuda_free_gpu_buffer(convertor->gpu_buffer_ptr, 0);
+                    convertor->gpu_buffer_ptr = NULL;
+                }   
+            }
+        }
+        send_msg.seq = seq;
+        if (msg_type == CUDA_DDT_COMPLETE) {
+            send_msg.msg_type = CUDA_DDT_COMPLETE_ACK;
+        } else {
+            send_msg.msg_type = CUDA_DDT_PACK_TO_BLOCK;
+        }
+        mca_btl_smcuda_send_cuda_pack_sig(btl, endpoint, &send_msg);
+    }
+}
+
+/* for sender */
+static void btl_smcuda_datatype_pack(mca_btl_base_module_t* btl,
+                                     mca_btl_base_tag_t tag,
+                                     mca_btl_base_descriptor_t* des, void* cbdata)
+{
+    struct mca_btl_base_endpoint_t *endpoint = NULL;
+    cuda_ddt_hdr_t recv_msg;
+    mca_btl_base_segment_t* segments = des->des_segments;
+    memcpy(&recv_msg, segments->seg_addr.pval, sizeof(cuda_ddt_hdr_t));
+    int seq = recv_msg.seq;
+    int lindex = recv_msg.lindex;
+    int msg_type = recv_msg.msg_type;
+    struct opal_convertor_t *convertor = recv_msg.pack_convertor;
+    mca_btl_smcuda_frag_t *frag = (mca_btl_smcuda_frag_t *)des;
+    cuda_ddt_hdr_t send_msg;
+    
+    /* We can find the endoint back from the rank embedded in the header */
+    endpoint = mca_btl_smcuda_component.sm_peers[frag->hdr->my_smp_rank];
+    
+    uint32_t iov_count = 1;
+    int rv_dt = 0;
+    size_t max_data = 0;
+    size_t packed_size = 0;
+
+    send_msg.lindex = lindex;
+    if (msg_type == CUDA_DDT_COMPLETE_ACK) {
+        send_msg.packed_size = 0;
+        send_msg.seq = -2;
+        send_msg.msg_type = CUDA_DDT_CLEANUP;
+        mca_btl_smcuda_send_cuda_unpack_sig(btl, endpoint, &send_msg);
+        if (convertor->gpu_buffer_ptr != NULL) {
+            opal_cuda_free_gpu_buffer(convertor->gpu_buffer_ptr, 0);
+            convertor->gpu_buffer_ptr = NULL;
+        }
+    } else if (msg_type == CUDA_DDT_PACK_TO_BLOCK) {
+        if (convertor->bConverted < convertor->local_size) {
+            struct iovec iov;
+            iov.iov_base = convertor->gpu_buffer_ptr + seq * mca_btl_smcuda_component.cuda_ddt_pipeline_size;
+            iov.iov_len = mca_btl_smcuda_component.cuda_ddt_pipeline_size;
+            opal_cuda_set_cuda_stream();
+            rv_dt = opal_convertor_pack(convertor, &iov, &iov_count, &max_data );
+            packed_size = max_data;
+            send_msg.packed_size = packed_size;
+            send_msg.seq = seq;
+            if (rv_dt == 1) {
+                send_msg.msg_type = CUDA_DDT_COMPLETE;
+            } else {
+                send_msg.msg_type = CUDA_DDT_UNPACK_FROM_BLOCK;
+            }
+            mca_btl_smcuda_send_cuda_unpack_sig(btl, endpoint, &send_msg);
+        }
+    } else if (msg_type == CUDA_DDT_PACK_START) {
+        struct iovec iov;
+        iov.iov_base = convertor->gpu_buffer_ptr;
+        iov.iov_len = mca_btl_smcuda_component.cuda_ddt_pipeline_size;
+        seq = 0;
+        while (rv_dt != 1 && convertor->gpu_buffer_size > 0) {
+            opal_cuda_set_cuda_stream();
+            rv_dt = opal_convertor_pack(convertor, &iov, &iov_count, &max_data );
+            iov.iov_base = (void*)((unsigned char*)iov.iov_base + mca_btl_smcuda_component.cuda_ddt_pipeline_size);
+            convertor->gpu_buffer_size -= mca_btl_smcuda_component.cuda_ddt_pipeline_size;
+            send_msg.packed_size = max_data;
+            send_msg.seq = seq;
+            if (rv_dt == 1) {
+                send_msg.msg_type = CUDA_DDT_COMPLETE;
+            } else {
+                send_msg.msg_type = CUDA_DDT_UNPACK_FROM_BLOCK;
+            }
+            mca_btl_smcuda_send_cuda_unpack_sig(btl, endpoint, &send_msg);
+            seq ++;
+        }
+    } else {
+        opal_output(0, "unknown message\n");
+    }
+}
+
+/* for sender */
+static void btl_smcuda_datatype_put(mca_btl_base_module_t* btl,
+                                    mca_btl_base_tag_t tag,
+                                    mca_btl_base_descriptor_t* des, void* cbdata)
+{
+    struct mca_btl_base_endpoint_t *endpoint = NULL;
+    cuda_ddt_put_hdr_t recv_msg;
+    mca_btl_base_segment_t* segments = des->des_segments;
+    memcpy(&recv_msg, segments->seg_addr.pval, sizeof(cuda_ddt_put_hdr_t));
+    int lindex = recv_msg.lindex;
+    void *remote_address = recv_msg.remote_address;
+    void *remote_base = recv_msg.remote_base;
+    struct opal_convertor_t *convertor = recv_msg.pack_convertor;
+    mca_btl_smcuda_frag_t *frag = (mca_btl_smcuda_frag_t *)des;
+    cuda_ddt_hdr_t send_msg;
+    
+    /* We can find the endoint back from the rank embedded in the header */
+    endpoint = mca_btl_smcuda_component.sm_peers[frag->hdr->my_smp_rank];
+    
+    opal_cuda_free_gpu_buffer(convertor->gpu_buffer_ptr, 0);
+    mca_mpool_common_cuda_reg_t *rget_reg_ptr = NULL;
+    mca_mpool_common_cuda_reg_t rget_reg;
+    rget_reg_ptr= &rget_reg;
+    memset(&rget_reg, 0, sizeof(rget_reg));
+    memcpy(rget_reg.data.memHandle, recv_msg.mem_handle, sizeof(recv_msg.mem_handle));
+    cuda_openmemhandle(NULL, 0, (mca_mpool_base_registration_t *)&rget_reg, NULL);
+    size_t offset = (size_t) ((intptr_t)remote_address - (intptr_t)remote_base);
+    unsigned char *remote_memory_address = (unsigned char *)rget_reg_ptr->base.alloc_base + offset;
+    convertor->gpu_buffer_ptr = remote_memory_address;
+    opal_output(0, "smcuda start put, remote_memory_address $$$$$$ %p, r_addr %p, r_base %p\n", remote_memory_address, remote_address, remote_base);
+    convertor->gpu_buffer_size = convertor->local_size;
+    
+    struct iovec iov;
+    uint32_t iov_count = 1;
+    int rv_dt = 0;
+    size_t max_data = 0;
+    iov.iov_len = convertor->local_size;
+    iov.iov_base = convertor->gpu_buffer_ptr;
+    opal_cuda_set_cuda_stream();
+    rv_dt = opal_convertor_pack(convertor, &iov, &iov_count, &max_data );
+    assert(rv_dt == 1);
+    send_msg.lindex = lindex;
+    send_msg.packed_size = 0;
+    send_msg.seq = -2;
+    send_msg.msg_type = CUDA_DDT_CLEANUP;
+    mca_btl_smcuda_send_cuda_unpack_sig(btl, endpoint, &send_msg);
+}
+
 #endif /* OPAL_CUDA_SUPPORT */
 
 /*
@@ -960,6 +1169,13 @@ mca_btl_smcuda_component_init(int *num_btls,
     /* Register a smcuda control function to help setup IPC support */
     mca_btl_base_active_message_trigger[MCA_BTL_TAG_SMCUDA].cbfunc = btl_smcuda_control;
     mca_btl_base_active_message_trigger[MCA_BTL_TAG_SMCUDA].cbdata = NULL;
+    mca_btl_base_active_message_trigger[MCA_BTL_TAG_SMCUDA_DATATYPE_UNPACK].cbfunc = btl_smcuda_datatype_unpack;
+    mca_btl_base_active_message_trigger[MCA_BTL_TAG_SMCUDA_DATATYPE_UNPACK].cbdata = NULL;
+    mca_btl_base_active_message_trigger[MCA_BTL_TAG_SMCUDA_DATATYPE_PACK].cbfunc = btl_smcuda_datatype_pack;
+    mca_btl_base_active_message_trigger[MCA_BTL_TAG_SMCUDA_DATATYPE_PACK].cbdata = NULL;
+    mca_btl_base_active_message_trigger[MCA_BTL_TAG_SMCUDA_DATATYPE_PUT].cbfunc = btl_smcuda_datatype_put;
+    mca_btl_base_active_message_trigger[MCA_BTL_TAG_SMCUDA_DATATYPE_PUT].cbdata = NULL;
+
 #endif /* OPAL_CUDA_SUPPORT */
 
     return btls;
@@ -1116,6 +1332,10 @@ int mca_btl_smcuda_component_progress(void)
                                           &frag->base, status?OPAL_ERROR:OPAL_SUCCESS);
                 }
                 if( btl_ownership ) {
+                    if (frag->hdr->tag == MCA_BTL_TAG_SMCUDA_DATATYPE_PACK) {
+                    }
+                    if (frag->hdr->tag == MCA_BTL_TAG_SMCUDA_DATATYPE_UNPACK) {
+                    }
                     MCA_BTL_SMCUDA_FRAG_RETURN(frag);
                 }
                 OPAL_THREAD_ADD32(&mca_btl_smcuda_component.num_outstanding_frags, -1);
diff --git a/opal/mca/btl/smcuda/btl_smcuda_endpoint.h b/opal/mca/btl/smcuda/btl_smcuda_endpoint.h
index cead5ec7a5c..20936dbeac1 100644
--- a/opal/mca/btl/smcuda/btl_smcuda_endpoint.h
+++ b/opal/mca/btl/smcuda/btl_smcuda_endpoint.h
@@ -49,6 +49,9 @@ struct mca_btl_base_endpoint_t {
     opal_proc_t *proc_opal;  /**< Needed for adding CUDA IPC support dynamically */
     enum ipcState ipcstate;  /**< CUDA IPC connection status */
     int ipctries;            /**< Number of times CUDA IPC connect was sent */
+    cuda_ddt_clone_t *smcuda_ddt_clone;
+    int smcuda_ddt_clone_size;
+    int smcuda_ddt_clone_avail;
 #endif /* OPAL_CUDA_SUPPORT */
 };
 
diff --git a/opal/mca/common/cuda/common_cuda.c b/opal/mca/common/cuda/common_cuda.c
index 0afe0dd94a2..c358bcb7a57 100644
--- a/opal/mca/common/cuda/common_cuda.c
+++ b/opal/mca/common/cuda/common_cuda.c
@@ -1638,6 +1638,60 @@ int progress_one_cuda_htod_event(struct mca_btl_base_descriptor_t **frag) {
     return 0;
 }
 
+int mca_common_cuda_create_event(uint64_t **event)
+{
+    CUresult result;
+
+    result = cuFunc.cuEventCreate((CUevent *)event, CU_EVENT_INTERPROCESS | CU_EVENT_DISABLE_TIMING);
+    if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
+        opal_show_help("help-mpi-common-cuda.txt", "cuEventCreate failed",
+                       true, OPAL_PROC_MY_HOSTNAME, result);
+        return OPAL_ERROR;
+    }
+    return OPAL_SUCCESS;
+}
+
+int mca_common_cuda_record_event(uint64_t *event)
+{
+    CUresult result;
+    result = cuFunc.cuEventRecord((CUevent)event,0);
+    if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
+        printf("record event error %d\n", result);
+        return OPAL_ERROR;
+    } else {
+        return OPAL_SUCCESS;
+    }
+}
+
+int mca_common_cuda_query_event(uint64_t *event)
+{
+    CUresult result;
+    result = cuFunc.cuEventQuery((CUevent)event);
+    if (OPAL_UNLIKELY(CUDA_SUCCESS == result)) {
+        return OPAL_SUCCESS;
+    } else if (OPAL_UNLIKELY(CUDA_ERROR_NOT_READY == result)) {
+        return OPAL_ERROR;
+    } else {
+        printf("query event error %d\n", result);
+        return OPAL_ERROR;
+    }
+}
+
+int mca_common_cuda_openeventhandle(uint64_t **event, int n, mca_mpool_common_cuda_reg_data_t *handle)
+{
+    // CUipcEventHandle evtHandle;
+    // CUresult result;
+    // mca_mpool_common_cuda_reg_data_t *cuda_handle = (mca_mpool_common_cuda_reg_data_t*)handle;
+    // memcpy(&evtHandle, &cuda_handle->pipeline_evtHandle[n*EVTHANDLE_SIZE], sizeof(evtHandle));
+    // result = cuFunc.cuIpcOpenEventHandle((CUevent *)event, evtHandle);
+    // if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
+    //     opal_show_help("help-mpi-common-cuda.txt", "cuIpcOpenEventHandle failed",
+    //                        true, result);
+    //     return OPAL_ERROR;
+    // }
+    return OPAL_SUCCESS;
+}
+
 
 /**
  * Need to make sure the handle we are retrieving from the cache is still
@@ -1846,7 +1900,9 @@ static int mca_common_cuda_is_gpu_buffer(const void *pUserBuf, opal_convertor_t
     if (!stage_three_init_complete) {
         if (0 != mca_common_cuda_stage_three_init()) {
             opal_cuda_support = 0;
-        }
+        } else {
+	    opal_cuda_kernel_support_init();
+	}
     }
 
     return 1;
@@ -2001,6 +2057,19 @@ int mca_common_cuda_get_address_range(void *pbase, size_t *psize, void *base)
     return 0;
 }
 
+int mca_common_cuda_memp2pcpy(void *dest, const void *src, size_t size)
+{
+    CUresult result;
+
+    result = cuFunc.cuMemcpy((CUdeviceptr)dest, (CUdeviceptr)src, size);
+    if (OPAL_UNLIKELY(CUDA_SUCCESS != result)) {
+        opal_show_help("help-mpi-common-cuda.txt", "cuMemcpy failed",
+                        true, OPAL_PROC_MY_HOSTNAME, result);
+        return OPAL_ERROR;
+    }
+    return OPAL_SUCCESS;
+}
+
 #if OPAL_CUDA_GDR_SUPPORT
 /* Check to see if the memory was freed between the time it was stored in
  * the registration cache and now.  Return true if the memory was previously
diff --git a/opal/mca/common/cuda/common_cuda.h b/opal/mca/common/cuda/common_cuda.h
index c0cd59c359b..e0b511fa48b 100644
--- a/opal/mca/common/cuda/common_cuda.h
+++ b/opal/mca/common/cuda/common_cuda.h
@@ -28,12 +28,19 @@
 #define MEMHANDLE_SIZE 8
 #define EVTHANDLE_SIZE 8
 
+typedef struct {
+    uint64_t evtHandle[EVTHANDLE_SIZE];
+}cuIPCHandle_t;
+
 struct mca_mpool_common_cuda_reg_data_t {
     uint64_t memHandle[MEMHANDLE_SIZE];
     uint64_t evtHandle[EVTHANDLE_SIZE];
     uint64_t event;
     opal_ptr_t memh_seg_addr;
     size_t memh_seg_len;
+    uint8_t pack_required;
+    int32_t gpu_device;
+    struct opal_convertor_t *pack_convertor;
 };
 typedef struct mca_mpool_common_cuda_reg_data_t mca_mpool_common_cuda_reg_data_t;
 
@@ -86,6 +93,11 @@ OPAL_DECLSPEC int mca_common_cuda_device_can_access_peer(int *access, int dev1,
 OPAL_DECLSPEC int mca_common_cuda_stage_one_init(void);
 OPAL_DECLSPEC int mca_common_cuda_get_address_range(void *pbase, size_t *psize, void *base);
 OPAL_DECLSPEC void mca_common_cuda_fini(void);
+OPAL_DECLSPEC int mca_common_cuda_create_event(uint64_t **event);
+OPAL_DECLSPEC int mca_common_cuda_record_event(uint64_t *event);
+OPAL_DECLSPEC int mca_common_cuda_query_event(uint64_t *event);
+OPAL_DECLSPEC int mca_common_cuda_openeventhandle(uint64_t **event, int n, mca_mpool_common_cuda_reg_data_t *handle);
+OPAL_DECLSPEC int mca_common_cuda_memp2pcpy(void *dest, const void *src, size_t size);
 #if OPAL_CUDA_GDR_SUPPORT
 OPAL_DECLSPEC bool mca_common_cuda_previously_freed_memory(mca_mpool_base_registration_t *reg);
 OPAL_DECLSPEC void mca_common_cuda_get_buffer_id(mca_mpool_base_registration_t *reg);
diff --git a/opal/mca/mpool/gpusm/mpool_gpusm.h b/opal/mca/mpool/gpusm/mpool_gpusm.h
index 537c95108a8..5d3d02b5110 100644
--- a/opal/mca/mpool/gpusm/mpool_gpusm.h
+++ b/opal/mca/mpool/gpusm/mpool_gpusm.h
@@ -41,8 +41,8 @@ struct mca_mpool_gpusm_registration_t {
     uint64_t evtHandle[EVTHANDLE_SIZE]; /* CUipcEventHandle */
     uintptr_t event;                    /* CUevent */
 };
-typedef struct mca_mpool_gpusm_registration_t mca_mpool_gpusm_registration_t;
-OPAL_DECLSPEC OBJ_CLASS_DECLARATION(mca_mpool_gpusm_registration_t);
+typedef struct mca_mpool_gpusm_registration_t mca_mpool_gpusm_registration_t; 
+OPAL_DECLSPEC OBJ_CLASS_DECLARATION(mca_mpool_gpusm_registration_t); 
 
 struct mca_mpool_gpusm_component_t {
     mca_mpool_base_component_t super;
diff --git a/opal/mca/mpool/gpusm/mpool_gpusm_module.c b/opal/mca/mpool/gpusm/mpool_gpusm_module.c
index 98740bbdcde..50dcbc859fb 100644
--- a/opal/mca/mpool/gpusm/mpool_gpusm_module.c
+++ b/opal/mca/mpool/gpusm/mpool_gpusm_module.c
@@ -49,7 +49,7 @@
 static void mca_mpool_gpusm_registration_constructor( mca_mpool_gpusm_registration_t *item )
 {
     mca_common_cuda_construct_event_and_handle(&item->event,
-                                               (void *)&item->evtHandle);
+                                               (void *)item->evtHandle);
 }
 
 /**
diff --git a/test/datatype/Makefile.am b/test/datatype/Makefile.am
index d870e6902e0..3dd69732ba8 100644
--- a/test/datatype/Makefile.am
+++ b/test/datatype/Makefile.am
@@ -14,7 +14,7 @@
 #
 
 if PROJECT_OMPI
-    MPI_TESTS = checksum position position_noncontig ddt_test ddt_raw unpack_ooo ddt_pack
+    MPI_TESTS = checksum position position_noncontig ddt_test ddt_raw unpack_ooo ddt_pack ddt_benchmark
     MPI_CHECKS = to_self
 endif
 TESTS = opal_datatype_test $(MPI_TESTS)
@@ -28,10 +28,18 @@ unpack_ooo_LDADD = \
         $(top_builddir)/opal/lib@OPAL_LIB_PREFIX@open-pal.la
 
 ddt_test_SOURCES = ddt_test.c ddt_lib.c ddt_lib.h
-ddt_test_LDFLAGS = $(OMPI_PKG_CONFIG_LDFLAGS)
-ddt_test_LDADD = \
-        $(top_builddir)/ompi/libmpi.la \
-        $(top_builddir)/opal/lib@OPAL_LIB_PREFIX@open-pal.la
+ddt_test_LDFLAGS = $(WRAPPER_EXTRA_LDFLAGS)
+ddt_test_CFLAGS = -I/mnt/sw/cuda/include -g -O0
+ddt_test_LDADD = $(top_builddir)/ompi/libmpi.la $(top_builddir)/opal/mca/common/cuda/libmca_common_cuda.la -L/mnt/sw/cuda/lib64 -lcudart
+
+ddt_benchmark_SOURCES = ddt_benchmark.c ddt_lib.c ddt_lib.h
+ddt_benchmark_LDFLAGS = $(WRAPPER_EXTRA_LDFLAGS)
+ddt_benchmark_CFLAGS = -I/mnt/sw/cuda/include -g -O0
+ddt_benchmark_LDADD = $(top_builddir)/ompi/libmpi.la $(top_builddir)/opal/mca/common/cuda/libmca_common_cuda.la -L/shared/apps/cuda/CUDA-v7.5.18/lib64 -lcudart
+
+#ddt_test_old_SOURCES = ddt_test_old.c ddt_lib.c ddt_lib.h
+#ddt_test_old_LDFLAGS = $(WRAPPER_EXTRA_LDFLAGS)
+#ddt_test_old_LDADD = $(top_builddir)/ompi/libmpi.la
 
 ddt_raw_SOURCES = ddt_raw.c ddt_lib.c ddt_lib.h
 ddt_raw_LDFLAGS = $(OMPI_PKG_CONFIG_LDFLAGS)
diff --git a/test/datatype/ddt_benchmark.c b/test/datatype/ddt_benchmark.c
new file mode 100644
index 00000000000..de3f43a8759
--- /dev/null
+++ b/test/datatype/ddt_benchmark.c
@@ -0,0 +1,1501 @@
+/* -*- Mode: C; c-basic-offset:4 ; -*- */
+/*
+ * Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana
+ *                         University Research and Technology
+ *                         Corporation.  All rights reserved.
+ * Copyright (c) 2004-2009 The University of Tennessee and The University
+ *                         of Tennessee Research Foundation.  All rights
+ *                         reserved.
+ * Copyright (c) 2004-2006 High Performance Computing Center Stuttgart, 
+ *                         University of Stuttgart.  All rights reserved.
+ * Copyright (c) 2004-2006 The Regents of the University of California.
+ *                         All rights reserved.
+ * Copyright (c) 2006      Sun Microsystems Inc. All rights reserved.
+ * $COPYRIGHT$
+ * 
+ * Additional copyrights may follow
+ * 
+ * $HEADER$
+ */
+
+#include "ompi_config.h"
+#include "ddt_lib.h"
+#include "opal/runtime/opal.h"
+#include "opal/datatype/opal_convertor.h"
+#include <time.h>
+#include <stdlib.h>
+#ifdef HAVE_SYS_TIME_H
+#include <sys/time.h>
+#endif
+#include <stdio.h>
+#include <string.h>
+
+#define DDT_TEST_CUDA
+#define CUDA_MEMCPY_2D_D2H
+
+
+#include <cuda_runtime_api.h>
+#include "opal/mca/common/cuda/common_cuda.h"
+#include "opal/runtime/opal_params.h"
+#define CONVERTOR_CUDA             0x00400000
+
+
+/* Compile with:
+mpicc -DHAVE_CONFIG_H -I. -I../../include -I../../../ompi-trunk/include  -I../.. -I../../include -I../../../ompi-trunk/opal -I../../../ompi-trunk/orte -I../../../ompi-trunk/ompi -g ddt_test.c -o ddt_test
+*/
+
+#define TIMER_DATA_TYPE struct timeval
+#define GET_TIME(TV)   gettimeofday( &(TV), NULL )
+#define ELAPSED_TIME(TSTART, TEND)  (((TEND).tv_sec - (TSTART).tv_sec) * 1000000 + ((TEND).tv_usec - (TSTART).tv_usec))
+
+#define DUMP_DATA_AFTER_COMMIT 0x00000001
+#define CHECK_PACK_UNPACK      0x00000002
+
+uint32_t remote_arch = 0xffffffff;
+
+static int test_upper( unsigned int length )
+{
+    double *mat1, *mat2, *inbuf;
+    ompi_datatype_t *pdt;
+    opal_convertor_t * pConv;
+    char *ptr;
+    int rc;
+    unsigned int i, j, iov_count, split_chunk, total_length;
+    size_t max_data;
+    struct iovec a;
+    TIMER_DATA_TYPE start, end;
+    long total_time;
+
+    printf( "test upper matrix\n" );
+    pdt = upper_matrix( length );
+    /*dt_dump( pdt );*/
+
+    mat1 = malloc( length * length * sizeof(double) );
+    init_random_upper_matrix( length, mat1 );
+    mat2 = calloc( length * length, sizeof(double) );
+
+    total_length = length * (length + 1) * ( sizeof(double) / 2);
+    inbuf = (double*)malloc( total_length );
+    ptr = (char*)inbuf;
+    /* copy upper matrix in the array simulating the input buffer */
+    for( i = 0; i < length; i++ ) {
+        uint32_t pos = i * length + i;
+        for( j = i; j < length; j++, pos++ ) {
+            *inbuf = mat1[pos];
+            inbuf++;
+        }
+    }
+    inbuf = (double*)ptr;
+    pConv = opal_convertor_create( remote_arch, 0 );
+    if( OPAL_SUCCESS != opal_convertor_prepare_for_recv( pConv, &(pdt->super), 1, mat2 ) ) {
+        printf( "Cannot attach the datatype to a convertor\n" );
+        return OMPI_ERROR;
+    }
+    
+    cudaDeviceSynchronize();
+
+    GET_TIME( start );
+    split_chunk = (length + 1) * sizeof(double);
+    /*    split_chunk = (total_length + 1) * sizeof(double); */
+    for( i = total_length; i > 0; ) {
+        if( i <= split_chunk ) {  /* equal test just to be able to set a breakpoint */
+            split_chunk = i;
+        }
+        a.iov_base = ptr;
+        a.iov_len = split_chunk;
+        iov_count = 1;
+        max_data = split_chunk;
+        opal_convertor_unpack( pConv, &a, &iov_count, &max_data );
+        ptr += max_data;
+        i -= max_data;
+        if( mat2[0] != inbuf[0] ) assert(0);
+    }
+    GET_TIME( end );
+    total_time = ELAPSED_TIME( start, end );
+    printf( "complete unpacking in %ld microsec\n", total_time );
+    free( inbuf );
+    rc = check_diag_matrix( length, mat1, mat2 );
+    free( mat1 );
+    free( mat2 );
+
+    /* test the automatic destruction pf the data */
+    ompi_datatype_destroy( &pdt ); assert( pdt == NULL );
+
+    OBJ_RELEASE( pConv );
+    return rc;
+}
+
+/**
+ * Computing the correct buffer length for moving a multiple of a datatype
+ * is not an easy task. Define a function to centralize the complexity in a
+ * single location.
+ */
+static size_t compute_buffer_length(ompi_datatype_t* pdt, int count)
+{
+    MPI_Aint extent, lb, true_extent, true_lb;
+    size_t length;
+
+    ompi_datatype_get_extent(pdt, &lb, &extent);
+    ompi_datatype_get_true_extent(pdt, &true_lb, &true_extent); (void)true_lb;
+    length = true_lb + true_extent + (count - 1) * extent;
+
+    return  length;
+}
+
+/**
+ *  Conversion function. They deal with data-types in 3 ways, always making local copies.
+ * In order to allow performance testings, there are 3 functions:
+ *  - one copying directly from one memory location to another one using the
+ *    data-type copy function.
+ *  - one which use a 2 convertors created with the same data-type
+ *  - and one using 2 convertors created from different data-types.
+ *
+ */
+static int local_copy_ddt_count( ompi_datatype_t* pdt, int count )
+{
+    void *pdst, *psrc;
+    TIMER_DATA_TYPE start, end;
+    long total_time;
+    size_t length;
+
+    length = compute_buffer_length(pdt, count);
+
+    pdst = malloc(length);
+    psrc = malloc(length);
+
+    for( size_t i = 0; i < length; i++ )
+	((char*)psrc)[i] = i % 128 + 32;
+    memset(pdst, 0, length);
+
+    cache_trash();  /* make sure the cache is useless */
+
+    GET_TIME( start );
+    if( OMPI_SUCCESS != ompi_datatype_copy_content_same_ddt( pdt, count, pdst, psrc ) ) {
+        printf( "Unable to copy the datatype in the function local_copy_ddt_count."
+                " Is the datatype committed ?\n" );
+    }
+    GET_TIME( end );
+    total_time = ELAPSED_TIME( start, end );
+    printf( "direct local copy in %ld microsec\n", total_time );
+    free(pdst);
+    free(psrc);
+
+    return OMPI_SUCCESS;
+}
+
+static void fill_vectors(double* vp, int itera, int contig, int gap)
+{
+    int i, j;
+    for (i = 0; i < itera-1; i++ ){
+        for (j = i*gap; j < (i+1)*gap; j++) {
+            if (j >= i*gap && j < i*gap+contig) {
+                vp[j] = 1.1;
+            } else {
+                vp[j] = 0;
+            }
+        }
+    }
+    for (i = (itera-1)*gap; i < (itera-1)*gap+contig; i++) {
+        vp[i] = 1.1;
+    }
+   /* 
+     printf("vector generated:\n");
+     for (i = 0; i < (itera-1)*gap+contig; i++) {
+         printf("%1.f ", vp[i]);
+         if ((i+1) % gap == 0) printf("\n");
+     }
+    printf("\n");*/
+}
+
+static void verify_vectors(double *vp, int itera, int contig, int gap)
+{
+    int i, j;
+    int error = 0;
+    int count = 0;
+    for (i = 0; i < itera-1; i++) {
+        for (j = i*gap; j < (i+1)*gap; j++) {
+            if (j >= i*gap && j < i*gap+contig) {
+                if (vp[j] != 1.1) {
+                    error ++;
+                }
+                count ++;
+            } 
+        }
+    }
+    for (i = (itera-1)*gap; i < (itera-1)*gap+contig; i++) {
+        if (vp[i] != 1.1) {
+            error ++;
+        }
+        count ++;
+    }
+/*
+     printf("vector received:\n");
+     for (i = 0; i < (itera-1)*gap+contig; i++) {
+         printf("%1.f ", vp[i]);
+         if ((i+1) % gap == 0) printf("\n");
+     }
+  */
+     if (error != 0) {
+        printf("%d errors out of %d\n", error, count);
+    } else {
+        printf("no errors out of %d\n", count);
+    }
+}
+
+static int
+vector_ddt( ompi_datatype_t* send_type, int send_count,
+            ompi_datatype_t* recv_type, int recv_count,
+            int chunk, int itera, int contig, int gap )
+{
+    void *pdst = NULL, *psrc = NULL, *ptemp = NULL, *psrc_host = NULL, *pdst_host = NULL; 
+    opal_convertor_t *send_convertor = NULL, *recv_convertor = NULL;
+    struct iovec iov;
+    uint32_t iov_count;
+    size_t max_data;
+    int32_t length = 0, done1 = 0, done2 = 0;
+    TIMER_DATA_TYPE start, end, unpack_start, unpack_end;
+    long total_time, unpack_time = 0, push_time = 0, pop_time = 0, pack_time = 0;
+    size_t slength, rlength;
+    int shift_n = 0;
+
+    rlength = compute_buffer_length(recv_type, recv_count) + sizeof(double)*shift_n;
+    slength = compute_buffer_length(send_type, send_count) + sizeof(double)*shift_n;
+    
+    cudaSetDevice(0);
+
+    cudaError_t error = cudaMalloc((void **)&psrc, slength);
+    if ( error != cudaSuccess) {
+        printf("CUDA error: %s\n", cudaGetErrorString(error));
+        exit(-1);
+    }
+    cudaMemset(psrc, 0, slength);
+    psrc += sizeof(double)*shift_n;
+    printf("cudamalloc psrc %p\n", psrc);
+    
+    error = cudaMalloc((void **)&pdst, rlength);
+    if ( error != cudaSuccess) {
+        printf("CUDA error: %s\n", cudaGetErrorString(error));
+        exit(-1);
+    }
+    cudaMemset(pdst, 0, rlength); 
+    pdst += sizeof(double)*shift_n;
+    printf("cudamalloc pdst %p\n", pdst);
+    
+ //   error = cudaHostAlloc((void **)&ptemp, chunk, cudaHostAllocMapped);
+    error = cudaMallocHost((void **)&ptemp, chunk);
+    //ptemp = malloc(chunk);
+    if ( error != cudaSuccess) {
+        printf("CUDA error: %s\n", cudaGetErrorString(error));
+        exit(-1);
+    }
+    memset(ptemp, 0, chunk);
+    ptemp += sizeof(double)*shift_n;
+    printf("cudamallochost ptemp %p\n", ptemp);
+    
+    
+    error = cudaMallocHost((void **)&psrc_host, slength);
+    error = cudaMallocHost((void **)&pdst_host, rlength);
+ //   psrc_host = malloc(slength);
+ //   pdst_host = malloc(rlength);
+    printf("cudamallochost phost \n");
+    
+    memset(psrc_host, 0, slength);
+    memset(pdst_host, 0, rlength);
+    pdst_host += sizeof(double)*shift_n;
+    psrc_host += sizeof(double)*shift_n;
+    slength -= sizeof(double)*shift_n;
+    rlength -= sizeof(double)*shift_n;
+    if (itera > 0) {
+        fill_vectors((double *)psrc_host, itera, contig, gap);
+    }
+    cudaMemcpy(psrc, psrc_host, slength, cudaMemcpyHostToDevice);
+
+
+    send_convertor = opal_convertor_create( remote_arch, 0 );
+#if defined (DDT_TEST_CUDA)
+    send_convertor->flags |= CONVERTOR_CUDA;
+    if( OPAL_SUCCESS != opal_convertor_prepare_for_send( send_convertor, &(send_type->super), send_count, psrc ) ) {
+        printf( "Unable to create the send convertor. Is the datatype committed ?\n" );
+        goto clean_and_return;
+    }
+#else
+    if( OPAL_SUCCESS != opal_convertor_prepare_for_send( send_convertor, &(send_type->super), send_count, psrc_host ) ) {
+        printf( "Unable to create the send convertor. Is the datatype committed ?\n" );
+        goto clean_and_return;
+    }
+#endif
+    recv_convertor = opal_convertor_create( remote_arch, 0 );
+#if defined (DDT_TEST_CUDA)
+    recv_convertor->flags |= CONVERTOR_CUDA;
+    if( OPAL_SUCCESS != opal_convertor_prepare_for_recv( recv_convertor, &(recv_type->super), recv_count, pdst ) ) {
+        printf( "Unable to create the recv convertor. Is the datatype committed ?\n" );
+        goto clean_and_return;
+    }
+#else
+    if( OPAL_SUCCESS != opal_convertor_prepare_for_recv( recv_convertor, &(recv_type->super), recv_count, pdst_host ) ) {
+        printf( "Unable to create the recv convertor. Is the datatype committed ?\n" );
+        goto clean_and_return;
+    }
+#endif
+
+    cache_trash();  /* make sure the cache is useless */
+    cudaDeviceSynchronize();
+    
+    GET_TIME( start );
+#if !defined (DDT_TEST_CUDA)
+    GET_TIME( unpack_start );
+    cudaMemcpy(psrc_host, psrc, slength, cudaMemcpyDeviceToHost);
+    GET_TIME( unpack_end );
+    push_time = ELAPSED_TIME( unpack_start, unpack_end );
+#endif
+    while( (done1 & done2) != 1 ) {
+        /* They are supposed to finish in exactly the same time. */
+        if( done1 | done2 ) {
+            printf( "WRONG !!! the send is %s but the receive is %s in local_copy_with_convertor_2datatypes\n",
+                    (done1 ? "finish" : "not finish"),
+                    (done2 ? "finish" : "not finish") );
+        }
+
+        max_data = chunk;
+        iov_count = 1;
+        iov.iov_base = ptemp;
+        iov.iov_len = chunk;
+
+        if( done1 == 0 ) {
+            done1 = opal_convertor_pack( send_convertor, &iov, &iov_count, &max_data );
+       //     done1 = 1;
+        }
+        
+        // int i,j = 0;
+        // printf("buffer received\n");
+        // double *mat_temp = (double*)ptemp;
+        // for (i = 0; i < itera; i++) {
+        //     for (j = 0; j < contig; j++) {
+        //         printf(" %1.f ", mat_temp[i*itera+j]);
+        //     }
+        //     printf("\n");
+        // }
+
+        if( done2 == 0 ) {
+            GET_TIME( unpack_start );
+            done2 = opal_convertor_unpack( recv_convertor, &iov, &iov_count, &max_data );
+            GET_TIME( unpack_end );
+            unpack_time += ELAPSED_TIME( unpack_start, unpack_end );
+        }
+
+        length += max_data;
+    }
+#if !defined (DDT_TEST_CUDA)
+    GET_TIME( unpack_start );
+    cudaMemcpy(pdst, pdst_host, rlength, cudaMemcpyHostToDevice);
+    GET_TIME( unpack_end );
+    pop_time = ELAPSED_TIME( unpack_start, unpack_end );
+#endif
+
+    GET_TIME( end );
+    total_time = ELAPSED_TIME( start, end );
+    pack_time = total_time - unpack_time - push_time - pop_time;
+    printf( "copying different data-types using convertors in %ld microsec, p&up in %ld \n", total_time, pack_time+unpack_time );
+    printf( "\t unpack in %ld microsec [pack in %ld microsec], push in %ld microsec, pop in %ld microsec\n", unpack_time,
+            pack_time, push_time, pop_time);
+            
+    memset(pdst_host, 0, slength);
+    cudaMemcpy(pdst_host, pdst, rlength, cudaMemcpyDeviceToHost);
+    if (itera > 0) {
+        verify_vectors((double *)pdst_host, itera, contig, gap);
+    }
+
+ clean_and_return:
+    if( send_convertor != NULL ) {
+        OBJ_RELEASE( send_convertor ); assert( send_convertor == NULL );
+    }
+    if( recv_convertor != NULL ) {
+        OBJ_RELEASE( recv_convertor ); assert( recv_convertor == NULL );
+    }
+
+    if( NULL != pdst ) cudaFree( pdst );
+    if( NULL != psrc ) cudaFree( psrc );
+    if( NULL != ptemp ) cudaFreeHost( ptemp );
+    if( NULL != psrc_host ) cudaFreeHost( psrc_host );
+    if( NULL != pdst_host ) cudaFreeHost( pdst_host );
+
+    return OMPI_SUCCESS;
+}
+
+static int
+vector_ddt_2d( ompi_datatype_t* send_type, int send_count,
+            ompi_datatype_t* recv_type, int recv_count,
+            int chunk, int itera, int contig, int gap )
+{
+    void *pdst = NULL, *psrc = NULL, *ptemp = NULL, *psrc_host = NULL, *pdst_host = NULL; 
+    opal_convertor_t *send_convertor = NULL, *recv_convertor = NULL;
+    struct iovec iov;
+    uint32_t iov_count;
+    size_t max_data;
+    int32_t length = 0, done1 = 0, done2 = 0;
+    TIMER_DATA_TYPE start, end, unpack_start, unpack_end;
+    long total_time, unpack_time = 0, push_time = 0, pop_time = 0, pack_time = 0;
+    size_t slength, rlength;
+
+    rlength = compute_buffer_length(recv_type, recv_count);
+    slength = compute_buffer_length(send_type, send_count);
+    
+    cudaSetDevice(2);
+
+    cudaError_t error = cudaMalloc((void **)&psrc, slength);
+    if ( error != cudaSuccess) {
+        printf("CUDA error: %s\n", cudaGetErrorString(error));
+        exit(-1);
+    }
+    cudaMemset(psrc, 0, slength);
+    printf("cudamalloc psrc %p\n", psrc);
+    
+    error = cudaMalloc((void **)&pdst, rlength);
+    if ( error != cudaSuccess) {
+        printf("CUDA error: %s\n", cudaGetErrorString(error));
+        exit(-1);
+    }
+    cudaMemset(pdst, 0, rlength); 
+    printf("cudamalloc pdst %p\n", pdst);
+    
+    error = cudaMallocHost((void **)&ptemp, chunk);
+    if ( error != cudaSuccess) {
+        printf("CUDA error: %s\n", cudaGetErrorString(error));
+        exit(-1);
+    }
+    memset(ptemp, 0, chunk);
+    printf("cudamallochost ptemp %p\n", ptemp);
+    
+    
+    error = cudaMallocHost((void **)&psrc_host, slength);
+    error = cudaMallocHost((void **)&pdst_host, rlength);
+    printf("cudamallochost phost \n");
+    
+    memset(psrc_host, 0, slength);
+    memset(pdst_host, 0, rlength);
+    if (itera > 0) {
+        fill_vectors((double *)psrc_host, itera, contig, gap);
+    }
+    cudaMemcpy(psrc, psrc_host, slength, cudaMemcpyHostToDevice);
+
+
+    GET_TIME( start );
+    //cudaMemcpy2D(pdst, contig*sizeof(double), psrc, gap*sizeof(double), contig*sizeof(double), itera, cudaMemcpyDeviceToDevice);
+    cudaMemcpy2D(psrc_host, contig*sizeof(double), psrc, gap*sizeof(double), contig*sizeof(double), itera, cudaMemcpyDeviceToHost);
+    GET_TIME( end );
+    pop_time = ELAPSED_TIME( start, end );
+    
+    GET_TIME( start );
+    cudaMemcpy2D(pdst, gap*sizeof(double), psrc_host, contig*sizeof(double), contig*sizeof(double), itera, cudaMemcpyHostToDevice);
+    GET_TIME( end );
+    push_time = ELAPSED_TIME( start, end );
+    
+    printf( "MEMCPY2D D2H %ld microseconds, H2D %ld microseconds, size %ld\n", pop_time, push_time, contig*sizeof(double)*itera); 
+            
+    memset(pdst_host, 0, slength);
+    cudaMemcpy(pdst_host, pdst, rlength, cudaMemcpyDeviceToHost);
+    if (itera > 0) {
+        verify_vectors((double *)pdst_host, itera, contig, gap);
+    }
+    /* D2D D2H */
+    if (itera > 0) {
+        fill_vectors((double *)psrc_host, itera, contig, gap);
+    }
+    cudaMemcpy(psrc, psrc_host, slength, cudaMemcpyHostToDevice);
+
+
+    GET_TIME( start );
+    cudaMemcpy2D(pdst, contig*sizeof(double), psrc, gap*sizeof(double), contig*sizeof(double), itera, cudaMemcpyDeviceToDevice);
+    GET_TIME( end );
+    pack_time = ELAPSED_TIME( start, end );
+    
+    GET_TIME( start );
+    cudaMemcpy(psrc_host, pdst, contig*sizeof(double)*itera, cudaMemcpyDeviceToHost);
+    GET_TIME( end );
+    pop_time = ELAPSED_TIME( start, end );
+    
+    GET_TIME( start );
+    cudaMemcpy(psrc, psrc_host, contig*sizeof(double)*itera, cudaMemcpyHostToDevice);
+    GET_TIME( end );
+    push_time = ELAPSED_TIME( start, end );
+    
+    GET_TIME( start );
+    cudaMemcpy2D(pdst, gap*sizeof(double), psrc, contig*sizeof(double), contig*sizeof(double), itera, cudaMemcpyDeviceToDevice);
+    GET_TIME( end );
+    unpack_time = ELAPSED_TIME( start, end );
+    
+    printf( "MEMCPY2D D2H %ld microseconds, H2D %ld microseconds, pack in %ld, unpack in %ld, size %lu \n", pop_time, push_time, pack_time, unpack_time, contig*sizeof(double)*itera); 
+            
+    memset(pdst_host, 0, slength);
+    cudaMemcpy(pdst_host, pdst, rlength, cudaMemcpyDeviceToHost);
+    if (itera > 0) {
+        verify_vectors((double *)pdst_host, itera, contig, gap);
+    }
+    
+
+ clean_and_return:
+    if( send_convertor != NULL ) {
+        OBJ_RELEASE( send_convertor ); assert( send_convertor == NULL );
+    }
+    if( recv_convertor != NULL ) {
+        OBJ_RELEASE( recv_convertor ); assert( recv_convertor == NULL );
+    }
+
+    if( NULL != pdst ) cudaFree( pdst );
+    if( NULL != psrc ) cudaFree( psrc );
+    if( NULL != ptemp ) cudaFreeHost( ptemp );
+    if( NULL != psrc_host ) cudaFreeHost( psrc_host );
+    if( NULL != pdst_host ) cudaFreeHost( pdst_host );
+
+    return OMPI_SUCCESS;
+}
+
+
+static int
+local_copy_with_convertor_2datatypes_struct( ompi_datatype_t* send_type, int send_count,
+                                      ompi_datatype_t* recv_type, int recv_count,
+                                      int chunk, int count)
+{
+    void *pdst = NULL, *psrc = NULL, *ptemp = NULL, *phost = NULL;
+    opal_convertor_t *send_convertor = NULL, *recv_convertor = NULL;
+    struct iovec iov;
+    uint32_t iov_count;
+    size_t max_data;
+    int32_t length = 0, done1 = 0, done2 = 0;
+    TIMER_DATA_TYPE start, end, unpack_start, unpack_end;
+    long total_time, unpack_time = 0;
+    size_t slength, rlength;
+
+    rlength = compute_buffer_length(recv_type, recv_count);
+    slength = compute_buffer_length(send_type, send_count);
+    
+#if defined (DDT_TEST_CUDA)
+    cudaSetDevice(0);
+#endif
+
+#if defined (DDT_TEST_CUDA)
+    cudaError_t error = cudaMalloc((void **)&psrc, slength);
+    if ( error != cudaSuccess) {
+        printf("CUDA error: %s\n", cudaGetErrorString(error));
+        exit(-1);
+    }
+    cudaMemset(psrc, 0, slength);
+    printf("cudamalloc psrc %p\n", psrc);
+    
+    error = cudaMalloc((void **)&pdst, rlength);
+    if ( error != cudaSuccess) {
+        printf("CUDA error: %s\n", cudaGetErrorString(error));
+        exit(-1);
+    }
+    cudaMemset(pdst, 0, rlength); 
+    printf("cudamalloc pdst %p\n", pdst);
+    
+    error = cudaMallocHost((void **)&ptemp, chunk);
+    if ( error != cudaSuccess) {
+        printf("CUDA error: %s\n", cudaGetErrorString(error));
+        exit(-1);
+    }
+    memset(ptemp, 0, chunk);
+    printf("cudamallochost ptemp %p\n", ptemp);
+    
+    error = cudaMallocHost((void **)&phost, slength);
+    if ( error != cudaSuccess) {
+        printf("CUDA error: %s\n", cudaGetErrorString(error));
+        exit(-1);
+    }
+    memset(phost, 0, slength);
+    printf("cudamallochost phost %p\n", phost);
+#else
+    pdst  = malloc( rlength );
+    psrc  = malloc( slength );
+    ptemp = malloc( chunk );
+
+    /* initialize the buffers to prevent valgrind from complaining */
+    for( size_t i = 0; i < slength; i++ )
+            ((char*)psrc)[i] = i % 128 + 32;
+    memset(pdst, 0, rlength);
+#endif
+    
+#if defined (DDT_TEST_CUDA)
+
+    cudaMemcpy(psrc, phost, slength, cudaMemcpyHostToDevice);
+#else 
+
+#endif
+
+    send_convertor = opal_convertor_create( remote_arch, 0 );
+#if defined (DDT_TEST_CUDA)
+    send_convertor->flags |= CONVERTOR_CUDA;
+#endif
+    if( OPAL_SUCCESS != opal_convertor_prepare_for_send( send_convertor, &(send_type->super), send_count, psrc ) ) {
+        printf( "Unable to create the send convertor. Is the datatype committed ?\n" );
+        goto clean_and_return;
+    }
+    recv_convertor = opal_convertor_create( remote_arch, 0 );
+#if defined (DDT_TEST_CUDA)
+    recv_convertor->flags |= CONVERTOR_CUDA;
+#endif
+    if( OPAL_SUCCESS != opal_convertor_prepare_for_recv( recv_convertor, &(recv_type->super), recv_count, pdst ) ) {
+        printf( "Unable to create the recv convertor. Is the datatype committed ?\n" );
+        goto clean_and_return;
+    }
+
+    cache_trash();  /* make sure the cache is useless */
+
+    GET_TIME( start );
+    while( (done1 & done2) != 1 ) {
+        /* They are supposed to finish in exactly the same time. */
+        if( done1 | done2 ) {
+            printf( "WRONG !!! the send is %s but the receive is %s in local_copy_with_convertor_2datatypes\n",
+                    (done1 ? "finish" : "not finish"),
+                    (done2 ? "finish" : "not finish") );
+        }
+
+        max_data = chunk;
+        iov_count = 1;
+        iov.iov_base = ptemp;
+        iov.iov_len = chunk;
+
+        if( done1 == 0 ) {
+            done1 = opal_convertor_pack( send_convertor, &iov, &iov_count, &max_data );
+        }
+
+        if( done2 == 0 ) {
+            GET_TIME( unpack_start );
+            done2 = opal_convertor_unpack( recv_convertor, &iov, &iov_count, &max_data );
+            GET_TIME( unpack_end );
+            unpack_time += ELAPSED_TIME( unpack_start, unpack_end );
+        }
+
+        length += max_data;
+    }
+    GET_TIME( end );
+    total_time = ELAPSED_TIME( start, end );
+    printf( "copying different data-types using convertors in %ld microsec\n", total_time );
+    printf( "\t unpack in %ld microsec [pack in %ld microsec]\n", unpack_time,
+            total_time - unpack_time );
+            
+#if defined (DDT_TEST_CUDA)
+    memset(phost, 0, slength);
+    cudaMemcpy(phost, pdst, rlength, cudaMemcpyDeviceToHost);
+
+#else
+
+#endif
+ clean_and_return:
+    if( send_convertor != NULL ) {
+        OBJ_RELEASE( send_convertor ); assert( send_convertor == NULL );
+    }
+    if( recv_convertor != NULL ) {
+        OBJ_RELEASE( recv_convertor ); assert( recv_convertor == NULL );
+    }
+#if defined (DDT_TEST_CUDA)
+    if( NULL != pdst ) cudaFree( pdst );
+    if( NULL != psrc ) cudaFree( psrc );
+    if( NULL != ptemp ) cudaFreeHost( ptemp );
+    if( NULL != phost ) cudaFreeHost( phost );
+#else
+    if( NULL != pdst ) free( pdst );
+    if( NULL != psrc ) free( psrc );
+    if( NULL != ptemp ) free( ptemp );
+#endif
+    return OMPI_SUCCESS;
+}
+
+
+static void fill_upper_matrix(void *matt, int msize)
+{
+    int i, j, start, end;
+    int *blklens, *displs;
+#if defined (TEST_DOUBLE)
+    double *mat = (double *)matt;
+#elif defined (TEST_FLOAT)
+    float *mat = (float *)matt;
+#elif defined (TEST_CHAR)
+    char *mat = (char *)matt;
+#else
+    void *mat = matt;
+#endif
+    
+    blklens = (int *)malloc(sizeof(int)*msize);
+    displs = (int *)malloc(sizeof(int)*msize);
+    for (i = 0; i < msize; i++) {
+        blklens[i] = msize - i;
+        displs[i] = i*msize + i;
+    }
+    /*int ct = 0;
+    for (i = 0; i < msize; i++) {
+        blklens[i] = msize - ct*160;
+        displs[i] = i*msize + ct*160;
+        if (i % 160 == 0 && i != 0) {
+            ct++;
+        }
+    }*/
+    for (i = 0; i < msize; i++) {
+        start = displs[i];
+        end = start + blklens[i];
+        for (j = start; j < end; j++) {
+#if defined (TEST_CHAR)
+            mat[j] = 'a';
+#else
+            mat[j] = 0.0 + i;
+#endif
+        }
+    }
+    free(blklens);
+    free(displs);
+
+    /*
+    printf("matrix generate\n");
+    for (i = 0; i < msize; i++) {
+        for (j = 0; j < msize; j++) {
+            printf(" %1.f ", mat[i*msize+j]);
+        }
+        printf("\n");
+    }*/
+}
+
+static void verify_mat_result(void *matt, int msize)
+{
+    int *blklens, *displs;
+    int i, j, error = 0;
+    int start, end;
+#if defined (TEST_DOUBLE)
+    double *mat = (double *)matt;
+#elif defined (TEST_FLOAT)
+    float *mat = (float *)matt;
+#elif defined (TEST_CHAR)
+    char *mat = (char *)matt;
+#else
+    void *mat = matt;
+#endif
+    
+    blklens = (int *)malloc(sizeof(int)*msize);
+    displs = (int *)malloc(sizeof(int)*msize);
+    for (i = 0; i < msize; i++) {
+        blklens[i] = msize - i;
+        displs[i] = i*msize + i;
+    }
+    /*int ct = 0;
+    for (i = 0; i < msize; i++) {
+        blklens[i] = msize - ct*160;
+        displs[i] = i*msize + ct*160;
+        if (i % 160 == 0 && i != 0) {
+            ct++;
+        }
+    }*/
+    for (i = 0; i < msize; i++) {
+        start = displs[i];
+        end = start + blklens[i];
+        for (j = start; j < end; j++) {
+#if defined (TEST_CHAR) 
+            if (mat[j] != 'a') {
+#else
+            if (mat[j] != (0.0+i)) {
+#endif
+                error ++;
+            }
+        }
+    }
+    free(blklens);
+    free(displs);
+   /* 
+     printf("matrix received\n");
+     for (i = 0; i < msize; i++) {
+         for (j = 0; j < msize; j++) {
+             printf(" %1.f ", mat[i*msize+j]);
+         }
+         printf("\n");
+     }
+    */
+    if (error != 0) {
+        printf("error is found %d\n", error);
+    } else {
+        printf("no error is found\n");
+    }
+}
+
+static int local_copy_with_convertor( ompi_datatype_t* pdt, int count, int chunk, int msize )
+{
+    void *pdst = NULL, *psrc = NULL, *ptemp = NULL, *phost = NULL;
+    opal_convertor_t *send_convertor = NULL, *recv_convertor = NULL;
+    struct iovec iov;
+    uint32_t iov_count;
+    size_t max_data, dt_length;
+    int32_t length = 0, done1 = 0, done2 = 0;
+    TIMER_DATA_TYPE start, end, unpack_start, unpack_end;
+    long total_time, unpack_time = 0;
+    int j, t_error = 0;
+    unsigned char *mat_char;
+    int shift_n = 0;
+
+    dt_length = compute_buffer_length(pdt, count) + sizeof(double) * shift_n;
+    printf("length %lu\n", dt_length);
+
+#if defined (DDT_TEST_CUDA)
+    cudaSetDevice(0);
+#endif
+
+#if defined (DDT_TEST_CUDA)
+    cudaError_t error = cudaMalloc((void **)&psrc, dt_length);
+    if ( error != cudaSuccess) {
+        printf("CUDA error: %s\n", cudaGetErrorString(error));
+        exit(-1);
+    }
+    psrc += sizeof(double) * shift_n;
+    cudaMemset(psrc, 0, dt_length);
+    printf("cudamalloc psrc %p\n", psrc);
+    
+    error = cudaMalloc((void **)&pdst, dt_length);
+    if ( error != cudaSuccess) {
+        printf("CUDA error: %s\n", cudaGetErrorString(error));
+        exit(-1);
+    }
+    pdst += sizeof(double) * shift_n;
+    cudaMemset(pdst, 0, dt_length); 
+    printf("cudamalloc pdst %p\n", pdst);
+    
+    error = cudaMallocHost((void **)&ptemp, chunk);
+    if ( error != cudaSuccess) {
+        printf("CUDA error: %s\n", cudaGetErrorString(error));
+        exit(-1);
+    }
+    ptemp += sizeof(double) * shift_n;
+    memset(ptemp, 0, chunk);
+    printf("cudamallochost ptemp %p\n", ptemp);
+    
+    error = cudaMallocHost((void **)&phost, dt_length);
+    if ( error != cudaSuccess) {
+        printf("CUDA error: %s\n", cudaGetErrorString(error));
+        exit(-1);
+    }
+    phost += sizeof(double) * shift_n;
+    memset(phost, 0, dt_length);
+    printf("cudamallochost phost %p\n", phost);
+#else
+    pdst  = malloc(dt_length);
+    psrc  = malloc(dt_length);
+    ptemp = malloc(chunk);
+    
+    for( int i = 0; i < length; ((char*)psrc)[i] = i % 128 + 32, i++ );
+    memset( pdst, 0, length );
+#endif
+
+#if defined (DDT_TEST_CUDA)
+    dt_length -= sizeof(double) * shift_n;
+    if (msize > 0) {
+        fill_upper_matrix(phost, msize);
+    }
+    cudaMemcpy(psrc, phost, dt_length, cudaMemcpyHostToDevice);
+#else 
+    if (msize > 0) {
+        fill_upper_matrix(psrc, msize);
+    }
+#endif
+
+    send_convertor = opal_convertor_create( remote_arch, 0 );
+#if defined (DDT_TEST_CUDA)
+    send_convertor->flags |= CONVERTOR_CUDA;
+#endif
+    if( OPAL_SUCCESS != opal_convertor_prepare_for_send( send_convertor, &(pdt->super), count, psrc ) ) {
+        printf( "Unable to create the send convertor. Is the datatype committed ?\n" );
+        goto clean_and_return;
+    }
+
+    recv_convertor = opal_convertor_create( remote_arch, 0 );
+#if defined (DDT_TEST_CUDA)
+    recv_convertor->flags |= CONVERTOR_CUDA;
+#endif
+    if( OPAL_SUCCESS != opal_convertor_prepare_for_recv( recv_convertor, &(pdt->super), count, pdst ) ) {
+        printf( "Unable to create the recv convertor. Is the datatype committed ?\n" );
+        goto clean_and_return;
+    }
+
+    cache_trash();  /* make sure the cache is useless */
+    cudaDeviceSynchronize();
+
+    GET_TIME( start );
+    while( (done1 & done2) != 1 ) {
+        /* They are supposed to finish in exactly the same time. */
+        if( done1 | done2 ) {
+            printf( "WRONG !!! the send is %s but the receive is %s in local_copy_with_convertor\n",
+                    (done1 ? "finish" : "not finish"),
+                    (done2 ? "finish" : "not finish") );
+        }
+
+        max_data = chunk;
+        iov_count = 1;
+        iov.iov_base = ptemp;
+        iov.iov_len = chunk;
+
+        if( done1 == 0 ) {
+            done1 = opal_convertor_pack( send_convertor, &iov, &iov_count, &max_data );
+            
+        }
+#if defined (TEST_CHAR)
+        mat_char = (unsigned char *)ptemp;
+        for (j = 0; j < max_data; j++) {
+            if (mat_char[j] != 'a') {
+                t_error ++;
+                printf("error %d, %c\n", j, mat_char[j]);
+            }
+        }
+        printf("total error %d\n", t_error);
+#endif
+      /*  double *mat_d = (double *)ptemp;
+        for (j = 0; j < max_data/sizeof(double); j++) {
+            printf("%1.f ", mat_d[j]);
+        }*/
+      //  printf("max data %d, ptemp %p \n", max_data, ptemp);
+
+        if( done2 == 0 ) {
+            GET_TIME( unpack_start );
+            done2 = opal_convertor_unpack( recv_convertor, &iov, &iov_count, &max_data );
+            GET_TIME( unpack_end );
+            unpack_time += ELAPSED_TIME( unpack_start, unpack_end );
+        }
+
+        length += max_data;
+    }
+    GET_TIME( end );
+    total_time = ELAPSED_TIME( start, end );
+    printf( "copying same data-type using convertors in %ld microsec\n", total_time );
+    printf( "\t unpack in %ld microsec [pack in %ld microsec]\n", unpack_time,
+            total_time - unpack_time );
+            
+#if defined (DDT_TEST_CUDA)
+    memset(phost, 0, dt_length);
+    cudaMemcpy(phost, pdst, dt_length, cudaMemcpyDeviceToHost);
+    if (msize > 0) {
+        verify_mat_result(phost, msize);
+    }
+#else
+    if (msize > 0) {
+        verify_mat_result(pdst, msize);
+    }
+#endif
+clean_and_return:
+    if( NULL != send_convertor ) OBJ_RELEASE( send_convertor );
+    if( NULL != recv_convertor ) OBJ_RELEASE( recv_convertor );
+
+#if defined (DDT_TEST_CUDA)
+    psrc -= sizeof(double) * shift_n;
+    pdst -= sizeof(double) * shift_n;
+    ptemp -= sizeof(double) * shift_n;
+    phost -= sizeof(double) * shift_n;
+    if( NULL != pdst ) cudaFree( pdst );
+    if( NULL != psrc ) cudaFree( psrc );
+    if( NULL != ptemp ) cudaFreeHost( ptemp );
+    if( NULL != phost ) cudaFreeHost( phost );
+#else
+    if( NULL != pdst ) free( pdst );
+    if( NULL != psrc ) free( psrc );
+    if( NULL != ptemp ) free( ptemp );
+#endif
+    return OMPI_SUCCESS;
+}
+
+static void fill_matrix(void *matt, int msize)
+{
+    int i, j;
+#if defined (TEST_DOUBLE)
+    double *mat = (double *)matt;
+#elif defined (TEST_FLOAT)
+    float *mat = (float *)matt;
+#elif defined (TEST_CHAR)
+    char *mat = (char *)matt;
+#else
+    void *mat = matt;
+#endif
+    
+    for (i = 0; i < msize*msize; i++) {
+        mat[i] = i;
+    }
+
+   // printf("matrix generate\n");
+   // for (i = 0; i < msize; i++) {
+   //     for (j = 0; j < msize; j++) {
+   //         printf(" %1.f ", mat[i*msize+j]);
+   //     }
+   //     printf("\n");
+   // }
+}
+
+static void verify_mat(void *matt, int msize)
+{
+    int i, j, error = 0;
+#if defined (TEST_DOUBLE)
+    double *mat = (double *)matt;
+#elif defined (TEST_FLOAT)
+    float *mat = (float *)matt;
+#elif defined (TEST_CHAR)
+    char *mat = (char *)matt;
+#else
+    void *mat = matt;
+#endif
+    
+    for (i = 0; i < msize*msize; i++) {
+#if defined (TEST_CHAR) 
+        if (mat[i] != 'a') {
+#else
+        if (mat[i] != (0.0+i)) {
+#endif
+            error ++;
+        }
+    }
+    
+    // printf("matrix received\n");
+    // for (i = 0; i < msize; i++) {
+    //     for (j = 0; j < msize; j++) {
+    //         printf(" %1.f ", mat[i*msize+j]);
+    //     }
+    //     printf("\n");
+    // }
+    
+    if (error != 0) {
+        printf("error is found %d\n", error);
+    } else {
+        printf("no error is found\n");
+    }
+}
+
+static int local_copy_with_convertor_mat( ompi_datatype_t* pdt, int count, int chunk, int msize )
+{
+    void *pdst = NULL, *psrc = NULL, *ptemp = NULL, *phost = NULL;
+    opal_convertor_t *send_convertor = NULL, *recv_convertor = NULL;
+    struct iovec iov;
+    uint32_t iov_count;
+    size_t max_data, dt_length;
+    int32_t length = 0, done1 = 0, done2 = 0;
+    TIMER_DATA_TYPE start, end, unpack_start, unpack_end;
+    long total_time, unpack_time = 0;
+
+    dt_length = compute_buffer_length(pdt, count);
+    printf("length %lu\n", dt_length);
+
+#if defined (DDT_TEST_CUDA)
+    cudaSetDevice(0);
+#endif
+
+#if defined (DDT_TEST_CUDA)
+    cudaError_t error = cudaMalloc((void **)&psrc, dt_length);
+    if ( error != cudaSuccess) {
+        printf("CUDA error: %s\n", cudaGetErrorString(error));
+        exit(-1);
+    }
+    cudaMemset(psrc, 0, dt_length);
+    printf("cudamalloc psrc %p\n", psrc);
+    
+    error = cudaMalloc((void **)&pdst, dt_length);
+    if ( error != cudaSuccess) {
+        printf("CUDA error: %s\n", cudaGetErrorString(error));
+        exit(-1);
+    }
+    cudaMemset(pdst, 0, dt_length); 
+    printf("cudamalloc pdst %p\n", pdst);
+    
+    error = cudaMallocHost((void **)&ptemp, chunk);
+    if ( error != cudaSuccess) {
+        printf("CUDA error: %s\n", cudaGetErrorString(error));
+        exit(-1);
+    }
+    memset(ptemp, 0, chunk);
+    printf("cudamallochost ptemp %p\n", ptemp);
+    
+    error = cudaMallocHost((void **)&phost, dt_length);
+    if ( error != cudaSuccess) {
+        printf("CUDA error: %s\n", cudaGetErrorString(error));
+        exit(-1);
+    }
+    memset(phost, 0, dt_length);
+    printf("cudamallochost phost %p\n", phost);
+#else
+    pdst  = malloc(dt_length);
+    psrc  = malloc(dt_length);
+    ptemp = malloc(chunk);
+    
+    for( int i = 0; i < length; ((char*)psrc)[i] = i % 128 + 32, i++ );
+    memset( pdst, 0, length );
+#endif
+
+#if defined (DDT_TEST_CUDA)
+    if (msize > 0) {
+        fill_matrix(phost, msize);
+    }
+    cudaMemcpy(psrc, phost, dt_length, cudaMemcpyHostToDevice);
+#else 
+    if (msize > 0) {
+  //      fill_upper_matrix(psrc, msize);
+    }
+#endif
+
+    send_convertor = opal_convertor_create( remote_arch, 0 );
+#if defined (DDT_TEST_CUDA)
+    send_convertor->flags |= CONVERTOR_CUDA;
+#endif
+    if( OPAL_SUCCESS != opal_convertor_prepare_for_send( send_convertor, &(pdt->super), count, psrc ) ) {
+        printf( "Unable to create the send convertor. Is the datatype committed ?\n" );
+        goto clean_and_return;
+    }
+
+    recv_convertor = opal_convertor_create( remote_arch, 0 );
+#if defined (DDT_TEST_CUDA)
+    recv_convertor->flags |= CONVERTOR_CUDA;
+#endif
+    if( OPAL_SUCCESS != opal_convertor_prepare_for_recv( recv_convertor, &(pdt->super), count, pdst ) ) {
+        printf( "Unable to create the recv convertor. Is the datatype committed ?\n" );
+        goto clean_and_return;
+    }
+
+    cache_trash();  /* make sure the cache is useless */
+    cudaDeviceSynchronize();
+
+    GET_TIME( start );
+    while( (done1 & done2) != 1 ) {
+        /* They are supposed to finish in exactly the same time. */
+        if( done1 | done2 ) {
+            printf( "WRONG !!! the send is %s but the receive is %s in local_copy_with_convertor\n",
+                    (done1 ? "finish" : "not finish"),
+                    (done2 ? "finish" : "not finish") );
+        }
+
+        max_data = chunk;
+        iov_count = 1;
+        iov.iov_base = ptemp;
+        iov.iov_len = chunk;
+
+        if( done1 == 0 ) {
+            done1 = opal_convertor_pack( send_convertor, &iov, &iov_count, &max_data );
+        }
+        
+        // int i,j = 0;
+        // printf("buffer received\n");
+        // double *mat_temp = (double*)ptemp;
+        // for (i = 0; i < msize; i++) {
+        //     for (j = 0; j < msize; j++) {
+        //         printf(" %1.f ", mat_temp[i*msize+j]);
+        //     }
+        //     printf("\n");
+        // }
+
+        if( done2 == 0 ) {
+            GET_TIME( unpack_start );
+            done2 = opal_convertor_unpack( recv_convertor, &iov, &iov_count, &max_data );
+            GET_TIME( unpack_end );
+            unpack_time += ELAPSED_TIME( unpack_start, unpack_end );
+        }
+
+        length += max_data;
+    }
+    GET_TIME( end );
+    total_time = ELAPSED_TIME( start, end );
+    printf( "copying same data-type using convertors in %ld microsec\n", total_time );
+    printf( "\t unpack in %ld microsec [pack in %ld microsec]\n", unpack_time,
+            total_time - unpack_time );
+            
+#if defined (DDT_TEST_CUDA)
+    memset(phost, 0, dt_length);
+    cudaMemcpy(phost, pdst, dt_length, cudaMemcpyDeviceToHost);
+    if (msize > 0) {
+     verify_mat(phost, msize);
+    }
+#else
+    if (msize > 0) {
+//      verify_mat_result(pdst, msize);
+    }
+#endif
+clean_and_return:
+    if( NULL != send_convertor ) OBJ_RELEASE( send_convertor );
+    if( NULL != recv_convertor ) OBJ_RELEASE( recv_convertor );
+
+#if defined (DDT_TEST_CUDA)
+    if( NULL != pdst ) cudaFree( pdst );
+    if( NULL != psrc ) cudaFree( psrc );
+    if( NULL != ptemp ) cudaFreeHost( ptemp );
+    if( NULL != phost ) cudaFreeHost( phost );
+#else
+    if( NULL != pdst ) free( pdst );
+    if( NULL != psrc ) free( psrc );
+    if( NULL != ptemp ) free( ptemp );
+#endif
+    return OMPI_SUCCESS;
+}
+
+/**
+ * Main function. Call several tests and print-out the results. It try to stress the convertor
+ * using difficult data-type constructions as well as strange segment sizes for the conversion.
+ * Usually, it is able to detect most of the data-type and convertor problems. Any modifications
+ * on the data-type engine should first pass all the tests from this file, before going into other
+ * tests.
+ */
+int main( int argc, char* argv[] )
+{
+    ompi_datatype_t *pdt, *pdt1, *pdt2, *pdt3;
+    int rc, length = 500, i;
+
+#if defined (DDT_TEST_CUDA)
+    opal_cuda_support = 1;
+#endif
+    opal_init_util(&argc, &argv);
+#if defined (DDT_TEST_CUDA)
+    mca_common_cuda_stage_one_init();
+#endif
+    ompi_datatype_init();
+
+    /**
+     * By default simulate homogeneous architectures.
+     */
+    remote_arch = opal_local_arch;
+/*    printf( "\n\n#\n * TEST INVERSED VECTOR\n #\n\n" );
+    pdt = create_inversed_vector( &ompi_mpi_int.dt, 10 );
+    if( outputFlags & CHECK_PACK_UNPACK ) {
+        local_copy_ddt_count(pdt, 100);
+        local_copy_with_convertor(pdt, 100, 956);
+    }
+    OBJ_RELEASE( pdt ); assert( pdt == NULL );
+    printf( "\n\n#\n * TEST STRANGE DATATYPE\n #\n\n" );
+    pdt = create_strange_dt();
+    if( outputFlags & CHECK_PACK_UNPACK ) {
+        local_copy_ddt_count(pdt, 1);
+        local_copy_with_convertor(pdt, 1, 956);
+    }
+    OBJ_RELEASE( pdt ); assert( pdt == NULL );
+*/    
+    printf("\n TEST STRUCT \n");
+    pdt = create_struct_type(5);
+    if( outputFlags & CHECK_PACK_UNPACK ) {
+        for (i = 1; i <= 1; i++) {
+  //          local_copy_with_convertor_2datatypes_struct(pdt, 1, pdt, 1, 1024*1024*100, 5);
+        }
+    }
+    
+    printf( "\n\n#\n * TEST UPPER TRIANGULAR MATRIX (size 100)\n #\n\n" );
+    int mat_size = 500;
+    for (mat_size = 4000; mat_size <= 4000; mat_size +=1000) {
+        pdt = upper_matrix(mat_size);
+        printf("----matrix size %d-----\n", mat_size);
+        if( outputFlags & CHECK_PACK_UNPACK ) {
+            for (i = 1; i <= 5; i++) {
+               //  local_copy_with_convertor(pdt, 1, 200000000, mat_size);
+            }
+        }
+        OBJ_RELEASE( pdt ); assert( pdt == NULL );
+    }
+    
+    ompi_datatype_t *column, *matt;
+    mat_size = 1000;
+ //   ompi_datatype_create_vector( mat_size, 1, mat_size, MPI_DOUBLE, &column );
+ //   ompi_datatype_create_hvector( mat_size, 1, sizeof(double), column, &matt );
+ //   ompi_datatype_commit( &matt );
+ //   local_copy_with_convertor_mat(matt, 1, 200000000, mat_size);
+    
+    
+    int packed_size = 256;
+    int blk_len = 4;
+    int blk_count;
+    
+    while (packed_size <= 8388608) {
+        blk_count = packed_size / blk_len / sizeof(double);
+        printf( ">>--------------------------------------------<<\n" );
+        printf( "Vector data-type packed size %d, blk %d, count %d\n", packed_size, blk_len, blk_count );
+        pdt = create_vector_type( MPI_DOUBLE, blk_count, blk_len, 128+blk_len );
+        if( outputFlags & CHECK_PACK_UNPACK ) {
+            for (i = 0; i < 4; i++) {
+            //     vector_ddt( pdt, 1, pdt, 1, 1024*1024*30, blk_count, blk_len, 128+blk_len );
+            }
+        }
+        packed_size *= 2;
+    }
+    
+    packed_size = 256;
+    blk_len = 16;
+    while (packed_size <= 8388608) {
+        blk_count = packed_size / blk_len / sizeof(double);
+        printf( ">>--------------------------------------------<<\n" );
+        printf( "Vector data-type packed size %d, blk %d, count %d\n", packed_size, blk_len, blk_count );
+        pdt = create_vector_type( MPI_DOUBLE, blk_count, blk_len, 128+blk_len );
+        if( outputFlags & CHECK_PACK_UNPACK ) {
+            for (i = 0; i < 4; i++) {
+        //         vector_ddt( pdt, 1, pdt, 1, 1024*1024*30, blk_count, blk_len, 128+blk_len );
+            }
+        }
+        packed_size *= 2;
+    }
+    
+    packed_size = 1024;
+    blk_len = 64;
+    while (packed_size <= 8388608) {
+        blk_count = packed_size / blk_len / sizeof(double);
+        printf( ">>--------------------------------------------<<\n" );
+        printf( "Vector data-type packed size %d, blk %d, count %d\n", packed_size, blk_len, blk_count );
+        pdt = create_vector_type( MPI_DOUBLE, blk_count, blk_len, 128+blk_len );
+        if( outputFlags & CHECK_PACK_UNPACK ) {
+            for (i = 0; i < 4; i++) {
+         //       vector_ddt( pdt, 1, pdt, 1, 1024*1024*30, blk_count, blk_len, 128+blk_len );
+           //     vector_ddt_2d( pdt, 1, pdt, 1, 1024*1024*30, blk_count, blk_len, 128+blk_len );
+            }
+        }
+        packed_size *= 2;
+    }
+    
+    
+    for (blk_len = 1000; blk_len <= 4000; blk_len += 2000) {
+        printf( ">>--------------------------------------------<<\n" );
+        printf( "Vector data-type (1024 times %d double stride 512)\n", blk_len );
+        pdt = create_vector_type( MPI_DOUBLE, blk_len, blk_len, blk_len*2);
+        if( outputFlags & CHECK_PACK_UNPACK ) {
+            for (i = 0; i < 4; i++) {
+                 vector_ddt( pdt, 1, pdt, 1, 1024*1024*200 , blk_len, blk_len, blk_len*2);
+     //          vector_ddt_2d( pdt, 1, pdt, 1, 1024*1024*100 , 8192, blk_len, blk_len+128);
+            }
+        }
+        OBJ_RELEASE( pdt ); assert( pdt == NULL );
+    }
+    
+    
+    for (blk_len = 4; blk_len <= 64; blk_len += 2) {
+        printf( ">>--------------------------------------------<<\n" );
+        printf( "Vector data-type (60000 times %d double stride 512)\n", blk_len );
+        pdt = create_vector_type( MPI_DOUBLE, 8000, blk_len, blk_len+128);
+        if( outputFlags & CHECK_PACK_UNPACK ) {
+            for (i = 0; i < 4; i++) {
+     //            vector_ddt( pdt, 1, pdt, 1, 1024*1024*100 , 8000, blk_len, blk_len+128);
+    //             vector_ddt_2d( pdt, 1, pdt, 1, 1024*1024*100 , 8192, blk_len, blk_len+128);
+            }
+        }
+        OBJ_RELEASE( pdt ); assert( pdt == NULL );
+    }
+    
+    for (blk_len = 51; blk_len <= 51; blk_len += 500) {
+        printf( ">>--------------------------------------------<<\n" );
+        printf( "Vector data-type (60000 times %d double stride 512)\n", blk_len );
+        pdt = create_vector_type( MPI_DOUBLE, blk_len, blk_len, blk_len*2);
+        if( outputFlags & CHECK_PACK_UNPACK ) {
+            for (i = 0; i < 1; i++) {
+      //           vector_ddt( pdt, 1, pdt, 1, 1024*1024*100 , blk_len, blk_len, blk_len*2);
+    //             vector_ddt_2d( pdt, 1, pdt, 1, 1024*1024*100 , 8192, blk_len, blk_len+128);
+            }
+        }
+        OBJ_RELEASE( pdt ); assert( pdt == NULL );
+    }
+    
+    /*
+    for (blk_len = 4; blk_len <= 32; blk_len += 1) {
+        printf( ">>--------------------------------------------<<\n" );
+        printf( "Vector data-type (4000 times %d double stride 512)\n", blk_len );
+        pdt = create_vector_type( MPI_DOUBLE, 1000, blk_len, blk_len+64);
+        if( outputFlags & CHECK_PACK_UNPACK ) {
+            for (i = 0; i < 4; i++) {
+                vector_ddt( pdt, 1, pdt, 1, 1024*1024*200 , 1000, blk_len, blk_len+64);
+            }
+        }
+        OBJ_RELEASE( pdt ); assert( pdt == NULL );
+    }
+    */
+      
+    printf( "Vector data-type (4000 times 256 double stride 384)\n" );
+    pdt = create_vector_type( MPI_DOUBLE, 4000, 256, 384 );
+//    ompi_datatype_dump( pdt );
+    if( outputFlags & CHECK_PACK_UNPACK ) {
+        for (i = 0; i < 4; i++) {
+       // local_copy_ddt_count(pdt, 1);
+      //  local_copy_with_convertor( pdt, 1, 12 );
+      //  local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 12 );
+      //  local_copy_with_convertor( pdt, 1, 82 );
+      //  local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 82 );
+      //  local_copy_with_convertor( pdt, 1, 6000 );
+      //  local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 6000 );
+      //  local_copy_with_convertor( pdt, 1, 36000 );
+     //     vector_ddt( pdt, 1, pdt, 1, 1024*1024*200, 4000, 256, 384 );
+        }
+    }
+    printf( ">>--------------------------------------------<<\n" );
+    OBJ_RELEASE( pdt ); assert( pdt == NULL );
+    
+    printf( "Vector data-type (4000 times 128 double stride 256)\n" );
+    pdt = create_vector_type( MPI_DOUBLE, 4000, 128, 256 );
+//    ompi_datatype_dump( pdt );
+    if( outputFlags & CHECK_PACK_UNPACK ) {
+        for (i = 0; i < 1; i++) {
+       // local_copy_ddt_count(pdt, 1);
+      //  local_copy_with_convertor( pdt, 1, 12 );
+      //  local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 12 );
+      //  local_copy_with_convertor( pdt, 1, 82 );
+      //  local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 82 );
+      //  local_copy_with_convertor( pdt, 1, 6000 );
+      //  local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 6000 );
+      //  local_copy_with_convertor( pdt, 1, 36000 );
+     //     local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 1024*1024*5 );
+        }
+    }
+    printf( ">>--------------------------------------------<<\n" );
+    OBJ_RELEASE( pdt ); assert( pdt == NULL );
+    
+    printf( "Vector data-type (2000 times 3 double stride 4)\n" );
+    pdt = create_vector_type( MPI_DOUBLE, 2000, 3, 4 );
+//    ompi_datatype_dump( pdt );
+    if( outputFlags & CHECK_PACK_UNPACK ) {
+        for (i = 0; i < 10; i++) {
+       // local_copy_ddt_count(pdt, 1);
+      //  local_copy_with_convertor( pdt, 1, 12 );
+      //  local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 12 );
+      //  local_copy_with_convertor( pdt, 1, 82 );
+      //  local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 82 );
+      //  local_copy_with_convertor( pdt, 1, 6000 );
+      //  local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 6000 );
+      //  local_copy_with_convertor( pdt, 1, 36000 );
+      //  local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 1024*1024*4 );
+        }
+    }
+    printf( ">>--------------------------------------------<<\n" );
+    OBJ_RELEASE( pdt ); assert( pdt == NULL );
+    /*
+    printf( ">>--------------------------------------------<<\n" );
+    pdt = test_struct_char_double();
+    if( outputFlags & CHECK_PACK_UNPACK ) {
+        local_copy_ddt_count(pdt, 4500);
+        local_copy_with_convertor( pdt, 4500, 12 );
+        local_copy_with_convertor_2datatypes( pdt, 4500, pdt, 4500, 12 );
+    }
+    printf( ">>--------------------------------------------<<\n" );
+    OBJ_RELEASE( pdt ); assert( pdt == NULL );
+    
+    printf( ">>--------------------------------------------<<\n" );
+    pdt = test_create_twice_two_doubles();
+    if( outputFlags & CHECK_PACK_UNPACK ) {
+        local_copy_ddt_count(pdt, 4500);
+        local_copy_with_convertor( pdt, 4500, 12 );
+        local_copy_with_convertor_2datatypes( pdt, 4500, pdt, 4500, 12 );
+    }
+    printf( ">>--------------------------------------------<<\n" );
+    OBJ_RELEASE( pdt ); assert( pdt == NULL );
+    printf( ">>--------------------------------------------<<\n" );
+    pdt = test_create_blacs_type();
+    if( outputFlags & CHECK_PACK_UNPACK ) {
+        ompi_datatype_dump( pdt );
+        local_copy_ddt_count(pdt, 2);
+        local_copy_ddt_count(pdt, 4500);
+        local_copy_with_convertor( pdt, 4500, 956 );
+        local_copy_with_convertor_2datatypes( pdt, 4500, pdt, 4500, 956 );
+        local_copy_with_convertor( pdt, 4500, 16*1024 );
+        local_copy_with_convertor_2datatypes( pdt, 4500, pdt, 4500, 16*1024 );
+        local_copy_with_convertor( pdt, 4500, 64*1024 );
+        local_copy_with_convertor_2datatypes( pdt, 4500, pdt, 4500, 64*1024 );
+    }
+    printf( ">>--------------------------------------------<<\n" );
+    OBJ_RELEASE( pdt ); assert( pdt == NULL );
+    printf( ">>--------------------------------------------<<\n" );
+    pdt1 = test_create_blacs_type1( &ompi_mpi_int.dt );
+    pdt2 = test_create_blacs_type2( &ompi_mpi_int.dt );
+    if( outputFlags & CHECK_PACK_UNPACK ) {
+        local_copy_with_convertor_2datatypes( pdt1, 1, pdt2, 1, 100 );
+    }
+    printf( ">>--------------------------------------------<<\n" );
+    OBJ_RELEASE( pdt1 ); assert( pdt1 == NULL );
+    OBJ_RELEASE( pdt2 ); assert( pdt2 == NULL );
+*/
+    /* clean-ups all data allocations */
+    ompi_datatype_finalize();
+
+    return OMPI_SUCCESS;
+}
diff --git a/test/datatype/ddt_lib.c b/test/datatype/ddt_lib.c
index 9170da0914a..a96ec085ddd 100644
--- a/test/datatype/ddt_lib.c
+++ b/test/datatype/ddt_lib.c
@@ -358,14 +358,28 @@ ompi_datatype_t* upper_matrix( unsigned int mat_size )
 
     disp = (int*)malloc( sizeof(int) * mat_size );
     blocklen = (int*)malloc( sizeof(int) * mat_size );
-
+    
     for( i = 0; i < mat_size; i++ ) {
         disp[i] = i * mat_size + i;
         blocklen[i] = mat_size - i;
     }
-
+    /*int ct = 0;
+    for (i = 0; i < mat_size; i++) {
+        blocklen[i] = mat_size - ct*160;
+        disp[i] = i*mat_size + ct*160;
+        if (i % 160 == 0 && i != 0) {
+            ct++;
+        }
+    }*/
+#if defined (TEST_DOUBLE)
     ompi_datatype_create_indexed( mat_size, blocklen, disp, &ompi_mpi_double.dt,
                              &upper );
+#elif defined (TEST_FLOAT)
+    ompi_datatype_create_indexed( mat_size, blocklen, disp, &ompi_mpi_float.dt, &upper );
+#elif defined (TEST_CHAR)
+    ompi_datatype_create_indexed( mat_size, blocklen, disp, &ompi_mpi_char.dt, &upper );
+#else
+#endif
     ompi_datatype_commit( &upper );
     if( outputFlags & DUMP_DATA_AFTER_COMMIT ) {
         ompi_datatype_dump( upper );
@@ -686,3 +700,26 @@ ompi_datatype_t* create_vector_type( const ompi_datatype_t* data, int count, int
     return vector;
 }
 
+ompi_datatype_t* create_struct_type(int count)
+{
+    ompi_datatype_t* dt_struct;
+    ompi_datatype_t* dt_struct_vector;
+    ompi_datatype_t* oldtypes[2];
+    MPI_Aint offsets[2], extent, lb;
+    int blockcounts[2];
+    
+    offsets[0] = 0; 
+    oldtypes[0] = MPI_FLOAT; 
+    blockcounts[0] = 4; 
+    
+    ompi_datatype_get_extent(MPI_FLOAT, &lb, &extent);
+    offsets[1] = 4 * extent; 
+    oldtypes[1] = MPI_DOUBLE; 
+    blockcounts[1] = 2;
+    
+    ompi_datatype_create_struct( 2, blockcounts, offsets, oldtypes, &dt_struct );
+    dt_struct_vector = create_vector_type( dt_struct, 10, 2, 4 );
+    ompi_datatype_commit( &dt_struct_vector );
+    return dt_struct_vector;
+}
+
diff --git a/test/datatype/ddt_lib.h b/test/datatype/ddt_lib.h
index d94690047a7..0f6bbc2cb37 100644
--- a/test/datatype/ddt_lib.h
+++ b/test/datatype/ddt_lib.h
@@ -34,6 +34,11 @@
 #define DUMP_DATA_AFTER_COMMIT 0x00000001
 #define CHECK_PACK_UNPACK      0x00000002
 
+#define TEST_DOUBLE
+//#define TEST_FLOAT
+//#define TEST_CHAR
+
+
 extern uint32_t outputFlags;
 
 /**
@@ -92,4 +97,5 @@ extern ompi_datatype_t* create_contiguous_type( const ompi_datatype_t* data, int
 extern ompi_datatype_t* create_vector_type( const ompi_datatype_t* data, int count,
                                             int length, int stride );
 extern ompi_datatype_t* create_struct_constant_gap_resized_ddt( ompi_datatype_t* type );
+extern ompi_datatype_t* create_struct_type(int count);
 
diff --git a/test/datatype/ddt_test.c b/test/datatype/ddt_test.c
index 0afac9b49ec..ae72785b86c 100644
--- a/test/datatype/ddt_test.c
+++ b/test/datatype/ddt_test.c
@@ -30,6 +30,15 @@
 #include <stdio.h>
 #include <string.h>
 
+#define DDT_TEST_CUDA
+
+#if defined (DDT_TEST_CUDA)
+#include <cuda_runtime_api.h>
+#include "opal/mca/common/cuda/common_cuda.h"
+#include "opal/runtime/opal_params.h"
+#define CONVERTOR_CUDA             0x00400000
+#endif
+
 /* Compile with:
 mpicc -DHAVE_CONFIG_H -I. -I../../include -I../../../ompi-trunk/include  -I../.. -I../../include -I../../../ompi-trunk/opal -I../../../ompi-trunk/orte -I../../../ompi-trunk/ompi -g ddt_test.c -o ddt_test
 */
@@ -171,12 +180,64 @@ static int local_copy_ddt_count( ompi_datatype_t* pdt, int count )
     return OMPI_SUCCESS;
 }
 
+static void fill_vectors(double* vp, int itera, int contig, int gap)
+{
+    int i, j;
+    for (i = 0; i < itera-1; i++ ){
+        for (j = i*gap; j < (i+1)*gap; j++) {
+            if (j >= i*gap && j < i*gap+contig) {
+                vp[j] = 1.0;
+            } else {
+                vp[j] = 0.0;
+            }
+        }
+    }
+    for (i = (itera-1)*gap; i < (itera-1)*gap+contig; i++) {
+        vp[i] = 1.0;
+    }
+    
+    // printf("vector generated:\n");
+    // for (i = 0; i < (itera-1)*gap+contig; i++) {
+    //     printf("%1.f ", vp[i]);
+    // }
+    // printf("\n");
+}
+
+static void verify_vectors(double *vp, int itera, int contig, int gap)
+{
+    int i, j;
+    int error = 0;
+    for (i = 0; i < itera-1; i++) {
+        for (j = i*gap; j < (i+1)*gap; j++) {
+            if (j >= i*gap && j < i*gap+contig) {
+                if (vp[j] != 1.0) {
+                    error ++;
+                }
+            } 
+        }
+    }
+    for (i = (itera-1)*gap; i < (itera-1)*gap+contig; i++) {
+        if (vp[i] != 1.0) {
+            error ++;
+        }
+    }
+    // printf("vector received:\n");
+    // for (i = 0; i < (itera-1)*gap+contig; i++) {
+    //     printf("%1.f ", vp[i]);
+    // }
+    if (error != 0) {
+        printf("%d error is found\n", error);
+    } else {
+        printf("no error is found\n");
+    }
+}
+
 static int
 local_copy_with_convertor_2datatypes( ompi_datatype_t* send_type, int send_count,
                                       ompi_datatype_t* recv_type, int recv_count,
-                                      int chunk )
+                                      int chunk, int itera, int contig, int gap )
 {
-    void *pdst = NULL, *psrc = NULL, *ptemp = NULL;
+    void *pdst = NULL, *psrc = NULL, *ptemp = NULL, *phost = NULL;
     opal_convertor_t *send_convertor = NULL, *recv_convertor = NULL;
     struct iovec iov;
     uint32_t iov_count;
@@ -188,6 +249,40 @@ local_copy_with_convertor_2datatypes( ompi_datatype_t* send_type, int send_count
 
     rlength = compute_buffer_length(recv_type, recv_count);
     slength = compute_buffer_length(send_type, send_count);
+
+#if defined (DDT_TEST_CUDA)
+    cudaError_t error = cudaMalloc((void **)&psrc, slength);
+    if ( error != cudaSuccess) {
+        printf("CUDA error: %s\n", cudaGetErrorString(error));
+        exit(-1);
+    }
+    cudaMemset(psrc, 0, slength);
+    printf("cudamalloc psrc %p\n", psrc);
+    
+    error = cudaMalloc((void **)&pdst, rlength);
+    if ( error != cudaSuccess) {
+        printf("CUDA error: %s\n", cudaGetErrorString(error));
+        exit(-1);
+    }
+    cudaMemset(pdst, 0, rlength); 
+    printf("cudamalloc pdst %p\n", pdst);
+    
+    error = cudaMallocHost((void **)&ptemp, chunk);
+    if ( error != cudaSuccess) {
+        printf("CUDA error: %s\n", cudaGetErrorString(error));
+        exit(-1);
+    }
+    memset(ptemp, 0, chunk);
+    printf("cudamallochost ptemp %p\n", ptemp);
+    
+    error = cudaMallocHost((void **)&phost, slength);
+    if ( error != cudaSuccess) {
+        printf("CUDA error: %s\n", cudaGetErrorString(error));
+        exit(-1);
+    }
+    memset(phost, 0, slength);
+    printf("cudamallochost phost %p\n", phost);
+#else
     pdst  = malloc( rlength );
     psrc  = malloc( slength );
     ptemp = malloc( chunk );
@@ -196,13 +291,31 @@ local_copy_with_convertor_2datatypes( ompi_datatype_t* send_type, int send_count
     for( size_t i = 0; i < slength; i++ )
             ((char*)psrc)[i] = i % 128 + 32;
     memset(pdst, 0, rlength);
+#endif
+    
+#if defined (DDT_TEST_CUDA)
+    if (itera > 0) {
+        fill_vectors((double *)phost, itera, contig, gap);
+    }
+    cudaMemcpy(psrc, phost, slength, cudaMemcpyHostToDevice);
+#else 
+    if (itera > 0) {
+        fill_vectors(psrc, itera, contig, gap);
+    }
+#endif
 
     send_convertor = opal_convertor_create( remote_arch, 0 );
+#if defined (DDT_TEST_CUDA)
+    send_convertor->flags |= CONVERTOR_CUDA;
+#endif
     if( OPAL_SUCCESS != opal_convertor_prepare_for_send( send_convertor, &(send_type->super), send_count, psrc ) ) {
         printf( "Unable to create the send convertor. Is the datatype committed ?\n" );
         goto clean_and_return;
     }
     recv_convertor = opal_convertor_create( remote_arch, 0 );
+#if defined (DDT_TEST_CUDA)
+    recv_convertor->flags |= CONVERTOR_CUDA;
+#endif
     if( OPAL_SUCCESS != opal_convertor_prepare_for_recv( recv_convertor, &(recv_type->super), recv_count, pdst ) ) {
         printf( "Unable to create the recv convertor. Is the datatype committed ?\n" );
         goto clean_and_return;
@@ -242,6 +355,18 @@ local_copy_with_convertor_2datatypes( ompi_datatype_t* send_type, int send_count
     printf( "copying different data-types using convertors in %ld microsec\n", total_time );
     printf( "\t unpack in %ld microsec [pack in %ld microsec]\n", unpack_time,
             total_time - unpack_time );
+            
+#if defined (DDT_TEST_CUDA)
+    memset(phost, 0, slength);
+    cudaMemcpy(phost, pdst, rlength, cudaMemcpyDeviceToHost);
+    if (itera > 0) {
+        verify_vectors((double *)phost, itera, contig, gap);
+    }
+#else
+    if (itera > 0) {
+        verify_vectors((double *)pdst, itera, contig, gap);
+    }
+#endif
  clean_and_return:
     if( send_convertor != NULL ) {
         OBJ_RELEASE( send_convertor ); assert( send_convertor == NULL );
@@ -249,15 +374,25 @@ local_copy_with_convertor_2datatypes( ompi_datatype_t* send_type, int send_count
     if( recv_convertor != NULL ) {
         OBJ_RELEASE( recv_convertor ); assert( recv_convertor == NULL );
     }
+#if defined (DDT_TEST_CUDA)
+    if( NULL != pdst ) cudaFree( pdst );
+    if( NULL != psrc ) cudaFree( psrc );
+    if( NULL != ptemp ) cudaFreeHost( ptemp );
+    if( NULL != phost ) cudaFreeHost( phost );
+#else
     if( NULL != pdst ) free( pdst );
     if( NULL != psrc ) free( psrc );
     if( NULL != ptemp ) free( ptemp );
+#endif
     return OMPI_SUCCESS;
 }
 
-static int local_copy_with_convertor( ompi_datatype_t* pdt, int count, int chunk )
+static int
+local_copy_with_convertor_2datatypes_struct( ompi_datatype_t* send_type, int send_count,
+                                      ompi_datatype_t* recv_type, int recv_count,
+                                      int chunk, int count)
 {
-    void *pdst = NULL, *psrc = NULL, *ptemp = NULL;
+    void *pdst = NULL, *psrc = NULL, *ptemp = NULL, *phost = NULL;
     opal_convertor_t *send_convertor = NULL, *recv_convertor = NULL;
     struct iovec iov;
     uint32_t iov_count;
@@ -265,23 +400,317 @@ static int local_copy_with_convertor( ompi_datatype_t* pdt, int count, int chunk
     int32_t length = 0, done1 = 0, done2 = 0;
     TIMER_DATA_TYPE start, end, unpack_start, unpack_end;
     long total_time, unpack_time = 0;
+    size_t slength, rlength;
 
-    max_data = compute_buffer_length(pdt, count);
+    rlength = compute_buffer_length(recv_type, recv_count);
+    slength = compute_buffer_length(send_type, send_count);
 
-    pdst  = malloc(max_data);
-    psrc  = malloc(max_data);
-    ptemp = malloc(chunk);
+#if defined (DDT_TEST_CUDA)
+    cudaError_t error = cudaMalloc((void **)&psrc, slength);
+    if ( error != cudaSuccess) {
+        printf("CUDA error: %s\n", cudaGetErrorString(error));
+        exit(-1);
+    }
+    cudaMemset(psrc, 0, slength);
+    printf("cudamalloc psrc %p\n", psrc);
+    
+    error = cudaMalloc((void **)&pdst, rlength);
+    if ( error != cudaSuccess) {
+        printf("CUDA error: %s\n", cudaGetErrorString(error));
+        exit(-1);
+    }
+    cudaMemset(pdst, 0, rlength); 
+    printf("cudamalloc pdst %p\n", pdst);
+    
+    error = cudaMallocHost((void **)&ptemp, chunk);
+    if ( error != cudaSuccess) {
+        printf("CUDA error: %s\n", cudaGetErrorString(error));
+        exit(-1);
+    }
+    memset(ptemp, 0, chunk);
+    printf("cudamallochost ptemp %p\n", ptemp);
+    
+    error = cudaMallocHost((void **)&phost, slength);
+    if ( error != cudaSuccess) {
+        printf("CUDA error: %s\n", cudaGetErrorString(error));
+        exit(-1);
+    }
+    memset(phost, 0, slength);
+    printf("cudamallochost phost %p\n", phost);
+#else
+    pdst  = malloc( rlength );
+    psrc  = malloc( slength );
+    ptemp = malloc( chunk );
+
+    /* initialize the buffers to prevent valgrind from complaining */
+    for( size_t i = 0; i < slength; i++ )
+            ((char*)psrc)[i] = i % 128 + 32;
+    memset(pdst, 0, rlength);
+#endif
+    
+#if defined (DDT_TEST_CUDA)
 
+    cudaMemcpy(psrc, phost, slength, cudaMemcpyHostToDevice);
+#else 
+
+#endif
+
+    send_convertor = opal_convertor_create( remote_arch, 0 );
+#if defined (DDT_TEST_CUDA)
+    send_convertor->flags |= CONVERTOR_CUDA;
+#endif
+    if( OPAL_SUCCESS != opal_convertor_prepare_for_send( send_convertor, &(send_type->super), send_count, psrc ) ) {
+        printf( "Unable to create the send convertor. Is the datatype committed ?\n" );
+        goto clean_and_return;
+    }
+    recv_convertor = opal_convertor_create( remote_arch, 0 );
+#if defined (DDT_TEST_CUDA)
+    recv_convertor->flags |= CONVERTOR_CUDA;
+#endif
+    if( OPAL_SUCCESS != opal_convertor_prepare_for_recv( recv_convertor, &(recv_type->super), recv_count, pdst ) ) {
+        printf( "Unable to create the recv convertor. Is the datatype committed ?\n" );
+        goto clean_and_return;
+    }
+
+    cache_trash();  /* make sure the cache is useless */
+
+    GET_TIME( start );
+    while( (done1 & done2) != 1 ) {
+        /* They are supposed to finish in exactly the same time. */
+        if( done1 | done2 ) {
+            printf( "WRONG !!! the send is %s but the receive is %s in local_copy_with_convertor_2datatypes\n",
+                    (done1 ? "finish" : "not finish"),
+                    (done2 ? "finish" : "not finish") );
+        }
+
+        max_data = chunk;
+        iov_count = 1;
+        iov.iov_base = ptemp;
+        iov.iov_len = chunk;
+
+        if( done1 == 0 ) {
+            done1 = opal_convertor_pack( send_convertor, &iov, &iov_count, &max_data );
+        }
+
+        if( done2 == 0 ) {
+            GET_TIME( unpack_start );
+            done2 = opal_convertor_unpack( recv_convertor, &iov, &iov_count, &max_data );
+            GET_TIME( unpack_end );
+            unpack_time += ELAPSED_TIME( unpack_start, unpack_end );
+        }
+
+        length += max_data;
+    }
+    GET_TIME( end );
+    total_time = ELAPSED_TIME( start, end );
+    printf( "copying different data-types using convertors in %ld microsec\n", total_time );
+    printf( "\t unpack in %ld microsec [pack in %ld microsec]\n", unpack_time,
+            total_time - unpack_time );
+            
+#if defined (DDT_TEST_CUDA)
+    memset(phost, 0, slength);
+    cudaMemcpy(phost, pdst, rlength, cudaMemcpyDeviceToHost);
+
+#else
+
+#endif
+ clean_and_return:
+    if( send_convertor != NULL ) {
+        OBJ_RELEASE( send_convertor ); assert( send_convertor == NULL );
+    }
+    if( recv_convertor != NULL ) {
+        OBJ_RELEASE( recv_convertor ); assert( recv_convertor == NULL );
+    }
+#if defined (DDT_TEST_CUDA)
+    if( NULL != pdst ) cudaFree( pdst );
+    if( NULL != psrc ) cudaFree( psrc );
+    if( NULL != ptemp ) cudaFreeHost( ptemp );
+    if( NULL != phost ) cudaFreeHost( phost );
+#else
+    if( NULL != pdst ) free( pdst );
+    if( NULL != psrc ) free( psrc );
+    if( NULL != ptemp ) free( ptemp );
+#endif
+    return OMPI_SUCCESS;
+}
+
+
+static void fill_upper_matrix(void *matt, int msize)
+{
+    int i, j, start, end;
+    int *blklens, *displs;
+#if defined (TEST_DOUBLE)
+    double *mat = (double *)matt;
+#elif defined (TEST_FLOAT)
+    float *mat = (float *)matt;
+#elif defined (TEST_CHAR)
+    char *mat = (char *)matt;
+#else
+    void *mat = matt;
+#endif
+    
+    blklens = (int *)malloc(sizeof(int)*msize);
+    displs = (int *)malloc(sizeof(int)*msize);
+    for (i = 0; i < msize; i++) {
+        blklens[i] = msize - i;
+        displs[i] = i*msize + i;
+    }
+    for (i = 0; i < msize; i++) {
+        start = displs[i];
+        end = start + blklens[i];
+        for (j = start; j < end; j++) {
+#if defined (TEST_CHAR)
+            mat[j] = 'a';
+#else
+            mat[j] = 0.0 + i;
+#endif
+        }
+    }
+    free(blklens);
+    free(displs);
+
+   // printf("matrix generate\n");
+   // for (i = 0; i < msize; i++) {
+   //     for (j = 0; j < msize; j++) {
+   //         printf(" %1.f ", mat[i*msize+j]);
+   //     }
+   //     printf("\n");
+   // }
+}
+
+static void verify_mat_result(void *matt, int msize)
+{
+    int *blklens, *displs;
+    int i, j, error = 0;
+    int start, end;
+#if defined (TEST_DOUBLE)
+    double *mat = (double *)matt;
+#elif defined (TEST_FLOAT)
+    float *mat = (float *)matt;
+#elif defined (TEST_CHAR)
+    char *mat = (char *)matt;
+#else
+    void *mat = matt;
+#endif
+    
+    blklens = (int *)malloc(sizeof(int)*msize);
+    displs = (int *)malloc(sizeof(int)*msize);
+    for (i = 0; i < msize; i++) {
+        blklens[i] = msize - i;
+        displs[i] = i*msize + i;
+    }
+    for (i = 0; i < msize; i++) {
+        start = displs[i];
+        end = start + blklens[i];
+        for (j = start; j < end; j++) {
+#if defined (TEST_CHAR) 
+            if (mat[j] != 'a') {
+#else
+            if (mat[j] != (0.0+i)) {
+#endif
+                error ++;
+            }
+        }
+    }
+    free(blklens);
+    free(displs);
+    
+    // printf("matrix received\n");
+    // for (i = 0; i < msize; i++) {
+    //     for (j = 0; j < msize; j++) {
+    //         printf(" %1.f ", mat[i*msize+j]);
+    //     }
+    //     printf("\n");
+    // }
+    
+    if (error != 0) {
+        printf("error is found %d\n", error);
+    } else {
+        printf("no error is found\n");
+    }
+}
+
+static int local_copy_with_convertor( ompi_datatype_t* pdt, int count, int chunk, int msize )
+{
+    void *pdst = NULL, *psrc = NULL, *ptemp = NULL, *phost = NULL;
+    opal_convertor_t *send_convertor = NULL, *recv_convertor = NULL;
+    struct iovec iov;
+    uint32_t iov_count;
+    size_t max_data, dt_length;
+    int32_t length = 0, done1 = 0, done2 = 0;
+    TIMER_DATA_TYPE start, end, unpack_start, unpack_end;
+    long total_time, unpack_time = 0;
+
+    dt_length = compute_buffer_length(pdt, count);
+    printf("length %lu\n", dt_length);
+
+    cudaSetDevice(1);
+
+#if defined (DDT_TEST_CUDA)
+    cudaError_t error = cudaMalloc((void **)&psrc, dt_length);
+    if ( error != cudaSuccess) {
+        printf("CUDA error: %s\n", cudaGetErrorString(error));
+        exit(-1);
+    }
+    cudaMemset(psrc, 0, dt_length);
+    printf("cudamalloc psrc %p\n", psrc);
+    
+    error = cudaMalloc((void **)&pdst, dt_length);
+    if ( error != cudaSuccess) {
+        printf("CUDA error: %s\n", cudaGetErrorString(error));
+        exit(-1);
+    }
+    cudaMemset(pdst, 0, dt_length); 
+    printf("cudamalloc pdst %p\n", pdst);
+    
+    error = cudaMallocHost((void **)&ptemp, chunk);
+    if ( error != cudaSuccess) {
+        printf("CUDA error: %s\n", cudaGetErrorString(error));
+        exit(-1);
+    }
+    memset(ptemp, 0, chunk);
+    printf("cudamallochost ptemp %p\n", ptemp);
+    
+    error = cudaMallocHost((void **)&phost, dt_length);
+    if ( error != cudaSuccess) {
+        printf("CUDA error: %s\n", cudaGetErrorString(error));
+        exit(-1);
+    }
+    memset(phost, 0, dt_length);
+    printf("cudamallochost phost %p\n", phost);
+#else
+    pdst  = malloc(dt_length);
+    psrc  = malloc(dt_length);
+    ptemp = malloc(chunk);
+    
     for( int i = 0; i < length; ((char*)psrc)[i] = i % 128 + 32, i++ );
     memset( pdst, 0, length );
+#endif
+
+#if defined (DDT_TEST_CUDA)
+    if (msize > 0) {
+        fill_upper_matrix(phost, msize);
+    }
+    cudaMemcpy(psrc, phost, dt_length, cudaMemcpyHostToDevice);
+#else 
+    if (msize > 0) {
+        fill_upper_matrix(psrc, msize);
+    }
+#endif
 
     send_convertor = opal_convertor_create( remote_arch, 0 );
+#if defined (DDT_TEST_CUDA)
+    send_convertor->flags |= CONVERTOR_CUDA;
+#endif
     if( OPAL_SUCCESS != opal_convertor_prepare_for_send( send_convertor, &(pdt->super), count, psrc ) ) {
         printf( "Unable to create the send convertor. Is the datatype committed ?\n" );
         goto clean_and_return;
     }
 
     recv_convertor = opal_convertor_create( remote_arch, 0 );
+#if defined (DDT_TEST_CUDA)
+    recv_convertor->flags |= CONVERTOR_CUDA;
+#endif
     if( OPAL_SUCCESS != opal_convertor_prepare_for_recv( recv_convertor, &(pdt->super), count, pdst ) ) {
         printf( "Unable to create the recv convertor. Is the datatype committed ?\n" );
         goto clean_and_return;
@@ -321,13 +750,32 @@ static int local_copy_with_convertor( ompi_datatype_t* pdt, int count, int chunk
     printf( "copying same data-type using convertors in %ld microsec\n", total_time );
     printf( "\t unpack in %ld microsec [pack in %ld microsec]\n", unpack_time,
             total_time - unpack_time );
- clean_and_return:
+            
+#if defined (DDT_TEST_CUDA)
+    memset(phost, 0, dt_length);
+    cudaMemcpy(phost, pdst, dt_length, cudaMemcpyDeviceToHost);
+    if (msize > 0) {
+        verify_mat_result(phost, msize);
+    }
+#else
+    if (msize > 0) {
+        verify_mat_result(pdst, msize);
+    }
+#endif
+clean_and_return:
     if( NULL != send_convertor ) OBJ_RELEASE( send_convertor );
     if( NULL != recv_convertor ) OBJ_RELEASE( recv_convertor );
 
+#if defined (DDT_TEST_CUDA)
+    if( NULL != pdst ) cudaFree( pdst );
+    if( NULL != psrc ) cudaFree( psrc );
+    if( NULL != ptemp ) cudaFreeHost( ptemp );
+    if( NULL != phost ) cudaFreeHost( phost );
+#else
     if( NULL != pdst ) free( pdst );
     if( NULL != psrc ) free( psrc );
     if( NULL != ptemp ) free( ptemp );
+#endif
     return OMPI_SUCCESS;
 }
 
@@ -341,16 +789,22 @@ static int local_copy_with_convertor( ompi_datatype_t* pdt, int count, int chunk
 int main( int argc, char* argv[] )
 {
     ompi_datatype_t *pdt, *pdt1, *pdt2, *pdt3;
-    int rc, length = 500;
+    int rc, length = 500, i;
 
+#if defined (DDT_TEST_CUDA)
+    opal_cuda_support = 1;
+#endif
     opal_init_util(&argc, &argv);
+#if defined (DDT_TEST_CUDA)
+   // mca_common_cuda_stage_one_init();
+#endif
     ompi_datatype_init();
 
     /**
      * By default simulate homogeneous architectures.
      */
     remote_arch = opal_local_arch;
-    printf( "\n\n#\n * TEST INVERSED VECTOR\n #\n\n" );
+/*    printf( "\n\n#\n * TEST INVERSED VECTOR\n #\n\n" );
     pdt = create_inversed_vector( &ompi_mpi_int.dt, 10 );
     if( outputFlags & CHECK_PACK_UNPACK ) {
         local_copy_ddt_count(pdt, 100);
@@ -364,15 +818,25 @@ int main( int argc, char* argv[] )
         local_copy_with_convertor(pdt, 1, 956);
     }
     OBJ_RELEASE( pdt ); assert( pdt == NULL );
-
+*/    
+    printf("\n TEST STRUCT \n");
+    pdt = create_struct_type(5);
+    if( outputFlags & CHECK_PACK_UNPACK ) {
+        for (i = 1; i <= 1; i++) {
+  //          local_copy_with_convertor_2datatypes_struct(pdt, 1, pdt, 1, 1024*1024*100, 5);
+        }
+    }
+    
     printf( "\n\n#\n * TEST UPPER TRIANGULAR MATRIX (size 100)\n #\n\n" );
-    pdt = upper_matrix(100);
+    pdt = upper_matrix(4000);
     if( outputFlags & CHECK_PACK_UNPACK ) {
-        local_copy_ddt_count(pdt, 1);
-        local_copy_with_convertor(pdt, 1, 48);
+        for (i = 1; i <= 4; i++) {
+//        local_copy_ddt_count(pdt, 1);
+            local_copy_with_convertor(pdt, 1, 1024*1024*200, 4000);
+        }
     }
     OBJ_RELEASE( pdt ); assert( pdt == NULL );
-
+  /*  
     mpich_typeub();
     mpich_typeub2();
     mpich_typeub3();
@@ -401,7 +865,6 @@ int main( int argc, char* argv[] )
     ompi_datatype_create_contiguous(0, &ompi_mpi_datatype_null.dt, &pdt1);
     ompi_datatype_create_contiguous(0, &ompi_mpi_datatype_null.dt, &pdt2);
     ompi_datatype_create_contiguous(0, &ompi_mpi_datatype_null.dt, &pdt3);
-
     ompi_datatype_add( pdt3, &ompi_mpi_int.dt, 10, 0, -1 );
     ompi_datatype_add( pdt3, &ompi_mpi_float.dt, 5, 10 * sizeof(int), -1 );
 
@@ -427,7 +890,6 @@ int main( int argc, char* argv[] )
     OBJ_RELEASE( pdt1 ); assert( pdt1 == NULL );
     OBJ_RELEASE( pdt2 ); assert( pdt2 == NULL );
     OBJ_RELEASE( pdt3 ); assert( pdt3 == NULL );
-
     printf( ">>--------------------------------------------<<\n" );
     printf( " Contiguous data-type (MPI_DOUBLE)\n" );
     pdt = MPI_DOUBLE;
@@ -476,26 +938,104 @@ int main( int argc, char* argv[] )
         local_copy_with_convertor( pdt, 4500, 12 );
         local_copy_with_convertor_2datatypes( pdt, 4500, pdt, 4500, 12 );
         OBJ_RELEASE( pdt ); assert( pdt == NULL );
+    }*/
+    printf( ">>--------------------------------------------<<\n" );
+    printf( "Vector data-type (4000 times 512 double stride 640)\n" );
+#if 0
+    pdt = create_vector_type( MPI_DOUBLE, 4000, 512, 640 );
+    opal_datatype_resize(&pdt->super, 0, pdt->super.size+sizeof(MPI_DOUBLE));
+    ompi_datatype_create_contiguous( 1, pdt, &pdt1 );
+#else
+    pdt = create_vector_type( MPI_DOUBLE, 4000, 512, 640 );
+  //  opal_datatype_resize(&pdt->super, 0, pdt->super.size+sizeof(MPI_DOUBLE));
+  //  ompi_datatype_create_contiguous( 4000, pdt, &pdt1 );
+#endif
+//    ompi_datatype_dump( pdt );
+ //   ompi_datatype_commit(&pdt1);
+    if( outputFlags & CHECK_PACK_UNPACK ) {
+        for (i = 0; i < 10; i++) {
+    //         local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 1024*1024*30 );
+        }
     }
     printf( ">>--------------------------------------------<<\n" );
+    printf( "Vector data-type (4000 times 384 double stride 512)\n" );
+    pdt = create_vector_type( MPI_DOUBLE, 4000, 384, 512 );
+    opal_datatype_resize(&pdt->super, 0, pdt->super.size+sizeof(MPI_DOUBLE));
+    ompi_datatype_create_contiguous( 1, pdt, &pdt1 );
+//    ompi_datatype_dump( pdt );
+    if( outputFlags & CHECK_PACK_UNPACK ) {
+        for (i = 0; i < 1; i++) {
+       // local_copy_ddt_count(pdt, 1);
+      //  local_copy_with_convertor( pdt, 1, 12 );
+      //  local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 12 );
+      //  local_copy_with_convertor( pdt, 1, 82 );
+      //  local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 82 );
+      //  local_copy_with_convertor( pdt, 1, 6000 );
+      //  local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 6000 );
+      //  local_copy_with_convertor( pdt, 1, 36000 );
+   //       local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 1024*1024*20 , 4000, 384, 512);
+        }
+    }
     printf( ">>--------------------------------------------<<\n" );
-    printf( "Vector data-type (450 times 10 double stride 11)\n" );
-    pdt = create_vector_type( MPI_DOUBLE, 450, 10, 11 );
-    ompi_datatype_dump( pdt );
+    OBJ_RELEASE( pdt ); assert( pdt == NULL );
+    
+    printf( "Vector data-type (4000 times 256 double stride 384)\n" );
+    pdt = create_vector_type( MPI_DOUBLE, 4000, 256, 384 );
+//    ompi_datatype_dump( pdt );
     if( outputFlags & CHECK_PACK_UNPACK ) {
-        local_copy_ddt_count(pdt, 1);
-        local_copy_with_convertor( pdt, 1, 12 );
-        local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 12 );
-        local_copy_with_convertor( pdt, 1, 82 );
-        local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 82 );
-        local_copy_with_convertor( pdt, 1, 6000 );
-        local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 6000 );
-        local_copy_with_convertor( pdt, 1, 36000 );
-        local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 36000 );
+        for (i = 0; i < 1; i++) {
+       // local_copy_ddt_count(pdt, 1);
+      //  local_copy_with_convertor( pdt, 1, 12 );
+      //  local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 12 );
+      //  local_copy_with_convertor( pdt, 1, 82 );
+      //  local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 82 );
+      //  local_copy_with_convertor( pdt, 1, 6000 );
+      //  local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 6000 );
+      //  local_copy_with_convertor( pdt, 1, 36000 );
+      //    local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 1024*2000, 4000, 256, 384 );
+        }
     }
     printf( ">>--------------------------------------------<<\n" );
     OBJ_RELEASE( pdt ); assert( pdt == NULL );
-
+    
+    printf( "Vector data-type (4000 times 128 double stride 256)\n" );
+    pdt = create_vector_type( MPI_DOUBLE, 4000, 128, 256 );
+//    ompi_datatype_dump( pdt );
+    if( outputFlags & CHECK_PACK_UNPACK ) {
+        for (i = 0; i < 10; i++) {
+       // local_copy_ddt_count(pdt, 1);
+      //  local_copy_with_convertor( pdt, 1, 12 );
+      //  local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 12 );
+      //  local_copy_with_convertor( pdt, 1, 82 );
+      //  local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 82 );
+      //  local_copy_with_convertor( pdt, 1, 6000 );
+      //  local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 6000 );
+      //  local_copy_with_convertor( pdt, 1, 36000 );
+    //      local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 1024*1024*5 );
+        }
+    }
+    printf( ">>--------------------------------------------<<\n" );
+    OBJ_RELEASE( pdt ); assert( pdt == NULL );
+    
+    printf( "Vector data-type (2000 times 3 double stride 4)\n" );
+    pdt = create_vector_type( MPI_DOUBLE, 2000, 3, 4 );
+//    ompi_datatype_dump( pdt );
+    if( outputFlags & CHECK_PACK_UNPACK ) {
+        for (i = 0; i < 10; i++) {
+       // local_copy_ddt_count(pdt, 1);
+      //  local_copy_with_convertor( pdt, 1, 12 );
+      //  local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 12 );
+      //  local_copy_with_convertor( pdt, 1, 82 );
+      //  local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 82 );
+      //  local_copy_with_convertor( pdt, 1, 6000 );
+      //  local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 6000 );
+      //  local_copy_with_convertor( pdt, 1, 36000 );
+      //  local_copy_with_convertor_2datatypes( pdt, 1, pdt, 1, 1024*1024*4 );
+        }
+    }
+    printf( ">>--------------------------------------------<<\n" );
+    OBJ_RELEASE( pdt ); assert( pdt == NULL );
+    /*
     printf( ">>--------------------------------------------<<\n" );
     pdt = test_struct_char_double();
     if( outputFlags & CHECK_PACK_UNPACK ) {
@@ -515,7 +1055,6 @@ int main( int argc, char* argv[] )
     }
     printf( ">>--------------------------------------------<<\n" );
     OBJ_RELEASE( pdt ); assert( pdt == NULL );
-
     printf( ">>--------------------------------------------<<\n" );
     pdt = test_create_blacs_type();
     if( outputFlags & CHECK_PACK_UNPACK ) {
@@ -531,7 +1070,6 @@ int main( int argc, char* argv[] )
     }
     printf( ">>--------------------------------------------<<\n" );
     OBJ_RELEASE( pdt ); assert( pdt == NULL );
-
     printf( ">>--------------------------------------------<<\n" );
     pdt1 = test_create_blacs_type1( &ompi_mpi_int.dt );
     pdt2 = test_create_blacs_type2( &ompi_mpi_int.dt );
@@ -541,7 +1079,7 @@ int main( int argc, char* argv[] )
     printf( ">>--------------------------------------------<<\n" );
     OBJ_RELEASE( pdt1 ); assert( pdt1 == NULL );
     OBJ_RELEASE( pdt2 ); assert( pdt2 == NULL );
-
+*/
     /* clean-ups all data allocations */
     ompi_datatype_finalize();