From bd8c4cc0579f4fc728be7eee8816674c9f5ae2fa Mon Sep 17 00:00:00 2001
From: George Bosilca <gbosilca@nvidia.com>
Date: Sun, 4 Aug 2024 21:50:59 -0700
Subject: [PATCH 01/18] Allow JDF with no dependencies, no datatype and no
 arenas.

Signed-off-by: George Bosilca <gbosilca@nvidia.com>
---
 parsec/interfaces/ptg/ptg-compiler/jdf2c.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/parsec/interfaces/ptg/ptg-compiler/jdf2c.c b/parsec/interfaces/ptg/ptg-compiler/jdf2c.c
index 37f7e5f10..f1fffff4d 100644
--- a/parsec/interfaces/ptg/ptg-compiler/jdf2c.c
+++ b/parsec/interfaces/ptg/ptg-compiler/jdf2c.c
@@ -4548,11 +4548,14 @@ static void jdf_generate_destructor( const jdf_t *jdf )
             "  free(__parsec_tp->super.super.task_classes_array); __parsec_tp->super.super.task_classes_array = NULL;\n"
             "  __parsec_tp->super.super.nb_task_classes = 0;\n"
             "\n"
+            "#if PARSEC_%s_ADT_IDX_MAX > 0\n"
             "  for(i = 0; i < (uint32_t)__parsec_tp->super.arenas_datatypes_size; i++) {\n"
             "    if( NULL != __parsec_tp->super.arenas_datatypes[i].arena ) {\n"
             "      PARSEC_OBJ_RELEASE(__parsec_tp->super.arenas_datatypes[i].arena);\n"
             "    }\n"
-            "  }\n");
+            "  }\n"
+            "#endif  /* PARSEC_%s_ADT_IDX_MAX > 0 */",
+            jdf_basename, jdf_basename);
 
     coutput("  /* Destroy the data repositories for this object */\n");
     for( f = jdf->functions; NULL != f; f = f->next ) {

From 3e9e84d7facc7901be5d7248b6e6b3c24223d6cc Mon Sep 17 00:00:00 2001
From: George Bosilca <gbosilca@nvidia.com>
Date: Wed, 31 Jul 2024 22:43:26 -0700
Subject: [PATCH 02/18] Add a CUDA-based RTT test.

This allows to check if the data can be send and received directly to
and from GPU buffers.

Signed-off-by: George Bosilca <gbosilca@nvidia.com>
---
 tests/runtime/cuda/CMakeLists.txt |   7 +-
 tests/runtime/cuda/rtt.jdf        |  26 ++++
 tests/runtime/cuda/rtt_main.c     | 225 ++++++++++++++++++++++++++++++
 3 files changed, 257 insertions(+), 1 deletion(-)
 create mode 100644 tests/runtime/cuda/rtt.jdf
 create mode 100644 tests/runtime/cuda/rtt_main.c

diff --git a/tests/runtime/cuda/CMakeLists.txt b/tests/runtime/cuda/CMakeLists.txt
index fbb5a5022..fdfbf9f89 100644
--- a/tests/runtime/cuda/CMakeLists.txt
+++ b/tests/runtime/cuda/CMakeLists.txt
@@ -22,7 +22,12 @@ if(PARSEC_HAVE_CUDA)
   endif( NOT TARGET CUDA::cublas )
 
   # Testing for geting best device
-  parsec_addtest_executable(C testing_get_best_device SOURCES "testing_get_best_device.c")
+  parsec_addtest_executable(C testing_get_best_device SOURCES testing_get_best_device.c)
   target_include_directories(testing_get_best_device PRIVATE $<$<NOT:${PARSEC_BUILD_INPLACE}>:${CMAKE_CURRENT_SOURCE_DIR}>)
   target_ptg_sources(testing_get_best_device PRIVATE "get_best_device_check.jdf")
+
+  # Testing for communications to and from GPU
+  parsec_addtest_executable(C cuda_rtt SOURCES rtt.c rtt_main.c)
+  target_include_directories(cuda_rtt PRIVATE $<$<NOT:${PARSEC_BUILD_INPLACE}>:${CMAKE_CURRENT_SOURCE_DIR}>)
+  target_ptg_sources(cuda_rtt PRIVATE "rtt.jdf")
 endif(PARSEC_HAVE_CUDA)
diff --git a/tests/runtime/cuda/rtt.jdf b/tests/runtime/cuda/rtt.jdf
new file mode 100644
index 000000000..28fe93176
--- /dev/null
+++ b/tests/runtime/cuda/rtt.jdf
@@ -0,0 +1,26 @@
+extern "C" %{
+   /* This simple example does not need to include anything */
+%}
+
+%option no_taskpool_instance = true  /* can be anything */
+
+NT
+WS
+
+PING(k)
+
+k = 0 .. NT-1
+: A(0, k % WS)
+
+RW T <- (k == 0) ? A(0, k % WS) : T PING(k-1)
+     -> (k < NT) ? T PING(k+1) : A(0, k % WS)
+
+; 0
+
+BODY [type = CUDA]
+	printf("ping(%d)\n", k);
+	/* 
+	int r; MPI_Comm_rank(MPI_COMM_WORLD, &r);
+	printf("%d: PING(%d)\n", r, k);
+ 	*/
+END
diff --git a/tests/runtime/cuda/rtt_main.c b/tests/runtime/cuda/rtt_main.c
new file mode 100644
index 000000000..e3c212907
--- /dev/null
+++ b/tests/runtime/cuda/rtt_main.c
@@ -0,0 +1,225 @@
+/**
+ * Copyright (c) 2019-2024 The University of Tennessee and The University
+ *                         of Tennessee Research Foundation.  All rights
+ *                         reserved.
+ */
+
+#include "parsec.h"
+#include "parsec/data_distribution.h"
+#include "parsec/data_dist/matrix/matrix.h"
+#include "parsec/data_dist/matrix/two_dim_rectangle_cyclic.h"
+#include "parsec/include/parsec/execution_stream.h"
+#include "parsec/utils/mca_param.h"
+
+#include "rtt.h"
+
+#if defined(DISTRIBUTED)
+#include <mpi.h>
+#endif
+ #include <math.h>
+
+static int nb_gpus = 1, gpu_mask = 0xff;
+static int cuda_device_index_len = 0, *cuda_device_index = NULL;
+
+/**
+ * @brief init operator
+ *
+ * @param [in] es: execution stream
+ * @param [in] descA: tiled matrix date descriptor
+ * @param [inout] A:  inout data
+ * @param [in] uplo: matrix shape
+ * @param [in] m: tile row index
+ * @param [in] n: tile column index
+ * @param [in] args: NULL
+ */
+static int matrix_init_ops(parsec_execution_stream_t *es,
+                           const parsec_tiled_matrix_t *descA,
+                           void *_A, parsec_matrix_uplo_t uplo,
+                           int m, int n, void *args)
+{
+    memset(_A, 1, m*n);
+
+    /* Address warning when compile */
+#if 1
+    parsec_data_key_t key = descA->super.data_key((parsec_data_collection_t*)descA, m, n);
+    parsec_data_t* data = descA->super.data_of_key((parsec_data_collection_t*)descA, key);
+    parsec_advise_data_on_device(data,
+                                 cuda_device_index[m % cuda_device_index_len],
+                                 PARSEC_DEV_DATA_ADVICE_PREFERRED_DEVICE);
+#endif
+    (void)es; (void)uplo;(void)n;(void)m;(void)args;
+    return 0;
+}
+
+static void
+__parsec_rtt_destructor(parsec_rtt_taskpool_t *rtt_tp)
+{
+    parsec_type_free(&(rtt_tp->arenas_datatypes[PARSEC_rtt_DEFAULT_ADT_IDX].opaque_dtt));
+}
+
+PARSEC_OBJ_CLASS_INSTANCE(parsec_rtt_taskpool_t, parsec_taskpool_t,
+                          NULL, __parsec_rtt_destructor);
+
+parsec_taskpool_t *rtt_New(parsec_context_t *ctx, size_t size, int roundtrips)
+{
+    parsec_rtt_taskpool_t *tp = NULL;
+    parsec_datatype_t block;
+    size_t mb = sqrt(size), nb = size / mb;
+
+    if (mb <= 0) {
+        fprintf(stderr, "To work, RTT must do at least one round time trip of at least one byte\n");
+        return (parsec_taskpool_t *)tp;
+    }
+
+    parsec_matrix_block_cyclic_t* dcA = (parsec_matrix_block_cyclic_t *)calloc(1, sizeof(parsec_matrix_block_cyclic_t));
+    parsec_matrix_block_cyclic_init(dcA, PARSEC_MATRIX_BYTE, PARSEC_MATRIX_TILE,
+                                    ctx->my_rank,
+                                    mb, nb,
+                                    mb, ctx->nb_nodes * nb,
+                                    0, 0,
+                                    mb, ctx->nb_nodes * nb,
+                                    1, ctx->nb_nodes, 1, 1,
+                                    0, 0);
+    dcA->mat = parsec_data_allocate((size_t)dcA->super.nb_local_tiles *
+                                    (size_t)dcA->super.bsiz *
+                                    (size_t)parsec_datadist_getsizeoftype(dcA->super.mtype));
+    parsec_data_collection_set_key((parsec_data_collection_t *)dcA, "A");
+
+    /* Initialize and place the dcA */
+    parsec_apply(ctx, PARSEC_MATRIX_FULL,
+                 (parsec_tiled_matrix_t *)dcA,
+                 (parsec_tiled_matrix_unary_op_t)matrix_init_ops, NULL);
+
+    tp = parsec_rtt_new((parsec_data_collection_t*)dcA, roundtrips, ctx->nb_nodes);
+
+    ptrdiff_t lb, extent;
+    parsec_type_create_contiguous(mb*nb, parsec_datatype_uint8_t, &block);
+    parsec_type_extent(block, &lb, &extent);
+
+    parsec_arena_datatype_construct(&tp->arenas_datatypes[PARSEC_rtt_DEFAULT_ADT_IDX],
+                                    extent, PARSEC_ARENA_ALIGNMENT_SSE,
+                                    block);
+    return (parsec_taskpool_t *)tp;
+}
+
+int main(int argc, char *argv[])
+{
+    parsec_context_t *parsec = NULL;
+    parsec_taskpool_t *tp;
+    int size = 1, rank = 0, loops = 100, frags = 1, nb_runs = 1, cores = 2, do_sleep = 0, ch, use_opt = 1;
+    struct timeval tstart, tend;
+    size_t msg_size = 8*1024;
+    double t, bw;
+
+    while ((ch = getopt(argc, argv, "c:g:G:l:f:m:n:s:")) != -1) {
+        switch (ch) {
+            case 'c': cores = atoi(optarg); use_opt += 2; break;
+            case 'g': nb_gpus = atoi(optarg); use_opt += 2; break;
+            case 'G': gpu_mask = atoi(optarg); use_opt += 2; break;
+            case 'l': loops = atoi(optarg); use_opt += 2; break;
+            case 'f': frags = atoi(optarg); use_opt += 2; break;
+            case 'm': msg_size = (size_t)atoi(optarg); use_opt += 2; break;
+            case 'n': nb_runs = atoi(optarg); use_opt += 2; break;
+            case 's': do_sleep = atoi(optarg); use_opt += 2; break;
+            default:
+                fprintf(stderr,
+                        "-c : number of cores to use (default 2)\n"
+                        "-g : number of GPU to use (default 1)\n"
+                        "-G : GPU mask to use (-1 to modulo rank per node)\n"
+                        "-l : loops of bandwidth(default: 100)\n"
+                        "-f : frags, number of fragments (default: 1)\n"
+                        "-m : size, size of message (default: 1024 * 8)\n"
+                        "-n : number of runs (default: 1)\n"
+                        "-s : number of seconds to sleep before running the tests\n"
+                        "\n");
+                 exit(1);
+        }
+    }
+    /* Remove all options already acknowledged */
+    if( NULL == argv[optind] ) {
+        argc = 1;
+    } else {
+        memcpy(&argv[1], &argv[use_opt+1], (argc - use_opt) * sizeof(char*));
+        argc -= use_opt;
+    }
+    argv[argc] = NULL;
+#if defined(DISTRIBUTED)
+#if defined(PARSEC_HAVE_DEV_CUDA_SUPPORT)
+    extern char **environ;
+    char *value;
+    asprintf(&value, "%d", nb_gpus);
+    parsec_setenv_mca_param("device_cuda_enabled", value, &environ);
+    free(value);
+    value = NULL;
+    if (0xFF != gpu_mask) {
+        asprintf(&value, "%d", gpu_mask);
+        parsec_setenv_mca_param("device_cuda_mask", value, &environ);
+        free(value);
+        value = NULL;
+    }
+#endif
+    {
+        int provided;
+        MPI_Init_thread(NULL, NULL, MPI_THREAD_SERIALIZED, &provided);
+    }
+    MPI_Comm_size(MPI_COMM_WORLD, &size);
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+#endif /* DISTRIBUTED */
+    if( 0 == rank ) {
+        printf("Running %d tests of %d steps RTT with a data of size %zu\n",
+               nb_runs, loops, msg_size);
+    }
+    parsec = parsec_init(cores, &argc, &argv);
+
+    /* can the test run? */
+    nb_gpus = parsec_context_query(parsec, PARSEC_CONTEXT_QUERY_DEVICES, PARSEC_DEV_CUDA);
+    assert(nb_gpus >= 0);
+    if(nb_gpus == 0) {
+        parsec_warning("This test can only run if at least one GPU device is present");
+        exit(-PARSEC_ERR_DEVICE);
+    }
+    if( do_sleep ) {
+        sleep(do_sleep);
+    }
+    cuda_device_index = (int *)malloc(parsec_nb_devices * sizeof(int));
+    cuda_device_index_len = 0;
+    for (int dev = 0; dev < (int)parsec_nb_devices; dev++) {
+        parsec_device_module_t *device = parsec_mca_device_get(dev);
+        if (PARSEC_DEV_CUDA & device->type) {
+            cuda_device_index[cuda_device_index_len++] = device->device_index;
+        }
+    }
+
+#if defined(PARSEC_HAVE_MPI)
+        MPI_Barrier(MPI_COMM_WORLD);
+#endif  /* defined(PARSEC_HAVE_MPI) */
+    gettimeofday(&tstart, NULL);
+    for( int test_id = 0; test_id < nb_runs; test_id++ ) {
+        tp = rtt_New(parsec, msg_size, loops);
+        if( NULL != tp ) {
+            parsec_context_add_taskpool(parsec, tp);
+            parsec_context_start(parsec);
+            parsec_context_wait(parsec);
+            parsec_taskpool_free(tp);
+        }
+    }
+#if defined(PARSEC_HAVE_MPI)
+    MPI_Barrier(MPI_COMM_WORLD);
+#endif  /* defined(PARSEC_HAVE_MPI) */
+    gettimeofday(&tend, NULL);
+
+    if( 0 == rank ) {
+        t = ((tend.tv_sec - tstart.tv_sec) * 1000000.0 + (tend.tv_usec - tstart.tv_usec)) / 1000000.0;  /* in seconds */
+        double total_payload = (double)nb_runs * (double)loops * (double)msg_size / 1024.0 / 1024.0 / 1024.0;
+        bw = total_payload / t;
+        printf("%d\t%d\t%d\t%zu\t%08.4g s\t%4.8g GB/s\n", nb_runs, frags, loops, msg_size*sizeof(uint8_t), t, bw);
+    }
+
+    free(cuda_device_index); cuda_device_index = NULL;
+    cuda_device_index_len = 0;
+    parsec_fini(&parsec);
+#if defined(DISTRIBUTED)
+    MPI_Finalize();
+#endif /* DISTRIBUTED */
+    return 0;
+}

From 6b9630f2b07ea0674e8c6a6818069daa465ee78d Mon Sep 17 00:00:00 2001
From: George Bosilca <gbosilca@nvidia.com>
Date: Wed, 31 Jul 2024 22:45:18 -0700
Subject: [PATCH 03/18] Transfer data to and from GPU.

This is a multi-part patch that allows the CPU to prepare a data copy
mapped onto a device.

1. The first question is how is such a device selected ?

The allocation of such a copy happen way before the scheduler is invoked
for a task, in fact before the task is even ready. Thus, we need to
decide on the location of this copy only based on some static
information, such as the task affinity. Therefore, this approach only
works for owner-compute type of tasks, where the task will be executed
on the device that owns the data used for the task affinity.

2. Pass the correct data copy across the entire system, instead of
   falling back to data copy of the device 0 (CPU memory)

Add a configure option to enable GPU-aware communications.

Signed-off-by: George Bosilca <gbosilca@nvidia.com>
---
 CMakeLists.txt                                |   3 +
 parsec/CMakeLists.txt                         |   1 +
 parsec/arena.c                                | 114 +++++--
 parsec/arena.h                                |  10 +-
 parsec/data.c                                 | 107 +++++--
 parsec/data.h                                 |  10 +-
 parsec/data_dist/matrix/map_operator.c        |   4 +-
 .../matrix/two_dim_rectangle_cyclic.h         |   1 -
 parsec/data_internal.h                        |  42 ++-
 parsec/include/parsec/parsec_options.h.in     |   1 +
 .../interfaces/dtd/insert_function_internal.h |  10 +-
 parsec/interfaces/ptg/ptg-compiler/jdf2c.c    |  44 ++-
 parsec/mca/device/device_gpu.c                | 295 +++++++++++++-----
 parsec/mca/device/device_gpu.h                |   2 +-
 parsec/parsec.c                               |  14 +-
 parsec/parsec_internal.h                      |   9 +
 parsec/parsec_mpi_funnelled.c                 |  29 ++
 parsec/parsec_reshape.c                       |  24 +-
 parsec/remote_dep.c                           |   2 +-
 parsec/remote_dep.h                           |   5 +
 parsec/remote_dep_mpi.c                       |  92 ++++--
 parsec/utils/zone_malloc.c                    |   5 +-
 parsec/utils/zone_malloc.h                    |   1 -
 tests/apps/haar_tree/project_dyn.jdf          |   2 +-
 tests/runtime/cuda/rtt.jdf                    |   2 +-
 tests/runtime/cuda/rtt_main.c                 | 101 +++---
 26 files changed, 666 insertions(+), 264 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index d05bf3e7f..504f9f211 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -170,6 +170,9 @@ option(PARSEC_DIST_WITH_MPI
 if(PARSEC_DIST_WITH_MPI AND 0)
   message(FATAL_ERROR "PARSEC_DIST_WITH_MPI and PARSEC_DIST_WITH_OTHER are mutually exclusive, please select only one")
 endif()
+option(PARSEC_MPI_IS_GPU_AWARE
+       "Build PaRSEC assuming the MPI library is GPU-aware, aka. can move data directly to and from GPU memory.\
+       As of today (mid 2024) while most MPI support such an option, they require a single process per GPU" ON)
 option(PARSEC_DIST_THREAD
   "Use an extra thread to progress the data movements" ON)
 option(PARSEC_DIST_PRIORITIES
diff --git a/parsec/CMakeLists.txt b/parsec/CMakeLists.txt
index eea8bd70a..237985bce 100644
--- a/parsec/CMakeLists.txt
+++ b/parsec/CMakeLists.txt
@@ -238,6 +238,7 @@ if( BUILD_PARSEC )
     $<$<BOOL:${PARSEC_HAVE_OTF2}>:OTF2::OTF2>
     $<$<BOOL:${MPI_C_FOUND}>:MPI::MPI_C>
     $<$<BOOL:${PARSEC_HAVE_CUDA}>:CUDA::cudart>
+    $<$<BOOL:${PARSEC_HAVE_CUDA}>:cuda>
     $<$<BOOL:${PARSEC_HAVE_HIP}>:hip::host>
     ${EXTRA_LIBS}
     INTERFACE
diff --git a/parsec/arena.c b/parsec/arena.c
index ca7def080..73428930e 100644
--- a/parsec/arena.c
+++ b/parsec/arena.c
@@ -235,43 +235,109 @@ int  parsec_arena_allocate_device_private(parsec_data_copy_t *copy,
     return PARSEC_SUCCESS;
 }
 
-parsec_data_copy_t *parsec_arena_get_copy(parsec_arena_t *arena,
-                                          size_t count, int device,
-                                          parsec_datatype_t dtt)
+#include "parsec/utils/zone_malloc.h"
+#include "mca/device/device_gpu.h"
+
+static inline parsec_data_copy_t *
+parsec_arena_internal_copy_new(parsec_arena_t *arena,
+                               parsec_data_t *data,
+                               size_t count, int device,
+                               parsec_datatype_t dtt)
 {
-    parsec_data_t *data;
-    parsec_data_copy_t *copy;
-    int rc;
-
-    
-    data = parsec_data_new();
+    parsec_data_copy_t *copy = NULL;
+    parsec_data_t* ldata = data;
     if( NULL == data ) {
+        ldata = parsec_data_new();
+        if( NULL == ldata ) {
+            return NULL;
+        }
+    }
+    if( 0 == device ) {
+        copy = parsec_data_copy_new(ldata, device, dtt,
+                                    PARSEC_DATA_FLAG_PARSEC_OWNED | PARSEC_DATA_FLAG_PARSEC_MANAGED | PARSEC_DATA_FLAG_ARENA);
+        if (NULL == copy) {
+            goto free_and_return;
+        }
+        int rc = parsec_arena_allocate_device_private(copy, arena, count, device, dtt);
+        if (PARSEC_SUCCESS != rc) {
+            goto free_and_return;
+        }
+        return copy;
+    }
+    /**
+     * This part is not really nice, it breaks the separation between devices, and how their memory is
+     * managed. But, it should give nice perfromance improvements if the communication layer is
+     * capable of sending or receiving data directly to and from the accelerator memory. The only drawback
+     * is that once the GPU memory is full, this will fail, so the soeftware will fall back to the
+     * prior behavior, going through the CPU memory.
+     *
+     * The zone deallocation is not symmetric, it will happen in the GPU management, when the data copies
+     * are released from the different LRU lists.
+     */
+    parsec_device_gpu_module_t *gpu_device = (parsec_device_gpu_module_t *)parsec_mca_device_get(device);
+    if (NULL == gpu_device) {
         return NULL;
     }
+    size_t size = count * arena->elem_size;
+    void* device_private = zone_malloc(gpu_device->memory, size);
+    if( NULL == device_private ) {
+        PARSEC_DEBUG_VERBOSE(10, parsec_debug_output, "Arena:\tallocate data copy on device %d of size %zu from zone %p failed (out of memory)\n",
+                             device, size, (void *)copy->arena_chunk);
+        goto free_and_return;
+    }
+    copy = parsec_data_copy_new(ldata, device, dtt,
+                                PARSEC_DATA_FLAG_PARSEC_OWNED | PARSEC_DATA_FLAG_PARSEC_MANAGED);
+    if (NULL == copy) {
+        PARSEC_DEBUG_VERBOSE(10, parsec_debug_output, "Arena:\tallocate data copy on device %d of size %zu from zone %p failed to allocate copy (out of memory)\n",
+                             device, size, (void *)copy->arena_chunk);
+        zone_free(gpu_device->memory, device_private);
+        goto free_and_return;
+    }
+    copy->dtt = dtt;
+    copy->device_private = device_private;
+    copy->arena_chunk = (parsec_arena_chunk_t*)gpu_device->memory;
+    PARSEC_DEBUG_VERBOSE(10, parsec_debug_output, "Arena:\tallocate data copy on device %d of size %zu from zone %p, "
+                                                  "data ptr %p",
+                         device, size, (void*)copy->arena_chunk, (void*)copy->device_private);
+    copy->version = 0;
+    copy->coherency_state = PARSEC_DATA_COHERENCY_INVALID;
+    copy->original->owner_device = device;
+    copy->original->preferred_device = device;
+    return copy;
+  free_and_return:
+    if( NULL != copy )
+        PARSEC_OBJ_RELEASE(copy);
+    if( NULL == data)
+        PARSEC_OBJ_RELEASE(ldata); /* release the locally allocated data */
+    return NULL;
+}
 
-    copy = parsec_data_copy_new( data, device, dtt,
-                                 PARSEC_DATA_FLAG_ARENA |
-                                 PARSEC_DATA_FLAG_PARSEC_OWNED |
-                                 PARSEC_DATA_FLAG_PARSEC_MANAGED);
+parsec_data_copy_t *
+parsec_arena_get_new_copy(parsec_arena_t *arena,
+                          size_t count, int device,
+                          parsec_datatype_t dtt)
+{
+    parsec_data_copy_t *dev0_copy, *copy;
 
-    if(NULL == copy) {
-        PARSEC_OBJ_RELEASE(data);
+    dev0_copy = parsec_arena_internal_copy_new(arena, NULL, count, 0 /* first allocate the copy on the device 0 */, dtt);
+    if( NULL == dev0_copy ) {
         return NULL;
     }
+    dev0_copy->coherency_state = PARSEC_DATA_COHERENCY_INVALID;
+    dev0_copy->version = 0;  /* start from somewhere */
+    if( 0 == device ) {
+        return dev0_copy;
+    }
 
-    rc = parsec_arena_allocate_device_private(copy, arena, count, device, dtt);
-
+    copy = parsec_arena_internal_copy_new(arena, dev0_copy->original, count, device, dtt);
+    if( NULL == copy ) {
+        copy = dev0_copy;  /* return the main memory data copy */
+    }
     /* This data is going to be released once all copies are released
      * It does not exist without at least a copy, and we don't give the
      * pointer to the user, so we must remove our retain from it
      */
-    PARSEC_OBJ_RELEASE(data);
-
-    if( PARSEC_SUCCESS != rc ) {
-        PARSEC_OBJ_RELEASE(copy);
-        return NULL;
-    }
-
+    PARSEC_OBJ_RELEASE(dev0_copy->original);
     return copy;
 }
 
diff --git a/parsec/arena.h b/parsec/arena.h
index acf8697a0..17783b307 100644
--- a/parsec/arena.h
+++ b/parsec/arena.h
@@ -133,15 +133,15 @@ int parsec_arena_construct_ex(parsec_arena_t* arena,
  *   enough resource to allocate a new data copy of this type.
  */
 
-parsec_data_copy_t *parsec_arena_get_copy(parsec_arena_t *arena,
-                                          size_t count, int device,
-                                          parsec_datatype_t dtt);
+parsec_data_copy_t *parsec_arena_get_new_copy(parsec_arena_t *arena,
+                                              size_t count, int device,
+                                              parsec_datatype_t dtt);
 
 /**
  * @brief Allocates memory for a given data copy. This is a function used by
  *  DSLs to set the memory associated with a data copy they have created.
- *  It is also used by parsec_arena_get_copy.
- * 
+ *  It is also used by parsec_arena_get_new_copy.
+ *
  * @param copy the (empty) data copy to allocate memory for. NB: the @p original
  *  field of this data copy must be set. The operation overwrites the device
  *  dtt and count of this data copy, as well as the device_private pointer.
diff --git a/parsec/data.c b/parsec/data.c
index fa4fe9d2a..4361e3b7e 100644
--- a/parsec/data.c
+++ b/parsec/data.c
@@ -15,6 +15,7 @@
 #include "parsec/sys/atomic.h"
 #include "parsec/remote_dep.h"
 #include "parsec/parsec_internal.h"
+#include "parsec/utils/zone_malloc.h"
 
 static parsec_lifo_t parsec_data_lifo;
 static parsec_lifo_t parsec_data_copies_lifo;
@@ -70,6 +71,7 @@ static void parsec_data_construct(parsec_data_t* obj )
     obj->preferred_device = -1;
     obj->key              = 0;
     obj->nb_elts          = 0;
+    obj->nb_copies        = 0;
     for( uint32_t i = 0; i < parsec_nb_devices;
          obj->device_copies[i] = NULL, i++ );
     obj->dc               = NULL;
@@ -104,11 +106,12 @@ static void parsec_data_destruct(parsec_data_t* obj )
                  * GPU copies are normally stored in LRU lists, and must be
                  * destroyed by the release list to free the memory on the device
                  */
-                PARSEC_OBJ_RELEASE( copy );
+                PARSEC_DATA_COPY_RELEASE(copy);
             }
         }
         assert(NULL == obj->device_copies[i]);
     }
+    assert(0 == obj->nb_copies);
 }
 
 PARSEC_OBJ_CLASS_INSTANCE(parsec_data_t, parsec_object_t,
@@ -166,8 +169,8 @@ void parsec_data_delete(parsec_data_t* data)
 
 inline int
 parsec_data_copy_attach(parsec_data_t* data,
-                       parsec_data_copy_t* copy,
-                       uint8_t device)
+                        parsec_data_copy_t* copy,
+                        uint8_t device)
 {
     assert(NULL == copy->original);
     assert(NULL == copy->older);
@@ -180,6 +183,7 @@ parsec_data_copy_attach(parsec_data_t* data,
         copy->older = NULL;
         return PARSEC_ERROR;
     }
+    parsec_atomic_fetch_add_int32(&data->nb_copies, 1);
     PARSEC_OBJ_RETAIN(data);
     return PARSEC_SUCCESS;
 }
@@ -197,6 +201,7 @@ int parsec_data_copy_detach(parsec_data_t* data,
         return PARSEC_ERR_NOT_FOUND;
     }
     data->device_copies[device] = copy->older;
+    parsec_atomic_fetch_add_int32(&data->nb_copies, -1);
 
     copy->original     = NULL;
     copy->older        = NULL;
@@ -226,7 +231,7 @@ parsec_data_copy_t* parsec_data_copy_new(parsec_data_t* data, uint8_t device,
     }
     copy->flags = flags;
     if( PARSEC_SUCCESS != parsec_data_copy_attach(data, copy, device) ) {
-        PARSEC_OBJ_RELEASE(copy);
+        PARSEC_DATA_COPY_RELEASE(copy);
         return NULL;
     }
     copy->dtt = dtt;
@@ -335,6 +340,12 @@ int parsec_data_start_transfer_ownership_to_copy(parsec_data_t* data,
     copy = data->device_copies[device];
     assert( NULL != copy );
 
+    if( valid_copy == device ) {
+        PARSEC_DEBUG_VERBOSE(10, parsec_debug_output,
+                             "DEV[%d]: already has ownership of data %p to copy %p in mode %d",
+                             device, data, copy, access_mode);
+        goto bookkeeping;
+    }
     PARSEC_DEBUG_VERBOSE(10, parsec_debug_output,
                          "DEV[%d]: start transfer ownership of data %p to copy %p in mode %d",
                          device, data, copy, access_mode);
@@ -422,6 +433,7 @@ int parsec_data_start_transfer_ownership_to_copy(parsec_data_t* data,
         }
     }
 
+  bookkeeping:
     if( PARSEC_FLOW_ACCESS_READ & access_mode ) {
         copy->readers++;
     }
@@ -440,40 +452,52 @@ int parsec_data_start_transfer_ownership_to_copy(parsec_data_t* data,
     return valid_copy;
 }
 
-static char dump_coherency_codex(parsec_data_coherency_t state)
-{
-    if( PARSEC_DATA_COHERENCY_INVALID   == state ) return 'I';
-    if( PARSEC_DATA_COHERENCY_OWNED     == state ) return 'O';
-    if( PARSEC_DATA_COHERENCY_EXCLUSIVE == state ) return 'E';
-    if( PARSEC_DATA_COHERENCY_SHARED    == state ) return 'S';
-    return 'X';
-}
-
-void parsec_dump_data_copy(parsec_data_copy_t* copy)
+void parsec_data_copy_dump(parsec_data_copy_t* copy)
 {
-    parsec_debug_verbose(0, 0, "-  [%d]: copy %p state %c readers %d version %u\n",
-           (int)copy->device_index, copy, dump_coherency_codex(copy->coherency_state), copy->readers, copy->version);
+    char *tranfer = "---", flags[] = "----", *coherency = "undef";
+    switch(copy->data_transfer_status) {
+        case PARSEC_DATA_STATUS_NOT_TRANSFER: tranfer = "no"; break;
+        case PARSEC_DATA_STATUS_UNDER_TRANSFER: tranfer = "yes"; break;
+        case PARSEC_DATA_STATUS_COMPLETE_TRANSFER: tranfer = "no"; break;
+    }
+    if (copy->flags & PARSEC_DATA_FLAG_ARENA) flags[0] = 'A';
+    if (copy->flags & PARSEC_DATA_FLAG_TRANSIT) flags[1] = 'T';
+    if (copy->flags & PARSEC_DATA_FLAG_PARSEC_MANAGED) flags[2] = 'M';
+    if (copy->flags & PARSEC_DATA_FLAG_PARSEC_OWNED) flags[3] = 'O';
+
+    if( PARSEC_DATA_COHERENCY_INVALID   == copy->coherency_state ) coherency = "invalid";
+    if( PARSEC_DATA_COHERENCY_OWNED     == copy->coherency_state ) coherency = "owned";
+    if( PARSEC_DATA_COHERENCY_EXCLUSIVE == copy->coherency_state ) coherency = "exclusive";
+    if( PARSEC_DATA_COHERENCY_SHARED    == copy->coherency_state ) coherency = "shared";
+
+    parsec_debug_verbose(0, 0, "%s  [%d]: copy %p [ref %d] coherency %s readers %d version %u transit %s flags %s\n"
+                               "          older %p orig %p arena %p dev_priv %p\n",
+                         ((NULL != copy->original) && (copy->original->owner_device == copy->device_index)) ? "*" : " ",
+                         (int)copy->device_index, copy, copy->super.super.obj_reference_count, coherency, copy->readers, copy->version, tranfer, flags,
+                         (void *)copy->older, (void *)copy->original, (void *)copy->arena_chunk, copy->device_private);
 }
 
-void parsec_dump_data(parsec_data_t* data)
+void parsec_data_dump(parsec_data_t* data)
 {
-    parsec_debug_verbose(0, 0, "data %p key %lu owner %d\n", data, data->key, data->owner_device);
+    parsec_debug_verbose(0, 0, "data %p [ref %d] key %lu owner dev %d pref dev %d copies %d dc %p [# elems %zu]\n",
+                         data, data->super.obj_reference_count, data->key, data->owner_device, data->preferred_device, data->nb_copies,
+                         (void*)data->dc, data->nb_elts);
 
     for( uint32_t i = 0; i < parsec_nb_devices; i++ ) {
         if( NULL != data->device_copies[i])
-            parsec_dump_data_copy(data->device_copies[i]);
+            parsec_data_copy_dump(data->device_copies[i]);
     }
 }
 
 parsec_data_copy_t*
 parsec_data_get_copy(parsec_data_t* data, uint32_t device)
 {
-    return PARSEC_DATA_GET_COPY(data, device);
+   return PARSEC_DATA_GET_COPY(data, device);
 }
 
 void parsec_data_copy_release(parsec_data_copy_t* copy)
 {
-    /* TODO: Move the copy back to the CPU before destroying it */
+   /* TODO: Move the copy back to the CPU before destroying it */
     PARSEC_DATA_COPY_RELEASE(copy);
 }
 
@@ -514,7 +538,7 @@ parsec_data_create( parsec_data_t **holder,
 
         if( !parsec_atomic_cas_ptr(holder, NULL, data) ) {
             parsec_data_copy_detach(data, data_copy, 0);
-            PARSEC_OBJ_RELEASE(data_copy);
+            PARSEC_DATA_COPY_RELEASE(data_copy);
             data = *holder;
         }
     } else {
@@ -565,3 +589,42 @@ parsec_data_destroy( parsec_data_t *data )
 #endif
     PARSEC_OBJ_RELEASE(data);
 }
+
+#include "parsec/utils/debug.h"
+
+int parsec_data_release_self_contained_data(parsec_data_t *data)
+{
+    if (data->super.obj_reference_count != data->nb_copies) return 0;
+    parsec_data_copy_t *copy;
+    PARSEC_DEBUG_VERBOSE(1, parsec_debug_output, "Examine the status of data %p with %d copies and refcounts at %s:%d\n",
+                         data, data->nb_copies, __FILE__, __LINE__);
+    /* this data is only referenced by it's own copies. If these copies are also only referenced by
+     * data, then we can release them all.
+     */
+    for( uint32_t i = 0; i < parsec_nb_devices; i++) {
+        if (NULL == (copy = data->device_copies[i])) continue;
+        if( copy->super.super.obj_reference_count > 1 )
+            return 0;
+    }
+    PARSEC_DEBUG_VERBOSE(1, parsec_debug_output, "Force the release of data %p at %s:%d",
+		         copy, __FILE__, __LINE__);
+    for( uint32_t i = 0; i < parsec_nb_devices; i++) {
+        if (NULL == (copy = data->device_copies[i])) continue;
+        assert(1 == copy->super.super.obj_reference_count);
+        if( 0 == copy->device_index ) {
+            PARSEC_OBJ_RELEASE(copy);
+            assert(NULL == copy);
+        } else {
+            /* Do not release data copies that do not belong to the CPU or really bad things will happen.
+             * Only the device manager can release these copies, the best we can do here is to detach them
+             * from the data and eventually release their memory.
+             */
+            parsec_data_copy_detach(data, copy, copy->device_index);
+            zone_free((zone_malloc_t *)copy->arena_chunk, copy->device_private);
+            copy->device_private = NULL;
+            copy->arena_chunk = NULL;
+        }
+    }
+    return 1;
+}
+
diff --git a/parsec/data.h b/parsec/data.h
index b4989fac2..873a66c34 100644
--- a/parsec/data.h
+++ b/parsec/data.h
@@ -31,9 +31,9 @@ typedef uint8_t parsec_data_coherency_t;
 #define    PARSEC_DATA_COHERENCY_SHARED    ((parsec_data_coherency_t)0x4)
 
 typedef uint8_t parsec_data_status_t;
-#define    PARSEC_DATA_STATUS_NOT_TRANSFER          ((parsec_data_coherency_t)0x0)
-#define    PARSEC_DATA_STATUS_UNDER_TRANSFER        ((parsec_data_coherency_t)0x1)
-#define    PARSEC_DATA_STATUS_COMPLETE_TRANSFER     ((parsec_data_coherency_t)0x2)
+#define    PARSEC_DATA_STATUS_NOT_TRANSFER          ((parsec_data_status_t)0x0)
+#define    PARSEC_DATA_STATUS_UNDER_TRANSFER        ((parsec_data_status_t)0x1)
+#define    PARSEC_DATA_STATUS_COMPLETE_TRANSFER     ((parsec_data_status_t)0x2)
 /**
  * Data copies have three levels of 'ownership':
  * - a data copy can be owned and managed by PaRSEC.
@@ -128,8 +128,8 @@ PARSEC_DECLSPEC void
 parsec_data_end_transfer_ownership_to_copy(parsec_data_t* data,
                                                 uint8_t device,
                                                 uint8_t access_mode);
-PARSEC_DECLSPEC void parsec_dump_data_copy(parsec_data_copy_t* copy);
-PARSEC_DECLSPEC void parsec_dump_data(parsec_data_t* copy);
+PARSEC_DECLSPEC void parsec_data_copy_dump(parsec_data_copy_t *copy);
+PARSEC_DECLSPEC void parsec_data_dump(parsec_data_t* copy);
 
 PARSEC_DECLSPEC parsec_data_t *
 parsec_data_create( parsec_data_t **holder,
diff --git a/parsec/data_dist/matrix/map_operator.c b/parsec/data_dist/matrix/map_operator.c
index fe7846f8f..4bbafcab4 100644
--- a/parsec/data_dist/matrix/map_operator.c
+++ b/parsec/data_dist/matrix/map_operator.c
@@ -297,13 +297,13 @@ static int data_lookup(parsec_execution_stream_t *es,
         this_task->data[0].data_in   = parsec_data_get_copy(src(m,n), 0);
         this_task->data[0].source_repo_entry = NULL;
         this_task->data[0].data_out  = NULL;
-        PARSEC_OBJ_RETAIN(this_task->data[0].data_in);
+        PARSEC_DATA_COPY_RETAIN(this_task->data[0].data_in);
     }
     if( NULL != __tp->dest ) {
         this_task->data[1].data_in   = parsec_data_get_copy(dest(m,n), 0);
         this_task->data[1].source_repo_entry = NULL;
         this_task->data[1].data_out  = this_task->data[1].data_in;
-        PARSEC_OBJ_RETAIN(this_task->data[1].data_in);
+        PARSEC_DATA_COPY_RETAIN(this_task->data[1].data_in);
     }
     return PARSEC_HOOK_RETURN_DONE;
 }
diff --git a/parsec/data_dist/matrix/two_dim_rectangle_cyclic.h b/parsec/data_dist/matrix/two_dim_rectangle_cyclic.h
index 6dddc3ebd..a0c718b1f 100644
--- a/parsec/data_dist/matrix/two_dim_rectangle_cyclic.h
+++ b/parsec/data_dist/matrix/two_dim_rectangle_cyclic.h
@@ -45,7 +45,6 @@ typedef struct parsec_matrix_block_cyclic {
  * @param dc matrix description structure, already allocated, that will be initialize
  * @param mtype type of data used for this matrix
  * @param storage type of storage of data
- * @param nodes number of nodes
  * @param myrank rank of the local node (as of mpi rank)
  * @param mb number of row in a tile
  * @param nb number of column in a tile
diff --git a/parsec/data_internal.h b/parsec/data_internal.h
index 85cc103ca..6f5edcdb9 100644
--- a/parsec/data_internal.h
+++ b/parsec/data_internal.h
@@ -19,6 +19,7 @@
 #include "parsec/arena.h"
 #include "parsec/data.h"
 #include "parsec/class/parsec_future.h"
+#include "parsec/utils/debug.h"
 
 /**
  * This structure is the keeper of all the information regarding
@@ -30,11 +31,12 @@ struct parsec_data_s {
 
     parsec_atomic_lock_t       lock;
 
-    parsec_data_key_t          key;
     int8_t                     owner_device;
     int8_t                     preferred_device; /* Hint set from the MEMADVICE device API to define on
                                                   * which device this data should be modified RW when there
                                                   * are multiple choices. -1 means no preference. */
+    int32_t                    nb_copies;        /* How many valid copies are attached to this data */
+    parsec_data_key_t          key;
     struct parsec_data_collection_s*     dc;
     size_t                     nb_elts;          /* size in bytes of the memory layout */
     struct parsec_data_copy_s *device_copies[];  /* this array allocated according to the number of devices
@@ -87,22 +89,50 @@ PARSEC_DECLSPEC PARSEC_OBJ_CLASS_DECLARATION(parsec_data_copy_t);
 
 #define PARSEC_DATA_GET_COPY(DATA, DEVID) \
     ((DATA)->device_copies[(DEVID)])
+
+int parsec_data_release_self_contained_data(parsec_data_t* data);
 /**
  * Decrease the refcount of this copy of the data. If the refcount reach
  * 0 the upper level is in charge of cleaning up and releasing all content
  * of the copy.
  */
-#define PARSEC_DATA_COPY_RELEASE(DATA)     \
+#if 0
+#define PARSEC_DATA_COPY_RELEASE(COPY)     \
     do {                                  \
-        PARSEC_DEBUG_VERBOSE(20, parsec_debug_output, "Release data copy %p at %s:%d", (DATA), __FILE__, __LINE__); \
-        PARSEC_OBJ_RELEASE((DATA));                                            \
+        PARSEC_DEBUG_VERBOSE(20, parsec_debug_output, "Release data copy %p at %s:%d", (COPY), __FILE__, __LINE__); \
+        PARSEC_OBJ_RELEASE((COPY));                                            \
+        if( (NULL != (COPY)) && (NULL != ((COPY)->original)) ) parsec_data_release_self_contained_data((COPY)->original); \
     } while(0)
 
+#define PARSEC_DATA_COPY_RETAIN(COPY)     \
+    do {                                  \
+        PARSEC_DEBUG_VERBOSE(20, parsec_debug_output, "Retain data copy %p at %s:%d", (COPY), __FILE__, __LINE__); \
+        PARSEC_OBJ_RETAIN((COPY));                                            \
+    } while(0)
+#else
+static inline void __parsec_data_copy_release(parsec_data_copy_t** copy)
+{
+    PARSEC_DEBUG_VERBOSE(20, parsec_debug_output, "Release data copy %p at %s:%d", *copy, __FILE__, __LINE__);
+    PARSEC_OBJ_RELEASE(*copy);
+    if ((NULL != *copy) && (NULL != (*copy)->original) && (1 == (*copy)->super.super.obj_reference_count))
+        parsec_data_release_self_contained_data((*copy)->original);
+}
+#define PARSEC_DATA_COPY_RELEASE(COPY) \
+    __parsec_data_copy_release(&(COPY))
+
+static inline void __parsec_data_copy_retain(parsec_data_copy_t* copy)
+{
+    PARSEC_DEBUG_VERBOSE(20, parsec_debug_output, "Retain data copy %p at %s:%d", copy, __FILE__, __LINE__);
+    PARSEC_OBJ_RETAIN(copy);
+}
+#define PARSEC_DATA_COPY_RETAIN(COPY) \
+    __parsec_data_copy_retain((COPY))
+#endif  /* 0 */
 /**
  * Return the device private pointer for a datacopy.
  */
-#define PARSEC_DATA_COPY_GET_PTR(DATA) \
-    ((DATA) ? (DATA)->device_private : NULL)
+#define PARSEC_DATA_COPY_GET_PTR(COPY) \
+    ((COPY) ? (COPY)->device_private : NULL)
 
 /** @} */
 
diff --git a/parsec/include/parsec/parsec_options.h.in b/parsec/include/parsec/parsec_options.h.in
index 85af2db8e..9c397c5fb 100644
--- a/parsec/include/parsec/parsec_options.h.in
+++ b/parsec/include/parsec/parsec_options.h.in
@@ -71,6 +71,7 @@
 
 /* Communication engine */
 #cmakedefine PARSEC_DIST_WITH_MPI
+#cmakedefine PARSEC_MPI_IS_GPU_AWARE
 #cmakedefine PARSEC_DIST_THREAD
 #cmakedefine PARSEC_DIST_PRIORITIES
 #cmakedefine PARSEC_DIST_COLLECTIVES
diff --git a/parsec/interfaces/dtd/insert_function_internal.h b/parsec/interfaces/dtd/insert_function_internal.h
index 38da01ad0..1b38a6c56 100644
--- a/parsec/interfaces/dtd/insert_function_internal.h
+++ b/parsec/interfaces/dtd/insert_function_internal.h
@@ -417,16 +417,16 @@ void
 parsec_dtd_fini();
 
 static inline void
-parsec_dtd_retain_data_copy( parsec_data_copy_t *data )
+parsec_dtd_retain_data_copy( parsec_data_copy_t *copy )
 {
-    assert( data->super.super.obj_reference_count >= 1 );
-    PARSEC_OBJ_RETAIN(data);
+    assert( copy->super.super.obj_reference_count >= 1 );
+    PARSEC_DATA_COPY_RETAIN(copy);
 }
 
 static inline void
-parsec_dtd_release_data_copy( parsec_data_copy_t *data )
+parsec_dtd_release_data_copy(parsec_data_copy_t *copy)
 {
-    PARSEC_OBJ_RELEASE(data);
+    PARSEC_DATA_COPY_RELEASE(copy);
 }
 
 
diff --git a/parsec/interfaces/ptg/ptg-compiler/jdf2c.c b/parsec/interfaces/ptg/ptg-compiler/jdf2c.c
index f1fffff4d..7005b963e 100644
--- a/parsec/interfaces/ptg/ptg-compiler/jdf2c.c
+++ b/parsec/interfaces/ptg/ptg-compiler/jdf2c.c
@@ -673,6 +673,17 @@ static char* dump_local_assignments( void** elem, void* arg )
     if( dos > 0 ) {
         string_arena_init(info->sa);
         string_arena_add_string(info->sa, "const int %s = %s%s.value;", def->name, info->holder, def->name);
+#if 0
+        jdf_expr_t* type_str = jdf_find_property( def->properties, "type", NULL );
+        if( NULL == type_str ) {
+            string_arena_add_string(info->sa, "const int %s = %s%s.value;", def->name, info->holder, def->name);
+        } else {
+            expr_info_t expr_info = {.sa = info->sa, .prefix = "", .suffix = "", .assignments = "locals"};
+            string_arena_add_string(info->sa, "const %s %s = %s%s.value;",
+                                    dump_expr((void**)type_str, &expr_info),
+                                    def->name, info->holder, def->name);
+        }
+#endif
         if( dos > 1 )
             string_arena_add_string(info->sa, " (void)%s;", def->name);
         return string_arena_get_string(info->sa);
@@ -5577,7 +5588,7 @@ jdf_generate_code_call_initialization(const jdf_t *jdf, const jdf_call_t *call,
             jdf_generate_code_reshape_input_from_desc(jdf, f, flow, dl, spaces);
 
             coutput("%s    this_task->data._f_%s.data_out = chunk;\n"
-                    "%s    PARSEC_OBJ_RETAIN(chunk);\n",
+                    "%s    PARSEC_DATA_COPY_RETAIN(chunk);\n",
                     spaces, flow->varname,
                     spaces);
 
@@ -5603,7 +5614,7 @@ jdf_generate_code_call_initialization(const jdf_t *jdf, const jdf_call_t *call,
                 assert( dl->datatype_local.count != NULL );
                 string_arena_add_string(sa2, "%s", dump_expr((void**)dl->datatype_local.count, &info));
 
-                coutput("%s    chunk = parsec_arena_get_copy(%s->arena, %s, target_device, %s->opaque_dtt);\n"
+                coutput("%s    chunk = parsec_arena_get_new_copy(%s->arena, %s, target_device, %s->opaque_dtt);\n"
                         "%s    chunk->original->owner_device = target_device;\n"
                         "%s    this_task->data._f_%s.data_out = chunk;\n",
                         spaces, string_arena_get_string(sa), string_arena_get_string(sa2), string_arena_get_string(sa),
@@ -5634,12 +5645,19 @@ jdf_generate_code_call_initialization(const jdf_t *jdf, const jdf_call_t *call,
 
         /* Code to create & fulfill a reshape promise locally in case this input dependency is typed */
         jdf_generate_code_reshape_input_from_dep(jdf, f, flow, dl, spaces);
-        coutput("%s    this_task->data._f_%s.data_out = parsec_data_get_copy(chunk->original, target_device);\n"
-                "#if defined(PARSEC_PROF_GRAPHER) && defined(PARSEC_PROF_TRACE)\n"
+        /* TODO: Setting the data_out here is kind of random, especially as some copy of the input flow. The only thing
+         *       that would make sense here is to set the data_out to the dep outputs back into the user memory (output
+         *       dep with a target into a data collection), to give the opportunity to the accelerator components to
+         *       do a pushout to the desired location (instead of the current approach that will do a pushout to the
+         *       data_copy on device 0 followed by a memcpy into the desired location).
+         */
+        //coutput("%s    this_task->data._f_%s.data_out = parsec_data_get_copy(chunk->original, target_device);\n",
+        //        spaces, flow->varname);
+
+        coutput("#if defined(PARSEC_PROF_GRAPHER) && defined(PARSEC_PROF_TRACE)\n"
                 "%s    parsec_prof_grapher_data_input(chunk->original, (parsec_task_t*)this_task, &%s, 0);\n"
                 "#endif\n"
                 "%s  }\n",
-                spaces, flow->varname,
                 spaces, JDF_OBJECT_ONAME( flow ),
                 spaces);
     }
@@ -5713,7 +5731,7 @@ static void jdf_generate_code_call_init_output(const jdf_t *jdf, const jdf_call_
              spaces, flow->varname,
              spaces);
 
-    coutput("%s    chunk = parsec_arena_get_copy(%s->arena, %s, target_device, %s);\n"
+    coutput("%s    chunk = parsec_arena_get_new_copy(%s->arena, %s, target_device, %s);\n"
             "%s    chunk->original->owner_device = target_device;\n",
             spaces, string_arena_get_string(sa_arena), string_arena_get_string(sa_count), string_arena_get_string(sa_datatype),
             spaces);
@@ -6497,10 +6515,10 @@ jdf_generate_code_data_lookup(const jdf_t *jdf,
      * This way, it's only retained once during release_deps.
      */
     coutput("  if( NULL == this_task->repo_entry ){\n"
-            "    this_task->repo_entry = data_repo_lookup_entry_and_create(es, %s_repo, "
+            "    this_task->repo_entry = data_repo_lookup_entry_and_create(es, %s_repo, \n"
             "                                      %s((const parsec_taskpool_t*)__parsec_tp, (const parsec_assignment_t*)&this_task->locals));\n"
-            "    data_repo_entry_addto_usage_limit(%s_repo, this_task->repo_entry->ht_item.key, 1);"
-            "    this_task->repo_entry ->generator = (void*)this_task;  /* for AYU */\n"
+            "    data_repo_entry_addto_usage_limit(%s_repo, this_task->repo_entry->ht_item.key, 1);\n"
+            "    this_task->repo_entry->generator = (void*)this_task;  /* for AYU */\n"
             "#if defined(PARSEC_SIM)\n"
             "    assert(this_task->repo_entry ->sim_exec_date == 0);\n"
             "    this_task->repo_entry ->sim_exec_date = this_task->sim_exec_date;\n"
@@ -6510,7 +6528,7 @@ jdf_generate_code_data_lookup(const jdf_t *jdf,
             jdf_property_get_string(f->properties, JDF_PROP_UD_MAKE_KEY_FN_NAME, NULL),
             f->fname);
 
-    coutput("  /* The reshape repo is the current task repo. */"
+    coutput("  /* The reshape repo is the current task repo. */\n"
             "  reshape_repo = %s_repo;\n"
             "  reshape_entry_key = %s((const parsec_taskpool_t*)__parsec_tp, (const parsec_assignment_t*)&this_task->locals) ;\n"
             "  reshape_entry = this_task->repo_entry;\n",
@@ -7014,6 +7032,12 @@ static void jdf_generate_code_hook(const jdf_t *jdf,
     output = UTIL_DUMP_LIST(sa, f->dataflow, next,
                             dump_data_initialization_from_data_array, &ai2, "", "", "", "");
     if( 0 != strlen(output) ) {
+        coutput("/* Make sure we have the data_out set to the data_in */\n");
+        for( fl = f->dataflow; fl != NULL; fl = fl->next) {
+            if( fl->flow_flags & JDF_FLOW_TYPE_CTL ) continue;
+            coutput("  this_task->data._f_%s.data_out = this_task->data._f_%s.data_in;\n",
+                    fl->varname, fl->varname);
+        }
         coutput("  /** Declare the variables that will hold the data, and all the accounting for each */\n"
                 "%s\n",
                 output);
diff --git a/parsec/mca/device/device_gpu.c b/parsec/mca/device/device_gpu.c
index d57ac904e..2fde0d7d0 100644
--- a/parsec/mca/device/device_gpu.c
+++ b/parsec/mca/device/device_gpu.c
@@ -342,7 +342,7 @@ void parsec_device_dump_gpu_state(parsec_device_gpu_module_t* gpu_device)
                                  parsec_gpu_data_copy_t* gpu_copy = (parsec_gpu_data_copy_t*)item;
                                  parsec_output(parsec_gpu_output_stream, "  %d. elem %p flags 0x%x GPU mem %p\n",
                                                i, gpu_copy, gpu_copy->flags, gpu_copy->device_private);
-                                 parsec_dump_data_copy(gpu_copy);
+                                 parsec_data_copy_dump(gpu_copy);
                                  i++;
                              });
     }
@@ -354,7 +354,7 @@ void parsec_device_dump_gpu_state(parsec_device_gpu_module_t* gpu_device)
                                  parsec_gpu_data_copy_t* gpu_copy = (parsec_gpu_data_copy_t*)item;
                                  parsec_output(parsec_gpu_output_stream, "  %d. elem %p flags 0x%x GPU mem %p\n",
                                                i, gpu_copy, gpu_copy->flags, gpu_copy->device_private);
-                                 parsec_dump_data_copy(gpu_copy);
+                                 parsec_data_copy_dump(gpu_copy);
                                  i++;
                              });
     }
@@ -480,7 +480,7 @@ parsec_device_data_advise(parsec_device_module_t *dev, parsec_data_t *data, int
             PARSEC_DEBUG_VERBOSE(20, parsec_debug_output, "Retain data copy %p [ref_count %d]",
                                  data->device_copies[ data->owner_device ],
                                  data->device_copies[ data->owner_device ]->super.super.obj_reference_count);
-            PARSEC_OBJ_RETAIN(data->device_copies[ data->owner_device ]);
+            PARSEC_DATA_COPY_RETAIN(data->device_copies[ data->owner_device ]);
             gpu_task->ec->data[0].data_in = data->device_copies[ data->owner_device ];
             gpu_task->ec->data[0].data_out = NULL;
             gpu_task->ec->data[0].source_repo_entry = NULL;
@@ -616,9 +616,8 @@ parsec_device_memory_reserve( parsec_device_gpu_module_t* gpu_device,
             parsec_warning("GPU[%d:%s] Invalid argument: requesting 0 bytes of memory",
                            gpu_device->super.device_index, gpu_device->super.name);
             return PARSEC_ERROR;
-        } else {
-            alloc_size = number_blocks * eltsize;
         }
+        alloc_size = number_blocks * eltsize;
     } else {
         /* number_blocks == -1 means memory_percentage is used */
         alloc_size = (memory_percentage * initial_free_mem) / 100;
@@ -670,7 +669,7 @@ parsec_device_memory_reserve( parsec_device_gpu_module_t* gpu_device,
         gpu_elem->flags |= PARSEC_DATA_FLAG_PARSEC_OWNED;
         gpu_elem->device_index = gpu_device->super.device_index;
         mem_elem_per_gpu++;
-        PARSEC_OBJ_RETAIN(gpu_elem);
+        PARSEC_DATA_COPY_RETAIN(gpu_elem);
         PARSEC_DEBUG_VERBOSE(20, parsec_gpu_output_stream,
                             "GPU[%d:%s] Retain and insert GPU copy %p [ref_count %d] in LRU",
                             gpu_device->super.device_index, gpu_device->super.name, gpu_elem, gpu_elem->super.obj_reference_count);
@@ -733,12 +732,22 @@ static void parsec_device_memory_release_list(parsec_device_gpu_module_t* gpu_de
                             gpu_device->super.device_index, gpu_device->super.name, gpu_copy, gpu_copy->device_private, gpu_copy->super.super.obj_reference_count,
                              original, (NULL != original ? original->dc : NULL));
         assert( gpu_copy->device_index == gpu_device->super.device_index );
-
-        if( PARSEC_DATA_COHERENCY_OWNED == gpu_copy->coherency_state ) {
-            parsec_warning("GPU[%d:%s] still OWNS the master memory copy for data %d and it is discarding it!",
-                           gpu_device->super.device_index, gpu_device->super.name, original->key);
+        if( NULL == gpu_copy->device_private ) {
+            PARSEC_DEBUG_VERBOSE(35, parsec_gpu_output_stream,
+                                 "GPU[%d:%s] copy %p is dangling without private data. This is OK.",
+                                 gpu_device->super.device_index, gpu_device->super.name, (void*)gpu_copy);
+            goto release_and_continue;
+        }
+        if( NULL == gpu_copy->original ) {
+            PARSEC_DEBUG_VERBOSE(35, parsec_gpu_output_stream,
+                                 "GPU[%d:%s] copy %p detached from a data but not yet reclaimed!",
+                                 gpu_device->super.device_index, gpu_device->super.name, (void*)gpu_copy);
+        }
+        if (PARSEC_DATA_COHERENCY_OWNED == gpu_copy->coherency_state) {
+            parsec_warning("GPU[%d:%s] still OWNS the master memory copy for data %d (%p) and it is discarding it!",
+                           gpu_device->super.device_index, gpu_device->super.name, original->key, (void*)gpu_copy->device_private);
         }
-        assert(0 != (gpu_copy->flags & PARSEC_DATA_FLAG_PARSEC_OWNED) );
+        assert(0 != (gpu_copy->flags & PARSEC_DATA_FLAG_PARSEC_OWNED));
 
 #if defined(PARSEC_GPU_ALLOC_PER_TILE)
         gpu_device->memory_free( gpu_copy->device_private );
@@ -762,11 +771,12 @@ static void parsec_device_memory_release_list(parsec_device_gpu_module_t* gpu_de
 #endif
         gpu_copy->device_private = NULL;
 
+      release_and_continue:
         /* At this point the data copies should have no attachment to a data_t. Thus,
          * before we get here (aka below parsec_fini), the destructor of the data
          * collection must have been called, releasing all the copies.
          */
-        PARSEC_OBJ_RELEASE(gpu_copy); assert(NULL == gpu_copy);
+        PARSEC_DATA_COPY_RELEASE(gpu_copy); assert(NULL == gpu_copy);
     }
 }
 
@@ -826,6 +836,87 @@ parsec_device_memory_release( parsec_device_gpu_module_t* gpu_device )
     return PARSEC_SUCCESS;
 }
 
+int
+parsec_device_get_copy( parsec_device_gpu_module_t* gpu_device, parsec_data_copy_t** dc )
+{
+    char task_name[] = "unknown";
+    parsec_gpu_data_copy_t *gpu_mem_lru_cycling = NULL, *lru_gpu_elem;
+    /* Get the head of the LRU, assuming it has no readers and mark it as used, using the same mechanism as
+     * the GPU to GPU tranfers. Once the communication into this copy completes, the task will get into
+     * the GPU queues, and the data will be reattributed accordingly to this GPU.
+     */
+  find_another_data:
+    lru_gpu_elem = (parsec_gpu_data_copy_t*)parsec_list_pop_front(&gpu_device->gpu_mem_lru);
+    if( NULL == lru_gpu_elem ) {
+        /* nothing available on the GPU. Let the upper level know about this */
+        *dc = NULL;
+        return PARSEC_ERR_OUT_OF_RESOURCE;
+    }
+    if( 0 != lru_gpu_elem->readers ) {
+        PARSEC_DEBUG_VERBOSE(20, parsec_gpu_output_stream,
+                             "GPU[%d:%s]:%s: Drop LRU-retrieved GPU copy %p [readers %d, ref_count %d] original %p",
+                             gpu_device->super.device_index, gpu_device->super.name, task_name,
+                             lru_gpu_elem, lru_gpu_elem->readers, lru_gpu_elem->super.super.obj_reference_count, lru_gpu_elem->original);
+        /* We do not add the copy back into the LRU. This means that for now this copy is not
+         * tracked via the LRU (despite being only used in read mode) and instead is dangling
+         * on other tasks. Thus, it will eventually need to be added back into the LRU when
+         * current task using it completes.
+        */
+        goto find_another_data;
+    }
+    /* It's also possible that the ref_count of that element is bigger than 1
+     * In that case, it's because some task completion did not execute yet, and
+     * we need to keep it in the list until it reaches 1.
+     */
+    if( lru_gpu_elem->super.super.obj_reference_count > 1 ) {
+        /* It's also possible (although unlikely) that we livelock here:
+         * if gpu_mem_lru has *only* elements with readers == 0 but
+         * ref_count > 1, then we might pop/push forever. We save the
+         * earliest element found and if we see it again it means we
+         * run over the entire list without finding a suitable replacement.
+         * We need to make progress on something else. This remains safe for as long as the
+         * LRU is only modified by a single thread (in this case the current thread).
+         */
+        PARSEC_DEBUG_VERBOSE(20, parsec_gpu_output_stream,
+                             "GPU[%d:%s]:%s: Push back LRU-retrieved GPU copy %p [readers %d, ref_count %d] original %p",
+                             gpu_device->super.device_index, gpu_device->super.name, task_name,
+                             lru_gpu_elem, lru_gpu_elem->readers, lru_gpu_elem->super.super.obj_reference_count, lru_gpu_elem->original);
+        assert(0 != (lru_gpu_elem->flags & PARSEC_DATA_FLAG_PARSEC_OWNED) );
+        parsec_list_push_back(&gpu_device->gpu_mem_lru, &lru_gpu_elem->super);
+        goto find_another_data;
+    }
+    if( gpu_mem_lru_cycling == lru_gpu_elem ) {
+        PARSEC_DEBUG_VERBOSE(2, parsec_gpu_output_stream,
+                             "GPU[%d:%s]: Cycle detected on allocating memory for %s",
+                             gpu_device->super.device_index, gpu_device->super.name, task_name);
+        *dc = NULL;  /* did our best but failed to find a data. Return and allocate it onto another device. */
+        return PARSEC_ERR_OUT_OF_RESOURCE;
+    }
+    /* detect cycles to have an opportunity to stop */
+    gpu_mem_lru_cycling = (NULL == gpu_mem_lru_cycling) ? lru_gpu_elem : gpu_mem_lru_cycling;  /* update the cycle detector */
+
+    parsec_data_t* master = lru_gpu_elem->original;
+    if (NULL == master ) {
+        /* This copy has been detached by the CPU once it has been consumed (by the communication engine),
+         * there is no device memory associated with it, we can safely release the CPU copy.
+         */
+        assert(1 == lru_gpu_elem->super.super.obj_reference_count);
+        PARSEC_OBJ_RELEASE(lru_gpu_elem);
+        goto find_another_data;
+    }
+    parsec_atomic_lock(&master->lock);
+    if ( lru_gpu_elem->data_transfer_status == PARSEC_DATA_STATUS_UNDER_TRANSFER ) {
+        /* can't reuse, it is drained right now by another device */
+        parsec_atomic_unlock(&master->lock);
+        goto find_another_data;
+    }
+    parsec_data_copy_detach(master, lru_gpu_elem, gpu_device->super.device_index);
+    parsec_atomic_wmb();
+    *dc = lru_gpu_elem;
+    parsec_atomic_unlock(&master->lock);
+    return PARSEC_SUCCESS;
+}
+
 /**
  * Try to find memory space to move all data on the GPU. We attach a device_elem to
  * a memory_elem as soon as a device_elem is available. If we fail to find enough
@@ -863,7 +954,10 @@ parsec_device_data_reserve_space( parsec_device_gpu_module_t* gpu_device,
         assert( flow && (flow->flow_index == i) );
 
         /* Skip CTL flows only */
-        if(PARSEC_FLOW_ACCESS_NONE == (PARSEC_FLOW_ACCESS_MASK & flow->flow_flags)) continue;
+        if(PARSEC_FLOW_ACCESS_NONE == (PARSEC_FLOW_ACCESS_MASK & flow->flow_flags)) {
+            gpu_task->flow_nb_elts[i] = 0;  /* assume there is nothing to transfer to the GPU */
+            continue;
+        }
 
         PARSEC_DEBUG_VERBOSE(20, parsec_gpu_output_stream,
                              "GPU[%d:%s]:%s: Investigating flow %s:%d",
@@ -871,7 +965,16 @@ parsec_device_data_reserve_space( parsec_device_gpu_module_t* gpu_device,
         temp_loc[i] = NULL;
         if (this_task->data[i].data_in == NULL)
             continue;
-
+        /* if the input data is already on this device there is nothing else to do */
+        if( gpu_device->super.device_index == this_task->data[i].data_in->device_index ) {
+            PARSEC_DEBUG_VERBOSE(20, parsec_gpu_output_stream,
+                                 "GPU[%d:%s]:%s: Flow %s:%i was already on the device %p%s",
+                                 gpu_device->super.device_index, gpu_device->super.name, task_name,
+                                 flow->name, i, gpu_elem,
+                                 this_task->data[i].data_in->data_transfer_status == PARSEC_DATA_STATUS_UNDER_TRANSFER ? " [in transfer]" : "");
+            this_task->data[i].data_out = this_task->data[i].data_in;
+            continue;
+        }
         master   = this_task->data[i].data_in->original;
         parsec_atomic_lock(&master->lock);
         gpu_elem = PARSEC_DATA_GET_COPY(master, gpu_device->super.device_index);
@@ -926,6 +1029,7 @@ parsec_device_data_reserve_space( parsec_device_gpu_module_t* gpu_device,
         copy_readers_update = 0;
         assert(0 != (gpu_elem->flags & PARSEC_DATA_FLAG_PARSEC_OWNED) );
         gpu_elem->device_private = zone_malloc(gpu_device->memory, gpu_task->flow_nb_elts[i]);
+        gpu_elem->arena_chunk = (parsec_arena_chunk_t *)gpu_device->memory;
         if( NULL == gpu_elem->device_private ) {
 #endif
 
@@ -947,6 +1051,7 @@ parsec_device_data_reserve_space( parsec_device_gpu_module_t* gpu_device,
                 for( j = 0; j <= i; j++ ) {
                     /* This flow could be a control flow */
                     if( NULL == temp_loc[j] ) continue;
+                    this_task->data[j].data_out = gpu_elem;  /* reset the data out */
                     /* This flow could be non-parsec-owned, in which case we can't reclaim it */
                     if( 0 == (temp_loc[j]->flags & PARSEC_DATA_FLAG_PARSEC_OWNED) ) continue;
                     PARSEC_DEBUG_VERBOSE(20, parsec_gpu_output_stream,
@@ -957,7 +1062,7 @@ parsec_device_data_reserve_space( parsec_device_gpu_module_t* gpu_device,
                     parsec_list_push_front(&gpu_device->gpu_mem_lru, (parsec_list_item_t*)temp_loc[j]);
                 }
 #if !defined(PARSEC_GPU_ALLOC_PER_TILE)
-                PARSEC_OBJ_RELEASE(gpu_elem);
+                PARSEC_DATA_COPY_RELEASE(gpu_elem);
 #endif
                 parsec_atomic_unlock(&master->lock);
                 return PARSEC_HOOK_RETURN_NEXT;
@@ -1115,7 +1220,7 @@ parsec_device_data_reserve_space( parsec_device_gpu_module_t* gpu_device,
                                  "GPU[%d:%s]:%s: Release LRU-retrieved GPU copy %p [ref_count %d: must be 1]",
                                  gpu_device->super.device_index, gpu_device->super.name, task_name,
                                  lru_gpu_elem, lru_gpu_elem->super.super.obj_reference_count);
-            PARSEC_OBJ_RELEASE(lru_gpu_elem);
+            PARSEC_DATA_COPY_RELEASE(lru_gpu_elem);
             assert( NULL == lru_gpu_elem );
             goto malloc_data;
         }
@@ -1187,8 +1292,8 @@ parsec_default_gpu_stage_in(parsec_gpu_task_t        *gtask,
                             parsec_gpu_exec_stream_t *gpu_stream)
 {
     int ret;
-    parsec_data_copy_t * source;
-    parsec_data_copy_t * dest;
+    parsec_data_copy_t * src_copy;
+    parsec_data_copy_t * dst_copy;
     parsec_device_gpu_module_t *src_dev;
     parsec_device_gpu_module_t *dst_dev;
     parsec_task_t *task = gtask->ec;
@@ -1197,11 +1302,10 @@ parsec_default_gpu_stage_in(parsec_gpu_task_t        *gtask,
 
     for(int i = 0; i < task->task_class->nb_flows; i++) {
         if( !(flow_mask & (1U << i)) ) continue;
-        source = gtask->sources[i];
-        assert(source->device_private != NULL);
-        dest = task->data[i].data_out;
-        src_dev = (parsec_device_gpu_module_t*)parsec_mca_device_get(source->device_index);
-        dst_dev = (parsec_device_gpu_module_t*)parsec_mca_device_get(dest->device_index);
+        src_copy = gtask->sources[i];
+        dst_copy = task->data[i].data_out;
+        src_dev = (parsec_device_gpu_module_t *)parsec_mca_device_get(src_copy->device_index);
+        dst_dev = (parsec_device_gpu_module_t *)parsec_mca_device_get(dst_copy->device_index);
 
         if(src_dev->super.type == dst_dev->super.type) {
             assert( src_dev->peer_access_mask & (1 << dst_dev->super.device_index) );
@@ -1210,13 +1314,12 @@ parsec_default_gpu_stage_in(parsec_gpu_task_t        *gtask,
             dir = parsec_device_gpu_transfer_direction_h2d;
         }
 
-        count = (source->original->nb_elts <= dest->original->nb_elts) ?
-            source->original->nb_elts : dest->original->nb_elts;
-        ret = dst_dev->memcpy_async( dst_dev, gpu_stream,
-                                     dest->device_private,
-                                     source->device_private,
-                                     count,
-                                     dir );
+        count = (src_copy->original->nb_elts <= dst_copy->original->nb_elts) ? src_copy->original->nb_elts : dst_copy->original->nb_elts;
+        ret = dst_dev->memcpy_async(dst_dev, gpu_stream,
+                                    dst_copy->device_private,
+                                    src_copy->device_private,
+                                    count,
+                                    dir);
         if(PARSEC_SUCCESS != ret)
             return PARSEC_HOOK_RETURN_ERROR;
     }
@@ -1238,8 +1341,8 @@ parsec_default_gpu_stage_out(parsec_gpu_task_t        *gtask,
                              parsec_gpu_exec_stream_t *gpu_stream)
 {
     int ret;
-    parsec_data_copy_t * source;
-    parsec_data_copy_t * dest;
+    parsec_data_copy_t * src_copy;
+    parsec_data_copy_t * dst_copy;
     parsec_device_gpu_module_t *dst_dev, *src_dev;
     parsec_task_t *task = gtask->ec;
     size_t count;
@@ -1247,14 +1350,14 @@ parsec_default_gpu_stage_out(parsec_gpu_task_t        *gtask,
     int i;
     for(i = 0; i < task->task_class->nb_flows; i++){
         if(flow_mask & (1U << i)){
-            source = task->data[i].data_out;
-            dest = source->original->device_copies[0];
-            dst_dev = (parsec_device_gpu_module_t*)parsec_mca_device_get(dest->device_index);
-            src_dev = (parsec_device_gpu_module_t*)parsec_mca_device_get(source->device_index);
-
-            count = (source->original->nb_elts <= dest->original->nb_elts) ? source->original->nb_elts :
-                        dest->original->nb_elts;
-            if( src_dev->super.type == dst_dev->super.type ) {
+            src_copy = task->data[i].data_out;
+            dst_copy = src_copy->original->device_copies[0];
+            dst_dev = (parsec_device_gpu_module_t*)parsec_mca_device_get(dst_copy->device_index);
+            src_dev = (parsec_device_gpu_module_t*)parsec_mca_device_get(src_copy->device_index);
+            assert(dst_copy->data_transfer_status != PARSEC_DATA_STATUS_UNDER_TRANSFER);
+            dst_copy->data_transfer_status = PARSEC_DATA_STATUS_UNDER_TRANSFER;
+            count = (src_copy->original->nb_elts <= dst_copy->original->nb_elts) ? src_copy->original->nb_elts : dst_copy->original->nb_elts;
+            if( (src_dev->super.type & PARSEC_DEV_ANY_TYPE) == (dst_dev->super.type & PARSEC_DEV_ANY_TYPE) ) {
                 assert( src_dev->peer_access_mask & (1 << dst_dev->super.device_index) );
                 dir = parsec_device_gpu_transfer_direction_d2d;
             } else {
@@ -1267,8 +1370,8 @@ parsec_default_gpu_stage_out(parsec_gpu_task_t        *gtask,
                 }
             }
             ret = src_dev->memcpy_async( src_dev, gpu_stream,
-                                         dest->device_private,
-                                         source->device_private,
+                                         dst_copy->device_private,
+                                         src_copy->device_private,
                                          count,
                                          dir );
             if(PARSEC_SUCCESS != ret) {
@@ -1306,7 +1409,24 @@ parsec_device_data_stage_in( parsec_device_gpu_module_t* gpu_device,
                              "GPU[%d:%s]: Prefetch task %p is staging in",
                              gpu_device->super.device_index, gpu_device->super.name, gpu_task);
     }
-
+    if( NULL == gpu_elem ) {
+        if( candidate->device_index == gpu_device->super.device_index ) {
+            /* the candidate is already located on the GPU, no transfer should be necessary but let's do the bookkeeping */
+            if( (PARSEC_FLOW_ACCESS_WRITE & type) && (gpu_task->task_type != PARSEC_GPU_TASK_TYPE_PREFETCH) ) {
+                candidate->version++;
+                parsec_list_item_ring_chop((parsec_list_item_t *)candidate);
+                PARSEC_LIST_ITEM_SINGLETON(candidate);
+            }
+            if( PARSEC_FLOW_ACCESS_READ & type ) {
+                parsec_atomic_fetch_add_int32(&candidate->readers, 1);
+            }
+            return PARSEC_HOOK_RETURN_DONE;
+        }
+        parsec_warning("GPU[%d:%s]:\t device_data_stage_in without a proper data_out on the device "
+                       "and with a data_in (%p) located on another device %d",
+                       gpu_device->super.device_index, gpu_device->super.name,
+                       candidate, candidate->device_index);
+    }
     parsec_atomic_lock( &original->lock );
 
     gpu_task->sources[flow->flow_index] = candidate;  /* default source for the transfer */
@@ -1677,6 +1797,7 @@ parsec_device_callback_complete_push(parsec_device_gpu_module_t   *gpu_device,
         /* We also don't push back non-parsec-owned copies */
         if(NULL != task->data[i].data_out &&
            0 == (task->data[i].data_out->flags & PARSEC_DATA_FLAG_PARSEC_OWNED)) continue;
+        if( gpu_device->super.device_index == task->data[i].data_in->device_index ) continue;
 
         flow = gtask->flow[i];
         assert( flow );
@@ -2183,7 +2304,7 @@ parsec_device_kernel_pop( parsec_device_gpu_module_t   *gpu_device,
             /* If the gpu copy is not owned by parsec, we don't manage it at all */
             if( 0 == (gpu_copy->flags & PARSEC_DATA_FLAG_PARSEC_OWNED) ) continue;
             original = gpu_copy->original;
-            rc = gpu_task->stage_out? gpu_task->stage_out(gpu_task, (1U << i), gpu_stream): PARSEC_SUCCESS;
+            rc = gpu_task->stage_out ? gpu_task->stage_out(gpu_task, (1U << i), gpu_stream): PARSEC_SUCCESS;
             if(PARSEC_SUCCESS != rc) {
                 parsec_warning( "GPU[%d:%s]: gpu_task->stage_out from device rc=%d @%s:%d\n"
                                 "\tdata %s <<%p>> -> <<%p>>\n",
@@ -2221,6 +2342,7 @@ parsec_device_kernel_pop( parsec_device_gpu_module_t   *gpu_device,
 
         assert( this_task->data[i].data_in == NULL || original == this_task->data[i].data_in->original );
 
+#if 0
         if( (gpu_task->task_type != PARSEC_GPU_TASK_TYPE_D2D_COMPLETE) && !(flow->flow_flags & PARSEC_FLOW_ACCESS_WRITE) ) {
             /* Do not propagate GPU copies to successors (temporary solution) */
             this_task->data[i].data_out = original->device_copies[0];
@@ -2232,6 +2354,7 @@ parsec_device_kernel_pop( parsec_device_gpu_module_t   *gpu_device,
                                      this_task->data[i].data_out, this_task->data[i].data_out->super.super.obj_reference_count,
                                      original);
         }
+#endif
         parsec_atomic_lock(&original->lock);
         if( flow->flow_flags & PARSEC_FLOW_ACCESS_READ ) {
             int current_readers = parsec_atomic_fetch_sub_int32(&gpu_copy->readers, 1) - 1;
@@ -2305,7 +2428,7 @@ parsec_device_kernel_pop( parsec_device_gpu_module_t   *gpu_device,
 #endif
                 /* Move the data back into main memory */
                 rc = gpu_task->stage_out? gpu_task->stage_out(gpu_task, (1U << flow->flow_index), gpu_stream): PARSEC_SUCCESS;
-                if(PARSEC_SUCCESS != rc) {
+                if( PARSEC_SUCCESS != rc ) {
                     parsec_warning( "GPU[%d:%s]: gpu_task->stage_out from device rc=%d @%s:%d\n"
                                     "\tdata %s <<%p>> -> <<%p>>\n",
                                     gpu_device->super.device_index, gpu_device->super.name, rc, __func__, __LINE__,
@@ -2344,8 +2467,7 @@ parsec_device_kernel_epilog( parsec_device_gpu_module_t *gpu_device,
                              parsec_gpu_task_t          *gpu_task )
 {
     parsec_task_t *this_task = gpu_task->ec;
-    parsec_gpu_data_copy_t     *gpu_copy, *cpu_copy;
-    parsec_data_t              *original;
+    parsec_gpu_data_copy_t     *gpu_copy;
     int i;
 
 #if defined(PARSEC_DEBUG_NOISIER)
@@ -2370,45 +2492,60 @@ parsec_device_kernel_epilog( parsec_device_gpu_module_t *gpu_device,
         }
 
         gpu_copy = this_task->data[i].data_out;
-        original = gpu_copy->original;
-        cpu_copy = original->device_copies[0];
 
         /* If it is a copy managed by the user, don't bother either */
         if( 0 == (gpu_copy->flags & PARSEC_DATA_FLAG_PARSEC_OWNED) ) continue;
+#if 0
+        parsec_data_t *original = gpu_copy->original;
+        parsec_gpu_data_copy_t *cpu_copy = original->device_copies[0];
+        if( this_task->data[i].data_in == this_task->data[i].data_out ) {
+            /**
+             * There might be a race condition here. We can't assume the first CPU
+             * version is the corresponding CPU copy, as a new CPU-bound data
+             * might have been created meanwhile.
+             *
+             * WARNING: For now we always forward the cpu_copy to the next task, to
+             * do that, we lie to the engine by updating the CPU copy to the same
+             * status than the GPU copy without updating the data itself. Thus, the
+             * cpu copy is really invalid. this is related to Issue #88, and the
+             * fact that:
+             *      - we don't forward the gpu copy as output
+             *      - we always take a cpu copy as input, so it has to be in the
+             *        same state as the GPU to prevent an extra data movement.
+             */
+            assert( PARSEC_DATA_COHERENCY_OWNED == gpu_copy->coherency_state );
+            gpu_copy->coherency_state = PARSEC_DATA_COHERENCY_SHARED;
+            cpu_copy->coherency_state = PARSEC_DATA_COHERENCY_SHARED;
 
-        /**
-         * There might be a race condition here. We can't assume the first CPU
-         * version is the corresponding CPU copy, as a new CPU-bound data
-         * might have been created meanwhile.
-         *
-         * WARNING: For now we always forward the cpu_copy to the next task, to
-         * do that, we lie to the engine by updating the CPU copy to the same
-         * status than the GPU copy without updating the data itself. Thus, the
-         * cpu copy is really invalid. this is related to Issue #88, and the
-         * fact that:
-         *      - we don't forward the gpu copy as output
-         *      - we always take a cpu copy as input, so it has to be in the
-         *        same state as the GPU to prevent an extra data movement.
-         */
-        assert( PARSEC_DATA_COHERENCY_OWNED == gpu_copy->coherency_state );
-        gpu_copy->coherency_state = PARSEC_DATA_COHERENCY_SHARED;
-        cpu_copy->coherency_state = PARSEC_DATA_COHERENCY_SHARED;
-
-        cpu_copy->version = gpu_copy->version;
-        PARSEC_DEBUG_VERBOSE(10, parsec_gpu_output_stream,
-                             "GPU[%d:%s]: CPU copy %p [ref_count %d] gets the same version %d as GPU copy %p [ref_count %d]",
-                             gpu_device->super.device_index, gpu_device->super.name,
-                             cpu_copy, cpu_copy->super.super.obj_reference_count, cpu_copy->version, gpu_copy, gpu_copy->super.super.obj_reference_count);
-
-        /**
-         * Let's lie to the engine by reporting that working version of this
-         * data is now on the CPU.
-         */
-        this_task->data[i].data_out = cpu_copy;
+            cpu_copy->version = gpu_copy->version;
+            PARSEC_DEBUG_VERBOSE(10, parsec_gpu_output_stream,
+                                 "GPU[%d:%s]: CPU copy %p [ref_count %d] gets the same version %d as GPU copy %p [ref_count %d]",
+                                 gpu_device->super.device_index, gpu_device->super.name,
+                                 cpu_copy, cpu_copy->super.super.obj_reference_count, cpu_copy->version, gpu_copy, gpu_copy->super.super.obj_reference_count);
 
-        assert( 0 <= gpu_copy->readers );
+            /**
+             * Let's lie to the engine by reporting that working version of this
+             * data is now on the CPU.
+             */
+            this_task->data[i].data_out = cpu_copy;
+        }
+#endif
+        assert(0 <= gpu_copy->readers);
 
         if( gpu_task->pushout & (1 << i) ) {
+            parsec_data_t *original = gpu_copy->original;
+            parsec_gpu_data_copy_t *cpu_copy = original->device_copies[0];
+            /* Update the CPU copy to reflect the current status */
+            assert(cpu_copy->version < gpu_copy->version);
+            cpu_copy->version = gpu_copy->version;
+            cpu_copy->coherency_state = PARSEC_DATA_COHERENCY_SHARED;
+            gpu_copy->coherency_state = PARSEC_DATA_COHERENCY_SHARED;
+            assert(PARSEC_DATA_STATUS_UNDER_TRANSFER == cpu_copy->data_transfer_status);
+            cpu_copy->data_transfer_status = PARSEC_DATA_STATUS_COMPLETE_TRANSFER;
+            if( 0 == (parsec_mpi_allow_gpu_memory_communications & PARSEC_RUNTIME_SEND_FROM_GPU_MEMORY) ) {
+                /* Report the CPU copy as the output of the task. */
+                this_task->data[i].data_out = cpu_copy;
+            }
             PARSEC_DEBUG_VERBOSE(20, parsec_gpu_output_stream,
                                  "GPU copy %p [ref_count %d] moved to the read LRU in %s",
                                  gpu_copy, gpu_copy->super.super.obj_reference_count, __func__);
diff --git a/parsec/mca/device/device_gpu.h b/parsec/mca/device/device_gpu.h
index fa25b87a3..bb9ff93ea 100644
--- a/parsec/mca/device/device_gpu.h
+++ b/parsec/mca/device/device_gpu.h
@@ -66,7 +66,7 @@ typedef int (parsec_stage_in_function_t)(parsec_gpu_task_t        *gtask,
 
 /* Function type to transfer data from the GPU device.
  * Transfer transfer the <count> contiguous bytes from
- * task->data[i].data_in to task->data[i].data_out.
+ * task->data[i].data_out to the copy on device 0.
  *
  * @param[in] task parsec_task_t containing task->data[i].data_in, task->data[i].data_out.
  * @param[in] flow_mask indicating task flows for which to transfer.
diff --git a/parsec/parsec.c b/parsec/parsec.c
index 49e631dab..387ea9f88 100644
--- a/parsec/parsec.c
+++ b/parsec/parsec.c
@@ -1799,7 +1799,7 @@ parsec_release_dep_fct(parsec_execution_stream_t *es,
      * Check that we don't forward a NULL data to someone else. This
      * can be done only on the src node, since the dst node can
      * check for datatypes without knowing the data yet.
-     * By checking now, we allow for the data to be created any time bfore we
+     * By checking now, we allow for the data to be created any time before we
      * actually try to transfer it.
      */
     if( PARSEC_UNLIKELY((data->data == NULL) &&
@@ -1840,20 +1840,20 @@ parsec_release_dep_fct(parsec_execution_stream_t *es,
 #ifdef PARSEC_RESHAPE_BEFORE_SEND_TO_REMOTE
                     /* Now everything is a reshaping entry */
                     /* Check if we need to reshape before sending */
-                    if(parsec_is_CTL_dep(output->data)){ /* CTL DEP */
+                    if(parsec_is_CTL_dep(output->data)) { /* CTL DEP */
                         output->data.data_future = NULL;
                         output->data.repo = NULL;
                         output->data.repo_key = -1;
-                    }else{
+                    } else {
                         /* Get reshape from whatever repo it has been set up into */
                         output->data.data_future = (parsec_datacopy_future_t*)target_dc;
                         output->data.repo = target_repo;
                         output->data.repo_key = target_repo_entry->ht_item.key;
                         PARSEC_DEBUG_VERBOSE(4, parsec_debug_output,
-                                         "th%d RESHAPE_PROMISE SETUP FOR REMOTE DEPS [%p:%p] for INLINE REMOTE %s fut %p",
-                                         es->th_id, output->data.data, (output->data.data)->dtt,
-                                         (target_repo == successor_repo? "UNFULFILLED" : "FULFILLED"),
-                                         output->data.data_future);
+                                             "th%d RESHAPE_PROMISE SETUP FOR REMOTE DEPS [%p:%p] for INLINE REMOTE %s fut %p",
+                                             es->th_id, output->data.data, (output->data.data)->dtt,
+                                             (target_repo == successor_repo? "UNFULFILLED" : "FULFILLED"),
+                                             output->data.data_future);
                     }
 #endif
                 } else {
diff --git a/parsec/parsec_internal.h b/parsec/parsec_internal.h
index 214d18c75..cbeb5695a 100644
--- a/parsec/parsec_internal.h
+++ b/parsec/parsec_internal.h
@@ -201,6 +201,9 @@ PARSEC_DECLSPEC PARSEC_OBJ_CLASS_DECLARATION(parsec_taskpool_t);
 #define PARSEC_DEPENDENCIES_STARTUP_TASK   ((parsec_dependency_t)(1<<29))
 #define PARSEC_DEPENDENCIES_BITMASK        (~(PARSEC_DEPENDENCIES_TASK_DONE|PARSEC_DEPENDENCIES_IN_DONE|PARSEC_DEPENDENCIES_STARTUP_TASK))
 
+#define PARSEC_RUNTIME_SEND_FROM_GPU_MEMORY 0x00000002
+#define PARSEC_RUNTIME_RECV_FROM_GPU_MEMORY 0x00000001
+
 /**
  * This structure is used internally by the parsec_dependencies_t structures
  */
@@ -492,6 +495,12 @@ PARSEC_DECLSPEC extern int parsec_slow_bind_warning;
  * the scheduler, but can provide a better cache reuse.
  */
 PARSEC_DECLSPEC extern int parsec_runtime_keep_highest_priority_task;
+/**
+ * Global configuration mask allowing or not for the data to be sent or received,
+ * from or to, GPU memory. It can be an OR between PARSEC_RUNTIME_SEND_FROM_GPU_MEMORY
+ * and PARSEC_RUNTIME_RECV_FROM_GPU_MEMORY.
+ */
+PARSEC_DECLSPEC extern int parsec_mpi_allow_gpu_memory_communications;
 
 /**
  * Description of the state of the task. It indicates what will be the next
diff --git a/parsec/parsec_mpi_funnelled.c b/parsec/parsec_mpi_funnelled.c
index c2b84998b..662007e02 100644
--- a/parsec/parsec_mpi_funnelled.c
+++ b/parsec/parsec_mpi_funnelled.c
@@ -203,6 +203,8 @@ parsec_list_t mpi_funnelled_dynamic_sendreq_fifo; /* ordered non threaded fifo *
 parsec_list_t mpi_funnelled_dynamic_recvreq_fifo; /* ordered non threaded fifo */
 parsec_mempool_t *mpi_funnelled_dynamic_req_mempool = NULL;
 
+int parsec_mpi_allow_gpu_memory_communications = 3;
+
 /* This structure is used to save all the information necessary to
  * invoke a callback after a MPI_Request is satisfied
  */
@@ -519,6 +521,14 @@ static int mpi_funneled_init_once(parsec_context_t* context)
     parsec_param_enable_mpi_overtake = 0;  /* Don't allow to be changed */
 #endif  /* !defined(PARSEC_HAVE_MPI_OVERTAKE) */
 
+#if !defined(PARSEC_MPI_IS_GPU_AWARE)
+    parsec_mpi_allow_gpu_memory_communications = 0;
+#endif
+    parsec_mca_param_reg_int_name("mpi", "gpu_aware",
+                                  "Enabled if PaRSEC should allow MPI to move data directly from or to GPU memory. Otherwise, all data"
+                                  " movements will transit through CPU memory, and will always have a backup copy there. Accepted values "
+                                  "are ORed between 1 for receiving into GPU memory and 2 for sending from GPU memory",
+                                  false, false, parsec_mpi_allow_gpu_memory_communications, &parsec_mpi_allow_gpu_memory_communications);
     (void)context;
     return 0;
 }
@@ -1332,6 +1342,10 @@ parsec_check_overlapping_binding(parsec_context_t *context)
 #endif
 }
 
+#if defined(PARSEC_HAVE_DEV_CUDA_SUPPORT)
+#include <parsec/mca/device/device_gpu.h>
+#endif  /* defined(PARSEC_HAVE_DEV_CUDA_SUPPORT) */
+
 int
 mpi_no_thread_enable(parsec_comm_engine_t *ce)
 {
@@ -1343,6 +1357,21 @@ mpi_no_thread_enable(parsec_comm_engine_t *ce)
     if(parsec_ce_mpi_comm == (MPI_Comm)context->comm_ctx) {
         return PARSEC_SUCCESS;
     }
+#if defined(PARSEC_HAVE_DEV_CUDA_SUPPORT)
+    /* The communication thread need to have a CUDA context in order to be able to use
+     * CUDA managed memory. It should be enough to just create a context onto the first
+     * active device.
+     */
+    for (int dev = 0; dev < (int)parsec_nb_devices; dev++) {
+        parsec_device_module_t *device = parsec_mca_device_get(dev);
+        if (PARSEC_DEV_CUDA & device->type) {
+            parsec_device_gpu_module_t *gpu_dev = (parsec_device_gpu_module_t*)device;
+            gpu_dev->set_device(gpu_dev);
+        }
+    }
+#endif  /* defined(PARSEC_HAVE_DEV_CUDA_SUPPORT) */
+
+
     /* Finish the initialization of the communication engine */
     parsec_ce.mem_register        = mpi_no_thread_mem_register;
     parsec_ce.mem_unregister      = mpi_no_thread_mem_unregister;
diff --git a/parsec/parsec_reshape.c b/parsec/parsec_reshape.c
index eb6045647..83aa970bf 100644
--- a/parsec/parsec_reshape.c
+++ b/parsec/parsec_reshape.c
@@ -50,8 +50,8 @@ void parsec_cleanup_reshape_promise(parsec_base_future_t *future)
         free(match_data);
     }
     if(d_fut->super.tracked_data != NULL){
-        parsec_data_copy_t * data = (parsec_data_copy_t*) d_fut->super.tracked_data;
-        PARSEC_DATA_COPY_RELEASE(data);
+        parsec_data_copy_t * copy = (parsec_data_copy_t*) d_fut->super.tracked_data;
+        PARSEC_DATA_COPY_RELEASE(copy);
     }
 }
 
@@ -141,7 +141,7 @@ parsec_new_reshape_promise(parsec_dep_data_description_t* data,
      * going to consume the original data->data in order to reshape it, and
      * all other successors will use directly the reshaped data instead.
      */
-    PARSEC_OBJ_RETAIN( future_in_data->data );
+    PARSEC_DATA_COPY_RETAIN( future_in_data->data );
 
     return data_future;
 }
@@ -331,7 +331,7 @@ parsec_create_reshape_promise(parsec_execution_stream_t *es,
     }
 #endif
 
-    /* retain the future if it's being reuse. */
+    /* retain the future if it's being reused. */
     if ( !new_future ) PARSEC_OBJ_RETAIN(data->data_future);
 
     /* Set up the reshape promise. */
@@ -439,7 +439,7 @@ parsec_set_up_reshape_promise(parsec_execution_stream_t *es,
             /* Data has been received with the expected remote type of the
              * successor contained on data->data->dtt. */
             data->local.dst_datatype = data->local.src_datatype = data->data->dtt;
-        }else{
+        } else {
             /* Packed data because multiple unpacking alternatives at reception. */
             const parsec_task_class_t* fct = newcontext->task_class;
             uint32_t flow_mask = (1U << dep->flow->flow_index) | 0x80000000;  /* in flow */
@@ -579,7 +579,7 @@ parsec_get_copy_reshape_inline(parsec_execution_stream_t *es,
         data->data_future = (parsec_datacopy_future_t*)reshape_repo_entry->data[dep_flow_index];
     }
 
-    if(data->data_future == NULL){
+    if(data->data_future == NULL) {
         parsec_create_reshape_promise(es,
                                       data,
                                       0, -1, -1, /* no src dst rank */
@@ -595,9 +595,6 @@ parsec_get_copy_reshape_inline(parsec_execution_stream_t *es,
         char task_name[MAX_TASK_STRLEN];
         parsec_task_snprintf(task_name, MAX_TASK_STRLEN, task);
 
-        char type_string[MAX_TASK_STRLEN]="UNFULFILLED";
-        char orig_string[MAX_TASK_STRLEN]="LOCAL INLINE";
-
         char type_name_src[MAX_TASK_STRLEN] = "NULL";
         char type_name_dst[MAX_TASK_STRLEN] = "NULL";
         char type_name_data[MAX_TASK_STRLEN] = "NULL";
@@ -608,12 +605,11 @@ parsec_get_copy_reshape_inline(parsec_execution_stream_t *es,
 
         PARSEC_DEBUG_VERBOSE(12, parsec_debug_output,
                              "th%d RESHAPE_PROMISE CREATE %s %s [%s:%p:%p -> %p] flow_idx %u fut %p on %s(%p) k%d dtt %s -> %s [data %s]",
-                             es->th_id, type_string, orig_string, task_name, data->data, data->data->dtt,
+                             es->th_id, "UNFULFILLED", "LOCAL INLINE", task_name, data->data, data->data->dtt,
                              data->local.dst_datatype,
                              dep_flow_index,
                              data->data_future,
                              "CURR_REPO", setup_repo, setup_repo_key, type_name_src, type_name_dst, type_name_data);
-
 #endif
     }
 
@@ -629,7 +625,7 @@ parsec_get_copy_reshape_inline(parsec_execution_stream_t *es,
                          es->th_id, *reshape, (*reshape)->dtt, data->data_future);
 
     /* reshape completed */
-    PARSEC_OBJ_RETAIN(*reshape);
+    PARSEC_DATA_COPY_RETAIN(*reshape);
     PARSEC_OBJ_RELEASE(data->data_future);
     /* Clean up the old stuff on the repo used temporarily to hold
      * the inline reshape promise.
@@ -642,7 +638,7 @@ parsec_get_copy_reshape_inline(parsec_execution_stream_t *es,
 /**
  *
  * Routine to obtain a reshaped copy matching the specifications when reading
- * a tile from the datacollection.
+ * a tile from the data collection.
  * If a reshape needs to be performed, it is done using an inline reshape
  * promise, i.e., creating and fulfilling a local future promise (only
  * the current task instance is involved). Each thread accessing the same
@@ -769,7 +765,7 @@ parsec_get_copy_reshape_from_dep(parsec_execution_stream_t *es,
                          "th%d RESHAPE_PROMISE OBTAINED [%p:%p] for %s fut %p",
                          es->th_id, *reshape, (*reshape)->dtt, task_string, data->data_future);
 
-    PARSEC_OBJ_RETAIN(*reshape);
+    PARSEC_DATA_COPY_RETAIN(*reshape);
     PARSEC_OBJ_RELEASE(data->data_future);
 
     return PARSEC_HOOK_RETURN_RESHAPE_DONE;
diff --git a/parsec/remote_dep.c b/parsec/remote_dep.c
index 1f6920cac..fe4b6c5ec 100644
--- a/parsec/remote_dep.c
+++ b/parsec/remote_dep.c
@@ -492,7 +492,7 @@ int parsec_remote_dep_activate(parsec_execution_stream_t* es,
             /* This assert is not correct anymore, we don't need an arena to send to a remote
              * assert(NULL != output->data.remote.arena);*/
             assert( !parsec_is_CTL_dep(&output->data) );
-            PARSEC_OBJ_RETAIN(output->data.data);
+            PARSEC_DATA_COPY_RETAIN(output->data.data);
         }
 
         for( array_index = count = 0; count < remote_deps->output[i].count_bits; array_index++ ) {
diff --git a/parsec/remote_dep.h b/parsec/remote_dep.h
index 931053b7d..4e5c15677 100644
--- a/parsec/remote_dep.h
+++ b/parsec/remote_dep.h
@@ -87,6 +87,11 @@ struct parsec_dep_data_description_s {
      */
     parsec_datacopy_future_t      *data_future;
 
+    /* If we can extract a preferred location for the incoming data set it
+     * here, otherwise the memory for the incoming data will be allocated
+     * on the main memory (device 0).
+     */
+    int32_t preferred_device;
 #ifdef PARSEC_RESHAPE_BEFORE_SEND_TO_REMOTE
     /* Keeping current repo & key to be able to consume when
      * the "remote" successors (aka the communication engine)
diff --git a/parsec/remote_dep_mpi.c b/parsec/remote_dep_mpi.c
index a06a2088c..7c6aba035 100644
--- a/parsec/remote_dep_mpi.c
+++ b/parsec/remote_dep_mpi.c
@@ -142,10 +142,6 @@ parsec_execution_stream_t parsec_comm_es = {
 static void remote_dep_mpi_put_start(parsec_execution_stream_t* es, dep_cmd_item_t* item);
 static void remote_dep_mpi_get_start(parsec_execution_stream_t* es, parsec_remote_deps_t* deps);
 
-static void remote_dep_mpi_get_end(parsec_execution_stream_t* es,
-                                   int idx,
-                                   parsec_remote_deps_t* deps);
-
 static int
 remote_dep_mpi_get_end_cb(parsec_comm_engine_t *ce,
                           parsec_ce_tag_t tag,
@@ -565,25 +561,34 @@ void parsec_remote_dep_memcpy(parsec_execution_stream_t* es,
     item->cmd.memcpy.destination  = dst;
     item->cmd.memcpy.layout       = data->local;
 
-    PARSEC_OBJ_RETAIN(src);
+    PARSEC_DATA_COPY_RETAIN(src);
     remote_dep_inc_flying_messages(tp);
 
     parsec_dequeue_push_back(&dep_cmd_queue, (parsec_list_item_t*) item);
 }
 
 static inline parsec_data_copy_t*
-remote_dep_copy_allocate(parsec_dep_type_description_t* data)
+remote_dep_copy_allocate(parsec_dep_type_description_t* data, int preferred_device)
 {
     parsec_data_copy_t* dc;
     if( NULL == data->arena ) {
         assert(0 == data->dst_count);
         return NULL;
     }
-    dc = parsec_arena_get_copy(data->arena, data->dst_count, 0, data->dst_datatype);
-
-    dc->coherency_state = PARSEC_DATA_COHERENCY_EXCLUSIVE;
-    PARSEC_DEBUG_VERBOSE(20, parsec_comm_output_stream, "MPI:\tMalloc new remote tile %p size %" PRIu64 " count = %" PRIu64 " displ = %" PRIi64 " %p",
-            dc, data->arena->elem_size, data->dst_count, data->dst_displ, data->arena);
+    /* Go and allocate on the preferred device. If that fails, fall back and allocate a
+     * copy on the main memory (device 0), and parsec will transfer the data as needed
+     * for all tasks executing on acclerators.
+     */
+    dc = parsec_arena_get_new_copy(data->arena, data->dst_count, preferred_device, data->dst_datatype);
+    PARSEC_DATA_COPY_RETAIN(dc);
+    /* don't use preferred_device, it might not be the location where the data copy resides */
+    parsec_data_start_transfer_ownership_to_copy(dc->original, dc->device_index, PARSEC_FLOW_ACCESS_WRITE);
+    if (dc->device_index != preferred_device) {
+        PARSEC_DEBUG_VERBOSE(5, parsec_comm_output_stream, "MPI:\tFail to allocate tile on device %d and instead allocate on device %d\n",
+                             preferred_device, dc->device_index);
+    }
+    PARSEC_DEBUG_VERBOSE(5, parsec_comm_output_stream, "MPI:\tMalloc new temporary tile [dev %d] copy %p size %" PRIu64 " count = %" PRIu64 " displ = %" PRIi64 " %p",
+                         dc->device_index, dc, data->arena->elem_size, data->dst_count, data->dst_displ, data->arena);
     return dc;
 }
 
@@ -598,7 +603,7 @@ static inline parsec_data_copy_t*
 reshape_copy_allocate(parsec_dep_type_description_t* data)
 {
     parsec_data_copy_t* dc;
-    dc = remote_dep_copy_allocate(data);
+    dc = remote_dep_copy_allocate(data, 0  /* default device */);
 
     parsec_data_start_transfer_ownership_to_copy(dc->original,
                                                  0,
@@ -608,7 +613,7 @@ reshape_copy_allocate(parsec_dep_type_description_t* data)
 
 /**
  *
- * Routine to fulfilled a reshape promise by the current thread
+ * Fulfill a reshape promise by the current thread
  * (when MPI_THREAD_MULTIPLE) or delegate the reshaping to the communication
  * thread.
  * Routine set as callback when initializing a future.
@@ -657,9 +662,9 @@ void parsec_local_reshape_cb(parsec_base_future_t *future, ... )
     }
     if(src_pack_size != dst_pack_size){
         parsec_warning("parsec_local_reshape: reshape requested between dtt with different packed size fut %p dtt [%p:%s = sz(%d) -> %p:%s= sz(%d)]",
-                         future,
-                         dt->local->src_datatype, type_name_src, src_pack_size,
-                         dt->local->dst_datatype, type_name_dst, dst_pack_size);
+                       future,
+                       dt->local->src_datatype, type_name_src, src_pack_size,
+                       dt->local->dst_datatype, type_name_dst, dst_pack_size);
     }
 #endif
 
@@ -779,13 +784,19 @@ remote_dep_mpi_retrieve_datatype(parsec_execution_stream_t *eu,
         return PARSEC_ITERATE_STOP;
     }
     if(old_dtt != PARSEC_DATATYPE_NULL) {
+        /* multiple input deps from the same predecessor exists. Be careful on what format
+         * to receive the data. It would not make sense to receive different amount, but
+         * it is legal to receive them with different type signatures. In this case, ignore
+         * the datatype, and instead fall back into a packed format (aka bytes) and the
+         * entire length of the incomming data.
+         */
         if(old_dtt != output->data.remote.dst_datatype) {
 #if defined(PARSEC_DEBUG_NOISIER)
             char type_name_src[MAX_TASK_STRLEN] = "NULL";
             char type_name_dst[MAX_TASK_STRLEN] = "NULL";
             int len;
-            if(old_dtt!=PARSEC_DATATYPE_NULL) MPI_Type_get_name(old_dtt, type_name_src, &len);
-            if(output->data.remote.dst_datatype!=PARSEC_DATATYPE_NULL) MPI_Type_get_name(output->data.remote.dst_datatype, type_name_dst, &len);
+            if(old_dtt != PARSEC_DATATYPE_NULL) MPI_Type_get_name(old_dtt, type_name_src, &len);
+            if(output->data.remote.dst_datatype != PARSEC_DATATYPE_NULL) MPI_Type_get_name(output->data.remote.dst_datatype, type_name_dst, &len);
             PARSEC_DEBUG_VERBOSE(30, parsec_comm_output_stream, "MPI: retrieve dtt for %s [dep_datatype_index %x] DTT: old %s new %s (%p) --> PACKED",
                 newcontext->task_class->name, dep->dep_datatype_index, type_name_src, type_name_dst, output->data.remote.dst_datatype);
 #endif
@@ -793,7 +804,6 @@ remote_dep_mpi_retrieve_datatype(parsec_execution_stream_t *eu,
             parsec_ce.pack_size(&parsec_ce, output->data.remote.dst_count, output->data.remote.dst_datatype, &dsize);
             output->data.remote.src_count = output->data.remote.dst_count = dsize;
             output->data.remote.src_datatype = output->data.remote.dst_datatype = PARSEC_DATATYPE_PACKED;
-
             return PARSEC_ITERATE_STOP;
         }
     }
@@ -808,6 +818,18 @@ remote_dep_mpi_retrieve_datatype(parsec_execution_stream_t *eu,
                 newcontext->task_class->name, dep->dep_datatype_index, type_name_src, type_name_dst, output->data.remote.dst_datatype);
     }
 #endif
+    /* Predict where the incoming temporary should be located, by using the data_affinity.
+     * This only works is the task affinity is linked to the output location of the task, which
+     * is mostly true for owner-compute type of algorithms.
+     */
+    output->data.preferred_device = 0;  /* the default is CPU memory (aka. device 0) */
+    if (NULL != fct->data_affinity ) {
+        parsec_data_ref_t dref;
+        fct->data_affinity(newcontext, &dref);
+        parsec_data_t* data = dref.dc->data_of_key(dref.dc, dref.key);
+        output->data.preferred_device = (-1 != data->preferred_device) ?
+                                        data->preferred_device : data->owner_device; 
+    }
     return PARSEC_ITERATE_CONTINUE;
 }
 
@@ -989,6 +1011,12 @@ remote_dep_release_incoming(parsec_execution_stream_t* es,
         task.data[target->flow_index].source_repo_entry = NULL;
         task.data[target->flow_index].data_in   = origin->output[i].data.data;
         task.data[target->flow_index].data_out  = origin->output[i].data.data;
+        if( NULL != origin->output[i].data.data ) {  /* nothing for control flows */
+            /* The data has been fully received, mark the copy accordingly */
+            task.data[target->flow_index].data_in->coherency_state = PARSEC_DATA_COHERENCY_OWNED;
+            task.data[target->flow_index].data_in->flags &= ~PARSEC_DATA_FLAG_TRANSIT;  /* not in transit anymore */
+            task.data[target->flow_index].data_in->data_transfer_status = PARSEC_DATA_STATUS_COMPLETE_TRANSFER;
+        }
     }
 
 #ifdef PARSEC_DIST_COLLECTIVES
@@ -1436,7 +1464,7 @@ static int local_dep_nothread_reshape(parsec_execution_stream_t* es,
      * once all successors have consumed the future, in case it is needed
      * as an input for nested futures.
      */
-    PARSEC_OBJ_RETAIN(cmd->memcpy.source);
+    PARSEC_DATA_COPY_RETAIN(cmd->memcpy.source);
 
     int rc = remote_dep_nothread_memcpy(es, item);
     assert(MPI_SUCCESS == rc);
@@ -1649,7 +1677,7 @@ remote_dep_mpi_put_start(parsec_execution_stream_t* es,
 
             deps->output[k].data.data = reshape_data;
 
-            PARSEC_OBJ_RETAIN(reshape_data);
+            PARSEC_DATA_COPY_RETAIN(reshape_data);
             PARSEC_DATA_COPY_RELEASE(old_data);/*old data has been retained for remote communication*/
 
             PARSEC_OBJ_RELEASE(deps->output[k].data.data_future);
@@ -1854,7 +1882,8 @@ static void remote_dep_mpi_recv_activate(parsec_execution_stream_t* es,
             if((length - (*position)) >= (int)data_sizes[ds_idx]) {
                 assert(NULL == data_desc->data); /* we do not support in-place tiles now, make sure it doesn't happen yet */
                 if(NULL == data_desc->data) {
-                    data_desc->data = remote_dep_copy_allocate(type_desc);
+                    /* if we have to unpack the data onto this new copy we might want to allocated it on the CPU */
+                    data_desc->data = remote_dep_copy_allocate(type_desc, 0);
                 }
 #ifndef PARSEC_PROF_DRY_DEP
                 PARSEC_DEBUG_VERBOSE(10, parsec_comm_output_stream,
@@ -2080,9 +2109,13 @@ static void remote_dep_mpi_get_start(parsec_execution_stream_t* es,
         /* prepare the local receiving data */
         assert(NULL == deps->output[k].data.data); /* we do not support in-place tiles now, make sure it doesn't happen yet */
         if(NULL == deps->output[k].data.data) {
-            deps->output[k].data.data = remote_dep_copy_allocate(&deps->output[k].data.remote);
+            int best_device = (parsec_mpi_allow_gpu_memory_communications & PARSEC_RUNTIME_RECV_FROM_GPU_MEMORY) ? deps->output[k].data.preferred_device : 0;
+            deps->output[k].data.data = remote_dep_copy_allocate(&deps->output[k].data.remote, best_device);
         }
-        dtt   = deps->output[k].data.remote.dst_datatype;
+        /* Mark the data under tranfer */
+        deps->output[k].data.data->data_transfer_status = PARSEC_DATA_STATUS_UNDER_TRANSFER;
+        deps->output[k].data.data->flags |= PARSEC_DATA_FLAG_TRANSIT;
+        dtt = deps->output[k].data.remote.dst_datatype;
         nbdtt = deps->output[k].data.remote.dst_count;
 
         /* We have the remote mem_handle.
@@ -2159,14 +2192,6 @@ static void remote_dep_mpi_get_start(parsec_execution_stream_t* es,
     }
 }
 
-static void remote_dep_mpi_get_end(parsec_execution_stream_t* es,
-                                   int idx,
-                                   parsec_remote_deps_t* deps)
-{
-    /* The ref on the data will be released below */
-    remote_dep_release_incoming(es, deps, (1U<<idx));
-}
-
 static int
 remote_dep_mpi_get_end_cb(parsec_comm_engine_t *ce,
                           parsec_ce_tag_t tag,
@@ -2197,7 +2222,7 @@ remote_dep_mpi_get_end_cb(parsec_comm_engine_t *ce,
 #if defined(PARSEC_PROF_TRACE)
     TAKE_TIME(es->es_profile, MPI_Data_pldr_ek, callback_data->event_id);
 #endif /* PARSEC_PROF_TRACE */
-    remote_dep_mpi_get_end(es, callback_data->k, deps);
+    remote_dep_release_incoming(es, deps, (1U << callback_data->k));
 
     parsec_ce.mem_unregister(&callback_data->memory_handle);
     parsec_thread_mempool_free(parsec_remote_dep_cb_data_mempool->thread_mempools, callback_data);
@@ -2239,6 +2264,7 @@ int remote_dep_ce_reconfigure(parsec_context_t* context)
          * execution stream to parsec_comm_es. */
         parsec_set_my_execution_stream(&parsec_comm_es);
     }
+
     return PARSEC_SUCCESS;
 }
 
diff --git a/parsec/utils/zone_malloc.c b/parsec/utils/zone_malloc.c
index 81dad5ccd..700c2dade 100644
--- a/parsec/utils/zone_malloc.c
+++ b/parsec/utils/zone_malloc.c
@@ -83,7 +83,10 @@ void *zone_malloc(zone_malloc_t *gdata, size_t size)
         current_segment = SEGMENT_AT_TID(gdata, current_tid);
         if( NULL == current_segment ) {
             /* Maybe there is a free slot in the beginning. Let's cycle at least once before we bail out */
-            if( 0 != cycled_through ) break;
+            if( 0 != cycled_through  ) {
+                parsec_atomic_unlock(&gdata->lock);
+                return NULL;
+            }
             current_tid = 0;
             cycled_through = 1;
             current_segment = SEGMENT_AT_TID(gdata, current_tid);
diff --git a/parsec/utils/zone_malloc.h b/parsec/utils/zone_malloc.h
index 3529834e0..6e012e3da 100644
--- a/parsec/utils/zone_malloc.h
+++ b/parsec/utils/zone_malloc.h
@@ -9,7 +9,6 @@
 
 #include "parsec/parsec_config.h"
 #include "parsec/sys/atomic.h"
-
 #include <stdlib.h>
 #include <assert.h>
 
diff --git a/tests/apps/haar_tree/project_dyn.jdf b/tests/apps/haar_tree/project_dyn.jdf
index a34a9cf80..878b74489 100644
--- a/tests/apps/haar_tree/project_dyn.jdf
+++ b/tests/apps/haar_tree/project_dyn.jdf
@@ -135,7 +135,7 @@ static int my_project_dyn_startup(parsec_execution_stream_t * es, __parsec_proje
   new_task->data._f_RL.data_out = NULL;
   new_task->data._f_NODE.source_repo = NULL;
   new_task->data._f_NODE.source_repo_entry = NULL;
-  chunk = parsec_arena_get_copy(__tp->super.arenas_datatypes[PARSEC_project_dyn_DEFAULT_ADT_IDX].arena, 1, 0, __tp->super.arenas_datatypes[PARSEC_project_dyn_DEFAULT_ADT_IDX].opaque_dtt);
+  chunk = parsec_arena_get_new_copy(__tp->super.arenas_datatypes[PARSEC_project_dyn_DEFAULT_ADT_IDX].arena, 1, 0, __tp->super.arenas_datatypes[PARSEC_project_dyn_DEFAULT_ADT_IDX].opaque_dtt);
   chunk->original->owner_device = 0;
   new_task->data._f_NODE.data_out = chunk;
   new_task->data._f_NODE.data_in = chunk;
diff --git a/tests/runtime/cuda/rtt.jdf b/tests/runtime/cuda/rtt.jdf
index 28fe93176..a7b3a6cdd 100644
--- a/tests/runtime/cuda/rtt.jdf
+++ b/tests/runtime/cuda/rtt.jdf
@@ -13,7 +13,7 @@ k = 0 .. NT-1
 : A(0, k % WS)
 
 RW T <- (k == 0) ? A(0, k % WS) : T PING(k-1)
-     -> (k < NT) ? T PING(k+1) : A(0, k % WS)
+     -> (k < NT) ? T PING(k+1)
 
 ; 0
 
diff --git a/tests/runtime/cuda/rtt_main.c b/tests/runtime/cuda/rtt_main.c
index e3c212907..0d2f5ae41 100644
--- a/tests/runtime/cuda/rtt_main.c
+++ b/tests/runtime/cuda/rtt_main.c
@@ -60,40 +60,15 @@ __parsec_rtt_destructor(parsec_rtt_taskpool_t *rtt_tp)
 PARSEC_OBJ_CLASS_INSTANCE(parsec_rtt_taskpool_t, parsec_taskpool_t,
                           NULL, __parsec_rtt_destructor);
 
-parsec_taskpool_t *rtt_New(parsec_context_t *ctx, size_t size, int roundtrips)
+parsec_taskpool_t *rtt_New(parsec_context_t *ctx, parsec_matrix_block_cyclic_t *dcA,
+                           parsec_datatype_t block, int roundtrips)
 {
     parsec_rtt_taskpool_t *tp = NULL;
-    parsec_datatype_t block;
-    size_t mb = sqrt(size), nb = size / mb;
-
-    if (mb <= 0) {
-        fprintf(stderr, "To work, RTT must do at least one round time trip of at least one byte\n");
-        return (parsec_taskpool_t *)tp;
-    }
-
-    parsec_matrix_block_cyclic_t* dcA = (parsec_matrix_block_cyclic_t *)calloc(1, sizeof(parsec_matrix_block_cyclic_t));
-    parsec_matrix_block_cyclic_init(dcA, PARSEC_MATRIX_BYTE, PARSEC_MATRIX_TILE,
-                                    ctx->my_rank,
-                                    mb, nb,
-                                    mb, ctx->nb_nodes * nb,
-                                    0, 0,
-                                    mb, ctx->nb_nodes * nb,
-                                    1, ctx->nb_nodes, 1, 1,
-                                    0, 0);
-    dcA->mat = parsec_data_allocate((size_t)dcA->super.nb_local_tiles *
-                                    (size_t)dcA->super.bsiz *
-                                    (size_t)parsec_datadist_getsizeoftype(dcA->super.mtype));
-    parsec_data_collection_set_key((parsec_data_collection_t *)dcA, "A");
-
-    /* Initialize and place the dcA */
-    parsec_apply(ctx, PARSEC_MATRIX_FULL,
-                 (parsec_tiled_matrix_t *)dcA,
-                 (parsec_tiled_matrix_unary_op_t)matrix_init_ops, NULL);
 
     tp = parsec_rtt_new((parsec_data_collection_t*)dcA, roundtrips, ctx->nb_nodes);
+    tp->arenas_datatypes[PARSEC_rtt_DEFAULT_ADT_IDX].opaque_dtt = block;
 
     ptrdiff_t lb, extent;
-    parsec_type_create_contiguous(mb*nb, parsec_datatype_uint8_t, &block);
     parsec_type_extent(block, &lb, &extent);
 
     parsec_arena_datatype_construct(&tp->arenas_datatypes[PARSEC_rtt_DEFAULT_ADT_IDX],
@@ -144,26 +119,33 @@ int main(int argc, char *argv[])
     }
     argv[argc] = NULL;
 #if defined(DISTRIBUTED)
+    {
+        int provided;
+        MPI_Init_thread(NULL, NULL, MPI_THREAD_SERIALIZED, &provided);
+    }
+    MPI_Comm_size(MPI_COMM_WORLD, &size);
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
 #if defined(PARSEC_HAVE_DEV_CUDA_SUPPORT)
-    extern char **environ;
-    char *value;
-    asprintf(&value, "%d", nb_gpus);
-    parsec_setenv_mca_param("device_cuda_enabled", value, &environ);
-    free(value);
-    value = NULL;
-    if (0xFF != gpu_mask) {
+    if (0xFF == gpu_mask) {
+        extern char **environ;
+        MPI_Comm local_comm;
+        int local_rank, local_size;
+        MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0,
+                            MPI_INFO_NULL, &local_comm);
+        MPI_Comm_rank(local_comm, &local_rank);
+        MPI_Comm_size(local_comm, &local_size);
+        MPI_Comm_free(&local_comm);
+        int gpu_mask = 0;
+        for (int i = 0; i < nb_gpus; i++) {
+            gpu_mask |= ((1 << local_rank) << i);
+        }
+        char *value;
         asprintf(&value, "%d", gpu_mask);
         parsec_setenv_mca_param("device_cuda_mask", value, &environ);
         free(value);
         value = NULL;
     }
 #endif
-    {
-        int provided;
-        MPI_Init_thread(NULL, NULL, MPI_THREAD_SERIALIZED, &provided);
-    }
-    MPI_Comm_size(MPI_COMM_WORLD, &size);
-    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
 #endif /* DISTRIBUTED */
     if( 0 == rank ) {
         printf("Running %d tests of %d steps RTT with a data of size %zu\n",
@@ -178,9 +160,6 @@ int main(int argc, char *argv[])
         parsec_warning("This test can only run if at least one GPU device is present");
         exit(-PARSEC_ERR_DEVICE);
     }
-    if( do_sleep ) {
-        sleep(do_sleep);
-    }
     cuda_device_index = (int *)malloc(parsec_nb_devices * sizeof(int));
     cuda_device_index_len = 0;
     for (int dev = 0; dev < (int)parsec_nb_devices; dev++) {
@@ -190,12 +169,44 @@ int main(int argc, char *argv[])
         }
     }
 
+    parsec_datatype_t block;
+    size_t mb = sqrt(msg_size), nb = msg_size / mb;
+    if (mb <= 0) {
+        fprintf(stderr, "To work, RTT must do at least one round time trip of at least one byte\n");
+        exit(-1);
+    }
+
+    parsec_matrix_block_cyclic_t* dcA = (parsec_matrix_block_cyclic_t *)calloc(1, sizeof(parsec_matrix_block_cyclic_t));
+    parsec_matrix_block_cyclic_init(dcA, PARSEC_MATRIX_BYTE, PARSEC_MATRIX_TILE,
+                                    parsec->my_rank,
+                                    mb, nb,
+                                    mb, parsec->nb_nodes * nb,
+                                    0, 0,
+                                    mb, parsec->nb_nodes * nb,
+                                    1, parsec->nb_nodes, 1, 1,
+                                    0, 0);
+    dcA->mat = parsec_data_allocate((size_t)dcA->super.nb_local_tiles *
+                                    (size_t)dcA->super.bsiz *
+                                    (size_t)parsec_datadist_getsizeoftype(dcA->super.mtype));
+    parsec_data_collection_set_key((parsec_data_collection_t *)dcA, "A");
+
+    parsec_type_create_contiguous(mb * nb, parsec_datatype_uint8_t, &block);
+
+    /* Initialize and place the dcA */
+    parsec_apply(parsec, PARSEC_MATRIX_FULL,
+                 (parsec_tiled_matrix_t *)dcA,
+                 (parsec_tiled_matrix_unary_op_t)matrix_init_ops, NULL);
+
+
+    if( do_sleep ) {
+        sleep(do_sleep);
+    }
 #if defined(PARSEC_HAVE_MPI)
         MPI_Barrier(MPI_COMM_WORLD);
 #endif  /* defined(PARSEC_HAVE_MPI) */
     gettimeofday(&tstart, NULL);
     for( int test_id = 0; test_id < nb_runs; test_id++ ) {
-        tp = rtt_New(parsec, msg_size, loops);
+        tp = rtt_New(parsec, dcA, block, loops);
         if( NULL != tp ) {
             parsec_context_add_taskpool(parsec, tp);
             parsec_context_start(parsec);

From 0505b25128560109a9143e35bafd6c13b26a6e22 Mon Sep 17 00:00:00 2001
From: George Bosilca <gbosilca@nvidia.com>
Date: Wed, 7 Aug 2024 19:56:30 -0700
Subject: [PATCH 04/18] Mostly improvement to the debuging output.

Name the data_t allocated for temporaries allowing developers to track
them through the execution. Add the keys to all outputs (tasks and
copies).

Signed-off-by: George Bosilca <gbosilca@nvidia.com>
---
 parsec/arena.c                 |  9 +++++++++
 parsec/class/info.c            |  2 ++
 parsec/data.c                  |  5 +++--
 parsec/mca/device/device_gpu.c | 28 +++++++++++++++-------------
 parsec/parsec.c                | 14 ++++++++++++--
 parsec/remote_dep_mpi.c        |  1 +
 parsec/utils/debug.h           |  2 +-
 parsec/utils/zone_malloc.c     |  5 +----
 8 files changed, 44 insertions(+), 22 deletions(-)

diff --git a/parsec/arena.c b/parsec/arena.c
index 73428930e..3fc19f561 100644
--- a/parsec/arena.c
+++ b/parsec/arena.c
@@ -238,6 +238,10 @@ int  parsec_arena_allocate_device_private(parsec_data_copy_t *copy,
 #include "parsec/utils/zone_malloc.h"
 #include "mca/device/device_gpu.h"
 
+#if defined(PARSEC_DEBUG)
+static int64_t parsec_countable_incoming_message = 0xF000000000000000;
+#endif  /* defined(PARSEC_DEBUG) */
+
 static inline parsec_data_copy_t *
 parsec_arena_internal_copy_new(parsec_arena_t *arena,
                                parsec_data_t *data,
@@ -251,6 +255,11 @@ parsec_arena_internal_copy_new(parsec_arena_t *arena,
         if( NULL == ldata ) {
             return NULL;
         }
+#if defined(PARSEC_DEBUG)
+        /* Name the data with a default key to facilitate debuging */
+        ldata->key = (uint64_t)parsec_atomic_fetch_inc_int64(&parsec_countable_incoming_message);
+        ldata->key |= ((uint64_t)device) << 56;
+#endif  /* defined(PARSEC_DEBUG) */
     }
     if( 0 == device ) {
         copy = parsec_data_copy_new(ldata, device, dtt,
diff --git a/parsec/class/info.c b/parsec/class/info.c
index 789f77f0d..4afbd6d38 100644
--- a/parsec/class/info.c
+++ b/parsec/class/info.c
@@ -313,6 +313,8 @@ void *parsec_info_get(parsec_info_object_array_t *oa, parsec_info_id_t iid)
     if(NULL == ie->constructor)
         return ret;
     nio = ie->constructor(oa->cons_obj, ie->cons_data);
+    if( NULL == nio )
+        return ret;
     ret = parsec_info_test_and_set(oa, iid, nio, NULL);
     if(ret != nio && NULL != ie->destructor) {
         ie->destructor(nio, ie->des_data);
diff --git a/parsec/data.c b/parsec/data.c
index 4361e3b7e..b04dde7ae 100644
--- a/parsec/data.c
+++ b/parsec/data.c
@@ -471,10 +471,11 @@ void parsec_data_copy_dump(parsec_data_copy_t* copy)
     if( PARSEC_DATA_COHERENCY_SHARED    == copy->coherency_state ) coherency = "shared";
 
     parsec_debug_verbose(0, 0, "%s  [%d]: copy %p [ref %d] coherency %s readers %d version %u transit %s flags %s\n"
-                               "          older %p orig %p arena %p dev_priv %p\n",
+                               "          older %p orig %p [%llx] arena %p dev_priv %p\n",
                          ((NULL != copy->original) && (copy->original->owner_device == copy->device_index)) ? "*" : " ",
                          (int)copy->device_index, copy, copy->super.super.obj_reference_count, coherency, copy->readers, copy->version, tranfer, flags,
-                         (void *)copy->older, (void *)copy->original, (void *)copy->arena_chunk, copy->device_private);
+                         (void *)copy->older, (void *)copy->original,
+                         (NULL != copy->original) ? (unsigned long)copy->original->key : (unsigned long)-1, (void *)copy->arena_chunk, copy->device_private);
 }
 
 void parsec_data_dump(parsec_data_t* data)
diff --git a/parsec/mca/device/device_gpu.c b/parsec/mca/device/device_gpu.c
index 2fde0d7d0..7a7e1fa1c 100644
--- a/parsec/mca/device/device_gpu.c
+++ b/parsec/mca/device/device_gpu.c
@@ -301,7 +301,7 @@ void parsec_device_dump_exec_stream(parsec_gpu_exec_stream_t* exec_stream)
     int i;
 
     parsec_debug_verbose(0, parsec_gpu_output_stream,
-                         "Dev: GPU stream %d{%p} [events = %d, start = %d, end = %d, executed = %d]",
+                         "Dev: GPU stream %s{%p} [events = %d, start = %d, end = %d, executed = %d]",
                          exec_stream->name, exec_stream, exec_stream->max_events, exec_stream->start, exec_stream->end,
                          exec_stream->executed);
     for( i = 0; i < exec_stream->max_events; i++ ) {
@@ -322,12 +322,12 @@ void parsec_device_dump_gpu_state(parsec_device_gpu_module_t* gpu_device)
         data_in_dev += gpu_device->super.data_in_from_device[i];
     }
 
-    parsec_output(parsec_gpu_output_stream, "\n\n");
-    parsec_output(parsec_gpu_output_stream, "Device %d:%d (%p) epoch\n", gpu_device->super.device_index,
-                  gpu_device->super.device_index, gpu_device, gpu_device->data_avail_epoch);
-    parsec_output(parsec_gpu_output_stream, "\tpeer mask %x executed tasks with %llu streams %d\n",
-                  gpu_device->peer_access_mask, (unsigned long long)gpu_device->super.executed_tasks, gpu_device->num_exec_streams);
-    parsec_output(parsec_gpu_output_stream, "\tstats transferred [in: %llu from host %llu from other device out: %llu] required [in: %llu out: %llu]\n",
+    parsec_output(parsec_gpu_output_stream,
+                  "\n\nDevice %s:%d (%p) epoch %zu\n"
+                  "\tpeer mask %x executed tasks %llu streams %d\n"
+                  "\tstats transferred [in: %llu from host %llu from other device out: %llu] required [in: %llu out: %llu]\n",
+                  gpu_device->super.name, gpu_device->super.device_index, gpu_device, gpu_device->data_avail_epoch,
+                  gpu_device->peer_access_mask, (unsigned long long)gpu_device->super.executed_tasks, gpu_device->num_exec_streams,
                   (unsigned long long)data_in_host, (unsigned long long)data_in_dev,
                   (unsigned long long)gpu_device->super.data_out_to_host,
                   (unsigned long long)gpu_device->super.required_data_in, (unsigned long long)gpu_device->super.required_data_out);
@@ -1051,7 +1051,7 @@ parsec_device_data_reserve_space( parsec_device_gpu_module_t* gpu_device,
                 for( j = 0; j <= i; j++ ) {
                     /* This flow could be a control flow */
                     if( NULL == temp_loc[j] ) continue;
-                    this_task->data[j].data_out = gpu_elem;  /* reset the data out */
+                    this_task->data[j].data_out = NULL;  /* reset the data out */
                     /* This flow could be non-parsec-owned, in which case we can't reclaim it */
                     if( 0 == (temp_loc[j]->flags & PARSEC_DATA_FLAG_PARSEC_OWNED) ) continue;
                     PARSEC_DEBUG_VERBOSE(20, parsec_gpu_output_stream,
@@ -1065,7 +1065,10 @@ parsec_device_data_reserve_space( parsec_device_gpu_module_t* gpu_device,
                 PARSEC_DATA_COPY_RELEASE(gpu_elem);
 #endif
                 parsec_atomic_unlock(&master->lock);
-                return PARSEC_HOOK_RETURN_NEXT;
+                if( data_avail_epoch ) {  /* update the memory epoch */
+                    gpu_device->data_avail_epoch++;
+                }
+                return PARSEC_HOOK_RETURN_AGAIN;
             }
 
             PARSEC_LIST_ITEM_SINGLETON(lru_gpu_elem);
@@ -1409,7 +1412,7 @@ parsec_device_data_stage_in( parsec_device_gpu_module_t* gpu_device,
                              "GPU[%d:%s]: Prefetch task %p is staging in",
                              gpu_device->super.device_index, gpu_device->super.name, gpu_task);
     }
-    if( NULL == gpu_elem ) {
+    if( gpu_elem == candidate ) {  /* data already located in the right place */
         if( candidate->device_index == gpu_device->super.device_index ) {
             /* the candidate is already located on the GPU, no transfer should be necessary but let's do the bookkeeping */
             if( (PARSEC_FLOW_ACCESS_WRITE & type) && (gpu_task->task_type != PARSEC_GPU_TASK_TYPE_PREFETCH) ) {
@@ -2182,7 +2185,7 @@ parsec_device_kernel_push( parsec_device_gpu_module_t      *gpu_device,
         gpu_task->last_data_check_epoch = gpu_device->data_avail_epoch;
         return ret;
     }
-
+    gpu_task->last_status = 0;  /* mark the task as clean */
     for( i = 0; i < this_task->task_class->nb_flows; i++ ) {
 
         flow = gpu_task->flow[i];
@@ -2211,11 +2214,10 @@ parsec_device_kernel_push( parsec_device_gpu_module_t      *gpu_device,
             return ret;
         }
     }
-
     PARSEC_DEBUG_VERBOSE(10, parsec_gpu_output_stream,
                          "GPU[%d:%s]: Push task %s DONE",
                          gpu_device->super.device_index, gpu_device->super.name,
-                         parsec_task_snprintf(tmp, MAX_TASK_STRLEN, this_task) );
+                         parsec_task_snprintf(tmp, MAX_TASK_STRLEN, this_task));
     gpu_task->complete_stage = parsec_device_callback_complete_push;
 #if defined(PARSEC_PROF_TRACE)
     gpu_task->prof_key_end = -1; /* We do not log that event as the completion of this task */
diff --git a/parsec/parsec.c b/parsec/parsec.c
index 387ea9f88..68980f9ad 100644
--- a/parsec/parsec.c
+++ b/parsec/parsec.c
@@ -1943,10 +1943,20 @@ parsec_task_snprintf( char* str, size_t size,
                            task->locals[i].value );
         if( index >= size ) return str;
     }
-    index += snprintf(str + index, size - index, "]<%d>", task->priority );
+    index += snprintf(str + index, size - index, "]<%d> keys = {", task->priority );
+    if( index >= size ) return str;
+    for( i = 0; i < tc->nb_flows; i++ ) {
+        char *prefix = (i == 0) ? "" : ", ";
+        if ((NULL == task->data[i].data_in) || (NULL == task->data[i].data_in->original))
+            index += snprintf(str + index, size - index, "%s*", prefix);
+        else
+            index += snprintf(str + index, size - index, "%s%lx", prefix, task->data[i].data_in->original->key);
+        if( index >= size ) return str;
+    }
+    index += snprintf(str + index, size - index, "}" );
     if( index >= size ) return str;
     if( NULL != task->taskpool ) {
-        index += snprintf(str + index, size - index, "{%u}", task->taskpool->taskpool_id );
+        index += snprintf(str + index, size - index, " {tp: %u}", task->taskpool->taskpool_id );
         if( index >= size ) return str;
     }
     return str;
diff --git a/parsec/remote_dep_mpi.c b/parsec/remote_dep_mpi.c
index 7c6aba035..b513b4df9 100644
--- a/parsec/remote_dep_mpi.c
+++ b/parsec/remote_dep_mpi.c
@@ -80,6 +80,7 @@ remote_dep_cmd_to_string(remote_dep_wire_activate_t* origin,
     if( NULL == task.task_class ) return snprintf(str, len, "UNKNOWN_of_TASKCLASS_%d", origin->task_class_id), str;
     memcpy(&task.locals, origin->locals, sizeof(parsec_assignment_t) * task.task_class->nb_locals);
     task.priority     = 0xFFFFFFFF;
+    for(int i = 0; i < task.task_class->nb_flows; task.data[i++].data_in = NULL);
     return parsec_task_snprintf(str, len, &task);
 }
 
diff --git a/parsec/utils/debug.h b/parsec/utils/debug.h
index 6f3e53231..80290ae8b 100644
--- a/parsec/utils/debug.h
+++ b/parsec/utils/debug.h
@@ -160,7 +160,7 @@ extern void (*parsec_weaksym_exit)(int status);
 #else  /* defined(PARSEC_DEBUG_NOISIER) */
 
 #define PARSEC_DEBUG_VERBOSE(...) do{} while(0)
-#endif /* defined(PARSEC_DEBUG_VERBOSE) */
+#endif /* defined(PARSEC_DEBUG_NOISIER) */
 
 /** $brief To check if any parsec function returned error.
  */
diff --git a/parsec/utils/zone_malloc.c b/parsec/utils/zone_malloc.c
index 700c2dade..81dad5ccd 100644
--- a/parsec/utils/zone_malloc.c
+++ b/parsec/utils/zone_malloc.c
@@ -83,10 +83,7 @@ void *zone_malloc(zone_malloc_t *gdata, size_t size)
         current_segment = SEGMENT_AT_TID(gdata, current_tid);
         if( NULL == current_segment ) {
             /* Maybe there is a free slot in the beginning. Let's cycle at least once before we bail out */
-            if( 0 != cycled_through  ) {
-                parsec_atomic_unlock(&gdata->lock);
-                return NULL;
-            }
+            if( 0 != cycled_through ) break;
             current_tid = 0;
             cycled_through = 1;
             current_segment = SEGMENT_AT_TID(gdata, current_tid);

From 9f0c172c5801d0839123a95fa42a63f2c621c308 Mon Sep 17 00:00:00 2001
From: George Bosilca <gbosilca@nvidia.com>
Date: Wed, 2 Oct 2024 17:40:40 -0400
Subject: [PATCH 05/18] Correctly initialize an unlock atomic lock.

Signed-off-by: George Bosilca <gbosilca@nvidia.com>
---
 parsec/utils/zone_malloc.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/parsec/utils/zone_malloc.c b/parsec/utils/zone_malloc.c
index 81dad5ccd..ca78c546d 100644
--- a/parsec/utils/zone_malloc.c
+++ b/parsec/utils/zone_malloc.c
@@ -33,6 +33,7 @@ zone_malloc_t* zone_malloc_init(void* base_ptr, int _max_segment, size_t _unit_s
         zone_malloc_error("Cannot manage an empty memory region\n");
         return NULL;
     }
+    parsec_atomic_lock_t temp_unlock = PARSEC_ATOMIC_UNLOCKED;
 
     gdata = (zone_malloc_t*)malloc( sizeof(zone_malloc_t) );
     gdata->base         = base_ptr;

From 1468d6ac3afe2a7caa6947684995ef00cf6d4d40 Mon Sep 17 00:00:00 2001
From: Aurelien Bouteiller <bouteill@icl.utk.edu>
Date: Fri, 11 Oct 2024 13:51:23 -0400
Subject: [PATCH 06/18] gpu-datain: when we are executing a CPU body we need to
 retrieve the CPU copy if we are passed-in a GPU copy, and we need to
 retain/release the copies that we are swapping

---
 parsec/interfaces/ptg/ptg-compiler/jdf2c.c |  2 +-
 parsec/mca/device/device_gpu.c             |  2 +-
 parsec/parsec.c                            | 13 +++++---
 parsec/scheduling.c                        | 38 +++++++++++++++++++---
 4 files changed, 45 insertions(+), 10 deletions(-)

diff --git a/parsec/interfaces/ptg/ptg-compiler/jdf2c.c b/parsec/interfaces/ptg/ptg-compiler/jdf2c.c
index 7005b963e..c0867495e 100644
--- a/parsec/interfaces/ptg/ptg-compiler/jdf2c.c
+++ b/parsec/interfaces/ptg/ptg-compiler/jdf2c.c
@@ -4467,7 +4467,7 @@ static void jdf_generate_startup_hook( const jdf_t *jdf )
             "    if(NULL == device) continue;\n"
             "    if(NULL != device->taskpool_register)\n"
             "      if( PARSEC_SUCCESS != device->taskpool_register(device, (parsec_taskpool_t*)__parsec_tp) ) {\n"
-            "        parsec_debug_verbose(5, parsec_debug_output, \"Device %%s refused to register taskpool %%p\", device->name, __parsec_tp);\n"
+            "        parsec_debug_verbose(10, parsec_debug_output, \"Device %%s refused to register taskpool %%p\", device->name, __parsec_tp);\n"
             "        __parsec_tp->super.super.devices_index_mask &= ~(1 << device->device_index);\n"
             "        continue;\n"
             "      }\n"
diff --git a/parsec/mca/device/device_gpu.c b/parsec/mca/device/device_gpu.c
index 7a7e1fa1c..f67b42dca 100644
--- a/parsec/mca/device/device_gpu.c
+++ b/parsec/mca/device/device_gpu.c
@@ -2456,7 +2456,7 @@ parsec_device_kernel_pop( parsec_device_gpu_module_t   *gpu_device,
     PARSEC_DEBUG_VERBOSE(10, parsec_gpu_output_stream,
                          "GPU[%d:%s]: Pop %s DONE (return %d data epoch %"PRIu64")",
                          gpu_device->super.device_index, gpu_device->super.name,
-                         parsec_task_snprintf(tmp, MAX_TASK_STRLEN, this_task), return_code, gpu_device->data_avail_epoch );
+                         parsec_task_snprintf(tmp, MAX_TASK_STRLEN, this_task), return_code < 0 ? return_code: how_many, gpu_device->data_avail_epoch );
 
     return (return_code < 0 ? return_code : how_many);
 }
diff --git a/parsec/parsec.c b/parsec/parsec.c
index 68980f9ad..5b35ac7ba 100644
--- a/parsec/parsec.c
+++ b/parsec/parsec.c
@@ -215,6 +215,10 @@ static void __parsec_task_constructor(parsec_task_t* task) {
     task->selected_chore = -1;
     task->load = 0;
     task->status = PARSEC_TASK_STATUS_NONE;
+#if defined(PARSEC_DEBUG_NOISIER)
+    /* used during task_snprintf for non-fully initialized task_t */
+    memset(&task->data, 0, MAX_PARAM_COUNT * sizeof(parsec_data_pair_t));
+#endif
 }
 
 /*
@@ -1226,7 +1230,7 @@ int parsec_fini( parsec_context_t** pcontext )
 #endif  /* PARSEC_PROF_TRACE */
 
     /* PAPI SDE needs to process the shutdown before resources exposed to it are freed.
-     * This includes scheduling resources, so SDE needs to be finalized before the 
+     * This includes scheduling resources, so SDE needs to be finalized before the
      * computation threads leave */
     PARSEC_PAPI_SDE_FINI();
 
@@ -1553,7 +1557,7 @@ parsec_update_deps_with_counter(parsec_taskpool_t *tp,
     (void)origin;
     (void)origin_flow;
     (void)dest_flow;
-    
+
     if( 0 == *deps ) {
         dep_new_value = parsec_check_IN_dependencies_with_counter(tp, task) - 1;
         if( parsec_atomic_cas_int32( deps, 0, dep_new_value ) == 1 )
@@ -1716,7 +1720,7 @@ parsec_release_local_OUT_dependencies(parsec_execution_stream_t* es,
             PARSEC_COPY_EXECUTION_CONTEXT(new_context, task);
             PARSEC_AYU_ADD_TASK(new_context);
 
-            PARSEC_DEBUG_VERBOSE(6, parsec_debug_output,
+            PARSEC_DEBUG_VERBOSE(7, parsec_debug_output,
                    "%s becomes ready from %s on thread %d:%d, with mask 0x%04x",
                    tmp1,
                    parsec_task_snprintf(tmp2, MAX_TASK_STRLEN, origin),
@@ -2095,6 +2099,7 @@ int parsec_taskpool_reserve_id( parsec_taskpool_t* tp )
     tp->taskpool_id = idx;
     assert( NOTASKPOOL == taskpool_array[idx] );
     parsec_atomic_unlock( &taskpool_array_lock );
+    PARSEC_DEBUG_VERBOSE(5, parsec_debug_output, "Taskpool %s received id %d", tp->taskpool_name, tp->taskpool_id);
     return idx;
 }
 
@@ -2217,7 +2222,7 @@ int parsec_taskpool_enable(parsec_taskpool_t* tp,
     }
 
     if( 0 != distributed ) {
-        PARSEC_DEBUG_VERBOSE(10, parsec_debug_output, "Register a new taskpool %p: %d with the comm engine", tp, tp->taskpool_id);
+        PARSEC_DEBUG_VERBOSE(10, parsec_debug_output, "Register a new taskpool %s: %d with the comm engine", tp->taskpool_name, tp->taskpool_id);
         (void)parsec_remote_dep_new_taskpool(tp);
     }
     return PARSEC_HOOK_RETURN_DONE;
diff --git a/parsec/scheduling.c b/parsec/scheduling.c
index e665173e7..5df7a10b9 100644
--- a/parsec/scheduling.c
+++ b/parsec/scheduling.c
@@ -141,11 +141,41 @@ int __parsec_execute( parsec_execution_stream_t* es,
          * If we run get_best_device, the caller core is available to run a task, so directly using time_estimate with a 0 base is accurate. */
         parsec_atomic_fetch_add_int64(&task->selected_device->device_load, task->load);
     }
+    else {
+        /* for every flow, check if we received a GPU copy, and revert to the
+         * CPU copy since this is a CPU hook. Note: at this point copy versions
+         * should be synchronized already from a pushout. */
+        for( int i = 0; i < task->task_class->nb_flows; i++ ) {
+            /* Make sure data_in is not NULL */
+            if( NULL == task->data[i].data_in ) continue;
+            if( PARSEC_FLOW_ACCESS_NONE == (PARSEC_FLOW_ACCESS_MASK & task->task_class->in[i]->flow_flags) )  continue;  /* control flow */
+
+            parsec_data_copy_t* copy = task->data[i].data_in;
+            if(parsec_mca_device_is_gpu(copy->device_index)) {
+                assert(copy->coherency_state == PARSEC_DATA_COHERENCY_SHARED);
+                assert(NULL != copy->original && NULL != copy->original->device_copies[0]);
+                assert(copy->version == copy->original->device_copies[0]->version);
+                task->data[i].data_in = copy->original->device_copies[0];
+                PARSEC_DATA_COPY_RETAIN(task->data[i].data_in);
+                PARSEC_DATA_COPY_RELEASE(copy);
+            }
+        }
+    }
 
-    PARSEC_DEBUG_VERBOSE(5, parsec_debug_output, "Thread %d of VP %d Execute %s chore %d device %d:%s",
-                         es->th_id, es->virtual_process->vp_id,
-                         tmp, task->selected_chore,
-                         task->selected_device->device_index, task->selected_device->name);
+#if defined(PARSEC_DEBUG_NOISIER)
+    char tmp2[4096]; int len = 0;
+    len += snprintf(&tmp2[len], 4096-len, "Thread %2d:%d\tExecute %s\tchore[%d] dev %d:%s",
+                    es->th_id, es->virtual_process->vp_id,
+                    tmp, task->selected_chore,
+                    task->selected_device->device_index, task->selected_device->name);
+    for( int i = 0; parsec_debug_verbose > 5 && i < task->task_class->nb_flows; i++ ) {
+        parsec_data_copy_t* copy = task->data[i].data_in;
+        if( NULL == copy ) continue; /* Make sure data_in is not NULL */
+        if( PARSEC_FLOW_ACCESS_NONE == (PARSEC_FLOW_ACCESS_MASK & task->task_class->in[i]->flow_flags) )  continue;  /* control flow */
+        len += snprintf(&tmp2[len], 4096-len, "\n\t  Data[%d] key %lx\ton dev:%d version:%d readers:%d ptr:%p", i, copy->original->key, copy->device_index, copy->version, copy->readers, copy->device_private);
+    }
+    PARSEC_DEBUG_VERBOSE(5, parsec_debug_output, "%s", tmp2);
+#endif
 
     parsec_hook_t *hook = tc->incarnations[task->selected_chore].hook;
     assert( NULL != hook );

From f3c9a0d7617efcca0824e5974469d031c981bb8d Mon Sep 17 00:00:00 2001
From: Aurelien Bouteiller <bouteill@icl.utk.edu>
Date: Fri, 11 Oct 2024 13:52:34 -0400
Subject: [PATCH 07/18] gpu-datain: pure distribution collections have no
 data_of/data_of_key

---
 parsec/remote_dep_mpi.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/parsec/remote_dep_mpi.c b/parsec/remote_dep_mpi.c
index b513b4df9..7b8ff7464 100644
--- a/parsec/remote_dep_mpi.c
+++ b/parsec/remote_dep_mpi.c
@@ -827,9 +827,11 @@ remote_dep_mpi_retrieve_datatype(parsec_execution_stream_t *eu,
     if (NULL != fct->data_affinity ) {
         parsec_data_ref_t dref;
         fct->data_affinity(newcontext, &dref);
-        parsec_data_t* data = dref.dc->data_of_key(dref.dc, dref.key);
-        output->data.preferred_device = (-1 != data->preferred_device) ?
-                                        data->preferred_device : data->owner_device; 
+        if(NULL != dref.dc->data_of_key) {
+            parsec_data_t* data = dref.dc->data_of_key(dref.dc, dref.key);
+            output->data.preferred_device = (-1 != data->preferred_device) ?
+                                            data->preferred_device : data->owner_device;
+        }
     }
     return PARSEC_ITERATE_CONTINUE;
 }

From 30742bcb51ffe35e768adfadea7bf0c3aa45dae6 Mon Sep 17 00:00:00 2001
From: Aurelien Bouteiller <bouteill@icl.utk.edu>
Date: Mon, 14 Oct 2024 13:14:30 -0400
Subject: [PATCH 08/18] Use lock initializers instead of static temps

---
 parsec/utils/zone_malloc.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/parsec/utils/zone_malloc.c b/parsec/utils/zone_malloc.c
index ca78c546d..81dad5ccd 100644
--- a/parsec/utils/zone_malloc.c
+++ b/parsec/utils/zone_malloc.c
@@ -33,7 +33,6 @@ zone_malloc_t* zone_malloc_init(void* base_ptr, int _max_segment, size_t _unit_s
         zone_malloc_error("Cannot manage an empty memory region\n");
         return NULL;
     }
-    parsec_atomic_lock_t temp_unlock = PARSEC_ATOMIC_UNLOCKED;
 
     gdata = (zone_malloc_t*)malloc( sizeof(zone_malloc_t) );
     gdata->base         = base_ptr;

From d49fe0ed4076b15260285b86148ead212d115008 Mon Sep 17 00:00:00 2001
From: Aurelien Bouteiller <bouteill@icl.utk.edu>
Date: Wed, 16 Oct 2024 11:35:04 -0400
Subject: [PATCH 09/18] gpu-datain: when the RO data-in is from D2D we need to
 increase its readers

---
 parsec/mca/device/device_gpu.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/parsec/mca/device/device_gpu.c b/parsec/mca/device/device_gpu.c
index f67b42dca..fe16e759d 100644
--- a/parsec/mca/device/device_gpu.c
+++ b/parsec/mca/device/device_gpu.c
@@ -1506,10 +1506,13 @@ parsec_device_data_stage_in( parsec_device_gpu_module_t* gpu_device,
                              gpu_device->super.device_index, gpu_device->super.name, task_data->data_in, task_data->data_in->super.super.obj_reference_count, original);
         if( gpu_device->super.type == candidate_dev->super.type ) {
             if( gpu_device->peer_access_mask & (1 << candidate_dev->super.device_index) ) {
-                /* We can directly do D2D, so let's skip the selection */
-                PARSEC_DEBUG_VERBOSE(30, parsec_gpu_output_stream,
-                                     "GPU[%d:%s]:\tskipping candidate lookup: data_in copy %p on %s has PEER ACCESS",
-                                     gpu_device->super.device_index, gpu_device->super.name, task_data->data_in, candidate_dev->super.name);
+                /* We have a candidate for the d2d transfer. */
+                int readers = parsec_atomic_fetch_inc_int32(&candidate->readers);
+                /* the data-in should not be in the lru so we can skip the coordination protocol with the owner of the candidate being repurposed. */
+                assert( readers >= 0 /* repurposed! */); (void)readers;
+                PARSEC_DEBUG_VERBOSE(10, parsec_gpu_output_stream,
+                                     "GPU[%d:%s]:\tData copy %p [ref_count %d] on PaRSEC device %s is the data_in candidate and has PEER ACCESS; increasing its readers to %d",
+                                     gpu_device->super.device_index, gpu_device->super.name, candidate, candidate->super.super.obj_reference_count, candidate_dev->super.name, candidate->readers+1);
                 goto src_selected;
             }
         }

From 7c2535189ebe88c2f0c9f1d23c3f07baa6bb299a Mon Sep 17 00:00:00 2001
From: Aurelien Bouteiller <bouteill@icl.utk.edu>
Date: Wed, 16 Oct 2024 18:45:04 -0400
Subject: [PATCH 10/18] gpu-datain: add a configure option and change some
 namings

---
 configure                              | 8 ++++++--
 parsec/data.c                          | 5 +----
 parsec/include/parsec/sys/atomic-c11.h | 2 +-
 parsec/mca/device/device_gpu.c         | 2 +-
 parsec/parsec_internal.h               | 7 +++++--
 parsec/remote_dep_mpi.c                | 2 +-
 6 files changed, 15 insertions(+), 11 deletions(-)

diff --git a/configure b/configure
index e9c05ca8f..e23f88ebc 100755
--- a/configure
+++ b/configure
@@ -103,7 +103,8 @@ cat <<EOF
                 use the MPI communication library [installed in DIR] (default=autodetect)
 --enable-collectives
                 use asynchronous dataflow collective communication
-
+--enable-mpi-gpu-aware
+                assume the MPI communication library can send/receive from GPU data buffers directly
 
 --with-cuda[=DIR]
                 use the CUDA accelerator libray [installed in DIR] (default=autodetect)
@@ -238,6 +239,8 @@ while [ "x$1" != x ]; do
         --without-mpi) with_mpi=no; shift;;
         --enable-collectives) enable_collectives=yes; shift;;
         --disable-collectives) enable_collectives=no; shift;;
+        --enable-mpi-gpu-aware) enable_mpi_gpu_aware=yes; shift;;
+        --disable-mpi-gpu-aware) enable_mpi_gpu_aware=no; shift;;
 
 # Hwloc options
         --with-hwloc=*) with_hwloc="${1#*=}"; shift;;
@@ -527,7 +530,8 @@ x) ;;
 esac
 [ x$enable_collectives = xyes ] && CMAKE_DEFINES+=" -DPARSEC_DIST_COLLECTIVES=ON"
 [ x$enable_collectives = xno ] && CMAKE_DEFINES+=" -DPARSEC_DIST_COLLECTIVES=OFF"
-
+[ x$enable_mpi_gpu_aware = xyes ] && CMAKE_DEFINES+=" -DPARSEC_MPI_IS_GPU_AWARE=ON"
+[ x$enable_mpi_gpu_aware = xno ] && CMAKE_DEFINES+=" -DPARSEC_MPI_IS_GPU_AWARE=OFF"
 
 case x$with_cuda in
 xno) CMAKE_DEFINES+=" -DPARSEC_GPU_WITH_CUDA=OFF";;
diff --git a/parsec/data.c b/parsec/data.c
index b04dde7ae..f0a2a0a25 100644
--- a/parsec/data.c
+++ b/parsec/data.c
@@ -597,8 +597,6 @@ int parsec_data_release_self_contained_data(parsec_data_t *data)
 {
     if (data->super.obj_reference_count != data->nb_copies) return 0;
     parsec_data_copy_t *copy;
-    PARSEC_DEBUG_VERBOSE(1, parsec_debug_output, "Examine the status of data %p with %d copies and refcounts at %s:%d\n",
-                         data, data->nb_copies, __FILE__, __LINE__);
     /* this data is only referenced by it's own copies. If these copies are also only referenced by
      * data, then we can release them all.
      */
@@ -607,8 +605,7 @@ int parsec_data_release_self_contained_data(parsec_data_t *data)
         if( copy->super.super.obj_reference_count > 1 )
             return 0;
     }
-    PARSEC_DEBUG_VERBOSE(1, parsec_debug_output, "Force the release of data %p at %s:%d",
-		         copy, __FILE__, __LINE__);
+    PARSEC_DEBUG_VERBOSE(90, parsec_debug_output, "Release copy %p from self-contained data %p", copy, data);
     for( uint32_t i = 0; i < parsec_nb_devices; i++) {
         if (NULL == (copy = data->device_copies[i])) continue;
         assert(1 == copy->super.super.obj_reference_count);
diff --git a/parsec/include/parsec/sys/atomic-c11.h b/parsec/include/parsec/sys/atomic-c11.h
index 000825c89..c507d4c00 100644
--- a/parsec/include/parsec/sys/atomic-c11.h
+++ b/parsec/include/parsec/sys/atomic-c11.h
@@ -191,7 +191,7 @@ static_assert(sizeof(volatile int) >= sizeof(volatile atomic_flag),
     "The type size for atomic_flag is larger than expected. Please report this error to PaRSEC developpers."
     "You may compile without C11 atomic support (-DSUPPORT_C11=OFF in cmake) to fallback on other atomic types.");
 
-#  define PARSEC_ATOMIC_HAS_ATOMIC_INIT
+#define PARSEC_ATOMIC_HAS_ATOMIC_INIT
 ATOMIC_STATIC_INLINE
 void parsec_atomic_lock_init( parsec_atomic_lock_t* atomic_lock )
 {
diff --git a/parsec/mca/device/device_gpu.c b/parsec/mca/device/device_gpu.c
index fe16e759d..b89f3ba4d 100644
--- a/parsec/mca/device/device_gpu.c
+++ b/parsec/mca/device/device_gpu.c
@@ -2547,7 +2547,7 @@ parsec_device_kernel_epilog( parsec_device_gpu_module_t *gpu_device,
             gpu_copy->coherency_state = PARSEC_DATA_COHERENCY_SHARED;
             assert(PARSEC_DATA_STATUS_UNDER_TRANSFER == cpu_copy->data_transfer_status);
             cpu_copy->data_transfer_status = PARSEC_DATA_STATUS_COMPLETE_TRANSFER;
-            if( 0 == (parsec_mpi_allow_gpu_memory_communications & PARSEC_RUNTIME_SEND_FROM_GPU_MEMORY) ) {
+            if( 0 == (parsec_mpi_allow_gpu_memory_communications & PARSEC_RUNTIME_SEND_GPU_MEMORY) ) {
                 /* Report the CPU copy as the output of the task. */
                 this_task->data[i].data_out = cpu_copy;
             }
diff --git a/parsec/parsec_internal.h b/parsec/parsec_internal.h
index cbeb5695a..b836b67f4 100644
--- a/parsec/parsec_internal.h
+++ b/parsec/parsec_internal.h
@@ -201,8 +201,11 @@ PARSEC_DECLSPEC PARSEC_OBJ_CLASS_DECLARATION(parsec_taskpool_t);
 #define PARSEC_DEPENDENCIES_STARTUP_TASK   ((parsec_dependency_t)(1<<29))
 #define PARSEC_DEPENDENCIES_BITMASK        (~(PARSEC_DEPENDENCIES_TASK_DONE|PARSEC_DEPENDENCIES_IN_DONE|PARSEC_DEPENDENCIES_STARTUP_TASK))
 
-#define PARSEC_RUNTIME_SEND_FROM_GPU_MEMORY 0x00000002
-#define PARSEC_RUNTIME_RECV_FROM_GPU_MEMORY 0x00000001
+/* Mask denoting if we can send and receive communication (e.g., MPI) directly into
+ * GPU buffers.
+ */
+#define PARSEC_RUNTIME_SEND_GPU_MEMORY 0x00000002
+#define PARSEC_RUNTIME_RECV_GPU_MEMORY 0x00000001
 
 /**
  * This structure is used internally by the parsec_dependencies_t structures
diff --git a/parsec/remote_dep_mpi.c b/parsec/remote_dep_mpi.c
index 7b8ff7464..a776fa410 100644
--- a/parsec/remote_dep_mpi.c
+++ b/parsec/remote_dep_mpi.c
@@ -2112,7 +2112,7 @@ static void remote_dep_mpi_get_start(parsec_execution_stream_t* es,
         /* prepare the local receiving data */
         assert(NULL == deps->output[k].data.data); /* we do not support in-place tiles now, make sure it doesn't happen yet */
         if(NULL == deps->output[k].data.data) {
-            int best_device = (parsec_mpi_allow_gpu_memory_communications & PARSEC_RUNTIME_RECV_FROM_GPU_MEMORY) ? deps->output[k].data.preferred_device : 0;
+            int best_device = (parsec_mpi_allow_gpu_memory_communications & PARSEC_RUNTIME_RECV_GPU_MEMORY) ? deps->output[k].data.preferred_device : 0;
             deps->output[k].data.data = remote_dep_copy_allocate(&deps->output[k].data.remote, best_device);
         }
         /* Mark the data under tranfer */

From 320d62b857d9b53519c1bd172668bed16c0d5403 Mon Sep 17 00:00:00 2001
From: Thomas Heraul <herault@icl.utk.edu>
Date: Wed, 30 Oct 2024 19:25:54 +0000
Subject: [PATCH 11/18] When managing CPU-only tasks that received a GPU data
 copy, skip output-only flows, for which checking if they are control flows
 segfaults

---
 parsec/scheduling.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/parsec/scheduling.c b/parsec/scheduling.c
index 5df7a10b9..29fdd9284 100644
--- a/parsec/scheduling.c
+++ b/parsec/scheduling.c
@@ -142,10 +142,12 @@ int __parsec_execute( parsec_execution_stream_t* es,
         parsec_atomic_fetch_add_int64(&task->selected_device->device_load, task->load);
     }
     else {
-        /* for every flow, check if we received a GPU copy, and revert to the
+        /* For every input flow, check if we received a GPU copy, and revert to the
          * CPU copy since this is a CPU hook. Note: at this point copy versions
          * should be synchronized already from a pushout. */
         for( int i = 0; i < task->task_class->nb_flows; i++ ) {
+            /* Ignore output-only flows */
+            if( NULL == task->task_class->in[i] ) continue;
             /* Make sure data_in is not NULL */
             if( NULL == task->data[i].data_in ) continue;
             if( PARSEC_FLOW_ACCESS_NONE == (PARSEC_FLOW_ACCESS_MASK & task->task_class->in[i]->flow_flags) )  continue;  /* control flow */

From 87ca1b2a77b5ed93e31f3933343c4e23851370e1 Mon Sep 17 00:00:00 2001
From: Aurelien Bouteiller <bouteill@icl.utk.edu>
Date: Mon, 27 Jan 2025 16:36:12 -0500
Subject: [PATCH 12/18] mpigpu: fix a number of crashes in
 release_self_contained_data

---
 parsec/data.c | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/parsec/data.c b/parsec/data.c
index f0a2a0a25..db7dda41e 100644
--- a/parsec/data.c
+++ b/parsec/data.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2012-2024 The University of Tennessee and The University
+ * Copyright (c) 2012-2025 The University of Tennessee and The University
  *                         of Tennessee Research Foundation.  All rights
  *                         reserved.
  */
@@ -595,20 +595,21 @@ parsec_data_destroy( parsec_data_t *data )
 
 int parsec_data_release_self_contained_data(parsec_data_t *data)
 {
-    if (data->super.obj_reference_count != data->nb_copies) return 0;
-    parsec_data_copy_t *copy;
+    int32_t nb_copies = data->nb_copies;
+    if (data->super.obj_reference_count != nb_copies) return 0;
+    parsec_data_copy_t *copy = NULL;
     /* this data is only referenced by it's own copies. If these copies are also only referenced by
      * data, then we can release them all.
      */
     for( uint32_t i = 0; i < parsec_nb_devices; i++) {
         if (NULL == (copy = data->device_copies[i])) continue;
-        if( copy->super.super.obj_reference_count > 1 )
+        if( copy->super.super.obj_reference_count > 1 || copy->readers > 0 )
             return 0;
     }
     PARSEC_DEBUG_VERBOSE(90, parsec_debug_output, "Release copy %p from self-contained data %p", copy, data);
     for( uint32_t i = 0; i < parsec_nb_devices; i++) {
         if (NULL == (copy = data->device_copies[i])) continue;
-        assert(1 == copy->super.super.obj_reference_count);
+        assert(1 == copy->super.super.obj_reference_count && 0 == copy->readers);
         if( 0 == copy->device_index ) {
             PARSEC_OBJ_RELEASE(copy);
             assert(NULL == copy);
@@ -622,7 +623,9 @@ int parsec_data_release_self_contained_data(parsec_data_t *data)
             copy->device_private = NULL;
             copy->arena_chunk = NULL;
         }
+        if (0 == --nb_copies) return 1; /* we deallocate the data_t during the copy_release/detach of the last copy, so we need to stop now */
     }
-    return 1;
+    parsec_warning("Release copy %p from self-contained data %p had %d more copies than present in device_copies", copy, data, nb_copies);
+    return 0;
 }
 

From 7d2941ab8a21cc5b29b5f1cf2ca35c0c78ed3132 Mon Sep 17 00:00:00 2001
From: Aurelien Bouteiller <bouteill@icl.utk.edu>
Date: Wed, 26 Feb 2025 14:20:31 -0500
Subject: [PATCH 13/18] testing for dev_index==0 is not always true, use the
 mca_device_is_gpu accessor

---
 parsec/data.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/parsec/data.c b/parsec/data.c
index db7dda41e..6b2bf9027 100644
--- a/parsec/data.c
+++ b/parsec/data.c
@@ -610,7 +610,7 @@ int parsec_data_release_self_contained_data(parsec_data_t *data)
     for( uint32_t i = 0; i < parsec_nb_devices; i++) {
         if (NULL == (copy = data->device_copies[i])) continue;
         assert(1 == copy->super.super.obj_reference_count && 0 == copy->readers);
-        if( 0 == copy->device_index ) {
+        if (!parsec_mca_device_is_gpu(copy->device_index)) {
             PARSEC_OBJ_RELEASE(copy);
             assert(NULL == copy);
         } else {

From 5edcf6044168919ed80538823041730ea84dceb4 Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <joseph.schuchart@stonybrook.edu>
Date: Thu, 27 Mar 2025 11:44:10 -0400
Subject: [PATCH 14/18] Use PRIx64 when printing a parsec_data_key_t

Mismatch in printf type sizes might lead to segmentation faults.

Signed-off-by: Joseph Schuchart <joseph.schuchart@stonybrook.edu>
---
 parsec/parsec.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/parsec/parsec.c b/parsec/parsec.c
index 5b35ac7ba..2f8d3252e 100644
--- a/parsec/parsec.c
+++ b/parsec/parsec.c
@@ -15,6 +15,7 @@
 #include <errno.h>
 #include <unistd.h>
 #include <limits.h>
+#include <inttypes.h>
 #if defined(PARSEC_HAVE_GEN_H)
 #include <libgen.h>
 #endif  /* defined(PARSEC_HAVE_GEN_H) */
@@ -1954,7 +1955,7 @@ parsec_task_snprintf( char* str, size_t size,
         if ((NULL == task->data[i].data_in) || (NULL == task->data[i].data_in->original))
             index += snprintf(str + index, size - index, "%s*", prefix);
         else
-            index += snprintf(str + index, size - index, "%s%lx", prefix, task->data[i].data_in->original->key);
+            index += snprintf(str + index, size - index, "%s%"PRIx64, prefix, task->data[i].data_in->original->key);
         if( index >= size ) return str;
     }
     index += snprintf(str + index, size - index, "}" );

From 237b9d2abef34659690f38a579b69ddc95aeb9ac Mon Sep 17 00:00:00 2001
From: George Bosilca <gbosilca@nvidia.com>
Date: Fri, 13 Feb 2026 07:06:58 +0200
Subject: [PATCH 15/18] Add support for CUDA 13.

Signed-off-by: George Bosilca <gbosilca@nvidia.com>
---
 parsec/mca/device/cuda/device_cuda_module.c | 25 ++++++++++++++++-----
 1 file changed, 19 insertions(+), 6 deletions(-)

diff --git a/parsec/mca/device/cuda/device_cuda_module.c b/parsec/mca/device/cuda/device_cuda_module.c
index 9120bf027..c74c3f7cb 100644
--- a/parsec/mca/device/cuda/device_cuda_module.c
+++ b/parsec/mca/device/cuda/device_cuda_module.c
@@ -2,7 +2,7 @@
  * Copyright (c) 2010-2025 The University of Tennessee and The University
  *                         of Tennessee Research Foundation.  All rights
  *                         reserved.
- * Copyright (c) 2024      NVIDIA Corporation.  All rights reserved.
+ * Copyright (c) 2024-2026 NVIDIA Corporation.  All rights reserved.
  */
 
 #include "parsec/parsec_config.h"
@@ -411,9 +411,8 @@ parsec_cuda_module_init( int dev_id, parsec_device_module_t** module )
     parsec_device_gpu_module_t* gpu_device;
     parsec_device_module_t* device;
     cudaError_t cudastatus;
-    int show_caps_index, show_caps = 0, j, k;
+    int show_caps_index, show_caps = 0, j, k, freqHz;
     char *szName;
-    uint64_t freqHz;
     double fp16, fp32, fp64, tf32;
     struct cudaDeviceProp prop;
 
@@ -431,10 +430,16 @@ parsec_cuda_module_init( int dev_id, parsec_device_module_t** module )
     szName    = prop.name;
     major     = prop.major;
     minor     = prop.minor;
-    freqHz    = prop.clockRate * 1000;  /* clockRate is in KHz */
+    /* up to CUDA 12.3, clockRate is in KHz */
+#if CUDART_VERSION >= 12030
+    cudaDeviceGetAttribute(&freqHz, cudaDevAttrClockRate, dev_id);
+    computemode = cudaComputeModeDefault;  /* default value until I figure out how to retrieve it */
+#else
+    freqHz    = (int)prop.clockRate * 1000;  /* clockRate is in KHz */
+    computemode = prop.computeMode;
+#endif
     concurrency = prop.concurrentKernels;
     streaming_multiprocessor = prop.multiProcessorCount;
-    computemode = prop.computeMode;
 
     // We use calloc because we need some fields to be zero-initialized to ensure graceful handling of errors
     cuda_device = (parsec_device_cuda_module_t*)calloc(1, sizeof(parsec_device_cuda_module_t));
@@ -587,6 +592,14 @@ parsec_cuda_module_init( int dev_id, parsec_device_module_t** module )
     }
 
     if( show_caps ) {
+        int memoryClockRateKHz, memoryBusWidth;
+#if CUDART_VERSION >= 12030
+        cudaDeviceGetAttribute(&memoryClockRateKHz, cudaDevAttrMemoryClockRate, dev_id);
+        cudaDeviceGetAttribute(&memoryBusWidth, cudaDevAttrGlobalMemoryBusWidth, dev_id);
+#else
+        memoryClockRateKHz = (int)prop.memoryClockRate;
+        memoryBusWidth = prop.memoryBusWidth;
+#endif
         parsec_inform("Dev GPU %10s : %s %.0fGB [pci %x:%x.%x]\n"
                       "\tFrequency (GHz)    : %.2f\t[SM: %d | Capabilities: %d.%d | Concurency %s | ComputeMode %d]\n"
                       "\tPeak Tflop/s %-5s : fp64: %-8.3f fp32: %-8.3f fp16: %-8.3f tf32: %-8.3f\n"
@@ -595,7 +608,7 @@ parsec_cuda_module_init( int dev_id, parsec_device_module_t** module )
                       freqHz*1e-9f, streaming_multiprocessor, cuda_device->major, cuda_device->minor,
                       (concurrency == 1)? "yes": "no", computemode,
                       device->gflops_guess? "GUESS": "", fp64*1e-3, fp32*1e-3, fp16*1e-3, tf32*1e-3,
-                      2.0*prop.memoryClockRate*(prop.memoryBusWidth/8)/1.0e6, prop.memoryClockRate*1e-6, prop.memoryBusWidth, gpu_device->mem_block_size*gpu_device->mem_nb_blocks/1024.f/1024.f/1024.f);
+                      2.0*memoryClockRateKHz*(memoryBusWidth/8)/1.0e6, memoryClockRateKHz*1e-6, memoryBusWidth, gpu_device->mem_block_size*gpu_device->mem_nb_blocks/1024.f/1024.f/1024.f);
     }
 
     *module = device;

From 45e0618cd75988e6aa7e32f3e91b40ca7f5d2ff4 Mon Sep 17 00:00:00 2001
From: George Bosilca <gbosilca@nvidia.com>
Date: Fri, 13 Feb 2026 07:07:44 +0200
Subject: [PATCH 16/18] Allow command line arguments for the stress test.

Add missing dependency library.

Signed-off-by: George Bosilca <gbosilca@nvidia.com>
---
 tests/runtime/cuda/CMakeLists.txt |  1 +
 tests/runtime/cuda/stress_main.c  | 26 +++++++++++++++++++++++++-
 2 files changed, 26 insertions(+), 1 deletion(-)

diff --git a/tests/runtime/cuda/CMakeLists.txt b/tests/runtime/cuda/CMakeLists.txt
index fdfbf9f89..1f79f5e9b 100644
--- a/tests/runtime/cuda/CMakeLists.txt
+++ b/tests/runtime/cuda/CMakeLists.txt
@@ -29,5 +29,6 @@ if(PARSEC_HAVE_CUDA)
   # Testing for communications to and from GPU
   parsec_addtest_executable(C cuda_rtt SOURCES rtt.c rtt_main.c)
   target_include_directories(cuda_rtt PRIVATE $<$<NOT:${PARSEC_BUILD_INPLACE}>:${CMAKE_CURRENT_SOURCE_DIR}>)
+  target_link_libraries(cuda_rtt PRIVATE m)
   target_ptg_sources(cuda_rtt PRIVATE "rtt.jdf")
 endif(PARSEC_HAVE_CUDA)
diff --git a/tests/runtime/cuda/stress_main.c b/tests/runtime/cuda/stress_main.c
index 371587b88..36008203c 100644
--- a/tests/runtime/cuda/stress_main.c
+++ b/tests/runtime/cuda/stress_main.c
@@ -15,12 +15,36 @@
 #include <mpi.h>
 #endif
 
+#include <getopt.h>
+#include <stdlib.h>
+
 int main(int argc, char *argv[])
 {
     parsec_context_t *parsec = NULL;
     parsec_taskpool_t *tp;
     int size = 1;
     int rank = 0;
+    int tile_size = 1024;
+    int depth = 80;
+    int ch;
+
+    /* Parse -n (tile size) and -d (depth) before parsec_init */
+    while ((ch = getopt(argc, argv, "n:d:")) != -1) {
+        switch (ch) {
+            case 'n':
+                tile_size = atoi(optarg);
+                break;
+            case 'd':
+                depth = atoi(optarg);
+                break;
+        }
+    }
+
+    /* Shift argv in place to remove our options so parsec_init does not see them */
+    for (int i = 1; i <= argc - optind; i++) {
+        argv[i] = argv[optind + i - 1];
+    }
+    argc = argc - optind + 1;
 
 #if defined(DISTRIBUTED)
     {
@@ -33,7 +57,7 @@ int main(int argc, char *argv[])
 
     parsec = parsec_init(-1, &argc, &argv);
 
-    tp = testing_stress_New(parsec, 80, 1024);
+    tp = testing_stress_New(parsec, depth, tile_size);
     if( NULL != tp ) {
         parsec_context_add_taskpool(parsec, tp);
         parsec_context_start(parsec);

From 42b881a138078472645fa8ccfd0f6480f55b9096 Mon Sep 17 00:00:00 2001
From: George Bosilca <gbosilca@nvidia.com>
Date: Fri, 13 Feb 2026 07:09:02 +0200
Subject: [PATCH 17/18] Always propagate the CPU copy to successors.

This disable the communications from GPU memory, but it is necessary for
a proper tracking of reference counts on the data copies. At this point
I dont think we can fiddle with the code to allow for device copy
propagation, too many corner cases to be able to be workable.

Instead, we should rethink the entire data copy framework, and allow
tasks to fetch their inputs from devices as needed.

Signed-off-by: George Bosilca <gbosilca@nvidia.com>
---
 parsec/mca/device/device_gpu.c | 41 +++++++++++++++++++++++-----------
 1 file changed, 28 insertions(+), 13 deletions(-)

diff --git a/parsec/mca/device/device_gpu.c b/parsec/mca/device/device_gpu.c
index b89f3ba4d..20c688bdc 100644
--- a/parsec/mca/device/device_gpu.c
+++ b/parsec/mca/device/device_gpu.c
@@ -3,7 +3,7 @@
  * Copyright (c) 2021-2024 The University of Tennessee and The University
  *                         of Tennessee Research Foundation.  All rights
  *                         reserved.
- * Copyright (c) 2024      NVIDIA Corporation.  All rights reserved.
+ * Copyright (c) 2024-2026 NVIDIA Corporation.  All rights reserved.
  */
 
 #include "parsec/parsec_config.h"
@@ -839,7 +839,9 @@ parsec_device_memory_release( parsec_device_gpu_module_t* gpu_device )
 int
 parsec_device_get_copy( parsec_device_gpu_module_t* gpu_device, parsec_data_copy_t** dc )
 {
+#if defined(PARSEC_DEBUG_NOISIER)
     char task_name[] = "unknown";
+#endif /* defined(PARSEC_DEBUG_NOISIER) */
     parsec_gpu_data_copy_t *gpu_mem_lru_cycling = NULL, *lru_gpu_elem;
     /* Get the head of the LRU, assuming it has no readers and mark it as used, using the same mechanism as
      * the GPU to GPU tranfers. Once the communication into this copy completes, the task will get into
@@ -1365,10 +1367,10 @@ parsec_default_gpu_stage_out(parsec_gpu_task_t        *gtask,
                 dir = parsec_device_gpu_transfer_direction_d2d;
             } else {
                 dir = parsec_device_gpu_transfer_direction_d2h;
-                if (dest->device_private == NULL && dest->alloc_cb != NULL) {
-                    dest->alloc_cb(dest, 0); // allocate on host
+                if (dst_copy->device_private == NULL && dst_copy->alloc_cb != NULL) {
+                    dst_copy->alloc_cb(dst_copy, 0); // allocate on host
                 }
-                if (dest->device_private == NULL) {
+                if (dst_copy->device_private == NULL) {
                     return PARSEC_HOOK_RETURN_ERROR;
                 }
             }
@@ -2476,11 +2478,11 @@ parsec_device_kernel_epilog( parsec_device_gpu_module_t *gpu_device,
     int i;
 
 #if defined(PARSEC_DEBUG_NOISIER)
-    char tmp[MAX_TASK_STRLEN];
+    char task_str[MAX_TASK_STRLEN];
+    (void)parsec_task_snprintf(task_str, MAX_TASK_STRLEN, this_task);
     PARSEC_DEBUG_VERBOSE(10, parsec_gpu_output_stream,
                          "GPU[%d:%s]: Epilog of %s",
-                         gpu_device->super.device_index, gpu_device->super.name,
-                         parsec_task_snprintf(tmp, MAX_TASK_STRLEN, this_task) );
+                         gpu_device->super.device_index, gpu_device->super.name, task_str);
 #endif
 
     for( i = 0; i < this_task->task_class->nb_flows; i++ ) {
@@ -2523,9 +2525,9 @@ parsec_device_kernel_epilog( parsec_device_gpu_module_t *gpu_device,
             cpu_copy->coherency_state = PARSEC_DATA_COHERENCY_SHARED;
 
             cpu_copy->version = gpu_copy->version;
-            PARSEC_DEBUG_VERBOSE(10, parsec_gpu_output_stream,
-                                 "GPU[%d:%s]: CPU copy %p [ref_count %d] gets the same version %d as GPU copy %p [ref_count %d]",
-                                 gpu_device->super.device_index, gpu_device->super.name,
+            PARSEC_DEBUG_VERBOSE(20, parsec_gpu_output_stream,
+                                 "GPU[%d:%s]: %s: CPU copy %p [ref_count %d] gets the same version %d as GPU copy %p [ref_count %d]",
+                                 gpu_device->super.device_index, gpu_device->super.name, task_str,
                                  cpu_copy, cpu_copy->super.super.obj_reference_count, cpu_copy->version, gpu_copy, gpu_copy->super.super.obj_reference_count);
 
             /**
@@ -2547,19 +2549,32 @@ parsec_device_kernel_epilog( parsec_device_gpu_module_t *gpu_device,
             gpu_copy->coherency_state = PARSEC_DATA_COHERENCY_SHARED;
             assert(PARSEC_DATA_STATUS_UNDER_TRANSFER == cpu_copy->data_transfer_status);
             cpu_copy->data_transfer_status = PARSEC_DATA_STATUS_COMPLETE_TRANSFER;
-            if( 0 == (parsec_mpi_allow_gpu_memory_communications & PARSEC_RUNTIME_SEND_GPU_MEMORY) ) {
+            /* This condition should be checked more strictly. It is one thing to be able to send the
+             * data directly from the GPU via the communication engine, but if we have local successors
+             * that are not executing on the GPU (aka. the pushout has been set) we need to propagate
+             * the CPU copy instead.
+             */
+            if( 1 || (0 == (parsec_mpi_allow_gpu_memory_communications & PARSEC_RUNTIME_SEND_GPU_MEMORY)) ) {
                 /* Report the CPU copy as the output of the task. */
                 this_task->data[i].data_out = cpu_copy;
+                PARSEC_DEBUG_VERBOSE(100, parsec_gpu_output_stream,
+                                 "GPU[%d:%s]: %s: GPU copy %p [ref_count %d] replaced by the CPU copy %p [ref_count %d] in %s",
+                                 gpu_device->super.device_index, gpu_device->super.name, task_str,
+                                 gpu_copy, gpu_copy->super.super.obj_reference_count,
+                                 cpu_copy, cpu_copy->super.super.obj_reference_count, __func__);
             }
             PARSEC_DEBUG_VERBOSE(20, parsec_gpu_output_stream,
-                                 "GPU copy %p [ref_count %d] moved to the read LRU in %s",
+                                 "GPU[%d:%s]: %s: GPU copy %p [ref_count %d] moved to the read LRU in %s",
+                                 gpu_device->super.device_index, gpu_device->super.name, task_str,
                                  gpu_copy, gpu_copy->super.super.obj_reference_count, __func__);
             parsec_list_item_ring_chop((parsec_list_item_t*)gpu_copy);
             PARSEC_LIST_ITEM_SINGLETON(gpu_copy);
             parsec_list_push_back(&gpu_device->gpu_mem_lru, (parsec_list_item_t*)gpu_copy);
         } else {
+            /* No need to detach the GPU copy it does not belong to any lists because it was owned by the task */
             PARSEC_DEBUG_VERBOSE(20, parsec_gpu_output_stream,
-                                 "GPU copy %p [ref_count %d] moved to the owned LRU in %s",
+                                 "GPU[%d:%s]: %s: GPU copy %p [ref_count %d] moved to the owned LRU in %s",
+                                 gpu_device->super.device_index, gpu_device->super.name, task_str,
                                  gpu_copy, gpu_copy->super.super.obj_reference_count, __func__);
             parsec_list_push_back(&gpu_device->gpu_mem_owned_lru, (parsec_list_item_t*)gpu_copy);
         }

From ae5581fbdd901993641b09b27bd4a1e4cd4269c7 Mon Sep 17 00:00:00 2001
From: George Bosilca <gbosilca@nvidia.com>
Date: Fri, 13 Feb 2026 08:46:20 +0200
Subject: [PATCH 18/18] Set the data_in to NULL before using the temporary
 task.

I'm not sure I understand how this worked so far, because the temporary
stack-based task we use to prepare the successors is barely initialized,
not enough to call parsec_task_snprintf on it. The least info we need to
set is to NULLify the data_in for all task classs flows.

Signed-off-by: George Bosilca <gbosilca@nvidia.com>
---
 parsec/interfaces/ptg/ptg-compiler/jdf2c.c | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/parsec/interfaces/ptg/ptg-compiler/jdf2c.c b/parsec/interfaces/ptg/ptg-compiler/jdf2c.c
index c0867495e..8076679f1 100644
--- a/parsec/interfaces/ptg/ptg-compiler/jdf2c.c
+++ b/parsec/interfaces/ptg/ptg-compiler/jdf2c.c
@@ -7373,7 +7373,8 @@ static char *jdf_dump_context_assignment(string_arena_t *sa_open,
                             prefix, indent(nbopen), parsec_get_name(jdf, targetf, "task_t"), parsec_get_name(jdf, targetf, "task_t"), var);
     string_arena_add_string(sa_open, "%s%s%s.task_class = __parsec_tp->super.super.task_classes_array[%s_%s.task_class_id];\n",
                             prefix, indent(nbopen), var, jdf_basename, targetf->fname);
-
+    string_arena_add_string(sa_open, "%s%sfor (int _i = 0; _i < %s.task_class->nb_flows; %s.data[_i].data_in = NULL, _i++);\n",
+                            prefix, indent(nbopen), var, var);
     nbparam_given = 0;
     for(el = call->parameters; el != NULL; el = el->next) {
         nbparam_given++;
@@ -7405,7 +7406,7 @@ static char *jdf_dump_context_assignment(string_arena_t *sa_open,
                 dep_ld = jdf_expr_lv_next(dep->local_defs, dep_ld);
                 continue; /* This local define was already issued as part of the dep */
             }
-            string_arena_add_string(sa_open, "%s%s  int %s;\n", prefix, indent(nbopen), ld->alias);
+            string_arena_add_string(sa_open, "%s%sint %s;\n", prefix, indent(nbopen), ld->alias);
             if(JDF_RANGE == ld->op) {
                 string_arena_add_string(sa_open,
                                         "%s%sfor( %s = %s;",
@@ -7443,11 +7444,11 @@ static char *jdf_dump_context_assignment(string_arena_t *sa_open,
              */
             assert(el == NULL);
             string_arena_add_string(sa_open,
-                                    "%s%s  const int %s_%s = %s;\n",
+                                    "%s%sconst int %s_%s = %s;\n",
                                     prefix, indent(nbopen), targetf->fname, vl->name, dump_expr((void**)vl->expr, &dest_info));
-            string_arena_add_string(sa_open, "%s%s  assert(&%s.locals[%d].value == &ncc->locals.%s.value);\n",
+            string_arena_add_string(sa_open, "%s%sassert(&%s.locals[%d].value == &ncc->locals.%s.value);\n",
                                     prefix, indent(nbopen), var, i, vl->name);
-            string_arena_add_string(sa_open, "%s%s  ncc->locals.%s.value = %s_%s;\n",
+            string_arena_add_string(sa_open, "%s%sncc->locals.%s.value = %s_%s;\n",
                                     prefix, indent(nbopen), vl->name,
                                     targetf->fname, vl->name);
         } else {
@@ -7504,7 +7505,7 @@ static char *jdf_dump_context_assignment(string_arena_t *sa_open,
 
             if( JDF_RANGE == el->op ) {
                 string_arena_add_string(sa_open,
-                                        "%s%s  int %s_%s;\n",
+                                        "%s%sint %s_%s;\n",
                                         prefix, indent(nbopen), targetf->fname, nl->name);
 
                 string_arena_add_string(sa_open,
@@ -7517,7 +7518,7 @@ static char *jdf_dump_context_assignment(string_arena_t *sa_open,
                 nbopen++;
             } else {
                 string_arena_add_string(sa_open,
-                                        "%s%s  const int %s_%s = %s;\n",
+                                        "%s%sconst int %s_%s = %s;\n",
                                         prefix, indent(nbopen), targetf->fname, nl->name, dump_expr((void**)el, &local_info));
             }