From f7fb83c79b9e24e2ca0836030e148f88c8a44829 Mon Sep 17 00:00:00 2001
From: RoastSea8 <aditya26042005@gmail.com>
Date: Mon, 25 Mar 2024 20:33:13 -0700
Subject: [PATCH 01/16] add cudaDeviceSynchronize for NCCL

---
 allgather.cu | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/allgather.cu b/allgather.cu
index cf1a882..5953041 100644
--- a/allgather.cu
+++ b/allgather.cu
@@ -158,6 +158,7 @@ int main(int argc, char *argv[]) {
             MPI_CHECK(MPI_Wait(&request, &status));
             #elif defined(USE_NCCL)
             NCCL_CHECK(ncclAllGather((const void*)d_local_data, (void*)d_global_data, msg_count, ncclBfloat16, nccl_comm, NULL));
+	    cudaDeviceSynchronize();
             #elif defined(USE_RCCL)
 	    // TODO: fix later
             rcclAllReduce((const void*)d_local_data, (void*)d_global_data, global_data_size, rcclInt, rcclSum, comm, NULL);
@@ -177,6 +178,7 @@ int main(int argc, char *argv[]) {
             MPI_CHECK(MPI_Wait(&request, &status));
             #elif defined(USE_NCCL)
             NCCL_CHECK(ncclAllGather((const void*)d_local_data, (void*)d_global_data, msg_count, ncclBfloat16, nccl_comm, NULL));
+	    cudaDeviceSynchronize();
             #elif defined(USE_RCCL)
             // TODO: fix later
             rcclAllReduce((const void*)d_local_data, (void*)d_global_data, global_data_size, rcclInt, rcclSum, comm, NULL);

From 453a397d2c609f41eac92012e9564ef6e1fa8ed6 Mon Sep 17 00:00:00 2001
From: RoastSea8 <aditya26042005@gmail.com>
Date: Sat, 30 Mar 2024 14:15:33 -0700
Subject: [PATCH 02/16] add allreduce code

---
 Makefile     |   9 ++-
 allreduce.cu | 219 +++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 225 insertions(+), 3 deletions(-)
 create mode 100644 allreduce.cu

diff --git a/Makefile b/Makefile
index df453b4..231e499 100644
--- a/Makefile
+++ b/Makefile
@@ -5,14 +5,17 @@
 
 CC	= cc
 INC	= -I/global/common/software/nersc9/nccl/2.19.4/include
-CFLAGS	= -std=c++11 -O2 -target-accel=nvidia80 --cuda-gpu-arch=sm_80 -DUSE_CUDA -DUSE_NCCL
+CFLAGS	= -std=c++11 -O2 -target-accel=nvidia80 --cuda-gpu-arch=sm_80 -DUSE_CUDA -DUSE_MPI
 LDFLAGS	= -L/global/common/software/nersc9/nccl/2.19.4/lib -lnccl
 
 
-all: allgather.x
+all: allgather.x allreduce.x
 
 allgather.x: allgather.cu 
 	${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o allgather.x allgather.cu
 
+allreduce.x: allreduce.cu
+	${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o allreduce.x allreduce.cu
+
 clean: 
-	rm -f allgather.x 
+	rm -f allgather.x allreduce.x
diff --git a/allreduce.cu b/allreduce.cu
new file mode 100644
index 0000000..062b120
--- /dev/null
+++ b/allreduce.cu
@@ -0,0 +1,219 @@
+/* \file allreduce.cu
+ * Copyright 2024 Parallel Software and Systems Group, University of Maryland.
+ * See the top-level LICENSE file for details.
+ * 
+ * SPDX-License-Identifier: MIT
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <mpi.h>
+
+#ifdef USE_CUDA
+  #include <cuda_runtime.h>
+  #include <cuda_bf16.h>
+#endif
+
+#ifdef USE_NCCL
+  #include "nccl.h"
+#elif defined(USE_RCCL)
+  #include "rccl.h"
+#endif
+
+#define NUM_WARMUP_ITERATIONS		5
+
+#define MPI_CHECK(cmd) do {                         \
+  int e = cmd;                                      \
+  if( e != MPI_SUCCESS ) {                          \
+    printf("Failed: MPI error %s:%d '%d'\n",        \
+        __FILE__,__LINE__, e);                      \
+    exit(EXIT_FAILURE);                             \
+  }                                                 \
+} while(0)
+
+#define CUDA_CHECK(cmd) do {                        \
+  cudaError_t e = cmd;                              \
+  if(e != cudaSuccess) {                            \
+    printf("CUDA error  %s:%d: %s\n",               \
+        __FILE__, __LINE__, cudaGetErrorString(e)); \
+    exit(EXIT_FAILURE);                             \
+  }                                                 \
+} while(0)
+
+#define NCCL_CHECK(cmd) do {                        \
+  ncclResult_t e = cmd;                             \
+  if (e != ncclSuccess) {                           \
+    printf("NCCL error %s:%d %s\n",                 \
+        __FILE__, __LINE__, ncclGetErrorString(e)); \
+    exit(EXIT_FAILURE);                             \
+  }                                                 \
+} while(0)
+
+void initializeData(nv_bfloat16 *data, int size) {
+    for (int i = 0; i < (size / sizeof(nv_bfloat16)); ++i) {
+        data[i] = __float2bfloat16((float)i);
+    }
+}
+
+void custom_bf16_sum(void *invec, void *inoutvec, int *len, MPI_Datatype *datatype) {
+	nv_bfloat16* in = (nv_bfloat16*) invec;
+	nv_bfloat16* inout = (nv_bfloat16*) inoutvec;
+	for (int i = 0; i < *len; i++)
+	    inout[i] = __hadd(in[i], inout[i]);
+}
+
+int main(int argc, char *argv[]) {
+    if (argc != 5) {
+        fprintf(stderr, "Usage: %s <num_gpus> <min_msg_size> <max_msg_size> <iterations>\n", argv[0]);
+        return EXIT_FAILURE;
+    }
+
+    int num_gpus = atoi(argv[1]);
+    int min_msg_size = atoi(argv[2]);
+    int max_msg_size = atoi(argv[3]);
+    int iterations = atoi(argv[4]);
+
+    if (num_gpus < 2 || min_msg_size <= 0 || max_msg_size <= 0 || min_msg_size > max_msg_size || iterations <= 0) {
+        fprintf(stderr, "Invalid input parameters.\n");
+        return EXIT_FAILURE;
+    }
+
+    int my_rank, num_pes;
+    int num_gpus_per_node;
+    int msg_count;
+
+    MPI_Init(&argc, &argv);
+    MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
+    MPI_Comm_size(MPI_COMM_WORLD, &num_pes);
+
+    if (num_pes != num_gpus) {
+        fprintf(stderr, "Number of processes must match number of GPUs.\n");
+        MPI_Finalize();
+        return EXIT_FAILURE;
+    }
+
+    // Initialize GPU context
+    cudaGetDeviceCount(&num_gpus_per_node);
+    cudaSetDevice((my_rank % num_gpus_per_node));
+
+    int local_data_size = max_msg_size; // Size of local data
+    int global_data_size = local_data_size; // Size of global data
+
+    nv_bfloat16 *local_data = (nv_bfloat16*)malloc(local_data_size);
+    nv_bfloat16 *global_data = (nv_bfloat16*)malloc(global_data_size);
+
+    // Initialize local data
+    initializeData(local_data, local_data_size);
+
+    // Allocate memory on GPU
+    nv_bfloat16 *d_local_data, *d_global_data;
+    CUDA_CHECK(cudaMalloc(&d_local_data, local_data_size));
+    CUDA_CHECK(cudaMalloc(&d_global_data, global_data_size));
+
+    // Copy local data to GPU
+    CUDA_CHECK(cudaMemcpy(d_local_data, local_data, local_data_size, cudaMemcpyHostToDevice));
+
+    #ifdef USE_MPI
+    // create 2-byte datatype (send raw, un-interpreted bytes)
+    MPI_Datatype mpi_type_bfloat16;
+    MPI_Type_contiguous(2, MPI_BYTE, &mpi_type_bfloat16);
+    MPI_Type_commit(&mpi_type_bfloat16);
+
+    // define custom reduce operation for nv_bfloat16 types
+    MPI_Op CUSTOM_SUM;
+    MPI_Op_create(&custom_bf16_sum, 1, &CUSTOM_SUM);
+
+    #elif USE_NCCL
+    ncclUniqueId nccl_comm_id;
+    ncclComm_t nccl_comm;
+
+    if (my_rank == 0) {
+        /* Generates an Id to be used in ncclCommInitRank. */
+        ncclGetUniqueId(&nccl_comm_id);
+    }
+
+    /* distribute nccl_comm_id to all ranks */
+    MPI_CHECK(MPI_Bcast((void *)&nccl_comm_id, sizeof(nccl_comm_id), MPI_BYTE,
+                        0, MPI_COMM_WORLD));
+
+    /* Create a new NCCL communicator */
+    NCCL_CHECK(ncclCommInitRank(&nccl_comm, num_pes, nccl_comm_id, my_rank));
+
+    #elif defined(USE_RCCL)
+    // TODO: fix later
+    rcclComm_t rccl_comm;
+    rcclCommInitRank(&comm, num_gpus, 0, rccl_root);
+    #endif
+
+    // Perform MPI_Iallgather, NCCL allgather, or RCCL allgather
+    double total_time, start_time;
+    MPI_Request request;
+    MPI_Status status;
+
+    // Print benchmark results
+    if (my_rank == 0) {
+        printf("Number of GPUs: %d\n", num_gpus);
+        printf("Message size range: %d - %d\n", min_msg_size, max_msg_size);
+        printf("Number of iterations: %d\n", iterations);
+    }
+    fflush(NULL);
+
+    for (int msg_size = min_msg_size; msg_size <= max_msg_size; msg_size *= 2) {
+	msg_count = msg_size / sizeof(nv_bfloat16);
+	// warmup iterations
+	for (int i = 0; i < NUM_WARMUP_ITERATIONS; ++i) {
+            #ifdef USE_MPI
+            MPI_CHECK(MPI_Iallreduce(d_local_data, d_global_data, msg_count, mpi_type_bfloat16,
+                CUSTOM_SUM, MPI_COMM_WORLD, &request));
+
+            MPI_CHECK(MPI_Wait(&request, &status));
+            #elif defined(USE_NCCL)
+            NCCL_CHECK(ncclAllReduce((const void*)d_local_data, (void*)d_global_data, msg_count, ncclBfloat16, ncclSum, nccl_comm, NULL));
+	    cudaDeviceSynchronize();
+            #elif defined(USE_RCCL)
+	    // TODO: fix later
+            rcclAllReduce((const void*)d_local_data, (void*)d_global_data, global_data_size, rcclInt, rcclSum, comm, NULL);
+            #endif
+        }
+
+	if(msg_size >= 8388608)
+	    iterations = 20;
+
+        MPI_Barrier(MPI_COMM_WORLD);
+        start_time = MPI_Wtime();
+	for (int i = 0; i < iterations; ++i) {
+            #ifdef USE_MPI
+            MPI_CHECK(MPI_Iallreduce(d_local_data, d_global_data, msg_count, mpi_type_bfloat16,
+                CUSTOM_SUM, MPI_COMM_WORLD, &request));
+
+            MPI_CHECK(MPI_Wait(&request, &status));
+            #elif defined(USE_NCCL)
+            NCCL_CHECK(ncclAllReduce((const void*)d_local_data, (void*)d_global_data, msg_count, ncclBfloat16, ncclSum, nccl_comm, NULL));
+	    cudaDeviceSynchronize();
+            #elif defined(USE_RCCL)
+            // TODO: fix later
+            rcclAllReduce((const void*)d_local_data, (void*)d_global_data, global_data_size, rcclInt, rcclSum, comm, NULL);
+            #endif
+        }
+        MPI_Barrier(MPI_COMM_WORLD);
+        total_time = MPI_Wtime() - start_time;
+	if (my_rank == 0)
+	    printf("%d %.6f seconds\n", msg_size, (total_time / iterations));
+    }
+
+    // Cleanup
+    free(local_data);
+    free(global_data);
+    CUDA_CHECK(cudaFree(d_local_data));
+    CUDA_CHECK(cudaFree(d_global_data));
+
+    #ifdef USE_NCCL
+    ncclCommDestroy(nccl_comm);
+    #elif defined(USE_RCCL)
+    rcclCommDestroy(rccl_comm);
+    #endif
+
+    MPI_Finalize();
+    return EXIT_SUCCESS;
+}
+

From f516fa01dc6a1fe40f255216da98f0b235a1ede4 Mon Sep 17 00:00:00 2001
From: RoastSea8 <aditya26042005@gmail.com>
Date: Sat, 30 Mar 2024 16:40:26 -0700
Subject: [PATCH 03/16] add reduce scatter code

---
 Makefile          |   7 +-
 reduce_scatter.cu | 226 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 231 insertions(+), 2 deletions(-)
 create mode 100644 reduce_scatter.cu

diff --git a/Makefile b/Makefile
index 231e499..973364d 100644
--- a/Makefile
+++ b/Makefile
@@ -9,7 +9,7 @@ CFLAGS	= -std=c++11 -O2 -target-accel=nvidia80 --cuda-gpu-arch=sm_80 -DUSE_CUDA
 LDFLAGS	= -L/global/common/software/nersc9/nccl/2.19.4/lib -lnccl
 
 
-all: allgather.x allreduce.x
+all: allgather.x allreduce.x reduce_scatter.x
 
 allgather.x: allgather.cu 
 	${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o allgather.x allgather.cu
@@ -17,5 +17,8 @@ allgather.x: allgather.cu
 allreduce.x: allreduce.cu
 	${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o allreduce.x allreduce.cu
 
+reduce_scatter.x: reduce_scatter.cu
+	${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o reduce_scatter.x reduce_scatter.cu
+
 clean: 
-	rm -f allgather.x allreduce.x
+	rm -f allgather.x allreduce.x reduce_scatter.x
diff --git a/reduce_scatter.cu b/reduce_scatter.cu
new file mode 100644
index 0000000..9ed9e53
--- /dev/null
+++ b/reduce_scatter.cu
@@ -0,0 +1,226 @@
+/* \file reduce_scatter.cu
+ * Copyright 2024 Parallel Software and Systems Group, University of Maryland.
+ * See the top-level LICENSE file for details.
+ * 
+ * SPDX-License-Identifier: MIT
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <mpi.h>
+
+#ifdef USE_CUDA
+  #include <cuda_runtime.h>
+  #include <cuda_bf16.h>
+#endif
+
+#ifdef USE_NCCL
+  #include "nccl.h"
+#elif defined(USE_RCCL)
+  #include "rccl.h"
+#endif
+
+#define NUM_WARMUP_ITERATIONS		5
+
+#define MPI_CHECK(cmd) do {                         \
+  int e = cmd;                                      \
+  if( e != MPI_SUCCESS ) {                          \
+    printf("Failed: MPI error %s:%d '%d'\n",        \
+        __FILE__,__LINE__, e);                      \
+    exit(EXIT_FAILURE);                             \
+  }                                                 \
+} while(0)
+
+#define CUDA_CHECK(cmd) do {                        \
+  cudaError_t e = cmd;                              \
+  if(e != cudaSuccess) {                            \
+    printf("CUDA error  %s:%d: %s\n",               \
+        __FILE__, __LINE__, cudaGetErrorString(e)); \
+    exit(EXIT_FAILURE);                             \
+  }                                                 \
+} while(0)
+
+#define NCCL_CHECK(cmd) do {                        \
+  ncclResult_t e = cmd;                             \
+  if (e != ncclSuccess) {                           \
+    printf("NCCL error %s:%d %s\n",                 \
+        __FILE__, __LINE__, ncclGetErrorString(e)); \
+    exit(EXIT_FAILURE);                             \
+  }                                                 \
+} while(0)
+
+void initializeData(nv_bfloat16 *data, int size) {
+    for (int i = 0; i < (size / sizeof(nv_bfloat16)); ++i) {
+        data[i] = __float2bfloat16((float)i);
+    }
+}
+
+void custom_bf16_sum(void *invec, void *inoutvec, int *len, MPI_Datatype *datatype) {
+	nv_bfloat16* in = (nv_bfloat16*) invec;
+	nv_bfloat16* inout = (nv_bfloat16*) inoutvec;
+	for (int i = 0; i < *len; i++)
+	    inout[i] = __hadd(in[i], inout[i]);
+}
+
+int main(int argc, char *argv[]) {
+    if (argc != 5) {
+        fprintf(stderr, "Usage: %s <num_gpus> <min_msg_size> <max_msg_size> <iterations>\n", argv[0]);
+        return EXIT_FAILURE;
+    }
+
+    int num_gpus = atoi(argv[1]);
+    int min_msg_size = atoi(argv[2]);
+    int max_msg_size = atoi(argv[3]);
+    int iterations = atoi(argv[4]);
+
+    if (num_gpus < 2 || min_msg_size <= 0 || max_msg_size <= 0 || min_msg_size > max_msg_size || iterations <= 0) {
+        fprintf(stderr, "Invalid input parameters.\n");
+        return EXIT_FAILURE;
+    }
+
+    int my_rank, num_pes;
+    int num_gpus_per_node;
+    int msg_count;
+
+    MPI_Init(&argc, &argv);
+    MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
+    MPI_Comm_size(MPI_COMM_WORLD, &num_pes);
+
+    if (num_pes != num_gpus) {
+        fprintf(stderr, "Number of processes must match number of GPUs.\n");
+        MPI_Finalize();
+        return EXIT_FAILURE;
+    }
+
+    // Initialize GPU context
+    cudaGetDeviceCount(&num_gpus_per_node);
+    cudaSetDevice((my_rank % num_gpus_per_node));
+
+    int local_data_size = max_msg_size; // Size of local data
+    int global_data_size = local_data_size; // Size of global data
+
+    nv_bfloat16 *local_data = (nv_bfloat16*)malloc(local_data_size);
+    nv_bfloat16 *global_data = (nv_bfloat16*)malloc(global_data_size);
+
+    // Initialize local data
+    initializeData(local_data, local_data_size);
+
+    // Allocate memory on GPU
+    nv_bfloat16 *d_local_data, *d_global_data;
+    CUDA_CHECK(cudaMalloc(&d_local_data, local_data_size));
+    CUDA_CHECK(cudaMalloc(&d_global_data, global_data_size));
+
+    // Copy local data to GPU
+    CUDA_CHECK(cudaMemcpy(d_local_data, local_data, local_data_size, cudaMemcpyHostToDevice));
+
+    #ifdef USE_MPI
+    // create 2-byte datatype (send raw, un-interpreted bytes)
+    MPI_Datatype mpi_type_bfloat16;
+    MPI_Type_contiguous(2, MPI_BYTE, &mpi_type_bfloat16);
+    MPI_Type_commit(&mpi_type_bfloat16);
+
+    // define custom reduce operation for nv_bfloat16 types
+    MPI_Op CUSTOM_SUM;
+    MPI_Op_create(&custom_bf16_sum, 1, &CUSTOM_SUM);
+
+    #elif USE_NCCL
+    ncclUniqueId nccl_comm_id;
+    ncclComm_t nccl_comm;
+
+    if (my_rank == 0) {
+        /* Generates an Id to be used in ncclCommInitRank. */
+        ncclGetUniqueId(&nccl_comm_id);
+    }
+
+    /* distribute nccl_comm_id to all ranks */
+    MPI_CHECK(MPI_Bcast((void *)&nccl_comm_id, sizeof(nccl_comm_id), MPI_BYTE,
+                        0, MPI_COMM_WORLD));
+
+    /* Create a new NCCL communicator */
+    NCCL_CHECK(ncclCommInitRank(&nccl_comm, num_pes, nccl_comm_id, my_rank));
+
+    #elif defined(USE_RCCL)
+    // TODO: fix later
+    rcclComm_t rccl_comm;
+    rcclCommInitRank(&comm, num_gpus, 0, rccl_root);
+    #endif
+
+    // init recvcounts to send an equal portion of data from the reduce operation
+    int num_elements = local_data_size / sizeof(nv_bfloat16);
+    int portion = num_elements / num_pes;
+    int *recvcounts = (int*) malloc(sizeof(int) * num_pes);
+    for (int i = 0; i < num_pes; i++) 
+        recvcounts[i] = portion;
+
+    // Perform MPI_Iallgather, NCCL allgather, or RCCL allgather
+    double total_time, start_time;
+    MPI_Request request;
+    MPI_Status status;
+
+    // Print benchmark results
+    if (my_rank == 0) {
+        printf("Number of GPUs: %d\n", num_gpus);
+        printf("Message size range: %d - %d\n", min_msg_size, max_msg_size);
+        printf("Number of iterations: %d\n", iterations);
+    }
+    fflush(NULL);
+
+    for (int msg_size = min_msg_size; msg_size <= max_msg_size; msg_size *= 2) {
+	msg_count = msg_size / sizeof(nv_bfloat16);
+	// warmup iterations
+	for (int i = 0; i < NUM_WARMUP_ITERATIONS; ++i) {
+            #ifdef USE_MPI
+            MPI_CHECK(MPI_Ireduce_scatter(d_local_data, d_global_data, recvcounts, mpi_type_bfloat16,
+                CUSTOM_SUM, MPI_COMM_WORLD, &request));
+
+            MPI_CHECK(MPI_Wait(&request, &status));
+            #elif defined(USE_NCCL)
+            NCCL_CHECK(ncclReduceScatter((const void*)d_local_data, (void*)d_global_data, portion, ncclBfloat16, ncclSum, nccl_comm, NULL));
+	    cudaDeviceSynchronize();
+            #elif defined(USE_RCCL)
+	    // TODO: fix later
+            rcclAllReduce((const void*)d_local_data, (void*)d_global_data, global_data_size, rcclInt, rcclSum, comm, NULL);
+            #endif
+        }
+
+	if(msg_size >= 8388608)
+	    iterations = 20;
+
+        MPI_Barrier(MPI_COMM_WORLD);
+        start_time = MPI_Wtime();
+	for (int i = 0; i < iterations; ++i) {
+            #ifdef USE_MPI
+            MPI_CHECK(MPI_Ireduce_scatter(d_local_data, d_global_data, recvcounts, mpi_type_bfloat16,
+                CUSTOM_SUM, MPI_COMM_WORLD, &request));
+
+            MPI_CHECK(MPI_Wait(&request, &status));
+            #elif defined(USE_NCCL)
+            NCCL_CHECK(ncclReduceScatter((const void*)d_local_data, (void*)d_global_data, portion, ncclBfloat16, ncclSum, nccl_comm, NULL));
+	    cudaDeviceSynchronize();
+            #elif defined(USE_RCCL)
+            // TODO: fix later
+            rcclAllReduce((const void*)d_local_data, (void*)d_global_data, global_data_size, rcclInt, rcclSum, comm, NULL);
+            #endif
+        }
+        MPI_Barrier(MPI_COMM_WORLD);
+        total_time = MPI_Wtime() - start_time;
+	if (my_rank == 0)
+	    printf("%d %.6f seconds\n", msg_size, (total_time / iterations));
+    }
+
+    // Cleanup
+    free(local_data);
+    free(global_data);
+    CUDA_CHECK(cudaFree(d_local_data));
+    CUDA_CHECK(cudaFree(d_global_data));
+
+    #ifdef USE_NCCL
+    ncclCommDestroy(nccl_comm);
+    #elif defined(USE_RCCL)
+    rcclCommDestroy(rccl_comm);
+    #endif
+
+    MPI_Finalize();
+    return EXIT_SUCCESS;
+}
+

From 23e9f5cea25b1c430ea0cfb3a0b5977a4ed27ff0 Mon Sep 17 00:00:00 2001
From: RoastSea8 <aditya26042005@gmail.com>
Date: Sat, 30 Mar 2024 16:43:04 -0700
Subject: [PATCH 04/16] remove duplicate commit

---
 allreduce.cu | 219 ---------------------------------------------------
 1 file changed, 219 deletions(-)
 delete mode 100644 allreduce.cu

diff --git a/allreduce.cu b/allreduce.cu
deleted file mode 100644
index 062b120..0000000
--- a/allreduce.cu
+++ /dev/null
@@ -1,219 +0,0 @@
-/* \file allreduce.cu
- * Copyright 2024 Parallel Software and Systems Group, University of Maryland.
- * See the top-level LICENSE file for details.
- * 
- * SPDX-License-Identifier: MIT
- */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <mpi.h>
-
-#ifdef USE_CUDA
-  #include <cuda_runtime.h>
-  #include <cuda_bf16.h>
-#endif
-
-#ifdef USE_NCCL
-  #include "nccl.h"
-#elif defined(USE_RCCL)
-  #include "rccl.h"
-#endif
-
-#define NUM_WARMUP_ITERATIONS		5
-
-#define MPI_CHECK(cmd) do {                         \
-  int e = cmd;                                      \
-  if( e != MPI_SUCCESS ) {                          \
-    printf("Failed: MPI error %s:%d '%d'\n",        \
-        __FILE__,__LINE__, e);                      \
-    exit(EXIT_FAILURE);                             \
-  }                                                 \
-} while(0)
-
-#define CUDA_CHECK(cmd) do {                        \
-  cudaError_t e = cmd;                              \
-  if(e != cudaSuccess) {                            \
-    printf("CUDA error  %s:%d: %s\n",               \
-        __FILE__, __LINE__, cudaGetErrorString(e)); \
-    exit(EXIT_FAILURE);                             \
-  }                                                 \
-} while(0)
-
-#define NCCL_CHECK(cmd) do {                        \
-  ncclResult_t e = cmd;                             \
-  if (e != ncclSuccess) {                           \
-    printf("NCCL error %s:%d %s\n",                 \
-        __FILE__, __LINE__, ncclGetErrorString(e)); \
-    exit(EXIT_FAILURE);                             \
-  }                                                 \
-} while(0)
-
-void initializeData(nv_bfloat16 *data, int size) {
-    for (int i = 0; i < (size / sizeof(nv_bfloat16)); ++i) {
-        data[i] = __float2bfloat16((float)i);
-    }
-}
-
-void custom_bf16_sum(void *invec, void *inoutvec, int *len, MPI_Datatype *datatype) {
-	nv_bfloat16* in = (nv_bfloat16*) invec;
-	nv_bfloat16* inout = (nv_bfloat16*) inoutvec;
-	for (int i = 0; i < *len; i++)
-	    inout[i] = __hadd(in[i], inout[i]);
-}
-
-int main(int argc, char *argv[]) {
-    if (argc != 5) {
-        fprintf(stderr, "Usage: %s <num_gpus> <min_msg_size> <max_msg_size> <iterations>\n", argv[0]);
-        return EXIT_FAILURE;
-    }
-
-    int num_gpus = atoi(argv[1]);
-    int min_msg_size = atoi(argv[2]);
-    int max_msg_size = atoi(argv[3]);
-    int iterations = atoi(argv[4]);
-
-    if (num_gpus < 2 || min_msg_size <= 0 || max_msg_size <= 0 || min_msg_size > max_msg_size || iterations <= 0) {
-        fprintf(stderr, "Invalid input parameters.\n");
-        return EXIT_FAILURE;
-    }
-
-    int my_rank, num_pes;
-    int num_gpus_per_node;
-    int msg_count;
-
-    MPI_Init(&argc, &argv);
-    MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
-    MPI_Comm_size(MPI_COMM_WORLD, &num_pes);
-
-    if (num_pes != num_gpus) {
-        fprintf(stderr, "Number of processes must match number of GPUs.\n");
-        MPI_Finalize();
-        return EXIT_FAILURE;
-    }
-
-    // Initialize GPU context
-    cudaGetDeviceCount(&num_gpus_per_node);
-    cudaSetDevice((my_rank % num_gpus_per_node));
-
-    int local_data_size = max_msg_size; // Size of local data
-    int global_data_size = local_data_size; // Size of global data
-
-    nv_bfloat16 *local_data = (nv_bfloat16*)malloc(local_data_size);
-    nv_bfloat16 *global_data = (nv_bfloat16*)malloc(global_data_size);
-
-    // Initialize local data
-    initializeData(local_data, local_data_size);
-
-    // Allocate memory on GPU
-    nv_bfloat16 *d_local_data, *d_global_data;
-    CUDA_CHECK(cudaMalloc(&d_local_data, local_data_size));
-    CUDA_CHECK(cudaMalloc(&d_global_data, global_data_size));
-
-    // Copy local data to GPU
-    CUDA_CHECK(cudaMemcpy(d_local_data, local_data, local_data_size, cudaMemcpyHostToDevice));
-
-    #ifdef USE_MPI
-    // create 2-byte datatype (send raw, un-interpreted bytes)
-    MPI_Datatype mpi_type_bfloat16;
-    MPI_Type_contiguous(2, MPI_BYTE, &mpi_type_bfloat16);
-    MPI_Type_commit(&mpi_type_bfloat16);
-
-    // define custom reduce operation for nv_bfloat16 types
-    MPI_Op CUSTOM_SUM;
-    MPI_Op_create(&custom_bf16_sum, 1, &CUSTOM_SUM);
-
-    #elif USE_NCCL
-    ncclUniqueId nccl_comm_id;
-    ncclComm_t nccl_comm;
-
-    if (my_rank == 0) {
-        /* Generates an Id to be used in ncclCommInitRank. */
-        ncclGetUniqueId(&nccl_comm_id);
-    }
-
-    /* distribute nccl_comm_id to all ranks */
-    MPI_CHECK(MPI_Bcast((void *)&nccl_comm_id, sizeof(nccl_comm_id), MPI_BYTE,
-                        0, MPI_COMM_WORLD));
-
-    /* Create a new NCCL communicator */
-    NCCL_CHECK(ncclCommInitRank(&nccl_comm, num_pes, nccl_comm_id, my_rank));
-
-    #elif defined(USE_RCCL)
-    // TODO: fix later
-    rcclComm_t rccl_comm;
-    rcclCommInitRank(&comm, num_gpus, 0, rccl_root);
-    #endif
-
-    // Perform MPI_Iallgather, NCCL allgather, or RCCL allgather
-    double total_time, start_time;
-    MPI_Request request;
-    MPI_Status status;
-
-    // Print benchmark results
-    if (my_rank == 0) {
-        printf("Number of GPUs: %d\n", num_gpus);
-        printf("Message size range: %d - %d\n", min_msg_size, max_msg_size);
-        printf("Number of iterations: %d\n", iterations);
-    }
-    fflush(NULL);
-
-    for (int msg_size = min_msg_size; msg_size <= max_msg_size; msg_size *= 2) {
-	msg_count = msg_size / sizeof(nv_bfloat16);
-	// warmup iterations
-	for (int i = 0; i < NUM_WARMUP_ITERATIONS; ++i) {
-            #ifdef USE_MPI
-            MPI_CHECK(MPI_Iallreduce(d_local_data, d_global_data, msg_count, mpi_type_bfloat16,
-                CUSTOM_SUM, MPI_COMM_WORLD, &request));
-
-            MPI_CHECK(MPI_Wait(&request, &status));
-            #elif defined(USE_NCCL)
-            NCCL_CHECK(ncclAllReduce((const void*)d_local_data, (void*)d_global_data, msg_count, ncclBfloat16, ncclSum, nccl_comm, NULL));
-	    cudaDeviceSynchronize();
-            #elif defined(USE_RCCL)
-	    // TODO: fix later
-            rcclAllReduce((const void*)d_local_data, (void*)d_global_data, global_data_size, rcclInt, rcclSum, comm, NULL);
-            #endif
-        }
-
-	if(msg_size >= 8388608)
-	    iterations = 20;
-
-        MPI_Barrier(MPI_COMM_WORLD);
-        start_time = MPI_Wtime();
-	for (int i = 0; i < iterations; ++i) {
-            #ifdef USE_MPI
-            MPI_CHECK(MPI_Iallreduce(d_local_data, d_global_data, msg_count, mpi_type_bfloat16,
-                CUSTOM_SUM, MPI_COMM_WORLD, &request));
-
-            MPI_CHECK(MPI_Wait(&request, &status));
-            #elif defined(USE_NCCL)
-            NCCL_CHECK(ncclAllReduce((const void*)d_local_data, (void*)d_global_data, msg_count, ncclBfloat16, ncclSum, nccl_comm, NULL));
-	    cudaDeviceSynchronize();
-            #elif defined(USE_RCCL)
-            // TODO: fix later
-            rcclAllReduce((const void*)d_local_data, (void*)d_global_data, global_data_size, rcclInt, rcclSum, comm, NULL);
-            #endif
-        }
-        MPI_Barrier(MPI_COMM_WORLD);
-        total_time = MPI_Wtime() - start_time;
-	if (my_rank == 0)
-	    printf("%d %.6f seconds\n", msg_size, (total_time / iterations));
-    }
-
-    // Cleanup
-    free(local_data);
-    free(global_data);
-    CUDA_CHECK(cudaFree(d_local_data));
-    CUDA_CHECK(cudaFree(d_global_data));
-
-    #ifdef USE_NCCL
-    ncclCommDestroy(nccl_comm);
-    #elif defined(USE_RCCL)
-    rcclCommDestroy(rccl_comm);
-    #endif
-
-    MPI_Finalize();
-    return EXIT_SUCCESS;
-}
-

From 7ff3fb503e4549ca170ea9ee08d763a90eb55584 Mon Sep 17 00:00:00 2001
From: RoastSea8 <aditya26042005@gmail.com>
Date: Sat, 30 Mar 2024 16:47:59 -0700
Subject: [PATCH 05/16] fix Makefile

---
 Makefile | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/Makefile b/Makefile
index 973364d..7c01696 100644
--- a/Makefile
+++ b/Makefile
@@ -9,16 +9,13 @@ CFLAGS	= -std=c++11 -O2 -target-accel=nvidia80 --cuda-gpu-arch=sm_80 -DUSE_CUDA
 LDFLAGS	= -L/global/common/software/nersc9/nccl/2.19.4/lib -lnccl
 
 
-all: allgather.x allreduce.x reduce_scatter.x
+all: allgather.x reduce_scatter.x
 
 allgather.x: allgather.cu 
 	${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o allgather.x allgather.cu
 
-allreduce.x: allreduce.cu
-	${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o allreduce.x allreduce.cu
-
 reduce_scatter.x: reduce_scatter.cu
 	${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o reduce_scatter.x reduce_scatter.cu
 
 clean: 
-	rm -f allgather.x allreduce.x reduce_scatter.x
+	rm -f allgather.x reduce_scatter.x

From b9e882437578691448c9748aa67caae643db422a Mon Sep 17 00:00:00 2001
From: Aditya Tomar <aditya26042005@gmail.com>
Date: Mon, 1 Apr 2024 00:38:59 -0400
Subject: [PATCH 06/16] add ROCm and RCCL code for reduce-scatter

---
 reduce_scatter.cu | 98 +++++++++++++++++++++++++++++++----------------
 1 file changed, 64 insertions(+), 34 deletions(-)

diff --git a/reduce_scatter.cu b/reduce_scatter.cu
index 9ed9e53..8f851d4 100644
--- a/reduce_scatter.cu
+++ b/reduce_scatter.cu
@@ -8,16 +8,20 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <mpi.h>
-
 #ifdef USE_CUDA
-  #include <cuda_runtime.h>
   #include <cuda_bf16.h>
+  #define bfloat16 nv_bfloat16
+#elif USE_ROCM
+  #include <hip/hip_bfloat16.h>
+  #include <hip/hip_runtime.h>
+  #include <hip/hip_runtime_api.h>
+  #define bfloat16 hip_bfloat16
 #endif
 
 #ifdef USE_NCCL
   #include "nccl.h"
-#elif defined(USE_RCCL)
-  #include "rccl.h"
+#elif USE_RCCL
+  #include <rccl/rccl.h> 
 #endif
 
 #define NUM_WARMUP_ITERATIONS		5
@@ -40,6 +44,16 @@
   }                                                 \
 } while(0)
 
+#define HIP_CHECK(cmd) do {                        \
+  hipError_t e = cmd;                              \
+  if(e != hipSuccess) {                            \
+    printf("HIP error  %s:%d: %s\n",               \
+        __FILE__, __LINE__, hipGetErrorString(e)); \
+    exit(EXIT_FAILURE);                             \
+  }                                                 \
+} while(0)
+
+// NCCL_CHECK is used to validate RCCL functions as well
 #define NCCL_CHECK(cmd) do {                        \
   ncclResult_t e = cmd;                             \
   if (e != ncclSuccess) {                           \
@@ -49,9 +63,14 @@
   }                                                 \
 } while(0)
 
-void initializeData(nv_bfloat16 *data, int size) {
-    for (int i = 0; i < (size / sizeof(nv_bfloat16)); ++i) {
+void initializeData(bfloat16 *data, int size) {
+    for (int i = 0; i < (size / sizeof(bfloat16)); ++i) {
+        #ifdef USE_CUDA
         data[i] = __float2bfloat16((float)i);
+        #elif USE_ROCM
+        // ROCm doesn't have a float2bfloat16 method
+        data[i] = (bfloat16) ((float) i);
+        #endif
     }
 }
 
@@ -93,26 +112,36 @@ int main(int argc, char *argv[]) {
     }
 
     // Initialize GPU context
+    #if USE_CUDA
     cudaGetDeviceCount(&num_gpus_per_node);
     cudaSetDevice((my_rank % num_gpus_per_node));
+    #elif USE_ROCM
+    hipGetDeviceCount(&num_gpus_per_node);
+    hipSetDevice((my_rank % num_gpus_per_node));
+    #endif
 
     int local_data_size = max_msg_size; // Size of local data
     int global_data_size = local_data_size; // Size of global data
 
-    nv_bfloat16 *local_data = (nv_bfloat16*)malloc(local_data_size);
-    nv_bfloat16 *global_data = (nv_bfloat16*)malloc(global_data_size);
+    bfloat16 *local_data = (bfloat16*)malloc(local_data_size);
+    bfloat16 *global_data = (bfloat16*)malloc(global_data_size);
 
     // Initialize local data
     initializeData(local_data, local_data_size);
 
-    // Allocate memory on GPU
-    nv_bfloat16 *d_local_data, *d_global_data;
+    bfloat16 *d_local_data, *d_global_data;
+    #ifdef USE_CUDA
     CUDA_CHECK(cudaMalloc(&d_local_data, local_data_size));
     CUDA_CHECK(cudaMalloc(&d_global_data, global_data_size));
-
     // Copy local data to GPU
     CUDA_CHECK(cudaMemcpy(d_local_data, local_data, local_data_size, cudaMemcpyHostToDevice));
 
+    #elif USE_ROCM
+    HIP_CHECK(hipMalloc(&d_local_data, local_data_size));
+    HIP_CHECK(hipMalloc(&d_global_data, global_data_size));
+    HIP_CHECK(hipMemcpy(d_local_data, local_data, local_data_size, hipMemcpyHostToDevice));
+    #endif
+
     #ifdef USE_MPI
     // create 2-byte datatype (send raw, un-interpreted bytes)
     MPI_Datatype mpi_type_bfloat16;
@@ -123,7 +152,7 @@ int main(int argc, char *argv[]) {
     MPI_Op CUSTOM_SUM;
     MPI_Op_create(&custom_bf16_sum, 1, &CUSTOM_SUM);
 
-    #elif USE_NCCL
+    #elif defined(USE_NCCL) || defined(USE_RCCL)
     ncclUniqueId nccl_comm_id;
     ncclComm_t nccl_comm;
 
@@ -136,17 +165,12 @@ int main(int argc, char *argv[]) {
     MPI_CHECK(MPI_Bcast((void *)&nccl_comm_id, sizeof(nccl_comm_id), MPI_BYTE,
                         0, MPI_COMM_WORLD));
 
-    /* Create a new NCCL communicator */
+    /* Create a new NCCL/RCCL communicator */
     NCCL_CHECK(ncclCommInitRank(&nccl_comm, num_pes, nccl_comm_id, my_rank));
-
-    #elif defined(USE_RCCL)
-    // TODO: fix later
-    rcclComm_t rccl_comm;
-    rcclCommInitRank(&comm, num_gpus, 0, rccl_root);
     #endif
 
     // init recvcounts to send an equal portion of data from the reduce operation
-    int num_elements = local_data_size / sizeof(nv_bfloat16);
+    int num_elements = local_data_size / sizeof(bfloat16);
     int portion = num_elements / num_pes;
     int *recvcounts = (int*) malloc(sizeof(int) * num_pes);
     for (int i = 0; i < num_pes; i++) 
@@ -166,7 +190,7 @@ int main(int argc, char *argv[]) {
     fflush(NULL);
 
     for (int msg_size = min_msg_size; msg_size <= max_msg_size; msg_size *= 2) {
-	msg_count = msg_size / sizeof(nv_bfloat16);
+	msg_count = msg_size / sizeof(bfloat16);
 	// warmup iterations
 	for (int i = 0; i < NUM_WARMUP_ITERATIONS; ++i) {
             #ifdef USE_MPI
@@ -174,12 +198,14 @@ int main(int argc, char *argv[]) {
                 CUSTOM_SUM, MPI_COMM_WORLD, &request));
 
             MPI_CHECK(MPI_Wait(&request, &status));
-            #elif defined(USE_NCCL)
+            #elif defined(USE_NCCL) || defined(USE_RCCL)
             NCCL_CHECK(ncclReduceScatter((const void*)d_local_data, (void*)d_global_data, portion, ncclBfloat16, ncclSum, nccl_comm, NULL));
-	    cudaDeviceSynchronize();
-            #elif defined(USE_RCCL)
-	    // TODO: fix later
-            rcclAllReduce((const void*)d_local_data, (void*)d_global_data, global_data_size, rcclInt, rcclSum, comm, NULL);
+            #endif
+            
+            #ifdef USE_CUDA
+            cudaDeviceSynchronize();
+            #elif USE_ROCM
+            hipDeviceSynchronize();
             #endif
         }
 
@@ -194,12 +220,14 @@ int main(int argc, char *argv[]) {
                 CUSTOM_SUM, MPI_COMM_WORLD, &request));
 
             MPI_CHECK(MPI_Wait(&request, &status));
-            #elif defined(USE_NCCL)
+            #elif defined(USE_NCCL) || defined(USE_RCCL)
             NCCL_CHECK(ncclReduceScatter((const void*)d_local_data, (void*)d_global_data, portion, ncclBfloat16, ncclSum, nccl_comm, NULL));
-	    cudaDeviceSynchronize();
-            #elif defined(USE_RCCL)
-            // TODO: fix later
-            rcclAllReduce((const void*)d_local_data, (void*)d_global_data, global_data_size, rcclInt, rcclSum, comm, NULL);
+            #endif
+            
+            #ifdef USE_CUDA
+            cudaDeviceSynchronize();
+            #elif USE_ROCM
+            hipDeviceSynchronize();
             #endif
         }
         MPI_Barrier(MPI_COMM_WORLD);
@@ -211,16 +239,18 @@ int main(int argc, char *argv[]) {
     // Cleanup
     free(local_data);
     free(global_data);
+    #ifdef USE_CUDA
     CUDA_CHECK(cudaFree(d_local_data));
     CUDA_CHECK(cudaFree(d_global_data));
+    #elif USE_ROCM
+    HIP_CHECK(hipFree(d_local_data));
+    HIP_CHECK(hipFree(d_global_data));
+    #endif
 
-    #ifdef USE_NCCL
+    #ifdef defined(USE_NCCL) || defined(USE_RCCL)
     ncclCommDestroy(nccl_comm);
-    #elif defined(USE_RCCL)
-    rcclCommDestroy(rccl_comm);
     #endif
 
     MPI_Finalize();
     return EXIT_SUCCESS;
 }
-

From 686be82807a62cb66f3ba91fe055c9644b2d4442 Mon Sep 17 00:00:00 2001
From: Aditya Tomar <aditya26042005@gmail.com>
Date: Mon, 1 Apr 2024 00:57:19 -0400
Subject: [PATCH 07/16] revert Makefile to original

---
 Makefile | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/Makefile b/Makefile
index 7c01696..11f5145 100644
--- a/Makefile
+++ b/Makefile
@@ -5,17 +5,14 @@
 
 CC	= cc
 INC	= -I/global/common/software/nersc9/nccl/2.19.4/include
-CFLAGS	= -std=c++11 -O2 -target-accel=nvidia80 --cuda-gpu-arch=sm_80 -DUSE_CUDA -DUSE_MPI
+CFLAGS	= -std=c++11 -O2 -target-accel=nvidia80 --cuda-gpu-arch=sm_80 -DUSE_CUDA -DUSE_NCCL
 LDFLAGS	= -L/global/common/software/nersc9/nccl/2.19.4/lib -lnccl
 
 
-all: allgather.x reduce_scatter.x
+all: allgather.x
 
 allgather.x: allgather.cu 
 	${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o allgather.x allgather.cu
 
-reduce_scatter.x: reduce_scatter.cu
-	${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o reduce_scatter.x reduce_scatter.cu
-
 clean: 
-	rm -f allgather.x reduce_scatter.x
+	rm -f allgather.x

From ec4fdedfe0e26a4344a778e0b9ec7fd0ed8985ab Mon Sep 17 00:00:00 2001
From: Aditya Tomar <aditya26042005@gmail.com>
Date: Mon, 1 Apr 2024 00:58:42 -0400
Subject: [PATCH 08/16] revert Makefile to original

---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 11f5145..df453b4 100644
--- a/Makefile
+++ b/Makefile
@@ -15,4 +15,4 @@ allgather.x: allgather.cu
 	${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o allgather.x allgather.cu
 
 clean: 
-	rm -f allgather.x
+	rm -f allgather.x 

From 60e6911eb66fed80565a0fba1f59f932163fd194 Mon Sep 17 00:00:00 2001
From: Aditya Tomar <aditya26042005@gmail.com>
Date: Mon, 1 Apr 2024 01:40:18 -0400
Subject: [PATCH 09/16] update custom bf16 sum function

---
 reduce_scatter.cu | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/reduce_scatter.cu b/reduce_scatter.cu
index 8f851d4..5db2b60 100644
--- a/reduce_scatter.cu
+++ b/reduce_scatter.cu
@@ -75,10 +75,15 @@ void initializeData(bfloat16 *data, int size) {
 }
 
 void custom_bf16_sum(void *invec, void *inoutvec, int *len, MPI_Datatype *datatype) {
-	nv_bfloat16* in = (nv_bfloat16*) invec;
-	nv_bfloat16* inout = (nv_bfloat16*) inoutvec;
-	for (int i = 0; i < *len; i++)
+    bfloat16* in = (bfloat16*) invec;
+	bfloat16* inout = (bfloat16*) inoutvec;
+	for (int i = 0; i < *len; i++) {
+        #ifdef USE_CUDA
 	    inout[i] = __hadd(in[i], inout[i]);
+        #elif USE_ROCM
+        inout[i] = in[i] + inout[i];
+        #endif
+    }
 }
 
 int main(int argc, char *argv[]) {

From a67570e98862f1e62ac850d295cffa9a3fc79206 Mon Sep 17 00:00:00 2001
From: Aditya Tomar <aditya26042005@gmail.com>
Date: Mon, 1 Apr 2024 01:46:47 -0400
Subject: [PATCH 10/16] fix indents

---
 reduce_scatter.cu | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/reduce_scatter.cu b/reduce_scatter.cu
index 5db2b60..b667c01 100644
--- a/reduce_scatter.cu
+++ b/reduce_scatter.cu
@@ -76,10 +76,10 @@ void initializeData(bfloat16 *data, int size) {
 
 void custom_bf16_sum(void *invec, void *inoutvec, int *len, MPI_Datatype *datatype) {
     bfloat16* in = (bfloat16*) invec;
-	bfloat16* inout = (bfloat16*) inoutvec;
-	for (int i = 0; i < *len; i++) {
+    bfloat16* inout = (bfloat16*) inoutvec;
+    for (int i = 0; i < *len; i++) {
         #ifdef USE_CUDA
-	    inout[i] = __hadd(in[i], inout[i]);
+        inout[i] = __hadd(in[i], inout[i]);
         #elif USE_ROCM
         inout[i] = in[i] + inout[i];
         #endif

From 63bb696da18e45195885cd465cb54af3930b77b9 Mon Sep 17 00:00:00 2001
From: RoastSea8 <aditya26042005@gmail.com>
Date: Sat, 13 Apr 2024 13:47:23 -0700
Subject: [PATCH 11/16] change to int64_t for global/local data size

---
 reduce_scatter.cu | 33 ++++++++++++++++++++-------------
 1 file changed, 20 insertions(+), 13 deletions(-)

diff --git a/reduce_scatter.cu b/reduce_scatter.cu
index b667c01..820cf4f 100644
--- a/reduce_scatter.cu
+++ b/reduce_scatter.cu
@@ -8,6 +8,8 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <mpi.h>
+#include <stdint.h>
+
 #ifdef USE_CUDA
   #include <cuda_bf16.h>
   #define bfloat16 nv_bfloat16
@@ -27,9 +29,9 @@
 #define NUM_WARMUP_ITERATIONS		5
 
 #define MPI_CHECK(cmd) do {                         \
-  int e = cmd;                                      \
+  int64_t e = cmd;                                      \
   if( e != MPI_SUCCESS ) {                          \
-    printf("Failed: MPI error %s:%d '%d'\n",        \
+    printf("Failed: MPI error %s:%d '%ld'\n",        \
         __FILE__,__LINE__, e);                      \
     exit(EXIT_FAILURE);                             \
   }                                                 \
@@ -63,8 +65,8 @@
   }                                                 \
 } while(0)
 
-void initializeData(bfloat16 *data, int size) {
-    for (int i = 0; i < (size / sizeof(bfloat16)); ++i) {
+void initializeData(bfloat16 *data, int64_t size) {
+    for (int64_t i = 0; i < (size / sizeof(bfloat16)); ++i) {
         #ifdef USE_CUDA
         data[i] = __float2bfloat16((float)i);
         #elif USE_ROCM
@@ -74,10 +76,10 @@ void initializeData(bfloat16 *data, int size) {
     }
 }
 
-void custom_bf16_sum(void *invec, void *inoutvec, int *len, MPI_Datatype *datatype) {
+void custom_bf16_sum(void *invec, void *inoutvec, int64_t *len, MPI_Datatype *datatype) {
     bfloat16* in = (bfloat16*) invec;
     bfloat16* inout = (bfloat16*) inoutvec;
-    for (int i = 0; i < *len; i++) {
+    for (int64_t i = 0; i < *len; i++) {
         #ifdef USE_CUDA
         inout[i] = __hadd(in[i], inout[i]);
         #elif USE_ROCM
@@ -93,8 +95,8 @@ int main(int argc, char *argv[]) {
     }
 
     int num_gpus = atoi(argv[1]);
-    int min_msg_size = atoi(argv[2]);
-    int max_msg_size = atoi(argv[3]);
+    int64_t min_msg_size = atoi(argv[2]);
+    int64_t max_msg_size = atoi(argv[3]);
     int iterations = atoi(argv[4]);
 
     if (num_gpus < 2 || min_msg_size <= 0 || max_msg_size <= 0 || min_msg_size > max_msg_size || iterations <= 0) {
@@ -125,8 +127,13 @@ int main(int argc, char *argv[]) {
     hipSetDevice((my_rank % num_gpus_per_node));
     #endif
 
-    int local_data_size = max_msg_size; // Size of local data
-    int global_data_size = local_data_size; // Size of global data
+    int64_t local_data_size = max_msg_size; // Size of local data
+    int64_t global_data_size = local_data_size; // Size of global data
+
+    if (my_rank == 0) {
+        fprintf(stdout, "Local data size: %ld\n", (local_data_size / 1024) / 1024);
+        fprintf(stdout, "Global data size: %ld\n", (global_data_size / 1024) / 1024);
+    }
 
     bfloat16 *local_data = (bfloat16*)malloc(local_data_size);
     bfloat16 *global_data = (bfloat16*)malloc(global_data_size);
@@ -189,12 +196,12 @@ int main(int argc, char *argv[]) {
     // Print benchmark results
     if (my_rank == 0) {
         printf("Number of GPUs: %d\n", num_gpus);
-        printf("Message size range: %d - %d\n", min_msg_size, max_msg_size);
+        printf("Message size range: %ld - %ld\n", min_msg_size, max_msg_size);
         printf("Number of iterations: %d\n", iterations);
     }
     fflush(NULL);
 
-    for (int msg_size = min_msg_size; msg_size <= max_msg_size; msg_size *= 2) {
+    for (int64_t msg_size = min_msg_size; msg_size <= max_msg_size; msg_size *= 2) {
 	msg_count = msg_size / sizeof(bfloat16);
 	// warmup iterations
 	for (int i = 0; i < NUM_WARMUP_ITERATIONS; ++i) {
@@ -238,7 +245,7 @@ int main(int argc, char *argv[]) {
         MPI_Barrier(MPI_COMM_WORLD);
         total_time = MPI_Wtime() - start_time;
 	if (my_rank == 0)
-	    printf("%d %.6f seconds\n", msg_size, (total_time / iterations));
+	    printf("%ld %.6f seconds\n", msg_size, (total_time / iterations));
     }
 
     // Cleanup

From 0a33166c8ed7059a422d3bdda3c4804604a9f849 Mon Sep 17 00:00:00 2001
From: RoastSea8 <aditya26042005@gmail.com>
Date: Sat, 13 Apr 2024 14:33:25 -0700
Subject: [PATCH 12/16] revert type change for custom sum

---
 reduce_scatter.cu | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/reduce_scatter.cu b/reduce_scatter.cu
index 820cf4f..f824072 100644
--- a/reduce_scatter.cu
+++ b/reduce_scatter.cu
@@ -76,10 +76,10 @@ void initializeData(bfloat16 *data, int64_t size) {
     }
 }
 
-void custom_bf16_sum(void *invec, void *inoutvec, int64_t *len, MPI_Datatype *datatype) {
+void custom_bf16_sum(void *invec, void *inoutvec, int *len, MPI_Datatype *datatype) {
     bfloat16* in = (bfloat16*) invec;
     bfloat16* inout = (bfloat16*) inoutvec;
-    for (int64_t i = 0; i < *len; i++) {
+    for (int i = 0; i < *len; i++) {
         #ifdef USE_CUDA
         inout[i] = __hadd(in[i], inout[i]);
         #elif USE_ROCM

From 74cfdd8c32a705f2a74d82a9ae5ec230060aa317 Mon Sep 17 00:00:00 2001
From: RoastSea8 <aditya26042005@gmail.com>
Date: Sat, 13 Apr 2024 17:59:18 -0700
Subject: [PATCH 13/16] change atoi to strtoll

---
 reduce_scatter.cu | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/reduce_scatter.cu b/reduce_scatter.cu
index f824072..1853aed 100644
--- a/reduce_scatter.cu
+++ b/reduce_scatter.cu
@@ -95,8 +95,8 @@ int main(int argc, char *argv[]) {
     }
 
     int num_gpus = atoi(argv[1]);
-    int64_t min_msg_size = atoi(argv[2]);
-    int64_t max_msg_size = atoi(argv[3]);
+    int64_t min_msg_size = strtoll(argv[2], NULL, 10);
+    int64_t max_msg_size = strtoll(argv[3], NULL, 10);
     int iterations = atoi(argv[4]);
 
     if (num_gpus < 2 || min_msg_size <= 0 || max_msg_size <= 0 || min_msg_size > max_msg_size || iterations <= 0) {

From 089423eb99a05f058a173543003abab3e744e698 Mon Sep 17 00:00:00 2001
From: Aditya Tomar <aditya26042005@gmail.com>
Date: Sun, 14 Apr 2024 04:44:50 -0700
Subject: [PATCH 14/16] add hip directive

---
 reduce_scatter.cu | 1 +
 1 file changed, 1 insertion(+)

diff --git a/reduce_scatter.cu b/reduce_scatter.cu
index 1853aed..99fc950 100644
--- a/reduce_scatter.cu
+++ b/reduce_scatter.cu
@@ -14,6 +14,7 @@
   #include <cuda_bf16.h>
   #define bfloat16 nv_bfloat16
 #elif USE_ROCM
+  #define __HIP_PLATFORM_AMD__
   #include <hip/hip_bfloat16.h>
   #include <hip/hip_runtime.h>
   #include <hip/hip_runtime_api.h>

From 31dd4952078b16bb81b49cc5f68152731a99992b Mon Sep 17 00:00:00 2001
From: Aditya Tomar <aditya26042005@gmail.com>
Date: Fri, 26 Apr 2024 10:56:04 -0700
Subject: [PATCH 15/16] update

---
 reduce_scatter.cu | 19 ++++++++-----------
 1 file changed, 8 insertions(+), 11 deletions(-)

diff --git a/reduce_scatter.cu b/reduce_scatter.cu
index 99fc950..c60236e 100644
--- a/reduce_scatter.cu
+++ b/reduce_scatter.cu
@@ -30,9 +30,9 @@
 #define NUM_WARMUP_ITERATIONS		5
 
 #define MPI_CHECK(cmd) do {                         \
-  int64_t e = cmd;                                      \
+  int64_t e = cmd;                                  \
   if( e != MPI_SUCCESS ) {                          \
-    printf("Failed: MPI error %s:%d '%ld'\n",        \
+    printf("Failed: MPI error %s:%d '%ld'\n",       \
         __FILE__,__LINE__, e);                      \
     exit(EXIT_FAILURE);                             \
   }                                                 \
@@ -47,11 +47,11 @@
   }                                                 \
 } while(0)
 
-#define HIP_CHECK(cmd) do {                        \
-  hipError_t e = cmd;                              \
-  if(e != hipSuccess) {                            \
-    printf("HIP error  %s:%d: %s\n",               \
-        __FILE__, __LINE__, hipGetErrorString(e)); \
+#define HIP_CHECK(cmd) do {                         \
+  hipError_t e = cmd;                               \
+  if(e != hipSuccess) {                             \
+    printf("HIP error  %s:%d: %s\n",                \
+        __FILE__, __LINE__, hipGetErrorString(e));  \
     exit(EXIT_FAILURE);                             \
   }                                                 \
 } while(0)
@@ -189,7 +189,7 @@ int main(int argc, char *argv[]) {
     for (int i = 0; i < num_pes; i++) 
         recvcounts[i] = portion;
 
-    // Perform MPI_Iallgather, NCCL allgather, or RCCL allgather
+    // Perform MPI_Ireduce_scatter, NCCL reduce_scatter, or RCCL reduce_scatter 
     double total_time, start_time;
     MPI_Request request;
     MPI_Status status;
@@ -222,9 +222,6 @@ int main(int argc, char *argv[]) {
             #endif
         }
 
-	if(msg_size >= 8388608)
-	    iterations = 20;
-
         MPI_Barrier(MPI_COMM_WORLD);
         start_time = MPI_Wtime();
 	for (int i = 0; i < iterations; ++i) {

From b1ec98a82773dff0bffdde3ff72e7762eaa0e4cf Mon Sep 17 00:00:00 2001
From: Aditya Tomar <aditya26042005@gmail.com>
Date: Thu, 11 Jul 2024 18:18:59 -0700
Subject: [PATCH 16/16] fix bug

---
 reduce_scatter.cu | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/reduce_scatter.cu b/reduce_scatter.cu
index c60236e..238bf23 100644
--- a/reduce_scatter.cu
+++ b/reduce_scatter.cu
@@ -182,12 +182,9 @@ int main(int argc, char *argv[]) {
     NCCL_CHECK(ncclCommInitRank(&nccl_comm, num_pes, nccl_comm_id, my_rank));
     #endif
 
-    // init recvcounts to send an equal portion of data from the reduce operation
-    int num_elements = local_data_size / sizeof(bfloat16);
-    int portion = num_elements / num_pes;
+    // init recvcounts, which stores the portion of data to send to each process after calling reduce
     int *recvcounts = (int*) malloc(sizeof(int) * num_pes);
-    for (int i = 0; i < num_pes; i++) 
-        recvcounts[i] = portion;
+    int portion;
 
     // Perform MPI_Ireduce_scatter, NCCL reduce_scatter, or RCCL reduce_scatter 
     double total_time, start_time;
@@ -204,6 +201,11 @@ int main(int argc, char *argv[]) {
 
     for (int64_t msg_size = min_msg_size; msg_size <= max_msg_size; msg_size *= 2) {
 	msg_count = msg_size / sizeof(bfloat16);
+
+    portion = msg_count / num_pes;
+    for (int i = 0; i < num_pes; i++) 
+        recvcounts[i] = portion;
+
 	// warmup iterations
 	for (int i = 0; i < NUM_WARMUP_ITERATIONS; ++i) {
             #ifdef USE_MPI