From f7fb83c79b9e24e2ca0836030e148f88c8a44829 Mon Sep 17 00:00:00 2001 From: RoastSea8 Date: Mon, 25 Mar 2024 20:33:13 -0700 Subject: [PATCH 01/16] add cudaDeviceSynchronize for NCCL --- allgather.cu | 2 ++ 1 file changed, 2 insertions(+) diff --git a/allgather.cu b/allgather.cu index cf1a882..5953041 100644 --- a/allgather.cu +++ b/allgather.cu @@ -158,6 +158,7 @@ int main(int argc, char *argv[]) { MPI_CHECK(MPI_Wait(&request, &status)); #elif defined(USE_NCCL) NCCL_CHECK(ncclAllGather((const void*)d_local_data, (void*)d_global_data, msg_count, ncclBfloat16, nccl_comm, NULL)); + cudaDeviceSynchronize(); #elif defined(USE_RCCL) // TODO: fix later rcclAllReduce((const void*)d_local_data, (void*)d_global_data, global_data_size, rcclInt, rcclSum, comm, NULL); @@ -177,6 +178,7 @@ int main(int argc, char *argv[]) { MPI_CHECK(MPI_Wait(&request, &status)); #elif defined(USE_NCCL) NCCL_CHECK(ncclAllGather((const void*)d_local_data, (void*)d_global_data, msg_count, ncclBfloat16, nccl_comm, NULL)); + cudaDeviceSynchronize(); #elif defined(USE_RCCL) // TODO: fix later rcclAllReduce((const void*)d_local_data, (void*)d_global_data, global_data_size, rcclInt, rcclSum, comm, NULL); From 453a397d2c609f41eac92012e9564ef6e1fa8ed6 Mon Sep 17 00:00:00 2001 From: RoastSea8 Date: Sat, 30 Mar 2024 14:15:33 -0700 Subject: [PATCH 02/16] add allreduce code --- Makefile | 9 ++- allreduce.cu | 219 +++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 225 insertions(+), 3 deletions(-) create mode 100644 allreduce.cu diff --git a/Makefile b/Makefile index df453b4..231e499 100644 --- a/Makefile +++ b/Makefile @@ -5,14 +5,17 @@ CC = cc INC = -I/global/common/software/nersc9/nccl/2.19.4/include -CFLAGS = -std=c++11 -O2 -target-accel=nvidia80 --cuda-gpu-arch=sm_80 -DUSE_CUDA -DUSE_NCCL +CFLAGS = -std=c++11 -O2 -target-accel=nvidia80 --cuda-gpu-arch=sm_80 -DUSE_CUDA -DUSE_MPI LDFLAGS = -L/global/common/software/nersc9/nccl/2.19.4/lib -lnccl -all: allgather.x +all: allgather.x allreduce.x allgather.x: allgather.cu ${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o allgather.x allgather.cu +allreduce.x: allreduce.cu + ${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o allreduce.x allreduce.cu + clean: - rm -f allgather.x + rm -f allgather.x allreduce.x diff --git a/allreduce.cu b/allreduce.cu new file mode 100644 index 0000000..062b120 --- /dev/null +++ b/allreduce.cu @@ -0,0 +1,219 @@ +/* \file allreduce.cu + * Copyright 2024 Parallel Software and Systems Group, University of Maryland. + * See the top-level LICENSE file for details. + * + * SPDX-License-Identifier: MIT + */ + +#include +#include +#include + +#ifdef USE_CUDA + #include + #include +#endif + +#ifdef USE_NCCL + #include "nccl.h" +#elif defined(USE_RCCL) + #include "rccl.h" +#endif + +#define NUM_WARMUP_ITERATIONS 5 + +#define MPI_CHECK(cmd) do { \ + int e = cmd; \ + if( e != MPI_SUCCESS ) { \ + printf("Failed: MPI error %s:%d '%d'\n", \ + __FILE__,__LINE__, e); \ + exit(EXIT_FAILURE); \ + } \ +} while(0) + +#define CUDA_CHECK(cmd) do { \ + cudaError_t e = cmd; \ + if(e != cudaSuccess) { \ + printf("CUDA error %s:%d: %s\n", \ + __FILE__, __LINE__, cudaGetErrorString(e)); \ + exit(EXIT_FAILURE); \ + } \ +} while(0) + +#define NCCL_CHECK(cmd) do { \ + ncclResult_t e = cmd; \ + if (e != ncclSuccess) { \ + printf("NCCL error %s:%d %s\n", \ + __FILE__, __LINE__, ncclGetErrorString(e)); \ + exit(EXIT_FAILURE); \ + } \ +} while(0) + +void initializeData(nv_bfloat16 *data, int size) { + for (int i = 0; i < (size / sizeof(nv_bfloat16)); ++i) { + data[i] = __float2bfloat16((float)i); + } +} + +void custom_bf16_sum(void *invec, void *inoutvec, int *len, MPI_Datatype *datatype) { + nv_bfloat16* in = (nv_bfloat16*) invec; + nv_bfloat16* inout = (nv_bfloat16*) inoutvec; + for (int i = 0; i < *len; i++) + inout[i] = __hadd(in[i], inout[i]); +} + +int main(int argc, char *argv[]) { + if (argc != 5) { + fprintf(stderr, "Usage: %s \n", argv[0]); + return EXIT_FAILURE; + } + + int num_gpus = atoi(argv[1]); + int min_msg_size = atoi(argv[2]); + int max_msg_size = atoi(argv[3]); + int iterations = atoi(argv[4]); + + if (num_gpus < 2 || min_msg_size <= 0 || max_msg_size <= 0 || min_msg_size > max_msg_size || iterations <= 0) { + fprintf(stderr, "Invalid input parameters.\n"); + return EXIT_FAILURE; + } + + int my_rank, num_pes; + int num_gpus_per_node; + int msg_count; + + MPI_Init(&argc, &argv); + MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); + MPI_Comm_size(MPI_COMM_WORLD, &num_pes); + + if (num_pes != num_gpus) { + fprintf(stderr, "Number of processes must match number of GPUs.\n"); + MPI_Finalize(); + return EXIT_FAILURE; + } + + // Initialize GPU context + cudaGetDeviceCount(&num_gpus_per_node); + cudaSetDevice((my_rank % num_gpus_per_node)); + + int local_data_size = max_msg_size; // Size of local data + int global_data_size = local_data_size; // Size of global data + + nv_bfloat16 *local_data = (nv_bfloat16*)malloc(local_data_size); + nv_bfloat16 *global_data = (nv_bfloat16*)malloc(global_data_size); + + // Initialize local data + initializeData(local_data, local_data_size); + + // Allocate memory on GPU + nv_bfloat16 *d_local_data, *d_global_data; + CUDA_CHECK(cudaMalloc(&d_local_data, local_data_size)); + CUDA_CHECK(cudaMalloc(&d_global_data, global_data_size)); + + // Copy local data to GPU + CUDA_CHECK(cudaMemcpy(d_local_data, local_data, local_data_size, cudaMemcpyHostToDevice)); + + #ifdef USE_MPI + // create 2-byte datatype (send raw, un-interpreted bytes) + MPI_Datatype mpi_type_bfloat16; + MPI_Type_contiguous(2, MPI_BYTE, &mpi_type_bfloat16); + MPI_Type_commit(&mpi_type_bfloat16); + + // define custom reduce operation for nv_bfloat16 types + MPI_Op CUSTOM_SUM; + MPI_Op_create(&custom_bf16_sum, 1, &CUSTOM_SUM); + + #elif USE_NCCL + ncclUniqueId nccl_comm_id; + ncclComm_t nccl_comm; + + if (my_rank == 0) { + /* Generates an Id to be used in ncclCommInitRank. */ + ncclGetUniqueId(&nccl_comm_id); + } + + /* distribute nccl_comm_id to all ranks */ + MPI_CHECK(MPI_Bcast((void *)&nccl_comm_id, sizeof(nccl_comm_id), MPI_BYTE, + 0, MPI_COMM_WORLD)); + + /* Create a new NCCL communicator */ + NCCL_CHECK(ncclCommInitRank(&nccl_comm, num_pes, nccl_comm_id, my_rank)); + + #elif defined(USE_RCCL) + // TODO: fix later + rcclComm_t rccl_comm; + rcclCommInitRank(&comm, num_gpus, 0, rccl_root); + #endif + + // Perform MPI_Iallgather, NCCL allgather, or RCCL allgather + double total_time, start_time; + MPI_Request request; + MPI_Status status; + + // Print benchmark results + if (my_rank == 0) { + printf("Number of GPUs: %d\n", num_gpus); + printf("Message size range: %d - %d\n", min_msg_size, max_msg_size); + printf("Number of iterations: %d\n", iterations); + } + fflush(NULL); + + for (int msg_size = min_msg_size; msg_size <= max_msg_size; msg_size *= 2) { + msg_count = msg_size / sizeof(nv_bfloat16); + // warmup iterations + for (int i = 0; i < NUM_WARMUP_ITERATIONS; ++i) { + #ifdef USE_MPI + MPI_CHECK(MPI_Iallreduce(d_local_data, d_global_data, msg_count, mpi_type_bfloat16, + CUSTOM_SUM, MPI_COMM_WORLD, &request)); + + MPI_CHECK(MPI_Wait(&request, &status)); + #elif defined(USE_NCCL) + NCCL_CHECK(ncclAllReduce((const void*)d_local_data, (void*)d_global_data, msg_count, ncclBfloat16, ncclSum, nccl_comm, NULL)); + cudaDeviceSynchronize(); + #elif defined(USE_RCCL) + // TODO: fix later + rcclAllReduce((const void*)d_local_data, (void*)d_global_data, global_data_size, rcclInt, rcclSum, comm, NULL); + #endif + } + + if(msg_size >= 8388608) + iterations = 20; + + MPI_Barrier(MPI_COMM_WORLD); + start_time = MPI_Wtime(); + for (int i = 0; i < iterations; ++i) { + #ifdef USE_MPI + MPI_CHECK(MPI_Iallreduce(d_local_data, d_global_data, msg_count, mpi_type_bfloat16, + CUSTOM_SUM, MPI_COMM_WORLD, &request)); + + MPI_CHECK(MPI_Wait(&request, &status)); + #elif defined(USE_NCCL) + NCCL_CHECK(ncclAllReduce((const void*)d_local_data, (void*)d_global_data, msg_count, ncclBfloat16, ncclSum, nccl_comm, NULL)); + cudaDeviceSynchronize(); + #elif defined(USE_RCCL) + // TODO: fix later + rcclAllReduce((const void*)d_local_data, (void*)d_global_data, global_data_size, rcclInt, rcclSum, comm, NULL); + #endif + } + MPI_Barrier(MPI_COMM_WORLD); + total_time = MPI_Wtime() - start_time; + if (my_rank == 0) + printf("%d %.6f seconds\n", msg_size, (total_time / iterations)); + } + + // Cleanup + free(local_data); + free(global_data); + CUDA_CHECK(cudaFree(d_local_data)); + CUDA_CHECK(cudaFree(d_global_data)); + + #ifdef USE_NCCL + ncclCommDestroy(nccl_comm); + #elif defined(USE_RCCL) + rcclCommDestroy(rccl_comm); + #endif + + MPI_Finalize(); + return EXIT_SUCCESS; +} + From f516fa01dc6a1fe40f255216da98f0b235a1ede4 Mon Sep 17 00:00:00 2001 From: RoastSea8 Date: Sat, 30 Mar 2024 16:40:26 -0700 Subject: [PATCH 03/16] add reduce scatter code --- Makefile | 7 +- reduce_scatter.cu | 226 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 231 insertions(+), 2 deletions(-) create mode 100644 reduce_scatter.cu diff --git a/Makefile b/Makefile index 231e499..973364d 100644 --- a/Makefile +++ b/Makefile @@ -9,7 +9,7 @@ CFLAGS = -std=c++11 -O2 -target-accel=nvidia80 --cuda-gpu-arch=sm_80 -DUSE_CUDA LDFLAGS = -L/global/common/software/nersc9/nccl/2.19.4/lib -lnccl -all: allgather.x allreduce.x +all: allgather.x allreduce.x reduce_scatter.x allgather.x: allgather.cu ${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o allgather.x allgather.cu @@ -17,5 +17,8 @@ allgather.x: allgather.cu allreduce.x: allreduce.cu ${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o allreduce.x allreduce.cu +reduce_scatter.x: reduce_scatter.cu + ${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o reduce_scatter.x reduce_scatter.cu + clean: - rm -f allgather.x allreduce.x + rm -f allgather.x allreduce.x reduce_scatter.x diff --git a/reduce_scatter.cu b/reduce_scatter.cu new file mode 100644 index 0000000..9ed9e53 --- /dev/null +++ b/reduce_scatter.cu @@ -0,0 +1,226 @@ +/* \file reduce_scatter.cu + * Copyright 2024 Parallel Software and Systems Group, University of Maryland. + * See the top-level LICENSE file for details. + * + * SPDX-License-Identifier: MIT + */ + +#include +#include +#include + +#ifdef USE_CUDA + #include + #include +#endif + +#ifdef USE_NCCL + #include "nccl.h" +#elif defined(USE_RCCL) + #include "rccl.h" +#endif + +#define NUM_WARMUP_ITERATIONS 5 + +#define MPI_CHECK(cmd) do { \ + int e = cmd; \ + if( e != MPI_SUCCESS ) { \ + printf("Failed: MPI error %s:%d '%d'\n", \ + __FILE__,__LINE__, e); \ + exit(EXIT_FAILURE); \ + } \ +} while(0) + +#define CUDA_CHECK(cmd) do { \ + cudaError_t e = cmd; \ + if(e != cudaSuccess) { \ + printf("CUDA error %s:%d: %s\n", \ + __FILE__, __LINE__, cudaGetErrorString(e)); \ + exit(EXIT_FAILURE); \ + } \ +} while(0) + +#define NCCL_CHECK(cmd) do { \ + ncclResult_t e = cmd; \ + if (e != ncclSuccess) { \ + printf("NCCL error %s:%d %s\n", \ + __FILE__, __LINE__, ncclGetErrorString(e)); \ + exit(EXIT_FAILURE); \ + } \ +} while(0) + +void initializeData(nv_bfloat16 *data, int size) { + for (int i = 0; i < (size / sizeof(nv_bfloat16)); ++i) { + data[i] = __float2bfloat16((float)i); + } +} + +void custom_bf16_sum(void *invec, void *inoutvec, int *len, MPI_Datatype *datatype) { + nv_bfloat16* in = (nv_bfloat16*) invec; + nv_bfloat16* inout = (nv_bfloat16*) inoutvec; + for (int i = 0; i < *len; i++) + inout[i] = __hadd(in[i], inout[i]); +} + +int main(int argc, char *argv[]) { + if (argc != 5) { + fprintf(stderr, "Usage: %s \n", argv[0]); + return EXIT_FAILURE; + } + + int num_gpus = atoi(argv[1]); + int min_msg_size = atoi(argv[2]); + int max_msg_size = atoi(argv[3]); + int iterations = atoi(argv[4]); + + if (num_gpus < 2 || min_msg_size <= 0 || max_msg_size <= 0 || min_msg_size > max_msg_size || iterations <= 0) { + fprintf(stderr, "Invalid input parameters.\n"); + return EXIT_FAILURE; + } + + int my_rank, num_pes; + int num_gpus_per_node; + int msg_count; + + MPI_Init(&argc, &argv); + MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); + MPI_Comm_size(MPI_COMM_WORLD, &num_pes); + + if (num_pes != num_gpus) { + fprintf(stderr, "Number of processes must match number of GPUs.\n"); + MPI_Finalize(); + return EXIT_FAILURE; + } + + // Initialize GPU context + cudaGetDeviceCount(&num_gpus_per_node); + cudaSetDevice((my_rank % num_gpus_per_node)); + + int local_data_size = max_msg_size; // Size of local data + int global_data_size = local_data_size; // Size of global data + + nv_bfloat16 *local_data = (nv_bfloat16*)malloc(local_data_size); + nv_bfloat16 *global_data = (nv_bfloat16*)malloc(global_data_size); + + // Initialize local data + initializeData(local_data, local_data_size); + + // Allocate memory on GPU + nv_bfloat16 *d_local_data, *d_global_data; + CUDA_CHECK(cudaMalloc(&d_local_data, local_data_size)); + CUDA_CHECK(cudaMalloc(&d_global_data, global_data_size)); + + // Copy local data to GPU + CUDA_CHECK(cudaMemcpy(d_local_data, local_data, local_data_size, cudaMemcpyHostToDevice)); + + #ifdef USE_MPI + // create 2-byte datatype (send raw, un-interpreted bytes) + MPI_Datatype mpi_type_bfloat16; + MPI_Type_contiguous(2, MPI_BYTE, &mpi_type_bfloat16); + MPI_Type_commit(&mpi_type_bfloat16); + + // define custom reduce operation for nv_bfloat16 types + MPI_Op CUSTOM_SUM; + MPI_Op_create(&custom_bf16_sum, 1, &CUSTOM_SUM); + + #elif USE_NCCL + ncclUniqueId nccl_comm_id; + ncclComm_t nccl_comm; + + if (my_rank == 0) { + /* Generates an Id to be used in ncclCommInitRank. */ + ncclGetUniqueId(&nccl_comm_id); + } + + /* distribute nccl_comm_id to all ranks */ + MPI_CHECK(MPI_Bcast((void *)&nccl_comm_id, sizeof(nccl_comm_id), MPI_BYTE, + 0, MPI_COMM_WORLD)); + + /* Create a new NCCL communicator */ + NCCL_CHECK(ncclCommInitRank(&nccl_comm, num_pes, nccl_comm_id, my_rank)); + + #elif defined(USE_RCCL) + // TODO: fix later + rcclComm_t rccl_comm; + rcclCommInitRank(&comm, num_gpus, 0, rccl_root); + #endif + + // init recvcounts to send an equal portion of data from the reduce operation + int num_elements = local_data_size / sizeof(nv_bfloat16); + int portion = num_elements / num_pes; + int *recvcounts = (int*) malloc(sizeof(int) * num_pes); + for (int i = 0; i < num_pes; i++) + recvcounts[i] = portion; + + // Perform MPI_Iallgather, NCCL allgather, or RCCL allgather + double total_time, start_time; + MPI_Request request; + MPI_Status status; + + // Print benchmark results + if (my_rank == 0) { + printf("Number of GPUs: %d\n", num_gpus); + printf("Message size range: %d - %d\n", min_msg_size, max_msg_size); + printf("Number of iterations: %d\n", iterations); + } + fflush(NULL); + + for (int msg_size = min_msg_size; msg_size <= max_msg_size; msg_size *= 2) { + msg_count = msg_size / sizeof(nv_bfloat16); + // warmup iterations + for (int i = 0; i < NUM_WARMUP_ITERATIONS; ++i) { + #ifdef USE_MPI + MPI_CHECK(MPI_Ireduce_scatter(d_local_data, d_global_data, recvcounts, mpi_type_bfloat16, + CUSTOM_SUM, MPI_COMM_WORLD, &request)); + + MPI_CHECK(MPI_Wait(&request, &status)); + #elif defined(USE_NCCL) + NCCL_CHECK(ncclReduceScatter((const void*)d_local_data, (void*)d_global_data, portion, ncclBfloat16, ncclSum, nccl_comm, NULL)); + cudaDeviceSynchronize(); + #elif defined(USE_RCCL) + // TODO: fix later + rcclAllReduce((const void*)d_local_data, (void*)d_global_data, global_data_size, rcclInt, rcclSum, comm, NULL); + #endif + } + + if(msg_size >= 8388608) + iterations = 20; + + MPI_Barrier(MPI_COMM_WORLD); + start_time = MPI_Wtime(); + for (int i = 0; i < iterations; ++i) { + #ifdef USE_MPI + MPI_CHECK(MPI_Ireduce_scatter(d_local_data, d_global_data, recvcounts, mpi_type_bfloat16, + CUSTOM_SUM, MPI_COMM_WORLD, &request)); + + MPI_CHECK(MPI_Wait(&request, &status)); + #elif defined(USE_NCCL) + NCCL_CHECK(ncclReduceScatter((const void*)d_local_data, (void*)d_global_data, portion, ncclBfloat16, ncclSum, nccl_comm, NULL)); + cudaDeviceSynchronize(); + #elif defined(USE_RCCL) + // TODO: fix later + rcclAllReduce((const void*)d_local_data, (void*)d_global_data, global_data_size, rcclInt, rcclSum, comm, NULL); + #endif + } + MPI_Barrier(MPI_COMM_WORLD); + total_time = MPI_Wtime() - start_time; + if (my_rank == 0) + printf("%d %.6f seconds\n", msg_size, (total_time / iterations)); + } + + // Cleanup + free(local_data); + free(global_data); + CUDA_CHECK(cudaFree(d_local_data)); + CUDA_CHECK(cudaFree(d_global_data)); + + #ifdef USE_NCCL + ncclCommDestroy(nccl_comm); + #elif defined(USE_RCCL) + rcclCommDestroy(rccl_comm); + #endif + + MPI_Finalize(); + return EXIT_SUCCESS; +} + From 23e9f5cea25b1c430ea0cfb3a0b5977a4ed27ff0 Mon Sep 17 00:00:00 2001 From: RoastSea8 Date: Sat, 30 Mar 2024 16:43:04 -0700 Subject: [PATCH 04/16] remove duplicate commit --- allreduce.cu | 219 --------------------------------------------------- 1 file changed, 219 deletions(-) delete mode 100644 allreduce.cu diff --git a/allreduce.cu b/allreduce.cu deleted file mode 100644 index 062b120..0000000 --- a/allreduce.cu +++ /dev/null @@ -1,219 +0,0 @@ -/* \file allreduce.cu - * Copyright 2024 Parallel Software and Systems Group, University of Maryland. - * See the top-level LICENSE file for details. - * - * SPDX-License-Identifier: MIT - */ - -#include -#include -#include - -#ifdef USE_CUDA - #include - #include -#endif - -#ifdef USE_NCCL - #include "nccl.h" -#elif defined(USE_RCCL) - #include "rccl.h" -#endif - -#define NUM_WARMUP_ITERATIONS 5 - -#define MPI_CHECK(cmd) do { \ - int e = cmd; \ - if( e != MPI_SUCCESS ) { \ - printf("Failed: MPI error %s:%d '%d'\n", \ - __FILE__,__LINE__, e); \ - exit(EXIT_FAILURE); \ - } \ -} while(0) - -#define CUDA_CHECK(cmd) do { \ - cudaError_t e = cmd; \ - if(e != cudaSuccess) { \ - printf("CUDA error %s:%d: %s\n", \ - __FILE__, __LINE__, cudaGetErrorString(e)); \ - exit(EXIT_FAILURE); \ - } \ -} while(0) - -#define NCCL_CHECK(cmd) do { \ - ncclResult_t e = cmd; \ - if (e != ncclSuccess) { \ - printf("NCCL error %s:%d %s\n", \ - __FILE__, __LINE__, ncclGetErrorString(e)); \ - exit(EXIT_FAILURE); \ - } \ -} while(0) - -void initializeData(nv_bfloat16 *data, int size) { - for (int i = 0; i < (size / sizeof(nv_bfloat16)); ++i) { - data[i] = __float2bfloat16((float)i); - } -} - -void custom_bf16_sum(void *invec, void *inoutvec, int *len, MPI_Datatype *datatype) { - nv_bfloat16* in = (nv_bfloat16*) invec; - nv_bfloat16* inout = (nv_bfloat16*) inoutvec; - for (int i = 0; i < *len; i++) - inout[i] = __hadd(in[i], inout[i]); -} - -int main(int argc, char *argv[]) { - if (argc != 5) { - fprintf(stderr, "Usage: %s \n", argv[0]); - return EXIT_FAILURE; - } - - int num_gpus = atoi(argv[1]); - int min_msg_size = atoi(argv[2]); - int max_msg_size = atoi(argv[3]); - int iterations = atoi(argv[4]); - - if (num_gpus < 2 || min_msg_size <= 0 || max_msg_size <= 0 || min_msg_size > max_msg_size || iterations <= 0) { - fprintf(stderr, "Invalid input parameters.\n"); - return EXIT_FAILURE; - } - - int my_rank, num_pes; - int num_gpus_per_node; - int msg_count; - - MPI_Init(&argc, &argv); - MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); - MPI_Comm_size(MPI_COMM_WORLD, &num_pes); - - if (num_pes != num_gpus) { - fprintf(stderr, "Number of processes must match number of GPUs.\n"); - MPI_Finalize(); - return EXIT_FAILURE; - } - - // Initialize GPU context - cudaGetDeviceCount(&num_gpus_per_node); - cudaSetDevice((my_rank % num_gpus_per_node)); - - int local_data_size = max_msg_size; // Size of local data - int global_data_size = local_data_size; // Size of global data - - nv_bfloat16 *local_data = (nv_bfloat16*)malloc(local_data_size); - nv_bfloat16 *global_data = (nv_bfloat16*)malloc(global_data_size); - - // Initialize local data - initializeData(local_data, local_data_size); - - // Allocate memory on GPU - nv_bfloat16 *d_local_data, *d_global_data; - CUDA_CHECK(cudaMalloc(&d_local_data, local_data_size)); - CUDA_CHECK(cudaMalloc(&d_global_data, global_data_size)); - - // Copy local data to GPU - CUDA_CHECK(cudaMemcpy(d_local_data, local_data, local_data_size, cudaMemcpyHostToDevice)); - - #ifdef USE_MPI - // create 2-byte datatype (send raw, un-interpreted bytes) - MPI_Datatype mpi_type_bfloat16; - MPI_Type_contiguous(2, MPI_BYTE, &mpi_type_bfloat16); - MPI_Type_commit(&mpi_type_bfloat16); - - // define custom reduce operation for nv_bfloat16 types - MPI_Op CUSTOM_SUM; - MPI_Op_create(&custom_bf16_sum, 1, &CUSTOM_SUM); - - #elif USE_NCCL - ncclUniqueId nccl_comm_id; - ncclComm_t nccl_comm; - - if (my_rank == 0) { - /* Generates an Id to be used in ncclCommInitRank. */ - ncclGetUniqueId(&nccl_comm_id); - } - - /* distribute nccl_comm_id to all ranks */ - MPI_CHECK(MPI_Bcast((void *)&nccl_comm_id, sizeof(nccl_comm_id), MPI_BYTE, - 0, MPI_COMM_WORLD)); - - /* Create a new NCCL communicator */ - NCCL_CHECK(ncclCommInitRank(&nccl_comm, num_pes, nccl_comm_id, my_rank)); - - #elif defined(USE_RCCL) - // TODO: fix later - rcclComm_t rccl_comm; - rcclCommInitRank(&comm, num_gpus, 0, rccl_root); - #endif - - // Perform MPI_Iallgather, NCCL allgather, or RCCL allgather - double total_time, start_time; - MPI_Request request; - MPI_Status status; - - // Print benchmark results - if (my_rank == 0) { - printf("Number of GPUs: %d\n", num_gpus); - printf("Message size range: %d - %d\n", min_msg_size, max_msg_size); - printf("Number of iterations: %d\n", iterations); - } - fflush(NULL); - - for (int msg_size = min_msg_size; msg_size <= max_msg_size; msg_size *= 2) { - msg_count = msg_size / sizeof(nv_bfloat16); - // warmup iterations - for (int i = 0; i < NUM_WARMUP_ITERATIONS; ++i) { - #ifdef USE_MPI - MPI_CHECK(MPI_Iallreduce(d_local_data, d_global_data, msg_count, mpi_type_bfloat16, - CUSTOM_SUM, MPI_COMM_WORLD, &request)); - - MPI_CHECK(MPI_Wait(&request, &status)); - #elif defined(USE_NCCL) - NCCL_CHECK(ncclAllReduce((const void*)d_local_data, (void*)d_global_data, msg_count, ncclBfloat16, ncclSum, nccl_comm, NULL)); - cudaDeviceSynchronize(); - #elif defined(USE_RCCL) - // TODO: fix later - rcclAllReduce((const void*)d_local_data, (void*)d_global_data, global_data_size, rcclInt, rcclSum, comm, NULL); - #endif - } - - if(msg_size >= 8388608) - iterations = 20; - - MPI_Barrier(MPI_COMM_WORLD); - start_time = MPI_Wtime(); - for (int i = 0; i < iterations; ++i) { - #ifdef USE_MPI - MPI_CHECK(MPI_Iallreduce(d_local_data, d_global_data, msg_count, mpi_type_bfloat16, - CUSTOM_SUM, MPI_COMM_WORLD, &request)); - - MPI_CHECK(MPI_Wait(&request, &status)); - #elif defined(USE_NCCL) - NCCL_CHECK(ncclAllReduce((const void*)d_local_data, (void*)d_global_data, msg_count, ncclBfloat16, ncclSum, nccl_comm, NULL)); - cudaDeviceSynchronize(); - #elif defined(USE_RCCL) - // TODO: fix later - rcclAllReduce((const void*)d_local_data, (void*)d_global_data, global_data_size, rcclInt, rcclSum, comm, NULL); - #endif - } - MPI_Barrier(MPI_COMM_WORLD); - total_time = MPI_Wtime() - start_time; - if (my_rank == 0) - printf("%d %.6f seconds\n", msg_size, (total_time / iterations)); - } - - // Cleanup - free(local_data); - free(global_data); - CUDA_CHECK(cudaFree(d_local_data)); - CUDA_CHECK(cudaFree(d_global_data)); - - #ifdef USE_NCCL - ncclCommDestroy(nccl_comm); - #elif defined(USE_RCCL) - rcclCommDestroy(rccl_comm); - #endif - - MPI_Finalize(); - return EXIT_SUCCESS; -} - From 7ff3fb503e4549ca170ea9ee08d763a90eb55584 Mon Sep 17 00:00:00 2001 From: RoastSea8 Date: Sat, 30 Mar 2024 16:47:59 -0700 Subject: [PATCH 05/16] fix Makefile --- Makefile | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/Makefile b/Makefile index 973364d..7c01696 100644 --- a/Makefile +++ b/Makefile @@ -9,16 +9,13 @@ CFLAGS = -std=c++11 -O2 -target-accel=nvidia80 --cuda-gpu-arch=sm_80 -DUSE_CUDA LDFLAGS = -L/global/common/software/nersc9/nccl/2.19.4/lib -lnccl -all: allgather.x allreduce.x reduce_scatter.x +all: allgather.x reduce_scatter.x allgather.x: allgather.cu ${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o allgather.x allgather.cu -allreduce.x: allreduce.cu - ${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o allreduce.x allreduce.cu - reduce_scatter.x: reduce_scatter.cu ${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o reduce_scatter.x reduce_scatter.cu clean: - rm -f allgather.x allreduce.x reduce_scatter.x + rm -f allgather.x reduce_scatter.x From b9e882437578691448c9748aa67caae643db422a Mon Sep 17 00:00:00 2001 From: Aditya Tomar Date: Mon, 1 Apr 2024 00:38:59 -0400 Subject: [PATCH 06/16] add ROCm and RCCL code for reduce-scatter --- reduce_scatter.cu | 98 +++++++++++++++++++++++++++++++---------------- 1 file changed, 64 insertions(+), 34 deletions(-) diff --git a/reduce_scatter.cu b/reduce_scatter.cu index 9ed9e53..8f851d4 100644 --- a/reduce_scatter.cu +++ b/reduce_scatter.cu @@ -8,16 +8,20 @@ #include #include #include - #ifdef USE_CUDA - #include #include + #define bfloat16 nv_bfloat16 +#elif USE_ROCM + #include + #include + #include + #define bfloat16 hip_bfloat16 #endif #ifdef USE_NCCL #include "nccl.h" -#elif defined(USE_RCCL) - #include "rccl.h" +#elif USE_RCCL + #include #endif #define NUM_WARMUP_ITERATIONS 5 @@ -40,6 +44,16 @@ } \ } while(0) +#define HIP_CHECK(cmd) do { \ + hipError_t e = cmd; \ + if(e != hipSuccess) { \ + printf("HIP error %s:%d: %s\n", \ + __FILE__, __LINE__, hipGetErrorString(e)); \ + exit(EXIT_FAILURE); \ + } \ +} while(0) + +// NCCL_CHECK is used to validate RCCL functions as well #define NCCL_CHECK(cmd) do { \ ncclResult_t e = cmd; \ if (e != ncclSuccess) { \ @@ -49,9 +63,14 @@ } \ } while(0) -void initializeData(nv_bfloat16 *data, int size) { - for (int i = 0; i < (size / sizeof(nv_bfloat16)); ++i) { +void initializeData(bfloat16 *data, int size) { + for (int i = 0; i < (size / sizeof(bfloat16)); ++i) { + #ifdef USE_CUDA data[i] = __float2bfloat16((float)i); + #elif USE_ROCM + // ROCm doesn't have a float2bfloat16 method + data[i] = (bfloat16) ((float) i); + #endif } } @@ -93,26 +112,36 @@ int main(int argc, char *argv[]) { } // Initialize GPU context + #if USE_CUDA cudaGetDeviceCount(&num_gpus_per_node); cudaSetDevice((my_rank % num_gpus_per_node)); + #elif USE_ROCM + hipGetDeviceCount(&num_gpus_per_node); + hipSetDevice((my_rank % num_gpus_per_node)); + #endif int local_data_size = max_msg_size; // Size of local data int global_data_size = local_data_size; // Size of global data - nv_bfloat16 *local_data = (nv_bfloat16*)malloc(local_data_size); - nv_bfloat16 *global_data = (nv_bfloat16*)malloc(global_data_size); + bfloat16 *local_data = (bfloat16*)malloc(local_data_size); + bfloat16 *global_data = (bfloat16*)malloc(global_data_size); // Initialize local data initializeData(local_data, local_data_size); - // Allocate memory on GPU - nv_bfloat16 *d_local_data, *d_global_data; + bfloat16 *d_local_data, *d_global_data; + #ifdef USE_CUDA CUDA_CHECK(cudaMalloc(&d_local_data, local_data_size)); CUDA_CHECK(cudaMalloc(&d_global_data, global_data_size)); - // Copy local data to GPU CUDA_CHECK(cudaMemcpy(d_local_data, local_data, local_data_size, cudaMemcpyHostToDevice)); + #elif USE_ROCM + HIP_CHECK(hipMalloc(&d_local_data, local_data_size)); + HIP_CHECK(hipMalloc(&d_global_data, global_data_size)); + HIP_CHECK(hipMemcpy(d_local_data, local_data, local_data_size, hipMemcpyHostToDevice)); + #endif + #ifdef USE_MPI // create 2-byte datatype (send raw, un-interpreted bytes) MPI_Datatype mpi_type_bfloat16; @@ -123,7 +152,7 @@ int main(int argc, char *argv[]) { MPI_Op CUSTOM_SUM; MPI_Op_create(&custom_bf16_sum, 1, &CUSTOM_SUM); - #elif USE_NCCL + #elif defined(USE_NCCL) || defined(USE_RCCL) ncclUniqueId nccl_comm_id; ncclComm_t nccl_comm; @@ -136,17 +165,12 @@ int main(int argc, char *argv[]) { MPI_CHECK(MPI_Bcast((void *)&nccl_comm_id, sizeof(nccl_comm_id), MPI_BYTE, 0, MPI_COMM_WORLD)); - /* Create a new NCCL communicator */ + /* Create a new NCCL/RCCL communicator */ NCCL_CHECK(ncclCommInitRank(&nccl_comm, num_pes, nccl_comm_id, my_rank)); - - #elif defined(USE_RCCL) - // TODO: fix later - rcclComm_t rccl_comm; - rcclCommInitRank(&comm, num_gpus, 0, rccl_root); #endif // init recvcounts to send an equal portion of data from the reduce operation - int num_elements = local_data_size / sizeof(nv_bfloat16); + int num_elements = local_data_size / sizeof(bfloat16); int portion = num_elements / num_pes; int *recvcounts = (int*) malloc(sizeof(int) * num_pes); for (int i = 0; i < num_pes; i++) @@ -166,7 +190,7 @@ int main(int argc, char *argv[]) { fflush(NULL); for (int msg_size = min_msg_size; msg_size <= max_msg_size; msg_size *= 2) { - msg_count = msg_size / sizeof(nv_bfloat16); + msg_count = msg_size / sizeof(bfloat16); // warmup iterations for (int i = 0; i < NUM_WARMUP_ITERATIONS; ++i) { #ifdef USE_MPI @@ -174,12 +198,14 @@ int main(int argc, char *argv[]) { CUSTOM_SUM, MPI_COMM_WORLD, &request)); MPI_CHECK(MPI_Wait(&request, &status)); - #elif defined(USE_NCCL) + #elif defined(USE_NCCL) || defined(USE_RCCL) NCCL_CHECK(ncclReduceScatter((const void*)d_local_data, (void*)d_global_data, portion, ncclBfloat16, ncclSum, nccl_comm, NULL)); - cudaDeviceSynchronize(); - #elif defined(USE_RCCL) - // TODO: fix later - rcclAllReduce((const void*)d_local_data, (void*)d_global_data, global_data_size, rcclInt, rcclSum, comm, NULL); + #endif + + #ifdef USE_CUDA + cudaDeviceSynchronize(); + #elif USE_ROCM + hipDeviceSynchronize(); #endif } @@ -194,12 +220,14 @@ int main(int argc, char *argv[]) { CUSTOM_SUM, MPI_COMM_WORLD, &request)); MPI_CHECK(MPI_Wait(&request, &status)); - #elif defined(USE_NCCL) + #elif defined(USE_NCCL) || defined(USE_RCCL) NCCL_CHECK(ncclReduceScatter((const void*)d_local_data, (void*)d_global_data, portion, ncclBfloat16, ncclSum, nccl_comm, NULL)); - cudaDeviceSynchronize(); - #elif defined(USE_RCCL) - // TODO: fix later - rcclAllReduce((const void*)d_local_data, (void*)d_global_data, global_data_size, rcclInt, rcclSum, comm, NULL); + #endif + + #ifdef USE_CUDA + cudaDeviceSynchronize(); + #elif USE_ROCM + hipDeviceSynchronize(); #endif } MPI_Barrier(MPI_COMM_WORLD); @@ -211,16 +239,18 @@ int main(int argc, char *argv[]) { // Cleanup free(local_data); free(global_data); + #ifdef USE_CUDA CUDA_CHECK(cudaFree(d_local_data)); CUDA_CHECK(cudaFree(d_global_data)); + #elif USE_ROCM + HIP_CHECK(hipFree(d_local_data)); + HIP_CHECK(hipFree(d_global_data)); + #endif - #ifdef USE_NCCL + #ifdef defined(USE_NCCL) || defined(USE_RCCL) ncclCommDestroy(nccl_comm); - #elif defined(USE_RCCL) - rcclCommDestroy(rccl_comm); #endif MPI_Finalize(); return EXIT_SUCCESS; } - From 686be82807a62cb66f3ba91fe055c9644b2d4442 Mon Sep 17 00:00:00 2001 From: Aditya Tomar Date: Mon, 1 Apr 2024 00:57:19 -0400 Subject: [PATCH 07/16] revert Makefile to original --- Makefile | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/Makefile b/Makefile index 7c01696..11f5145 100644 --- a/Makefile +++ b/Makefile @@ -5,17 +5,14 @@ CC = cc INC = -I/global/common/software/nersc9/nccl/2.19.4/include -CFLAGS = -std=c++11 -O2 -target-accel=nvidia80 --cuda-gpu-arch=sm_80 -DUSE_CUDA -DUSE_MPI +CFLAGS = -std=c++11 -O2 -target-accel=nvidia80 --cuda-gpu-arch=sm_80 -DUSE_CUDA -DUSE_NCCL LDFLAGS = -L/global/common/software/nersc9/nccl/2.19.4/lib -lnccl -all: allgather.x reduce_scatter.x +all: allgather.x allgather.x: allgather.cu ${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o allgather.x allgather.cu -reduce_scatter.x: reduce_scatter.cu - ${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o reduce_scatter.x reduce_scatter.cu - clean: - rm -f allgather.x reduce_scatter.x + rm -f allgather.x From ec4fdedfe0e26a4344a778e0b9ec7fd0ed8985ab Mon Sep 17 00:00:00 2001 From: Aditya Tomar Date: Mon, 1 Apr 2024 00:58:42 -0400 Subject: [PATCH 08/16] revert Makefile to original --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 11f5145..df453b4 100644 --- a/Makefile +++ b/Makefile @@ -15,4 +15,4 @@ allgather.x: allgather.cu ${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o allgather.x allgather.cu clean: - rm -f allgather.x + rm -f allgather.x From 60e6911eb66fed80565a0fba1f59f932163fd194 Mon Sep 17 00:00:00 2001 From: Aditya Tomar Date: Mon, 1 Apr 2024 01:40:18 -0400 Subject: [PATCH 09/16] update custom bf16 sum function --- reduce_scatter.cu | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/reduce_scatter.cu b/reduce_scatter.cu index 8f851d4..5db2b60 100644 --- a/reduce_scatter.cu +++ b/reduce_scatter.cu @@ -75,10 +75,15 @@ void initializeData(bfloat16 *data, int size) { } void custom_bf16_sum(void *invec, void *inoutvec, int *len, MPI_Datatype *datatype) { - nv_bfloat16* in = (nv_bfloat16*) invec; - nv_bfloat16* inout = (nv_bfloat16*) inoutvec; - for (int i = 0; i < *len; i++) + bfloat16* in = (bfloat16*) invec; + bfloat16* inout = (bfloat16*) inoutvec; + for (int i = 0; i < *len; i++) { + #ifdef USE_CUDA inout[i] = __hadd(in[i], inout[i]); + #elif USE_ROCM + inout[i] = in[i] + inout[i]; + #endif + } } int main(int argc, char *argv[]) { From a67570e98862f1e62ac850d295cffa9a3fc79206 Mon Sep 17 00:00:00 2001 From: Aditya Tomar Date: Mon, 1 Apr 2024 01:46:47 -0400 Subject: [PATCH 10/16] fix indents --- reduce_scatter.cu | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/reduce_scatter.cu b/reduce_scatter.cu index 5db2b60..b667c01 100644 --- a/reduce_scatter.cu +++ b/reduce_scatter.cu @@ -76,10 +76,10 @@ void initializeData(bfloat16 *data, int size) { void custom_bf16_sum(void *invec, void *inoutvec, int *len, MPI_Datatype *datatype) { bfloat16* in = (bfloat16*) invec; - bfloat16* inout = (bfloat16*) inoutvec; - for (int i = 0; i < *len; i++) { + bfloat16* inout = (bfloat16*) inoutvec; + for (int i = 0; i < *len; i++) { #ifdef USE_CUDA - inout[i] = __hadd(in[i], inout[i]); + inout[i] = __hadd(in[i], inout[i]); #elif USE_ROCM inout[i] = in[i] + inout[i]; #endif From 63bb696da18e45195885cd465cb54af3930b77b9 Mon Sep 17 00:00:00 2001 From: RoastSea8 Date: Sat, 13 Apr 2024 13:47:23 -0700 Subject: [PATCH 11/16] change to int64_t for global/local data size --- reduce_scatter.cu | 33 ++++++++++++++++++++------------- 1 file changed, 20 insertions(+), 13 deletions(-) diff --git a/reduce_scatter.cu b/reduce_scatter.cu index b667c01..820cf4f 100644 --- a/reduce_scatter.cu +++ b/reduce_scatter.cu @@ -8,6 +8,8 @@ #include #include #include +#include + #ifdef USE_CUDA #include #define bfloat16 nv_bfloat16 @@ -27,9 +29,9 @@ #define NUM_WARMUP_ITERATIONS 5 #define MPI_CHECK(cmd) do { \ - int e = cmd; \ + int64_t e = cmd; \ if( e != MPI_SUCCESS ) { \ - printf("Failed: MPI error %s:%d '%d'\n", \ + printf("Failed: MPI error %s:%d '%ld'\n", \ __FILE__,__LINE__, e); \ exit(EXIT_FAILURE); \ } \ @@ -63,8 +65,8 @@ } \ } while(0) -void initializeData(bfloat16 *data, int size) { - for (int i = 0; i < (size / sizeof(bfloat16)); ++i) { +void initializeData(bfloat16 *data, int64_t size) { + for (int64_t i = 0; i < (size / sizeof(bfloat16)); ++i) { #ifdef USE_CUDA data[i] = __float2bfloat16((float)i); #elif USE_ROCM @@ -74,10 +76,10 @@ void initializeData(bfloat16 *data, int size) { } } -void custom_bf16_sum(void *invec, void *inoutvec, int *len, MPI_Datatype *datatype) { +void custom_bf16_sum(void *invec, void *inoutvec, int64_t *len, MPI_Datatype *datatype) { bfloat16* in = (bfloat16*) invec; bfloat16* inout = (bfloat16*) inoutvec; - for (int i = 0; i < *len; i++) { + for (int64_t i = 0; i < *len; i++) { #ifdef USE_CUDA inout[i] = __hadd(in[i], inout[i]); #elif USE_ROCM @@ -93,8 +95,8 @@ int main(int argc, char *argv[]) { } int num_gpus = atoi(argv[1]); - int min_msg_size = atoi(argv[2]); - int max_msg_size = atoi(argv[3]); + int64_t min_msg_size = atoi(argv[2]); + int64_t max_msg_size = atoi(argv[3]); int iterations = atoi(argv[4]); if (num_gpus < 2 || min_msg_size <= 0 || max_msg_size <= 0 || min_msg_size > max_msg_size || iterations <= 0) { @@ -125,8 +127,13 @@ int main(int argc, char *argv[]) { hipSetDevice((my_rank % num_gpus_per_node)); #endif - int local_data_size = max_msg_size; // Size of local data - int global_data_size = local_data_size; // Size of global data + int64_t local_data_size = max_msg_size; // Size of local data + int64_t global_data_size = local_data_size; // Size of global data + + if (my_rank == 0) { + fprintf(stdout, "Local data size: %ld\n", (local_data_size / 1024) / 1024); + fprintf(stdout, "Global data size: %ld\n", (global_data_size / 1024) / 1024); + } bfloat16 *local_data = (bfloat16*)malloc(local_data_size); bfloat16 *global_data = (bfloat16*)malloc(global_data_size); @@ -189,12 +196,12 @@ int main(int argc, char *argv[]) { // Print benchmark results if (my_rank == 0) { printf("Number of GPUs: %d\n", num_gpus); - printf("Message size range: %d - %d\n", min_msg_size, max_msg_size); + printf("Message size range: %ld - %ld\n", min_msg_size, max_msg_size); printf("Number of iterations: %d\n", iterations); } fflush(NULL); - for (int msg_size = min_msg_size; msg_size <= max_msg_size; msg_size *= 2) { + for (int64_t msg_size = min_msg_size; msg_size <= max_msg_size; msg_size *= 2) { msg_count = msg_size / sizeof(bfloat16); // warmup iterations for (int i = 0; i < NUM_WARMUP_ITERATIONS; ++i) { @@ -238,7 +245,7 @@ int main(int argc, char *argv[]) { MPI_Barrier(MPI_COMM_WORLD); total_time = MPI_Wtime() - start_time; if (my_rank == 0) - printf("%d %.6f seconds\n", msg_size, (total_time / iterations)); + printf("%ld %.6f seconds\n", msg_size, (total_time / iterations)); } // Cleanup From 0a33166c8ed7059a422d3bdda3c4804604a9f849 Mon Sep 17 00:00:00 2001 From: RoastSea8 Date: Sat, 13 Apr 2024 14:33:25 -0700 Subject: [PATCH 12/16] revert type change for custom sum --- reduce_scatter.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/reduce_scatter.cu b/reduce_scatter.cu index 820cf4f..f824072 100644 --- a/reduce_scatter.cu +++ b/reduce_scatter.cu @@ -76,10 +76,10 @@ void initializeData(bfloat16 *data, int64_t size) { } } -void custom_bf16_sum(void *invec, void *inoutvec, int64_t *len, MPI_Datatype *datatype) { +void custom_bf16_sum(void *invec, void *inoutvec, int *len, MPI_Datatype *datatype) { bfloat16* in = (bfloat16*) invec; bfloat16* inout = (bfloat16*) inoutvec; - for (int64_t i = 0; i < *len; i++) { + for (int i = 0; i < *len; i++) { #ifdef USE_CUDA inout[i] = __hadd(in[i], inout[i]); #elif USE_ROCM From 74cfdd8c32a705f2a74d82a9ae5ec230060aa317 Mon Sep 17 00:00:00 2001 From: RoastSea8 Date: Sat, 13 Apr 2024 17:59:18 -0700 Subject: [PATCH 13/16] change atoi to strtoll --- reduce_scatter.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/reduce_scatter.cu b/reduce_scatter.cu index f824072..1853aed 100644 --- a/reduce_scatter.cu +++ b/reduce_scatter.cu @@ -95,8 +95,8 @@ int main(int argc, char *argv[]) { } int num_gpus = atoi(argv[1]); - int64_t min_msg_size = atoi(argv[2]); - int64_t max_msg_size = atoi(argv[3]); + int64_t min_msg_size = strtoll(argv[2], NULL, 10); + int64_t max_msg_size = strtoll(argv[3], NULL, 10); int iterations = atoi(argv[4]); if (num_gpus < 2 || min_msg_size <= 0 || max_msg_size <= 0 || min_msg_size > max_msg_size || iterations <= 0) { From 089423eb99a05f058a173543003abab3e744e698 Mon Sep 17 00:00:00 2001 From: Aditya Tomar Date: Sun, 14 Apr 2024 04:44:50 -0700 Subject: [PATCH 14/16] add hip directive --- reduce_scatter.cu | 1 + 1 file changed, 1 insertion(+) diff --git a/reduce_scatter.cu b/reduce_scatter.cu index 1853aed..99fc950 100644 --- a/reduce_scatter.cu +++ b/reduce_scatter.cu @@ -14,6 +14,7 @@ #include #define bfloat16 nv_bfloat16 #elif USE_ROCM + #define __HIP_PLATFORM_AMD__ #include #include #include From 31dd4952078b16bb81b49cc5f68152731a99992b Mon Sep 17 00:00:00 2001 From: Aditya Tomar Date: Fri, 26 Apr 2024 10:56:04 -0700 Subject: [PATCH 15/16] update --- reduce_scatter.cu | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/reduce_scatter.cu b/reduce_scatter.cu index 99fc950..c60236e 100644 --- a/reduce_scatter.cu +++ b/reduce_scatter.cu @@ -30,9 +30,9 @@ #define NUM_WARMUP_ITERATIONS 5 #define MPI_CHECK(cmd) do { \ - int64_t e = cmd; \ + int64_t e = cmd; \ if( e != MPI_SUCCESS ) { \ - printf("Failed: MPI error %s:%d '%ld'\n", \ + printf("Failed: MPI error %s:%d '%ld'\n", \ __FILE__,__LINE__, e); \ exit(EXIT_FAILURE); \ } \ @@ -47,11 +47,11 @@ } \ } while(0) -#define HIP_CHECK(cmd) do { \ - hipError_t e = cmd; \ - if(e != hipSuccess) { \ - printf("HIP error %s:%d: %s\n", \ - __FILE__, __LINE__, hipGetErrorString(e)); \ +#define HIP_CHECK(cmd) do { \ + hipError_t e = cmd; \ + if(e != hipSuccess) { \ + printf("HIP error %s:%d: %s\n", \ + __FILE__, __LINE__, hipGetErrorString(e)); \ exit(EXIT_FAILURE); \ } \ } while(0) @@ -189,7 +189,7 @@ int main(int argc, char *argv[]) { for (int i = 0; i < num_pes; i++) recvcounts[i] = portion; - // Perform MPI_Iallgather, NCCL allgather, or RCCL allgather + // Perform MPI_Ireduce_scatter, NCCL reduce_scatter, or RCCL reduce_scatter double total_time, start_time; MPI_Request request; MPI_Status status; @@ -222,9 +222,6 @@ int main(int argc, char *argv[]) { #endif } - if(msg_size >= 8388608) - iterations = 20; - MPI_Barrier(MPI_COMM_WORLD); start_time = MPI_Wtime(); for (int i = 0; i < iterations; ++i) { From b1ec98a82773dff0bffdde3ff72e7762eaa0e4cf Mon Sep 17 00:00:00 2001 From: Aditya Tomar Date: Thu, 11 Jul 2024 18:18:59 -0700 Subject: [PATCH 16/16] fix bug --- reduce_scatter.cu | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/reduce_scatter.cu b/reduce_scatter.cu index c60236e..238bf23 100644 --- a/reduce_scatter.cu +++ b/reduce_scatter.cu @@ -182,12 +182,9 @@ int main(int argc, char *argv[]) { NCCL_CHECK(ncclCommInitRank(&nccl_comm, num_pes, nccl_comm_id, my_rank)); #endif - // init recvcounts to send an equal portion of data from the reduce operation - int num_elements = local_data_size / sizeof(bfloat16); - int portion = num_elements / num_pes; + // init recvcounts, which stores the portion of data to send to each process after calling reduce int *recvcounts = (int*) malloc(sizeof(int) * num_pes); - for (int i = 0; i < num_pes; i++) - recvcounts[i] = portion; + int portion; // Perform MPI_Ireduce_scatter, NCCL reduce_scatter, or RCCL reduce_scatter double total_time, start_time; @@ -204,6 +201,11 @@ int main(int argc, char *argv[]) { for (int64_t msg_size = min_msg_size; msg_size <= max_msg_size; msg_size *= 2) { msg_count = msg_size / sizeof(bfloat16); + + portion = msg_count / num_pes; + for (int i = 0; i < num_pes; i++) + recvcounts[i] = portion; + // warmup iterations for (int i = 0; i < NUM_WARMUP_ITERATIONS; ++i) { #ifdef USE_MPI