Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion apps/c/poisson/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,8 @@ APP=poisson
MAIN_SRC=poisson

#OPS_GENERATOR_VERBOSE=1
TARGETS=dev_seq dev_mpi seq tiled openmp mpi mpi_tiled mpi_openmp cuda mpi_cuda mpi_cuda_tiled hip mpi_hip mpi_hip_tiled sycl mpi_sycl mpi_sycl_tiled ompoffload mpi_ompoffload mpi_ompoffload_tiled
DEBUG=1
TARGETS=hip mpi_hip dev_seq dev_mpi seq tiled openmp mpi mpi_tiled mpi_openmp cuda mpi_cuda mpi_cuda_tiled hip mpi_hip mpi_hip_tiled sycl mpi_sycl mpi_sycl_tiled ompoffload mpi_ompoffload mpi_ompoffload_tiled

#include $(OPS_INSTALL_PATH)/../makefiles/Makefile.c_app_legacy
include $(OPS_INSTALL_PATH)/../makefiles/Makefile.c_app
6 changes: 3 additions & 3 deletions makefiles/Makefile.c_app
Original file line number Diff line number Diff line change
Expand Up @@ -268,21 +268,21 @@ $(APP)_hip: Makefile .generated $(OPS_INSTALL_PATH)/c/lib/$(OPS_COMPILER)/libops
@echo ""
@echo "Building ${APP}_hip"
@echo ""
$(HIPCC) $(CXXFLAGS_NOOMP) $(HIPFLAGS) -D$(OPS_COMPILER) -I$(C_OPS_INC) $(OPS_FILES_GEN) -I. ./hip/$(HIP_KERNELS) -L$(C_OPS_LIB) $(OPS_LINK) $(OPS_LIB_HIP) $(HDF5_LIB_SEQ) $(HIP_LINK) $(HIPRAND) -o $(APP)_hip
$(HIPCC) $(CXXFLAGS_NOOMP) $(HIPFLAGS) -D$(OPS_COMPILER) -I$(C_OPS_INC) $(OPS_FILES_GEN) -I. ./hip/$(HIP_KERNELS) -L$(C_OPS_LIB) $(OPS_LINK) $(OPS_LIB_HIP) $(HDF5_LIB_SEQ) $(HIP_LINK) $(HIP_LIB) $(HIPRAND) -o $(APP)_hip

$(APP)_mpi_hip: Makefile .generated $(OPS_INSTALL_PATH)/c/lib/$(OPS_COMPILER)/libops_mpi_hip.a $(OPS_FILES_GEN) $(HEADERS)
@echo ""
@echo ""
@echo "Building ${APP}_mpi_hip"
@echo ""
$(HIPMPICXX) $(CXXFLAGS_NOOMP) $(HIPFLAGS) -D$(OPS_COMPILER) -DOPS_MPI -I$(C_OPS_INC) $(I_MPI_INC) $(OPS_FILES_GEN) -I. ./hip/$(HIP_KERNELS) -L$(C_OPS_LIB) $(OPS_LINK) $(OPS_LIB_MPI_HIP) $(HDF5_LIB_MPI) $(L_MPI_LIB) $(MPI_LINK) $(MPI_HIP_LINK) $(HIP_LINK) $(HIPRAND) -o $(APP)_mpi_hip
$(HIPMPICXX) $(CXXFLAGS_NOOMP) $(HIPFLAGS) -D$(OPS_COMPILER) -DOPS_MPI -I$(C_OPS_INC) $(I_MPI_INC) $(OPS_FILES_GEN) -I. ./hip/$(HIP_KERNELS) -L$(C_OPS_LIB) $(OPS_LINK) $(OPS_LIB_MPI_HIP) $(HDF5_LIB_MPI) $(L_MPI_LIB) $(MPI_LINK) $(MPI_HIP_LINK) $(HIP_LINK) $(HIP_LIB) $(HIPRAND) -o $(APP)_mpi_hip

$(APP)_mpi_hip_tiled: Makefile .generated $(OPS_INSTALL_PATH)/c/lib/$(OPS_COMPILER)/libops_mpi_hip.a $(OPS_FILES_GEN) $(HEADERS)
@echo ""
@echo ""
@echo "Building ${APP}_mpi_hip_tiled"
@echo ""
$(HIPMPICXX) $(CXXFLAGS_NOOMP) $(HIPFLAGS) -D$(OPS_COMPILER) -DOPS_MPI -DOPS_LAZY -I$(C_OPS_INC) $(I_MPI_INC) $(OPS_FILES_GEN) -I. ./hip/$(HIP_KERNELS) -L$(C_OPS_LIB) $(OPS_LINK) $(OPS_LIB_MPI_HIP) $(HDF5_LIB_MPI) $(L_MPI_LIB) $(MPI_LINK) $(MPI_HIP_LINK) $(HIP_LINK) $(HIPRAND) -o $(APP)_mpi_hip_tiled
$(HIPMPICXX) $(CXXFLAGS_NOOMP) $(HIPFLAGS) -D$(OPS_COMPILER) -DOPS_MPI -DOPS_LAZY -I$(C_OPS_INC) $(I_MPI_INC) $(OPS_FILES_GEN) -I. ./hip/$(HIP_KERNELS) -L$(C_OPS_LIB) $(OPS_LINK) $(OPS_LIB_MPI_HIP) $(HDF5_LIB_MPI) $(L_MPI_LIB) $(MPI_LINK) $(MPI_HIP_LINK) $(HIP_LINK) $(HIP_LIB) $(HIPRAND) -o $(APP)_mpi_hip_tiled


#=====================================================================================================================================================================
Expand Down
10 changes: 9 additions & 1 deletion makefiles/Makefile.hip
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,16 @@ ifdef IEEE
HIPFLAGS += $(HIPIEEE)
endif

ifeq ($(OPS_COMPILER),icx)
HIPFLAGS += -lirc -limf
endif

ifeq ($(OPS_COMPILER),gnu)
MPI_LINK = -lmpi
endif

HIP_INC ?= $(HIP_INSTALL_PATH)/include -I$(HIP_INSTALL_PATH)/../include
HIP_LIB ?= $(HIP_INSTALL_PATH)/lib
HIP_LIB ?= -L$(HIP_INSTALL_PATH)/lib
MPI_HIP_LINK ?= -L$(MPI_LIB) -lmpi

HIP_LIB += -lrocm_smi64
Expand Down
3 changes: 2 additions & 1 deletion ops/c/include/ops_instance.h
Original file line number Diff line number Diff line change
Expand Up @@ -376,7 +376,7 @@ class OPS_instance {
int OPS_soa;
int OPS_diags;

// CUDA & OpenCL
// CUDA, HIP & OpenCL
int OPS_hybrid_gpu, OPS_gpu_direct;
int OPS_block_size_x;
int OPS_block_size_y;
Expand All @@ -385,6 +385,7 @@ class OPS_instance {
char *OPS_consts_h, *OPS_consts_d, *OPS_reduct_h, *OPS_reduct_d;
int OPS_consts_bytes, OPS_reduct_bytes;
int OPS_cl_device;
bool OPS_uvm_device;
char *ops_halo_buffer;
char *ops_halo_buffer_d;
int ops_halo_buffer_size;
Expand Down
20 changes: 16 additions & 4 deletions ops/c/src/core/ops_device_singlenode_common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,12 @@ ops_dat ops_decl_dat_char(ops_block block, int size, int *dat_size, int *base,
// ops_decl_dat_hdf5()
} else {
// Allocate memory immediately
dat->data = (char*) ops_malloc(bytes);
if (dat->block->instance->OPS_uvm_device) {
// If UVM is enabled, we can use the host pointer directly
ops_device_mallochost(block->instance, (void**)&dat->data, bytes);
} else {
dat->data = (char*) ops_malloc(bytes);
}
dat->user_managed = 0;
dat->mem = bytes;
dat->data_d = NULL;
Expand All @@ -79,10 +84,17 @@ ops_dat ops_decl_dat_char(ops_block block, int size, int *dat_size, int *base,
// block->instance->OPS_hybrid_layout ? //TODO: comes in when batching
// block->instance->ops_batch_size : 0);
} else {
ops_device_malloc(block->instance, ( void ** ) &( dat->data_d ), bytes);
ops_device_memset(block->instance, ( void ** ) &( dat->data_d ), 0, bytes);
if (block->instance->OPS_uvm_device){
dat->data_d = dat->data;
}
else
{
// Allocate memory on the device
ops_device_malloc(block->instance, ( void ** ) &( dat->data_d ), bytes);
ops_device_memset(block->instance, ( void ** ) &( dat->data_d ), 0, bytes);
dat->dirty_hd = 2;
}
init_deviceptr = 0;
dat->dirty_hd = 2;
}

if(init_deviceptr)
Expand Down
1 change: 1 addition & 0 deletions ops/c/src/core/ops_instance.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,7 @@ void OPS_instance::init_globals() {
OPS_consts_h=NULL; OPS_consts_d=NULL; OPS_reduct_h=NULL; OPS_reduct_d=NULL;
OPS_consts_bytes = 0; OPS_reduct_bytes = 0;
OPS_cl_device=0;
OPS_uvm_device=false;
ops_halo_buffer = NULL;
ops_halo_buffer_d = NULL;
ops_halo_buffer_size = 0;
Expand Down
20 changes: 16 additions & 4 deletions ops/c/src/core/ops_lib_core.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -607,7 +607,11 @@ void ops_free_dat_core(ops_dat dat) {
}
}
if(dat->user_managed == 0) {
ops_free(dat->data);
if (OPS_instance::getOPSInstance()->OPS_uvm_device)
{
dat->data = nullptr; // UVM managed memory, no need to free as data_d will be freed
} else
ops_free(dat->data);
dat->data = nullptr;
}
ops_free((char*)dat->name);
Expand Down Expand Up @@ -2269,12 +2273,17 @@ int ops_dat_copy_metadata_core(ops_dat target, ops_dat orig_dat)
void ops_cpHostToDevice(OPS_instance *instance, void **data_d, void **data_h, size_t size) {
if (instance->OPS_hybrid_gpu == 0) return;
if ( *data_d == NULL ) {
ops_device_malloc(instance, data_d, size);
if (instance->OPS_uvm_device) {
*data_d = *data_h;
} else {
ops_device_malloc(instance, data_d, size);
}
}
if (data_h == NULL || *data_h == NULL) {
ops_device_memset(instance, data_d, 0, size);
return;
}
if (instance->OPS_uvm_device) return; // UVM does not need explicit copy
ops_device_memcpy_h2d(instance, data_d, data_h, size);
}

Expand Down Expand Up @@ -2350,8 +2359,11 @@ void ops_put_data(ops_dat dat) {
size_t bytes = dat->elem_size;
for (int i = 0; i < dat->block->dims; i++)
bytes = bytes * dat->size[i];
ops_device_memcpy_h2d(dat->block->instance, (void**)&dat->data_d, (void**)&dat->data, bytes);
ops_device_sync(dat->block->instance);
if (not dat->block->instance->OPS_uvm_device)
{
ops_device_memcpy_h2d(dat->block->instance, (void**)&dat->data_d, (void**)&dat->data, bytes);
ops_device_sync(dat->block->instance);
}
}

void ops_randomgen_init_host(unsigned int seed, int options, std::mt19937 &ops_rand_gen) {
Expand Down
23 changes: 21 additions & 2 deletions ops/c/src/hip/ops_hip_common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -72,15 +72,26 @@ void ops_init_device(OPS_instance *instance, const int argc, const char *const a
cutilDeviceInit(instance, argc, argv);
instance->OPS_hybrid_gpu = 1;
//hipSafeCall(instance->ostream(),hipDeviceSetCacheConfig(hipFuncCachePreferL1));
hipDeviceSetCacheConfig(hipFuncCachePreferL1);
hipDeviceProp_t hipDeviceProp;
hipSafeCall(instance->ostream(), hipGetDeviceProperties(&hipDeviceProp, 0));

if (hipDeviceProp.integrated) {
instance->OPS_uvm_device = true;
ops_printf("Using integrated GPU with UVM support\n");
}

hipSafeCall(instance->ostream(), hipDeviceSetCacheConfig(hipFuncCachePreferL1));
}

void ops_device_malloc(OPS_instance *instance, void** ptr, size_t bytes) {
hipSafeCall(instance->ostream(), hipMalloc(ptr, bytes));
}

void ops_device_mallochost(OPS_instance *instance, void** ptr, size_t bytes) {
hipSafeCall(instance->ostream(), hipHostMalloc(ptr, bytes));
if (instance->OPS_uvm_device)
hipSafeCall(instance->ostream(), hipMalloc(ptr, bytes));
else
hipSafeCall(instance->ostream(), hipHostMalloc(ptr, bytes));
}

void ops_device_free(OPS_instance *instance, void** ptr) {
Expand All @@ -94,10 +105,18 @@ void ops_device_freehost(OPS_instance *instance, void** ptr) {
}

void ops_device_memcpy_h2d(OPS_instance *instance, void** to, void **from, size_t size) {
if (instance->OPS_uvm_device) {
// For UVM, the host pointer and device pointer are the same, so we can skip the copy
return;
}
hipSafeCall(instance->ostream(), hipMemcpy(*to, *from, size, hipMemcpyHostToDevice));
}

void ops_device_memcpy_d2h(OPS_instance *instance, void** to, void **from, size_t size) {
if (instance->OPS_uvm_device) {
// For UVM, the host pointer and device pointer are the same, so we can skip the copy
return;
}
hipSafeCall(instance->ostream(), hipMemcpy(*to, *from, size, hipMemcpyDeviceToHost));
}

Expand Down
31 changes: 24 additions & 7 deletions ops/c/src/mpi/ops_mpi_partition.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -459,20 +459,37 @@ void ops_decomp_dats(sub_block *sb) {
if (dat->data == NULL){
if (dat->is_hdf5 == 0) {
// dat->data = (char *)ops_calloc(prod[sb->ndim-1]*dat->elem_size, 1);
dat->data = (char *)ops_malloc(prod[sb->ndim-1]*dat->elem_size*1);
if (dat->block->instance->OPS_hybrid_gpu == 0) // CPU-only target
{
dat->data = (char *)ops_malloc(prod[sb->ndim-1]*dat->elem_size*1);
ops_init_zero(dat->data, prod[sb->ndim-1]*dat->elem_size*1);
else {
ops_device_malloc(dat->block->instance, (void **)&(dat->data_d), prod[sb->ndim-1]*dat->elem_size*1);
ops_device_memset(dat->block->instance, (void **)&(dat->data_d), 0, prod[sb->ndim-1]*dat->elem_size*1);
init_deviceptr = 0; // When device ptr initialized to zero, no need to call HostToDevice copy
dat->dirty_hd = 2; // device dirty bit set to true to trigger DeviceToHost copy
} else {
if (dat->block->instance->OPS_uvm_device){
// UVM enabled, so no need to copy to device
ops_device_malloc(dat->block->instance, (void **)&(dat->data), prod[sb->ndim-1]*dat->elem_size*1);
dat->data_d = dat->data;
init_deviceptr = 0; // When device ptr initialized to host ptr, no need to call HostToDevice copy
} else{
dat->data = (char *)ops_malloc(prod[sb->ndim-1]*dat->elem_size*1);

ops_device_malloc(dat->block->instance, (void **)&(dat->data_d), prod[sb->ndim-1]*dat->elem_size*1);
ops_device_memset(dat->block->instance, (void **)&(dat->data_d), 0, prod[sb->ndim-1]*dat->elem_size*1);
init_deviceptr = 0; // When device ptr initialized to zero, no need to call HostToDevice copy
dat->dirty_hd = 2; // device dirty bit set to true to trigger DeviceToHost copy
}
}
dat->hdf5_file = "none";
dat->mem =
prod[sb->ndim - 1] * dat->elem_size; // this includes the halo sizes
} else {
dat->data = (char *)ops_calloc(prod[sb->ndim - 1] * dat->elem_size, 1);
if (dat->block->instance->OPS_hybrid_gpu == 0) // CPU-only target
{
dat->data = (char *)ops_calloc(prod[sb->ndim - 1] * dat->elem_size, 1);
}
else {
ops_device_mallochost(dat->block->instance, (void **)&(dat->data), prod[sb->ndim - 1] * dat->elem_size);
memset((void*)dat->data, 0, prod[sb->ndim - 1] * dat->elem_size);
}
dat->mem =
prod[sb->ndim - 1] * dat->elem_size; // this includes the halo sizes
if (ops_read_dat_hdf5_dynamic == NULL) {
Expand Down
41 changes: 41 additions & 0 deletions scripts/MI300A_Archer2_OPS_env.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
export OPS_COMPILER=gnu
export OPS_INSTALL_PATH=~/repos/OPS/ops
export OPS_TRANSLATOR=~/repos/OPS/ops_translator

export USE_HDF5=1

module purge

export AMD_ARCH=MI300A

module load rocm/6.4.0
# module load gcc/base
module load openmpi/5.0.7-ucc1.3.0-ucx1.18.0

export MPI_INSTALL_PATH=/opt/rocmplus-6.4.0/openmpi-5.0.7-ucc-1.3.0-ucx-1.18.0
# export LD_LIBRARY_PATH=$MPI_INSTALL_PATH/lib:$LD_LIBRARY_PATH

if [ $AMD_ARCH = "MI300A" ]; then
export HSA_XNACK=1 # Enable XNACK for MI300A
fi

export ROCM_PATH=/opt/rocm-6.4.0
# export LD_LIBRARY_PATH=$ROCM_PATH/llvm/lib:$LD_LIBRARY_PATH
export HIP_INSTALL_PATH=$ROCM_PATH
export AOMP=$ROCM_PATH/llvm

export MPICC=mpic++
export MPICPP=mpic++
export MPICXX=mpicxx

# export MPICH_GPU_SUPPORT_ENABLED=1

export HDF5_INSTALL_PATH=/opt/hdf5-v1.14.5/HDF_Group/HDF5/1.14.5
export LD_LIBRARY_PATH=$HDF5_INSTALL_PATH/lib:$LD_LIBRARY_PATH

export PYTHONPATH=$PYTHONPATH:~/repos/opensbli/

source $OPS_INSTALL_PATH/../ops_translator/ops_venv/bin/activate



45 changes: 45 additions & 0 deletions scripts/set_amd_cpu_gpu_devices.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
#!/bin/bash

export global_rank=${OMPI_COMM_WORLD_RANK}
export local_rank=${OMPI_COMM_WORLD_LOCAL_RANK}
export ranks_per_node=${OMPI_COMM_WORLD_LOCAL_SIZE}

if [ -z "${NUM_CPUS}" ]; then
let NUM_CPUS=96
fi

if [ -z "${RANK_STRIDE}" ]; then
let RANK_STRIDE=$(( ${NUM_CPUS}/${ranks_per_node} ))
fi

if [ -z "${OMP_STRIDE}" ]; then
let OMP_STRIDE=1
fi

if [ -z "${NUM_GPUS}" ]; then
let NUM_GPUS=4
fi

if [ -z "${GPU_START}" ]; then
let GPU_START=0
fi

if [ -z "${GPU_STRIDE}" ]; then
let GPU_STRIDE=1
fi

cpu_list=($(seq 0 95))
let cpus_per_gpu=${NUM_CPUS}/${NUM_GPUS}
let cpu_start_index=$(( ($RANK_STRIDE*${local_rank})+${GPU_START}*$cpus_per_gpu ))
let cpu_start=${cpu_list[$cpu_start_index]}
let cpu_stop=$(($cpu_start+$OMP_NUM_THREADS*$OMP_STRIDE-1))

gpu_list=(0 1 2 3)
let ranks_per_gpu=$(((${ranks_per_node}+${NUM_GPUS}-1)/${NUM_GPUS}))
let my_gpu_index=$(($local_rank*$GPU_STRIDE/$ranks_per_gpu))+${GPU_START}
let my_gpu=${gpu_list[${my_gpu_index}]}

export GOMP_CPU_AFFINITY=$cpu_start-$cpu_stop:$OMP_STRIDE
export ROCR_VISIBLE_DEVICES=$my_gpu

"$@"
4 changes: 4 additions & 0 deletions scripts/set_amd_gpu_devices.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
#!/bin/bash
let mygpu=${OMPI_COMM_WORLD_SIZE}-${OMPI_COMM_WORLD_LOCAL_RANK}-1
export ROCR_VISIBLE_DEVICES=$mygpu
exec $*
41 changes: 41 additions & 0 deletions source_files/setup_env_gpu_gnu_ompi_MI300A_COSMA.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
#!/bin/bash

# OPS_COMPILER - gnu
export OPS_COMPILER=gnu

export OPS_INSTALL_PATH=$HOME/repos/OPS/ops
module purge

# Compiler
module load gnu_comp/14.1.0
module load openmpi/5.0.3
module load parallel_hdf5/1.14.4

# MPI setting
export MPI_INSTALL_PATH=//cosma/local/openmpi/gnu_14.1.0/5.0.3
export LD_LIBRARY_PATH=$MPI_INSTALL_PATH/lib:$LD_LIBRARY_PATH
# export CPLUS_INCLUDE_PATH=$MPI_INSTALL_PATH/include:$CPLUS_INCLUDE_PATH
# export C_INCLUDE_PATH=$MPI_INSTALL_PATH/include:$C_INCLUDE_PATH
# export CPP_INCLUDE_PATH=$MPI_INSTALL_PATH/include:$CPP_INCLUDE_PATH

export MPICC=mpic++
export MPICPP=mpic++
export MPICXX=mpicxx
# HIP
export AMD_ARCH=MI300A

# export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7

export ROCM_PATH=/etc/alternatives/rocm
export LD_LIBRARY_PATH=$ROCM_PATH/llvm/lib:$LD_LIBRARY_PATH
export HIP_INSTALL_PATH=$ROCM_PATH
export AOMP=$ROCM_PATH/llvm

# HDF5
unset HDF5_INSTALL_PATH
export HDF5_INSTALL_PATH=/cosma/local/parallel_hdf5/gnu_14.1.0_ompi_5.0.3/1.14.4/
export LD_LIBRARY_PATH=$HDF5_INSTALL_PATH/lib:$LD_LIBRARY_PATH

# Python
module load python/3.9.19
source $OPS_INSTALL_PATH/../ops_translator/ops_venv/bin/activate
Loading