diff --git a/apps/c/poisson/Makefile b/apps/c/poisson/Makefile index a9a57c1ed..aafa636b9 100644 --- a/apps/c/poisson/Makefile +++ b/apps/c/poisson/Makefile @@ -26,7 +26,8 @@ APP=poisson MAIN_SRC=poisson #OPS_GENERATOR_VERBOSE=1 -TARGETS=dev_seq dev_mpi seq tiled openmp mpi mpi_tiled mpi_openmp cuda mpi_cuda mpi_cuda_tiled hip mpi_hip mpi_hip_tiled sycl mpi_sycl mpi_sycl_tiled ompoffload mpi_ompoffload mpi_ompoffload_tiled +DEBUG=1 +TARGETS=hip mpi_hip dev_seq dev_mpi seq tiled openmp mpi mpi_tiled mpi_openmp cuda mpi_cuda mpi_cuda_tiled hip mpi_hip mpi_hip_tiled sycl mpi_sycl mpi_sycl_tiled ompoffload mpi_ompoffload mpi_ompoffload_tiled #include $(OPS_INSTALL_PATH)/../makefiles/Makefile.c_app_legacy include $(OPS_INSTALL_PATH)/../makefiles/Makefile.c_app diff --git a/makefiles/Makefile.c_app b/makefiles/Makefile.c_app index f0bc96299..9e4afcc48 100644 --- a/makefiles/Makefile.c_app +++ b/makefiles/Makefile.c_app @@ -268,21 +268,21 @@ $(APP)_hip: Makefile .generated $(OPS_INSTALL_PATH)/c/lib/$(OPS_COMPILER)/libops @echo "" @echo "Building ${APP}_hip" @echo "" - $(HIPCC) $(CXXFLAGS_NOOMP) $(HIPFLAGS) -D$(OPS_COMPILER) -I$(C_OPS_INC) $(OPS_FILES_GEN) -I. ./hip/$(HIP_KERNELS) -L$(C_OPS_LIB) $(OPS_LINK) $(OPS_LIB_HIP) $(HDF5_LIB_SEQ) $(HIP_LINK) $(HIPRAND) -o $(APP)_hip + $(HIPCC) $(CXXFLAGS_NOOMP) $(HIPFLAGS) -D$(OPS_COMPILER) -I$(C_OPS_INC) $(OPS_FILES_GEN) -I. ./hip/$(HIP_KERNELS) -L$(C_OPS_LIB) $(OPS_LINK) $(OPS_LIB_HIP) $(HDF5_LIB_SEQ) $(HIP_LINK) $(HIP_LIB) $(HIPRAND) -o $(APP)_hip $(APP)_mpi_hip: Makefile .generated $(OPS_INSTALL_PATH)/c/lib/$(OPS_COMPILER)/libops_mpi_hip.a $(OPS_FILES_GEN) $(HEADERS) @echo "" @echo "" @echo "Building ${APP}_mpi_hip" @echo "" - $(HIPMPICXX) $(CXXFLAGS_NOOMP) $(HIPFLAGS) -D$(OPS_COMPILER) -DOPS_MPI -I$(C_OPS_INC) $(I_MPI_INC) $(OPS_FILES_GEN) -I. ./hip/$(HIP_KERNELS) -L$(C_OPS_LIB) $(OPS_LINK) $(OPS_LIB_MPI_HIP) $(HDF5_LIB_MPI) $(L_MPI_LIB) $(MPI_LINK) $(MPI_HIP_LINK) $(HIP_LINK) $(HIPRAND) -o $(APP)_mpi_hip + $(HIPMPICXX) $(CXXFLAGS_NOOMP) $(HIPFLAGS) -D$(OPS_COMPILER) -DOPS_MPI -I$(C_OPS_INC) $(I_MPI_INC) $(OPS_FILES_GEN) -I. ./hip/$(HIP_KERNELS) -L$(C_OPS_LIB) $(OPS_LINK) $(OPS_LIB_MPI_HIP) $(HDF5_LIB_MPI) $(L_MPI_LIB) $(MPI_LINK) $(MPI_HIP_LINK) $(HIP_LINK) $(HIP_LIB) $(HIPRAND) -o $(APP)_mpi_hip $(APP)_mpi_hip_tiled: Makefile .generated $(OPS_INSTALL_PATH)/c/lib/$(OPS_COMPILER)/libops_mpi_hip.a $(OPS_FILES_GEN) $(HEADERS) @echo "" @echo "" @echo "Building ${APP}_mpi_hip_tiled" @echo "" - $(HIPMPICXX) $(CXXFLAGS_NOOMP) $(HIPFLAGS) -D$(OPS_COMPILER) -DOPS_MPI -DOPS_LAZY -I$(C_OPS_INC) $(I_MPI_INC) $(OPS_FILES_GEN) -I. ./hip/$(HIP_KERNELS) -L$(C_OPS_LIB) $(OPS_LINK) $(OPS_LIB_MPI_HIP) $(HDF5_LIB_MPI) $(L_MPI_LIB) $(MPI_LINK) $(MPI_HIP_LINK) $(HIP_LINK) $(HIPRAND) -o $(APP)_mpi_hip_tiled + $(HIPMPICXX) $(CXXFLAGS_NOOMP) $(HIPFLAGS) -D$(OPS_COMPILER) -DOPS_MPI -DOPS_LAZY -I$(C_OPS_INC) $(I_MPI_INC) $(OPS_FILES_GEN) -I. ./hip/$(HIP_KERNELS) -L$(C_OPS_LIB) $(OPS_LINK) $(OPS_LIB_MPI_HIP) $(HDF5_LIB_MPI) $(L_MPI_LIB) $(MPI_LINK) $(MPI_HIP_LINK) $(HIP_LINK) $(HIP_LIB) $(HIPRAND) -o $(APP)_mpi_hip_tiled #===================================================================================================================================================================== diff --git a/makefiles/Makefile.hip b/makefiles/Makefile.hip index 9a4af4839..d86550684 100644 --- a/makefiles/Makefile.hip +++ b/makefiles/Makefile.hip @@ -4,8 +4,16 @@ ifdef IEEE HIPFLAGS += $(HIPIEEE) endif +ifeq ($(OPS_COMPILER),icx) +HIPFLAGS += -lirc -limf +endif + +ifeq ($(OPS_COMPILER),gnu) +MPI_LINK = -lmpi +endif + HIP_INC ?= $(HIP_INSTALL_PATH)/include -I$(HIP_INSTALL_PATH)/../include -HIP_LIB ?= $(HIP_INSTALL_PATH)/lib +HIP_LIB ?= -L$(HIP_INSTALL_PATH)/lib MPI_HIP_LINK ?= -L$(MPI_LIB) -lmpi HIP_LIB += -lrocm_smi64 diff --git a/ops/c/include/ops_instance.h b/ops/c/include/ops_instance.h index afcff7440..979b3c649 100644 --- a/ops/c/include/ops_instance.h +++ b/ops/c/include/ops_instance.h @@ -376,7 +376,7 @@ class OPS_instance { int OPS_soa; int OPS_diags; - // CUDA & OpenCL + // CUDA, HIP & OpenCL int OPS_hybrid_gpu, OPS_gpu_direct; int OPS_block_size_x; int OPS_block_size_y; @@ -385,6 +385,7 @@ class OPS_instance { char *OPS_consts_h, *OPS_consts_d, *OPS_reduct_h, *OPS_reduct_d; int OPS_consts_bytes, OPS_reduct_bytes; int OPS_cl_device; + bool OPS_uvm_device; char *ops_halo_buffer; char *ops_halo_buffer_d; int ops_halo_buffer_size; diff --git a/ops/c/src/core/ops_device_singlenode_common.cpp b/ops/c/src/core/ops_device_singlenode_common.cpp index 0a469dc00..b3f9aff34 100644 --- a/ops/c/src/core/ops_device_singlenode_common.cpp +++ b/ops/c/src/core/ops_device_singlenode_common.cpp @@ -67,7 +67,12 @@ ops_dat ops_decl_dat_char(ops_block block, int size, int *dat_size, int *base, // ops_decl_dat_hdf5() } else { // Allocate memory immediately - dat->data = (char*) ops_malloc(bytes); + if (dat->block->instance->OPS_uvm_device) { + // If UVM is enabled, we can use the host pointer directly + ops_device_mallochost(block->instance, (void**)&dat->data, bytes); + } else { + dat->data = (char*) ops_malloc(bytes); + } dat->user_managed = 0; dat->mem = bytes; dat->data_d = NULL; @@ -79,10 +84,17 @@ ops_dat ops_decl_dat_char(ops_block block, int size, int *dat_size, int *base, // block->instance->OPS_hybrid_layout ? //TODO: comes in when batching // block->instance->ops_batch_size : 0); } else { - ops_device_malloc(block->instance, ( void ** ) &( dat->data_d ), bytes); - ops_device_memset(block->instance, ( void ** ) &( dat->data_d ), 0, bytes); + if (block->instance->OPS_uvm_device){ + dat->data_d = dat->data; + } + else + { + // Allocate memory on the device + ops_device_malloc(block->instance, ( void ** ) &( dat->data_d ), bytes); + ops_device_memset(block->instance, ( void ** ) &( dat->data_d ), 0, bytes); + dat->dirty_hd = 2; + } init_deviceptr = 0; - dat->dirty_hd = 2; } if(init_deviceptr) diff --git a/ops/c/src/core/ops_instance.cpp b/ops/c/src/core/ops_instance.cpp index 42c85cd9f..982b208b5 100644 --- a/ops/c/src/core/ops_instance.cpp +++ b/ops/c/src/core/ops_instance.cpp @@ -183,6 +183,7 @@ void OPS_instance::init_globals() { OPS_consts_h=NULL; OPS_consts_d=NULL; OPS_reduct_h=NULL; OPS_reduct_d=NULL; OPS_consts_bytes = 0; OPS_reduct_bytes = 0; OPS_cl_device=0; + OPS_uvm_device=false; ops_halo_buffer = NULL; ops_halo_buffer_d = NULL; ops_halo_buffer_size = 0; diff --git a/ops/c/src/core/ops_lib_core.cpp b/ops/c/src/core/ops_lib_core.cpp index 21025d110..8847a6f81 100644 --- a/ops/c/src/core/ops_lib_core.cpp +++ b/ops/c/src/core/ops_lib_core.cpp @@ -607,7 +607,11 @@ void ops_free_dat_core(ops_dat dat) { } } if(dat->user_managed == 0) { - ops_free(dat->data); + if (OPS_instance::getOPSInstance()->OPS_uvm_device) + { + dat->data = nullptr; // UVM managed memory, no need to free as data_d will be freed + } else + ops_free(dat->data); dat->data = nullptr; } ops_free((char*)dat->name); @@ -2269,12 +2273,17 @@ int ops_dat_copy_metadata_core(ops_dat target, ops_dat orig_dat) void ops_cpHostToDevice(OPS_instance *instance, void **data_d, void **data_h, size_t size) { if (instance->OPS_hybrid_gpu == 0) return; if ( *data_d == NULL ) { - ops_device_malloc(instance, data_d, size); + if (instance->OPS_uvm_device) { + *data_d = *data_h; + } else { + ops_device_malloc(instance, data_d, size); + } } if (data_h == NULL || *data_h == NULL) { ops_device_memset(instance, data_d, 0, size); return; } + if (instance->OPS_uvm_device) return; // UVM does not need explicit copy ops_device_memcpy_h2d(instance, data_d, data_h, size); } @@ -2350,8 +2359,11 @@ void ops_put_data(ops_dat dat) { size_t bytes = dat->elem_size; for (int i = 0; i < dat->block->dims; i++) bytes = bytes * dat->size[i]; - ops_device_memcpy_h2d(dat->block->instance, (void**)&dat->data_d, (void**)&dat->data, bytes); - ops_device_sync(dat->block->instance); + if (not dat->block->instance->OPS_uvm_device) + { + ops_device_memcpy_h2d(dat->block->instance, (void**)&dat->data_d, (void**)&dat->data, bytes); + ops_device_sync(dat->block->instance); + } } void ops_randomgen_init_host(unsigned int seed, int options, std::mt19937 &ops_rand_gen) { diff --git a/ops/c/src/hip/ops_hip_common.cpp b/ops/c/src/hip/ops_hip_common.cpp index 7233d5e07..cb8371404 100644 --- a/ops/c/src/hip/ops_hip_common.cpp +++ b/ops/c/src/hip/ops_hip_common.cpp @@ -72,7 +72,15 @@ void ops_init_device(OPS_instance *instance, const int argc, const char *const a cutilDeviceInit(instance, argc, argv); instance->OPS_hybrid_gpu = 1; //hipSafeCall(instance->ostream(),hipDeviceSetCacheConfig(hipFuncCachePreferL1)); - hipDeviceSetCacheConfig(hipFuncCachePreferL1); + hipDeviceProp_t hipDeviceProp; + hipSafeCall(instance->ostream(), hipGetDeviceProperties(&hipDeviceProp, 0)); + + if (hipDeviceProp.integrated) { + instance->OPS_uvm_device = true; + ops_printf("Using integrated GPU with UVM support\n"); + } + + hipSafeCall(instance->ostream(), hipDeviceSetCacheConfig(hipFuncCachePreferL1)); } void ops_device_malloc(OPS_instance *instance, void** ptr, size_t bytes) { @@ -80,7 +88,10 @@ void ops_device_malloc(OPS_instance *instance, void** ptr, size_t bytes) { } void ops_device_mallochost(OPS_instance *instance, void** ptr, size_t bytes) { - hipSafeCall(instance->ostream(), hipHostMalloc(ptr, bytes)); + if (instance->OPS_uvm_device) + hipSafeCall(instance->ostream(), hipMalloc(ptr, bytes)); + else + hipSafeCall(instance->ostream(), hipHostMalloc(ptr, bytes)); } void ops_device_free(OPS_instance *instance, void** ptr) { @@ -94,10 +105,18 @@ void ops_device_freehost(OPS_instance *instance, void** ptr) { } void ops_device_memcpy_h2d(OPS_instance *instance, void** to, void **from, size_t size) { + if (instance->OPS_uvm_device) { + // For UVM, the host pointer and device pointer are the same, so we can skip the copy + return; + } hipSafeCall(instance->ostream(), hipMemcpy(*to, *from, size, hipMemcpyHostToDevice)); } void ops_device_memcpy_d2h(OPS_instance *instance, void** to, void **from, size_t size) { + if (instance->OPS_uvm_device) { + // For UVM, the host pointer and device pointer are the same, so we can skip the copy + return; + } hipSafeCall(instance->ostream(), hipMemcpy(*to, *from, size, hipMemcpyDeviceToHost)); } diff --git a/ops/c/src/mpi/ops_mpi_partition.cpp b/ops/c/src/mpi/ops_mpi_partition.cpp index 129ebf9af..728e37a38 100644 --- a/ops/c/src/mpi/ops_mpi_partition.cpp +++ b/ops/c/src/mpi/ops_mpi_partition.cpp @@ -459,20 +459,37 @@ void ops_decomp_dats(sub_block *sb) { if (dat->data == NULL){ if (dat->is_hdf5 == 0) { // dat->data = (char *)ops_calloc(prod[sb->ndim-1]*dat->elem_size, 1); - dat->data = (char *)ops_malloc(prod[sb->ndim-1]*dat->elem_size*1); if (dat->block->instance->OPS_hybrid_gpu == 0) // CPU-only target + { + dat->data = (char *)ops_malloc(prod[sb->ndim-1]*dat->elem_size*1); ops_init_zero(dat->data, prod[sb->ndim-1]*dat->elem_size*1); - else { - ops_device_malloc(dat->block->instance, (void **)&(dat->data_d), prod[sb->ndim-1]*dat->elem_size*1); - ops_device_memset(dat->block->instance, (void **)&(dat->data_d), 0, prod[sb->ndim-1]*dat->elem_size*1); - init_deviceptr = 0; // When device ptr initialized to zero, no need to call HostToDevice copy - dat->dirty_hd = 2; // device dirty bit set to true to trigger DeviceToHost copy + } else { + if (dat->block->instance->OPS_uvm_device){ + // UVM enabled, so no need to copy to device + ops_device_malloc(dat->block->instance, (void **)&(dat->data), prod[sb->ndim-1]*dat->elem_size*1); + dat->data_d = dat->data; + init_deviceptr = 0; // When device ptr initialized to host ptr, no need to call HostToDevice copy + } else{ + dat->data = (char *)ops_malloc(prod[sb->ndim-1]*dat->elem_size*1); + + ops_device_malloc(dat->block->instance, (void **)&(dat->data_d), prod[sb->ndim-1]*dat->elem_size*1); + ops_device_memset(dat->block->instance, (void **)&(dat->data_d), 0, prod[sb->ndim-1]*dat->elem_size*1); + init_deviceptr = 0; // When device ptr initialized to zero, no need to call HostToDevice copy + dat->dirty_hd = 2; // device dirty bit set to true to trigger DeviceToHost copy + } } dat->hdf5_file = "none"; dat->mem = prod[sb->ndim - 1] * dat->elem_size; // this includes the halo sizes } else { - dat->data = (char *)ops_calloc(prod[sb->ndim - 1] * dat->elem_size, 1); + if (dat->block->instance->OPS_hybrid_gpu == 0) // CPU-only target + { + dat->data = (char *)ops_calloc(prod[sb->ndim - 1] * dat->elem_size, 1); + } + else { + ops_device_mallochost(dat->block->instance, (void **)&(dat->data), prod[sb->ndim - 1] * dat->elem_size); + memset((void*)dat->data, 0, prod[sb->ndim - 1] * dat->elem_size); + } dat->mem = prod[sb->ndim - 1] * dat->elem_size; // this includes the halo sizes if (ops_read_dat_hdf5_dynamic == NULL) { diff --git a/scripts/MI300A_Archer2_OPS_env.sh b/scripts/MI300A_Archer2_OPS_env.sh new file mode 100644 index 000000000..c180e46ff --- /dev/null +++ b/scripts/MI300A_Archer2_OPS_env.sh @@ -0,0 +1,41 @@ +export OPS_COMPILER=gnu +export OPS_INSTALL_PATH=~/repos/OPS/ops +export OPS_TRANSLATOR=~/repos/OPS/ops_translator + +export USE_HDF5=1 + +module purge + +export AMD_ARCH=MI300A + +module load rocm/6.4.0 +# module load gcc/base +module load openmpi/5.0.7-ucc1.3.0-ucx1.18.0 + +export MPI_INSTALL_PATH=/opt/rocmplus-6.4.0/openmpi-5.0.7-ucc-1.3.0-ucx-1.18.0 +# export LD_LIBRARY_PATH=$MPI_INSTALL_PATH/lib:$LD_LIBRARY_PATH + +if [ $AMD_ARCH = "MI300A" ]; then + export HSA_XNACK=1 # Enable XNACK for MI300A +fi + +export ROCM_PATH=/opt/rocm-6.4.0 +# export LD_LIBRARY_PATH=$ROCM_PATH/llvm/lib:$LD_LIBRARY_PATH +export HIP_INSTALL_PATH=$ROCM_PATH +export AOMP=$ROCM_PATH/llvm + +export MPICC=mpic++ +export MPICPP=mpic++ +export MPICXX=mpicxx + +# export MPICH_GPU_SUPPORT_ENABLED=1 + +export HDF5_INSTALL_PATH=/opt/hdf5-v1.14.5/HDF_Group/HDF5/1.14.5 +export LD_LIBRARY_PATH=$HDF5_INSTALL_PATH/lib:$LD_LIBRARY_PATH + +export PYTHONPATH=$PYTHONPATH:~/repos/opensbli/ + +source $OPS_INSTALL_PATH/../ops_translator/ops_venv/bin/activate + + + diff --git a/scripts/set_amd_cpu_gpu_devices.sh b/scripts/set_amd_cpu_gpu_devices.sh new file mode 100755 index 000000000..8ef6a5018 --- /dev/null +++ b/scripts/set_amd_cpu_gpu_devices.sh @@ -0,0 +1,45 @@ +#!/bin/bash + +export global_rank=${OMPI_COMM_WORLD_RANK} +export local_rank=${OMPI_COMM_WORLD_LOCAL_RANK} +export ranks_per_node=${OMPI_COMM_WORLD_LOCAL_SIZE} + +if [ -z "${NUM_CPUS}" ]; then + let NUM_CPUS=96 +fi + +if [ -z "${RANK_STRIDE}" ]; then + let RANK_STRIDE=$(( ${NUM_CPUS}/${ranks_per_node} )) +fi + +if [ -z "${OMP_STRIDE}" ]; then + let OMP_STRIDE=1 +fi + +if [ -z "${NUM_GPUS}" ]; then + let NUM_GPUS=4 +fi + +if [ -z "${GPU_START}" ]; then + let GPU_START=0 +fi + +if [ -z "${GPU_STRIDE}" ]; then + let GPU_STRIDE=1 +fi + +cpu_list=($(seq 0 95)) +let cpus_per_gpu=${NUM_CPUS}/${NUM_GPUS} +let cpu_start_index=$(( ($RANK_STRIDE*${local_rank})+${GPU_START}*$cpus_per_gpu )) +let cpu_start=${cpu_list[$cpu_start_index]} +let cpu_stop=$(($cpu_start+$OMP_NUM_THREADS*$OMP_STRIDE-1)) + +gpu_list=(0 1 2 3) +let ranks_per_gpu=$(((${ranks_per_node}+${NUM_GPUS}-1)/${NUM_GPUS})) +let my_gpu_index=$(($local_rank*$GPU_STRIDE/$ranks_per_gpu))+${GPU_START} +let my_gpu=${gpu_list[${my_gpu_index}]} + +export GOMP_CPU_AFFINITY=$cpu_start-$cpu_stop:$OMP_STRIDE +export ROCR_VISIBLE_DEVICES=$my_gpu + +"$@" \ No newline at end of file diff --git a/scripts/set_amd_gpu_devices.sh b/scripts/set_amd_gpu_devices.sh new file mode 100755 index 000000000..f9f6f9c66 --- /dev/null +++ b/scripts/set_amd_gpu_devices.sh @@ -0,0 +1,4 @@ +#!/bin/bash +let mygpu=${OMPI_COMM_WORLD_SIZE}-${OMPI_COMM_WORLD_LOCAL_RANK}-1 +export ROCR_VISIBLE_DEVICES=$mygpu +exec $* \ No newline at end of file diff --git a/source_files/setup_env_gpu_gnu_ompi_MI300A_COSMA.sh b/source_files/setup_env_gpu_gnu_ompi_MI300A_COSMA.sh new file mode 100644 index 000000000..79c7b083a --- /dev/null +++ b/source_files/setup_env_gpu_gnu_ompi_MI300A_COSMA.sh @@ -0,0 +1,41 @@ +#!/bin/bash + +# OPS_COMPILER - gnu +export OPS_COMPILER=gnu + +export OPS_INSTALL_PATH=$HOME/repos/OPS/ops +module purge + +# Compiler +module load gnu_comp/14.1.0 +module load openmpi/5.0.3 +module load parallel_hdf5/1.14.4 + +# MPI setting +export MPI_INSTALL_PATH=//cosma/local/openmpi/gnu_14.1.0/5.0.3 +export LD_LIBRARY_PATH=$MPI_INSTALL_PATH/lib:$LD_LIBRARY_PATH +# export CPLUS_INCLUDE_PATH=$MPI_INSTALL_PATH/include:$CPLUS_INCLUDE_PATH +# export C_INCLUDE_PATH=$MPI_INSTALL_PATH/include:$C_INCLUDE_PATH +# export CPP_INCLUDE_PATH=$MPI_INSTALL_PATH/include:$CPP_INCLUDE_PATH + +export MPICC=mpic++ +export MPICPP=mpic++ +export MPICXX=mpicxx +# HIP +export AMD_ARCH=MI300A + +# export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 + +export ROCM_PATH=/etc/alternatives/rocm +export LD_LIBRARY_PATH=$ROCM_PATH/llvm/lib:$LD_LIBRARY_PATH +export HIP_INSTALL_PATH=$ROCM_PATH +export AOMP=$ROCM_PATH/llvm + +# HDF5 +unset HDF5_INSTALL_PATH +export HDF5_INSTALL_PATH=/cosma/local/parallel_hdf5/gnu_14.1.0_ompi_5.0.3/1.14.4/ +export LD_LIBRARY_PATH=$HDF5_INSTALL_PATH/lib:$LD_LIBRARY_PATH + +# Python +module load python/3.9.19 +source $OPS_INSTALL_PATH/../ops_translator/ops_venv/bin/activate diff --git a/source_files/setup_env_gpu_icx_ompi_MI300A_COSMA.sh b/source_files/setup_env_gpu_icx_ompi_MI300A_COSMA.sh new file mode 100644 index 000000000..5cf42d0fd --- /dev/null +++ b/source_files/setup_env_gpu_icx_ompi_MI300A_COSMA.sh @@ -0,0 +1,40 @@ +#!/bin/bash + +# OPS_COMPILER - icx +export OPS_COMPILER=icx + +export OPS_INSTALL_PATH=$HOME/repos/OPS/ops +module purge + +# Compiler +module load intel_comp/2024.2.0 +module load compiler-rt tbb compiler openmpi/5.0.3 ucx dpl dpct +module load parallel_hdf5/1.14.4 + +# MPI setting +export MPI_INSTALL_PATH=/cosma/local/openmpi/intel_2024.2.0/5.0.3 +export LD_LIBRARY_PATH=$MPI_INSTALL_PATH/lib:$LD_LIBRARY_PATH + +export MPICC=mpicc +export MPICPP=mpiCC +export MPICXX=mpicxx +export MPIFC=mpiifort +export MPIF90=mpiifort +# HIP +export AMD_ARCH=MI300A + +# export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 + +export ROCM_PATH=/opt/rocm +export LD_LIBRARY_PATH=$ROCM_PATH/llvm/lib:$LD_LIBRARY_PATH +export HIP_INSTALL_PATH=$ROCM_PATH +export AOMP=$ROCM_PATH/llvm + +# HDF5 +unset HDF5_INSTALL_PATH +export HDF5_INSTALL_PATH=/cosma/local/parallel-hdf5/intel_2024.2.0_intel_mpi_2024.2.0/1.14.4 +export LD_LIBRARY_PATH=$HDF5_INSTALL_PATH/lib:$LD_LIBRARY_PATH + +# Python +module load python/3.9.19 +source $OPS_INSTALL_PATH/../ops_translator/ops_venv/bin/activate