From 45531faa2e1984d89e4593457ad4fcd128a7dd24 Mon Sep 17 00:00:00 2001 From: Olli Lupton Date: Mon, 13 Dec 2021 15:57:24 +0100 Subject: [PATCH 1/3] Code fixes for XLC and Clang execution. Build system changes are not included for now. --- CMake/MakefileBuildOptions.cmake | 1 + coreneuron/gpu/nrn_acc_manager.cpp | 52 ----------------------- coreneuron/kinderiv.py | 6 +++ coreneuron/mechanism/eion.cpp | 2 + coreneuron/mechanism/mech/dimplic.cpp | 2 + coreneuron/mechanism/register_mech.cpp | 2 + coreneuron/network/cvodestb.cpp | 2 + coreneuron/network/netcvode.cpp | 2 +- coreneuron/sim/scopmath/crout_thread.cpp | 2 + coreneuron/sim/scopmath/newton_thread.cpp | 2 + coreneuron/sim/treeset_core.cpp | 12 +++--- coreneuron/utils/offload.hpp | 50 ++++++++++++++++++++++ extra/nrnivmodl_core_makefile.in | 4 +- 13 files changed, 78 insertions(+), 61 deletions(-) diff --git a/CMake/MakefileBuildOptions.cmake b/CMake/MakefileBuildOptions.cmake index fc0b0b551..009dd3215 100644 --- a/CMake/MakefileBuildOptions.cmake +++ b/CMake/MakefileBuildOptions.cmake @@ -75,6 +75,7 @@ string(TOUPPER "${CMAKE_BUILD_TYPE}" _BUILD_TYPE) set(CORENRN_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CMAKE_CXX_FLAGS_${_BUILD_TYPE}} ${CXX14_STD_FLAGS} ${NVHPC_ACC_COMP_FLAGS} ${NVHPC_CXX_INLINE_FLAGS}" ) +set(CORENRN_LD_FLAGS "${NVHPC_ACC_LINK_FLAGS}") # ============================================================================= # nmodl/mod2c related options : TODO diff --git a/coreneuron/gpu/nrn_acc_manager.cpp b/coreneuron/gpu/nrn_acc_manager.cpp index 4fe0004fd..cc0a2bdee 100644 --- a/coreneuron/gpu/nrn_acc_manager.cpp +++ b/coreneuron/gpu/nrn_acc_manager.cpp @@ -24,13 +24,6 @@ #include "coreneuron/mpi/nrnmpidec.h" #include "coreneuron/utils/utils.hpp" -#ifdef _OPENACC -#include -#endif -#ifdef CORENEURON_PREFER_OPENMP_OFFLOAD -#include -#endif - #ifdef CRAYPAT #include #endif @@ -43,51 +36,6 @@ void nrn_ion_global_map_delete_from_device(); void nrn_VecPlay_copyto_device(NrnThread* nt, void** d_vecplay); void nrn_VecPlay_delete_from_device(NrnThread* nt); -template -T* cnrn_target_deviceptr(const T* h_ptr) { -#if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENACC) - return static_cast(acc_deviceptr(const_cast(h_ptr))); -#elif defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENMP) - return static_cast(omp_get_mapped_ptr(const_cast(h_ptr), omp_get_default_device())); -#else - throw std::runtime_error("cnrn_target_deviceptr() not implemented without OpenACC/OpenMP and gpu build"); -#endif -} - -template -T* cnrn_target_copyin(const T* h_ptr, std::size_t len = 1) { -#if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENACC) - return static_cast(acc_copyin(const_cast(h_ptr), len * sizeof(T))); -#elif defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENMP) - #pragma omp target enter data map(to:h_ptr[:len]) - return cnrn_target_deviceptr(const_cast(h_ptr)); -#else - throw std::runtime_error("cnrn_target_copyin() not implemented without OpenACC/OpenMP and gpu build"); -#endif -} - -template -void cnrn_target_delete(T* h_ptr, std::size_t len = 1) { -#if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENACC) - acc_delete(h_ptr, len * sizeof(T)); -#elif defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENMP) - #pragma omp target exit data map(delete: h_ptr[:len]) -#else - throw std::runtime_error("cnrn_target_delete() not implemented without OpenACC/OpenMP and gpu build"); -#endif -} - -template -void cnrn_target_memcpy_to_device(T* d_ptr, const T* h_ptr, std::size_t len = 1) { -#if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENACC) - acc_memcpy_to_device(d_ptr, const_cast(h_ptr), len * sizeof(T)); -#elif defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENMP) - omp_target_memcpy(d_ptr, const_cast(h_ptr), len* sizeof(T), 0, 0, omp_get_default_device(), omp_get_initial_device()); -#else - throw std::runtime_error("cnrn_target_memcpy_to_device() not implemented without OpenACC/OpenMP and gpu build"); -#endif -} - /* note: threads here are corresponding to global nrn_threads array */ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) { #ifdef _OPENACC diff --git a/coreneuron/kinderiv.py b/coreneuron/kinderiv.py index 35158908c..9b143c0cf 100644 --- a/coreneuron/kinderiv.py +++ b/coreneuron/kinderiv.py @@ -59,6 +59,9 @@ def write_out_kinderiv(fout): fout.write("\n/* declarations */\n") fout.write("\nnamespace coreneuron {\n") + if deriv or kin or euler: + fout.write('nrn_pragma_omp(declare target)\n') + for item in deriv: fout.write('#pragma acc routine seq\n') fout.write('extern int %s%s(_threadargsproto_);\n' % (item[0], item[1])) @@ -73,6 +76,9 @@ def write_out_kinderiv(fout): fout.write('#pragma acc routine seq\n') fout.write('extern int %s%s(_threadargsproto_);\n' % (item[0], item[1])) + if deriv or kin or euler: + fout.write('nrn_pragma_omp(end declare target)\n') + fout.write("\n/* callback indices */\n") derivoffset = 1 kinoffset = 1 diff --git a/coreneuron/mechanism/eion.cpp b/coreneuron/mechanism/eion.cpp index 727f30ea6..6cb3cf83d 100644 --- a/coreneuron/mechanism/eion.cpp +++ b/coreneuron/mechanism/eion.cpp @@ -177,6 +177,7 @@ double nrn_nernst(double ci, double co, double z, double celsius) { } } +nrn_pragma_omp(declare target) void nrn_wrote_conc(int type, double* p1, int p2, @@ -193,6 +194,7 @@ void nrn_wrote_conc(int type, pe[0] = nrn_nernst(pe[1 * _STRIDE], pe[2 * _STRIDE], gimap[type][2], celsius); } } +nrn_pragma_omp(end declare target) static double efun(double x) { if (fabs(x) < 1e-4) { diff --git a/coreneuron/mechanism/mech/dimplic.cpp b/coreneuron/mechanism/mech/dimplic.cpp index e3b08207e..de8970560 100644 --- a/coreneuron/mechanism/mech/dimplic.cpp +++ b/coreneuron/mechanism/mech/dimplic.cpp @@ -24,6 +24,7 @@ #include "coreneuron/mechanism/mech/mod2c_core_thread.hpp" #include "_kinderiv.h" namespace coreneuron { +nrn_pragma_omp(declare target) int derivimplicit_thread(int n, int* slist, int* dlist, DIFUN fun, _threadargsproto_) { difun(fun); return 0; @@ -48,5 +49,6 @@ int nrn_kinetic_steer(int fun, SparseObj* so, double* rhs, _threadargsproto_) { switch (fun) { _NRN_KINETIC_CASES } return 0; } +nrn_pragma_omp(end declare target) } // namespace coreneuron diff --git a/coreneuron/mechanism/register_mech.cpp b/coreneuron/mechanism/register_mech.cpp index a8bff7a50..433140b82 100644 --- a/coreneuron/mechanism/register_mech.cpp +++ b/coreneuron/mechanism/register_mech.cpp @@ -19,7 +19,9 @@ namespace coreneuron { int secondorder = 0; +nrn_pragma_omp(declare target) double t, dt, celsius, pi; +nrn_pragma_omp(end declare target) int rev_dt; using Pfrv = void (*)(); diff --git a/coreneuron/network/cvodestb.cpp b/coreneuron/network/cvodestb.cpp index 31b2fec54..97c70950e 100644 --- a/coreneuron/network/cvodestb.cpp +++ b/coreneuron/network/cvodestb.cpp @@ -86,6 +86,7 @@ void fixed_play_continuous(NrnThread* nt) { // NOTE : this implementation is duplicated in "coreneuron/mechanism/nrnoc_ml.ispc" // for the ISPC backend. If changes are required, make sure to change ISPC as well. +nrn_pragma_omp(declare target) int at_time(NrnThread* nt, double te) { double x = te - 1e-11; if (x <= nt->_t && x > (nt->_t - nt->_dt)) { @@ -93,5 +94,6 @@ int at_time(NrnThread* nt, double te) { } return 0; } +nrn_pragma_omp(end declare target) } // namespace coreneuron diff --git a/coreneuron/network/netcvode.cpp b/coreneuron/network/netcvode.cpp index ee2e5cb3e..dd521afde 100644 --- a/coreneuron/network/netcvode.cpp +++ b/coreneuron/network/netcvode.cpp @@ -537,7 +537,7 @@ void NetCvode::check_thresh(NrnThread* nt) { // for default method nrn_pragma_acc(parallel loop present( nt [0:1], presyns_helper [0:nt->n_presyn], presyns [0:nt->n_presyn], actual_v [0:nt->end]) copy(net_send_buf_count) if (nt->compute_gpu) async(nt->stream_id)) - nrn_pragma_omp(target teams distribute parallel for simd map(tofrom: net_send_buf_count) if(nt->compute_gpu)) + nrn_pragma_omp(target teams distribute parallel for map(tofrom: net_send_buf_count) if(nt->compute_gpu)) for (int i = 0; i < nt->ncell; ++i) { PreSyn* ps = presyns + i; PreSynHelper* psh = presyns_helper + i; diff --git a/coreneuron/sim/scopmath/crout_thread.cpp b/coreneuron/sim/scopmath/crout_thread.cpp index b180ea107..72a5c017f 100644 --- a/coreneuron/sim/scopmath/crout_thread.cpp +++ b/coreneuron/sim/scopmath/crout_thread.cpp @@ -50,6 +50,7 @@ namespace coreneuron { #define ix(arg) ((arg) *_STRIDE) /* having a differnt permutation per instance may not be a good idea */ +nrn_pragma_omp(declare target) int nrn_crout_thread(NewtonSpace* ns, int n, double** a, int* perm, _threadargsproto_) { int save_i = 0; @@ -224,4 +225,5 @@ void nrn_scopmath_solve_thread(int n, } } } +nrn_pragma_omp(end declare target) } // namespace coreneuron diff --git a/coreneuron/sim/scopmath/newton_thread.cpp b/coreneuron/sim/scopmath/newton_thread.cpp index 6c0f303ce..dc08ca04b 100644 --- a/coreneuron/sim/scopmath/newton_thread.cpp +++ b/coreneuron/sim/scopmath/newton_thread.cpp @@ -59,6 +59,7 @@ namespace coreneuron { #define ix(arg) ((arg) *_STRIDE) #define s_(arg) _p[s[arg] * _STRIDE] +nrn_pragma_omp(declare target) int nrn_newton_thread(NewtonSpace* ns, int n, int* s, @@ -136,6 +137,7 @@ int nrn_newton_thread(NewtonSpace* ns, return (error); } +nrn_pragma_omp(end declare target) /*------------------------------------------------------------*/ /* */ diff --git a/coreneuron/sim/treeset_core.cpp b/coreneuron/sim/treeset_core.cpp index bb92d2ab1..208058fe1 100644 --- a/coreneuron/sim/treeset_core.cpp +++ b/coreneuron/sim/treeset_core.cpp @@ -34,7 +34,7 @@ static void nrn_rhs(NrnThread* _nt) { nrn_pragma_acc(parallel loop present(vec_rhs [0:i3], vec_d [0:i3]) if (_nt->compute_gpu) async(_nt->stream_id)) - nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu)) + nrn_pragma_omp(target teams distribute parallel for if(_nt->compute_gpu)) for (int i = i1; i < i3; ++i) { vec_rhs[i] = 0.; vec_d[i] = 0.; @@ -46,7 +46,7 @@ static void nrn_rhs(NrnThread* _nt) { nrn_pragma_acc( parallel loop present(fast_imem_d [i1:i3], fast_imem_rhs [i1:i3]) if (_nt->compute_gpu) async(_nt->stream_id)) - nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu)) + nrn_pragma_omp(target teams distribute parallel for if(_nt->compute_gpu)) for (int i = i1; i < i3; ++i) { fast_imem_d[i] = 0.; fast_imem_rhs[i] = 0.; @@ -76,7 +76,7 @@ static void nrn_rhs(NrnThread* _nt) { double* p = _nt->nrn_fast_imem->nrn_sav_rhs; nrn_pragma_acc(parallel loop present(p, vec_rhs) if (_nt->compute_gpu) async(_nt->stream_id)) - nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu)) + nrn_pragma_omp(target teams distribute parallel for if(_nt->compute_gpu)) for (int i = i1; i < i3; ++i) { p[i] -= vec_rhs[i]; } @@ -93,7 +93,7 @@ static void nrn_rhs(NrnThread* _nt) { vec_v [0:i3], parent_index [0:i3]) if (_nt->compute_gpu) async(_nt->stream_id)) - nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu)) + nrn_pragma_omp(target teams distribute parallel for if(_nt->compute_gpu)) for (int i = i2; i < i3; ++i) { double dv = vec_v[parent_index[i]] - vec_v[i]; /* our connection coefficients are negative so */ @@ -153,7 +153,7 @@ static void nrn_lhs(NrnThread* _nt) { */ double* p = _nt->nrn_fast_imem->nrn_sav_d; nrn_pragma_acc(parallel loop present(p, vec_d) if (_nt->compute_gpu) async(_nt->stream_id)) - nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu)) + nrn_pragma_omp(target teams distribute parallel for if(_nt->compute_gpu)) for (int i = i1; i < i3; ++i) { p[i] += vec_d[i]; } @@ -163,7 +163,7 @@ static void nrn_lhs(NrnThread* _nt) { nrn_pragma_acc(parallel loop present( vec_d [0:i3], vec_a [0:i3], vec_b [0:i3], parent_index [0:i3]) if (_nt->compute_gpu) async(_nt->stream_id)) - nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu)) + nrn_pragma_omp(target teams distribute parallel for if(_nt->compute_gpu)) for (int i = i2; i < i3; ++i) { nrn_pragma_acc(atomic update) nrn_pragma_omp(atomic update) diff --git a/coreneuron/utils/offload.hpp b/coreneuron/utils/offload.hpp index d90cc10fd..0a5806ec9 100644 --- a/coreneuron/utils/offload.hpp +++ b/coreneuron/utils/offload.hpp @@ -10,11 +10,61 @@ #if defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENMP) #define nrn_pragma_acc(x) #define nrn_pragma_omp(x) _Pragma(nrn_pragma_stringify(omp x)) +#include #elif defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && \ defined(_OPENACC) #define nrn_pragma_acc(x) _Pragma(nrn_pragma_stringify(acc x)) #define nrn_pragma_omp(x) +#include #else #define nrn_pragma_acc(x) #define nrn_pragma_omp(x) #endif + +namespace coreneuron { +template +T* cnrn_target_deviceptr(const T* h_ptr) { +#if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENACC) + return static_cast(acc_deviceptr(const_cast(h_ptr))); +#elif defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENMP) + return static_cast(omp_get_mapped_ptr(const_cast(h_ptr), omp_get_default_device())); +#else + throw std::runtime_error("cnrn_target_deviceptr() not implemented without OpenACC/OpenMP and gpu build"); +#endif +} + +template +T* cnrn_target_copyin(const T* h_ptr, std::size_t len = 1) { +#if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENACC) + return static_cast(acc_copyin(const_cast(h_ptr), len * sizeof(T))); +#elif defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENMP) + #pragma omp target enter data map(to:h_ptr[:len]) + return cnrn_target_deviceptr(const_cast(h_ptr)); +#else + throw std::runtime_error("cnrn_target_copyin() not implemented without OpenACC/OpenMP and gpu build"); +#endif +} + +template +void cnrn_target_delete(T* h_ptr, std::size_t len = 1) { +#if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENACC) + acc_delete(h_ptr, len * sizeof(T)); +#elif defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENMP) + #pragma omp target exit data map(delete: h_ptr[:len]) +#else + throw std::runtime_error("cnrn_target_delete() not implemented without OpenACC/OpenMP and gpu build"); +#endif +} + +template +void cnrn_target_memcpy_to_device(T* d_ptr, const T* h_ptr, std::size_t len = 1) { +#if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENACC) + acc_memcpy_to_device(d_ptr, const_cast(h_ptr), len * sizeof(T)); +#elif defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENMP) + omp_target_memcpy(d_ptr, const_cast(h_ptr), len* sizeof(T), 0, 0, omp_get_default_device(), omp_get_initial_device()); +#else + throw std::runtime_error("cnrn_target_memcpy_to_device() not implemented without OpenACC/OpenMP and gpu build"); +#endif +} + +} diff --git a/extra/nrnivmodl_core_makefile.in b/extra/nrnivmodl_core_makefile.in index 5bd424865..f51571ae8 100644 --- a/extra/nrnivmodl_core_makefile.in +++ b/extra/nrnivmodl_core_makefile.in @@ -73,8 +73,8 @@ endif CXXFLAGS = @CORENRN_CXX_FLAGS@ CXX_COMPILE_CMD = $(CXX) $(CXXFLAGS) @CMAKE_CXX_COMPILE_OPTIONS_PIC@ @CORENRN_COMMON_COMPILE_DEFS@ $(INCLUDES) -CXX_LINK_EXE_CMD = $(CXX) $(CXXFLAGS) @CMAKE_EXE_LINKER_FLAGS@ -CXX_SHARED_LIB_CMD = $(CXX) $(CXXFLAGS) @CMAKE_SHARED_LIBRARY_CREATE_CXX_FLAGS@ @CMAKE_SHARED_LIBRARY_CXX_FLAGS@ @CMAKE_SHARED_LINKER_FLAGS@ +CXX_LINK_EXE_CMD = $(CXX) $(CXXFLAGS) @CORENRN_LD_FLAGS@ @CMAKE_EXE_LINKER_FLAGS@ +CXX_SHARED_LIB_CMD = $(CXX) $(CXXFLAGS) @CORENRN_LD_FLAGS@ @CMAKE_SHARED_LIBRARY_CREATE_CXX_FLAGS@ @CMAKE_SHARED_LIBRARY_CXX_FLAGS@ @CMAKE_SHARED_LINKER_FLAGS@ # ISPC compilation and link commands ISPC = @CMAKE_ISPC_COMPILER@ From 39b3523558fdfd32542128aa519b742878cbaff9 Mon Sep 17 00:00:00 2001 From: Olli Lupton Date: Mon, 13 Dec 2021 16:02:16 +0100 Subject: [PATCH 2/3] Add missing #includes. --- coreneuron/utils/offload.hpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/coreneuron/utils/offload.hpp b/coreneuron/utils/offload.hpp index 0a5806ec9..7ec41f4f4 100644 --- a/coreneuron/utils/offload.hpp +++ b/coreneuron/utils/offload.hpp @@ -19,8 +19,11 @@ #else #define nrn_pragma_acc(x) #define nrn_pragma_omp(x) +#include #endif +#include + namespace coreneuron { template T* cnrn_target_deviceptr(const T* h_ptr) { From e291683db9ff656badd9929d5fe86f0fbc33ad8b Mon Sep 17 00:00:00 2001 From: Olli Lupton Date: Mon, 13 Dec 2021 22:27:48 +0100 Subject: [PATCH 3/3] Fixup for NVHPC + OpenMP builds. --- coreneuron/gpu/nrn_acc_manager.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/coreneuron/gpu/nrn_acc_manager.cpp b/coreneuron/gpu/nrn_acc_manager.cpp index cc0a2bdee..6676448eb 100644 --- a/coreneuron/gpu/nrn_acc_manager.cpp +++ b/coreneuron/gpu/nrn_acc_manager.cpp @@ -27,6 +27,11 @@ #ifdef CRAYPAT #include #endif + +#ifdef _OPENACC +#include +#endif + namespace coreneuron { extern InterleaveInfo* interleave_info; void copy_ivoc_vect_to_device(const IvocVect& iv, IvocVect& div);