Fixes and improvements from LLVM/XLC work. (#716)

olupton · web-flow · commit 781d34f615c2 · 2021-12-14T10:31:57.000+01:00
Code fixes for XLC and Clang execution without build system changes.
This mainly adds missing OpenMP pragmas and makes cnrn_target_
wrappers visible to NMODL.
diff --git a/CMake/MakefileBuildOptions.cmake b/CMake/MakefileBuildOptions.cmake
@@ -75,6 +75,7 @@ string(TOUPPER "${CMAKE_BUILD_TYPE}" _BUILD_TYPE)
 set(CORENRN_CXX_FLAGS
     "${CMAKE_CXX_FLAGS} ${CMAKE_CXX_FLAGS_${_BUILD_TYPE}} ${CXX14_STD_FLAGS} ${NVHPC_ACC_COMP_FLAGS} ${NVHPC_CXX_INLINE_FLAGS}"
 )
+set(CORENRN_LD_FLAGS "${NVHPC_ACC_LINK_FLAGS}")
 
 # =============================================================================
 # nmodl/mod2c related options : TODO
diff --git a/coreneuron/gpu/nrn_acc_manager.cpp b/coreneuron/gpu/nrn_acc_manager.cpp
@@ -24,16 +24,14 @@
 #include "coreneuron/mpi/nrnmpidec.h"
 #include "coreneuron/utils/utils.hpp"
 
+#ifdef CRAYPAT
+#include <pat_api.h>
+#endif
+
 #ifdef _OPENACC
 #include <openacc.h>
 #endif
-#ifdef CORENEURON_PREFER_OPENMP_OFFLOAD
-#include <omp.h>
-#endif
 
-#ifdef CRAYPAT
-#include <pat_api.h>
-#endif
 namespace coreneuron {
 extern InterleaveInfo* interleave_info;
 void copy_ivoc_vect_to_device(const IvocVect& iv, IvocVect& div);
@@ -43,51 +41,6 @@ void nrn_ion_global_map_delete_from_device();
 void nrn_VecPlay_copyto_device(NrnThread* nt, void** d_vecplay);
 void nrn_VecPlay_delete_from_device(NrnThread* nt);
 
-template <typename T>
-T* cnrn_target_deviceptr(const T* h_ptr) {
-#if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENACC)
-    return static_cast<T*>(acc_deviceptr(const_cast<T*>(h_ptr)));
-#elif defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENMP)
-    return static_cast<T*>(omp_get_mapped_ptr(const_cast<T*>(h_ptr), omp_get_default_device()));
-#else
-    throw std::runtime_error("cnrn_target_deviceptr() not implemented without OpenACC/OpenMP and gpu build");
-#endif
-}
-
-template <typename T>
-T* cnrn_target_copyin(const T* h_ptr, std::size_t len = 1) {
-#if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENACC)
-    return static_cast<T*>(acc_copyin(const_cast<T*>(h_ptr), len * sizeof(T)));
-#elif defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENMP)
-    #pragma omp target enter data map(to:h_ptr[:len])
-    return cnrn_target_deviceptr(const_cast<T*>(h_ptr));
-#else
-    throw std::runtime_error("cnrn_target_copyin() not implemented without OpenACC/OpenMP and gpu build");
-#endif
-}
-
-template <typename T>
-void cnrn_target_delete(T* h_ptr, std::size_t len = 1) {
-#if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENACC)
-    acc_delete(h_ptr, len * sizeof(T));
-#elif defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENMP)
-    #pragma omp target exit data map(delete: h_ptr[:len])
-#else
-    throw std::runtime_error("cnrn_target_delete() not implemented without OpenACC/OpenMP and gpu build");
-#endif
-}
-
-template <typename T>
-void cnrn_target_memcpy_to_device(T* d_ptr, const T* h_ptr, std::size_t len = 1) {
-#if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENACC)
-    acc_memcpy_to_device(d_ptr, const_cast<T*>(h_ptr), len * sizeof(T));
-#elif defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENMP)
-    omp_target_memcpy(d_ptr, const_cast<T*>(h_ptr), len* sizeof(T), 0, 0, omp_get_default_device(), omp_get_initial_device());
-#else
-    throw std::runtime_error("cnrn_target_memcpy_to_device() not implemented without OpenACC/OpenMP and gpu build");
-#endif
-}
-
 /* note: threads here are corresponding to global nrn_threads array */
 void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) {
 #ifdef _OPENACC
diff --git a/coreneuron/kinderiv.py b/coreneuron/kinderiv.py
@@ -59,6 +59,9 @@ def write_out_kinderiv(fout):
     fout.write("\n/* declarations */\n")
     fout.write("\nnamespace coreneuron {\n")
 
+    if deriv or kin or euler:
+        fout.write('nrn_pragma_omp(declare target)\n')
+
     for item in deriv:
         fout.write('#pragma acc routine seq\n')
         fout.write('extern int %s%s(_threadargsproto_);\n' % (item[0], item[1]))
@@ -73,6 +76,9 @@ def write_out_kinderiv(fout):
         fout.write('#pragma acc routine seq\n')
         fout.write('extern int %s%s(_threadargsproto_);\n' % (item[0], item[1]))
 
+    if deriv or kin or euler:
+        fout.write('nrn_pragma_omp(end declare target)\n')
+
     fout.write("\n/* callback indices */\n")
     derivoffset = 1
     kinoffset = 1
diff --git a/coreneuron/mechanism/eion.cpp b/coreneuron/mechanism/eion.cpp
@@ -177,6 +177,7 @@ double nrn_nernst(double ci, double co, double z, double celsius) {
     }
 }
 
+nrn_pragma_omp(declare target)
 void nrn_wrote_conc(int type,
                     double* p1,
                     int p2,
@@ -193,6 +194,7 @@ void nrn_wrote_conc(int type,
         pe[0] = nrn_nernst(pe[1 * _STRIDE], pe[2 * _STRIDE], gimap[type][2], celsius);
     }
 }
+nrn_pragma_omp(end declare target)
 
 static double efun(double x) {
     if (fabs(x) < 1e-4) {
diff --git a/coreneuron/mechanism/mech/dimplic.cpp b/coreneuron/mechanism/mech/dimplic.cpp
@@ -24,6 +24,7 @@
 #include "coreneuron/mechanism/mech/mod2c_core_thread.hpp"
 #include "_kinderiv.h"
 namespace coreneuron {
+nrn_pragma_omp(declare target)
 int derivimplicit_thread(int n, int* slist, int* dlist, DIFUN fun, _threadargsproto_) {
     difun(fun);
     return 0;
@@ -48,5 +49,6 @@ int nrn_kinetic_steer(int fun, SparseObj* so, double* rhs, _threadargsproto_) {
     switch (fun) { _NRN_KINETIC_CASES }
     return 0;
 }
+nrn_pragma_omp(end declare target)
 
 }  // namespace coreneuron
diff --git a/coreneuron/mechanism/register_mech.cpp b/coreneuron/mechanism/register_mech.cpp
@@ -19,7 +19,9 @@
 
 namespace coreneuron {
 int secondorder = 0;
+nrn_pragma_omp(declare target)
 double t, dt, celsius, pi;
+nrn_pragma_omp(end declare target)
 int rev_dt;
 
 using Pfrv = void (*)();
diff --git a/coreneuron/network/cvodestb.cpp b/coreneuron/network/cvodestb.cpp
@@ -86,12 +86,14 @@ void fixed_play_continuous(NrnThread* nt) {
 
 // NOTE : this implementation is duplicated in "coreneuron/mechanism/nrnoc_ml.ispc"
 // for the ISPC backend. If changes are required, make sure to change ISPC as well.
+nrn_pragma_omp(declare target)
 int at_time(NrnThread* nt, double te) {
     double x = te - 1e-11;
     if (x <= nt->_t && x > (nt->_t - nt->_dt)) {
         return 1;
     }
     return 0;
 }
+nrn_pragma_omp(end declare target)
 
 }  // namespace coreneuron
diff --git a/coreneuron/network/netcvode.cpp b/coreneuron/network/netcvode.cpp
@@ -537,7 +537,7 @@ void NetCvode::check_thresh(NrnThread* nt) {  // for default method
     nrn_pragma_acc(parallel loop present(
         nt [0:1], presyns_helper [0:nt->n_presyn], presyns [0:nt->n_presyn], actual_v [0:nt->end])
                        copy(net_send_buf_count) if (nt->compute_gpu) async(nt->stream_id))
-    nrn_pragma_omp(target teams distribute parallel for simd map(tofrom: net_send_buf_count) if(nt->compute_gpu))
+    nrn_pragma_omp(target teams distribute parallel for map(tofrom: net_send_buf_count) if(nt->compute_gpu))
     for (int i = 0; i < nt->ncell; ++i) {
         PreSyn* ps = presyns + i;
         PreSynHelper* psh = presyns_helper + i;
diff --git a/coreneuron/sim/scopmath/crout_thread.cpp b/coreneuron/sim/scopmath/crout_thread.cpp
@@ -50,6 +50,7 @@ namespace coreneuron {
 #define ix(arg) ((arg) *_STRIDE)
 
 /* having a differnt permutation per instance may not be a good idea */
+nrn_pragma_omp(declare target)
 int nrn_crout_thread(NewtonSpace* ns, int n, double** a, int* perm, _threadargsproto_) {
     int save_i = 0;
 
@@ -224,4 +225,5 @@ void nrn_scopmath_solve_thread(int n,
         }
     }
 }
+nrn_pragma_omp(end declare target)
 }  // namespace coreneuron
diff --git a/coreneuron/sim/scopmath/newton_thread.cpp b/coreneuron/sim/scopmath/newton_thread.cpp
@@ -59,6 +59,7 @@ namespace coreneuron {
 #define ix(arg) ((arg) *_STRIDE)
 #define s_(arg) _p[s[arg] * _STRIDE]
 
+nrn_pragma_omp(declare target)
 int nrn_newton_thread(NewtonSpace* ns,
                       int n,
                       int* s,
@@ -136,6 +137,7 @@ int nrn_newton_thread(NewtonSpace* ns,
 
     return (error);
 }
+nrn_pragma_omp(end declare target)
 
 /*------------------------------------------------------------*/
 /*                                                            */
diff --git a/coreneuron/sim/treeset_core.cpp b/coreneuron/sim/treeset_core.cpp
@@ -34,7 +34,7 @@ static void nrn_rhs(NrnThread* _nt) {
 
     nrn_pragma_acc(parallel loop present(vec_rhs [0:i3], vec_d [0:i3]) if (_nt->compute_gpu)
                        async(_nt->stream_id))
-    nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu))
+    nrn_pragma_omp(target teams distribute parallel for if(_nt->compute_gpu))
     for (int i = i1; i < i3; ++i) {
         vec_rhs[i] = 0.;
         vec_d[i] = 0.;
@@ -46,7 +46,7 @@ static void nrn_rhs(NrnThread* _nt) {
         nrn_pragma_acc(
             parallel loop present(fast_imem_d [i1:i3], fast_imem_rhs [i1:i3]) if (_nt->compute_gpu)
                 async(_nt->stream_id))
-        nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu))
+        nrn_pragma_omp(target teams distribute parallel for if(_nt->compute_gpu))
         for (int i = i1; i < i3; ++i) {
             fast_imem_d[i] = 0.;
             fast_imem_rhs[i] = 0.;
@@ -76,7 +76,7 @@ static void nrn_rhs(NrnThread* _nt) {
         double* p = _nt->nrn_fast_imem->nrn_sav_rhs;
         nrn_pragma_acc(parallel loop present(p, vec_rhs) if (_nt->compute_gpu)
                            async(_nt->stream_id))
-        nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu))
+        nrn_pragma_omp(target teams distribute parallel for if(_nt->compute_gpu))
         for (int i = i1; i < i3; ++i) {
             p[i] -= vec_rhs[i];
         }
@@ -93,7 +93,7 @@ static void nrn_rhs(NrnThread* _nt) {
                                          vec_v [0:i3],
                                          parent_index [0:i3]) if (_nt->compute_gpu)
                        async(_nt->stream_id))
-    nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu))
+    nrn_pragma_omp(target teams distribute parallel for if(_nt->compute_gpu))
     for (int i = i2; i < i3; ++i) {
         double dv = vec_v[parent_index[i]] - vec_v[i];
         /* our connection coefficients are negative so */
@@ -153,7 +153,7 @@ static void nrn_lhs(NrnThread* _nt) {
         */
         double* p = _nt->nrn_fast_imem->nrn_sav_d;
         nrn_pragma_acc(parallel loop present(p, vec_d) if (_nt->compute_gpu) async(_nt->stream_id))
-        nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu))
+        nrn_pragma_omp(target teams distribute parallel for if(_nt->compute_gpu))
         for (int i = i1; i < i3; ++i) {
             p[i] += vec_d[i];
         }
@@ -163,7 +163,7 @@ static void nrn_lhs(NrnThread* _nt) {
     nrn_pragma_acc(parallel loop present(
         vec_d [0:i3], vec_a [0:i3], vec_b [0:i3], parent_index [0:i3]) if (_nt->compute_gpu)
                        async(_nt->stream_id))
-    nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu))
+    nrn_pragma_omp(target teams distribute parallel for if(_nt->compute_gpu))
     for (int i = i2; i < i3; ++i) {
         nrn_pragma_acc(atomic update)
         nrn_pragma_omp(atomic update)
diff --git a/coreneuron/utils/offload.hpp b/coreneuron/utils/offload.hpp
@@ -10,11 +10,64 @@
 #if defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENMP)
 #define nrn_pragma_acc(x)
 #define nrn_pragma_omp(x) _Pragma(nrn_pragma_stringify(omp x))
+#include <omp.h>
 #elif defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && \
     defined(_OPENACC)
 #define nrn_pragma_acc(x) _Pragma(nrn_pragma_stringify(acc x))
 #define nrn_pragma_omp(x)
+#include <openacc.h>
 #else
 #define nrn_pragma_acc(x)
 #define nrn_pragma_omp(x)
+#include <stdexcept>
 #endif
+
+#include <cstddef>
+
+namespace coreneuron {
+template <typename T>
+T* cnrn_target_deviceptr(const T* h_ptr) {
+#if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENACC)
+    return static_cast<T*>(acc_deviceptr(const_cast<T*>(h_ptr)));
+#elif defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENMP)
+    return static_cast<T*>(omp_get_mapped_ptr(const_cast<T*>(h_ptr), omp_get_default_device()));
+#else
+    throw std::runtime_error("cnrn_target_deviceptr() not implemented without OpenACC/OpenMP and gpu build");
+#endif
+}
+
+template <typename T>
+T* cnrn_target_copyin(const T* h_ptr, std::size_t len = 1) {
+#if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENACC)
+    return static_cast<T*>(acc_copyin(const_cast<T*>(h_ptr), len * sizeof(T)));
+#elif defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENMP)
+    #pragma omp target enter data map(to:h_ptr[:len])
+    return cnrn_target_deviceptr(const_cast<T*>(h_ptr));
+#else
+    throw std::runtime_error("cnrn_target_copyin() not implemented without OpenACC/OpenMP and gpu build");
+#endif
+}
+
+template <typename T>
+void cnrn_target_delete(T* h_ptr, std::size_t len = 1) {
+#if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENACC)
+    acc_delete(h_ptr, len * sizeof(T));
+#elif defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENMP)
+    #pragma omp target exit data map(delete: h_ptr[:len])
+#else
+    throw std::runtime_error("cnrn_target_delete() not implemented without OpenACC/OpenMP and gpu build");
+#endif
+}
+
+template <typename T>
+void cnrn_target_memcpy_to_device(T* d_ptr, const T* h_ptr, std::size_t len = 1) {
+#if defined(CORENEURON_ENABLE_GPU) && !defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENACC)
+    acc_memcpy_to_device(d_ptr, const_cast<T*>(h_ptr), len * sizeof(T));
+#elif defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && defined(_OPENMP)
+    omp_target_memcpy(d_ptr, const_cast<T*>(h_ptr), len* sizeof(T), 0, 0, omp_get_default_device(), omp_get_initial_device());
+#else
+    throw std::runtime_error("cnrn_target_memcpy_to_device() not implemented without OpenACC/OpenMP and gpu build");
+#endif
+}
+
+}
diff --git a/extra/nrnivmodl_core_makefile.in b/extra/nrnivmodl_core_makefile.in
@@ -73,8 +73,8 @@ endif
 
 CXXFLAGS = @CORENRN_CXX_FLAGS@
 CXX_COMPILE_CMD = $(CXX) $(CXXFLAGS) @CMAKE_CXX_COMPILE_OPTIONS_PIC@ @CORENRN_COMMON_COMPILE_DEFS@ $(INCLUDES)
-CXX_LINK_EXE_CMD = $(CXX) $(CXXFLAGS) @CMAKE_EXE_LINKER_FLAGS@
-CXX_SHARED_LIB_CMD = $(CXX) $(CXXFLAGS) @CMAKE_SHARED_LIBRARY_CREATE_CXX_FLAGS@ @CMAKE_SHARED_LIBRARY_CXX_FLAGS@ @CMAKE_SHARED_LINKER_FLAGS@
+CXX_LINK_EXE_CMD = $(CXX) $(CXXFLAGS) @CORENRN_LD_FLAGS@ @CMAKE_EXE_LINKER_FLAGS@
+CXX_SHARED_LIB_CMD = $(CXX) $(CXXFLAGS) @CORENRN_LD_FLAGS@ @CMAKE_SHARED_LIBRARY_CREATE_CXX_FLAGS@ @CMAKE_SHARED_LIBRARY_CXX_FLAGS@ @CMAKE_SHARED_LINKER_FLAGS@
 
 # ISPC compilation and link commands
 ISPC = @CMAKE_ISPC_COMPILER@

Original file line number	Diff line number	Diff line change
`@@ -75,6 +75,7 @@ string(TOUPPER "${CMAKE_BUILD_TYPE}" _BUILD_TYPE)`
`75`	`75`	`set(CORENRN_CXX_FLAGS`
`76`	`76`	`"${CMAKE_CXX_FLAGS} ${CMAKE_CXX_FLAGS_${_BUILD_TYPE}} ${CXX14_STD_FLAGS} ${NVHPC_ACC_COMP_FLAGS} ${NVHPC_CXX_INLINE_FLAGS}"`
`77`	`77`	`)`
	`78`	`+set(CORENRN_LD_FLAGS "${NVHPC_ACC_LINK_FLAGS}")`
`78`	`79`
`79`	`80`	`# =============================================================================`
`80`	`81`	`# nmodl/mod2c related options : TODO`
Original file line number	Diff line number	Diff line change
`@@ -177,6 +177,7 @@ double nrn_nernst(double ci, double co, double z, double celsius) {`
`177`	`177`	`}`
`178`	`178`	`}`
`179`	`179`
	`180`	`+nrn_pragma_omp(declare target)`
`180`	`181`	`void nrn_wrote_conc(int type,`
`181`	`182`	`double* p1,`
`182`	`183`	`int p2,`
`@@ -193,6 +194,7 @@ void nrn_wrote_conc(int type,`
`193`	`194`	`pe[0] = nrn_nernst(pe[1 * _STRIDE], pe[2 * _STRIDE], gimap[type][2], celsius);`
`194`	`195`	`}`
`195`	`196`	`}`
	`197`	`+nrn_pragma_omp(end declare target)`
`196`	`198`
`197`	`199`	`static double efun(double x) {`
`198`	`200`	`if (fabs(x) < 1e-4) {`
Original file line number	Diff line number	Diff line change
`@@ -86,12 +86,14 @@ void fixed_play_continuous(NrnThread* nt) {`
`86`	`86`
`87`	`87`	`// NOTE : this implementation is duplicated in "coreneuron/mechanism/nrnoc_ml.ispc"`
`88`	`88`	`// for the ISPC backend. If changes are required, make sure to change ISPC as well.`
	`89`	`+nrn_pragma_omp(declare target)`
`89`	`90`	`int at_time(NrnThread* nt, double te) {`
`90`	`91`	`double x = te - 1e-11;`
`91`	`92`	`if (x <= nt->_t && x > (nt->_t - nt->_dt)) {`
`92`	`93`	`return 1;`
`93`	`94`	`}`
`94`	`95`	`return 0;`
`95`	`96`	`}`
	`97`	`+nrn_pragma_omp(end declare target)`
`96`	`98`
`97`	`99`	`} // namespace coreneuron`
Original file line number	Diff line number	Diff line change
`@@ -50,6 +50,7 @@ namespace coreneuron {`
`50`	`50`	`#define ix(arg) ((arg) *_STRIDE)`
`51`	`51`
`52`	`52`	`/* having a differnt permutation per instance may not be a good idea */`
	`53`	`+nrn_pragma_omp(declare target)`
`53`	`54`	`int nrn_crout_thread(NewtonSpace* ns, int n, double** a, int* perm, _threadargsproto_) {`
`54`	`55`	`int save_i = 0;`
`55`	`56`
`@@ -224,4 +225,5 @@ void nrn_scopmath_solve_thread(int n,`
`224`	`225`	`}`
`225`	`226`	`}`
`226`	`227`	`}`
	`228`	`+nrn_pragma_omp(end declare target)`
`227`	`229`	`} // namespace coreneuron`