From f1a5f619b1a553816b940d3e452d7fae105514c7 Mon Sep 17 00:00:00 2001 From: Olli Lupton Date: Wed, 6 Jul 2022 17:18:05 +0200 Subject: [PATCH 1/3] Compilation fixes for NVHPC 22.5. --- coreneuron/CMakeLists.txt | 11 ++----- coreneuron/apps/main1.cpp | 9 +++--- coreneuron/io/global_vars.cpp | 2 +- coreneuron/mechanism/eion.cpp | 48 +++++++----------------------- coreneuron/mechanism/membfunc.hpp | 36 ++++++++++++++++------ coreneuron/nrnconf.h | 10 ++----- coreneuron/nrnoc/nrnunits_modern.h | 36 ---------------------- coreneuron/utils/nrnoc_aux.hpp | 5 ---- coreneuron/utils/units.hpp | 36 ++++++++++++++++++++++ 9 files changed, 82 insertions(+), 111 deletions(-) delete mode 100644 coreneuron/nrnoc/nrnunits_modern.h create mode 100644 coreneuron/utils/units.hpp diff --git a/coreneuron/CMakeLists.txt b/coreneuron/CMakeLists.txt index 0dc648628..1fc423f06 100644 --- a/coreneuron/CMakeLists.txt +++ b/coreneuron/CMakeLists.txt @@ -120,16 +120,9 @@ if(CORENRN_ENABLE_GPU) endif() # ============================================================================= -# eion.cpp depends on CORENRN_USE_LEGACY_UNITS +# CORENEURON_USE_LEGACY_UNITS is used in membfunc.hpp so define it everywhere # ============================================================================= -set(LegacyFR_FILES - ${CMAKE_CURRENT_SOURCE_DIR}/mechanism/eion.cpp ${CMAKE_CURRENT_SOURCE_DIR}/apps/main1.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/io/global_vars.cpp) - -set_property( - SOURCE ${LegacyFR_FILES} - APPEND - PROPERTY COMPILE_DEFINITIONS "CORENRN_USE_LEGACY_UNITS=${CORENRN_USE_LEGACY_UNITS}") +add_compile_definitions(CORENEURON_USE_LEGACY_UNITS=${CORENRN_USE_LEGACY_UNITS}) # ============================================================================= # create libraries diff --git a/coreneuron/apps/main1.cpp b/coreneuron/apps/main1.cpp index fb74df7d0..3f314d208 100644 --- a/coreneuron/apps/main1.cpp +++ b/coreneuron/apps/main1.cpp @@ -1,6 +1,6 @@ /* # ============================================================================= -# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL +# Copyright (c) 2016 - 2022 Blue Brain Project/EPFL # # See top-level LICENSE file for details. # =============================================================================. @@ -51,9 +51,9 @@ const char* corenrn_version() { return coreneuron::bbcore_write_version; } -// the CORENRN_USE_LEGACY_UNITS determined by CORENRN_ENABLE_LEGACY_UNITS +// the CORENEURON_USE_LEGACY_UNITS determined by CORENRN_ENABLE_LEGACY_UNITS bool corenrn_units_use_legacy() { - return CORENRN_USE_LEGACY_UNITS; + return CORENEURON_USE_LEGACY_UNITS; } void (*nrn2core_part2_clean_)(); @@ -562,8 +562,7 @@ extern "C" int run_solve_core(int argc, char** argv) { } #endif bool compute_gpu = corenrn_param.gpu; - - nrn_pragma_acc(update device(celsius, secondorder, pi) if (compute_gpu)) + nrn_pragma_acc(data copyin(celsius, secondorder, pi) if (compute_gpu)) nrn_pragma_omp(target update to(celsius, secondorder, pi) if (compute_gpu)) { double v = corenrn_param.voltage; diff --git a/coreneuron/io/global_vars.cpp b/coreneuron/io/global_vars.cpp index 128a1cdb9..815423ea9 100644 --- a/coreneuron/io/global_vars.cpp +++ b/coreneuron/io/global_vars.cpp @@ -142,7 +142,7 @@ void set_globals(const char* path, bool cli_global_seed, int cli_global_seed_val } else if (strcmp(name, "Random123_globalindex") == 0) { nrnran123_set_globalindex((uint32_t) n); } else if (strcmp(name, "_nrnunit_use_legacy_") == 0) { - if (n != CORENRN_USE_LEGACY_UNITS) { + if (n != CORENEURON_USE_LEGACY_UNITS) { hoc_execerror( "CORENRN_ENABLE_LEGACY_UNITS not" " consistent with NEURON value of" diff --git a/coreneuron/mechanism/eion.cpp b/coreneuron/mechanism/eion.cpp index 4bc077880..a2e76a537 100644 --- a/coreneuron/mechanism/eion.cpp +++ b/coreneuron/mechanism/eion.cpp @@ -1,6 +1,6 @@ /* # ============================================================================= -# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL +# Copyright (c) 2016 - 2022 Blue Brain Project/EPFL # # See top-level LICENSE file for details. # =============================================================================. @@ -154,23 +154,9 @@ the USEION statement of any model using this ion\n", } } -#ifndef CORENRN_USE_LEGACY_UNITS -#define CORENRN_USE_LEGACY_UNITS 0 -#endif - -#if CORENRN_USE_LEGACY_UNITS == 1 -#define FARADAY 96485.309 -#define gasconstant 8.3134 -#else -#include "coreneuron/nrnoc/nrnunits_modern.h" -#define FARADAY _faraday_codata2018 -#define gasconstant _gasconstant_codata2018 -#endif - -#define ktf (1000. * gasconstant * (celsius + 273.15) / FARADAY) - -double nrn_nernst(double ci, double co, double z, double celsius) { - /*printf("nrn_nernst %g %g %g\n", ci, co, z);*/ +// std::log isn't constexpr, but there are argument values for which nrn_nernst +// is a constant expression +constexpr double nrn_nernst(double ci, double co, double z, double celsius) { if (z == 0) { return 0.; } @@ -179,7 +165,7 @@ double nrn_nernst(double ci, double co, double z, double celsius) { } else if (co <= 0.) { return -1e6; } else { - return ktf / z * log(co / ci); + return ktf(celsius) / z * std::log(co / ci); } } @@ -200,24 +186,8 @@ void nrn_wrote_conc(int type, pe[0] = nrn_nernst(pe[1 * _STRIDE], pe[2 * _STRIDE], gimap[type][2], celsius); } } - -static double efun(double x) { - if (fabs(x) < 1e-4) { - return 1. - x / 2.; - } else { - return x / (exp(x) - 1); - } -} - nrn_pragma_omp(end declare target) -double nrn_ghk(double v, double ci, double co, double z) { - double temp = z * v / ktf; - double eco = co * efun(temp); - double eci = ci * efun(-temp); - return (.001) * z * FARADAY * (eci - eco); -} - #if VECTORIZE #define erev pd[0 * _STRIDE] /* From Eion */ #define conci pd[1 * _STRIDE] @@ -257,7 +227,7 @@ ion_style("name_ion", [c_style, e_style, einit, eadvance, cinit]) double nrn_nernst_coef(int type) { /* for computing jacobian element dconc'/dconc */ - return ktf / charge; + return ktf(celsius) / charge; } /* Must be called prior to any channels which update the currents */ @@ -271,7 +241,8 @@ void nrn_cur_ion(NrnThread* nt, Memb_list* ml, int type) { pd = ml->data; ppd = ml->pdata; // clang-format off - nrn_pragma_acc(parallel loop present(pd[0:_cntml_padded * 5], + nrn_pragma_acc(parallel loop present(celsius, + pd[0:_cntml_padded * 5], ppd[0:_cntml_actual], nrn_ion_global_map[0:nrn_ion_global_map_size] [0:ion_global_map_member_size]) @@ -311,7 +282,8 @@ void nrn_init_ion(NrnThread* nt, Memb_list* ml, int type) { // verify if this can be made asynchronous or if there is a strong reason it // needs to be like this. // clang-format off - nrn_pragma_acc(parallel loop present(pd[0:_cntml_padded * 5], + nrn_pragma_acc(parallel loop present(celsius, + pd[0:_cntml_padded * 5], ppd[0:_cntml_actual], nrn_ion_global_map[0:nrn_ion_global_map_size] [0:ion_global_map_member_size]) diff --git a/coreneuron/mechanism/membfunc.hpp b/coreneuron/mechanism/membfunc.hpp index 2556f0f87..6862d9245 100644 --- a/coreneuron/mechanism/membfunc.hpp +++ b/coreneuron/mechanism/membfunc.hpp @@ -1,17 +1,19 @@ /* # ============================================================================= -# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL +# Copyright (c) 2016 - 2022 Blue Brain Project/EPFL # # See top-level LICENSE file for details. # =============================================================================. */ - #pragma once -#include - #include "coreneuron/mechanism/mechanism.hpp" #include "coreneuron/utils/offload.hpp" +#include "coreneuron/utils/units.hpp" + +#include +#include + namespace coreneuron { using Pfrpdat = Datum* (*) (void); @@ -112,12 +114,28 @@ extern void nrn_jacob_capacitance(NrnThread*, Memb_list*, int); extern void nrn_writes_conc(int, int); nrn_pragma_omp(declare target) nrn_pragma_acc(routine seq) -extern void nrn_wrote_conc(int, double*, int, int, double**, double, int); -nrn_pragma_acc(routine seq) -double nrn_nernst(double ci, double co, double z, double celsius); -nrn_pragma_acc(routine seq) -extern double nrn_ghk(double v, double ci, double co, double z); +void nrn_wrote_conc(int, double*, int, int, double**, double, int); nrn_pragma_omp(end declare target) +constexpr double ktf(double celsius) { + return 1000. * units::gasconstant * (celsius + 273.15) / units::faraday; +} +inline double nrn_ghk(double v, double ci, double co, double z, double celsius) { + auto const efun = [](double x) { + if (std::abs(x) < 1e-4) { + return 1. - x / 2.; + } else { + return x / (std::exp(x) - 1.); + } + }; + double const temp{z * v / ktf(celsius)}; + double const eco{co * efun(+temp)}; + double const eci{ci * efun(-temp)}; + return .001 * z * units::faraday * (eci - eco); +} +// Overload using the global celsius variable +inline double nrn_ghk(double v, double ci, double co, double z) { + return nrn_ghk(v, ci, co, z, celsius); +} extern void hoc_register_prop_size(int, int, int); extern void hoc_register_dparam_semantics(int type, int, const char* name); extern void hoc_reg_ba(int, mod_f_t, int); diff --git a/coreneuron/nrnconf.h b/coreneuron/nrnconf.h index b25a2764a..2b1a894ad 100644 --- a/coreneuron/nrnconf.h +++ b/coreneuron/nrnconf.h @@ -32,16 +32,10 @@ using Symbol = char; #define VEC_AREA(i) (_nt->_actual_area[(i)]) #define VECTORIZE 1 -// extern variables require acc declare +// Defined in register_mech.cpp nrn_pragma_omp(declare target) -extern double celsius; -nrn_pragma_acc(declare create(celsius)) - -extern double pi; -nrn_pragma_acc(declare create(pi)) - +extern double celsius, pi; extern int secondorder; -nrn_pragma_acc(declare create(secondorder)) nrn_pragma_omp(end declare target) extern double t, dt; diff --git a/coreneuron/nrnoc/nrnunits_modern.h b/coreneuron/nrnoc/nrnunits_modern.h deleted file mode 100644 index d93638841..000000000 --- a/coreneuron/nrnoc/nrnunits_modern.h +++ /dev/null @@ -1,36 +0,0 @@ -/* -# ============================================================================= -# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL -# -# See top-level LICENSE file for details. -# ============================================================================= -*/ - -#pragma once - -/** - NMODL translated MOD files get unit constants typically from - share/lib/nrnunits.lib.in. But there were other source files that - hardcode some of the constants. Here we gather a few modern units into - a single place (but, unfortunately, also in nrnunits.lib.in). Legacy units - cannot be gathered here because they can differ slightly from place to place. - - These come from https://physics.nist.gov/cuu/Constants/index.html. - Termed the "2018 CODATA recommended values", they became available - on 20 May 2019 and replace the 2014 CODATA set. - - See oc/hoc_init.c, nrnoc/eion.c, nrniv/kschan.h -**/ - - -#define _electron_charge_codata2018 1.602176634e-19 /* coulomb exact*/ -#define _avogadro_number_codata2018 6.02214076e+23 /* exact */ -#define _boltzmann_codata2018 1.380649e-23 /* joule/K exact */ -#define _faraday_codata2018 \ - (_electron_charge_codata2018 * _avogadro_number_codata2018) /* 96485.33212... coulomb/mol */ -#define _gasconstant_codata2018 \ - (_boltzmann_codata2018 * _avogadro_number_codata2018) /* 8.314462618... joule/mol-K */ - -/* e/k in K/millivolt */ -#define _e_over_k_codata2018 \ - (.001 * _electron_charge_codata2018 / _boltzmann_codata2018) /* 11.604518... K/mV */ diff --git a/coreneuron/utils/nrnoc_aux.hpp b/coreneuron/utils/nrnoc_aux.hpp index 3c2f23326..10b5880ea 100644 --- a/coreneuron/utils/nrnoc_aux.hpp +++ b/coreneuron/utils/nrnoc_aux.hpp @@ -34,9 +34,4 @@ extern void hoc_execerror(const char*, const char*); /* print and abort */ extern void hoc_warning(const char*, const char*); extern double hoc_Exp(double x); - -// defined in eion.cpp and this file included in translated mod files. -extern double nrn_nernst(double ci, double co, double z, double celsius); -extern double nrn_ghk(double v, double ci, double co, double z); - } // namespace coreneuron diff --git a/coreneuron/utils/units.hpp b/coreneuron/utils/units.hpp new file mode 100644 index 000000000..aebbc40e0 --- /dev/null +++ b/coreneuron/utils/units.hpp @@ -0,0 +1,36 @@ +/* +# ============================================================================= +# Copyright (c) 2016 - 2022 Blue Brain Project/EPFL +# +# See top-level LICENSE file for details. +# ============================================================================= +*/ +#pragma once +namespace coreneuron::units { +#if CORENEURON_USE_LEGACY_UNITS == 1 +constexpr double faraday{96485.309}; +constexpr double gasconstant{8.3134}; +#else +/* NMODL translated MOD files get unit constants typically from + * share/lib/nrnunits.lib.in. But there were other source files that hardcode + * some of the constants. Here we gather a few modern units into a single place + * (but, unfortunately, also in nrnunits.lib.in). Legacy units cannot be + * gathered here because they can differ slightly from place to place. + * + * These come from https://physics.nist.gov/cuu/Constants/index.html. + * Termed the "2018 CODATA recommended values", they became available + * on 20 May 2019 and replace the 2014 CODATA set. + * + * See oc/hoc_init.c, nrnoc/eion.c, nrniv/kschan.h + */ +namespace detail { +constexpr double electron_charge{1.602176634e-19}; // coulomb exact +constexpr double avogadro_number{6.02214076e+23}; // exact +constexpr double boltzmann{1.380649e-23}; // joule/K exact +} // namespace detail +constexpr double faraday{detail::electron_charge * detail::avogadro_number}; // 96485.33212... + // coulomb/mol +constexpr double gasconstant{detail::boltzmann * detail::avogadro_number}; // 8.314462618... + // joule/mol-K +#endif +} // namespace coreneuron::units From f6e3237958f47e7c0e8a6376d8e4f7551248dc8c Mon Sep 17 00:00:00 2001 From: Christos Kotsalos Date: Thu, 28 Jul 2022 15:09:48 +0200 Subject: [PATCH 2/3] fixing race condition in cell permute 2 --- coreneuron/permute/cellorder.cpp | 48 +++++++++++++++++--------------- 1 file changed, 25 insertions(+), 23 deletions(-) diff --git a/coreneuron/permute/cellorder.cpp b/coreneuron/permute/cellorder.cpp index 14feb31de..27f820ee6 100644 --- a/coreneuron/permute/cellorder.cpp +++ b/coreneuron/permute/cellorder.cpp @@ -482,9 +482,9 @@ static void triang_interleaved2(NrnThread* nt, int icore, int ncycle, int* strid int icycle = ncycle - 1; int istride = stride[icycle]; int i = lastnode - istride + icore; -#ifndef CORENEURON_ENABLE_GPU +//#ifndef CORENEURON_ENABLE_GPU int ii = i; -#endif +//#endif // execute until all tree depths are executed bool has_subtrees_to_compute = true; @@ -492,11 +492,12 @@ static void triang_interleaved2(NrnThread* nt, int icore, int ncycle, int* strid // clang-format off nrn_pragma_acc(loop seq) for (; has_subtrees_to_compute; ) { // ncycle loop -#ifndef CORENEURON_ENABLE_GPU +//#ifndef CORENEURON_ENABLE_GPU // serial test, gpu does this in parallel + nrn_pragma_acc(loop) for (int icore = 0; icore < warpsize; ++icore) { int i = ii + icore; -#endif +//#endif if (icore < istride) { // most efficient if istride equal warpsize // what is the index int ip = GPU_PARENT(i); @@ -508,9 +509,9 @@ static void triang_interleaved2(NrnThread* nt, int icore, int ncycle, int* strid nrn_pragma_omp(atomic update) GPU_RHS(ip) -= p * GPU_RHS(i); } -#ifndef CORENEURON_ENABLE_GPU +//#ifndef CORENEURON_ENABLE_GPU } -#endif +//#endif // if finished with all tree depths then ready to break // (note that break is not allowed in OpenACC) if (icycle == 0) { @@ -520,9 +521,9 @@ static void triang_interleaved2(NrnThread* nt, int icore, int ncycle, int* strid --icycle; istride = stride[icycle]; i -= istride; -#ifndef CORENEURON_ENABLE_GPU +//#ifndef CORENEURON_ENABLE_GPU ii -= istride; -#endif +//#endif } // clang-format on } @@ -535,36 +536,37 @@ static void bksub_interleaved2(NrnThread* nt, int ncycle, int* stride, int firstnode) { -#ifndef CORENEURON_ENABLE_GPU +//#ifndef CORENEURON_ENABLE_GPU for (int i = root; i < lastroot; i += 1) { -#else - nrn_pragma_acc(loop seq) - for (int i = root; i < lastroot; i += warpsize) { -#endif +//#else +// nrn_pragma_acc(loop seq) +// for (int i = root; i < lastroot; i += warpsize) { +//#endif GPU_RHS(i) /= GPU_D(i); // the root } int i = firstnode + icore; -#ifndef CORENEURON_ENABLE_GPU +//#ifndef CORENEURON_ENABLE_GPU int ii = i; -#endif +//#endif for (int icycle = 0; icycle < ncycle; ++icycle) { int istride = stride[icycle]; -#ifndef CORENEURON_ENABLE_GPU +//#ifndef CORENEURON_ENABLE_GPU + nrn_pragma_acc(loop) // serial test, gpu does this in parallel for (int icore = 0; icore < warpsize; ++icore) { int i = ii + icore; -#endif +//#endif if (icore < istride) { int ip = GPU_PARENT(i); GPU_RHS(i) -= GPU_B(i) * GPU_RHS(ip); GPU_RHS(i) /= GPU_D(i); } i += istride; -#ifndef CORENEURON_ENABLE_GPU +//#ifndef CORENEURON_ENABLE_GPU } ii += istride; -#endif +//#endif } } @@ -617,14 +619,14 @@ void solve_interleaved2(int ith) { int lastroot = rootbegin[iwarp + 1]; int firstnode = nodebegin[iwarp]; int lastnode = nodebegin[iwarp + 1]; -#ifndef CORENEURON_ENABLE_GPU +//#ifndef CORENEURON_ENABLE_GPU if (ic == 0) { // serial test mode. triang and bksub do all cores in warp -#endif +//#endif triang_interleaved2(nt, ic, ncycle, stride, lastnode); bksub_interleaved2(nt, root + ic, lastroot, ic, ncycle, stride, firstnode); -#ifndef CORENEURON_ENABLE_GPU +//#ifndef CORENEURON_ENABLE_GPU } // serial test mode -#endif +//#endif } nrn_pragma_acc(wait(nt->stream_id)) #ifdef _OPENACC From 8107f27b428444fefff049d35a1456f28b3aa902 Mon Sep 17 00:00:00 2001 From: Christos Kotsalos Date: Tue, 9 Aug 2022 09:45:23 +0200 Subject: [PATCH 3/3] fixing race condition in cell permute 2 : performance optimization --- coreneuron/permute/cellorder.cpp | 45 +++++++++----------------------- 1 file changed, 12 insertions(+), 33 deletions(-) diff --git a/coreneuron/permute/cellorder.cpp b/coreneuron/permute/cellorder.cpp index 27f820ee6..ad173b196 100644 --- a/coreneuron/permute/cellorder.cpp +++ b/coreneuron/permute/cellorder.cpp @@ -478,13 +478,12 @@ static void bksub_interleaved(NrnThread* nt, } // icore ranges [0:warpsize) ; stride[ncycle] +nrn_pragma_acc(routine vector) static void triang_interleaved2(NrnThread* nt, int icore, int ncycle, int* stride, int lastnode) { int icycle = ncycle - 1; int istride = stride[icycle]; int i = lastnode - istride + icore; -//#ifndef CORENEURON_ENABLE_GPU int ii = i; -//#endif // execute until all tree depths are executed bool has_subtrees_to_compute = true; @@ -492,12 +491,10 @@ static void triang_interleaved2(NrnThread* nt, int icore, int ncycle, int* strid // clang-format off nrn_pragma_acc(loop seq) for (; has_subtrees_to_compute; ) { // ncycle loop -//#ifndef CORENEURON_ENABLE_GPU // serial test, gpu does this in parallel - nrn_pragma_acc(loop) + nrn_pragma_acc(loop vector) for (int icore = 0; icore < warpsize; ++icore) { int i = ii + icore; -//#endif if (icore < istride) { // most efficient if istride equal warpsize // what is the index int ip = GPU_PARENT(i); @@ -509,9 +506,7 @@ static void triang_interleaved2(NrnThread* nt, int icore, int ncycle, int* strid nrn_pragma_omp(atomic update) GPU_RHS(ip) -= p * GPU_RHS(i); } -//#ifndef CORENEURON_ENABLE_GPU } -//#endif // if finished with all tree depths then ready to break // (note that break is not allowed in OpenACC) if (icycle == 0) { @@ -521,14 +516,12 @@ static void triang_interleaved2(NrnThread* nt, int icore, int ncycle, int* strid --icycle; istride = stride[icycle]; i -= istride; -//#ifndef CORENEURON_ENABLE_GPU ii -= istride; -//#endif } - // clang-format on } // icore ranges [0:warpsize) ; stride[ncycle] +nrn_pragma_acc(routine vector) static void bksub_interleaved2(NrnThread* nt, int root, int lastroot, @@ -536,37 +529,28 @@ static void bksub_interleaved2(NrnThread* nt, int ncycle, int* stride, int firstnode) { -//#ifndef CORENEURON_ENABLE_GPU + nrn_pragma_acc(loop seq) for (int i = root; i < lastroot; i += 1) { -//#else -// nrn_pragma_acc(loop seq) -// for (int i = root; i < lastroot; i += warpsize) { -//#endif GPU_RHS(i) /= GPU_D(i); // the root } int i = firstnode + icore; -//#ifndef CORENEURON_ENABLE_GPU int ii = i; -//#endif + nrn_pragma_acc(loop seq) for (int icycle = 0; icycle < ncycle; ++icycle) { int istride = stride[icycle]; -//#ifndef CORENEURON_ENABLE_GPU - nrn_pragma_acc(loop) // serial test, gpu does this in parallel + nrn_pragma_acc(loop vector) for (int icore = 0; icore < warpsize; ++icore) { int i = ii + icore; -//#endif if (icore < istride) { int ip = GPU_PARENT(i); GPU_RHS(i) -= GPU_B(i) * GPU_RHS(ip); GPU_RHS(i) /= GPU_D(i); } i += istride; -//#ifndef CORENEURON_ENABLE_GPU } ii += istride; -//#endif } } @@ -602,15 +586,14 @@ void solve_interleaved2(int ith) { defined(_OPENACC) int nstride = stridedispl[nwarp]; #endif - nrn_pragma_acc(parallel loop gang vector vector_length( - warpsize) present(nt [0:1], + nrn_pragma_acc(parallel loop gang present(nt [0:1], strides [0:nstride], ncycles [0:nwarp], stridedispl [0:nwarp + 1], rootbegin [0:nwarp + 1], nodebegin [0:nwarp + 1]) if (nt->compute_gpu) async(nt->stream_id)) nrn_pragma_omp(target teams distribute parallel for simd if(nt->compute_gpu)) - for (int icore = 0; icore < ncore; ++icore) { + for (int icore = 0; icore < ncore; icore += warpsize) { int iwarp = icore / warpsize; // figure out the >> value int ic = icore & (warpsize - 1); // figure out the & mask int ncycle = ncycles[iwarp]; @@ -619,14 +602,10 @@ void solve_interleaved2(int ith) { int lastroot = rootbegin[iwarp + 1]; int firstnode = nodebegin[iwarp]; int lastnode = nodebegin[iwarp + 1]; -//#ifndef CORENEURON_ENABLE_GPU - if (ic == 0) { // serial test mode. triang and bksub do all cores in warp -//#endif - triang_interleaved2(nt, ic, ncycle, stride, lastnode); - bksub_interleaved2(nt, root + ic, lastroot, ic, ncycle, stride, firstnode); -//#ifndef CORENEURON_ENABLE_GPU - } // serial test mode -//#endif + + // triang and bksub do all cores in warp + triang_interleaved2(nt, ic, ncycle, stride, lastnode); + bksub_interleaved2(nt, root + ic, lastroot, ic, ncycle, stride, firstnode); } nrn_pragma_acc(wait(nt->stream_id)) #ifdef _OPENACC