From f1a5f619b1a553816b940d3e452d7fae105514c7 Mon Sep 17 00:00:00 2001
From: Olli Lupton <oliver.lupton@epfl.ch>
Date: Wed, 6 Jul 2022 17:18:05 +0200
Subject: [PATCH 1/3] Compilation fixes for NVHPC 22.5.

---
 coreneuron/CMakeLists.txt          | 11 ++-----
 coreneuron/apps/main1.cpp          |  9 +++---
 coreneuron/io/global_vars.cpp      |  2 +-
 coreneuron/mechanism/eion.cpp      | 48 +++++++-----------------------
 coreneuron/mechanism/membfunc.hpp  | 36 ++++++++++++++++------
 coreneuron/nrnconf.h               | 10 ++-----
 coreneuron/nrnoc/nrnunits_modern.h | 36 ----------------------
 coreneuron/utils/nrnoc_aux.hpp     |  5 ----
 coreneuron/utils/units.hpp         | 36 ++++++++++++++++++++++
 9 files changed, 82 insertions(+), 111 deletions(-)
 delete mode 100644 coreneuron/nrnoc/nrnunits_modern.h
 create mode 100644 coreneuron/utils/units.hpp

diff --git a/coreneuron/CMakeLists.txt b/coreneuron/CMakeLists.txt
index 0dc648628..1fc423f06 100644
--- a/coreneuron/CMakeLists.txt
+++ b/coreneuron/CMakeLists.txt
@@ -120,16 +120,9 @@ if(CORENRN_ENABLE_GPU)
 endif()
 
 # =============================================================================
-# eion.cpp depends on CORENRN_USE_LEGACY_UNITS
+# CORENEURON_USE_LEGACY_UNITS is used in membfunc.hpp so define it everywhere
 # =============================================================================
-set(LegacyFR_FILES
-    ${CMAKE_CURRENT_SOURCE_DIR}/mechanism/eion.cpp ${CMAKE_CURRENT_SOURCE_DIR}/apps/main1.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/io/global_vars.cpp)
-
-set_property(
-  SOURCE ${LegacyFR_FILES}
-  APPEND
-  PROPERTY COMPILE_DEFINITIONS "CORENRN_USE_LEGACY_UNITS=${CORENRN_USE_LEGACY_UNITS}")
+add_compile_definitions(CORENEURON_USE_LEGACY_UNITS=${CORENRN_USE_LEGACY_UNITS})
 
 # =============================================================================
 # create libraries
diff --git a/coreneuron/apps/main1.cpp b/coreneuron/apps/main1.cpp
index fb74df7d0..3f314d208 100644
--- a/coreneuron/apps/main1.cpp
+++ b/coreneuron/apps/main1.cpp
@@ -1,6 +1,6 @@
 /*
 # =============================================================================
-# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
+# Copyright (c) 2016 - 2022 Blue Brain Project/EPFL
 #
 # See top-level LICENSE file for details.
 # =============================================================================.
@@ -51,9 +51,9 @@ const char* corenrn_version() {
     return coreneuron::bbcore_write_version;
 }
 
-// the CORENRN_USE_LEGACY_UNITS determined by CORENRN_ENABLE_LEGACY_UNITS
+// the CORENEURON_USE_LEGACY_UNITS determined by CORENRN_ENABLE_LEGACY_UNITS
 bool corenrn_units_use_legacy() {
-    return CORENRN_USE_LEGACY_UNITS;
+    return CORENEURON_USE_LEGACY_UNITS;
 }
 
 void (*nrn2core_part2_clean_)();
@@ -562,8 +562,7 @@ extern "C" int run_solve_core(int argc, char** argv) {
     }
 #endif
     bool compute_gpu = corenrn_param.gpu;
-
-    nrn_pragma_acc(update device(celsius, secondorder, pi) if (compute_gpu))
+    nrn_pragma_acc(data copyin(celsius, secondorder, pi) if (compute_gpu))
     nrn_pragma_omp(target update to(celsius, secondorder, pi) if (compute_gpu))
     {
         double v = corenrn_param.voltage;
diff --git a/coreneuron/io/global_vars.cpp b/coreneuron/io/global_vars.cpp
index 128a1cdb9..815423ea9 100644
--- a/coreneuron/io/global_vars.cpp
+++ b/coreneuron/io/global_vars.cpp
@@ -142,7 +142,7 @@ void set_globals(const char* path, bool cli_global_seed, int cli_global_seed_val
                 } else if (strcmp(name, "Random123_globalindex") == 0) {
                     nrnran123_set_globalindex((uint32_t) n);
                 } else if (strcmp(name, "_nrnunit_use_legacy_") == 0) {
-                    if (n != CORENRN_USE_LEGACY_UNITS) {
+                    if (n != CORENEURON_USE_LEGACY_UNITS) {
                         hoc_execerror(
                             "CORENRN_ENABLE_LEGACY_UNITS not"
                             " consistent with NEURON value of"
diff --git a/coreneuron/mechanism/eion.cpp b/coreneuron/mechanism/eion.cpp
index 4bc077880..a2e76a537 100644
--- a/coreneuron/mechanism/eion.cpp
+++ b/coreneuron/mechanism/eion.cpp
@@ -1,6 +1,6 @@
 /*
 # =============================================================================
-# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
+# Copyright (c) 2016 - 2022 Blue Brain Project/EPFL
 #
 # See top-level LICENSE file for details.
 # =============================================================================.
@@ -154,23 +154,9 @@ the USEION statement of any model using this ion\n",
     }
 }
 
-#ifndef CORENRN_USE_LEGACY_UNITS
-#define CORENRN_USE_LEGACY_UNITS 0
-#endif
-
-#if CORENRN_USE_LEGACY_UNITS == 1
-#define FARADAY     96485.309
-#define gasconstant 8.3134
-#else
-#include "coreneuron/nrnoc/nrnunits_modern.h"
-#define FARADAY     _faraday_codata2018
-#define gasconstant _gasconstant_codata2018
-#endif
-
-#define ktf (1000. * gasconstant * (celsius + 273.15) / FARADAY)
-
-double nrn_nernst(double ci, double co, double z, double celsius) {
-    /*printf("nrn_nernst %g %g %g\n", ci, co, z);*/
+// std::log isn't constexpr, but there are argument values for which nrn_nernst
+// is a constant expression
+constexpr double nrn_nernst(double ci, double co, double z, double celsius) {
     if (z == 0) {
         return 0.;
     }
@@ -179,7 +165,7 @@ double nrn_nernst(double ci, double co, double z, double celsius) {
     } else if (co <= 0.) {
         return -1e6;
     } else {
-        return ktf / z * log(co / ci);
+        return ktf(celsius) / z * std::log(co / ci);
     }
 }
 
@@ -200,24 +186,8 @@ void nrn_wrote_conc(int type,
         pe[0] = nrn_nernst(pe[1 * _STRIDE], pe[2 * _STRIDE], gimap[type][2], celsius);
     }
 }
-
-static double efun(double x) {
-    if (fabs(x) < 1e-4) {
-        return 1. - x / 2.;
-    } else {
-        return x / (exp(x) - 1);
-    }
-}
-
 nrn_pragma_omp(end declare target)
 
-double nrn_ghk(double v, double ci, double co, double z) {
-    double temp = z * v / ktf;
-    double eco = co * efun(temp);
-    double eci = ci * efun(-temp);
-    return (.001) * z * FARADAY * (eci - eco);
-}
-
 #if VECTORIZE
 #define erev   pd[0 * _STRIDE] /* From Eion */
 #define conci  pd[1 * _STRIDE]
@@ -257,7 +227,7 @@ ion_style("name_ion", [c_style, e_style, einit, eadvance, cinit])
 
 double nrn_nernst_coef(int type) {
     /* for computing jacobian element dconc'/dconc */
-    return ktf / charge;
+    return ktf(celsius) / charge;
 }
 
 /* Must be called prior to any channels which update the currents */
@@ -271,7 +241,8 @@ void nrn_cur_ion(NrnThread* nt, Memb_list* ml, int type) {
     pd = ml->data;
     ppd = ml->pdata;
     // clang-format off
-    nrn_pragma_acc(parallel loop present(pd[0:_cntml_padded * 5],
+    nrn_pragma_acc(parallel loop present(celsius,
+                                         pd[0:_cntml_padded * 5],
                                          ppd[0:_cntml_actual],
                                          nrn_ion_global_map[0:nrn_ion_global_map_size]
                                                            [0:ion_global_map_member_size])
@@ -311,7 +282,8 @@ void nrn_init_ion(NrnThread* nt, Memb_list* ml, int type) {
     // verify if this can be made asynchronous or if there is a strong reason it
     // needs to be like this.
     // clang-format off
-    nrn_pragma_acc(parallel loop present(pd[0:_cntml_padded * 5],
+    nrn_pragma_acc(parallel loop present(celsius,
+                                         pd[0:_cntml_padded * 5],
                                          ppd[0:_cntml_actual],
                                          nrn_ion_global_map[0:nrn_ion_global_map_size]
                                                            [0:ion_global_map_member_size])
diff --git a/coreneuron/mechanism/membfunc.hpp b/coreneuron/mechanism/membfunc.hpp
index 2556f0f87..6862d9245 100644
--- a/coreneuron/mechanism/membfunc.hpp
+++ b/coreneuron/mechanism/membfunc.hpp
@@ -1,17 +1,19 @@
 /*
 # =============================================================================
-# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
+# Copyright (c) 2016 - 2022 Blue Brain Project/EPFL
 #
 # See top-level LICENSE file for details.
 # =============================================================================.
 */
-
 #pragma once
 
-#include <vector>
-
 #include "coreneuron/mechanism/mechanism.hpp"
 #include "coreneuron/utils/offload.hpp"
+#include "coreneuron/utils/units.hpp"
+
+#include <cmath>
+#include <vector>
+
 namespace coreneuron {
 
 using Pfrpdat = Datum* (*) (void);
@@ -112,12 +114,28 @@ extern void nrn_jacob_capacitance(NrnThread*, Memb_list*, int);
 extern void nrn_writes_conc(int, int);
 nrn_pragma_omp(declare target)
 nrn_pragma_acc(routine seq)
-extern void nrn_wrote_conc(int, double*, int, int, double**, double, int);
-nrn_pragma_acc(routine seq)
-double nrn_nernst(double ci, double co, double z, double celsius);
-nrn_pragma_acc(routine seq)
-extern double nrn_ghk(double v, double ci, double co, double z);
+void nrn_wrote_conc(int, double*, int, int, double**, double, int);
 nrn_pragma_omp(end declare target)
+constexpr double ktf(double celsius) {
+    return 1000. * units::gasconstant * (celsius + 273.15) / units::faraday;
+}
+inline double nrn_ghk(double v, double ci, double co, double z, double celsius) {
+    auto const efun = [](double x) {
+        if (std::abs(x) < 1e-4) {
+            return 1. - x / 2.;
+        } else {
+            return x / (std::exp(x) - 1.);
+        }
+    };
+    double const temp{z * v / ktf(celsius)};
+    double const eco{co * efun(+temp)};
+    double const eci{ci * efun(-temp)};
+    return .001 * z * units::faraday * (eci - eco);
+}
+// Overload using the global celsius variable
+inline double nrn_ghk(double v, double ci, double co, double z) {
+    return nrn_ghk(v, ci, co, z, celsius);
+}
 extern void hoc_register_prop_size(int, int, int);
 extern void hoc_register_dparam_semantics(int type, int, const char* name);
 extern void hoc_reg_ba(int, mod_f_t, int);
diff --git a/coreneuron/nrnconf.h b/coreneuron/nrnconf.h
index b25a2764a..2b1a894ad 100644
--- a/coreneuron/nrnconf.h
+++ b/coreneuron/nrnconf.h
@@ -32,16 +32,10 @@ using Symbol = char;
 #define VEC_AREA(i) (_nt->_actual_area[(i)])
 #define VECTORIZE   1
 
-// extern variables require acc declare
+// Defined in register_mech.cpp
 nrn_pragma_omp(declare target)
-extern double celsius;
-nrn_pragma_acc(declare create(celsius))
-
-extern double pi;
-nrn_pragma_acc(declare create(pi))
-
+extern double celsius, pi;
 extern int secondorder;
-nrn_pragma_acc(declare create(secondorder))
 nrn_pragma_omp(end declare target)
 
 extern double t, dt;
diff --git a/coreneuron/nrnoc/nrnunits_modern.h b/coreneuron/nrnoc/nrnunits_modern.h
deleted file mode 100644
index d93638841..000000000
--- a/coreneuron/nrnoc/nrnunits_modern.h
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
-# =============================================================================
-# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
-#
-# See top-level LICENSE file for details.
-# =============================================================================
-*/
-
-#pragma once
-
-/**
- NMODL translated MOD files get unit constants typically from
- share/lib/nrnunits.lib.in. But there were other source files that
- hardcode some of the constants. Here we gather a few modern units into
- a single place (but, unfortunately, also in nrnunits.lib.in). Legacy units
- cannot be gathered here because they can differ slightly from place to place.
-
- These come from https://physics.nist.gov/cuu/Constants/index.html.
- Termed the "2018 CODATA recommended values", they became available
- on 20 May 2019 and replace the 2014 CODATA set.
-
- See oc/hoc_init.c, nrnoc/eion.c, nrniv/kschan.h
-**/
-
-
-#define _electron_charge_codata2018 1.602176634e-19 /* coulomb exact*/
-#define _avogadro_number_codata2018 6.02214076e+23  /* exact */
-#define _boltzmann_codata2018       1.380649e-23    /* joule/K exact */
-#define _faraday_codata2018 \
-    (_electron_charge_codata2018 * _avogadro_number_codata2018) /* 96485.33212... coulomb/mol */
-#define _gasconstant_codata2018 \
-    (_boltzmann_codata2018 * _avogadro_number_codata2018) /* 8.314462618... joule/mol-K */
-
-/* e/k in K/millivolt */
-#define _e_over_k_codata2018 \
-    (.001 * _electron_charge_codata2018 / _boltzmann_codata2018) /* 11.604518... K/mV */
diff --git a/coreneuron/utils/nrnoc_aux.hpp b/coreneuron/utils/nrnoc_aux.hpp
index 3c2f23326..10b5880ea 100644
--- a/coreneuron/utils/nrnoc_aux.hpp
+++ b/coreneuron/utils/nrnoc_aux.hpp
@@ -34,9 +34,4 @@ extern void hoc_execerror(const char*, const char*); /* print and abort */
 extern void hoc_warning(const char*, const char*);
 
 extern double hoc_Exp(double x);
-
-// defined in eion.cpp and this file included in translated mod files.
-extern double nrn_nernst(double ci, double co, double z, double celsius);
-extern double nrn_ghk(double v, double ci, double co, double z);
-
 }  // namespace coreneuron
diff --git a/coreneuron/utils/units.hpp b/coreneuron/utils/units.hpp
new file mode 100644
index 000000000..aebbc40e0
--- /dev/null
+++ b/coreneuron/utils/units.hpp
@@ -0,0 +1,36 @@
+/*
+# =============================================================================
+# Copyright (c) 2016 - 2022 Blue Brain Project/EPFL
+#
+# See top-level LICENSE file for details.
+# =============================================================================
+*/
+#pragma once
+namespace coreneuron::units {
+#if CORENEURON_USE_LEGACY_UNITS == 1
+constexpr double faraday{96485.309};
+constexpr double gasconstant{8.3134};
+#else
+/* NMODL translated MOD files get unit constants typically from
+ * share/lib/nrnunits.lib.in. But there were other source files that hardcode
+ * some of the constants. Here we gather a few modern units into a single place
+ * (but, unfortunately, also in nrnunits.lib.in). Legacy units cannot be
+ * gathered here because they can differ slightly from place to place.
+ *
+ * These come from https://physics.nist.gov/cuu/Constants/index.html.
+ * Termed the "2018 CODATA recommended values", they became available
+ * on 20 May 2019 and replace the 2014 CODATA set.
+ *
+ * See oc/hoc_init.c, nrnoc/eion.c, nrniv/kschan.h
+ */
+namespace detail {
+constexpr double electron_charge{1.602176634e-19};  // coulomb exact
+constexpr double avogadro_number{6.02214076e+23};   // exact
+constexpr double boltzmann{1.380649e-23};           // joule/K exact
+}  // namespace detail
+constexpr double faraday{detail::electron_charge * detail::avogadro_number};  // 96485.33212...
+                                                                              // coulomb/mol
+constexpr double gasconstant{detail::boltzmann * detail::avogadro_number};    // 8.314462618...
+                                                                              // joule/mol-K
+#endif
+}  // namespace coreneuron::units

From f6e3237958f47e7c0e8a6376d8e4f7551248dc8c Mon Sep 17 00:00:00 2001
From: Christos Kotsalos <christos.kotsalos@epfl.ch>
Date: Thu, 28 Jul 2022 15:09:48 +0200
Subject: [PATCH 2/3] fixing race condition in cell permute 2

---
 coreneuron/permute/cellorder.cpp | 48 +++++++++++++++++---------------
 1 file changed, 25 insertions(+), 23 deletions(-)

diff --git a/coreneuron/permute/cellorder.cpp b/coreneuron/permute/cellorder.cpp
index 14feb31de..27f820ee6 100644
--- a/coreneuron/permute/cellorder.cpp
+++ b/coreneuron/permute/cellorder.cpp
@@ -482,9 +482,9 @@ static void triang_interleaved2(NrnThread* nt, int icore, int ncycle, int* strid
     int icycle = ncycle - 1;
     int istride = stride[icycle];
     int i = lastnode - istride + icore;
-#ifndef CORENEURON_ENABLE_GPU
+//#ifndef CORENEURON_ENABLE_GPU
     int ii = i;
-#endif
+//#endif
 
     // execute until all tree depths are executed
     bool has_subtrees_to_compute = true;
@@ -492,11 +492,12 @@ static void triang_interleaved2(NrnThread* nt, int icore, int ncycle, int* strid
     // clang-format off
     nrn_pragma_acc(loop seq)
     for (; has_subtrees_to_compute; ) {  // ncycle loop
-#ifndef CORENEURON_ENABLE_GPU
+//#ifndef CORENEURON_ENABLE_GPU
         // serial test, gpu does this in parallel
+        nrn_pragma_acc(loop)
         for (int icore = 0; icore < warpsize; ++icore) {
             int i = ii + icore;
-#endif
+//#endif
             if (icore < istride) {  // most efficient if istride equal  warpsize
                 // what is the index
                 int ip = GPU_PARENT(i);
@@ -508,9 +509,9 @@ static void triang_interleaved2(NrnThread* nt, int icore, int ncycle, int* strid
                 nrn_pragma_omp(atomic update)
                 GPU_RHS(ip) -= p * GPU_RHS(i);
             }
-#ifndef CORENEURON_ENABLE_GPU
+//#ifndef CORENEURON_ENABLE_GPU
         }
-#endif
+//#endif
         // if finished with all tree depths then ready to break
         // (note that break is not allowed in OpenACC)
         if (icycle == 0) {
@@ -520,9 +521,9 @@ static void triang_interleaved2(NrnThread* nt, int icore, int ncycle, int* strid
         --icycle;
         istride = stride[icycle];
         i -= istride;
-#ifndef CORENEURON_ENABLE_GPU
+//#ifndef CORENEURON_ENABLE_GPU
         ii -= istride;
-#endif
+//#endif
     }
     // clang-format on
 }
@@ -535,36 +536,37 @@ static void bksub_interleaved2(NrnThread* nt,
                                int ncycle,
                                int* stride,
                                int firstnode) {
-#ifndef CORENEURON_ENABLE_GPU
+//#ifndef CORENEURON_ENABLE_GPU
     for (int i = root; i < lastroot; i += 1) {
-#else
-    nrn_pragma_acc(loop seq)
-    for (int i = root; i < lastroot; i += warpsize) {
-#endif
+//#else
+//    nrn_pragma_acc(loop seq)
+//    for (int i = root; i < lastroot; i += warpsize) {
+//#endif
         GPU_RHS(i) /= GPU_D(i);  // the root
     }
 
     int i = firstnode + icore;
-#ifndef CORENEURON_ENABLE_GPU
+//#ifndef CORENEURON_ENABLE_GPU
     int ii = i;
-#endif
+//#endif
     for (int icycle = 0; icycle < ncycle; ++icycle) {
         int istride = stride[icycle];
-#ifndef CORENEURON_ENABLE_GPU
+//#ifndef CORENEURON_ENABLE_GPU
+        nrn_pragma_acc(loop)
         // serial test, gpu does this in parallel
         for (int icore = 0; icore < warpsize; ++icore) {
             int i = ii + icore;
-#endif
+//#endif
             if (icore < istride) {
                 int ip = GPU_PARENT(i);
                 GPU_RHS(i) -= GPU_B(i) * GPU_RHS(ip);
                 GPU_RHS(i) /= GPU_D(i);
             }
             i += istride;
-#ifndef CORENEURON_ENABLE_GPU
+//#ifndef CORENEURON_ENABLE_GPU
         }
         ii += istride;
-#endif
+//#endif
     }
 }
 
@@ -617,14 +619,14 @@ void solve_interleaved2(int ith) {
             int lastroot = rootbegin[iwarp + 1];
             int firstnode = nodebegin[iwarp];
             int lastnode = nodebegin[iwarp + 1];
-#ifndef CORENEURON_ENABLE_GPU
+//#ifndef CORENEURON_ENABLE_GPU
             if (ic == 0) {  // serial test mode. triang and bksub do all cores in warp
-#endif
+//#endif
                 triang_interleaved2(nt, ic, ncycle, stride, lastnode);
                 bksub_interleaved2(nt, root + ic, lastroot, ic, ncycle, stride, firstnode);
-#ifndef CORENEURON_ENABLE_GPU
+//#ifndef CORENEURON_ENABLE_GPU
             }  // serial test mode
-#endif
+//#endif
         }
         nrn_pragma_acc(wait(nt->stream_id))
 #ifdef _OPENACC

From 8107f27b428444fefff049d35a1456f28b3aa902 Mon Sep 17 00:00:00 2001
From: Christos Kotsalos <christos.kotsalos@epfl.ch>
Date: Tue, 9 Aug 2022 09:45:23 +0200
Subject: [PATCH 3/3] fixing race condition in cell permute 2 : performance
 optimization

---
 coreneuron/permute/cellorder.cpp | 45 +++++++++-----------------------
 1 file changed, 12 insertions(+), 33 deletions(-)

diff --git a/coreneuron/permute/cellorder.cpp b/coreneuron/permute/cellorder.cpp
index 27f820ee6..ad173b196 100644
--- a/coreneuron/permute/cellorder.cpp
+++ b/coreneuron/permute/cellorder.cpp
@@ -478,13 +478,12 @@ static void bksub_interleaved(NrnThread* nt,
 }
 
 // icore ranges [0:warpsize) ; stride[ncycle]
+nrn_pragma_acc(routine vector)
 static void triang_interleaved2(NrnThread* nt, int icore, int ncycle, int* stride, int lastnode) {
     int icycle = ncycle - 1;
     int istride = stride[icycle];
     int i = lastnode - istride + icore;
-//#ifndef CORENEURON_ENABLE_GPU
     int ii = i;
-//#endif
 
     // execute until all tree depths are executed
     bool has_subtrees_to_compute = true;
@@ -492,12 +491,10 @@ static void triang_interleaved2(NrnThread* nt, int icore, int ncycle, int* strid
     // clang-format off
     nrn_pragma_acc(loop seq)
     for (; has_subtrees_to_compute; ) {  // ncycle loop
-//#ifndef CORENEURON_ENABLE_GPU
         // serial test, gpu does this in parallel
-        nrn_pragma_acc(loop)
+        nrn_pragma_acc(loop vector)
         for (int icore = 0; icore < warpsize; ++icore) {
             int i = ii + icore;
-//#endif
             if (icore < istride) {  // most efficient if istride equal  warpsize
                 // what is the index
                 int ip = GPU_PARENT(i);
@@ -509,9 +506,7 @@ static void triang_interleaved2(NrnThread* nt, int icore, int ncycle, int* strid
                 nrn_pragma_omp(atomic update)
                 GPU_RHS(ip) -= p * GPU_RHS(i);
             }
-//#ifndef CORENEURON_ENABLE_GPU
         }
-//#endif
         // if finished with all tree depths then ready to break
         // (note that break is not allowed in OpenACC)
         if (icycle == 0) {
@@ -521,14 +516,12 @@ static void triang_interleaved2(NrnThread* nt, int icore, int ncycle, int* strid
         --icycle;
         istride = stride[icycle];
         i -= istride;
-//#ifndef CORENEURON_ENABLE_GPU
         ii -= istride;
-//#endif
     }
-    // clang-format on
 }
 
 // icore ranges [0:warpsize) ; stride[ncycle]
+nrn_pragma_acc(routine vector)
 static void bksub_interleaved2(NrnThread* nt,
                                int root,
                                int lastroot,
@@ -536,37 +529,28 @@ static void bksub_interleaved2(NrnThread* nt,
                                int ncycle,
                                int* stride,
                                int firstnode) {
-//#ifndef CORENEURON_ENABLE_GPU
+    nrn_pragma_acc(loop seq)
     for (int i = root; i < lastroot; i += 1) {
-//#else
-//    nrn_pragma_acc(loop seq)
-//    for (int i = root; i < lastroot; i += warpsize) {
-//#endif
         GPU_RHS(i) /= GPU_D(i);  // the root
     }
 
     int i = firstnode + icore;
-//#ifndef CORENEURON_ENABLE_GPU
     int ii = i;
-//#endif
+    nrn_pragma_acc(loop seq)
     for (int icycle = 0; icycle < ncycle; ++icycle) {
         int istride = stride[icycle];
-//#ifndef CORENEURON_ENABLE_GPU
-        nrn_pragma_acc(loop)
         // serial test, gpu does this in parallel
+        nrn_pragma_acc(loop vector)
         for (int icore = 0; icore < warpsize; ++icore) {
             int i = ii + icore;
-//#endif
             if (icore < istride) {
                 int ip = GPU_PARENT(i);
                 GPU_RHS(i) -= GPU_B(i) * GPU_RHS(ip);
                 GPU_RHS(i) /= GPU_D(i);
             }
             i += istride;
-//#ifndef CORENEURON_ENABLE_GPU
         }
         ii += istride;
-//#endif
     }
 }
 
@@ -602,15 +586,14 @@ void solve_interleaved2(int ith) {
     defined(_OPENACC)
         int nstride = stridedispl[nwarp];
 #endif
-        nrn_pragma_acc(parallel loop gang vector vector_length(
-            warpsize) present(nt [0:1],
+        nrn_pragma_acc(parallel loop gang present(nt [0:1],
                               strides [0:nstride],
                               ncycles [0:nwarp],
                               stridedispl [0:nwarp + 1],
                               rootbegin [0:nwarp + 1],
                               nodebegin [0:nwarp + 1]) if (nt->compute_gpu) async(nt->stream_id))
         nrn_pragma_omp(target teams distribute parallel for simd if(nt->compute_gpu))
-        for (int icore = 0; icore < ncore; ++icore) {
+        for (int icore = 0; icore < ncore; icore += warpsize) {
             int iwarp = icore / warpsize;     // figure out the >> value
             int ic = icore & (warpsize - 1);  // figure out the & mask
             int ncycle = ncycles[iwarp];
@@ -619,14 +602,10 @@ void solve_interleaved2(int ith) {
             int lastroot = rootbegin[iwarp + 1];
             int firstnode = nodebegin[iwarp];
             int lastnode = nodebegin[iwarp + 1];
-//#ifndef CORENEURON_ENABLE_GPU
-            if (ic == 0) {  // serial test mode. triang and bksub do all cores in warp
-//#endif
-                triang_interleaved2(nt, ic, ncycle, stride, lastnode);
-                bksub_interleaved2(nt, root + ic, lastroot, ic, ncycle, stride, firstnode);
-//#ifndef CORENEURON_ENABLE_GPU
-            }  // serial test mode
-//#endif
+
+            // triang and bksub do all cores in warp
+            triang_interleaved2(nt, ic, ncycle, stride, lastnode);
+            bksub_interleaved2(nt, root + ic, lastroot, ic, ncycle, stride, firstnode);
         }
         nrn_pragma_acc(wait(nt->stream_id))
 #ifdef _OPENACC