diff --git a/coreneuron/gpu/nrn_acc_manager.cpp b/coreneuron/gpu/nrn_acc_manager.cpp
index d5e723527..c4f74b559 100644
--- a/coreneuron/gpu/nrn_acc_manager.cpp
+++ b/coreneuron/gpu/nrn_acc_manager.cpp
@@ -326,8 +326,8 @@ void setup_nrnthreads_on_device(NrnThread* threads, int nthreads) {
         /* Here is the example of using OpenACC data enter/exit
          * Remember that we are not allowed to use nt->_data but we have to use:
          *      double *dtmp = nt->_data;  // now use dtmp!
-                #pragma acc enter data copyin(dtmp[0:nt->_ndata]) async(nt->stream_id)
-                #pragma acc wait(nt->stream_id)
+                #pragma acc enter data copyin(dtmp[0:nt->_ndata]) async(nt->streams[nt->stream_id])
+                #pragma acc wait(nt->streams[nt->stream_id])
          */
 
         /*update d_nt._data to point to device copy */
@@ -736,7 +736,7 @@ void update_net_receive_buffer(NrnThread* nt) {
                                              nrb->_nrb_flag[:nrb->_cnt],
                                              nrb->_displ[:nrb->_displ_cnt + 1],
                                              nrb->_nrb_index[:nrb->_cnt])
-                                             async(nt->stream_id))
+                                             async(nt->streams[nt->stream_id]))
                 nrn_pragma_omp(target update to(nrb->_cnt,
                                                 nrb->_displ_cnt,
                                                 nrb->_pnt_index[:nrb->_cnt],
@@ -749,7 +749,7 @@ void update_net_receive_buffer(NrnThread* nt) {
             }
         }
     }
-    nrn_pragma_acc(wait(nt->stream_id))
+    nrn_pragma_acc(wait async(nt->streams[nt->stream_id]))
 }
 
 void update_net_send_buffer_on_host(NrnThread* nt, NetSendBuffer_t* nsb) {
diff --git a/coreneuron/io/phase2.cpp b/coreneuron/io/phase2.cpp
index e54e1db81..157b69d61 100644
--- a/coreneuron/io/phase2.cpp
+++ b/coreneuron/io/phase2.cpp
@@ -875,7 +875,9 @@ void Phase2::populate(NrnThread& nt, const UserParams& userParams) {
  * greater than number of omp threads.
  */
 #if defined(_OPENMP)
-    nt.stream_id = omp_get_thread_num();
+    const auto omp_thread_num = omp_get_thread_num();
+    nt.stream_id = omp_thread_num;
+    nt.streams[nt.stream_id] = omp_thread_num;
 #endif
 
     int shadow_rhs_cnt = 0;
diff --git a/coreneuron/mechanism/capac.cpp b/coreneuron/mechanism/capac.cpp
index 42c65cb18..6a03e1f53 100644
--- a/coreneuron/mechanism/capac.cpp
+++ b/coreneuron/mechanism/capac.cpp
@@ -70,8 +70,8 @@ void nrn_jacob_capacitance(NrnThread* _nt, Memb_list* ml, int /* type */) {
         nrn_pragma_acc(parallel loop present(vdata [0:_cntml_padded * nparm],
                                              ni [0:_cntml_actual],
                                              _vec_d [0:_nt->end]) if (_nt->compute_gpu)
-                           async(_nt->stream_id))
-        nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu))
+                           async(_nt->streams[_nt->stream_id]))
+        nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu) depend(inout: _nt->streams[_nt->stream_id]) nowait)
         for (_iml = 0; _iml < _cntml_actual; _iml++) {
             _vec_d[ni[_iml]] += cfac * cm;
         }
@@ -116,8 +116,8 @@ void nrn_cur_capacitance(NrnThread* _nt, Memb_list* ml, int /* type */) {
     nrn_pragma_acc(parallel loop present(vdata [0:_cntml_padded * nparm],
                                          ni [0:_cntml_actual],
                                          _vec_rhs [0:_nt->end]) if (_nt->compute_gpu)
-                       async(_nt->stream_id))
-    nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu))
+                       async(_nt->streams[_nt->stream_id]))
+    nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu) depend(inout: _nt->streams[_nt->stream_id]) nowait)
     for (int _iml = 0; _iml < _cntml_actual; _iml++) {
         i_cap = cfac * cm * _vec_rhs[ni[_iml]];
     }
diff --git a/coreneuron/mechanism/eion.cpp b/coreneuron/mechanism/eion.cpp
index 8b58e858d..de3a92611 100644
--- a/coreneuron/mechanism/eion.cpp
+++ b/coreneuron/mechanism/eion.cpp
@@ -263,14 +263,12 @@ void nrn_cur_ion(NrnThread* nt, Memb_list* ml, int type) {
     int _cntml_padded = ml->_nodecount_padded;
     pd = ml->data;
     ppd = ml->pdata;
-    // clang-format off
-    nrn_pragma_acc(parallel loop present(pd[0:_cntml_padded * 5],
-                                         nrn_ion_global_map[0:nrn_ion_global_map_size]
-                                                           [0:ion_global_map_member_size])
-                                 if (nt->compute_gpu)
-                                 async(nt->stream_id))
-    // clang-format on
-    nrn_pragma_omp(target teams distribute parallel for simd if(nt->compute_gpu))
+    nrn_pragma_acc(parallel loop present(
+        pd [0:_cntml_padded * 5],
+        nrn_ion_global_map
+        [0:nrn_ion_global_map_size] [0:ion_global_map_member_size]) if (nt->compute_gpu)
+                       async(nt->streams[nt->stream_id]))
+    nrn_pragma_omp(target teams distribute parallel for simd if(nt->compute_gpu) depend(inout: nt->streams[nt->stream_id]) nowait)
     for (int _iml = 0; _iml < _cntml_actual; ++_iml) {
         dcurdv = 0.;
         cur = 0.;
@@ -342,8 +340,8 @@ void second_order_cur(NrnThread* _nt, int secondorder) {
                 nrn_pragma_acc(parallel loop present(pd [0:_cntml_padded * 5],
                                                      ni [0:_cntml_actual],
                                                      _vec_rhs [0:_nt->end]) if (_nt->compute_gpu)
-                                   async(_nt->stream_id))
-                nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu))
+                                   async(_nt->streams[_nt->stream_id]))
+                nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu) depend(inout: _nt->streams[_nt->stream_id]) nowait)
                 for (int _iml = 0; _iml < _cntml_actual; ++_iml) {
                     cur += dcurdv * (_vec_rhs[ni[_iml]]);
                 }
diff --git a/coreneuron/network/netcvode.cpp b/coreneuron/network/netcvode.cpp
index 4fb1d165f..60192e8af 100644
--- a/coreneuron/network/netcvode.cpp
+++ b/coreneuron/network/netcvode.cpp
@@ -533,8 +533,10 @@ void NetCvode::check_thresh(NrnThread* nt) {  // for default method
 
     nrn_pragma_acc(parallel loop present(
         nt [0:1], presyns_helper [0:nt->n_presyn], presyns [0:nt->n_presyn], actual_v [0:nt->end])
-                       copy(net_send_buf_count) if (nt->compute_gpu) async(nt->stream_id))
-    nrn_pragma_omp(target teams distribute parallel for map(tofrom: net_send_buf_count) if(nt->compute_gpu))
+                       copy(net_send_buf_count) if (nt->compute_gpu)
+                           async(nt->streams[nt->stream_id]))
+    nrn_pragma_omp(target teams distribute parallel for map(tofrom: net_send_buf_count)
+                   if(nt->compute_gpu) depend(inout: nt->streams[nt->stream_id]) nowait)
     for (int i = 0; i < nt->ncell; ++i) {
         PreSyn* ps = presyns + i;
         PreSynHelper* psh = presyns_helper + i;
@@ -561,16 +563,22 @@ void NetCvode::check_thresh(NrnThread* nt) {  // for default method
             nt->_net_send_buffer[idx] = i;
         }
     }
-    nrn_pragma_acc(wait(nt->stream_id))
+    nrn_pragma_acc(wait async(nt->streams[nt->stream_id]))
+    nrn_pragma_omp(taskwait)
     nt->_net_send_buffer_cnt = net_send_buf_count;
 
     if (nt->compute_gpu && nt->_net_send_buffer_cnt) {
 #ifdef CORENEURON_ENABLE_GPU
         int* nsbuffer = nt->_net_send_buffer;
 #endif
-        nrn_pragma_acc(update host(nsbuffer [0:nt->_net_send_buffer_cnt]) async(nt->stream_id))
-        nrn_pragma_acc(wait(nt->stream_id))
-        nrn_pragma_omp(target update from(nsbuffer [0:nt->_net_send_buffer_cnt]))
+        nrn_pragma_acc(update host(nsbuffer [0:nt->_net_send_buffer_cnt])
+                           async(nt->streams[nt->stream_id]))
+        nrn_pragma_acc(wait async(nt->streams[nt->stream_id]))
+        // clang-format off
+        nrn_pragma_omp(target update from(nsbuffer [0:nt->_net_send_buffer_cnt])
+                           depend(inout: nt->streams[nt->stream_id]) nowait)
+        // clang-format on
+        nrn_pragma_omp(taskwait)
     }
 
     // on CPU...
diff --git a/coreneuron/network/partrans.cpp b/coreneuron/network/partrans.cpp
index 4c517e999..a39458f25 100644
--- a/coreneuron/network/partrans.cpp
+++ b/coreneuron/network/partrans.cpp
@@ -56,14 +56,17 @@ void nrnmpi_v_transfer() {
         nrn_pragma_acc(parallel loop present(src_indices [0:n_src_gather],
                                              src_data [0:nt->_ndata],
                                              src_gather [0:n_src_gather]) if (nt->compute_gpu)
-                           async(nt->stream_id))
+                           async(nt->streams[nt->stream_id]))
         nrn_pragma_omp(target teams distribute parallel for simd if(nt->compute_gpu))
         for (int i = 0; i < n_src_gather; ++i) {
             src_gather[i] = src_data[src_indices[i]];
         }
         nrn_pragma_acc(update host(src_gather [0:n_src_gather]) if (nt->compute_gpu)
-                           async(nt->stream_id))
-        nrn_pragma_omp(target update from(src_gather [0:n_src_gather]) if (nt->compute_gpu))
+                           async(nt->streams[nt->stream_id]))
+        // clang-format off
+        nrn_pragma_omp(target update from(src_gather [0:n_src_gather]) if (nt->compute_gpu)
+                           depend(inout: nt->streams[nt->stream_id]) nowait)
+        // clang-format on
     }
 
     // copy gathered source values to outsrc_buf_
@@ -71,7 +74,7 @@ void nrnmpi_v_transfer() {
     for (int tid = 0; tid < nrn_nthread; ++tid) {
         if (nrn_threads[tid].compute_gpu) {
             compute_gpu = true;
-            nrn_pragma_acc(wait(nrn_threads[tid].stream_id))
+            nrn_pragma_acc(wait async(nrn_threads[tid].streams[nrn_threads[tid].stream_id]))
         }
         TransferThreadData& ttd = transfer_thread_data_[tid];
         size_t n_outsrc_indices = ttd.outsrc_indices.size();
@@ -122,8 +125,8 @@ void nrnthread_v_transfer(NrnThread* _nt) {
     nrn_pragma_acc(parallel loop present(insrc_indices [0:ntar],
                                          tar_data [0:ndata],
                                          insrc_buf_ [0:n_insrc_buf]) if (_nt->compute_gpu)
-                       async(_nt->stream_id))
-    nrn_pragma_omp(target teams distribute parallel for simd map(to: tar_indices[0:ntar]) if(_nt->compute_gpu))
+                       async(_nt->streams[_nt->stream_id]))
+    nrn_pragma_omp(target teams distribute parallel for simd map(to: tar_indices[0:ntar]) if(_nt->compute_gpu) depend(inout: _nt->streams[_nt->stream_id]) nowait)
     for (size_t i = 0; i < ntar; ++i) {
         tar_data[tar_indices[i]] = insrc_buf_[insrc_indices[i]];
     }
diff --git a/coreneuron/permute/cellorder.cpp b/coreneuron/permute/cellorder.cpp
index 14feb31de..c9a93bcd9 100644
--- a/coreneuron/permute/cellorder.cpp
+++ b/coreneuron/permute/cellorder.cpp
@@ -600,14 +600,18 @@ void solve_interleaved2(int ith) {
     defined(_OPENACC)
         int nstride = stridedispl[nwarp];
 #endif
-        nrn_pragma_acc(parallel loop gang vector vector_length(
-            warpsize) present(nt [0:1],
-                              strides [0:nstride],
-                              ncycles [0:nwarp],
-                              stridedispl [0:nwarp + 1],
-                              rootbegin [0:nwarp + 1],
-                              nodebegin [0:nwarp + 1]) if (nt->compute_gpu) async(nt->stream_id))
-        nrn_pragma_omp(target teams distribute parallel for simd if(nt->compute_gpu))
+        nrn_pragma_acc(parallel loop gang vector vector_length(warpsize)
+                           present(nt [0:1],
+                                   strides [0:nstride],
+                                   ncycles [0:nwarp],
+                                   stridedispl [0:nwarp + 1],
+                                   rootbegin [0:nwarp + 1],
+                                   nodebegin [0:nwarp + 1]) if (nt->compute_gpu)
+                               async(nt->streams[nt->stream_id]))
+        // clang-format off
+        nrn_pragma_omp(target teams distribute parallel for simd if(nt->compute_gpu)
+                           depend(inout: nt->streams[nt->stream_id]) nowait)
+        // clang-format on
         for (int icore = 0; icore < ncore; ++icore) {
             int iwarp = icore / warpsize;     // figure out the >> value
             int ic = icore & (warpsize - 1);  // figure out the & mask
@@ -626,7 +630,8 @@ void solve_interleaved2(int ith) {
             }  // serial test mode
 #endif
         }
-        nrn_pragma_acc(wait(nt->stream_id))
+        nrn_pragma_acc(wait async(nt->streams[nt->stream_id]))
+        nrn_pragma_omp(taskwait)
 #ifdef _OPENACC
     }
 #endif
@@ -660,14 +665,15 @@ void solve_interleaved1(int ith) {
                                          firstnode [0:ncell],
                                          lastnode [0:ncell],
                                          cellsize [0:ncell]) if (nt->compute_gpu)
-                       async(nt->stream_id))
-    nrn_pragma_omp(target teams distribute parallel for simd if(nt->compute_gpu))
+                       async(nt->streams[nt->stream_id]))
+    nrn_pragma_omp(target teams distribute parallel for simd if(nt->compute_gpu) depend(inout: nt->streams[nt->stream_id]) nowait)
     for (int icell = 0; icell < ncell; ++icell) {
         int icellsize = cellsize[icell];
         triang_interleaved(nt, icell, icellsize, nstride, stride, lastnode);
         bksub_interleaved(nt, icell, icellsize, nstride, stride, firstnode);
     }
-    nrn_pragma_acc(wait(nt->stream_id))
+    nrn_pragma_acc(wait async(nt->streams[nt->stream_id]))
+    nrn_pragma_omp(taskwait)
 }
 
 void solve_interleaved(int ith) {
diff --git a/coreneuron/sim/fadvance_core.cpp b/coreneuron/sim/fadvance_core.cpp
index ab6fc4bfb..ad71f4c7a 100644
--- a/coreneuron/sim/fadvance_core.cpp
+++ b/coreneuron/sim/fadvance_core.cpp
@@ -79,7 +79,7 @@ void dt2thread(double adt) { /* copied from nrnoc/fadvance.c */
                 nt->cj = 1.0 / dt;
             }
             nrn_pragma_acc(update device(nt->_t, nt->_dt, nt->cj)
-                               async(nt->stream_id) if (nt->compute_gpu))
+                               async(nt->streams[nt->stream_id]) if (nt->compute_gpu))
             // clang-format off
             nrn_pragma_omp(target update to(nt->_t, nt->_dt, nt->cj)
                                          if(nt->compute_gpu))
@@ -206,15 +206,15 @@ void update(NrnThread* _nt) {
     /* do not need to worry about linmod or extracellular*/
     if (secondorder) {
         nrn_pragma_acc(parallel loop present(vec_v [0:i2], vec_rhs [0:i2]) if (_nt->compute_gpu)
-                           async(_nt->stream_id))
-        nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu))
+                           async(_nt->streams[_nt->stream_id]))
+        nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu) depend(inout: _nt->streams[_nt->stream_id]) nowait)
         for (int i = 0; i < i2; ++i) {
             vec_v[i] += 2. * vec_rhs[i];
         }
     } else {
         nrn_pragma_acc(parallel loop present(vec_v [0:i2], vec_rhs [0:i2]) if (_nt->compute_gpu)
-                           async(_nt->stream_id))
-        nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu))
+                           async(_nt->streams[_nt->stream_id]))
+        nrn_pragma_omp(target teams distribute parallel for simd if(_nt->compute_gpu) depend(inout: _nt->streams[_nt->stream_id]) nowait)
         for (int i = 0; i < i2; ++i) {
             vec_v[i] += vec_rhs[i];
         }
@@ -295,8 +295,8 @@ void nrncore2nrn_send_values(NrnThread* nth) {
             assert(vs < tr->bsize);
 
             nrn_pragma_acc(parallel loop present(tr [0:1]) if (nth->compute_gpu)
-                               async(nth->stream_id))
-            nrn_pragma_omp(target teams distribute parallel for simd if(nth->compute_gpu))
+                               async(nth->streams[nth->stream_id]))
+            nrn_pragma_omp(target teams distribute parallel for simd if(nth->compute_gpu) depend(inout: nth->streams[nth->stream_id]) nowait)
             for (int i = 0; i < tr->n_trajec; ++i) {
                 tr->varrays[i][vs] = *tr->gather[i];
             }
@@ -316,10 +316,13 @@ void nrncore2nrn_send_values(NrnThread* nth) {
             for (int i = 0; i < tr->n_trajec; ++i) {
                 double* gather_i = tr->gather[i];
                 nrn_pragma_acc(update self(gather_i [0:1]) if (nth->compute_gpu)
-                                   async(nth->stream_id))
-                nrn_pragma_omp(target update from(gather_i [0:1]) if (nth->compute_gpu))
+                                   async(nth->streams[nth->stream_id]))
+                // clang-format off
+                nrn_pragma_omp(target update from(gather_i [0:1]) if (nth->compute_gpu)
+                                   depend(inout: nth->streams[nth->stream_id]) nowait)
+                // clang-format on
             }
-            nrn_pragma_acc(wait(nth->stream_id))
+            nrn_pragma_acc(wait async(nth->streams[nth->stream_id]))
             for (int i = 0; i < tr->n_trajec; ++i) {
                 *(tr->scatter[i]) = *(tr->gather[i]);
             }
@@ -341,8 +344,9 @@ static void* nrn_fixed_step_thread(NrnThread* nth) {
     if (nth->ncell) {
         /*@todo: do we need to update nth->_t on GPU: Yes (Michael, but can
         launch kernel) */
-        nrn_pragma_acc(update device(nth->_t) if (nth->compute_gpu) async(nth->stream_id))
-        nrn_pragma_acc(wait(nth->stream_id))
+        nrn_pragma_acc(update device(nth->_t) if (nth->compute_gpu)
+                           async(nth->streams[nth->stream_id]))
+        nrn_pragma_acc(wait async(nth->streams[nth->stream_id]))
         nrn_pragma_omp(target update to(nth->_t) if (nth->compute_gpu))
         fixed_play_continuous(nth);
 
@@ -377,8 +381,9 @@ void* nrn_fixed_step_lastpart(NrnThread* nth) {
 
     if (nth->ncell) {
         /*@todo: do we need to update nth->_t on GPU */
-        nrn_pragma_acc(update device(nth->_t) if (nth->compute_gpu) async(nth->stream_id))
-        nrn_pragma_acc(wait(nth->stream_id))
+        nrn_pragma_acc(update device(nth->_t) if (nth->compute_gpu)
+                           async(nth->streams[nth->stream_id]))
+        nrn_pragma_acc(wait async(nth->streams[nth->stream_id]))
         nrn_pragma_omp(target update to(nth->_t) if (nth->compute_gpu))
         fixed_play_continuous(nth);
         nonvint(nth);
diff --git a/coreneuron/sim/fast_imem.cpp b/coreneuron/sim/fast_imem.cpp
index 1218b7967..52b118b53 100644
--- a/coreneuron/sim/fast_imem.cpp
+++ b/coreneuron/sim/fast_imem.cpp
@@ -52,8 +52,8 @@ void nrn_calc_fast_imem(NrnThread* nt) {
     double* fast_imem_rhs = nt->nrn_fast_imem->nrn_sav_rhs;
     nrn_pragma_acc(
         parallel loop present(vec_rhs, vec_area, fast_imem_d, fast_imem_rhs) if (nt->compute_gpu)
-            async(nt->stream_id))
-    nrn_pragma_omp(target teams distribute parallel for simd if(nt->compute_gpu))
+            async(nt->streams[nt->stream_id]))
+    nrn_pragma_omp(target teams distribute parallel for simd if(nt->compute_gpu) depend(inout: nt->streams[nt->stream_id]) nowait)
     for (int i = i1; i < i3; ++i) {
         fast_imem_rhs[i] = (fast_imem_d[i] * vec_rhs[i] + fast_imem_rhs[i]) * vec_area[i] * 0.01;
     }
@@ -69,8 +69,8 @@ void nrn_calc_fast_imem_init(NrnThread* nt) {
 
     double* fast_imem_rhs = nt->nrn_fast_imem->nrn_sav_rhs;
     nrn_pragma_acc(parallel loop present(vec_rhs, vec_area, fast_imem_rhs) if (nt->compute_gpu)
-                       async(nt->stream_id))
-    nrn_pragma_omp(target teams distribute parallel for simd if(nt->compute_gpu))
+                       async(nt->streams[nt->stream_id]))
+    nrn_pragma_omp(target teams distribute parallel for simd if(nt->compute_gpu) depend(inout: nt->streams[nt->stream_id]) nowait)
     for (int i = i1; i < i3; ++i) {
         fast_imem_rhs[i] = (vec_rhs[i] + fast_imem_rhs[i]) * vec_area[i] * 0.01;
     }
diff --git a/coreneuron/sim/multicore.cpp b/coreneuron/sim/multicore.cpp
index d5368a29c..aeee91352 100644
--- a/coreneuron/sim/multicore.cpp
+++ b/coreneuron/sim/multicore.cpp
@@ -15,6 +15,10 @@
 #include "coreneuron/coreneuron.hpp"
 #include "coreneuron/utils/nrnoc_aux.hpp"
 
+#if defined(_OPENMP)
+#include <omp.h>
+#endif
+
 /*
 Now that threads have taken over the actual_v, v_node, etc, it might
 be a good time to regularize the method of freeing, allocating, and
@@ -106,6 +110,9 @@ void nrn_threads_create(int n) {
             for (int i = 0; i < nrn_nthread; ++i) {
                 NrnThread& nt = nrn_threads[i];
                 nt.id = i;
+#if defined(_OPENMP)
+                nt.streams.reserve(omp_get_num_threads());
+#endif
                 for (int j = 0; j < BEFORE_AFTER_SIZE; ++j) {
                     nt.tbl[j] = nullptr;
                 }
diff --git a/coreneuron/sim/multicore.hpp b/coreneuron/sim/multicore.hpp
index c108e2431..44189191d 100644
--- a/coreneuron/sim/multicore.hpp
+++ b/coreneuron/sim/multicore.hpp
@@ -130,9 +130,11 @@ struct NrnThread: public MemoryManaged {
 
     NrnThreadBAList* tbl[BEFORE_AFTER_SIZE]; /* wasteful since almost all empty */
 
-    int shadow_rhs_cnt = 0; /* added to facilitate the NrnThread transfer to GPU */
-    int compute_gpu = 0;    /* define whether to compute with gpus */
-    int stream_id = 0;      /* define where the kernel will be launched on GPU stream */
+    int shadow_rhs_cnt = 0;   /* added to facilitate the NrnThread transfer to GPU */
+    int compute_gpu = 0;      /* define whether to compute with gpus */
+    int stream_id = 0;        /* define where the kernel will be launched on GPU stream */
+    std::vector<int> streams; /* vector of stream ids needed for async execution of OpenMP in
+                                 multiple streams */
     int _net_send_buffer_size = 0;
     int _net_send_buffer_cnt = 0;
     int* _net_send_buffer = nullptr;
diff --git a/coreneuron/sim/solve_core.cpp b/coreneuron/sim/solve_core.cpp
index 60ba2b660..2ee416242 100644
--- a/coreneuron/sim/solve_core.cpp
+++ b/coreneuron/sim/solve_core.cpp
@@ -41,7 +41,7 @@ static void triang(NrnThread* _nt) {
 
     nrn_pragma_acc(parallel loop seq present(
         vec_a [0:i3], vec_b [0:i3], vec_d [0:i3], vec_rhs [0:i3], parent_index [0:i3])
-                       async(_nt->stream_id) if (_nt->compute_gpu))
+                       async(_nt->streams[_nt->stream_id]) if (_nt->compute_gpu))
     for (int i = i3 - 1; i >= i2; --i) {
         double p = vec_a[i] / vec_d[i];
         vec_d[parent_index[i]] -= p * vec_b[i];
@@ -61,21 +61,21 @@ static void bksub(NrnThread* _nt) {
     int* parent_index = _nt->_v_parent_index;
 
     nrn_pragma_acc(parallel loop seq present(vec_d [0:i2], vec_rhs [0:i2])
-                       async(_nt->stream_id) if (_nt->compute_gpu))
+                       async(_nt->streams[_nt->stream_id]) if (_nt->compute_gpu))
     for (int i = i1; i < i2; ++i) {
         vec_rhs[i] /= vec_d[i];
     }
 
     nrn_pragma_acc(
         parallel loop seq present(vec_b [0:i3], vec_d [0:i3], vec_rhs [0:i3], parent_index [0:i3])
-            async(_nt->stream_id) if (_nt->compute_gpu))
+            async(_nt->streams[_nt->stream_id]) if (_nt->compute_gpu))
     for (int i = i2; i < i3; ++i) {
         vec_rhs[i] -= vec_b[i] * vec_rhs[parent_index[i]];
         vec_rhs[i] /= vec_d[i];
     }
 
     if (_nt->compute_gpu) {
-        nrn_pragma_acc(wait(_nt->stream_id))
+        nrn_pragma_acc(wait async(_nt->streams[_nt->stream_id]))
     }
 }
 }  // namespace coreneuron
diff --git a/coreneuron/sim/treeset_core.cpp b/coreneuron/sim/treeset_core.cpp
index 208058fe1..42de967d0 100644
--- a/coreneuron/sim/treeset_core.cpp
+++ b/coreneuron/sim/treeset_core.cpp
@@ -33,8 +33,8 @@ static void nrn_rhs(NrnThread* _nt) {
     int* parent_index = _nt->_v_parent_index;
 
     nrn_pragma_acc(parallel loop present(vec_rhs [0:i3], vec_d [0:i3]) if (_nt->compute_gpu)
-                       async(_nt->stream_id))
-    nrn_pragma_omp(target teams distribute parallel for if(_nt->compute_gpu))
+                       async(_nt->streams[_nt->stream_id]))
+    nrn_pragma_omp(target teams distribute parallel for if(_nt->compute_gpu) depend(inout: _nt->streams[_nt->stream_id]) nowait)
     for (int i = i1; i < i3; ++i) {
         vec_rhs[i] = 0.;
         vec_d[i] = 0.;
@@ -45,8 +45,8 @@ static void nrn_rhs(NrnThread* _nt) {
         double* fast_imem_rhs = _nt->nrn_fast_imem->nrn_sav_rhs;
         nrn_pragma_acc(
             parallel loop present(fast_imem_d [i1:i3], fast_imem_rhs [i1:i3]) if (_nt->compute_gpu)
-                async(_nt->stream_id))
-        nrn_pragma_omp(target teams distribute parallel for if(_nt->compute_gpu))
+                async(_nt->streams[_nt->stream_id]))
+        nrn_pragma_omp(target teams distribute parallel for if(_nt->compute_gpu) depend(inout: _nt->streams[_nt->stream_id]) nowait)
         for (int i = i1; i < i3; ++i) {
             fast_imem_d[i] = 0.;
             fast_imem_rhs[i] = 0.;
@@ -75,8 +75,8 @@ static void nrn_rhs(NrnThread* _nt) {
         */
         double* p = _nt->nrn_fast_imem->nrn_sav_rhs;
         nrn_pragma_acc(parallel loop present(p, vec_rhs) if (_nt->compute_gpu)
-                           async(_nt->stream_id))
-        nrn_pragma_omp(target teams distribute parallel for if(_nt->compute_gpu))
+                           async(_nt->streams[_nt->stream_id]))
+        nrn_pragma_omp(target teams distribute parallel for if(_nt->compute_gpu) depend(inout: _nt->streams[_nt->stream_id]) nowait)
         for (int i = i1; i < i3; ++i) {
             p[i] -= vec_rhs[i];
         }
@@ -92,8 +92,8 @@ static void nrn_rhs(NrnThread* _nt) {
                                          vec_b [0:i3],
                                          vec_v [0:i3],
                                          parent_index [0:i3]) if (_nt->compute_gpu)
-                       async(_nt->stream_id))
-    nrn_pragma_omp(target teams distribute parallel for if(_nt->compute_gpu))
+                       async(_nt->streams[_nt->stream_id]))
+    nrn_pragma_omp(target teams distribute parallel for if(_nt->compute_gpu) depend(inout: _nt->streams[_nt->stream_id]) nowait)
     for (int i = i2; i < i3; ++i) {
         double dv = vec_v[parent_index[i]] - vec_v[i];
         /* our connection coefficients are negative so */
@@ -152,8 +152,10 @@ static void nrn_lhs(NrnThread* _nt) {
            so here we transform so it only has membrane current contribution
         */
         double* p = _nt->nrn_fast_imem->nrn_sav_d;
-        nrn_pragma_acc(parallel loop present(p, vec_d) if (_nt->compute_gpu) async(_nt->stream_id))
-        nrn_pragma_omp(target teams distribute parallel for if(_nt->compute_gpu))
+        nrn_pragma_acc(parallel loop present(p, vec_d) if (_nt->compute_gpu)
+                           async(_nt->streams[_nt->stream_id]))
+        nrn_pragma_omp(target teams distribute parallel for if(_nt->compute_gpu)
+                           depend(inout: _nt->streams[_nt->stream_id]) nowait)
         for (int i = i1; i < i3; ++i) {
             p[i] += vec_d[i];
         }
@@ -162,8 +164,8 @@ static void nrn_lhs(NrnThread* _nt) {
     /* now add the axial currents */
     nrn_pragma_acc(parallel loop present(
         vec_d [0:i3], vec_a [0:i3], vec_b [0:i3], parent_index [0:i3]) if (_nt->compute_gpu)
-                       async(_nt->stream_id))
-    nrn_pragma_omp(target teams distribute parallel for if(_nt->compute_gpu))
+                       async(_nt->streams[_nt->stream_id]))
+    nrn_pragma_omp(target teams distribute parallel for if(_nt->compute_gpu) depend(inout: _nt->streams[_nt->stream_id]) nowait)
     for (int i = i2; i < i3; ++i) {
         nrn_pragma_acc(atomic update)
         nrn_pragma_omp(atomic update)
diff --git a/external/mod2c b/external/mod2c
index 9c1ca6ed3..a33e8df74 160000
--- a/external/mod2c
+++ b/external/mod2c
@@ -1 +1 @@
-Subproject commit 9c1ca6ed3fcefad9230b4cdf2f5e7fde6cce6d08
+Subproject commit a33e8df74f4f6bee44f8de5b397c3ff4338202c3
diff --git a/external/nmodl b/external/nmodl
index 46f8baf2b..afd70c5bb 160000
--- a/external/nmodl
+++ b/external/nmodl
@@ -1 +1 @@
-Subproject commit 46f8baf2bbeaa0d21559d6306ec37b94c601f1ee
+Subproject commit afd70c5bb258835661963efa091c8a55718ee2c0