From cc127b2c052c65bf9713820700a5dc0f0b8c2326 Mon Sep 17 00:00:00 2001
From: Jan Ciesko <jan.ciesko@gmail.com>
Date: Mon, 18 Dec 2023 15:51:24 -0700
Subject: [PATCH 01/13] Use future::is_ready (drops depracated)

---
 .../include/upcxx_utils/flat_aggr_store.hpp   |    4 +-
 install/include/upcxx_utils/gather.hpp        |   10 +-
 install/include/upcxx_utils/reduce_prefix.hpp |   16 +-
 install/include/upcxx_utils/thread_pool.hpp   |    2 +-
 .../upcxx_utils/three_tier_aggr_store.hpp     |    4 +-
 .../upcxx_utils/two_tier_aggr_store.hpp       |   10 +-
 src/fastq.cpp                                 |    6 +-
 src/kcount/kcount_gpu.cpp                     |    2 +-
 .../include/upcxx_utils/flat_aggr_store.hpp   |    2 +-
 upcxx-utils/include/upcxx_utils/gather.hpp    |   10 +-
 .../include/upcxx_utils/reduce_prefix.hpp     |   16 +-
 .../include/upcxx_utils/thread_pool.hpp       |    2 +-
 .../upcxx_utils/three_tier_aggr_store.hpp     |    2 +-
 .../upcxx_utils/two_tier_aggr_store.hpp       | 1962 +++++++++++++++++
 upcxx-utils/src/limit_outstanding.cpp         |   14 +-
 upcxx-utils/src/ofstream.cpp                  |    2 +-
 upcxx-utils/src/promise_collectives.cpp       |    4 +-
 upcxx-utils/src/reduce_prefix.cpp             |    8 +-
 upcxx-utils/src/timers.cpp                    |  698 ++++++
 upcxx-utils/test/test_ofstream.cpp            |    2 +-
 upcxx-utils/test/test_promise_collectives.cpp |   44 +-
 21 files changed, 2740 insertions(+), 80 deletions(-)
 create mode 100644 upcxx-utils/include/upcxx_utils/two_tier_aggr_store.hpp
 create mode 100644 upcxx-utils/src/timers.cpp

diff --git a/install/include/upcxx_utils/flat_aggr_store.hpp b/install/include/upcxx_utils/flat_aggr_store.hpp
index 16bb3d7..a6b9f5d 100644
--- a/install/include/upcxx_utils/flat_aggr_store.hpp
+++ b/install/include/upcxx_utils/flat_aggr_store.hpp
@@ -469,7 +469,7 @@ class FlatAggrStore {
       do {
         rpc_counts->progress_timer.progress();  // call progress after firing a rpc
         fut = limit_outstanding_futures(fut);
-      } while (!fut.ready());
+      } while (!fut.is_ready());
     }
 
     CountType max_vals[2], sum_vals[2];
@@ -478,7 +478,7 @@ class FlatAggrStore {
 
     DBG("flush_updates() waiting for counts\n");
     auto fut_done = flush_outstanding_futures_async();
-    while (!fut_done.ready()) {
+    while (!fut_done.is_ready()) {
       rpc_counts->progress_timer.discharge();
     }
 
diff --git a/install/include/upcxx_utils/gather.hpp b/install/include/upcxx_utils/gather.hpp
index 521e99e..812cfc7 100644
--- a/install/include/upcxx_utils/gather.hpp
+++ b/install/include/upcxx_utils/gather.hpp
@@ -225,7 +225,7 @@ upcxx::future<> binomial_gather(const T* send_buf, size_t send_count, T* dest_bu
     if (is_sending) {
       DBG_VERBOSE("is sending level=", level, "\n");
       assert(!is_receiving && "sending is not also receiving");
-      assert(!workflow.prom_buffer_filled.get_future().ready() && "sending buffer has not been filled before workflow is prepared");
+      assert(!workflow.prom_buffer_filled.get_future().is_ready() && "sending buffer has not been filled before workflow is prepared");
       if (!have_received) {
         assert(send_buf);
         assert(send_count != 0);
@@ -280,11 +280,11 @@ upcxx::future<> binomial_gather(const T* send_buf, size_t send_count, T* dest_bu
         DBG_VERBOSE("Sent rpc to dest_rrank=", dest_rrank, ", dest_rank=", dest_rank, " rrank=", rrank, " level=", level,
                     " count=", sending_size, " ", get_size_str(sending_size * sizeof(T)), "\n");
         // make buffer available for next level (may never be needed but steps workflow to completion)
-        assert(!workflow.prom_buffer_filled.get_future().ready() && "sending buffer that was just used has not been filled yet");
+        assert(!workflow.prom_buffer_filled.get_future().is_ready() && "sending buffer that was just used has not been filled yet");
         workflow.prom_buffer_filled.fulfill_anonymous(1);
       });
       if (!have_received)
-        assert(fut.ready() &&
+        assert(fut.is_ready() &&
                "first sending level always immediately executes ensuring send_buf is ready for reuse on exit of binomial_gather");
       have_sent = true;
       // not done until I have sent my message
@@ -295,7 +295,7 @@ upcxx::future<> binomial_gather(const T* send_buf, size_t send_count, T* dest_bu
     if (!is_sending & !is_receiving) {
       // some ranks on some levels have nothing to do but make the buffer available
       DBG_VERBOSE("idle level=", level, "\n");
-      assert(!workflow.prom_buffer_filled.get_future().ready());
+      assert(!workflow.prom_buffer_filled.get_future().is_ready());
       workflow.prom_buffer_filled.fulfill_anonymous(1);
       if (!have_received && !have_sent) workflow.prom_buffer.fulfill_result(ShBuffer());
     }
@@ -309,7 +309,7 @@ upcxx::future<> binomial_gather(const T* send_buf, size_t send_count, T* dest_bu
                        LevelWorkflow& next = dist_workflows->level(level + 1);
                        DBG_VERBOSE("Setting buffer for next level: ", level + 1, " size=", sh_buf->size(),
                                    " cap=", sh_buf->capacity(), "\n");
-                       if (!next.prom_buffer.get_future().ready()) {
+                       if (!next.prom_buffer.get_future().is_ready()) {
                          // fulfill buffer for next level on this rank for it to send or recv
                          next.prom_buffer.fulfill_result(std::move(sh_buf));
                        } else {
diff --git a/install/include/upcxx_utils/reduce_prefix.hpp b/install/include/upcxx_utils/reduce_prefix.hpp
index f7188be..04a33ae 100644
--- a/install/include/upcxx_utils/reduce_prefix.hpp
+++ b/install/include/upcxx_utils/reduce_prefix.hpp
@@ -425,7 +425,7 @@ upcxx::future<> reduce_prefix_binary_tree_up(ShDistData<T, BinaryOp> sh_dist_dat
           // scratch has partial_to_parent from right (j+1 ... rr) if there is a right
           // if there is a left child, dst already has applied from left (ll ... j)
           // calculate partial_to_parent as (ll ... rr) in scratch.
-          assert(proms.scratch_is_partial_right.get_future().ready());
+          assert(proms.scratch_is_partial_right.get_future().is_ready());
 
           if (my_node.right < my_node.n) {
             assert(!sh_scratch->empty());
@@ -433,7 +433,7 @@ upcxx::future<> reduce_prefix_binary_tree_up(ShDistData<T, BinaryOp> sh_dist_dat
           T *partial_right = sh_scratch->data();
           T *partial_left_right = sh_scratch->data();
 
-          assert(proms.dst_is_partial_left_me.get_future().ready());
+          assert(proms.dst_is_partial_left_me.get_future().is_ready());
           const T *partial_left_me = my_node.left < my_node.me ? dst : src;
           const T *send_to_parent = partial_left_me;
 
@@ -512,11 +512,11 @@ upcxx::future<> reduce_prefix_binary_tree_down(ShDistData<T, BinaryOp> sh_dist_d
   }
 
   // check that upstage is completed
-  assert(proms.ready_for_up.get_future().ready());
-  assert(proms.dst_is_partial_left_me.get_future().ready());
-  assert(proms.scratch_is_partial_right.get_future().ready());
-  assert(proms.scratch_is_partial_to_parent.get_future().ready());
-  assert(proms.sent_partial_to_parent.get_future().ready());
+  assert(proms.ready_for_up.get_future().is_ready());
+  assert(proms.dst_is_partial_left_me.get_future().is_ready());
+  assert(proms.scratch_is_partial_right.get_future().is_ready());
+  assert(proms.scratch_is_partial_to_parent.get_future().is_ready());
+  assert(proms.sent_partial_to_parent.get_future().is_ready());
 
   // step 4 down
   // receive from parent
@@ -555,7 +555,7 @@ upcxx::future<> reduce_prefix_binary_tree_down(ShDistData<T, BinaryOp> sh_dist_d
     rpcs_sent = rpcs_sent.then(
         [sh_dist_data, dst = dst, count = count, sh_scratch = sh_scratch, child, my_node = my_node, &proms, &team]() {
           assert(proms.up_ready());
-          assert(proms.scratch_is_partial_from_parent.get_future().ready());
+          assert(proms.scratch_is_partial_from_parent.get_future().is_ready());
           const T *send_data;
           if (child < my_node.me) {
             // relay just a copy from my parent (0 ... ll-1)
diff --git a/install/include/upcxx_utils/thread_pool.hpp b/install/include/upcxx_utils/thread_pool.hpp
index 8b27992..864c06f 100644
--- a/install/include/upcxx_utils/thread_pool.hpp
+++ b/install/include/upcxx_utils/thread_pool.hpp
@@ -94,7 +94,7 @@ class ThreadPool {
   template <typename Func, class... Args>
   static upcxx::future<> &enqueue_in_single_pool_serially(upcxx::future<> &serial_fut, Func &&func, Args &&... args) {
     assert(upcxx::master_persona().active_with_caller() && "Called from master persona");
-    DBG_VERBOSE("enqueue_in_single_pool_serially: ", &serial_fut, " ", (serial_fut.ready() ? "ready" : "NOT READY"), "\n");
+    DBG_VERBOSE("enqueue_in_single_pool_serially: ", &serial_fut, " ", (serial_fut.is_ready() ? "ready" : "NOT READY"), "\n");
 
     using return_t = typename std::invoke_result<Func, Args...>::type;
     static_assert(std::is_void<return_t>::value, "void is the required return type for enqueue_in_serial_pool");
diff --git a/install/include/upcxx_utils/three_tier_aggr_store.hpp b/install/include/upcxx_utils/three_tier_aggr_store.hpp
index c48c473..232ed78 100644
--- a/install/include/upcxx_utils/three_tier_aggr_store.hpp
+++ b/install/include/upcxx_utils/three_tier_aggr_store.hpp
@@ -1080,11 +1080,11 @@ class ThreeTierAggrStore : public FlatAggrStore<T, Data...> {
         do {
           tt_rpc_counts->progress_timer.progress();  // call progress after firing a rpc
           fut = limit_outstanding_futures(fut);
-        } while (!fut.ready());
+        } while (!fut.is_ready());
       }
     }
     auto fut_done = flush_outstanding_futures_async();
-    while (!fut_done.ready()) {
+    while (!fut_done.is_ready()) {
       tt_rpc_counts->progress_timer.discharge();
     }
 
diff --git a/install/include/upcxx_utils/two_tier_aggr_store.hpp b/install/include/upcxx_utils/two_tier_aggr_store.hpp
index 25be1b8..663a39d 100644
--- a/install/include/upcxx_utils/two_tier_aggr_store.hpp
+++ b/install/include/upcxx_utils/two_tier_aggr_store.hpp
@@ -503,7 +503,7 @@ class FixedMemoryRPC {
       dest_stores.reserve(num_stores);
       for (int i = 0; i < num_stores; i++) {
         auto fut = global_dispatcher.pop();
-        if (!fut.ready()) {
+        if (!fut.is_ready()) {
           DIE("Detected a global block that is not ready! i=", i, " available_size=", global_dispatcher.available_size(), "\n");
         }
         dest_stores.push_back(fut.wait());
@@ -1121,7 +1121,7 @@ class TwoTierAggrStore {
                         assert(store_block.first);
                         assert(store_block.first.where() == rank_me());
                         auto lblock_fut = inter_fixed_mem->pop_global(true);
-                        assert(lblock_fut.ready());
+                        assert(lblock_fut.is_ready());
                         auto lblock = lblock_fut.result();
                         assert(lblock.first);
                         assert(lblock.first.where() == rank_me());
@@ -1320,7 +1320,7 @@ class TwoTierAggrStore {
     } else {
       assert(!gblock.first);
       auto fut = replace_intra_store(gblock, intra_fixed_mem);
-      if (!fut.ready()) DBG(__func__, " will wait\n");
+      if (!fut.is_ready()) DBG(__func__, " will wait\n");
       fut.wait();
     }
     assert(gblock.first);
@@ -1403,7 +1403,7 @@ class TwoTierAggrStore {
     assert(gblock.second > 0);
     auto fut = update_remote_inter_nb(target_rank, gblock);
     DBG(__func__, " my_progress\n");
-    if (!fut.ready()) {
+    if (!fut.is_ready()) {
       DBG(__func__, " still waiting on inter dest store\n");
     }
     fut.wait();
@@ -1420,7 +1420,7 @@ class TwoTierAggrStore {
     gblock = {};  // invalidate it
     auto fut = replace_inter_store(gblock, inter_fixed_memory_store);
     send_inter_rpc(split_rank::get_rank_from_node(node), sendBlock);  // send to dedicated rank on remote node
-    if (!fut.ready()) DBG("intra dest store is not immediately ready\n");
+    if (!fut.is_ready()) DBG("intra dest store is not immediately ready\n");
     return fut;
   }
 
diff --git a/src/fastq.cpp b/src/fastq.cpp
index 3697abc..231b21a 100644
--- a/src/fastq.cpp
+++ b/src/fastq.cpp
@@ -484,7 +484,7 @@ void FastqReader::seek() {
 }
 
 FastqReader::~FastqReader() {
-  if (!open_fut.ready()) {
+  if (!open_fut.is_ready()) {
     WARN("Destructor called before opening completed\n");
     open_fut.wait();
   }
@@ -502,7 +502,7 @@ string FastqReader::get_fname() { return fname; }
 size_t FastqReader::my_file_size() { return end_read - start_read + (fqr2 ? fqr2->my_file_size() : 0); }
 
 size_t FastqReader::get_next_fq_record(string &id, string &seq, string &quals, bool wait_open) {
-  if (wait_open && !open_fut.ready()) {
+  if (wait_open && !open_fut.is_ready()) {
     WARN("Attempt to read ", fname, " before it is ready. wait on open_fut first to avoid this warning!\n");
     open_fut.wait();
   }
@@ -554,7 +554,7 @@ int FastqReader::get_max_read_len() { return std::max(max_read_len, fqr2 ? fqr2-
 
 
 void FastqReader::reset() {
-  if (!open_fut.ready()) {
+  if (!open_fut.is_ready()) {
     open_fut.wait();
   }
   if (!f) {
diff --git a/src/kcount/kcount_gpu.cpp b/src/kcount/kcount_gpu.cpp
index d603938..c1e5da0 100644
--- a/src/kcount/kcount_gpu.cpp
+++ b/src/kcount/kcount_gpu.cpp
@@ -101,7 +101,7 @@ static void process_block(SeqBlockInserter<MAX_K> *seq_block_inserter, dist_obje
   state->num_block_calls++;
   future<bool> fut = execute_in_thread_pool(
       [&state, &num_valid_kmers] { return state->pnp_gpu_driver->process_seq_block(state->seq_block, num_valid_kmers); });
-  while (!fut.ready()) {
+  while (!fut.is_ready()) {
     state->num_pnp_gpu_waits++;
     progress();
   }
diff --git a/upcxx-utils/include/upcxx_utils/flat_aggr_store.hpp b/upcxx-utils/include/upcxx_utils/flat_aggr_store.hpp
index dc11e3a..b96e2ad 100644
--- a/upcxx-utils/include/upcxx_utils/flat_aggr_store.hpp
+++ b/upcxx-utils/include/upcxx_utils/flat_aggr_store.hpp
@@ -466,7 +466,7 @@ class FlatAggrStore {
       do {
        
         fut = limit_outstanding_futures(fut);
-      } while (!fut.ready());
+      } while (!fut.is_ready());
     }
 
     CountType max_vals[2], sum_vals[2];
diff --git a/upcxx-utils/include/upcxx_utils/gather.hpp b/upcxx-utils/include/upcxx_utils/gather.hpp
index 521e99e..812cfc7 100644
--- a/upcxx-utils/include/upcxx_utils/gather.hpp
+++ b/upcxx-utils/include/upcxx_utils/gather.hpp
@@ -225,7 +225,7 @@ upcxx::future<> binomial_gather(const T* send_buf, size_t send_count, T* dest_bu
     if (is_sending) {
       DBG_VERBOSE("is sending level=", level, "\n");
       assert(!is_receiving && "sending is not also receiving");
-      assert(!workflow.prom_buffer_filled.get_future().ready() && "sending buffer has not been filled before workflow is prepared");
+      assert(!workflow.prom_buffer_filled.get_future().is_ready() && "sending buffer has not been filled before workflow is prepared");
       if (!have_received) {
         assert(send_buf);
         assert(send_count != 0);
@@ -280,11 +280,11 @@ upcxx::future<> binomial_gather(const T* send_buf, size_t send_count, T* dest_bu
         DBG_VERBOSE("Sent rpc to dest_rrank=", dest_rrank, ", dest_rank=", dest_rank, " rrank=", rrank, " level=", level,
                     " count=", sending_size, " ", get_size_str(sending_size * sizeof(T)), "\n");
         // make buffer available for next level (may never be needed but steps workflow to completion)
-        assert(!workflow.prom_buffer_filled.get_future().ready() && "sending buffer that was just used has not been filled yet");
+        assert(!workflow.prom_buffer_filled.get_future().is_ready() && "sending buffer that was just used has not been filled yet");
         workflow.prom_buffer_filled.fulfill_anonymous(1);
       });
       if (!have_received)
-        assert(fut.ready() &&
+        assert(fut.is_ready() &&
                "first sending level always immediately executes ensuring send_buf is ready for reuse on exit of binomial_gather");
       have_sent = true;
       // not done until I have sent my message
@@ -295,7 +295,7 @@ upcxx::future<> binomial_gather(const T* send_buf, size_t send_count, T* dest_bu
     if (!is_sending & !is_receiving) {
       // some ranks on some levels have nothing to do but make the buffer available
       DBG_VERBOSE("idle level=", level, "\n");
-      assert(!workflow.prom_buffer_filled.get_future().ready());
+      assert(!workflow.prom_buffer_filled.get_future().is_ready());
       workflow.prom_buffer_filled.fulfill_anonymous(1);
       if (!have_received && !have_sent) workflow.prom_buffer.fulfill_result(ShBuffer());
     }
@@ -309,7 +309,7 @@ upcxx::future<> binomial_gather(const T* send_buf, size_t send_count, T* dest_bu
                        LevelWorkflow& next = dist_workflows->level(level + 1);
                        DBG_VERBOSE("Setting buffer for next level: ", level + 1, " size=", sh_buf->size(),
                                    " cap=", sh_buf->capacity(), "\n");
-                       if (!next.prom_buffer.get_future().ready()) {
+                       if (!next.prom_buffer.get_future().is_ready()) {
                          // fulfill buffer for next level on this rank for it to send or recv
                          next.prom_buffer.fulfill_result(std::move(sh_buf));
                        } else {
diff --git a/upcxx-utils/include/upcxx_utils/reduce_prefix.hpp b/upcxx-utils/include/upcxx_utils/reduce_prefix.hpp
index f7188be..04a33ae 100644
--- a/upcxx-utils/include/upcxx_utils/reduce_prefix.hpp
+++ b/upcxx-utils/include/upcxx_utils/reduce_prefix.hpp
@@ -425,7 +425,7 @@ upcxx::future<> reduce_prefix_binary_tree_up(ShDistData<T, BinaryOp> sh_dist_dat
           // scratch has partial_to_parent from right (j+1 ... rr) if there is a right
           // if there is a left child, dst already has applied from left (ll ... j)
           // calculate partial_to_parent as (ll ... rr) in scratch.
-          assert(proms.scratch_is_partial_right.get_future().ready());
+          assert(proms.scratch_is_partial_right.get_future().is_ready());
 
           if (my_node.right < my_node.n) {
             assert(!sh_scratch->empty());
@@ -433,7 +433,7 @@ upcxx::future<> reduce_prefix_binary_tree_up(ShDistData<T, BinaryOp> sh_dist_dat
           T *partial_right = sh_scratch->data();
           T *partial_left_right = sh_scratch->data();
 
-          assert(proms.dst_is_partial_left_me.get_future().ready());
+          assert(proms.dst_is_partial_left_me.get_future().is_ready());
           const T *partial_left_me = my_node.left < my_node.me ? dst : src;
           const T *send_to_parent = partial_left_me;
 
@@ -512,11 +512,11 @@ upcxx::future<> reduce_prefix_binary_tree_down(ShDistData<T, BinaryOp> sh_dist_d
   }
 
   // check that upstage is completed
-  assert(proms.ready_for_up.get_future().ready());
-  assert(proms.dst_is_partial_left_me.get_future().ready());
-  assert(proms.scratch_is_partial_right.get_future().ready());
-  assert(proms.scratch_is_partial_to_parent.get_future().ready());
-  assert(proms.sent_partial_to_parent.get_future().ready());
+  assert(proms.ready_for_up.get_future().is_ready());
+  assert(proms.dst_is_partial_left_me.get_future().is_ready());
+  assert(proms.scratch_is_partial_right.get_future().is_ready());
+  assert(proms.scratch_is_partial_to_parent.get_future().is_ready());
+  assert(proms.sent_partial_to_parent.get_future().is_ready());
 
   // step 4 down
   // receive from parent
@@ -555,7 +555,7 @@ upcxx::future<> reduce_prefix_binary_tree_down(ShDistData<T, BinaryOp> sh_dist_d
     rpcs_sent = rpcs_sent.then(
         [sh_dist_data, dst = dst, count = count, sh_scratch = sh_scratch, child, my_node = my_node, &proms, &team]() {
           assert(proms.up_ready());
-          assert(proms.scratch_is_partial_from_parent.get_future().ready());
+          assert(proms.scratch_is_partial_from_parent.get_future().is_ready());
           const T *send_data;
           if (child < my_node.me) {
             // relay just a copy from my parent (0 ... ll-1)
diff --git a/upcxx-utils/include/upcxx_utils/thread_pool.hpp b/upcxx-utils/include/upcxx_utils/thread_pool.hpp
index cc92248..c04ec48 100644
--- a/upcxx-utils/include/upcxx_utils/thread_pool.hpp
+++ b/upcxx-utils/include/upcxx_utils/thread_pool.hpp
@@ -94,7 +94,7 @@ class ThreadPool {
   template <typename Func, class... Args>
   static upcxx::future<> &enqueue_in_single_pool_serially(upcxx::future<> &serial_fut, Func &&func, Args &&... args) {
     assert(upcxx::master_persona().active_with_caller() && "Called from master persona");
-    DBG_VERBOSE("enqueue_in_single_pool_serially: ", &serial_fut, " ", (serial_fut.ready() ? "ready" : "NOT READY"), "\n");
+    DBG_VERBOSE("enqueue_in_single_pool_serially: ", &serial_fut, " ", (serial_fut.is_ready() ? "ready" : "NOT READY"), "\n");
 
     using return_t = typename std::invoke_result<Func, Args...>::type;
     static_assert(std::is_void<return_t>::value, "void is the required return type for enqueue_in_serial_pool");
diff --git a/upcxx-utils/include/upcxx_utils/three_tier_aggr_store.hpp b/upcxx-utils/include/upcxx_utils/three_tier_aggr_store.hpp
index a60f89f..c4f8b43 100644
--- a/upcxx-utils/include/upcxx_utils/three_tier_aggr_store.hpp
+++ b/upcxx-utils/include/upcxx_utils/three_tier_aggr_store.hpp
@@ -1073,7 +1073,7 @@ class ThreeTierAggrStore : public FlatAggrStore<T, Data...> {
         do {
           
           fut = limit_outstanding_futures(fut);
-        } while (!fut.ready());
+        } while (!fut.is_ready());
       }
     }
     auto fut_done = flush_outstanding_futures_async();
diff --git a/upcxx-utils/include/upcxx_utils/two_tier_aggr_store.hpp b/upcxx-utils/include/upcxx_utils/two_tier_aggr_store.hpp
new file mode 100644
index 0000000..663a39d
--- /dev/null
+++ b/upcxx-utils/include/upcxx_utils/two_tier_aggr_store.hpp
@@ -0,0 +1,1962 @@
+#pragma once
+
+#include <algorithm>
+#include <sstream>
+#include <upcxx/upcxx.hpp>
+#include <utility>
+
+#include "log.hpp"
+#include "split_rank.hpp"
+#include "timers.hpp"
+#include "version.h"
+
+using std::list;
+using std::make_shared;
+using std::ostream;
+using std::ostringstream;
+using std::pair;
+using std::shared_ptr;
+using std::string;
+using std::vector;
+
+using upcxx::barrier;
+using upcxx::dist_object;
+using upcxx::global_ptr;
+using upcxx::intrank_t;
+using upcxx::make_future;
+using upcxx::make_view;
+using upcxx::op_fast_add;
+using upcxx::op_fast_max;
+using upcxx::progress;
+using upcxx::rank_me;
+using upcxx::rank_n;
+using upcxx::reduce_all;
+using upcxx::reduce_one;
+using upcxx::rget;
+using upcxx::rpc;
+using upcxx::to_future;
+using upcxx::view;
+using upcxx::when_all;
+
+// this class aggregates updates into local buffers and then periodically does an rpc to dispatch them
+
+#ifdef DEBUG
+#define DEBUG_MINIMAL_STORE
+#endif
+
+#ifndef MAX_RPCS_IN_FLIGHT
+#define MAX_RPCS_IN_FLIGHT 4096
+#endif
+
+#ifndef MIN_INFLIGHT_BYTES
+#define MIN_INFLIGHT_BYTES (1024L * 1024L) /* always use at least 1MB in flight */
+#endif
+
+namespace upcxx_utils {
+
+template <typename T>
+class BlockDispatcher {
+  // does not allocate or deallocate memory, just handles pointer to it
+  // available are backed blocks that are empty and ready to be consumed
+  // promises is a strictly FIFO queue of blocks that will be fulfilled by a backed block
+  //  pushing / poping from this queue is a signal for asynchronos processing at a later time when resources permit
+  // all methods are non-blocking, returning futures
+  // push() will fulfill outstanding promises before returning a block to the available heap
+  // only pop() can grow promises beyond a fixed size
+  // all blocks must be returned to available, with no promises before clear() can be called
+ public:
+  using ptr_t = T;
+  using block_t = pair<ptr_t, size_t>;
+  using future_block_t = upcxx::future<block_t>;
+  using promise_block_t = ActiveInstantiationTimer<upcxx::promise<block_t> >;
+
+  using blocks_t = vector<block_t>;
+  using reservation_t = shared_ptr<blocks_t>;
+  using promise_reservation_t = ActiveInstantiationTimer<upcxx::promise<reservation_t> >;
+
+  using promise_blocks_t = list<promise_block_t>;
+  using promise_reservations_t = list<promise_reservation_t>;
+
+ private:
+  block_t backing;
+  size_t count_per_block, num_blocks, reservation_size;
+  blocks_t available, reservation;
+  promise_blocks_t promised_blocks;  // may grow indefinitely, but elements are small
+  ActiveCountTimer promise_block_count_timer;
+  size_t promised_blocks_count;
+  promise_reservations_t promised_reservations;  // may grow indefinitely, but elements are small
+  ActiveCountTimer promise_reservation_count_timer;
+  size_t promised_reservations_count;
+
+ protected:
+  // drains the current reservation and returns the contents
+  reservation_t claim_reservation(bool require_full = true) {
+    // no need to get a lock as all methods already have a lock
+    if (require_full && reservation.size() != reservation_size)
+      DIE("claim_reservation is not full but this is required. ", reservation.size(), "\n");
+    reservation_t reserved(new blocks_t());
+    reserved->reserve(reservation_size);
+    reserved->swap(reservation);  // claim and empty the current reservation
+    assert(reservation.size() == 0);
+    // repopulate the reservation from the available set
+    while (!available.empty() && reservation.size() < reservation_size) {
+      reservation.push_back(available.back());
+      available.pop_back();
+    }
+    assert(reserved);
+    return reserved;
+  }
+
+ public:
+  string description;
+  BlockDispatcher(const string description)
+      : backing({})
+      , count_per_block(0)
+      , num_blocks(0)
+      , reservation_size(0)
+      , available()
+      , reservation()
+      , promised_blocks()
+      , promise_block_count_timer(description + "-promised_blocks")
+      , promised_blocks_count(0)
+      , promised_reservations()
+      , promise_reservation_count_timer(description + "-promised_reservations")
+      , promised_reservations_count(0)
+      , description(description) {}
+  BlockDispatcher(const BlockDispatcher &) = delete;
+  BlockDispatcher(BlockDispatcher &&) = default;
+  virtual ~BlockDispatcher() { clear(); }
+
+  // a valid dispatcher has an allocation and blocks
+  bool valid() const {
+    bool is_valid = (backing.first && backing.second > 0 && num_blocks > 0 && count_per_block > 1 &&
+                     num_blocks * count_per_block <= backing.second);
+    return is_valid;
+  }
+
+  // checks if a block is backed by the allocation
+  bool is_backed(block_t &block) const {
+    return valid() && count_per_block > 0 &&
+           (block.first && (backing.first <= block.first) && (backing.first + backing.second >= block.first + count_per_block));
+  }
+
+  // accepts a large block that will be divided into regular num blocks of count elements for dispatching
+  // all blocks will be put in the available heap and have a zero count
+  void set(block_t allocation, size_t num, size_t count, size_t thread_offset = 0, size_t reservation_count = 0) {
+    if (valid()) DIE("set called on an already valid dispatcher.  clear() MUST be called first\n");
+    if (!available.empty()) DIE("set called with non-empty available heap\n");
+    if (!promised_blocks.empty()) DIE("set called with non-empty promised_blocks queue\n");
+    if (!promised_reservations.empty()) DIE("set called with non-empty promised_reservations queue\n");
+    if (num != 0 && (!allocation.first || allocation.second <= 0)) DIE("set called with null backing\n");
+    if (num * (count + thread_offset) != allocation.second)
+      DIE("set called with an incorrectly sized allocated_backing: ", allocation.second, ", blocks ", num, ", count per ", count,
+          " and ", thread_offset, " thread_offset\n");
+    if (reservation_count > num / 2) DIE("reservation_count=", reservation_count, " can not fit within num=", num, "\n");
+    clear();
+    num_blocks = num;
+    count_per_block = count;
+    reservation_size = reservation_count;
+
+    backing = allocation;
+    available.reserve(num_blocks);
+    reservation.reserve(reservation_size);
+    promised_blocks_count = 0;
+    promised_reservations_count = 0;
+    for (size_t i = 0; i < num_blocks; i++) {
+      block_t tmp(backing.first + i * (count_per_block + thread_offset) + thread_offset, 0);
+      assert(is_backed(tmp));
+      push(tmp);
+    }
+    if (available.size() + reservation.size() != num_blocks)
+      DIE("Invalid set() - available size is not num_blocks: ", available.size(), " available + ", reservation.size(),
+          " reserved vs ", num_blocks, "\n");
+    if (!promised_blocks.empty()) DIE("Invalid set() - promised_blocks should be empty: ", promised_blocks.size(), "\n");
+    if (!promised_reservations.empty())
+      DIE("Invalid set() - promised_blocks should be empty: ", promised_reservations.size(), "\n");
+    assert(num_blocks == 0 || valid());
+  }
+
+  // clears this dispatcher. Aborts if promises queue is not empty or available is not full
+  void clear() {
+    if (!valid()) {
+      assert(available.empty());
+      assert(reservation.empty());
+      assert(promised_blocks.empty());
+      assert(promised_reservations.empty());
+      assert(num_blocks == 0);
+      assert(promised_blocks_count == 0);
+      assert(promised_reservations_count == 0);
+    }
+
+    if (num_blocks) {
+      promise_block_count_timer.print_reduce_timings();
+      if (reservation_size) {
+        promise_reservation_count_timer.print_reduce_timings();
+      }
+    }
+
+    if (!promised_blocks.empty()) DIE("clear() called with entries in the promised_blocks queue\n");
+    if (!promised_reservations.empty()) DIE("clear() called with entries in the promised_reservations queue\n");
+    if (reservation.size() + available.size() != num_blocks)
+      DIE("clear() called witnout all blocks returned to available: ", available.size(), " + reserved ", reservation.size(),
+          " expected ", num_blocks, "\n");
+
+    backing = {};
+    available.resize(0);
+    reservation.resize(0);
+    num_blocks = 0;
+    count_per_block = 0;
+    promised_blocks_count = 0;
+    promised_reservations_count = 0;
+    assert(!valid());
+    barrier();
+  }
+
+  inline size_t get_count_per_block() const { return count_per_block; }
+  inline size_t get_num_blocks() const { return num_blocks; }
+
+ public:
+  // returns a full reservation of global block that can be used immediately
+  // non blocking
+  upcxx::future<reservation_t> acquire_reservation() {
+    assert(reservation_size > 0);
+    if (reservation_size == 0) DIE("There is no reservation to acquire as reservation_size == 0\n");
+    // lock against concurrent modification on all containers
+    if (reservation.size() == reservation_size) {
+      reservation_t reserved = claim_reservation();
+      assert(reserved->front().first);
+      assert(reserved->back().first);
+      DBG("acquire_reservation: got immediately:", reserved.get(), " -- ", to_string(), "\n");
+      return make_future(reserved);
+    } else {
+      // add a promise for a reservation
+      DBG("acquire_reservation: issuing a promise -- ", to_string(), "\n");
+      promise_reservation_t res(promise_reservation_count_timer);
+      auto fut = res.get_future();
+      promised_reservations.push_back(std::move(res));
+      promised_reservations_count++;
+      assert(promised_reservations_count == promised_reservations.size());
+      return res;
+    }
+  }
+
+  reservation_t acquire_partial_reservation() {
+    if (reservation_size == 0) DIE("There is no reservation to acquire as reservation_size == 0\n");
+    reservation_t res;
+    res = claim_reservation(false);
+    assert(res);
+    DBG("acquire_partial_reservation got one with ", res->size(), " -- ", to_string(), "\n");
+    return res;
+  }
+
+  void release_reservation(reservation_t reserved) {
+    assert(reserved);
+    DBG("Release_reservation:", reserved.get(), ", size=", reserved->size(), "\n");
+    for (auto block : *reserved) {
+      // DBG("Pushing back reserved block=", block.first, " reservation:", reserved.get(), "\n");
+      assert(block.first);
+      push(block);
+      assert(!block.first);  // invalidated
+    }
+    reserved->clear();
+  }
+
+  // if the reservation is not full, push to it
+  // otherwise if there is a promised_block, fulfill that promise
+  // otherwise push the block into the available heap
+  // assigns a zero count, and invalidates block so it can not be reused
+  void push(block_t &block) {
+    assert(block.first);
+    if (!valid()) DIE("push called on invalid BlockDispatcher!\n");
+    assert(is_backed(block));
+    if (!is_backed(block))
+      DIE("push called on foreign block(", block.first, " ", block.second, "): ", backing.first, " ", backing.second, "\n");
+    assert(promised_blocks_count == promised_blocks.size());
+    block.second = 0;  // reset the count
+
+    // a reservation may already be ready so fulfill apply before promised_block fulfillment
+    try_fulfill_promised_reservation();
+
+    if (reservation.size() < reservation_size) {
+      // put into the reservation
+      reservation.push_back(block);
+    } else if (!promised_blocks.empty()) {
+      // deliver this block to the first promised_blocks
+      block_t promised_block = block;  // copy before invalidation below
+      DBG("push fulfilling promised_block: ", promised_block.first, "\n");
+      assert(!promised_blocks.empty());
+      auto promise_for_block = std::move(promised_blocks.front());
+      promised_blocks.pop_front();
+      promised_blocks_count--;
+      promise_for_block.fulfill_result(promised_block);
+    } else {
+      // put back on available heap
+      available.push_back(block);
+    }
+    block = {};  // invalidate it
+
+    if (available.size() + reservation.size() > num_blocks)
+      DIE("push added too many blocks: ", available.size(), " + ", reservation.size(), " vs ", num_blocks, "\n");
+
+    // a reservation may now also be ready
+    try_fulfill_promised_reservation();
+  }
+
+  void try_fulfill_promised_reservation() {
+    // check for any outstanding promised_reservations and fulfill if possible
+    if (reservation_size > 0 && !promised_reservations.empty() && reservation.size() == reservation_size) {
+      // fulfill this promised reservation
+      auto promised_reservation = claim_reservation();
+      assert(promised_reservation);
+      assert(promised_reservation->front().first);
+      assert(promised_reservation->back().first);
+      assert(promised_reservation->size() == reservation_size);
+      auto promise_for_reservation = std::move(promised_reservations.front());
+      promised_reservations.pop_front();
+      promised_reservations_count--;
+      assert(promised_reservations_count == promised_reservations.size());
+      DBG("fulfilling promised_reservation: ", promised_reservation.get(), "\n");
+      promise_for_reservation.fulfill_result(promised_reservation);
+    }
+  }
+
+  // returns a future block.
+  // if one is available it may be immediately ready
+  // otherwise a promised_block is made and tracked
+  // if available.empty, create a promised_block
+  future_block_t pop(bool from_reservation = false) {
+    if (!valid()) DIE("pop_available called on invalid BlockDispatcher!\n");
+    future_block_t future_block;
+    // lock against concurrent modification on all containers
+    if (available.empty() && from_reservation && reservation.size() > 0) {
+      block_t block = reservation.back();
+      reservation.pop_back();
+      assert(is_backed(block));
+      assert(block.second == 0);
+      future_block = to_future(block);
+    } else if (available.empty()) {
+      // add a new promised_block and return its future
+      promise_block_t prom(promise_block_count_timer);
+      future_block = prom.get_future();
+      promised_blocks.push_back(std::move(prom));
+      promised_blocks_count++;
+      // DBG("pop got promised_blocks\n");
+    } else {
+      block_t block = available.back();
+      available.pop_back();
+      assert(is_backed(block));
+      assert(block.second == 0);
+      future_block = to_future(block);
+      // DBG("pop got available\n");
+    }
+    return future_block;
+  }
+
+  // const status accessors
+
+  inline size_t reserved_size() const { return reservation.size(); }
+
+  inline size_t available_size() const { return available.size(); }
+
+  inline bool available_empty() const { return available.empty(); }
+
+  // true if both the promises queues are empty
+  bool empty() const {
+    // DBG("empty():  promised reservations=", promised_reservations_count, " promised_blocks_count=", promised_blocks_count, "
+    // available=", available.size() , " reserved=", reservation.size(), " num_blocks=", num_blocks, "\n");
+    return promised_reservations.empty() && promised_blocks.empty() && available_size() + reservation.size() == num_blocks;
+  }
+
+  // true if the promises queue has entries
+  inline size_t promises_size() const { return promised_reservations_count + promised_blocks_count; }
+
+  inline bool promises_empty() const { return promised_reservations.empty() && promised_blocks.empty(); }
+
+  // to_string for debug output
+  string to_string() const {
+    ostringstream os;
+    os << description << "-";
+    os << "BlockDispatcher(backing=" << backing.first;
+    os << ",count=" << count_per_block << ",num=" << num_blocks;
+    os << ",avail=" << available.size();
+    os << ",promised_reservations=" << promised_reservations_count;
+    os << ",promised_blocks=" << promised_blocks_count << ")";
+    assert(promised_reservations_count == promised_reservations.size());
+    assert(promised_blocks_count == promised_blocks.size());
+    return os.str();
+  }
+};
+
+class TrackRPCs {
+ public:
+  using future_ack_t = upcxx::future<>;
+  using rpc_acks_t = list<future_ack_t>;
+
+  TrackRPCs(const string description_)
+      : rpcs_in_flight()
+      , sent_rpcs(0)
+      , returned_rpcs(0)
+      , rpc_timer()
+      , rpc_inner_timer()
+      , rpc_relay_timer()
+      , description(description_)
+      , t_prog(description_) {}
+  TrackRPCs(const TrackRPCs &) = delete;
+  TrackRPCs(TrackRPCs &&) = default;
+  virtual ~TrackRPCs() { clear(); }
+
+  bool empty() const;
+
+  // frees memory.  Can only be called when all futures have completed
+  void clear();
+
+  // track an rpc acknowledgment
+  void push(future_ack_t fut);
+
+  // tests all rpcs and returns the remaining count
+  // if ready, wait on it and remove, otherwise count it
+  size_t pop_finished();
+
+  size_t count_pending();
+
+  void flush(size_t max_pending = 0);
+
+  string to_string() const;
+
+  inline ActiveCountTimer &get_rpc_timer() { return rpc_timer; }
+  inline ActiveCountTimer &get_rpc_inner_timer() { return rpc_inner_timer; }
+  inline ActiveCountTimer &get_rpc_relay_timer() { return rpc_relay_timer; }
+
+ protected:
+  rpc_acks_t rpcs_in_flight;
+  size_t sent_rpcs, returned_rpcs;
+  ActiveCountTimer rpc_timer, rpc_inner_timer, rpc_relay_timer;
+  string description;
+  ProgressTimer t_prog;
+};
+
+template <typename T>
+class FixedMemoryRPC {
+  // consists of a global pointer dispatcher, and an acknowledgement queue for RPC calls
+  // The global pointer dispatcher is for receiving rgets from remote global shared memory
+  // The global pointer dispatcher is also for accumulating a block locally and then sending the blocks remotely
+
+ public:
+  // for global data on this node
+  using global_block_dispatch_t = BlockDispatcher<global_ptr<T> >;
+  using global_block_t = typename global_block_dispatch_t::block_t;
+  using future_global_block_t = typename global_block_dispatch_t::future_block_t;
+  using global_store_t = typename global_block_dispatch_t::blocks_t;
+  using future_src_dest_block_t = upcxx::future<global_block_t, global_block_t>;
+  using global_reservation_t = typename global_block_dispatch_t::reservation_t;
+  using inst_timer_t = GenericInstantiationTimer;
+
+ private:
+  // backing and dispatchers
+  global_block_t global_backing;  // for dest stores and sourcing rgets
+  global_block_dispatch_t global_dispatcher;
+
+  global_store_t dest_stores;  // a block of data for aggregation to each destination
+  size_t thread_offset;
+  ActiveCountTimer rput_timer, rget_timer, rget_wait_timer;
+  ProgressTimer t_prog;
+  string description;
+
+ public:
+  FixedMemoryRPC(const string description)
+      : global_backing({})
+      , global_dispatcher(description + string("-global-dispatcher"))
+      , dest_stores()
+      , thread_offset(0)
+      , rput_timer()
+      , rget_timer()
+      , rget_wait_timer()
+      , t_prog(description)
+      , description(description) {}
+  FixedMemoryRPC(const FixedMemoryRPC &) = delete;
+  FixedMemoryRPC(FixedMemoryRPC &&) = default;
+  virtual ~FixedMemoryRPC() { clear(); }
+
+  bool valid() const {
+    bool is_valid = (global_dispatcher.get_count_per_block() == global_dispatcher.get_count_per_block()) &&
+                    ((global_dispatcher.get_count_per_block() == 1 && !global_backing.first) ||
+                     (global_dispatcher.get_count_per_block() > 1 && global_backing.first && global_backing.second > 0));
+
+    return is_valid;
+  }
+
+  inline void my_progress() { t_prog.progress(); }
+
+  // we may have no intra dispatchers if there is 1 thread per node
+  // we may have no inter dispatchers if there is just 1 node
+  void set_dest_stores(size_t num_stores) {
+    DBG("FixedMemoryRPC - ", description, "::set_dest_stores(num_stores=", num_stores,
+        ") global_blocks=", global_dispatcher.get_num_blocks(), " count=", global_dispatcher.get_count_per_block(),
+        " avail=", global_dispatcher.available_size(), "\n");
+    assert(valid());
+    if (global_dispatcher.get_count_per_block() > 1) {
+      // require split_rank::split_local_team().rank_n() - 1 available blocks at this point (no blocking!)
+      if (global_dispatcher.available_size() < num_stores) {
+        DIE("There are an insufficient number of available blocks to populate the dest stores: available_blocks=",
+            global_dispatcher.available_size(), ", num_stores=", num_stores, "\n");
+      }
+      dest_stores.reserve(num_stores);
+      for (int i = 0; i < num_stores; i++) {
+        auto fut = global_dispatcher.pop();
+        if (!fut.is_ready()) {
+          DIE("Detected a global block that is not ready! i=", i, " available_size=", global_dispatcher.available_size(), "\n");
+        }
+        dest_stores.push_back(fut.wait());
+        assert(dest_stores.back().first);
+      }
+    }
+    assert(valid());
+    barrier();  // required so that no other global_dispatcher.pop() happends before dest_stores are filled
+  }
+
+  void clear_dest_stores() {
+    DBG("FixedMemoryRPC - ", description, " clear_dest_stores:", dest_stores.size(), "\n");
+    if (global_dispatcher.get_num_blocks() == 0) {
+      assert(dest_stores.empty());
+    } else {
+      assert(valid());
+      for (auto s : dest_stores) {
+        if (s.second > 0) DIE("Can not clear_dest_stores if they are not empty!\n");
+        global_dispatcher.push(s);
+      }
+      dest_stores.resize(0);
+      assert(valid());
+    }
+  }
+
+  size_t count_empty_dest_stores() const {
+    size_t empty = 0;
+    for (auto s : dest_stores) {
+      if (s.second == 0) empty++;
+    }
+    return empty;
+  }
+
+  // only true both the local and global dispatcher are empty themselves (*this be invalid)
+  // and the dispatchers have a full available heap
+  bool empty() const {
+    bool is_empty = global_dispatcher.empty() &&
+                    global_dispatcher.available_size() + global_dispatcher.reserved_size() + count_empty_dest_stores() ==
+                        global_dispatcher.get_num_blocks();
+    // DBG("FixedMemoryRPC::empty(): ", (is_empty?"True":"False"), ", global.empty()=", (global_dispatcher.empty()?"True":"False"),
+    // "\n");
+    return is_empty;
+  }
+
+  // allocates the blocks and sets the dispatchers
+  void set_fixed_mem(size_t num_global_blocks, size_t count_per_block, size_t num_stores, bool includes_thread_offset = false,
+                     size_t num_reserved_blocks = 0) {
+    global_dispatcher.clear();
+    clear();
+
+    if (num_global_blocks == 0) {
+      assert(count_per_block == 1);
+      SOUT("Using empty FixedMemoryRPC\n");
+      return;
+    }
+
+    if (num_global_blocks <= num_stores + num_reserved_blocks) {
+      DIE("Invalid set_fixed_mem num_global_blocks=", num_global_blocks, " num_stores=", num_stores,
+          " num_reserved_blocks=", num_reserved_blocks, "\n");
+    }
+
+    size_t global_count = num_global_blocks * count_per_block;
+
+    SOUT("Allocating ", description, " dispatchers: global_count=", global_count, " ", get_size_str(global_count * sizeof(T)),
+         "\n");
+
+    // allocate global memory for global dispatcher
+    thread_offset = includes_thread_offset ? (sizeof(thread_num_t) * count_per_block + sizeof(T) - 1) / sizeof(T) : 0;
+
+    // allocate thread_offset more elements and start the block that many into the actual allocation
+    global_backing.second = global_count + (thread_offset * num_global_blocks);
+    global_backing.first = upcxx::new_array<T>(global_backing.second);
+    size_t global_num = global_backing.second / (count_per_block + thread_offset);
+    assert(global_num == num_global_blocks);
+    global_dispatcher.set(global_backing, global_num, count_per_block, thread_offset, num_reserved_blocks);
+
+    SOUT("finished allocating ", description, " dispatchers\n");
+
+    set_dest_stores(num_stores);
+
+    size_t total_global_bytes = sizeof(T) * global_backing.second;
+    size_t total_bytes = total_global_bytes;
+    SOUT("Using ", num_global_blocks, " global of ", count_per_block, " elements (of ", get_size_str(sizeof(T)),
+         ") aggregate & send (", get_size_str(total_global_bytes), ") per thread or ",
+         get_size_str(total_bytes * upcxx::local_team().rank_n()), " per node in shared memory for dest and send buffers\n");
+    assert(valid());
+  }
+
+  // frees memory.  Can only be called when all futures have completed
+  void clear() {
+    if (!valid()) {
+      assert(!global_dispatcher.valid());
+      assert(dest_stores.empty());
+      return;
+    }
+    clear_dest_stores();
+    t_prog.print_out();
+    rget_timer.print_reduce_timings(description + "-rget");
+    rget_wait_timer.print_reduce_timings(description + "-rget-wait");
+    rput_timer.print_reduce_timings(description + "-rput");
+
+    // deallocate global_dispatcher
+    global_dispatcher.clear();
+    upcxx::delete_array(global_backing.first);
+    global_backing = {};
+    assert(!valid());
+    barrier();
+  }
+
+  inline bool has_dest_stores() const { return dest_stores.size() > 0; }
+
+  global_block_t &dest_store(size_t store_idx) {
+    if (store_idx >= dest_stores.size()) DBG("getting ", description, " dest_store store_idx=", store_idx, "\n");
+    assert(store_idx < dest_stores.size());
+    if (dest_stores.size() <= store_idx)
+      DIE("There are no dest stores at the moment:", dest_stores.size(), " looking for ", store_idx, "\n");
+    global_block_t &gblock = dest_stores[store_idx];
+    return gblock;
+  }
+
+  // push a global block back to the dispatcher
+  void push_global(global_block_t &gblock) {
+    // DBG("FixedSize::push_global: ", gblock.first, "\n");
+    if (!valid()) DIE("push called on an invalid FixedMemoryRPC!\n");
+    assert(gblock.first);
+    assert(gblock.first.where() == rank_me());
+    global_dispatcher.push(gblock);
+    assert(!gblock.first);  // is invalidated
+  }
+
+  // pops a future global block from the dispatcher
+  future_global_block_t pop_global(bool from_reservation = false) {
+    // DBG("pop_global\n");
+    if (!valid()) DIE("pop_global called on invalid FixedMemoryRPC!\n");
+    return global_dispatcher.pop(from_reservation);
+  }
+
+  inline upcxx::future<global_reservation_t> acquire_reservation() { return global_dispatcher.acquire_reservation(); }
+
+  inline global_reservation_t acquire_partial_reservation() { return global_dispatcher.acquire_partial_reservation(); }
+
+  inline void release_reservation(global_reservation_t reserved) { global_dispatcher.release_reservation(reserved); }
+
+  inline bool has_promises() const { return !global_dispatcher.promises_empty(); }
+
+  inline size_t global_available_size() const { return global_dispatcher.available_size(); }
+
+  inline size_t global_reserved_size() const { return global_dispatcher.reserved_size(); }
+
+  size_t _prep_xfer(global_block_t &src, global_block_t &dest) {
+    if (src.second == 0) DIE(__func__, " Invalid state - src block is EMPTY\n");
+    if (dest.second > 0) DIE(__func__, " Invalid state - dest is not empty: ", dest.second, "\n");
+    assert(global_dispatcher.get_count_per_block() >= src.second);
+    // dest will have src's size
+    dest.second = src.second;
+    size_t send_offset = 0;
+    if (thread_offset > 0) {
+      // blocks start inside of the allocation so the thread_num decends and the element ascends from the pointer
+      send_offset = (dest.second * sizeof(thread_num_t) + sizeof(T) - 1) / sizeof(T);
+    }
+    assert(send_offset <= thread_offset);
+    return send_offset;
+  }
+
+  // starts an rput of local source to remote dest
+  future_src_dest_block_t rput_block(global_block_t &src, global_block_t &dest) {
+    DBG("rput_block( src ", src.first, ", dest: ", dest.first, ", ", src.second, ")\n");
+    size_t send_offset = _prep_xfer(src, dest);
+    auto rput_t = make_shared<inst_timer_t>(rput_timer);
+
+    // perform the rpet
+    assert(src.first.is_local());
+    assert(dest.second == src.second);
+    auto rput_fut = rput(src.first.local() - send_offset, dest.first - send_offset, dest.second + send_offset);
+    src.second = 0;  // signal it is drained
+    auto fut_return = when_all(make_future(src, dest), rput_fut);
+
+    // prevent reuse
+    src = {};
+    dest = {};
+
+    return fut_return.then([rput_t, send_offset](global_block_t src, global_block_t dest) {
+      size_t count = dest.second + send_offset;
+      DBG("rput completed ", dest.second, " elements with ", send_offset, " extra (", get_size_str(count * sizeof(T)),
+          ") src=", src.first, " dest=", dest.first, " in ", rput_t->get_elapsed_since_start(), " s, ",
+          get_size_str(count * sizeof(T) / rput_t->get_elapsed_since_start()), " / s\n");
+      return make_future(src, dest);
+    });
+  }
+
+  // starts an rget of the global block copied to the local block
+  // creates a future of the same global and local blocks once the rget has completed
+  // invalidates both inputs: src and dest
+  // non-blocking
+  future_src_dest_block_t rget_block(global_block_t &src, global_block_t &dest) {
+    DBG("rget_block( src ", src.first, ", dest: ", dest.first, ", ", src.second, ")\n");
+    size_t send_offset = _prep_xfer(src, dest);
+    auto rget_t = make_shared<inst_timer_t>(rget_timer);
+
+    // perform the rget
+    assert(dest.first.is_local());
+    assert(dest.second == src.second);
+    auto rget_fut = rget(src.first - send_offset, dest.first.local() - send_offset, dest.second + send_offset);
+    src.second = 0;  // signal it is drained
+    auto fut_return = when_all(make_future(src, dest), rget_fut);
+
+    // prevent reuse
+    dest = {};
+    src = {};
+
+    return fut_return.then([rget_t, send_offset](global_block_t src, global_block_t dest) {
+      size_t count = dest.second + send_offset;
+      DBG("rget completed ", dest.second, " elements with ", send_offset, " extra (", get_size_str(count * sizeof(T)),
+          ") src=", src.first, " dest=", dest.first, " in ", rget_t->get_elapsed_since_start(), " s, ",
+          get_size_str(count * sizeof(T) / rget_t->get_elapsed_since_start()), " / s\n");
+      return make_future(src, dest);
+    });
+  }
+
+  // static rget_block pops a new block for dest
+  future_src_dest_block_t rget_block(global_block_t &gblock) {
+    // get a future block_t
+    auto rget_wait_t = make_shared<inst_timer_t>(rget_wait_timer);
+    auto fut_loc = pop_global(true);  // allow extraction from reservation
+    auto fut_both = when_all(to_future(gblock), fut_loc);
+    // rget the block
+    auto fut_blocks = fut_both
+                          .then([rget_wait_t](global_block_t src, global_block_t dest) {
+                            // just stop the timer
+                            return make_future(src, dest);
+                          })
+                          .then([this](global_block_t src, global_block_t dest) { return this->rget_block(src, dest); });
+    return fut_blocks;
+  }
+
+  inline size_t get_count_per_block() const {
+    assert(global_dispatcher.get_count_per_block() == global_dispatcher.get_count_per_block());
+    return global_dispatcher.get_count_per_block();
+  }
+
+  inline size_t get_thread_offset() const { return thread_offset; }
+
+  string to_string() const {
+    ostringstream os;
+    os << description << "-";
+    os << "FixedMemoryRPC(";
+    os << "thread_offset=" << thread_offset;
+    os << ",global_back=" << global_backing.first << "," << global_backing.second;
+    os << ",global_dispatch=" << global_dispatcher.to_string();
+    os << ")";
+    return os.str();
+  }
+};
+
+template <typename FuncDistObj, typename T>
+class TwoTierAggrStore {
+ private:
+  // T for intra node RPCs
+  using intra_fixed_memory_rpc_t = FixedMemoryRPC<T>;
+  using intra_fixed_memory_t = dist_object<intra_fixed_memory_rpc_t>;
+  using intra_global_ptr_t = global_ptr<T>;
+  using intra_global_block_t = typename intra_fixed_memory_rpc_t::global_block_t;
+  using intra_future_global_block_t = typename intra_fixed_memory_rpc_t::future_global_block_t;
+  using intra_reservation_t = typename intra_fixed_memory_rpc_t::global_reservation_t;
+
+  // For inter-node global,use a more compact array than a pair<Elem, thread_num_t>
+  //  as the pair packs very inefficiently and sends a lot of zeros over the net
+  //  #'s for thread-dest appending descending, E for element appending ascending from the pointer at the first E
+  //  .....4321EEEE.....
+  //  --------->-------- // start of element ptr  == T* (alloc + thread_offset)
+  //  --------<--------- // start of thread_num ptr == ((thread_num_t*) (alloc + thread_offset)) - 1
+  //  only sending the non-zero data in the middle over the wire
+  // thread_offset represents the # of elements from the start of the allocation that the pointer will be at
+
+  using inter_fixed_memory_rpc_t = FixedMemoryRPC<T>;
+  using inter_fixed_memory_t = dist_object<inter_fixed_memory_rpc_t>;
+  using inter_global_ptr_t = global_ptr<T>;
+  using inter_global_block_t = typename inter_fixed_memory_rpc_t::global_block_t;
+  using inter_future_global_block_t = typename inter_fixed_memory_rpc_t::future_global_block_t;
+  using inst_timer_t = GenericInstantiationTimer;
+
+  using track_rpcs_t = dist_object<TrackRPCs>;
+
+  FuncDistObj &func;
+
+  size_t max_store_size;      // the count of T per block (may be 0)
+  size_t max_rpcs_in_flight;  // Limit for the number of rpcs in flight. This limit exists to prevent the dispatch buffers from
+                              // growing indefinitely
+
+  inter_fixed_memory_t inter_fixed_memory_store;
+  intra_fixed_memory_t intra_fixed_memory_store;
+  track_rpcs_t track_inter_rpcs, track_intra_rpcs;
+  ProgressTimer t_prog;
+  static IntermittentTimer &t_process_local() {
+    static IntermittentTimer _(string("process_local()"));
+    return _;
+  }
+
+  // private static methods
+
+  // proceses a batch of data that is local (must be intra)
+  static void process_local(T *elem, size_t count, FuncDistObj &func) {
+    assert(elem);
+    assert(count > 0);
+    t_process_local().start();
+    auto func_inst = *func;
+    for (size_t i = 0; i < count; i++) {
+      func_inst(elem[i]);
+    }
+    t_process_local().stop();
+  }
+
+  static void process_local(intra_global_block_t lblock, FuncDistObj &func) {
+    assert(lblock.first);
+    assert(lblock.first.is_local());
+    process_local(lblock.first.local(), lblock.second, func);
+  }
+
+  // static my_partial_progress version does NOT call upcxx::progress()
+  // just clears any ready rpcs
+  static size_t my_partial_progress(track_rpcs_t &track_rpcs) {
+    size_t pending_rpcs = track_rpcs->pop_finished();
+    assert(pending_rpcs == track_rpcs->count_pending());
+    return pending_rpcs;
+  }
+
+  bool my_progress_is_required;
+  inline bool &my_progress_required() { return my_progress_is_required; }
+  // performs upcxx::progress() and TwoTierAggrStore progress on rpc acknowledgments
+  // returns the number of pending rpcs
+  bool calc_my_progress_required() {
+    my_progress_is_required = false;
+    if (inter_fixed_memory_store->has_promises() || intra_fixed_memory_store->has_promises()) {
+      // some promises exist
+      my_progress_is_required = true;
+    } else if (track_inter_rpcs->get_rpc_inner_timer().get_total_count() +
+                   track_inter_rpcs->get_rpc_inner_timer().get_active_count() <
+               track_inter_rpcs->get_rpc_timer().get_total_count() + track_inter_rpcs->get_rpc_timer().get_active_count()) {
+      // inter inner rpcs (receiving) is less than rpcs (sending)
+      my_progress_is_required = true;
+    } else if (track_inter_rpcs->get_rpc_inner_timer().get_active_count() > 2 * split_rank::num_nodes()) {
+      // there are more active inter rpcs requiring my progress than there are nodes.  Get them completed
+      my_progress_is_required = true;
+    } else if (track_intra_rpcs->get_rpc_inner_timer().get_active_count() > 2 * split_rank::num_threads()) {
+      // there are more active intra rpcs requiring my progress than there are threads in a node.  Get them completed
+      my_progress_is_required = true;
+    }
+    // DBG(__func__, ": ", my_progress_is_required, "\n");
+    return my_progress_is_required;
+  }
+
+  size_t my_progress() {
+    // DBG(__func__, " my_progress_is_required=", my_progress_is_required, " -- ", to_string(), "\n");
+    t_prog.progress();
+    calc_my_progress_required();
+    return my_partial_progress(track_inter_rpcs) + my_partial_progress(track_intra_rpcs);
+  }
+
+  void wait_max_rpcs() {
+    // limit pending RPCs still
+    StallTimer is_stalled(description + string("-wait_max_rpcs"));
+    while (my_progress() >= max_rpcs_in_flight) is_stalled.check();
+  }
+
+  // simply sends a single element via rpc, bypassing all blocks
+  void send_rpc1(intrank_t target_rank, const T &elem) {
+    auto fut = rpc(
+        target_rank, [](T elem, FuncDistObj &func) { (*func)(elem); }, elem, func);
+    track_inter_rpcs->push(fut);
+  }
+
+  // get the thread from a block with a thread_offset
+  static inline thread_num_t &get_thread_from_block(T *block, int idx) {
+    assert(idx >= 0);
+    return *(((thread_num_t *)block) - 1 - idx);
+  }
+
+  // This function takes last element as pivot, places
+  // the pivot element at its correct position in sorted
+  // array, and places all smaller (smaller than pivot)
+  // to left of pivot and all greater elements to right
+  // of pivot
+  static int block_quicksort_partition(T *block, int low, int high) {
+    thread_num_t pivot = get_thread_from_block(block, high);  // pivot
+    int i = (low - 1);                                        // Index of smaller element
+
+    for (int j = low; j <= high - 1; j++) {
+      // If current element is smaller than the pivot
+      if (get_thread_from_block(block, j) < pivot) {
+        i++;  // increment index of smaller element
+        assert(i >= low);
+        assert(j >= low);
+        assert(i < high);
+        assert(j < high);
+        if (i != j) {
+          assert(i < j);
+          std::swap(block[i], block[j]);
+          std::swap(get_thread_from_block(block, i), get_thread_from_block(block, j));
+        }
+        assert(get_thread_from_block(block, i) < pivot);
+      }
+    }
+    assert(i + 1 >= low);
+    if (i + 1 != high) {
+      assert(i + 1 < high);
+      std::swap(block[i + 1], block[high]);
+      std::swap(get_thread_from_block(block, i + 1), get_thread_from_block(block, high));
+    }
+    return (i + 1);
+  }
+
+  // The main function that implements QuickSort
+  // low --> Starting index,
+  // high --> Ending index
+  static void block_quicksort(T *block, int low, int high) {
+    if (low < high) {
+      /* pi is partitioning index, block[pi] is now
+      at right place */
+      int pi = block_quicksort_partition(block, low, high);
+
+      // Separately sort elements before
+      // partition and after partition
+      block_quicksort(block, low, pi - 1);
+      block_quicksort(block, pi + 1, high);
+    }
+  }
+
+  // returns "virtual" intra blocks based on the underlying gblock for each local thread
+  // some may be empty but there will be one entry for every local thread
+  static vector<intra_global_block_t> inter_to_sorted_intra_blocks(inter_global_block_t &gblock, size_t start = 0) {
+    assert(gblock.first);
+    assert(gblock.first.is_local());
+    assert(gblock.second > 0);
+    assert(start <= gblock.second);
+    DBG("Sorting ", gblock.second, " inter entries into intra blocks\n");
+
+    T *block = gblock.first.local();
+    if (gblock.second - start > 1) {  // no need to sort a single entry, right?
+      block_quicksort(block, start, gblock.second - 1);
+#ifdef DEBUG
+      /* validate it was indeed sorted */
+      int last_thread = -1;
+      for (size_t idx = start; idx < gblock.second; idx++) {
+        thread_num_t t = get_thread_from_block(gblock.first.local(), idx);
+        assert(last_thread <= t);
+        assert(t >= 0);
+        assert(t < split_rank::num_threads());
+        last_thread = t;
+      }
+#endif
+    }
+    vector<intra_global_block_t> intra_blocks;
+    intra_blocks.resize(split_rank::num_threads(), {});
+    int last_thread = -1;
+    // find the partitions by thread in the sorted array
+    // TODO there should be a faster way to do this in a long list
+    for (size_t idx = start; idx < gblock.second; idx++) {
+      thread_num_t &thread = get_thread_from_block(block, idx);
+      assert(thread >= 0);
+      assert(thread < split_rank::num_threads());
+      if (last_thread > thread)
+        DIE("inter_to_sorted_intra_blocks did not sort properly. idx=", idx, " gblock.second=", gblock.second,
+            " last_thread=", (int)last_thread, " thread=", (int)thread, "\n");
+      intra_global_block_t &gb = intra_blocks[thread];
+      if (!gb.first) {
+        assert(gb.second == 0);
+        gb.first = gblock.first + idx;
+      }
+      gb.second++;
+      last_thread = thread;
+    }
+    assert(gblock.first);             // did not modify
+    assert(gblock.first.is_local());  // not modify
+    assert(gblock.second > 0);
+    return intra_blocks;
+  }
+
+  static upcxx::future<inter_global_block_t> inter_intra_inner_rpc_relay(inter_global_block_t lblock, track_rpcs_t &track_inter_rpcs,
+                                                                  track_rpcs_t &track_intra_rpcs,
+                                                                  intra_fixed_memory_t &intra_fixed_mem,
+                                                                  inter_fixed_memory_t &inter_fixed_mem, FuncDistObj &func) {
+    DBG("inter_relay processing inter RPC:", lblock.first, " ", lblock.second, "\n");
+    assert(lblock.first);
+    assert(lblock.first.is_local());
+    assert(lblock.second > 0);
+    inter_global_block_t lblock_consumed = lblock;
+    lblock_consumed.second = 0;
+    upcxx::future<intra_global_block_t> all_rpcs = make_future(lblock_consumed);
+    intra_reservation_t reservation = intra_fixed_mem->acquire_partial_reservation();  // may be empty
+    T *elem_ptr = lblock.first.local();
+    thread_num_t *thread_ptr = ((thread_num_t *)elem_ptr) - 1;
+    size_t res_sent = 0;
+    size_t idx = 0;
+    for (; idx < lblock.second; idx++) {
+      if (reservation->empty()) break;             // must resort to plan B
+      T &elem = elem_ptr[idx];                     // element increase up the stack
+      thread_num_t &thread = thread_ptr[0 - idx];  // threads increase down the stack
+      split_rank split = split_rank::from_thread(thread);
+      bool sent_rpc = add_to_dest_store_intranode_nb(split, elem, track_intra_rpcs, intra_fixed_mem, reservation, func);
+      if (sent_rpc) res_sent++;
+    }
+    if (res_sent) DBG("inter_relay res_sent=", res_sent, "\n");
+    if (idx < lblock.second) {
+      // plan B
+      // sort the remaining entries by thread and send intra_rpcs directly using this set of virtual blocks
+      auto inter_intra_rpc_timer = make_shared<inst_timer_t>(track_inter_rpcs->get_rpc_relay_timer());
+      DBG("inter_relay sorting remaining ", lblock.second - idx, " entries for direct intra rpc\n");
+      size_t direct_sent = 0;
+      vector<intra_global_block_t> intra_blocks = inter_to_sorted_intra_blocks(lblock, idx);
+      thread_num_t thread_idx = 0;
+      for (intra_global_block_t intra_block : intra_blocks) {
+        assert(thread_idx < split_rank::num_threads());
+        if (intra_block.first && intra_block.second > 0) {
+          size_t count = intra_block.second;
+          auto fut_rpc = just_send_intra_rpc_nb(split_rank::get_rank_from_thread(thread_idx), intra_block, intra_fixed_mem,
+                                                track_intra_rpcs, func)
+                             .then([](intra_global_block_t ignored) {});
+          all_rpcs = when_all(all_rpcs, fut_rpc);
+          direct_sent++;
+        }
+        thread_idx++;
+      }
+      all_rpcs = all_rpcs.then([inter_intra_rpc_timer, direct_sent](inter_global_block_t lblock) {
+        DBG("inter_relay All direct_sent=", direct_sent, " intra rpcs relayed from inter rpc took ",
+            inter_intra_rpc_timer->get_elapsed_since_start(), " s count=", inter_intra_rpc_timer->get_total_count(),
+            " active=", inter_intra_rpc_timer->get_active_count(), "\n");
+        assert(lblock.first);
+        assert(lblock.first.is_local());
+        assert(lblock.second == 0);  // it is drained
+        // stop timer, return the block
+        return lblock;
+      });
+    }
+    intra_fixed_mem->release_reservation(reservation);
+    assert(reservation->empty());  // reservation was drained after release
+    return all_rpcs;
+  }
+
+  // sends and rpc to the corresponding target rank on a different node with the same split_rank::split_local_team().rank_me()
+  // consumes the gblock
+  // returns the gblock once it is consumed
+  void send_inter_rpc(intrank_t target_rank, inter_global_block_t &gblock) {
+    assert(!gblock.first.is_null());
+    assert(gblock.first.where() == rank_me());
+    assert(gblock.second > 0);
+
+    DBG("send_inter_rpc(", target_rank, ", gblock=", gblock.first, " size=", gblock.second, "\n");
+    assert(inter_fixed_memory_store->valid());
+    assert(intra_fixed_memory_store->valid());
+
+    // time the round trip
+    auto t_rpc = make_shared<inst_timer_t>(track_inter_rpcs->get_rpc_timer());
+
+    auto fut = rpc(
+        target_rank,
+        [](inter_global_block_t gblock, track_rpcs_t &track_inter_rpcs, track_rpcs_t &track_intra_rpcs,
+           intra_fixed_memory_t &intra_fixed_mem, inter_fixed_memory_t &inter_fixed_mem, FuncDistObj &func) {
+          assert(gblock.first.where() != rank_me());  // no local data should be transmitted via RPC
+          assert(split_rank::get_my_node() !=
+                 split_rank::get_node_from_rank(gblock.first.where()));  // no inter-node transfer on the same node
+          DBG("Executing process rpc inter node ", gblock.first, " ", gblock.second, "\n");
+          auto t_inner_rpc = make_shared<inst_timer_t>(track_inter_rpcs->get_rpc_inner_timer());
+
+          // future for both blocks after the rget completes
+          auto fut_blocks = inter_fixed_mem->rget_block(gblock);
+
+          // future for lblock after it is consumed
+          auto fut_relay =
+              fut_blocks
+                  .then([&track_inter_rpcs, &track_intra_rpcs, &inter_fixed_mem, &intra_fixed_mem, &func](
+                            inter_global_block_t gblock_IGNORED, inter_global_block_t lblock) {
+                    assert(gblock_IGNORED.first);
+                    assert(gblock_IGNORED.first.where() != rank_me());
+                    assert(gblock_IGNORED.second == 0);  // is drained
+                    return inter_intra_inner_rpc_relay(lblock, track_inter_rpcs, track_intra_rpcs, intra_fixed_mem, inter_fixed_mem,
+                                                       func)
+                        .then([&inter_fixed_mem](inter_global_block_t lblock) {
+                          assert(lblock.first);
+                          assert(lblock.first.is_local());
+                          assert(lblock.second == 0);  // consumed
+                          inter_fixed_mem->push_global(lblock);
+                        });
+                  })
+                  .then([t_inner_rpc]() {
+                    DBG("Completed inter rpc relay in ", t_inner_rpc->get_elapsed_since_start(),
+                        " s count=", t_inner_rpc->get_total_count(), " active=", t_inner_rpc->get_active_count(), "\n");
+                    // stop the timer
+                  });
+          // no need to wait on fut_relay -- cleanup of the timer and push back to inter_fixed_mem will eventually happen and be
+          // verified
+
+          // future for gblock after it is consumed -- possibly with a return package
+          auto fut_return =
+              fut_blocks
+                  .then([&inter_fixed_mem](inter_global_block_t gblock,
+                                           inter_global_block_t lblock_ignored) -> upcxx::future<inter_global_block_t> {
+                    // optionally send this rank dest store back to the sending process
+                    DBG("Returning inter rpc gblock=", gblock.first, "\n");
+                    assert(gblock.first);
+                    assert(gblock.second == 0);  // it is drained
+                    assert(gblock.first.where() != rank_me());
+                    assert(lblock_ignored.first);
+                    assert(lblock_ignored.first.where() == rank_me());
+                    node_num_t node = split_rank(gblock.first.where()).get_node();
+                    DBG("to node=", node, "\n");
+                    // check for another available lblock and enough in dest store to have efficient transfer speed
+                    if (inter_fixed_mem->has_dest_stores() && inter_fixed_mem->global_reserved_size() > 0) {
+                      inter_global_block_t &store_block = inter_fixed_mem->dest_store(node);
+                      size_t store_size = store_block.second;
+                      if (store_block.first && store_size * 2 > inter_fixed_mem->get_count_per_block() &&
+                          store_size * sizeof(T) > 4 * ONE_KB) {
+                        // send this store back to sender with this gblock
+                        DBG("Sending store_size=", store_size, " back to ", gblock.first.where(), " store=", store_block.first,
+                            " (swapped)\n");
+                        assert(store_block.first);
+                        assert(store_block.first.where() == rank_me());
+                        auto lblock_fut = inter_fixed_mem->pop_global(true);
+                        assert(lblock_fut.is_ready());
+                        auto lblock = lblock_fut.result();
+                        assert(lblock.first);
+                        assert(lblock.first.where() == rank_me());
+                        assert(lblock.second == 0);  // empty
+                        // swap the store and free block
+                        std::swap(store_block, lblock);
+                        assert(lblock.second > 0);
+                        auto fut_rput_blocks = inter_fixed_mem->rput_block(lblock, gblock);
+                        return fut_rput_blocks.then([&inter_fixed_mem](inter_global_block_t lblock, inter_global_block_t gblock) {
+                          DBG("rput finished lblock=", lblock.first, " ", lblock.second, " gblock=", gblock.first, " ",
+                              gblock.second, "\n");
+                          assert(lblock.first);
+                          assert(lblock.first.where() == rank_me());
+                          assert(lblock.second == 0);  // is drained
+                          assert(gblock.first);
+                          assert(gblock.first.where() != rank_me());
+                          assert(gblock.second > 0);
+                          // push lblock after the rput completes
+                          inter_fixed_mem->push_global(lblock);
+                          return gblock;
+                        });
+                      }
+                    }
+                    // did not end up sending, just return the empty gblock
+                    DBG("Just returning gblock=", gblock.first, "\n");
+                    assert(gblock.first);
+                    assert(gblock.first.where() != rank_me());
+                    assert(gblock.second == 0);  // it is drained
+                    return make_future(gblock);
+                  })
+                  .then([t_inner_rpc](inter_global_block_t gblock) {
+                    DBG("Returning inter rpc gblock=", gblock.first, " ", gblock.second, " in ",
+                        t_inner_rpc->get_elapsed_since_start(), "\n");
+                    return gblock;
+                  });
+
+          return fut_return;  // just the gblock
+        },
+        gblock, track_inter_rpcs, track_intra_rpcs, intra_fixed_memory_store, inter_fixed_memory_store, func);
+
+    gblock = {};  // do not allow reuse of this global pointer until the return is ready and pushed back
+
+    // handle returned global_block
+    inter_fixed_memory_t &inter_fixed_mem = inter_fixed_memory_store;
+    intra_fixed_memory_t &intra_fixed_mem = intra_fixed_memory_store;
+    FuncDistObj &_func = func;
+    track_rpcs_t &_track_inter_rpcs = track_inter_rpcs;
+    track_rpcs_t &_track_intra_rpcs = track_intra_rpcs;
+    auto fut_returned =
+        fut.then([t_rpc, target_rank](inter_global_block_t gblock) {
+             DBG("Got inter rpc ack from ", (int)target_rank, ": ", gblock.first, " in ", t_rpc->get_elapsed_since_start(), " s\n");
+             // just stop the timer
+             assert(gblock.first);
+             assert(gblock.first.where() == rank_me());
+             return make_future(gblock);
+           })
+            .then([&inter_fixed_mem, &_track_inter_rpcs, &_track_intra_rpcs, &intra_fixed_mem,
+                   &_func](inter_global_block_t gblock) -> upcxx::future<inter_global_block_t> {
+              assert(gblock.first);
+              assert(gblock.first.where() == rank_me());
+              auto fut_gblock = make_future(gblock);
+              if (gblock.second > 0) {
+                // there is data to be processed in this return ack, so relay it
+                DBG("Processing inter rpc ack gblock=", gblock.first, " ", gblock.second, "\n");
+                return inter_intra_inner_rpc_relay(gblock, _track_inter_rpcs, _track_intra_rpcs, intra_fixed_mem, inter_fixed_mem,
+                                                   _func);
+              } else {
+                return to_future(gblock);
+              }
+            })
+            .then([&inter_fixed_mem](inter_global_block_t gblock) {
+              assert(gblock.first);
+              assert(gblock.second == 0);  // it is drained
+              inter_fixed_mem->push_global(gblock);
+              assert(!gblock.first);  // is invalidated
+            });
+
+    // remember this rpc to wait on later (may not be necessary)
+    track_inter_rpcs->push(fut_returned);
+  }
+
+  // sends an rpc to target_rank located on this node (intra).
+  // consumes the gblock
+  // returns the gblock one it is consumed
+  static void send_intra_rpc(intrank_t target_rank, intra_global_block_t &gblock, track_rpcs_t &track_intra_rpcs,
+                             intra_fixed_memory_t &intra_fixed_mem, FuncDistObj &func) {
+    assert(!gblock.first.is_null());
+    assert(gblock.first.where() == rank_me());
+    assert(gblock.second > 0);
+
+    DBG("send_intra_rpc(", target_rank, ", gblock=", gblock.first, " size=", gblock.second, "\n");
+    assert(intra_fixed_mem->valid());
+    my_partial_progress(track_intra_rpcs);
+
+    auto fut_gblock = just_send_intra_rpc_nb(target_rank, gblock, intra_fixed_mem, track_intra_rpcs, func);
+
+    // handle returned global_block
+    auto fut_returned = fut_gblock.then([&intra_fixed_mem](intra_global_block_t gblock) {
+      DBG("Returned acknowledged global ", gblock.first, "\n");
+      assert(gblock.first);
+      assert(gblock.first.where() == rank_me());
+      intra_fixed_mem->push_global(gblock);
+      assert(!gblock.first);  // is invalidated
+    });
+
+    // remember this rpc to wait on later
+    track_intra_rpcs->push(fut_returned);
+  }
+
+  static upcxx::future<intra_global_block_t> just_send_intra_rpc_nb(intrank_t target_rank, intra_global_block_t &gblock,
+                                                             intra_fixed_memory_t &intra_fixed_mem, track_rpcs_t &track_intra_rpcs,
+                                                             FuncDistObj &func) {
+    // time the round-trip
+    auto t_rpc = make_shared<inst_timer_t>(track_intra_rpcs->get_rpc_timer());
+
+    // This RPC just starts consuming the global block and makes progress on the remote rank
+    // It returns a future global block when the remote rank has finished consuming it
+    auto fut = rpc(
+        target_rank,
+        [](intra_global_block_t gblock, FuncDistObj &func, intra_fixed_memory_t &intra_fixed_mem, track_rpcs_t &track_intra_rpcs) {
+          DBG("Executing process rpc intra node ", gblock.first, " ", gblock.second, ", intra_fixed_mem: ", &(*intra_fixed_mem),
+              "\n");
+          auto t_inner_rpc = make_shared<inst_timer_t>(track_intra_rpcs->get_rpc_inner_timer());
+          upcxx::future<> finished;
+
+          upcxx::future<intra_global_block_t> fut_gblock;
+          if (gblock.first.is_local()) {
+            // processes the data immediately
+            process_local(gblock, func);
+            fut_gblock = to_future(gblock);
+            finished = make_future();
+          } else {
+            // copy the global data, then process it eventually
+            DBG("initiating rget of non-local intra gblock:", gblock.first, "\n");
+            auto fut_blocks = intra_fixed_mem->rget_block(gblock);
+            fut_gblock = fut_blocks.then([](intra_global_block_t gblock, intra_global_block_t ignored) { return gblock; });
+            finished = fut_blocks.then([&func, &intra_fixed_mem](intra_global_block_t ignored, intra_global_block_t lblock) {
+              process_local(lblock, func);
+              intra_fixed_mem->push_global(lblock);
+            });
+          }
+          finished.then([t_inner_rpc]() {
+            DBG("intra rpc finished in ", t_inner_rpc->get_elapsed_since_start(), " s\n");
+            // stop the timer
+          });
+          return fut_gblock;  // return th global_block to sender for reuse
+        },
+        gblock, func, intra_fixed_mem, track_intra_rpcs);
+    gblock = {};  // do not allow reuse of this global pointer until the return is ready and pushed back
+
+    return fut.then([t_rpc](intra_global_block_t gblock) {
+      DBG("intra rpc returned in ", t_rpc->get_elapsed_since_start(), " s\n");
+      // stop the timer
+      return gblock;
+    });
+  }
+
+  inline void send_intra_rpc(intrank_t target_rank, intra_global_block_t &gblock) {
+    send_intra_rpc(target_rank, gblock, track_intra_rpcs, intra_fixed_memory_store, func);
+  }
+
+  // operation on 1 element (i.e. no dest_store)
+  // will block until sufficient global available blocks are available
+  // and subject to the maximum rpcs in flight
+  void update_remote1(intrank_t target_rank, const T &elem) {
+    assert(max_store_size <= 1);
+    // limit pending RPCs still
+    wait_max_rpcs();
+    update_remote1_nb(target_rank, elem);
+  }
+
+  // non blocking version (for use in future chains)
+  inline void update_remote1_nb(intrank_t target_rank, const T &elem) { send_rpc1(target_rank, elem); }
+
+  // operate on a vector of elements in the dest_stores
+  // will block until sufficient global available blocks are available
+  inline static void update_remote_intra(intrank_t target_rank, intra_global_block_t &gblock, track_rpcs_t &track_intra_rpcs,
+                                         intra_fixed_memory_t &intra_fixed_mem, FuncDistObj &func) {
+    intra_reservation_t empty_res;
+    update_remote_intra_nb(target_rank, gblock, track_intra_rpcs, intra_fixed_mem, empty_res, func);
+  }
+
+  // and subject to the maximum rpcs in flight
+  static void update_remote_intra_nb(intrank_t target_rank, intra_global_block_t &gblock, track_rpcs_t &track_intra_rpcs,
+                                     intra_fixed_memory_t &intra_fixed_mem, intra_reservation_t &reservation, FuncDistObj &func) {
+    DBG("update_remote_intra(target_rank=", target_rank, ", gblock=", gblock.first, ", size=", gblock.second, "\n");
+    assert(gblock.first);
+    assert(gblock.first.where() == rank_me());
+    if (gblock.second == 0) DIE("Invalid call to update_remote on an empty global block\n");
+    intra_global_block_t send_gblock = gblock;  // make a copy
+    gblock = {};                                // invalidate it first
+
+    // now get another gblock
+    if (reservation && !reservation->empty()) {
+      replace_intra_store_nb(gblock, intra_fixed_mem, reservation);
+    } else {
+      assert(!gblock.first);
+      auto fut = replace_intra_store(gblock, intra_fixed_mem);
+      if (!fut.is_ready()) DBG(__func__, " will wait\n");
+      fut.wait();
+    }
+    assert(gblock.first);
+    assert(gblock.first.where() == rank_me());
+
+    // now send the copy after gblock is available again
+    send_intra_rpc(target_rank, send_gblock, track_intra_rpcs, intra_fixed_mem, func);
+    assert(!send_gblock.first);  // is invalidated
+  }
+
+  inline void update_remote_intra(intrank_t target_rank, intra_global_block_t &gblock) {
+    intra_reservation_t empty_res;
+    update_remote_intra_nb(target_rank, gblock, track_intra_rpcs, intra_fixed_memory_store, empty_res, func);
+  }
+
+  inline void update_remote_intra_nb(intrank_t target_rank, intra_global_block_t &gblock, intra_reservation_t &reservation) {
+    update_remote_intra(target_rank, gblock, track_intra_rpcs, intra_fixed_memory_store, reservation, func);
+  }
+
+  static upcxx::future<> replace_inter_store(inter_global_block_t &gblock, inter_fixed_memory_t &inter_fixed_memory_store) {
+    assert(!gblock.first);
+    assert(gblock.second == 0);
+    upcxx::future<inter_global_block_t> newblock = inter_fixed_memory_store->pop_global();
+    return when_all(make_future(std::ref(gblock)), newblock)
+        .then([&inter_fixed_memory_store](inter_global_block_t &gblock, inter_global_block_t newblock) {
+          assert(newblock.first);
+          assert(newblock.first.where() == rank_me());
+          assert(newblock.second == 0);
+          if (!gblock.first) {
+            // won the race
+            gblock = newblock;
+          } else {
+            // lost the race put the newblock back
+            inter_fixed_memory_store->push_global(newblock);
+          }
+        });
+  }
+
+  static upcxx::future<> replace_intra_store(intra_global_block_t &gblock, intra_fixed_memory_t &intra_fixed_memory_store) {
+    upcxx::future<intra_global_block_t> newblock = intra_fixed_memory_store->pop_global();
+    return when_all(make_future(std::ref(gblock)), newblock)
+        .then([&intra_fixed_memory_store](intra_global_block_t &gblock, intra_global_block_t newblock) {
+          assert(newblock.first);
+          assert(newblock.first.where() == rank_me());
+          assert(newblock.second == 0);
+          if (!gblock.first) {
+            // won the race
+            gblock = newblock;
+            DBG("update_remote_intra: got an used new gblock from dispatcher:", gblock.first, "\n");
+          } else {
+            // lost the race put the newblock back
+            intra_fixed_memory_store->push_global(newblock);
+          }
+        });
+  }
+
+  static void replace_intra_store_nb(intra_global_block_t &gblock, intra_fixed_memory_t &intra_fixed_mem,
+                                     intra_reservation_t &reservation) {
+    assert(reservation);
+    if (!reservation) DIE("invalid call without reservation!\n");
+    assert(!reservation->empty());
+    if (reservation->empty()) DIE("Unexpected - the reservation is fully drained!\n");
+
+    // replace the gblock with a reserved block
+    assert(!gblock.first);
+    gblock = reservation->back();
+    reservation->pop_back();
+
+    assert(gblock.first);
+    assert(gblock.first.where() == rank_me());
+    assert(gblock.second == 0);
+    DBG("update_remote_intra: got new gblock from reservation:", gblock.first, "\n");
+  }
+
+  // operate on a vector of elements in the dest_stores
+  // will block until sufficient global available blocks are available
+  // and subject to the maximum rpcs in flight
+  void update_remote_inter(intrank_t target_rank, inter_global_block_t &gblock) {
+    assert(gblock.first);
+    assert(gblock.second > 0);
+    auto fut = update_remote_inter_nb(target_rank, gblock);
+    DBG(__func__, " my_progress\n");
+    if (!fut.is_ready()) {
+      DBG(__func__, " still waiting on inter dest store\n");
+    }
+    fut.wait();
+    assert(gblock.first);  // is valid again
+  }
+
+  upcxx::future<> update_remote_inter_nb(intrank_t target_rank, inter_global_block_t &gblock) {
+    if (gblock.second == 0) DIE("Invalid call to update_remote_inter on an empty global block\n");
+    assert(gblock.first);
+    assert(gblock.first.where() == rank_me());
+    inter_global_block_t sendBlock = gblock;  // copy
+    size_t node = split_rank(target_rank).get_node();
+    assert(inter_fixed_memory_store->dest_store(node) == sendBlock);
+    gblock = {};  // invalidate it
+    auto fut = replace_inter_store(gblock, inter_fixed_memory_store);
+    send_inter_rpc(split_rank::get_rank_from_node(node), sendBlock);  // send to dedicated rank on remote node
+    if (!fut.is_ready()) DBG("intra dest store is not immediately ready\n");
+    return fut;
+  }
+
+  // returns true if an RPC was initiated
+  inline static bool add_to_dest_store_intranode(split_rank split, const T &elem, track_rpcs_t &track_intra_rpcs,
+                                                 intra_fixed_memory_t &intra_fixed_mem, FuncDistObj &func) {
+    intra_reservation_t empty_res;
+    return add_to_dest_store_intranode_nb(split, elem, track_intra_rpcs, intra_fixed_mem, empty_res, func);
+  }
+  // non-blocking version (with non-empty reservation)
+  static bool add_to_dest_store_intranode_nb(split_rank split, const T &elem, track_rpcs_t &track_intra_rpcs,
+                                             intra_fixed_memory_t &intra_fixed_mem, intra_reservation_t &reservation,
+                                             FuncDistObj &func) {
+    // intranode
+    size_t max_store_size = intra_fixed_mem->get_count_per_block();
+    intra_global_block_t &gblock = intra_fixed_mem->dest_store(split.get_thread());
+    if (!gblock.first && reservation) {
+      // This is a race between the master persona waiting for a free gblock and an intra node rpc executing while it waits
+      // it is safe for this stack to swap in a new gblock from the reservation
+      assert(!reservation->empty());
+      gblock = reservation->back();
+      reservation->pop_back();
+      DBG("add_to_dest_store_intranode replaced empty gblock with one from my reservation\n");
+    }
+    assert(gblock.second < max_store_size);
+    if (gblock.second >= max_store_size)
+      DIE("Invalid state of gblock with ", gblock.second, " elements but max of ", max_store_size, "\n");
+    assert(gblock.first);
+    assert(gblock.first.where() == rank_me());
+    if (gblock.first.where() != rank_me()) DIE("Invalid state of gblock not local to current rank: ", gblock.first, "\n");
+    T *lptr = gblock.first.local();
+    lptr[gblock.second++] = elem;
+    if (gblock.second == max_store_size) {
+      // DBG("add_to_dest_store_intranode found full for ", (int) split.get_thread(), "/", split.get_rank(), " reservation:",
+      // reservation.get(), ", gblock=", gblock.first, ",", gblock.second, "\n");
+      if (reservation && reservation->empty()) DIE("Invalid state for ", __func__, " reservation is present but empty\n");
+      update_remote_intra_nb(split.get_rank(), gblock, track_intra_rpcs, intra_fixed_mem, reservation, func);
+      assert(gblock.first);  // gblock is restored
+      assert(gblock.first == intra_fixed_mem->dest_store(split.get_thread()).first);
+      return true;
+    }
+    assert(gblock.first);  // gblock is still good
+    assert(gblock.first.local());
+    assert(gblock.first == intra_fixed_mem->dest_store(split.get_thread()).first);
+    return false;
+  }
+
+  void _add_to_dest_store_internode_fast(split_rank split, const T &elem, inter_global_block_t &gblock) {
+    assert(gblock.first);
+    assert(gblock.first.where() == rank_me());
+    assert(inter_fixed_memory_store->get_thread_offset() > 0);
+    if (gblock.second >= max_store_size) DIE("Invalid call to add_to_dest_store_internode_fast\n");
+    T *lptr = gblock.first.local();
+    // element ascendes after the pointer
+    lptr[gblock.second] = elem;
+    // thread num decends before the pointer
+    thread_num_t *t = ((thread_num_t *)lptr) - 1 - gblock.second;
+    *t = split.get_thread();
+    gblock.second++;
+    assert(gblock.second <= max_store_size);
+  }
+
+  // adds an entry to the dest store
+  // may send an rpc (returning true in that case)
+  // may block if inter global blocks are unavailable
+  bool add_to_dest_store_internode(split_rank split, const T &elem) {
+    inter_global_block_t &gblock = inter_fixed_memory_store->dest_store(split.get_node());
+    assert(gblock.first);
+    assert(gblock.first == inter_fixed_memory_store->dest_store(split.get_node()).first);
+    if (gblock.second >= max_store_size)
+      DIE("Invalid state of gblock with ", gblock.second, " elements but max of ", max_store_size, "\n");
+    assert(gblock.second < max_store_size);
+    if (gblock.first.where() != rank_me()) DIE("Invalid state of gblock not local to current rank: ", gblock.first, "\n");
+
+    bool did_send = false;
+    if (gblock.second < max_store_size) {
+      _add_to_dest_store_internode_fast(split, elem, gblock);
+    }
+    if (gblock.second == max_store_size) {
+      update_remote_inter(split.get_rank(), gblock);
+      did_send = true;
+    }
+    assert(gblock.second < max_store_size);
+    assert(gblock.first);
+    assert(gblock.first.where() == rank_me());
+    assert(gblock.first == inter_fixed_memory_store->dest_store(split.get_node()).first);
+    return did_send;
+  }
+
+  // returns true if an RPC was initiated
+  // may block
+  bool add_to_dest_store(intrank_t target_rank, const T &elem) {
+    bool sent_rpc = false;
+    if (max_store_size <= 1) {
+      update_remote1(target_rank, elem);
+      sent_rpc = true;
+    } else {
+      split_rank split(target_rank);
+      if (split.is_local()) {
+        // intranode
+        sent_rpc = add_to_dest_store_intranode(split, elem, track_intra_rpcs, intra_fixed_memory_store, func);
+      } else {
+        // internode
+        assert(split_rank::num_nodes() > 1);
+        sent_rpc = add_to_dest_store_internode(split, elem);
+      }
+    }
+    if (sent_rpc) {
+      my_progress();  // progress anyway to kick off the rpc
+    }
+    return sent_rpc;
+  }
+
+ public:
+  string description;
+
+  TwoTierAggrStore(FuncDistObj &f, const string description)
+      : func(f)
+      , max_store_size(0)
+      , max_rpcs_in_flight(MAX_RPCS_IN_FLIGHT)
+      , intra_fixed_memory_store(func.team(), description + string("-intra-store"))
+      , inter_fixed_memory_store(func.team(), description + string("-inter-store"))
+      , track_inter_rpcs(func.team(), description + string("-track-inter-rpc"))
+      , track_intra_rpcs(func.team(), description + string("-track-intra-rpc"))
+      , t_prog(description + string("-TwoTierAggrStore"))
+      , description(description)
+      , my_progress_is_required(false) {}
+  TwoTierAggrStore(const TwoTierAggrStore &) = delete;
+  TwoTierAggrStore(TwoTierAggrStore &&) = default;
+  virtual ~TwoTierAggrStore() { clear(); }
+
+  string to_string() const {
+    ostringstream os;
+    os << description;
+    os << "-TwoTierAggrStore";
+    os << "inter_store=" << inter_fixed_memory_store->to_string() << ",";
+    os << "intra_store=" << intra_fixed_memory_store->to_string() << ",";
+    os << "track_inter_rpcs=" << track_inter_rpcs->to_string() << ",";
+    os << "inter_rpc_t=" << track_inter_rpcs->get_rpc_timer().get_total_count() << "/"
+       << track_inter_rpcs->get_rpc_inner_timer().get_total_count() << ",";
+    os << "inter_intra_rpc_t=" << track_inter_rpcs->get_rpc_relay_timer().get_total_count() << ",";
+    os << "track_intra_rpcs=" << track_intra_rpcs->to_string() << ",";
+    os << "intra_rpc_t=" << track_intra_rpcs->get_rpc_timer().get_total_count() << "/"
+       << track_intra_rpcs->get_rpc_inner_timer().get_total_count() << ",";
+    os << ")";
+    return os.str();
+  }
+
+  static void optimal_num_blocks_and_count_per(const size_t max_bytes, const size_t max_rpcs, size_t &num_intra_blocks,
+                                               size_t &num_inter_blocks, size_t &count_per_block) {
+    // a few constraints and priorities for optimization
+    // required:
+    //   num_blocks * sizeof(T) * count_per_block <= max_bytes
+    //   min_rpcs_in_flight <= rpcs_in_flight <= max_rpcs_in_flight
+    //   rpcs_in_flight == num_blocks - dest_store_size
+    //
+    // optimization compromises:
+    //   count_per_block * sizeof(T) >= 8KB, optimally much larger 1MB
+    //   dest_store_size == count_per_block == 1 ? 0 : rank_n()
+    //   max_rpcs_in_flight == min(rank_n() * 10, 2048);
+    //   min_rpcs_in_flight == rank_n() // possibly nodes (technically 1)
+    //
+    // furthermore if num_nodes == 1 there will be 0 internode blocks
+
+    // start calcs with min limits
+    size_t sz = sizeof(T) + sizeof(thread_num_t);
+    size_t inter_dest_store_size = split_rank::num_nodes();
+    size_t intra_dest_store_size = split_rank::num_threads();
+    size_t res_size = split_rank::num_threads();
+    size_t min_inter_rpcs_in_flight = 2 * inter_dest_store_size + 16;                   // every 2 * inter dest store + 16
+    size_t min_reservations = 1 + split_rank::num_nodes() / split_rank::num_threads();  // 1 + nodes/(cores/node)
+    size_t min_intra_rpcs_in_flight =
+        intra_dest_store_size + min_reservations * res_size + 16;  // every dest store + a few reservations + 16
+
+    if (split_rank::num_nodes() == 1) {
+      sz = sizeof(T);
+      inter_dest_store_size = 0;
+      res_size = 0;
+      min_inter_rpcs_in_flight = 0;
+      min_intra_rpcs_in_flight = intra_dest_store_size + 16;
+    }
+
+    size_t num_blocks = 0;
+    size_t min_rpcs_in_flight = min_intra_rpcs_in_flight + min_inter_rpcs_in_flight;
+    size_t rpcs_in_flight = min_rpcs_in_flight * 8;
+    if (min_rpcs_in_flight > 2 * max_rpcs) {
+      rpcs_in_flight = min_rpcs_in_flight;  // min == max and it will exceed max_rpcs
+    } else if (rpcs_in_flight > 2 * max_rpcs) {
+      rpcs_in_flight = 2 * max_rpcs;  // reduce the starting max rpc
+    }
+
+    DBG("optimizing max_bytes=", get_size_str(max_bytes), " min_rpcs=", min_rpcs_in_flight, " inter=", inter_dest_store_size,
+        " intra=", intra_dest_store_size, "\n");
+    if (max_bytes >= 2 * sz * (min_rpcs_in_flight + inter_dest_store_size + intra_dest_store_size)) {
+      // start with large blocks and max rpcs in flight
+      // decrease rpcs_in_flight to 2* minimums
+      // decrease block size to 16KB
+      // decrease rpcs_in_flight to minimum
+      // decrease block size further.
+      size_t target_min_mem = 16 * ONE_KB - 64;  // still fast but below gets noticibly slower
+      size_t mem_per_block = 128 * ONE_KB - 64;  // initial best case
+      count_per_block = (mem_per_block + sz - 1) / sz;
+      DBG("optimizing mem_per_block=", get_size_str(mem_per_block), " rpcs=", rpcs_in_flight, " count_per_block=", count_per_block,
+          " block_size=", get_size_str(count_per_block * sz), "\n");
+      do {
+        num_blocks = rpcs_in_flight + inter_dest_store_size + intra_dest_store_size;
+        if (sz * num_blocks * count_per_block < max_bytes) break;
+        size_t try_mem = 3 * mem_per_block / 4;    // reduce to 75%
+        size_t try_rpcs = 3 * rpcs_in_flight / 4;  // reduce to 75%
+        if (try_rpcs > 2 * min_rpcs_in_flight) {   // first reduce in-flight to 2*minimum
+          rpcs_in_flight = try_rpcs;
+        } else if (try_mem > target_min_mem) {  // next reduce count to target minimum
+          rpcs_in_flight = 2 * min_rpcs_in_flight;
+          mem_per_block = try_mem;
+        } else if (try_rpcs > min_rpcs_in_flight) {  // next reduce in-flight to minimum
+          mem_per_block = target_min_mem;
+          rpcs_in_flight = try_rpcs;
+        } else {  // lastly reduce block size below the target_min_mem
+          rpcs_in_flight = min_rpcs_in_flight;
+          mem_per_block = (mem_per_block > ONE_KB) ? (3 * mem_per_block / 4) : (mem_per_block / 2);
+        }
+        count_per_block = (mem_per_block + sz - 1) / sz;
+        DBG("optimizing mem_per_block=", get_size_str(mem_per_block), " rpcs=", rpcs_in_flight,
+            " count_per_block=", count_per_block, "\n");
+      } while (count_per_block > 1);
+    } else {
+      count_per_block = 1;
+    }
+
+    if (count_per_block <= 1) {
+      // no allocation - just direct rpcs
+      count_per_block = 1;
+      inter_dest_store_size = 0;
+      intra_dest_store_size = 0;
+      num_blocks = 0;
+      rpcs_in_flight = max_bytes / sz < max_rpcs ? max_bytes / sz : max_rpcs;
+    } else {
+      num_blocks = rpcs_in_flight + inter_dest_store_size + intra_dest_store_size;
+    }
+
+    // All of these must still be true
+    assert(count_per_block * sizeof(T) * num_blocks <= max_bytes);
+    assert(count_per_block >= 1);
+
+    if (num_blocks > 1) {
+      // calculate the inter and intra block counts
+      double inter_fraction = .75;  // 75% of the extra blocks go to inter stores
+      assert(min_inter_rpcs_in_flight + min_intra_rpcs_in_flight == min_rpcs_in_flight);
+      assert(min_inter_rpcs_in_flight + min_intra_rpcs_in_flight <= rpcs_in_flight);
+
+      if (split_rank::num_nodes() > 1) {
+        num_inter_blocks =
+            inter_dest_store_size + min_inter_rpcs_in_flight + (rpcs_in_flight - min_rpcs_in_flight) * inter_fraction;
+      } else {
+        num_inter_blocks = 0;
+        inter_fraction = 0.0;
+      }
+      num_intra_blocks =
+          intra_dest_store_size + min_intra_rpcs_in_flight + (rpcs_in_flight - min_rpcs_in_flight) * (1.0 - inter_fraction);
+      assert(num_blocks >= num_inter_blocks + num_intra_blocks);
+
+    } else {
+      num_intra_blocks = num_inter_blocks = 0;
+    }
+    SOUT("Calculated optimal num and block size for ", split_rank::num_nodes(), " internode sets and ",
+         (size_t)split_rank::num_threads(), " intranode ranks per node\n");
+    SOUT("Found optimal TwoTierAggrStore of num_intra_blocks=", num_intra_blocks, " num_inter_blocks=", num_inter_blocks,
+         " count_per_block=", count_per_block, " (", get_size_str(count_per_block * sz), " per block, ",
+         get_size_str((num_inter_blocks + num_intra_blocks) * count_per_block * sz), ")\n");
+  }
+
+  void set_size(size_t max_store_bytes) {
+    DBG("TwoTierAggrStore::set_size(", max_store_bytes, ")\n");
+
+    size_t count_per_block = 0, num_intra_blocks = 0, num_inter_blocks = 0;
+    optimal_num_blocks_and_count_per(max_store_bytes, max_rpcs_in_flight, num_intra_blocks, num_inter_blocks, count_per_block);
+    assert(count_per_block > 0);
+
+    if (count_per_block <= 1) {
+      // no reason for delay and storage of 1 entry (i.e. small max mem at large scale), still uses max_rpcs_in_flight
+      max_store_size = 0;
+      num_intra_blocks = 0;
+      num_inter_blocks = 0;
+      count_per_block = 1;  // will send single rpcs
+      if (max_store_bytes > 0) {
+        // not intentionally disabled
+        SWARN("Using no TwoTierAggrStore to aggregate messages because no configutation works with less than ",
+              get_size_str(max_store_bytes), " at this scale\n");
+      }
+    } else {
+      max_store_size = count_per_block;
+    }
+
+    size_t per_intra_rpc_bytes = count_per_block * sizeof(T);
+    size_t per_inter_rpc_bytes = count_per_block * (sizeof(T) + sizeof(thread_num_t));
+    size_t total_blocks = num_intra_blocks + num_inter_blocks;
+
+    if (num_intra_blocks == 0) {
+      // no dest stores will be used intra or inter
+      assert(num_inter_blocks == 0);
+      assert(max_store_size == 0);
+      assert(count_per_block == 1);
+      num_intra_blocks = 0;
+      num_inter_blocks = 0;
+    }
+
+    node_num_t nodes = split_rank::num_nodes();
+    // always have intra node
+
+    SOUT("Establishing ", description, " intra dest stores\n");
+    intra_fixed_memory_store->set_fixed_mem(num_intra_blocks, count_per_block, split_rank::num_threads(), false,
+                                            nodes == 1 ? 0 : split_rank::num_threads());
+
+    SOUT("Establishing ", description, " inter dest stores\n");
+    if (nodes == 1) {
+      // special case for single node with no internode rpcs needed
+      if (rank_n() > 1) assert(num_inter_blocks == 0);
+      inter_fixed_memory_store->set_fixed_mem(0, 1, 1, false, 0);
+    } else {
+      if (num_inter_blocks > 0)
+        assert(num_inter_blocks >= split_rank::num_nodes() * 3);  // room for dest store, 1 reservation and 1 in flight
+      inter_fixed_memory_store->set_fixed_mem(num_inter_blocks, count_per_block, split_rank::num_nodes(), true,
+                                              split_rank::num_nodes());
+    }
+
+    SOUT("Using a ", description, " store of max ",
+         get_size_str(num_intra_blocks * per_intra_rpc_bytes + num_inter_blocks * per_inter_rpc_bytes),
+         " per target rank, giving max ", max_store_size, " of ", get_size_str(sizeof(T)), "/",
+         get_size_str(sizeof(T) + sizeof(thread_num_t)), " entries per target rank (", get_size_str(per_intra_rpc_bytes), "/",
+         get_size_str(per_inter_rpc_bytes), ", ", get_size_str(per_intra_rpc_bytes * num_intra_blocks), "/",
+         get_size_str(per_inter_rpc_bytes * num_inter_blocks), ") and ", max_rpcs_in_flight, " rpcs in flight\n");
+  }
+
+  // true only if there are no element stored and no rpcs in flight
+  inline bool empty() const {
+    return track_intra_rpcs->empty() && intra_fixed_memory_store->empty() && track_inter_rpcs->empty() &&
+           inter_fixed_memory_store->empty();
+  }
+
+  void clear() {
+    DBG("TwoTierAggrStore::clear()\n");
+    inter_fixed_memory_store->clear_dest_stores();
+    intra_fixed_memory_store->clear_dest_stores();
+    if (!empty()) DIE("clear() called on a non-empty TwoTierAggrStore!\n");
+    track_inter_rpcs->clear();
+    inter_fixed_memory_store->clear();
+    track_intra_rpcs->clear();
+    intra_fixed_memory_store->clear();
+    t_prog.print_out();
+    t_process_local().print_out();
+    Timings::wait_pending();
+    assert(intra_fixed_memory_store->empty());
+    assert(inter_fixed_memory_store->empty());
+    assert(!intra_fixed_memory_store->valid());
+    assert(!inter_fixed_memory_store->valid());
+    barrier();
+  }
+
+  bool update(intrank_t target_rank, const T &elem) {
+    static size_t update_count = 0;
+    bool ret = add_to_dest_store(target_rank, elem);
+    update_count++;
+    bool progress_is_required = my_progress_required();
+    if (update_count % (progress_is_required ? 32 : 4096) == 0) {
+      my_progress();
+    }
+    return ret;
+  }
+
+  void flush_inter_updates() {
+    if (split_rank::num_nodes() == 1) {
+      if (!inter_fixed_memory_store->empty()) DIE("flush_inter_updates called when there is only 1 node from split_rank!\n");
+      return;
+    }
+    Timer timer(description + "-TwoTierAggrStore::flush_inter_updates");
+    DBG("flushing inter updates...\n");
+
+    // first flush inter node stores
+    size_t num_inter_dest = split_rank::num_nodes() == 1 ? 0 : split_rank::num_nodes();
+    if (num_inter_dest == 0) assert(inter_fixed_memory_store->empty());
+    for (node_num_t _node = 0; _node < num_inter_dest; _node++) {
+      node_num_t node = (_node + 1 + split_rank::get_my_node()) %
+                        split_rank::num_nodes();  // rotate the flushes, starting with the next node in the job
+      if (max_store_size > 0) {
+        inter_global_block_t &gblock = inter_fixed_memory_store->dest_store(node);
+        if (node == split_rank::get_my_node()) {
+          assert(gblock.second == 0);
+        }
+        assert(gblock.first);
+        assert(gblock.first.where() == rank_me());
+        if (gblock.second > 0) {
+          update_remote_inter(split_rank::get_rank_from_node(node), gblock);
+        }
+      }
+    }
+    my_progress();
+    DBG("all my internode data send rpcs have been sent\n");
+  }
+
+  void flush_intra_updates() {
+    Timer timer(description + "-TwoTierAggrStore::flush_intra_updates");
+    DBG("flushing intra updates...\n");
+
+    for (thread_num_t _thread = 0; _thread < split_rank::num_threads(); _thread++) {
+      thread_num_t thread = (_thread + 1 + split_rank::get_my_thread()) %
+                            split_rank::num_threads();  // rotate the flushes starting with the next thread
+      if (max_store_size > 0) {
+        intra_global_block_t &gblock = intra_fixed_memory_store->dest_store(thread);
+        assert(gblock.first);
+        assert(gblock.first.where() == rank_me());
+        if (gblock.second > 0) {
+          update_remote_intra(split_rank::get_rank_from_thread(thread), gblock);
+          assert(gblock.first);
+          assert(gblock.first.where() == rank_me());
+        }
+      }
+    }
+    DBG(__func__, " my_progress\n");
+    my_progress();
+    DBG("all my intranode data send rpcs have been sent\n");
+  }
+
+  static void flush_intra_updates_with_res(intra_reservation_t &reservation, track_rpcs_t &track_intra_rpcs,
+                                           intra_fixed_memory_t &intra_fixed_mem, FuncDistObj &func) {
+    assert(reservation);
+    int count_flushed = 0;
+    if (!reservation->empty()) {
+      // flush the most full dest_stores first, stopping at 1/4 capacity
+      vector<size_t> rank_counts;
+      rank_counts.reserve(split_rank::num_threads());
+      for (thread_num_t thread = 0; thread < split_rank::num_threads(); thread++) {
+        size_t s = intra_fixed_mem->dest_store(thread).second;
+        assert(s < (1ull << 32));
+        if (s >= intra_fixed_mem->get_count_per_block() / 4) {
+          s = (s << 32) | thread;  // combine count in high bits, thread in low bits
+          rank_counts.push_back(s);
+        }
+      }
+      if (!rank_counts.empty()) {
+        // sort ascending by count, then thread
+        std::sort(rank_counts.begin(), rank_counts.end());
+      }
+      while (!reservation->empty() && !rank_counts.empty()) {
+        size_t r_c = rank_counts.back();
+        rank_counts.pop_back();
+        size_t _thread = r_c & 0xffffffff;
+        thread_num_t thread = _thread;
+        size_t count = (r_c >> 32) & 0xffffffff;
+        intra_global_block_t &gblock = intra_fixed_mem->dest_store(thread);
+        assert(gblock.first);
+        assert(gblock.first.where() == rank_me());
+        update_remote_intra_nb(split_rank::get_rank_from_thread(thread), gblock, track_intra_rpcs, intra_fixed_mem, reservation,
+                               func);
+        assert(gblock.first);
+        assert(gblock.first.where() == rank_me());
+        count_flushed++;
+      }
+    }
+    DBG("flush_intra_updates with reservation flushed ", count_flushed, " intra stores\n");
+  }
+
+  void flush_updates() {
+    BarrierTimer timer(description + "-TwoTierAggrStore::flush_updates", false);
+    DBG("flushing updates...\n");
+
+    flush_inter_updates();
+
+    // pre-emptively flush intra_stores
+    // create a (possibly small) reservation of intra blocks
+    intra_reservation_t res = make_shared<vector<intra_global_block_t> >();
+    // clear dest stores so global_dispatcher can be empty()
+    inter_fixed_memory_store->clear_dest_stores();
+    StallTimer is_inter_stalled(description + "-flush_updates-inter-store-empty");
+    do {
+      is_inter_stalled.check();
+      my_progress();
+      while (res->size() < split_rank::num_threads() &&
+             intra_fixed_memory_store->global_available_size() > split_rank::num_threads()) {
+        auto fut_gblock = intra_fixed_memory_store->pop_global();
+        if (!fut_gblock.ready()) DIE("Invalid state - there were available blocks but just popped one not ready!\n");
+        res->push_back(fut_gblock.result());
+      }
+      if (!res->empty()) {
+        flush_intra_updates_with_res(res, track_intra_rpcs, intra_fixed_memory_store, func);
+      }
+    } while (!inter_fixed_memory_store->empty());
+    // replace the temporary intra blocks within the reservation and destroy it
+    intra_fixed_memory_store->release_reservation(res);
+    assert(res->empty());
+    res.reset();
+    track_inter_rpcs->flush(0);
+
+    DBG("all my data send rpcs returned and global blocks have returned too\n");
+
+    {
+      BarrierTimer timer2(description + "-TwoTierAggrStore::flush_updates after inter-node", split_rank::num_nodes() > 1);
+      // now all threads have received all inter node rpcs.  Flush last internode RPCs that may not have been processed yet
+      StallTimer is_inter_stalled_again(description + "-flush_updates-inter-store-empty-again");
+      do {
+        is_inter_stalled_again.check();
+        my_progress();
+      } while (!inter_fixed_memory_store->empty());
+      assert(inter_fixed_memory_store->empty());  // should remain empty
+
+      // now all threads have received all inter node rpcs.  Flush last intra node stores
+      // last flush intra node stores
+      flush_intra_updates();
+
+      // clear dest stores so global_dispatcher can be empty()
+      intra_fixed_memory_store->clear_dest_stores();
+      StallTimer is_intra_stalled(description + "-flush_updates-intra-store-empty");
+      do {
+        is_intra_stalled.check();
+        assert(inter_fixed_memory_store->empty());  // should remain empty
+        my_progress();
+      } while (!intra_fixed_memory_store->empty());
+      assert(inter_fixed_memory_store->empty());  // should still be empty
+      track_intra_rpcs->flush(0);
+    }  // implicit barrier from BarrierTimer timer2
+    DBG(__func__, " last my_progress\n");
+    my_progress();
+
+    assert(inter_fixed_memory_store->empty());
+    assert(intra_fixed_memory_store->empty());
+    DBG("Done with flush_updates\n");
+
+    // restore dest_stores for next round
+    if (intra_fixed_memory_store->get_count_per_block() > 1) {
+      intra_fixed_memory_store->set_dest_stores(split_rank::num_threads());
+    }
+    // restore dest_stores for next round
+    if (inter_fixed_memory_store->get_count_per_block() > 1 && split_rank::num_nodes() > 1) {
+      inter_fixed_memory_store->set_dest_stores(split_rank::num_nodes());
+    }
+    // barrier at exit from BarrierTimer
+  }
+};
+
+};  // namespace upcxx_utils
diff --git a/upcxx-utils/src/limit_outstanding.cpp b/upcxx-utils/src/limit_outstanding.cpp
index ed08bf9..e65d063 100644
--- a/upcxx-utils/src/limit_outstanding.cpp
+++ b/upcxx-utils/src/limit_outstanding.cpp
@@ -23,7 +23,7 @@ upcxx::future<> upcxx_utils::collapse_outstanding_futures(int limit, LimitedFutu
     while (outstanding_queue.size() > limit) {
       auto fut = outstanding_queue.front();
       outstanding_queue.pop_front();
-      if (!fut.ready()) returned_future = upcxx::when_all(fut, returned_future);
+      if (!fut.is_ready()) returned_future = upcxx::when_all(fut, returned_future);
     }
     DBG("limit=", limit, " outstanding=", outstanding_queue.size(), " max_check=", max_check, "\n");
     if (limit == 0) {
@@ -31,24 +31,24 @@ upcxx::future<> upcxx_utils::collapse_outstanding_futures(int limit, LimitedFutu
     } else {
       assert(outstanding_queue.size() <= limit);
       int i = 0;
-      while (i < max_check && !returned_future.ready() && i < outstanding_queue.size()) {
+      while (i < max_check && !returned_future.is_ready() && i < outstanding_queue.size()) {
         // find a ready future in the queue to swap with
         auto &test_fut = outstanding_queue[i++];
-        if (test_fut.ready()) {
+        if (test_fut.is_ready()) {
           std::swap(test_fut, returned_future);
-          assert(returned_future.ready());
+          assert(returned_future.is_ready());
           break;
         }
       }
     }
   }
-  DBG("limit=", limit, " outstanding=", outstanding_queue.size(), " max_check=", max_check, ", ret=", returned_future.ready(),
+  DBG("limit=", limit, " outstanding=", outstanding_queue.size(), " max_check=", max_check, ", ret=", returned_future.is_ready(),
       "\n");
   return returned_future;
 }
 
 void upcxx_utils::add_outstanding_future(upcxx::future<> fut, LimitedFutureQueue &outstanding_queue) {
-  if (!fut.ready()) outstanding_queue.push_back(fut);
+  if (!fut.is_ready()) outstanding_queue.push_back(fut);
 }
 
 upcxx::future<> upcxx_utils::limit_outstanding_futures(int limit, LimitedFutureQueue &outstanding_queue) {
@@ -62,7 +62,7 @@ upcxx::future<> upcxx_utils::limit_outstanding_futures(upcxx::future<> fut, int
     if (outstanding_queue.empty()) return fut;
     return upcxx::when_all(collapse_outstanding_futures(limit, outstanding_queue), fut);
   }
-  if (fut.ready()) {
+  if (fut.is_ready()) {
     if (outstanding_queue.size() <= limit) return fut;
   } else {
     outstanding_queue.push_back(fut);
diff --git a/upcxx-utils/src/ofstream.cpp b/upcxx-utils/src/ofstream.cpp
index cd311ac..784820f 100644
--- a/upcxx-utils/src/ofstream.cpp
+++ b/upcxx-utils/src/ofstream.cpp
@@ -841,7 +841,7 @@ dist_ofstream::~dist_ofstream() {
   if (!is_closed) close();
   assert(is_closed);
   stringstream().swap(ss);
-  DBG_VERBOSE("close_fut=", close_fut.ready(), "\n");
+  DBG_VERBOSE("close_fut=", close_fut.is_ready(), "\n");
 }
 
 void dist_ofstream::close() {
diff --git a/upcxx-utils/src/promise_collectives.cpp b/upcxx-utils/src/promise_collectives.cpp
index fd9ee7d..9f059cb 100644
--- a/upcxx-utils/src/promise_collectives.cpp
+++ b/upcxx-utils/src/promise_collectives.cpp
@@ -115,14 +115,14 @@ upcxx_utils::PromiseBarrier::~PromiseBarrier() {
   DBG_VERBOSE("Destroy this=", this, " move=", moved, "\n");
   if (moved) return;  // invalidated
   assert(upcxx::master_persona().active_with_caller());
-  assert(dist_workflow->initiated_prom.get_future().ready());
+  assert(dist_workflow->initiated_prom.get_future().is_ready());
   get_future().wait();
 }
 
 void upcxx_utils::PromiseBarrier::fulfill() const {
   DBG_VERBOSE("fulfill this=", this, "\n");
   assert(upcxx::master_persona().active_with_caller());
-  assert(!dist_workflow->initiated_prom.get_future().ready());
+  assert(!dist_workflow->initiated_prom.get_future().is_ready());
   dist_workflow->initiated_prom.fulfill_anonymous(1);
 }
 
diff --git a/upcxx-utils/src/reduce_prefix.cpp b/upcxx-utils/src/reduce_prefix.cpp
index ecd0aa6..0ca447e 100644
--- a/upcxx-utils/src/reduce_prefix.cpp
+++ b/upcxx-utils/src/reduce_prefix.cpp
@@ -124,8 +124,8 @@ future<> binary_tree_steps::get_future() const {
 // up phase is done
 
 bool binary_tree_steps::up_ready() const {
-  return dst_is_partial_left_me.get_future().ready() && scratch_is_partial_right.get_future().ready() &&
-         scratch_is_partial_to_parent.get_future().ready() && sent_partial_to_parent.get_future().ready();
+  return dst_is_partial_left_me.get_future().is_ready() && scratch_is_partial_right.get_future().is_ready() &&
+         scratch_is_partial_to_parent.get_future().is_ready() && sent_partial_to_parent.get_future().is_ready();
 }
 
 future<> binary_tree_steps::get_up_future() const {
@@ -135,8 +135,8 @@ future<> binary_tree_steps::get_up_future() const {
 // down phase is done
 
 bool binary_tree_steps::down_ready() const {
-  return scratch_is_partial_from_parent.get_future().ready() && sent_left_child.get_future().ready() &&
-         sent_right_child.get_future().ready();
+  return scratch_is_partial_from_parent.get_future().is_ready() && sent_left_child.get_future().is_ready() &&
+         sent_right_child.get_future().is_ready();
 }
 
 future<> binary_tree_steps::get_down_future() const {
diff --git a/upcxx-utils/src/timers.cpp b/upcxx-utils/src/timers.cpp
new file mode 100644
index 0000000..467c4b0
--- /dev/null
+++ b/upcxx-utils/src/timers.cpp
@@ -0,0 +1,698 @@
+#include <chrono>
+#include <ctime>
+#include <iomanip>
+#include <upcxx/upcxx.hpp>
+
+#define _TIMERS_CPP
+#include "upcxx_utils/timers.hpp"
+
+using upcxx::future;
+
+namespace upcxx_utils {
+
+// Reduce compile time by making templates instantiations of common types
+// these are each constructed in CMakeLists.txt and timers-extern-template.in.cpp
+// extern templates declarations all happen in timers.hpp
+
+/*
+ * This is now handled by CMakeLists.txt
+ *
+   MACRO_MIN_SUM_MAX(float,    template);
+   MACRO_MIN_SUM_MAX(double,   template);
+   MACRO_MIN_SUM_MAX(int64_t,  template);
+   MACRO_MIN_SUM_MAX(uint64_t, template);
+   MACRO_MIN_SUM_MAX(int,      template);
+
+ */
+
+//
+// Timings
+//
+
+future<> &Timings::get_last_pending() {
+  static future<> _ = make_future();
+  return _;
+}
+
+Timings::Timings()
+    : t()
+    , before_elapsed(0.0)
+    , after_elapsed(0.0)
+    , reduction_elapsed(0.0)
+    , my_count(0)
+    , my_instance(0) {}
+
+future<> Timings::get_pending() { return get_last_pending(); }
+
+void Timings::set_pending(future<> fut) { get_last_pending() = when_all(get_last_pending(), fut); }
+
+void Timings::wait_pending() {
+  DBG_VERBOSE(__func__, "\n");
+  if (upcxx::initialized()) {
+    get_last_pending().wait();
+    get_last_pending() = make_future();
+  }
+}
+
+string Timings::to_string(bool print_count, bool print_label) const {
+  ostringstream os;
+  if (print_label) os << "(min/my/avg/max, bal) ";
+  os << std::setprecision(2) << std::fixed;
+  // print the timing metrics
+  auto &before_max = before_msm.max;
+  auto &before_min = before_msm.min;
+  auto &before_sum = before_msm.sum;
+  if (before_max > 0.0) {
+    double bal = (before_max > 0.0 ? before_sum / rank_n() / before_max : 1.0);
+    if (before_max > 10.0 && bal < .9) os << KLRED;  // highlight large imbalances
+    os << before_min << "/" << before_elapsed << "/" << before_sum / rank_n() << "/" << before_max << " s, " << bal;
+    if (before_max > 1.0 && bal < .9) os << KLCYAN;
+  } else {
+    os << "0/0/0/0 s, 1.00";
+  }
+
+  os << std::setprecision(1) << std::fixed;
+
+  auto &after_max = after_msm.max;
+  auto &after_min = after_msm.min;
+  auto &after_sum = after_msm.sum;
+  // print the timings around a barrier if they are significant
+  if (after_max >= 0.1) {
+    os << (after_max > 1.0 ? KLRED : "") << " barrier " << after_min << "/" << after_elapsed << "/" << after_sum / rank_n() << "/"
+       << after_max << " s, " << (after_max > 0.0 ? after_sum / rank_n() / after_max : 0.0) << (after_max > 1.0 ? KLCYAN : "");
+  } else if (after_max > 0.0) {
+    os << std::setprecision(2) << std::fixed;
+    os << " barrier " << after_max << " s";
+    os << std::setprecision(1) << std::fixed;
+  }
+
+  auto &count_max = count_msm.max;
+  auto &count_min = count_msm.min;
+  auto &count_sum = count_msm.sum;
+  // print the max_count if it is more than 1 or more than 0 if asked to print the count
+  if (count_max > (print_count ? 0.0 : 1.00001))
+    os << " count " << count_min << "/" << my_count << "/" << count_sum / rank_n() << "/" << count_max << ", "
+       << (count_max > 0.0 ? count_sum / rank_n() / count_max : 0.0);
+
+  auto &instance_max = instance_msm.max;
+  auto &instance_min = instance_msm.min;
+  auto &instance_sum = instance_msm.sum;
+  // print the instances if it is both non-zero and not 1 per rank
+  if (instance_sum > 0 && ((int)(instance_sum + 0.01)) != rank_n() && ((int)(instance_sum + 0.99)) != rank_n())
+    os << " inst " << instance_min << "/" << my_instance << "/" << instance_sum / rank_n() << "/" << instance_max << ", "
+       << (instance_max > 0.0 ? instance_sum / rank_n() / instance_max : 0.0);
+  // print the reduction timings if they are significant
+  if (reduction_elapsed > 0.05)
+    os << (reduction_elapsed > .5 ? KLRED : "") << " reduct " << reduction_elapsed << (reduction_elapsed > .5 ? KLCYAN : "");
+  return os.str();
+}
+
+void Timings::set_before(Timings &timings, size_t count, double elapsed, size_t instances) {
+  DBG_VERBOSE("set_before: my_count=", count, " my_elapsed=", elapsed, " instances=", instances, "\n");
+  timings.before = std::chrono::high_resolution_clock::now();
+
+  timings.my_count = count;
+  timings.count_msm.reset(timings.my_count);
+
+  timings.before_elapsed = elapsed;
+  timings.before_msm.reset(elapsed);
+
+  timings.my_instance = instances;
+  timings.instance_msm.reset(instances);
+}
+
+// timings must remain in scope until the returened future is ready()
+future<> Timings::set_after(const upcxx::team &team, Timings &timings,
+                            std::chrono::time_point<std::chrono::high_resolution_clock> t_after) {
+  timings.after = t_after;
+  duration_seconds interval = timings.after - timings.before;
+  timings.after_elapsed = interval.count();
+  timings.after_msm.reset(timings.after_elapsed);
+  DBG_VERBOSE("set_after: ", interval.count(), "\n");
+
+  // time the reductions
+  timings.t = t_after;
+
+  assert(&timings.instance_msm == &timings.before_msm + 3);  // memory is in order
+  auto fut_msms = min_sum_max_reduce_all(&timings.before_msm, &timings.before_msm, 4, team);
+  auto ret = fut_msms.then([&timings]() {
+    duration_seconds interval = std::chrono::high_resolution_clock::now() - timings.t;
+    timings.reduction_elapsed = interval.count();
+    DBG_VERBOSE("Finished reductions:, ", interval.count(), "\n");
+  });
+
+  set_pending(when_all(ret, get_pending()));
+  return ret;
+}
+
+// barrier and reduction
+Timings Timings::barrier(const upcxx::team &team, size_t count, double elapsed, size_t instances) {
+  DBG("Timings::barrier(", count, ", ", elapsed, ", ", instances, ")\n");
+  Timings timings;
+  set_before(timings, count, elapsed, instances);
+  upcxx::barrier(team);
+  progress();  // explicitly make progress after the barrier if the barrier itself was already ready()
+  auto fut = set_after(team, timings);
+  wait_pending();
+  assert(fut.is_ready());
+  return timings;
+}
+
+void Timings::print_barrier_timings(const upcxx::team &team, string label) {
+  Timings timings = barrier(team, 0, 0, 0);
+  wait_pending();
+  SLOG_VERBOSE(KLCYAN, "Timing ", label, ":", timings.to_string(), KNORM, "\n");
+}
+
+// no barrier but a future reduction is started
+future<ShTimings> Timings::reduce(const upcxx::team &team, size_t count, double elapsed, size_t instances) {
+  DBG("Timings::reduce(", count, ", ", elapsed, ", ", instances, ")\n");
+  auto timings = make_shared<Timings>();
+  set_before(*timings, count, elapsed, instances);
+  auto future_reduction = set_after(team, *timings, timings->before);  // after == before, so no barrier info will be output
+  return when_all(make_future(timings), future_reduction, get_pending());
+}
+
+void Timings::print_reduce_timings(const upcxx::team &team, string label) {
+  future<ShTimings> fut_timings = reduce(team, 0, 0, 0);
+  auto fut = when_all(fut_timings, get_pending()).then([label = std::move(label)](ShTimings shptr_timings) {
+    SLOG_VERBOSE(KLCYAN, "Timing ", label, ": ", shptr_timings->to_string(), "\n", KNORM);
+  });
+  set_pending(fut);
+}
+
+//
+// BaseTimer
+//
+
+size_t &BaseTimer::instance_count() {
+  static size_t _ = 0;
+  return _;
+}
+
+void BaseTimer::increment_instance() { ++instance_count(); }
+void BaseTimer::decrement_instance() { instance_count()--; }
+size_t BaseTimer::get_instance_count() { return instance_count(); }
+
+BaseTimer::BaseTimer()
+    : t()
+    , name()
+    , t_elapsed(0.0)
+    , count(0) {}
+
+BaseTimer::BaseTimer(const string &_name)
+    : t()
+    , name(_name)
+    , t_elapsed(0.0)
+    , count(0) {}
+
+BaseTimer::~BaseTimer() {}
+
+void BaseTimer::clear() {
+  t = timepoint_t();
+  t_elapsed = 0.0;
+  count = 0;
+}
+
+void BaseTimer::start() {
+  assert(t == timepoint_t());
+  t = now();
+}
+
+void BaseTimer::stop() {
+  double elapsed = get_elapsed_since_start();
+  t = timepoint_t();  // reset to 0
+  // DBG("stop(", name, ", inst=", get_instance_count(), "): ", elapsed, " s, ", now_str(), "\n");
+  t_elapsed += elapsed;
+  count++;
+}
+
+double BaseTimer::get_elapsed() const { return t_elapsed; }
+
+double BaseTimer::get_elapsed_since_start() const {
+  assert(t != timepoint_t());
+  duration_seconds interval = now() - t;
+  return interval.count();
+}
+
+size_t BaseTimer::get_count() const { return count; }
+
+const string &BaseTimer::get_name() const { return name; }
+
+void BaseTimer::done() const {
+  assert(t == timepoint_t());
+  SLOG_VERBOSE(KLCYAN, "Timing ", name, ": ", std::setprecision(2), std::fixed, t_elapsed, " s ", KNORM, "\n");
+  DBG(name, " took ", std::setprecision(2), std::fixed, t_elapsed, " s ", "\n");
+}
+
+future<MinSumMax<double>> BaseTimer::done_all_async(const upcxx::team &tm) const {
+  assert(t == timepoint_t());
+  auto msm_fut = upcxx_utils::min_sum_max_reduce_one(t_elapsed, 0, tm);
+  DBG(name, " took ", t_elapsed, " \n");
+  auto name_copy = name;
+  msm_fut = msm_fut.then([name_copy](MinSumMax<double> msm) {
+    SLOG_VERBOSE(KLCYAN, "Timing ", name_copy, ": ", msm, KNORM, "\n");
+    return msm;
+  });
+  Timings::set_pending(msm_fut.then([](MinSumMax<double>) {}));
+  return msm_fut;
+}
+void BaseTimer::done_all(const upcxx::team &tm) const { done_all_async(tm).wait(); }
+
+string BaseTimer::get_final() const {
+  ostringstream os;
+  os << name << ": " << std::setprecision(2) << std::fixed << t_elapsed << " s";
+  if (count > 1) os << " " << count << " count";
+  return os.str();
+}
+
+future<MinSumMax<double>> BaseTimer::reduce_timepoint(const upcxx::team &team, timepoint_t timepoint) {
+  duration_seconds secs = timepoint.time_since_epoch();
+  DBG_VERBOSE("reduce_timepoint ", secs.count(), " since epoch\n");
+  future<MinSumMax<double>> fut_msm = min_sum_max_reduce_one<double>(secs.count(), 0, team);
+  return fut_msm.then([&team](MinSumMax<double> msm) {
+    duration_seconds interval;
+    if (team.rank_me()) return msm;
+    // translate to seconds since the first rank entered
+    msm.my = msm.my - msm.min;
+    msm.max = msm.max - msm.min;
+    msm.sum = msm.sum - msm.min * team.rank_n();
+    msm.min = 0.0;
+    msm.apply_avg(team);
+    return msm;
+  });
+}
+
+future<ShTimings> BaseTimer::reduce_timings(const upcxx::team &team, size_t my_instances) const {
+  return reduce_timings(team, count, t_elapsed, my_instances);
+}
+
+future<ShTimings> BaseTimer::reduce_timings(const upcxx::team &team, size_t my_count, double my_elapsed, size_t my_instances) {
+  return Timings::reduce(team, my_count, my_elapsed, my_instances);
+}
+
+Timings BaseTimer::barrier_timings(const upcxx::team &team, size_t my_instances) const {
+  return barrier_timings(team, count, t_elapsed, my_instances);
+}
+
+Timings BaseTimer::barrier_timings(const upcxx::team &team, size_t my_count, double my_elapsed, size_t my_instances) {
+  return Timings::barrier(team, my_count, my_elapsed, my_instances);
+}
+
+timepoint_t BaseTimer::now() { return std::chrono::high_resolution_clock::now(); }
+
+string BaseTimer::now_str() {
+  std::time_t result = std::time(nullptr);
+  char buffer[100];
+  size_t sz = strftime(buffer, sizeof(buffer), "%Y-%m-%d %H:%M:%S", std::localtime(&result));
+  return string(sz > 0 ? buffer : "BAD TIME");
+}
+
+//
+// StallTimer
+//
+
+StallTimer::StallTimer(const string _name, double _max_seconds, int64_t _max_count)
+    : BaseTimer(_name)
+    , max_seconds(_max_seconds)
+    , max_count(_max_count) {
+  start();
+}
+
+StallTimer::~StallTimer() { stop(); }
+
+void StallTimer::check() {
+  stop();
+  bool print = false;
+  if (max_seconds > 0.0 && t_elapsed > max_seconds) {
+    print = true;
+  } else if (max_count > 0 && count > max_count) {
+    print = true;
+  }
+  if (print) {
+    WARN("StallTimer - ", name, " on ", rank_me(), " stalled for ", t_elapsed, " s and ", count, " iterations\n");
+    max_seconds *= 2.0;
+    max_count *= 2;
+  }
+  start();
+}
+
+//
+// IntermittentTimer
+//
+
+IntermittentTimer::IntermittentTimer(const string &_name, string _interval_label)
+    : BaseTimer(_name)
+    , t_interval(0.0)
+    , interval_label(_interval_label) {}
+
+IntermittentTimer::~IntermittentTimer() {}
+
+void IntermittentTimer::clear() {
+  ((BaseTimer *)this)->clear();
+  t_interval = 0.0;
+  interval_label = "";
+}
+
+void IntermittentTimer::start_interval() { t_interval = get_elapsed_since_start(); }
+
+void IntermittentTimer::stop_interval() {
+  t_interval = get_elapsed_since_start() - t_interval;
+  if (!interval_label.empty()) {
+    ostringstream oss;
+    oss << KBLUE << std::left << std::setw(40) << interval_label << std::setprecision(2) << std::fixed << t_interval << " s"
+        << KNORM << "\n";
+    SLOG(oss.str());
+  }
+}
+
+void IntermittentTimer::print_out(const upcxx::team &tm) {
+  future<ShTimings> fut_shptr_timings = reduce_timings(tm);
+  auto fut =
+      when_all(Timings::get_pending(), fut_shptr_timings).then([&name = this->name, &count = this->count](ShTimings shptr_timings) {
+        if (shptr_timings->count_msm.max > 0.0)
+          SLOG_VERBOSE(KLCYAN, "Timing ", name, ": ", count, " intervals, ", shptr_timings->to_string(true), "\n", KNORM);
+      });
+  Timings::set_pending(fut);
+  count = 0;
+  t_elapsed = 0.0;
+}
+
+//
+// ProgressTimer
+//
+
+ProgressTimer::ProgressTimer(const string &_name)
+    : BaseTimer(_name)
+    , calls(0) {}
+
+ProgressTimer::~ProgressTimer() {}
+
+void ProgressTimer::clear() {
+  ((BaseTimer *)this)->clear();
+  calls = 0;
+}
+
+void ProgressTimer::progress(size_t run_every) {
+  if (run_every > 1 && ++calls % run_every != 0) return;
+  start();
+  upcxx::progress();
+  stop();
+  // DBG("ProgressTimer(", name, ") - ", t_elapsed, "\n");
+}
+
+void ProgressTimer::discharge(size_t run_every) {
+  if (run_every != 1 && ++calls % run_every != 0) return;
+  start();
+  upcxx::discharge();
+  upcxx::progress();
+  stop();
+  // DBG("ProgressTimer(", name, ").discharge() - ", t_elapsed, "\n");
+}
+
+void ProgressTimer::print_out(const upcxx::team &tm) {
+  future<ShTimings> fut_shptr_timings = reduce_timings(tm);
+  auto fut = when_all(Timings::get_pending(), fut_shptr_timings).then([&name = this->name](ShTimings shptr_timings) {
+    if (shptr_timings->count_msm.max > 0.0)
+      SLOG_VERBOSE(KLCYAN, "Timing ", name, ": ", shptr_timings->to_string(true), KNORM, "\n");
+  });
+  Timings::set_pending(fut);
+  count = 0;
+  t_elapsed = 0.0;
+}
+
+//
+// Timer
+//
+Timer::Timer(const upcxx::team &tm, const string &_name, bool exit_reduction)
+    : tm(tm)
+    , exited(exit_reduction)
+    , logged(false)
+    , BaseTimer(_name) {
+  init();
+}
+Timer::Timer(const string &_name, bool exit_reduction)
+    : tm(upcxx::world())
+    , exited(exit_reduction)
+    , logged(false)
+    , BaseTimer(_name) {
+  init();
+}
+void Timer::init() {
+  increment_instance();
+  auto fut = when_all(Timings::get_pending(), make_future(now_str())).then([name = this->name](string now) {});
+  Timings::set_pending(fut);
+  start();
+}
+Timer::Timer(Timer &&move)
+    : tm(move.tm)
+    , exited(move.exited)
+    , BaseTimer((BaseTimer &)move) {
+  move.exited = true;
+  move.logged = true;
+}
+Timer &Timer::operator=(Timer &&move) {
+  Timer mv(std::move(move));
+  std::swap(*this, mv);
+  return *this;
+}
+
+Timer::~Timer() {
+  if (!exited)
+    initiate_exit_reduction();
+  else if (!logged) {
+    stop();
+    LOG(KLCYAN, "Timing ", name, ":", get_elapsed(), KNORM, "\n");
+  }
+}
+
+future<> Timer::initiate_entrance_reduction() {
+  DBG_VERBOSE("Tracking entrance of ", name, "\n");
+  auto fut_msm = reduce_timepoint(tm, now());
+
+  auto fut = when_all(Timings::get_pending(), fut_msm).then([name = this->name](MinSumMax<double> msm) {
+    DBG_VERBOSE("got reduction: ", msm.to_string(), "\n");
+    SLOG_VERBOSE(KLCYAN, "Timing (entrance) ", name, ":", msm.to_string(), KNORM, "\n");
+  });
+  Timings::set_pending(fut);
+  return fut;
+}
+
+future<> Timer::initiate_exit_reduction() {
+  stop();
+  future<ShTimings> fut_shptr_timings = reduce_timings(tm);
+  auto fut = when_all(Timings::get_pending(), fut_shptr_timings).then([name = this->name](ShTimings shptr_timings) {
+    SLOG_VERBOSE(KLCYAN, "Timing ", name, " exit: ", shptr_timings->to_string(), KNORM, "\n");
+  });
+  Timings::set_pending(fut);
+  decrement_instance();
+  exited = true;
+  logged = true;
+  return fut;
+}
+
+//
+// BarrierTimer
+//
+
+BarrierTimer::BarrierTimer(const upcxx::team &team, const string _name, bool _entrance_barrier, bool _exit_barrier)
+    : _team(team)
+    , exit_barrier(_exit_barrier)
+    , exited(false)
+    , BaseTimer(_name) {
+  init(_entrance_barrier);
+}
+BarrierTimer::BarrierTimer(const string _name, bool _entrance_barrier, bool _exit_barrier)
+    : _team(upcxx::world())
+    , exit_barrier(_exit_barrier)
+    , exited(false)
+    , BaseTimer(_name) {
+  init(_entrance_barrier);
+}
+
+future<> BarrierTimer::init(bool _entrance_barrier) {
+  increment_instance();
+  if (!_entrance_barrier && !exit_barrier) SLOG_VERBOSE("Why are we using a BarrierTimer without any barriers???\n");
+  future<> fut;
+  DBG("Entering BarrierTimer ", name, "\n");
+  if (_entrance_barrier) {
+    fut = when_all(Timings::get_pending(), make_future(now_str())).then([&name = this->name](string now) {
+      // SLOG_VERBOSE(KLCYAN, "Timing ", name, ":  (entering barrier) ", KNORM);
+    });
+    Timings::set_pending(fut);
+    auto timings = barrier_timings(_team);
+    Timings::wait_pending();  // should be noop
+    SLOG_VERBOSE(KLCYAN, "Timing (entrance barrier) ", name, ": ", timings.to_string(), KNORM, "\n");
+  } else {
+    fut = when_all(Timings::get_pending(), make_future(now_str())).then([&name = this->name](string now) {});
+    Timings::set_pending(fut);
+  }
+  start();
+  return fut;
+}
+
+BarrierTimer::~BarrierTimer() {
+  if (!exited) initate_exit_barrier().wait();
+}
+future<> BarrierTimer::initate_exit_barrier() {
+  stop();
+  future<> fut;
+  DBG("Exiting BarrierTimer ", name, "\n");
+  if (exit_barrier) {
+    fut = when_all(Timings::get_pending(), make_future(now_str())).then([name = this->name](string now) {});
+    Timings::set_pending(fut);
+    auto timings = barrier_timings(_team);
+    Timings::wait_pending();
+    SLOG_VERBOSE(KLCYAN, "Timing ", name, ": ", timings.to_string(), KNORM, "\n");
+  } else {
+    future<ShTimings> fut_shptr_timings = reduce_timings(_team);
+    fut = when_all(Timings::get_pending(), fut_shptr_timings).then([name = this->name](ShTimings shptr_timings) {
+      SLOG_VERBOSE(KLCYAN, "Timing ", name, ": ", shptr_timings->to_string(), KNORM, "\n");
+    });
+    Timings::set_pending(fut);
+  }
+  decrement_instance();
+  exited = true;
+  return fut;
+}
+
+//
+// AsyncTimer
+//
+
+_AsyncTimer::_AsyncTimer(const upcxx::team &tm, const string &name)
+    : BaseTimer(name)
+    , tm(tm)
+    , construct_t(BaseTimer::now())
+    , start_t{} {}
+void _AsyncTimer::start() {
+  start_t = now();
+  ((BaseTimer *)this)->start();
+}
+void _AsyncTimer::stop() { ((BaseTimer *)this)->stop(); }
+void _AsyncTimer::report(const string label, MinSumMax<double> msm) {
+  SLOG_VERBOSE(KLCYAN, "Timing ", name, " ", label, ":", msm.to_string(), KNORM, "\n");
+}
+
+future<> _AsyncTimer::initiate_construct_reduction() {
+  auto fut_msm = BaseTimer::reduce_timepoint(tm, construct_t);
+  auto fut = when_all(Timings::get_pending(), fut_msm).then([this](MinSumMax<double> msm) { this->report("construct", msm); });
+  Timings::set_pending(fut);
+  return fut;
+}
+future<> _AsyncTimer::initiate_start_reduction() {
+  auto fut_msm = BaseTimer::reduce_timepoint(tm, start_t);
+  auto fut = when_all(Timings::get_pending(), fut_msm).then([this](MinSumMax<double> msm) { this->report("start", msm); });
+  Timings::set_pending(fut);
+  return fut;
+}
+future<> _AsyncTimer::initiate_stop_reduction() {
+  auto fut_msm = Timings::reduce(tm, 1, get_elapsed(), 1);
+  auto fut = when_all(Timings::get_pending(), fut_msm).then([this](ShTimings sh_timings) {
+    this->report("stop", sh_timings->before_elapsed);
+  });
+  Timings::set_pending(fut);
+  return fut;
+}
+
+AsyncTimer::AsyncTimer(const upcxx::team &tm, const string &name)
+    : timer(make_shared<_AsyncTimer>(tm, name)) {}
+AsyncTimer::AsyncTimer(const string &name)
+    : timer(make_shared<_AsyncTimer>(upcxx::world(), name)) {}
+void AsyncTimer::start() const { timer->start(); }
+void AsyncTimer::stop() const {
+  timer->stop();
+  LOG(timer->get_name(), " completed in ", timer->get_elapsed(), " s\n");
+}
+double AsyncTimer::get_elapsed() const { return timer->get_elapsed(); }
+future<> AsyncTimer::initiate_construct_reduction() {
+  return timer->initiate_construct_reduction().then([timer = this->timer]() {
+    // keep timer alive
+  });
+}
+future<> AsyncTimer::initiate_start_reduction() {
+  return timer->initiate_start_reduction().then([timer = this->timer]() {
+    // keep timer alive
+  });
+}
+future<> AsyncTimer::initiate_stop_reduction() {
+  return timer->initiate_stop_reduction().then([timer = this->timer]() {
+    // keep timer alive
+  });
+}
+
+//
+// ActiveCountTimer
+//
+
+ActiveCountTimer::ActiveCountTimer(const string _name)
+    : total_elapsed(0.0)
+    , total_count(0)
+    , active_count(0)
+    , max_active(0)
+    , name(_name)
+    , my_fut(make_future()) {}
+
+ActiveCountTimer::~ActiveCountTimer() {
+  if (upcxx::initialized()) my_fut.wait();  // keep alive until all futures have finished
+}
+
+void ActiveCountTimer::clear() {
+  total_elapsed = 0.0;
+  total_count = 0;
+  active_count = 0;
+  max_active = 0;
+}
+
+timepoint_t ActiveCountTimer::begin() {
+  active_count++;
+  if (max_active < active_count) max_active = active_count;
+  return BaseTimer::now();
+}
+
+void ActiveCountTimer::end(timepoint_t t) {
+  duration_seconds interval = BaseTimer::now() - t;
+  active_count--;
+  total_count++;
+  total_elapsed += interval.count();
+}
+
+void ActiveCountTimer::print_barrier_timings(const upcxx::team &team, string label) {
+  Timings timings = BaseTimer::barrier_timings(team, total_count, total_elapsed, max_active);
+  clear();
+  Timings::wait_pending();
+  print_timings(timings, label);
+}
+
+void ActiveCountTimer::print_reduce_timings(const upcxx::team &team, string label) {
+  label = name + label;
+  auto fut_timings = BaseTimer::reduce_timings(team, total_count, total_elapsed, max_active);
+  auto _this = this;
+  auto fut_clear = fut_timings.then([_this](ShTimings ignored) { _this->clear(); });
+  auto fut = when_all(Timings::get_pending(), fut_timings, fut_clear).then([_this, label](ShTimings shptr_timings) {
+    _this->print_timings(*shptr_timings, label);
+  });
+  my_fut = when_all(fut_clear, my_fut, fut);  // keep this in scope until clear has been called...
+  Timings::set_pending(my_fut);
+}
+
+void ActiveCountTimer::print_timings(Timings &timings, string label) {
+  label = name + label;
+  DBG_VERBOSE(__func__, " label=", label, "\n");
+  if (active_count > 0)
+    SWARN("print_timings on ActiveCountTimer '", label, "' called while ", active_count, " (max ", max_active,
+          ") are still active\n");
+  if (timings.count_msm.max > 0.0) {
+    SLOG_VERBOSE(KLCYAN, "Timing instances of ", label, ": ",
+                 (timings.count_msm.max > 0.0 ? timings.to_string(true) : string("(none)")), KNORM, "\n");
+  }
+}
+
+ActiveCountTimer _GenericActiveCountTimer("_upcxx_dummy");
+GenericInstantiationTimer _GenericInstantiationTimer(_GenericActiveCountTimer);
+template class ActiveInstantiationTimer<_upcxx_utils_dummy>;
+
+SingletonInstantiationTimer _SingletonInstantiationTimer();
+template class InstantiationTimer<_upcxx_utils_dummy>;
+
+};  // namespace upcxx_utils
diff --git a/upcxx-utils/test/test_ofstream.cpp b/upcxx-utils/test/test_ofstream.cpp
index adfcd77..8c3183d 100644
--- a/upcxx-utils/test/test_ofstream.cpp
+++ b/upcxx-utils/test/test_ofstream.cpp
@@ -434,7 +434,7 @@ int run_large_test(int argc, char **argv) {
     future<> fut;
     if (i % 2 == 1) {
       fut = f.close_async();
-      assert(!fut.ready());
+      assert(!fut.is_ready());
       fut = fut.then([t3]() { t3->stop(); });
     } else {
       fut = make_future();
diff --git a/upcxx-utils/test/test_promise_collectives.cpp b/upcxx-utils/test/test_promise_collectives.cpp
index db062e7..f9c3129 100644
--- a/upcxx-utils/test/test_promise_collectives.cpp
+++ b/upcxx-utils/test/test_promise_collectives.cpp
@@ -34,7 +34,7 @@ int test_promise_barrier(int argc, char **argv) {
   assert(roundup_log2(17) == 5);
   {
     PromiseBarrier pb;
-    assert(!pb.get_future().ready());
+    assert(!pb.get_future().is_ready());
     pb.fulfill();
     pb.get_future().wait();
   }
@@ -44,10 +44,10 @@ int test_promise_barrier(int argc, char **argv) {
     barrier();
     PromiseBarrier pb1, pb2;
     barrier();
-    assert(!pb1.get_future().ready());
+    assert(!pb1.get_future().is_ready());
     pb1.fulfill();
     barrier();
-    assert(!pb2.get_future().ready());
+    assert(!pb2.get_future().is_ready());
     pb2.fulfill();
     barrier();
     pb1.get_future().wait();
@@ -60,12 +60,12 @@ int test_promise_barrier(int argc, char **argv) {
     barrier();
     PromiseBarrier pb1, pb2;
     barrier();
-    assert(!pb1.get_future().ready());
+    assert(!pb1.get_future().is_ready());
     pb1.fulfill();
     barrier();
     pb1.get_future().wait();
     barrier();
-    assert(!pb2.get_future().ready());
+    assert(!pb2.get_future().is_ready());
     pb2.fulfill();
     barrier();
     pb2.get_future().wait();
@@ -76,10 +76,10 @@ int test_promise_barrier(int argc, char **argv) {
     barrier();
     PromiseBarrier pb1, pb2;
     barrier();
-    assert(!pb1.get_future().ready());
+    assert(!pb1.get_future().is_ready());
     pb1.fulfill();
     barrier();
-    assert(!pb2.get_future().ready());
+    assert(!pb2.get_future().is_ready());
     pb2.fulfill();
     barrier();
     pb2.get_future().wait();
@@ -93,10 +93,10 @@ int test_promise_barrier(int argc, char **argv) {
     barrier();
     PromiseBarrier pb1, pb2;
     barrier();
-    assert(!pb2.get_future().ready());
+    assert(!pb2.get_future().is_ready());
     pb2.fulfill();
     barrier();
-    assert(!pb1.get_future().ready());
+    assert(!pb1.get_future().is_ready());
     pb1.fulfill();
     barrier();
     pb2.get_future().wait();
@@ -109,12 +109,12 @@ int test_promise_barrier(int argc, char **argv) {
     barrier();
     PromiseBarrier pb1, pb2;
     barrier();
-    assert(!pb2.get_future().ready());
+    assert(!pb2.get_future().is_ready());
     pb2.fulfill();
     barrier();
     pb2.get_future().wait();
     barrier();
-    assert(!pb1.get_future().ready());
+    assert(!pb1.get_future().is_ready());
     pb1.fulfill();
     barrier();
     pb1.get_future().wait();
@@ -125,10 +125,10 @@ int test_promise_barrier(int argc, char **argv) {
     barrier();
     PromiseBarrier pb1, pb2;
     barrier();
-    assert(!pb2.get_future().ready());
+    assert(!pb2.get_future().is_ready());
     pb2.fulfill();
     barrier();
-    assert(!pb1.get_future().ready());
+    assert(!pb1.get_future().is_ready());
     pb1.fulfill();
     barrier();
     pb1.get_future().wait();
@@ -147,13 +147,13 @@ int test_promise_barrier(int argc, char **argv) {
     for (int i = 0; i < iterations; i++) {
       fulfill_order[i] = i;
       wait_order[i] = i;
-      assert(!pbs[i].get_future().ready());
+      assert(!pbs[i].get_future().is_ready());
     }
     std::shuffle(fulfill_order.begin(), fulfill_order.end(), g);
     barrier();
     // initiate all
     for (int i = 0; i < iterations; i++) {
-      assert(!pbs[fulfill_order[i]].get_future().ready());
+      assert(!pbs[fulfill_order[i]].get_future().is_ready());
       pbs[fulfill_order[i]].fulfill();
     }
     // wait all
@@ -175,14 +175,14 @@ int test_promise_barrier(int argc, char **argv) {
     for (int i = 0; i < iterations; i++) {
       fulfill_order[i] = i;
       wait_order[i] = i;
-      assert(!pbs[i].get_future().ready());
+      assert(!pbs[i].get_future().is_ready());
     }
     std::shuffle(fulfill_order.begin(), fulfill_order.end(), g);
     std::shuffle(wait_order.begin(), wait_order.end(), g);
     barrier();
     // initiate all
     for (int i = 0; i < iterations; i++) {
-      assert(!pbs[fulfill_order[i]].get_future().ready());
+      assert(!pbs[fulfill_order[i]].get_future().is_ready());
       pbs[fulfill_order[i]].fulfill();
     }
     barrier();
@@ -210,7 +210,7 @@ int test_promise_barrier(int argc, char **argv) {
     barrier();
     // initiate all
     for (int i = 0; i < iterations; i++) {
-      assert(!pbs[fulfill_order[i]].get_future().ready());
+      assert(!pbs[fulfill_order[i]].get_future().is_ready());
       pbs[fulfill_order[i]].fulfill();
     }
     barrier();
@@ -236,7 +236,7 @@ int test_promise_barrier(int argc, char **argv) {
     barrier();
     // initiate all
     for (int i = 0; i < iterations; i++) {
-      assert(!pbs[fulfill_order[i]].get_future().ready());
+      assert(!pbs[fulfill_order[i]].get_future().is_ready());
       pbs[fulfill_order[i]].fulfill();
     }
     barrier();
@@ -261,7 +261,7 @@ int test_promise_barrier(int argc, char **argv) {
     barrier();
     // initiate all
     for (int i = 0; i < iterations; i++) {
-      assert(!pbs[fulfill_order[i]].get_future().ready());
+      assert(!pbs[fulfill_order[i]].get_future().is_ready());
       pbs[fulfill_order[i]].fulfill();
       pbs[wait_order[i]].get_future().wait();
     }
@@ -285,7 +285,7 @@ int test_promise_barrier(int argc, char **argv) {
     // initiate all
     future<> all_fut = make_future();
     for (int i = 0; i < iterations; i++) {
-      assert(!pbs[fulfill_order[i]].get_future().ready());
+      assert(!pbs[fulfill_order[i]].get_future().is_ready());
       pbs[fulfill_order[i]].fulfill();
       auto fut = pbs[wait_order[i]].get_future();
       all_fut = when_all(all_fut, fut);
@@ -310,7 +310,7 @@ int test_promise_barrier(int argc, char **argv) {
     // initiate all
     future<> all_fut = make_future();
     for (int i = 0; i < iterations; i++) {
-      assert(!pbs[fulfill_order[i]].get_future().ready());
+      assert(!pbs[fulfill_order[i]].get_future().is_ready());
       pbs[fulfill_order[i]].fulfill();
       auto fut = pbs[wait_order[i]].get_future();
       all_fut = when_all(all_fut, fut);

From 80c29547bc0f061d127b0a326d4d3a3bf2e896ad Mon Sep 17 00:00:00 2001
From: Jan Ciesko <jan.ciesko@gmail.com>
Date: Mon, 18 Dec 2023 16:10:38 -0700
Subject: [PATCH 02/13] Check if cmake variable is empty before appying string
 op

---
 cmake/Modules/CheckSubmodules.cmake | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/cmake/Modules/CheckSubmodules.cmake b/cmake/Modules/CheckSubmodules.cmake
index 50c5583..13fe2c7 100644
--- a/cmake/Modules/CheckSubmodules.cmake
+++ b/cmake/Modules/CheckSubmodules.cmake
@@ -15,8 +15,9 @@ if(GIT_FOUND AND EXISTS "${PROJECT_SOURCE_DIR}/.git")
         if(NOT GIT_SUBMOD_RESULT EQUAL "0")
             message(FATAL_ERROR "git submodule failed with ${GIT_SUBMOD_RESULT}, please checkout submodules")
         endif()
-
-        string(REPLACE "\n" ";" SUBMOD_LIST ${GIT_SUBMOD_OUTPUT})
+        if(NOT GIT_SUBMOD_OUTPUT STREQUAL "")
+	    string(REPLACE "\n" ";" SUBMOD_LIST ${GIT_SUBMOD_OUTPUT})
+        endif()
         set(UPDATE_SUBMODULES "")
         foreach(tmp ${EXPECTED_SUBMODULES})
            set(IS_SUB_OK FALSE)

From a8223305916231b92197c848035febef42f0fbae Mon Sep 17 00:00:00 2001
From: Jan Ciesko <jan.ciesko@gmail.com>
Date: Mon, 18 Dec 2023 16:23:27 -0700
Subject: [PATCH 03/13] Add missing header

---
 upcxx-utils/include/upcxx_utils/colors.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/upcxx-utils/include/upcxx_utils/colors.h b/upcxx-utils/include/upcxx_utils/colors.h
index f0170c3..9fbee40 100644
--- a/upcxx-utils/include/upcxx_utils/colors.h
+++ b/upcxx-utils/include/upcxx_utils/colors.h
@@ -24,6 +24,9 @@
  *
  */
 
+#include <array>
+#include <string>
+
 #ifdef CONFIG_USE_COLORS
 
 #define KNORM "\x1B[0m"

From 98169a7a8da14cb4f473a33cf720384561e42489 Mon Sep 17 00:00:00 2001
From: Jan Ciesko <jan.ciesko@gmail.com>
Date: Mon, 18 Dec 2023 16:26:19 -0700
Subject: [PATCH 04/13] Exclude test in proxy version of mhm2

---
 test/CMakeLists.txt        |   2 +
 upcxx-utils/src/timers.cpp | 698 -------------------------------------
 2 files changed, 2 insertions(+), 698 deletions(-)
 delete mode 100644 upcxx-utils/src/timers.cpp

diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index a4516b2..a95e4ad 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -21,6 +21,8 @@ file(
   LIST_DIRECTORIES false
   *.hpp *.cpp)
 
+list(FILTER TEST_SOURCES EXCLUDE REGEX "ssw-test\\.cpp$")
+
 set(SOURCES ${TEST_SOURCES})
 
 include_directories("${CMAKE_SOURCE_DIR}/src")
diff --git a/upcxx-utils/src/timers.cpp b/upcxx-utils/src/timers.cpp
deleted file mode 100644
index 467c4b0..0000000
--- a/upcxx-utils/src/timers.cpp
+++ /dev/null
@@ -1,698 +0,0 @@
-#include <chrono>
-#include <ctime>
-#include <iomanip>
-#include <upcxx/upcxx.hpp>
-
-#define _TIMERS_CPP
-#include "upcxx_utils/timers.hpp"
-
-using upcxx::future;
-
-namespace upcxx_utils {
-
-// Reduce compile time by making templates instantiations of common types
-// these are each constructed in CMakeLists.txt and timers-extern-template.in.cpp
-// extern templates declarations all happen in timers.hpp
-
-/*
- * This is now handled by CMakeLists.txt
- *
-   MACRO_MIN_SUM_MAX(float,    template);
-   MACRO_MIN_SUM_MAX(double,   template);
-   MACRO_MIN_SUM_MAX(int64_t,  template);
-   MACRO_MIN_SUM_MAX(uint64_t, template);
-   MACRO_MIN_SUM_MAX(int,      template);
-
- */
-
-//
-// Timings
-//
-
-future<> &Timings::get_last_pending() {
-  static future<> _ = make_future();
-  return _;
-}
-
-Timings::Timings()
-    : t()
-    , before_elapsed(0.0)
-    , after_elapsed(0.0)
-    , reduction_elapsed(0.0)
-    , my_count(0)
-    , my_instance(0) {}
-
-future<> Timings::get_pending() { return get_last_pending(); }
-
-void Timings::set_pending(future<> fut) { get_last_pending() = when_all(get_last_pending(), fut); }
-
-void Timings::wait_pending() {
-  DBG_VERBOSE(__func__, "\n");
-  if (upcxx::initialized()) {
-    get_last_pending().wait();
-    get_last_pending() = make_future();
-  }
-}
-
-string Timings::to_string(bool print_count, bool print_label) const {
-  ostringstream os;
-  if (print_label) os << "(min/my/avg/max, bal) ";
-  os << std::setprecision(2) << std::fixed;
-  // print the timing metrics
-  auto &before_max = before_msm.max;
-  auto &before_min = before_msm.min;
-  auto &before_sum = before_msm.sum;
-  if (before_max > 0.0) {
-    double bal = (before_max > 0.0 ? before_sum / rank_n() / before_max : 1.0);
-    if (before_max > 10.0 && bal < .9) os << KLRED;  // highlight large imbalances
-    os << before_min << "/" << before_elapsed << "/" << before_sum / rank_n() << "/" << before_max << " s, " << bal;
-    if (before_max > 1.0 && bal < .9) os << KLCYAN;
-  } else {
-    os << "0/0/0/0 s, 1.00";
-  }
-
-  os << std::setprecision(1) << std::fixed;
-
-  auto &after_max = after_msm.max;
-  auto &after_min = after_msm.min;
-  auto &after_sum = after_msm.sum;
-  // print the timings around a barrier if they are significant
-  if (after_max >= 0.1) {
-    os << (after_max > 1.0 ? KLRED : "") << " barrier " << after_min << "/" << after_elapsed << "/" << after_sum / rank_n() << "/"
-       << after_max << " s, " << (after_max > 0.0 ? after_sum / rank_n() / after_max : 0.0) << (after_max > 1.0 ? KLCYAN : "");
-  } else if (after_max > 0.0) {
-    os << std::setprecision(2) << std::fixed;
-    os << " barrier " << after_max << " s";
-    os << std::setprecision(1) << std::fixed;
-  }
-
-  auto &count_max = count_msm.max;
-  auto &count_min = count_msm.min;
-  auto &count_sum = count_msm.sum;
-  // print the max_count if it is more than 1 or more than 0 if asked to print the count
-  if (count_max > (print_count ? 0.0 : 1.00001))
-    os << " count " << count_min << "/" << my_count << "/" << count_sum / rank_n() << "/" << count_max << ", "
-       << (count_max > 0.0 ? count_sum / rank_n() / count_max : 0.0);
-
-  auto &instance_max = instance_msm.max;
-  auto &instance_min = instance_msm.min;
-  auto &instance_sum = instance_msm.sum;
-  // print the instances if it is both non-zero and not 1 per rank
-  if (instance_sum > 0 && ((int)(instance_sum + 0.01)) != rank_n() && ((int)(instance_sum + 0.99)) != rank_n())
-    os << " inst " << instance_min << "/" << my_instance << "/" << instance_sum / rank_n() << "/" << instance_max << ", "
-       << (instance_max > 0.0 ? instance_sum / rank_n() / instance_max : 0.0);
-  // print the reduction timings if they are significant
-  if (reduction_elapsed > 0.05)
-    os << (reduction_elapsed > .5 ? KLRED : "") << " reduct " << reduction_elapsed << (reduction_elapsed > .5 ? KLCYAN : "");
-  return os.str();
-}
-
-void Timings::set_before(Timings &timings, size_t count, double elapsed, size_t instances) {
-  DBG_VERBOSE("set_before: my_count=", count, " my_elapsed=", elapsed, " instances=", instances, "\n");
-  timings.before = std::chrono::high_resolution_clock::now();
-
-  timings.my_count = count;
-  timings.count_msm.reset(timings.my_count);
-
-  timings.before_elapsed = elapsed;
-  timings.before_msm.reset(elapsed);
-
-  timings.my_instance = instances;
-  timings.instance_msm.reset(instances);
-}
-
-// timings must remain in scope until the returened future is ready()
-future<> Timings::set_after(const upcxx::team &team, Timings &timings,
-                            std::chrono::time_point<std::chrono::high_resolution_clock> t_after) {
-  timings.after = t_after;
-  duration_seconds interval = timings.after - timings.before;
-  timings.after_elapsed = interval.count();
-  timings.after_msm.reset(timings.after_elapsed);
-  DBG_VERBOSE("set_after: ", interval.count(), "\n");
-
-  // time the reductions
-  timings.t = t_after;
-
-  assert(&timings.instance_msm == &timings.before_msm + 3);  // memory is in order
-  auto fut_msms = min_sum_max_reduce_all(&timings.before_msm, &timings.before_msm, 4, team);
-  auto ret = fut_msms.then([&timings]() {
-    duration_seconds interval = std::chrono::high_resolution_clock::now() - timings.t;
-    timings.reduction_elapsed = interval.count();
-    DBG_VERBOSE("Finished reductions:, ", interval.count(), "\n");
-  });
-
-  set_pending(when_all(ret, get_pending()));
-  return ret;
-}
-
-// barrier and reduction
-Timings Timings::barrier(const upcxx::team &team, size_t count, double elapsed, size_t instances) {
-  DBG("Timings::barrier(", count, ", ", elapsed, ", ", instances, ")\n");
-  Timings timings;
-  set_before(timings, count, elapsed, instances);
-  upcxx::barrier(team);
-  progress();  // explicitly make progress after the barrier if the barrier itself was already ready()
-  auto fut = set_after(team, timings);
-  wait_pending();
-  assert(fut.is_ready());
-  return timings;
-}
-
-void Timings::print_barrier_timings(const upcxx::team &team, string label) {
-  Timings timings = barrier(team, 0, 0, 0);
-  wait_pending();
-  SLOG_VERBOSE(KLCYAN, "Timing ", label, ":", timings.to_string(), KNORM, "\n");
-}
-
-// no barrier but a future reduction is started
-future<ShTimings> Timings::reduce(const upcxx::team &team, size_t count, double elapsed, size_t instances) {
-  DBG("Timings::reduce(", count, ", ", elapsed, ", ", instances, ")\n");
-  auto timings = make_shared<Timings>();
-  set_before(*timings, count, elapsed, instances);
-  auto future_reduction = set_after(team, *timings, timings->before);  // after == before, so no barrier info will be output
-  return when_all(make_future(timings), future_reduction, get_pending());
-}
-
-void Timings::print_reduce_timings(const upcxx::team &team, string label) {
-  future<ShTimings> fut_timings = reduce(team, 0, 0, 0);
-  auto fut = when_all(fut_timings, get_pending()).then([label = std::move(label)](ShTimings shptr_timings) {
-    SLOG_VERBOSE(KLCYAN, "Timing ", label, ": ", shptr_timings->to_string(), "\n", KNORM);
-  });
-  set_pending(fut);
-}
-
-//
-// BaseTimer
-//
-
-size_t &BaseTimer::instance_count() {
-  static size_t _ = 0;
-  return _;
-}
-
-void BaseTimer::increment_instance() { ++instance_count(); }
-void BaseTimer::decrement_instance() { instance_count()--; }
-size_t BaseTimer::get_instance_count() { return instance_count(); }
-
-BaseTimer::BaseTimer()
-    : t()
-    , name()
-    , t_elapsed(0.0)
-    , count(0) {}
-
-BaseTimer::BaseTimer(const string &_name)
-    : t()
-    , name(_name)
-    , t_elapsed(0.0)
-    , count(0) {}
-
-BaseTimer::~BaseTimer() {}
-
-void BaseTimer::clear() {
-  t = timepoint_t();
-  t_elapsed = 0.0;
-  count = 0;
-}
-
-void BaseTimer::start() {
-  assert(t == timepoint_t());
-  t = now();
-}
-
-void BaseTimer::stop() {
-  double elapsed = get_elapsed_since_start();
-  t = timepoint_t();  // reset to 0
-  // DBG("stop(", name, ", inst=", get_instance_count(), "): ", elapsed, " s, ", now_str(), "\n");
-  t_elapsed += elapsed;
-  count++;
-}
-
-double BaseTimer::get_elapsed() const { return t_elapsed; }
-
-double BaseTimer::get_elapsed_since_start() const {
-  assert(t != timepoint_t());
-  duration_seconds interval = now() - t;
-  return interval.count();
-}
-
-size_t BaseTimer::get_count() const { return count; }
-
-const string &BaseTimer::get_name() const { return name; }
-
-void BaseTimer::done() const {
-  assert(t == timepoint_t());
-  SLOG_VERBOSE(KLCYAN, "Timing ", name, ": ", std::setprecision(2), std::fixed, t_elapsed, " s ", KNORM, "\n");
-  DBG(name, " took ", std::setprecision(2), std::fixed, t_elapsed, " s ", "\n");
-}
-
-future<MinSumMax<double>> BaseTimer::done_all_async(const upcxx::team &tm) const {
-  assert(t == timepoint_t());
-  auto msm_fut = upcxx_utils::min_sum_max_reduce_one(t_elapsed, 0, tm);
-  DBG(name, " took ", t_elapsed, " \n");
-  auto name_copy = name;
-  msm_fut = msm_fut.then([name_copy](MinSumMax<double> msm) {
-    SLOG_VERBOSE(KLCYAN, "Timing ", name_copy, ": ", msm, KNORM, "\n");
-    return msm;
-  });
-  Timings::set_pending(msm_fut.then([](MinSumMax<double>) {}));
-  return msm_fut;
-}
-void BaseTimer::done_all(const upcxx::team &tm) const { done_all_async(tm).wait(); }
-
-string BaseTimer::get_final() const {
-  ostringstream os;
-  os << name << ": " << std::setprecision(2) << std::fixed << t_elapsed << " s";
-  if (count > 1) os << " " << count << " count";
-  return os.str();
-}
-
-future<MinSumMax<double>> BaseTimer::reduce_timepoint(const upcxx::team &team, timepoint_t timepoint) {
-  duration_seconds secs = timepoint.time_since_epoch();
-  DBG_VERBOSE("reduce_timepoint ", secs.count(), " since epoch\n");
-  future<MinSumMax<double>> fut_msm = min_sum_max_reduce_one<double>(secs.count(), 0, team);
-  return fut_msm.then([&team](MinSumMax<double> msm) {
-    duration_seconds interval;
-    if (team.rank_me()) return msm;
-    // translate to seconds since the first rank entered
-    msm.my = msm.my - msm.min;
-    msm.max = msm.max - msm.min;
-    msm.sum = msm.sum - msm.min * team.rank_n();
-    msm.min = 0.0;
-    msm.apply_avg(team);
-    return msm;
-  });
-}
-
-future<ShTimings> BaseTimer::reduce_timings(const upcxx::team &team, size_t my_instances) const {
-  return reduce_timings(team, count, t_elapsed, my_instances);
-}
-
-future<ShTimings> BaseTimer::reduce_timings(const upcxx::team &team, size_t my_count, double my_elapsed, size_t my_instances) {
-  return Timings::reduce(team, my_count, my_elapsed, my_instances);
-}
-
-Timings BaseTimer::barrier_timings(const upcxx::team &team, size_t my_instances) const {
-  return barrier_timings(team, count, t_elapsed, my_instances);
-}
-
-Timings BaseTimer::barrier_timings(const upcxx::team &team, size_t my_count, double my_elapsed, size_t my_instances) {
-  return Timings::barrier(team, my_count, my_elapsed, my_instances);
-}
-
-timepoint_t BaseTimer::now() { return std::chrono::high_resolution_clock::now(); }
-
-string BaseTimer::now_str() {
-  std::time_t result = std::time(nullptr);
-  char buffer[100];
-  size_t sz = strftime(buffer, sizeof(buffer), "%Y-%m-%d %H:%M:%S", std::localtime(&result));
-  return string(sz > 0 ? buffer : "BAD TIME");
-}
-
-//
-// StallTimer
-//
-
-StallTimer::StallTimer(const string _name, double _max_seconds, int64_t _max_count)
-    : BaseTimer(_name)
-    , max_seconds(_max_seconds)
-    , max_count(_max_count) {
-  start();
-}
-
-StallTimer::~StallTimer() { stop(); }
-
-void StallTimer::check() {
-  stop();
-  bool print = false;
-  if (max_seconds > 0.0 && t_elapsed > max_seconds) {
-    print = true;
-  } else if (max_count > 0 && count > max_count) {
-    print = true;
-  }
-  if (print) {
-    WARN("StallTimer - ", name, " on ", rank_me(), " stalled for ", t_elapsed, " s and ", count, " iterations\n");
-    max_seconds *= 2.0;
-    max_count *= 2;
-  }
-  start();
-}
-
-//
-// IntermittentTimer
-//
-
-IntermittentTimer::IntermittentTimer(const string &_name, string _interval_label)
-    : BaseTimer(_name)
-    , t_interval(0.0)
-    , interval_label(_interval_label) {}
-
-IntermittentTimer::~IntermittentTimer() {}
-
-void IntermittentTimer::clear() {
-  ((BaseTimer *)this)->clear();
-  t_interval = 0.0;
-  interval_label = "";
-}
-
-void IntermittentTimer::start_interval() { t_interval = get_elapsed_since_start(); }
-
-void IntermittentTimer::stop_interval() {
-  t_interval = get_elapsed_since_start() - t_interval;
-  if (!interval_label.empty()) {
-    ostringstream oss;
-    oss << KBLUE << std::left << std::setw(40) << interval_label << std::setprecision(2) << std::fixed << t_interval << " s"
-        << KNORM << "\n";
-    SLOG(oss.str());
-  }
-}
-
-void IntermittentTimer::print_out(const upcxx::team &tm) {
-  future<ShTimings> fut_shptr_timings = reduce_timings(tm);
-  auto fut =
-      when_all(Timings::get_pending(), fut_shptr_timings).then([&name = this->name, &count = this->count](ShTimings shptr_timings) {
-        if (shptr_timings->count_msm.max > 0.0)
-          SLOG_VERBOSE(KLCYAN, "Timing ", name, ": ", count, " intervals, ", shptr_timings->to_string(true), "\n", KNORM);
-      });
-  Timings::set_pending(fut);
-  count = 0;
-  t_elapsed = 0.0;
-}
-
-//
-// ProgressTimer
-//
-
-ProgressTimer::ProgressTimer(const string &_name)
-    : BaseTimer(_name)
-    , calls(0) {}
-
-ProgressTimer::~ProgressTimer() {}
-
-void ProgressTimer::clear() {
-  ((BaseTimer *)this)->clear();
-  calls = 0;
-}
-
-void ProgressTimer::progress(size_t run_every) {
-  if (run_every > 1 && ++calls % run_every != 0) return;
-  start();
-  upcxx::progress();
-  stop();
-  // DBG("ProgressTimer(", name, ") - ", t_elapsed, "\n");
-}
-
-void ProgressTimer::discharge(size_t run_every) {
-  if (run_every != 1 && ++calls % run_every != 0) return;
-  start();
-  upcxx::discharge();
-  upcxx::progress();
-  stop();
-  // DBG("ProgressTimer(", name, ").discharge() - ", t_elapsed, "\n");
-}
-
-void ProgressTimer::print_out(const upcxx::team &tm) {
-  future<ShTimings> fut_shptr_timings = reduce_timings(tm);
-  auto fut = when_all(Timings::get_pending(), fut_shptr_timings).then([&name = this->name](ShTimings shptr_timings) {
-    if (shptr_timings->count_msm.max > 0.0)
-      SLOG_VERBOSE(KLCYAN, "Timing ", name, ": ", shptr_timings->to_string(true), KNORM, "\n");
-  });
-  Timings::set_pending(fut);
-  count = 0;
-  t_elapsed = 0.0;
-}
-
-//
-// Timer
-//
-Timer::Timer(const upcxx::team &tm, const string &_name, bool exit_reduction)
-    : tm(tm)
-    , exited(exit_reduction)
-    , logged(false)
-    , BaseTimer(_name) {
-  init();
-}
-Timer::Timer(const string &_name, bool exit_reduction)
-    : tm(upcxx::world())
-    , exited(exit_reduction)
-    , logged(false)
-    , BaseTimer(_name) {
-  init();
-}
-void Timer::init() {
-  increment_instance();
-  auto fut = when_all(Timings::get_pending(), make_future(now_str())).then([name = this->name](string now) {});
-  Timings::set_pending(fut);
-  start();
-}
-Timer::Timer(Timer &&move)
-    : tm(move.tm)
-    , exited(move.exited)
-    , BaseTimer((BaseTimer &)move) {
-  move.exited = true;
-  move.logged = true;
-}
-Timer &Timer::operator=(Timer &&move) {
-  Timer mv(std::move(move));
-  std::swap(*this, mv);
-  return *this;
-}
-
-Timer::~Timer() {
-  if (!exited)
-    initiate_exit_reduction();
-  else if (!logged) {
-    stop();
-    LOG(KLCYAN, "Timing ", name, ":", get_elapsed(), KNORM, "\n");
-  }
-}
-
-future<> Timer::initiate_entrance_reduction() {
-  DBG_VERBOSE("Tracking entrance of ", name, "\n");
-  auto fut_msm = reduce_timepoint(tm, now());
-
-  auto fut = when_all(Timings::get_pending(), fut_msm).then([name = this->name](MinSumMax<double> msm) {
-    DBG_VERBOSE("got reduction: ", msm.to_string(), "\n");
-    SLOG_VERBOSE(KLCYAN, "Timing (entrance) ", name, ":", msm.to_string(), KNORM, "\n");
-  });
-  Timings::set_pending(fut);
-  return fut;
-}
-
-future<> Timer::initiate_exit_reduction() {
-  stop();
-  future<ShTimings> fut_shptr_timings = reduce_timings(tm);
-  auto fut = when_all(Timings::get_pending(), fut_shptr_timings).then([name = this->name](ShTimings shptr_timings) {
-    SLOG_VERBOSE(KLCYAN, "Timing ", name, " exit: ", shptr_timings->to_string(), KNORM, "\n");
-  });
-  Timings::set_pending(fut);
-  decrement_instance();
-  exited = true;
-  logged = true;
-  return fut;
-}
-
-//
-// BarrierTimer
-//
-
-BarrierTimer::BarrierTimer(const upcxx::team &team, const string _name, bool _entrance_barrier, bool _exit_barrier)
-    : _team(team)
-    , exit_barrier(_exit_barrier)
-    , exited(false)
-    , BaseTimer(_name) {
-  init(_entrance_barrier);
-}
-BarrierTimer::BarrierTimer(const string _name, bool _entrance_barrier, bool _exit_barrier)
-    : _team(upcxx::world())
-    , exit_barrier(_exit_barrier)
-    , exited(false)
-    , BaseTimer(_name) {
-  init(_entrance_barrier);
-}
-
-future<> BarrierTimer::init(bool _entrance_barrier) {
-  increment_instance();
-  if (!_entrance_barrier && !exit_barrier) SLOG_VERBOSE("Why are we using a BarrierTimer without any barriers???\n");
-  future<> fut;
-  DBG("Entering BarrierTimer ", name, "\n");
-  if (_entrance_barrier) {
-    fut = when_all(Timings::get_pending(), make_future(now_str())).then([&name = this->name](string now) {
-      // SLOG_VERBOSE(KLCYAN, "Timing ", name, ":  (entering barrier) ", KNORM);
-    });
-    Timings::set_pending(fut);
-    auto timings = barrier_timings(_team);
-    Timings::wait_pending();  // should be noop
-    SLOG_VERBOSE(KLCYAN, "Timing (entrance barrier) ", name, ": ", timings.to_string(), KNORM, "\n");
-  } else {
-    fut = when_all(Timings::get_pending(), make_future(now_str())).then([&name = this->name](string now) {});
-    Timings::set_pending(fut);
-  }
-  start();
-  return fut;
-}
-
-BarrierTimer::~BarrierTimer() {
-  if (!exited) initate_exit_barrier().wait();
-}
-future<> BarrierTimer::initate_exit_barrier() {
-  stop();
-  future<> fut;
-  DBG("Exiting BarrierTimer ", name, "\n");
-  if (exit_barrier) {
-    fut = when_all(Timings::get_pending(), make_future(now_str())).then([name = this->name](string now) {});
-    Timings::set_pending(fut);
-    auto timings = barrier_timings(_team);
-    Timings::wait_pending();
-    SLOG_VERBOSE(KLCYAN, "Timing ", name, ": ", timings.to_string(), KNORM, "\n");
-  } else {
-    future<ShTimings> fut_shptr_timings = reduce_timings(_team);
-    fut = when_all(Timings::get_pending(), fut_shptr_timings).then([name = this->name](ShTimings shptr_timings) {
-      SLOG_VERBOSE(KLCYAN, "Timing ", name, ": ", shptr_timings->to_string(), KNORM, "\n");
-    });
-    Timings::set_pending(fut);
-  }
-  decrement_instance();
-  exited = true;
-  return fut;
-}
-
-//
-// AsyncTimer
-//
-
-_AsyncTimer::_AsyncTimer(const upcxx::team &tm, const string &name)
-    : BaseTimer(name)
-    , tm(tm)
-    , construct_t(BaseTimer::now())
-    , start_t{} {}
-void _AsyncTimer::start() {
-  start_t = now();
-  ((BaseTimer *)this)->start();
-}
-void _AsyncTimer::stop() { ((BaseTimer *)this)->stop(); }
-void _AsyncTimer::report(const string label, MinSumMax<double> msm) {
-  SLOG_VERBOSE(KLCYAN, "Timing ", name, " ", label, ":", msm.to_string(), KNORM, "\n");
-}
-
-future<> _AsyncTimer::initiate_construct_reduction() {
-  auto fut_msm = BaseTimer::reduce_timepoint(tm, construct_t);
-  auto fut = when_all(Timings::get_pending(), fut_msm).then([this](MinSumMax<double> msm) { this->report("construct", msm); });
-  Timings::set_pending(fut);
-  return fut;
-}
-future<> _AsyncTimer::initiate_start_reduction() {
-  auto fut_msm = BaseTimer::reduce_timepoint(tm, start_t);
-  auto fut = when_all(Timings::get_pending(), fut_msm).then([this](MinSumMax<double> msm) { this->report("start", msm); });
-  Timings::set_pending(fut);
-  return fut;
-}
-future<> _AsyncTimer::initiate_stop_reduction() {
-  auto fut_msm = Timings::reduce(tm, 1, get_elapsed(), 1);
-  auto fut = when_all(Timings::get_pending(), fut_msm).then([this](ShTimings sh_timings) {
-    this->report("stop", sh_timings->before_elapsed);
-  });
-  Timings::set_pending(fut);
-  return fut;
-}
-
-AsyncTimer::AsyncTimer(const upcxx::team &tm, const string &name)
-    : timer(make_shared<_AsyncTimer>(tm, name)) {}
-AsyncTimer::AsyncTimer(const string &name)
-    : timer(make_shared<_AsyncTimer>(upcxx::world(), name)) {}
-void AsyncTimer::start() const { timer->start(); }
-void AsyncTimer::stop() const {
-  timer->stop();
-  LOG(timer->get_name(), " completed in ", timer->get_elapsed(), " s\n");
-}
-double AsyncTimer::get_elapsed() const { return timer->get_elapsed(); }
-future<> AsyncTimer::initiate_construct_reduction() {
-  return timer->initiate_construct_reduction().then([timer = this->timer]() {
-    // keep timer alive
-  });
-}
-future<> AsyncTimer::initiate_start_reduction() {
-  return timer->initiate_start_reduction().then([timer = this->timer]() {
-    // keep timer alive
-  });
-}
-future<> AsyncTimer::initiate_stop_reduction() {
-  return timer->initiate_stop_reduction().then([timer = this->timer]() {
-    // keep timer alive
-  });
-}
-
-//
-// ActiveCountTimer
-//
-
-ActiveCountTimer::ActiveCountTimer(const string _name)
-    : total_elapsed(0.0)
-    , total_count(0)
-    , active_count(0)
-    , max_active(0)
-    , name(_name)
-    , my_fut(make_future()) {}
-
-ActiveCountTimer::~ActiveCountTimer() {
-  if (upcxx::initialized()) my_fut.wait();  // keep alive until all futures have finished
-}
-
-void ActiveCountTimer::clear() {
-  total_elapsed = 0.0;
-  total_count = 0;
-  active_count = 0;
-  max_active = 0;
-}
-
-timepoint_t ActiveCountTimer::begin() {
-  active_count++;
-  if (max_active < active_count) max_active = active_count;
-  return BaseTimer::now();
-}
-
-void ActiveCountTimer::end(timepoint_t t) {
-  duration_seconds interval = BaseTimer::now() - t;
-  active_count--;
-  total_count++;
-  total_elapsed += interval.count();
-}
-
-void ActiveCountTimer::print_barrier_timings(const upcxx::team &team, string label) {
-  Timings timings = BaseTimer::barrier_timings(team, total_count, total_elapsed, max_active);
-  clear();
-  Timings::wait_pending();
-  print_timings(timings, label);
-}
-
-void ActiveCountTimer::print_reduce_timings(const upcxx::team &team, string label) {
-  label = name + label;
-  auto fut_timings = BaseTimer::reduce_timings(team, total_count, total_elapsed, max_active);
-  auto _this = this;
-  auto fut_clear = fut_timings.then([_this](ShTimings ignored) { _this->clear(); });
-  auto fut = when_all(Timings::get_pending(), fut_timings, fut_clear).then([_this, label](ShTimings shptr_timings) {
-    _this->print_timings(*shptr_timings, label);
-  });
-  my_fut = when_all(fut_clear, my_fut, fut);  // keep this in scope until clear has been called...
-  Timings::set_pending(my_fut);
-}
-
-void ActiveCountTimer::print_timings(Timings &timings, string label) {
-  label = name + label;
-  DBG_VERBOSE(__func__, " label=", label, "\n");
-  if (active_count > 0)
-    SWARN("print_timings on ActiveCountTimer '", label, "' called while ", active_count, " (max ", max_active,
-          ") are still active\n");
-  if (timings.count_msm.max > 0.0) {
-    SLOG_VERBOSE(KLCYAN, "Timing instances of ", label, ": ",
-                 (timings.count_msm.max > 0.0 ? timings.to_string(true) : string("(none)")), KNORM, "\n");
-  }
-}
-
-ActiveCountTimer _GenericActiveCountTimer("_upcxx_dummy");
-GenericInstantiationTimer _GenericInstantiationTimer(_GenericActiveCountTimer);
-template class ActiveInstantiationTimer<_upcxx_utils_dummy>;
-
-SingletonInstantiationTimer _SingletonInstantiationTimer();
-template class InstantiationTimer<_upcxx_utils_dummy>;
-
-};  // namespace upcxx_utils

From 1390efd959e31677cf5907158c0f0db69888f5da Mon Sep 17 00:00:00 2001
From: Jan Ciesko <jan.ciesko@gmail.com>
Date: Thu, 15 Feb 2024 11:23:14 -0700
Subject: [PATCH 05/13] Set DCMAKE_CXX_COMPILTER=mpicxx

---
 build.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/build.sh b/build.sh
index 41280b5..226602d 100755
--- a/build.sh
+++ b/build.sh
@@ -48,7 +48,7 @@ else
     if [ "$1" == "Debug" ] || [ "$1" == "Release" ] || [ "$1" == "RelWithDebInfo" ]; then
         rm -rf *
         rm -rf $INSTALL_PATH/cmake
-        cmake $rootdir -DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DCMAKE_BUILD_TYPE=$1 -DCMAKE_INSTALL_PREFIX=$INSTALL_PATH \
+        cmake $rootdir -DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DCMAKE_BUILD_TYPE=$1 -DCMAKE_INSTALL_PREFIX=$INSTALL_PATH -DCMAKE_CXX_COMPILER=mpicxx \
               -DMHM2_ENABLE_TESTING=0 $MHM2_CMAKE_EXTRAS $2
         #-DENABLE_CUDA=0
     fi

From 6e24d8b73477362a8913031e2b5cbffd1cf83f16 Mon Sep 17 00:00:00 2001
From: Jan Ciesko <jan.ciesko@gmail.com>
Date: Thu, 15 Feb 2024 11:24:20 -0700
Subject: [PATCH 06/13] Use std::chrono::duration contructor and not
 assignement operator Cleanup after timer removal

---
 src/devices_gpu.cpp                             |  1 -
 src/kcount/kcount_gpu.cpp                       | 11 -----------
 upcxx-utils/include/upcxx_utils/thread_pool.hpp | 12 ++++++------
 3 files changed, 6 insertions(+), 18 deletions(-)

diff --git a/src/devices_gpu.cpp b/src/devices_gpu.cpp
index ba139ba..a072104 100644
--- a/src/devices_gpu.cpp
+++ b/src/devices_gpu.cpp
@@ -68,7 +68,6 @@ void init_devices() {
 
 void done_init_devices() {
   if (init_gpu_thread) {
-    Timer t("Waiting for GPU to be initialized (should be noop)");
     init_gpu_thread = false;
     detect_gpu_fut.wait();
     if (gpu_utils::gpus_present()) {
diff --git a/src/kcount/kcount_gpu.cpp b/src/kcount/kcount_gpu.cpp
index c1e5da0..4b31e30 100644
--- a/src/kcount/kcount_gpu.cpp
+++ b/src/kcount/kcount_gpu.cpp
@@ -276,16 +276,9 @@ void HashTableInserter<MAX_K>::flush_inserts() {
   SLOG_GPU("  final size per rank is ", insert_stats.new_inserts, " entries\n");
 }
 
-template <int MAX_K>
-void HashTableInserter<MAX_K>::get_elapsed_time(double &insert_time, double &kernel_time) {
-  state->ht_gpu_driver.get_elapsed_time(insert_time, kernel_time);
-}
-
 template <int MAX_K>
 void HashTableInserter<MAX_K>::insert_into_local_hashtable(dist_object<KmerMap<MAX_K>> &local_kmers) {
   barrier();
-  IntermittentTimer insert_timer("gpu insert to cpu timer");
-  insert_timer.start();
   if (state->ht_gpu_driver.pass_type == CTG_KMERS_PASS) {
     int attempted_inserts = 0, dropped_inserts = 0, new_inserts = 0;
     state->ht_gpu_driver.done_ctg_kmer_inserts(attempted_inserts, dropped_inserts, new_inserts);
@@ -350,11 +343,7 @@ void HashTableInserter<MAX_K>::insert_into_local_hashtable(dist_object<KmerMap<M
            kmer_counts.count);
     local_kmers->insert({kmer, kmer_counts});
   }
-  insert_timer.stop();
 
-  auto all_avg_elapsed_time = reduce_one(insert_timer.get_elapsed(), op_fast_add, 0).wait() / rank_n();
-  auto all_max_elapsed_time = reduce_one(insert_timer.get_elapsed(), op_fast_max, 0).wait();
-  SLOG_GPU("Inserting kmers from GPU to cpu hash table took ", all_avg_elapsed_time, " avg, ", all_max_elapsed_time, " max\n");
   auto all_kmers_size = reduce_one((uint64_t)local_kmers->size(), op_fast_add, 0).wait();
   if (local_kmers->size() != (num_entries - invalid))
     WARN("kmers->size() is ", local_kmers->size(), " != ", (num_entries - invalid), " num_entries");
diff --git a/upcxx-utils/include/upcxx_utils/thread_pool.hpp b/upcxx-utils/include/upcxx_utils/thread_pool.hpp
index c04ec48..c2e586a 100644
--- a/upcxx-utils/include/upcxx_utils/thread_pool.hpp
+++ b/upcxx-utils/include/upcxx_utils/thread_pool.hpp
@@ -144,7 +144,7 @@ class ThreadPool {
     sh_prom->require_anonymous(1);  // additional requirement to complete
 
     auto task_id = global_task_id()++;
-    auto start_t = 0;
+    auto start_t = 0.0;
     DBG("sh_prom=", sh_prom.get(), " task_id=", task_id, "\n");
 
     auto args_tuple = std::make_tuple(args...);  // *copy* arguments to avoid races in argument references being reused
@@ -155,7 +155,7 @@ class ThreadPool {
           DBG_VERBOSE("Finished sh_prom=", sh_prom.get(), "\n");
           // fulfill only in calling persona
           persona.lpc_ff([task_id, start_t, sh_prom]() {
-            duration_seconds s = 0;
+            duration_seconds s (0.0);
             DBG("Fulfilled sh_prom=", sh_prom.get(), " task_id=", task_id, " in ", s.count(), " s\n");
             sh_prom->fulfill_anonymous(1);
             global_tasks_completed()++;
@@ -184,20 +184,20 @@ class ThreadPool {
     std::shared_ptr<upcxx::promise<>> sh_prom = std::make_shared<upcxx::promise<>>();
 
     auto task_id = global_task_id()++;
-    auto start_t = 0;
+    auto start_t = 0.0;
     DBG("sh_prom=", sh_prom.get(), " task_id=", task_id, "of", global_task_id(), "\n");
 
     auto args_tuple = std::make_tuple(args...);  // *copy* arguments to avoid races in argument references being reused
     auto sh_task =
         std::make_shared<Task>([sh_prom, task_id, start_t, &persona, func{std::move(func)}, args_tuple{std::move(args_tuple)}]() {
-          auto compute_start_t = 0;
-          duration_seconds delay_s = compute_start_t - start_t;
+          auto compute_start_t = 0.0;
+          duration_seconds delay_s (compute_start_t - start_t);
           DBG_VERBOSE("Executing sh_prom=", sh_prom.get(), "\n");
           std::apply(func, args_tuple);
           DBG_VERBOSE("Finished sh_prom=", sh_prom.get(), "\n");
           // fulfill only in calling persona
           persona.lpc_ff([task_id, start_t, compute_start_t, delay_s, sh_prom]() {
-            duration_seconds s = 0 - compute_start_t;
+            duration_seconds s (0.0 - compute_start_t);
             DBG("Fulfilled sh_prom=", sh_prom.get(), " task_id=", task_id, "of", global_task_id(), " in ", delay_s.count(), " delay + ", s.count(), " s\n");
             sh_prom->fulfill_anonymous(1);
             global_tasks_completed()++;

From dd36c56830c977c09a65f4e30263edda3070615a Mon Sep 17 00:00:00 2001
From: Jan Ciesko <jan.ciesko@gmail.com>
Date: Mon, 19 Feb 2024 09:40:50 -0700
Subject: [PATCH 07/13] Add Kokkos to CMake Add Kokkos init and finalize calls

---
 CMakeLists.txt     |  8 ++++++++
 build.sh           |  6 +++---
 src/CMakeLists.txt |  8 +++++++-
 src/main.cpp       | 10 ++++++++++
 4 files changed, 28 insertions(+), 4 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6bad75f..74355c5 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -42,6 +42,7 @@
 # to the root source directory of the project as ${MHM2_SOURCE_DIR} and to the
 # root binary directory of the project as ${MHM2_BINARY_DIR}
 cmake_minimum_required(VERSION 3.10 FATAL_ERROR)
+cmake_policy(SET CMP0074 NEW)
 project(MHM2)
 
 message(STATUS "Building ${CMAKE_PROJECT_NAME} with CMake ${CMAKE_VERSION}")
@@ -151,6 +152,13 @@ if(MHM2_VECTORS)
   endforeach()
 endif()
 
+option(ENABLE_KOKKOS "Whether to use Kokkos" OFF)
+if(ENABLE_KOKKOS)
+  message(STATUS "Building with Kokkos")
+  add_definitions(-DENABLE_KOKKOS)
+  find_package(Kokkos REQUIRED)
+endif()
+
 option(ENABLE_GASNET_STATS "Turn on gasnet stats recording" OFF)
 message("Building ${CMAKE_BUILD_TYPE} version")
 if("${CMAKE_BUILD_TYPE}" STREQUAL "Debug")
diff --git a/build.sh b/build.sh
index 226602d..19599ed 100755
--- a/build.sh
+++ b/build.sh
@@ -46,11 +46,11 @@ else
     mkdir -p $rootdir/.build
     cd $rootdir/.build
     if [ "$1" == "Debug" ] || [ "$1" == "Release" ] || [ "$1" == "RelWithDebInfo" ]; then
-        rm -rf *
-        rm -rf $INSTALL_PATH/cmake
+#        rm -rf *
+#        rm -rf $INSTALL_PATH/cmake
         cmake $rootdir -DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DCMAKE_BUILD_TYPE=$1 -DCMAKE_INSTALL_PREFIX=$INSTALL_PATH -DCMAKE_CXX_COMPILER=mpicxx \
               -DMHM2_ENABLE_TESTING=0 $MHM2_CMAKE_EXTRAS $2
-        #-DENABLE_CUDA=0
+             #-DENABLE_CUDA=0
     fi
     make -j ${MHM2_BUILD_THREADS} all install
     #make VERBOSE=1 -j ${MHM2_BUILD_THREADS} all install
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 4643bb8..5e979b6 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -39,6 +39,7 @@
 # such enhancements or derivative works thereof, in binary and source code form.
 
 cmake_minimum_required(VERSION 3.10 FATAL_ERROR)
+cmake_policy(SET CMP0074 NEW)
 
 if(${CMAKE_VERSION} VERSION_GREATER_EQUAL 3.13 AND DEFINED UPCXX_LIBRARIES)
   message(
@@ -236,11 +237,16 @@ set(MHM2_TARGET_OBJECTS
 
 add_executable(mhm2 main.cpp ${MHM2_TARGET_OBJECTS})
 
+if(ENABLE_KOKKOS)
+  set (KokkosLib Kokkos::kokkos)
+endif()
+
 set(MHM2_LINK_LIBRARIES
     Threads::Threads
     KCOUNT_LIBRARY
     ${ZLIB_LIBRARIES}
-    MHM2_VERSION_LIB)
+    MHM2_VERSION_LIB 
+    ${KokkosLib})
 
 include_directories(${CMAKE_CURRENT_SOURCE_DIR})
 
diff --git a/src/main.cpp b/src/main.cpp
index b21cccb..b0bdc66 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -50,6 +50,10 @@
 
 #include "kmer.hpp"
 
+#ifdef ENABLE_KOKKOS
+#include <Kokkos_Core.hpp>
+#endif
+
 using std::fixed;
 using std::setprecision;
 
@@ -63,6 +67,9 @@ void merge_reads(vector<string> reads_fname_list, int qual_offset,
 
 int main(int argc, char **argv) {
   
+  #ifdef ENABLE_KOKKOS
+  Kokkos::initialize(argc, argv);
+  #endif
   upcxx::init();
   
   barrier();
@@ -237,5 +244,8 @@ int main(int argc, char **argv) {
     ;
 #endif
   upcxx::finalize();
+  #ifdef ENABLE_KOKKOS
+  Kokkos::finalize();
+  #endif
   return 0;
 }

From 85947d7ac8c5660ce02849ea97c3f27f2558dd03 Mon Sep 17 00:00:00 2001
From: Amy Powell <ajpowel@sandia.gov>
Date: Wed, 20 Mar 2024 10:05:49 -0700
Subject: [PATCH 08/13] Hipification:  round one

---
 src/fastq.cpp                                 |    6 +-
 src/kcount/kcount-gpu/gpu_hash_funcs.cpp      |  141 -
 src/kcount/kcount-gpu/gpu_hash_table.cpp      |  802 -----
 src/kcount/kcount-gpu/gpu_hash_table.hpp      |  182 -
 src/kcount/kcount-gpu/gqf.cpp                 | 2920 -----------------
 src/kcount/kcount-gpu/gqf.hpp                 |  384 ---
 src/kcount/kcount-gpu/gqf_int.hpp             |  138 -
 src/kcount/kcount-gpu/hashutil.cpp            |  179 -
 src/kcount/kcount-gpu/hashutil.hpp            |   25 -
 src/kcount/kcount-gpu/parse_and_pack.cpp      |  325 --
 src/kcount/kcount-gpu/parse_and_pack.hpp      |   89 -
 src/kcount/kcount-gpu/partitioned_counter.cpp |   68 -
 src/kcount/kcount-gpu/partitioned_counter.hpp |   56 -
 src/kcount/kcount_gpu.cpp                     |    2 +-
 14 files changed, 4 insertions(+), 5313 deletions(-)

diff --git a/src/fastq.cpp b/src/fastq.cpp
index 231b21a..3697abc 100644
--- a/src/fastq.cpp
+++ b/src/fastq.cpp
@@ -484,7 +484,7 @@ void FastqReader::seek() {
 }
 
 FastqReader::~FastqReader() {
-  if (!open_fut.is_ready()) {
+  if (!open_fut.ready()) {
     WARN("Destructor called before opening completed\n");
     open_fut.wait();
   }
@@ -502,7 +502,7 @@ string FastqReader::get_fname() { return fname; }
 size_t FastqReader::my_file_size() { return end_read - start_read + (fqr2 ? fqr2->my_file_size() : 0); }
 
 size_t FastqReader::get_next_fq_record(string &id, string &seq, string &quals, bool wait_open) {
-  if (wait_open && !open_fut.is_ready()) {
+  if (wait_open && !open_fut.ready()) {
     WARN("Attempt to read ", fname, " before it is ready. wait on open_fut first to avoid this warning!\n");
     open_fut.wait();
   }
@@ -554,7 +554,7 @@ int FastqReader::get_max_read_len() { return std::max(max_read_len, fqr2 ? fqr2-
 
 
 void FastqReader::reset() {
-  if (!open_fut.is_ready()) {
+  if (!open_fut.ready()) {
     open_fut.wait();
   }
   if (!f) {
diff --git a/src/kcount/kcount-gpu/gpu_hash_funcs.cpp b/src/kcount/kcount-gpu/gpu_hash_funcs.cpp
index 10bcc0f..e69de29 100644
--- a/src/kcount/kcount-gpu/gpu_hash_funcs.cpp
+++ b/src/kcount/kcount-gpu/gpu_hash_funcs.cpp
@@ -1,141 +0,0 @@
-/*
- HipMer v 2.0, Copyright (c) 2020, The Regents of the University of California,
- through Lawrence Berkeley National Laboratory (subject to receipt of any required
- approvals from the U.S. Dept. of Energy).  All rights reserved."
-
- Redistribution and use in source and binary forms, with or without modification,
- are permitted provided that the following conditions are met:
-
- (1) Redistributions of source code must retain the above copyright notice, this
- list of conditions and the following disclaimer.
-
- (2) Redistributions in binary form must reproduce the above copyright notice,
- this list of conditions and the following disclaimer in the documentation and/or
- other materials provided with the distribution.
-
- (3) Neither the name of the University of California, Lawrence Berkeley National
- Laboratory, U.S. Dept. of Energy nor the names of its contributors may be used to
- endorse or promote products derived from this software without specific prior
- written permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY
- EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
- OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT
- SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
- TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
- BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
- DAMAGE.
-
- You are under no obligation whatsoever to provide any bug fixes, patches, or upgrades
- to the features, functionality or performance of the source code ("Enhancements") to
- anyone; however, if you choose to make your Enhancements available either publicly,
- or directly to Lawrence Berkeley National Laboratory, without imposing a separate
- written license agreement for such Enhancements, then you hereby grant the following
- license: a  non-exclusive, royalty-free perpetual license to install, use, modify,
- prepare derivative works, incorporate into other computer software, distribute, and
- sublicense such enhancements or derivative works thereof, in binary and source code
- form.
-*/
-
-#define BIG_CONSTANT(x) (x##LLU)
-#define ROTL64(x, r) ((x << r) | (x >> (64 - r)))
-
-// Finalization mix - force all bits of a hash block to avalanche
-__device__ uint64_t fmix64(uint64_t k) {
-  k ^= k >> 33;
-  k *= BIG_CONSTANT(0xff51afd7ed558ccd);
-  k ^= k >> 33;
-  k *= BIG_CONSTANT(0xc4ceb9fe1a85ec53);
-  k ^= k >> 33;
-
-  return k;
-}
-
-__device__ uint64_t gpu_murmurhash3_64(const void *key, const uint32_t len) {
-  const uint8_t *data = (const uint8_t *)key;
-  const uint32_t nblocks = len / 16;
-  const uint32_t seed = 313;
-  int32_t i;
-
-  uint64_t h1 = seed;
-  uint64_t h2 = seed;
-
-  uint64_t c1 = BIG_CONSTANT(0x87c37b91114253d5);
-  uint64_t c2 = BIG_CONSTANT(0x4cf5ad432745937f);
-
-  const uint64_t *blocks = (const uint64_t *)(data);
-
-  for (i = 0; i < nblocks; i++) {
-    uint64_t k1 = blocks[i * 2 + 0];
-    uint64_t k2 = blocks[i * 2 + 1];
-
-    k1 *= c1;
-    k1 = ROTL64(k1, 31);
-    k1 *= c2;
-    h1 ^= k1;
-
-    h1 = ROTL64(h1, 27);
-    h1 += h2;
-    h1 = h1 * 5 + 0x52dce729;
-
-    k2 *= c2;
-    k2 = ROTL64(k2, 33);
-    k2 *= c1;
-    h2 ^= k2;
-
-    h2 = ROTL64(h2, 31);
-    h2 += h1;
-    h2 = h2 * 5 + 0x38495ab5;
-  }
-
-  const uint8_t *tail = (const uint8_t *)(data + nblocks * 16);
-
-  uint64_t k1 = 0;
-  uint64_t k2 = 0;
-
-  switch (len & 15) {
-    case 15: k2 ^= (uint64_t)(tail[14]) << 48;
-    case 14: k2 ^= (uint64_t)(tail[13]) << 40;
-    case 13: k2 ^= (uint64_t)(tail[12]) << 32;
-    case 12: k2 ^= (uint64_t)(tail[11]) << 24;
-    case 11: k2 ^= (uint64_t)(tail[10]) << 16;
-    case 10: k2 ^= (uint64_t)(tail[9]) << 8;
-    case 9:
-      k2 ^= (uint64_t)(tail[8]) << 0;
-      k2 *= c2;
-      k2 = ROTL64(k2, 33);
-      k2 *= c1;
-      h2 ^= k2;
-
-    case 8: k1 ^= (uint64_t)(tail[7]) << 56;
-    case 7: k1 ^= (uint64_t)(tail[6]) << 48;
-    case 6: k1 ^= (uint64_t)(tail[5]) << 40;
-    case 5: k1 ^= (uint64_t)(tail[4]) << 32;
-    case 4: k1 ^= (uint64_t)(tail[3]) << 24;
-    case 3: k1 ^= (uint64_t)(tail[2]) << 16;
-    case 2: k1 ^= (uint64_t)(tail[1]) << 8;
-    case 1:
-      k1 ^= (uint64_t)(tail[0]) << 0;
-      k1 *= c1;
-      k1 = ROTL64(k1, 31);
-      k1 *= c2;
-      h1 ^= k1;
-  };
-
-  h1 ^= len;
-  h2 ^= len;
-
-  h1 += h2;
-  h2 += h1;
-
-  h1 = fmix64(h1);
-  h2 = fmix64(h2);
-
-  h1 += h2;
-  h2 += h1;
-
-  return h1;
-}
diff --git a/src/kcount/kcount-gpu/gpu_hash_table.cpp b/src/kcount/kcount-gpu/gpu_hash_table.cpp
index 7391060..e69de29 100644
--- a/src/kcount/kcount-gpu/gpu_hash_table.cpp
+++ b/src/kcount/kcount-gpu/gpu_hash_table.cpp
@@ -1,802 +0,0 @@
-/*
- HipMer v 2.0, Copyright (c) 2020, The Regents of the University of California,
- through Lawrence Berkeley National Laboratory (subject to receipt of any required
- approvals from the U.S. Dept. of Energy).  All rights reserved."
-
- Redistribution and use in source and binary forms, with or without modification,
- are permitted provided that the following conditions are met:
-
- (1) Redistributions of source code must retain the above copyright notice, this
- list of conditions and the following disclaimer.
-
- (2) Redistributions in binary form must reproduce the above copyright notice,
- this list of conditions and the following disclaimer in the documentation and/or
- other materials provided with the distribution.
-
- (3) Neither the name of the University of California, Lawrence Berkeley National
- Laboratory, U.S. Dept. of Energy nor the names of its contributors may be used to
- endorse or promote products derived from this software without specific prior
- written permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY
- EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
- OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT
- SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
- TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
- BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
- DAMAGE.
-
- You are under no obligation whatsoever to provide any bug fixes, patches, or upgrades
- to the features, functionality or performance of the source code ("Enhancements") to
- anyone; however, if you choose to make your Enhancements available either publicly,
- or directly to Lawrence Berkeley National Laboratory, without imposing a separate
- written license agreement for such Enhancements, then you hereby grant the following
- license: a  non-exclusive, royalty-free perpetual license to install, use, modify,
- prepare derivative works, incorporate into other computer software, distribute, and
- sublicense such enhancements or derivative works thereof, in binary and source code
- form.
-*/
-
-#include <iostream>
-#include <sstream>
-#include <fstream>
-#include <chrono>
-#include <tuple>
-#include <iomanip>
-#include <assert.h>
-#include <cuda_runtime_api.h>
-#include <cuda.h>
-
-#include "upcxx_utils/colors.h"
-#include "gpu-utils/gpu_common.hpp"
-#include "gpu-utils/gpu_utils.hpp"
-#include "gpu_hash_table.hpp"
-#include "prime.hpp"
-#include "gqf.hpp"
-
-#include "gpu_hash_funcs.cpp"
-
-using namespace std;
-using namespace gpu_common;
-using namespace kcount_gpu;
-
-const uint64_t KEY_EMPTY = 0xffffffffffffffff;
-const uint64_t KEY_TRANSITION = 0xfffffffffffffffe;
-const uint8_t KEY_EMPTY_BYTE = 0xff;
-
-template <int MAX_K>
-__device__ void kmer_set(KmerArray<MAX_K> &kmer1, const KmerArray<MAX_K> &kmer2) {
-  int N_LONGS = kmer1.N_LONGS;
-  uint64_t old_key;
-  for (int i = 0; i < N_LONGS - 1; i++) {
-    old_key = atomicExch((unsigned long long *)&(kmer1.longs[i]), kmer2.longs[i]);
-    if (old_key != KEY_EMPTY) printf("ERROR: old key should be KEY_EMPTY\n");
-  }
-  old_key = atomicExch((unsigned long long *)&(kmer1.longs[N_LONGS - 1]), kmer2.longs[N_LONGS - 1]);
-  if (old_key != KEY_TRANSITION) printf("ERROR: old key should be KEY_TRANSITION\n");
-}
-
-template <int MAX_K>
-__device__ bool kmers_equal(const KmerArray<MAX_K> &kmer1, const KmerArray<MAX_K> &kmer2) {
-  int n_longs = kmer1.N_LONGS;
-  for (int i = 0; i < n_longs; i++) {
-    uint64_t old_key = atomicAdd((unsigned long long *)&(kmer1.longs[i]), 0ULL);
-    if (old_key != kmer2.longs[i]) return false;
-  }
-  return true;
-}
-
-template <int MAX_K>
-__device__ size_t kmer_hash(const KmerArray<MAX_K> &kmer) {
-  return gpu_murmurhash3_64(reinterpret_cast<const void *>(kmer.longs), kmer.N_LONGS * sizeof(uint64_t));
-}
-
-__device__ int8_t get_ext(CountsArray &counts, int pos, int8_t *ext_map) {
-  count_t top_count = 0, runner_up_count = 0;
-  int top_ext_pos = 0;
-  count_t kmer_count = counts.kmer_count;
-  for (int i = pos; i < pos + 4; i++) {
-    if (counts.ext_counts[i] >= top_count) {
-      runner_up_count = top_count;
-      top_count = counts.ext_counts[i];
-      top_ext_pos = i;
-    } else if (counts.ext_counts[i] > runner_up_count) {
-      runner_up_count = counts.ext_counts[i];
-    }
-  }
-  int dmin_dyn = (1.0 - DYN_MIN_DEPTH) * kmer_count;
-  if (dmin_dyn < 2.0) dmin_dyn = 2.0;
-  if (top_count < dmin_dyn) return 'X';
-  if (runner_up_count >= dmin_dyn) return 'F';
-  return ext_map[top_ext_pos - pos];
-}
-
-__device__ bool ext_conflict(ext_count_t *ext_counts, int start_idx) {
-  int idx = -1;
-  for (int i = start_idx; i < start_idx + 4; i++) {
-    if (ext_counts[i]) {
-      // conflict
-      if (idx != -1) return true;
-      idx = i;
-    }
-  }
-  return false;
-}
-
-template <int MAX_K>
-__global__ void gpu_merge_ctg_kmers(KmerCountsMap<MAX_K> read_kmers, const KmerCountsMap<MAX_K> ctg_kmers,
-                                    unsigned int *insert_counts) {
-  unsigned int threadid = blockIdx.x * blockDim.x + threadIdx.x;
-  int8_t ext_map[4] = {'A', 'C', 'G', 'T'};
-  int N_LONGS = KmerArray<MAX_K>::N_LONGS;
-  int attempted_inserts = 0;
-  int dropped_inserts = 0;
-  int new_inserts = 0;
-  if (threadid < ctg_kmers.capacity) {
-    count_t kmer_count = ctg_kmers.vals[threadid].kmer_count;
-    ext_count_t *ext_counts = ctg_kmers.vals[threadid].ext_counts;
-    if (kmer_count && !ext_conflict(ext_counts, 0) && !ext_conflict(ext_counts, 4)) {
-      KmerArray<MAX_K> kmer = ctg_kmers.keys[threadid];
-      uint64_t slot = kmer_hash(kmer) % read_kmers.capacity;
-      auto start_slot = slot;
-      attempted_inserts++;
-      const int MAX_PROBE = (read_kmers.capacity < KCOUNT_HT_MAX_PROBE ? read_kmers.capacity : KCOUNT_HT_MAX_PROBE);
-      for (int j = 0; j < MAX_PROBE; j++) {
-        uint64_t old_key = atomicCAS((unsigned long long *)&(read_kmers.keys[slot].longs[N_LONGS - 1]), KEY_EMPTY, KEY_TRANSITION);
-        if (old_key == KEY_EMPTY) {
-          new_inserts++;
-          memcpy(&read_kmers.vals[slot], &ctg_kmers.vals[threadid], sizeof(CountsArray));
-          kmer_set(read_kmers.keys[slot], kmer);
-          break;
-        } else if (old_key == kmer.longs[N_LONGS - 1]) {
-          if (kmers_equal(read_kmers.keys[slot], kmer)) {
-            // existing kmer from reads - only replace if the kmer is non-UU
-            // there is no need for atomics here because all ctg kmers are unique; hence only one thread will ever match this kmer
-            int8_t left_ext = get_ext(read_kmers.vals[slot], 0, ext_map);
-            int8_t right_ext = get_ext(read_kmers.vals[slot], 4, ext_map);
-            if (left_ext == 'X' || left_ext == 'F' || right_ext == 'X' || right_ext == 'F')
-              memcpy(&read_kmers.vals[slot], &ctg_kmers.vals[threadid], sizeof(CountsArray));
-            break;
-          }
-        }
-        // quadratic probing - worse cache but reduced clustering
-        slot = (start_slot + (j + 1) * (j + 1)) % read_kmers.capacity;
-        if (j == MAX_PROBE - 1) dropped_inserts++;
-      }
-    }
-  }
-  reduce(attempted_inserts, ctg_kmers.capacity, &(insert_counts[0]));
-  reduce(dropped_inserts, ctg_kmers.capacity, &(insert_counts[1]));
-  reduce(new_inserts, ctg_kmers.capacity, &(insert_counts[2]));
-}
-
-template <int MAX_K>
-__global__ void gpu_compact_ht(KmerCountsMap<MAX_K> elems, KmerExtsMap<MAX_K> compact_elems, unsigned int *elem_counts) {
-  unsigned int threadid = blockIdx.x * blockDim.x + threadIdx.x;
-  const int N_LONGS = KmerArray<MAX_K>::N_LONGS;
-  int dropped_inserts = 0;
-  int unique_inserts = 0;
-  int8_t ext_map[4] = {'A', 'C', 'G', 'T'};
-  if (threadid < elems.capacity) {
-    if (elems.vals[threadid].kmer_count) {
-      KmerArray<MAX_K> kmer = elems.keys[threadid];
-      uint64_t slot = kmer_hash(kmer) % compact_elems.capacity;
-      auto start_slot = slot;
-      // we set a constraint on the max probe to track whether we are getting excessive collisions and need a bigger default
-      // compact table
-      const int MAX_PROBE = (compact_elems.capacity < KCOUNT_HT_MAX_PROBE ? compact_elems.capacity : KCOUNT_HT_MAX_PROBE);
-      // look for empty slot in compact hash table
-      for (int j = 0; j < MAX_PROBE; j++) {
-        uint64_t old_key =
-            atomicCAS((unsigned long long *)&(compact_elems.keys[slot].longs[N_LONGS - 1]), KEY_EMPTY, kmer.longs[N_LONGS - 1]);
-        if (old_key == KEY_EMPTY) {
-          // found empty slot - there will be no duplicate keys since we're copying across from another hash table
-          unique_inserts++;
-          memcpy((void *)compact_elems.keys[slot].longs, kmer.longs, sizeof(uint64_t) * (N_LONGS - 1));
-          // compute exts
-          int8_t left_ext = get_ext(elems.vals[threadid], 0, ext_map);
-          int8_t right_ext = get_ext(elems.vals[threadid], 4, ext_map);
-          if (elems.vals[threadid].kmer_count < 2)
-            printf("WARNING: elem should have been purged, count %d\n", elems.vals[threadid].kmer_count);
-          compact_elems.vals[slot].count = elems.vals[threadid].kmer_count;
-          compact_elems.vals[slot].left = left_ext;
-          compact_elems.vals[slot].right = right_ext;
-          break;
-        }
-        // quadratic probing - worse cache but reduced clustering
-        slot = (start_slot + (j + 1) * (j + 1)) % compact_elems.capacity;
-        if (j == MAX_PROBE - 1) dropped_inserts++;
-      }
-    }
-  }
-  reduce(dropped_inserts, compact_elems.capacity, &(elem_counts[0]));
-  reduce(unique_inserts, compact_elems.capacity, &(elem_counts[1]));
-}
-
-template <int MAX_K>
-__global__ void gpu_purge_invalid(KmerCountsMap<MAX_K> elems, unsigned int *elem_counts) {
-  unsigned int threadid = blockIdx.x * blockDim.x + threadIdx.x;
-  int N_LONGS = KmerArray<MAX_K>::N_LONGS;
-  int num_purged = 0;
-  int num_elems = 0;
-  if (threadid < elems.capacity) {
-    if (elems.vals[threadid].kmer_count) {
-      int ext_sum = 0;
-      for (int j = 0; j < 8; j++) ext_sum += elems.vals[threadid].ext_counts[j];
-      if (elems.vals[threadid].kmer_count < 2 || !ext_sum) {
-        memset(&elems.vals[threadid], 0, sizeof(CountsArray));
-        memset((void *)elems.keys[threadid].longs, KEY_EMPTY_BYTE, N_LONGS * sizeof(uint64_t));
-        num_purged++;
-      } else {
-        num_elems++;
-      }
-    }
-  }
-  reduce(num_purged, elems.capacity, &(elem_counts[0]));
-  reduce(num_elems, elems.capacity, &(elem_counts[1]));
-}
-
-static __constant__ char to_base[] = {'0', 'a', 'c', 'g', 't', 'A', 'C', 'G', 'T', 'N'};
-
-inline __device__ char to_base_func(int index, int pp) {
-  if (index > 9) {
-    printf("ERROR: index out of range for to_base: %d, packed seq pos %d\n", index, pp);
-    return 0;
-  }
-  if (index == 0) return '_';
-  return to_base[index];
-}
-
-__global__ void gpu_unpack_supermer_block(SupermerBuff unpacked_supermer_buff, SupermerBuff packed_supermer_buff, int buff_len) {
-  unsigned int threadid = blockIdx.x * blockDim.x + threadIdx.x;
-  if (threadid >= buff_len) return;
-  uint8_t packed = packed_supermer_buff.seqs[threadid];
-  if (packed == '_') return;
-  uint8_t left_side = (packed & 240) >> 4;
-  unpacked_supermer_buff.seqs[threadid * 2] = to_base_func(left_side, packed);
-  if (packed_supermer_buff.counts) unpacked_supermer_buff.counts[threadid * 2] = packed_supermer_buff.counts[threadid];
-  uint8_t right_side = packed & 15;
-  unpacked_supermer_buff.seqs[threadid * 2 + 1] = to_base_func(right_side, packed);
-  if (packed_supermer_buff.counts) unpacked_supermer_buff.counts[threadid * 2 + 1] = packed_supermer_buff.counts[threadid];
-}
-
-inline __device__ bool is_valid_base(char base) {
-  return (base == 'A' || base == 'C' || base == 'G' || base == 'T' || base == '0' || base == 'N');
-}
-
-inline __device__ bool bad_qual(char base) { return (base == 'a' || base == 'c' || base == 'g' || base == 't'); }
-
-inline __device__ void inc_ext(char ext, ext_count_t kmer_count, ext_count_t *ext_counts) {
-  switch (ext) {
-    case 'A': atomicAddUint16_thres(&(ext_counts[0]), kmer_count, KCOUNT_MAX_KMER_COUNT); return;
-    case 'C': atomicAddUint16_thres(&(ext_counts[1]), kmer_count, KCOUNT_MAX_KMER_COUNT); return;
-    case 'G': atomicAddUint16_thres(&(ext_counts[2]), kmer_count, KCOUNT_MAX_KMER_COUNT); return;
-    case 'T': atomicAddUint16_thres(&(ext_counts[3]), kmer_count, KCOUNT_MAX_KMER_COUNT); return;
-  }
-}
-
-template <int MAX_K>
-__device__ bool get_kmer_from_supermer(SupermerBuff supermer_buff, uint32_t buff_len, int kmer_len, uint64_t *kmer, char &left_ext,
-                                       char &right_ext, count_t &count) {
-  unsigned int threadid = blockIdx.x * blockDim.x + threadIdx.x;
-  int num_kmers = buff_len - kmer_len + 1;
-  if (threadid >= num_kmers) return false;
-  const int N_LONGS = KmerArray<MAX_K>::N_LONGS;
-  if (!pack_seq_to_kmer(&(supermer_buff.seqs[threadid]), kmer_len, N_LONGS, kmer)) return false;
-  if (threadid + kmer_len >= buff_len) return false;  // printf("out of bounds %d >= %d\n", threadid + kmer_len, buff_len);
-  left_ext = supermer_buff.seqs[threadid - 1];
-  right_ext = supermer_buff.seqs[threadid + kmer_len];
-  if (left_ext == '_' || right_ext == '_') return false;
-  if (!left_ext || !right_ext) return false;
-  if (supermer_buff.counts) {
-    count = supermer_buff.counts[threadid];
-  } else {
-    count = 1;
-    if (bad_qual(left_ext)) left_ext = '0';
-    if (bad_qual(right_ext)) right_ext = '0';
-  }
-  if (!is_valid_base(left_ext)) {
-    printf("ERROR: threadid %d, invalid char for left nucleotide %d\n", threadid, (uint8_t)left_ext);
-    return false;
-  }
-  if (!is_valid_base(right_ext)) {
-    printf("ERROR: threadid %d, invalid char for right nucleotide %d\n", threadid, (uint8_t)right_ext);
-    return false;
-  }
-  uint64_t kmer_rc[N_LONGS];
-  revcomp(kmer, kmer_rc, kmer_len, N_LONGS);
-  for (int l = 0; l < N_LONGS; l++) {
-    if (kmer_rc[l] == kmer[l]) continue;
-    if (kmer_rc[l] < kmer[l]) {
-      // swap
-      char tmp = left_ext;
-      left_ext = comp_nucleotide(right_ext);
-      right_ext = comp_nucleotide(tmp);
-
-      // FIXME: we should be able to have a 0 extension even for revcomp - we do for non-revcomp
-      // if (!left_ext || !right_ext) return false;
-
-      memcpy(kmer, kmer_rc, N_LONGS * sizeof(uint64_t));
-    }
-    break;
-  }
-  return true;
-}
-
-template <int MAX_K>
-__device__ bool gpu_insert_kmer(KmerCountsMap<MAX_K> elems, uint64_t hash_val, KmerArray<MAX_K> &kmer, char left_ext,
-                                char right_ext, char prev_left_ext, char prev_right_ext, count_t kmer_count, int &new_inserts,
-                                int &dropped_inserts, bool ctg_kmers, bool use_qf, bool update_only) {
-  const int N_LONGS = KmerArray<MAX_K>::N_LONGS;
-  uint64_t slot = hash_val % elems.capacity;
-  auto start_slot = slot;
-  const int MAX_PROBE = (elems.capacity < 200 ? elems.capacity : 200);
-  bool found_slot = false;
-  bool kmer_found_in_ht = false;
-  uint64_t old_key = KEY_TRANSITION;
-  for (int j = 0; j < MAX_PROBE; j++) {
-    // we have to be careful here not to end up with multiple threads on the same warp accessing the same slot, because
-    // that will cause a deadlock. So we loop over all statements in each CAS spin to ensure that all threads get a
-    // chance to execute
-    do {
-      old_key = atomicCAS((unsigned long long *)&(elems.keys[slot].longs[N_LONGS - 1]), KEY_EMPTY, KEY_TRANSITION);
-      if (old_key != KEY_TRANSITION) {
-        if (old_key == KEY_EMPTY) {
-          if (update_only) {
-            old_key = atomicExch((unsigned long long *)&(elems.keys[slot].longs[N_LONGS - 1]), KEY_EMPTY);
-            if (old_key != KEY_TRANSITION) printf("ERROR: old key should be KEY_TRANSITION\n");
-            return false;
-          }
-          kmer_set(elems.keys[slot], kmer);
-          found_slot = true;
-        } else if (old_key == kmer.longs[N_LONGS - 1]) {
-          if (kmers_equal(elems.keys[slot], kmer)) {
-            found_slot = true;
-            kmer_found_in_ht = true;
-          }
-        }
-      }
-    } while (old_key == KEY_TRANSITION);
-    if (found_slot) break;
-    // quadratic probing - worse cache but reduced clustering
-    slot = (start_slot + j * j) % elems.capacity;
-    // this entry didn't get inserted because we ran out of probing time (and probably space)
-    if (j == MAX_PROBE - 1) dropped_inserts++;
-  }
-  if (found_slot) {
-    ext_count_t *ext_counts = elems.vals[slot].ext_counts;
-    if (ctg_kmers) {
-      // the count is the min of all counts. Use CAS to deal with the initial zero value
-      int prev_count = atomicCAS(&elems.vals[slot].kmer_count, 0, kmer_count);
-      if (prev_count)
-        atomicMin(&elems.vals[slot].kmer_count, kmer_count);
-      else
-        new_inserts++;
-    } else {
-      assert(kmer_count == 1);
-      int prev_count = atomicAdd(&elems.vals[slot].kmer_count, kmer_count);
-      if (!prev_count) new_inserts++;
-    }
-    ext_count_t kmer_count_uint16 = min(kmer_count, UINT16_MAX);
-    inc_ext(left_ext, kmer_count_uint16, ext_counts);
-    inc_ext(right_ext, kmer_count_uint16, ext_counts + 4);
-    if (use_qf && !update_only && !kmer_found_in_ht && !ctg_kmers) {
-      // kmer was not in hash table, so it must have been found in the qf
-      // add the extensions from the previous entry stored in the qf
-      inc_ext(prev_left_ext, 1, ext_counts);
-      inc_ext(prev_right_ext, 1, ext_counts + 4);
-      // inc the overall kmer count
-      atomicAdd(&elems.vals[slot].kmer_count, 1);
-    }
-  }
-  return true;
-}
-
-template <int MAX_K>
-__global__ void gpu_insert_supermer_block(KmerCountsMap<MAX_K> elems, SupermerBuff supermer_buff, uint32_t buff_len, int kmer_len,
-                                          bool ctg_kmers, InsertStats *insert_stats, quotient_filter::QF *qf) {
-  unsigned int threadid = blockIdx.x * blockDim.x + threadIdx.x;
-  const int N_LONGS = KmerArray<MAX_K>::N_LONGS;
-  int attempted_inserts = 0, dropped_inserts = 0, new_inserts = 0, num_unique_qf = 0;
-  if (threadid > 0 && threadid < buff_len) {
-    attempted_inserts++;
-    KmerArray<MAX_K> kmer;
-    char left_ext, right_ext;
-    count_t kmer_count;
-    if (get_kmer_from_supermer<MAX_K>(supermer_buff, buff_len, kmer_len, kmer.longs, left_ext, right_ext, kmer_count)) {
-      if (kmer.longs[N_LONGS - 1] == KEY_EMPTY) printf("ERROR: block equal to KEY_EMPTY\n");
-      if (kmer.longs[N_LONGS - 1] == KEY_TRANSITION) printf("ERROR: block equal to KEY_TRANSITION\n");
-      auto hash_val = kmer_hash(kmer);
-      char prev_left_ext = '0', prev_right_ext = '0';
-      bool use_qf = (qf != nullptr);
-      bool update_only = (use_qf && !ctg_kmers);
-      bool updated = gpu_insert_kmer(elems, hash_val, kmer, left_ext, right_ext, prev_left_ext, prev_right_ext, kmer_count,
-                                     new_inserts, dropped_inserts, ctg_kmers, use_qf, update_only);
-      if (update_only && !updated) {
-        // not found in the hash table - look in the qf
-        quotient_filter::qf_returns qf_insert_result = quotient_filter::QF_ITEM_FOUND;
-        qf_insert_result = quotient_filter::insert_kmer(qf, hash_val, left_ext, right_ext, prev_left_ext, prev_right_ext);
-        if (qf_insert_result == quotient_filter::QF_ITEM_INSERTED) {
-          num_unique_qf++;
-          assert(prev_left_ext == '0' && prev_right_ext == '0');
-        } else if (qf_insert_result == quotient_filter::QF_ITEM_FOUND) {
-          gpu_insert_kmer(elems, hash_val, kmer, left_ext, right_ext, prev_left_ext, prev_right_ext, kmer_count, new_inserts,
-                          dropped_inserts, ctg_kmers, use_qf, false);
-        }
-      }
-    }
-  }
-  reduce(attempted_inserts, buff_len, &insert_stats->attempted);
-  reduce(dropped_inserts, buff_len, &insert_stats->dropped);
-  reduce(new_inserts, buff_len, &insert_stats->new_inserts);
-  reduce(num_unique_qf, buff_len, &insert_stats->num_unique_qf);
-}
-
-template <int MAX_K>
-struct HashTableGPUDriver<MAX_K>::HashTableDriverState {
-  cudaEvent_t event;
-  QuickTimer insert_timer, kernel_timer;
-  quotient_filter::QF *qf = nullptr;
-};
-
-template <int MAX_K>
-void KmerArray<MAX_K>::set(const uint64_t *kmer) {
-  memcpy(longs, kmer, N_LONGS * sizeof(uint64_t));
-}
-
-template <int MAX_K>
-void KmerCountsMap<MAX_K>::init(int64_t ht_capacity) {
-  capacity = ht_capacity;
-  cudaErrchk(cudaMalloc(&keys, capacity * sizeof(KmerArray<MAX_K>)));
-  cudaErrchk(cudaMemset((void *)keys, KEY_EMPTY_BYTE, capacity * sizeof(KmerArray<MAX_K>)));
-  cudaErrchk(cudaMalloc(&vals, capacity * sizeof(CountsArray)));
-  cudaErrchk(cudaMemset(vals, 0, capacity * sizeof(CountsArray)));
-}
-
-template <int MAX_K>
-void KmerCountsMap<MAX_K>::clear() {
-  cudaFree((void *)keys);
-  cudaFree(vals);
-}
-
-template <int MAX_K>
-void KmerExtsMap<MAX_K>::init(int64_t ht_capacity) {
-  capacity = ht_capacity;
-  cudaErrchk(cudaMalloc(&keys, capacity * sizeof(KmerArray<MAX_K>)));
-  cudaErrchk(cudaMemset((void *)keys, KEY_EMPTY_BYTE, capacity * sizeof(KmerArray<MAX_K>)));
-  cudaErrchk(cudaMalloc(&vals, capacity * sizeof(CountExts)));
-  cudaErrchk(cudaMemset(vals, 0, capacity * sizeof(CountExts)));
-}
-
-template <int MAX_K>
-void KmerExtsMap<MAX_K>::clear() {
-  cudaFree((void *)keys);
-  cudaFree(vals);
-}
-
-template <int MAX_K>
-HashTableGPUDriver<MAX_K>::HashTableGPUDriver() {}
-
-template <int MAX_K>
-void HashTableGPUDriver<MAX_K>::init(int upcxx_rank_me, int upcxx_rank_n, int kmer_len, int max_elems, size_t gpu_avail_mem,
-                                     double &init_time, size_t &gpu_bytes_reqd, size_t &ht_bytes_used, size_t &qf_bytes_used,
-                                     bool use_qf) {
-  QuickTimer init_timer;
-  init_timer.start();
-  this->upcxx_rank_me = upcxx_rank_me;
-  this->upcxx_rank_n = upcxx_rank_n;
-  this->kmer_len = kmer_len;
-  pass_type = READ_KMERS_PASS;
-  gpu_utils::set_gpu_device(upcxx_rank_me);
-  dstate = new HashTableDriverState();
-  dstate->qf = nullptr;
-  // max ratio of singletons to dups
-  uint64_t max_elems_qf = max_elems * 5;
-  int nbits_qf = log2(max_elems_qf);
-  if (nbits_qf == 0) use_qf = false;
-  if (use_qf) {
-    qf_bytes_used = quotient_filter::qf_estimate_memory(nbits_qf);
-    double qf_avail_mem = gpu_avail_mem / 5;
-    // if (!upcxx_rank_me)
-    //  cout << "QF nbits " << nbits_qf << " qf_avail_mem " << qf_avail_mem << " qf bytes used " << qf_bytes_used << "\n" ;
-    if (qf_bytes_used > qf_avail_mem) {
-      // For debugging OOMs
-      // size_t prev_bytes_used = qf_bytes_used;
-      // int prev_nbits = nbits_qf;
-      double factor = qf_avail_mem / qf_bytes_used;
-      size_t corrected_max_elems = (max_elems_qf * factor);
-      nbits_qf = log2(corrected_max_elems) - 1;
-      // drop bits further for really long kmers because the space requirements for the qf relative to the ht go down
-      if (kmer_len >= 96) nbits_qf--;
-      if (nbits_qf == 0) nbits_qf = 1;
-      qf_bytes_used = quotient_filter::qf_estimate_memory(nbits_qf);
-      if (!upcxx_rank_me) cout << "Corrected: QF nbits " << nbits_qf << " qf bytes used " << qf_bytes_used << "\n";
-      /*
-      // uncomment to debug if crashing with OOM when allocating
-      cout << "****** QF nbits corrected to " << nbits_qf << " from " << prev_nbits << "\n";
-      cout << "****** QF will take " << (qf_bytes_used / 1024 / 1024) << "MB instead of " << (prev_bytes_used / 1024 / 1024)
-           << "MB\n";
-      */
-    } else {
-      if (kmer_len >= 64) nbits_qf--;
-    }
-    quotient_filter::qf_malloc_device(&(dstate->qf), nbits_qf);
-  }
-
-  // now check that we have sufficient memory for the required capacity
-  size_t elem_buff_size = KCOUNT_GPU_HASHTABLE_BLOCK_SIZE * (1 + sizeof(count_t)) * 1.5;
-  size_t elem_size = sizeof(KmerArray<MAX_K>) + sizeof(CountsArray);
-  gpu_bytes_reqd = (max_elems * elem_size) + elem_buff_size + qf_bytes_used;
-  // save 1/5 of avail gpu memory for possible ctg kmers and compact hash table
-  // set capacity to max avail remaining from gpu memory - more slots means lower load
-  auto max_slots = (use_qf ? 0.6 : 0.8) * (gpu_avail_mem - elem_buff_size - qf_bytes_used) / elem_size;
-  // find the first prime number lower than this value
-  primes::Prime prime;
-  prime.set(min((size_t)max_slots, (size_t)(max_elems * 3)), false);
-  auto ht_capacity = prime.get();
-  ht_bytes_used = ht_capacity * elem_size;
-
-  // uncomment to debug OOMs
-  // cout << "ht bytes used " << (ht_bytes_used / 1024 / 1024) << "MB\n";
-
-  read_kmers_dev.init(ht_capacity);
-  // for transferring packed elements from host to gpu
-  elem_buff_host.seqs = new char[KCOUNT_GPU_HASHTABLE_BLOCK_SIZE];
-  // these are not used for kmers from reads
-  elem_buff_host.counts = nullptr;
-  // buffer on the device
-  cudaErrchk(cudaMalloc(&packed_elem_buff_dev.seqs, KCOUNT_GPU_HASHTABLE_BLOCK_SIZE));
-  cudaErrchk(cudaMalloc(&unpacked_elem_buff_dev.seqs, KCOUNT_GPU_HASHTABLE_BLOCK_SIZE * 2));
-  packed_elem_buff_dev.counts = nullptr;
-  unpacked_elem_buff_dev.counts = nullptr;
-
-  cudaErrchk(cudaMalloc(&gpu_insert_stats, sizeof(InsertStats)));
-  cudaErrchk(cudaMemset(gpu_insert_stats, 0, sizeof(InsertStats)));
-
-  init_timer.stop();
-  init_time = init_timer.get_elapsed();
-}
-
-template <int MAX_K>
-void HashTableGPUDriver<MAX_K>::init_ctg_kmers(int max_elems, size_t gpu_avail_mem) {
-  pass_type = CTG_KMERS_PASS;
-  // free up space
-  if (dstate->qf) quotient_filter::qf_destroy_device(dstate->qf);
-  dstate->qf = nullptr;
-  size_t elem_buff_size = KCOUNT_GPU_HASHTABLE_BLOCK_SIZE * (1 + sizeof(count_t)) * 1.5;
-  size_t elem_size = sizeof(KmerArray<MAX_K>) + sizeof(CountsArray);
-  size_t max_slots = 0.97 * (gpu_avail_mem - elem_buff_size) / elem_size;
-  primes::Prime prime;
-  prime.set(min(max_slots, (size_t)(max_elems * 3)), false);
-  auto ht_capacity = prime.get();
-  ctg_kmers_dev.init(ht_capacity);
-  elem_buff_host.counts = new count_t[KCOUNT_GPU_HASHTABLE_BLOCK_SIZE];
-  cudaErrchk(cudaMalloc(&packed_elem_buff_dev.counts, KCOUNT_GPU_HASHTABLE_BLOCK_SIZE * sizeof(count_t)));
-  cudaErrchk(cudaMalloc(&unpacked_elem_buff_dev.counts, 2 * KCOUNT_GPU_HASHTABLE_BLOCK_SIZE * sizeof(count_t)));
-  cudaErrchk(cudaMemset(gpu_insert_stats, 0, sizeof(InsertStats)));
-}
-
-template <int MAX_K>
-HashTableGPUDriver<MAX_K>::~HashTableGPUDriver() {
-  if (dstate) {
-    // this happens when there is no ctg kmers pass
-    if (dstate->qf) quotient_filter::qf_destroy_device(dstate->qf);
-    delete dstate;
-  }
-}
-
-template <int MAX_K>
-void HashTableGPUDriver<MAX_K>::insert_supermer_block() {
-  dstate->insert_timer.start();
-  bool is_ctg_kmers = (pass_type == CTG_KMERS_PASS);
-  cudaErrchk(cudaMemcpy(packed_elem_buff_dev.seqs, elem_buff_host.seqs, buff_len, cudaMemcpyHostToDevice));
-  cudaErrchk(cudaMemset(unpacked_elem_buff_dev.seqs, 0, buff_len * 2));
-  if (is_ctg_kmers)
-    cudaErrchk(cudaMemcpy(packed_elem_buff_dev.counts, elem_buff_host.counts, buff_len * sizeof(count_t), cudaMemcpyHostToDevice));
-
-  int gridsize, threadblocksize;
-  dstate->kernel_timer.start();
-  get_kernel_config(buff_len, gpu_unpack_supermer_block, gridsize, threadblocksize);
-  gpu_unpack_supermer_block<<<gridsize, threadblocksize>>>(unpacked_elem_buff_dev, packed_elem_buff_dev, buff_len);
-  get_kernel_config(buff_len * 2, gpu_insert_supermer_block<MAX_K>, gridsize, threadblocksize);
-  // gridsize = gridsize * threadblocksize;
-  // threadblocksize = 1;
-  gpu_insert_supermer_block<<<gridsize, threadblocksize>>>(is_ctg_kmers ? ctg_kmers_dev : read_kmers_dev, unpacked_elem_buff_dev,
-                                                           buff_len * 2, kmer_len, is_ctg_kmers, gpu_insert_stats, dstate->qf);
-  // the kernel time is not going to be accurate, because we are not waiting for the kernel to complete
-  // need to uncomment the line below, which will decrease performance by preventing the overlap of GPU and CPU execution
-  cudaDeviceSynchronize();
-  dstate->kernel_timer.stop();
-  num_gpu_calls++;
-  dstate->insert_timer.stop();
-}
-
-template <int MAX_K>
-void HashTableGPUDriver<MAX_K>::insert_supermer(const string &supermer_seq, count_t supermer_count) {
-  if (buff_len + supermer_seq.length() + 1 >= KCOUNT_GPU_HASHTABLE_BLOCK_SIZE) {
-    insert_supermer_block();
-    buff_len = 0;
-  }
-  memcpy(&(elem_buff_host.seqs[buff_len]), supermer_seq.c_str(), supermer_seq.length());
-  if (pass_type == CTG_KMERS_PASS) {
-    for (int i = 0; i < (int)supermer_seq.length(); i++) elem_buff_host.counts[buff_len + i] = supermer_count;
-  }
-  buff_len += supermer_seq.length();
-  elem_buff_host.seqs[buff_len] = '_';
-  if (pass_type == CTG_KMERS_PASS) elem_buff_host.counts[buff_len] = 0;
-  buff_len++;
-}
-
-template <int MAX_K>
-void HashTableGPUDriver<MAX_K>::purge_invalid(int &num_purged, int &num_entries) {
-  num_purged = num_entries = 0;
-  unsigned int *counts_gpu;
-  int NUM_COUNTS = 2;
-  cudaErrchk(cudaMalloc(&counts_gpu, NUM_COUNTS * sizeof(unsigned int)));
-  cudaErrchk(cudaMemset(counts_gpu, 0, NUM_COUNTS * sizeof(unsigned int)));
-  GPUTimer t;
-  int gridsize, threadblocksize;
-  get_kernel_config(read_kmers_dev.capacity, gpu_purge_invalid<MAX_K>, gridsize, threadblocksize);
-  t.start();
-  // now purge all invalid kmers (do it on the gpu)
-  gpu_purge_invalid<<<gridsize, threadblocksize>>>(read_kmers_dev, counts_gpu);
-  t.stop();
-  dstate->kernel_timer.inc(t.get_elapsed());
-
-  unsigned int counts_host[NUM_COUNTS];
-  cudaErrchk(cudaMemcpy(&counts_host, counts_gpu, NUM_COUNTS * sizeof(unsigned int), cudaMemcpyDeviceToHost));
-  num_purged = counts_host[0];
-  num_entries = counts_host[1];
-  auto expected_num_entries = read_kmers_stats.new_inserts - num_purged;
-  if (num_entries != (int)expected_num_entries)
-    cout << KLRED << "[" << upcxx_rank_me << "] WARNING mismatch " << num_entries << " != " << expected_num_entries << " diff "
-         << (num_entries - (int)expected_num_entries) << " new inserts " << read_kmers_stats.new_inserts << " num purged "
-         << num_purged << KNORM << endl;
-  read_kmers_dev.num = num_entries;
-}
-
-template <int MAX_K>
-void HashTableGPUDriver<MAX_K>::flush_inserts() {
-  if (buff_len) {
-    insert_supermer_block();
-    buff_len = 0;
-  }
-  cudaErrchk(cudaMemcpy(pass_type == READ_KMERS_PASS ? &read_kmers_stats : &ctg_kmers_stats, gpu_insert_stats, sizeof(InsertStats),
-                        cudaMemcpyDeviceToHost));
-}
-
-template <int MAX_K>
-void HashTableGPUDriver<MAX_K>::done_all_inserts(int &num_dropped, int &num_unique, int &num_purged) {
-  int num_entries = 0;
-  purge_invalid(num_purged, num_entries);
-  read_kmers_dev.num = num_entries;
-  if (elem_buff_host.seqs) delete[] elem_buff_host.seqs;
-  if (elem_buff_host.counts) delete[] elem_buff_host.counts;
-  cudaFree(packed_elem_buff_dev.seqs);
-  cudaFree(unpacked_elem_buff_dev.seqs);
-  if (packed_elem_buff_dev.counts) cudaFree(packed_elem_buff_dev.counts);
-  if (unpacked_elem_buff_dev.counts) cudaFree(unpacked_elem_buff_dev.counts);
-  cudaFree(gpu_insert_stats);
-  // overallocate to reduce collisions
-  num_entries *= 1.3;
-  // now compact the hash table entries
-  unsigned int *counts_gpu;
-  int NUM_COUNTS = 2;
-  cudaErrchk(cudaMalloc(&counts_gpu, NUM_COUNTS * sizeof(unsigned int)));
-  cudaErrchk(cudaMemset(counts_gpu, 0, NUM_COUNTS * sizeof(unsigned int)));
-  KmerExtsMap<MAX_K> compact_read_kmers_dev;
-  compact_read_kmers_dev.init(num_entries);
-  GPUTimer t;
-  int gridsize, threadblocksize;
-  get_kernel_config(read_kmers_dev.capacity, gpu_compact_ht<MAX_K>, gridsize, threadblocksize);
-  t.start();
-  gpu_compact_ht<<<gridsize, threadblocksize>>>(read_kmers_dev, compact_read_kmers_dev, counts_gpu);
-  t.stop();
-  dstate->kernel_timer.inc(t.get_elapsed());
-  read_kmers_dev.clear();
-  unsigned int counts_host[NUM_COUNTS];
-  cudaErrchk(cudaMemcpy(&counts_host, counts_gpu, NUM_COUNTS * sizeof(unsigned int), cudaMemcpyDeviceToHost));
-  cudaFree(counts_gpu);
-  num_dropped = counts_host[0];
-  num_unique = counts_host[1];
-  if (num_unique != read_kmers_dev.num)
-    cerr << KLRED << "[" << upcxx_rank_me << "] <gpu_hash_table.cpp:" << __LINE__ << "> WARNING: " << KNORM
-         << "mismatch in expected entries " << num_unique << " != " << read_kmers_dev.num << "\n";
-  // now copy the gpu hash table values across to the host
-  // We only do this once, which requires enough memory on the host to store the full GPU hash table, but since the GPU memory
-  // is generally a lot less than the host memory, it should be fine.
-  output_keys.resize(num_entries);
-  output_vals.resize(num_entries);
-  output_index = 0;
-  cudaErrchk(cudaMemcpy(output_keys.data(), (void *)compact_read_kmers_dev.keys,
-                        compact_read_kmers_dev.capacity * sizeof(KmerArray<MAX_K>), cudaMemcpyDeviceToHost));
-  cudaErrchk(cudaMemcpy(output_vals.data(), compact_read_kmers_dev.vals, compact_read_kmers_dev.capacity * sizeof(CountExts),
-                        cudaMemcpyDeviceToHost));
-  compact_read_kmers_dev.clear();
-}
-
-template <int MAX_K>
-void HashTableGPUDriver<MAX_K>::done_ctg_kmer_inserts(int &attempted_inserts, int &dropped_inserts, int &new_inserts) {
-  unsigned int *counts_gpu;
-  int NUM_COUNTS = 3;
-  cudaErrchk(cudaMalloc(&counts_gpu, NUM_COUNTS * sizeof(unsigned int)));
-  cudaErrchk(cudaMemset(counts_gpu, 0, NUM_COUNTS * sizeof(unsigned int)));
-  GPUTimer t;
-  int gridsize, threadblocksize;
-  get_kernel_config(ctg_kmers_dev.capacity, gpu_merge_ctg_kmers<MAX_K>, gridsize, threadblocksize);
-  t.start();
-  gpu_merge_ctg_kmers<<<gridsize, threadblocksize>>>(read_kmers_dev, ctg_kmers_dev, counts_gpu);
-  t.stop();
-  dstate->kernel_timer.inc(t.get_elapsed());
-  ctg_kmers_dev.clear();
-  unsigned int counts_host[NUM_COUNTS];
-  cudaErrchk(cudaMemcpy(&counts_host, counts_gpu, NUM_COUNTS * sizeof(unsigned int), cudaMemcpyDeviceToHost));
-  cudaFree(counts_gpu);
-  attempted_inserts = counts_host[0];
-  dropped_inserts = counts_host[1];
-  new_inserts = counts_host[2];
-  read_kmers_dev.num += new_inserts;
-  read_kmers_stats.new_inserts += new_inserts;
-}
-
-template <int MAX_K>
-void HashTableGPUDriver<MAX_K>::get_elapsed_time(double &insert_time, double &kernel_time) {
-  insert_time = dstate->insert_timer.get_elapsed();
-  kernel_time = dstate->kernel_timer.get_elapsed();
-}
-
-template <int MAX_K>
-pair<KmerArray<MAX_K> *, CountExts *> HashTableGPUDriver<MAX_K>::get_next_entry() {
-  if (output_keys.empty() || output_index == output_keys.size()) return {nullptr, nullptr};
-  output_index++;
-  return {&(output_keys[output_index - 1]), &(output_vals[output_index - 1])};
-}
-
-template <int MAX_K>
-int64_t HashTableGPUDriver<MAX_K>::get_capacity() {
-  if (pass_type == READ_KMERS_PASS)
-    return read_kmers_dev.capacity;
-  else
-    return ctg_kmers_dev.capacity;
-}
-
-template <int MAX_K>
-int64_t HashTableGPUDriver<MAX_K>::get_final_capacity() {
-  return read_kmers_dev.capacity;
-}
-
-template <int MAX_K>
-InsertStats &HashTableGPUDriver<MAX_K>::get_stats() {
-  if (pass_type == READ_KMERS_PASS)
-    return read_kmers_stats;
-  else
-    return ctg_kmers_stats;
-}
-
-template <int MAX_K>
-int HashTableGPUDriver<MAX_K>::get_num_gpu_calls() {
-  return num_gpu_calls;
-}
-
-template <int MAX_K>
-double HashTableGPUDriver<MAX_K>::get_qf_load_factor() {
-  if (!dstate->qf) return 0;
-  return (double)quotient_filter::host_qf_get_num_occupied_slots(dstate->qf) / quotient_filter::host_qf_get_nslots(dstate->qf);
-}
-
-template class kcount_gpu::HashTableGPUDriver<32>;
-#if MAX_BUILD_KMER >= 64
-template class kcount_gpu::HashTableGPUDriver<64>;
-#endif
-#if MAX_BUILD_KMER >= 96
-template class kcount_gpu::HashTableGPUDriver<96>;
-#endif
-#if MAX_BUILD_KMER >= 128
-template class kcount_gpu::HashTableGPUDriver<128>;
-#endif
-#if MAX_BUILD_KMER >= 160
-template class kcount_gpu::HashTableGPUDriver<160>;
-#endif
diff --git a/src/kcount/kcount-gpu/gpu_hash_table.hpp b/src/kcount/kcount-gpu/gpu_hash_table.hpp
index 0861b86..e69de29 100644
--- a/src/kcount/kcount-gpu/gpu_hash_table.hpp
+++ b/src/kcount/kcount-gpu/gpu_hash_table.hpp
@@ -1,182 +0,0 @@
-#pragma once
-
-/*
- HipMer v 2.0, Copyright (c) 2020, The Regents of the University of California,
- through Lawrence Berkeley National Laboratory (subject to receipt of any required
- approvals from the U.S. Dept. of Energy).  All rights reserved."
-
- Redistribution and use in source and binary forms, with or without modification,
- are permitted provided that the following conditions are met:
-
- (1) Redistributions of source code must retain the above copyright notice, this
- list of conditions and the following disclaimer.
-
- (2) Redistributions in binary form must reproduce the above copyright notice,
- this list of conditions and the following disclaimer in the documentation and/or
- other materials provided with the distribution.
-
- (3) Neither the name of the University of California, Lawrence Berkeley National
- Laboratory, U.S. Dept. of Energy nor the names of its contributors may be used to
- endorse or promote products derived from this software without specific prior
- written permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY
- EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
- OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT
- SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
- TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
- BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
- DAMAGE.
-
- You are under no obligation whatsoever to provide any bug fixes, patches, or upgrades
- to the features, functionality or performance of the source code ("Enhancements") to
- anyone; however, if you choose to make your Enhancements available either publicly,
- or directly to Lawrence Berkeley National Laboratory, without imposing a separate
- written license agreement for such Enhancements, then you hereby grant the following
- license: a  non-exclusive, royalty-free perpetual license to install, use, modify,
- prepare derivative works, incorporate into other computer software, distribute, and
- sublicense such enhancements or derivative works thereof, in binary and source code
- form.
-*/
-
-#include <vector>
-#include <array>
-#include <unordered_map>
-#include <thread>
-
-#include "hash_funcs.h"
-
-namespace kcount_gpu {
-
-enum PASS_TYPE { READ_KMERS_PASS = 0, CTG_KMERS_PASS = 1 };
-
-using count_t = uint32_t;
-using ext_count_t = uint16_t;
-
-struct CountsArray {
-  count_t kmer_count;
-  ext_count_t ext_counts[8];
-};
-
-struct CountExts {
-  count_t count;
-  int8_t left, right;
-};
-
-template <int MAX_K>
-struct KmerArray {
-  static const int N_LONGS = (MAX_K + 31) / 32;
-  uint64_t longs[N_LONGS];
-
-  void set(const uint64_t *x);
-};
-
-struct SupermerBuff {
-  char *seqs;
-  count_t *counts;
-};
-
-// Bytes used per element:
-// k = 21: 8+20 = 28
-// k = 33, 55: 16+20 = 36
-// k = 77: 24+20 = 44
-// k = 99: 32+20 = 52
-template <int MAX_K>
-struct KmerCountsMap {
-  // Arrays for keys and values. They are separate because the keys get initialized with max number and the vals with zero
-  KmerArray<MAX_K> *keys = nullptr;
-  CountsArray *vals = nullptr;
-  int64_t capacity = 0;
-  int num = 0;
-
-  void init(int64_t ht_capacity);
-  void clear();
-};
-
-template <int MAX_K>
-struct KmerExtsMap {
-  KmerArray<MAX_K> *keys = nullptr;
-  CountExts *vals = nullptr;
-  int64_t capacity = 0;
-
-  void init(int64_t ht_capacity);
-  void clear();
-};
-
-struct InsertStats {
-  unsigned int dropped = 0;
-  unsigned int attempted = 0;
-  unsigned int new_inserts = 0;
-  unsigned int num_unique_qf = 0;
-};
-
-template <int MAX_K>
-class HashTableGPUDriver {
-  static const int N_LONGS = (MAX_K + 31) / 32;
-  struct HashTableDriverState;
-  // stores CUDA specific variables
-  HashTableDriverState *dstate = nullptr;
-
-  int upcxx_rank_me;
-  int upcxx_rank_n;
-  int kmer_len;
-  int buff_len = 0;
-  std::vector<KmerArray<MAX_K>> output_keys;
-  std::vector<CountExts> output_vals;
-  size_t output_index = 0;
-
-  KmerCountsMap<MAX_K> read_kmers_dev;
-  KmerCountsMap<MAX_K> ctg_kmers_dev;
-
-  // for buffering elements in the host memory
-  SupermerBuff elem_buff_host = {0};
-  // for transferring host memory buffer to device
-  SupermerBuff unpacked_elem_buff_dev = {0};
-  SupermerBuff packed_elem_buff_dev = {0};
-
-  InsertStats read_kmers_stats;
-  InsertStats ctg_kmers_stats;
-  InsertStats *gpu_insert_stats;
-  int num_gpu_calls = 0;
-
-  void insert_supermer_block();
-  void purge_invalid(int &num_purged, int &num_entries);
-
- public:
-  PASS_TYPE pass_type;
-
-  HashTableGPUDriver();
-  ~HashTableGPUDriver();
-
-  void init(int upcxx_rank_me, int upcxx_rank_n, int kmer_len, int max_elems, size_t gpu_avail_mem, double &init_time,
-            size_t &gpu_bytes_reqd, size_t &ht_bytes_used, size_t &qf_bytes_used, bool use_qf);
-
-  void init_ctg_kmers(int max_elems, size_t gpu_avail_mem);
-
-  void insert_supermer(const std::string &supermer_seq, count_t supermer_count);
-
-  void flush_inserts();
-
-  void done_ctg_kmer_inserts(int &attempted_inserts, int &dropped_inserts, int &new_inserts);
-
-  void done_all_inserts(int &num_dropped, int &num_unique, int &num_purged);
-
-  std::pair<KmerArray<MAX_K> *, CountExts *> get_next_entry();
-
-  void get_elapsed_time(double &insert_time, double &kernel_time);
-
-  int64_t get_capacity();
-
-  int64_t get_final_capacity();
-  
-  InsertStats &get_stats();
-
-  int get_num_gpu_calls();
-
-  double get_qf_load_factor();
-};
-
-}  // namespace kcount_gpu
diff --git a/src/kcount/kcount-gpu/gqf.cpp b/src/kcount/kcount-gpu/gqf.cpp
index 78ca1c9..e69de29 100644
--- a/src/kcount/kcount-gpu/gqf.cpp
+++ b/src/kcount/kcount-gpu/gqf.cpp
@@ -1,2920 +0,0 @@
-/*
- * ============================================================================
- *
- *        Authors:  Prashant Pandey <ppandey@cs.stonybrook.edu>
- *                  Rob Johnson <robj@vmware.com>
- *
- * ============================================================================
- */
-
-#include <cuda.h>
-#include <cuda_runtime_api.h>
-#include <stdlib.h>
-#include <assert.h>
-#include <string.h>
-#include <inttypes.h>
-#include <stdio.h>
-#include <unistd.h>
-#include <math.h>
-#include <time.h>
-#include <sys/mman.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-
-// timing stuff
-#include <chrono>
-#include <iostream>
-#include <cmath>
-
-// how fast is a thrust sort?
-#include <thrust/sort.h>
-#include <thrust/execution_policy.h>
-
-#include "hashutil.hpp"
-#include "gqf.hpp"
-#include "gqf_int.hpp"
-#include "gpu-utils/gpu_common.hpp"
-
-#include <cuda_profiler_api.h>
-
-/******************************************************************
- * Code for managing the metadata bits and slots w/o interpreting *
- * the content of the slots.
- ******************************************************************/
-
-namespace quotient_filter {
-
-#define MAX_VALUE(nbits) ((1ULL << (nbits)) - 1)
-#define BITMASK(nbits) ((nbits) == 64 ? 0xffffffffffffffff : MAX_VALUE(nbits))
-#define NUM_SLOTS_TO_LOCK (1ULL << 13)
-#define EXP_BEFORE_FAILURE -15
-#define CLUSTER_SIZE (1ULL << 14)
-#define METADATA_WORD(qf, field, slot_index) \
-  (get_block((qf), (slot_index) / QF_SLOTS_PER_BLOCK)->field[((slot_index) % QF_SLOTS_PER_BLOCK) / 64])
-
-#define GET_NO_LOCK(flag) (flag & QF_NO_LOCK)
-#define GET_TRY_ONCE_LOCK(flag) (flag & QF_TRY_ONCE_LOCK)
-#define GET_WAIT_FOR_LOCK(flag) (flag & QF_WAIT_FOR_LOCK)
-#define GET_KEY_HASH(flag) (flag & QF_KEY_IS_HASH)
-
-#define NUM_BUFFERS 10
-#define MAX_BUFFER_SIZE 100
-#define LOCK_DIST 1
-
-#define CYCLES_PER_SECOND 1601000000
-
-#define MAX_DEPTH 16
-#define SELECT_BOUND 32
-
-#define DISTANCE_FROM_HOME_SLOT_CUTOFF 1000
-#define BILLION 1000000000L
-#define CUDA_CHECK(ans) gpuAssert((ans), __FILE__, __LINE__);
-inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true) {
-  if (code != cudaSuccess) {
-    printf("GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
-    if (abort) exit(code);
-  }
-}
-
-#ifdef DEBUG
-#define PRINT_DEBUG 1
-#else
-#define PRINT_DEBUG 0
-#endif
-
-#define DEBUG_CQF(fmt, ...)                    \
-  do {                                         \
-    if (PRINT_DEBUG) printf(fmt, __VA_ARGS__); \
-  } while (0)
-
-#define DEBUG_DUMP(qf)                     \
-  do {                                     \
-    if (PRINT_DEBUG) qf_dump_metadata(qf); \
-  } while (0)
-
-#if QF_BITS_PER_SLOT > 0
-__host__ __device__ static inline qfblock *get_block(const QF *qf, uint64_t block_index) { return &qf->blocks[block_index]; }
-#else
-__host__ __device__ static inline qfblock *get_block(const QF *qf, uint64_t block_index) {
-  return (qfblock *)(((char *)qf->blocks) + block_index * (sizeof(qfblock) + QF_SLOTS_PER_BLOCK * qf->metadata->bits_per_slot / 8));
-}
-#endif
-/*
-__device__ static __inline__ unsigned long long rdtsc(void)
-{
-        unsigned hi, lo;
-        __asm__ __volatile__ ("rdtsc" : "=a"(lo), "=d"(hi));
-        return ( (unsigned long long)lo)|( ((unsigned long long)hi)<<32 );
-}
-*/
-/*
-__host__ __device__ static void modify_metadata(pc_t *metadata, int cnt)
-{
-        pc_add(metadata, cnt);
-        return;
-}
-*/
-/*changing sizes of register based on https://docs.nvidia.com/cuda/inline-ptx-assembly/index.html
-l is for "l" = .u64 reg
-*/
-
-#ifdef __CUDA_ARCH__
-__constant__ char kmer_vals[6] = {'F', 'A', 'C', 'T', 'G', '0'};
-#else
-const char kmer_vals[6] = {'F', 'A', 'C', 'T', 'G', '0'};
-#endif
-
-__host__ __device__ static inline int popcnt(uint64_t val) {
-#ifdef __CUDA_ARCH__
-  val = __popcll(val);
-#else
-
-#ifndef __x86_64
-  val = __builtin_popcount(val);
-
-#else
-
-  asm("popcnt %[val], %[val]" : [val] "+r"(val) : : "cc");
-
-#endif
-
-#endif
-  return val;
-}
-
-/*
-__device__ static inline int64_t bitscanreverse(uint64_t val) {
-  if (val == 0) {
-    return -1;
-  } else {
-    asm("bsr %[val], %[val]" : [val] "+l"(val) : :);
-    return val;
-  }
-}
-*/
-
-__host__ __device__ static inline int popcntv(const uint64_t val, int ignore) {
-  if (ignore % 64)
-    return popcnt(val & ~BITMASK(ignore % 64));
-  else
-    return popcnt(val);
-}
-
-// Returns the number of 1s up to (and including) the pos'th bit
-// Bits are numbered from 0
-__host__ __device__ static inline int bitrank(uint64_t val, int pos) {
-  val = val & ((2ULL << pos) - 1);
-#ifdef __CUDA_ARCH__
-  val = __popcll(val);
-#else
-
-  // quick fix for summit
-
-#ifndef __x86_64
-
-  val = __builtin_popcount(val);
-
-#else
-
-  asm("popcnt %[val], %[val]" : [val] "+r"(val) : : "cc");
-
-#endif
-
-#endif
-  return val;
-}
-
-// moved dump functions
-__host__ __device__ static inline void qf_dump_block(const QF *qf, uint64_t i) {
-  uint64_t j;
-
-  // printf("Block %llu Runs from %llu to %llu\n", i, i * QF_SLOTS_PER_BLOCK, (i + 1) * QF_SLOTS_PER_BLOCK);
-  printf("Offset: %-192d", get_block(qf, i)->offset);
-  printf("\n");
-
-  for (j = 0; j < QF_SLOTS_PER_BLOCK; j++) printf("%02lx ", j);
-  printf("\n");
-
-  for (j = 0; j < QF_SLOTS_PER_BLOCK; j++) printf(" %d ", (get_block(qf, i)->occupieds[j / 64] & (1ULL << (j % 64))) ? 1 : 0);
-  printf("\n");
-
-  for (j = 0; j < QF_SLOTS_PER_BLOCK; j++) printf(" %d ", (get_block(qf, i)->runends[j / 64] & (1ULL << (j % 64))) ? 1 : 0);
-  printf("\n");
-
-#if QF_BITS_PER_SLOT == 8 || QF_BITS_PER_SLOT == 16 || QF_BITS_PER_SLOT == 32
-  for (j = 0; j < QF_SLOTS_PER_BLOCK; j++) printf("%02x ", get_block(qf, i)->slots[j]);
-#elif QF_BITS_PER_SLOT == 64
-  for (j = 0; j < QF_SLOTS_PER_BLOCK; j++) printf("%02lx ", get_block(qf, i)->slots[j]);
-#else
-  for (j = 0; j < QF_SLOTS_PER_BLOCK * qf->metadata->bits_per_slot / 8; j++) printf("%02x ", get_block(qf, i)->slots[j]);
-#endif
-
-  printf("\n");
-
-  printf("\n");
-}
-
-__host__ __device__ void qf_dump_metadata(const QF *qf) {
-  printf("Slots: %lu Occupied: %lu Elements: %lu Distinct: %lu\n", qf->metadata->nslots, qf->metadata->noccupied_slots,
-         qf->metadata->nelts, qf->metadata->ndistinct_elts);
-  printf("Key_bits: %lu Value_bits: %lu Remainder_bits: %lu Bits_per_slot: %lu\n", qf->metadata->key_bits, qf->metadata->value_bits,
-         qf->metadata->key_remainder_bits, qf->metadata->bits_per_slot);
-}
-
-__host__ __device__ void qf_dump(const QF *qf) {
-  uint64_t i;
-
-  printf("%lu %lu %lu\n", qf->metadata->nblocks, qf->metadata->ndistinct_elts, qf->metadata->nelts);
-
-  for (i = 0; i < qf->metadata->nblocks; i++) {
-    qf_dump_block(qf, i);
-  }
-}
-
-/**
- * Returns the position of the k-th 1 in the 64-bit word x.
- * k is 0-based, so k=0 returns the position of the first 1.
- *
- * Uses the broadword selection algorithm by Vigna [1], improved by Gog
- * and Petri [2] and Vigna [3].
- *
- * [1] Sebastiano Vigna. Broadword Implementation of Rank/Select
- *    Queries. WEA, 2008
- *
- * [2] Simon Gog, Matthias Petri. Optimized succinct data
- * structures for massive data. Softw. Pract. Exper., 2014
- *
- * [3] Sebastiano Vigna. MG4J 5.2.1. http://mg4j.di.unimi.it/
- * The following code is taken from
- * https://github.com/facebook/folly/blob/b28186247104f8b90cfbe094d289c91f9e413317/folly/experimental/Select64.h
- */
-__device__ __constant__ uint8_t gpukSelectInByte[2048] = {
-    8, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1,
-    0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0,
-    1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 7,
-    0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0,
-    2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1,
-    0, 3, 0, 1, 0, 2, 0, 1, 0, 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 8, 8,
-    8, 1, 8, 2, 2, 1, 8, 3, 3, 1, 3, 2, 2, 1, 8, 4, 4, 1, 4, 2, 2, 1, 4, 3, 3, 1, 3, 2, 2, 1, 8, 5, 5, 1, 5, 2, 2, 1, 5, 3, 3, 1, 3,
-    2, 2, 1, 5, 4, 4, 1, 4, 2, 2, 1, 4, 3, 3, 1, 3, 2, 2, 1, 8, 6, 6, 1, 6, 2, 2, 1, 6, 3, 3, 1, 3, 2, 2, 1, 6, 4, 4, 1, 4, 2, 2, 1,
-    4, 3, 3, 1, 3, 2, 2, 1, 6, 5, 5, 1, 5, 2, 2, 1, 5, 3, 3, 1, 3, 2, 2, 1, 5, 4, 4, 1, 4, 2, 2, 1, 4, 3, 3, 1, 3, 2, 2, 1, 8, 7, 7,
-    1, 7, 2, 2, 1, 7, 3, 3, 1, 3, 2, 2, 1, 7, 4, 4, 1, 4, 2, 2, 1, 4, 3, 3, 1, 3, 2, 2, 1, 7, 5, 5, 1, 5, 2, 2, 1, 5, 3, 3, 1, 3, 2,
-    2, 1, 5, 4, 4, 1, 4, 2, 2, 1, 4, 3, 3, 1, 3, 2, 2, 1, 7, 6, 6, 1, 6, 2, 2, 1, 6, 3, 3, 1, 3, 2, 2, 1, 6, 4, 4, 1, 4, 2, 2, 1, 4,
-    3, 3, 1, 3, 2, 2, 1, 6, 5, 5, 1, 5, 2, 2, 1, 5, 3, 3, 1, 3, 2, 2, 1, 5, 4, 4, 1, 4, 2, 2, 1, 4, 3, 3, 1, 3, 2, 2, 1, 8, 8, 8, 8,
-    8, 8, 8, 2, 8, 8, 8, 3, 8, 3, 3, 2, 8, 8, 8, 4, 8, 4, 4, 2, 8, 4, 4, 3, 4, 3, 3, 2, 8, 8, 8, 5, 8, 5, 5, 2, 8, 5, 5, 3, 5, 3, 3,
-    2, 8, 5, 5, 4, 5, 4, 4, 2, 5, 4, 4, 3, 4, 3, 3, 2, 8, 8, 8, 6, 8, 6, 6, 2, 8, 6, 6, 3, 6, 3, 3, 2, 8, 6, 6, 4, 6, 4, 4, 2, 6, 4,
-    4, 3, 4, 3, 3, 2, 8, 6, 6, 5, 6, 5, 5, 2, 6, 5, 5, 3, 5, 3, 3, 2, 6, 5, 5, 4, 5, 4, 4, 2, 5, 4, 4, 3, 4, 3, 3, 2, 8, 8, 8, 7, 8,
-    7, 7, 2, 8, 7, 7, 3, 7, 3, 3, 2, 8, 7, 7, 4, 7, 4, 4, 2, 7, 4, 4, 3, 4, 3, 3, 2, 8, 7, 7, 5, 7, 5, 5, 2, 7, 5, 5, 3, 5, 3, 3, 2,
-    7, 5, 5, 4, 5, 4, 4, 2, 5, 4, 4, 3, 4, 3, 3, 2, 8, 7, 7, 6, 7, 6, 6, 2, 7, 6, 6, 3, 6, 3, 3, 2, 7, 6, 6, 4, 6, 4, 4, 2, 6, 4, 4,
-    3, 4, 3, 3, 2, 7, 6, 6, 5, 6, 5, 5, 2, 6, 5, 5, 3, 5, 3, 3, 2, 6, 5, 5, 4, 5, 4, 4, 2, 5, 4, 4, 3, 4, 3, 3, 2, 8, 8, 8, 8, 8, 8,
-    8, 8, 8, 8, 8, 8, 8, 8, 8, 3, 8, 8, 8, 8, 8, 8, 8, 4, 8, 8, 8, 4, 8, 4, 4, 3, 8, 8, 8, 8, 8, 8, 8, 5, 8, 8, 8, 5, 8, 5, 5, 3, 8,
-    8, 8, 5, 8, 5, 5, 4, 8, 5, 5, 4, 5, 4, 4, 3, 8, 8, 8, 8, 8, 8, 8, 6, 8, 8, 8, 6, 8, 6, 6, 3, 8, 8, 8, 6, 8, 6, 6, 4, 8, 6, 6, 4,
-    6, 4, 4, 3, 8, 8, 8, 6, 8, 6, 6, 5, 8, 6, 6, 5, 6, 5, 5, 3, 8, 6, 6, 5, 6, 5, 5, 4, 6, 5, 5, 4, 5, 4, 4, 3, 8, 8, 8, 8, 8, 8, 8,
-    7, 8, 8, 8, 7, 8, 7, 7, 3, 8, 8, 8, 7, 8, 7, 7, 4, 8, 7, 7, 4, 7, 4, 4, 3, 8, 8, 8, 7, 8, 7, 7, 5, 8, 7, 7, 5, 7, 5, 5, 3, 8, 7,
-    7, 5, 7, 5, 5, 4, 7, 5, 5, 4, 5, 4, 4, 3, 8, 8, 8, 7, 8, 7, 7, 6, 8, 7, 7, 6, 7, 6, 6, 3, 8, 7, 7, 6, 7, 6, 6, 4, 7, 6, 6, 4, 6,
-    4, 4, 3, 8, 7, 7, 6, 7, 6, 6, 5, 7, 6, 6, 5, 6, 5, 5, 3, 7, 6, 6, 5, 6, 5, 5, 4, 6, 5, 5, 4, 5, 4, 4, 3, 8, 8, 8, 8, 8, 8, 8, 8,
-    8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 4, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 5, 8, 8, 8,
-    8, 8, 8, 8, 5, 8, 8, 8, 5, 8, 5, 5, 4, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 6, 8, 8, 8, 8, 8, 8, 8, 6, 8, 8, 8, 6, 8, 6,
-    6, 4, 8, 8, 8, 8, 8, 8, 8, 6, 8, 8, 8, 6, 8, 6, 6, 5, 8, 8, 8, 6, 8, 6, 6, 5, 8, 6, 6, 5, 6, 5, 5, 4, 8, 8, 8, 8, 8, 8, 8, 8, 8,
-    8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 7, 8, 7, 7, 4, 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 7, 8, 7, 7, 5, 8, 8, 8, 7,
-    8, 7, 7, 5, 8, 7, 7, 5, 7, 5, 5, 4, 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 7, 8, 7, 7, 6, 8, 8, 8, 7, 8, 7, 7, 6, 8, 7, 7, 6, 7, 6, 6,
-    4, 8, 8, 8, 7, 8, 7, 7, 6, 8, 7, 7, 6, 7, 6, 6, 5, 8, 7, 7, 6, 7, 6, 6, 5, 7, 6, 6, 5, 6, 5, 5, 4, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
-    8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
-    8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 6,
-    8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 6, 8, 8, 8, 8, 8, 8, 8, 6, 8, 8, 8, 6, 8, 6, 6, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
-    8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 8, 8, 8,
-    8, 7, 8, 8, 8, 7, 8, 7, 7, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 7, 8, 7, 7, 6, 8,
-    8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 7, 8, 7, 7, 6, 8, 8, 8, 7, 8, 7, 7, 6, 8, 7, 7, 6, 7, 6, 6, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
-    8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
-    8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
-    8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 6, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
-    8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
-    8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8,
-    8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 7, 8, 7, 7, 6, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
-    8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
-    8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
-    8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
-    8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
-    8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
-    8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7};
-
-#ifndef __CUDA_ARCH__
-const uint8_t hostkSelectInByte[2048] = {
-    8, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1,
-    0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0,
-    1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 7,
-    0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0,
-    2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1,
-    0, 3, 0, 1, 0, 2, 0, 1, 0, 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 8, 8,
-    8, 1, 8, 2, 2, 1, 8, 3, 3, 1, 3, 2, 2, 1, 8, 4, 4, 1, 4, 2, 2, 1, 4, 3, 3, 1, 3, 2, 2, 1, 8, 5, 5, 1, 5, 2, 2, 1, 5, 3, 3, 1, 3,
-    2, 2, 1, 5, 4, 4, 1, 4, 2, 2, 1, 4, 3, 3, 1, 3, 2, 2, 1, 8, 6, 6, 1, 6, 2, 2, 1, 6, 3, 3, 1, 3, 2, 2, 1, 6, 4, 4, 1, 4, 2, 2, 1,
-    4, 3, 3, 1, 3, 2, 2, 1, 6, 5, 5, 1, 5, 2, 2, 1, 5, 3, 3, 1, 3, 2, 2, 1, 5, 4, 4, 1, 4, 2, 2, 1, 4, 3, 3, 1, 3, 2, 2, 1, 8, 7, 7,
-    1, 7, 2, 2, 1, 7, 3, 3, 1, 3, 2, 2, 1, 7, 4, 4, 1, 4, 2, 2, 1, 4, 3, 3, 1, 3, 2, 2, 1, 7, 5, 5, 1, 5, 2, 2, 1, 5, 3, 3, 1, 3, 2,
-    2, 1, 5, 4, 4, 1, 4, 2, 2, 1, 4, 3, 3, 1, 3, 2, 2, 1, 7, 6, 6, 1, 6, 2, 2, 1, 6, 3, 3, 1, 3, 2, 2, 1, 6, 4, 4, 1, 4, 2, 2, 1, 4,
-    3, 3, 1, 3, 2, 2, 1, 6, 5, 5, 1, 5, 2, 2, 1, 5, 3, 3, 1, 3, 2, 2, 1, 5, 4, 4, 1, 4, 2, 2, 1, 4, 3, 3, 1, 3, 2, 2, 1, 8, 8, 8, 8,
-    8, 8, 8, 2, 8, 8, 8, 3, 8, 3, 3, 2, 8, 8, 8, 4, 8, 4, 4, 2, 8, 4, 4, 3, 4, 3, 3, 2, 8, 8, 8, 5, 8, 5, 5, 2, 8, 5, 5, 3, 5, 3, 3,
-    2, 8, 5, 5, 4, 5, 4, 4, 2, 5, 4, 4, 3, 4, 3, 3, 2, 8, 8, 8, 6, 8, 6, 6, 2, 8, 6, 6, 3, 6, 3, 3, 2, 8, 6, 6, 4, 6, 4, 4, 2, 6, 4,
-    4, 3, 4, 3, 3, 2, 8, 6, 6, 5, 6, 5, 5, 2, 6, 5, 5, 3, 5, 3, 3, 2, 6, 5, 5, 4, 5, 4, 4, 2, 5, 4, 4, 3, 4, 3, 3, 2, 8, 8, 8, 7, 8,
-    7, 7, 2, 8, 7, 7, 3, 7, 3, 3, 2, 8, 7, 7, 4, 7, 4, 4, 2, 7, 4, 4, 3, 4, 3, 3, 2, 8, 7, 7, 5, 7, 5, 5, 2, 7, 5, 5, 3, 5, 3, 3, 2,
-    7, 5, 5, 4, 5, 4, 4, 2, 5, 4, 4, 3, 4, 3, 3, 2, 8, 7, 7, 6, 7, 6, 6, 2, 7, 6, 6, 3, 6, 3, 3, 2, 7, 6, 6, 4, 6, 4, 4, 2, 6, 4, 4,
-    3, 4, 3, 3, 2, 7, 6, 6, 5, 6, 5, 5, 2, 6, 5, 5, 3, 5, 3, 3, 2, 6, 5, 5, 4, 5, 4, 4, 2, 5, 4, 4, 3, 4, 3, 3, 2, 8, 8, 8, 8, 8, 8,
-    8, 8, 8, 8, 8, 8, 8, 8, 8, 3, 8, 8, 8, 8, 8, 8, 8, 4, 8, 8, 8, 4, 8, 4, 4, 3, 8, 8, 8, 8, 8, 8, 8, 5, 8, 8, 8, 5, 8, 5, 5, 3, 8,
-    8, 8, 5, 8, 5, 5, 4, 8, 5, 5, 4, 5, 4, 4, 3, 8, 8, 8, 8, 8, 8, 8, 6, 8, 8, 8, 6, 8, 6, 6, 3, 8, 8, 8, 6, 8, 6, 6, 4, 8, 6, 6, 4,
-    6, 4, 4, 3, 8, 8, 8, 6, 8, 6, 6, 5, 8, 6, 6, 5, 6, 5, 5, 3, 8, 6, 6, 5, 6, 5, 5, 4, 6, 5, 5, 4, 5, 4, 4, 3, 8, 8, 8, 8, 8, 8, 8,
-    7, 8, 8, 8, 7, 8, 7, 7, 3, 8, 8, 8, 7, 8, 7, 7, 4, 8, 7, 7, 4, 7, 4, 4, 3, 8, 8, 8, 7, 8, 7, 7, 5, 8, 7, 7, 5, 7, 5, 5, 3, 8, 7,
-    7, 5, 7, 5, 5, 4, 7, 5, 5, 4, 5, 4, 4, 3, 8, 8, 8, 7, 8, 7, 7, 6, 8, 7, 7, 6, 7, 6, 6, 3, 8, 7, 7, 6, 7, 6, 6, 4, 7, 6, 6, 4, 6,
-    4, 4, 3, 8, 7, 7, 6, 7, 6, 6, 5, 7, 6, 6, 5, 6, 5, 5, 3, 7, 6, 6, 5, 6, 5, 5, 4, 6, 5, 5, 4, 5, 4, 4, 3, 8, 8, 8, 8, 8, 8, 8, 8,
-    8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 4, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 5, 8, 8, 8,
-    8, 8, 8, 8, 5, 8, 8, 8, 5, 8, 5, 5, 4, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 6, 8, 8, 8, 8, 8, 8, 8, 6, 8, 8, 8, 6, 8, 6,
-    6, 4, 8, 8, 8, 8, 8, 8, 8, 6, 8, 8, 8, 6, 8, 6, 6, 5, 8, 8, 8, 6, 8, 6, 6, 5, 8, 6, 6, 5, 6, 5, 5, 4, 8, 8, 8, 8, 8, 8, 8, 8, 8,
-    8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 7, 8, 7, 7, 4, 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 7, 8, 7, 7, 5, 8, 8, 8, 7,
-    8, 7, 7, 5, 8, 7, 7, 5, 7, 5, 5, 4, 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 7, 8, 7, 7, 6, 8, 8, 8, 7, 8, 7, 7, 6, 8, 7, 7, 6, 7, 6, 6,
-    4, 8, 8, 8, 7, 8, 7, 7, 6, 8, 7, 7, 6, 7, 6, 6, 5, 8, 7, 7, 6, 7, 6, 6, 5, 7, 6, 6, 5, 6, 5, 5, 4, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
-    8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
-    8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 6,
-    8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 6, 8, 8, 8, 8, 8, 8, 8, 6, 8, 8, 8, 6, 8, 6, 6, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
-    8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 8, 8, 8,
-    8, 7, 8, 8, 8, 7, 8, 7, 7, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 7, 8, 7, 7, 6, 8,
-    8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 7, 8, 7, 7, 6, 8, 8, 8, 7, 8, 7, 7, 6, 8, 7, 7, 6, 7, 6, 6, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
-    8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
-    8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
-    8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 6, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
-    8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
-    8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8,
-    8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 7, 8, 7, 7, 6, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
-    8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
-    8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
-    8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
-    8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
-    8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
-    8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7};
-#endif
-
-__host__ __device__ static inline uint64_t _select64(uint64_t x, int k) {
-  if (k >= popcnt(x)) {
-    return 64;
-  }
-
-  const uint64_t kOnesStep4 = 0x1111111111111111ULL;
-  const uint64_t kOnesStep8 = 0x0101010101010101ULL;
-  const uint64_t kMSBsStep8 = 0x80ULL * kOnesStep8;
-
-  uint64_t s = x;
-  s = s - ((s & 0xA * kOnesStep4) >> 1);
-  s = (s & 0x3 * kOnesStep4) + ((s >> 2) & 0x3 * kOnesStep4);
-  s = (s + (s >> 4)) & 0xF * kOnesStep8;
-  uint64_t byteSums = s * kOnesStep8;
-
-  uint64_t kStep8 = k * kOnesStep8;
-  uint64_t geqKStep8 = (((kStep8 | kMSBsStep8) - byteSums) & kMSBsStep8);
-  uint64_t place = popcnt(geqKStep8) * 8;
-  uint64_t byteRank = k - (((byteSums << 8) >> place) & (uint64_t)(0xFF));
-#ifdef __CUDA_ARCH__
-  return place + gpukSelectInByte[((x >> place) & 0xFF) | (byteRank << 8)];
-#else
-  return place + hostkSelectInByte[((x >> place) & 0xFF) | (byteRank << 8)];
-#endif  // __CUDA_ARCH__
-}
-
-// Returns the position of the rank'th 1.  (rank = 0 returns the 1st 1)
-// Returns 64 if there are fewer than rank+1 1s.
-__host__ __device__ static inline uint64_t bitselect(uint64_t val, int rank) {
-#ifdef __SSE4_2_
-  uint64_t i = 1ULL << rank;
-  asm("pdep %[val], %[mask], %[val]" : [val] "+r"(val) : [mask] "r"(i));
-  asm("tzcnt %[bit], %[index]" : [index] "=r"(i) : [bit] "g"(val) : "cc");
-  return i;
-#endif
-  return _select64(val, rank);
-}
-
-__host__ __device__ static inline uint64_t bitselectv(const uint64_t val, int ignore, int rank) {
-  return bitselect(val & ~BITMASK(ignore % 64), rank);
-}
-
-__host__ __device__ static inline int is_runend(const QF *qf, uint64_t index) {
-  return (METADATA_WORD(qf, runends, index) >> ((index % QF_SLOTS_PER_BLOCK) % 64)) & 1ULL;
-}
-
-__host__ __device__ static inline int is_occupied(const QF *qf, uint64_t index) {
-  return (METADATA_WORD(qf, occupieds, index) >> ((index % QF_SLOTS_PER_BLOCK) % 64)) & 1ULL;
-}
-
-#if QF_BITS_PER_SLOT == 8 || QF_BITS_PER_SLOT == 16 || QF_BITS_PER_SLOT == 32 || QF_BITS_PER_SLOT == 64
-
-__host__ __device__ static inline uint64_t get_slot(const QF *qf, uint64_t index) {
-  // ERR: Index passed in is incorrect
-  // printf("slots %lu, index %lu\n", qf->metadata->nslots, index);
-  assert(index < qf->metadata->xnslots);
-  return get_block(qf, index / QF_SLOTS_PER_BLOCK)->slots[index % QF_SLOTS_PER_BLOCK];
-}
-
-__host__ __device__ static inline void set_slot(const QF *qf, uint64_t index, uint64_t value) {
-  assert(index < qf->metadata->xnslots);
-  get_block(qf, index / QF_SLOTS_PER_BLOCK)->slots[index % QF_SLOTS_PER_BLOCK] = value & BITMASK(qf->metadata->bits_per_slot);
-}
-
-#elif QF_BITS_PER_SLOT > 0
-
-/* Little-endian code ....  Big-endian is TODO */
-
-__host__ __device__ static inline uint64_t get_slot(const QF *qf, uint64_t index) {
-  /* Should use __uint128_t to support up to 64-bit remainders, but gcc seems
-   * to generate buggy code.  :/  */
-  // printf("Other get slot: slots %lu, index %lu\n", qf->metadata->nslots, index);
-  assert(index < qf->metadata->xnslots);
-  uint64_t *p = (uint64_t *)&get_block(qf, index / QF_SLOTS_PER_BLOCK)->slots[(index % QF_SLOTS_PER_BLOCK) * QF_BITS_PER_SLOT / 8];
-  return (uint64_t)(((*p) >> (((index % QF_SLOTS_PER_BLOCK) * QF_BITS_PER_SLOT) % 8)) & BITMASK(QF_BITS_PER_SLOT));
-}
-
-__host__ __device__ static inline void set_slot(const QF *qf, uint64_t index, uint64_t value) {
-  /* Should use __uint128_t to support up to 64-bit remainders, but gcc seems
-   * to generate buggy code.  :/  */
-  assert(index < qf->metadata->xnslots);
-  uint64_t *p = (uint64_t *)&get_block(qf, index / QF_SLOTS_PER_BLOCK)->slots[(index % QF_SLOTS_PER_BLOCK) * QF_BITS_PER_SLOT / 8];
-  uint64_t t = *p;
-  uint64_t mask = BITMASK(QF_BITS_PER_SLOT);
-  uint64_t v = value;
-  int shift = ((index % QF_SLOTS_PER_BLOCK) * QF_BITS_PER_SLOT) % 8;
-  mask <<= shift;
-  v <<= shift;
-  t &= ~mask;
-  t |= v;
-  *p = t;
-}
-
-#else
-
-/* Little-endian code ....  Big-endian is TODO */
-
-__host__ __device__ static inline uint64_t get_slot(const QF *qf, uint64_t index) {
-  // rintf("Third get slot?!? slots %lu, index %lu\n", qf->metadata->nslots, index);
-  assert(index < qf->metadata->xnslots);
-  /* Should use __uint128_t to support up to 64-bit remainders, but gcc seems
-   * to generate buggy code.  :/  */
-  uint64_t *p =
-      (uint64_t *)&get_block(qf, index / QF_SLOTS_PER_BLOCK)->slots[(index % QF_SLOTS_PER_BLOCK) * qf->metadata->bits_per_slot / 8];
-  return (uint64_t)(((*p) >> (((index % QF_SLOTS_PER_BLOCK) * qf->metadata->bits_per_slot) % 8)) &
-                    BITMASK(qf->metadata->bits_per_slot));
-}
-
-__host__ __device__ static inline void set_slot(const QF *qf, uint64_t index, uint64_t value) {
-  assert(index < qf->metadata->xnslots);
-  /* Should use __uint128_t to support up to 64-bit remainders, but gcc seems
-   * to generate buggy code.  :/  */
-  uint64_t *p =
-      (uint64_t *)&get_block(qf, index / QF_SLOTS_PER_BLOCK)->slots[(index % QF_SLOTS_PER_BLOCK) * qf->metadata->bits_per_slot / 8];
-  uint64_t t = *p;
-  uint64_t mask = BITMASK(qf->metadata->bits_per_slot);
-  uint64_t v = value;
-  int shift = ((index % QF_SLOTS_PER_BLOCK) * qf->metadata->bits_per_slot) % 8;
-  mask <<= shift;
-  v <<= shift;
-  t &= ~mask;
-  t |= v;
-  *p = t;
-}
-
-#endif
-
-__host__ __device__ static inline uint64_t run_end(const QF *qf, uint64_t hash_bucket_index);
-
-__host__ __device__ static inline uint64_t block_offset(const QF *qf, uint64_t blockidx) {
-  /* If we have extended counters and a 16-bit (or larger) offset
-           field, then we can safely ignore the possibility of overflowing
-           that field. */
-  if (sizeof(qf->blocks[0].offset) > 1 || get_block(qf, blockidx)->offset < BITMASK(8 * sizeof(qf->blocks[0].offset)))
-    return get_block(qf, blockidx)->offset;
-
-  return run_end(qf, QF_SLOTS_PER_BLOCK * blockidx - 1) - QF_SLOTS_PER_BLOCK * blockidx + 1;
-}
-
-__host__ __device__ static inline uint64_t run_end(const QF *qf, uint64_t hash_bucket_index) {
-  uint64_t bucket_block_index = hash_bucket_index / QF_SLOTS_PER_BLOCK;
-  uint64_t bucket_intrablock_offset = hash_bucket_index % QF_SLOTS_PER_BLOCK;
-  uint64_t bucket_blocks_offset = block_offset(qf, bucket_block_index);
-
-  uint64_t bucket_intrablock_rank = bitrank(get_block(qf, bucket_block_index)->occupieds[0], bucket_intrablock_offset);
-
-  if (bucket_intrablock_rank == 0) {
-    if (bucket_blocks_offset <= bucket_intrablock_offset)
-      return hash_bucket_index;
-    else
-      return QF_SLOTS_PER_BLOCK * bucket_block_index + bucket_blocks_offset - 1;
-  }
-
-  uint64_t runend_block_index = bucket_block_index + bucket_blocks_offset / QF_SLOTS_PER_BLOCK;
-  uint64_t runend_ignore_bits = bucket_blocks_offset % QF_SLOTS_PER_BLOCK;
-  uint64_t runend_rank = bucket_intrablock_rank - 1;
-  uint64_t runend_block_offset = bitselectv(get_block(qf, runend_block_index)->runends[0], runend_ignore_bits, runend_rank);
-  if (runend_block_offset == QF_SLOTS_PER_BLOCK) {
-    if (bucket_blocks_offset == 0 && bucket_intrablock_rank == 0) {
-      /* The block begins in empty space, and this bucket is in that region of
-       * empty space */
-      return hash_bucket_index;
-    } else {
-      do {
-        runend_rank -= popcntv(get_block(qf, runend_block_index)->runends[0], runend_ignore_bits);
-        runend_block_index++;
-        runend_ignore_bits = 0;
-        runend_block_offset = bitselectv(get_block(qf, runend_block_index)->runends[0], runend_ignore_bits, runend_rank);
-      } while (runend_block_offset == QF_SLOTS_PER_BLOCK);
-    }
-  }
-
-  uint64_t runend_index = QF_SLOTS_PER_BLOCK * runend_block_index + runend_block_offset;
-  if (runend_index < hash_bucket_index)
-    return hash_bucket_index;
-  else
-    return runend_index;
-}
-
-__host__ __device__ static inline int offset_lower_bound(const QF *qf, uint64_t slot_index) {
-  const qfblock *b = get_block(qf, slot_index / QF_SLOTS_PER_BLOCK);
-  const uint64_t slot_offset = slot_index % QF_SLOTS_PER_BLOCK;
-  const uint64_t boffset = b->offset;
-  const uint64_t occupieds = b->occupieds[0] & BITMASK(slot_offset + 1);
-
-  // printf("slot %llu, slot_offset %02lx, block offset %llu, occupieds: %d ", slot_index, slot_offset, boffset, popcnt(occupieds));
-  assert(QF_SLOTS_PER_BLOCK == 64);
-
-  // if (boffset < slot_offset) {
-  if (boffset <= slot_offset) {
-    const uint64_t runends = (b->runends[0] & BITMASK(slot_offset)) >> boffset;
-    // printf(" runends %d\n", popcnt(runends));
-    // printf("boffset < slot_offset, runends %llu, popcnt(occupieds) %d, popcnt(runends) %d\n", runends, popcnt(occupieds),
-    // popcnt(runends)); printf("returning %d\n", popcnt(occupieds)-popcnt(runends));
-    return popcnt(occupieds) - popcnt(runends);
-  }
-  // printf("\n");
-  // printf("boffset > slot_offset, boffset-slotoffset %llu, popcnt(occupieds) %d\n", boffset-slot_offset, popcnt(occupieds));
-  // printf("returning %d\n", boffset-slot_offset+popcnt(occupieds));
-  return boffset - slot_offset + popcnt(occupieds);
-}
-
-/*
-__host__ __device__ static inline int offset_lower_bound_verbose(const QF *qf, uint64_t slot_index) {
-  const qfblock *b = get_block(qf, slot_index / QF_SLOTS_PER_BLOCK);
-  const uint64_t slot_offset = slot_index % QF_SLOTS_PER_BLOCK;
-  const uint64_t boffset = b->offset;
-  const uint64_t occupieds = b->occupieds[0] & BITMASK(slot_offset + 1);
-
-  printf("slot %llu, slot_offset %02lx, block offset %llu, occupieds: %d ", slot_index, slot_offset, boffset, popcnt(occupieds));
-  assert(QF_SLOTS_PER_BLOCK == 64);
-  if (boffset <= slot_offset) {
-    const uint64_t runends = (b->runends[0] & BITMASK(slot_offset)) >> boffset;
-    printf(" runends %d\n", popcnt(runends));
-    // printf("boffset < slot_offset, runends %llu, popcnt(occupieds) %d, popcnt(runends) %d\n", runends, popcnt(occupieds),
-    // popcnt(runends));
-    printf("returning %d\n", popcnt(occupieds) - popcnt(runends));
-    return popcnt(occupieds) - popcnt(runends);
-  }
-  printf("\n");
-  // printf("boffset > slot_offset, boffset-slotoffset %llu, popcnt(occupieds) %d\n", boffset-slot_offset, popcnt(occupieds));
-  printf("returning %ld\n", boffset - slot_offset + popcnt(occupieds));
-  return boffset - slot_offset + popcnt(occupieds);
-}
-*/
-
-__host__ __device__ static inline int is_empty(const QF *qf, uint64_t slot_index) {
-  return offset_lower_bound(qf, slot_index) == 0;
-}
-
-__host__ __device__ static inline int might_be_empty(const QF *qf, uint64_t slot_index) {
-  return !is_occupied(qf, slot_index) && !is_runend(qf, slot_index);
-}
-
-/*
-__device__ static inline int probably_is_empty(const QF *qf, uint64_t slot_index) {
-  return get_slot(qf, slot_index) == 0 && !is_occupied(qf, slot_index) && !is_runend(qf, slot_index);
-}*/
-
-/*
-__host__ __device__ static inline uint64_t find_first_empty_slot_verbose(QF *qf, uint64_t from) {
-  printf("Starting find first - this will terminate in -1\n");
-  qf_dump_block(qf, from / QF_SLOTS_PER_BLOCK);
-  do {
-    int t = offset_lower_bound_verbose(qf, from);
-    // get block of from
-
-    if (t < 0) {
-      printf("Finding first empty slot. T: %d, from: %llu\n - block %llu", t, from, from / QF_SLOTS_PER_BLOCK);
-      qf_dump(qf);
-    }
-    assert(t >= 0);
-    if (t == 0) break;
-    from = from + t;
-  } while (1);
-  printf("Next empty slot: %llu", from);
-  return from;
-}
-*/
-
-__host__ __device__ static inline uint64_t find_first_empty_slot(QF *qf, uint64_t from) {
-  uint64_t start_from = from;
-
-  do {
-    int t = offset_lower_bound(qf, from);
-    // get block of from
-
-    // if (t < 0){
-
-    // 	//this implies a failure in the code - you are going to
-    // 	find_first_empty_slot_verbose(qf, start_from);
-
-    // }
-
-    assert(t >= 0);
-    if (t == 0) break;
-    from = from + t;
-  } while (1);
-
-  uint64_t bucket_start_from = start_from / NUM_SLOTS_TO_LOCK;
-  uint64_t end_start_from = from / NUM_SLOTS_TO_LOCK;
-
-  // testing without this gate to check if we see speed improvements
-  if (end_start_from > bucket_start_from + 1) {
-    printf("Find first empty ran over a bucket: %lu\n", end_start_from - bucket_start_from);
-  }
-
-  return from;
-}
-
-__host__ __device__ static inline uint64_t shift_into_b(const uint64_t a, const uint64_t b, const int bstart, const int bend,
-                                                        const int amount) {
-  const uint64_t a_component = bstart == 0 ? (a >> (64 - amount)) : 0;
-  const uint64_t b_shifted_mask = BITMASK(bend - bstart) << bstart;
-  const uint64_t b_shifted = ((b_shifted_mask & b) << amount) & b_shifted_mask;
-  const uint64_t b_mask = ~b_shifted_mask;
-  return a_component | b_shifted | (b & b_mask);
-}
-
-// __device__ void* gpu_memmove(void* dst, const void* src, size_t n)
-// {
-// 	//printf("Launching memmove\n");
-// 	//todo: allocate space per thread for this buffer before launching the kernel
-// 	void* temp_buffer = malloc(n);
-// 	//maybe stack allocation?
-// 	//void* temp_buffer = void* char[n];
-// 	// cudaMemcpyAsync(temp_buffer, src, n, cudaMemcpyDeviceToDevice);
-// 	// cudaMemcpyAsync(dst, temp_buffer, n, cudaMemcpyDeviceToDevice);
-// 	// //cudaFree(temp_buffer);
-// 	// return dst;
-//   memcpy(temp_buffer, src, n);
-//   memcpy(dst, temp_buffer, n);
-
-//   free(temp_buffer);
-
-// }
-
-// a variant of memmove that compares the two pointers
-__device__ void gpu_memmove(void *dst, const void *src, size_t n) {
-  // printf("Launching memmove\n");
-  // todo: allocate space per thread for this buffer before launching the kernel
-
-  char *char_dst = (char *)dst;
-  char *char_src = (char *)src;
-
-  // double check this,
-  // think it is just > since dst+n does not get copied
-  if (char_src + n > char_dst) {
-    // copy backwards
-    for (int i = n - 1; i >= 0; i--) {
-      char_dst[i] = char_src[i];
-    }
-
-  } else {
-    // copy regular
-    for (int i = 0; i < n; i++) {
-      char_dst[i] = char_src[i];
-    }
-  }
-
-  // free(temp_buffer);
-}
-
-#if QF_BITS_PER_SLOT == 8 || QF_BITS_PER_SLOT == 16 || QF_BITS_PER_SLOT == 32 || QF_BITS_PER_SLOT == 64
-
-__host__ __device__ static inline void shift_remainders(QF *qf, uint64_t start_index, uint64_t empty_index) {
-  uint64_t start_block = start_index / QF_SLOTS_PER_BLOCK;
-  uint64_t start_offset = start_index % QF_SLOTS_PER_BLOCK;
-  uint64_t empty_block = empty_index / QF_SLOTS_PER_BLOCK;
-  uint64_t empty_offset = empty_index % QF_SLOTS_PER_BLOCK;
-
-  assert(start_index <= empty_index);
-  assert(empty_index < qf->metadata->xnslots);
-
-  while (start_block < empty_block) {
-#ifdef __CUDA_ARCH__
-    gpu_memmove(&get_block(qf, empty_block)->slots[1], &get_block(qf, empty_block)->slots[0],
-                empty_offset * sizeof(qf->blocks[0].slots[0]));
-#else
-    memmove(&get_block(qf, empty_block)->slots[1], &get_block(qf, empty_block)->slots[0],
-            empty_offset * sizeof(qf->blocks[0].slots[0]));
-#endif
-
-    get_block(qf, empty_block)->slots[0] = get_block(qf, empty_block - 1)->slots[QF_SLOTS_PER_BLOCK - 1];
-    empty_block--;
-    empty_offset = QF_SLOTS_PER_BLOCK - 1;
-  }
-#ifdef __CUDA_ARCH__
-  gpu_memmove(&get_block(qf, empty_block)->slots[start_offset + 1], &get_block(qf, empty_block)->slots[start_offset],
-              (empty_offset - start_offset) * sizeof(qf->blocks[0].slots[0]));
-#else
-  memmove(&get_block(qf, empty_block)->slots[start_offset + 1], &get_block(qf, empty_block)->slots[start_offset],
-          (empty_offset - start_offset) * sizeof(qf->blocks[0].slots[0]));
-#endif
-}
-
-#else
-
-#define REMAINDER_WORD(qf, i) \
-  ((uint64_t *)&(get_block(qf, (i) / qf->metadata->bits_per_slot)->slots[8 * ((i) % qf->metadata->bits_per_slot)]))
-
-__host__ __device__ static inline void shift_remainders(QF *qf, const uint64_t start_index, const uint64_t empty_index) {
-  uint64_t last_word = (empty_index + 1) * qf->metadata->bits_per_slot / 64;
-  const uint64_t first_word = start_index * qf->metadata->bits_per_slot / 64;
-  int bend = ((empty_index + 1) * qf->metadata->bits_per_slot) % 64;
-  const int bstart = (start_index * qf->metadata->bits_per_slot) % 64;
-
-  while (last_word != first_word) {
-    *REMAINDER_WORD(qf, last_word) =
-        shift_into_b(*REMAINDER_WORD(qf, last_word - 1), *REMAINDER_WORD(qf, last_word), 0, bend, qf->metadata->bits_per_slot);
-    last_word--;
-    bend = 64;
-  }
-  *REMAINDER_WORD(qf, last_word) = shift_into_b(0, *REMAINDER_WORD(qf, last_word), bstart, bend, qf->metadata->bits_per_slot);
-}
-
-#endif
-
-__host__ __device__ static inline void find_next_n_empty_slots(QF *qf, uint64_t from, uint64_t n, uint64_t *indices) {
-  while (n) {
-    indices[--n] = find_first_empty_slot(qf, from);
-    from = indices[n] + 1;
-  }
-}
-
-__host__ __device__ static inline void shift_slots(QF *qf, int64_t first, uint64_t last, uint64_t distance) {
-  int64_t i;
-  if (distance == 1)
-    shift_remainders(qf, first, last + 1);
-  else
-    for (i = last; i >= first; i--) set_slot(qf, i + distance, get_slot(qf, i));
-}
-
-__host__ __device__ static inline void shift_runends(QF *qf, int64_t first, uint64_t last, uint64_t distance) {
-  assert(last < qf->metadata->xnslots && distance < 64);
-  uint64_t first_word = first / 64;
-  uint64_t bstart = first % 64;
-  uint64_t last_word = (last + distance + 1) / 64;
-  uint64_t bend = (last + distance + 1) % 64;
-
-  if (last_word != first_word) {
-    METADATA_WORD(qf, runends, 64 * last_word) = shift_into_b(METADATA_WORD(qf, runends, 64 * (last_word - 1)),
-                                                              METADATA_WORD(qf, runends, 64 * last_word), 0, bend, distance);
-    bend = 64;
-    last_word--;
-    while (last_word != first_word) {
-      METADATA_WORD(qf, runends, 64 * last_word) = shift_into_b(METADATA_WORD(qf, runends, 64 * (last_word - 1)),
-                                                                METADATA_WORD(qf, runends, 64 * last_word), 0, bend, distance);
-      last_word--;
-    }
-  }
-  METADATA_WORD(qf, runends, 64 * last_word) = shift_into_b(0, METADATA_WORD(qf, runends, 64 * last_word), bstart, bend, distance);
-}
-
-__host__ __device__ static inline bool insert_replace_slots_and_shift_remainders_and_runends_and_offsets(
-    QF *qf, int operation, uint64_t bucket_index, uint64_t overwrite_index, const uint64_t *remainders, uint64_t total_remainders,
-    uint64_t noverwrites) {
-  uint64_t empties[67];
-  uint64_t i;
-  int64_t j;
-  int64_t ninserts = total_remainders - noverwrites;
-  uint64_t insert_index = overwrite_index + noverwrites;
-
-  if (ninserts > 0) {
-    /* First, shift things to create n empty spaces where we need them. */
-    find_next_n_empty_slots(qf, insert_index, ninserts, empties);
-    if (empties[0] >= qf->metadata->xnslots) {
-      return false;
-    }
-    for (j = 0; j < ninserts - 1; j++) shift_slots(qf, empties[j + 1] + 1, empties[j] - 1, j + 1);
-    shift_slots(qf, insert_index, empties[ninserts - 1] - 1, ninserts);
-
-    for (j = 0; j < ninserts - 1; j++) shift_runends(qf, empties[j + 1] + 1, empties[j] - 1, j + 1);
-    shift_runends(qf, insert_index, empties[ninserts - 1] - 1, ninserts);
-
-    for (i = noverwrites; i < total_remainders - 1; i++)
-      METADATA_WORD(qf, runends, overwrite_index + i) &= ~(1ULL << (((overwrite_index + i) % QF_SLOTS_PER_BLOCK) % 64));
-
-    switch (operation) {
-      case 0: /* insert into empty bucket */
-        assert(noverwrites == 0);
-        METADATA_WORD(qf, runends, overwrite_index + total_remainders - 1) |=
-            1ULL << (((overwrite_index + total_remainders - 1) % QF_SLOTS_PER_BLOCK) % 64);
-        break;
-      case 1: /* append to bucket */
-        METADATA_WORD(qf, runends, overwrite_index + noverwrites - 1) &=
-            ~(1ULL << (((overwrite_index + noverwrites - 1) % QF_SLOTS_PER_BLOCK) % 64));
-        METADATA_WORD(qf, runends, overwrite_index + total_remainders - 1) |=
-            1ULL << (((overwrite_index + total_remainders - 1) % QF_SLOTS_PER_BLOCK) % 64);
-        break;
-      case 2: /* insert into bucket */
-        METADATA_WORD(qf, runends, overwrite_index + total_remainders - 1) &=
-            ~(1ULL << (((overwrite_index + total_remainders - 1) % QF_SLOTS_PER_BLOCK) % 64));
-        break;
-      default: printf("Invalid operation %d\n", operation);
-#ifdef __CUDA_ARCH__
-        __threadfence();  // ensure store issued before trap
-        asm("trap;");
-#else
-        abort();
-#endif
-    }
-
-    uint64_t npreceding_empties = 0;
-    for (i = bucket_index / QF_SLOTS_PER_BLOCK + 1; i <= empties[0] / QF_SLOTS_PER_BLOCK; i++) {
-      while ((int64_t)npreceding_empties < ninserts && empties[ninserts - 1 - npreceding_empties] / QF_SLOTS_PER_BLOCK < i)
-        npreceding_empties++;
-
-      if (get_block(qf, i)->offset + ninserts - npreceding_empties < BITMASK(8 * sizeof(qf->blocks[0].offset)))
-        get_block(qf, i)->offset += ninserts - npreceding_empties;
-      else
-        get_block(qf, i)->offset = (uint8_t)BITMASK(8 * sizeof(qf->blocks[0].offset));
-    }
-  }
-
-  for (i = 0; i < total_remainders; i++) set_slot(qf, overwrite_index + i, remainders[i]);
-
-  // modify_metadata(&qf->runtimedata->pc_noccupied_slots, ninserts);
-
-  return true;
-}
-
-__host__ __device__ static inline int remove_replace_slots_and_shift_remainders_and_runends_and_offsets(
-    QF *qf, int operation, uint64_t bucket_index, uint64_t overwrite_index, const uint64_t *remainders, uint64_t total_remainders,
-    uint64_t old_length) {
-  uint64_t i;
-
-  // Update the slots
-  for (i = 0; i < total_remainders; i++) set_slot(qf, overwrite_index + i, remainders[i]);
-
-  // If this is the last thing in its run, then we may need to set a new runend bit
-  if (is_runend(qf, overwrite_index + old_length - 1)) {
-    if (total_remainders > 0) {
-      // If we're not deleting this entry entirely, then it will still the last entry in this run
-      METADATA_WORD(qf, runends, overwrite_index + total_remainders - 1) |= 1ULL << ((overwrite_index + total_remainders - 1) % 64);
-    } else if (overwrite_index > bucket_index && !is_runend(qf, overwrite_index - 1)) {
-      // If we're deleting this entry entirely, but it is not the first entry in this run,
-      // then set the preceding entry to be the runend
-      METADATA_WORD(qf, runends, overwrite_index - 1) |= 1ULL << ((overwrite_index - 1) % 64);
-    }
-  }
-
-  // shift slots back one run at a time
-  uint64_t original_bucket = bucket_index;
-  uint64_t current_bucket = bucket_index;
-  uint64_t current_slot = overwrite_index + total_remainders;
-  uint64_t current_distance = old_length - total_remainders;
-  int ret_current_distance = current_distance;
-
-  while (current_distance > 0) {
-    if (is_runend(qf, current_slot + current_distance - 1)) {
-      do {
-        current_bucket++;
-      } while (current_bucket < current_slot + current_distance && !is_occupied(qf, current_bucket));
-    }
-
-    if (current_bucket <= current_slot) {
-      set_slot(qf, current_slot, get_slot(qf, current_slot + current_distance));
-      if (is_runend(qf, current_slot) != is_runend(qf, current_slot + current_distance))
-        METADATA_WORD(qf, runends, current_slot) ^= 1ULL << (current_slot % 64);
-      current_slot++;
-
-    } else if (current_bucket <= current_slot + current_distance) {
-      uint64_t i;
-      for (i = current_slot; i < current_slot + current_distance; i++) {
-        set_slot(qf, i, 0);
-        METADATA_WORD(qf, runends, i) &= ~(1ULL << (i % 64));
-      }
-
-      current_distance = current_slot + current_distance - current_bucket;
-      current_slot = current_bucket;
-    } else {
-      current_distance = 0;
-    }
-  }
-
-  // reset the occupied bit of the hash bucket index if the hash is the
-  // only item in the run and is removed completely.
-  if (operation && !total_remainders) METADATA_WORD(qf, occupieds, bucket_index) &= ~(1ULL << (bucket_index % 64));
-
-  // update the offset bits.
-  // find the number of occupied slots in the original_bucket block.
-  // Then find the runend slot corresponding to the last run in the
-  // original_bucket block.
-  // Update the offset of the block to which it belongs.
-  uint64_t original_block = original_bucket / QF_SLOTS_PER_BLOCK;
-  if (old_length > total_remainders) {  // we only update offsets if we shift/delete anything
-    while (1) {
-      uint64_t last_occupieds_hash_index = QF_SLOTS_PER_BLOCK * original_block + (QF_SLOTS_PER_BLOCK - 1);
-      uint64_t runend_index = run_end(qf, last_occupieds_hash_index);
-      // runend spans across the block
-      // update the offset of the next block
-      if (runend_index / QF_SLOTS_PER_BLOCK == original_block) {  // if the run ends in the same block
-        if (get_block(qf, original_block + 1)->offset == 0) break;
-        get_block(qf, original_block + 1)->offset = 0;
-      } else {  // if the last run spans across the block
-        if (get_block(qf, original_block + 1)->offset == (runend_index - last_occupieds_hash_index)) break;
-        get_block(qf, original_block + 1)->offset = (runend_index - last_occupieds_hash_index);
-      }
-      original_block++;
-    }
-  }
-
-  // int num_slots_freed = old_length - total_remainders;
-  // modify_metadata(&qf->runtimedata->pc_noccupied_slots, -num_slots_freed);
-  /*qf->metadata->noccupied_slots -= (old_length - total_remainders);*/
-  if (!total_remainders) {
-    // modify_metadata(&qf->runtimedata->pc_ndistinct_elts, -1);
-    /*qf->metadata->ndistinct_elts--;*/
-  }
-
-  return ret_current_distance;
-}
-
-/*****************************************************************************
- * Code that uses the above to implement a QF with keys and inline counters. *
- *****************************************************************************/
-
-/*
-         Counter format:
-         0 xs:    <empty string>
-         1 x:     x
-         2 xs:    xx
-         3 0s:    000
-         >2 xs:   xbc...cx  for x != 0, b < x, c != 0, x
-         >3 0s:   0c...c00  for c != 0
-         */
-__host__ __device__ static inline uint64_t *encode_counter(QF *qf, uint64_t remainder, uint64_t counter, uint64_t *slots) {
-  uint64_t digit = remainder;
-  uint64_t base = (1ULL << qf->metadata->bits_per_slot) - 1;
-  uint64_t *p = slots;
-
-  if (counter == 0) return p;
-
-  *--p = remainder;
-
-  if (counter == 1) return p;
-
-  if (counter == 2) {
-    *--p = remainder;
-    return p;
-  }
-
-  if (counter == 3 && remainder == 0) {
-    *--p = remainder;
-    *--p = remainder;
-    return p;
-  }
-
-  if (counter == 3 && remainder > 0) {
-    *--p = 0;
-    *--p = remainder;
-    return p;
-  }
-
-  if (remainder == 0)
-    *--p = remainder;
-  else
-    base--;
-
-  if (remainder)
-    counter -= 3;
-  else
-    counter -= 4;
-  do {
-    digit = counter % base;
-    digit++;                                      /* Zero not allowed */
-    if (remainder && digit >= remainder) digit++; /* Cannot overflow since digit is mod 2^r-2 */
-    *--p = digit;
-    counter /= base;
-  } while (counter);
-
-  if (remainder && digit >= remainder) *--p = 0;
-
-  *--p = remainder;
-
-  return p;
-}
-
-/* Returns the length of the encoding.
-REQUIRES: index points to first slot of a counter. */
-__host__ __device__ static inline uint64_t decode_counter(const QF *qf, uint64_t index, uint64_t *remainder, uint64_t *count) {
-  uint64_t base;
-  uint64_t rem;
-  uint64_t cnt;
-  uint64_t digit;
-  uint64_t end;
-
-  *remainder = rem = get_slot(qf, index);
-
-  if (is_runend(qf, index)) { /* Entire run is "0" */
-    *count = 1;
-    return index;
-  }
-
-  digit = get_slot(qf, index + 1);
-
-  if (is_runend(qf, index + 1)) {
-    *count = digit == rem ? 2 : 1;
-    return index + (digit == rem ? 1 : 0);
-  }
-
-  if (rem > 0 && digit >= rem) {
-    *count = digit == rem ? 2 : 1;
-    return index + (digit == rem ? 1 : 0);
-  }
-
-  if (rem > 0 && digit == 0 && get_slot(qf, index + 2) == rem) {
-    *count = 3;
-    return index + 2;
-  }
-
-  if (rem == 0 && digit == 0) {
-    if (get_slot(qf, index + 2) == 0) {
-      *count = 3;
-      return index + 2;
-    } else {
-      *count = 2;
-      return index + 1;
-    }
-  }
-
-  cnt = 0;
-  base = (1ULL << qf->metadata->bits_per_slot) - (rem ? 2 : 1);
-
-  end = index + 1;
-  while (digit != rem && !is_runend(qf, end)) {
-    if (digit > rem) digit--;
-    if (digit && rem) digit--;
-    cnt = cnt * base + digit;
-
-    end++;
-    digit = get_slot(qf, end);
-  }
-
-  if (rem) {
-    *count = cnt + 3;
-    return end;
-  }
-
-  if (is_runend(qf, end) || get_slot(qf, end + 1) != 0) {
-    *count = 1;
-    return index;
-  }
-
-  *count = cnt + 4;
-  return end + 1;
-}
-
-/* return the next slot which corresponds to a
- * different element
- * */
-/*
-__device__ static inline uint64_t next_slot(QF *qf, uint64_t current) {
-  uint64_t rem = get_slot(qf, current);
-  current++;
-
-  while (get_slot(qf, current) == rem && current <= qf->metadata->nslots) {
-    current++;
-  }
-  return current;
-}
-*/
-
-__host__ __device__ static inline qf_returns insert1_if_not_exists(QF *qf, __uint64_t hash, uint64_t *value) {
-  uint64_t hash_remainder = hash & BITMASK(qf->metadata->bits_per_slot);
-  uint64_t hash_bucket_index = hash >> qf->metadata->bits_per_slot;
-  uint64_t hash_bucket_block_offset = hash_bucket_index % QF_SLOTS_PER_BLOCK;
-
-  // printf("In insert1, Index is %llu, block_offset is %llu, remainder is %llu \n", hash_bucket_index, hash_bucket_block_offset,
-  // hash_remainder);
-
-  if (is_empty(qf, hash_bucket_index) /* might_be_empty(qf, hash_bucket_index) && runend_index == hash_bucket_index */) {
-    METADATA_WORD(qf, runends, hash_bucket_index) |= 1ULL << (hash_bucket_block_offset % 64);
-    set_slot(qf, hash_bucket_index, hash_remainder);
-    METADATA_WORD(qf, occupieds, hash_bucket_index) |= 1ULL << (hash_bucket_block_offset % 64);
-  } else {
-    uint64_t runend_index = run_end(qf, hash_bucket_index);
-    int operation = 0; /* Insert into empty bucket */
-    uint64_t insert_index = runend_index + 1;
-    uint64_t new_value = hash_remainder;
-
-    /* printf("RUNSTART: %02lx RUNEND: %02lx\n", runstart_index, runend_index); */
-
-    uint64_t runstart_index = hash_bucket_index == 0 ? 0 : run_end(qf, hash_bucket_index - 1) + 1;
-
-    if (is_occupied(qf, hash_bucket_index)) {
-      /* Find the counter for this remainder if it exists. */
-      uint64_t current_remainder = get_slot(qf, runstart_index);
-
-      // printf("Current remainder above: %llu\n", current_remainder);
-      *value = current_remainder & BITMASK(qf->metadata->value_bits);
-      // printf("Clipped remainder: %llu\n", *value);
-
-      // return here?
-      // maybe qf_returns::QF_ITEM_FOUND
-      return QF_ITEM_FOUND;
-
-    }  // else {
-       // modify_metadata(&qf->runtimedata->pc_ndistinct_elts, 1);
-    //}
-
-    // Here is where we modify?
-    if (operation != 0) {
-      // extract
-      uint64_t current_remainder = get_slot(qf, runstart_index);
-      printf("Expecting an extraction, Current remainder: %llu\n", (unsigned long long)current_remainder);
-    } else {
-      printf("Expecting regular insert. \n");
-    }
-
-    if (operation >= 0) {
-      uint64_t empty_slot_index = find_first_empty_slot(qf, runend_index + 1);
-      if (empty_slot_index >= qf->metadata->xnslots) {
-        printf("Ran out of space. Total xnslots is %lu, first empty slot is %lu\n", qf->metadata->xnslots, empty_slot_index);
-        return QF_FULL;
-      }
-      shift_remainders(qf, insert_index, empty_slot_index);
-
-      set_slot(qf, insert_index, new_value);
-
-      shift_runends(qf, insert_index, empty_slot_index - 1, 1);
-      switch (operation) {
-        case 0: METADATA_WORD(qf, runends, insert_index) |= 1ULL << ((insert_index % QF_SLOTS_PER_BLOCK) % 64); break;
-        case 1:
-          METADATA_WORD(qf, runends, insert_index - 1) &= ~(1ULL << (((insert_index - 1) % QF_SLOTS_PER_BLOCK) % 64));
-          METADATA_WORD(qf, runends, insert_index) |= 1ULL << ((insert_index % QF_SLOTS_PER_BLOCK) % 64);
-          break;
-        case 2: METADATA_WORD(qf, runends, insert_index) &= ~(1ULL << ((insert_index % QF_SLOTS_PER_BLOCK) % 64)); break;
-        default: printf("Invalid operation %d\n", operation);
-#ifdef __CUDA_ARCH__
-          __threadfence();  // ensure store issued before trap
-          asm("trap;");
-#else
-          abort();
-#endif
-      }
-      /*
-       * Increment the offset for each block between the hash bucket index
-       * and block of the empty slot
-       * */
-      uint64_t i;
-      for (i = hash_bucket_index / QF_SLOTS_PER_BLOCK + 1; i <= empty_slot_index / QF_SLOTS_PER_BLOCK; i++) {
-        if (get_block(qf, i)->offset < BITMASK(8 * sizeof(qf->blocks[0].offset))) get_block(qf, i)->offset++;
-        assert(get_block(qf, i)->offset != 0);
-      }
-    }
-    METADATA_WORD(qf, occupieds, hash_bucket_index) |= 1ULL << (hash_bucket_block_offset % 64);
-  }
-
-  // change here?
-  return QF_ITEM_INSERTED;
-}
-
-__host__ __device__ static inline int insert1(QF *qf, __uint64_t hash, uint8_t runtime_lock) {
-  int ret_distance = 0;
-  uint64_t hash_remainder = hash & BITMASK(qf->metadata->bits_per_slot);
-  uint64_t hash_bucket_index = hash >> qf->metadata->bits_per_slot;
-  uint64_t hash_bucket_block_offset = hash_bucket_index % QF_SLOTS_PER_BLOCK;
-  /*
-  if (GET_NO_LOCK(runtime_lock) != QF_NO_LOCK) {
-          if (!qf_lock(qf, hash_bucket_index,  true, runtime_lock))
-                  return QF_COULDNT_LOCK;
-  }
-  */
-  // printf("In insert1, Index is %llu, block_offset is %llu, remainder is %llu \n", hash_bucket_index, hash_bucket_block_offset,
-  // hash_remainder);
-
-#ifdef __CUDA_ARCH__
-  atomicAdd((unsigned long long *)&qf->metadata->noccupied_slots, 1ULL);
-#else
-  abort();
-#endif
-
-  if (is_empty(qf, hash_bucket_index) /* might_be_empty(qf, hash_bucket_index) && runend_index == hash_bucket_index */) {
-    METADATA_WORD(qf, runends, hash_bucket_index) |= 1ULL << (hash_bucket_block_offset % 64);
-    set_slot(qf, hash_bucket_index, hash_remainder);
-    METADATA_WORD(qf, occupieds, hash_bucket_index) |= 1ULL << (hash_bucket_block_offset % 64);
-
-    ret_distance = 0;
-    // modify_metadata(&qf->runtimedata->pc_ndistinct_elts, 1);
-    // modify_metadata(&qf->runtimedata->pc_noccupied_slots, 1);
-    // modify_metadata(&qf->runtimedata->pc_nelts, 1);
-  } else {
-    uint64_t runend_index = run_end(qf, hash_bucket_index);
-    int operation = 0; /* Insert into empty bucket */
-    uint64_t insert_index = runend_index + 1;
-    uint64_t new_value = hash_remainder;
-
-    /* printf("RUNSTART: %02lx RUNEND: %02lx\n", runstart_index, runend_index); */
-
-    uint64_t runstart_index = hash_bucket_index == 0 ? 0 : run_end(qf, hash_bucket_index - 1) + 1;
-
-    if (is_occupied(qf, hash_bucket_index)) {
-      /* Find the counter for this remainder if it exists. */
-      uint64_t current_remainder = get_slot(qf, runstart_index);
-      uint64_t zero_terminator = runstart_index;
-
-      /* The counter for 0 is special. */
-      if (current_remainder == 0) {
-        uint64_t t = runstart_index + 1;
-        while (t < runend_index && get_slot(qf, t) != 0) t++;
-        if (t < runend_index && get_slot(qf, t + 1) == 0)
-          zero_terminator = t + 1; /* Three or more 0s */
-        else if (runstart_index < runend_index && get_slot(qf, runstart_index + 1) == 0)
-          zero_terminator = runstart_index + 1; /* Exactly two 0s */
-        /* Otherwise, exactly one 0 (i.e. zero_terminator == runstart_index) */
-
-        /* May read past end of run, but that's OK because loop below
-                 can handle that */
-        if (hash_remainder != 0) {
-          runstart_index = zero_terminator + 1;
-          current_remainder = get_slot(qf, runstart_index);
-        }
-      }
-
-      /* Skip over counters for other remainders. */
-      while (current_remainder < hash_remainder && runstart_index <= runend_index) {
-        /* If this remainder has an extended counter, skip over it. */
-        if (runstart_index < runend_index && get_slot(qf, runstart_index + 1) < current_remainder) {
-          runstart_index = runstart_index + 2;
-          while (runstart_index < runend_index && get_slot(qf, runstart_index) != current_remainder) runstart_index++;
-          runstart_index++;
-
-          /* This remainder has a simple counter. */
-        } else {
-          runstart_index++;
-        }
-
-        /* This may read past the end of the run, but the while loop
-                 condition will prevent us from using the invalid result in
-                 that case. */
-        current_remainder = get_slot(qf, runstart_index);
-      }
-
-      /* If this is the first time we've inserted the new remainder,
-               and it is larger than any remainder in the run. */
-      if (runstart_index > runend_index) {
-        operation = 1;
-        insert_index = runstart_index;
-        new_value = hash_remainder;
-        // modify_metadata(&qf->runtimedata->pc_ndistinct_elts, 1);
-
-        /* This is the first time we're inserting this remainder, but
-                 there are larger remainders already in the run. */
-      } else if (current_remainder != hash_remainder) {
-        operation = 2; /* Inserting */
-        insert_index = runstart_index;
-        new_value = hash_remainder;
-        // modify_metadata(&qf->runtimedata->pc_ndistinct_elts, 1);
-
-        /* Cases below here: we're incrementing the (simple or
-                 extended) counter for this remainder. */
-
-        /* If there's exactly one instance of this remainder. */
-      } else if (runstart_index == runend_index || (hash_remainder > 0 && get_slot(qf, runstart_index + 1) > hash_remainder) ||
-                 (hash_remainder == 0 && zero_terminator == runstart_index)) {
-        operation = 2; /* Insert */
-        insert_index = runstart_index;
-        new_value = hash_remainder;
-
-        /* If there are exactly two instances of this remainder. */
-      } else if ((hash_remainder > 0 && get_slot(qf, runstart_index + 1) == hash_remainder) ||
-                 (hash_remainder == 0 && zero_terminator == runstart_index + 1)) {
-        operation = 2; /* Insert */
-        insert_index = runstart_index + 1;
-        new_value = 0;
-
-        /* Special case for three 0s */
-      } else if (hash_remainder == 0 && zero_terminator == runstart_index + 2) {
-        operation = 2; /* Insert */
-        insert_index = runstart_index + 1;
-        new_value = 1;
-
-        /* There is an extended counter for this remainder. */
-      } else {
-        /* Move to the LSD of the counter. */
-        insert_index = runstart_index + 1;
-        while (get_slot(qf, insert_index + 1) != hash_remainder) insert_index++;
-
-        /* Increment the counter. */
-        uint64_t digit, carry;
-        do {
-          carry = 0;
-          digit = get_slot(qf, insert_index);
-          // Convert a leading 0 (which is special) to a normal encoded digit
-          if (digit == 0) {
-            digit++;
-            if (digit == current_remainder) digit++;
-          }
-
-          // Increment the digit
-          digit = (digit + 1) & BITMASK(qf->metadata->bits_per_slot);
-
-          // Ensure digit meets our encoding requirements
-          if (digit == 0) {
-            digit++;
-            carry = 1;
-          }
-          if (digit == current_remainder) digit = (digit + 1) & BITMASK(qf->metadata->bits_per_slot);
-          if (digit == 0) {
-            digit++;
-            carry = 1;
-          }
-
-          set_slot(qf, insert_index, digit);
-          insert_index--;
-        } while (insert_index > runstart_index && carry);
-
-        /* If the counter needs to be expanded. */
-        if (insert_index == runstart_index && (carry > 0 || (current_remainder != 0 && digit >= current_remainder))) {
-          operation = 2; /* insert */
-          insert_index = runstart_index + 1;
-          if (!carry) /* To prepend a 0 before the counter if the MSD is greater than the rem */
-            new_value = 0;
-          else if (carry) { /* Increment the new value because we don't use 0 to encode counters */
-            new_value = 2;
-            /* If the rem is greater than or equal to the new_value then fail*/
-            if (current_remainder > 0) assert(new_value < current_remainder);
-          }
-        } else {
-          operation = -1;
-        }
-      }
-    }  // else {
-       // modify_metadata(&qf->runtimedata->pc_ndistinct_elts, 1);
-    //}
-
-    if (operation >= 0) {
-      uint64_t empty_slot_index = find_first_empty_slot(qf, runend_index + 1);
-      if (empty_slot_index >= qf->metadata->xnslots) {
-        printf("Ran out of space. Total xnslots is %lu, first empty slot is %lu\n", qf->metadata->xnslots, empty_slot_index);
-        return QF_NO_SPACE;
-      }
-      shift_remainders(qf, insert_index, empty_slot_index);
-
-      set_slot(qf, insert_index, new_value);
-      ret_distance = insert_index - hash_bucket_index;
-
-      shift_runends(qf, insert_index, empty_slot_index - 1, 1);
-      switch (operation) {
-        case 0: METADATA_WORD(qf, runends, insert_index) |= 1ULL << ((insert_index % QF_SLOTS_PER_BLOCK) % 64); break;
-        case 1:
-          METADATA_WORD(qf, runends, insert_index - 1) &= ~(1ULL << (((insert_index - 1) % QF_SLOTS_PER_BLOCK) % 64));
-          METADATA_WORD(qf, runends, insert_index) |= 1ULL << ((insert_index % QF_SLOTS_PER_BLOCK) % 64);
-          break;
-        case 2: METADATA_WORD(qf, runends, insert_index) &= ~(1ULL << ((insert_index % QF_SLOTS_PER_BLOCK) % 64)); break;
-        default: printf("Invalid operation %d\n", operation);
-#ifdef __CUDA_ARCH__
-          __threadfence();  // ensure store issued before trap
-          asm("trap;");
-#else
-          abort();
-#endif
-      }
-      /*
-       * Increment the offset for each block between the hash bucket index
-       * and block of the empty slot
-       * */
-      uint64_t i;
-      for (i = hash_bucket_index / QF_SLOTS_PER_BLOCK + 1; i <= empty_slot_index / QF_SLOTS_PER_BLOCK; i++) {
-        if (get_block(qf, i)->offset < BITMASK(8 * sizeof(qf->blocks[0].offset))) get_block(qf, i)->offset++;
-        assert(get_block(qf, i)->offset != 0);
-      }
-      // modify_metadata(&qf->runtimedata->pc_noccupied_slots, 1);
-    }
-    // modify_metadata(&qf->runtimedata->pc_nelts, 1);
-    METADATA_WORD(qf, occupieds, hash_bucket_index) |= 1ULL << (hash_bucket_block_offset % 64);
-  }
-  /*
-  if (GET_NO_LOCK(runtime_lock) != QF_NO_LOCK) {
-          qf_unlock(qf, hash_bucket_index, true);
-  }
-  */
-  return ret_distance;
-}
-
-__host__ __device__ static inline int insert(QF *qf, __uint64_t hash, uint64_t count, uint8_t runtime_lock) {
-  int ret_distance = 0;
-  uint64_t hash_remainder = hash & BITMASK(qf->metadata->bits_per_slot);
-  uint64_t hash_bucket_index = hash >> qf->metadata->bits_per_slot;
-  uint64_t hash_bucket_block_offset = hash_bucket_index % QF_SLOTS_PER_BLOCK;
-  /*uint64_t hash_bucket_lock_offset  = hash_bucket_index % NUM_SLOTS_TO_LOCK;*/
-  /*
-  if (GET_NO_LOCK(runtime_lock) != QF_NO_LOCK) {
-          if (!qf_lock(qf, hash_bucket_index,  false, runtime_lock))
-                  return QF_COULDNT_LOCK;
-  }
-  */
-  uint64_t runend_index = run_end(qf, hash_bucket_index);
-
-  /* Empty slot */
-  if (might_be_empty(qf, hash_bucket_index) && runend_index == hash_bucket_index) {
-    METADATA_WORD(qf, runends, hash_bucket_index) |= 1ULL << (hash_bucket_block_offset % 64);
-    set_slot(qf, hash_bucket_index, hash_remainder);
-    METADATA_WORD(qf, occupieds, hash_bucket_index) |= 1ULL << (hash_bucket_block_offset % 64);
-
-    // ERIC TODO: see if this metadata is needed--probably isn't compatible with GPU
-    // modify_metadata(&qf->runtimedata->pc_ndistinct_elts, 1);
-    // modify_metadata(&qf->runtimedata->pc_noccupied_slots, 1);
-    // modify_metadata(&qf->runtimedata->pc_nelts, 1);
-    /* This trick will, I hope, keep the fast case fast. */
-    if (count > 1) {
-      insert(qf, hash, count - 1, QF_NO_LOCK);
-    }
-  } else { /* Non-empty slot */
-    uint64_t new_values[67];
-    int64_t runstart_index = hash_bucket_index == 0 ? 0 : run_end(qf, hash_bucket_index - 1) + 1;
-
-    bool ret;
-    if (!is_occupied(qf, hash_bucket_index)) { /* Empty bucket, but its slot is occupied. */
-      uint64_t *p = encode_counter(qf, hash_remainder, count, &new_values[67]);
-      ret = insert_replace_slots_and_shift_remainders_and_runends_and_offsets(qf, 0, hash_bucket_index, runstart_index, p,
-                                                                              &new_values[67] - p, 0);
-      if (!ret) return QF_NO_SPACE;
-      // modify_metadata(&qf->runtimedata->pc_ndistinct_elts, 1);
-      ret_distance = runstart_index - hash_bucket_index;
-    } else { /* Non-empty bucket */
-
-      uint64_t current_remainder, current_count, current_end;
-
-      /* Find the counter for this remainder, if one exists. */
-      current_end = decode_counter(qf, runstart_index, &current_remainder, &current_count);
-      while (current_remainder < hash_remainder && !is_runend(qf, current_end)) {
-        runstart_index = current_end + 1;
-        current_end = decode_counter(qf, runstart_index, &current_remainder, &current_count);
-      }
-
-      /* If we reached the end of the run w/o finding a counter for this remainder,
-               then append a counter for this remainder to the run. */
-      if (current_remainder < hash_remainder) {
-        uint64_t *p = encode_counter(qf, hash_remainder, count, &new_values[67]);
-        ret = insert_replace_slots_and_shift_remainders_and_runends_and_offsets(qf, 1, /* Append to bucket */ hash_bucket_index,
-                                                                                current_end + 1, p, &new_values[67] - p, 0);
-        if (!ret) return QF_NO_SPACE;
-        // modify_metadata(&qf->runtimedata->pc_ndistinct_elts, 1);
-        ret_distance = (current_end + 1) - hash_bucket_index;
-        /* Found a counter for this remainder.  Add in the new count. */
-      } else if (current_remainder == hash_remainder) {
-        uint64_t *p = encode_counter(qf, hash_remainder, current_count + count, &new_values[67]);
-        ret = insert_replace_slots_and_shift_remainders_and_runends_and_offsets(
-            qf, is_runend(qf, current_end) ? 1 : 2, hash_bucket_index, runstart_index, p, &new_values[67] - p,
-            current_end - runstart_index + 1);
-        if (!ret) return QF_NO_SPACE;
-        ret_distance = runstart_index - hash_bucket_index;
-        /* No counter for this remainder, but there are larger
-                 remainders, so we're not appending to the bucket. */
-      } else {
-        uint64_t *p = encode_counter(qf, hash_remainder, count, &new_values[67]);
-        ret = insert_replace_slots_and_shift_remainders_and_runends_and_offsets(qf, 2, /* Insert to bucket */
-                                                                                hash_bucket_index, runstart_index, p,
-                                                                                &new_values[67] - p, 0);
-        if (!ret) return QF_NO_SPACE;
-        // modify_metadata(&qf->runtimedata->pc_ndistinct_elts, 1);
-        ret_distance = runstart_index - hash_bucket_index;
-      }
-    }
-    METADATA_WORD(qf, occupieds, hash_bucket_index) |= 1ULL << (hash_bucket_block_offset % 64);
-
-    // modify_metadata(&qf->runtimedata->pc_nelts, count);
-  }
-  /*
-  if (GET_NO_LOCK(runtime_lock) != QF_NO_LOCK) {
-          qf_unlock(qf, hash_bucket_index,  false);
-  }
-  */
-  return ret_distance;
-}
-
-__host__ __device__ inline static int _remove(QF *qf, __uint64_t hash, uint64_t count, uint8_t runtime_lock) {
-  int ret_numfreedslots = 0;
-  uint64_t hash_remainder = hash & BITMASK(qf->metadata->bits_per_slot);
-  uint64_t hash_bucket_index = hash >> qf->metadata->bits_per_slot;
-  uint64_t current_remainder, current_count, current_end;
-  uint64_t new_values[67];
-  /*
-  if (GET_NO_LOCK(runtime_lock) != QF_NO_LOCK) {
-          if (!qf_lock(qf, hash_bucket_index,  false, runtime_lock))
-                  return -2;
-  }
-  */
-
-  /* Empty bucket */
-  if (!is_occupied(qf, hash_bucket_index)) return -1;
-
-  uint64_t runstart_index = hash_bucket_index == 0 ? 0 : run_end(qf, hash_bucket_index - 1) + 1;
-  uint64_t original_runstart_index = runstart_index;
-  int only_item_in_the_run = 0;
-
-  /*Find the counter for this remainder, if one exists.*/
-  current_end = decode_counter(qf, runstart_index, &current_remainder, &current_count);
-  while (current_remainder < hash_remainder && !is_runend(qf, current_end)) {
-    runstart_index = current_end + 1;
-    current_end = decode_counter(qf, runstart_index, &current_remainder, &current_count);
-  }
-  /* remainder not found in the given run */
-  if (current_remainder != hash_remainder) return -1;
-
-  if (original_runstart_index == runstart_index && is_runend(qf, current_end)) only_item_in_the_run = 1;
-
-  /* endode the new counter */
-  uint64_t *p = encode_counter(qf, hash_remainder, count > current_count ? 0 : current_count - count, &new_values[67]);
-  ret_numfreedslots = remove_replace_slots_and_shift_remainders_and_runends_and_offsets(
-      qf, only_item_in_the_run, hash_bucket_index, runstart_index, p, &new_values[67] - p, current_end - runstart_index + 1);
-
-  // update the nelements.
-  // modify_metadata(&qf->runtimedata->pc_nelts, -count);
-  /*qf->metadata->nelts -= count;*/
-  /*
-  if (GET_NO_LOCK(runtime_lock) != QF_NO_LOCK) {
-          qf_unlock(qf, hash_bucket_index, false);
-  }
-  */
-  return ret_numfreedslots;
-}
-
-/***********************************************************************
- * Code that uses the above to implement key-value-counter operations. *
- ***********************************************************************/
-
-__host__ uint64_t qf_estimate_memory(int nbits) {
-  uint64_t nslots = 1ULL << nbits;
-
-#ifdef DEBUG
-  uint64_t key_remainder_bits = QF_BITS_PER_REMAINDER;
-  assert(key_remainder_bits >= 2);
-  uint64_t value_bits = QF_BITS_PER_VALUE;
-  uint64_t bits_per_slot;
-#endif
-
-  // uint64_t num_slots, xnslots, nblocks;
-  uint64_t xnslots, nblocks;
-  uint64_t size;
-  uint64_t total_num_bytes;
-
-  assert(popcnt(nslots) == 1); /* nslots must be a power of 2 */
-  // num_slots = nslots;
-  xnslots = nslots + 10 * sqrt((double)nslots);
-  nblocks = (xnslots + QF_SLOTS_PER_BLOCK - 1) / QF_SLOTS_PER_BLOCK;
-
-#ifdef DEBUG
-  bits_per_slot = key_remainder_bits + value_bits;
-  assert(QF_BITS_PER_SLOT == 0 || QF_BITS_PER_SLOT == bits_per_slot);
-  assert(bits_per_slot > 1);
-#endif
-
-#if QF_BITS_PER_SLOT == 8 || QF_BITS_PER_SLOT == 16 || QF_BITS_PER_SLOT == 32 || QF_BITS_PER_SLOT == 64
-  size = nblocks * sizeof(qfblock);
-#else
-  size = nblocks * (sizeof(qfblock) + QF_SLOTS_PER_BLOCK * bits_per_slot / 8);
-#endif
-
-  total_num_bytes = sizeof(qfmetadata) + size;
-
-  return total_num_bytes;
-}
-
-__host__ uint64_t qf_init(QF *qf, uint64_t nslots, uint64_t key_bits, uint64_t value_bits, enum qf_hashmode hash, uint32_t seed,
-                          void *buffer, uint64_t buffer_len) {
-  uint64_t num_slots, xnslots, nblocks;
-  uint64_t key_remainder_bits, bits_per_slot;
-  uint64_t size;
-  uint64_t total_num_bytes;
-
-  assert(popcnt(nslots) == 1); /* nslots must be a power of 2 */
-  num_slots = nslots;
-  xnslots = nslots + 10 * sqrt((double)nslots);
-  nblocks = (xnslots + QF_SLOTS_PER_BLOCK - 1) / QF_SLOTS_PER_BLOCK;
-  key_remainder_bits = key_bits;
-  while (nslots > 1 && key_remainder_bits > 0) {
-    key_remainder_bits--;
-    nslots >>= 1;
-  }
-  assert(key_remainder_bits >= 2);
-
-  bits_per_slot = key_remainder_bits + value_bits;
-  assert(QF_BITS_PER_SLOT == 0 || QF_BITS_PER_SLOT == bits_per_slot);
-  assert(bits_per_slot > 1);
-#if QF_BITS_PER_SLOT == 8 || QF_BITS_PER_SLOT == 16 || QF_BITS_PER_SLOT == 32 || QF_BITS_PER_SLOT == 64
-  size = nblocks * sizeof(qfblock);
-#else
-  size = nblocks * (sizeof(qfblock) + QF_SLOTS_PER_BLOCK * bits_per_slot / 8);
-#endif
-
-  total_num_bytes = sizeof(qfmetadata) + size;
-  if (buffer == NULL || total_num_bytes > buffer_len) return total_num_bytes;
-
-  // memset(buffer, 0, total_num_bytes);
-  qf->metadata = (qfmetadata *)(buffer);
-  qf->blocks = (qfblock *)(qf->metadata + 1);
-
-  qf->metadata->magic_endian_number = MAGIC_NUMBER;
-  qf->metadata->reserved = 0;
-  qf->metadata->hash_mode = hash;
-  qf->metadata->total_size_in_bytes = size;
-  qf->metadata->seed = seed;
-  qf->metadata->nslots = num_slots;
-  qf->metadata->xnslots = xnslots;
-  qf->metadata->key_bits = key_bits;
-  qf->metadata->value_bits = value_bits;
-  qf->metadata->key_remainder_bits = key_remainder_bits;
-  qf->metadata->bits_per_slot = bits_per_slot;
-
-  qf->metadata->range = qf->metadata->nslots;
-  qf->metadata->range <<= qf->metadata->key_remainder_bits;
-  qf->metadata->nblocks = (qf->metadata->xnslots + QF_SLOTS_PER_BLOCK - 1) / QF_SLOTS_PER_BLOCK;
-  qf->metadata->nelts = 0;
-  qf->metadata->ndistinct_elts = 0;
-  qf->metadata->noccupied_slots = 0;
-
-  qf->runtimedata->num_locks = ((qf->metadata->xnslots / NUM_SLOTS_TO_LOCK) + 10) * LOCK_DIST;
-
-  pc_init(&qf->runtimedata->pc_nelts, (int64_t *)&qf->metadata->nelts, 8, 100);
-  pc_init(&qf->runtimedata->pc_ndistinct_elts, (int64_t *)&qf->metadata->ndistinct_elts, 8, 100);
-  pc_init(&qf->runtimedata->pc_noccupied_slots, (int64_t *)&qf->metadata->noccupied_slots, 8, 100);
-  /* initialize container resize */
-  qf->runtimedata->auto_resize = 0;
-  qf->runtimedata->container_resize = qf_resize_malloc;
-  /* initialize all the locks to 0 */
-  qf->runtimedata->metadata_lock = 0;
-  // todo: copy this to GPU
-
-  qf->runtimedata->locks = (uint16_t *)calloc(qf->runtimedata->num_locks, sizeof(uint16_t));
-  if (qf->runtimedata->locks == NULL) {
-    perror("Couldn't allocate memory for runtime locks.");
-    exit(EXIT_FAILURE);
-  }
-#ifdef LOG_WAIT_TIME
-  qf->runtimedata->wait_times = (wait_time_data *)calloc(qf->runtimedata->num_locks + 1, sizeof(wait_time_data));
-  if (qf->runtimedata->wait_times == NULL) {
-    perror("Couldn't allocate memory for runtime wait_times.");
-    exit(EXIT_FAILURE);
-  }
-#endif
-
-  return total_num_bytes;
-}
-
-__host__ uint64_t qf_use(QF *qf, void *buffer, uint64_t buffer_len) {
-  qf->metadata = (qfmetadata *)(buffer);
-  if (qf->metadata->total_size_in_bytes + sizeof(qfmetadata) > buffer_len) {
-    return qf->metadata->total_size_in_bytes + sizeof(qfmetadata);
-  }
-  qf->blocks = (qfblock *)(qf->metadata + 1);
-
-  qf->runtimedata = (qfruntime *)calloc(sizeof(qfruntime), 1);
-  if (qf->runtimedata == NULL) {
-    perror("Couldn't allocate memory for runtime data.");
-    exit(EXIT_FAILURE);
-  }
-  /* initialize all the locks to 0 */
-  qf->runtimedata->metadata_lock = 0;
-  qf->runtimedata->locks = (uint16_t *)calloc(qf->runtimedata->num_locks, sizeof(uint16_t));
-  if (qf->runtimedata->locks == NULL) {
-    perror("Couldn't allocate memory for runtime locks.");
-    exit(EXIT_FAILURE);
-  }
-#ifdef LOG_WAIT_TIME
-  qf->runtimedata->wait_times = (wait_time_data *)calloc(qf->runtimedata->num_locks + 1, sizeof(wait_time_data));
-  if (qf->runtimedata->wait_times == NULL) {
-    perror("Couldn't allocate memory for runtime wait_times.");
-    exit(EXIT_FAILURE);
-  }
-#endif
-
-  return sizeof(qfmetadata) + qf->metadata->total_size_in_bytes;
-}
-
-__host__ void *qf_destroy(QF *qf) {
-  assert(qf != NULL && "QF is NULL");
-  assert(qf->runtimedata != NULL && "runtimedata for QF is NULL");
-  if (qf->runtimedata->locks != NULL) free((void *)qf->runtimedata->locks);
-  if (qf->runtimedata->wait_times != NULL) free(qf->runtimedata->wait_times);
-  if (qf->runtimedata->f_info.filepath != NULL) free(qf->runtimedata->f_info.filepath);
-  free(qf->runtimedata);
-
-  return (void *)qf->metadata;
-}
-
-__host__ bool qf_malloc(QF *qf, uint64_t nslots, uint64_t key_bits, uint64_t value_bits, enum qf_hashmode hash, bool on_device,
-                        uint32_t seed) {
-  uint64_t total_num_bytes = qf_init(qf, nslots, key_bits, value_bits, hash, seed, NULL, 0);
-
-  // buffer malloc bad?
-  void *buffer = malloc(total_num_bytes);
-  memset(buffer, 0, total_num_bytes);
-
-  if (buffer == NULL) {
-    perror("Couldn't allocate memory for the CQF.");
-    exit(EXIT_FAILURE);
-  }
-
-  qf->runtimedata = (qfruntime *)calloc(sizeof(qfruntime), 1);
-
-  if (qf->runtimedata == NULL) {
-    perror("Couldn't allocate memory for runtime data.");
-    exit(EXIT_FAILURE);
-  }
-
-  uint64_t init_size = qf_init(qf, nslots, key_bits, value_bits, hash, seed, buffer, total_num_bytes);
-
-  if (init_size == total_num_bytes)
-    return total_num_bytes;
-  else
-    return -1;
-}
-
-__host__ bool qf_free(QF *qf) {
-  assert(qf->metadata != NULL);
-  void *buffer = qf_destroy(qf);
-  if (buffer != NULL) {
-    free(buffer);
-    return true;
-  }
-
-  return false;
-}
-
-__host__ void qf_free_gpu(QF *qf) {
-  QF hostQF;
-
-  // cudaMallocHost((void **)&hostQF, sizeof(QF));
-
-  cudaMemcpy(&hostQF, qf, sizeof(QF), cudaMemcpyDeviceToHost);
-
-  cudaFree(hostQF.runtimedata);
-  cudaFree(hostQF.metadata);
-  cudaFree(hostQF.blocks);
-
-  cudaFree(qf);
-}
-
-__host__ void qf_copy(QF *dest, const QF *src) {
-  DEBUG_CQF("%s\n", "Source CQF");
-  DEBUG_DUMP(src);
-  memcpy(dest->runtimedata, src->runtimedata, sizeof(qfruntime));
-  memcpy(dest->metadata, src->metadata, sizeof(qfmetadata));
-  memcpy(dest->blocks, src->blocks, src->metadata->total_size_in_bytes);
-  DEBUG_CQF("%s\n", "Destination CQF after copy.");
-  DEBUG_DUMP(dest);
-}
-
-__host__ void qf_reset(QF *qf) {
-  qf->metadata->nelts = 0;
-  qf->metadata->ndistinct_elts = 0;
-  qf->metadata->noccupied_slots = 0;
-
-#ifdef LOG_WAIT_TIME
-  memset(qf->wait_times, 0, (qf->runtimedata->num_locks + 1) * sizeof(wait_time_data));
-#endif
-#if QF_BITS_PER_SLOT == 8 || QF_BITS_PER_SLOT == 16 || QF_BITS_PER_SLOT == 32 || QF_BITS_PER_SLOT == 64
-  memset(qf->blocks, 0, qf->metadata->nblocks * sizeof(qfblock));
-#else
-  memset(qf->blocks, 0, qf->metadata->nblocks * (sizeof(qfblock) + QF_SLOTS_PER_BLOCK * qf->metadata->bits_per_slot / 8));
-#endif
-}
-
-__host__ int64_t qf_resize_malloc(QF *qf, uint64_t nslots) {
-  QF new_qf;
-  if (!qf_malloc(&new_qf, nslots, qf->metadata->key_bits, qf->metadata->value_bits, qf->metadata->hash_mode, false,
-                 qf->metadata->seed))
-    return -1;
-  if (qf->runtimedata->auto_resize) qf_set_auto_resize(&new_qf, true);
-
-  // copy keys from qf into new_qf
-  QFi qfi;
-  qf_iterator_from_position(qf, &qfi, 0);
-  int64_t ret_numkeys = 0;
-  do {
-    uint64_t key, value, count;
-    qfi_get_hash(&qfi, &key, &value, &count);
-    qfi_next(&qfi);
-    int ret = qf_insert(&new_qf, key, value, count, QF_NO_LOCK | QF_KEY_IS_HASH);
-    if (ret < 0) {
-      printf("Failed to insert key: %ld into the new CQF.\n", key);
-      return ret;
-    }
-    ret_numkeys++;
-  } while (!qfi_end(&qfi));
-
-  qf_free(qf);
-  memcpy(qf, &new_qf, sizeof(QF));
-
-  return ret_numkeys;
-}
-
-uint64_t qf_resize(QF *qf, uint64_t nslots, void *buffer, uint64_t buffer_len) {
-  printf("QF attempting resize - This will fail\n");
-  QF new_qf;
-  new_qf.runtimedata = (qfruntime *)calloc(sizeof(qfruntime), 1);
-  if (new_qf.runtimedata == NULL) {
-    perror("Couldn't allocate memory for runtime data.\n");
-    exit(EXIT_FAILURE);
-  }
-
-  uint64_t init_size = qf_init(&new_qf, nslots, qf->metadata->key_bits, qf->metadata->value_bits, qf->metadata->hash_mode,
-                               qf->metadata->seed, buffer, buffer_len);
-
-  if (init_size > buffer_len) return init_size;
-
-  if (qf->runtimedata->auto_resize) qf_set_auto_resize(&new_qf, true);
-
-  // copy keys from qf into new_qf
-  QFi qfi;
-  qf_iterator_from_position(qf, &qfi, 0);
-  do {
-    uint64_t key, value, count;
-    qfi_get_hash(&qfi, &key, &value, &count);
-    qfi_next(&qfi);
-    int ret = qf_insert(&new_qf, key, value, count, QF_NO_LOCK | QF_KEY_IS_HASH);
-    if (ret < 0) {
-      printf("Failed to insert key: %ld into the new CQF.\n", key);
-      abort();  // kill kernel with error
-    }
-  } while (!qfi_end(&qfi));
-
-  qf_free(qf);
-  memcpy(qf, &new_qf, sizeof(QF));
-
-  return init_size;
-}
-
-__host__ void qf_set_auto_resize(QF *qf, bool enabled) {
-  if (enabled)
-    qf->runtimedata->auto_resize = 1;
-  else
-    qf->runtimedata->auto_resize = 0;
-}
-
-__host__ __device__ qf_returns qf_insert_not_exists(QF *qf, uint64_t key, uint64_t value, uint64_t count, uint8_t flags,
-                                                    uint64_t *retvalue) {
-  // We fill up the CQF up to 95% load factor.
-  // This is a very conservative check.
-
-  // TODO: GPU resizing
-  /*
-  if (qf_get_num_occupied_slots(qf) >= qf->metadata->nslots * 0.95) {
-  if (qf->runtimedata->auto_resize) {
-  fprintf(stdout, "Resizing the CQF.\n");
-  if (qf->runtimedata->container_resize(qf, qf->metadata->nslots * 2) < 0)
-  {
-  fprintf(stderr, "Resizing the failed.\n");
-  return QF_NO_SPACE;
-  }
-  } else
-  return QF_NO_SPACE;
-  }
-  */
-  // if (count == 0)
-  //      return 0;
-
-  if (GET_KEY_HASH(flags) != QF_KEY_IS_HASH) {
-    if (qf->metadata->hash_mode == QF_HASH_DEFAULT)
-      key = MurmurHash64A(((void *)&key), sizeof(key), qf->metadata->seed) % qf->metadata->range;
-    else if (qf->metadata->hash_mode == QF_HASH_INVERTIBLE)
-      key = hash_64(key, BITMASK(qf->metadata->key_bits));
-  }
-
-  uint64_t hash = (key << qf->metadata->value_bits) | (value & BITMASK(qf->metadata->value_bits));
-  // printf("Inside insert, new hash is recorded as %llu\n", hash);
-  qf_returns ret = QF_FULL;
-
-  if (count == 1) ret = insert1_if_not_exists(qf, hash, retvalue);
-  assert(count == 1);
-  return ret;
-}
-
-__host__ __device__ int qf_insert(QF *qf, uint64_t key, uint64_t value, uint64_t count, uint8_t flags) {
-  // We fill up the CQF up to 95% load factor.
-  // This is a very conservative check.
-
-  // TODO: GPU resizing
-  /*
-  if (qf_get_num_occupied_slots(qf) >= qf->metadata->nslots * 0.95) {
-          if (qf->runtimedata->auto_resize) {
-                  fprintf(stdout, "Resizing the CQF.\n");
-                  if (qf->runtimedata->container_resize(qf, qf->metadata->nslots * 2) < 0)
-                  {
-                          fprintf(stderr, "Resizing the failed.\n");
-                          return QF_NO_SPACE;
-                  }
-          } else
-                  return QF_NO_SPACE;
-  }
-  */
-  if (count == 0) return 0;
-
-  if (GET_KEY_HASH(flags) != QF_KEY_IS_HASH) {
-    if (qf->metadata->hash_mode == QF_HASH_DEFAULT)
-      key = MurmurHash64A(((void *)&key), sizeof(key), qf->metadata->seed) % qf->metadata->range;
-    else if (qf->metadata->hash_mode == QF_HASH_INVERTIBLE)
-      key = hash_64(key, BITMASK(qf->metadata->key_bits));
-  }
-
-  uint64_t hash = (key << qf->metadata->value_bits) | (value & BITMASK(qf->metadata->value_bits));
-  // printf("Inside insert, new hash is recorded as %llu\n", hash);
-  int ret = QF_NO_SPACE;
-  if (count == 1) ret = insert1(qf, hash, flags);
-  assert(count == 1);
-  return ret;
-}
-/*------------------------
-GPU Modifications
---------------------------*/
-
-// locking implementation for the 16 bit locks
-// undefined behavior if you try to unlock a not locked lock
-__device__ void lock_16(uint16_t *lock, uint64_t index) {
-  uint16_t zero = 0;
-  uint16_t one = 1;
-
-  while (atomicCAS((uint16_t *)&lock[index * LOCK_DIST], zero, one) != zero)
-    ;
-}
-
-__device__ void unlock_16(uint16_t *lock, uint64_t index) {
-  uint16_t zero = 0;
-  uint16_t one = 1;
-
-  atomicCAS((uint16_t *)&lock[index * LOCK_DIST], one, zero);
-}
-
-// lock_16 but built to be included as a piece of a while loop
-// this is more in line with traditional cuda processing, may increase throughput
-__device__ bool try_lock_16(uint16_t *lock, uint64_t index) {
-  uint16_t zero = 0;
-  uint16_t one = 1;
-
-  if (atomicCAS((uint16_t *)&lock[index * LOCK_DIST], zero, one) == zero) return true;
-  return false;
-}
-
-// TODO: it might expect a short int instead of uint16_t
-// TODO: needs to be 32 bits (whoops)
-__device__ uint16_t get_lock(volatile uint32_t *lock, int index) {
-  // set lock to 1 to claim
-  // returns 0 if success
-  uint32_t zero = 0;
-  uint32_t one = 1;
-  return atomicCAS((uint32_t *)&lock[index], zero, one);
-}
-
-// synchronous lock so that we can acquire multiple locks
-
-// __device__ uint16_t get_lock_wait(uint32_t * locks, int index){
-
-//   uint16_t result = 1;
-
-//   do {
-
-//     result = get_lock(locks, index);
-
-//   } while (result !=0);
-
-//   return result;
-
-// }
-
-/*
-__device__ uint16_t unlock(volatile uint32_t *lock, int index) {
-  // set lock to 0 to release
-  uint32_t zero = 0;
-  uint32_t one = 1;
-  // TODO: might need a __threadfence();
-  lock[index] = 0;
-}
-*/
-
-//__device__ void __bitonic_sort(uint64_t * array, uint64_t low, uint64_t n, uint64_t idx, bool dir);
-
-// consolidate all of the device construction into one convenient func!
-__host__ void qf_malloc_device(QF **qf, int nbits) {
-  // bring in compile #define
-  int rbits = 8;
-  int vbits = 8;
-
-  QF host_qf;
-  QF temp_device_qf;
-
-  QF *temp_dev_ptr;
-
-  uint64_t nslots = 1ULL << nbits;
-  int num_hash_bits = nbits + rbits;
-
-  qf_malloc(&host_qf, nslots, num_hash_bits, vbits, QF_HASH_INVERTIBLE, false, 0);
-  qf_set_auto_resize(&host_qf, false);
-
-  qfruntime *_runtime;
-  qfmetadata *_metadata;
-  qfblock *_blocks;
-
-  uint16_t *dev_locks;
-
-  cudaMalloc((void **)&dev_locks, host_qf.runtimedata->num_locks * sizeof(uint16_t));
-
-  cudaMemset(dev_locks, 0, host_qf.runtimedata->num_locks * sizeof(uint16_t));
-
-  // wipe and replace
-  free(host_qf.runtimedata->locks);
-  host_qf.runtimedata->locks = dev_locks;
-
-  cudaMalloc((void **)&_runtime, sizeof(qfruntime));
-  cudaMalloc((void **)&_metadata, sizeof(qfmetadata));
-  cudaMalloc((void **)&_blocks, qf_get_total_size_in_bytes(&host_qf));
-
-  cudaMemcpy(_runtime, host_qf.runtimedata, sizeof(qfruntime), cudaMemcpyHostToDevice);
-  cudaMemcpy(_metadata, host_qf.metadata, sizeof(qfmetadata), cudaMemcpyHostToDevice);
-  cudaMemcpy(_blocks, host_qf.blocks, qf_get_total_size_in_bytes(&host_qf), cudaMemcpyHostToDevice);
-
-  temp_device_qf.runtimedata = _runtime;
-  temp_device_qf.metadata = _metadata;
-  temp_device_qf.blocks = _blocks;
-
-  // this might be buggy
-  // request to fill the dev ptr with a QF, then copy over, then copy that to qf
-  cudaMalloc((void **)&temp_dev_ptr, sizeof(QF));
-
-  cudaMemcpy(temp_dev_ptr, &temp_device_qf, sizeof(QF), cudaMemcpyHostToDevice);
-
-  *qf = temp_dev_ptr;
-}
-
-__host__ void qf_destroy_device(QF *qf) {
-  QF *host_qf;
-  cudaMallocHost((void **)&host_qf, sizeof(QF));
-
-  cudaMemcpy(host_qf, qf, sizeof(QF), cudaMemcpyDeviceToHost);
-
-  qfruntime *_runtime;
-
-  cudaMallocHost((void **)&_runtime, sizeof(qfruntime));
-
-  cudaMemcpy(_runtime, host_qf->runtimedata, sizeof(qfruntime), cudaMemcpyDeviceToHost);
-
-  // may need to have _runtimedata shunted into another host object
-  // ill synchronize before this to double check
-  assert(_runtime != NULL);
-  if (_runtime->locks != NULL) cudaFree(_runtime->locks);
-
-  if (_runtime->wait_times != NULL) cudaFree(_runtime->wait_times);
-
-  // this one may break
-  if (_runtime->f_info.filepath != NULL) cudaFree(host_qf->runtimedata->f_info.filepath);
-
-  cudaFree(host_qf->runtimedata);
-
-  cudaFree(host_qf->metadata);
-  cudaFree(host_qf->blocks);
-
-  cudaFreeHost(host_qf);
-  cudaFreeHost(_runtime);
-}
-
-// __host__ void init_device_locks(uint16_t ** locks, uint64_t nbits){
-
-// 	uint16_t * temp_locks;
-
-// 	uint64_t nslots = 1ULL << nbits;
-
-// 	uint64_t xnslots = nslots+10*sqrt((double)nslots);
-
-// 	cudaMalloc((void **)&temp_locks, (((xnslots-1)/NUM_SLOTS_TO_LOCK+1)+10)*sizeof(uint16_t));
-
-// 	cudaMemset(temp_locks, 0, (((xnslots-1)/NUM_SLOTS_TO_LOCK+1)+10)*sizeof(uint16_t));
-
-// 	*locks = temp_locks;
-
-// }
-
-// convert a counter with
-__host__ __device__ uint8_t encode_kmer_counter(uint8_t *counter) {
-  uint8_t base = 0;
-
-  // A is 000 0
-  // C is 001 1
-  // T is 010 2
-  // G is 011 3
-  // F is 100 4
-
-  for (uint8_t i = 0; i < 5; i++) {
-    if (counter[i]) {
-      // printf("Front %d: %0x", i, i<<5);
-      base += i << 5;
-    }
-
-    if (counter[i + 5]) {
-      base += i << 2;
-    }
-  }
-
-  return base;
-}
-
-// convert a counter with
-__host__ __device__ uint8_t encode_chars(char fwd, char back) {
-  uint8_t base = 0;
-
-  // encodings of kmers relative to inputs,
-  // if you want to change this modify the const array
-  // kmer_vals in gqf.cu. F is unused and only exists to prevent crashes
-
-  // F is 000 0
-  // A is 001 1
-  // C is 010 2
-  // T is 011 3
-  // G is 100 4
-  // 0/NULL is 101 5
-
-  for (uint8_t i = 0; i < 5; i++) {
-    if (kmer_vals[i] == fwd) {
-      // printf("Front %d: %0x", i, i<<5);
-      base += i << 3;
-    }
-
-    if (kmer_vals[i] == back) {
-      base += i;
-    }
-  }
-
-  return base;
-}
-
-// convert a counter with
-__host__ __device__ void decode_chars(uint8_t stored, char &fwd, char &back) {
-  // NULL is 000 0
-  // A is 001 1
-  // C is 010 2
-  // T is 011 3
-  // G is 100 4
-  // 0 is 101 5
-
-  uint8_t upper = stored >> 3;
-  uint8_t lower = stored & 7;
-
-  fwd = kmer_vals[upper];
-  back = kmer_vals[lower];
-
-  if (fwd == 'F') fwd = '0';
-  if (back == 'F') back = '0';
-}
-
-__host__ __device__ void decode_kmer_counter(uint8_t *counter, uint8_t stored) {
-  uint8_t upper = stored >> 5;
-
-  uint8_t lower = (stored & (15)) >> 2;
-
-  // printf("Upper %x, lower %x\n", upper, lower);
-
-  counter[upper] += 1;
-  counter[lower + 5] += 1;
-}
-
-__host__ __device__ bool is_encodable(uint8_t *counter) {
-  int count = 0;
-
-  for (int i = 0; i < 5; i++) {
-    count += counter[i];
-  }
-
-  if (count > 1) return false;
-
-  return true;
-}
-
-// finalized version of locking kmer insert
-// uses 10 bits 6 bits remainder/val pairings
-__device__ qf_returns insert_kmer(QF *qf, uint64_t hash, char forward, char backward, char &returnedfwd, char &returnedback) {
-  uint8_t encoded = encode_chars(forward, backward);
-  uint8_t query;
-  uint64_t bigquery;
-
-  hash = hash % qf->metadata->range;
-
-  uint64_t hash_bucket_index = hash >> qf->metadata->key_remainder_bits;
-  uint64_t lock_index = hash_bucket_index / NUM_SLOTS_TO_LOCK;
-
-  // encode extensions outside of the lock
-
-  lock_16(qf->runtimedata->locks, lock_index);
-  lock_16(qf->runtimedata->locks, lock_index + 1);
-
-  int found = qf_query(qf, hash, &bigquery, QF_NO_LOCK | QF_KEY_IS_HASH);
-
-  query = bigquery;
-
-  if (found == 0)
-    qf_insert(qf, hash, encoded, 1, QF_NO_LOCK | QF_KEY_IS_HASH);
-  else
-    decode_chars(query, returnedfwd, returnedback);
-
-  __threadfence();
-  unlock_16(qf->runtimedata->locks, lock_index + 1);
-  unlock_16(qf->runtimedata->locks, lock_index);
-
-  if (found == 1) return QF_ITEM_FOUND;
-  return QF_ITEM_INSERTED;
-}
-
-__device__ qf_returns insert_kmer_not_exists(QF *qf, uint64_t hash, char forward, char backward, char &returnedfwd,
-                                             char &returnedback) {
-  uint8_t encoded = encode_chars(forward, backward);
-  uint8_t query;
-  uint64_t bigquery;
-
-  hash = hash % qf->metadata->range;
-
-  uint64_t hash_bucket_index = hash >> qf->metadata->key_remainder_bits;
-  uint64_t lock_index = hash_bucket_index / NUM_SLOTS_TO_LOCK;
-
-  // encode extensions outside of the lock
-
-  lock_16(qf->runtimedata->locks, lock_index);
-  lock_16(qf->runtimedata->locks, lock_index + 1);
-
-  // uint64_t query;
-  // int found = qf_query(qf, hash, &bigquery, QF_NO_LOCK | QF_KEY_IS_HASH);
-  // printf("being inserted/checked: %d\n", encoded);
-  qf_returns ret = qf_insert_not_exists(qf, hash, encoded, 1, QF_NO_LOCK | QF_KEY_IS_HASH, &bigquery);
-
-  __threadfence();
-  unlock_16(qf->runtimedata->locks, lock_index + 1);
-  unlock_16(qf->runtimedata->locks, lock_index);
-
-  // cast down
-  query = bigquery;
-  if (ret == QF_ITEM_FOUND) {
-    decode_chars(query, returnedfwd, returnedback);
-  }
-  // obvious cast for clarity
-  return ret;
-}
-
-// given a kmer we want to look for, and an encoded char, insert it and retreive a copy if it exists
-// returns 1 if not found since they won't interfere with any unique combos
-__device__ uint8_t insert_kmer_with_lock(QF *qf, uint64_t hash, uint8_t val) {
-  uint8_t query;
-
-  // ha hire me pls google
-  uint64_t bigquery;
-
-  // uint64_t hash_bucket_index = hash >> qf->metadata->bits_per_slot;
-  uint64_t hash_bucket_index = hash >> qf->metadata->key_remainder_bits;
-  uint64_t lock_index = hash_bucket_index / NUM_SLOTS_TO_LOCK;
-
-  lock_16(qf->runtimedata->locks, lock_index);
-  lock_16(qf->runtimedata->locks, lock_index + 1);
-
-  // figure out flags here
-  // QF NO lock and QF KEY_IS_HASH
-  int found = qf_query(qf, hash, &bigquery, QF_NO_LOCK | QF_KEY_IS_HASH);
-
-  // implicit casts, data from bigquery should always fit in uint8_t
-  query = bigquery;
-
-  if (found == 0) {
-    qf_insert(qf, hash, val, 1, QF_NO_LOCK | QF_KEY_IS_HASH);
-    query = 1U;
-  }
-
-  __threadfence();
-
-  unlock_16(qf->runtimedata->locks, lock_index + 1);
-  unlock_16(qf->runtimedata->locks, lock_index);
-
-  return query;
-}
-
-// perform a bitwise operatiojn, check if query has been seen at least once already
-// this is indicated by the 2 to last bit being set to 1
-__device__ bool seen_once(uint8_t query) {
-  // looking for bit 0000 0010
-
-  uint8_t lower = (query & (2)) >> 1;
-
-  printf("query val %x\n", lower);
-
-  // implicit cast
-  return lower;
-}
-
-__device__ uint8_t set_seen(uint8_t query) { return (query | 2); }
-
-// __global__ void insert_one_kmer_kernel(QF* qf, uint64_t hash, uint8_t val, uint16_t * locks){
-
-// 	uint64_t tid = threadIdx.x + blockIdx.x * blockDim.x;
-
-// 	if (tid != 0) return;
-
-// 	printf("Returned: %x\n", insert_kmer_with_lock(qf, hash, val, locks));
-
-// }
-/*
-__global__ void insert_multi_kmer_kernel(QF *qf, uint64_t *hashes, uint8_t *firsts, uint8_t *seconds, uint64_t nitems,
-                                         uint64_t *counter) {
-  uint64_t tid = threadIdx.x + blockIdx.x * blockDim.x;
-
-  if (tid >= nitems) return;
-
-  uint8_t one = firsts[tid];
-  uint8_t two = seconds[tid];
-
-  // if this fails the random gen is messed up
-  char fwd;
-  char back;
-
-  if (insert_kmer(qf, hashes[tid], kmer_vals[one], kmer_vals[two - 5], fwd, back)) {
-    atomicAdd((unsigned long long *)counter, (unsigned long long)1);
-  }
-}
-*/
-
-__host__ __device__ int qf_set_count(QF *qf, uint64_t key, uint64_t value, uint64_t count, uint8_t flags) {
-  if (count == 0) return 0;
-
-  uint64_t cur_count = qf_count_key_value(qf, key, value, flags);
-  int64_t delta = count - cur_count;
-
-  int ret;
-  if (delta == 0)
-    ret = 0;
-  else if (delta > 0)
-    ret = qf_insert(qf, key, value, delta, flags);
-  else
-    ret = qf_remove(qf, key, value, labs(delta), flags);
-
-  return ret;
-}
-
-__host__ __device__ int qf_remove(QF *qf, uint64_t key, uint64_t value, uint64_t count, uint8_t flags) {
-  if (count == 0) return true;
-
-  if (GET_KEY_HASH(flags) != QF_KEY_IS_HASH) {
-    if (qf->metadata->hash_mode == QF_HASH_DEFAULT)
-      key = MurmurHash64A(((void *)&key), sizeof(key), qf->metadata->seed) % qf->metadata->range;
-    else if (qf->metadata->hash_mode == QF_HASH_INVERTIBLE)
-      key = hash_64(key, BITMASK(qf->metadata->key_bits));
-  }
-  uint64_t hash = (key << qf->metadata->value_bits) | (value & BITMASK(qf->metadata->value_bits));
-  return _remove(qf, hash, count, flags);
-}
-
-__host__ __device__ int qf_delete_key_value(QF *qf, uint64_t key, uint64_t value, uint8_t flags) {
-  uint64_t count = qf_count_key_value(qf, key, value, flags);
-  if (count == 0) return true;
-
-  if (GET_KEY_HASH(flags) != QF_KEY_IS_HASH) {
-    if (qf->metadata->hash_mode == QF_HASH_DEFAULT)
-      key = MurmurHash64A(((void *)&key), sizeof(key), qf->metadata->seed) % qf->metadata->range;
-    else if (qf->metadata->hash_mode == QF_HASH_INVERTIBLE)
-      key = hash_64(key, BITMASK(qf->metadata->key_bits));
-  }
-  uint64_t hash = (key << qf->metadata->value_bits) | (value & BITMASK(qf->metadata->value_bits));
-  return _remove(qf, hash, count, flags);
-}
-
-__host__ __device__ uint64_t qf_count_key_value(const QF *qf, uint64_t key, uint64_t value, uint8_t flags) {
-  if (GET_KEY_HASH(flags) != QF_KEY_IS_HASH) {
-    if (qf->metadata->hash_mode == QF_HASH_DEFAULT)
-      key = MurmurHash64A(((void *)&key), sizeof(key), qf->metadata->seed) % qf->metadata->range;
-    else if (qf->metadata->hash_mode == QF_HASH_INVERTIBLE)
-      key = hash_64(key, BITMASK(qf->metadata->key_bits));
-  }
-
-  uint64_t hash = (key << qf->metadata->value_bits) | (value & BITMASK(qf->metadata->value_bits));
-  uint64_t hash_remainder = hash & BITMASK(qf->metadata->bits_per_slot);
-  int64_t hash_bucket_index = hash >> qf->metadata->bits_per_slot;
-
-  if (!is_occupied(qf, hash_bucket_index)) return 0;
-
-  int64_t runstart_index = hash_bucket_index == 0 ? 0 : run_end(qf, hash_bucket_index - 1) + 1;
-  if (runstart_index < hash_bucket_index) runstart_index = hash_bucket_index;
-
-  /* printf("MC RUNSTART: %02lx RUNEND: %02lx\n", runstart_index, runend_index); */
-
-  uint64_t current_remainder, current_count, current_end;
-  do {
-    current_end = decode_counter(qf, runstart_index, &current_remainder, &current_count);
-    if (current_remainder == hash_remainder) return current_count;
-    runstart_index = current_end + 1;
-  } while (!is_runend(qf, current_end));
-
-  return 0;
-}
-
-__host__ __device__ uint64_t qf_query(const QF *qf, uint64_t key, uint64_t *value, uint8_t flags) {
-  if (GET_KEY_HASH(flags) != QF_KEY_IS_HASH) {
-    if (qf->metadata->hash_mode == QF_HASH_DEFAULT)
-      key = MurmurHash64A(((void *)&key), sizeof(key), qf->metadata->seed) % qf->metadata->range;
-    else if (qf->metadata->hash_mode == QF_HASH_INVERTIBLE)
-      key = hash_64(key, BITMASK(qf->metadata->key_bits));
-  }
-  uint64_t hash = key;
-  uint64_t hash_remainder = hash & BITMASK(qf->metadata->key_remainder_bits);
-  int64_t hash_bucket_index = hash >> qf->metadata->key_remainder_bits;
-
-  if (!is_occupied(qf, hash_bucket_index)) return 0;
-
-  int64_t runstart_index = hash_bucket_index == 0 ? 0 : run_end(qf, hash_bucket_index - 1) + 1;
-  if (runstart_index < hash_bucket_index) runstart_index = hash_bucket_index;
-
-  /* printf("MC RUNSTART: %02lx RUNEND: %02lx\n", runstart_index, runend_index); */
-
-  uint64_t current_remainder, current_count, current_end;
-  do {
-    current_end = decode_counter(qf, runstart_index, &current_remainder, &current_count);
-    *value = current_remainder & BITMASK(qf->metadata->value_bits);
-    current_remainder = current_remainder >> qf->metadata->value_bits;
-    if (current_remainder == hash_remainder) {
-      return current_count;
-    }
-    runstart_index = current_end + 1;
-  } while (!is_runend(qf, current_end));
-
-  return 0;
-}
-
-__host__ __device__ int64_t qf_get_unique_index(const QF *qf, uint64_t key, uint64_t value, uint8_t flags) {
-  if (GET_KEY_HASH(flags) != QF_KEY_IS_HASH) {
-    if (qf->metadata->hash_mode == QF_HASH_DEFAULT)
-      key = MurmurHash64A(((void *)&key), sizeof(key), qf->metadata->seed) % qf->metadata->range;
-    else if (qf->metadata->hash_mode == QF_HASH_INVERTIBLE)
-      key = hash_64(key, BITMASK(qf->metadata->key_bits));
-  }
-  uint64_t hash = (key << qf->metadata->value_bits) | (value & BITMASK(qf->metadata->value_bits));
-  uint64_t hash_remainder = hash & BITMASK(qf->metadata->bits_per_slot);
-  int64_t hash_bucket_index = hash >> qf->metadata->bits_per_slot;
-
-  if (!is_occupied(qf, hash_bucket_index)) return QF_DOESNT_EXIST;
-
-  int64_t runstart_index = hash_bucket_index == 0 ? 0 : run_end(qf, hash_bucket_index - 1) + 1;
-  if (runstart_index < hash_bucket_index) runstart_index = hash_bucket_index;
-
-  /* printf("MC RUNSTART: %02lx RUNEND: %02lx\n", runstart_index, runend_index); */
-
-  uint64_t current_remainder, current_count, current_end;
-  do {
-    current_end = decode_counter(qf, runstart_index, &current_remainder, &current_count);
-    if (current_remainder == hash_remainder) return runstart_index;
-
-    runstart_index = current_end + 1;
-  } while (!is_runend(qf, current_end));
-
-  return QF_DOESNT_EXIST;
-}
-
-enum qf_hashmode qf_get_hashmode(const QF *qf) { return qf->metadata->hash_mode; }
-uint64_t qf_get_hash_seed(const QF *qf) { return qf->metadata->seed; }
-__uint64_t qf_get_hash_range(const QF *qf) { return qf->metadata->range; }
-
-bool qf_is_auto_resize_enabled(const QF *qf) {
-  if (qf->runtimedata->auto_resize == 1) return true;
-  return false;
-}
-uint64_t qf_get_total_size_in_bytes(const QF *qf) { return qf->metadata->total_size_in_bytes; }
-uint64_t qf_get_nslots(const QF *qf) { return qf->metadata->nslots; }
-uint64_t qf_get_num_occupied_slots(const QF *qf) {
-  pc_sync(&qf->runtimedata->pc_noccupied_slots);
-  return qf->metadata->noccupied_slots;
-}
-
-// need to pull metadata from qf, and nslots from metadata
-__host__ uint64_t host_qf_get_nslots(const QF *qf) {
-  QF *host_qf;
-  CUDA_CHECK(cudaMallocHost((void **)&host_qf, sizeof(QF)));
-  CUDA_CHECK(cudaMemcpy(host_qf, qf, sizeof(QF), cudaMemcpyDeviceToHost));
-  qfmetadata *_metadata;
-  CUDA_CHECK(cudaMallocHost((void **)&_metadata, sizeof(qfmetadata)));
-  CUDA_CHECK(cudaMemcpy(_metadata, host_qf->metadata, sizeof(qfmetadata), cudaMemcpyDeviceToHost));
-  uint64_t toReturn = _metadata->nslots;
-  CUDA_CHECK(cudaFreeHost(_metadata));
-  CUDA_CHECK(cudaFreeHost(host_qf));
-  return toReturn;
-}
-
-__host__ uint64_t host_qf_get_num_occupied_slots(const QF *qf) {
-  QF *host_qf;
-  CUDA_CHECK(cudaMallocHost((void **)&host_qf, sizeof(QF)));
-  CUDA_CHECK(cudaMemcpy(host_qf, qf, sizeof(QF), cudaMemcpyDeviceToHost));
-  qfmetadata *_metadata;
-  CUDA_CHECK(cudaMallocHost((void **)&_metadata, sizeof(qfmetadata)));
-  CUDA_CHECK(cudaMemcpy(_metadata, host_qf->metadata, sizeof(qfmetadata), cudaMemcpyDeviceToHost));
-  uint64_t toReturn = _metadata->noccupied_slots;
-  CUDA_CHECK(cudaFreeHost(_metadata));
-  CUDA_CHECK(cudaFreeHost(host_qf));
-  return toReturn;
-}
-
-uint64_t qf_get_num_key_bits(const QF *qf) { return qf->metadata->key_bits; }
-uint64_t qf_get_num_value_bits(const QF *qf) { return qf->metadata->value_bits; }
-uint64_t qf_get_num_key_remainder_bits(const QF *qf) { return qf->metadata->key_remainder_bits; }
-uint64_t qf_get_bits_per_slot(const QF *qf) { return qf->metadata->bits_per_slot; }
-
-uint64_t qf_get_sum_of_counts(const QF *qf) {
-  pc_sync(&qf->runtimedata->pc_nelts);
-  return qf->metadata->nelts;
-}
-uint64_t qf_get_num_distinct_key_value_pairs(const QF *qf) {
-  pc_sync(&qf->runtimedata->pc_ndistinct_elts);
-  return qf->metadata->ndistinct_elts;
-}
-
-void qf_sync_counters(const QF *qf) {
-  pc_sync(&qf->runtimedata->pc_ndistinct_elts);
-  pc_sync(&qf->runtimedata->pc_nelts);
-  pc_sync(&qf->runtimedata->pc_noccupied_slots);
-}
-
-/* initialize the iterator at the run corresponding
- * to the position index
- */
-int64_t qf_iterator_from_position(const QF *qf, QFi *qfi, uint64_t position) {
-  if (position == 0xffffffffffffffff) {
-    qfi->current = 0xffffffffffffffff;
-    qfi->qf = qf;
-    return QFI_INVALID;
-  }
-  assert(position < qf->metadata->nslots);
-  if (!is_occupied(qf, position)) {
-    uint64_t block_index = position;
-    uint64_t idx = bitselect(get_block(qf, block_index)->occupieds[0], 0);
-    if (idx == 64) {
-      while (idx == 64 && block_index < qf->metadata->nblocks) {
-        block_index++;
-        idx = bitselect(get_block(qf, block_index)->occupieds[0], 0);
-      }
-    }
-    position = block_index * QF_SLOTS_PER_BLOCK + idx;
-  }
-
-  qfi->qf = qf;
-  qfi->num_clusters = 0;
-  qfi->run = position;
-  qfi->current = position == 0 ? 0 : run_end(qfi->qf, position - 1) + 1;
-  if (qfi->current < position) qfi->current = position;
-
-#ifdef LOG_CLUSTER_LENGTH
-  qfi->c_info = (cluster_data *)calloc(qf->metadata->nslots / 32, sizeof(cluster_data));
-  if (qfi->c_info == NULL) {
-    perror("Couldn't allocate memory for c_info.");
-    exit(EXIT_FAILURE);
-  }
-  qfi->cur_start_index = position;
-  qfi->cur_length = 1;
-#endif
-
-  if (qfi->current >= qf->metadata->nslots) return QFI_INVALID;
-  return qfi->current;
-}
-
-int64_t qf_iterator_from_key_value(const QF *qf, QFi *qfi, uint64_t key, uint64_t value, uint8_t flags) {
-  if (key >= qf->metadata->range) {
-    qfi->current = 0xffffffffffffffff;
-    qfi->qf = qf;
-    return QFI_INVALID;
-  }
-
-  qfi->qf = qf;
-  qfi->num_clusters = 0;
-
-  if (GET_KEY_HASH(flags) != QF_KEY_IS_HASH) {
-    if (qf->metadata->hash_mode == QF_HASH_DEFAULT)
-      key = MurmurHash64A(((void *)&key), sizeof(key), qf->metadata->seed) % qf->metadata->range;
-    else if (qf->metadata->hash_mode == QF_HASH_INVERTIBLE)
-      key = hash_64(key, BITMASK(qf->metadata->key_bits));
-  }
-  uint64_t hash = (key << qf->metadata->value_bits) | (value & BITMASK(qf->metadata->value_bits));
-
-  uint64_t hash_remainder = hash & BITMASK(qf->metadata->bits_per_slot);
-  uint64_t hash_bucket_index = hash >> qf->metadata->bits_per_slot;
-  bool flag = false;
-
-  // If a run starts at "position" move the iterator to point it to the
-  // smallest key greater than or equal to "hash".
-  if (is_occupied(qf, hash_bucket_index)) {
-    uint64_t runstart_index = hash_bucket_index == 0 ? 0 : run_end(qf, hash_bucket_index - 1) + 1;
-    if (runstart_index < hash_bucket_index) runstart_index = hash_bucket_index;
-    uint64_t current_remainder, current_count, current_end;
-    do {
-      current_end = decode_counter(qf, runstart_index, &current_remainder, &current_count);
-      if (current_remainder >= hash_remainder) {
-        flag = true;
-        break;
-      }
-      runstart_index = current_end + 1;
-    } while (!is_runend(qf, current_end));
-    // found "hash" or smallest key greater than "hash" in this run.
-    if (flag) {
-      qfi->run = hash_bucket_index;
-      qfi->current = runstart_index;
-    }
-  }
-  // If a run doesn't start at "position" or the largest key in the run
-  // starting at "position" is smaller than "hash" then find the start of the
-  // next run.
-  if (!is_occupied(qf, hash_bucket_index) || !flag) {
-    uint64_t position = hash_bucket_index;
-    assert(position < qf->metadata->nslots);
-    uint64_t block_index = position / QF_SLOTS_PER_BLOCK;
-    uint64_t idx = bitselect(get_block(qf, block_index)->occupieds[0], 0);
-    if (idx == 64) {
-      while (idx == 64 && block_index < qf->metadata->nblocks) {
-        block_index++;
-        idx = bitselect(get_block(qf, block_index)->occupieds[0], 0);
-      }
-    }
-    position = block_index * QF_SLOTS_PER_BLOCK + idx;
-    qfi->run = position;
-    qfi->current = position == 0 ? 0 : run_end(qfi->qf, position - 1) + 1;
-    if (qfi->current < position) qfi->current = position;
-  }
-
-  if (qfi->current >= qf->metadata->nslots) return QFI_INVALID;
-  return qfi->current;
-}
-
-static int qfi_get(const QFi *qfi, uint64_t *key, uint64_t *value, uint64_t *count) {
-  if (qfi_end(qfi)) return QFI_INVALID;
-
-  uint64_t current_remainder, current_count;
-  decode_counter(qfi->qf, qfi->current, &current_remainder, &current_count);
-
-  *value = current_remainder & BITMASK(qfi->qf->metadata->value_bits);
-  current_remainder = current_remainder >> qfi->qf->metadata->value_bits;
-  *key = (qfi->run << qfi->qf->metadata->key_remainder_bits) | current_remainder;
-  *count = current_count;
-
-  return 0;
-}
-
-int qfi_get_key(const QFi *qfi, uint64_t *key, uint64_t *value, uint64_t *count) {
-  *key = *value = *count = 0;
-  int ret = qfi_get(qfi, key, value, count);
-  if (ret == 0) {
-    if (qfi->qf->metadata->hash_mode == QF_HASH_DEFAULT) {
-      *key = 0;
-      *value = 0;
-      *count = 0;
-      return QF_INVALID;
-    } else if (qfi->qf->metadata->hash_mode == QF_HASH_INVERTIBLE)
-      *key = hash_64i(*key, BITMASK(qfi->qf->metadata->key_bits));
-  }
-
-  return ret;
-}
-
-int qfi_get_hash(const QFi *qfi, uint64_t *key, uint64_t *value, uint64_t *count) {
-  *key = *value = *count = 0;
-  return qfi_get(qfi, key, value, count);
-}
-
-int qfi_next(QFi *qfi) {
-  if (qfi_end(qfi))
-    return QFI_INVALID;
-  else {
-    /* move to the end of the current counter*/
-    uint64_t current_remainder, current_count;
-    qfi->current = decode_counter(qfi->qf, qfi->current, &current_remainder, &current_count);
-
-    if (!is_runend(qfi->qf, qfi->current)) {
-      qfi->current++;
-#ifdef LOG_CLUSTER_LENGTH
-      qfi->cur_length++;
-#endif
-      if (qfi_end(qfi)) return QFI_INVALID;
-      return 0;
-    } else {
-#ifdef LOG_CLUSTER_LENGTH
-      /* save to check if the new current is the new cluster. */
-      uint64_t old_current = qfi->current;
-#endif
-      uint64_t block_index = qfi->run / QF_SLOTS_PER_BLOCK;
-      uint64_t rank = bitrank(get_block(qfi->qf, block_index)->occupieds[0], qfi->run % QF_SLOTS_PER_BLOCK);
-      uint64_t next_run = bitselect(get_block(qfi->qf, block_index)->occupieds[0], rank);
-      if (next_run == 64) {
-        rank = 0;
-        while (next_run == 64 && block_index < qfi->qf->metadata->nblocks) {
-          block_index++;
-          next_run = bitselect(get_block(qfi->qf, block_index)->occupieds[0], rank);
-        }
-      }
-      if (block_index == qfi->qf->metadata->nblocks) {
-        /* set the index values to max. */
-        qfi->run = qfi->current = qfi->qf->metadata->xnslots;
-        return QFI_INVALID;
-      }
-      qfi->run = block_index * QF_SLOTS_PER_BLOCK + next_run;
-      qfi->current++;
-      if (qfi->current < qfi->run) qfi->current = qfi->run;
-#ifdef LOG_CLUSTER_LENGTH
-      if (qfi->current > old_current + 1) { /* new cluster. */
-        if (qfi->cur_length > 10) {
-          qfi->c_info[qfi->num_clusters].start_index = qfi->cur_start_index;
-          qfi->c_info[qfi->num_clusters].length = qfi->cur_length;
-          qfi->num_clusters++;
-        }
-        qfi->cur_start_index = qfi->run;
-        qfi->cur_length = 1;
-      } else {
-        qfi->cur_length++;
-      }
-#endif
-      return 0;
-    }
-  }
-}
-
-bool qfi_end(const QFi *qfi) {
-  if (qfi->current >= qfi->qf->metadata->xnslots /*&& is_runend(qfi->qf, qfi->current)*/) return true;
-  return false;
-}
-
-/*
- * Merge qfa and qfb into qfc
- */
-/*
- * iterate over both qf (qfa and qfb)
- * simultaneously
- * for each index i
- * min(get_value(qfa, ia) < get_value(qfb, ib))
- * insert(min, ic)
- * increment either ia or ib, whichever is minimum.
- */
-void qf_merge(const QF *qfa, const QF *qfb, QF *qfc) {
-  QFi qfia, qfib;
-  qf_iterator_from_position(qfa, &qfia, 0);
-  qf_iterator_from_position(qfb, &qfib, 0);
-
-  if (qfa->metadata->hash_mode != qfc->metadata->hash_mode && qfa->metadata->seed != qfc->metadata->seed &&
-      qfb->metadata->hash_mode != qfc->metadata->hash_mode && qfb->metadata->seed != qfc->metadata->seed) {
-    fprintf(stderr, "Output QF and input QFs do not have the same hash mode or seed.\n");
-    exit(1);
-  }
-
-  uint64_t keya, valuea, counta, keyb, valueb, countb;
-  qfi_get_hash(&qfia, &keya, &valuea, &counta);
-  qfi_get_hash(&qfib, &keyb, &valueb, &countb);
-  do {
-    if (keya < keyb) {
-      qf_insert(qfc, keya, valuea, counta, QF_NO_LOCK | QF_KEY_IS_HASH);
-      qfi_next(&qfia);
-      qfi_get_hash(&qfia, &keya, &valuea, &counta);
-    } else {
-      qf_insert(qfc, keyb, valueb, countb, QF_NO_LOCK | QF_KEY_IS_HASH);
-      qfi_next(&qfib);
-      qfi_get_hash(&qfib, &keyb, &valueb, &countb);
-    }
-  } while (!qfi_end(&qfia) && !qfi_end(&qfib));
-
-  if (!qfi_end(&qfia)) {
-    do {
-      qfi_get_hash(&qfia, &keya, &valuea, &counta);
-      qf_insert(qfc, keya, valuea, counta, QF_NO_LOCK | QF_KEY_IS_HASH);
-    } while (!qfi_next(&qfia));
-  }
-  if (!qfi_end(&qfib)) {
-    do {
-      qfi_get_hash(&qfib, &keyb, &valueb, &countb);
-      qf_insert(qfc, keyb, valueb, countb, QF_NO_LOCK | QF_KEY_IS_HASH);
-    } while (!qfi_next(&qfib));
-  }
-}
-
-/*
- * Merge an array of qfs into the resultant QF
- */
-void qf_multi_merge(const QF *qf_arr[], int nqf, QF *qfr) {
-  int i;
-  QFi qfi_arr[nqf];
-  int smallest_idx = 0;
-  uint64_t smallest_key = UINT64_MAX;
-  for (i = 0; i < nqf; i++) {
-    if (qf_arr[i]->metadata->hash_mode != qfr->metadata->hash_mode && qf_arr[i]->metadata->seed != qfr->metadata->seed) {
-      fprintf(stderr, "Output QF and input QFs do not have the same hash mode or seed.\n");
-      exit(1);
-    }
-    qf_iterator_from_position(qf_arr[i], &qfi_arr[i], 0);
-  }
-
-  DEBUG_CQF("Merging %d CQFs\n", nqf);
-  for (i = 0; i < nqf; i++) {
-    DEBUG_CQF("CQF %d\n", i);
-    DEBUG_DUMP(qf_arr[i]);
-  }
-
-  while (nqf > 1) {
-    uint64_t keys[nqf];
-    uint64_t values[nqf];
-    uint64_t counts[nqf];
-    for (i = 0; i < nqf; i++) qfi_get_hash(&qfi_arr[i], &keys[i], &values[i], &counts[i]);
-
-    do {
-      smallest_key = UINT64_MAX;
-      for (i = 0; i < nqf; i++) {
-        if (keys[i] < smallest_key) {
-          smallest_key = keys[i];
-          smallest_idx = i;
-        }
-      }
-      qf_insert(qfr, keys[smallest_idx], values[smallest_idx], counts[smallest_idx], QF_NO_LOCK | QF_KEY_IS_HASH);
-      qfi_next(&qfi_arr[smallest_idx]);
-      qfi_get_hash(&qfi_arr[smallest_idx], &keys[smallest_idx], &values[smallest_idx], &counts[smallest_idx]);
-    } while (!qfi_end(&qfi_arr[smallest_idx]));
-
-    /* remove the qf that is exhausted from the array */
-    if (smallest_idx < nqf - 1)
-      memmove(&qfi_arr[smallest_idx], &qfi_arr[smallest_idx + 1], (nqf - smallest_idx - 1) * sizeof(qfi_arr[0]));
-    nqf--;
-  }
-  if (!qfi_end(&qfi_arr[0])) {
-    uint64_t iters = 0;
-    do {
-      uint64_t key, value, count;
-      qfi_get_hash(&qfi_arr[0], &key, &value, &count);
-      qf_insert(qfr, key, value, count, QF_NO_LOCK | QF_KEY_IS_HASH);
-      qfi_next(&qfi_arr[0]);
-      iters++;
-    } while (!qfi_end(&qfi_arr[0]));
-    DEBUG_CQF("Num of iterations: %lu\n", iters);
-  }
-
-  DEBUG_CQF("%s", "Final CQF after merging.\n");
-  DEBUG_DUMP(qfr);
-
-  return;
-}
-
-/* find cosine similarity between two QFs. */
-uint64_t qf_inner_product(const QF *qfa, const QF *qfb) {
-  uint64_t acc = 0;
-  QFi qfi;
-  const QF *qf_mem, *qf_disk;
-
-  if (qfa->metadata->hash_mode != qfb->metadata->hash_mode && qfa->metadata->seed != qfb->metadata->seed) {
-    fprintf(stderr, "Input QFs do not have the same hash mode or seed.\n");
-    exit(1);
-  }
-
-  // create the iterator on the larger QF.
-  if (qfa->metadata->total_size_in_bytes > qfb->metadata->total_size_in_bytes) {
-    qf_mem = qfb;
-    qf_disk = qfa;
-  } else {
-    qf_mem = qfa;
-    qf_disk = qfb;
-  }
-
-  qf_iterator_from_position(qf_disk, &qfi, 0);
-  do {
-    uint64_t key = 0, value = 0, count = 0;
-    uint64_t count_mem;
-    qfi_get_hash(&qfi, &key, &value, &count);
-    if ((count_mem = qf_count_key_value(qf_mem, key, 0, QF_KEY_IS_HASH)) > 0) {
-      acc += count * count_mem;
-    }
-  } while (!qfi_next(&qfi));
-
-  return acc;
-}
-
-/* find cosine similarity between two QFs. */
-void qf_intersect(const QF *qfa, const QF *qfb, QF *qfr) {
-  QFi qfi;
-  const QF *qf_mem, *qf_disk;
-
-  if (qfa->metadata->hash_mode != qfr->metadata->hash_mode && qfa->metadata->seed != qfr->metadata->seed &&
-      qfb->metadata->hash_mode != qfr->metadata->hash_mode && qfb->metadata->seed != qfr->metadata->seed) {
-    fprintf(stderr, "Output QF and input QFs do not have the same hash mode or seed.\n");
-    exit(1);
-  }
-
-  // create the iterator on the larger QF.
-  if (qfa->metadata->total_size_in_bytes > qfb->metadata->total_size_in_bytes) {
-    qf_mem = qfb;
-    qf_disk = qfa;
-  } else {
-    qf_mem = qfa;
-    qf_disk = qfb;
-  }
-
-  qf_iterator_from_position(qf_disk, &qfi, 0);
-  do {
-    uint64_t key = 0, value = 0, count = 0;
-    qfi_get_hash(&qfi, &key, &value, &count);
-    if (qf_count_key_value(qf_mem, key, 0, QF_KEY_IS_HASH) > 0) qf_insert(qfr, key, value, count, QF_NO_LOCK | QF_KEY_IS_HASH);
-  } while (!qfi_next(&qfi));
-}
-
-}  // namespace quotient_filter
diff --git a/src/kcount/kcount-gpu/gqf.hpp b/src/kcount/kcount-gpu/gqf.hpp
index 6917385..e69de29 100644
--- a/src/kcount/kcount-gpu/gqf.hpp
+++ b/src/kcount/kcount-gpu/gqf.hpp
@@ -1,384 +0,0 @@
-#pragma once
-
-/*
- * ============================================================================
- *
- *        Authors:  Prashant Pandey <ppandey@cs.stonybrook.edu>
- *                  Rob Johnson <robj@vmware.com>
- *
- * ============================================================================
- */
-
-#include <cuda.h>
-#include <inttypes.h>
-#include <stdbool.h>
-
-namespace quotient_filter {
-
-typedef struct quotient_filter quotient_filter;
-typedef quotient_filter QF;
-
-/* CQFs support three hashing modes:
-
-         - DEFAULT uses a hash that may introduce false positives, but
-this can be useful when inserting large keys that need to be
-hashed down to a small fingerprint.  With this type of hash,
-you can iterate over the hash values of all the keys in the
-CQF, but you cannot iterate over the keys themselves.
-
-         - INVERTIBLE has no false positives, but the size of the hash
-output must be the same as the size of the hash input,
-e.g. 17-bit keys hashed to 17-bit outputs.  So this mode is
-generally only useful when storing small keys in the CQF.  With
-this hashing mode, you can use iterators to enumerate both all
-the hashes in the CQF, or all the keys.
-
-         - NONE, for when you've done the hashing yourself.  WARNING: the
-           CQF can exhibit very bad performance if you insert a skewed
-                 distribution of intputs.
-*/
-
-enum qf_hashmode { QF_HASH_DEFAULT, QF_HASH_INVERTIBLE, QF_HASH_NONE };
-
-enum qf_returns { QF_ITEM_INSERTED, QF_ITEM_FOUND, QF_FULL };
-
-/* The CQF supports concurrent insertions and queries.  Only the
-         portion of the CQF being examined or modified is locked, so it
-         supports high throughput even with many threads.
-
-         The CQF operations support 3 locking modes:
-
-         - NO_LOCK: for single-threaded applications or applications
-that do their own concurrency management.
-
-         - WAIT_FOR_LOCK: Spin until you get the lock, then do the query
-or update.
-
-         - TRY_ONCE_LOCK: If you can't grab the lock on the first try,
-return with an error code.
-*/
-#define QF_NO_LOCK (0x01)
-#define QF_TRY_ONCE_LOCK (0x02)
-#define QF_WAIT_FOR_LOCK (0x04)
-
-/* It is sometimes useful to insert a key that has already been
-         hashed. */
-#define QF_KEY_IS_HASH (0x08)
-
-/******************************************
-         The CQF defines low-level constructor and destructor operations
-         that are designed to enable the application to manage the memory
-         used by the CQF.
-*******************************************/
-
-/*
- * Create an empty CQF in "buffer".  If there is not enough space at
- * buffer then it will return the total size needed in bytes to
- * initialize the CQF.  This function takes ownership of buffer.
- */
-uint64_t qf_init(QF *qf, uint64_t nslots, uint64_t key_bits, uint64_t value_bits, enum qf_hashmode hash, uint32_t seed,
-                 void *buffer, uint64_t buffer_len);
-
-/* Create a CQF in "buffer". Note that this does not initialize the
- contents of bufferss Use this function if you have read a CQF, e.g.
- off of disk or network, and want to begin using that stream of
- bytes as a CQF. The CQF takes ownership of buffer.  */
-uint64_t qf_use(QF *qf, void *buffer, uint64_t buffer_len);
-
-/* Destroy this CQF.  Returns a pointer to the memory that the CQF was
-         using (i.e. passed into qf_init or qf_use) so that the application
-         can release that memory. */
-void *qf_destroy(QF *qf);
-
-/* Allocate a new CQF using "nslots" at "buffer" and copy elements from "qf"
- * into it.
- * If there is not enough space at buffer then it will return the total size
- * needed in bytes to initialize the new CQF.
- * */
-uint64_t qf_resize(QF *qf, uint64_t nslots, void *buffer, uint64_t buffer_len);
-
-/***********************************
-The following convenience functions create and destroy CQFs by
-        using malloc/free to obtain and release the memory for the CQF.
-************************************/
-
-/* Initialize the CQF and allocate memory for the CQF. */
-__host__ bool qf_malloc(QF *qf, uint64_t nslots, uint64_t key_bits, uint64_t value_bits, enum qf_hashmode hash, bool on_device,
-                        uint32_t seed);
-
-__host__ bool qf_free(QF *qf);
-
-__host__ void qf_free_gpu(QF *qf);
-
-/* Resize the QF to the specified number of slots.  Uses malloc() to
- * obtain the new memory, and calls free() on the old memory.
- * Return value:
- *    >= 0: number of keys copied during resizing.
- * */
-int64_t qf_resize_malloc(QF *qf, uint64_t nslots);
-
-/* Turn on automatic resizing.  Resizing is performed by calling
-         qf_resize_malloc, so the CQF must meet the requirements of that
-         function. */
-__host__ void qf_set_auto_resize(QF *qf, bool enabled);
-
-/***********************************
-Functions for modifying the CQF.
-***********************************/
-
-#define QF_NO_SPACE (-1)
-#define QF_COULDNT_LOCK (-2)
-#define QF_DOESNT_EXIST (-3)
-
-/* Increment the counter for this key/value pair by count.
- * Return value:
- *    >= 0: distance from the home slot to the slot in which the key is
- *          inserted (or 0 if count == 0).
- *    == QF_NO_SPACE: the CQF has reached capacity.
- *    == QF_COULDNT_LOCK: TRY_ONCE_LOCK has failed to acquire the lock.
- */
-__host__ __device__ int qf_insert(QF *qf, uint64_t key, uint64_t value, uint64_t count, uint8_t flags);
-
-__host__ void bulk_insert_bucketing_premalloc(QF *qf, uint64_t *keys, uint64_t value, uint64_t count, uint64_t nvals,
-                                              uint64_t slots_per_lock, uint64_t num_locks, uint8_t flags);
-
-__host__ void bulk_insert_bucketing(QF *qf, uint64_t *keys, uint64_t value, uint64_t count, uint64_t nvals, uint64_t slots_per_lock,
-                                    uint64_t num_locks, uint8_t flags);
-
-// functions for controlling buffers
-__host__ void bulk_insert_bucketing_buffer_provided(QF *qf, uint64_t *keys, uint64_t value, uint64_t count, uint64_t nvals,
-                                                    uint64_t slots_per_lock, uint64_t num_locks, uint8_t flags, uint64_t **buffers,
-                                                    uint64_t *buffer_backing, volatile uint64_t *buffer_sizes);
-
-__host__ void bulk_insert_bucketing_buffer_provided_timed(QF *qf, uint64_t *keys, uint64_t value, uint64_t count, uint64_t nvals,
-                                                          uint64_t slots_per_lock, uint64_t num_locks, uint8_t flags,
-                                                          uint64_t **buffers, uint64_t *buffer_backing,
-                                                          volatile uint64_t *buffer_sizes);
-
-__host__ void bulk_insert_one_hash(QF *qf, uint64_t *keys, uint64_t value, uint64_t count, uint64_t nvals, uint64_t slots_per_lock,
-                                   uint64_t num_locks, uint8_t flags, uint64_t **buffers, uint64_t *buffer_backing,
-                                   volatile uint64_t *buffer_sizes);
-
-__host__ void bulk_insert_no_atomics(QF *qf, uint64_t *keys, uint64_t value, uint64_t count, uint64_t nvals,
-                                     uint64_t slots_per_lock, uint64_t num_locks, uint8_t flags, uint64_t **buffers,
-                                     volatile uint64_t *buffer_sizes);
-
-__host__ void free_buffers_premalloced(QF *qf, uint64_t **buffers, uint64_t *buffer_backing, volatile uint64_t *buffer_sizes,
-                                       uint64_t num_buffers);
-
-__host__ uint64_t bulk_get_wrapper(QF *qf, uint64_t *vals, uint64_t nvals);
-/* Set the counter for this key/value pair to count.
- Return value: Same as qf_insert.
- Returns 0 if new count is equal to old count.
-*/
-__host__ __device__ int qf_set_count(QF *qf, uint64_t key, uint64_t value, uint64_t count, uint8_t flags);
-
-/* Remove up to count instances of this key/value combination.
- * If the CQF contains <= count instances, then they will all be
- * removed, which is not an error.
- * Return value:
- *    >=  0: number of slots freed.
- *    == QF_DOESNT_EXIST: Specified item did not exist.
- *    == QF_COULDNT_LOCK: TRY_ONCE_LOCK has failed to acquire the lock.
- */
-__host__ __device__ int qf_remove(QF *qf, uint64_t key, uint64_t value, uint64_t count, uint8_t flags);
-
-/* Remove all instances of this key/value pair. */
-__host__ __device__ int qf_delete_key_value(QF *qf, uint64_t key, uint64_t value, uint8_t flags);
-
-/* Remove all instances of this key. */
-/* NOT IMPLEMENTED YET. */
-// void qf_delete_key(QF *qf, uint64_t key);
-
-/* Replace the association (key, oldvalue, count) with the association
-         (key, newvalue, count). If there is already an association (key,
-         newvalue, count'), then the two associations will be merged and
-         their counters will be summed, resulting in association (key,
-         newvalue, count' + count). */
-/* NOT IMPLEMENTED YET. */
-// void qf_replace(QF *qf, uint64_t key, uint64_t oldvalue, uint64_t newvalue);
-
-/*******************************************
-GPU functions
-******************************/
-/* batch inserts using GPU*/
-__host__ void qf_gpu_launch(QF *qf, uint64_t *vals, uint64_t nvals, uint64_t key_count, uint64_t nhashbits, uint64_t nslots);
-
-__global__ void insert_one_kmer_kernel(QF *qf, uint64_t hash, uint8_t val, uint16_t *locks);
-
-/****************************************
-Query functions
-****************************************/
-
-/* Lookup the value associated with key.  Returns the count of that
-         key/value pair in the QF.  If it returns 0, then, the key is not
-         present in the QF. Only returns the first value associated with key
-         in the QF.  If you want to see others, use an iterator.
-         May return QF_COULDNT_LOCK if called with QF_TRY_LOCK.  */
-__host__ __device__ uint64_t qf_query(const QF *qf, uint64_t key, uint64_t *value, uint8_t flags);
-
-/* Return the number of times key has been inserted, with any value,
-         into qf. */
-/* NOT IMPLEMENTED YET. */
-// uint64_t qf_count_key(const QF *qf, uint64_t key);
-
-/* Return the number of times key has been inserted, with the given
-         value, into qf.
-         May return QF_COULDNT_LOCK if called with QF_TRY_LOCK.  */
-__host__ __device__ uint64_t qf_count_key_value(const QF *qf, uint64_t key, uint64_t value, uint8_t flags);
-
-/* Returns a unique index corresponding to the key in the CQF.  Note
-         that this can change if further modifications are made to the
-         CQF.
-
-         If the key is not found then returns QF_DOESNT_EXIST.
-         May return QF_COULDNT_LOCK if called with QF_TRY_LOCK.
- */
-__host__ __device__ int64_t qf_get_unique_index(const QF *qf, uint64_t key, uint64_t value, uint8_t flags);
-
-/****************************************
-Metadata accessors.
-****************************************/
-
-/* Hashing info */
-enum qf_hashmode qf_get_hashmode(const QF *qf);
-uint64_t qf_get_hash_seed(const QF *qf);
-__uint64_t qf_get_hash_range(const QF *qf);
-
-/* Space usage info. */
-bool qf_is_auto_resize_enabled(const QF *qf);
-uint64_t qf_get_total_size_in_bytes(const QF *qf);
-uint64_t qf_get_nslots(const QF *qf);
-uint64_t qf_get_num_occupied_slots(const QF *qf);
-
-__host__ uint64_t host_qf_get_nslots(const QF *qf);
-__host__ uint64_t host_qf_get_num_occupied_slots(const QF *qf);
-
-/* Bit-sizes info. */
-uint64_t qf_get_num_key_bits(const QF *qf);
-uint64_t qf_get_num_value_bits(const QF *qf);
-uint64_t qf_get_num_key_remainder_bits(const QF *qf);
-uint64_t qf_get_bits_per_slot(const QF *qf);
-
-/* Number of (distinct) key-value pairs. */
-uint64_t qf_get_sum_of_counts(const QF *qf);
-uint64_t qf_get_num_distinct_key_value_pairs(const QF *qf);
-
-void qf_sync_counters(const QF *qf);
-
-/****************************************
-        Iterators
-*****************************************/
-
-typedef struct quotient_filter_iterator quotient_filter_iterator;
-typedef quotient_filter_iterator QFi;
-
-#define QF_INVALID (-4)
-#define QFI_INVALID (-5)
-
-/* Initialize an iterator starting at the given position.
- * Return value:
- *  >= 0: iterator is initialized and positioned at the returned slot.
- *   = QFI_INVALID: iterator has reached end.
- */
-int64_t qf_iterator_from_position(const QF *qf, QFi *qfi, uint64_t position);
-
-/* Initialize an iterator and position it at the smallest index
- * containing a key-value pair whose hash is greater than or equal
- * to the specified key-value pair.
- * Return value:
- *  >= 0: iterator is initialized and position at the returned slot.
- *   = QFI_INVALID: iterator has reached end.
- */
-int64_t qf_iterator_from_key_value(const QF *qf, QFi *qfi, uint64_t key, uint64_t value, uint8_t flags);
-
-/* Requires that the hash mode of the CQF is INVERTIBLE or NONE.
- * If the hash mode is DEFAULT then returns QF_INVALID.
- * Return value:
- *   = 0: Iterator is still valid.
- *   = QFI_INVALID: iterator has reached end.
- *   = QF_INVALID: hash mode is QF_DEFAULT_HASH
- */
-int qfi_get_key(const QFi *qfi, uint64_t *key, uint64_t *value, uint64_t *count);
-
-/* Return value:
- *   = 0: Iterator is still valid.
- *   = QFI_INVALID: iterator has reached end.
- */
-int qfi_get_hash(const QFi *qfi, uint64_t *hash, uint64_t *value, uint64_t *count);
-
-/* Advance to next entry.
- * Return value:
- *   = 0: Iterator is still valid.
- *   = QFI_INVALID: iterator has reached end.
- */
-int qfi_next(QFi *qfi);
-
-/* Check to see if the if the end of the QF */
-bool qfi_end(const QFi *qfi);
-
-/************************************
-Miscellaneous convenience functions.
-*************************************/
-
-/* Reset the CQF to an empty filter. */
-void qf_reset(QF *qf);
-
-/* The caller should call qf_init on the dest QF using the same
- * parameters as the src QF before calling this function. Note: src
- * and dest must be exactly the same, including number of slots.  */
-void qf_copy(QF *dest, const QF *src);
-
-/* merge two QFs into the third one. Note: merges with any existing
-         values in qfc.  */
-void qf_merge(const QF *qfa, const QF *qfb, QF *qfc);
-
-/* merge multiple QFs into the final QF one. */
-void qf_multi_merge(const QF *qf_arr[], int nqf, QF *qfr);
-
-/* find cosine similarity between two QFs. */
-uint64_t qf_inner_product(const QF *qfa, const QF *qfb);
-
-/* square of the L_2 norm of a QF (i.e. sum of squares of counts of
-         all items in the CQF). */
-uint64_t qf_magnitude(const QF *qf);
-
-/***********************************
-        Debugging functions.
-************************************/
-
-__host__ __device__ void qf_dump(const QF *);
-__host__ __device__ void qf_dump_metadata(const QF *qf);
-
-// TEMPORARILY EXPOSED FOR DEBUGGING
-
-// FUNCTIONS FOR MHM2
-
-// construct a filter, takes in the address of a pointer
-__host__ void qf_malloc_device(QF **qf, int nbits);
-
-// device_funcs for interacting with the filter
-__device__ qf_returns insert_kmer(QF *qf, uint64_t hash, char forward, char backward, char &returnedfwd, char &returnedback);
-
-__device__ qf_returns insert_kmer_not_exists(QF *qf, uint64_t hash, char forward, char backward, char &returnedfwd,
-                                             char &returnedback);
-
-// destroys a filter
-__host__ void qf_destroy_device(QF *qf);
-
-__host__ __device__ uint8_t encode_kmer_counter(uint8_t *counter);
-__host__ __device__ void decode_kmer_counter(uint8_t *counter, uint8_t stored);
-
-__host__ __device__ bool is_encodable(uint8_t *counter);
-//__global__ void insert_multi_kmer_kernel_first(QF* qf, uint16_t * locks, uint64_t * hashes, uint8_t * firsts, uint8_t * seconds,
-// uint64_t nitems);
-__global__ void insert_multi_kmer_kernel(QF *qf, uint64_t *hashes, uint8_t *firsts, uint8_t *seconds, uint64_t nitems,
-                                         uint64_t *counter);
-
-__host__ uint64_t qf_estimate_memory(int nbits);
-
-// get mem usage from dev side QF
-
-//__host__ uint64_t get_current_usage(QF* qf);
-
-}  // namespace quotient_filter
diff --git a/src/kcount/kcount-gpu/gqf_int.hpp b/src/kcount/kcount-gpu/gqf_int.hpp
index a83962b..e69de29 100644
--- a/src/kcount/kcount-gpu/gqf_int.hpp
+++ b/src/kcount/kcount-gpu/gqf_int.hpp
@@ -1,138 +0,0 @@
-#pragma once
-
-/*
- * ============================================================================
- *
- *        Authors:  Prashant Pandey <ppandey@cs.stonybrook.edu>
- *                  Rob Johnson <robj@vmware.com>
- *
- * ============================================================================
- */
-
-#include <inttypes.h>
-#include <stdbool.h>
-
-#include "gqf.hpp"
-#include "partitioned_counter.hpp"
-
-namespace quotient_filter {
-
-#define MAGIC_NUMBER 1018874902021329732
-
-/* Can be
-   0 (choose size at run-time),
-   8, 16, 32, or 64 (for optimized versions),
-   or other integer <= 56 (for compile-time-optimized bit-shifting-based versions)
-*/
-
-// move #define to gqf.cu
-#define QF_BITS_PER_REMAINDER 10
-#define QF_BITS_PER_VALUE 6
-
-#define QF_BITS_PER_SLOT 16
-
-/* Must be >= 6.  6 seems fastest. */
-#define QF_BLOCK_OFFSET_BITS (6)
-
-#define QF_SLOTS_PER_BLOCK (1ULL << QF_BLOCK_OFFSET_BITS)
-#define QF_METADATA_WORDS_PER_BLOCK ((QF_SLOTS_PER_BLOCK + 63) / 64)
-
-typedef struct {
-  /* Code works with uint16_t, uint32_t, etc, but uint8_t seems just as fast as
-   * anything else */
-  uint8_t offset;
-  uint64_t occupieds[QF_METADATA_WORDS_PER_BLOCK];
-  uint64_t runends[QF_METADATA_WORDS_PER_BLOCK];
-#if QF_BITS_PER_SLOT == 8
-  uint8_t slots[QF_SLOTS_PER_BLOCK];
-#elif QF_BITS_PER_SLOT == 16
-  uint16_t slots[QF_SLOTS_PER_BLOCK];
-#elif QF_BITS_PER_SLOT == 32
-  uint32_t slots[QF_SLOTS_PER_BLOCK];
-#elif QF_BITS_PER_SLOT == 64
-  uint64_t slots[QF_SLOTS_PER_BLOCK];
-#elif QF_BITS_PER_SLOT != 0
-  uint8_t slots[QF_SLOTS_PER_BLOCK * QF_BITS_PER_SLOT / 8];
-#else
-  uint8_t slots[];
-#endif
-} __attribute__((__packed__)) qfblock;
-
-//	struct __attribute__ ((__packed__)) qfblock;
-//	typedef struct qfblock qfblock;
-
-typedef struct file_info {
-  int fd;
-  char *filepath;
-} file_info;
-
-// The below struct is used to instrument the code.
-// It is not used in normal operations of the CQF.
-typedef struct {
-  uint64_t total_time_single;
-  uint64_t total_time_spinning;
-  uint64_t locks_taken;
-  uint64_t locks_acquired_single_attempt;
-} wait_time_data;
-
-typedef struct quotient_filter_runtime_data {
-  file_info f_info;
-  uint32_t auto_resize;
-  int64_t (*container_resize)(QF *qf, uint64_t nslots);
-  pc_t pc_nelts;
-  pc_t pc_ndistinct_elts;
-  pc_t pc_noccupied_slots;
-  uint64_t num_locks;
-  volatile int metadata_lock;
-  uint16_t *locks;
-  wait_time_data *wait_times;
-} quotient_filter_runtime_data;
-
-typedef quotient_filter_runtime_data qfruntime;
-
-typedef struct quotient_filter_metadata {
-  uint64_t magic_endian_number;
-  enum qf_hashmode hash_mode;
-  uint32_t reserved;
-  uint64_t total_size_in_bytes;
-  uint32_t seed;
-  uint64_t nslots;
-  uint64_t xnslots;
-  uint64_t key_bits;
-  uint64_t value_bits;
-  uint64_t key_remainder_bits;
-  uint64_t bits_per_slot;
-  uint64_t range;
-  uint64_t nblocks;
-  uint64_t nelts;
-  uint64_t ndistinct_elts;
-  uint64_t noccupied_slots;
-} quotient_filter_metadata;
-
-typedef quotient_filter_metadata qfmetadata;
-
-typedef struct quotient_filter {
-  qfruntime *runtimedata;
-  qfmetadata *metadata;
-  qfblock *blocks;
-} quotient_filter;
-
-typedef quotient_filter QF;
-
-// The below struct is used to instrument the code.
-// It is not used in normal operations of the CQF.
-typedef struct cluster_data {
-  uint64_t start_index;
-  uint16_t length;
-} cluster_data;
-typedef struct quotient_filter_iterator {
-  const QF *qf;
-  uint64_t run;
-  uint64_t current;
-  uint64_t cur_start_index;
-  uint16_t cur_length;
-  uint32_t num_clusters;
-  cluster_data *c_info;
-} quotient_filter_iterator;
-
-}  // namespace quotient_filter
diff --git a/src/kcount/kcount-gpu/hashutil.cpp b/src/kcount/kcount-gpu/hashutil.cpp
index 962bb58..e69de29 100644
--- a/src/kcount/kcount-gpu/hashutil.cpp
+++ b/src/kcount/kcount-gpu/hashutil.cpp
@@ -1,179 +0,0 @@
-/*
- * ============================================================================
- *
- *        Authors:  Prashant Pandey <ppandey@cs.stonybrook.edu>
- *                  Rob Johnson <robj@vmware.com>
- *
- * ============================================================================
- */
-
-#include "hashutil.hpp"
-
-//-----------------------------------------------------------------------------
-// MurmurHash2, 64-bit versions, by Austin Appleby
-
-// The same caveats as 32-bit MurmurHash2 apply here - beware of alignment
-// and endian-ness issues if used across multiple platforms.
-
-// 64-bit hash for 64-bit platforms
-
-__host__ __device__ uint64_t MurmurHash64A(const void *key, int len, unsigned int seed) {
-  const uint64_t m = 0xc6a4a7935bd1e995;
-  const int r = 47;
-
-  uint64_t h = seed ^ (len * m);
-
-  const uint64_t *data = (const uint64_t *)key;
-  const uint64_t *end = data + (len / 8);
-
-  while (data != end) {
-    uint64_t k = *data++;
-
-    k *= m;
-    k ^= k >> r;
-    k *= m;
-
-    h ^= k;
-    h *= m;
-  }
-
-  const unsigned char *data2 = (const unsigned char *)data;
-
-  switch (len & 7) {
-    case 7: h ^= (uint64_t)data2[6] << 48;
-    case 6: h ^= (uint64_t)data2[5] << 40;
-    case 5: h ^= (uint64_t)data2[4] << 32;
-    case 4: h ^= (uint64_t)data2[3] << 24;
-    case 3: h ^= (uint64_t)data2[2] << 16;
-    case 2: h ^= (uint64_t)data2[1] << 8;
-    case 1: h ^= (uint64_t)data2[0]; h *= m;
-  };
-
-  h ^= h >> r;
-  h *= m;
-  h ^= h >> r;
-
-  return h;
-}
-
-// 64-bit hash for 32-bit platforms
-
-__host__ __device__ uint64_t MurmurHash64B(const void *key, int len, unsigned int seed) {
-  const unsigned int m = 0x5bd1e995;
-  const int r = 24;
-
-  unsigned int h1 = seed ^ len;
-  unsigned int h2 = 0;
-
-  const unsigned int *data = (const unsigned int *)key;
-
-  while (len >= 8) {
-    unsigned int k1 = *data++;
-    k1 *= m;
-    k1 ^= k1 >> r;
-    k1 *= m;
-    h1 *= m;
-    h1 ^= k1;
-    len -= 4;
-
-    unsigned int k2 = *data++;
-    k2 *= m;
-    k2 ^= k2 >> r;
-    k2 *= m;
-    h2 *= m;
-    h2 ^= k2;
-    len -= 4;
-  }
-
-  if (len >= 4) {
-    unsigned int k1 = *data++;
-    k1 *= m;
-    k1 ^= k1 >> r;
-    k1 *= m;
-    h1 *= m;
-    h1 ^= k1;
-    len -= 4;
-  }
-
-  switch (len) {
-    case 3: h2 ^= ((unsigned char *)data)[2] << 16;
-    case 2: h2 ^= ((unsigned char *)data)[1] << 8;
-    case 1: h2 ^= ((unsigned char *)data)[0]; h2 *= m;
-  };
-
-  h1 ^= h2 >> 18;
-  h1 *= m;
-  h2 ^= h1 >> 22;
-  h2 *= m;
-  h1 ^= h2 >> 17;
-  h1 *= m;
-  h2 ^= h1 >> 19;
-  h2 *= m;
-
-  uint64_t h = h1;
-
-  h = (h << 32) | h2;
-
-  return h;
-}
-
-/*
- *   For any 1<k<=64, let mask=(1<<k)-1. hash_64() is a bijection on [0,1<<k),
- *   which means
- *     hash_64(x, mask)==hash_64(y, mask) if and only if x==y. hash_64i() is
- *     the inversion of
- *       hash_64(): hash_64i(hash_64(x, mask), mask) == hash_64(hash_64i(x,
- *       mask), mask) == x.
- */
-
-// Thomas Wang's integer hash functions. See
-// <https://gist.github.com/lh3/59882d6b96166dfc3d8d> for a snapshot.
-
-__host__ __device__ uint64_t hash_64(uint64_t key, uint64_t mask) {
-  key = (~key + (key << 21)) & mask;  // key = (key << 21) - key - 1;
-  key = key ^ key >> 24;
-  key = ((key + (key << 3)) + (key << 8)) & mask;  // key * 265
-  key = key ^ key >> 14;
-  key = ((key + (key << 2)) + (key << 4)) & mask;  // key * 21
-  key = key ^ key >> 28;
-  key = (key + (key << 31)) & mask;
-  return key;
-}
-
-// The inversion of hash_64(). Modified from
-// <https://naml.us/blog/tag/invertible>
-__host__ __device__ uint64_t hash_64i(uint64_t key, uint64_t mask) {
-  uint64_t tmp;
-
-  // Invert key = key + (key << 31)
-  tmp = (key - (key << 31));
-  key = (key - (tmp << 31)) & mask;
-
-  // Invert key = key ^ (key >> 28)
-  tmp = key ^ key >> 28;
-  key = key ^ tmp >> 28;
-
-  // Invert key *= 21
-  key = (key * 14933078535860113213ull) & mask;
-
-  // Invert key = key ^ (key >> 14)
-  tmp = key ^ key >> 14;
-  tmp = key ^ tmp >> 14;
-  tmp = key ^ tmp >> 14;
-  key = key ^ tmp >> 14;
-
-  // Invert key *= 265
-  key = (key * 15244667743933553977ull) & mask;
-
-  // Invert key = key ^ (key >> 24)
-  tmp = key ^ key >> 24;
-  key = key ^ tmp >> 24;
-
-  // Invert key = (~key) + (key << 21)
-  tmp = ~key;
-  tmp = ~(key - (tmp << 21));
-  tmp = ~(key - (tmp << 21));
-  key = ~(key - (tmp << 21)) & mask;
-
-  return key;
-}
diff --git a/src/kcount/kcount-gpu/hashutil.hpp b/src/kcount/kcount-gpu/hashutil.hpp
index b09fb65..e69de29 100644
--- a/src/kcount/kcount-gpu/hashutil.hpp
+++ b/src/kcount/kcount-gpu/hashutil.hpp
@@ -1,25 +0,0 @@
-#pragma once
-
-/*
- * ============================================================================
- *
- *        Authors:  Prashant Pandey <ppandey@cs.stonybrook.edu>
- *                  Rob Johnson <robj@vmware.com>
- *
- * ============================================================================
- */
-
-#ifndef _HASHUTIL_CUH_
-#define _HASHUTIL_CUH_
-
-#include <sys/types.h>
-#include <stdlib.h>
-#include <stdint.h>
-
-__host__ __device__ uint64_t MurmurHash64B(const void* key, int len, unsigned int seed);
-__host__ __device__ uint64_t MurmurHash64A(const void* key, int len, unsigned int seed);
-
-__host__ __device__ uint64_t hash_64(uint64_t key, uint64_t mask);
-__host__ __device__ uint64_t hash_64i(uint64_t key, uint64_t mask);
-
-#endif  // #ifndef _HASHUTIL_H_
diff --git a/src/kcount/kcount-gpu/parse_and_pack.cpp b/src/kcount/kcount-gpu/parse_and_pack.cpp
index a3ecf17..e69de29 100644
--- a/src/kcount/kcount-gpu/parse_and_pack.cpp
+++ b/src/kcount/kcount-gpu/parse_and_pack.cpp
@@ -1,325 +0,0 @@
-/*
- HipMer v 2.0, Copyright (c) 2020, The Regents of the University of California,
- through Lawrence Berkeley National Laboratory (subject to receipt of any required
- approvals from the U.S. Dept. of Energy).  All rights reserved."
-
- Redistribution and use in source and binary forms, with or without modification,
- are permitted provided that the following conditions are met:
-
- (1) Redistributions of source code must retain the above copyright notice, this
- list of conditions and the following disclaimer.
-
- (2) Redistributions in binary form must reproduce the above copyright notice,
- this list of conditions and the following disclaimer in the documentation and/or
- other materials provided with the distribution.
-
- (3) Neither the name of the University of California, Lawrence Berkeley National
- Laboratory, U.S. Dept. of Energy nor the names of its contributors may be used to
- endorse or promote products derived from this software without specific prior
- written permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY
- EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
- OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT
- SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
- TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
- BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
- DAMAGE.
-
- You are under no obligation whatsoever to provide any bug fixes, patches, or upgrades
- to the features, functionality or performance of the source code ("Enhancements") to
- anyone; however, if you choose to make your Enhancements available either publicly,
- or directly to Lawrence Berkeley National Laboratory, without imposing a separate
- written license agreement for such Enhancements, then you hereby grant the following
- license: a  non-exclusive, royalty-free perpetual license to install, use, modify,
- prepare derivative works, incorporate into other computer software, distribute, and
- sublicense such enhancements or derivative works thereof, in binary and source code
- form.
-*/
-
-#include <iostream>
-#include <fstream>
-#include <sstream>
-#include <chrono>
-#include <tuple>
-#include <cuda_runtime_api.h>
-#include <cuda.h>
-
-#include "upcxx_utils/colors.h"
-#include "gpu-utils/gpu_common.hpp"
-#include "gpu-utils/gpu_utils.hpp"
-#include "parse_and_pack.hpp"
-
-using namespace std;
-using namespace gpu_common;
-
-__constant__ uint64_t GPU_0_MASK[32] = {
-    0x0000000000000000, 0xC000000000000000, 0xF000000000000000, 0xFC00000000000000, 0xFF00000000000000, 0xFFC0000000000000,
-    0xFFF0000000000000, 0xFFFC000000000000, 0xFFFF000000000000, 0xFFFFC00000000000, 0xFFFFF00000000000, 0xFFFFFC0000000000,
-    0xFFFFFF0000000000, 0xFFFFFFC000000000, 0xFFFFFFF000000000, 0xFFFFFFFC00000000, 0xFFFFFFFF00000000, 0xFFFFFFFFC0000000,
-    0xFFFFFFFFF0000000, 0xFFFFFFFFFC000000, 0xFFFFFFFFFF000000, 0xFFFFFFFFFFC00000, 0xFFFFFFFFFFF00000, 0xFFFFFFFFFFFC0000,
-    0xFFFFFFFFFFFF0000, 0xFFFFFFFFFFFFC000, 0xFFFFFFFFFFFFF000, 0xFFFFFFFFFFFFFC00, 0xFFFFFFFFFFFFFF00, 0xFFFFFFFFFFFFFFC0,
-    0xFFFFFFFFFFFFFFF0, 0xFFFFFFFFFFFFFFFC};
-
-struct kcount_gpu::ParseAndPackDriverState {
-  cudaEvent_t event;
-  int rank_me;
-};
-
-inline __device__ uint64_t quick_hash(uint64_t v) {
-  v = v * 3935559000370003845 + 2691343689449507681;
-  v ^= v >> 21;
-  v ^= v << 37;
-  v ^= v >> 4;
-  v *= 4768777513237032717;
-  v ^= v << 20;
-  v ^= v >> 41;
-  v ^= v << 5;
-  return v;
-}
-
-__device__ uint64_t gpu_minimizer_hash_fast(int m, int kmer_len, int num_longs, uint64_t *longs, uint64_t *rc_longs) {
-  const int chunk_step = 32 - ((m + 3) / 4) * 4;  // chunk_step is a multiple of 4
-
-  int base;
-  int num_candidates = kmer_len - m + 1;
-  const int max_candidates = MAX_BUILD_KMER;
-  uint64_t rc_candidates[max_candidates];
-
-  // calculate and temporarily store all revcomp minimizer candidates on the stack
-  for (base = 0; base <= kmer_len - m; base += chunk_step) {
-    int shift = base % 32;
-    int l = base / 32;
-    uint64_t tmp = rc_longs[l];
-    if (shift) {
-      tmp = (tmp << (shift * 2));
-      if (l < num_longs - 1) tmp |= rc_longs[l + 1] >> (64 - shift * 2);
-    }
-    for (int j = 0; j < chunk_step; j++) {
-      if (base + j + m > kmer_len) break;
-      rc_candidates[base + j] = ((tmp << (j * 2)) & GPU_0_MASK[m]);
-    }
-  }
-
-  uint64_t minimizer = 0;
-  // calculate and compare minimizers from revcomp
-  for (base = 0; base <= kmer_len - m; base += chunk_step) {
-    int shift = base % 32;
-    int l = base / 32;
-    uint64_t tmp = longs[l];
-    if (shift) {
-      tmp = (tmp << (shift * 2));
-      if (l < num_longs - 1) tmp |= longs[l + 1] >> (64 - shift * 2);
-    }
-    for (int j = 0; j < chunk_step; j++) {
-      if (base + j + m > kmer_len) break;
-      uint64_t fwd_candidate = ((tmp << (j * 2)) & GPU_0_MASK[m]);
-      auto &rc_candidate = rc_candidates[num_candidates - base - j - 1];
-      uint64_t &least_candidate = (fwd_candidate < rc_candidate) ? fwd_candidate : rc_candidate;
-      if (least_candidate > minimizer) minimizer = least_candidate;
-    }
-  }
-  return quick_hash(minimizer);
-}
-
-__global__ void parse_and_pack(char *seqs, int minimizer_len, int kmer_len, int num_longs, int seqs_len, int *kmer_targets,
-                               int num_ranks) {
-  int num_kmers = seqs_len - kmer_len + 1;
-  const int MAX_LONGS = (MAX_BUILD_KMER + 31) / 32;
-  uint64_t kmer[MAX_LONGS];
-  unsigned int threadid = blockIdx.x * blockDim.x + threadIdx.x;
-  if (threadid < num_kmers) {
-    if (pack_seq_to_kmer(&(seqs[threadid]), kmer_len, num_longs, kmer)) {
-      uint64_t kmer_rc[MAX_LONGS];
-      revcomp(kmer, kmer_rc, kmer_len, num_longs);
-      kmer_targets[threadid] = gpu_minimizer_hash_fast(minimizer_len, kmer_len, num_longs, kmer, kmer_rc) % num_ranks;
-    } else {
-      // indicate invalid with -1
-      kmer_targets[threadid] = -1;
-    }
-  }
-}
-
-inline __device__ bool is_valid_base(char base) { return (base != '_' && base != 'N'); }
-
-__global__ void build_supermers(char *seqs, int *kmer_targets, int num_kmers, int kmer_len, int seqs_len,
-                                kcount_gpu::SupermerInfo *supermers, unsigned int *num_supermers, unsigned int *num_valid_kmers,
-                                int rank_me) {
-  // builds a single supermer starting at a given kmer, but only if the kmer is a valid start to a supermer
-  int my_valid_kmers = 0;
-  unsigned int threadid = blockIdx.x * blockDim.x + threadIdx.x;
-  if (threadid == 0 && kmer_targets[threadid] != -1) my_valid_kmers++;
-  if (threadid > 0 && threadid < num_kmers) {
-    int target = kmer_targets[threadid];
-    if (target != -1) {
-      my_valid_kmers++;
-      bool prev_target_ok = false;
-      if (threadid == 1) {
-        prev_target_ok = true;
-      } else {
-        if (kmer_targets[threadid - 1] != target) {
-          // prev kmer was a different or invalid target
-          prev_target_ok = true;
-        } else {
-          // prev kmer was the same target, but was not a valid start to a supermer
-          if (!is_valid_base(seqs[threadid - 2]) || !is_valid_base(seqs[threadid - 1 + kmer_len])) prev_target_ok = true;
-        }
-      }
-      // make sure this is the first kmer for this target
-      if (prev_target_ok && is_valid_base(seqs[threadid - 1]) && is_valid_base(seqs[threadid + kmer_len])) {
-        int supermer_start_i = threadid - 1;
-        int supermer_len = kmer_len + 2;
-        // build the supermer
-        for (int i = threadid + 1; i < num_kmers - 1; i++) {
-          auto next_target = kmer_targets[i];
-          int end_pos = supermer_start_i + supermer_len;  // i + kmer_len;
-          if (next_target == target && end_pos < seqs_len && is_valid_base(seqs[end_pos]))
-            supermer_len++;
-          else
-            break;
-        }
-        // get a slot for the supermer
-        int slot = atomicAdd(num_supermers, 1);
-        supermers[slot].target = target;
-        supermers[slot].offset = supermer_start_i;
-        supermers[slot].len = supermer_len;
-      }
-    }
-  }
-  reduce(my_valid_kmers, num_kmers, num_valid_kmers);
-}
-
-inline __device__ uint8_t get_packed_val(char base) {
-  switch (base) {
-    case 'a': return 1;
-    case 'c': return 2;
-    case 'g': return 3;
-    case 't': return 4;
-    case 'A': return 5;
-    case 'C': return 6;
-    case 'G': return 7;
-    case 'T': return 8;
-    case 'N':
-    case 'n': return 9;
-    case '_':
-    case 0: return 0;
-    default: printf("Invalid value encountered when packing: %d\n", (int)base);
-  };
-  return 0;
-}
-
-__global__ void pack_seqs(char *dev_seqs, char *dev_packed_seqs, int seqs_len) {
-  unsigned int threadid = blockIdx.x * blockDim.x + threadIdx.x;
-  int packed_seqs_len = seqs_len / 2 + seqs_len % 2;
-  if (threadid < packed_seqs_len) {
-    int seqs_i = threadid * 2;
-    char packed = (get_packed_val(dev_seqs[seqs_i]) << 4);
-    packed |= get_packed_val(dev_seqs[seqs_i + 1]);
-    dev_packed_seqs[threadid] = packed;
-  }
-}
-
-inline int halve_up(int x) { return x / 2 + x % 2; }
-
-kcount_gpu::ParseAndPackGPUDriver::ParseAndPackGPUDriver(int upcxx_rank_me, int upcxx_rank_n, int qual_offset, int kmer_len,
-                                                         int num_kmer_longs, int minimizer_len, double &init_time)
-    : upcxx_rank_me(upcxx_rank_me)
-    , upcxx_rank_n(upcxx_rank_n)
-    , kmer_len(kmer_len)
-    , qual_offset(qual_offset)
-    , num_kmer_longs(num_kmer_longs)
-    , minimizer_len(minimizer_len)
-    , t_func(0)
-    , t_kernel(0) {
-  QuickTimer init_timer;
-  init_timer.start();
-  gpu_utils::set_gpu_device(upcxx_rank_me);
-  max_kmers = KCOUNT_SEQ_BLOCK_SIZE - kmer_len + 1;
-
-  cudaErrchk(cudaMalloc((void **)&dev_seqs, KCOUNT_SEQ_BLOCK_SIZE));
-  cudaErrchk(cudaMalloc((void **)&dev_kmer_targets, max_kmers * sizeof(int)));
-
-  cudaErrchk(cudaMalloc((void **)&dev_supermers, max_kmers * sizeof(SupermerInfo)));
-  cudaErrchk(cudaMalloc((void **)&dev_packed_seqs, halve_up(KCOUNT_SEQ_BLOCK_SIZE)));
-  cudaErrchk(cudaMalloc((void **)&dev_num_supermers, sizeof(int)));
-  cudaErrchk(cudaMalloc((void **)&dev_num_valid_kmers, sizeof(int)));
-
-  // total storage required is approx KCOUNT_SEQ_BLOCK_SIZE * (1 + num_kmers_longs * sizeof(uint64_t) + sizeof(int) + 1)
-  dstate = new ParseAndPackDriverState();
-  dstate->rank_me = upcxx_rank_me;
-  init_timer.stop();
-  init_time = init_timer.get_elapsed();
-}
-
-kcount_gpu::ParseAndPackGPUDriver::~ParseAndPackGPUDriver() {
-  cudaFree(dev_seqs);
-  cudaFree(dev_kmer_targets);
-
-  cudaFree(dev_supermers);
-  cudaFree(dev_packed_seqs);
-  cudaFree(dev_num_supermers);
-  cudaFree(dev_num_valid_kmers);
-
-  delete dstate;
-}
-
-bool kcount_gpu::ParseAndPackGPUDriver::process_seq_block(const string &seqs, unsigned int &num_valid_kmers) {
-  QuickTimer func_timer, kernel_timer;
-
-  if (seqs.length() >= KCOUNT_SEQ_BLOCK_SIZE) return false;
-  if (seqs.length() == 0) return false;
-  if (seqs.length() < (unsigned int)kmer_len) return false;
-
-  func_timer.start();
-  gpu_utils::set_gpu_device(dstate->rank_me);
-  cudaErrchk(cudaEventCreateWithFlags(&dstate->event, cudaEventDisableTiming | cudaEventBlockingSync));
-
-  int num_kmers = seqs.length() - kmer_len + 1;
-  cudaErrchk(cudaMemcpy(dev_seqs, &seqs[0], seqs.length(), cudaMemcpyHostToDevice));
-
-  int gridsize, threadblocksize;
-  get_kernel_config(seqs.length(), parse_and_pack, gridsize, threadblocksize);
-  kernel_timer.start();
-  parse_and_pack<<<gridsize, threadblocksize>>>(dev_seqs, minimizer_len, kmer_len, num_kmer_longs, seqs.length(), dev_kmer_targets,
-                                                upcxx_rank_n);
-
-  cudaErrchk(cudaMemset(dev_num_supermers, 0, sizeof(int)));
-  cudaErrchk(cudaMemset(dev_num_valid_kmers, 0, sizeof(int)));
-  get_kernel_config(num_kmers, build_supermers, gridsize, threadblocksize);
-  build_supermers<<<gridsize, threadblocksize>>>(dev_seqs, dev_kmer_targets, num_kmers, kmer_len, seqs.length(), dev_supermers,
-                                                 dev_num_supermers, dev_num_valid_kmers, upcxx_rank_me);
-  cudaErrchk(cudaMemcpy(&num_valid_kmers, dev_num_valid_kmers, sizeof(unsigned int), cudaMemcpyDeviceToHost));
-  unsigned int num_supermers;
-  cudaErrchk(cudaMemcpy(&num_supermers, dev_num_supermers, sizeof(unsigned int), cudaMemcpyDeviceToHost));
-  supermers.resize(num_supermers);
-  cudaErrchk(cudaMemcpy(&(supermers[0]), dev_supermers, num_supermers * sizeof(SupermerInfo), cudaMemcpyDeviceToHost));
-  cudaErrchk(cudaEventSynchronize(dstate->event));
-  cudaErrchk(cudaEventDestroy(dstate->event));
-  kernel_timer.stop();
-  t_kernel += kernel_timer.get_elapsed();
-  func_timer.stop();
-  t_func += func_timer.get_elapsed();
-  return true;
-}
-
-void kcount_gpu::ParseAndPackGPUDriver::pack_seq_block(const string &seqs) {
-  gpu_utils::set_gpu_device(dstate->rank_me);
-  int packed_seqs_len = halve_up(seqs.length());
-  cudaErrchk(cudaMemcpy(dev_seqs, &seqs[0], seqs.length(), cudaMemcpyHostToDevice));
-  cudaErrchk(cudaMemset(dev_packed_seqs, 0, packed_seqs_len));
-  int gridsize, threadblocksize;
-  get_kernel_config(packed_seqs_len, pack_seqs, gridsize, threadblocksize);
-  GPUTimer t;
-  t.start();
-  pack_seqs<<<gridsize, threadblocksize>>>(dev_seqs, dev_packed_seqs, seqs.length());
-  // this GPUTimer forces a wait for the GPU kernel to complete
-  t.stop();
-  t_kernel += t.get_elapsed();
-  packed_seqs.resize(packed_seqs_len);
-  cudaErrchk(cudaMemcpy(&(packed_seqs[0]), dev_packed_seqs, packed_seqs_len, cudaMemcpyDeviceToHost));
-}
-
-tuple<double, double> kcount_gpu::ParseAndPackGPUDriver::get_elapsed_times() { return {t_func, t_kernel}; }
diff --git a/src/kcount/kcount-gpu/parse_and_pack.hpp b/src/kcount/kcount-gpu/parse_and_pack.hpp
index 7f21c2f..e69de29 100644
--- a/src/kcount/kcount-gpu/parse_and_pack.hpp
+++ b/src/kcount/kcount-gpu/parse_and_pack.hpp
@@ -1,89 +0,0 @@
-/*
- HipMer v 2.0, Copyright (c) 2020, The Regents of the University of California,
- through Lawrence Berkeley National Laboratory (subject to receipt of any required
- approvals from the U.S. Dept. of Energy).  All rights reserved."
-
- Redistribution and use in source and binary forms, with or without modification,
- are permitted provided that the following conditions are met:
-
- (1) Redistributions of source code must retain the above copyright notice, this
- list of conditions and the following disclaimer.
-
- (2) Redistributions in binary form must reproduce the above copyright notice,
- this list of conditions and the following disclaimer in the documentation and/or
- other materials provided with the distribution.
-
- (3) Neither the name of the University of California, Lawrence Berkeley National
- Laboratory, U.S. Dept. of Energy nor the names of its contributors may be used to
- endorse or promote products derived from this software without specific prior
- written permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY
- EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
- OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT
- SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
- TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
- BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
- DAMAGE.
-
- You are under no obligation whatsoever to provide any bug fixes, patches, or upgrades
- to the features, functionality or performance of the source code ("Enhancements") to
- anyone; however, if you choose to make your Enhancements available either publicly,
- or directly to Lawrence Berkeley National Laboratory, without imposing a separate
- written license agreement for such Enhancements, then you hereby grant the following
- license: a  non-exclusive, royalty-free perpetual license to install, use, modify,
- prepare derivative works, incorporate into other computer software, distribute, and
- sublicense such enhancements or derivative works thereof, in binary and source code
- form.
-*/
-
-#pragma once
-
-#include <vector>
-
-namespace kcount_gpu {
-
-struct ParseAndPackDriverState;
-
-struct SupermerInfo {
-  int target;
-  int offset;
-  uint16_t len;
-};
-
-class ParseAndPackGPUDriver {
-  // this opaque data type stores CUDA specific variables
-  ParseAndPackDriverState *dstate = nullptr;
-
-  int upcxx_rank_me;
-  int upcxx_rank_n;
-  int max_kmers;
-  int kmer_len;
-  int qual_offset;
-  int num_kmer_longs;
-  int minimizer_len;
-  double t_func = 0, t_malloc = 0, t_cp = 0, t_kernel = 0;
-  char *dev_seqs;
-  int *dev_kmer_targets;
-
-  SupermerInfo *dev_supermers;
-  char *dev_packed_seqs;
-  unsigned int *dev_num_supermers;
-  unsigned int *dev_num_valid_kmers;
-
- public:
-  std::vector<SupermerInfo> supermers;
-  std::string packed_seqs;
-
-  ParseAndPackGPUDriver(int upcxx_rank_me, int upcxx_rank_n, int qual_offset, int kmer_len, int num_kmer_longs, int minimizer_len,
-                        double &init_time);
-  ~ParseAndPackGPUDriver();
-  bool process_seq_block(const std::string &seqs, unsigned int &num_valid_kmers);
-  void pack_seq_block(const std::string &seqs);
-  std::tuple<double, double> get_elapsed_times();
-};
-
-}  // namespace kcount_gpu
diff --git a/src/kcount/kcount-gpu/partitioned_counter.cpp b/src/kcount/kcount-gpu/partitioned_counter.cpp
index a219f00..e69de29 100644
--- a/src/kcount/kcount-gpu/partitioned_counter.cpp
+++ b/src/kcount/kcount-gpu/partitioned_counter.cpp
@@ -1,68 +0,0 @@
-/*
- * ============================================================================
- *
- *         Author:  Prashant Pandey (), ppandey@cs.stonybrook.edu
- *   Organization:  Stony Brook University
- *
- * ============================================================================
- */
-
-//#define _GNU_SOURCE
-#include <cuda.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <unistd.h>
-#include <sched.h>
-#include <sys/sysinfo.h>
-#include <linux/unistd.h>
-#include <sys/syscall.h>
-#include <errno.h>
-
-#include "partitioned_counter.hpp"
-
-#define min(a, b) ((a) < (b) ? (a) : (b))
-
-int pc_init(pc_t *pc, int64_t *global_counter, uint32_t num_counters, int32_t threshold) {
-  int num_cpus = (int)sysconf(_SC_NPROCESSORS_ONLN);
-  if (num_cpus < 0) {
-    perror("sysconf");
-    return PC_ERROR;
-  }
-  pc->num_counters = num_counters == 0 ? num_cpus : min((unsigned)num_cpus, num_counters);
-
-  pc->local_counters = (lctr_t *)calloc(pc->num_counters, sizeof(*pc->local_counters));
-  if (pc->local_counters == NULL) {
-    perror("Couldn't allocate memory for local counters.");
-    return PC_ERROR;
-  }
-  /*printf("Padding check: 0: %p 1: %p\n", (void*)&pc->local_counters[0],*/
-  /*(void*)&pc->local_counters[1]);*/
-  pc->global_counter = global_counter;
-  pc->threshold = threshold;
-
-  return 0;
-}
-
-void pc_destructor(pc_t *pc) {
-  pc_sync(pc);
-  lctr_t *lc = pc->local_counters;
-  pc->local_counters = NULL;
-  free(lc);
-}
-
-void pc_add(pc_t *pc, int64_t count) {
-  int cpuid = sched_getcpu();
-  uint32_t counter_id = cpuid % pc->num_counters;
-  int64_t cur_count = __atomic_add_fetch(&pc->local_counters[counter_id].counter, count, __ATOMIC_SEQ_CST);
-  if (cur_count > pc->threshold || cur_count < -pc->threshold) {
-    int64_t new_count = __atomic_exchange_n(&pc->local_counters[counter_id].counter, 0, __ATOMIC_SEQ_CST);
-    __atomic_fetch_add(pc->global_counter, new_count, __ATOMIC_SEQ_CST);
-  }
-}
-
-void pc_sync(pc_t *pc) {
-  for (uint32_t i = 0; i < pc->num_counters; i++) {
-    int64_t c = __atomic_exchange_n(&pc->local_counters[i].counter, 0, __ATOMIC_SEQ_CST);
-    __atomic_fetch_add(pc->global_counter, c, __ATOMIC_SEQ_CST);
-  }
-}
diff --git a/src/kcount/kcount-gpu/partitioned_counter.hpp b/src/kcount/kcount-gpu/partitioned_counter.hpp
index b6b50e8..e69de29 100644
--- a/src/kcount/kcount-gpu/partitioned_counter.hpp
+++ b/src/kcount/kcount-gpu/partitioned_counter.hpp
@@ -1,56 +0,0 @@
-#pragma once
-
-/*
- * ============================================================================
- *
- *         Author:  Prashant Pandey (), ppandey@cs.stonybrook.edu
- *   Organization:  Stony Brook University
- *
- * ============================================================================
- */
-
-#ifndef _PARTITIONED_COUNTER_CUH_
-#define _PARTITIONED_COUNTER_CUH_
-
-#include <cuda.h>
-#include <inttypes.h>
-#include <stdbool.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-typedef struct local_counter {
-  int64_t counter;
-  int64_t padding[7];
-} local_counter;
-
-typedef struct local_counter lctr_t;
-
-typedef struct partitioned_counter {
-  lctr_t *local_counters;
-  int64_t *global_counter;
-  uint32_t num_counters;
-  int32_t threshold;
-} partitioned_counter;
-
-typedef struct partitioned_counter pc_t;
-
-#define PC_ERROR -1
-
-/* on success returns 0.
- * If allocation fails returns PC_ERROR
- */
-int pc_init(pc_t *pc, int64_t *global_counter, uint32_t num_counters, int32_t threshold);
-
-void pc_destructor(pc_t *pc);
-
-void pc_add(pc_t *pc, int64_t count);
-
-void pc_sync(pc_t *pc);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* _PARTITIONED_COUNTER_CUH_ */
diff --git a/src/kcount/kcount_gpu.cpp b/src/kcount/kcount_gpu.cpp
index 4b31e30..718982a 100644
--- a/src/kcount/kcount_gpu.cpp
+++ b/src/kcount/kcount_gpu.cpp
@@ -101,7 +101,7 @@ static void process_block(SeqBlockInserter<MAX_K> *seq_block_inserter, dist_obje
   state->num_block_calls++;
   future<bool> fut = execute_in_thread_pool(
       [&state, &num_valid_kmers] { return state->pnp_gpu_driver->process_seq_block(state->seq_block, num_valid_kmers); });
-  while (!fut.is_ready()) {
+  while (!fut.ready()) {
     state->num_pnp_gpu_waits++;
     progress();
   }

From dd25703b7fdc08bc4907ba32f0bad89c14e19b00 Mon Sep 17 00:00:00 2001
From: Jan Ciesko <jan.ciesko@gmail.com>
Date: Wed, 20 Mar 2024 10:17:17 -0700
Subject: [PATCH 09/13] Update operator definition syntax to make clang happy

---
 src/kmer.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/kmer.cpp b/src/kmer.cpp
index 1a9f7b2..721c794 100644
--- a/src/kmer.cpp
+++ b/src/kmer.cpp
@@ -633,7 +633,7 @@ ostream &operator<<(ostream &out, const Kmer<MAX_K> &k) {
   return out << k.to_string();
 }
 
-#define KMER_K(KMER_LEN) template ostream &operator<<<KMER_LEN>(ostream &out, const Kmer<KMER_LEN> &k);
+#define KMER_K(KMER_LEN) template ostream &operator<< <KMER_LEN>(ostream &out, const Kmer<KMER_LEN> &k);
 
 KMER_K(32);
 #if MAX_BUILD_KMER >= 64

From de1d7e4dd93e50517816603f4e29b5cb96daacbb Mon Sep 17 00:00:00 2001
From: Jan Ciesko <jan.ciesko@gmail.com>
Date: Wed, 20 Mar 2024 10:40:13 -0700
Subject: [PATCH 10/13] Add HIP to CmakeFile

---
 CMakeLists.txt                       | 24 ++++-----
 src/CMakeLists.txt                   | 80 ++++++++++++++--------------
 src/gpu-utils/CMakeLists.txt         | 10 ++--
 src/kcount/kcount-gpu/CMakeLists.txt | 18 +++----
 test/CMakeLists.txt                  |  2 +-
 5 files changed, 67 insertions(+), 67 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 74355c5..53fe9cd 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -85,23 +85,23 @@ if(NOT CMAKE_BUILD_TYPE)
 endif()
 
 include(CheckLanguage)
-check_language(CUDA)
-if(CMAKE_CUDA_COMPILER)
-  option(ENABLE_CUDA "Enable CUDA" ON)
-  if(ENABLE_CUDA)
-    set(CMAKE_CUDA_SEPARABLE_COMPILATION ON)
-    set(CMAKE_CUDA_RESOLVE_DEVICE_SYMBOLS OFF) # only build device link objects
+check_language(HIP)
+if(CMAKE_HIP_COMPILER)
+  option(ENABLE_HIP "Enable HIP" ON)
+  if(ENABLE_HIP)
+    set(CMAKE_HIP_SEPARABLE_COMPILATION ON)
+    set(CMAKE_HIP_RESOLVE_DEVICE_SYMBOLS OFF) # only build device link objects
                                                # for GPU targets
-    enable_language(CUDA)
+    enable_language(HIP)
   endif()
 else()
-  message(STATUS "No CUDA environment detected")
-  set(ENABLE_CUDA
+  message(STATUS "No HIP environment detected")
+  set(ENABLE_HIP
       OFF
-      CACHE BOOL "Enable CUDA" FORCE)
+      CACHE BOOL "Enable HIP" FORCE)
 endif()
-if(ENABLE_CUDA)
-  message(STATUS "Building for GPU with CUDA")
+if(ENABLE_HIP)
+  message(STATUS "Building for GPU with HIP")
 else()
   message(STATUS "Building for CPU only")
 endif()
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 5e979b6..859805d 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -74,95 +74,95 @@ message(
   "Adding ZLIB for -I${ZLIB_INCLUDE_DIRS} and -L${ZLIB_BASEPATH}/lib64 -L${ZLIB_BASEPATH}/lib)"
 )
 
-if(ENABLE_CUDA)
-  message(STATUS "Building for GPUs with CUDA")
+if(ENABLE_HIP)
+  message(STATUS "Building for GPUs with HIP")
 
-  set(CMAKE_CUDA_STANDARD
+  set(CMAKE_HIP_STANDARD
       14
       CACHE STRING "")
-  set(CMAKE_CUDA_STANDARD_REQUIRED
+  set(CMAKE_HIP_STANDARD_REQUIRED
       ON
       CACHE BOOL "")
-  set(CMAKE_CUDA_EXTENSIONS
+  set(CMAKE_HIP_EXTENSIONS
       OFF
       CACHE BOOL "")
 
-  find_package(CUDA REQUIRED)
+  find_package(HIP REQUIRED)
 
   if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.12)
-    cmake_policy(SET CMP0074 NEW) # use the CUDA_ROOT variable
+    cmake_policy(SET CMP0074 NEW) # use the HIP_ROOT variable
   endif()
 
-  enable_language(CUDA)
-  set(MHM2_MIN_CUDA_ARCH 70)
+  enable_language(HIP)
+  set(MHM2_MIN_HIP_ARCH 70)
 
   if(${CMAKE_VERSION} VERSION_GREATER_EQUAL 3.18)
-    if(NOT CMAKE_CUDA_ARCHITECTURES)
-      include(FindCUDA/select_compute_arch)
-      cuda_detect_installed_gpus(INSTALLED_GPU_CCS_1)
+    if(NOT CMAKE_HIP_ARCHITECTURES)
+      include(FindHIP/select_compute_arch)
+      hip_detect_installed_gpus(INSTALLED_GPU_CCS_1)
       string(STRIP "${INSTALLED_GPU_CCS_1}" INSTALLED_GPU_CCS_2)
       string(REPLACE " " ";" INSTALLED_GPU_CCS_3 "${INSTALLED_GPU_CCS_2}")
-      string(REPLACE "." "" CUDA_ARCH_LIST "${INSTALLED_GPU_CCS_3}")
-      set(CMAKE_CUDA_ARCHITECTURES ${CUDA_ARCH_LIST})
+      string(REPLACE "." "" HIP_ARCH_LIST "${INSTALLED_GPU_CCS_3}")
+      set(CMAKE_HIP_ARCHITECTURES ${HIP_ARCH_LIST})
       message(
-        STATUS "Autodetect CMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES}"
+        STATUS "Autodetect CMAKE_HIP_ARCHITECTURES=${CMAKE_HIP_ARCHITECTURES}"
       )
     endif()
-    message(STATUS "Using CMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES}")
+    message(STATUS "Using CMAKE_HIP_ARCHITECTURES=${CMAKE_HIP_ARCHITECTURES}")
   else()
-    find_package(CUDA REQUIRED)
-    cuda_select_nvcc_arch_flags(ARCH_FLAGS_LIST)
+    find_package(HIP REQUIRED)
+    hip_select_nvcc_arch_flags(ARCH_FLAGS_LIST)
     if(ARCH_FLAGS_LIST)
       string(REPLACE ";" " " ARCH_FLAGS "${ARCH_FLAGS_LIST}")
       message("Found ARCH_FLAGS_LIST: ${ARCH_FLAGS_LIST}")
-      message("Found CUDA_GPU_DETECT_OUTPUT: ${CUDA_GPU_DETECT_OUTPUT}")
-      if(NOT CMAKE_CUDA_ARCHITECTURES)
-        string(REPLACE "." "" CMAKE_CUDA_ARCHITECTURES
-                       "${CUDA_GPU_DETECT_OUTPUT}")
+      message("Found HIP_GPU_DETECT_OUTPUT: ${HIP_GPU_DETECT_OUTPUT}")
+      if(NOT CMAKE_HIP_ARCHITECTURES)
+        string(REPLACE "." "" CMAKE_HIP_ARCHITECTURES
+                       "${HIP_GPU_DETECT_OUTPUT}")
       endif()
     else()
-      foreach(_CMAKE_CUDA_COMPUTE_CAPABILITY 70 80)
+      foreach(_CMAKE_HIP_COMPUTE_CAPABILITY 70 80)
         set(ARCH_FLAGS
-            "${ARCH_FLAGS} -gencode arch=compute_${_CMAKE_CUDA_COMPUTE_CAPABILITY},code=sm_${_CMAKE_CUDA_COMPUTE_CAPABILITY}"
+            "${ARCH_FLAGS} -gencode arch=compute_${_CMAKE_HIP_COMPUTE_CAPABILITY},code=sm_${_CMAKE_HIP_COMPUTE_CAPABILITY}"
         )
       endforeach()
-      if(NOT CMAKE_CUDA_COMPUTE_CAPABILITY)
-        set(CMAKE_CUDA_COMPUTE_CAPABILITY
+      if(NOT CMAKE_HIP_COMPUTE_CAPABILITY)
+        set(CMAKE_HIP_COMPUTE_CAPABILITY
             70
             CACHE STRING "")
       endif()
       message(
         WARNING
-          "Could not auto-detect the GPU arch flags building for all gpu compute capabilities 3 - 7 and PTX for ${CMAKE_CUDA_COMPUTE_CAPABILITY}"
+          "Could not auto-detect the GPU arch flags building for all gpu compute capabilities 3 - 7 and PTX for ${CMAKE_HIP_COMPUTE_CAPABILITY}"
       )
       set(ARCH_FLAGS
-          "${ARCH_FLAGS} -gencode arch=compute_${CMAKE_CUDA_COMPUTE_CAPABILITY},code=sm_${CMAKE_CUDA_COMPUTE_CAPABILITY}"
+          "${ARCH_FLAGS} -gencode arch=compute_${CMAKE_HIP_COMPUTE_CAPABILITY},code=sm_${CMAKE_HIP_COMPUTE_CAPABILITY}"
       )
       set(ARCH_FLAGS
-          "${ARCH_FLAGS} -gencode arch=compute_${CMAKE_CUDA_COMPUTE_CAPABILITY},code=compute_${CMAKE_CUDA_COMPUTE_CAPABILITY}"
+          "${ARCH_FLAGS} -gencode arch=compute_${CMAKE_HIP_COMPUTE_CAPABILITY},code=compute_${CMAKE_HIP_COMPUTE_CAPABILITY}"
       )
     endif()
     message(STATUS "ARCH_FLAGS=${ARCH_FLAGS}")
   endif()
 
-  foreach(test_cuda_arch ${CMAKE_CUDA_ARCHITECTURES})
-    if(${test_cuda_arch} LESS ${MHM2_MIN_CUDA_ARCH})
+  foreach(test_hip_arch ${CMAKE_HIP_ARCHITECTURES})
+    if(${test_hip_arch} LESS ${MHM2_MIN_HIP_ARCH})
       message(
         FATAL_ERROR
-          "CUDA architecture ${test_cuda_arch} is incompatible with the minimum ${MHM2_MIN_CUDA_ARCH}.  Try compiling without cuda: -DENABLE_CUDA=OFF"
+          "HIP architecture ${test_hip_arch} is incompatible with the minimum ${MHM2_MIN_HIP_ARCH}.  Try compiling without hip: -DENABLE_HIP=OFF"
       )
     endif()
   endforeach()
 
   set(CMAKE_POSITION_INDEPENDENT_CODE ON)
-  set(CMAKE_CUDA_SEPARABLE_COMPILATION ON)
-  set(CMAKE_CUDA_PTX_COMPILATION ON)
+  set(CMAKE_HIP_SEPARABLE_COMPILATION ON)
+  set(CMAKE_HIP_PTX_COMPILATION ON)
 
-  set(CMAKE_CUDA_FLAGS
-      "${CMAKE_CUDA_FLAGS} -Xcompiler=-Wall ${ARCH_FLAGS} -Wno-deprecated-gpu-targets"
+  set(CMAKE_HIP_FLAGS
+      "${CMAKE_HIP_FLAGS} ${ARCH_FLAGS} -Wno-deprecated-gpu-targets"
   )
 
-  message(STATUS "CMAKE_CUDA_FLAGS=${CMAKE_CUDA_FLAGS}")
+  message(STATUS "CMAKE_HIP_FLAGS=${CMAKE_HIP_FLAGS}")
 
 else()
   message(STATUS "Building for CPUs")
@@ -215,7 +215,7 @@ foreach(
   list(APPEND MHM2_TARGET_OBJECTS $<TARGET_OBJECTS:${tgt}>)
 endforeach()
 
-if(ENABLE_CUDA)
+if(ENABLE_HIP)
   set(tgt devices_gpu)
   add_library(${tgt} OBJECT ${tgt}.cpp)
   if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.12)
@@ -250,8 +250,8 @@ set(MHM2_LINK_LIBRARIES
 
 include_directories(${CMAKE_CURRENT_SOURCE_DIR})
 
-if(ENABLE_CUDA)
-  set_property(TARGET mhm2 PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS OFF)
+if(ENABLE_HIP)
+  set_property(TARGET mhm2 PROPERTY HIP_RESOLVE_DEVICE_SYMBOLS OFF)
   include_directories("gpu-utils")
   add_subdirectory(gpu-utils)
   set(MHM2_LINK_LIBRARIES ${MHM2_LINK_LIBRARIES} GPU_UTILS_LIBRARY_static)
diff --git a/src/gpu-utils/CMakeLists.txt b/src/gpu-utils/CMakeLists.txt
index faaef64..4700e06 100644
--- a/src/gpu-utils/CMakeLists.txt
+++ b/src/gpu-utils/CMakeLists.txt
@@ -1,5 +1,5 @@
-if(NOT ENABLE_CUDA)
-  message(FATAL_ERROR "Trying to build GPU-UTILS but CUDA is not enabled")
+if(NOT ENABLE_HIP)
+  message(FATAL_ERROR "Trying to build GPU-UTILS but HIP is not enabled")
 endif()
 
 add_library(GPU_UTILS_LIBRARY_obj OBJECT gpu_utils.cpp gpu_common.cpp)
@@ -8,7 +8,7 @@ if(${CMAKE_VERSION} VERSION_GREATER_EQUAL 3.13)
 endif()
 
 set_source_files_properties(gpu_utils.cpp gpu_common.cpp
-                            PROPERTIES LANGUAGE CUDA LINKER_LANGUAGE CUDA)
+                            PROPERTIES LANGUAGE HIP LINKER_LANGUAGE HIP)
 option(GPU_UTILS_SHARED "GPU-utils shared library" OFF)
 option(GPU_UTILS_STATIC "GPU-utils static library" ON)
 
@@ -16,7 +16,7 @@ if(GPU_UTILS_SHARED)
   add_library(GPU_UTILS_LIBRARY_shared SHARED
               $<TARGET_OBJECTS:GPU_UTILS_LIBRARY_obj>)
   set_property(TARGET GPU_UTILS_LIBRARY_shared
-               PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
+               PROPERTY HIP_RESOLVE_DEVICE_SYMBOLS ON)
   target_link_libraries(GPU_UTILS_LIBRARY_shared INTERFACE)
   install(TARGETS GPU_UTILS_LIBRARY_shared LIBRARY DESTINATION lib)
 endif()
@@ -25,6 +25,6 @@ if(GPU_UTILS_STATIC)
               $<TARGET_OBJECTS:GPU_UTILS_LIBRARY_obj>)
   target_link_libraries(GPU_UTILS_LIBRARY_static INTERFACE)
   set_property(TARGET GPU_UTILS_LIBRARY_static
-               PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
+               PROPERTY HIP_RESOLVE_DEVICE_SYMBOLS ON)
   install(TARGETS GPU_UTILS_LIBRARY_static ARCHIVE DESTINATION lib)
 endif()
diff --git a/src/kcount/kcount-gpu/CMakeLists.txt b/src/kcount/kcount-gpu/CMakeLists.txt
index 0afbeae..dd84757 100644
--- a/src/kcount/kcount-gpu/CMakeLists.txt
+++ b/src/kcount/kcount-gpu/CMakeLists.txt
@@ -9,16 +9,16 @@ if(${CMAKE_VERSION} VERSION_GREATER_EQUAL 3.13)
     -suppress-stack-size-warning>)
 endif()
 
-set_source_files_properties(parse_and_pack.cpp PROPERTIES LANGUAGE CUDA
-                                                          LINKER_LANGUAGE CUDA)
-set_source_files_properties(gpu_hash_table.cpp PROPERTIES LANGUAGE CUDA
-                                                          LINKER_LANGUAGE CUDA)
-set_source_files_properties(gqf.cpp PROPERTIES LANGUAGE CUDA LINKER_LANGUAGE
-                                                             CUDA)
-set_source_files_properties(hashutil.cpp PROPERTIES LANGUAGE CUDA
-                                                    LINKER_LANGUAGE CUDA)
+set_source_files_properties(parse_and_pack.cpp PROPERTIES LANGUAGE HIP
+                                                          LINKER_LANGUAGE HIP)
+set_source_files_properties(gpu_hash_table.cpp PROPERTIES LANGUAGE HIP
+                                                          LINKER_LANGUAGE HIP)
+set_source_files_properties(gqf.cpp PROPERTIES LANGUAGE HIP LINKER_LANGUAGE
+                                                             HIP)
+set_source_files_properties(hashutil.cpp PROPERTIES LANGUAGE HIP
+                                                    LINKER_LANGUAGE HIP)
 set_source_files_properties(partitioned_counter.cpp
-                            PROPERTIES LANGUAGE CUDA LINKER_LANGUAGE CUDA)
+                            PROPERTIES LANGUAGE HIP LINKER_LANGUAGE HIP)
 
 option(KCOUNT_GPU_SHARED "kcount-GPU shared library" OFF)
 option(KCOUNT_GPU_STATIC "kcount-GPU static library" ON)
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index a95e4ad..88a4d5c 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -11,7 +11,7 @@ add_subdirectory(googletest)
 
 if(ENABLE_CUDA)
   add_definitions(-DENABLE_GPUS)
-  message(STATUS "Building tests for GPUs with CUDA")
+  message(STATUS "Building tests for GPUs with HIP")
 endif()
 
 set(BINARY mhm2_test)

From e023280c967caa85c195f11da004ccd0e15e8495 Mon Sep 17 00:00:00 2001
From: Jan Ciesko <jan.ciesko@gmail.com>
Date: Wed, 20 Mar 2024 12:01:52 -0700
Subject: [PATCH 11/13] Hipification:  round two

---
 src/gpu-utils/gpu_common.cpp | 22 +++++++++++-----------
 src/gpu-utils/gpu_common.hpp | 15 ++++++++-------
 src/gpu-utils/gpu_utils.cpp  | 30 +++++++++++++++---------------
 src/mhm2.py                  |  2 +-
 4 files changed, 35 insertions(+), 34 deletions(-)

diff --git a/src/gpu-utils/gpu_common.cpp b/src/gpu-utils/gpu_common.cpp
index dc70a8a..df21bb3 100644
--- a/src/gpu-utils/gpu_common.cpp
+++ b/src/gpu-utils/gpu_common.cpp
@@ -48,9 +48,9 @@
 
 namespace gpu_common {
 
-void gpu_die(cudaError_t code, const char *file, int line, bool abort) {
-  if (code != cudaSuccess) {
-    std::cerr << KLRED << "<" << file << ":" << line << "> ERROR:" << KNORM << cudaGetErrorString(code) << "\n";
+void gpu_die(hipError_t code, const char *file, int line, bool abort) {
+  if (code != hipSuccess) {
+    std::cerr << KLRED << "<" << file << ":" << line << "> ERROR:" << KNORM << hipGetErrorString(code) << "\n";
     std::abort();
     // do not throw exceptions -- does not work properly within progress() throw std::runtime_error(outstr);
   }
@@ -71,23 +71,23 @@ void QuickTimer::inc(double s) { secs += s; }
 double QuickTimer::get_elapsed() { return secs; }
 
 GPUTimer::GPUTimer() {
-  cudaErrchk(cudaEventCreate(&start_event));
-  cudaErrchk(cudaEventCreate(&stop_event));
+  cudaErrchk(hipEventCreate(&start_event));
+  cudaErrchk(hipEventCreate(&stop_event));
   elapsed_t_ms = 0;
 }
 
 GPUTimer::~GPUTimer() {
-  cudaErrchk(cudaEventDestroy(start_event));
-  cudaErrchk(cudaEventDestroy(stop_event));
+  cudaErrchk(hipEventDestroy(start_event));
+  cudaErrchk(hipEventDestroy(stop_event));
 }
 
-void GPUTimer::start() { cudaErrchk(cudaEventRecord(start_event, 0)); }
+void GPUTimer::start() { cudaErrchk(hipEventRecord(start_event, 0)); }
 
 void GPUTimer::stop() {
-  cudaErrchk(cudaEventRecord(stop_event, 0));
-  cudaErrchk(cudaEventSynchronize(stop_event));
+  cudaErrchk(hipEventRecord(stop_event, 0));
+  cudaErrchk(hipEventSynchronize(stop_event));
   float ms;
-  cudaErrchk(cudaEventElapsedTime(&ms, start_event, stop_event));
+  cudaErrchk(hipEventElapsedTime(&ms, start_event, stop_event));
   elapsed_t_ms += ms;
 }
 
diff --git a/src/gpu-utils/gpu_common.hpp b/src/gpu-utils/gpu_common.hpp
index b4b094c..2ebe6f3 100644
--- a/src/gpu-utils/gpu_common.hpp
+++ b/src/gpu-utils/gpu_common.hpp
@@ -1,3 +1,4 @@
+#include "hip/hip_runtime.h"
 /*
  HipMer v 2.0, Copyright (c) 2020, The Regents of the University of California,
  through Lawrence Berkeley National Laboratory (subject to receipt of any required
@@ -44,8 +45,8 @@
 
 #include <iostream>
 #include <chrono>
-#include <cuda_runtime_api.h>
-#include <cuda.h>
+//#include <hip/hip_runtime_api.h>
+//#include <hip/hip_runtime.h>
 
 // Functions that are common to all cuda code; not to be used by upcxx code
 
@@ -71,7 +72,7 @@ static __constant__ uint64_t GPU_TWINS[256] = {
     0xC8, 0x88, 0x48, 0x08, 0xF4, 0xB4, 0x74, 0x34, 0xE4, 0xA4, 0x64, 0x24, 0xD4, 0x94, 0x54, 0x14, 0xC4, 0x84, 0x44, 0x04,
     0xF0, 0xB0, 0x70, 0x30, 0xE0, 0xA0, 0x60, 0x20, 0xD0, 0x90, 0x50, 0x10, 0xC0, 0x80, 0x40, 0x00};
 
-void gpu_die(cudaError_t code, const char *file, int line, bool abort = true);
+void gpu_die(hipError_t code, const char *file, int line, bool abort = true);
 
 using timepoint_t = std::chrono::time_point<std::chrono::high_resolution_clock>;
 
@@ -88,7 +89,7 @@ class QuickTimer {
 };
 
 class GPUTimer {
-  cudaEvent_t start_event, stop_event;
+  hipEvent_t start_event, stop_event;
   float elapsed_t_ms = 0;
 
  public:
@@ -101,8 +102,8 @@ class GPUTimer {
 
 inline __device__ int warpReduceSum(int val, int n) {
   unsigned int threadid = blockIdx.x * blockDim.x + threadIdx.x;
-  unsigned mask = __ballot_sync(0xffffffff, threadid < n);
-  for (int offset = warpSize / 2; offset > 0; offset /= 2) val += __shfl_down_sync(mask, val, offset);
+  unsigned mask = __ballot(threadid < n); /*JC*/
+  for (int offset = warpSize / 2; offset > 0; offset /= 2) val += __shfl_down(mask, val, offset); /*JC*/
   return val;
 }
 
@@ -134,7 +135,7 @@ template <class T>
 inline void get_kernel_config(unsigned max_val, T func, int &gridsize, int &threadblocksize) {
   int mingridsize = 0;
   threadblocksize = 0;  // 1024
-  cudaErrchk(cudaOccupancyMaxPotentialBlockSize(&mingridsize, &threadblocksize, func, 0, 0));
+  cudaErrchk(hipOccupancyMaxPotentialBlockSize(&mingridsize, &threadblocksize, func, 0, 0));
   gridsize = (max_val + threadblocksize - 1) / threadblocksize;
 }
 
diff --git a/src/gpu-utils/gpu_utils.cpp b/src/gpu-utils/gpu_utils.cpp
index 402997a..8cb35a2 100644
--- a/src/gpu-utils/gpu_utils.cpp
+++ b/src/gpu-utils/gpu_utils.cpp
@@ -45,8 +45,8 @@
 #include <chrono>
 #include <array>
 #include <iomanip>
-#include <cuda_runtime_api.h>
-#include <cuda.h>
+//#include <hip/hip_runtime_api.h>
+//#include <hip/hip_runtime.h>
 
 #include "gpu_utils.hpp"
 #include "upcxx_utils/colors.h"
@@ -59,8 +59,8 @@ static int _rank_me = -1;
 
 static int get_gpu_device_count() {
   if (!_device_count) {
-    auto res = cudaGetDeviceCount(&_device_count);
-    if (res != cudaSuccess) return 0;
+    auto res = hipGetDeviceCount(&_device_count);
+    if (res != hipSuccess) return 0;
   }
   return _device_count;
 }
@@ -71,27 +71,27 @@ void gpu_utils::set_gpu_device(int rank_me) {
     exit(1);
   }
   int num_devs = get_gpu_device_count();
-  cudaErrchk(cudaSetDevice(rank_me % num_devs));
+  cudaErrchk(hipSetDevice(rank_me % num_devs));
 }
 
 size_t gpu_utils::get_gpu_tot_mem() {
   set_gpu_device(_rank_me);
-  cudaDeviceProp prop;
-  cudaErrchk(cudaGetDeviceProperties(&prop, 0));
+  hipDeviceProp_t prop;
+  cudaErrchk(hipGetDeviceProperties(&prop, 0));
   return prop.totalGlobalMem;
 }
 
 size_t gpu_utils::get_gpu_avail_mem() {
   set_gpu_device(_rank_me);
   size_t free_mem, tot_mem;
-  cudaErrchk(cudaMemGetInfo(&free_mem, &tot_mem));
+  cudaErrchk(hipMemGetInfo(&free_mem, &tot_mem));
   return free_mem;
 }
 
 string gpu_utils::get_gpu_device_name() {
   set_gpu_device(_rank_me);
-  cudaDeviceProp prop;
-  cudaErrchk(cudaGetDeviceProperties(&prop, 0));
+  hipDeviceProp_t prop;
+  cudaErrchk(hipGetDeviceProperties(&prop, 0));
   return prop.name;
 }
 
@@ -107,8 +107,8 @@ vector<string> gpu_utils::get_gpu_uuids() {
   vector<string> uuids;
   int num_devs = get_gpu_device_count();
   for (int i = 0; i < num_devs; ++i) {
-    cudaDeviceProp prop;
-    cudaErrchk(cudaGetDeviceProperties(&prop, i));
+    hipDeviceProp_t prop;
+    cudaErrchk(hipGetDeviceProperties(&prop, i));
 #if (CUDA_VERSION >= 10000)
     uuids.push_back(get_uuid_str(prop.uuid.bytes));
 #else
@@ -136,18 +136,18 @@ void gpu_utils::initialize_gpu(double& time_to_initialize, int rank_me) {
   if (!gpus_present()) return;
   _rank_me = rank_me;
   set_gpu_device(_rank_me);
-  cudaErrchk(cudaDeviceReset());
+  cudaErrchk(hipDeviceReset());
   elapsed = chrono::high_resolution_clock::now() - t;
   time_to_initialize = elapsed.count();
 }
 
 string gpu_utils::get_gpu_device_descriptions() {
-  cudaDeviceProp prop;
+  hipDeviceProp_t prop;
   int num_devs = get_gpu_device_count();
   ostringstream os;
   os << "Number of GPU devices visible: " << num_devs << "\n";
   for (int i = 0; i < num_devs; ++i) {
-    cudaErrchk(cudaGetDeviceProperties(&prop, i));
+    cudaErrchk(hipGetDeviceProperties(&prop, i));
 
     os << "GPU Device number: " << i << "\n";
     os << "  Device name: " << prop.name << "\n";
diff --git a/src/mhm2.py b/src/mhm2.py
index 5b929c7..8386c3e 100755
--- a/src/mhm2.py
+++ b/src/mhm2.py
@@ -475,7 +475,7 @@ def main():
         halfnoderanks += ',' + str(n*cores) + ',' + str(n*cores+cores/2)
 
     # set extra GASNET environments from build and/or options to mhm2.py
-    runtime_vars = """@MHM2PY_RUNTIME_ENV@"""
+    runtime_vars = """"""
     if runtime_vars == '@MHM2PY_RUNTIME' + '_ENV@':
         runtime_vars = ''
     runtime_output_vars = ''

From 6ada10f4d140b611b80f289953d89681e6cd5c4b Mon Sep 17 00:00:00 2001
From: Jan Ciesko <jan.ciesko@gmail.com>
Date: Wed, 20 Mar 2024 12:12:24 -0700
Subject: [PATCH 12/13] Add hip specific cmake build flags

---
 src/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 859805d..f970d64 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -159,7 +159,7 @@ if(ENABLE_HIP)
   set(CMAKE_HIP_PTX_COMPILATION ON)
 
   set(CMAKE_HIP_FLAGS
-      "${CMAKE_HIP_FLAGS} ${ARCH_FLAGS} -Wno-deprecated-gpu-targets"
+      "${CMAKE_HIP_FLAGS} ${ARCH_FLAGS} -Wno-deprecated-gpu-targets -fgpu-rdc --hip-link"
   )
 
   message(STATUS "CMAKE_HIP_FLAGS=${CMAKE_HIP_FLAGS}")

From 51af4f83c5e37268a7f5d54c1bbb8fe091262387 Mon Sep 17 00:00:00 2001
From: Amy Powell <ajpowel@sandia.gov>
Date: Wed, 20 Mar 2024 14:30:06 -0700
Subject: [PATCH 13/13] update is_ready() to ready();

---
 .../include/upcxx_utils/flat_aggr_store.hpp        |  2 +-
 .../include/upcxx_utils/three_tier_aggr_store.hpp  |  2 +-
 upcxx-utils/src/limit_outstanding.cpp              | 14 +++++++-------
 upcxx-utils/src/ofstream.cpp                       |  2 +-
 upcxx-utils/src/promise_collectives.cpp            |  4 ++--
 upcxx-utils/src/reduce_prefix.cpp                  |  8 ++++----
 6 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/upcxx-utils/include/upcxx_utils/flat_aggr_store.hpp b/upcxx-utils/include/upcxx_utils/flat_aggr_store.hpp
index b96e2ad..dc11e3a 100644
--- a/upcxx-utils/include/upcxx_utils/flat_aggr_store.hpp
+++ b/upcxx-utils/include/upcxx_utils/flat_aggr_store.hpp
@@ -466,7 +466,7 @@ class FlatAggrStore {
       do {
        
         fut = limit_outstanding_futures(fut);
-      } while (!fut.is_ready());
+      } while (!fut.ready());
     }
 
     CountType max_vals[2], sum_vals[2];
diff --git a/upcxx-utils/include/upcxx_utils/three_tier_aggr_store.hpp b/upcxx-utils/include/upcxx_utils/three_tier_aggr_store.hpp
index c4f8b43..a60f89f 100644
--- a/upcxx-utils/include/upcxx_utils/three_tier_aggr_store.hpp
+++ b/upcxx-utils/include/upcxx_utils/three_tier_aggr_store.hpp
@@ -1073,7 +1073,7 @@ class ThreeTierAggrStore : public FlatAggrStore<T, Data...> {
         do {
           
           fut = limit_outstanding_futures(fut);
-        } while (!fut.is_ready());
+        } while (!fut.ready());
       }
     }
     auto fut_done = flush_outstanding_futures_async();
diff --git a/upcxx-utils/src/limit_outstanding.cpp b/upcxx-utils/src/limit_outstanding.cpp
index e65d063..ed08bf9 100644
--- a/upcxx-utils/src/limit_outstanding.cpp
+++ b/upcxx-utils/src/limit_outstanding.cpp
@@ -23,7 +23,7 @@ upcxx::future<> upcxx_utils::collapse_outstanding_futures(int limit, LimitedFutu
     while (outstanding_queue.size() > limit) {
       auto fut = outstanding_queue.front();
       outstanding_queue.pop_front();
-      if (!fut.is_ready()) returned_future = upcxx::when_all(fut, returned_future);
+      if (!fut.ready()) returned_future = upcxx::when_all(fut, returned_future);
     }
     DBG("limit=", limit, " outstanding=", outstanding_queue.size(), " max_check=", max_check, "\n");
     if (limit == 0) {
@@ -31,24 +31,24 @@ upcxx::future<> upcxx_utils::collapse_outstanding_futures(int limit, LimitedFutu
     } else {
       assert(outstanding_queue.size() <= limit);
       int i = 0;
-      while (i < max_check && !returned_future.is_ready() && i < outstanding_queue.size()) {
+      while (i < max_check && !returned_future.ready() && i < outstanding_queue.size()) {
         // find a ready future in the queue to swap with
         auto &test_fut = outstanding_queue[i++];
-        if (test_fut.is_ready()) {
+        if (test_fut.ready()) {
           std::swap(test_fut, returned_future);
-          assert(returned_future.is_ready());
+          assert(returned_future.ready());
           break;
         }
       }
     }
   }
-  DBG("limit=", limit, " outstanding=", outstanding_queue.size(), " max_check=", max_check, ", ret=", returned_future.is_ready(),
+  DBG("limit=", limit, " outstanding=", outstanding_queue.size(), " max_check=", max_check, ", ret=", returned_future.ready(),
       "\n");
   return returned_future;
 }
 
 void upcxx_utils::add_outstanding_future(upcxx::future<> fut, LimitedFutureQueue &outstanding_queue) {
-  if (!fut.is_ready()) outstanding_queue.push_back(fut);
+  if (!fut.ready()) outstanding_queue.push_back(fut);
 }
 
 upcxx::future<> upcxx_utils::limit_outstanding_futures(int limit, LimitedFutureQueue &outstanding_queue) {
@@ -62,7 +62,7 @@ upcxx::future<> upcxx_utils::limit_outstanding_futures(upcxx::future<> fut, int
     if (outstanding_queue.empty()) return fut;
     return upcxx::when_all(collapse_outstanding_futures(limit, outstanding_queue), fut);
   }
-  if (fut.is_ready()) {
+  if (fut.ready()) {
     if (outstanding_queue.size() <= limit) return fut;
   } else {
     outstanding_queue.push_back(fut);
diff --git a/upcxx-utils/src/ofstream.cpp b/upcxx-utils/src/ofstream.cpp
index 784820f..cd311ac 100644
--- a/upcxx-utils/src/ofstream.cpp
+++ b/upcxx-utils/src/ofstream.cpp
@@ -841,7 +841,7 @@ dist_ofstream::~dist_ofstream() {
   if (!is_closed) close();
   assert(is_closed);
   stringstream().swap(ss);
-  DBG_VERBOSE("close_fut=", close_fut.is_ready(), "\n");
+  DBG_VERBOSE("close_fut=", close_fut.ready(), "\n");
 }
 
 void dist_ofstream::close() {
diff --git a/upcxx-utils/src/promise_collectives.cpp b/upcxx-utils/src/promise_collectives.cpp
index 9f059cb..fd9ee7d 100644
--- a/upcxx-utils/src/promise_collectives.cpp
+++ b/upcxx-utils/src/promise_collectives.cpp
@@ -115,14 +115,14 @@ upcxx_utils::PromiseBarrier::~PromiseBarrier() {
   DBG_VERBOSE("Destroy this=", this, " move=", moved, "\n");
   if (moved) return;  // invalidated
   assert(upcxx::master_persona().active_with_caller());
-  assert(dist_workflow->initiated_prom.get_future().is_ready());
+  assert(dist_workflow->initiated_prom.get_future().ready());
   get_future().wait();
 }
 
 void upcxx_utils::PromiseBarrier::fulfill() const {
   DBG_VERBOSE("fulfill this=", this, "\n");
   assert(upcxx::master_persona().active_with_caller());
-  assert(!dist_workflow->initiated_prom.get_future().is_ready());
+  assert(!dist_workflow->initiated_prom.get_future().ready());
   dist_workflow->initiated_prom.fulfill_anonymous(1);
 }
 
diff --git a/upcxx-utils/src/reduce_prefix.cpp b/upcxx-utils/src/reduce_prefix.cpp
index 0ca447e..ecd0aa6 100644
--- a/upcxx-utils/src/reduce_prefix.cpp
+++ b/upcxx-utils/src/reduce_prefix.cpp
@@ -124,8 +124,8 @@ future<> binary_tree_steps::get_future() const {
 // up phase is done
 
 bool binary_tree_steps::up_ready() const {
-  return dst_is_partial_left_me.get_future().is_ready() && scratch_is_partial_right.get_future().is_ready() &&
-         scratch_is_partial_to_parent.get_future().is_ready() && sent_partial_to_parent.get_future().is_ready();
+  return dst_is_partial_left_me.get_future().ready() && scratch_is_partial_right.get_future().ready() &&
+         scratch_is_partial_to_parent.get_future().ready() && sent_partial_to_parent.get_future().ready();
 }
 
 future<> binary_tree_steps::get_up_future() const {
@@ -135,8 +135,8 @@ future<> binary_tree_steps::get_up_future() const {
 // down phase is done
 
 bool binary_tree_steps::down_ready() const {
-  return scratch_is_partial_from_parent.get_future().is_ready() && sent_left_child.get_future().is_ready() &&
-         sent_right_child.get_future().is_ready();
+  return scratch_is_partial_from_parent.get_future().ready() && sent_left_child.get_future().ready() &&
+         sent_right_child.get_future().ready();
 }
 
 future<> binary_tree_steps::get_down_future() const {