From cc127b2c052c65bf9713820700a5dc0f0b8c2326 Mon Sep 17 00:00:00 2001 From: Jan Ciesko Date: Mon, 18 Dec 2023 15:51:24 -0700 Subject: [PATCH 01/13] Use future::is_ready (drops depracated) --- .../include/upcxx_utils/flat_aggr_store.hpp | 4 +- install/include/upcxx_utils/gather.hpp | 10 +- install/include/upcxx_utils/reduce_prefix.hpp | 16 +- install/include/upcxx_utils/thread_pool.hpp | 2 +- .../upcxx_utils/three_tier_aggr_store.hpp | 4 +- .../upcxx_utils/two_tier_aggr_store.hpp | 10 +- src/fastq.cpp | 6 +- src/kcount/kcount_gpu.cpp | 2 +- .../include/upcxx_utils/flat_aggr_store.hpp | 2 +- upcxx-utils/include/upcxx_utils/gather.hpp | 10 +- .../include/upcxx_utils/reduce_prefix.hpp | 16 +- .../include/upcxx_utils/thread_pool.hpp | 2 +- .../upcxx_utils/three_tier_aggr_store.hpp | 2 +- .../upcxx_utils/two_tier_aggr_store.hpp | 1962 +++++++++++++++++ upcxx-utils/src/limit_outstanding.cpp | 14 +- upcxx-utils/src/ofstream.cpp | 2 +- upcxx-utils/src/promise_collectives.cpp | 4 +- upcxx-utils/src/reduce_prefix.cpp | 8 +- upcxx-utils/src/timers.cpp | 698 ++++++ upcxx-utils/test/test_ofstream.cpp | 2 +- upcxx-utils/test/test_promise_collectives.cpp | 44 +- 21 files changed, 2740 insertions(+), 80 deletions(-) create mode 100644 upcxx-utils/include/upcxx_utils/two_tier_aggr_store.hpp create mode 100644 upcxx-utils/src/timers.cpp diff --git a/install/include/upcxx_utils/flat_aggr_store.hpp b/install/include/upcxx_utils/flat_aggr_store.hpp index 16bb3d7..a6b9f5d 100644 --- a/install/include/upcxx_utils/flat_aggr_store.hpp +++ b/install/include/upcxx_utils/flat_aggr_store.hpp @@ -469,7 +469,7 @@ class FlatAggrStore { do { rpc_counts->progress_timer.progress(); // call progress after firing a rpc fut = limit_outstanding_futures(fut); - } while (!fut.ready()); + } while (!fut.is_ready()); } CountType max_vals[2], sum_vals[2]; @@ -478,7 +478,7 @@ class FlatAggrStore { DBG("flush_updates() waiting for counts\n"); auto fut_done = flush_outstanding_futures_async(); - while (!fut_done.ready()) { + while (!fut_done.is_ready()) { rpc_counts->progress_timer.discharge(); } diff --git a/install/include/upcxx_utils/gather.hpp b/install/include/upcxx_utils/gather.hpp index 521e99e..812cfc7 100644 --- a/install/include/upcxx_utils/gather.hpp +++ b/install/include/upcxx_utils/gather.hpp @@ -225,7 +225,7 @@ upcxx::future<> binomial_gather(const T* send_buf, size_t send_count, T* dest_bu if (is_sending) { DBG_VERBOSE("is sending level=", level, "\n"); assert(!is_receiving && "sending is not also receiving"); - assert(!workflow.prom_buffer_filled.get_future().ready() && "sending buffer has not been filled before workflow is prepared"); + assert(!workflow.prom_buffer_filled.get_future().is_ready() && "sending buffer has not been filled before workflow is prepared"); if (!have_received) { assert(send_buf); assert(send_count != 0); @@ -280,11 +280,11 @@ upcxx::future<> binomial_gather(const T* send_buf, size_t send_count, T* dest_bu DBG_VERBOSE("Sent rpc to dest_rrank=", dest_rrank, ", dest_rank=", dest_rank, " rrank=", rrank, " level=", level, " count=", sending_size, " ", get_size_str(sending_size * sizeof(T)), "\n"); // make buffer available for next level (may never be needed but steps workflow to completion) - assert(!workflow.prom_buffer_filled.get_future().ready() && "sending buffer that was just used has not been filled yet"); + assert(!workflow.prom_buffer_filled.get_future().is_ready() && "sending buffer that was just used has not been filled yet"); workflow.prom_buffer_filled.fulfill_anonymous(1); }); if (!have_received) - assert(fut.ready() && + assert(fut.is_ready() && "first sending level always immediately executes ensuring send_buf is ready for reuse on exit of binomial_gather"); have_sent = true; // not done until I have sent my message @@ -295,7 +295,7 @@ upcxx::future<> binomial_gather(const T* send_buf, size_t send_count, T* dest_bu if (!is_sending & !is_receiving) { // some ranks on some levels have nothing to do but make the buffer available DBG_VERBOSE("idle level=", level, "\n"); - assert(!workflow.prom_buffer_filled.get_future().ready()); + assert(!workflow.prom_buffer_filled.get_future().is_ready()); workflow.prom_buffer_filled.fulfill_anonymous(1); if (!have_received && !have_sent) workflow.prom_buffer.fulfill_result(ShBuffer()); } @@ -309,7 +309,7 @@ upcxx::future<> binomial_gather(const T* send_buf, size_t send_count, T* dest_bu LevelWorkflow& next = dist_workflows->level(level + 1); DBG_VERBOSE("Setting buffer for next level: ", level + 1, " size=", sh_buf->size(), " cap=", sh_buf->capacity(), "\n"); - if (!next.prom_buffer.get_future().ready()) { + if (!next.prom_buffer.get_future().is_ready()) { // fulfill buffer for next level on this rank for it to send or recv next.prom_buffer.fulfill_result(std::move(sh_buf)); } else { diff --git a/install/include/upcxx_utils/reduce_prefix.hpp b/install/include/upcxx_utils/reduce_prefix.hpp index f7188be..04a33ae 100644 --- a/install/include/upcxx_utils/reduce_prefix.hpp +++ b/install/include/upcxx_utils/reduce_prefix.hpp @@ -425,7 +425,7 @@ upcxx::future<> reduce_prefix_binary_tree_up(ShDistData sh_dist_dat // scratch has partial_to_parent from right (j+1 ... rr) if there is a right // if there is a left child, dst already has applied from left (ll ... j) // calculate partial_to_parent as (ll ... rr) in scratch. - assert(proms.scratch_is_partial_right.get_future().ready()); + assert(proms.scratch_is_partial_right.get_future().is_ready()); if (my_node.right < my_node.n) { assert(!sh_scratch->empty()); @@ -433,7 +433,7 @@ upcxx::future<> reduce_prefix_binary_tree_up(ShDistData sh_dist_dat T *partial_right = sh_scratch->data(); T *partial_left_right = sh_scratch->data(); - assert(proms.dst_is_partial_left_me.get_future().ready()); + assert(proms.dst_is_partial_left_me.get_future().is_ready()); const T *partial_left_me = my_node.left < my_node.me ? dst : src; const T *send_to_parent = partial_left_me; @@ -512,11 +512,11 @@ upcxx::future<> reduce_prefix_binary_tree_down(ShDistData sh_dist_d } // check that upstage is completed - assert(proms.ready_for_up.get_future().ready()); - assert(proms.dst_is_partial_left_me.get_future().ready()); - assert(proms.scratch_is_partial_right.get_future().ready()); - assert(proms.scratch_is_partial_to_parent.get_future().ready()); - assert(proms.sent_partial_to_parent.get_future().ready()); + assert(proms.ready_for_up.get_future().is_ready()); + assert(proms.dst_is_partial_left_me.get_future().is_ready()); + assert(proms.scratch_is_partial_right.get_future().is_ready()); + assert(proms.scratch_is_partial_to_parent.get_future().is_ready()); + assert(proms.sent_partial_to_parent.get_future().is_ready()); // step 4 down // receive from parent @@ -555,7 +555,7 @@ upcxx::future<> reduce_prefix_binary_tree_down(ShDistData sh_dist_d rpcs_sent = rpcs_sent.then( [sh_dist_data, dst = dst, count = count, sh_scratch = sh_scratch, child, my_node = my_node, &proms, &team]() { assert(proms.up_ready()); - assert(proms.scratch_is_partial_from_parent.get_future().ready()); + assert(proms.scratch_is_partial_from_parent.get_future().is_ready()); const T *send_data; if (child < my_node.me) { // relay just a copy from my parent (0 ... ll-1) diff --git a/install/include/upcxx_utils/thread_pool.hpp b/install/include/upcxx_utils/thread_pool.hpp index 8b27992..864c06f 100644 --- a/install/include/upcxx_utils/thread_pool.hpp +++ b/install/include/upcxx_utils/thread_pool.hpp @@ -94,7 +94,7 @@ class ThreadPool { template static upcxx::future<> &enqueue_in_single_pool_serially(upcxx::future<> &serial_fut, Func &&func, Args &&... args) { assert(upcxx::master_persona().active_with_caller() && "Called from master persona"); - DBG_VERBOSE("enqueue_in_single_pool_serially: ", &serial_fut, " ", (serial_fut.ready() ? "ready" : "NOT READY"), "\n"); + DBG_VERBOSE("enqueue_in_single_pool_serially: ", &serial_fut, " ", (serial_fut.is_ready() ? "ready" : "NOT READY"), "\n"); using return_t = typename std::invoke_result::type; static_assert(std::is_void::value, "void is the required return type for enqueue_in_serial_pool"); diff --git a/install/include/upcxx_utils/three_tier_aggr_store.hpp b/install/include/upcxx_utils/three_tier_aggr_store.hpp index c48c473..232ed78 100644 --- a/install/include/upcxx_utils/three_tier_aggr_store.hpp +++ b/install/include/upcxx_utils/three_tier_aggr_store.hpp @@ -1080,11 +1080,11 @@ class ThreeTierAggrStore : public FlatAggrStore { do { tt_rpc_counts->progress_timer.progress(); // call progress after firing a rpc fut = limit_outstanding_futures(fut); - } while (!fut.ready()); + } while (!fut.is_ready()); } } auto fut_done = flush_outstanding_futures_async(); - while (!fut_done.ready()) { + while (!fut_done.is_ready()) { tt_rpc_counts->progress_timer.discharge(); } diff --git a/install/include/upcxx_utils/two_tier_aggr_store.hpp b/install/include/upcxx_utils/two_tier_aggr_store.hpp index 25be1b8..663a39d 100644 --- a/install/include/upcxx_utils/two_tier_aggr_store.hpp +++ b/install/include/upcxx_utils/two_tier_aggr_store.hpp @@ -503,7 +503,7 @@ class FixedMemoryRPC { dest_stores.reserve(num_stores); for (int i = 0; i < num_stores; i++) { auto fut = global_dispatcher.pop(); - if (!fut.ready()) { + if (!fut.is_ready()) { DIE("Detected a global block that is not ready! i=", i, " available_size=", global_dispatcher.available_size(), "\n"); } dest_stores.push_back(fut.wait()); @@ -1121,7 +1121,7 @@ class TwoTierAggrStore { assert(store_block.first); assert(store_block.first.where() == rank_me()); auto lblock_fut = inter_fixed_mem->pop_global(true); - assert(lblock_fut.ready()); + assert(lblock_fut.is_ready()); auto lblock = lblock_fut.result(); assert(lblock.first); assert(lblock.first.where() == rank_me()); @@ -1320,7 +1320,7 @@ class TwoTierAggrStore { } else { assert(!gblock.first); auto fut = replace_intra_store(gblock, intra_fixed_mem); - if (!fut.ready()) DBG(__func__, " will wait\n"); + if (!fut.is_ready()) DBG(__func__, " will wait\n"); fut.wait(); } assert(gblock.first); @@ -1403,7 +1403,7 @@ class TwoTierAggrStore { assert(gblock.second > 0); auto fut = update_remote_inter_nb(target_rank, gblock); DBG(__func__, " my_progress\n"); - if (!fut.ready()) { + if (!fut.is_ready()) { DBG(__func__, " still waiting on inter dest store\n"); } fut.wait(); @@ -1420,7 +1420,7 @@ class TwoTierAggrStore { gblock = {}; // invalidate it auto fut = replace_inter_store(gblock, inter_fixed_memory_store); send_inter_rpc(split_rank::get_rank_from_node(node), sendBlock); // send to dedicated rank on remote node - if (!fut.ready()) DBG("intra dest store is not immediately ready\n"); + if (!fut.is_ready()) DBG("intra dest store is not immediately ready\n"); return fut; } diff --git a/src/fastq.cpp b/src/fastq.cpp index 3697abc..231b21a 100644 --- a/src/fastq.cpp +++ b/src/fastq.cpp @@ -484,7 +484,7 @@ void FastqReader::seek() { } FastqReader::~FastqReader() { - if (!open_fut.ready()) { + if (!open_fut.is_ready()) { WARN("Destructor called before opening completed\n"); open_fut.wait(); } @@ -502,7 +502,7 @@ string FastqReader::get_fname() { return fname; } size_t FastqReader::my_file_size() { return end_read - start_read + (fqr2 ? fqr2->my_file_size() : 0); } size_t FastqReader::get_next_fq_record(string &id, string &seq, string &quals, bool wait_open) { - if (wait_open && !open_fut.ready()) { + if (wait_open && !open_fut.is_ready()) { WARN("Attempt to read ", fname, " before it is ready. wait on open_fut first to avoid this warning!\n"); open_fut.wait(); } @@ -554,7 +554,7 @@ int FastqReader::get_max_read_len() { return std::max(max_read_len, fqr2 ? fqr2- void FastqReader::reset() { - if (!open_fut.ready()) { + if (!open_fut.is_ready()) { open_fut.wait(); } if (!f) { diff --git a/src/kcount/kcount_gpu.cpp b/src/kcount/kcount_gpu.cpp index d603938..c1e5da0 100644 --- a/src/kcount/kcount_gpu.cpp +++ b/src/kcount/kcount_gpu.cpp @@ -101,7 +101,7 @@ static void process_block(SeqBlockInserter *seq_block_inserter, dist_obje state->num_block_calls++; future fut = execute_in_thread_pool( [&state, &num_valid_kmers] { return state->pnp_gpu_driver->process_seq_block(state->seq_block, num_valid_kmers); }); - while (!fut.ready()) { + while (!fut.is_ready()) { state->num_pnp_gpu_waits++; progress(); } diff --git a/upcxx-utils/include/upcxx_utils/flat_aggr_store.hpp b/upcxx-utils/include/upcxx_utils/flat_aggr_store.hpp index dc11e3a..b96e2ad 100644 --- a/upcxx-utils/include/upcxx_utils/flat_aggr_store.hpp +++ b/upcxx-utils/include/upcxx_utils/flat_aggr_store.hpp @@ -466,7 +466,7 @@ class FlatAggrStore { do { fut = limit_outstanding_futures(fut); - } while (!fut.ready()); + } while (!fut.is_ready()); } CountType max_vals[2], sum_vals[2]; diff --git a/upcxx-utils/include/upcxx_utils/gather.hpp b/upcxx-utils/include/upcxx_utils/gather.hpp index 521e99e..812cfc7 100644 --- a/upcxx-utils/include/upcxx_utils/gather.hpp +++ b/upcxx-utils/include/upcxx_utils/gather.hpp @@ -225,7 +225,7 @@ upcxx::future<> binomial_gather(const T* send_buf, size_t send_count, T* dest_bu if (is_sending) { DBG_VERBOSE("is sending level=", level, "\n"); assert(!is_receiving && "sending is not also receiving"); - assert(!workflow.prom_buffer_filled.get_future().ready() && "sending buffer has not been filled before workflow is prepared"); + assert(!workflow.prom_buffer_filled.get_future().is_ready() && "sending buffer has not been filled before workflow is prepared"); if (!have_received) { assert(send_buf); assert(send_count != 0); @@ -280,11 +280,11 @@ upcxx::future<> binomial_gather(const T* send_buf, size_t send_count, T* dest_bu DBG_VERBOSE("Sent rpc to dest_rrank=", dest_rrank, ", dest_rank=", dest_rank, " rrank=", rrank, " level=", level, " count=", sending_size, " ", get_size_str(sending_size * sizeof(T)), "\n"); // make buffer available for next level (may never be needed but steps workflow to completion) - assert(!workflow.prom_buffer_filled.get_future().ready() && "sending buffer that was just used has not been filled yet"); + assert(!workflow.prom_buffer_filled.get_future().is_ready() && "sending buffer that was just used has not been filled yet"); workflow.prom_buffer_filled.fulfill_anonymous(1); }); if (!have_received) - assert(fut.ready() && + assert(fut.is_ready() && "first sending level always immediately executes ensuring send_buf is ready for reuse on exit of binomial_gather"); have_sent = true; // not done until I have sent my message @@ -295,7 +295,7 @@ upcxx::future<> binomial_gather(const T* send_buf, size_t send_count, T* dest_bu if (!is_sending & !is_receiving) { // some ranks on some levels have nothing to do but make the buffer available DBG_VERBOSE("idle level=", level, "\n"); - assert(!workflow.prom_buffer_filled.get_future().ready()); + assert(!workflow.prom_buffer_filled.get_future().is_ready()); workflow.prom_buffer_filled.fulfill_anonymous(1); if (!have_received && !have_sent) workflow.prom_buffer.fulfill_result(ShBuffer()); } @@ -309,7 +309,7 @@ upcxx::future<> binomial_gather(const T* send_buf, size_t send_count, T* dest_bu LevelWorkflow& next = dist_workflows->level(level + 1); DBG_VERBOSE("Setting buffer for next level: ", level + 1, " size=", sh_buf->size(), " cap=", sh_buf->capacity(), "\n"); - if (!next.prom_buffer.get_future().ready()) { + if (!next.prom_buffer.get_future().is_ready()) { // fulfill buffer for next level on this rank for it to send or recv next.prom_buffer.fulfill_result(std::move(sh_buf)); } else { diff --git a/upcxx-utils/include/upcxx_utils/reduce_prefix.hpp b/upcxx-utils/include/upcxx_utils/reduce_prefix.hpp index f7188be..04a33ae 100644 --- a/upcxx-utils/include/upcxx_utils/reduce_prefix.hpp +++ b/upcxx-utils/include/upcxx_utils/reduce_prefix.hpp @@ -425,7 +425,7 @@ upcxx::future<> reduce_prefix_binary_tree_up(ShDistData sh_dist_dat // scratch has partial_to_parent from right (j+1 ... rr) if there is a right // if there is a left child, dst already has applied from left (ll ... j) // calculate partial_to_parent as (ll ... rr) in scratch. - assert(proms.scratch_is_partial_right.get_future().ready()); + assert(proms.scratch_is_partial_right.get_future().is_ready()); if (my_node.right < my_node.n) { assert(!sh_scratch->empty()); @@ -433,7 +433,7 @@ upcxx::future<> reduce_prefix_binary_tree_up(ShDistData sh_dist_dat T *partial_right = sh_scratch->data(); T *partial_left_right = sh_scratch->data(); - assert(proms.dst_is_partial_left_me.get_future().ready()); + assert(proms.dst_is_partial_left_me.get_future().is_ready()); const T *partial_left_me = my_node.left < my_node.me ? dst : src; const T *send_to_parent = partial_left_me; @@ -512,11 +512,11 @@ upcxx::future<> reduce_prefix_binary_tree_down(ShDistData sh_dist_d } // check that upstage is completed - assert(proms.ready_for_up.get_future().ready()); - assert(proms.dst_is_partial_left_me.get_future().ready()); - assert(proms.scratch_is_partial_right.get_future().ready()); - assert(proms.scratch_is_partial_to_parent.get_future().ready()); - assert(proms.sent_partial_to_parent.get_future().ready()); + assert(proms.ready_for_up.get_future().is_ready()); + assert(proms.dst_is_partial_left_me.get_future().is_ready()); + assert(proms.scratch_is_partial_right.get_future().is_ready()); + assert(proms.scratch_is_partial_to_parent.get_future().is_ready()); + assert(proms.sent_partial_to_parent.get_future().is_ready()); // step 4 down // receive from parent @@ -555,7 +555,7 @@ upcxx::future<> reduce_prefix_binary_tree_down(ShDistData sh_dist_d rpcs_sent = rpcs_sent.then( [sh_dist_data, dst = dst, count = count, sh_scratch = sh_scratch, child, my_node = my_node, &proms, &team]() { assert(proms.up_ready()); - assert(proms.scratch_is_partial_from_parent.get_future().ready()); + assert(proms.scratch_is_partial_from_parent.get_future().is_ready()); const T *send_data; if (child < my_node.me) { // relay just a copy from my parent (0 ... ll-1) diff --git a/upcxx-utils/include/upcxx_utils/thread_pool.hpp b/upcxx-utils/include/upcxx_utils/thread_pool.hpp index cc92248..c04ec48 100644 --- a/upcxx-utils/include/upcxx_utils/thread_pool.hpp +++ b/upcxx-utils/include/upcxx_utils/thread_pool.hpp @@ -94,7 +94,7 @@ class ThreadPool { template static upcxx::future<> &enqueue_in_single_pool_serially(upcxx::future<> &serial_fut, Func &&func, Args &&... args) { assert(upcxx::master_persona().active_with_caller() && "Called from master persona"); - DBG_VERBOSE("enqueue_in_single_pool_serially: ", &serial_fut, " ", (serial_fut.ready() ? "ready" : "NOT READY"), "\n"); + DBG_VERBOSE("enqueue_in_single_pool_serially: ", &serial_fut, " ", (serial_fut.is_ready() ? "ready" : "NOT READY"), "\n"); using return_t = typename std::invoke_result::type; static_assert(std::is_void::value, "void is the required return type for enqueue_in_serial_pool"); diff --git a/upcxx-utils/include/upcxx_utils/three_tier_aggr_store.hpp b/upcxx-utils/include/upcxx_utils/three_tier_aggr_store.hpp index a60f89f..c4f8b43 100644 --- a/upcxx-utils/include/upcxx_utils/three_tier_aggr_store.hpp +++ b/upcxx-utils/include/upcxx_utils/three_tier_aggr_store.hpp @@ -1073,7 +1073,7 @@ class ThreeTierAggrStore : public FlatAggrStore { do { fut = limit_outstanding_futures(fut); - } while (!fut.ready()); + } while (!fut.is_ready()); } } auto fut_done = flush_outstanding_futures_async(); diff --git a/upcxx-utils/include/upcxx_utils/two_tier_aggr_store.hpp b/upcxx-utils/include/upcxx_utils/two_tier_aggr_store.hpp new file mode 100644 index 0000000..663a39d --- /dev/null +++ b/upcxx-utils/include/upcxx_utils/two_tier_aggr_store.hpp @@ -0,0 +1,1962 @@ +#pragma once + +#include +#include +#include +#include + +#include "log.hpp" +#include "split_rank.hpp" +#include "timers.hpp" +#include "version.h" + +using std::list; +using std::make_shared; +using std::ostream; +using std::ostringstream; +using std::pair; +using std::shared_ptr; +using std::string; +using std::vector; + +using upcxx::barrier; +using upcxx::dist_object; +using upcxx::global_ptr; +using upcxx::intrank_t; +using upcxx::make_future; +using upcxx::make_view; +using upcxx::op_fast_add; +using upcxx::op_fast_max; +using upcxx::progress; +using upcxx::rank_me; +using upcxx::rank_n; +using upcxx::reduce_all; +using upcxx::reduce_one; +using upcxx::rget; +using upcxx::rpc; +using upcxx::to_future; +using upcxx::view; +using upcxx::when_all; + +// this class aggregates updates into local buffers and then periodically does an rpc to dispatch them + +#ifdef DEBUG +#define DEBUG_MINIMAL_STORE +#endif + +#ifndef MAX_RPCS_IN_FLIGHT +#define MAX_RPCS_IN_FLIGHT 4096 +#endif + +#ifndef MIN_INFLIGHT_BYTES +#define MIN_INFLIGHT_BYTES (1024L * 1024L) /* always use at least 1MB in flight */ +#endif + +namespace upcxx_utils { + +template +class BlockDispatcher { + // does not allocate or deallocate memory, just handles pointer to it + // available are backed blocks that are empty and ready to be consumed + // promises is a strictly FIFO queue of blocks that will be fulfilled by a backed block + // pushing / poping from this queue is a signal for asynchronos processing at a later time when resources permit + // all methods are non-blocking, returning futures + // push() will fulfill outstanding promises before returning a block to the available heap + // only pop() can grow promises beyond a fixed size + // all blocks must be returned to available, with no promises before clear() can be called + public: + using ptr_t = T; + using block_t = pair; + using future_block_t = upcxx::future; + using promise_block_t = ActiveInstantiationTimer >; + + using blocks_t = vector; + using reservation_t = shared_ptr; + using promise_reservation_t = ActiveInstantiationTimer >; + + using promise_blocks_t = list; + using promise_reservations_t = list; + + private: + block_t backing; + size_t count_per_block, num_blocks, reservation_size; + blocks_t available, reservation; + promise_blocks_t promised_blocks; // may grow indefinitely, but elements are small + ActiveCountTimer promise_block_count_timer; + size_t promised_blocks_count; + promise_reservations_t promised_reservations; // may grow indefinitely, but elements are small + ActiveCountTimer promise_reservation_count_timer; + size_t promised_reservations_count; + + protected: + // drains the current reservation and returns the contents + reservation_t claim_reservation(bool require_full = true) { + // no need to get a lock as all methods already have a lock + if (require_full && reservation.size() != reservation_size) + DIE("claim_reservation is not full but this is required. ", reservation.size(), "\n"); + reservation_t reserved(new blocks_t()); + reserved->reserve(reservation_size); + reserved->swap(reservation); // claim and empty the current reservation + assert(reservation.size() == 0); + // repopulate the reservation from the available set + while (!available.empty() && reservation.size() < reservation_size) { + reservation.push_back(available.back()); + available.pop_back(); + } + assert(reserved); + return reserved; + } + + public: + string description; + BlockDispatcher(const string description) + : backing({}) + , count_per_block(0) + , num_blocks(0) + , reservation_size(0) + , available() + , reservation() + , promised_blocks() + , promise_block_count_timer(description + "-promised_blocks") + , promised_blocks_count(0) + , promised_reservations() + , promise_reservation_count_timer(description + "-promised_reservations") + , promised_reservations_count(0) + , description(description) {} + BlockDispatcher(const BlockDispatcher &) = delete; + BlockDispatcher(BlockDispatcher &&) = default; + virtual ~BlockDispatcher() { clear(); } + + // a valid dispatcher has an allocation and blocks + bool valid() const { + bool is_valid = (backing.first && backing.second > 0 && num_blocks > 0 && count_per_block > 1 && + num_blocks * count_per_block <= backing.second); + return is_valid; + } + + // checks if a block is backed by the allocation + bool is_backed(block_t &block) const { + return valid() && count_per_block > 0 && + (block.first && (backing.first <= block.first) && (backing.first + backing.second >= block.first + count_per_block)); + } + + // accepts a large block that will be divided into regular num blocks of count elements for dispatching + // all blocks will be put in the available heap and have a zero count + void set(block_t allocation, size_t num, size_t count, size_t thread_offset = 0, size_t reservation_count = 0) { + if (valid()) DIE("set called on an already valid dispatcher. clear() MUST be called first\n"); + if (!available.empty()) DIE("set called with non-empty available heap\n"); + if (!promised_blocks.empty()) DIE("set called with non-empty promised_blocks queue\n"); + if (!promised_reservations.empty()) DIE("set called with non-empty promised_reservations queue\n"); + if (num != 0 && (!allocation.first || allocation.second <= 0)) DIE("set called with null backing\n"); + if (num * (count + thread_offset) != allocation.second) + DIE("set called with an incorrectly sized allocated_backing: ", allocation.second, ", blocks ", num, ", count per ", count, + " and ", thread_offset, " thread_offset\n"); + if (reservation_count > num / 2) DIE("reservation_count=", reservation_count, " can not fit within num=", num, "\n"); + clear(); + num_blocks = num; + count_per_block = count; + reservation_size = reservation_count; + + backing = allocation; + available.reserve(num_blocks); + reservation.reserve(reservation_size); + promised_blocks_count = 0; + promised_reservations_count = 0; + for (size_t i = 0; i < num_blocks; i++) { + block_t tmp(backing.first + i * (count_per_block + thread_offset) + thread_offset, 0); + assert(is_backed(tmp)); + push(tmp); + } + if (available.size() + reservation.size() != num_blocks) + DIE("Invalid set() - available size is not num_blocks: ", available.size(), " available + ", reservation.size(), + " reserved vs ", num_blocks, "\n"); + if (!promised_blocks.empty()) DIE("Invalid set() - promised_blocks should be empty: ", promised_blocks.size(), "\n"); + if (!promised_reservations.empty()) + DIE("Invalid set() - promised_blocks should be empty: ", promised_reservations.size(), "\n"); + assert(num_blocks == 0 || valid()); + } + + // clears this dispatcher. Aborts if promises queue is not empty or available is not full + void clear() { + if (!valid()) { + assert(available.empty()); + assert(reservation.empty()); + assert(promised_blocks.empty()); + assert(promised_reservations.empty()); + assert(num_blocks == 0); + assert(promised_blocks_count == 0); + assert(promised_reservations_count == 0); + } + + if (num_blocks) { + promise_block_count_timer.print_reduce_timings(); + if (reservation_size) { + promise_reservation_count_timer.print_reduce_timings(); + } + } + + if (!promised_blocks.empty()) DIE("clear() called with entries in the promised_blocks queue\n"); + if (!promised_reservations.empty()) DIE("clear() called with entries in the promised_reservations queue\n"); + if (reservation.size() + available.size() != num_blocks) + DIE("clear() called witnout all blocks returned to available: ", available.size(), " + reserved ", reservation.size(), + " expected ", num_blocks, "\n"); + + backing = {}; + available.resize(0); + reservation.resize(0); + num_blocks = 0; + count_per_block = 0; + promised_blocks_count = 0; + promised_reservations_count = 0; + assert(!valid()); + barrier(); + } + + inline size_t get_count_per_block() const { return count_per_block; } + inline size_t get_num_blocks() const { return num_blocks; } + + public: + // returns a full reservation of global block that can be used immediately + // non blocking + upcxx::future acquire_reservation() { + assert(reservation_size > 0); + if (reservation_size == 0) DIE("There is no reservation to acquire as reservation_size == 0\n"); + // lock against concurrent modification on all containers + if (reservation.size() == reservation_size) { + reservation_t reserved = claim_reservation(); + assert(reserved->front().first); + assert(reserved->back().first); + DBG("acquire_reservation: got immediately:", reserved.get(), " -- ", to_string(), "\n"); + return make_future(reserved); + } else { + // add a promise for a reservation + DBG("acquire_reservation: issuing a promise -- ", to_string(), "\n"); + promise_reservation_t res(promise_reservation_count_timer); + auto fut = res.get_future(); + promised_reservations.push_back(std::move(res)); + promised_reservations_count++; + assert(promised_reservations_count == promised_reservations.size()); + return res; + } + } + + reservation_t acquire_partial_reservation() { + if (reservation_size == 0) DIE("There is no reservation to acquire as reservation_size == 0\n"); + reservation_t res; + res = claim_reservation(false); + assert(res); + DBG("acquire_partial_reservation got one with ", res->size(), " -- ", to_string(), "\n"); + return res; + } + + void release_reservation(reservation_t reserved) { + assert(reserved); + DBG("Release_reservation:", reserved.get(), ", size=", reserved->size(), "\n"); + for (auto block : *reserved) { + // DBG("Pushing back reserved block=", block.first, " reservation:", reserved.get(), "\n"); + assert(block.first); + push(block); + assert(!block.first); // invalidated + } + reserved->clear(); + } + + // if the reservation is not full, push to it + // otherwise if there is a promised_block, fulfill that promise + // otherwise push the block into the available heap + // assigns a zero count, and invalidates block so it can not be reused + void push(block_t &block) { + assert(block.first); + if (!valid()) DIE("push called on invalid BlockDispatcher!\n"); + assert(is_backed(block)); + if (!is_backed(block)) + DIE("push called on foreign block(", block.first, " ", block.second, "): ", backing.first, " ", backing.second, "\n"); + assert(promised_blocks_count == promised_blocks.size()); + block.second = 0; // reset the count + + // a reservation may already be ready so fulfill apply before promised_block fulfillment + try_fulfill_promised_reservation(); + + if (reservation.size() < reservation_size) { + // put into the reservation + reservation.push_back(block); + } else if (!promised_blocks.empty()) { + // deliver this block to the first promised_blocks + block_t promised_block = block; // copy before invalidation below + DBG("push fulfilling promised_block: ", promised_block.first, "\n"); + assert(!promised_blocks.empty()); + auto promise_for_block = std::move(promised_blocks.front()); + promised_blocks.pop_front(); + promised_blocks_count--; + promise_for_block.fulfill_result(promised_block); + } else { + // put back on available heap + available.push_back(block); + } + block = {}; // invalidate it + + if (available.size() + reservation.size() > num_blocks) + DIE("push added too many blocks: ", available.size(), " + ", reservation.size(), " vs ", num_blocks, "\n"); + + // a reservation may now also be ready + try_fulfill_promised_reservation(); + } + + void try_fulfill_promised_reservation() { + // check for any outstanding promised_reservations and fulfill if possible + if (reservation_size > 0 && !promised_reservations.empty() && reservation.size() == reservation_size) { + // fulfill this promised reservation + auto promised_reservation = claim_reservation(); + assert(promised_reservation); + assert(promised_reservation->front().first); + assert(promised_reservation->back().first); + assert(promised_reservation->size() == reservation_size); + auto promise_for_reservation = std::move(promised_reservations.front()); + promised_reservations.pop_front(); + promised_reservations_count--; + assert(promised_reservations_count == promised_reservations.size()); + DBG("fulfilling promised_reservation: ", promised_reservation.get(), "\n"); + promise_for_reservation.fulfill_result(promised_reservation); + } + } + + // returns a future block. + // if one is available it may be immediately ready + // otherwise a promised_block is made and tracked + // if available.empty, create a promised_block + future_block_t pop(bool from_reservation = false) { + if (!valid()) DIE("pop_available called on invalid BlockDispatcher!\n"); + future_block_t future_block; + // lock against concurrent modification on all containers + if (available.empty() && from_reservation && reservation.size() > 0) { + block_t block = reservation.back(); + reservation.pop_back(); + assert(is_backed(block)); + assert(block.second == 0); + future_block = to_future(block); + } else if (available.empty()) { + // add a new promised_block and return its future + promise_block_t prom(promise_block_count_timer); + future_block = prom.get_future(); + promised_blocks.push_back(std::move(prom)); + promised_blocks_count++; + // DBG("pop got promised_blocks\n"); + } else { + block_t block = available.back(); + available.pop_back(); + assert(is_backed(block)); + assert(block.second == 0); + future_block = to_future(block); + // DBG("pop got available\n"); + } + return future_block; + } + + // const status accessors + + inline size_t reserved_size() const { return reservation.size(); } + + inline size_t available_size() const { return available.size(); } + + inline bool available_empty() const { return available.empty(); } + + // true if both the promises queues are empty + bool empty() const { + // DBG("empty(): promised reservations=", promised_reservations_count, " promised_blocks_count=", promised_blocks_count, " + // available=", available.size() , " reserved=", reservation.size(), " num_blocks=", num_blocks, "\n"); + return promised_reservations.empty() && promised_blocks.empty() && available_size() + reservation.size() == num_blocks; + } + + // true if the promises queue has entries + inline size_t promises_size() const { return promised_reservations_count + promised_blocks_count; } + + inline bool promises_empty() const { return promised_reservations.empty() && promised_blocks.empty(); } + + // to_string for debug output + string to_string() const { + ostringstream os; + os << description << "-"; + os << "BlockDispatcher(backing=" << backing.first; + os << ",count=" << count_per_block << ",num=" << num_blocks; + os << ",avail=" << available.size(); + os << ",promised_reservations=" << promised_reservations_count; + os << ",promised_blocks=" << promised_blocks_count << ")"; + assert(promised_reservations_count == promised_reservations.size()); + assert(promised_blocks_count == promised_blocks.size()); + return os.str(); + } +}; + +class TrackRPCs { + public: + using future_ack_t = upcxx::future<>; + using rpc_acks_t = list; + + TrackRPCs(const string description_) + : rpcs_in_flight() + , sent_rpcs(0) + , returned_rpcs(0) + , rpc_timer() + , rpc_inner_timer() + , rpc_relay_timer() + , description(description_) + , t_prog(description_) {} + TrackRPCs(const TrackRPCs &) = delete; + TrackRPCs(TrackRPCs &&) = default; + virtual ~TrackRPCs() { clear(); } + + bool empty() const; + + // frees memory. Can only be called when all futures have completed + void clear(); + + // track an rpc acknowledgment + void push(future_ack_t fut); + + // tests all rpcs and returns the remaining count + // if ready, wait on it and remove, otherwise count it + size_t pop_finished(); + + size_t count_pending(); + + void flush(size_t max_pending = 0); + + string to_string() const; + + inline ActiveCountTimer &get_rpc_timer() { return rpc_timer; } + inline ActiveCountTimer &get_rpc_inner_timer() { return rpc_inner_timer; } + inline ActiveCountTimer &get_rpc_relay_timer() { return rpc_relay_timer; } + + protected: + rpc_acks_t rpcs_in_flight; + size_t sent_rpcs, returned_rpcs; + ActiveCountTimer rpc_timer, rpc_inner_timer, rpc_relay_timer; + string description; + ProgressTimer t_prog; +}; + +template +class FixedMemoryRPC { + // consists of a global pointer dispatcher, and an acknowledgement queue for RPC calls + // The global pointer dispatcher is for receiving rgets from remote global shared memory + // The global pointer dispatcher is also for accumulating a block locally and then sending the blocks remotely + + public: + // for global data on this node + using global_block_dispatch_t = BlockDispatcher >; + using global_block_t = typename global_block_dispatch_t::block_t; + using future_global_block_t = typename global_block_dispatch_t::future_block_t; + using global_store_t = typename global_block_dispatch_t::blocks_t; + using future_src_dest_block_t = upcxx::future; + using global_reservation_t = typename global_block_dispatch_t::reservation_t; + using inst_timer_t = GenericInstantiationTimer; + + private: + // backing and dispatchers + global_block_t global_backing; // for dest stores and sourcing rgets + global_block_dispatch_t global_dispatcher; + + global_store_t dest_stores; // a block of data for aggregation to each destination + size_t thread_offset; + ActiveCountTimer rput_timer, rget_timer, rget_wait_timer; + ProgressTimer t_prog; + string description; + + public: + FixedMemoryRPC(const string description) + : global_backing({}) + , global_dispatcher(description + string("-global-dispatcher")) + , dest_stores() + , thread_offset(0) + , rput_timer() + , rget_timer() + , rget_wait_timer() + , t_prog(description) + , description(description) {} + FixedMemoryRPC(const FixedMemoryRPC &) = delete; + FixedMemoryRPC(FixedMemoryRPC &&) = default; + virtual ~FixedMemoryRPC() { clear(); } + + bool valid() const { + bool is_valid = (global_dispatcher.get_count_per_block() == global_dispatcher.get_count_per_block()) && + ((global_dispatcher.get_count_per_block() == 1 && !global_backing.first) || + (global_dispatcher.get_count_per_block() > 1 && global_backing.first && global_backing.second > 0)); + + return is_valid; + } + + inline void my_progress() { t_prog.progress(); } + + // we may have no intra dispatchers if there is 1 thread per node + // we may have no inter dispatchers if there is just 1 node + void set_dest_stores(size_t num_stores) { + DBG("FixedMemoryRPC - ", description, "::set_dest_stores(num_stores=", num_stores, + ") global_blocks=", global_dispatcher.get_num_blocks(), " count=", global_dispatcher.get_count_per_block(), + " avail=", global_dispatcher.available_size(), "\n"); + assert(valid()); + if (global_dispatcher.get_count_per_block() > 1) { + // require split_rank::split_local_team().rank_n() - 1 available blocks at this point (no blocking!) + if (global_dispatcher.available_size() < num_stores) { + DIE("There are an insufficient number of available blocks to populate the dest stores: available_blocks=", + global_dispatcher.available_size(), ", num_stores=", num_stores, "\n"); + } + dest_stores.reserve(num_stores); + for (int i = 0; i < num_stores; i++) { + auto fut = global_dispatcher.pop(); + if (!fut.is_ready()) { + DIE("Detected a global block that is not ready! i=", i, " available_size=", global_dispatcher.available_size(), "\n"); + } + dest_stores.push_back(fut.wait()); + assert(dest_stores.back().first); + } + } + assert(valid()); + barrier(); // required so that no other global_dispatcher.pop() happends before dest_stores are filled + } + + void clear_dest_stores() { + DBG("FixedMemoryRPC - ", description, " clear_dest_stores:", dest_stores.size(), "\n"); + if (global_dispatcher.get_num_blocks() == 0) { + assert(dest_stores.empty()); + } else { + assert(valid()); + for (auto s : dest_stores) { + if (s.second > 0) DIE("Can not clear_dest_stores if they are not empty!\n"); + global_dispatcher.push(s); + } + dest_stores.resize(0); + assert(valid()); + } + } + + size_t count_empty_dest_stores() const { + size_t empty = 0; + for (auto s : dest_stores) { + if (s.second == 0) empty++; + } + return empty; + } + + // only true both the local and global dispatcher are empty themselves (*this be invalid) + // and the dispatchers have a full available heap + bool empty() const { + bool is_empty = global_dispatcher.empty() && + global_dispatcher.available_size() + global_dispatcher.reserved_size() + count_empty_dest_stores() == + global_dispatcher.get_num_blocks(); + // DBG("FixedMemoryRPC::empty(): ", (is_empty?"True":"False"), ", global.empty()=", (global_dispatcher.empty()?"True":"False"), + // "\n"); + return is_empty; + } + + // allocates the blocks and sets the dispatchers + void set_fixed_mem(size_t num_global_blocks, size_t count_per_block, size_t num_stores, bool includes_thread_offset = false, + size_t num_reserved_blocks = 0) { + global_dispatcher.clear(); + clear(); + + if (num_global_blocks == 0) { + assert(count_per_block == 1); + SOUT("Using empty FixedMemoryRPC\n"); + return; + } + + if (num_global_blocks <= num_stores + num_reserved_blocks) { + DIE("Invalid set_fixed_mem num_global_blocks=", num_global_blocks, " num_stores=", num_stores, + " num_reserved_blocks=", num_reserved_blocks, "\n"); + } + + size_t global_count = num_global_blocks * count_per_block; + + SOUT("Allocating ", description, " dispatchers: global_count=", global_count, " ", get_size_str(global_count * sizeof(T)), + "\n"); + + // allocate global memory for global dispatcher + thread_offset = includes_thread_offset ? (sizeof(thread_num_t) * count_per_block + sizeof(T) - 1) / sizeof(T) : 0; + + // allocate thread_offset more elements and start the block that many into the actual allocation + global_backing.second = global_count + (thread_offset * num_global_blocks); + global_backing.first = upcxx::new_array(global_backing.second); + size_t global_num = global_backing.second / (count_per_block + thread_offset); + assert(global_num == num_global_blocks); + global_dispatcher.set(global_backing, global_num, count_per_block, thread_offset, num_reserved_blocks); + + SOUT("finished allocating ", description, " dispatchers\n"); + + set_dest_stores(num_stores); + + size_t total_global_bytes = sizeof(T) * global_backing.second; + size_t total_bytes = total_global_bytes; + SOUT("Using ", num_global_blocks, " global of ", count_per_block, " elements (of ", get_size_str(sizeof(T)), + ") aggregate & send (", get_size_str(total_global_bytes), ") per thread or ", + get_size_str(total_bytes * upcxx::local_team().rank_n()), " per node in shared memory for dest and send buffers\n"); + assert(valid()); + } + + // frees memory. Can only be called when all futures have completed + void clear() { + if (!valid()) { + assert(!global_dispatcher.valid()); + assert(dest_stores.empty()); + return; + } + clear_dest_stores(); + t_prog.print_out(); + rget_timer.print_reduce_timings(description + "-rget"); + rget_wait_timer.print_reduce_timings(description + "-rget-wait"); + rput_timer.print_reduce_timings(description + "-rput"); + + // deallocate global_dispatcher + global_dispatcher.clear(); + upcxx::delete_array(global_backing.first); + global_backing = {}; + assert(!valid()); + barrier(); + } + + inline bool has_dest_stores() const { return dest_stores.size() > 0; } + + global_block_t &dest_store(size_t store_idx) { + if (store_idx >= dest_stores.size()) DBG("getting ", description, " dest_store store_idx=", store_idx, "\n"); + assert(store_idx < dest_stores.size()); + if (dest_stores.size() <= store_idx) + DIE("There are no dest stores at the moment:", dest_stores.size(), " looking for ", store_idx, "\n"); + global_block_t &gblock = dest_stores[store_idx]; + return gblock; + } + + // push a global block back to the dispatcher + void push_global(global_block_t &gblock) { + // DBG("FixedSize::push_global: ", gblock.first, "\n"); + if (!valid()) DIE("push called on an invalid FixedMemoryRPC!\n"); + assert(gblock.first); + assert(gblock.first.where() == rank_me()); + global_dispatcher.push(gblock); + assert(!gblock.first); // is invalidated + } + + // pops a future global block from the dispatcher + future_global_block_t pop_global(bool from_reservation = false) { + // DBG("pop_global\n"); + if (!valid()) DIE("pop_global called on invalid FixedMemoryRPC!\n"); + return global_dispatcher.pop(from_reservation); + } + + inline upcxx::future acquire_reservation() { return global_dispatcher.acquire_reservation(); } + + inline global_reservation_t acquire_partial_reservation() { return global_dispatcher.acquire_partial_reservation(); } + + inline void release_reservation(global_reservation_t reserved) { global_dispatcher.release_reservation(reserved); } + + inline bool has_promises() const { return !global_dispatcher.promises_empty(); } + + inline size_t global_available_size() const { return global_dispatcher.available_size(); } + + inline size_t global_reserved_size() const { return global_dispatcher.reserved_size(); } + + size_t _prep_xfer(global_block_t &src, global_block_t &dest) { + if (src.second == 0) DIE(__func__, " Invalid state - src block is EMPTY\n"); + if (dest.second > 0) DIE(__func__, " Invalid state - dest is not empty: ", dest.second, "\n"); + assert(global_dispatcher.get_count_per_block() >= src.second); + // dest will have src's size + dest.second = src.second; + size_t send_offset = 0; + if (thread_offset > 0) { + // blocks start inside of the allocation so the thread_num decends and the element ascends from the pointer + send_offset = (dest.second * sizeof(thread_num_t) + sizeof(T) - 1) / sizeof(T); + } + assert(send_offset <= thread_offset); + return send_offset; + } + + // starts an rput of local source to remote dest + future_src_dest_block_t rput_block(global_block_t &src, global_block_t &dest) { + DBG("rput_block( src ", src.first, ", dest: ", dest.first, ", ", src.second, ")\n"); + size_t send_offset = _prep_xfer(src, dest); + auto rput_t = make_shared(rput_timer); + + // perform the rpet + assert(src.first.is_local()); + assert(dest.second == src.second); + auto rput_fut = rput(src.first.local() - send_offset, dest.first - send_offset, dest.second + send_offset); + src.second = 0; // signal it is drained + auto fut_return = when_all(make_future(src, dest), rput_fut); + + // prevent reuse + src = {}; + dest = {}; + + return fut_return.then([rput_t, send_offset](global_block_t src, global_block_t dest) { + size_t count = dest.second + send_offset; + DBG("rput completed ", dest.second, " elements with ", send_offset, " extra (", get_size_str(count * sizeof(T)), + ") src=", src.first, " dest=", dest.first, " in ", rput_t->get_elapsed_since_start(), " s, ", + get_size_str(count * sizeof(T) / rput_t->get_elapsed_since_start()), " / s\n"); + return make_future(src, dest); + }); + } + + // starts an rget of the global block copied to the local block + // creates a future of the same global and local blocks once the rget has completed + // invalidates both inputs: src and dest + // non-blocking + future_src_dest_block_t rget_block(global_block_t &src, global_block_t &dest) { + DBG("rget_block( src ", src.first, ", dest: ", dest.first, ", ", src.second, ")\n"); + size_t send_offset = _prep_xfer(src, dest); + auto rget_t = make_shared(rget_timer); + + // perform the rget + assert(dest.first.is_local()); + assert(dest.second == src.second); + auto rget_fut = rget(src.first - send_offset, dest.first.local() - send_offset, dest.second + send_offset); + src.second = 0; // signal it is drained + auto fut_return = when_all(make_future(src, dest), rget_fut); + + // prevent reuse + dest = {}; + src = {}; + + return fut_return.then([rget_t, send_offset](global_block_t src, global_block_t dest) { + size_t count = dest.second + send_offset; + DBG("rget completed ", dest.second, " elements with ", send_offset, " extra (", get_size_str(count * sizeof(T)), + ") src=", src.first, " dest=", dest.first, " in ", rget_t->get_elapsed_since_start(), " s, ", + get_size_str(count * sizeof(T) / rget_t->get_elapsed_since_start()), " / s\n"); + return make_future(src, dest); + }); + } + + // static rget_block pops a new block for dest + future_src_dest_block_t rget_block(global_block_t &gblock) { + // get a future block_t + auto rget_wait_t = make_shared(rget_wait_timer); + auto fut_loc = pop_global(true); // allow extraction from reservation + auto fut_both = when_all(to_future(gblock), fut_loc); + // rget the block + auto fut_blocks = fut_both + .then([rget_wait_t](global_block_t src, global_block_t dest) { + // just stop the timer + return make_future(src, dest); + }) + .then([this](global_block_t src, global_block_t dest) { return this->rget_block(src, dest); }); + return fut_blocks; + } + + inline size_t get_count_per_block() const { + assert(global_dispatcher.get_count_per_block() == global_dispatcher.get_count_per_block()); + return global_dispatcher.get_count_per_block(); + } + + inline size_t get_thread_offset() const { return thread_offset; } + + string to_string() const { + ostringstream os; + os << description << "-"; + os << "FixedMemoryRPC("; + os << "thread_offset=" << thread_offset; + os << ",global_back=" << global_backing.first << "," << global_backing.second; + os << ",global_dispatch=" << global_dispatcher.to_string(); + os << ")"; + return os.str(); + } +}; + +template +class TwoTierAggrStore { + private: + // T for intra node RPCs + using intra_fixed_memory_rpc_t = FixedMemoryRPC; + using intra_fixed_memory_t = dist_object; + using intra_global_ptr_t = global_ptr; + using intra_global_block_t = typename intra_fixed_memory_rpc_t::global_block_t; + using intra_future_global_block_t = typename intra_fixed_memory_rpc_t::future_global_block_t; + using intra_reservation_t = typename intra_fixed_memory_rpc_t::global_reservation_t; + + // For inter-node global,use a more compact array than a pair + // as the pair packs very inefficiently and sends a lot of zeros over the net + // #'s for thread-dest appending descending, E for element appending ascending from the pointer at the first E + // .....4321EEEE..... + // --------->-------- // start of element ptr == T* (alloc + thread_offset) + // --------<--------- // start of thread_num ptr == ((thread_num_t*) (alloc + thread_offset)) - 1 + // only sending the non-zero data in the middle over the wire + // thread_offset represents the # of elements from the start of the allocation that the pointer will be at + + using inter_fixed_memory_rpc_t = FixedMemoryRPC; + using inter_fixed_memory_t = dist_object; + using inter_global_ptr_t = global_ptr; + using inter_global_block_t = typename inter_fixed_memory_rpc_t::global_block_t; + using inter_future_global_block_t = typename inter_fixed_memory_rpc_t::future_global_block_t; + using inst_timer_t = GenericInstantiationTimer; + + using track_rpcs_t = dist_object; + + FuncDistObj &func; + + size_t max_store_size; // the count of T per block (may be 0) + size_t max_rpcs_in_flight; // Limit for the number of rpcs in flight. This limit exists to prevent the dispatch buffers from + // growing indefinitely + + inter_fixed_memory_t inter_fixed_memory_store; + intra_fixed_memory_t intra_fixed_memory_store; + track_rpcs_t track_inter_rpcs, track_intra_rpcs; + ProgressTimer t_prog; + static IntermittentTimer &t_process_local() { + static IntermittentTimer _(string("process_local()")); + return _; + } + + // private static methods + + // proceses a batch of data that is local (must be intra) + static void process_local(T *elem, size_t count, FuncDistObj &func) { + assert(elem); + assert(count > 0); + t_process_local().start(); + auto func_inst = *func; + for (size_t i = 0; i < count; i++) { + func_inst(elem[i]); + } + t_process_local().stop(); + } + + static void process_local(intra_global_block_t lblock, FuncDistObj &func) { + assert(lblock.first); + assert(lblock.first.is_local()); + process_local(lblock.first.local(), lblock.second, func); + } + + // static my_partial_progress version does NOT call upcxx::progress() + // just clears any ready rpcs + static size_t my_partial_progress(track_rpcs_t &track_rpcs) { + size_t pending_rpcs = track_rpcs->pop_finished(); + assert(pending_rpcs == track_rpcs->count_pending()); + return pending_rpcs; + } + + bool my_progress_is_required; + inline bool &my_progress_required() { return my_progress_is_required; } + // performs upcxx::progress() and TwoTierAggrStore progress on rpc acknowledgments + // returns the number of pending rpcs + bool calc_my_progress_required() { + my_progress_is_required = false; + if (inter_fixed_memory_store->has_promises() || intra_fixed_memory_store->has_promises()) { + // some promises exist + my_progress_is_required = true; + } else if (track_inter_rpcs->get_rpc_inner_timer().get_total_count() + + track_inter_rpcs->get_rpc_inner_timer().get_active_count() < + track_inter_rpcs->get_rpc_timer().get_total_count() + track_inter_rpcs->get_rpc_timer().get_active_count()) { + // inter inner rpcs (receiving) is less than rpcs (sending) + my_progress_is_required = true; + } else if (track_inter_rpcs->get_rpc_inner_timer().get_active_count() > 2 * split_rank::num_nodes()) { + // there are more active inter rpcs requiring my progress than there are nodes. Get them completed + my_progress_is_required = true; + } else if (track_intra_rpcs->get_rpc_inner_timer().get_active_count() > 2 * split_rank::num_threads()) { + // there are more active intra rpcs requiring my progress than there are threads in a node. Get them completed + my_progress_is_required = true; + } + // DBG(__func__, ": ", my_progress_is_required, "\n"); + return my_progress_is_required; + } + + size_t my_progress() { + // DBG(__func__, " my_progress_is_required=", my_progress_is_required, " -- ", to_string(), "\n"); + t_prog.progress(); + calc_my_progress_required(); + return my_partial_progress(track_inter_rpcs) + my_partial_progress(track_intra_rpcs); + } + + void wait_max_rpcs() { + // limit pending RPCs still + StallTimer is_stalled(description + string("-wait_max_rpcs")); + while (my_progress() >= max_rpcs_in_flight) is_stalled.check(); + } + + // simply sends a single element via rpc, bypassing all blocks + void send_rpc1(intrank_t target_rank, const T &elem) { + auto fut = rpc( + target_rank, [](T elem, FuncDistObj &func) { (*func)(elem); }, elem, func); + track_inter_rpcs->push(fut); + } + + // get the thread from a block with a thread_offset + static inline thread_num_t &get_thread_from_block(T *block, int idx) { + assert(idx >= 0); + return *(((thread_num_t *)block) - 1 - idx); + } + + // This function takes last element as pivot, places + // the pivot element at its correct position in sorted + // array, and places all smaller (smaller than pivot) + // to left of pivot and all greater elements to right + // of pivot + static int block_quicksort_partition(T *block, int low, int high) { + thread_num_t pivot = get_thread_from_block(block, high); // pivot + int i = (low - 1); // Index of smaller element + + for (int j = low; j <= high - 1; j++) { + // If current element is smaller than the pivot + if (get_thread_from_block(block, j) < pivot) { + i++; // increment index of smaller element + assert(i >= low); + assert(j >= low); + assert(i < high); + assert(j < high); + if (i != j) { + assert(i < j); + std::swap(block[i], block[j]); + std::swap(get_thread_from_block(block, i), get_thread_from_block(block, j)); + } + assert(get_thread_from_block(block, i) < pivot); + } + } + assert(i + 1 >= low); + if (i + 1 != high) { + assert(i + 1 < high); + std::swap(block[i + 1], block[high]); + std::swap(get_thread_from_block(block, i + 1), get_thread_from_block(block, high)); + } + return (i + 1); + } + + // The main function that implements QuickSort + // low --> Starting index, + // high --> Ending index + static void block_quicksort(T *block, int low, int high) { + if (low < high) { + /* pi is partitioning index, block[pi] is now + at right place */ + int pi = block_quicksort_partition(block, low, high); + + // Separately sort elements before + // partition and after partition + block_quicksort(block, low, pi - 1); + block_quicksort(block, pi + 1, high); + } + } + + // returns "virtual" intra blocks based on the underlying gblock for each local thread + // some may be empty but there will be one entry for every local thread + static vector inter_to_sorted_intra_blocks(inter_global_block_t &gblock, size_t start = 0) { + assert(gblock.first); + assert(gblock.first.is_local()); + assert(gblock.second > 0); + assert(start <= gblock.second); + DBG("Sorting ", gblock.second, " inter entries into intra blocks\n"); + + T *block = gblock.first.local(); + if (gblock.second - start > 1) { // no need to sort a single entry, right? + block_quicksort(block, start, gblock.second - 1); +#ifdef DEBUG + /* validate it was indeed sorted */ + int last_thread = -1; + for (size_t idx = start; idx < gblock.second; idx++) { + thread_num_t t = get_thread_from_block(gblock.first.local(), idx); + assert(last_thread <= t); + assert(t >= 0); + assert(t < split_rank::num_threads()); + last_thread = t; + } +#endif + } + vector intra_blocks; + intra_blocks.resize(split_rank::num_threads(), {}); + int last_thread = -1; + // find the partitions by thread in the sorted array + // TODO there should be a faster way to do this in a long list + for (size_t idx = start; idx < gblock.second; idx++) { + thread_num_t &thread = get_thread_from_block(block, idx); + assert(thread >= 0); + assert(thread < split_rank::num_threads()); + if (last_thread > thread) + DIE("inter_to_sorted_intra_blocks did not sort properly. idx=", idx, " gblock.second=", gblock.second, + " last_thread=", (int)last_thread, " thread=", (int)thread, "\n"); + intra_global_block_t &gb = intra_blocks[thread]; + if (!gb.first) { + assert(gb.second == 0); + gb.first = gblock.first + idx; + } + gb.second++; + last_thread = thread; + } + assert(gblock.first); // did not modify + assert(gblock.first.is_local()); // not modify + assert(gblock.second > 0); + return intra_blocks; + } + + static upcxx::future inter_intra_inner_rpc_relay(inter_global_block_t lblock, track_rpcs_t &track_inter_rpcs, + track_rpcs_t &track_intra_rpcs, + intra_fixed_memory_t &intra_fixed_mem, + inter_fixed_memory_t &inter_fixed_mem, FuncDistObj &func) { + DBG("inter_relay processing inter RPC:", lblock.first, " ", lblock.second, "\n"); + assert(lblock.first); + assert(lblock.first.is_local()); + assert(lblock.second > 0); + inter_global_block_t lblock_consumed = lblock; + lblock_consumed.second = 0; + upcxx::future all_rpcs = make_future(lblock_consumed); + intra_reservation_t reservation = intra_fixed_mem->acquire_partial_reservation(); // may be empty + T *elem_ptr = lblock.first.local(); + thread_num_t *thread_ptr = ((thread_num_t *)elem_ptr) - 1; + size_t res_sent = 0; + size_t idx = 0; + for (; idx < lblock.second; idx++) { + if (reservation->empty()) break; // must resort to plan B + T &elem = elem_ptr[idx]; // element increase up the stack + thread_num_t &thread = thread_ptr[0 - idx]; // threads increase down the stack + split_rank split = split_rank::from_thread(thread); + bool sent_rpc = add_to_dest_store_intranode_nb(split, elem, track_intra_rpcs, intra_fixed_mem, reservation, func); + if (sent_rpc) res_sent++; + } + if (res_sent) DBG("inter_relay res_sent=", res_sent, "\n"); + if (idx < lblock.second) { + // plan B + // sort the remaining entries by thread and send intra_rpcs directly using this set of virtual blocks + auto inter_intra_rpc_timer = make_shared(track_inter_rpcs->get_rpc_relay_timer()); + DBG("inter_relay sorting remaining ", lblock.second - idx, " entries for direct intra rpc\n"); + size_t direct_sent = 0; + vector intra_blocks = inter_to_sorted_intra_blocks(lblock, idx); + thread_num_t thread_idx = 0; + for (intra_global_block_t intra_block : intra_blocks) { + assert(thread_idx < split_rank::num_threads()); + if (intra_block.first && intra_block.second > 0) { + size_t count = intra_block.second; + auto fut_rpc = just_send_intra_rpc_nb(split_rank::get_rank_from_thread(thread_idx), intra_block, intra_fixed_mem, + track_intra_rpcs, func) + .then([](intra_global_block_t ignored) {}); + all_rpcs = when_all(all_rpcs, fut_rpc); + direct_sent++; + } + thread_idx++; + } + all_rpcs = all_rpcs.then([inter_intra_rpc_timer, direct_sent](inter_global_block_t lblock) { + DBG("inter_relay All direct_sent=", direct_sent, " intra rpcs relayed from inter rpc took ", + inter_intra_rpc_timer->get_elapsed_since_start(), " s count=", inter_intra_rpc_timer->get_total_count(), + " active=", inter_intra_rpc_timer->get_active_count(), "\n"); + assert(lblock.first); + assert(lblock.first.is_local()); + assert(lblock.second == 0); // it is drained + // stop timer, return the block + return lblock; + }); + } + intra_fixed_mem->release_reservation(reservation); + assert(reservation->empty()); // reservation was drained after release + return all_rpcs; + } + + // sends and rpc to the corresponding target rank on a different node with the same split_rank::split_local_team().rank_me() + // consumes the gblock + // returns the gblock once it is consumed + void send_inter_rpc(intrank_t target_rank, inter_global_block_t &gblock) { + assert(!gblock.first.is_null()); + assert(gblock.first.where() == rank_me()); + assert(gblock.second > 0); + + DBG("send_inter_rpc(", target_rank, ", gblock=", gblock.first, " size=", gblock.second, "\n"); + assert(inter_fixed_memory_store->valid()); + assert(intra_fixed_memory_store->valid()); + + // time the round trip + auto t_rpc = make_shared(track_inter_rpcs->get_rpc_timer()); + + auto fut = rpc( + target_rank, + [](inter_global_block_t gblock, track_rpcs_t &track_inter_rpcs, track_rpcs_t &track_intra_rpcs, + intra_fixed_memory_t &intra_fixed_mem, inter_fixed_memory_t &inter_fixed_mem, FuncDistObj &func) { + assert(gblock.first.where() != rank_me()); // no local data should be transmitted via RPC + assert(split_rank::get_my_node() != + split_rank::get_node_from_rank(gblock.first.where())); // no inter-node transfer on the same node + DBG("Executing process rpc inter node ", gblock.first, " ", gblock.second, "\n"); + auto t_inner_rpc = make_shared(track_inter_rpcs->get_rpc_inner_timer()); + + // future for both blocks after the rget completes + auto fut_blocks = inter_fixed_mem->rget_block(gblock); + + // future for lblock after it is consumed + auto fut_relay = + fut_blocks + .then([&track_inter_rpcs, &track_intra_rpcs, &inter_fixed_mem, &intra_fixed_mem, &func]( + inter_global_block_t gblock_IGNORED, inter_global_block_t lblock) { + assert(gblock_IGNORED.first); + assert(gblock_IGNORED.first.where() != rank_me()); + assert(gblock_IGNORED.second == 0); // is drained + return inter_intra_inner_rpc_relay(lblock, track_inter_rpcs, track_intra_rpcs, intra_fixed_mem, inter_fixed_mem, + func) + .then([&inter_fixed_mem](inter_global_block_t lblock) { + assert(lblock.first); + assert(lblock.first.is_local()); + assert(lblock.second == 0); // consumed + inter_fixed_mem->push_global(lblock); + }); + }) + .then([t_inner_rpc]() { + DBG("Completed inter rpc relay in ", t_inner_rpc->get_elapsed_since_start(), + " s count=", t_inner_rpc->get_total_count(), " active=", t_inner_rpc->get_active_count(), "\n"); + // stop the timer + }); + // no need to wait on fut_relay -- cleanup of the timer and push back to inter_fixed_mem will eventually happen and be + // verified + + // future for gblock after it is consumed -- possibly with a return package + auto fut_return = + fut_blocks + .then([&inter_fixed_mem](inter_global_block_t gblock, + inter_global_block_t lblock_ignored) -> upcxx::future { + // optionally send this rank dest store back to the sending process + DBG("Returning inter rpc gblock=", gblock.first, "\n"); + assert(gblock.first); + assert(gblock.second == 0); // it is drained + assert(gblock.first.where() != rank_me()); + assert(lblock_ignored.first); + assert(lblock_ignored.first.where() == rank_me()); + node_num_t node = split_rank(gblock.first.where()).get_node(); + DBG("to node=", node, "\n"); + // check for another available lblock and enough in dest store to have efficient transfer speed + if (inter_fixed_mem->has_dest_stores() && inter_fixed_mem->global_reserved_size() > 0) { + inter_global_block_t &store_block = inter_fixed_mem->dest_store(node); + size_t store_size = store_block.second; + if (store_block.first && store_size * 2 > inter_fixed_mem->get_count_per_block() && + store_size * sizeof(T) > 4 * ONE_KB) { + // send this store back to sender with this gblock + DBG("Sending store_size=", store_size, " back to ", gblock.first.where(), " store=", store_block.first, + " (swapped)\n"); + assert(store_block.first); + assert(store_block.first.where() == rank_me()); + auto lblock_fut = inter_fixed_mem->pop_global(true); + assert(lblock_fut.is_ready()); + auto lblock = lblock_fut.result(); + assert(lblock.first); + assert(lblock.first.where() == rank_me()); + assert(lblock.second == 0); // empty + // swap the store and free block + std::swap(store_block, lblock); + assert(lblock.second > 0); + auto fut_rput_blocks = inter_fixed_mem->rput_block(lblock, gblock); + return fut_rput_blocks.then([&inter_fixed_mem](inter_global_block_t lblock, inter_global_block_t gblock) { + DBG("rput finished lblock=", lblock.first, " ", lblock.second, " gblock=", gblock.first, " ", + gblock.second, "\n"); + assert(lblock.first); + assert(lblock.first.where() == rank_me()); + assert(lblock.second == 0); // is drained + assert(gblock.first); + assert(gblock.first.where() != rank_me()); + assert(gblock.second > 0); + // push lblock after the rput completes + inter_fixed_mem->push_global(lblock); + return gblock; + }); + } + } + // did not end up sending, just return the empty gblock + DBG("Just returning gblock=", gblock.first, "\n"); + assert(gblock.first); + assert(gblock.first.where() != rank_me()); + assert(gblock.second == 0); // it is drained + return make_future(gblock); + }) + .then([t_inner_rpc](inter_global_block_t gblock) { + DBG("Returning inter rpc gblock=", gblock.first, " ", gblock.second, " in ", + t_inner_rpc->get_elapsed_since_start(), "\n"); + return gblock; + }); + + return fut_return; // just the gblock + }, + gblock, track_inter_rpcs, track_intra_rpcs, intra_fixed_memory_store, inter_fixed_memory_store, func); + + gblock = {}; // do not allow reuse of this global pointer until the return is ready and pushed back + + // handle returned global_block + inter_fixed_memory_t &inter_fixed_mem = inter_fixed_memory_store; + intra_fixed_memory_t &intra_fixed_mem = intra_fixed_memory_store; + FuncDistObj &_func = func; + track_rpcs_t &_track_inter_rpcs = track_inter_rpcs; + track_rpcs_t &_track_intra_rpcs = track_intra_rpcs; + auto fut_returned = + fut.then([t_rpc, target_rank](inter_global_block_t gblock) { + DBG("Got inter rpc ack from ", (int)target_rank, ": ", gblock.first, " in ", t_rpc->get_elapsed_since_start(), " s\n"); + // just stop the timer + assert(gblock.first); + assert(gblock.first.where() == rank_me()); + return make_future(gblock); + }) + .then([&inter_fixed_mem, &_track_inter_rpcs, &_track_intra_rpcs, &intra_fixed_mem, + &_func](inter_global_block_t gblock) -> upcxx::future { + assert(gblock.first); + assert(gblock.first.where() == rank_me()); + auto fut_gblock = make_future(gblock); + if (gblock.second > 0) { + // there is data to be processed in this return ack, so relay it + DBG("Processing inter rpc ack gblock=", gblock.first, " ", gblock.second, "\n"); + return inter_intra_inner_rpc_relay(gblock, _track_inter_rpcs, _track_intra_rpcs, intra_fixed_mem, inter_fixed_mem, + _func); + } else { + return to_future(gblock); + } + }) + .then([&inter_fixed_mem](inter_global_block_t gblock) { + assert(gblock.first); + assert(gblock.second == 0); // it is drained + inter_fixed_mem->push_global(gblock); + assert(!gblock.first); // is invalidated + }); + + // remember this rpc to wait on later (may not be necessary) + track_inter_rpcs->push(fut_returned); + } + + // sends an rpc to target_rank located on this node (intra). + // consumes the gblock + // returns the gblock one it is consumed + static void send_intra_rpc(intrank_t target_rank, intra_global_block_t &gblock, track_rpcs_t &track_intra_rpcs, + intra_fixed_memory_t &intra_fixed_mem, FuncDistObj &func) { + assert(!gblock.first.is_null()); + assert(gblock.first.where() == rank_me()); + assert(gblock.second > 0); + + DBG("send_intra_rpc(", target_rank, ", gblock=", gblock.first, " size=", gblock.second, "\n"); + assert(intra_fixed_mem->valid()); + my_partial_progress(track_intra_rpcs); + + auto fut_gblock = just_send_intra_rpc_nb(target_rank, gblock, intra_fixed_mem, track_intra_rpcs, func); + + // handle returned global_block + auto fut_returned = fut_gblock.then([&intra_fixed_mem](intra_global_block_t gblock) { + DBG("Returned acknowledged global ", gblock.first, "\n"); + assert(gblock.first); + assert(gblock.first.where() == rank_me()); + intra_fixed_mem->push_global(gblock); + assert(!gblock.first); // is invalidated + }); + + // remember this rpc to wait on later + track_intra_rpcs->push(fut_returned); + } + + static upcxx::future just_send_intra_rpc_nb(intrank_t target_rank, intra_global_block_t &gblock, + intra_fixed_memory_t &intra_fixed_mem, track_rpcs_t &track_intra_rpcs, + FuncDistObj &func) { + // time the round-trip + auto t_rpc = make_shared(track_intra_rpcs->get_rpc_timer()); + + // This RPC just starts consuming the global block and makes progress on the remote rank + // It returns a future global block when the remote rank has finished consuming it + auto fut = rpc( + target_rank, + [](intra_global_block_t gblock, FuncDistObj &func, intra_fixed_memory_t &intra_fixed_mem, track_rpcs_t &track_intra_rpcs) { + DBG("Executing process rpc intra node ", gblock.first, " ", gblock.second, ", intra_fixed_mem: ", &(*intra_fixed_mem), + "\n"); + auto t_inner_rpc = make_shared(track_intra_rpcs->get_rpc_inner_timer()); + upcxx::future<> finished; + + upcxx::future fut_gblock; + if (gblock.first.is_local()) { + // processes the data immediately + process_local(gblock, func); + fut_gblock = to_future(gblock); + finished = make_future(); + } else { + // copy the global data, then process it eventually + DBG("initiating rget of non-local intra gblock:", gblock.first, "\n"); + auto fut_blocks = intra_fixed_mem->rget_block(gblock); + fut_gblock = fut_blocks.then([](intra_global_block_t gblock, intra_global_block_t ignored) { return gblock; }); + finished = fut_blocks.then([&func, &intra_fixed_mem](intra_global_block_t ignored, intra_global_block_t lblock) { + process_local(lblock, func); + intra_fixed_mem->push_global(lblock); + }); + } + finished.then([t_inner_rpc]() { + DBG("intra rpc finished in ", t_inner_rpc->get_elapsed_since_start(), " s\n"); + // stop the timer + }); + return fut_gblock; // return th global_block to sender for reuse + }, + gblock, func, intra_fixed_mem, track_intra_rpcs); + gblock = {}; // do not allow reuse of this global pointer until the return is ready and pushed back + + return fut.then([t_rpc](intra_global_block_t gblock) { + DBG("intra rpc returned in ", t_rpc->get_elapsed_since_start(), " s\n"); + // stop the timer + return gblock; + }); + } + + inline void send_intra_rpc(intrank_t target_rank, intra_global_block_t &gblock) { + send_intra_rpc(target_rank, gblock, track_intra_rpcs, intra_fixed_memory_store, func); + } + + // operation on 1 element (i.e. no dest_store) + // will block until sufficient global available blocks are available + // and subject to the maximum rpcs in flight + void update_remote1(intrank_t target_rank, const T &elem) { + assert(max_store_size <= 1); + // limit pending RPCs still + wait_max_rpcs(); + update_remote1_nb(target_rank, elem); + } + + // non blocking version (for use in future chains) + inline void update_remote1_nb(intrank_t target_rank, const T &elem) { send_rpc1(target_rank, elem); } + + // operate on a vector of elements in the dest_stores + // will block until sufficient global available blocks are available + inline static void update_remote_intra(intrank_t target_rank, intra_global_block_t &gblock, track_rpcs_t &track_intra_rpcs, + intra_fixed_memory_t &intra_fixed_mem, FuncDistObj &func) { + intra_reservation_t empty_res; + update_remote_intra_nb(target_rank, gblock, track_intra_rpcs, intra_fixed_mem, empty_res, func); + } + + // and subject to the maximum rpcs in flight + static void update_remote_intra_nb(intrank_t target_rank, intra_global_block_t &gblock, track_rpcs_t &track_intra_rpcs, + intra_fixed_memory_t &intra_fixed_mem, intra_reservation_t &reservation, FuncDistObj &func) { + DBG("update_remote_intra(target_rank=", target_rank, ", gblock=", gblock.first, ", size=", gblock.second, "\n"); + assert(gblock.first); + assert(gblock.first.where() == rank_me()); + if (gblock.second == 0) DIE("Invalid call to update_remote on an empty global block\n"); + intra_global_block_t send_gblock = gblock; // make a copy + gblock = {}; // invalidate it first + + // now get another gblock + if (reservation && !reservation->empty()) { + replace_intra_store_nb(gblock, intra_fixed_mem, reservation); + } else { + assert(!gblock.first); + auto fut = replace_intra_store(gblock, intra_fixed_mem); + if (!fut.is_ready()) DBG(__func__, " will wait\n"); + fut.wait(); + } + assert(gblock.first); + assert(gblock.first.where() == rank_me()); + + // now send the copy after gblock is available again + send_intra_rpc(target_rank, send_gblock, track_intra_rpcs, intra_fixed_mem, func); + assert(!send_gblock.first); // is invalidated + } + + inline void update_remote_intra(intrank_t target_rank, intra_global_block_t &gblock) { + intra_reservation_t empty_res; + update_remote_intra_nb(target_rank, gblock, track_intra_rpcs, intra_fixed_memory_store, empty_res, func); + } + + inline void update_remote_intra_nb(intrank_t target_rank, intra_global_block_t &gblock, intra_reservation_t &reservation) { + update_remote_intra(target_rank, gblock, track_intra_rpcs, intra_fixed_memory_store, reservation, func); + } + + static upcxx::future<> replace_inter_store(inter_global_block_t &gblock, inter_fixed_memory_t &inter_fixed_memory_store) { + assert(!gblock.first); + assert(gblock.second == 0); + upcxx::future newblock = inter_fixed_memory_store->pop_global(); + return when_all(make_future(std::ref(gblock)), newblock) + .then([&inter_fixed_memory_store](inter_global_block_t &gblock, inter_global_block_t newblock) { + assert(newblock.first); + assert(newblock.first.where() == rank_me()); + assert(newblock.second == 0); + if (!gblock.first) { + // won the race + gblock = newblock; + } else { + // lost the race put the newblock back + inter_fixed_memory_store->push_global(newblock); + } + }); + } + + static upcxx::future<> replace_intra_store(intra_global_block_t &gblock, intra_fixed_memory_t &intra_fixed_memory_store) { + upcxx::future newblock = intra_fixed_memory_store->pop_global(); + return when_all(make_future(std::ref(gblock)), newblock) + .then([&intra_fixed_memory_store](intra_global_block_t &gblock, intra_global_block_t newblock) { + assert(newblock.first); + assert(newblock.first.where() == rank_me()); + assert(newblock.second == 0); + if (!gblock.first) { + // won the race + gblock = newblock; + DBG("update_remote_intra: got an used new gblock from dispatcher:", gblock.first, "\n"); + } else { + // lost the race put the newblock back + intra_fixed_memory_store->push_global(newblock); + } + }); + } + + static void replace_intra_store_nb(intra_global_block_t &gblock, intra_fixed_memory_t &intra_fixed_mem, + intra_reservation_t &reservation) { + assert(reservation); + if (!reservation) DIE("invalid call without reservation!\n"); + assert(!reservation->empty()); + if (reservation->empty()) DIE("Unexpected - the reservation is fully drained!\n"); + + // replace the gblock with a reserved block + assert(!gblock.first); + gblock = reservation->back(); + reservation->pop_back(); + + assert(gblock.first); + assert(gblock.first.where() == rank_me()); + assert(gblock.second == 0); + DBG("update_remote_intra: got new gblock from reservation:", gblock.first, "\n"); + } + + // operate on a vector of elements in the dest_stores + // will block until sufficient global available blocks are available + // and subject to the maximum rpcs in flight + void update_remote_inter(intrank_t target_rank, inter_global_block_t &gblock) { + assert(gblock.first); + assert(gblock.second > 0); + auto fut = update_remote_inter_nb(target_rank, gblock); + DBG(__func__, " my_progress\n"); + if (!fut.is_ready()) { + DBG(__func__, " still waiting on inter dest store\n"); + } + fut.wait(); + assert(gblock.first); // is valid again + } + + upcxx::future<> update_remote_inter_nb(intrank_t target_rank, inter_global_block_t &gblock) { + if (gblock.second == 0) DIE("Invalid call to update_remote_inter on an empty global block\n"); + assert(gblock.first); + assert(gblock.first.where() == rank_me()); + inter_global_block_t sendBlock = gblock; // copy + size_t node = split_rank(target_rank).get_node(); + assert(inter_fixed_memory_store->dest_store(node) == sendBlock); + gblock = {}; // invalidate it + auto fut = replace_inter_store(gblock, inter_fixed_memory_store); + send_inter_rpc(split_rank::get_rank_from_node(node), sendBlock); // send to dedicated rank on remote node + if (!fut.is_ready()) DBG("intra dest store is not immediately ready\n"); + return fut; + } + + // returns true if an RPC was initiated + inline static bool add_to_dest_store_intranode(split_rank split, const T &elem, track_rpcs_t &track_intra_rpcs, + intra_fixed_memory_t &intra_fixed_mem, FuncDistObj &func) { + intra_reservation_t empty_res; + return add_to_dest_store_intranode_nb(split, elem, track_intra_rpcs, intra_fixed_mem, empty_res, func); + } + // non-blocking version (with non-empty reservation) + static bool add_to_dest_store_intranode_nb(split_rank split, const T &elem, track_rpcs_t &track_intra_rpcs, + intra_fixed_memory_t &intra_fixed_mem, intra_reservation_t &reservation, + FuncDistObj &func) { + // intranode + size_t max_store_size = intra_fixed_mem->get_count_per_block(); + intra_global_block_t &gblock = intra_fixed_mem->dest_store(split.get_thread()); + if (!gblock.first && reservation) { + // This is a race between the master persona waiting for a free gblock and an intra node rpc executing while it waits + // it is safe for this stack to swap in a new gblock from the reservation + assert(!reservation->empty()); + gblock = reservation->back(); + reservation->pop_back(); + DBG("add_to_dest_store_intranode replaced empty gblock with one from my reservation\n"); + } + assert(gblock.second < max_store_size); + if (gblock.second >= max_store_size) + DIE("Invalid state of gblock with ", gblock.second, " elements but max of ", max_store_size, "\n"); + assert(gblock.first); + assert(gblock.first.where() == rank_me()); + if (gblock.first.where() != rank_me()) DIE("Invalid state of gblock not local to current rank: ", gblock.first, "\n"); + T *lptr = gblock.first.local(); + lptr[gblock.second++] = elem; + if (gblock.second == max_store_size) { + // DBG("add_to_dest_store_intranode found full for ", (int) split.get_thread(), "/", split.get_rank(), " reservation:", + // reservation.get(), ", gblock=", gblock.first, ",", gblock.second, "\n"); + if (reservation && reservation->empty()) DIE("Invalid state for ", __func__, " reservation is present but empty\n"); + update_remote_intra_nb(split.get_rank(), gblock, track_intra_rpcs, intra_fixed_mem, reservation, func); + assert(gblock.first); // gblock is restored + assert(gblock.first == intra_fixed_mem->dest_store(split.get_thread()).first); + return true; + } + assert(gblock.first); // gblock is still good + assert(gblock.first.local()); + assert(gblock.first == intra_fixed_mem->dest_store(split.get_thread()).first); + return false; + } + + void _add_to_dest_store_internode_fast(split_rank split, const T &elem, inter_global_block_t &gblock) { + assert(gblock.first); + assert(gblock.first.where() == rank_me()); + assert(inter_fixed_memory_store->get_thread_offset() > 0); + if (gblock.second >= max_store_size) DIE("Invalid call to add_to_dest_store_internode_fast\n"); + T *lptr = gblock.first.local(); + // element ascendes after the pointer + lptr[gblock.second] = elem; + // thread num decends before the pointer + thread_num_t *t = ((thread_num_t *)lptr) - 1 - gblock.second; + *t = split.get_thread(); + gblock.second++; + assert(gblock.second <= max_store_size); + } + + // adds an entry to the dest store + // may send an rpc (returning true in that case) + // may block if inter global blocks are unavailable + bool add_to_dest_store_internode(split_rank split, const T &elem) { + inter_global_block_t &gblock = inter_fixed_memory_store->dest_store(split.get_node()); + assert(gblock.first); + assert(gblock.first == inter_fixed_memory_store->dest_store(split.get_node()).first); + if (gblock.second >= max_store_size) + DIE("Invalid state of gblock with ", gblock.second, " elements but max of ", max_store_size, "\n"); + assert(gblock.second < max_store_size); + if (gblock.first.where() != rank_me()) DIE("Invalid state of gblock not local to current rank: ", gblock.first, "\n"); + + bool did_send = false; + if (gblock.second < max_store_size) { + _add_to_dest_store_internode_fast(split, elem, gblock); + } + if (gblock.second == max_store_size) { + update_remote_inter(split.get_rank(), gblock); + did_send = true; + } + assert(gblock.second < max_store_size); + assert(gblock.first); + assert(gblock.first.where() == rank_me()); + assert(gblock.first == inter_fixed_memory_store->dest_store(split.get_node()).first); + return did_send; + } + + // returns true if an RPC was initiated + // may block + bool add_to_dest_store(intrank_t target_rank, const T &elem) { + bool sent_rpc = false; + if (max_store_size <= 1) { + update_remote1(target_rank, elem); + sent_rpc = true; + } else { + split_rank split(target_rank); + if (split.is_local()) { + // intranode + sent_rpc = add_to_dest_store_intranode(split, elem, track_intra_rpcs, intra_fixed_memory_store, func); + } else { + // internode + assert(split_rank::num_nodes() > 1); + sent_rpc = add_to_dest_store_internode(split, elem); + } + } + if (sent_rpc) { + my_progress(); // progress anyway to kick off the rpc + } + return sent_rpc; + } + + public: + string description; + + TwoTierAggrStore(FuncDistObj &f, const string description) + : func(f) + , max_store_size(0) + , max_rpcs_in_flight(MAX_RPCS_IN_FLIGHT) + , intra_fixed_memory_store(func.team(), description + string("-intra-store")) + , inter_fixed_memory_store(func.team(), description + string("-inter-store")) + , track_inter_rpcs(func.team(), description + string("-track-inter-rpc")) + , track_intra_rpcs(func.team(), description + string("-track-intra-rpc")) + , t_prog(description + string("-TwoTierAggrStore")) + , description(description) + , my_progress_is_required(false) {} + TwoTierAggrStore(const TwoTierAggrStore &) = delete; + TwoTierAggrStore(TwoTierAggrStore &&) = default; + virtual ~TwoTierAggrStore() { clear(); } + + string to_string() const { + ostringstream os; + os << description; + os << "-TwoTierAggrStore"; + os << "inter_store=" << inter_fixed_memory_store->to_string() << ","; + os << "intra_store=" << intra_fixed_memory_store->to_string() << ","; + os << "track_inter_rpcs=" << track_inter_rpcs->to_string() << ","; + os << "inter_rpc_t=" << track_inter_rpcs->get_rpc_timer().get_total_count() << "/" + << track_inter_rpcs->get_rpc_inner_timer().get_total_count() << ","; + os << "inter_intra_rpc_t=" << track_inter_rpcs->get_rpc_relay_timer().get_total_count() << ","; + os << "track_intra_rpcs=" << track_intra_rpcs->to_string() << ","; + os << "intra_rpc_t=" << track_intra_rpcs->get_rpc_timer().get_total_count() << "/" + << track_intra_rpcs->get_rpc_inner_timer().get_total_count() << ","; + os << ")"; + return os.str(); + } + + static void optimal_num_blocks_and_count_per(const size_t max_bytes, const size_t max_rpcs, size_t &num_intra_blocks, + size_t &num_inter_blocks, size_t &count_per_block) { + // a few constraints and priorities for optimization + // required: + // num_blocks * sizeof(T) * count_per_block <= max_bytes + // min_rpcs_in_flight <= rpcs_in_flight <= max_rpcs_in_flight + // rpcs_in_flight == num_blocks - dest_store_size + // + // optimization compromises: + // count_per_block * sizeof(T) >= 8KB, optimally much larger 1MB + // dest_store_size == count_per_block == 1 ? 0 : rank_n() + // max_rpcs_in_flight == min(rank_n() * 10, 2048); + // min_rpcs_in_flight == rank_n() // possibly nodes (technically 1) + // + // furthermore if num_nodes == 1 there will be 0 internode blocks + + // start calcs with min limits + size_t sz = sizeof(T) + sizeof(thread_num_t); + size_t inter_dest_store_size = split_rank::num_nodes(); + size_t intra_dest_store_size = split_rank::num_threads(); + size_t res_size = split_rank::num_threads(); + size_t min_inter_rpcs_in_flight = 2 * inter_dest_store_size + 16; // every 2 * inter dest store + 16 + size_t min_reservations = 1 + split_rank::num_nodes() / split_rank::num_threads(); // 1 + nodes/(cores/node) + size_t min_intra_rpcs_in_flight = + intra_dest_store_size + min_reservations * res_size + 16; // every dest store + a few reservations + 16 + + if (split_rank::num_nodes() == 1) { + sz = sizeof(T); + inter_dest_store_size = 0; + res_size = 0; + min_inter_rpcs_in_flight = 0; + min_intra_rpcs_in_flight = intra_dest_store_size + 16; + } + + size_t num_blocks = 0; + size_t min_rpcs_in_flight = min_intra_rpcs_in_flight + min_inter_rpcs_in_flight; + size_t rpcs_in_flight = min_rpcs_in_flight * 8; + if (min_rpcs_in_flight > 2 * max_rpcs) { + rpcs_in_flight = min_rpcs_in_flight; // min == max and it will exceed max_rpcs + } else if (rpcs_in_flight > 2 * max_rpcs) { + rpcs_in_flight = 2 * max_rpcs; // reduce the starting max rpc + } + + DBG("optimizing max_bytes=", get_size_str(max_bytes), " min_rpcs=", min_rpcs_in_flight, " inter=", inter_dest_store_size, + " intra=", intra_dest_store_size, "\n"); + if (max_bytes >= 2 * sz * (min_rpcs_in_flight + inter_dest_store_size + intra_dest_store_size)) { + // start with large blocks and max rpcs in flight + // decrease rpcs_in_flight to 2* minimums + // decrease block size to 16KB + // decrease rpcs_in_flight to minimum + // decrease block size further. + size_t target_min_mem = 16 * ONE_KB - 64; // still fast but below gets noticibly slower + size_t mem_per_block = 128 * ONE_KB - 64; // initial best case + count_per_block = (mem_per_block + sz - 1) / sz; + DBG("optimizing mem_per_block=", get_size_str(mem_per_block), " rpcs=", rpcs_in_flight, " count_per_block=", count_per_block, + " block_size=", get_size_str(count_per_block * sz), "\n"); + do { + num_blocks = rpcs_in_flight + inter_dest_store_size + intra_dest_store_size; + if (sz * num_blocks * count_per_block < max_bytes) break; + size_t try_mem = 3 * mem_per_block / 4; // reduce to 75% + size_t try_rpcs = 3 * rpcs_in_flight / 4; // reduce to 75% + if (try_rpcs > 2 * min_rpcs_in_flight) { // first reduce in-flight to 2*minimum + rpcs_in_flight = try_rpcs; + } else if (try_mem > target_min_mem) { // next reduce count to target minimum + rpcs_in_flight = 2 * min_rpcs_in_flight; + mem_per_block = try_mem; + } else if (try_rpcs > min_rpcs_in_flight) { // next reduce in-flight to minimum + mem_per_block = target_min_mem; + rpcs_in_flight = try_rpcs; + } else { // lastly reduce block size below the target_min_mem + rpcs_in_flight = min_rpcs_in_flight; + mem_per_block = (mem_per_block > ONE_KB) ? (3 * mem_per_block / 4) : (mem_per_block / 2); + } + count_per_block = (mem_per_block + sz - 1) / sz; + DBG("optimizing mem_per_block=", get_size_str(mem_per_block), " rpcs=", rpcs_in_flight, + " count_per_block=", count_per_block, "\n"); + } while (count_per_block > 1); + } else { + count_per_block = 1; + } + + if (count_per_block <= 1) { + // no allocation - just direct rpcs + count_per_block = 1; + inter_dest_store_size = 0; + intra_dest_store_size = 0; + num_blocks = 0; + rpcs_in_flight = max_bytes / sz < max_rpcs ? max_bytes / sz : max_rpcs; + } else { + num_blocks = rpcs_in_flight + inter_dest_store_size + intra_dest_store_size; + } + + // All of these must still be true + assert(count_per_block * sizeof(T) * num_blocks <= max_bytes); + assert(count_per_block >= 1); + + if (num_blocks > 1) { + // calculate the inter and intra block counts + double inter_fraction = .75; // 75% of the extra blocks go to inter stores + assert(min_inter_rpcs_in_flight + min_intra_rpcs_in_flight == min_rpcs_in_flight); + assert(min_inter_rpcs_in_flight + min_intra_rpcs_in_flight <= rpcs_in_flight); + + if (split_rank::num_nodes() > 1) { + num_inter_blocks = + inter_dest_store_size + min_inter_rpcs_in_flight + (rpcs_in_flight - min_rpcs_in_flight) * inter_fraction; + } else { + num_inter_blocks = 0; + inter_fraction = 0.0; + } + num_intra_blocks = + intra_dest_store_size + min_intra_rpcs_in_flight + (rpcs_in_flight - min_rpcs_in_flight) * (1.0 - inter_fraction); + assert(num_blocks >= num_inter_blocks + num_intra_blocks); + + } else { + num_intra_blocks = num_inter_blocks = 0; + } + SOUT("Calculated optimal num and block size for ", split_rank::num_nodes(), " internode sets and ", + (size_t)split_rank::num_threads(), " intranode ranks per node\n"); + SOUT("Found optimal TwoTierAggrStore of num_intra_blocks=", num_intra_blocks, " num_inter_blocks=", num_inter_blocks, + " count_per_block=", count_per_block, " (", get_size_str(count_per_block * sz), " per block, ", + get_size_str((num_inter_blocks + num_intra_blocks) * count_per_block * sz), ")\n"); + } + + void set_size(size_t max_store_bytes) { + DBG("TwoTierAggrStore::set_size(", max_store_bytes, ")\n"); + + size_t count_per_block = 0, num_intra_blocks = 0, num_inter_blocks = 0; + optimal_num_blocks_and_count_per(max_store_bytes, max_rpcs_in_flight, num_intra_blocks, num_inter_blocks, count_per_block); + assert(count_per_block > 0); + + if (count_per_block <= 1) { + // no reason for delay and storage of 1 entry (i.e. small max mem at large scale), still uses max_rpcs_in_flight + max_store_size = 0; + num_intra_blocks = 0; + num_inter_blocks = 0; + count_per_block = 1; // will send single rpcs + if (max_store_bytes > 0) { + // not intentionally disabled + SWARN("Using no TwoTierAggrStore to aggregate messages because no configutation works with less than ", + get_size_str(max_store_bytes), " at this scale\n"); + } + } else { + max_store_size = count_per_block; + } + + size_t per_intra_rpc_bytes = count_per_block * sizeof(T); + size_t per_inter_rpc_bytes = count_per_block * (sizeof(T) + sizeof(thread_num_t)); + size_t total_blocks = num_intra_blocks + num_inter_blocks; + + if (num_intra_blocks == 0) { + // no dest stores will be used intra or inter + assert(num_inter_blocks == 0); + assert(max_store_size == 0); + assert(count_per_block == 1); + num_intra_blocks = 0; + num_inter_blocks = 0; + } + + node_num_t nodes = split_rank::num_nodes(); + // always have intra node + + SOUT("Establishing ", description, " intra dest stores\n"); + intra_fixed_memory_store->set_fixed_mem(num_intra_blocks, count_per_block, split_rank::num_threads(), false, + nodes == 1 ? 0 : split_rank::num_threads()); + + SOUT("Establishing ", description, " inter dest stores\n"); + if (nodes == 1) { + // special case for single node with no internode rpcs needed + if (rank_n() > 1) assert(num_inter_blocks == 0); + inter_fixed_memory_store->set_fixed_mem(0, 1, 1, false, 0); + } else { + if (num_inter_blocks > 0) + assert(num_inter_blocks >= split_rank::num_nodes() * 3); // room for dest store, 1 reservation and 1 in flight + inter_fixed_memory_store->set_fixed_mem(num_inter_blocks, count_per_block, split_rank::num_nodes(), true, + split_rank::num_nodes()); + } + + SOUT("Using a ", description, " store of max ", + get_size_str(num_intra_blocks * per_intra_rpc_bytes + num_inter_blocks * per_inter_rpc_bytes), + " per target rank, giving max ", max_store_size, " of ", get_size_str(sizeof(T)), "/", + get_size_str(sizeof(T) + sizeof(thread_num_t)), " entries per target rank (", get_size_str(per_intra_rpc_bytes), "/", + get_size_str(per_inter_rpc_bytes), ", ", get_size_str(per_intra_rpc_bytes * num_intra_blocks), "/", + get_size_str(per_inter_rpc_bytes * num_inter_blocks), ") and ", max_rpcs_in_flight, " rpcs in flight\n"); + } + + // true only if there are no element stored and no rpcs in flight + inline bool empty() const { + return track_intra_rpcs->empty() && intra_fixed_memory_store->empty() && track_inter_rpcs->empty() && + inter_fixed_memory_store->empty(); + } + + void clear() { + DBG("TwoTierAggrStore::clear()\n"); + inter_fixed_memory_store->clear_dest_stores(); + intra_fixed_memory_store->clear_dest_stores(); + if (!empty()) DIE("clear() called on a non-empty TwoTierAggrStore!\n"); + track_inter_rpcs->clear(); + inter_fixed_memory_store->clear(); + track_intra_rpcs->clear(); + intra_fixed_memory_store->clear(); + t_prog.print_out(); + t_process_local().print_out(); + Timings::wait_pending(); + assert(intra_fixed_memory_store->empty()); + assert(inter_fixed_memory_store->empty()); + assert(!intra_fixed_memory_store->valid()); + assert(!inter_fixed_memory_store->valid()); + barrier(); + } + + bool update(intrank_t target_rank, const T &elem) { + static size_t update_count = 0; + bool ret = add_to_dest_store(target_rank, elem); + update_count++; + bool progress_is_required = my_progress_required(); + if (update_count % (progress_is_required ? 32 : 4096) == 0) { + my_progress(); + } + return ret; + } + + void flush_inter_updates() { + if (split_rank::num_nodes() == 1) { + if (!inter_fixed_memory_store->empty()) DIE("flush_inter_updates called when there is only 1 node from split_rank!\n"); + return; + } + Timer timer(description + "-TwoTierAggrStore::flush_inter_updates"); + DBG("flushing inter updates...\n"); + + // first flush inter node stores + size_t num_inter_dest = split_rank::num_nodes() == 1 ? 0 : split_rank::num_nodes(); + if (num_inter_dest == 0) assert(inter_fixed_memory_store->empty()); + for (node_num_t _node = 0; _node < num_inter_dest; _node++) { + node_num_t node = (_node + 1 + split_rank::get_my_node()) % + split_rank::num_nodes(); // rotate the flushes, starting with the next node in the job + if (max_store_size > 0) { + inter_global_block_t &gblock = inter_fixed_memory_store->dest_store(node); + if (node == split_rank::get_my_node()) { + assert(gblock.second == 0); + } + assert(gblock.first); + assert(gblock.first.where() == rank_me()); + if (gblock.second > 0) { + update_remote_inter(split_rank::get_rank_from_node(node), gblock); + } + } + } + my_progress(); + DBG("all my internode data send rpcs have been sent\n"); + } + + void flush_intra_updates() { + Timer timer(description + "-TwoTierAggrStore::flush_intra_updates"); + DBG("flushing intra updates...\n"); + + for (thread_num_t _thread = 0; _thread < split_rank::num_threads(); _thread++) { + thread_num_t thread = (_thread + 1 + split_rank::get_my_thread()) % + split_rank::num_threads(); // rotate the flushes starting with the next thread + if (max_store_size > 0) { + intra_global_block_t &gblock = intra_fixed_memory_store->dest_store(thread); + assert(gblock.first); + assert(gblock.first.where() == rank_me()); + if (gblock.second > 0) { + update_remote_intra(split_rank::get_rank_from_thread(thread), gblock); + assert(gblock.first); + assert(gblock.first.where() == rank_me()); + } + } + } + DBG(__func__, " my_progress\n"); + my_progress(); + DBG("all my intranode data send rpcs have been sent\n"); + } + + static void flush_intra_updates_with_res(intra_reservation_t &reservation, track_rpcs_t &track_intra_rpcs, + intra_fixed_memory_t &intra_fixed_mem, FuncDistObj &func) { + assert(reservation); + int count_flushed = 0; + if (!reservation->empty()) { + // flush the most full dest_stores first, stopping at 1/4 capacity + vector rank_counts; + rank_counts.reserve(split_rank::num_threads()); + for (thread_num_t thread = 0; thread < split_rank::num_threads(); thread++) { + size_t s = intra_fixed_mem->dest_store(thread).second; + assert(s < (1ull << 32)); + if (s >= intra_fixed_mem->get_count_per_block() / 4) { + s = (s << 32) | thread; // combine count in high bits, thread in low bits + rank_counts.push_back(s); + } + } + if (!rank_counts.empty()) { + // sort ascending by count, then thread + std::sort(rank_counts.begin(), rank_counts.end()); + } + while (!reservation->empty() && !rank_counts.empty()) { + size_t r_c = rank_counts.back(); + rank_counts.pop_back(); + size_t _thread = r_c & 0xffffffff; + thread_num_t thread = _thread; + size_t count = (r_c >> 32) & 0xffffffff; + intra_global_block_t &gblock = intra_fixed_mem->dest_store(thread); + assert(gblock.first); + assert(gblock.first.where() == rank_me()); + update_remote_intra_nb(split_rank::get_rank_from_thread(thread), gblock, track_intra_rpcs, intra_fixed_mem, reservation, + func); + assert(gblock.first); + assert(gblock.first.where() == rank_me()); + count_flushed++; + } + } + DBG("flush_intra_updates with reservation flushed ", count_flushed, " intra stores\n"); + } + + void flush_updates() { + BarrierTimer timer(description + "-TwoTierAggrStore::flush_updates", false); + DBG("flushing updates...\n"); + + flush_inter_updates(); + + // pre-emptively flush intra_stores + // create a (possibly small) reservation of intra blocks + intra_reservation_t res = make_shared >(); + // clear dest stores so global_dispatcher can be empty() + inter_fixed_memory_store->clear_dest_stores(); + StallTimer is_inter_stalled(description + "-flush_updates-inter-store-empty"); + do { + is_inter_stalled.check(); + my_progress(); + while (res->size() < split_rank::num_threads() && + intra_fixed_memory_store->global_available_size() > split_rank::num_threads()) { + auto fut_gblock = intra_fixed_memory_store->pop_global(); + if (!fut_gblock.ready()) DIE("Invalid state - there were available blocks but just popped one not ready!\n"); + res->push_back(fut_gblock.result()); + } + if (!res->empty()) { + flush_intra_updates_with_res(res, track_intra_rpcs, intra_fixed_memory_store, func); + } + } while (!inter_fixed_memory_store->empty()); + // replace the temporary intra blocks within the reservation and destroy it + intra_fixed_memory_store->release_reservation(res); + assert(res->empty()); + res.reset(); + track_inter_rpcs->flush(0); + + DBG("all my data send rpcs returned and global blocks have returned too\n"); + + { + BarrierTimer timer2(description + "-TwoTierAggrStore::flush_updates after inter-node", split_rank::num_nodes() > 1); + // now all threads have received all inter node rpcs. Flush last internode RPCs that may not have been processed yet + StallTimer is_inter_stalled_again(description + "-flush_updates-inter-store-empty-again"); + do { + is_inter_stalled_again.check(); + my_progress(); + } while (!inter_fixed_memory_store->empty()); + assert(inter_fixed_memory_store->empty()); // should remain empty + + // now all threads have received all inter node rpcs. Flush last intra node stores + // last flush intra node stores + flush_intra_updates(); + + // clear dest stores so global_dispatcher can be empty() + intra_fixed_memory_store->clear_dest_stores(); + StallTimer is_intra_stalled(description + "-flush_updates-intra-store-empty"); + do { + is_intra_stalled.check(); + assert(inter_fixed_memory_store->empty()); // should remain empty + my_progress(); + } while (!intra_fixed_memory_store->empty()); + assert(inter_fixed_memory_store->empty()); // should still be empty + track_intra_rpcs->flush(0); + } // implicit barrier from BarrierTimer timer2 + DBG(__func__, " last my_progress\n"); + my_progress(); + + assert(inter_fixed_memory_store->empty()); + assert(intra_fixed_memory_store->empty()); + DBG("Done with flush_updates\n"); + + // restore dest_stores for next round + if (intra_fixed_memory_store->get_count_per_block() > 1) { + intra_fixed_memory_store->set_dest_stores(split_rank::num_threads()); + } + // restore dest_stores for next round + if (inter_fixed_memory_store->get_count_per_block() > 1 && split_rank::num_nodes() > 1) { + inter_fixed_memory_store->set_dest_stores(split_rank::num_nodes()); + } + // barrier at exit from BarrierTimer + } +}; + +}; // namespace upcxx_utils diff --git a/upcxx-utils/src/limit_outstanding.cpp b/upcxx-utils/src/limit_outstanding.cpp index ed08bf9..e65d063 100644 --- a/upcxx-utils/src/limit_outstanding.cpp +++ b/upcxx-utils/src/limit_outstanding.cpp @@ -23,7 +23,7 @@ upcxx::future<> upcxx_utils::collapse_outstanding_futures(int limit, LimitedFutu while (outstanding_queue.size() > limit) { auto fut = outstanding_queue.front(); outstanding_queue.pop_front(); - if (!fut.ready()) returned_future = upcxx::when_all(fut, returned_future); + if (!fut.is_ready()) returned_future = upcxx::when_all(fut, returned_future); } DBG("limit=", limit, " outstanding=", outstanding_queue.size(), " max_check=", max_check, "\n"); if (limit == 0) { @@ -31,24 +31,24 @@ upcxx::future<> upcxx_utils::collapse_outstanding_futures(int limit, LimitedFutu } else { assert(outstanding_queue.size() <= limit); int i = 0; - while (i < max_check && !returned_future.ready() && i < outstanding_queue.size()) { + while (i < max_check && !returned_future.is_ready() && i < outstanding_queue.size()) { // find a ready future in the queue to swap with auto &test_fut = outstanding_queue[i++]; - if (test_fut.ready()) { + if (test_fut.is_ready()) { std::swap(test_fut, returned_future); - assert(returned_future.ready()); + assert(returned_future.is_ready()); break; } } } } - DBG("limit=", limit, " outstanding=", outstanding_queue.size(), " max_check=", max_check, ", ret=", returned_future.ready(), + DBG("limit=", limit, " outstanding=", outstanding_queue.size(), " max_check=", max_check, ", ret=", returned_future.is_ready(), "\n"); return returned_future; } void upcxx_utils::add_outstanding_future(upcxx::future<> fut, LimitedFutureQueue &outstanding_queue) { - if (!fut.ready()) outstanding_queue.push_back(fut); + if (!fut.is_ready()) outstanding_queue.push_back(fut); } upcxx::future<> upcxx_utils::limit_outstanding_futures(int limit, LimitedFutureQueue &outstanding_queue) { @@ -62,7 +62,7 @@ upcxx::future<> upcxx_utils::limit_outstanding_futures(upcxx::future<> fut, int if (outstanding_queue.empty()) return fut; return upcxx::when_all(collapse_outstanding_futures(limit, outstanding_queue), fut); } - if (fut.ready()) { + if (fut.is_ready()) { if (outstanding_queue.size() <= limit) return fut; } else { outstanding_queue.push_back(fut); diff --git a/upcxx-utils/src/ofstream.cpp b/upcxx-utils/src/ofstream.cpp index cd311ac..784820f 100644 --- a/upcxx-utils/src/ofstream.cpp +++ b/upcxx-utils/src/ofstream.cpp @@ -841,7 +841,7 @@ dist_ofstream::~dist_ofstream() { if (!is_closed) close(); assert(is_closed); stringstream().swap(ss); - DBG_VERBOSE("close_fut=", close_fut.ready(), "\n"); + DBG_VERBOSE("close_fut=", close_fut.is_ready(), "\n"); } void dist_ofstream::close() { diff --git a/upcxx-utils/src/promise_collectives.cpp b/upcxx-utils/src/promise_collectives.cpp index fd9ee7d..9f059cb 100644 --- a/upcxx-utils/src/promise_collectives.cpp +++ b/upcxx-utils/src/promise_collectives.cpp @@ -115,14 +115,14 @@ upcxx_utils::PromiseBarrier::~PromiseBarrier() { DBG_VERBOSE("Destroy this=", this, " move=", moved, "\n"); if (moved) return; // invalidated assert(upcxx::master_persona().active_with_caller()); - assert(dist_workflow->initiated_prom.get_future().ready()); + assert(dist_workflow->initiated_prom.get_future().is_ready()); get_future().wait(); } void upcxx_utils::PromiseBarrier::fulfill() const { DBG_VERBOSE("fulfill this=", this, "\n"); assert(upcxx::master_persona().active_with_caller()); - assert(!dist_workflow->initiated_prom.get_future().ready()); + assert(!dist_workflow->initiated_prom.get_future().is_ready()); dist_workflow->initiated_prom.fulfill_anonymous(1); } diff --git a/upcxx-utils/src/reduce_prefix.cpp b/upcxx-utils/src/reduce_prefix.cpp index ecd0aa6..0ca447e 100644 --- a/upcxx-utils/src/reduce_prefix.cpp +++ b/upcxx-utils/src/reduce_prefix.cpp @@ -124,8 +124,8 @@ future<> binary_tree_steps::get_future() const { // up phase is done bool binary_tree_steps::up_ready() const { - return dst_is_partial_left_me.get_future().ready() && scratch_is_partial_right.get_future().ready() && - scratch_is_partial_to_parent.get_future().ready() && sent_partial_to_parent.get_future().ready(); + return dst_is_partial_left_me.get_future().is_ready() && scratch_is_partial_right.get_future().is_ready() && + scratch_is_partial_to_parent.get_future().is_ready() && sent_partial_to_parent.get_future().is_ready(); } future<> binary_tree_steps::get_up_future() const { @@ -135,8 +135,8 @@ future<> binary_tree_steps::get_up_future() const { // down phase is done bool binary_tree_steps::down_ready() const { - return scratch_is_partial_from_parent.get_future().ready() && sent_left_child.get_future().ready() && - sent_right_child.get_future().ready(); + return scratch_is_partial_from_parent.get_future().is_ready() && sent_left_child.get_future().is_ready() && + sent_right_child.get_future().is_ready(); } future<> binary_tree_steps::get_down_future() const { diff --git a/upcxx-utils/src/timers.cpp b/upcxx-utils/src/timers.cpp new file mode 100644 index 0000000..467c4b0 --- /dev/null +++ b/upcxx-utils/src/timers.cpp @@ -0,0 +1,698 @@ +#include +#include +#include +#include + +#define _TIMERS_CPP +#include "upcxx_utils/timers.hpp" + +using upcxx::future; + +namespace upcxx_utils { + +// Reduce compile time by making templates instantiations of common types +// these are each constructed in CMakeLists.txt and timers-extern-template.in.cpp +// extern templates declarations all happen in timers.hpp + +/* + * This is now handled by CMakeLists.txt + * + MACRO_MIN_SUM_MAX(float, template); + MACRO_MIN_SUM_MAX(double, template); + MACRO_MIN_SUM_MAX(int64_t, template); + MACRO_MIN_SUM_MAX(uint64_t, template); + MACRO_MIN_SUM_MAX(int, template); + + */ + +// +// Timings +// + +future<> &Timings::get_last_pending() { + static future<> _ = make_future(); + return _; +} + +Timings::Timings() + : t() + , before_elapsed(0.0) + , after_elapsed(0.0) + , reduction_elapsed(0.0) + , my_count(0) + , my_instance(0) {} + +future<> Timings::get_pending() { return get_last_pending(); } + +void Timings::set_pending(future<> fut) { get_last_pending() = when_all(get_last_pending(), fut); } + +void Timings::wait_pending() { + DBG_VERBOSE(__func__, "\n"); + if (upcxx::initialized()) { + get_last_pending().wait(); + get_last_pending() = make_future(); + } +} + +string Timings::to_string(bool print_count, bool print_label) const { + ostringstream os; + if (print_label) os << "(min/my/avg/max, bal) "; + os << std::setprecision(2) << std::fixed; + // print the timing metrics + auto &before_max = before_msm.max; + auto &before_min = before_msm.min; + auto &before_sum = before_msm.sum; + if (before_max > 0.0) { + double bal = (before_max > 0.0 ? before_sum / rank_n() / before_max : 1.0); + if (before_max > 10.0 && bal < .9) os << KLRED; // highlight large imbalances + os << before_min << "/" << before_elapsed << "/" << before_sum / rank_n() << "/" << before_max << " s, " << bal; + if (before_max > 1.0 && bal < .9) os << KLCYAN; + } else { + os << "0/0/0/0 s, 1.00"; + } + + os << std::setprecision(1) << std::fixed; + + auto &after_max = after_msm.max; + auto &after_min = after_msm.min; + auto &after_sum = after_msm.sum; + // print the timings around a barrier if they are significant + if (after_max >= 0.1) { + os << (after_max > 1.0 ? KLRED : "") << " barrier " << after_min << "/" << after_elapsed << "/" << after_sum / rank_n() << "/" + << after_max << " s, " << (after_max > 0.0 ? after_sum / rank_n() / after_max : 0.0) << (after_max > 1.0 ? KLCYAN : ""); + } else if (after_max > 0.0) { + os << std::setprecision(2) << std::fixed; + os << " barrier " << after_max << " s"; + os << std::setprecision(1) << std::fixed; + } + + auto &count_max = count_msm.max; + auto &count_min = count_msm.min; + auto &count_sum = count_msm.sum; + // print the max_count if it is more than 1 or more than 0 if asked to print the count + if (count_max > (print_count ? 0.0 : 1.00001)) + os << " count " << count_min << "/" << my_count << "/" << count_sum / rank_n() << "/" << count_max << ", " + << (count_max > 0.0 ? count_sum / rank_n() / count_max : 0.0); + + auto &instance_max = instance_msm.max; + auto &instance_min = instance_msm.min; + auto &instance_sum = instance_msm.sum; + // print the instances if it is both non-zero and not 1 per rank + if (instance_sum > 0 && ((int)(instance_sum + 0.01)) != rank_n() && ((int)(instance_sum + 0.99)) != rank_n()) + os << " inst " << instance_min << "/" << my_instance << "/" << instance_sum / rank_n() << "/" << instance_max << ", " + << (instance_max > 0.0 ? instance_sum / rank_n() / instance_max : 0.0); + // print the reduction timings if they are significant + if (reduction_elapsed > 0.05) + os << (reduction_elapsed > .5 ? KLRED : "") << " reduct " << reduction_elapsed << (reduction_elapsed > .5 ? KLCYAN : ""); + return os.str(); +} + +void Timings::set_before(Timings &timings, size_t count, double elapsed, size_t instances) { + DBG_VERBOSE("set_before: my_count=", count, " my_elapsed=", elapsed, " instances=", instances, "\n"); + timings.before = std::chrono::high_resolution_clock::now(); + + timings.my_count = count; + timings.count_msm.reset(timings.my_count); + + timings.before_elapsed = elapsed; + timings.before_msm.reset(elapsed); + + timings.my_instance = instances; + timings.instance_msm.reset(instances); +} + +// timings must remain in scope until the returened future is ready() +future<> Timings::set_after(const upcxx::team &team, Timings &timings, + std::chrono::time_point t_after) { + timings.after = t_after; + duration_seconds interval = timings.after - timings.before; + timings.after_elapsed = interval.count(); + timings.after_msm.reset(timings.after_elapsed); + DBG_VERBOSE("set_after: ", interval.count(), "\n"); + + // time the reductions + timings.t = t_after; + + assert(&timings.instance_msm == &timings.before_msm + 3); // memory is in order + auto fut_msms = min_sum_max_reduce_all(&timings.before_msm, &timings.before_msm, 4, team); + auto ret = fut_msms.then([&timings]() { + duration_seconds interval = std::chrono::high_resolution_clock::now() - timings.t; + timings.reduction_elapsed = interval.count(); + DBG_VERBOSE("Finished reductions:, ", interval.count(), "\n"); + }); + + set_pending(when_all(ret, get_pending())); + return ret; +} + +// barrier and reduction +Timings Timings::barrier(const upcxx::team &team, size_t count, double elapsed, size_t instances) { + DBG("Timings::barrier(", count, ", ", elapsed, ", ", instances, ")\n"); + Timings timings; + set_before(timings, count, elapsed, instances); + upcxx::barrier(team); + progress(); // explicitly make progress after the barrier if the barrier itself was already ready() + auto fut = set_after(team, timings); + wait_pending(); + assert(fut.is_ready()); + return timings; +} + +void Timings::print_barrier_timings(const upcxx::team &team, string label) { + Timings timings = barrier(team, 0, 0, 0); + wait_pending(); + SLOG_VERBOSE(KLCYAN, "Timing ", label, ":", timings.to_string(), KNORM, "\n"); +} + +// no barrier but a future reduction is started +future Timings::reduce(const upcxx::team &team, size_t count, double elapsed, size_t instances) { + DBG("Timings::reduce(", count, ", ", elapsed, ", ", instances, ")\n"); + auto timings = make_shared(); + set_before(*timings, count, elapsed, instances); + auto future_reduction = set_after(team, *timings, timings->before); // after == before, so no barrier info will be output + return when_all(make_future(timings), future_reduction, get_pending()); +} + +void Timings::print_reduce_timings(const upcxx::team &team, string label) { + future fut_timings = reduce(team, 0, 0, 0); + auto fut = when_all(fut_timings, get_pending()).then([label = std::move(label)](ShTimings shptr_timings) { + SLOG_VERBOSE(KLCYAN, "Timing ", label, ": ", shptr_timings->to_string(), "\n", KNORM); + }); + set_pending(fut); +} + +// +// BaseTimer +// + +size_t &BaseTimer::instance_count() { + static size_t _ = 0; + return _; +} + +void BaseTimer::increment_instance() { ++instance_count(); } +void BaseTimer::decrement_instance() { instance_count()--; } +size_t BaseTimer::get_instance_count() { return instance_count(); } + +BaseTimer::BaseTimer() + : t() + , name() + , t_elapsed(0.0) + , count(0) {} + +BaseTimer::BaseTimer(const string &_name) + : t() + , name(_name) + , t_elapsed(0.0) + , count(0) {} + +BaseTimer::~BaseTimer() {} + +void BaseTimer::clear() { + t = timepoint_t(); + t_elapsed = 0.0; + count = 0; +} + +void BaseTimer::start() { + assert(t == timepoint_t()); + t = now(); +} + +void BaseTimer::stop() { + double elapsed = get_elapsed_since_start(); + t = timepoint_t(); // reset to 0 + // DBG("stop(", name, ", inst=", get_instance_count(), "): ", elapsed, " s, ", now_str(), "\n"); + t_elapsed += elapsed; + count++; +} + +double BaseTimer::get_elapsed() const { return t_elapsed; } + +double BaseTimer::get_elapsed_since_start() const { + assert(t != timepoint_t()); + duration_seconds interval = now() - t; + return interval.count(); +} + +size_t BaseTimer::get_count() const { return count; } + +const string &BaseTimer::get_name() const { return name; } + +void BaseTimer::done() const { + assert(t == timepoint_t()); + SLOG_VERBOSE(KLCYAN, "Timing ", name, ": ", std::setprecision(2), std::fixed, t_elapsed, " s ", KNORM, "\n"); + DBG(name, " took ", std::setprecision(2), std::fixed, t_elapsed, " s ", "\n"); +} + +future> BaseTimer::done_all_async(const upcxx::team &tm) const { + assert(t == timepoint_t()); + auto msm_fut = upcxx_utils::min_sum_max_reduce_one(t_elapsed, 0, tm); + DBG(name, " took ", t_elapsed, " \n"); + auto name_copy = name; + msm_fut = msm_fut.then([name_copy](MinSumMax msm) { + SLOG_VERBOSE(KLCYAN, "Timing ", name_copy, ": ", msm, KNORM, "\n"); + return msm; + }); + Timings::set_pending(msm_fut.then([](MinSumMax) {})); + return msm_fut; +} +void BaseTimer::done_all(const upcxx::team &tm) const { done_all_async(tm).wait(); } + +string BaseTimer::get_final() const { + ostringstream os; + os << name << ": " << std::setprecision(2) << std::fixed << t_elapsed << " s"; + if (count > 1) os << " " << count << " count"; + return os.str(); +} + +future> BaseTimer::reduce_timepoint(const upcxx::team &team, timepoint_t timepoint) { + duration_seconds secs = timepoint.time_since_epoch(); + DBG_VERBOSE("reduce_timepoint ", secs.count(), " since epoch\n"); + future> fut_msm = min_sum_max_reduce_one(secs.count(), 0, team); + return fut_msm.then([&team](MinSumMax msm) { + duration_seconds interval; + if (team.rank_me()) return msm; + // translate to seconds since the first rank entered + msm.my = msm.my - msm.min; + msm.max = msm.max - msm.min; + msm.sum = msm.sum - msm.min * team.rank_n(); + msm.min = 0.0; + msm.apply_avg(team); + return msm; + }); +} + +future BaseTimer::reduce_timings(const upcxx::team &team, size_t my_instances) const { + return reduce_timings(team, count, t_elapsed, my_instances); +} + +future BaseTimer::reduce_timings(const upcxx::team &team, size_t my_count, double my_elapsed, size_t my_instances) { + return Timings::reduce(team, my_count, my_elapsed, my_instances); +} + +Timings BaseTimer::barrier_timings(const upcxx::team &team, size_t my_instances) const { + return barrier_timings(team, count, t_elapsed, my_instances); +} + +Timings BaseTimer::barrier_timings(const upcxx::team &team, size_t my_count, double my_elapsed, size_t my_instances) { + return Timings::barrier(team, my_count, my_elapsed, my_instances); +} + +timepoint_t BaseTimer::now() { return std::chrono::high_resolution_clock::now(); } + +string BaseTimer::now_str() { + std::time_t result = std::time(nullptr); + char buffer[100]; + size_t sz = strftime(buffer, sizeof(buffer), "%Y-%m-%d %H:%M:%S", std::localtime(&result)); + return string(sz > 0 ? buffer : "BAD TIME"); +} + +// +// StallTimer +// + +StallTimer::StallTimer(const string _name, double _max_seconds, int64_t _max_count) + : BaseTimer(_name) + , max_seconds(_max_seconds) + , max_count(_max_count) { + start(); +} + +StallTimer::~StallTimer() { stop(); } + +void StallTimer::check() { + stop(); + bool print = false; + if (max_seconds > 0.0 && t_elapsed > max_seconds) { + print = true; + } else if (max_count > 0 && count > max_count) { + print = true; + } + if (print) { + WARN("StallTimer - ", name, " on ", rank_me(), " stalled for ", t_elapsed, " s and ", count, " iterations\n"); + max_seconds *= 2.0; + max_count *= 2; + } + start(); +} + +// +// IntermittentTimer +// + +IntermittentTimer::IntermittentTimer(const string &_name, string _interval_label) + : BaseTimer(_name) + , t_interval(0.0) + , interval_label(_interval_label) {} + +IntermittentTimer::~IntermittentTimer() {} + +void IntermittentTimer::clear() { + ((BaseTimer *)this)->clear(); + t_interval = 0.0; + interval_label = ""; +} + +void IntermittentTimer::start_interval() { t_interval = get_elapsed_since_start(); } + +void IntermittentTimer::stop_interval() { + t_interval = get_elapsed_since_start() - t_interval; + if (!interval_label.empty()) { + ostringstream oss; + oss << KBLUE << std::left << std::setw(40) << interval_label << std::setprecision(2) << std::fixed << t_interval << " s" + << KNORM << "\n"; + SLOG(oss.str()); + } +} + +void IntermittentTimer::print_out(const upcxx::team &tm) { + future fut_shptr_timings = reduce_timings(tm); + auto fut = + when_all(Timings::get_pending(), fut_shptr_timings).then([&name = this->name, &count = this->count](ShTimings shptr_timings) { + if (shptr_timings->count_msm.max > 0.0) + SLOG_VERBOSE(KLCYAN, "Timing ", name, ": ", count, " intervals, ", shptr_timings->to_string(true), "\n", KNORM); + }); + Timings::set_pending(fut); + count = 0; + t_elapsed = 0.0; +} + +// +// ProgressTimer +// + +ProgressTimer::ProgressTimer(const string &_name) + : BaseTimer(_name) + , calls(0) {} + +ProgressTimer::~ProgressTimer() {} + +void ProgressTimer::clear() { + ((BaseTimer *)this)->clear(); + calls = 0; +} + +void ProgressTimer::progress(size_t run_every) { + if (run_every > 1 && ++calls % run_every != 0) return; + start(); + upcxx::progress(); + stop(); + // DBG("ProgressTimer(", name, ") - ", t_elapsed, "\n"); +} + +void ProgressTimer::discharge(size_t run_every) { + if (run_every != 1 && ++calls % run_every != 0) return; + start(); + upcxx::discharge(); + upcxx::progress(); + stop(); + // DBG("ProgressTimer(", name, ").discharge() - ", t_elapsed, "\n"); +} + +void ProgressTimer::print_out(const upcxx::team &tm) { + future fut_shptr_timings = reduce_timings(tm); + auto fut = when_all(Timings::get_pending(), fut_shptr_timings).then([&name = this->name](ShTimings shptr_timings) { + if (shptr_timings->count_msm.max > 0.0) + SLOG_VERBOSE(KLCYAN, "Timing ", name, ": ", shptr_timings->to_string(true), KNORM, "\n"); + }); + Timings::set_pending(fut); + count = 0; + t_elapsed = 0.0; +} + +// +// Timer +// +Timer::Timer(const upcxx::team &tm, const string &_name, bool exit_reduction) + : tm(tm) + , exited(exit_reduction) + , logged(false) + , BaseTimer(_name) { + init(); +} +Timer::Timer(const string &_name, bool exit_reduction) + : tm(upcxx::world()) + , exited(exit_reduction) + , logged(false) + , BaseTimer(_name) { + init(); +} +void Timer::init() { + increment_instance(); + auto fut = when_all(Timings::get_pending(), make_future(now_str())).then([name = this->name](string now) {}); + Timings::set_pending(fut); + start(); +} +Timer::Timer(Timer &&move) + : tm(move.tm) + , exited(move.exited) + , BaseTimer((BaseTimer &)move) { + move.exited = true; + move.logged = true; +} +Timer &Timer::operator=(Timer &&move) { + Timer mv(std::move(move)); + std::swap(*this, mv); + return *this; +} + +Timer::~Timer() { + if (!exited) + initiate_exit_reduction(); + else if (!logged) { + stop(); + LOG(KLCYAN, "Timing ", name, ":", get_elapsed(), KNORM, "\n"); + } +} + +future<> Timer::initiate_entrance_reduction() { + DBG_VERBOSE("Tracking entrance of ", name, "\n"); + auto fut_msm = reduce_timepoint(tm, now()); + + auto fut = when_all(Timings::get_pending(), fut_msm).then([name = this->name](MinSumMax msm) { + DBG_VERBOSE("got reduction: ", msm.to_string(), "\n"); + SLOG_VERBOSE(KLCYAN, "Timing (entrance) ", name, ":", msm.to_string(), KNORM, "\n"); + }); + Timings::set_pending(fut); + return fut; +} + +future<> Timer::initiate_exit_reduction() { + stop(); + future fut_shptr_timings = reduce_timings(tm); + auto fut = when_all(Timings::get_pending(), fut_shptr_timings).then([name = this->name](ShTimings shptr_timings) { + SLOG_VERBOSE(KLCYAN, "Timing ", name, " exit: ", shptr_timings->to_string(), KNORM, "\n"); + }); + Timings::set_pending(fut); + decrement_instance(); + exited = true; + logged = true; + return fut; +} + +// +// BarrierTimer +// + +BarrierTimer::BarrierTimer(const upcxx::team &team, const string _name, bool _entrance_barrier, bool _exit_barrier) + : _team(team) + , exit_barrier(_exit_barrier) + , exited(false) + , BaseTimer(_name) { + init(_entrance_barrier); +} +BarrierTimer::BarrierTimer(const string _name, bool _entrance_barrier, bool _exit_barrier) + : _team(upcxx::world()) + , exit_barrier(_exit_barrier) + , exited(false) + , BaseTimer(_name) { + init(_entrance_barrier); +} + +future<> BarrierTimer::init(bool _entrance_barrier) { + increment_instance(); + if (!_entrance_barrier && !exit_barrier) SLOG_VERBOSE("Why are we using a BarrierTimer without any barriers???\n"); + future<> fut; + DBG("Entering BarrierTimer ", name, "\n"); + if (_entrance_barrier) { + fut = when_all(Timings::get_pending(), make_future(now_str())).then([&name = this->name](string now) { + // SLOG_VERBOSE(KLCYAN, "Timing ", name, ": (entering barrier) ", KNORM); + }); + Timings::set_pending(fut); + auto timings = barrier_timings(_team); + Timings::wait_pending(); // should be noop + SLOG_VERBOSE(KLCYAN, "Timing (entrance barrier) ", name, ": ", timings.to_string(), KNORM, "\n"); + } else { + fut = when_all(Timings::get_pending(), make_future(now_str())).then([&name = this->name](string now) {}); + Timings::set_pending(fut); + } + start(); + return fut; +} + +BarrierTimer::~BarrierTimer() { + if (!exited) initate_exit_barrier().wait(); +} +future<> BarrierTimer::initate_exit_barrier() { + stop(); + future<> fut; + DBG("Exiting BarrierTimer ", name, "\n"); + if (exit_barrier) { + fut = when_all(Timings::get_pending(), make_future(now_str())).then([name = this->name](string now) {}); + Timings::set_pending(fut); + auto timings = barrier_timings(_team); + Timings::wait_pending(); + SLOG_VERBOSE(KLCYAN, "Timing ", name, ": ", timings.to_string(), KNORM, "\n"); + } else { + future fut_shptr_timings = reduce_timings(_team); + fut = when_all(Timings::get_pending(), fut_shptr_timings).then([name = this->name](ShTimings shptr_timings) { + SLOG_VERBOSE(KLCYAN, "Timing ", name, ": ", shptr_timings->to_string(), KNORM, "\n"); + }); + Timings::set_pending(fut); + } + decrement_instance(); + exited = true; + return fut; +} + +// +// AsyncTimer +// + +_AsyncTimer::_AsyncTimer(const upcxx::team &tm, const string &name) + : BaseTimer(name) + , tm(tm) + , construct_t(BaseTimer::now()) + , start_t{} {} +void _AsyncTimer::start() { + start_t = now(); + ((BaseTimer *)this)->start(); +} +void _AsyncTimer::stop() { ((BaseTimer *)this)->stop(); } +void _AsyncTimer::report(const string label, MinSumMax msm) { + SLOG_VERBOSE(KLCYAN, "Timing ", name, " ", label, ":", msm.to_string(), KNORM, "\n"); +} + +future<> _AsyncTimer::initiate_construct_reduction() { + auto fut_msm = BaseTimer::reduce_timepoint(tm, construct_t); + auto fut = when_all(Timings::get_pending(), fut_msm).then([this](MinSumMax msm) { this->report("construct", msm); }); + Timings::set_pending(fut); + return fut; +} +future<> _AsyncTimer::initiate_start_reduction() { + auto fut_msm = BaseTimer::reduce_timepoint(tm, start_t); + auto fut = when_all(Timings::get_pending(), fut_msm).then([this](MinSumMax msm) { this->report("start", msm); }); + Timings::set_pending(fut); + return fut; +} +future<> _AsyncTimer::initiate_stop_reduction() { + auto fut_msm = Timings::reduce(tm, 1, get_elapsed(), 1); + auto fut = when_all(Timings::get_pending(), fut_msm).then([this](ShTimings sh_timings) { + this->report("stop", sh_timings->before_elapsed); + }); + Timings::set_pending(fut); + return fut; +} + +AsyncTimer::AsyncTimer(const upcxx::team &tm, const string &name) + : timer(make_shared<_AsyncTimer>(tm, name)) {} +AsyncTimer::AsyncTimer(const string &name) + : timer(make_shared<_AsyncTimer>(upcxx::world(), name)) {} +void AsyncTimer::start() const { timer->start(); } +void AsyncTimer::stop() const { + timer->stop(); + LOG(timer->get_name(), " completed in ", timer->get_elapsed(), " s\n"); +} +double AsyncTimer::get_elapsed() const { return timer->get_elapsed(); } +future<> AsyncTimer::initiate_construct_reduction() { + return timer->initiate_construct_reduction().then([timer = this->timer]() { + // keep timer alive + }); +} +future<> AsyncTimer::initiate_start_reduction() { + return timer->initiate_start_reduction().then([timer = this->timer]() { + // keep timer alive + }); +} +future<> AsyncTimer::initiate_stop_reduction() { + return timer->initiate_stop_reduction().then([timer = this->timer]() { + // keep timer alive + }); +} + +// +// ActiveCountTimer +// + +ActiveCountTimer::ActiveCountTimer(const string _name) + : total_elapsed(0.0) + , total_count(0) + , active_count(0) + , max_active(0) + , name(_name) + , my_fut(make_future()) {} + +ActiveCountTimer::~ActiveCountTimer() { + if (upcxx::initialized()) my_fut.wait(); // keep alive until all futures have finished +} + +void ActiveCountTimer::clear() { + total_elapsed = 0.0; + total_count = 0; + active_count = 0; + max_active = 0; +} + +timepoint_t ActiveCountTimer::begin() { + active_count++; + if (max_active < active_count) max_active = active_count; + return BaseTimer::now(); +} + +void ActiveCountTimer::end(timepoint_t t) { + duration_seconds interval = BaseTimer::now() - t; + active_count--; + total_count++; + total_elapsed += interval.count(); +} + +void ActiveCountTimer::print_barrier_timings(const upcxx::team &team, string label) { + Timings timings = BaseTimer::barrier_timings(team, total_count, total_elapsed, max_active); + clear(); + Timings::wait_pending(); + print_timings(timings, label); +} + +void ActiveCountTimer::print_reduce_timings(const upcxx::team &team, string label) { + label = name + label; + auto fut_timings = BaseTimer::reduce_timings(team, total_count, total_elapsed, max_active); + auto _this = this; + auto fut_clear = fut_timings.then([_this](ShTimings ignored) { _this->clear(); }); + auto fut = when_all(Timings::get_pending(), fut_timings, fut_clear).then([_this, label](ShTimings shptr_timings) { + _this->print_timings(*shptr_timings, label); + }); + my_fut = when_all(fut_clear, my_fut, fut); // keep this in scope until clear has been called... + Timings::set_pending(my_fut); +} + +void ActiveCountTimer::print_timings(Timings &timings, string label) { + label = name + label; + DBG_VERBOSE(__func__, " label=", label, "\n"); + if (active_count > 0) + SWARN("print_timings on ActiveCountTimer '", label, "' called while ", active_count, " (max ", max_active, + ") are still active\n"); + if (timings.count_msm.max > 0.0) { + SLOG_VERBOSE(KLCYAN, "Timing instances of ", label, ": ", + (timings.count_msm.max > 0.0 ? timings.to_string(true) : string("(none)")), KNORM, "\n"); + } +} + +ActiveCountTimer _GenericActiveCountTimer("_upcxx_dummy"); +GenericInstantiationTimer _GenericInstantiationTimer(_GenericActiveCountTimer); +template class ActiveInstantiationTimer<_upcxx_utils_dummy>; + +SingletonInstantiationTimer _SingletonInstantiationTimer(); +template class InstantiationTimer<_upcxx_utils_dummy>; + +}; // namespace upcxx_utils diff --git a/upcxx-utils/test/test_ofstream.cpp b/upcxx-utils/test/test_ofstream.cpp index adfcd77..8c3183d 100644 --- a/upcxx-utils/test/test_ofstream.cpp +++ b/upcxx-utils/test/test_ofstream.cpp @@ -434,7 +434,7 @@ int run_large_test(int argc, char **argv) { future<> fut; if (i % 2 == 1) { fut = f.close_async(); - assert(!fut.ready()); + assert(!fut.is_ready()); fut = fut.then([t3]() { t3->stop(); }); } else { fut = make_future(); diff --git a/upcxx-utils/test/test_promise_collectives.cpp b/upcxx-utils/test/test_promise_collectives.cpp index db062e7..f9c3129 100644 --- a/upcxx-utils/test/test_promise_collectives.cpp +++ b/upcxx-utils/test/test_promise_collectives.cpp @@ -34,7 +34,7 @@ int test_promise_barrier(int argc, char **argv) { assert(roundup_log2(17) == 5); { PromiseBarrier pb; - assert(!pb.get_future().ready()); + assert(!pb.get_future().is_ready()); pb.fulfill(); pb.get_future().wait(); } @@ -44,10 +44,10 @@ int test_promise_barrier(int argc, char **argv) { barrier(); PromiseBarrier pb1, pb2; barrier(); - assert(!pb1.get_future().ready()); + assert(!pb1.get_future().is_ready()); pb1.fulfill(); barrier(); - assert(!pb2.get_future().ready()); + assert(!pb2.get_future().is_ready()); pb2.fulfill(); barrier(); pb1.get_future().wait(); @@ -60,12 +60,12 @@ int test_promise_barrier(int argc, char **argv) { barrier(); PromiseBarrier pb1, pb2; barrier(); - assert(!pb1.get_future().ready()); + assert(!pb1.get_future().is_ready()); pb1.fulfill(); barrier(); pb1.get_future().wait(); barrier(); - assert(!pb2.get_future().ready()); + assert(!pb2.get_future().is_ready()); pb2.fulfill(); barrier(); pb2.get_future().wait(); @@ -76,10 +76,10 @@ int test_promise_barrier(int argc, char **argv) { barrier(); PromiseBarrier pb1, pb2; barrier(); - assert(!pb1.get_future().ready()); + assert(!pb1.get_future().is_ready()); pb1.fulfill(); barrier(); - assert(!pb2.get_future().ready()); + assert(!pb2.get_future().is_ready()); pb2.fulfill(); barrier(); pb2.get_future().wait(); @@ -93,10 +93,10 @@ int test_promise_barrier(int argc, char **argv) { barrier(); PromiseBarrier pb1, pb2; barrier(); - assert(!pb2.get_future().ready()); + assert(!pb2.get_future().is_ready()); pb2.fulfill(); barrier(); - assert(!pb1.get_future().ready()); + assert(!pb1.get_future().is_ready()); pb1.fulfill(); barrier(); pb2.get_future().wait(); @@ -109,12 +109,12 @@ int test_promise_barrier(int argc, char **argv) { barrier(); PromiseBarrier pb1, pb2; barrier(); - assert(!pb2.get_future().ready()); + assert(!pb2.get_future().is_ready()); pb2.fulfill(); barrier(); pb2.get_future().wait(); barrier(); - assert(!pb1.get_future().ready()); + assert(!pb1.get_future().is_ready()); pb1.fulfill(); barrier(); pb1.get_future().wait(); @@ -125,10 +125,10 @@ int test_promise_barrier(int argc, char **argv) { barrier(); PromiseBarrier pb1, pb2; barrier(); - assert(!pb2.get_future().ready()); + assert(!pb2.get_future().is_ready()); pb2.fulfill(); barrier(); - assert(!pb1.get_future().ready()); + assert(!pb1.get_future().is_ready()); pb1.fulfill(); barrier(); pb1.get_future().wait(); @@ -147,13 +147,13 @@ int test_promise_barrier(int argc, char **argv) { for (int i = 0; i < iterations; i++) { fulfill_order[i] = i; wait_order[i] = i; - assert(!pbs[i].get_future().ready()); + assert(!pbs[i].get_future().is_ready()); } std::shuffle(fulfill_order.begin(), fulfill_order.end(), g); barrier(); // initiate all for (int i = 0; i < iterations; i++) { - assert(!pbs[fulfill_order[i]].get_future().ready()); + assert(!pbs[fulfill_order[i]].get_future().is_ready()); pbs[fulfill_order[i]].fulfill(); } // wait all @@ -175,14 +175,14 @@ int test_promise_barrier(int argc, char **argv) { for (int i = 0; i < iterations; i++) { fulfill_order[i] = i; wait_order[i] = i; - assert(!pbs[i].get_future().ready()); + assert(!pbs[i].get_future().is_ready()); } std::shuffle(fulfill_order.begin(), fulfill_order.end(), g); std::shuffle(wait_order.begin(), wait_order.end(), g); barrier(); // initiate all for (int i = 0; i < iterations; i++) { - assert(!pbs[fulfill_order[i]].get_future().ready()); + assert(!pbs[fulfill_order[i]].get_future().is_ready()); pbs[fulfill_order[i]].fulfill(); } barrier(); @@ -210,7 +210,7 @@ int test_promise_barrier(int argc, char **argv) { barrier(); // initiate all for (int i = 0; i < iterations; i++) { - assert(!pbs[fulfill_order[i]].get_future().ready()); + assert(!pbs[fulfill_order[i]].get_future().is_ready()); pbs[fulfill_order[i]].fulfill(); } barrier(); @@ -236,7 +236,7 @@ int test_promise_barrier(int argc, char **argv) { barrier(); // initiate all for (int i = 0; i < iterations; i++) { - assert(!pbs[fulfill_order[i]].get_future().ready()); + assert(!pbs[fulfill_order[i]].get_future().is_ready()); pbs[fulfill_order[i]].fulfill(); } barrier(); @@ -261,7 +261,7 @@ int test_promise_barrier(int argc, char **argv) { barrier(); // initiate all for (int i = 0; i < iterations; i++) { - assert(!pbs[fulfill_order[i]].get_future().ready()); + assert(!pbs[fulfill_order[i]].get_future().is_ready()); pbs[fulfill_order[i]].fulfill(); pbs[wait_order[i]].get_future().wait(); } @@ -285,7 +285,7 @@ int test_promise_barrier(int argc, char **argv) { // initiate all future<> all_fut = make_future(); for (int i = 0; i < iterations; i++) { - assert(!pbs[fulfill_order[i]].get_future().ready()); + assert(!pbs[fulfill_order[i]].get_future().is_ready()); pbs[fulfill_order[i]].fulfill(); auto fut = pbs[wait_order[i]].get_future(); all_fut = when_all(all_fut, fut); @@ -310,7 +310,7 @@ int test_promise_barrier(int argc, char **argv) { // initiate all future<> all_fut = make_future(); for (int i = 0; i < iterations; i++) { - assert(!pbs[fulfill_order[i]].get_future().ready()); + assert(!pbs[fulfill_order[i]].get_future().is_ready()); pbs[fulfill_order[i]].fulfill(); auto fut = pbs[wait_order[i]].get_future(); all_fut = when_all(all_fut, fut); From 80c29547bc0f061d127b0a326d4d3a3bf2e896ad Mon Sep 17 00:00:00 2001 From: Jan Ciesko Date: Mon, 18 Dec 2023 16:10:38 -0700 Subject: [PATCH 02/13] Check if cmake variable is empty before appying string op --- cmake/Modules/CheckSubmodules.cmake | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/cmake/Modules/CheckSubmodules.cmake b/cmake/Modules/CheckSubmodules.cmake index 50c5583..13fe2c7 100644 --- a/cmake/Modules/CheckSubmodules.cmake +++ b/cmake/Modules/CheckSubmodules.cmake @@ -15,8 +15,9 @@ if(GIT_FOUND AND EXISTS "${PROJECT_SOURCE_DIR}/.git") if(NOT GIT_SUBMOD_RESULT EQUAL "0") message(FATAL_ERROR "git submodule failed with ${GIT_SUBMOD_RESULT}, please checkout submodules") endif() - - string(REPLACE "\n" ";" SUBMOD_LIST ${GIT_SUBMOD_OUTPUT}) + if(NOT GIT_SUBMOD_OUTPUT STREQUAL "") + string(REPLACE "\n" ";" SUBMOD_LIST ${GIT_SUBMOD_OUTPUT}) + endif() set(UPDATE_SUBMODULES "") foreach(tmp ${EXPECTED_SUBMODULES}) set(IS_SUB_OK FALSE) From a8223305916231b92197c848035febef42f0fbae Mon Sep 17 00:00:00 2001 From: Jan Ciesko Date: Mon, 18 Dec 2023 16:23:27 -0700 Subject: [PATCH 03/13] Add missing header --- upcxx-utils/include/upcxx_utils/colors.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/upcxx-utils/include/upcxx_utils/colors.h b/upcxx-utils/include/upcxx_utils/colors.h index f0170c3..9fbee40 100644 --- a/upcxx-utils/include/upcxx_utils/colors.h +++ b/upcxx-utils/include/upcxx_utils/colors.h @@ -24,6 +24,9 @@ * */ +#include +#include + #ifdef CONFIG_USE_COLORS #define KNORM "\x1B[0m" From 98169a7a8da14cb4f473a33cf720384561e42489 Mon Sep 17 00:00:00 2001 From: Jan Ciesko Date: Mon, 18 Dec 2023 16:26:19 -0700 Subject: [PATCH 04/13] Exclude test in proxy version of mhm2 --- test/CMakeLists.txt | 2 + upcxx-utils/src/timers.cpp | 698 ------------------------------------- 2 files changed, 2 insertions(+), 698 deletions(-) delete mode 100644 upcxx-utils/src/timers.cpp diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index a4516b2..a95e4ad 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -21,6 +21,8 @@ file( LIST_DIRECTORIES false *.hpp *.cpp) +list(FILTER TEST_SOURCES EXCLUDE REGEX "ssw-test\\.cpp$") + set(SOURCES ${TEST_SOURCES}) include_directories("${CMAKE_SOURCE_DIR}/src") diff --git a/upcxx-utils/src/timers.cpp b/upcxx-utils/src/timers.cpp deleted file mode 100644 index 467c4b0..0000000 --- a/upcxx-utils/src/timers.cpp +++ /dev/null @@ -1,698 +0,0 @@ -#include -#include -#include -#include - -#define _TIMERS_CPP -#include "upcxx_utils/timers.hpp" - -using upcxx::future; - -namespace upcxx_utils { - -// Reduce compile time by making templates instantiations of common types -// these are each constructed in CMakeLists.txt and timers-extern-template.in.cpp -// extern templates declarations all happen in timers.hpp - -/* - * This is now handled by CMakeLists.txt - * - MACRO_MIN_SUM_MAX(float, template); - MACRO_MIN_SUM_MAX(double, template); - MACRO_MIN_SUM_MAX(int64_t, template); - MACRO_MIN_SUM_MAX(uint64_t, template); - MACRO_MIN_SUM_MAX(int, template); - - */ - -// -// Timings -// - -future<> &Timings::get_last_pending() { - static future<> _ = make_future(); - return _; -} - -Timings::Timings() - : t() - , before_elapsed(0.0) - , after_elapsed(0.0) - , reduction_elapsed(0.0) - , my_count(0) - , my_instance(0) {} - -future<> Timings::get_pending() { return get_last_pending(); } - -void Timings::set_pending(future<> fut) { get_last_pending() = when_all(get_last_pending(), fut); } - -void Timings::wait_pending() { - DBG_VERBOSE(__func__, "\n"); - if (upcxx::initialized()) { - get_last_pending().wait(); - get_last_pending() = make_future(); - } -} - -string Timings::to_string(bool print_count, bool print_label) const { - ostringstream os; - if (print_label) os << "(min/my/avg/max, bal) "; - os << std::setprecision(2) << std::fixed; - // print the timing metrics - auto &before_max = before_msm.max; - auto &before_min = before_msm.min; - auto &before_sum = before_msm.sum; - if (before_max > 0.0) { - double bal = (before_max > 0.0 ? before_sum / rank_n() / before_max : 1.0); - if (before_max > 10.0 && bal < .9) os << KLRED; // highlight large imbalances - os << before_min << "/" << before_elapsed << "/" << before_sum / rank_n() << "/" << before_max << " s, " << bal; - if (before_max > 1.0 && bal < .9) os << KLCYAN; - } else { - os << "0/0/0/0 s, 1.00"; - } - - os << std::setprecision(1) << std::fixed; - - auto &after_max = after_msm.max; - auto &after_min = after_msm.min; - auto &after_sum = after_msm.sum; - // print the timings around a barrier if they are significant - if (after_max >= 0.1) { - os << (after_max > 1.0 ? KLRED : "") << " barrier " << after_min << "/" << after_elapsed << "/" << after_sum / rank_n() << "/" - << after_max << " s, " << (after_max > 0.0 ? after_sum / rank_n() / after_max : 0.0) << (after_max > 1.0 ? KLCYAN : ""); - } else if (after_max > 0.0) { - os << std::setprecision(2) << std::fixed; - os << " barrier " << after_max << " s"; - os << std::setprecision(1) << std::fixed; - } - - auto &count_max = count_msm.max; - auto &count_min = count_msm.min; - auto &count_sum = count_msm.sum; - // print the max_count if it is more than 1 or more than 0 if asked to print the count - if (count_max > (print_count ? 0.0 : 1.00001)) - os << " count " << count_min << "/" << my_count << "/" << count_sum / rank_n() << "/" << count_max << ", " - << (count_max > 0.0 ? count_sum / rank_n() / count_max : 0.0); - - auto &instance_max = instance_msm.max; - auto &instance_min = instance_msm.min; - auto &instance_sum = instance_msm.sum; - // print the instances if it is both non-zero and not 1 per rank - if (instance_sum > 0 && ((int)(instance_sum + 0.01)) != rank_n() && ((int)(instance_sum + 0.99)) != rank_n()) - os << " inst " << instance_min << "/" << my_instance << "/" << instance_sum / rank_n() << "/" << instance_max << ", " - << (instance_max > 0.0 ? instance_sum / rank_n() / instance_max : 0.0); - // print the reduction timings if they are significant - if (reduction_elapsed > 0.05) - os << (reduction_elapsed > .5 ? KLRED : "") << " reduct " << reduction_elapsed << (reduction_elapsed > .5 ? KLCYAN : ""); - return os.str(); -} - -void Timings::set_before(Timings &timings, size_t count, double elapsed, size_t instances) { - DBG_VERBOSE("set_before: my_count=", count, " my_elapsed=", elapsed, " instances=", instances, "\n"); - timings.before = std::chrono::high_resolution_clock::now(); - - timings.my_count = count; - timings.count_msm.reset(timings.my_count); - - timings.before_elapsed = elapsed; - timings.before_msm.reset(elapsed); - - timings.my_instance = instances; - timings.instance_msm.reset(instances); -} - -// timings must remain in scope until the returened future is ready() -future<> Timings::set_after(const upcxx::team &team, Timings &timings, - std::chrono::time_point t_after) { - timings.after = t_after; - duration_seconds interval = timings.after - timings.before; - timings.after_elapsed = interval.count(); - timings.after_msm.reset(timings.after_elapsed); - DBG_VERBOSE("set_after: ", interval.count(), "\n"); - - // time the reductions - timings.t = t_after; - - assert(&timings.instance_msm == &timings.before_msm + 3); // memory is in order - auto fut_msms = min_sum_max_reduce_all(&timings.before_msm, &timings.before_msm, 4, team); - auto ret = fut_msms.then([&timings]() { - duration_seconds interval = std::chrono::high_resolution_clock::now() - timings.t; - timings.reduction_elapsed = interval.count(); - DBG_VERBOSE("Finished reductions:, ", interval.count(), "\n"); - }); - - set_pending(when_all(ret, get_pending())); - return ret; -} - -// barrier and reduction -Timings Timings::barrier(const upcxx::team &team, size_t count, double elapsed, size_t instances) { - DBG("Timings::barrier(", count, ", ", elapsed, ", ", instances, ")\n"); - Timings timings; - set_before(timings, count, elapsed, instances); - upcxx::barrier(team); - progress(); // explicitly make progress after the barrier if the barrier itself was already ready() - auto fut = set_after(team, timings); - wait_pending(); - assert(fut.is_ready()); - return timings; -} - -void Timings::print_barrier_timings(const upcxx::team &team, string label) { - Timings timings = barrier(team, 0, 0, 0); - wait_pending(); - SLOG_VERBOSE(KLCYAN, "Timing ", label, ":", timings.to_string(), KNORM, "\n"); -} - -// no barrier but a future reduction is started -future Timings::reduce(const upcxx::team &team, size_t count, double elapsed, size_t instances) { - DBG("Timings::reduce(", count, ", ", elapsed, ", ", instances, ")\n"); - auto timings = make_shared(); - set_before(*timings, count, elapsed, instances); - auto future_reduction = set_after(team, *timings, timings->before); // after == before, so no barrier info will be output - return when_all(make_future(timings), future_reduction, get_pending()); -} - -void Timings::print_reduce_timings(const upcxx::team &team, string label) { - future fut_timings = reduce(team, 0, 0, 0); - auto fut = when_all(fut_timings, get_pending()).then([label = std::move(label)](ShTimings shptr_timings) { - SLOG_VERBOSE(KLCYAN, "Timing ", label, ": ", shptr_timings->to_string(), "\n", KNORM); - }); - set_pending(fut); -} - -// -// BaseTimer -// - -size_t &BaseTimer::instance_count() { - static size_t _ = 0; - return _; -} - -void BaseTimer::increment_instance() { ++instance_count(); } -void BaseTimer::decrement_instance() { instance_count()--; } -size_t BaseTimer::get_instance_count() { return instance_count(); } - -BaseTimer::BaseTimer() - : t() - , name() - , t_elapsed(0.0) - , count(0) {} - -BaseTimer::BaseTimer(const string &_name) - : t() - , name(_name) - , t_elapsed(0.0) - , count(0) {} - -BaseTimer::~BaseTimer() {} - -void BaseTimer::clear() { - t = timepoint_t(); - t_elapsed = 0.0; - count = 0; -} - -void BaseTimer::start() { - assert(t == timepoint_t()); - t = now(); -} - -void BaseTimer::stop() { - double elapsed = get_elapsed_since_start(); - t = timepoint_t(); // reset to 0 - // DBG("stop(", name, ", inst=", get_instance_count(), "): ", elapsed, " s, ", now_str(), "\n"); - t_elapsed += elapsed; - count++; -} - -double BaseTimer::get_elapsed() const { return t_elapsed; } - -double BaseTimer::get_elapsed_since_start() const { - assert(t != timepoint_t()); - duration_seconds interval = now() - t; - return interval.count(); -} - -size_t BaseTimer::get_count() const { return count; } - -const string &BaseTimer::get_name() const { return name; } - -void BaseTimer::done() const { - assert(t == timepoint_t()); - SLOG_VERBOSE(KLCYAN, "Timing ", name, ": ", std::setprecision(2), std::fixed, t_elapsed, " s ", KNORM, "\n"); - DBG(name, " took ", std::setprecision(2), std::fixed, t_elapsed, " s ", "\n"); -} - -future> BaseTimer::done_all_async(const upcxx::team &tm) const { - assert(t == timepoint_t()); - auto msm_fut = upcxx_utils::min_sum_max_reduce_one(t_elapsed, 0, tm); - DBG(name, " took ", t_elapsed, " \n"); - auto name_copy = name; - msm_fut = msm_fut.then([name_copy](MinSumMax msm) { - SLOG_VERBOSE(KLCYAN, "Timing ", name_copy, ": ", msm, KNORM, "\n"); - return msm; - }); - Timings::set_pending(msm_fut.then([](MinSumMax) {})); - return msm_fut; -} -void BaseTimer::done_all(const upcxx::team &tm) const { done_all_async(tm).wait(); } - -string BaseTimer::get_final() const { - ostringstream os; - os << name << ": " << std::setprecision(2) << std::fixed << t_elapsed << " s"; - if (count > 1) os << " " << count << " count"; - return os.str(); -} - -future> BaseTimer::reduce_timepoint(const upcxx::team &team, timepoint_t timepoint) { - duration_seconds secs = timepoint.time_since_epoch(); - DBG_VERBOSE("reduce_timepoint ", secs.count(), " since epoch\n"); - future> fut_msm = min_sum_max_reduce_one(secs.count(), 0, team); - return fut_msm.then([&team](MinSumMax msm) { - duration_seconds interval; - if (team.rank_me()) return msm; - // translate to seconds since the first rank entered - msm.my = msm.my - msm.min; - msm.max = msm.max - msm.min; - msm.sum = msm.sum - msm.min * team.rank_n(); - msm.min = 0.0; - msm.apply_avg(team); - return msm; - }); -} - -future BaseTimer::reduce_timings(const upcxx::team &team, size_t my_instances) const { - return reduce_timings(team, count, t_elapsed, my_instances); -} - -future BaseTimer::reduce_timings(const upcxx::team &team, size_t my_count, double my_elapsed, size_t my_instances) { - return Timings::reduce(team, my_count, my_elapsed, my_instances); -} - -Timings BaseTimer::barrier_timings(const upcxx::team &team, size_t my_instances) const { - return barrier_timings(team, count, t_elapsed, my_instances); -} - -Timings BaseTimer::barrier_timings(const upcxx::team &team, size_t my_count, double my_elapsed, size_t my_instances) { - return Timings::barrier(team, my_count, my_elapsed, my_instances); -} - -timepoint_t BaseTimer::now() { return std::chrono::high_resolution_clock::now(); } - -string BaseTimer::now_str() { - std::time_t result = std::time(nullptr); - char buffer[100]; - size_t sz = strftime(buffer, sizeof(buffer), "%Y-%m-%d %H:%M:%S", std::localtime(&result)); - return string(sz > 0 ? buffer : "BAD TIME"); -} - -// -// StallTimer -// - -StallTimer::StallTimer(const string _name, double _max_seconds, int64_t _max_count) - : BaseTimer(_name) - , max_seconds(_max_seconds) - , max_count(_max_count) { - start(); -} - -StallTimer::~StallTimer() { stop(); } - -void StallTimer::check() { - stop(); - bool print = false; - if (max_seconds > 0.0 && t_elapsed > max_seconds) { - print = true; - } else if (max_count > 0 && count > max_count) { - print = true; - } - if (print) { - WARN("StallTimer - ", name, " on ", rank_me(), " stalled for ", t_elapsed, " s and ", count, " iterations\n"); - max_seconds *= 2.0; - max_count *= 2; - } - start(); -} - -// -// IntermittentTimer -// - -IntermittentTimer::IntermittentTimer(const string &_name, string _interval_label) - : BaseTimer(_name) - , t_interval(0.0) - , interval_label(_interval_label) {} - -IntermittentTimer::~IntermittentTimer() {} - -void IntermittentTimer::clear() { - ((BaseTimer *)this)->clear(); - t_interval = 0.0; - interval_label = ""; -} - -void IntermittentTimer::start_interval() { t_interval = get_elapsed_since_start(); } - -void IntermittentTimer::stop_interval() { - t_interval = get_elapsed_since_start() - t_interval; - if (!interval_label.empty()) { - ostringstream oss; - oss << KBLUE << std::left << std::setw(40) << interval_label << std::setprecision(2) << std::fixed << t_interval << " s" - << KNORM << "\n"; - SLOG(oss.str()); - } -} - -void IntermittentTimer::print_out(const upcxx::team &tm) { - future fut_shptr_timings = reduce_timings(tm); - auto fut = - when_all(Timings::get_pending(), fut_shptr_timings).then([&name = this->name, &count = this->count](ShTimings shptr_timings) { - if (shptr_timings->count_msm.max > 0.0) - SLOG_VERBOSE(KLCYAN, "Timing ", name, ": ", count, " intervals, ", shptr_timings->to_string(true), "\n", KNORM); - }); - Timings::set_pending(fut); - count = 0; - t_elapsed = 0.0; -} - -// -// ProgressTimer -// - -ProgressTimer::ProgressTimer(const string &_name) - : BaseTimer(_name) - , calls(0) {} - -ProgressTimer::~ProgressTimer() {} - -void ProgressTimer::clear() { - ((BaseTimer *)this)->clear(); - calls = 0; -} - -void ProgressTimer::progress(size_t run_every) { - if (run_every > 1 && ++calls % run_every != 0) return; - start(); - upcxx::progress(); - stop(); - // DBG("ProgressTimer(", name, ") - ", t_elapsed, "\n"); -} - -void ProgressTimer::discharge(size_t run_every) { - if (run_every != 1 && ++calls % run_every != 0) return; - start(); - upcxx::discharge(); - upcxx::progress(); - stop(); - // DBG("ProgressTimer(", name, ").discharge() - ", t_elapsed, "\n"); -} - -void ProgressTimer::print_out(const upcxx::team &tm) { - future fut_shptr_timings = reduce_timings(tm); - auto fut = when_all(Timings::get_pending(), fut_shptr_timings).then([&name = this->name](ShTimings shptr_timings) { - if (shptr_timings->count_msm.max > 0.0) - SLOG_VERBOSE(KLCYAN, "Timing ", name, ": ", shptr_timings->to_string(true), KNORM, "\n"); - }); - Timings::set_pending(fut); - count = 0; - t_elapsed = 0.0; -} - -// -// Timer -// -Timer::Timer(const upcxx::team &tm, const string &_name, bool exit_reduction) - : tm(tm) - , exited(exit_reduction) - , logged(false) - , BaseTimer(_name) { - init(); -} -Timer::Timer(const string &_name, bool exit_reduction) - : tm(upcxx::world()) - , exited(exit_reduction) - , logged(false) - , BaseTimer(_name) { - init(); -} -void Timer::init() { - increment_instance(); - auto fut = when_all(Timings::get_pending(), make_future(now_str())).then([name = this->name](string now) {}); - Timings::set_pending(fut); - start(); -} -Timer::Timer(Timer &&move) - : tm(move.tm) - , exited(move.exited) - , BaseTimer((BaseTimer &)move) { - move.exited = true; - move.logged = true; -} -Timer &Timer::operator=(Timer &&move) { - Timer mv(std::move(move)); - std::swap(*this, mv); - return *this; -} - -Timer::~Timer() { - if (!exited) - initiate_exit_reduction(); - else if (!logged) { - stop(); - LOG(KLCYAN, "Timing ", name, ":", get_elapsed(), KNORM, "\n"); - } -} - -future<> Timer::initiate_entrance_reduction() { - DBG_VERBOSE("Tracking entrance of ", name, "\n"); - auto fut_msm = reduce_timepoint(tm, now()); - - auto fut = when_all(Timings::get_pending(), fut_msm).then([name = this->name](MinSumMax msm) { - DBG_VERBOSE("got reduction: ", msm.to_string(), "\n"); - SLOG_VERBOSE(KLCYAN, "Timing (entrance) ", name, ":", msm.to_string(), KNORM, "\n"); - }); - Timings::set_pending(fut); - return fut; -} - -future<> Timer::initiate_exit_reduction() { - stop(); - future fut_shptr_timings = reduce_timings(tm); - auto fut = when_all(Timings::get_pending(), fut_shptr_timings).then([name = this->name](ShTimings shptr_timings) { - SLOG_VERBOSE(KLCYAN, "Timing ", name, " exit: ", shptr_timings->to_string(), KNORM, "\n"); - }); - Timings::set_pending(fut); - decrement_instance(); - exited = true; - logged = true; - return fut; -} - -// -// BarrierTimer -// - -BarrierTimer::BarrierTimer(const upcxx::team &team, const string _name, bool _entrance_barrier, bool _exit_barrier) - : _team(team) - , exit_barrier(_exit_barrier) - , exited(false) - , BaseTimer(_name) { - init(_entrance_barrier); -} -BarrierTimer::BarrierTimer(const string _name, bool _entrance_barrier, bool _exit_barrier) - : _team(upcxx::world()) - , exit_barrier(_exit_barrier) - , exited(false) - , BaseTimer(_name) { - init(_entrance_barrier); -} - -future<> BarrierTimer::init(bool _entrance_barrier) { - increment_instance(); - if (!_entrance_barrier && !exit_barrier) SLOG_VERBOSE("Why are we using a BarrierTimer without any barriers???\n"); - future<> fut; - DBG("Entering BarrierTimer ", name, "\n"); - if (_entrance_barrier) { - fut = when_all(Timings::get_pending(), make_future(now_str())).then([&name = this->name](string now) { - // SLOG_VERBOSE(KLCYAN, "Timing ", name, ": (entering barrier) ", KNORM); - }); - Timings::set_pending(fut); - auto timings = barrier_timings(_team); - Timings::wait_pending(); // should be noop - SLOG_VERBOSE(KLCYAN, "Timing (entrance barrier) ", name, ": ", timings.to_string(), KNORM, "\n"); - } else { - fut = when_all(Timings::get_pending(), make_future(now_str())).then([&name = this->name](string now) {}); - Timings::set_pending(fut); - } - start(); - return fut; -} - -BarrierTimer::~BarrierTimer() { - if (!exited) initate_exit_barrier().wait(); -} -future<> BarrierTimer::initate_exit_barrier() { - stop(); - future<> fut; - DBG("Exiting BarrierTimer ", name, "\n"); - if (exit_barrier) { - fut = when_all(Timings::get_pending(), make_future(now_str())).then([name = this->name](string now) {}); - Timings::set_pending(fut); - auto timings = barrier_timings(_team); - Timings::wait_pending(); - SLOG_VERBOSE(KLCYAN, "Timing ", name, ": ", timings.to_string(), KNORM, "\n"); - } else { - future fut_shptr_timings = reduce_timings(_team); - fut = when_all(Timings::get_pending(), fut_shptr_timings).then([name = this->name](ShTimings shptr_timings) { - SLOG_VERBOSE(KLCYAN, "Timing ", name, ": ", shptr_timings->to_string(), KNORM, "\n"); - }); - Timings::set_pending(fut); - } - decrement_instance(); - exited = true; - return fut; -} - -// -// AsyncTimer -// - -_AsyncTimer::_AsyncTimer(const upcxx::team &tm, const string &name) - : BaseTimer(name) - , tm(tm) - , construct_t(BaseTimer::now()) - , start_t{} {} -void _AsyncTimer::start() { - start_t = now(); - ((BaseTimer *)this)->start(); -} -void _AsyncTimer::stop() { ((BaseTimer *)this)->stop(); } -void _AsyncTimer::report(const string label, MinSumMax msm) { - SLOG_VERBOSE(KLCYAN, "Timing ", name, " ", label, ":", msm.to_string(), KNORM, "\n"); -} - -future<> _AsyncTimer::initiate_construct_reduction() { - auto fut_msm = BaseTimer::reduce_timepoint(tm, construct_t); - auto fut = when_all(Timings::get_pending(), fut_msm).then([this](MinSumMax msm) { this->report("construct", msm); }); - Timings::set_pending(fut); - return fut; -} -future<> _AsyncTimer::initiate_start_reduction() { - auto fut_msm = BaseTimer::reduce_timepoint(tm, start_t); - auto fut = when_all(Timings::get_pending(), fut_msm).then([this](MinSumMax msm) { this->report("start", msm); }); - Timings::set_pending(fut); - return fut; -} -future<> _AsyncTimer::initiate_stop_reduction() { - auto fut_msm = Timings::reduce(tm, 1, get_elapsed(), 1); - auto fut = when_all(Timings::get_pending(), fut_msm).then([this](ShTimings sh_timings) { - this->report("stop", sh_timings->before_elapsed); - }); - Timings::set_pending(fut); - return fut; -} - -AsyncTimer::AsyncTimer(const upcxx::team &tm, const string &name) - : timer(make_shared<_AsyncTimer>(tm, name)) {} -AsyncTimer::AsyncTimer(const string &name) - : timer(make_shared<_AsyncTimer>(upcxx::world(), name)) {} -void AsyncTimer::start() const { timer->start(); } -void AsyncTimer::stop() const { - timer->stop(); - LOG(timer->get_name(), " completed in ", timer->get_elapsed(), " s\n"); -} -double AsyncTimer::get_elapsed() const { return timer->get_elapsed(); } -future<> AsyncTimer::initiate_construct_reduction() { - return timer->initiate_construct_reduction().then([timer = this->timer]() { - // keep timer alive - }); -} -future<> AsyncTimer::initiate_start_reduction() { - return timer->initiate_start_reduction().then([timer = this->timer]() { - // keep timer alive - }); -} -future<> AsyncTimer::initiate_stop_reduction() { - return timer->initiate_stop_reduction().then([timer = this->timer]() { - // keep timer alive - }); -} - -// -// ActiveCountTimer -// - -ActiveCountTimer::ActiveCountTimer(const string _name) - : total_elapsed(0.0) - , total_count(0) - , active_count(0) - , max_active(0) - , name(_name) - , my_fut(make_future()) {} - -ActiveCountTimer::~ActiveCountTimer() { - if (upcxx::initialized()) my_fut.wait(); // keep alive until all futures have finished -} - -void ActiveCountTimer::clear() { - total_elapsed = 0.0; - total_count = 0; - active_count = 0; - max_active = 0; -} - -timepoint_t ActiveCountTimer::begin() { - active_count++; - if (max_active < active_count) max_active = active_count; - return BaseTimer::now(); -} - -void ActiveCountTimer::end(timepoint_t t) { - duration_seconds interval = BaseTimer::now() - t; - active_count--; - total_count++; - total_elapsed += interval.count(); -} - -void ActiveCountTimer::print_barrier_timings(const upcxx::team &team, string label) { - Timings timings = BaseTimer::barrier_timings(team, total_count, total_elapsed, max_active); - clear(); - Timings::wait_pending(); - print_timings(timings, label); -} - -void ActiveCountTimer::print_reduce_timings(const upcxx::team &team, string label) { - label = name + label; - auto fut_timings = BaseTimer::reduce_timings(team, total_count, total_elapsed, max_active); - auto _this = this; - auto fut_clear = fut_timings.then([_this](ShTimings ignored) { _this->clear(); }); - auto fut = when_all(Timings::get_pending(), fut_timings, fut_clear).then([_this, label](ShTimings shptr_timings) { - _this->print_timings(*shptr_timings, label); - }); - my_fut = when_all(fut_clear, my_fut, fut); // keep this in scope until clear has been called... - Timings::set_pending(my_fut); -} - -void ActiveCountTimer::print_timings(Timings &timings, string label) { - label = name + label; - DBG_VERBOSE(__func__, " label=", label, "\n"); - if (active_count > 0) - SWARN("print_timings on ActiveCountTimer '", label, "' called while ", active_count, " (max ", max_active, - ") are still active\n"); - if (timings.count_msm.max > 0.0) { - SLOG_VERBOSE(KLCYAN, "Timing instances of ", label, ": ", - (timings.count_msm.max > 0.0 ? timings.to_string(true) : string("(none)")), KNORM, "\n"); - } -} - -ActiveCountTimer _GenericActiveCountTimer("_upcxx_dummy"); -GenericInstantiationTimer _GenericInstantiationTimer(_GenericActiveCountTimer); -template class ActiveInstantiationTimer<_upcxx_utils_dummy>; - -SingletonInstantiationTimer _SingletonInstantiationTimer(); -template class InstantiationTimer<_upcxx_utils_dummy>; - -}; // namespace upcxx_utils From 1390efd959e31677cf5907158c0f0db69888f5da Mon Sep 17 00:00:00 2001 From: Jan Ciesko Date: Thu, 15 Feb 2024 11:23:14 -0700 Subject: [PATCH 05/13] Set DCMAKE_CXX_COMPILTER=mpicxx --- build.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build.sh b/build.sh index 41280b5..226602d 100755 --- a/build.sh +++ b/build.sh @@ -48,7 +48,7 @@ else if [ "$1" == "Debug" ] || [ "$1" == "Release" ] || [ "$1" == "RelWithDebInfo" ]; then rm -rf * rm -rf $INSTALL_PATH/cmake - cmake $rootdir -DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DCMAKE_BUILD_TYPE=$1 -DCMAKE_INSTALL_PREFIX=$INSTALL_PATH \ + cmake $rootdir -DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DCMAKE_BUILD_TYPE=$1 -DCMAKE_INSTALL_PREFIX=$INSTALL_PATH -DCMAKE_CXX_COMPILER=mpicxx \ -DMHM2_ENABLE_TESTING=0 $MHM2_CMAKE_EXTRAS $2 #-DENABLE_CUDA=0 fi From 6e24d8b73477362a8913031e2b5cbffd1cf83f16 Mon Sep 17 00:00:00 2001 From: Jan Ciesko Date: Thu, 15 Feb 2024 11:24:20 -0700 Subject: [PATCH 06/13] Use std::chrono::duration contructor and not assignement operator Cleanup after timer removal --- src/devices_gpu.cpp | 1 - src/kcount/kcount_gpu.cpp | 11 ----------- upcxx-utils/include/upcxx_utils/thread_pool.hpp | 12 ++++++------ 3 files changed, 6 insertions(+), 18 deletions(-) diff --git a/src/devices_gpu.cpp b/src/devices_gpu.cpp index ba139ba..a072104 100644 --- a/src/devices_gpu.cpp +++ b/src/devices_gpu.cpp @@ -68,7 +68,6 @@ void init_devices() { void done_init_devices() { if (init_gpu_thread) { - Timer t("Waiting for GPU to be initialized (should be noop)"); init_gpu_thread = false; detect_gpu_fut.wait(); if (gpu_utils::gpus_present()) { diff --git a/src/kcount/kcount_gpu.cpp b/src/kcount/kcount_gpu.cpp index c1e5da0..4b31e30 100644 --- a/src/kcount/kcount_gpu.cpp +++ b/src/kcount/kcount_gpu.cpp @@ -276,16 +276,9 @@ void HashTableInserter::flush_inserts() { SLOG_GPU(" final size per rank is ", insert_stats.new_inserts, " entries\n"); } -template -void HashTableInserter::get_elapsed_time(double &insert_time, double &kernel_time) { - state->ht_gpu_driver.get_elapsed_time(insert_time, kernel_time); -} - template void HashTableInserter::insert_into_local_hashtable(dist_object> &local_kmers) { barrier(); - IntermittentTimer insert_timer("gpu insert to cpu timer"); - insert_timer.start(); if (state->ht_gpu_driver.pass_type == CTG_KMERS_PASS) { int attempted_inserts = 0, dropped_inserts = 0, new_inserts = 0; state->ht_gpu_driver.done_ctg_kmer_inserts(attempted_inserts, dropped_inserts, new_inserts); @@ -350,11 +343,7 @@ void HashTableInserter::insert_into_local_hashtable(dist_objectinsert({kmer, kmer_counts}); } - insert_timer.stop(); - auto all_avg_elapsed_time = reduce_one(insert_timer.get_elapsed(), op_fast_add, 0).wait() / rank_n(); - auto all_max_elapsed_time = reduce_one(insert_timer.get_elapsed(), op_fast_max, 0).wait(); - SLOG_GPU("Inserting kmers from GPU to cpu hash table took ", all_avg_elapsed_time, " avg, ", all_max_elapsed_time, " max\n"); auto all_kmers_size = reduce_one((uint64_t)local_kmers->size(), op_fast_add, 0).wait(); if (local_kmers->size() != (num_entries - invalid)) WARN("kmers->size() is ", local_kmers->size(), " != ", (num_entries - invalid), " num_entries"); diff --git a/upcxx-utils/include/upcxx_utils/thread_pool.hpp b/upcxx-utils/include/upcxx_utils/thread_pool.hpp index c04ec48..c2e586a 100644 --- a/upcxx-utils/include/upcxx_utils/thread_pool.hpp +++ b/upcxx-utils/include/upcxx_utils/thread_pool.hpp @@ -144,7 +144,7 @@ class ThreadPool { sh_prom->require_anonymous(1); // additional requirement to complete auto task_id = global_task_id()++; - auto start_t = 0; + auto start_t = 0.0; DBG("sh_prom=", sh_prom.get(), " task_id=", task_id, "\n"); auto args_tuple = std::make_tuple(args...); // *copy* arguments to avoid races in argument references being reused @@ -155,7 +155,7 @@ class ThreadPool { DBG_VERBOSE("Finished sh_prom=", sh_prom.get(), "\n"); // fulfill only in calling persona persona.lpc_ff([task_id, start_t, sh_prom]() { - duration_seconds s = 0; + duration_seconds s (0.0); DBG("Fulfilled sh_prom=", sh_prom.get(), " task_id=", task_id, " in ", s.count(), " s\n"); sh_prom->fulfill_anonymous(1); global_tasks_completed()++; @@ -184,20 +184,20 @@ class ThreadPool { std::shared_ptr> sh_prom = std::make_shared>(); auto task_id = global_task_id()++; - auto start_t = 0; + auto start_t = 0.0; DBG("sh_prom=", sh_prom.get(), " task_id=", task_id, "of", global_task_id(), "\n"); auto args_tuple = std::make_tuple(args...); // *copy* arguments to avoid races in argument references being reused auto sh_task = std::make_shared([sh_prom, task_id, start_t, &persona, func{std::move(func)}, args_tuple{std::move(args_tuple)}]() { - auto compute_start_t = 0; - duration_seconds delay_s = compute_start_t - start_t; + auto compute_start_t = 0.0; + duration_seconds delay_s (compute_start_t - start_t); DBG_VERBOSE("Executing sh_prom=", sh_prom.get(), "\n"); std::apply(func, args_tuple); DBG_VERBOSE("Finished sh_prom=", sh_prom.get(), "\n"); // fulfill only in calling persona persona.lpc_ff([task_id, start_t, compute_start_t, delay_s, sh_prom]() { - duration_seconds s = 0 - compute_start_t; + duration_seconds s (0.0 - compute_start_t); DBG("Fulfilled sh_prom=", sh_prom.get(), " task_id=", task_id, "of", global_task_id(), " in ", delay_s.count(), " delay + ", s.count(), " s\n"); sh_prom->fulfill_anonymous(1); global_tasks_completed()++; From dd36c56830c977c09a65f4e30263edda3070615a Mon Sep 17 00:00:00 2001 From: Jan Ciesko Date: Mon, 19 Feb 2024 09:40:50 -0700 Subject: [PATCH 07/13] Add Kokkos to CMake Add Kokkos init and finalize calls --- CMakeLists.txt | 8 ++++++++ build.sh | 6 +++--- src/CMakeLists.txt | 8 +++++++- src/main.cpp | 10 ++++++++++ 4 files changed, 28 insertions(+), 4 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 6bad75f..74355c5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -42,6 +42,7 @@ # to the root source directory of the project as ${MHM2_SOURCE_DIR} and to the # root binary directory of the project as ${MHM2_BINARY_DIR} cmake_minimum_required(VERSION 3.10 FATAL_ERROR) +cmake_policy(SET CMP0074 NEW) project(MHM2) message(STATUS "Building ${CMAKE_PROJECT_NAME} with CMake ${CMAKE_VERSION}") @@ -151,6 +152,13 @@ if(MHM2_VECTORS) endforeach() endif() +option(ENABLE_KOKKOS "Whether to use Kokkos" OFF) +if(ENABLE_KOKKOS) + message(STATUS "Building with Kokkos") + add_definitions(-DENABLE_KOKKOS) + find_package(Kokkos REQUIRED) +endif() + option(ENABLE_GASNET_STATS "Turn on gasnet stats recording" OFF) message("Building ${CMAKE_BUILD_TYPE} version") if("${CMAKE_BUILD_TYPE}" STREQUAL "Debug") diff --git a/build.sh b/build.sh index 226602d..19599ed 100755 --- a/build.sh +++ b/build.sh @@ -46,11 +46,11 @@ else mkdir -p $rootdir/.build cd $rootdir/.build if [ "$1" == "Debug" ] || [ "$1" == "Release" ] || [ "$1" == "RelWithDebInfo" ]; then - rm -rf * - rm -rf $INSTALL_PATH/cmake +# rm -rf * +# rm -rf $INSTALL_PATH/cmake cmake $rootdir -DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DCMAKE_BUILD_TYPE=$1 -DCMAKE_INSTALL_PREFIX=$INSTALL_PATH -DCMAKE_CXX_COMPILER=mpicxx \ -DMHM2_ENABLE_TESTING=0 $MHM2_CMAKE_EXTRAS $2 - #-DENABLE_CUDA=0 + #-DENABLE_CUDA=0 fi make -j ${MHM2_BUILD_THREADS} all install #make VERBOSE=1 -j ${MHM2_BUILD_THREADS} all install diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 4643bb8..5e979b6 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -39,6 +39,7 @@ # such enhancements or derivative works thereof, in binary and source code form. cmake_minimum_required(VERSION 3.10 FATAL_ERROR) +cmake_policy(SET CMP0074 NEW) if(${CMAKE_VERSION} VERSION_GREATER_EQUAL 3.13 AND DEFINED UPCXX_LIBRARIES) message( @@ -236,11 +237,16 @@ set(MHM2_TARGET_OBJECTS add_executable(mhm2 main.cpp ${MHM2_TARGET_OBJECTS}) +if(ENABLE_KOKKOS) + set (KokkosLib Kokkos::kokkos) +endif() + set(MHM2_LINK_LIBRARIES Threads::Threads KCOUNT_LIBRARY ${ZLIB_LIBRARIES} - MHM2_VERSION_LIB) + MHM2_VERSION_LIB + ${KokkosLib}) include_directories(${CMAKE_CURRENT_SOURCE_DIR}) diff --git a/src/main.cpp b/src/main.cpp index b21cccb..b0bdc66 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -50,6 +50,10 @@ #include "kmer.hpp" +#ifdef ENABLE_KOKKOS +#include +#endif + using std::fixed; using std::setprecision; @@ -63,6 +67,9 @@ void merge_reads(vector reads_fname_list, int qual_offset, int main(int argc, char **argv) { + #ifdef ENABLE_KOKKOS + Kokkos::initialize(argc, argv); + #endif upcxx::init(); barrier(); @@ -237,5 +244,8 @@ int main(int argc, char **argv) { ; #endif upcxx::finalize(); + #ifdef ENABLE_KOKKOS + Kokkos::finalize(); + #endif return 0; } From 85947d7ac8c5660ce02849ea97c3f27f2558dd03 Mon Sep 17 00:00:00 2001 From: Amy Powell Date: Wed, 20 Mar 2024 10:05:49 -0700 Subject: [PATCH 08/13] Hipification: round one --- src/fastq.cpp | 6 +- src/kcount/kcount-gpu/gpu_hash_funcs.cpp | 141 - src/kcount/kcount-gpu/gpu_hash_table.cpp | 802 ----- src/kcount/kcount-gpu/gpu_hash_table.hpp | 182 - src/kcount/kcount-gpu/gqf.cpp | 2920 ----------------- src/kcount/kcount-gpu/gqf.hpp | 384 --- src/kcount/kcount-gpu/gqf_int.hpp | 138 - src/kcount/kcount-gpu/hashutil.cpp | 179 - src/kcount/kcount-gpu/hashutil.hpp | 25 - src/kcount/kcount-gpu/parse_and_pack.cpp | 325 -- src/kcount/kcount-gpu/parse_and_pack.hpp | 89 - src/kcount/kcount-gpu/partitioned_counter.cpp | 68 - src/kcount/kcount-gpu/partitioned_counter.hpp | 56 - src/kcount/kcount_gpu.cpp | 2 +- 14 files changed, 4 insertions(+), 5313 deletions(-) diff --git a/src/fastq.cpp b/src/fastq.cpp index 231b21a..3697abc 100644 --- a/src/fastq.cpp +++ b/src/fastq.cpp @@ -484,7 +484,7 @@ void FastqReader::seek() { } FastqReader::~FastqReader() { - if (!open_fut.is_ready()) { + if (!open_fut.ready()) { WARN("Destructor called before opening completed\n"); open_fut.wait(); } @@ -502,7 +502,7 @@ string FastqReader::get_fname() { return fname; } size_t FastqReader::my_file_size() { return end_read - start_read + (fqr2 ? fqr2->my_file_size() : 0); } size_t FastqReader::get_next_fq_record(string &id, string &seq, string &quals, bool wait_open) { - if (wait_open && !open_fut.is_ready()) { + if (wait_open && !open_fut.ready()) { WARN("Attempt to read ", fname, " before it is ready. wait on open_fut first to avoid this warning!\n"); open_fut.wait(); } @@ -554,7 +554,7 @@ int FastqReader::get_max_read_len() { return std::max(max_read_len, fqr2 ? fqr2- void FastqReader::reset() { - if (!open_fut.is_ready()) { + if (!open_fut.ready()) { open_fut.wait(); } if (!f) { diff --git a/src/kcount/kcount-gpu/gpu_hash_funcs.cpp b/src/kcount/kcount-gpu/gpu_hash_funcs.cpp index 10bcc0f..e69de29 100644 --- a/src/kcount/kcount-gpu/gpu_hash_funcs.cpp +++ b/src/kcount/kcount-gpu/gpu_hash_funcs.cpp @@ -1,141 +0,0 @@ -/* - HipMer v 2.0, Copyright (c) 2020, The Regents of the University of California, - through Lawrence Berkeley National Laboratory (subject to receipt of any required - approvals from the U.S. Dept. of Energy). All rights reserved." - - Redistribution and use in source and binary forms, with or without modification, - are permitted provided that the following conditions are met: - - (1) Redistributions of source code must retain the above copyright notice, this - list of conditions and the following disclaimer. - - (2) Redistributions in binary form must reproduce the above copyright notice, - this list of conditions and the following disclaimer in the documentation and/or - other materials provided with the distribution. - - (3) Neither the name of the University of California, Lawrence Berkeley National - Laboratory, U.S. Dept. of Energy nor the names of its contributors may be used to - endorse or promote products derived from this software without specific prior - written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY - EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES - OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT - SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, - INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED - TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR - BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN - ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH - DAMAGE. - - You are under no obligation whatsoever to provide any bug fixes, patches, or upgrades - to the features, functionality or performance of the source code ("Enhancements") to - anyone; however, if you choose to make your Enhancements available either publicly, - or directly to Lawrence Berkeley National Laboratory, without imposing a separate - written license agreement for such Enhancements, then you hereby grant the following - license: a non-exclusive, royalty-free perpetual license to install, use, modify, - prepare derivative works, incorporate into other computer software, distribute, and - sublicense such enhancements or derivative works thereof, in binary and source code - form. -*/ - -#define BIG_CONSTANT(x) (x##LLU) -#define ROTL64(x, r) ((x << r) | (x >> (64 - r))) - -// Finalization mix - force all bits of a hash block to avalanche -__device__ uint64_t fmix64(uint64_t k) { - k ^= k >> 33; - k *= BIG_CONSTANT(0xff51afd7ed558ccd); - k ^= k >> 33; - k *= BIG_CONSTANT(0xc4ceb9fe1a85ec53); - k ^= k >> 33; - - return k; -} - -__device__ uint64_t gpu_murmurhash3_64(const void *key, const uint32_t len) { - const uint8_t *data = (const uint8_t *)key; - const uint32_t nblocks = len / 16; - const uint32_t seed = 313; - int32_t i; - - uint64_t h1 = seed; - uint64_t h2 = seed; - - uint64_t c1 = BIG_CONSTANT(0x87c37b91114253d5); - uint64_t c2 = BIG_CONSTANT(0x4cf5ad432745937f); - - const uint64_t *blocks = (const uint64_t *)(data); - - for (i = 0; i < nblocks; i++) { - uint64_t k1 = blocks[i * 2 + 0]; - uint64_t k2 = blocks[i * 2 + 1]; - - k1 *= c1; - k1 = ROTL64(k1, 31); - k1 *= c2; - h1 ^= k1; - - h1 = ROTL64(h1, 27); - h1 += h2; - h1 = h1 * 5 + 0x52dce729; - - k2 *= c2; - k2 = ROTL64(k2, 33); - k2 *= c1; - h2 ^= k2; - - h2 = ROTL64(h2, 31); - h2 += h1; - h2 = h2 * 5 + 0x38495ab5; - } - - const uint8_t *tail = (const uint8_t *)(data + nblocks * 16); - - uint64_t k1 = 0; - uint64_t k2 = 0; - - switch (len & 15) { - case 15: k2 ^= (uint64_t)(tail[14]) << 48; - case 14: k2 ^= (uint64_t)(tail[13]) << 40; - case 13: k2 ^= (uint64_t)(tail[12]) << 32; - case 12: k2 ^= (uint64_t)(tail[11]) << 24; - case 11: k2 ^= (uint64_t)(tail[10]) << 16; - case 10: k2 ^= (uint64_t)(tail[9]) << 8; - case 9: - k2 ^= (uint64_t)(tail[8]) << 0; - k2 *= c2; - k2 = ROTL64(k2, 33); - k2 *= c1; - h2 ^= k2; - - case 8: k1 ^= (uint64_t)(tail[7]) << 56; - case 7: k1 ^= (uint64_t)(tail[6]) << 48; - case 6: k1 ^= (uint64_t)(tail[5]) << 40; - case 5: k1 ^= (uint64_t)(tail[4]) << 32; - case 4: k1 ^= (uint64_t)(tail[3]) << 24; - case 3: k1 ^= (uint64_t)(tail[2]) << 16; - case 2: k1 ^= (uint64_t)(tail[1]) << 8; - case 1: - k1 ^= (uint64_t)(tail[0]) << 0; - k1 *= c1; - k1 = ROTL64(k1, 31); - k1 *= c2; - h1 ^= k1; - }; - - h1 ^= len; - h2 ^= len; - - h1 += h2; - h2 += h1; - - h1 = fmix64(h1); - h2 = fmix64(h2); - - h1 += h2; - h2 += h1; - - return h1; -} diff --git a/src/kcount/kcount-gpu/gpu_hash_table.cpp b/src/kcount/kcount-gpu/gpu_hash_table.cpp index 7391060..e69de29 100644 --- a/src/kcount/kcount-gpu/gpu_hash_table.cpp +++ b/src/kcount/kcount-gpu/gpu_hash_table.cpp @@ -1,802 +0,0 @@ -/* - HipMer v 2.0, Copyright (c) 2020, The Regents of the University of California, - through Lawrence Berkeley National Laboratory (subject to receipt of any required - approvals from the U.S. Dept. of Energy). All rights reserved." - - Redistribution and use in source and binary forms, with or without modification, - are permitted provided that the following conditions are met: - - (1) Redistributions of source code must retain the above copyright notice, this - list of conditions and the following disclaimer. - - (2) Redistributions in binary form must reproduce the above copyright notice, - this list of conditions and the following disclaimer in the documentation and/or - other materials provided with the distribution. - - (3) Neither the name of the University of California, Lawrence Berkeley National - Laboratory, U.S. Dept. of Energy nor the names of its contributors may be used to - endorse or promote products derived from this software without specific prior - written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY - EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES - OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT - SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, - INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED - TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR - BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN - ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH - DAMAGE. - - You are under no obligation whatsoever to provide any bug fixes, patches, or upgrades - to the features, functionality or performance of the source code ("Enhancements") to - anyone; however, if you choose to make your Enhancements available either publicly, - or directly to Lawrence Berkeley National Laboratory, without imposing a separate - written license agreement for such Enhancements, then you hereby grant the following - license: a non-exclusive, royalty-free perpetual license to install, use, modify, - prepare derivative works, incorporate into other computer software, distribute, and - sublicense such enhancements or derivative works thereof, in binary and source code - form. -*/ - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "upcxx_utils/colors.h" -#include "gpu-utils/gpu_common.hpp" -#include "gpu-utils/gpu_utils.hpp" -#include "gpu_hash_table.hpp" -#include "prime.hpp" -#include "gqf.hpp" - -#include "gpu_hash_funcs.cpp" - -using namespace std; -using namespace gpu_common; -using namespace kcount_gpu; - -const uint64_t KEY_EMPTY = 0xffffffffffffffff; -const uint64_t KEY_TRANSITION = 0xfffffffffffffffe; -const uint8_t KEY_EMPTY_BYTE = 0xff; - -template -__device__ void kmer_set(KmerArray &kmer1, const KmerArray &kmer2) { - int N_LONGS = kmer1.N_LONGS; - uint64_t old_key; - for (int i = 0; i < N_LONGS - 1; i++) { - old_key = atomicExch((unsigned long long *)&(kmer1.longs[i]), kmer2.longs[i]); - if (old_key != KEY_EMPTY) printf("ERROR: old key should be KEY_EMPTY\n"); - } - old_key = atomicExch((unsigned long long *)&(kmer1.longs[N_LONGS - 1]), kmer2.longs[N_LONGS - 1]); - if (old_key != KEY_TRANSITION) printf("ERROR: old key should be KEY_TRANSITION\n"); -} - -template -__device__ bool kmers_equal(const KmerArray &kmer1, const KmerArray &kmer2) { - int n_longs = kmer1.N_LONGS; - for (int i = 0; i < n_longs; i++) { - uint64_t old_key = atomicAdd((unsigned long long *)&(kmer1.longs[i]), 0ULL); - if (old_key != kmer2.longs[i]) return false; - } - return true; -} - -template -__device__ size_t kmer_hash(const KmerArray &kmer) { - return gpu_murmurhash3_64(reinterpret_cast(kmer.longs), kmer.N_LONGS * sizeof(uint64_t)); -} - -__device__ int8_t get_ext(CountsArray &counts, int pos, int8_t *ext_map) { - count_t top_count = 0, runner_up_count = 0; - int top_ext_pos = 0; - count_t kmer_count = counts.kmer_count; - for (int i = pos; i < pos + 4; i++) { - if (counts.ext_counts[i] >= top_count) { - runner_up_count = top_count; - top_count = counts.ext_counts[i]; - top_ext_pos = i; - } else if (counts.ext_counts[i] > runner_up_count) { - runner_up_count = counts.ext_counts[i]; - } - } - int dmin_dyn = (1.0 - DYN_MIN_DEPTH) * kmer_count; - if (dmin_dyn < 2.0) dmin_dyn = 2.0; - if (top_count < dmin_dyn) return 'X'; - if (runner_up_count >= dmin_dyn) return 'F'; - return ext_map[top_ext_pos - pos]; -} - -__device__ bool ext_conflict(ext_count_t *ext_counts, int start_idx) { - int idx = -1; - for (int i = start_idx; i < start_idx + 4; i++) { - if (ext_counts[i]) { - // conflict - if (idx != -1) return true; - idx = i; - } - } - return false; -} - -template -__global__ void gpu_merge_ctg_kmers(KmerCountsMap read_kmers, const KmerCountsMap ctg_kmers, - unsigned int *insert_counts) { - unsigned int threadid = blockIdx.x * blockDim.x + threadIdx.x; - int8_t ext_map[4] = {'A', 'C', 'G', 'T'}; - int N_LONGS = KmerArray::N_LONGS; - int attempted_inserts = 0; - int dropped_inserts = 0; - int new_inserts = 0; - if (threadid < ctg_kmers.capacity) { - count_t kmer_count = ctg_kmers.vals[threadid].kmer_count; - ext_count_t *ext_counts = ctg_kmers.vals[threadid].ext_counts; - if (kmer_count && !ext_conflict(ext_counts, 0) && !ext_conflict(ext_counts, 4)) { - KmerArray kmer = ctg_kmers.keys[threadid]; - uint64_t slot = kmer_hash(kmer) % read_kmers.capacity; - auto start_slot = slot; - attempted_inserts++; - const int MAX_PROBE = (read_kmers.capacity < KCOUNT_HT_MAX_PROBE ? read_kmers.capacity : KCOUNT_HT_MAX_PROBE); - for (int j = 0; j < MAX_PROBE; j++) { - uint64_t old_key = atomicCAS((unsigned long long *)&(read_kmers.keys[slot].longs[N_LONGS - 1]), KEY_EMPTY, KEY_TRANSITION); - if (old_key == KEY_EMPTY) { - new_inserts++; - memcpy(&read_kmers.vals[slot], &ctg_kmers.vals[threadid], sizeof(CountsArray)); - kmer_set(read_kmers.keys[slot], kmer); - break; - } else if (old_key == kmer.longs[N_LONGS - 1]) { - if (kmers_equal(read_kmers.keys[slot], kmer)) { - // existing kmer from reads - only replace if the kmer is non-UU - // there is no need for atomics here because all ctg kmers are unique; hence only one thread will ever match this kmer - int8_t left_ext = get_ext(read_kmers.vals[slot], 0, ext_map); - int8_t right_ext = get_ext(read_kmers.vals[slot], 4, ext_map); - if (left_ext == 'X' || left_ext == 'F' || right_ext == 'X' || right_ext == 'F') - memcpy(&read_kmers.vals[slot], &ctg_kmers.vals[threadid], sizeof(CountsArray)); - break; - } - } - // quadratic probing - worse cache but reduced clustering - slot = (start_slot + (j + 1) * (j + 1)) % read_kmers.capacity; - if (j == MAX_PROBE - 1) dropped_inserts++; - } - } - } - reduce(attempted_inserts, ctg_kmers.capacity, &(insert_counts[0])); - reduce(dropped_inserts, ctg_kmers.capacity, &(insert_counts[1])); - reduce(new_inserts, ctg_kmers.capacity, &(insert_counts[2])); -} - -template -__global__ void gpu_compact_ht(KmerCountsMap elems, KmerExtsMap compact_elems, unsigned int *elem_counts) { - unsigned int threadid = blockIdx.x * blockDim.x + threadIdx.x; - const int N_LONGS = KmerArray::N_LONGS; - int dropped_inserts = 0; - int unique_inserts = 0; - int8_t ext_map[4] = {'A', 'C', 'G', 'T'}; - if (threadid < elems.capacity) { - if (elems.vals[threadid].kmer_count) { - KmerArray kmer = elems.keys[threadid]; - uint64_t slot = kmer_hash(kmer) % compact_elems.capacity; - auto start_slot = slot; - // we set a constraint on the max probe to track whether we are getting excessive collisions and need a bigger default - // compact table - const int MAX_PROBE = (compact_elems.capacity < KCOUNT_HT_MAX_PROBE ? compact_elems.capacity : KCOUNT_HT_MAX_PROBE); - // look for empty slot in compact hash table - for (int j = 0; j < MAX_PROBE; j++) { - uint64_t old_key = - atomicCAS((unsigned long long *)&(compact_elems.keys[slot].longs[N_LONGS - 1]), KEY_EMPTY, kmer.longs[N_LONGS - 1]); - if (old_key == KEY_EMPTY) { - // found empty slot - there will be no duplicate keys since we're copying across from another hash table - unique_inserts++; - memcpy((void *)compact_elems.keys[slot].longs, kmer.longs, sizeof(uint64_t) * (N_LONGS - 1)); - // compute exts - int8_t left_ext = get_ext(elems.vals[threadid], 0, ext_map); - int8_t right_ext = get_ext(elems.vals[threadid], 4, ext_map); - if (elems.vals[threadid].kmer_count < 2) - printf("WARNING: elem should have been purged, count %d\n", elems.vals[threadid].kmer_count); - compact_elems.vals[slot].count = elems.vals[threadid].kmer_count; - compact_elems.vals[slot].left = left_ext; - compact_elems.vals[slot].right = right_ext; - break; - } - // quadratic probing - worse cache but reduced clustering - slot = (start_slot + (j + 1) * (j + 1)) % compact_elems.capacity; - if (j == MAX_PROBE - 1) dropped_inserts++; - } - } - } - reduce(dropped_inserts, compact_elems.capacity, &(elem_counts[0])); - reduce(unique_inserts, compact_elems.capacity, &(elem_counts[1])); -} - -template -__global__ void gpu_purge_invalid(KmerCountsMap elems, unsigned int *elem_counts) { - unsigned int threadid = blockIdx.x * blockDim.x + threadIdx.x; - int N_LONGS = KmerArray::N_LONGS; - int num_purged = 0; - int num_elems = 0; - if (threadid < elems.capacity) { - if (elems.vals[threadid].kmer_count) { - int ext_sum = 0; - for (int j = 0; j < 8; j++) ext_sum += elems.vals[threadid].ext_counts[j]; - if (elems.vals[threadid].kmer_count < 2 || !ext_sum) { - memset(&elems.vals[threadid], 0, sizeof(CountsArray)); - memset((void *)elems.keys[threadid].longs, KEY_EMPTY_BYTE, N_LONGS * sizeof(uint64_t)); - num_purged++; - } else { - num_elems++; - } - } - } - reduce(num_purged, elems.capacity, &(elem_counts[0])); - reduce(num_elems, elems.capacity, &(elem_counts[1])); -} - -static __constant__ char to_base[] = {'0', 'a', 'c', 'g', 't', 'A', 'C', 'G', 'T', 'N'}; - -inline __device__ char to_base_func(int index, int pp) { - if (index > 9) { - printf("ERROR: index out of range for to_base: %d, packed seq pos %d\n", index, pp); - return 0; - } - if (index == 0) return '_'; - return to_base[index]; -} - -__global__ void gpu_unpack_supermer_block(SupermerBuff unpacked_supermer_buff, SupermerBuff packed_supermer_buff, int buff_len) { - unsigned int threadid = blockIdx.x * blockDim.x + threadIdx.x; - if (threadid >= buff_len) return; - uint8_t packed = packed_supermer_buff.seqs[threadid]; - if (packed == '_') return; - uint8_t left_side = (packed & 240) >> 4; - unpacked_supermer_buff.seqs[threadid * 2] = to_base_func(left_side, packed); - if (packed_supermer_buff.counts) unpacked_supermer_buff.counts[threadid * 2] = packed_supermer_buff.counts[threadid]; - uint8_t right_side = packed & 15; - unpacked_supermer_buff.seqs[threadid * 2 + 1] = to_base_func(right_side, packed); - if (packed_supermer_buff.counts) unpacked_supermer_buff.counts[threadid * 2 + 1] = packed_supermer_buff.counts[threadid]; -} - -inline __device__ bool is_valid_base(char base) { - return (base == 'A' || base == 'C' || base == 'G' || base == 'T' || base == '0' || base == 'N'); -} - -inline __device__ bool bad_qual(char base) { return (base == 'a' || base == 'c' || base == 'g' || base == 't'); } - -inline __device__ void inc_ext(char ext, ext_count_t kmer_count, ext_count_t *ext_counts) { - switch (ext) { - case 'A': atomicAddUint16_thres(&(ext_counts[0]), kmer_count, KCOUNT_MAX_KMER_COUNT); return; - case 'C': atomicAddUint16_thres(&(ext_counts[1]), kmer_count, KCOUNT_MAX_KMER_COUNT); return; - case 'G': atomicAddUint16_thres(&(ext_counts[2]), kmer_count, KCOUNT_MAX_KMER_COUNT); return; - case 'T': atomicAddUint16_thres(&(ext_counts[3]), kmer_count, KCOUNT_MAX_KMER_COUNT); return; - } -} - -template -__device__ bool get_kmer_from_supermer(SupermerBuff supermer_buff, uint32_t buff_len, int kmer_len, uint64_t *kmer, char &left_ext, - char &right_ext, count_t &count) { - unsigned int threadid = blockIdx.x * blockDim.x + threadIdx.x; - int num_kmers = buff_len - kmer_len + 1; - if (threadid >= num_kmers) return false; - const int N_LONGS = KmerArray::N_LONGS; - if (!pack_seq_to_kmer(&(supermer_buff.seqs[threadid]), kmer_len, N_LONGS, kmer)) return false; - if (threadid + kmer_len >= buff_len) return false; // printf("out of bounds %d >= %d\n", threadid + kmer_len, buff_len); - left_ext = supermer_buff.seqs[threadid - 1]; - right_ext = supermer_buff.seqs[threadid + kmer_len]; - if (left_ext == '_' || right_ext == '_') return false; - if (!left_ext || !right_ext) return false; - if (supermer_buff.counts) { - count = supermer_buff.counts[threadid]; - } else { - count = 1; - if (bad_qual(left_ext)) left_ext = '0'; - if (bad_qual(right_ext)) right_ext = '0'; - } - if (!is_valid_base(left_ext)) { - printf("ERROR: threadid %d, invalid char for left nucleotide %d\n", threadid, (uint8_t)left_ext); - return false; - } - if (!is_valid_base(right_ext)) { - printf("ERROR: threadid %d, invalid char for right nucleotide %d\n", threadid, (uint8_t)right_ext); - return false; - } - uint64_t kmer_rc[N_LONGS]; - revcomp(kmer, kmer_rc, kmer_len, N_LONGS); - for (int l = 0; l < N_LONGS; l++) { - if (kmer_rc[l] == kmer[l]) continue; - if (kmer_rc[l] < kmer[l]) { - // swap - char tmp = left_ext; - left_ext = comp_nucleotide(right_ext); - right_ext = comp_nucleotide(tmp); - - // FIXME: we should be able to have a 0 extension even for revcomp - we do for non-revcomp - // if (!left_ext || !right_ext) return false; - - memcpy(kmer, kmer_rc, N_LONGS * sizeof(uint64_t)); - } - break; - } - return true; -} - -template -__device__ bool gpu_insert_kmer(KmerCountsMap elems, uint64_t hash_val, KmerArray &kmer, char left_ext, - char right_ext, char prev_left_ext, char prev_right_ext, count_t kmer_count, int &new_inserts, - int &dropped_inserts, bool ctg_kmers, bool use_qf, bool update_only) { - const int N_LONGS = KmerArray::N_LONGS; - uint64_t slot = hash_val % elems.capacity; - auto start_slot = slot; - const int MAX_PROBE = (elems.capacity < 200 ? elems.capacity : 200); - bool found_slot = false; - bool kmer_found_in_ht = false; - uint64_t old_key = KEY_TRANSITION; - for (int j = 0; j < MAX_PROBE; j++) { - // we have to be careful here not to end up with multiple threads on the same warp accessing the same slot, because - // that will cause a deadlock. So we loop over all statements in each CAS spin to ensure that all threads get a - // chance to execute - do { - old_key = atomicCAS((unsigned long long *)&(elems.keys[slot].longs[N_LONGS - 1]), KEY_EMPTY, KEY_TRANSITION); - if (old_key != KEY_TRANSITION) { - if (old_key == KEY_EMPTY) { - if (update_only) { - old_key = atomicExch((unsigned long long *)&(elems.keys[slot].longs[N_LONGS - 1]), KEY_EMPTY); - if (old_key != KEY_TRANSITION) printf("ERROR: old key should be KEY_TRANSITION\n"); - return false; - } - kmer_set(elems.keys[slot], kmer); - found_slot = true; - } else if (old_key == kmer.longs[N_LONGS - 1]) { - if (kmers_equal(elems.keys[slot], kmer)) { - found_slot = true; - kmer_found_in_ht = true; - } - } - } - } while (old_key == KEY_TRANSITION); - if (found_slot) break; - // quadratic probing - worse cache but reduced clustering - slot = (start_slot + j * j) % elems.capacity; - // this entry didn't get inserted because we ran out of probing time (and probably space) - if (j == MAX_PROBE - 1) dropped_inserts++; - } - if (found_slot) { - ext_count_t *ext_counts = elems.vals[slot].ext_counts; - if (ctg_kmers) { - // the count is the min of all counts. Use CAS to deal with the initial zero value - int prev_count = atomicCAS(&elems.vals[slot].kmer_count, 0, kmer_count); - if (prev_count) - atomicMin(&elems.vals[slot].kmer_count, kmer_count); - else - new_inserts++; - } else { - assert(kmer_count == 1); - int prev_count = atomicAdd(&elems.vals[slot].kmer_count, kmer_count); - if (!prev_count) new_inserts++; - } - ext_count_t kmer_count_uint16 = min(kmer_count, UINT16_MAX); - inc_ext(left_ext, kmer_count_uint16, ext_counts); - inc_ext(right_ext, kmer_count_uint16, ext_counts + 4); - if (use_qf && !update_only && !kmer_found_in_ht && !ctg_kmers) { - // kmer was not in hash table, so it must have been found in the qf - // add the extensions from the previous entry stored in the qf - inc_ext(prev_left_ext, 1, ext_counts); - inc_ext(prev_right_ext, 1, ext_counts + 4); - // inc the overall kmer count - atomicAdd(&elems.vals[slot].kmer_count, 1); - } - } - return true; -} - -template -__global__ void gpu_insert_supermer_block(KmerCountsMap elems, SupermerBuff supermer_buff, uint32_t buff_len, int kmer_len, - bool ctg_kmers, InsertStats *insert_stats, quotient_filter::QF *qf) { - unsigned int threadid = blockIdx.x * blockDim.x + threadIdx.x; - const int N_LONGS = KmerArray::N_LONGS; - int attempted_inserts = 0, dropped_inserts = 0, new_inserts = 0, num_unique_qf = 0; - if (threadid > 0 && threadid < buff_len) { - attempted_inserts++; - KmerArray kmer; - char left_ext, right_ext; - count_t kmer_count; - if (get_kmer_from_supermer(supermer_buff, buff_len, kmer_len, kmer.longs, left_ext, right_ext, kmer_count)) { - if (kmer.longs[N_LONGS - 1] == KEY_EMPTY) printf("ERROR: block equal to KEY_EMPTY\n"); - if (kmer.longs[N_LONGS - 1] == KEY_TRANSITION) printf("ERROR: block equal to KEY_TRANSITION\n"); - auto hash_val = kmer_hash(kmer); - char prev_left_ext = '0', prev_right_ext = '0'; - bool use_qf = (qf != nullptr); - bool update_only = (use_qf && !ctg_kmers); - bool updated = gpu_insert_kmer(elems, hash_val, kmer, left_ext, right_ext, prev_left_ext, prev_right_ext, kmer_count, - new_inserts, dropped_inserts, ctg_kmers, use_qf, update_only); - if (update_only && !updated) { - // not found in the hash table - look in the qf - quotient_filter::qf_returns qf_insert_result = quotient_filter::QF_ITEM_FOUND; - qf_insert_result = quotient_filter::insert_kmer(qf, hash_val, left_ext, right_ext, prev_left_ext, prev_right_ext); - if (qf_insert_result == quotient_filter::QF_ITEM_INSERTED) { - num_unique_qf++; - assert(prev_left_ext == '0' && prev_right_ext == '0'); - } else if (qf_insert_result == quotient_filter::QF_ITEM_FOUND) { - gpu_insert_kmer(elems, hash_val, kmer, left_ext, right_ext, prev_left_ext, prev_right_ext, kmer_count, new_inserts, - dropped_inserts, ctg_kmers, use_qf, false); - } - } - } - } - reduce(attempted_inserts, buff_len, &insert_stats->attempted); - reduce(dropped_inserts, buff_len, &insert_stats->dropped); - reduce(new_inserts, buff_len, &insert_stats->new_inserts); - reduce(num_unique_qf, buff_len, &insert_stats->num_unique_qf); -} - -template -struct HashTableGPUDriver::HashTableDriverState { - cudaEvent_t event; - QuickTimer insert_timer, kernel_timer; - quotient_filter::QF *qf = nullptr; -}; - -template -void KmerArray::set(const uint64_t *kmer) { - memcpy(longs, kmer, N_LONGS * sizeof(uint64_t)); -} - -template -void KmerCountsMap::init(int64_t ht_capacity) { - capacity = ht_capacity; - cudaErrchk(cudaMalloc(&keys, capacity * sizeof(KmerArray))); - cudaErrchk(cudaMemset((void *)keys, KEY_EMPTY_BYTE, capacity * sizeof(KmerArray))); - cudaErrchk(cudaMalloc(&vals, capacity * sizeof(CountsArray))); - cudaErrchk(cudaMemset(vals, 0, capacity * sizeof(CountsArray))); -} - -template -void KmerCountsMap::clear() { - cudaFree((void *)keys); - cudaFree(vals); -} - -template -void KmerExtsMap::init(int64_t ht_capacity) { - capacity = ht_capacity; - cudaErrchk(cudaMalloc(&keys, capacity * sizeof(KmerArray))); - cudaErrchk(cudaMemset((void *)keys, KEY_EMPTY_BYTE, capacity * sizeof(KmerArray))); - cudaErrchk(cudaMalloc(&vals, capacity * sizeof(CountExts))); - cudaErrchk(cudaMemset(vals, 0, capacity * sizeof(CountExts))); -} - -template -void KmerExtsMap::clear() { - cudaFree((void *)keys); - cudaFree(vals); -} - -template -HashTableGPUDriver::HashTableGPUDriver() {} - -template -void HashTableGPUDriver::init(int upcxx_rank_me, int upcxx_rank_n, int kmer_len, int max_elems, size_t gpu_avail_mem, - double &init_time, size_t &gpu_bytes_reqd, size_t &ht_bytes_used, size_t &qf_bytes_used, - bool use_qf) { - QuickTimer init_timer; - init_timer.start(); - this->upcxx_rank_me = upcxx_rank_me; - this->upcxx_rank_n = upcxx_rank_n; - this->kmer_len = kmer_len; - pass_type = READ_KMERS_PASS; - gpu_utils::set_gpu_device(upcxx_rank_me); - dstate = new HashTableDriverState(); - dstate->qf = nullptr; - // max ratio of singletons to dups - uint64_t max_elems_qf = max_elems * 5; - int nbits_qf = log2(max_elems_qf); - if (nbits_qf == 0) use_qf = false; - if (use_qf) { - qf_bytes_used = quotient_filter::qf_estimate_memory(nbits_qf); - double qf_avail_mem = gpu_avail_mem / 5; - // if (!upcxx_rank_me) - // cout << "QF nbits " << nbits_qf << " qf_avail_mem " << qf_avail_mem << " qf bytes used " << qf_bytes_used << "\n" ; - if (qf_bytes_used > qf_avail_mem) { - // For debugging OOMs - // size_t prev_bytes_used = qf_bytes_used; - // int prev_nbits = nbits_qf; - double factor = qf_avail_mem / qf_bytes_used; - size_t corrected_max_elems = (max_elems_qf * factor); - nbits_qf = log2(corrected_max_elems) - 1; - // drop bits further for really long kmers because the space requirements for the qf relative to the ht go down - if (kmer_len >= 96) nbits_qf--; - if (nbits_qf == 0) nbits_qf = 1; - qf_bytes_used = quotient_filter::qf_estimate_memory(nbits_qf); - if (!upcxx_rank_me) cout << "Corrected: QF nbits " << nbits_qf << " qf bytes used " << qf_bytes_used << "\n"; - /* - // uncomment to debug if crashing with OOM when allocating - cout << "****** QF nbits corrected to " << nbits_qf << " from " << prev_nbits << "\n"; - cout << "****** QF will take " << (qf_bytes_used / 1024 / 1024) << "MB instead of " << (prev_bytes_used / 1024 / 1024) - << "MB\n"; - */ - } else { - if (kmer_len >= 64) nbits_qf--; - } - quotient_filter::qf_malloc_device(&(dstate->qf), nbits_qf); - } - - // now check that we have sufficient memory for the required capacity - size_t elem_buff_size = KCOUNT_GPU_HASHTABLE_BLOCK_SIZE * (1 + sizeof(count_t)) * 1.5; - size_t elem_size = sizeof(KmerArray) + sizeof(CountsArray); - gpu_bytes_reqd = (max_elems * elem_size) + elem_buff_size + qf_bytes_used; - // save 1/5 of avail gpu memory for possible ctg kmers and compact hash table - // set capacity to max avail remaining from gpu memory - more slots means lower load - auto max_slots = (use_qf ? 0.6 : 0.8) * (gpu_avail_mem - elem_buff_size - qf_bytes_used) / elem_size; - // find the first prime number lower than this value - primes::Prime prime; - prime.set(min((size_t)max_slots, (size_t)(max_elems * 3)), false); - auto ht_capacity = prime.get(); - ht_bytes_used = ht_capacity * elem_size; - - // uncomment to debug OOMs - // cout << "ht bytes used " << (ht_bytes_used / 1024 / 1024) << "MB\n"; - - read_kmers_dev.init(ht_capacity); - // for transferring packed elements from host to gpu - elem_buff_host.seqs = new char[KCOUNT_GPU_HASHTABLE_BLOCK_SIZE]; - // these are not used for kmers from reads - elem_buff_host.counts = nullptr; - // buffer on the device - cudaErrchk(cudaMalloc(&packed_elem_buff_dev.seqs, KCOUNT_GPU_HASHTABLE_BLOCK_SIZE)); - cudaErrchk(cudaMalloc(&unpacked_elem_buff_dev.seqs, KCOUNT_GPU_HASHTABLE_BLOCK_SIZE * 2)); - packed_elem_buff_dev.counts = nullptr; - unpacked_elem_buff_dev.counts = nullptr; - - cudaErrchk(cudaMalloc(&gpu_insert_stats, sizeof(InsertStats))); - cudaErrchk(cudaMemset(gpu_insert_stats, 0, sizeof(InsertStats))); - - init_timer.stop(); - init_time = init_timer.get_elapsed(); -} - -template -void HashTableGPUDriver::init_ctg_kmers(int max_elems, size_t gpu_avail_mem) { - pass_type = CTG_KMERS_PASS; - // free up space - if (dstate->qf) quotient_filter::qf_destroy_device(dstate->qf); - dstate->qf = nullptr; - size_t elem_buff_size = KCOUNT_GPU_HASHTABLE_BLOCK_SIZE * (1 + sizeof(count_t)) * 1.5; - size_t elem_size = sizeof(KmerArray) + sizeof(CountsArray); - size_t max_slots = 0.97 * (gpu_avail_mem - elem_buff_size) / elem_size; - primes::Prime prime; - prime.set(min(max_slots, (size_t)(max_elems * 3)), false); - auto ht_capacity = prime.get(); - ctg_kmers_dev.init(ht_capacity); - elem_buff_host.counts = new count_t[KCOUNT_GPU_HASHTABLE_BLOCK_SIZE]; - cudaErrchk(cudaMalloc(&packed_elem_buff_dev.counts, KCOUNT_GPU_HASHTABLE_BLOCK_SIZE * sizeof(count_t))); - cudaErrchk(cudaMalloc(&unpacked_elem_buff_dev.counts, 2 * KCOUNT_GPU_HASHTABLE_BLOCK_SIZE * sizeof(count_t))); - cudaErrchk(cudaMemset(gpu_insert_stats, 0, sizeof(InsertStats))); -} - -template -HashTableGPUDriver::~HashTableGPUDriver() { - if (dstate) { - // this happens when there is no ctg kmers pass - if (dstate->qf) quotient_filter::qf_destroy_device(dstate->qf); - delete dstate; - } -} - -template -void HashTableGPUDriver::insert_supermer_block() { - dstate->insert_timer.start(); - bool is_ctg_kmers = (pass_type == CTG_KMERS_PASS); - cudaErrchk(cudaMemcpy(packed_elem_buff_dev.seqs, elem_buff_host.seqs, buff_len, cudaMemcpyHostToDevice)); - cudaErrchk(cudaMemset(unpacked_elem_buff_dev.seqs, 0, buff_len * 2)); - if (is_ctg_kmers) - cudaErrchk(cudaMemcpy(packed_elem_buff_dev.counts, elem_buff_host.counts, buff_len * sizeof(count_t), cudaMemcpyHostToDevice)); - - int gridsize, threadblocksize; - dstate->kernel_timer.start(); - get_kernel_config(buff_len, gpu_unpack_supermer_block, gridsize, threadblocksize); - gpu_unpack_supermer_block<<>>(unpacked_elem_buff_dev, packed_elem_buff_dev, buff_len); - get_kernel_config(buff_len * 2, gpu_insert_supermer_block, gridsize, threadblocksize); - // gridsize = gridsize * threadblocksize; - // threadblocksize = 1; - gpu_insert_supermer_block<<>>(is_ctg_kmers ? ctg_kmers_dev : read_kmers_dev, unpacked_elem_buff_dev, - buff_len * 2, kmer_len, is_ctg_kmers, gpu_insert_stats, dstate->qf); - // the kernel time is not going to be accurate, because we are not waiting for the kernel to complete - // need to uncomment the line below, which will decrease performance by preventing the overlap of GPU and CPU execution - cudaDeviceSynchronize(); - dstate->kernel_timer.stop(); - num_gpu_calls++; - dstate->insert_timer.stop(); -} - -template -void HashTableGPUDriver::insert_supermer(const string &supermer_seq, count_t supermer_count) { - if (buff_len + supermer_seq.length() + 1 >= KCOUNT_GPU_HASHTABLE_BLOCK_SIZE) { - insert_supermer_block(); - buff_len = 0; - } - memcpy(&(elem_buff_host.seqs[buff_len]), supermer_seq.c_str(), supermer_seq.length()); - if (pass_type == CTG_KMERS_PASS) { - for (int i = 0; i < (int)supermer_seq.length(); i++) elem_buff_host.counts[buff_len + i] = supermer_count; - } - buff_len += supermer_seq.length(); - elem_buff_host.seqs[buff_len] = '_'; - if (pass_type == CTG_KMERS_PASS) elem_buff_host.counts[buff_len] = 0; - buff_len++; -} - -template -void HashTableGPUDriver::purge_invalid(int &num_purged, int &num_entries) { - num_purged = num_entries = 0; - unsigned int *counts_gpu; - int NUM_COUNTS = 2; - cudaErrchk(cudaMalloc(&counts_gpu, NUM_COUNTS * sizeof(unsigned int))); - cudaErrchk(cudaMemset(counts_gpu, 0, NUM_COUNTS * sizeof(unsigned int))); - GPUTimer t; - int gridsize, threadblocksize; - get_kernel_config(read_kmers_dev.capacity, gpu_purge_invalid, gridsize, threadblocksize); - t.start(); - // now purge all invalid kmers (do it on the gpu) - gpu_purge_invalid<<>>(read_kmers_dev, counts_gpu); - t.stop(); - dstate->kernel_timer.inc(t.get_elapsed()); - - unsigned int counts_host[NUM_COUNTS]; - cudaErrchk(cudaMemcpy(&counts_host, counts_gpu, NUM_COUNTS * sizeof(unsigned int), cudaMemcpyDeviceToHost)); - num_purged = counts_host[0]; - num_entries = counts_host[1]; - auto expected_num_entries = read_kmers_stats.new_inserts - num_purged; - if (num_entries != (int)expected_num_entries) - cout << KLRED << "[" << upcxx_rank_me << "] WARNING mismatch " << num_entries << " != " << expected_num_entries << " diff " - << (num_entries - (int)expected_num_entries) << " new inserts " << read_kmers_stats.new_inserts << " num purged " - << num_purged << KNORM << endl; - read_kmers_dev.num = num_entries; -} - -template -void HashTableGPUDriver::flush_inserts() { - if (buff_len) { - insert_supermer_block(); - buff_len = 0; - } - cudaErrchk(cudaMemcpy(pass_type == READ_KMERS_PASS ? &read_kmers_stats : &ctg_kmers_stats, gpu_insert_stats, sizeof(InsertStats), - cudaMemcpyDeviceToHost)); -} - -template -void HashTableGPUDriver::done_all_inserts(int &num_dropped, int &num_unique, int &num_purged) { - int num_entries = 0; - purge_invalid(num_purged, num_entries); - read_kmers_dev.num = num_entries; - if (elem_buff_host.seqs) delete[] elem_buff_host.seqs; - if (elem_buff_host.counts) delete[] elem_buff_host.counts; - cudaFree(packed_elem_buff_dev.seqs); - cudaFree(unpacked_elem_buff_dev.seqs); - if (packed_elem_buff_dev.counts) cudaFree(packed_elem_buff_dev.counts); - if (unpacked_elem_buff_dev.counts) cudaFree(unpacked_elem_buff_dev.counts); - cudaFree(gpu_insert_stats); - // overallocate to reduce collisions - num_entries *= 1.3; - // now compact the hash table entries - unsigned int *counts_gpu; - int NUM_COUNTS = 2; - cudaErrchk(cudaMalloc(&counts_gpu, NUM_COUNTS * sizeof(unsigned int))); - cudaErrchk(cudaMemset(counts_gpu, 0, NUM_COUNTS * sizeof(unsigned int))); - KmerExtsMap compact_read_kmers_dev; - compact_read_kmers_dev.init(num_entries); - GPUTimer t; - int gridsize, threadblocksize; - get_kernel_config(read_kmers_dev.capacity, gpu_compact_ht, gridsize, threadblocksize); - t.start(); - gpu_compact_ht<<>>(read_kmers_dev, compact_read_kmers_dev, counts_gpu); - t.stop(); - dstate->kernel_timer.inc(t.get_elapsed()); - read_kmers_dev.clear(); - unsigned int counts_host[NUM_COUNTS]; - cudaErrchk(cudaMemcpy(&counts_host, counts_gpu, NUM_COUNTS * sizeof(unsigned int), cudaMemcpyDeviceToHost)); - cudaFree(counts_gpu); - num_dropped = counts_host[0]; - num_unique = counts_host[1]; - if (num_unique != read_kmers_dev.num) - cerr << KLRED << "[" << upcxx_rank_me << "] WARNING: " << KNORM - << "mismatch in expected entries " << num_unique << " != " << read_kmers_dev.num << "\n"; - // now copy the gpu hash table values across to the host - // We only do this once, which requires enough memory on the host to store the full GPU hash table, but since the GPU memory - // is generally a lot less than the host memory, it should be fine. - output_keys.resize(num_entries); - output_vals.resize(num_entries); - output_index = 0; - cudaErrchk(cudaMemcpy(output_keys.data(), (void *)compact_read_kmers_dev.keys, - compact_read_kmers_dev.capacity * sizeof(KmerArray), cudaMemcpyDeviceToHost)); - cudaErrchk(cudaMemcpy(output_vals.data(), compact_read_kmers_dev.vals, compact_read_kmers_dev.capacity * sizeof(CountExts), - cudaMemcpyDeviceToHost)); - compact_read_kmers_dev.clear(); -} - -template -void HashTableGPUDriver::done_ctg_kmer_inserts(int &attempted_inserts, int &dropped_inserts, int &new_inserts) { - unsigned int *counts_gpu; - int NUM_COUNTS = 3; - cudaErrchk(cudaMalloc(&counts_gpu, NUM_COUNTS * sizeof(unsigned int))); - cudaErrchk(cudaMemset(counts_gpu, 0, NUM_COUNTS * sizeof(unsigned int))); - GPUTimer t; - int gridsize, threadblocksize; - get_kernel_config(ctg_kmers_dev.capacity, gpu_merge_ctg_kmers, gridsize, threadblocksize); - t.start(); - gpu_merge_ctg_kmers<<>>(read_kmers_dev, ctg_kmers_dev, counts_gpu); - t.stop(); - dstate->kernel_timer.inc(t.get_elapsed()); - ctg_kmers_dev.clear(); - unsigned int counts_host[NUM_COUNTS]; - cudaErrchk(cudaMemcpy(&counts_host, counts_gpu, NUM_COUNTS * sizeof(unsigned int), cudaMemcpyDeviceToHost)); - cudaFree(counts_gpu); - attempted_inserts = counts_host[0]; - dropped_inserts = counts_host[1]; - new_inserts = counts_host[2]; - read_kmers_dev.num += new_inserts; - read_kmers_stats.new_inserts += new_inserts; -} - -template -void HashTableGPUDriver::get_elapsed_time(double &insert_time, double &kernel_time) { - insert_time = dstate->insert_timer.get_elapsed(); - kernel_time = dstate->kernel_timer.get_elapsed(); -} - -template -pair *, CountExts *> HashTableGPUDriver::get_next_entry() { - if (output_keys.empty() || output_index == output_keys.size()) return {nullptr, nullptr}; - output_index++; - return {&(output_keys[output_index - 1]), &(output_vals[output_index - 1])}; -} - -template -int64_t HashTableGPUDriver::get_capacity() { - if (pass_type == READ_KMERS_PASS) - return read_kmers_dev.capacity; - else - return ctg_kmers_dev.capacity; -} - -template -int64_t HashTableGPUDriver::get_final_capacity() { - return read_kmers_dev.capacity; -} - -template -InsertStats &HashTableGPUDriver::get_stats() { - if (pass_type == READ_KMERS_PASS) - return read_kmers_stats; - else - return ctg_kmers_stats; -} - -template -int HashTableGPUDriver::get_num_gpu_calls() { - return num_gpu_calls; -} - -template -double HashTableGPUDriver::get_qf_load_factor() { - if (!dstate->qf) return 0; - return (double)quotient_filter::host_qf_get_num_occupied_slots(dstate->qf) / quotient_filter::host_qf_get_nslots(dstate->qf); -} - -template class kcount_gpu::HashTableGPUDriver<32>; -#if MAX_BUILD_KMER >= 64 -template class kcount_gpu::HashTableGPUDriver<64>; -#endif -#if MAX_BUILD_KMER >= 96 -template class kcount_gpu::HashTableGPUDriver<96>; -#endif -#if MAX_BUILD_KMER >= 128 -template class kcount_gpu::HashTableGPUDriver<128>; -#endif -#if MAX_BUILD_KMER >= 160 -template class kcount_gpu::HashTableGPUDriver<160>; -#endif diff --git a/src/kcount/kcount-gpu/gpu_hash_table.hpp b/src/kcount/kcount-gpu/gpu_hash_table.hpp index 0861b86..e69de29 100644 --- a/src/kcount/kcount-gpu/gpu_hash_table.hpp +++ b/src/kcount/kcount-gpu/gpu_hash_table.hpp @@ -1,182 +0,0 @@ -#pragma once - -/* - HipMer v 2.0, Copyright (c) 2020, The Regents of the University of California, - through Lawrence Berkeley National Laboratory (subject to receipt of any required - approvals from the U.S. Dept. of Energy). All rights reserved." - - Redistribution and use in source and binary forms, with or without modification, - are permitted provided that the following conditions are met: - - (1) Redistributions of source code must retain the above copyright notice, this - list of conditions and the following disclaimer. - - (2) Redistributions in binary form must reproduce the above copyright notice, - this list of conditions and the following disclaimer in the documentation and/or - other materials provided with the distribution. - - (3) Neither the name of the University of California, Lawrence Berkeley National - Laboratory, U.S. Dept. of Energy nor the names of its contributors may be used to - endorse or promote products derived from this software without specific prior - written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY - EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES - OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT - SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, - INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED - TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR - BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN - ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH - DAMAGE. - - You are under no obligation whatsoever to provide any bug fixes, patches, or upgrades - to the features, functionality or performance of the source code ("Enhancements") to - anyone; however, if you choose to make your Enhancements available either publicly, - or directly to Lawrence Berkeley National Laboratory, without imposing a separate - written license agreement for such Enhancements, then you hereby grant the following - license: a non-exclusive, royalty-free perpetual license to install, use, modify, - prepare derivative works, incorporate into other computer software, distribute, and - sublicense such enhancements or derivative works thereof, in binary and source code - form. -*/ - -#include -#include -#include -#include - -#include "hash_funcs.h" - -namespace kcount_gpu { - -enum PASS_TYPE { READ_KMERS_PASS = 0, CTG_KMERS_PASS = 1 }; - -using count_t = uint32_t; -using ext_count_t = uint16_t; - -struct CountsArray { - count_t kmer_count; - ext_count_t ext_counts[8]; -}; - -struct CountExts { - count_t count; - int8_t left, right; -}; - -template -struct KmerArray { - static const int N_LONGS = (MAX_K + 31) / 32; - uint64_t longs[N_LONGS]; - - void set(const uint64_t *x); -}; - -struct SupermerBuff { - char *seqs; - count_t *counts; -}; - -// Bytes used per element: -// k = 21: 8+20 = 28 -// k = 33, 55: 16+20 = 36 -// k = 77: 24+20 = 44 -// k = 99: 32+20 = 52 -template -struct KmerCountsMap { - // Arrays for keys and values. They are separate because the keys get initialized with max number and the vals with zero - KmerArray *keys = nullptr; - CountsArray *vals = nullptr; - int64_t capacity = 0; - int num = 0; - - void init(int64_t ht_capacity); - void clear(); -}; - -template -struct KmerExtsMap { - KmerArray *keys = nullptr; - CountExts *vals = nullptr; - int64_t capacity = 0; - - void init(int64_t ht_capacity); - void clear(); -}; - -struct InsertStats { - unsigned int dropped = 0; - unsigned int attempted = 0; - unsigned int new_inserts = 0; - unsigned int num_unique_qf = 0; -}; - -template -class HashTableGPUDriver { - static const int N_LONGS = (MAX_K + 31) / 32; - struct HashTableDriverState; - // stores CUDA specific variables - HashTableDriverState *dstate = nullptr; - - int upcxx_rank_me; - int upcxx_rank_n; - int kmer_len; - int buff_len = 0; - std::vector> output_keys; - std::vector output_vals; - size_t output_index = 0; - - KmerCountsMap read_kmers_dev; - KmerCountsMap ctg_kmers_dev; - - // for buffering elements in the host memory - SupermerBuff elem_buff_host = {0}; - // for transferring host memory buffer to device - SupermerBuff unpacked_elem_buff_dev = {0}; - SupermerBuff packed_elem_buff_dev = {0}; - - InsertStats read_kmers_stats; - InsertStats ctg_kmers_stats; - InsertStats *gpu_insert_stats; - int num_gpu_calls = 0; - - void insert_supermer_block(); - void purge_invalid(int &num_purged, int &num_entries); - - public: - PASS_TYPE pass_type; - - HashTableGPUDriver(); - ~HashTableGPUDriver(); - - void init(int upcxx_rank_me, int upcxx_rank_n, int kmer_len, int max_elems, size_t gpu_avail_mem, double &init_time, - size_t &gpu_bytes_reqd, size_t &ht_bytes_used, size_t &qf_bytes_used, bool use_qf); - - void init_ctg_kmers(int max_elems, size_t gpu_avail_mem); - - void insert_supermer(const std::string &supermer_seq, count_t supermer_count); - - void flush_inserts(); - - void done_ctg_kmer_inserts(int &attempted_inserts, int &dropped_inserts, int &new_inserts); - - void done_all_inserts(int &num_dropped, int &num_unique, int &num_purged); - - std::pair *, CountExts *> get_next_entry(); - - void get_elapsed_time(double &insert_time, double &kernel_time); - - int64_t get_capacity(); - - int64_t get_final_capacity(); - - InsertStats &get_stats(); - - int get_num_gpu_calls(); - - double get_qf_load_factor(); -}; - -} // namespace kcount_gpu diff --git a/src/kcount/kcount-gpu/gqf.cpp b/src/kcount/kcount-gpu/gqf.cpp index 78ca1c9..e69de29 100644 --- a/src/kcount/kcount-gpu/gqf.cpp +++ b/src/kcount/kcount-gpu/gqf.cpp @@ -1,2920 +0,0 @@ -/* - * ============================================================================ - * - * Authors: Prashant Pandey - * Rob Johnson - * - * ============================================================================ - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -// timing stuff -#include -#include -#include - -// how fast is a thrust sort? -#include -#include - -#include "hashutil.hpp" -#include "gqf.hpp" -#include "gqf_int.hpp" -#include "gpu-utils/gpu_common.hpp" - -#include - -/****************************************************************** - * Code for managing the metadata bits and slots w/o interpreting * - * the content of the slots. - ******************************************************************/ - -namespace quotient_filter { - -#define MAX_VALUE(nbits) ((1ULL << (nbits)) - 1) -#define BITMASK(nbits) ((nbits) == 64 ? 0xffffffffffffffff : MAX_VALUE(nbits)) -#define NUM_SLOTS_TO_LOCK (1ULL << 13) -#define EXP_BEFORE_FAILURE -15 -#define CLUSTER_SIZE (1ULL << 14) -#define METADATA_WORD(qf, field, slot_index) \ - (get_block((qf), (slot_index) / QF_SLOTS_PER_BLOCK)->field[((slot_index) % QF_SLOTS_PER_BLOCK) / 64]) - -#define GET_NO_LOCK(flag) (flag & QF_NO_LOCK) -#define GET_TRY_ONCE_LOCK(flag) (flag & QF_TRY_ONCE_LOCK) -#define GET_WAIT_FOR_LOCK(flag) (flag & QF_WAIT_FOR_LOCK) -#define GET_KEY_HASH(flag) (flag & QF_KEY_IS_HASH) - -#define NUM_BUFFERS 10 -#define MAX_BUFFER_SIZE 100 -#define LOCK_DIST 1 - -#define CYCLES_PER_SECOND 1601000000 - -#define MAX_DEPTH 16 -#define SELECT_BOUND 32 - -#define DISTANCE_FROM_HOME_SLOT_CUTOFF 1000 -#define BILLION 1000000000L -#define CUDA_CHECK(ans) gpuAssert((ans), __FILE__, __LINE__); -inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true) { - if (code != cudaSuccess) { - printf("GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line); - if (abort) exit(code); - } -} - -#ifdef DEBUG -#define PRINT_DEBUG 1 -#else -#define PRINT_DEBUG 0 -#endif - -#define DEBUG_CQF(fmt, ...) \ - do { \ - if (PRINT_DEBUG) printf(fmt, __VA_ARGS__); \ - } while (0) - -#define DEBUG_DUMP(qf) \ - do { \ - if (PRINT_DEBUG) qf_dump_metadata(qf); \ - } while (0) - -#if QF_BITS_PER_SLOT > 0 -__host__ __device__ static inline qfblock *get_block(const QF *qf, uint64_t block_index) { return &qf->blocks[block_index]; } -#else -__host__ __device__ static inline qfblock *get_block(const QF *qf, uint64_t block_index) { - return (qfblock *)(((char *)qf->blocks) + block_index * (sizeof(qfblock) + QF_SLOTS_PER_BLOCK * qf->metadata->bits_per_slot / 8)); -} -#endif -/* -__device__ static __inline__ unsigned long long rdtsc(void) -{ - unsigned hi, lo; - __asm__ __volatile__ ("rdtsc" : "=a"(lo), "=d"(hi)); - return ( (unsigned long long)lo)|( ((unsigned long long)hi)<<32 ); -} -*/ -/* -__host__ __device__ static void modify_metadata(pc_t *metadata, int cnt) -{ - pc_add(metadata, cnt); - return; -} -*/ -/*changing sizes of register based on https://docs.nvidia.com/cuda/inline-ptx-assembly/index.html -l is for "l" = .u64 reg -*/ - -#ifdef __CUDA_ARCH__ -__constant__ char kmer_vals[6] = {'F', 'A', 'C', 'T', 'G', '0'}; -#else -const char kmer_vals[6] = {'F', 'A', 'C', 'T', 'G', '0'}; -#endif - -__host__ __device__ static inline int popcnt(uint64_t val) { -#ifdef __CUDA_ARCH__ - val = __popcll(val); -#else - -#ifndef __x86_64 - val = __builtin_popcount(val); - -#else - - asm("popcnt %[val], %[val]" : [val] "+r"(val) : : "cc"); - -#endif - -#endif - return val; -} - -/* -__device__ static inline int64_t bitscanreverse(uint64_t val) { - if (val == 0) { - return -1; - } else { - asm("bsr %[val], %[val]" : [val] "+l"(val) : :); - return val; - } -} -*/ - -__host__ __device__ static inline int popcntv(const uint64_t val, int ignore) { - if (ignore % 64) - return popcnt(val & ~BITMASK(ignore % 64)); - else - return popcnt(val); -} - -// Returns the number of 1s up to (and including) the pos'th bit -// Bits are numbered from 0 -__host__ __device__ static inline int bitrank(uint64_t val, int pos) { - val = val & ((2ULL << pos) - 1); -#ifdef __CUDA_ARCH__ - val = __popcll(val); -#else - - // quick fix for summit - -#ifndef __x86_64 - - val = __builtin_popcount(val); - -#else - - asm("popcnt %[val], %[val]" : [val] "+r"(val) : : "cc"); - -#endif - -#endif - return val; -} - -// moved dump functions -__host__ __device__ static inline void qf_dump_block(const QF *qf, uint64_t i) { - uint64_t j; - - // printf("Block %llu Runs from %llu to %llu\n", i, i * QF_SLOTS_PER_BLOCK, (i + 1) * QF_SLOTS_PER_BLOCK); - printf("Offset: %-192d", get_block(qf, i)->offset); - printf("\n"); - - for (j = 0; j < QF_SLOTS_PER_BLOCK; j++) printf("%02lx ", j); - printf("\n"); - - for (j = 0; j < QF_SLOTS_PER_BLOCK; j++) printf(" %d ", (get_block(qf, i)->occupieds[j / 64] & (1ULL << (j % 64))) ? 1 : 0); - printf("\n"); - - for (j = 0; j < QF_SLOTS_PER_BLOCK; j++) printf(" %d ", (get_block(qf, i)->runends[j / 64] & (1ULL << (j % 64))) ? 1 : 0); - printf("\n"); - -#if QF_BITS_PER_SLOT == 8 || QF_BITS_PER_SLOT == 16 || QF_BITS_PER_SLOT == 32 - for (j = 0; j < QF_SLOTS_PER_BLOCK; j++) printf("%02x ", get_block(qf, i)->slots[j]); -#elif QF_BITS_PER_SLOT == 64 - for (j = 0; j < QF_SLOTS_PER_BLOCK; j++) printf("%02lx ", get_block(qf, i)->slots[j]); -#else - for (j = 0; j < QF_SLOTS_PER_BLOCK * qf->metadata->bits_per_slot / 8; j++) printf("%02x ", get_block(qf, i)->slots[j]); -#endif - - printf("\n"); - - printf("\n"); -} - -__host__ __device__ void qf_dump_metadata(const QF *qf) { - printf("Slots: %lu Occupied: %lu Elements: %lu Distinct: %lu\n", qf->metadata->nslots, qf->metadata->noccupied_slots, - qf->metadata->nelts, qf->metadata->ndistinct_elts); - printf("Key_bits: %lu Value_bits: %lu Remainder_bits: %lu Bits_per_slot: %lu\n", qf->metadata->key_bits, qf->metadata->value_bits, - qf->metadata->key_remainder_bits, qf->metadata->bits_per_slot); -} - -__host__ __device__ void qf_dump(const QF *qf) { - uint64_t i; - - printf("%lu %lu %lu\n", qf->metadata->nblocks, qf->metadata->ndistinct_elts, qf->metadata->nelts); - - for (i = 0; i < qf->metadata->nblocks; i++) { - qf_dump_block(qf, i); - } -} - -/** - * Returns the position of the k-th 1 in the 64-bit word x. - * k is 0-based, so k=0 returns the position of the first 1. - * - * Uses the broadword selection algorithm by Vigna [1], improved by Gog - * and Petri [2] and Vigna [3]. - * - * [1] Sebastiano Vigna. Broadword Implementation of Rank/Select - * Queries. WEA, 2008 - * - * [2] Simon Gog, Matthias Petri. Optimized succinct data - * structures for massive data. Softw. Pract. Exper., 2014 - * - * [3] Sebastiano Vigna. MG4J 5.2.1. http://mg4j.di.unimi.it/ - * The following code is taken from - * https://github.com/facebook/folly/blob/b28186247104f8b90cfbe094d289c91f9e413317/folly/experimental/Select64.h - */ -__device__ __constant__ uint8_t gpukSelectInByte[2048] = { - 8, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, - 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, - 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 7, - 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, - 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, - 0, 3, 0, 1, 0, 2, 0, 1, 0, 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 8, 8, - 8, 1, 8, 2, 2, 1, 8, 3, 3, 1, 3, 2, 2, 1, 8, 4, 4, 1, 4, 2, 2, 1, 4, 3, 3, 1, 3, 2, 2, 1, 8, 5, 5, 1, 5, 2, 2, 1, 5, 3, 3, 1, 3, - 2, 2, 1, 5, 4, 4, 1, 4, 2, 2, 1, 4, 3, 3, 1, 3, 2, 2, 1, 8, 6, 6, 1, 6, 2, 2, 1, 6, 3, 3, 1, 3, 2, 2, 1, 6, 4, 4, 1, 4, 2, 2, 1, - 4, 3, 3, 1, 3, 2, 2, 1, 6, 5, 5, 1, 5, 2, 2, 1, 5, 3, 3, 1, 3, 2, 2, 1, 5, 4, 4, 1, 4, 2, 2, 1, 4, 3, 3, 1, 3, 2, 2, 1, 8, 7, 7, - 1, 7, 2, 2, 1, 7, 3, 3, 1, 3, 2, 2, 1, 7, 4, 4, 1, 4, 2, 2, 1, 4, 3, 3, 1, 3, 2, 2, 1, 7, 5, 5, 1, 5, 2, 2, 1, 5, 3, 3, 1, 3, 2, - 2, 1, 5, 4, 4, 1, 4, 2, 2, 1, 4, 3, 3, 1, 3, 2, 2, 1, 7, 6, 6, 1, 6, 2, 2, 1, 6, 3, 3, 1, 3, 2, 2, 1, 6, 4, 4, 1, 4, 2, 2, 1, 4, - 3, 3, 1, 3, 2, 2, 1, 6, 5, 5, 1, 5, 2, 2, 1, 5, 3, 3, 1, 3, 2, 2, 1, 5, 4, 4, 1, 4, 2, 2, 1, 4, 3, 3, 1, 3, 2, 2, 1, 8, 8, 8, 8, - 8, 8, 8, 2, 8, 8, 8, 3, 8, 3, 3, 2, 8, 8, 8, 4, 8, 4, 4, 2, 8, 4, 4, 3, 4, 3, 3, 2, 8, 8, 8, 5, 8, 5, 5, 2, 8, 5, 5, 3, 5, 3, 3, - 2, 8, 5, 5, 4, 5, 4, 4, 2, 5, 4, 4, 3, 4, 3, 3, 2, 8, 8, 8, 6, 8, 6, 6, 2, 8, 6, 6, 3, 6, 3, 3, 2, 8, 6, 6, 4, 6, 4, 4, 2, 6, 4, - 4, 3, 4, 3, 3, 2, 8, 6, 6, 5, 6, 5, 5, 2, 6, 5, 5, 3, 5, 3, 3, 2, 6, 5, 5, 4, 5, 4, 4, 2, 5, 4, 4, 3, 4, 3, 3, 2, 8, 8, 8, 7, 8, - 7, 7, 2, 8, 7, 7, 3, 7, 3, 3, 2, 8, 7, 7, 4, 7, 4, 4, 2, 7, 4, 4, 3, 4, 3, 3, 2, 8, 7, 7, 5, 7, 5, 5, 2, 7, 5, 5, 3, 5, 3, 3, 2, - 7, 5, 5, 4, 5, 4, 4, 2, 5, 4, 4, 3, 4, 3, 3, 2, 8, 7, 7, 6, 7, 6, 6, 2, 7, 6, 6, 3, 6, 3, 3, 2, 7, 6, 6, 4, 6, 4, 4, 2, 6, 4, 4, - 3, 4, 3, 3, 2, 7, 6, 6, 5, 6, 5, 5, 2, 6, 5, 5, 3, 5, 3, 3, 2, 6, 5, 5, 4, 5, 4, 4, 2, 5, 4, 4, 3, 4, 3, 3, 2, 8, 8, 8, 8, 8, 8, - 8, 8, 8, 8, 8, 8, 8, 8, 8, 3, 8, 8, 8, 8, 8, 8, 8, 4, 8, 8, 8, 4, 8, 4, 4, 3, 8, 8, 8, 8, 8, 8, 8, 5, 8, 8, 8, 5, 8, 5, 5, 3, 8, - 8, 8, 5, 8, 5, 5, 4, 8, 5, 5, 4, 5, 4, 4, 3, 8, 8, 8, 8, 8, 8, 8, 6, 8, 8, 8, 6, 8, 6, 6, 3, 8, 8, 8, 6, 8, 6, 6, 4, 8, 6, 6, 4, - 6, 4, 4, 3, 8, 8, 8, 6, 8, 6, 6, 5, 8, 6, 6, 5, 6, 5, 5, 3, 8, 6, 6, 5, 6, 5, 5, 4, 6, 5, 5, 4, 5, 4, 4, 3, 8, 8, 8, 8, 8, 8, 8, - 7, 8, 8, 8, 7, 8, 7, 7, 3, 8, 8, 8, 7, 8, 7, 7, 4, 8, 7, 7, 4, 7, 4, 4, 3, 8, 8, 8, 7, 8, 7, 7, 5, 8, 7, 7, 5, 7, 5, 5, 3, 8, 7, - 7, 5, 7, 5, 5, 4, 7, 5, 5, 4, 5, 4, 4, 3, 8, 8, 8, 7, 8, 7, 7, 6, 8, 7, 7, 6, 7, 6, 6, 3, 8, 7, 7, 6, 7, 6, 6, 4, 7, 6, 6, 4, 6, - 4, 4, 3, 8, 7, 7, 6, 7, 6, 6, 5, 7, 6, 6, 5, 6, 5, 5, 3, 7, 6, 6, 5, 6, 5, 5, 4, 6, 5, 5, 4, 5, 4, 4, 3, 8, 8, 8, 8, 8, 8, 8, 8, - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 4, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 5, 8, 8, 8, - 8, 8, 8, 8, 5, 8, 8, 8, 5, 8, 5, 5, 4, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 6, 8, 8, 8, 8, 8, 8, 8, 6, 8, 8, 8, 6, 8, 6, - 6, 4, 8, 8, 8, 8, 8, 8, 8, 6, 8, 8, 8, 6, 8, 6, 6, 5, 8, 8, 8, 6, 8, 6, 6, 5, 8, 6, 6, 5, 6, 5, 5, 4, 8, 8, 8, 8, 8, 8, 8, 8, 8, - 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 7, 8, 7, 7, 4, 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 7, 8, 7, 7, 5, 8, 8, 8, 7, - 8, 7, 7, 5, 8, 7, 7, 5, 7, 5, 5, 4, 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 7, 8, 7, 7, 6, 8, 8, 8, 7, 8, 7, 7, 6, 8, 7, 7, 6, 7, 6, 6, - 4, 8, 8, 8, 7, 8, 7, 7, 6, 8, 7, 7, 6, 7, 6, 6, 5, 8, 7, 7, 6, 7, 6, 6, 5, 7, 6, 6, 5, 6, 5, 5, 4, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 6, - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 6, 8, 8, 8, 8, 8, 8, 8, 6, 8, 8, 8, 6, 8, 6, 6, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 8, 8, 8, - 8, 7, 8, 8, 8, 7, 8, 7, 7, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 7, 8, 7, 7, 6, 8, - 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 7, 8, 7, 7, 6, 8, 8, 8, 7, 8, 7, 7, 6, 8, 7, 7, 6, 7, 6, 6, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 6, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, - 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 7, 8, 7, 7, 6, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7}; - -#ifndef __CUDA_ARCH__ -const uint8_t hostkSelectInByte[2048] = { - 8, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, - 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, - 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 7, - 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, - 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, - 0, 3, 0, 1, 0, 2, 0, 1, 0, 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 8, 8, - 8, 1, 8, 2, 2, 1, 8, 3, 3, 1, 3, 2, 2, 1, 8, 4, 4, 1, 4, 2, 2, 1, 4, 3, 3, 1, 3, 2, 2, 1, 8, 5, 5, 1, 5, 2, 2, 1, 5, 3, 3, 1, 3, - 2, 2, 1, 5, 4, 4, 1, 4, 2, 2, 1, 4, 3, 3, 1, 3, 2, 2, 1, 8, 6, 6, 1, 6, 2, 2, 1, 6, 3, 3, 1, 3, 2, 2, 1, 6, 4, 4, 1, 4, 2, 2, 1, - 4, 3, 3, 1, 3, 2, 2, 1, 6, 5, 5, 1, 5, 2, 2, 1, 5, 3, 3, 1, 3, 2, 2, 1, 5, 4, 4, 1, 4, 2, 2, 1, 4, 3, 3, 1, 3, 2, 2, 1, 8, 7, 7, - 1, 7, 2, 2, 1, 7, 3, 3, 1, 3, 2, 2, 1, 7, 4, 4, 1, 4, 2, 2, 1, 4, 3, 3, 1, 3, 2, 2, 1, 7, 5, 5, 1, 5, 2, 2, 1, 5, 3, 3, 1, 3, 2, - 2, 1, 5, 4, 4, 1, 4, 2, 2, 1, 4, 3, 3, 1, 3, 2, 2, 1, 7, 6, 6, 1, 6, 2, 2, 1, 6, 3, 3, 1, 3, 2, 2, 1, 6, 4, 4, 1, 4, 2, 2, 1, 4, - 3, 3, 1, 3, 2, 2, 1, 6, 5, 5, 1, 5, 2, 2, 1, 5, 3, 3, 1, 3, 2, 2, 1, 5, 4, 4, 1, 4, 2, 2, 1, 4, 3, 3, 1, 3, 2, 2, 1, 8, 8, 8, 8, - 8, 8, 8, 2, 8, 8, 8, 3, 8, 3, 3, 2, 8, 8, 8, 4, 8, 4, 4, 2, 8, 4, 4, 3, 4, 3, 3, 2, 8, 8, 8, 5, 8, 5, 5, 2, 8, 5, 5, 3, 5, 3, 3, - 2, 8, 5, 5, 4, 5, 4, 4, 2, 5, 4, 4, 3, 4, 3, 3, 2, 8, 8, 8, 6, 8, 6, 6, 2, 8, 6, 6, 3, 6, 3, 3, 2, 8, 6, 6, 4, 6, 4, 4, 2, 6, 4, - 4, 3, 4, 3, 3, 2, 8, 6, 6, 5, 6, 5, 5, 2, 6, 5, 5, 3, 5, 3, 3, 2, 6, 5, 5, 4, 5, 4, 4, 2, 5, 4, 4, 3, 4, 3, 3, 2, 8, 8, 8, 7, 8, - 7, 7, 2, 8, 7, 7, 3, 7, 3, 3, 2, 8, 7, 7, 4, 7, 4, 4, 2, 7, 4, 4, 3, 4, 3, 3, 2, 8, 7, 7, 5, 7, 5, 5, 2, 7, 5, 5, 3, 5, 3, 3, 2, - 7, 5, 5, 4, 5, 4, 4, 2, 5, 4, 4, 3, 4, 3, 3, 2, 8, 7, 7, 6, 7, 6, 6, 2, 7, 6, 6, 3, 6, 3, 3, 2, 7, 6, 6, 4, 6, 4, 4, 2, 6, 4, 4, - 3, 4, 3, 3, 2, 7, 6, 6, 5, 6, 5, 5, 2, 6, 5, 5, 3, 5, 3, 3, 2, 6, 5, 5, 4, 5, 4, 4, 2, 5, 4, 4, 3, 4, 3, 3, 2, 8, 8, 8, 8, 8, 8, - 8, 8, 8, 8, 8, 8, 8, 8, 8, 3, 8, 8, 8, 8, 8, 8, 8, 4, 8, 8, 8, 4, 8, 4, 4, 3, 8, 8, 8, 8, 8, 8, 8, 5, 8, 8, 8, 5, 8, 5, 5, 3, 8, - 8, 8, 5, 8, 5, 5, 4, 8, 5, 5, 4, 5, 4, 4, 3, 8, 8, 8, 8, 8, 8, 8, 6, 8, 8, 8, 6, 8, 6, 6, 3, 8, 8, 8, 6, 8, 6, 6, 4, 8, 6, 6, 4, - 6, 4, 4, 3, 8, 8, 8, 6, 8, 6, 6, 5, 8, 6, 6, 5, 6, 5, 5, 3, 8, 6, 6, 5, 6, 5, 5, 4, 6, 5, 5, 4, 5, 4, 4, 3, 8, 8, 8, 8, 8, 8, 8, - 7, 8, 8, 8, 7, 8, 7, 7, 3, 8, 8, 8, 7, 8, 7, 7, 4, 8, 7, 7, 4, 7, 4, 4, 3, 8, 8, 8, 7, 8, 7, 7, 5, 8, 7, 7, 5, 7, 5, 5, 3, 8, 7, - 7, 5, 7, 5, 5, 4, 7, 5, 5, 4, 5, 4, 4, 3, 8, 8, 8, 7, 8, 7, 7, 6, 8, 7, 7, 6, 7, 6, 6, 3, 8, 7, 7, 6, 7, 6, 6, 4, 7, 6, 6, 4, 6, - 4, 4, 3, 8, 7, 7, 6, 7, 6, 6, 5, 7, 6, 6, 5, 6, 5, 5, 3, 7, 6, 6, 5, 6, 5, 5, 4, 6, 5, 5, 4, 5, 4, 4, 3, 8, 8, 8, 8, 8, 8, 8, 8, - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 4, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 5, 8, 8, 8, - 8, 8, 8, 8, 5, 8, 8, 8, 5, 8, 5, 5, 4, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 6, 8, 8, 8, 8, 8, 8, 8, 6, 8, 8, 8, 6, 8, 6, - 6, 4, 8, 8, 8, 8, 8, 8, 8, 6, 8, 8, 8, 6, 8, 6, 6, 5, 8, 8, 8, 6, 8, 6, 6, 5, 8, 6, 6, 5, 6, 5, 5, 4, 8, 8, 8, 8, 8, 8, 8, 8, 8, - 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 7, 8, 7, 7, 4, 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 7, 8, 7, 7, 5, 8, 8, 8, 7, - 8, 7, 7, 5, 8, 7, 7, 5, 7, 5, 5, 4, 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 7, 8, 7, 7, 6, 8, 8, 8, 7, 8, 7, 7, 6, 8, 7, 7, 6, 7, 6, 6, - 4, 8, 8, 8, 7, 8, 7, 7, 6, 8, 7, 7, 6, 7, 6, 6, 5, 8, 7, 7, 6, 7, 6, 6, 5, 7, 6, 6, 5, 6, 5, 5, 4, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 6, - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 6, 8, 8, 8, 8, 8, 8, 8, 6, 8, 8, 8, 6, 8, 6, 6, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 8, 8, 8, - 8, 7, 8, 8, 8, 7, 8, 7, 7, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 7, 8, 7, 7, 6, 8, - 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 7, 8, 7, 7, 6, 8, 8, 8, 7, 8, 7, 7, 6, 8, 7, 7, 6, 7, 6, 6, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 6, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, - 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 8, 8, 8, 8, 7, 8, 8, 8, 7, 8, 7, 7, 6, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, - 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7}; -#endif - -__host__ __device__ static inline uint64_t _select64(uint64_t x, int k) { - if (k >= popcnt(x)) { - return 64; - } - - const uint64_t kOnesStep4 = 0x1111111111111111ULL; - const uint64_t kOnesStep8 = 0x0101010101010101ULL; - const uint64_t kMSBsStep8 = 0x80ULL * kOnesStep8; - - uint64_t s = x; - s = s - ((s & 0xA * kOnesStep4) >> 1); - s = (s & 0x3 * kOnesStep4) + ((s >> 2) & 0x3 * kOnesStep4); - s = (s + (s >> 4)) & 0xF * kOnesStep8; - uint64_t byteSums = s * kOnesStep8; - - uint64_t kStep8 = k * kOnesStep8; - uint64_t geqKStep8 = (((kStep8 | kMSBsStep8) - byteSums) & kMSBsStep8); - uint64_t place = popcnt(geqKStep8) * 8; - uint64_t byteRank = k - (((byteSums << 8) >> place) & (uint64_t)(0xFF)); -#ifdef __CUDA_ARCH__ - return place + gpukSelectInByte[((x >> place) & 0xFF) | (byteRank << 8)]; -#else - return place + hostkSelectInByte[((x >> place) & 0xFF) | (byteRank << 8)]; -#endif // __CUDA_ARCH__ -} - -// Returns the position of the rank'th 1. (rank = 0 returns the 1st 1) -// Returns 64 if there are fewer than rank+1 1s. -__host__ __device__ static inline uint64_t bitselect(uint64_t val, int rank) { -#ifdef __SSE4_2_ - uint64_t i = 1ULL << rank; - asm("pdep %[val], %[mask], %[val]" : [val] "+r"(val) : [mask] "r"(i)); - asm("tzcnt %[bit], %[index]" : [index] "=r"(i) : [bit] "g"(val) : "cc"); - return i; -#endif - return _select64(val, rank); -} - -__host__ __device__ static inline uint64_t bitselectv(const uint64_t val, int ignore, int rank) { - return bitselect(val & ~BITMASK(ignore % 64), rank); -} - -__host__ __device__ static inline int is_runend(const QF *qf, uint64_t index) { - return (METADATA_WORD(qf, runends, index) >> ((index % QF_SLOTS_PER_BLOCK) % 64)) & 1ULL; -} - -__host__ __device__ static inline int is_occupied(const QF *qf, uint64_t index) { - return (METADATA_WORD(qf, occupieds, index) >> ((index % QF_SLOTS_PER_BLOCK) % 64)) & 1ULL; -} - -#if QF_BITS_PER_SLOT == 8 || QF_BITS_PER_SLOT == 16 || QF_BITS_PER_SLOT == 32 || QF_BITS_PER_SLOT == 64 - -__host__ __device__ static inline uint64_t get_slot(const QF *qf, uint64_t index) { - // ERR: Index passed in is incorrect - // printf("slots %lu, index %lu\n", qf->metadata->nslots, index); - assert(index < qf->metadata->xnslots); - return get_block(qf, index / QF_SLOTS_PER_BLOCK)->slots[index % QF_SLOTS_PER_BLOCK]; -} - -__host__ __device__ static inline void set_slot(const QF *qf, uint64_t index, uint64_t value) { - assert(index < qf->metadata->xnslots); - get_block(qf, index / QF_SLOTS_PER_BLOCK)->slots[index % QF_SLOTS_PER_BLOCK] = value & BITMASK(qf->metadata->bits_per_slot); -} - -#elif QF_BITS_PER_SLOT > 0 - -/* Little-endian code .... Big-endian is TODO */ - -__host__ __device__ static inline uint64_t get_slot(const QF *qf, uint64_t index) { - /* Should use __uint128_t to support up to 64-bit remainders, but gcc seems - * to generate buggy code. :/ */ - // printf("Other get slot: slots %lu, index %lu\n", qf->metadata->nslots, index); - assert(index < qf->metadata->xnslots); - uint64_t *p = (uint64_t *)&get_block(qf, index / QF_SLOTS_PER_BLOCK)->slots[(index % QF_SLOTS_PER_BLOCK) * QF_BITS_PER_SLOT / 8]; - return (uint64_t)(((*p) >> (((index % QF_SLOTS_PER_BLOCK) * QF_BITS_PER_SLOT) % 8)) & BITMASK(QF_BITS_PER_SLOT)); -} - -__host__ __device__ static inline void set_slot(const QF *qf, uint64_t index, uint64_t value) { - /* Should use __uint128_t to support up to 64-bit remainders, but gcc seems - * to generate buggy code. :/ */ - assert(index < qf->metadata->xnslots); - uint64_t *p = (uint64_t *)&get_block(qf, index / QF_SLOTS_PER_BLOCK)->slots[(index % QF_SLOTS_PER_BLOCK) * QF_BITS_PER_SLOT / 8]; - uint64_t t = *p; - uint64_t mask = BITMASK(QF_BITS_PER_SLOT); - uint64_t v = value; - int shift = ((index % QF_SLOTS_PER_BLOCK) * QF_BITS_PER_SLOT) % 8; - mask <<= shift; - v <<= shift; - t &= ~mask; - t |= v; - *p = t; -} - -#else - -/* Little-endian code .... Big-endian is TODO */ - -__host__ __device__ static inline uint64_t get_slot(const QF *qf, uint64_t index) { - // rintf("Third get slot?!? slots %lu, index %lu\n", qf->metadata->nslots, index); - assert(index < qf->metadata->xnslots); - /* Should use __uint128_t to support up to 64-bit remainders, but gcc seems - * to generate buggy code. :/ */ - uint64_t *p = - (uint64_t *)&get_block(qf, index / QF_SLOTS_PER_BLOCK)->slots[(index % QF_SLOTS_PER_BLOCK) * qf->metadata->bits_per_slot / 8]; - return (uint64_t)(((*p) >> (((index % QF_SLOTS_PER_BLOCK) * qf->metadata->bits_per_slot) % 8)) & - BITMASK(qf->metadata->bits_per_slot)); -} - -__host__ __device__ static inline void set_slot(const QF *qf, uint64_t index, uint64_t value) { - assert(index < qf->metadata->xnslots); - /* Should use __uint128_t to support up to 64-bit remainders, but gcc seems - * to generate buggy code. :/ */ - uint64_t *p = - (uint64_t *)&get_block(qf, index / QF_SLOTS_PER_BLOCK)->slots[(index % QF_SLOTS_PER_BLOCK) * qf->metadata->bits_per_slot / 8]; - uint64_t t = *p; - uint64_t mask = BITMASK(qf->metadata->bits_per_slot); - uint64_t v = value; - int shift = ((index % QF_SLOTS_PER_BLOCK) * qf->metadata->bits_per_slot) % 8; - mask <<= shift; - v <<= shift; - t &= ~mask; - t |= v; - *p = t; -} - -#endif - -__host__ __device__ static inline uint64_t run_end(const QF *qf, uint64_t hash_bucket_index); - -__host__ __device__ static inline uint64_t block_offset(const QF *qf, uint64_t blockidx) { - /* If we have extended counters and a 16-bit (or larger) offset - field, then we can safely ignore the possibility of overflowing - that field. */ - if (sizeof(qf->blocks[0].offset) > 1 || get_block(qf, blockidx)->offset < BITMASK(8 * sizeof(qf->blocks[0].offset))) - return get_block(qf, blockidx)->offset; - - return run_end(qf, QF_SLOTS_PER_BLOCK * blockidx - 1) - QF_SLOTS_PER_BLOCK * blockidx + 1; -} - -__host__ __device__ static inline uint64_t run_end(const QF *qf, uint64_t hash_bucket_index) { - uint64_t bucket_block_index = hash_bucket_index / QF_SLOTS_PER_BLOCK; - uint64_t bucket_intrablock_offset = hash_bucket_index % QF_SLOTS_PER_BLOCK; - uint64_t bucket_blocks_offset = block_offset(qf, bucket_block_index); - - uint64_t bucket_intrablock_rank = bitrank(get_block(qf, bucket_block_index)->occupieds[0], bucket_intrablock_offset); - - if (bucket_intrablock_rank == 0) { - if (bucket_blocks_offset <= bucket_intrablock_offset) - return hash_bucket_index; - else - return QF_SLOTS_PER_BLOCK * bucket_block_index + bucket_blocks_offset - 1; - } - - uint64_t runend_block_index = bucket_block_index + bucket_blocks_offset / QF_SLOTS_PER_BLOCK; - uint64_t runend_ignore_bits = bucket_blocks_offset % QF_SLOTS_PER_BLOCK; - uint64_t runend_rank = bucket_intrablock_rank - 1; - uint64_t runend_block_offset = bitselectv(get_block(qf, runend_block_index)->runends[0], runend_ignore_bits, runend_rank); - if (runend_block_offset == QF_SLOTS_PER_BLOCK) { - if (bucket_blocks_offset == 0 && bucket_intrablock_rank == 0) { - /* The block begins in empty space, and this bucket is in that region of - * empty space */ - return hash_bucket_index; - } else { - do { - runend_rank -= popcntv(get_block(qf, runend_block_index)->runends[0], runend_ignore_bits); - runend_block_index++; - runend_ignore_bits = 0; - runend_block_offset = bitselectv(get_block(qf, runend_block_index)->runends[0], runend_ignore_bits, runend_rank); - } while (runend_block_offset == QF_SLOTS_PER_BLOCK); - } - } - - uint64_t runend_index = QF_SLOTS_PER_BLOCK * runend_block_index + runend_block_offset; - if (runend_index < hash_bucket_index) - return hash_bucket_index; - else - return runend_index; -} - -__host__ __device__ static inline int offset_lower_bound(const QF *qf, uint64_t slot_index) { - const qfblock *b = get_block(qf, slot_index / QF_SLOTS_PER_BLOCK); - const uint64_t slot_offset = slot_index % QF_SLOTS_PER_BLOCK; - const uint64_t boffset = b->offset; - const uint64_t occupieds = b->occupieds[0] & BITMASK(slot_offset + 1); - - // printf("slot %llu, slot_offset %02lx, block offset %llu, occupieds: %d ", slot_index, slot_offset, boffset, popcnt(occupieds)); - assert(QF_SLOTS_PER_BLOCK == 64); - - // if (boffset < slot_offset) { - if (boffset <= slot_offset) { - const uint64_t runends = (b->runends[0] & BITMASK(slot_offset)) >> boffset; - // printf(" runends %d\n", popcnt(runends)); - // printf("boffset < slot_offset, runends %llu, popcnt(occupieds) %d, popcnt(runends) %d\n", runends, popcnt(occupieds), - // popcnt(runends)); printf("returning %d\n", popcnt(occupieds)-popcnt(runends)); - return popcnt(occupieds) - popcnt(runends); - } - // printf("\n"); - // printf("boffset > slot_offset, boffset-slotoffset %llu, popcnt(occupieds) %d\n", boffset-slot_offset, popcnt(occupieds)); - // printf("returning %d\n", boffset-slot_offset+popcnt(occupieds)); - return boffset - slot_offset + popcnt(occupieds); -} - -/* -__host__ __device__ static inline int offset_lower_bound_verbose(const QF *qf, uint64_t slot_index) { - const qfblock *b = get_block(qf, slot_index / QF_SLOTS_PER_BLOCK); - const uint64_t slot_offset = slot_index % QF_SLOTS_PER_BLOCK; - const uint64_t boffset = b->offset; - const uint64_t occupieds = b->occupieds[0] & BITMASK(slot_offset + 1); - - printf("slot %llu, slot_offset %02lx, block offset %llu, occupieds: %d ", slot_index, slot_offset, boffset, popcnt(occupieds)); - assert(QF_SLOTS_PER_BLOCK == 64); - if (boffset <= slot_offset) { - const uint64_t runends = (b->runends[0] & BITMASK(slot_offset)) >> boffset; - printf(" runends %d\n", popcnt(runends)); - // printf("boffset < slot_offset, runends %llu, popcnt(occupieds) %d, popcnt(runends) %d\n", runends, popcnt(occupieds), - // popcnt(runends)); - printf("returning %d\n", popcnt(occupieds) - popcnt(runends)); - return popcnt(occupieds) - popcnt(runends); - } - printf("\n"); - // printf("boffset > slot_offset, boffset-slotoffset %llu, popcnt(occupieds) %d\n", boffset-slot_offset, popcnt(occupieds)); - printf("returning %ld\n", boffset - slot_offset + popcnt(occupieds)); - return boffset - slot_offset + popcnt(occupieds); -} -*/ - -__host__ __device__ static inline int is_empty(const QF *qf, uint64_t slot_index) { - return offset_lower_bound(qf, slot_index) == 0; -} - -__host__ __device__ static inline int might_be_empty(const QF *qf, uint64_t slot_index) { - return !is_occupied(qf, slot_index) && !is_runend(qf, slot_index); -} - -/* -__device__ static inline int probably_is_empty(const QF *qf, uint64_t slot_index) { - return get_slot(qf, slot_index) == 0 && !is_occupied(qf, slot_index) && !is_runend(qf, slot_index); -}*/ - -/* -__host__ __device__ static inline uint64_t find_first_empty_slot_verbose(QF *qf, uint64_t from) { - printf("Starting find first - this will terminate in -1\n"); - qf_dump_block(qf, from / QF_SLOTS_PER_BLOCK); - do { - int t = offset_lower_bound_verbose(qf, from); - // get block of from - - if (t < 0) { - printf("Finding first empty slot. T: %d, from: %llu\n - block %llu", t, from, from / QF_SLOTS_PER_BLOCK); - qf_dump(qf); - } - assert(t >= 0); - if (t == 0) break; - from = from + t; - } while (1); - printf("Next empty slot: %llu", from); - return from; -} -*/ - -__host__ __device__ static inline uint64_t find_first_empty_slot(QF *qf, uint64_t from) { - uint64_t start_from = from; - - do { - int t = offset_lower_bound(qf, from); - // get block of from - - // if (t < 0){ - - // //this implies a failure in the code - you are going to - // find_first_empty_slot_verbose(qf, start_from); - - // } - - assert(t >= 0); - if (t == 0) break; - from = from + t; - } while (1); - - uint64_t bucket_start_from = start_from / NUM_SLOTS_TO_LOCK; - uint64_t end_start_from = from / NUM_SLOTS_TO_LOCK; - - // testing without this gate to check if we see speed improvements - if (end_start_from > bucket_start_from + 1) { - printf("Find first empty ran over a bucket: %lu\n", end_start_from - bucket_start_from); - } - - return from; -} - -__host__ __device__ static inline uint64_t shift_into_b(const uint64_t a, const uint64_t b, const int bstart, const int bend, - const int amount) { - const uint64_t a_component = bstart == 0 ? (a >> (64 - amount)) : 0; - const uint64_t b_shifted_mask = BITMASK(bend - bstart) << bstart; - const uint64_t b_shifted = ((b_shifted_mask & b) << amount) & b_shifted_mask; - const uint64_t b_mask = ~b_shifted_mask; - return a_component | b_shifted | (b & b_mask); -} - -// __device__ void* gpu_memmove(void* dst, const void* src, size_t n) -// { -// //printf("Launching memmove\n"); -// //todo: allocate space per thread for this buffer before launching the kernel -// void* temp_buffer = malloc(n); -// //maybe stack allocation? -// //void* temp_buffer = void* char[n]; -// // cudaMemcpyAsync(temp_buffer, src, n, cudaMemcpyDeviceToDevice); -// // cudaMemcpyAsync(dst, temp_buffer, n, cudaMemcpyDeviceToDevice); -// // //cudaFree(temp_buffer); -// // return dst; -// memcpy(temp_buffer, src, n); -// memcpy(dst, temp_buffer, n); - -// free(temp_buffer); - -// } - -// a variant of memmove that compares the two pointers -__device__ void gpu_memmove(void *dst, const void *src, size_t n) { - // printf("Launching memmove\n"); - // todo: allocate space per thread for this buffer before launching the kernel - - char *char_dst = (char *)dst; - char *char_src = (char *)src; - - // double check this, - // think it is just > since dst+n does not get copied - if (char_src + n > char_dst) { - // copy backwards - for (int i = n - 1; i >= 0; i--) { - char_dst[i] = char_src[i]; - } - - } else { - // copy regular - for (int i = 0; i < n; i++) { - char_dst[i] = char_src[i]; - } - } - - // free(temp_buffer); -} - -#if QF_BITS_PER_SLOT == 8 || QF_BITS_PER_SLOT == 16 || QF_BITS_PER_SLOT == 32 || QF_BITS_PER_SLOT == 64 - -__host__ __device__ static inline void shift_remainders(QF *qf, uint64_t start_index, uint64_t empty_index) { - uint64_t start_block = start_index / QF_SLOTS_PER_BLOCK; - uint64_t start_offset = start_index % QF_SLOTS_PER_BLOCK; - uint64_t empty_block = empty_index / QF_SLOTS_PER_BLOCK; - uint64_t empty_offset = empty_index % QF_SLOTS_PER_BLOCK; - - assert(start_index <= empty_index); - assert(empty_index < qf->metadata->xnslots); - - while (start_block < empty_block) { -#ifdef __CUDA_ARCH__ - gpu_memmove(&get_block(qf, empty_block)->slots[1], &get_block(qf, empty_block)->slots[0], - empty_offset * sizeof(qf->blocks[0].slots[0])); -#else - memmove(&get_block(qf, empty_block)->slots[1], &get_block(qf, empty_block)->slots[0], - empty_offset * sizeof(qf->blocks[0].slots[0])); -#endif - - get_block(qf, empty_block)->slots[0] = get_block(qf, empty_block - 1)->slots[QF_SLOTS_PER_BLOCK - 1]; - empty_block--; - empty_offset = QF_SLOTS_PER_BLOCK - 1; - } -#ifdef __CUDA_ARCH__ - gpu_memmove(&get_block(qf, empty_block)->slots[start_offset + 1], &get_block(qf, empty_block)->slots[start_offset], - (empty_offset - start_offset) * sizeof(qf->blocks[0].slots[0])); -#else - memmove(&get_block(qf, empty_block)->slots[start_offset + 1], &get_block(qf, empty_block)->slots[start_offset], - (empty_offset - start_offset) * sizeof(qf->blocks[0].slots[0])); -#endif -} - -#else - -#define REMAINDER_WORD(qf, i) \ - ((uint64_t *)&(get_block(qf, (i) / qf->metadata->bits_per_slot)->slots[8 * ((i) % qf->metadata->bits_per_slot)])) - -__host__ __device__ static inline void shift_remainders(QF *qf, const uint64_t start_index, const uint64_t empty_index) { - uint64_t last_word = (empty_index + 1) * qf->metadata->bits_per_slot / 64; - const uint64_t first_word = start_index * qf->metadata->bits_per_slot / 64; - int bend = ((empty_index + 1) * qf->metadata->bits_per_slot) % 64; - const int bstart = (start_index * qf->metadata->bits_per_slot) % 64; - - while (last_word != first_word) { - *REMAINDER_WORD(qf, last_word) = - shift_into_b(*REMAINDER_WORD(qf, last_word - 1), *REMAINDER_WORD(qf, last_word), 0, bend, qf->metadata->bits_per_slot); - last_word--; - bend = 64; - } - *REMAINDER_WORD(qf, last_word) = shift_into_b(0, *REMAINDER_WORD(qf, last_word), bstart, bend, qf->metadata->bits_per_slot); -} - -#endif - -__host__ __device__ static inline void find_next_n_empty_slots(QF *qf, uint64_t from, uint64_t n, uint64_t *indices) { - while (n) { - indices[--n] = find_first_empty_slot(qf, from); - from = indices[n] + 1; - } -} - -__host__ __device__ static inline void shift_slots(QF *qf, int64_t first, uint64_t last, uint64_t distance) { - int64_t i; - if (distance == 1) - shift_remainders(qf, first, last + 1); - else - for (i = last; i >= first; i--) set_slot(qf, i + distance, get_slot(qf, i)); -} - -__host__ __device__ static inline void shift_runends(QF *qf, int64_t first, uint64_t last, uint64_t distance) { - assert(last < qf->metadata->xnslots && distance < 64); - uint64_t first_word = first / 64; - uint64_t bstart = first % 64; - uint64_t last_word = (last + distance + 1) / 64; - uint64_t bend = (last + distance + 1) % 64; - - if (last_word != first_word) { - METADATA_WORD(qf, runends, 64 * last_word) = shift_into_b(METADATA_WORD(qf, runends, 64 * (last_word - 1)), - METADATA_WORD(qf, runends, 64 * last_word), 0, bend, distance); - bend = 64; - last_word--; - while (last_word != first_word) { - METADATA_WORD(qf, runends, 64 * last_word) = shift_into_b(METADATA_WORD(qf, runends, 64 * (last_word - 1)), - METADATA_WORD(qf, runends, 64 * last_word), 0, bend, distance); - last_word--; - } - } - METADATA_WORD(qf, runends, 64 * last_word) = shift_into_b(0, METADATA_WORD(qf, runends, 64 * last_word), bstart, bend, distance); -} - -__host__ __device__ static inline bool insert_replace_slots_and_shift_remainders_and_runends_and_offsets( - QF *qf, int operation, uint64_t bucket_index, uint64_t overwrite_index, const uint64_t *remainders, uint64_t total_remainders, - uint64_t noverwrites) { - uint64_t empties[67]; - uint64_t i; - int64_t j; - int64_t ninserts = total_remainders - noverwrites; - uint64_t insert_index = overwrite_index + noverwrites; - - if (ninserts > 0) { - /* First, shift things to create n empty spaces where we need them. */ - find_next_n_empty_slots(qf, insert_index, ninserts, empties); - if (empties[0] >= qf->metadata->xnslots) { - return false; - } - for (j = 0; j < ninserts - 1; j++) shift_slots(qf, empties[j + 1] + 1, empties[j] - 1, j + 1); - shift_slots(qf, insert_index, empties[ninserts - 1] - 1, ninserts); - - for (j = 0; j < ninserts - 1; j++) shift_runends(qf, empties[j + 1] + 1, empties[j] - 1, j + 1); - shift_runends(qf, insert_index, empties[ninserts - 1] - 1, ninserts); - - for (i = noverwrites; i < total_remainders - 1; i++) - METADATA_WORD(qf, runends, overwrite_index + i) &= ~(1ULL << (((overwrite_index + i) % QF_SLOTS_PER_BLOCK) % 64)); - - switch (operation) { - case 0: /* insert into empty bucket */ - assert(noverwrites == 0); - METADATA_WORD(qf, runends, overwrite_index + total_remainders - 1) |= - 1ULL << (((overwrite_index + total_remainders - 1) % QF_SLOTS_PER_BLOCK) % 64); - break; - case 1: /* append to bucket */ - METADATA_WORD(qf, runends, overwrite_index + noverwrites - 1) &= - ~(1ULL << (((overwrite_index + noverwrites - 1) % QF_SLOTS_PER_BLOCK) % 64)); - METADATA_WORD(qf, runends, overwrite_index + total_remainders - 1) |= - 1ULL << (((overwrite_index + total_remainders - 1) % QF_SLOTS_PER_BLOCK) % 64); - break; - case 2: /* insert into bucket */ - METADATA_WORD(qf, runends, overwrite_index + total_remainders - 1) &= - ~(1ULL << (((overwrite_index + total_remainders - 1) % QF_SLOTS_PER_BLOCK) % 64)); - break; - default: printf("Invalid operation %d\n", operation); -#ifdef __CUDA_ARCH__ - __threadfence(); // ensure store issued before trap - asm("trap;"); -#else - abort(); -#endif - } - - uint64_t npreceding_empties = 0; - for (i = bucket_index / QF_SLOTS_PER_BLOCK + 1; i <= empties[0] / QF_SLOTS_PER_BLOCK; i++) { - while ((int64_t)npreceding_empties < ninserts && empties[ninserts - 1 - npreceding_empties] / QF_SLOTS_PER_BLOCK < i) - npreceding_empties++; - - if (get_block(qf, i)->offset + ninserts - npreceding_empties < BITMASK(8 * sizeof(qf->blocks[0].offset))) - get_block(qf, i)->offset += ninserts - npreceding_empties; - else - get_block(qf, i)->offset = (uint8_t)BITMASK(8 * sizeof(qf->blocks[0].offset)); - } - } - - for (i = 0; i < total_remainders; i++) set_slot(qf, overwrite_index + i, remainders[i]); - - // modify_metadata(&qf->runtimedata->pc_noccupied_slots, ninserts); - - return true; -} - -__host__ __device__ static inline int remove_replace_slots_and_shift_remainders_and_runends_and_offsets( - QF *qf, int operation, uint64_t bucket_index, uint64_t overwrite_index, const uint64_t *remainders, uint64_t total_remainders, - uint64_t old_length) { - uint64_t i; - - // Update the slots - for (i = 0; i < total_remainders; i++) set_slot(qf, overwrite_index + i, remainders[i]); - - // If this is the last thing in its run, then we may need to set a new runend bit - if (is_runend(qf, overwrite_index + old_length - 1)) { - if (total_remainders > 0) { - // If we're not deleting this entry entirely, then it will still the last entry in this run - METADATA_WORD(qf, runends, overwrite_index + total_remainders - 1) |= 1ULL << ((overwrite_index + total_remainders - 1) % 64); - } else if (overwrite_index > bucket_index && !is_runend(qf, overwrite_index - 1)) { - // If we're deleting this entry entirely, but it is not the first entry in this run, - // then set the preceding entry to be the runend - METADATA_WORD(qf, runends, overwrite_index - 1) |= 1ULL << ((overwrite_index - 1) % 64); - } - } - - // shift slots back one run at a time - uint64_t original_bucket = bucket_index; - uint64_t current_bucket = bucket_index; - uint64_t current_slot = overwrite_index + total_remainders; - uint64_t current_distance = old_length - total_remainders; - int ret_current_distance = current_distance; - - while (current_distance > 0) { - if (is_runend(qf, current_slot + current_distance - 1)) { - do { - current_bucket++; - } while (current_bucket < current_slot + current_distance && !is_occupied(qf, current_bucket)); - } - - if (current_bucket <= current_slot) { - set_slot(qf, current_slot, get_slot(qf, current_slot + current_distance)); - if (is_runend(qf, current_slot) != is_runend(qf, current_slot + current_distance)) - METADATA_WORD(qf, runends, current_slot) ^= 1ULL << (current_slot % 64); - current_slot++; - - } else if (current_bucket <= current_slot + current_distance) { - uint64_t i; - for (i = current_slot; i < current_slot + current_distance; i++) { - set_slot(qf, i, 0); - METADATA_WORD(qf, runends, i) &= ~(1ULL << (i % 64)); - } - - current_distance = current_slot + current_distance - current_bucket; - current_slot = current_bucket; - } else { - current_distance = 0; - } - } - - // reset the occupied bit of the hash bucket index if the hash is the - // only item in the run and is removed completely. - if (operation && !total_remainders) METADATA_WORD(qf, occupieds, bucket_index) &= ~(1ULL << (bucket_index % 64)); - - // update the offset bits. - // find the number of occupied slots in the original_bucket block. - // Then find the runend slot corresponding to the last run in the - // original_bucket block. - // Update the offset of the block to which it belongs. - uint64_t original_block = original_bucket / QF_SLOTS_PER_BLOCK; - if (old_length > total_remainders) { // we only update offsets if we shift/delete anything - while (1) { - uint64_t last_occupieds_hash_index = QF_SLOTS_PER_BLOCK * original_block + (QF_SLOTS_PER_BLOCK - 1); - uint64_t runend_index = run_end(qf, last_occupieds_hash_index); - // runend spans across the block - // update the offset of the next block - if (runend_index / QF_SLOTS_PER_BLOCK == original_block) { // if the run ends in the same block - if (get_block(qf, original_block + 1)->offset == 0) break; - get_block(qf, original_block + 1)->offset = 0; - } else { // if the last run spans across the block - if (get_block(qf, original_block + 1)->offset == (runend_index - last_occupieds_hash_index)) break; - get_block(qf, original_block + 1)->offset = (runend_index - last_occupieds_hash_index); - } - original_block++; - } - } - - // int num_slots_freed = old_length - total_remainders; - // modify_metadata(&qf->runtimedata->pc_noccupied_slots, -num_slots_freed); - /*qf->metadata->noccupied_slots -= (old_length - total_remainders);*/ - if (!total_remainders) { - // modify_metadata(&qf->runtimedata->pc_ndistinct_elts, -1); - /*qf->metadata->ndistinct_elts--;*/ - } - - return ret_current_distance; -} - -/***************************************************************************** - * Code that uses the above to implement a QF with keys and inline counters. * - *****************************************************************************/ - -/* - Counter format: - 0 xs: - 1 x: x - 2 xs: xx - 3 0s: 000 - >2 xs: xbc...cx for x != 0, b < x, c != 0, x - >3 0s: 0c...c00 for c != 0 - */ -__host__ __device__ static inline uint64_t *encode_counter(QF *qf, uint64_t remainder, uint64_t counter, uint64_t *slots) { - uint64_t digit = remainder; - uint64_t base = (1ULL << qf->metadata->bits_per_slot) - 1; - uint64_t *p = slots; - - if (counter == 0) return p; - - *--p = remainder; - - if (counter == 1) return p; - - if (counter == 2) { - *--p = remainder; - return p; - } - - if (counter == 3 && remainder == 0) { - *--p = remainder; - *--p = remainder; - return p; - } - - if (counter == 3 && remainder > 0) { - *--p = 0; - *--p = remainder; - return p; - } - - if (remainder == 0) - *--p = remainder; - else - base--; - - if (remainder) - counter -= 3; - else - counter -= 4; - do { - digit = counter % base; - digit++; /* Zero not allowed */ - if (remainder && digit >= remainder) digit++; /* Cannot overflow since digit is mod 2^r-2 */ - *--p = digit; - counter /= base; - } while (counter); - - if (remainder && digit >= remainder) *--p = 0; - - *--p = remainder; - - return p; -} - -/* Returns the length of the encoding. -REQUIRES: index points to first slot of a counter. */ -__host__ __device__ static inline uint64_t decode_counter(const QF *qf, uint64_t index, uint64_t *remainder, uint64_t *count) { - uint64_t base; - uint64_t rem; - uint64_t cnt; - uint64_t digit; - uint64_t end; - - *remainder = rem = get_slot(qf, index); - - if (is_runend(qf, index)) { /* Entire run is "0" */ - *count = 1; - return index; - } - - digit = get_slot(qf, index + 1); - - if (is_runend(qf, index + 1)) { - *count = digit == rem ? 2 : 1; - return index + (digit == rem ? 1 : 0); - } - - if (rem > 0 && digit >= rem) { - *count = digit == rem ? 2 : 1; - return index + (digit == rem ? 1 : 0); - } - - if (rem > 0 && digit == 0 && get_slot(qf, index + 2) == rem) { - *count = 3; - return index + 2; - } - - if (rem == 0 && digit == 0) { - if (get_slot(qf, index + 2) == 0) { - *count = 3; - return index + 2; - } else { - *count = 2; - return index + 1; - } - } - - cnt = 0; - base = (1ULL << qf->metadata->bits_per_slot) - (rem ? 2 : 1); - - end = index + 1; - while (digit != rem && !is_runend(qf, end)) { - if (digit > rem) digit--; - if (digit && rem) digit--; - cnt = cnt * base + digit; - - end++; - digit = get_slot(qf, end); - } - - if (rem) { - *count = cnt + 3; - return end; - } - - if (is_runend(qf, end) || get_slot(qf, end + 1) != 0) { - *count = 1; - return index; - } - - *count = cnt + 4; - return end + 1; -} - -/* return the next slot which corresponds to a - * different element - * */ -/* -__device__ static inline uint64_t next_slot(QF *qf, uint64_t current) { - uint64_t rem = get_slot(qf, current); - current++; - - while (get_slot(qf, current) == rem && current <= qf->metadata->nslots) { - current++; - } - return current; -} -*/ - -__host__ __device__ static inline qf_returns insert1_if_not_exists(QF *qf, __uint64_t hash, uint64_t *value) { - uint64_t hash_remainder = hash & BITMASK(qf->metadata->bits_per_slot); - uint64_t hash_bucket_index = hash >> qf->metadata->bits_per_slot; - uint64_t hash_bucket_block_offset = hash_bucket_index % QF_SLOTS_PER_BLOCK; - - // printf("In insert1, Index is %llu, block_offset is %llu, remainder is %llu \n", hash_bucket_index, hash_bucket_block_offset, - // hash_remainder); - - if (is_empty(qf, hash_bucket_index) /* might_be_empty(qf, hash_bucket_index) && runend_index == hash_bucket_index */) { - METADATA_WORD(qf, runends, hash_bucket_index) |= 1ULL << (hash_bucket_block_offset % 64); - set_slot(qf, hash_bucket_index, hash_remainder); - METADATA_WORD(qf, occupieds, hash_bucket_index) |= 1ULL << (hash_bucket_block_offset % 64); - } else { - uint64_t runend_index = run_end(qf, hash_bucket_index); - int operation = 0; /* Insert into empty bucket */ - uint64_t insert_index = runend_index + 1; - uint64_t new_value = hash_remainder; - - /* printf("RUNSTART: %02lx RUNEND: %02lx\n", runstart_index, runend_index); */ - - uint64_t runstart_index = hash_bucket_index == 0 ? 0 : run_end(qf, hash_bucket_index - 1) + 1; - - if (is_occupied(qf, hash_bucket_index)) { - /* Find the counter for this remainder if it exists. */ - uint64_t current_remainder = get_slot(qf, runstart_index); - - // printf("Current remainder above: %llu\n", current_remainder); - *value = current_remainder & BITMASK(qf->metadata->value_bits); - // printf("Clipped remainder: %llu\n", *value); - - // return here? - // maybe qf_returns::QF_ITEM_FOUND - return QF_ITEM_FOUND; - - } // else { - // modify_metadata(&qf->runtimedata->pc_ndistinct_elts, 1); - //} - - // Here is where we modify? - if (operation != 0) { - // extract - uint64_t current_remainder = get_slot(qf, runstart_index); - printf("Expecting an extraction, Current remainder: %llu\n", (unsigned long long)current_remainder); - } else { - printf("Expecting regular insert. \n"); - } - - if (operation >= 0) { - uint64_t empty_slot_index = find_first_empty_slot(qf, runend_index + 1); - if (empty_slot_index >= qf->metadata->xnslots) { - printf("Ran out of space. Total xnslots is %lu, first empty slot is %lu\n", qf->metadata->xnslots, empty_slot_index); - return QF_FULL; - } - shift_remainders(qf, insert_index, empty_slot_index); - - set_slot(qf, insert_index, new_value); - - shift_runends(qf, insert_index, empty_slot_index - 1, 1); - switch (operation) { - case 0: METADATA_WORD(qf, runends, insert_index) |= 1ULL << ((insert_index % QF_SLOTS_PER_BLOCK) % 64); break; - case 1: - METADATA_WORD(qf, runends, insert_index - 1) &= ~(1ULL << (((insert_index - 1) % QF_SLOTS_PER_BLOCK) % 64)); - METADATA_WORD(qf, runends, insert_index) |= 1ULL << ((insert_index % QF_SLOTS_PER_BLOCK) % 64); - break; - case 2: METADATA_WORD(qf, runends, insert_index) &= ~(1ULL << ((insert_index % QF_SLOTS_PER_BLOCK) % 64)); break; - default: printf("Invalid operation %d\n", operation); -#ifdef __CUDA_ARCH__ - __threadfence(); // ensure store issued before trap - asm("trap;"); -#else - abort(); -#endif - } - /* - * Increment the offset for each block between the hash bucket index - * and block of the empty slot - * */ - uint64_t i; - for (i = hash_bucket_index / QF_SLOTS_PER_BLOCK + 1; i <= empty_slot_index / QF_SLOTS_PER_BLOCK; i++) { - if (get_block(qf, i)->offset < BITMASK(8 * sizeof(qf->blocks[0].offset))) get_block(qf, i)->offset++; - assert(get_block(qf, i)->offset != 0); - } - } - METADATA_WORD(qf, occupieds, hash_bucket_index) |= 1ULL << (hash_bucket_block_offset % 64); - } - - // change here? - return QF_ITEM_INSERTED; -} - -__host__ __device__ static inline int insert1(QF *qf, __uint64_t hash, uint8_t runtime_lock) { - int ret_distance = 0; - uint64_t hash_remainder = hash & BITMASK(qf->metadata->bits_per_slot); - uint64_t hash_bucket_index = hash >> qf->metadata->bits_per_slot; - uint64_t hash_bucket_block_offset = hash_bucket_index % QF_SLOTS_PER_BLOCK; - /* - if (GET_NO_LOCK(runtime_lock) != QF_NO_LOCK) { - if (!qf_lock(qf, hash_bucket_index, true, runtime_lock)) - return QF_COULDNT_LOCK; - } - */ - // printf("In insert1, Index is %llu, block_offset is %llu, remainder is %llu \n", hash_bucket_index, hash_bucket_block_offset, - // hash_remainder); - -#ifdef __CUDA_ARCH__ - atomicAdd((unsigned long long *)&qf->metadata->noccupied_slots, 1ULL); -#else - abort(); -#endif - - if (is_empty(qf, hash_bucket_index) /* might_be_empty(qf, hash_bucket_index) && runend_index == hash_bucket_index */) { - METADATA_WORD(qf, runends, hash_bucket_index) |= 1ULL << (hash_bucket_block_offset % 64); - set_slot(qf, hash_bucket_index, hash_remainder); - METADATA_WORD(qf, occupieds, hash_bucket_index) |= 1ULL << (hash_bucket_block_offset % 64); - - ret_distance = 0; - // modify_metadata(&qf->runtimedata->pc_ndistinct_elts, 1); - // modify_metadata(&qf->runtimedata->pc_noccupied_slots, 1); - // modify_metadata(&qf->runtimedata->pc_nelts, 1); - } else { - uint64_t runend_index = run_end(qf, hash_bucket_index); - int operation = 0; /* Insert into empty bucket */ - uint64_t insert_index = runend_index + 1; - uint64_t new_value = hash_remainder; - - /* printf("RUNSTART: %02lx RUNEND: %02lx\n", runstart_index, runend_index); */ - - uint64_t runstart_index = hash_bucket_index == 0 ? 0 : run_end(qf, hash_bucket_index - 1) + 1; - - if (is_occupied(qf, hash_bucket_index)) { - /* Find the counter for this remainder if it exists. */ - uint64_t current_remainder = get_slot(qf, runstart_index); - uint64_t zero_terminator = runstart_index; - - /* The counter for 0 is special. */ - if (current_remainder == 0) { - uint64_t t = runstart_index + 1; - while (t < runend_index && get_slot(qf, t) != 0) t++; - if (t < runend_index && get_slot(qf, t + 1) == 0) - zero_terminator = t + 1; /* Three or more 0s */ - else if (runstart_index < runend_index && get_slot(qf, runstart_index + 1) == 0) - zero_terminator = runstart_index + 1; /* Exactly two 0s */ - /* Otherwise, exactly one 0 (i.e. zero_terminator == runstart_index) */ - - /* May read past end of run, but that's OK because loop below - can handle that */ - if (hash_remainder != 0) { - runstart_index = zero_terminator + 1; - current_remainder = get_slot(qf, runstart_index); - } - } - - /* Skip over counters for other remainders. */ - while (current_remainder < hash_remainder && runstart_index <= runend_index) { - /* If this remainder has an extended counter, skip over it. */ - if (runstart_index < runend_index && get_slot(qf, runstart_index + 1) < current_remainder) { - runstart_index = runstart_index + 2; - while (runstart_index < runend_index && get_slot(qf, runstart_index) != current_remainder) runstart_index++; - runstart_index++; - - /* This remainder has a simple counter. */ - } else { - runstart_index++; - } - - /* This may read past the end of the run, but the while loop - condition will prevent us from using the invalid result in - that case. */ - current_remainder = get_slot(qf, runstart_index); - } - - /* If this is the first time we've inserted the new remainder, - and it is larger than any remainder in the run. */ - if (runstart_index > runend_index) { - operation = 1; - insert_index = runstart_index; - new_value = hash_remainder; - // modify_metadata(&qf->runtimedata->pc_ndistinct_elts, 1); - - /* This is the first time we're inserting this remainder, but - there are larger remainders already in the run. */ - } else if (current_remainder != hash_remainder) { - operation = 2; /* Inserting */ - insert_index = runstart_index; - new_value = hash_remainder; - // modify_metadata(&qf->runtimedata->pc_ndistinct_elts, 1); - - /* Cases below here: we're incrementing the (simple or - extended) counter for this remainder. */ - - /* If there's exactly one instance of this remainder. */ - } else if (runstart_index == runend_index || (hash_remainder > 0 && get_slot(qf, runstart_index + 1) > hash_remainder) || - (hash_remainder == 0 && zero_terminator == runstart_index)) { - operation = 2; /* Insert */ - insert_index = runstart_index; - new_value = hash_remainder; - - /* If there are exactly two instances of this remainder. */ - } else if ((hash_remainder > 0 && get_slot(qf, runstart_index + 1) == hash_remainder) || - (hash_remainder == 0 && zero_terminator == runstart_index + 1)) { - operation = 2; /* Insert */ - insert_index = runstart_index + 1; - new_value = 0; - - /* Special case for three 0s */ - } else if (hash_remainder == 0 && zero_terminator == runstart_index + 2) { - operation = 2; /* Insert */ - insert_index = runstart_index + 1; - new_value = 1; - - /* There is an extended counter for this remainder. */ - } else { - /* Move to the LSD of the counter. */ - insert_index = runstart_index + 1; - while (get_slot(qf, insert_index + 1) != hash_remainder) insert_index++; - - /* Increment the counter. */ - uint64_t digit, carry; - do { - carry = 0; - digit = get_slot(qf, insert_index); - // Convert a leading 0 (which is special) to a normal encoded digit - if (digit == 0) { - digit++; - if (digit == current_remainder) digit++; - } - - // Increment the digit - digit = (digit + 1) & BITMASK(qf->metadata->bits_per_slot); - - // Ensure digit meets our encoding requirements - if (digit == 0) { - digit++; - carry = 1; - } - if (digit == current_remainder) digit = (digit + 1) & BITMASK(qf->metadata->bits_per_slot); - if (digit == 0) { - digit++; - carry = 1; - } - - set_slot(qf, insert_index, digit); - insert_index--; - } while (insert_index > runstart_index && carry); - - /* If the counter needs to be expanded. */ - if (insert_index == runstart_index && (carry > 0 || (current_remainder != 0 && digit >= current_remainder))) { - operation = 2; /* insert */ - insert_index = runstart_index + 1; - if (!carry) /* To prepend a 0 before the counter if the MSD is greater than the rem */ - new_value = 0; - else if (carry) { /* Increment the new value because we don't use 0 to encode counters */ - new_value = 2; - /* If the rem is greater than or equal to the new_value then fail*/ - if (current_remainder > 0) assert(new_value < current_remainder); - } - } else { - operation = -1; - } - } - } // else { - // modify_metadata(&qf->runtimedata->pc_ndistinct_elts, 1); - //} - - if (operation >= 0) { - uint64_t empty_slot_index = find_first_empty_slot(qf, runend_index + 1); - if (empty_slot_index >= qf->metadata->xnslots) { - printf("Ran out of space. Total xnslots is %lu, first empty slot is %lu\n", qf->metadata->xnslots, empty_slot_index); - return QF_NO_SPACE; - } - shift_remainders(qf, insert_index, empty_slot_index); - - set_slot(qf, insert_index, new_value); - ret_distance = insert_index - hash_bucket_index; - - shift_runends(qf, insert_index, empty_slot_index - 1, 1); - switch (operation) { - case 0: METADATA_WORD(qf, runends, insert_index) |= 1ULL << ((insert_index % QF_SLOTS_PER_BLOCK) % 64); break; - case 1: - METADATA_WORD(qf, runends, insert_index - 1) &= ~(1ULL << (((insert_index - 1) % QF_SLOTS_PER_BLOCK) % 64)); - METADATA_WORD(qf, runends, insert_index) |= 1ULL << ((insert_index % QF_SLOTS_PER_BLOCK) % 64); - break; - case 2: METADATA_WORD(qf, runends, insert_index) &= ~(1ULL << ((insert_index % QF_SLOTS_PER_BLOCK) % 64)); break; - default: printf("Invalid operation %d\n", operation); -#ifdef __CUDA_ARCH__ - __threadfence(); // ensure store issued before trap - asm("trap;"); -#else - abort(); -#endif - } - /* - * Increment the offset for each block between the hash bucket index - * and block of the empty slot - * */ - uint64_t i; - for (i = hash_bucket_index / QF_SLOTS_PER_BLOCK + 1; i <= empty_slot_index / QF_SLOTS_PER_BLOCK; i++) { - if (get_block(qf, i)->offset < BITMASK(8 * sizeof(qf->blocks[0].offset))) get_block(qf, i)->offset++; - assert(get_block(qf, i)->offset != 0); - } - // modify_metadata(&qf->runtimedata->pc_noccupied_slots, 1); - } - // modify_metadata(&qf->runtimedata->pc_nelts, 1); - METADATA_WORD(qf, occupieds, hash_bucket_index) |= 1ULL << (hash_bucket_block_offset % 64); - } - /* - if (GET_NO_LOCK(runtime_lock) != QF_NO_LOCK) { - qf_unlock(qf, hash_bucket_index, true); - } - */ - return ret_distance; -} - -__host__ __device__ static inline int insert(QF *qf, __uint64_t hash, uint64_t count, uint8_t runtime_lock) { - int ret_distance = 0; - uint64_t hash_remainder = hash & BITMASK(qf->metadata->bits_per_slot); - uint64_t hash_bucket_index = hash >> qf->metadata->bits_per_slot; - uint64_t hash_bucket_block_offset = hash_bucket_index % QF_SLOTS_PER_BLOCK; - /*uint64_t hash_bucket_lock_offset = hash_bucket_index % NUM_SLOTS_TO_LOCK;*/ - /* - if (GET_NO_LOCK(runtime_lock) != QF_NO_LOCK) { - if (!qf_lock(qf, hash_bucket_index, false, runtime_lock)) - return QF_COULDNT_LOCK; - } - */ - uint64_t runend_index = run_end(qf, hash_bucket_index); - - /* Empty slot */ - if (might_be_empty(qf, hash_bucket_index) && runend_index == hash_bucket_index) { - METADATA_WORD(qf, runends, hash_bucket_index) |= 1ULL << (hash_bucket_block_offset % 64); - set_slot(qf, hash_bucket_index, hash_remainder); - METADATA_WORD(qf, occupieds, hash_bucket_index) |= 1ULL << (hash_bucket_block_offset % 64); - - // ERIC TODO: see if this metadata is needed--probably isn't compatible with GPU - // modify_metadata(&qf->runtimedata->pc_ndistinct_elts, 1); - // modify_metadata(&qf->runtimedata->pc_noccupied_slots, 1); - // modify_metadata(&qf->runtimedata->pc_nelts, 1); - /* This trick will, I hope, keep the fast case fast. */ - if (count > 1) { - insert(qf, hash, count - 1, QF_NO_LOCK); - } - } else { /* Non-empty slot */ - uint64_t new_values[67]; - int64_t runstart_index = hash_bucket_index == 0 ? 0 : run_end(qf, hash_bucket_index - 1) + 1; - - bool ret; - if (!is_occupied(qf, hash_bucket_index)) { /* Empty bucket, but its slot is occupied. */ - uint64_t *p = encode_counter(qf, hash_remainder, count, &new_values[67]); - ret = insert_replace_slots_and_shift_remainders_and_runends_and_offsets(qf, 0, hash_bucket_index, runstart_index, p, - &new_values[67] - p, 0); - if (!ret) return QF_NO_SPACE; - // modify_metadata(&qf->runtimedata->pc_ndistinct_elts, 1); - ret_distance = runstart_index - hash_bucket_index; - } else { /* Non-empty bucket */ - - uint64_t current_remainder, current_count, current_end; - - /* Find the counter for this remainder, if one exists. */ - current_end = decode_counter(qf, runstart_index, ¤t_remainder, ¤t_count); - while (current_remainder < hash_remainder && !is_runend(qf, current_end)) { - runstart_index = current_end + 1; - current_end = decode_counter(qf, runstart_index, ¤t_remainder, ¤t_count); - } - - /* If we reached the end of the run w/o finding a counter for this remainder, - then append a counter for this remainder to the run. */ - if (current_remainder < hash_remainder) { - uint64_t *p = encode_counter(qf, hash_remainder, count, &new_values[67]); - ret = insert_replace_slots_and_shift_remainders_and_runends_and_offsets(qf, 1, /* Append to bucket */ hash_bucket_index, - current_end + 1, p, &new_values[67] - p, 0); - if (!ret) return QF_NO_SPACE; - // modify_metadata(&qf->runtimedata->pc_ndistinct_elts, 1); - ret_distance = (current_end + 1) - hash_bucket_index; - /* Found a counter for this remainder. Add in the new count. */ - } else if (current_remainder == hash_remainder) { - uint64_t *p = encode_counter(qf, hash_remainder, current_count + count, &new_values[67]); - ret = insert_replace_slots_and_shift_remainders_and_runends_and_offsets( - qf, is_runend(qf, current_end) ? 1 : 2, hash_bucket_index, runstart_index, p, &new_values[67] - p, - current_end - runstart_index + 1); - if (!ret) return QF_NO_SPACE; - ret_distance = runstart_index - hash_bucket_index; - /* No counter for this remainder, but there are larger - remainders, so we're not appending to the bucket. */ - } else { - uint64_t *p = encode_counter(qf, hash_remainder, count, &new_values[67]); - ret = insert_replace_slots_and_shift_remainders_and_runends_and_offsets(qf, 2, /* Insert to bucket */ - hash_bucket_index, runstart_index, p, - &new_values[67] - p, 0); - if (!ret) return QF_NO_SPACE; - // modify_metadata(&qf->runtimedata->pc_ndistinct_elts, 1); - ret_distance = runstart_index - hash_bucket_index; - } - } - METADATA_WORD(qf, occupieds, hash_bucket_index) |= 1ULL << (hash_bucket_block_offset % 64); - - // modify_metadata(&qf->runtimedata->pc_nelts, count); - } - /* - if (GET_NO_LOCK(runtime_lock) != QF_NO_LOCK) { - qf_unlock(qf, hash_bucket_index, false); - } - */ - return ret_distance; -} - -__host__ __device__ inline static int _remove(QF *qf, __uint64_t hash, uint64_t count, uint8_t runtime_lock) { - int ret_numfreedslots = 0; - uint64_t hash_remainder = hash & BITMASK(qf->metadata->bits_per_slot); - uint64_t hash_bucket_index = hash >> qf->metadata->bits_per_slot; - uint64_t current_remainder, current_count, current_end; - uint64_t new_values[67]; - /* - if (GET_NO_LOCK(runtime_lock) != QF_NO_LOCK) { - if (!qf_lock(qf, hash_bucket_index, false, runtime_lock)) - return -2; - } - */ - - /* Empty bucket */ - if (!is_occupied(qf, hash_bucket_index)) return -1; - - uint64_t runstart_index = hash_bucket_index == 0 ? 0 : run_end(qf, hash_bucket_index - 1) + 1; - uint64_t original_runstart_index = runstart_index; - int only_item_in_the_run = 0; - - /*Find the counter for this remainder, if one exists.*/ - current_end = decode_counter(qf, runstart_index, ¤t_remainder, ¤t_count); - while (current_remainder < hash_remainder && !is_runend(qf, current_end)) { - runstart_index = current_end + 1; - current_end = decode_counter(qf, runstart_index, ¤t_remainder, ¤t_count); - } - /* remainder not found in the given run */ - if (current_remainder != hash_remainder) return -1; - - if (original_runstart_index == runstart_index && is_runend(qf, current_end)) only_item_in_the_run = 1; - - /* endode the new counter */ - uint64_t *p = encode_counter(qf, hash_remainder, count > current_count ? 0 : current_count - count, &new_values[67]); - ret_numfreedslots = remove_replace_slots_and_shift_remainders_and_runends_and_offsets( - qf, only_item_in_the_run, hash_bucket_index, runstart_index, p, &new_values[67] - p, current_end - runstart_index + 1); - - // update the nelements. - // modify_metadata(&qf->runtimedata->pc_nelts, -count); - /*qf->metadata->nelts -= count;*/ - /* - if (GET_NO_LOCK(runtime_lock) != QF_NO_LOCK) { - qf_unlock(qf, hash_bucket_index, false); - } - */ - return ret_numfreedslots; -} - -/*********************************************************************** - * Code that uses the above to implement key-value-counter operations. * - ***********************************************************************/ - -__host__ uint64_t qf_estimate_memory(int nbits) { - uint64_t nslots = 1ULL << nbits; - -#ifdef DEBUG - uint64_t key_remainder_bits = QF_BITS_PER_REMAINDER; - assert(key_remainder_bits >= 2); - uint64_t value_bits = QF_BITS_PER_VALUE; - uint64_t bits_per_slot; -#endif - - // uint64_t num_slots, xnslots, nblocks; - uint64_t xnslots, nblocks; - uint64_t size; - uint64_t total_num_bytes; - - assert(popcnt(nslots) == 1); /* nslots must be a power of 2 */ - // num_slots = nslots; - xnslots = nslots + 10 * sqrt((double)nslots); - nblocks = (xnslots + QF_SLOTS_PER_BLOCK - 1) / QF_SLOTS_PER_BLOCK; - -#ifdef DEBUG - bits_per_slot = key_remainder_bits + value_bits; - assert(QF_BITS_PER_SLOT == 0 || QF_BITS_PER_SLOT == bits_per_slot); - assert(bits_per_slot > 1); -#endif - -#if QF_BITS_PER_SLOT == 8 || QF_BITS_PER_SLOT == 16 || QF_BITS_PER_SLOT == 32 || QF_BITS_PER_SLOT == 64 - size = nblocks * sizeof(qfblock); -#else - size = nblocks * (sizeof(qfblock) + QF_SLOTS_PER_BLOCK * bits_per_slot / 8); -#endif - - total_num_bytes = sizeof(qfmetadata) + size; - - return total_num_bytes; -} - -__host__ uint64_t qf_init(QF *qf, uint64_t nslots, uint64_t key_bits, uint64_t value_bits, enum qf_hashmode hash, uint32_t seed, - void *buffer, uint64_t buffer_len) { - uint64_t num_slots, xnslots, nblocks; - uint64_t key_remainder_bits, bits_per_slot; - uint64_t size; - uint64_t total_num_bytes; - - assert(popcnt(nslots) == 1); /* nslots must be a power of 2 */ - num_slots = nslots; - xnslots = nslots + 10 * sqrt((double)nslots); - nblocks = (xnslots + QF_SLOTS_PER_BLOCK - 1) / QF_SLOTS_PER_BLOCK; - key_remainder_bits = key_bits; - while (nslots > 1 && key_remainder_bits > 0) { - key_remainder_bits--; - nslots >>= 1; - } - assert(key_remainder_bits >= 2); - - bits_per_slot = key_remainder_bits + value_bits; - assert(QF_BITS_PER_SLOT == 0 || QF_BITS_PER_SLOT == bits_per_slot); - assert(bits_per_slot > 1); -#if QF_BITS_PER_SLOT == 8 || QF_BITS_PER_SLOT == 16 || QF_BITS_PER_SLOT == 32 || QF_BITS_PER_SLOT == 64 - size = nblocks * sizeof(qfblock); -#else - size = nblocks * (sizeof(qfblock) + QF_SLOTS_PER_BLOCK * bits_per_slot / 8); -#endif - - total_num_bytes = sizeof(qfmetadata) + size; - if (buffer == NULL || total_num_bytes > buffer_len) return total_num_bytes; - - // memset(buffer, 0, total_num_bytes); - qf->metadata = (qfmetadata *)(buffer); - qf->blocks = (qfblock *)(qf->metadata + 1); - - qf->metadata->magic_endian_number = MAGIC_NUMBER; - qf->metadata->reserved = 0; - qf->metadata->hash_mode = hash; - qf->metadata->total_size_in_bytes = size; - qf->metadata->seed = seed; - qf->metadata->nslots = num_slots; - qf->metadata->xnslots = xnslots; - qf->metadata->key_bits = key_bits; - qf->metadata->value_bits = value_bits; - qf->metadata->key_remainder_bits = key_remainder_bits; - qf->metadata->bits_per_slot = bits_per_slot; - - qf->metadata->range = qf->metadata->nslots; - qf->metadata->range <<= qf->metadata->key_remainder_bits; - qf->metadata->nblocks = (qf->metadata->xnslots + QF_SLOTS_PER_BLOCK - 1) / QF_SLOTS_PER_BLOCK; - qf->metadata->nelts = 0; - qf->metadata->ndistinct_elts = 0; - qf->metadata->noccupied_slots = 0; - - qf->runtimedata->num_locks = ((qf->metadata->xnslots / NUM_SLOTS_TO_LOCK) + 10) * LOCK_DIST; - - pc_init(&qf->runtimedata->pc_nelts, (int64_t *)&qf->metadata->nelts, 8, 100); - pc_init(&qf->runtimedata->pc_ndistinct_elts, (int64_t *)&qf->metadata->ndistinct_elts, 8, 100); - pc_init(&qf->runtimedata->pc_noccupied_slots, (int64_t *)&qf->metadata->noccupied_slots, 8, 100); - /* initialize container resize */ - qf->runtimedata->auto_resize = 0; - qf->runtimedata->container_resize = qf_resize_malloc; - /* initialize all the locks to 0 */ - qf->runtimedata->metadata_lock = 0; - // todo: copy this to GPU - - qf->runtimedata->locks = (uint16_t *)calloc(qf->runtimedata->num_locks, sizeof(uint16_t)); - if (qf->runtimedata->locks == NULL) { - perror("Couldn't allocate memory for runtime locks."); - exit(EXIT_FAILURE); - } -#ifdef LOG_WAIT_TIME - qf->runtimedata->wait_times = (wait_time_data *)calloc(qf->runtimedata->num_locks + 1, sizeof(wait_time_data)); - if (qf->runtimedata->wait_times == NULL) { - perror("Couldn't allocate memory for runtime wait_times."); - exit(EXIT_FAILURE); - } -#endif - - return total_num_bytes; -} - -__host__ uint64_t qf_use(QF *qf, void *buffer, uint64_t buffer_len) { - qf->metadata = (qfmetadata *)(buffer); - if (qf->metadata->total_size_in_bytes + sizeof(qfmetadata) > buffer_len) { - return qf->metadata->total_size_in_bytes + sizeof(qfmetadata); - } - qf->blocks = (qfblock *)(qf->metadata + 1); - - qf->runtimedata = (qfruntime *)calloc(sizeof(qfruntime), 1); - if (qf->runtimedata == NULL) { - perror("Couldn't allocate memory for runtime data."); - exit(EXIT_FAILURE); - } - /* initialize all the locks to 0 */ - qf->runtimedata->metadata_lock = 0; - qf->runtimedata->locks = (uint16_t *)calloc(qf->runtimedata->num_locks, sizeof(uint16_t)); - if (qf->runtimedata->locks == NULL) { - perror("Couldn't allocate memory for runtime locks."); - exit(EXIT_FAILURE); - } -#ifdef LOG_WAIT_TIME - qf->runtimedata->wait_times = (wait_time_data *)calloc(qf->runtimedata->num_locks + 1, sizeof(wait_time_data)); - if (qf->runtimedata->wait_times == NULL) { - perror("Couldn't allocate memory for runtime wait_times."); - exit(EXIT_FAILURE); - } -#endif - - return sizeof(qfmetadata) + qf->metadata->total_size_in_bytes; -} - -__host__ void *qf_destroy(QF *qf) { - assert(qf != NULL && "QF is NULL"); - assert(qf->runtimedata != NULL && "runtimedata for QF is NULL"); - if (qf->runtimedata->locks != NULL) free((void *)qf->runtimedata->locks); - if (qf->runtimedata->wait_times != NULL) free(qf->runtimedata->wait_times); - if (qf->runtimedata->f_info.filepath != NULL) free(qf->runtimedata->f_info.filepath); - free(qf->runtimedata); - - return (void *)qf->metadata; -} - -__host__ bool qf_malloc(QF *qf, uint64_t nslots, uint64_t key_bits, uint64_t value_bits, enum qf_hashmode hash, bool on_device, - uint32_t seed) { - uint64_t total_num_bytes = qf_init(qf, nslots, key_bits, value_bits, hash, seed, NULL, 0); - - // buffer malloc bad? - void *buffer = malloc(total_num_bytes); - memset(buffer, 0, total_num_bytes); - - if (buffer == NULL) { - perror("Couldn't allocate memory for the CQF."); - exit(EXIT_FAILURE); - } - - qf->runtimedata = (qfruntime *)calloc(sizeof(qfruntime), 1); - - if (qf->runtimedata == NULL) { - perror("Couldn't allocate memory for runtime data."); - exit(EXIT_FAILURE); - } - - uint64_t init_size = qf_init(qf, nslots, key_bits, value_bits, hash, seed, buffer, total_num_bytes); - - if (init_size == total_num_bytes) - return total_num_bytes; - else - return -1; -} - -__host__ bool qf_free(QF *qf) { - assert(qf->metadata != NULL); - void *buffer = qf_destroy(qf); - if (buffer != NULL) { - free(buffer); - return true; - } - - return false; -} - -__host__ void qf_free_gpu(QF *qf) { - QF hostQF; - - // cudaMallocHost((void **)&hostQF, sizeof(QF)); - - cudaMemcpy(&hostQF, qf, sizeof(QF), cudaMemcpyDeviceToHost); - - cudaFree(hostQF.runtimedata); - cudaFree(hostQF.metadata); - cudaFree(hostQF.blocks); - - cudaFree(qf); -} - -__host__ void qf_copy(QF *dest, const QF *src) { - DEBUG_CQF("%s\n", "Source CQF"); - DEBUG_DUMP(src); - memcpy(dest->runtimedata, src->runtimedata, sizeof(qfruntime)); - memcpy(dest->metadata, src->metadata, sizeof(qfmetadata)); - memcpy(dest->blocks, src->blocks, src->metadata->total_size_in_bytes); - DEBUG_CQF("%s\n", "Destination CQF after copy."); - DEBUG_DUMP(dest); -} - -__host__ void qf_reset(QF *qf) { - qf->metadata->nelts = 0; - qf->metadata->ndistinct_elts = 0; - qf->metadata->noccupied_slots = 0; - -#ifdef LOG_WAIT_TIME - memset(qf->wait_times, 0, (qf->runtimedata->num_locks + 1) * sizeof(wait_time_data)); -#endif -#if QF_BITS_PER_SLOT == 8 || QF_BITS_PER_SLOT == 16 || QF_BITS_PER_SLOT == 32 || QF_BITS_PER_SLOT == 64 - memset(qf->blocks, 0, qf->metadata->nblocks * sizeof(qfblock)); -#else - memset(qf->blocks, 0, qf->metadata->nblocks * (sizeof(qfblock) + QF_SLOTS_PER_BLOCK * qf->metadata->bits_per_slot / 8)); -#endif -} - -__host__ int64_t qf_resize_malloc(QF *qf, uint64_t nslots) { - QF new_qf; - if (!qf_malloc(&new_qf, nslots, qf->metadata->key_bits, qf->metadata->value_bits, qf->metadata->hash_mode, false, - qf->metadata->seed)) - return -1; - if (qf->runtimedata->auto_resize) qf_set_auto_resize(&new_qf, true); - - // copy keys from qf into new_qf - QFi qfi; - qf_iterator_from_position(qf, &qfi, 0); - int64_t ret_numkeys = 0; - do { - uint64_t key, value, count; - qfi_get_hash(&qfi, &key, &value, &count); - qfi_next(&qfi); - int ret = qf_insert(&new_qf, key, value, count, QF_NO_LOCK | QF_KEY_IS_HASH); - if (ret < 0) { - printf("Failed to insert key: %ld into the new CQF.\n", key); - return ret; - } - ret_numkeys++; - } while (!qfi_end(&qfi)); - - qf_free(qf); - memcpy(qf, &new_qf, sizeof(QF)); - - return ret_numkeys; -} - -uint64_t qf_resize(QF *qf, uint64_t nslots, void *buffer, uint64_t buffer_len) { - printf("QF attempting resize - This will fail\n"); - QF new_qf; - new_qf.runtimedata = (qfruntime *)calloc(sizeof(qfruntime), 1); - if (new_qf.runtimedata == NULL) { - perror("Couldn't allocate memory for runtime data.\n"); - exit(EXIT_FAILURE); - } - - uint64_t init_size = qf_init(&new_qf, nslots, qf->metadata->key_bits, qf->metadata->value_bits, qf->metadata->hash_mode, - qf->metadata->seed, buffer, buffer_len); - - if (init_size > buffer_len) return init_size; - - if (qf->runtimedata->auto_resize) qf_set_auto_resize(&new_qf, true); - - // copy keys from qf into new_qf - QFi qfi; - qf_iterator_from_position(qf, &qfi, 0); - do { - uint64_t key, value, count; - qfi_get_hash(&qfi, &key, &value, &count); - qfi_next(&qfi); - int ret = qf_insert(&new_qf, key, value, count, QF_NO_LOCK | QF_KEY_IS_HASH); - if (ret < 0) { - printf("Failed to insert key: %ld into the new CQF.\n", key); - abort(); // kill kernel with error - } - } while (!qfi_end(&qfi)); - - qf_free(qf); - memcpy(qf, &new_qf, sizeof(QF)); - - return init_size; -} - -__host__ void qf_set_auto_resize(QF *qf, bool enabled) { - if (enabled) - qf->runtimedata->auto_resize = 1; - else - qf->runtimedata->auto_resize = 0; -} - -__host__ __device__ qf_returns qf_insert_not_exists(QF *qf, uint64_t key, uint64_t value, uint64_t count, uint8_t flags, - uint64_t *retvalue) { - // We fill up the CQF up to 95% load factor. - // This is a very conservative check. - - // TODO: GPU resizing - /* - if (qf_get_num_occupied_slots(qf) >= qf->metadata->nslots * 0.95) { - if (qf->runtimedata->auto_resize) { - fprintf(stdout, "Resizing the CQF.\n"); - if (qf->runtimedata->container_resize(qf, qf->metadata->nslots * 2) < 0) - { - fprintf(stderr, "Resizing the failed.\n"); - return QF_NO_SPACE; - } - } else - return QF_NO_SPACE; - } - */ - // if (count == 0) - // return 0; - - if (GET_KEY_HASH(flags) != QF_KEY_IS_HASH) { - if (qf->metadata->hash_mode == QF_HASH_DEFAULT) - key = MurmurHash64A(((void *)&key), sizeof(key), qf->metadata->seed) % qf->metadata->range; - else if (qf->metadata->hash_mode == QF_HASH_INVERTIBLE) - key = hash_64(key, BITMASK(qf->metadata->key_bits)); - } - - uint64_t hash = (key << qf->metadata->value_bits) | (value & BITMASK(qf->metadata->value_bits)); - // printf("Inside insert, new hash is recorded as %llu\n", hash); - qf_returns ret = QF_FULL; - - if (count == 1) ret = insert1_if_not_exists(qf, hash, retvalue); - assert(count == 1); - return ret; -} - -__host__ __device__ int qf_insert(QF *qf, uint64_t key, uint64_t value, uint64_t count, uint8_t flags) { - // We fill up the CQF up to 95% load factor. - // This is a very conservative check. - - // TODO: GPU resizing - /* - if (qf_get_num_occupied_slots(qf) >= qf->metadata->nslots * 0.95) { - if (qf->runtimedata->auto_resize) { - fprintf(stdout, "Resizing the CQF.\n"); - if (qf->runtimedata->container_resize(qf, qf->metadata->nslots * 2) < 0) - { - fprintf(stderr, "Resizing the failed.\n"); - return QF_NO_SPACE; - } - } else - return QF_NO_SPACE; - } - */ - if (count == 0) return 0; - - if (GET_KEY_HASH(flags) != QF_KEY_IS_HASH) { - if (qf->metadata->hash_mode == QF_HASH_DEFAULT) - key = MurmurHash64A(((void *)&key), sizeof(key), qf->metadata->seed) % qf->metadata->range; - else if (qf->metadata->hash_mode == QF_HASH_INVERTIBLE) - key = hash_64(key, BITMASK(qf->metadata->key_bits)); - } - - uint64_t hash = (key << qf->metadata->value_bits) | (value & BITMASK(qf->metadata->value_bits)); - // printf("Inside insert, new hash is recorded as %llu\n", hash); - int ret = QF_NO_SPACE; - if (count == 1) ret = insert1(qf, hash, flags); - assert(count == 1); - return ret; -} -/*------------------------ -GPU Modifications ---------------------------*/ - -// locking implementation for the 16 bit locks -// undefined behavior if you try to unlock a not locked lock -__device__ void lock_16(uint16_t *lock, uint64_t index) { - uint16_t zero = 0; - uint16_t one = 1; - - while (atomicCAS((uint16_t *)&lock[index * LOCK_DIST], zero, one) != zero) - ; -} - -__device__ void unlock_16(uint16_t *lock, uint64_t index) { - uint16_t zero = 0; - uint16_t one = 1; - - atomicCAS((uint16_t *)&lock[index * LOCK_DIST], one, zero); -} - -// lock_16 but built to be included as a piece of a while loop -// this is more in line with traditional cuda processing, may increase throughput -__device__ bool try_lock_16(uint16_t *lock, uint64_t index) { - uint16_t zero = 0; - uint16_t one = 1; - - if (atomicCAS((uint16_t *)&lock[index * LOCK_DIST], zero, one) == zero) return true; - return false; -} - -// TODO: it might expect a short int instead of uint16_t -// TODO: needs to be 32 bits (whoops) -__device__ uint16_t get_lock(volatile uint32_t *lock, int index) { - // set lock to 1 to claim - // returns 0 if success - uint32_t zero = 0; - uint32_t one = 1; - return atomicCAS((uint32_t *)&lock[index], zero, one); -} - -// synchronous lock so that we can acquire multiple locks - -// __device__ uint16_t get_lock_wait(uint32_t * locks, int index){ - -// uint16_t result = 1; - -// do { - -// result = get_lock(locks, index); - -// } while (result !=0); - -// return result; - -// } - -/* -__device__ uint16_t unlock(volatile uint32_t *lock, int index) { - // set lock to 0 to release - uint32_t zero = 0; - uint32_t one = 1; - // TODO: might need a __threadfence(); - lock[index] = 0; -} -*/ - -//__device__ void __bitonic_sort(uint64_t * array, uint64_t low, uint64_t n, uint64_t idx, bool dir); - -// consolidate all of the device construction into one convenient func! -__host__ void qf_malloc_device(QF **qf, int nbits) { - // bring in compile #define - int rbits = 8; - int vbits = 8; - - QF host_qf; - QF temp_device_qf; - - QF *temp_dev_ptr; - - uint64_t nslots = 1ULL << nbits; - int num_hash_bits = nbits + rbits; - - qf_malloc(&host_qf, nslots, num_hash_bits, vbits, QF_HASH_INVERTIBLE, false, 0); - qf_set_auto_resize(&host_qf, false); - - qfruntime *_runtime; - qfmetadata *_metadata; - qfblock *_blocks; - - uint16_t *dev_locks; - - cudaMalloc((void **)&dev_locks, host_qf.runtimedata->num_locks * sizeof(uint16_t)); - - cudaMemset(dev_locks, 0, host_qf.runtimedata->num_locks * sizeof(uint16_t)); - - // wipe and replace - free(host_qf.runtimedata->locks); - host_qf.runtimedata->locks = dev_locks; - - cudaMalloc((void **)&_runtime, sizeof(qfruntime)); - cudaMalloc((void **)&_metadata, sizeof(qfmetadata)); - cudaMalloc((void **)&_blocks, qf_get_total_size_in_bytes(&host_qf)); - - cudaMemcpy(_runtime, host_qf.runtimedata, sizeof(qfruntime), cudaMemcpyHostToDevice); - cudaMemcpy(_metadata, host_qf.metadata, sizeof(qfmetadata), cudaMemcpyHostToDevice); - cudaMemcpy(_blocks, host_qf.blocks, qf_get_total_size_in_bytes(&host_qf), cudaMemcpyHostToDevice); - - temp_device_qf.runtimedata = _runtime; - temp_device_qf.metadata = _metadata; - temp_device_qf.blocks = _blocks; - - // this might be buggy - // request to fill the dev ptr with a QF, then copy over, then copy that to qf - cudaMalloc((void **)&temp_dev_ptr, sizeof(QF)); - - cudaMemcpy(temp_dev_ptr, &temp_device_qf, sizeof(QF), cudaMemcpyHostToDevice); - - *qf = temp_dev_ptr; -} - -__host__ void qf_destroy_device(QF *qf) { - QF *host_qf; - cudaMallocHost((void **)&host_qf, sizeof(QF)); - - cudaMemcpy(host_qf, qf, sizeof(QF), cudaMemcpyDeviceToHost); - - qfruntime *_runtime; - - cudaMallocHost((void **)&_runtime, sizeof(qfruntime)); - - cudaMemcpy(_runtime, host_qf->runtimedata, sizeof(qfruntime), cudaMemcpyDeviceToHost); - - // may need to have _runtimedata shunted into another host object - // ill synchronize before this to double check - assert(_runtime != NULL); - if (_runtime->locks != NULL) cudaFree(_runtime->locks); - - if (_runtime->wait_times != NULL) cudaFree(_runtime->wait_times); - - // this one may break - if (_runtime->f_info.filepath != NULL) cudaFree(host_qf->runtimedata->f_info.filepath); - - cudaFree(host_qf->runtimedata); - - cudaFree(host_qf->metadata); - cudaFree(host_qf->blocks); - - cudaFreeHost(host_qf); - cudaFreeHost(_runtime); -} - -// __host__ void init_device_locks(uint16_t ** locks, uint64_t nbits){ - -// uint16_t * temp_locks; - -// uint64_t nslots = 1ULL << nbits; - -// uint64_t xnslots = nslots+10*sqrt((double)nslots); - -// cudaMalloc((void **)&temp_locks, (((xnslots-1)/NUM_SLOTS_TO_LOCK+1)+10)*sizeof(uint16_t)); - -// cudaMemset(temp_locks, 0, (((xnslots-1)/NUM_SLOTS_TO_LOCK+1)+10)*sizeof(uint16_t)); - -// *locks = temp_locks; - -// } - -// convert a counter with -__host__ __device__ uint8_t encode_kmer_counter(uint8_t *counter) { - uint8_t base = 0; - - // A is 000 0 - // C is 001 1 - // T is 010 2 - // G is 011 3 - // F is 100 4 - - for (uint8_t i = 0; i < 5; i++) { - if (counter[i]) { - // printf("Front %d: %0x", i, i<<5); - base += i << 5; - } - - if (counter[i + 5]) { - base += i << 2; - } - } - - return base; -} - -// convert a counter with -__host__ __device__ uint8_t encode_chars(char fwd, char back) { - uint8_t base = 0; - - // encodings of kmers relative to inputs, - // if you want to change this modify the const array - // kmer_vals in gqf.cu. F is unused and only exists to prevent crashes - - // F is 000 0 - // A is 001 1 - // C is 010 2 - // T is 011 3 - // G is 100 4 - // 0/NULL is 101 5 - - for (uint8_t i = 0; i < 5; i++) { - if (kmer_vals[i] == fwd) { - // printf("Front %d: %0x", i, i<<5); - base += i << 3; - } - - if (kmer_vals[i] == back) { - base += i; - } - } - - return base; -} - -// convert a counter with -__host__ __device__ void decode_chars(uint8_t stored, char &fwd, char &back) { - // NULL is 000 0 - // A is 001 1 - // C is 010 2 - // T is 011 3 - // G is 100 4 - // 0 is 101 5 - - uint8_t upper = stored >> 3; - uint8_t lower = stored & 7; - - fwd = kmer_vals[upper]; - back = kmer_vals[lower]; - - if (fwd == 'F') fwd = '0'; - if (back == 'F') back = '0'; -} - -__host__ __device__ void decode_kmer_counter(uint8_t *counter, uint8_t stored) { - uint8_t upper = stored >> 5; - - uint8_t lower = (stored & (15)) >> 2; - - // printf("Upper %x, lower %x\n", upper, lower); - - counter[upper] += 1; - counter[lower + 5] += 1; -} - -__host__ __device__ bool is_encodable(uint8_t *counter) { - int count = 0; - - for (int i = 0; i < 5; i++) { - count += counter[i]; - } - - if (count > 1) return false; - - return true; -} - -// finalized version of locking kmer insert -// uses 10 bits 6 bits remainder/val pairings -__device__ qf_returns insert_kmer(QF *qf, uint64_t hash, char forward, char backward, char &returnedfwd, char &returnedback) { - uint8_t encoded = encode_chars(forward, backward); - uint8_t query; - uint64_t bigquery; - - hash = hash % qf->metadata->range; - - uint64_t hash_bucket_index = hash >> qf->metadata->key_remainder_bits; - uint64_t lock_index = hash_bucket_index / NUM_SLOTS_TO_LOCK; - - // encode extensions outside of the lock - - lock_16(qf->runtimedata->locks, lock_index); - lock_16(qf->runtimedata->locks, lock_index + 1); - - int found = qf_query(qf, hash, &bigquery, QF_NO_LOCK | QF_KEY_IS_HASH); - - query = bigquery; - - if (found == 0) - qf_insert(qf, hash, encoded, 1, QF_NO_LOCK | QF_KEY_IS_HASH); - else - decode_chars(query, returnedfwd, returnedback); - - __threadfence(); - unlock_16(qf->runtimedata->locks, lock_index + 1); - unlock_16(qf->runtimedata->locks, lock_index); - - if (found == 1) return QF_ITEM_FOUND; - return QF_ITEM_INSERTED; -} - -__device__ qf_returns insert_kmer_not_exists(QF *qf, uint64_t hash, char forward, char backward, char &returnedfwd, - char &returnedback) { - uint8_t encoded = encode_chars(forward, backward); - uint8_t query; - uint64_t bigquery; - - hash = hash % qf->metadata->range; - - uint64_t hash_bucket_index = hash >> qf->metadata->key_remainder_bits; - uint64_t lock_index = hash_bucket_index / NUM_SLOTS_TO_LOCK; - - // encode extensions outside of the lock - - lock_16(qf->runtimedata->locks, lock_index); - lock_16(qf->runtimedata->locks, lock_index + 1); - - // uint64_t query; - // int found = qf_query(qf, hash, &bigquery, QF_NO_LOCK | QF_KEY_IS_HASH); - // printf("being inserted/checked: %d\n", encoded); - qf_returns ret = qf_insert_not_exists(qf, hash, encoded, 1, QF_NO_LOCK | QF_KEY_IS_HASH, &bigquery); - - __threadfence(); - unlock_16(qf->runtimedata->locks, lock_index + 1); - unlock_16(qf->runtimedata->locks, lock_index); - - // cast down - query = bigquery; - if (ret == QF_ITEM_FOUND) { - decode_chars(query, returnedfwd, returnedback); - } - // obvious cast for clarity - return ret; -} - -// given a kmer we want to look for, and an encoded char, insert it and retreive a copy if it exists -// returns 1 if not found since they won't interfere with any unique combos -__device__ uint8_t insert_kmer_with_lock(QF *qf, uint64_t hash, uint8_t val) { - uint8_t query; - - // ha hire me pls google - uint64_t bigquery; - - // uint64_t hash_bucket_index = hash >> qf->metadata->bits_per_slot; - uint64_t hash_bucket_index = hash >> qf->metadata->key_remainder_bits; - uint64_t lock_index = hash_bucket_index / NUM_SLOTS_TO_LOCK; - - lock_16(qf->runtimedata->locks, lock_index); - lock_16(qf->runtimedata->locks, lock_index + 1); - - // figure out flags here - // QF NO lock and QF KEY_IS_HASH - int found = qf_query(qf, hash, &bigquery, QF_NO_LOCK | QF_KEY_IS_HASH); - - // implicit casts, data from bigquery should always fit in uint8_t - query = bigquery; - - if (found == 0) { - qf_insert(qf, hash, val, 1, QF_NO_LOCK | QF_KEY_IS_HASH); - query = 1U; - } - - __threadfence(); - - unlock_16(qf->runtimedata->locks, lock_index + 1); - unlock_16(qf->runtimedata->locks, lock_index); - - return query; -} - -// perform a bitwise operatiojn, check if query has been seen at least once already -// this is indicated by the 2 to last bit being set to 1 -__device__ bool seen_once(uint8_t query) { - // looking for bit 0000 0010 - - uint8_t lower = (query & (2)) >> 1; - - printf("query val %x\n", lower); - - // implicit cast - return lower; -} - -__device__ uint8_t set_seen(uint8_t query) { return (query | 2); } - -// __global__ void insert_one_kmer_kernel(QF* qf, uint64_t hash, uint8_t val, uint16_t * locks){ - -// uint64_t tid = threadIdx.x + blockIdx.x * blockDim.x; - -// if (tid != 0) return; - -// printf("Returned: %x\n", insert_kmer_with_lock(qf, hash, val, locks)); - -// } -/* -__global__ void insert_multi_kmer_kernel(QF *qf, uint64_t *hashes, uint8_t *firsts, uint8_t *seconds, uint64_t nitems, - uint64_t *counter) { - uint64_t tid = threadIdx.x + blockIdx.x * blockDim.x; - - if (tid >= nitems) return; - - uint8_t one = firsts[tid]; - uint8_t two = seconds[tid]; - - // if this fails the random gen is messed up - char fwd; - char back; - - if (insert_kmer(qf, hashes[tid], kmer_vals[one], kmer_vals[two - 5], fwd, back)) { - atomicAdd((unsigned long long *)counter, (unsigned long long)1); - } -} -*/ - -__host__ __device__ int qf_set_count(QF *qf, uint64_t key, uint64_t value, uint64_t count, uint8_t flags) { - if (count == 0) return 0; - - uint64_t cur_count = qf_count_key_value(qf, key, value, flags); - int64_t delta = count - cur_count; - - int ret; - if (delta == 0) - ret = 0; - else if (delta > 0) - ret = qf_insert(qf, key, value, delta, flags); - else - ret = qf_remove(qf, key, value, labs(delta), flags); - - return ret; -} - -__host__ __device__ int qf_remove(QF *qf, uint64_t key, uint64_t value, uint64_t count, uint8_t flags) { - if (count == 0) return true; - - if (GET_KEY_HASH(flags) != QF_KEY_IS_HASH) { - if (qf->metadata->hash_mode == QF_HASH_DEFAULT) - key = MurmurHash64A(((void *)&key), sizeof(key), qf->metadata->seed) % qf->metadata->range; - else if (qf->metadata->hash_mode == QF_HASH_INVERTIBLE) - key = hash_64(key, BITMASK(qf->metadata->key_bits)); - } - uint64_t hash = (key << qf->metadata->value_bits) | (value & BITMASK(qf->metadata->value_bits)); - return _remove(qf, hash, count, flags); -} - -__host__ __device__ int qf_delete_key_value(QF *qf, uint64_t key, uint64_t value, uint8_t flags) { - uint64_t count = qf_count_key_value(qf, key, value, flags); - if (count == 0) return true; - - if (GET_KEY_HASH(flags) != QF_KEY_IS_HASH) { - if (qf->metadata->hash_mode == QF_HASH_DEFAULT) - key = MurmurHash64A(((void *)&key), sizeof(key), qf->metadata->seed) % qf->metadata->range; - else if (qf->metadata->hash_mode == QF_HASH_INVERTIBLE) - key = hash_64(key, BITMASK(qf->metadata->key_bits)); - } - uint64_t hash = (key << qf->metadata->value_bits) | (value & BITMASK(qf->metadata->value_bits)); - return _remove(qf, hash, count, flags); -} - -__host__ __device__ uint64_t qf_count_key_value(const QF *qf, uint64_t key, uint64_t value, uint8_t flags) { - if (GET_KEY_HASH(flags) != QF_KEY_IS_HASH) { - if (qf->metadata->hash_mode == QF_HASH_DEFAULT) - key = MurmurHash64A(((void *)&key), sizeof(key), qf->metadata->seed) % qf->metadata->range; - else if (qf->metadata->hash_mode == QF_HASH_INVERTIBLE) - key = hash_64(key, BITMASK(qf->metadata->key_bits)); - } - - uint64_t hash = (key << qf->metadata->value_bits) | (value & BITMASK(qf->metadata->value_bits)); - uint64_t hash_remainder = hash & BITMASK(qf->metadata->bits_per_slot); - int64_t hash_bucket_index = hash >> qf->metadata->bits_per_slot; - - if (!is_occupied(qf, hash_bucket_index)) return 0; - - int64_t runstart_index = hash_bucket_index == 0 ? 0 : run_end(qf, hash_bucket_index - 1) + 1; - if (runstart_index < hash_bucket_index) runstart_index = hash_bucket_index; - - /* printf("MC RUNSTART: %02lx RUNEND: %02lx\n", runstart_index, runend_index); */ - - uint64_t current_remainder, current_count, current_end; - do { - current_end = decode_counter(qf, runstart_index, ¤t_remainder, ¤t_count); - if (current_remainder == hash_remainder) return current_count; - runstart_index = current_end + 1; - } while (!is_runend(qf, current_end)); - - return 0; -} - -__host__ __device__ uint64_t qf_query(const QF *qf, uint64_t key, uint64_t *value, uint8_t flags) { - if (GET_KEY_HASH(flags) != QF_KEY_IS_HASH) { - if (qf->metadata->hash_mode == QF_HASH_DEFAULT) - key = MurmurHash64A(((void *)&key), sizeof(key), qf->metadata->seed) % qf->metadata->range; - else if (qf->metadata->hash_mode == QF_HASH_INVERTIBLE) - key = hash_64(key, BITMASK(qf->metadata->key_bits)); - } - uint64_t hash = key; - uint64_t hash_remainder = hash & BITMASK(qf->metadata->key_remainder_bits); - int64_t hash_bucket_index = hash >> qf->metadata->key_remainder_bits; - - if (!is_occupied(qf, hash_bucket_index)) return 0; - - int64_t runstart_index = hash_bucket_index == 0 ? 0 : run_end(qf, hash_bucket_index - 1) + 1; - if (runstart_index < hash_bucket_index) runstart_index = hash_bucket_index; - - /* printf("MC RUNSTART: %02lx RUNEND: %02lx\n", runstart_index, runend_index); */ - - uint64_t current_remainder, current_count, current_end; - do { - current_end = decode_counter(qf, runstart_index, ¤t_remainder, ¤t_count); - *value = current_remainder & BITMASK(qf->metadata->value_bits); - current_remainder = current_remainder >> qf->metadata->value_bits; - if (current_remainder == hash_remainder) { - return current_count; - } - runstart_index = current_end + 1; - } while (!is_runend(qf, current_end)); - - return 0; -} - -__host__ __device__ int64_t qf_get_unique_index(const QF *qf, uint64_t key, uint64_t value, uint8_t flags) { - if (GET_KEY_HASH(flags) != QF_KEY_IS_HASH) { - if (qf->metadata->hash_mode == QF_HASH_DEFAULT) - key = MurmurHash64A(((void *)&key), sizeof(key), qf->metadata->seed) % qf->metadata->range; - else if (qf->metadata->hash_mode == QF_HASH_INVERTIBLE) - key = hash_64(key, BITMASK(qf->metadata->key_bits)); - } - uint64_t hash = (key << qf->metadata->value_bits) | (value & BITMASK(qf->metadata->value_bits)); - uint64_t hash_remainder = hash & BITMASK(qf->metadata->bits_per_slot); - int64_t hash_bucket_index = hash >> qf->metadata->bits_per_slot; - - if (!is_occupied(qf, hash_bucket_index)) return QF_DOESNT_EXIST; - - int64_t runstart_index = hash_bucket_index == 0 ? 0 : run_end(qf, hash_bucket_index - 1) + 1; - if (runstart_index < hash_bucket_index) runstart_index = hash_bucket_index; - - /* printf("MC RUNSTART: %02lx RUNEND: %02lx\n", runstart_index, runend_index); */ - - uint64_t current_remainder, current_count, current_end; - do { - current_end = decode_counter(qf, runstart_index, ¤t_remainder, ¤t_count); - if (current_remainder == hash_remainder) return runstart_index; - - runstart_index = current_end + 1; - } while (!is_runend(qf, current_end)); - - return QF_DOESNT_EXIST; -} - -enum qf_hashmode qf_get_hashmode(const QF *qf) { return qf->metadata->hash_mode; } -uint64_t qf_get_hash_seed(const QF *qf) { return qf->metadata->seed; } -__uint64_t qf_get_hash_range(const QF *qf) { return qf->metadata->range; } - -bool qf_is_auto_resize_enabled(const QF *qf) { - if (qf->runtimedata->auto_resize == 1) return true; - return false; -} -uint64_t qf_get_total_size_in_bytes(const QF *qf) { return qf->metadata->total_size_in_bytes; } -uint64_t qf_get_nslots(const QF *qf) { return qf->metadata->nslots; } -uint64_t qf_get_num_occupied_slots(const QF *qf) { - pc_sync(&qf->runtimedata->pc_noccupied_slots); - return qf->metadata->noccupied_slots; -} - -// need to pull metadata from qf, and nslots from metadata -__host__ uint64_t host_qf_get_nslots(const QF *qf) { - QF *host_qf; - CUDA_CHECK(cudaMallocHost((void **)&host_qf, sizeof(QF))); - CUDA_CHECK(cudaMemcpy(host_qf, qf, sizeof(QF), cudaMemcpyDeviceToHost)); - qfmetadata *_metadata; - CUDA_CHECK(cudaMallocHost((void **)&_metadata, sizeof(qfmetadata))); - CUDA_CHECK(cudaMemcpy(_metadata, host_qf->metadata, sizeof(qfmetadata), cudaMemcpyDeviceToHost)); - uint64_t toReturn = _metadata->nslots; - CUDA_CHECK(cudaFreeHost(_metadata)); - CUDA_CHECK(cudaFreeHost(host_qf)); - return toReturn; -} - -__host__ uint64_t host_qf_get_num_occupied_slots(const QF *qf) { - QF *host_qf; - CUDA_CHECK(cudaMallocHost((void **)&host_qf, sizeof(QF))); - CUDA_CHECK(cudaMemcpy(host_qf, qf, sizeof(QF), cudaMemcpyDeviceToHost)); - qfmetadata *_metadata; - CUDA_CHECK(cudaMallocHost((void **)&_metadata, sizeof(qfmetadata))); - CUDA_CHECK(cudaMemcpy(_metadata, host_qf->metadata, sizeof(qfmetadata), cudaMemcpyDeviceToHost)); - uint64_t toReturn = _metadata->noccupied_slots; - CUDA_CHECK(cudaFreeHost(_metadata)); - CUDA_CHECK(cudaFreeHost(host_qf)); - return toReturn; -} - -uint64_t qf_get_num_key_bits(const QF *qf) { return qf->metadata->key_bits; } -uint64_t qf_get_num_value_bits(const QF *qf) { return qf->metadata->value_bits; } -uint64_t qf_get_num_key_remainder_bits(const QF *qf) { return qf->metadata->key_remainder_bits; } -uint64_t qf_get_bits_per_slot(const QF *qf) { return qf->metadata->bits_per_slot; } - -uint64_t qf_get_sum_of_counts(const QF *qf) { - pc_sync(&qf->runtimedata->pc_nelts); - return qf->metadata->nelts; -} -uint64_t qf_get_num_distinct_key_value_pairs(const QF *qf) { - pc_sync(&qf->runtimedata->pc_ndistinct_elts); - return qf->metadata->ndistinct_elts; -} - -void qf_sync_counters(const QF *qf) { - pc_sync(&qf->runtimedata->pc_ndistinct_elts); - pc_sync(&qf->runtimedata->pc_nelts); - pc_sync(&qf->runtimedata->pc_noccupied_slots); -} - -/* initialize the iterator at the run corresponding - * to the position index - */ -int64_t qf_iterator_from_position(const QF *qf, QFi *qfi, uint64_t position) { - if (position == 0xffffffffffffffff) { - qfi->current = 0xffffffffffffffff; - qfi->qf = qf; - return QFI_INVALID; - } - assert(position < qf->metadata->nslots); - if (!is_occupied(qf, position)) { - uint64_t block_index = position; - uint64_t idx = bitselect(get_block(qf, block_index)->occupieds[0], 0); - if (idx == 64) { - while (idx == 64 && block_index < qf->metadata->nblocks) { - block_index++; - idx = bitselect(get_block(qf, block_index)->occupieds[0], 0); - } - } - position = block_index * QF_SLOTS_PER_BLOCK + idx; - } - - qfi->qf = qf; - qfi->num_clusters = 0; - qfi->run = position; - qfi->current = position == 0 ? 0 : run_end(qfi->qf, position - 1) + 1; - if (qfi->current < position) qfi->current = position; - -#ifdef LOG_CLUSTER_LENGTH - qfi->c_info = (cluster_data *)calloc(qf->metadata->nslots / 32, sizeof(cluster_data)); - if (qfi->c_info == NULL) { - perror("Couldn't allocate memory for c_info."); - exit(EXIT_FAILURE); - } - qfi->cur_start_index = position; - qfi->cur_length = 1; -#endif - - if (qfi->current >= qf->metadata->nslots) return QFI_INVALID; - return qfi->current; -} - -int64_t qf_iterator_from_key_value(const QF *qf, QFi *qfi, uint64_t key, uint64_t value, uint8_t flags) { - if (key >= qf->metadata->range) { - qfi->current = 0xffffffffffffffff; - qfi->qf = qf; - return QFI_INVALID; - } - - qfi->qf = qf; - qfi->num_clusters = 0; - - if (GET_KEY_HASH(flags) != QF_KEY_IS_HASH) { - if (qf->metadata->hash_mode == QF_HASH_DEFAULT) - key = MurmurHash64A(((void *)&key), sizeof(key), qf->metadata->seed) % qf->metadata->range; - else if (qf->metadata->hash_mode == QF_HASH_INVERTIBLE) - key = hash_64(key, BITMASK(qf->metadata->key_bits)); - } - uint64_t hash = (key << qf->metadata->value_bits) | (value & BITMASK(qf->metadata->value_bits)); - - uint64_t hash_remainder = hash & BITMASK(qf->metadata->bits_per_slot); - uint64_t hash_bucket_index = hash >> qf->metadata->bits_per_slot; - bool flag = false; - - // If a run starts at "position" move the iterator to point it to the - // smallest key greater than or equal to "hash". - if (is_occupied(qf, hash_bucket_index)) { - uint64_t runstart_index = hash_bucket_index == 0 ? 0 : run_end(qf, hash_bucket_index - 1) + 1; - if (runstart_index < hash_bucket_index) runstart_index = hash_bucket_index; - uint64_t current_remainder, current_count, current_end; - do { - current_end = decode_counter(qf, runstart_index, ¤t_remainder, ¤t_count); - if (current_remainder >= hash_remainder) { - flag = true; - break; - } - runstart_index = current_end + 1; - } while (!is_runend(qf, current_end)); - // found "hash" or smallest key greater than "hash" in this run. - if (flag) { - qfi->run = hash_bucket_index; - qfi->current = runstart_index; - } - } - // If a run doesn't start at "position" or the largest key in the run - // starting at "position" is smaller than "hash" then find the start of the - // next run. - if (!is_occupied(qf, hash_bucket_index) || !flag) { - uint64_t position = hash_bucket_index; - assert(position < qf->metadata->nslots); - uint64_t block_index = position / QF_SLOTS_PER_BLOCK; - uint64_t idx = bitselect(get_block(qf, block_index)->occupieds[0], 0); - if (idx == 64) { - while (idx == 64 && block_index < qf->metadata->nblocks) { - block_index++; - idx = bitselect(get_block(qf, block_index)->occupieds[0], 0); - } - } - position = block_index * QF_SLOTS_PER_BLOCK + idx; - qfi->run = position; - qfi->current = position == 0 ? 0 : run_end(qfi->qf, position - 1) + 1; - if (qfi->current < position) qfi->current = position; - } - - if (qfi->current >= qf->metadata->nslots) return QFI_INVALID; - return qfi->current; -} - -static int qfi_get(const QFi *qfi, uint64_t *key, uint64_t *value, uint64_t *count) { - if (qfi_end(qfi)) return QFI_INVALID; - - uint64_t current_remainder, current_count; - decode_counter(qfi->qf, qfi->current, ¤t_remainder, ¤t_count); - - *value = current_remainder & BITMASK(qfi->qf->metadata->value_bits); - current_remainder = current_remainder >> qfi->qf->metadata->value_bits; - *key = (qfi->run << qfi->qf->metadata->key_remainder_bits) | current_remainder; - *count = current_count; - - return 0; -} - -int qfi_get_key(const QFi *qfi, uint64_t *key, uint64_t *value, uint64_t *count) { - *key = *value = *count = 0; - int ret = qfi_get(qfi, key, value, count); - if (ret == 0) { - if (qfi->qf->metadata->hash_mode == QF_HASH_DEFAULT) { - *key = 0; - *value = 0; - *count = 0; - return QF_INVALID; - } else if (qfi->qf->metadata->hash_mode == QF_HASH_INVERTIBLE) - *key = hash_64i(*key, BITMASK(qfi->qf->metadata->key_bits)); - } - - return ret; -} - -int qfi_get_hash(const QFi *qfi, uint64_t *key, uint64_t *value, uint64_t *count) { - *key = *value = *count = 0; - return qfi_get(qfi, key, value, count); -} - -int qfi_next(QFi *qfi) { - if (qfi_end(qfi)) - return QFI_INVALID; - else { - /* move to the end of the current counter*/ - uint64_t current_remainder, current_count; - qfi->current = decode_counter(qfi->qf, qfi->current, ¤t_remainder, ¤t_count); - - if (!is_runend(qfi->qf, qfi->current)) { - qfi->current++; -#ifdef LOG_CLUSTER_LENGTH - qfi->cur_length++; -#endif - if (qfi_end(qfi)) return QFI_INVALID; - return 0; - } else { -#ifdef LOG_CLUSTER_LENGTH - /* save to check if the new current is the new cluster. */ - uint64_t old_current = qfi->current; -#endif - uint64_t block_index = qfi->run / QF_SLOTS_PER_BLOCK; - uint64_t rank = bitrank(get_block(qfi->qf, block_index)->occupieds[0], qfi->run % QF_SLOTS_PER_BLOCK); - uint64_t next_run = bitselect(get_block(qfi->qf, block_index)->occupieds[0], rank); - if (next_run == 64) { - rank = 0; - while (next_run == 64 && block_index < qfi->qf->metadata->nblocks) { - block_index++; - next_run = bitselect(get_block(qfi->qf, block_index)->occupieds[0], rank); - } - } - if (block_index == qfi->qf->metadata->nblocks) { - /* set the index values to max. */ - qfi->run = qfi->current = qfi->qf->metadata->xnslots; - return QFI_INVALID; - } - qfi->run = block_index * QF_SLOTS_PER_BLOCK + next_run; - qfi->current++; - if (qfi->current < qfi->run) qfi->current = qfi->run; -#ifdef LOG_CLUSTER_LENGTH - if (qfi->current > old_current + 1) { /* new cluster. */ - if (qfi->cur_length > 10) { - qfi->c_info[qfi->num_clusters].start_index = qfi->cur_start_index; - qfi->c_info[qfi->num_clusters].length = qfi->cur_length; - qfi->num_clusters++; - } - qfi->cur_start_index = qfi->run; - qfi->cur_length = 1; - } else { - qfi->cur_length++; - } -#endif - return 0; - } - } -} - -bool qfi_end(const QFi *qfi) { - if (qfi->current >= qfi->qf->metadata->xnslots /*&& is_runend(qfi->qf, qfi->current)*/) return true; - return false; -} - -/* - * Merge qfa and qfb into qfc - */ -/* - * iterate over both qf (qfa and qfb) - * simultaneously - * for each index i - * min(get_value(qfa, ia) < get_value(qfb, ib)) - * insert(min, ic) - * increment either ia or ib, whichever is minimum. - */ -void qf_merge(const QF *qfa, const QF *qfb, QF *qfc) { - QFi qfia, qfib; - qf_iterator_from_position(qfa, &qfia, 0); - qf_iterator_from_position(qfb, &qfib, 0); - - if (qfa->metadata->hash_mode != qfc->metadata->hash_mode && qfa->metadata->seed != qfc->metadata->seed && - qfb->metadata->hash_mode != qfc->metadata->hash_mode && qfb->metadata->seed != qfc->metadata->seed) { - fprintf(stderr, "Output QF and input QFs do not have the same hash mode or seed.\n"); - exit(1); - } - - uint64_t keya, valuea, counta, keyb, valueb, countb; - qfi_get_hash(&qfia, &keya, &valuea, &counta); - qfi_get_hash(&qfib, &keyb, &valueb, &countb); - do { - if (keya < keyb) { - qf_insert(qfc, keya, valuea, counta, QF_NO_LOCK | QF_KEY_IS_HASH); - qfi_next(&qfia); - qfi_get_hash(&qfia, &keya, &valuea, &counta); - } else { - qf_insert(qfc, keyb, valueb, countb, QF_NO_LOCK | QF_KEY_IS_HASH); - qfi_next(&qfib); - qfi_get_hash(&qfib, &keyb, &valueb, &countb); - } - } while (!qfi_end(&qfia) && !qfi_end(&qfib)); - - if (!qfi_end(&qfia)) { - do { - qfi_get_hash(&qfia, &keya, &valuea, &counta); - qf_insert(qfc, keya, valuea, counta, QF_NO_LOCK | QF_KEY_IS_HASH); - } while (!qfi_next(&qfia)); - } - if (!qfi_end(&qfib)) { - do { - qfi_get_hash(&qfib, &keyb, &valueb, &countb); - qf_insert(qfc, keyb, valueb, countb, QF_NO_LOCK | QF_KEY_IS_HASH); - } while (!qfi_next(&qfib)); - } -} - -/* - * Merge an array of qfs into the resultant QF - */ -void qf_multi_merge(const QF *qf_arr[], int nqf, QF *qfr) { - int i; - QFi qfi_arr[nqf]; - int smallest_idx = 0; - uint64_t smallest_key = UINT64_MAX; - for (i = 0; i < nqf; i++) { - if (qf_arr[i]->metadata->hash_mode != qfr->metadata->hash_mode && qf_arr[i]->metadata->seed != qfr->metadata->seed) { - fprintf(stderr, "Output QF and input QFs do not have the same hash mode or seed.\n"); - exit(1); - } - qf_iterator_from_position(qf_arr[i], &qfi_arr[i], 0); - } - - DEBUG_CQF("Merging %d CQFs\n", nqf); - for (i = 0; i < nqf; i++) { - DEBUG_CQF("CQF %d\n", i); - DEBUG_DUMP(qf_arr[i]); - } - - while (nqf > 1) { - uint64_t keys[nqf]; - uint64_t values[nqf]; - uint64_t counts[nqf]; - for (i = 0; i < nqf; i++) qfi_get_hash(&qfi_arr[i], &keys[i], &values[i], &counts[i]); - - do { - smallest_key = UINT64_MAX; - for (i = 0; i < nqf; i++) { - if (keys[i] < smallest_key) { - smallest_key = keys[i]; - smallest_idx = i; - } - } - qf_insert(qfr, keys[smallest_idx], values[smallest_idx], counts[smallest_idx], QF_NO_LOCK | QF_KEY_IS_HASH); - qfi_next(&qfi_arr[smallest_idx]); - qfi_get_hash(&qfi_arr[smallest_idx], &keys[smallest_idx], &values[smallest_idx], &counts[smallest_idx]); - } while (!qfi_end(&qfi_arr[smallest_idx])); - - /* remove the qf that is exhausted from the array */ - if (smallest_idx < nqf - 1) - memmove(&qfi_arr[smallest_idx], &qfi_arr[smallest_idx + 1], (nqf - smallest_idx - 1) * sizeof(qfi_arr[0])); - nqf--; - } - if (!qfi_end(&qfi_arr[0])) { - uint64_t iters = 0; - do { - uint64_t key, value, count; - qfi_get_hash(&qfi_arr[0], &key, &value, &count); - qf_insert(qfr, key, value, count, QF_NO_LOCK | QF_KEY_IS_HASH); - qfi_next(&qfi_arr[0]); - iters++; - } while (!qfi_end(&qfi_arr[0])); - DEBUG_CQF("Num of iterations: %lu\n", iters); - } - - DEBUG_CQF("%s", "Final CQF after merging.\n"); - DEBUG_DUMP(qfr); - - return; -} - -/* find cosine similarity between two QFs. */ -uint64_t qf_inner_product(const QF *qfa, const QF *qfb) { - uint64_t acc = 0; - QFi qfi; - const QF *qf_mem, *qf_disk; - - if (qfa->metadata->hash_mode != qfb->metadata->hash_mode && qfa->metadata->seed != qfb->metadata->seed) { - fprintf(stderr, "Input QFs do not have the same hash mode or seed.\n"); - exit(1); - } - - // create the iterator on the larger QF. - if (qfa->metadata->total_size_in_bytes > qfb->metadata->total_size_in_bytes) { - qf_mem = qfb; - qf_disk = qfa; - } else { - qf_mem = qfa; - qf_disk = qfb; - } - - qf_iterator_from_position(qf_disk, &qfi, 0); - do { - uint64_t key = 0, value = 0, count = 0; - uint64_t count_mem; - qfi_get_hash(&qfi, &key, &value, &count); - if ((count_mem = qf_count_key_value(qf_mem, key, 0, QF_KEY_IS_HASH)) > 0) { - acc += count * count_mem; - } - } while (!qfi_next(&qfi)); - - return acc; -} - -/* find cosine similarity between two QFs. */ -void qf_intersect(const QF *qfa, const QF *qfb, QF *qfr) { - QFi qfi; - const QF *qf_mem, *qf_disk; - - if (qfa->metadata->hash_mode != qfr->metadata->hash_mode && qfa->metadata->seed != qfr->metadata->seed && - qfb->metadata->hash_mode != qfr->metadata->hash_mode && qfb->metadata->seed != qfr->metadata->seed) { - fprintf(stderr, "Output QF and input QFs do not have the same hash mode or seed.\n"); - exit(1); - } - - // create the iterator on the larger QF. - if (qfa->metadata->total_size_in_bytes > qfb->metadata->total_size_in_bytes) { - qf_mem = qfb; - qf_disk = qfa; - } else { - qf_mem = qfa; - qf_disk = qfb; - } - - qf_iterator_from_position(qf_disk, &qfi, 0); - do { - uint64_t key = 0, value = 0, count = 0; - qfi_get_hash(&qfi, &key, &value, &count); - if (qf_count_key_value(qf_mem, key, 0, QF_KEY_IS_HASH) > 0) qf_insert(qfr, key, value, count, QF_NO_LOCK | QF_KEY_IS_HASH); - } while (!qfi_next(&qfi)); -} - -} // namespace quotient_filter diff --git a/src/kcount/kcount-gpu/gqf.hpp b/src/kcount/kcount-gpu/gqf.hpp index 6917385..e69de29 100644 --- a/src/kcount/kcount-gpu/gqf.hpp +++ b/src/kcount/kcount-gpu/gqf.hpp @@ -1,384 +0,0 @@ -#pragma once - -/* - * ============================================================================ - * - * Authors: Prashant Pandey - * Rob Johnson - * - * ============================================================================ - */ - -#include -#include -#include - -namespace quotient_filter { - -typedef struct quotient_filter quotient_filter; -typedef quotient_filter QF; - -/* CQFs support three hashing modes: - - - DEFAULT uses a hash that may introduce false positives, but -this can be useful when inserting large keys that need to be -hashed down to a small fingerprint. With this type of hash, -you can iterate over the hash values of all the keys in the -CQF, but you cannot iterate over the keys themselves. - - - INVERTIBLE has no false positives, but the size of the hash -output must be the same as the size of the hash input, -e.g. 17-bit keys hashed to 17-bit outputs. So this mode is -generally only useful when storing small keys in the CQF. With -this hashing mode, you can use iterators to enumerate both all -the hashes in the CQF, or all the keys. - - - NONE, for when you've done the hashing yourself. WARNING: the - CQF can exhibit very bad performance if you insert a skewed - distribution of intputs. -*/ - -enum qf_hashmode { QF_HASH_DEFAULT, QF_HASH_INVERTIBLE, QF_HASH_NONE }; - -enum qf_returns { QF_ITEM_INSERTED, QF_ITEM_FOUND, QF_FULL }; - -/* The CQF supports concurrent insertions and queries. Only the - portion of the CQF being examined or modified is locked, so it - supports high throughput even with many threads. - - The CQF operations support 3 locking modes: - - - NO_LOCK: for single-threaded applications or applications -that do their own concurrency management. - - - WAIT_FOR_LOCK: Spin until you get the lock, then do the query -or update. - - - TRY_ONCE_LOCK: If you can't grab the lock on the first try, -return with an error code. -*/ -#define QF_NO_LOCK (0x01) -#define QF_TRY_ONCE_LOCK (0x02) -#define QF_WAIT_FOR_LOCK (0x04) - -/* It is sometimes useful to insert a key that has already been - hashed. */ -#define QF_KEY_IS_HASH (0x08) - -/****************************************** - The CQF defines low-level constructor and destructor operations - that are designed to enable the application to manage the memory - used by the CQF. -*******************************************/ - -/* - * Create an empty CQF in "buffer". If there is not enough space at - * buffer then it will return the total size needed in bytes to - * initialize the CQF. This function takes ownership of buffer. - */ -uint64_t qf_init(QF *qf, uint64_t nslots, uint64_t key_bits, uint64_t value_bits, enum qf_hashmode hash, uint32_t seed, - void *buffer, uint64_t buffer_len); - -/* Create a CQF in "buffer". Note that this does not initialize the - contents of bufferss Use this function if you have read a CQF, e.g. - off of disk or network, and want to begin using that stream of - bytes as a CQF. The CQF takes ownership of buffer. */ -uint64_t qf_use(QF *qf, void *buffer, uint64_t buffer_len); - -/* Destroy this CQF. Returns a pointer to the memory that the CQF was - using (i.e. passed into qf_init or qf_use) so that the application - can release that memory. */ -void *qf_destroy(QF *qf); - -/* Allocate a new CQF using "nslots" at "buffer" and copy elements from "qf" - * into it. - * If there is not enough space at buffer then it will return the total size - * needed in bytes to initialize the new CQF. - * */ -uint64_t qf_resize(QF *qf, uint64_t nslots, void *buffer, uint64_t buffer_len); - -/*********************************** -The following convenience functions create and destroy CQFs by - using malloc/free to obtain and release the memory for the CQF. -************************************/ - -/* Initialize the CQF and allocate memory for the CQF. */ -__host__ bool qf_malloc(QF *qf, uint64_t nslots, uint64_t key_bits, uint64_t value_bits, enum qf_hashmode hash, bool on_device, - uint32_t seed); - -__host__ bool qf_free(QF *qf); - -__host__ void qf_free_gpu(QF *qf); - -/* Resize the QF to the specified number of slots. Uses malloc() to - * obtain the new memory, and calls free() on the old memory. - * Return value: - * >= 0: number of keys copied during resizing. - * */ -int64_t qf_resize_malloc(QF *qf, uint64_t nslots); - -/* Turn on automatic resizing. Resizing is performed by calling - qf_resize_malloc, so the CQF must meet the requirements of that - function. */ -__host__ void qf_set_auto_resize(QF *qf, bool enabled); - -/*********************************** -Functions for modifying the CQF. -***********************************/ - -#define QF_NO_SPACE (-1) -#define QF_COULDNT_LOCK (-2) -#define QF_DOESNT_EXIST (-3) - -/* Increment the counter for this key/value pair by count. - * Return value: - * >= 0: distance from the home slot to the slot in which the key is - * inserted (or 0 if count == 0). - * == QF_NO_SPACE: the CQF has reached capacity. - * == QF_COULDNT_LOCK: TRY_ONCE_LOCK has failed to acquire the lock. - */ -__host__ __device__ int qf_insert(QF *qf, uint64_t key, uint64_t value, uint64_t count, uint8_t flags); - -__host__ void bulk_insert_bucketing_premalloc(QF *qf, uint64_t *keys, uint64_t value, uint64_t count, uint64_t nvals, - uint64_t slots_per_lock, uint64_t num_locks, uint8_t flags); - -__host__ void bulk_insert_bucketing(QF *qf, uint64_t *keys, uint64_t value, uint64_t count, uint64_t nvals, uint64_t slots_per_lock, - uint64_t num_locks, uint8_t flags); - -// functions for controlling buffers -__host__ void bulk_insert_bucketing_buffer_provided(QF *qf, uint64_t *keys, uint64_t value, uint64_t count, uint64_t nvals, - uint64_t slots_per_lock, uint64_t num_locks, uint8_t flags, uint64_t **buffers, - uint64_t *buffer_backing, volatile uint64_t *buffer_sizes); - -__host__ void bulk_insert_bucketing_buffer_provided_timed(QF *qf, uint64_t *keys, uint64_t value, uint64_t count, uint64_t nvals, - uint64_t slots_per_lock, uint64_t num_locks, uint8_t flags, - uint64_t **buffers, uint64_t *buffer_backing, - volatile uint64_t *buffer_sizes); - -__host__ void bulk_insert_one_hash(QF *qf, uint64_t *keys, uint64_t value, uint64_t count, uint64_t nvals, uint64_t slots_per_lock, - uint64_t num_locks, uint8_t flags, uint64_t **buffers, uint64_t *buffer_backing, - volatile uint64_t *buffer_sizes); - -__host__ void bulk_insert_no_atomics(QF *qf, uint64_t *keys, uint64_t value, uint64_t count, uint64_t nvals, - uint64_t slots_per_lock, uint64_t num_locks, uint8_t flags, uint64_t **buffers, - volatile uint64_t *buffer_sizes); - -__host__ void free_buffers_premalloced(QF *qf, uint64_t **buffers, uint64_t *buffer_backing, volatile uint64_t *buffer_sizes, - uint64_t num_buffers); - -__host__ uint64_t bulk_get_wrapper(QF *qf, uint64_t *vals, uint64_t nvals); -/* Set the counter for this key/value pair to count. - Return value: Same as qf_insert. - Returns 0 if new count is equal to old count. -*/ -__host__ __device__ int qf_set_count(QF *qf, uint64_t key, uint64_t value, uint64_t count, uint8_t flags); - -/* Remove up to count instances of this key/value combination. - * If the CQF contains <= count instances, then they will all be - * removed, which is not an error. - * Return value: - * >= 0: number of slots freed. - * == QF_DOESNT_EXIST: Specified item did not exist. - * == QF_COULDNT_LOCK: TRY_ONCE_LOCK has failed to acquire the lock. - */ -__host__ __device__ int qf_remove(QF *qf, uint64_t key, uint64_t value, uint64_t count, uint8_t flags); - -/* Remove all instances of this key/value pair. */ -__host__ __device__ int qf_delete_key_value(QF *qf, uint64_t key, uint64_t value, uint8_t flags); - -/* Remove all instances of this key. */ -/* NOT IMPLEMENTED YET. */ -// void qf_delete_key(QF *qf, uint64_t key); - -/* Replace the association (key, oldvalue, count) with the association - (key, newvalue, count). If there is already an association (key, - newvalue, count'), then the two associations will be merged and - their counters will be summed, resulting in association (key, - newvalue, count' + count). */ -/* NOT IMPLEMENTED YET. */ -// void qf_replace(QF *qf, uint64_t key, uint64_t oldvalue, uint64_t newvalue); - -/******************************************* -GPU functions -******************************/ -/* batch inserts using GPU*/ -__host__ void qf_gpu_launch(QF *qf, uint64_t *vals, uint64_t nvals, uint64_t key_count, uint64_t nhashbits, uint64_t nslots); - -__global__ void insert_one_kmer_kernel(QF *qf, uint64_t hash, uint8_t val, uint16_t *locks); - -/**************************************** -Query functions -****************************************/ - -/* Lookup the value associated with key. Returns the count of that - key/value pair in the QF. If it returns 0, then, the key is not - present in the QF. Only returns the first value associated with key - in the QF. If you want to see others, use an iterator. - May return QF_COULDNT_LOCK if called with QF_TRY_LOCK. */ -__host__ __device__ uint64_t qf_query(const QF *qf, uint64_t key, uint64_t *value, uint8_t flags); - -/* Return the number of times key has been inserted, with any value, - into qf. */ -/* NOT IMPLEMENTED YET. */ -// uint64_t qf_count_key(const QF *qf, uint64_t key); - -/* Return the number of times key has been inserted, with the given - value, into qf. - May return QF_COULDNT_LOCK if called with QF_TRY_LOCK. */ -__host__ __device__ uint64_t qf_count_key_value(const QF *qf, uint64_t key, uint64_t value, uint8_t flags); - -/* Returns a unique index corresponding to the key in the CQF. Note - that this can change if further modifications are made to the - CQF. - - If the key is not found then returns QF_DOESNT_EXIST. - May return QF_COULDNT_LOCK if called with QF_TRY_LOCK. - */ -__host__ __device__ int64_t qf_get_unique_index(const QF *qf, uint64_t key, uint64_t value, uint8_t flags); - -/**************************************** -Metadata accessors. -****************************************/ - -/* Hashing info */ -enum qf_hashmode qf_get_hashmode(const QF *qf); -uint64_t qf_get_hash_seed(const QF *qf); -__uint64_t qf_get_hash_range(const QF *qf); - -/* Space usage info. */ -bool qf_is_auto_resize_enabled(const QF *qf); -uint64_t qf_get_total_size_in_bytes(const QF *qf); -uint64_t qf_get_nslots(const QF *qf); -uint64_t qf_get_num_occupied_slots(const QF *qf); - -__host__ uint64_t host_qf_get_nslots(const QF *qf); -__host__ uint64_t host_qf_get_num_occupied_slots(const QF *qf); - -/* Bit-sizes info. */ -uint64_t qf_get_num_key_bits(const QF *qf); -uint64_t qf_get_num_value_bits(const QF *qf); -uint64_t qf_get_num_key_remainder_bits(const QF *qf); -uint64_t qf_get_bits_per_slot(const QF *qf); - -/* Number of (distinct) key-value pairs. */ -uint64_t qf_get_sum_of_counts(const QF *qf); -uint64_t qf_get_num_distinct_key_value_pairs(const QF *qf); - -void qf_sync_counters(const QF *qf); - -/**************************************** - Iterators -*****************************************/ - -typedef struct quotient_filter_iterator quotient_filter_iterator; -typedef quotient_filter_iterator QFi; - -#define QF_INVALID (-4) -#define QFI_INVALID (-5) - -/* Initialize an iterator starting at the given position. - * Return value: - * >= 0: iterator is initialized and positioned at the returned slot. - * = QFI_INVALID: iterator has reached end. - */ -int64_t qf_iterator_from_position(const QF *qf, QFi *qfi, uint64_t position); - -/* Initialize an iterator and position it at the smallest index - * containing a key-value pair whose hash is greater than or equal - * to the specified key-value pair. - * Return value: - * >= 0: iterator is initialized and position at the returned slot. - * = QFI_INVALID: iterator has reached end. - */ -int64_t qf_iterator_from_key_value(const QF *qf, QFi *qfi, uint64_t key, uint64_t value, uint8_t flags); - -/* Requires that the hash mode of the CQF is INVERTIBLE or NONE. - * If the hash mode is DEFAULT then returns QF_INVALID. - * Return value: - * = 0: Iterator is still valid. - * = QFI_INVALID: iterator has reached end. - * = QF_INVALID: hash mode is QF_DEFAULT_HASH - */ -int qfi_get_key(const QFi *qfi, uint64_t *key, uint64_t *value, uint64_t *count); - -/* Return value: - * = 0: Iterator is still valid. - * = QFI_INVALID: iterator has reached end. - */ -int qfi_get_hash(const QFi *qfi, uint64_t *hash, uint64_t *value, uint64_t *count); - -/* Advance to next entry. - * Return value: - * = 0: Iterator is still valid. - * = QFI_INVALID: iterator has reached end. - */ -int qfi_next(QFi *qfi); - -/* Check to see if the if the end of the QF */ -bool qfi_end(const QFi *qfi); - -/************************************ -Miscellaneous convenience functions. -*************************************/ - -/* Reset the CQF to an empty filter. */ -void qf_reset(QF *qf); - -/* The caller should call qf_init on the dest QF using the same - * parameters as the src QF before calling this function. Note: src - * and dest must be exactly the same, including number of slots. */ -void qf_copy(QF *dest, const QF *src); - -/* merge two QFs into the third one. Note: merges with any existing - values in qfc. */ -void qf_merge(const QF *qfa, const QF *qfb, QF *qfc); - -/* merge multiple QFs into the final QF one. */ -void qf_multi_merge(const QF *qf_arr[], int nqf, QF *qfr); - -/* find cosine similarity between two QFs. */ -uint64_t qf_inner_product(const QF *qfa, const QF *qfb); - -/* square of the L_2 norm of a QF (i.e. sum of squares of counts of - all items in the CQF). */ -uint64_t qf_magnitude(const QF *qf); - -/*********************************** - Debugging functions. -************************************/ - -__host__ __device__ void qf_dump(const QF *); -__host__ __device__ void qf_dump_metadata(const QF *qf); - -// TEMPORARILY EXPOSED FOR DEBUGGING - -// FUNCTIONS FOR MHM2 - -// construct a filter, takes in the address of a pointer -__host__ void qf_malloc_device(QF **qf, int nbits); - -// device_funcs for interacting with the filter -__device__ qf_returns insert_kmer(QF *qf, uint64_t hash, char forward, char backward, char &returnedfwd, char &returnedback); - -__device__ qf_returns insert_kmer_not_exists(QF *qf, uint64_t hash, char forward, char backward, char &returnedfwd, - char &returnedback); - -// destroys a filter -__host__ void qf_destroy_device(QF *qf); - -__host__ __device__ uint8_t encode_kmer_counter(uint8_t *counter); -__host__ __device__ void decode_kmer_counter(uint8_t *counter, uint8_t stored); - -__host__ __device__ bool is_encodable(uint8_t *counter); -//__global__ void insert_multi_kmer_kernel_first(QF* qf, uint16_t * locks, uint64_t * hashes, uint8_t * firsts, uint8_t * seconds, -// uint64_t nitems); -__global__ void insert_multi_kmer_kernel(QF *qf, uint64_t *hashes, uint8_t *firsts, uint8_t *seconds, uint64_t nitems, - uint64_t *counter); - -__host__ uint64_t qf_estimate_memory(int nbits); - -// get mem usage from dev side QF - -//__host__ uint64_t get_current_usage(QF* qf); - -} // namespace quotient_filter diff --git a/src/kcount/kcount-gpu/gqf_int.hpp b/src/kcount/kcount-gpu/gqf_int.hpp index a83962b..e69de29 100644 --- a/src/kcount/kcount-gpu/gqf_int.hpp +++ b/src/kcount/kcount-gpu/gqf_int.hpp @@ -1,138 +0,0 @@ -#pragma once - -/* - * ============================================================================ - * - * Authors: Prashant Pandey - * Rob Johnson - * - * ============================================================================ - */ - -#include -#include - -#include "gqf.hpp" -#include "partitioned_counter.hpp" - -namespace quotient_filter { - -#define MAGIC_NUMBER 1018874902021329732 - -/* Can be - 0 (choose size at run-time), - 8, 16, 32, or 64 (for optimized versions), - or other integer <= 56 (for compile-time-optimized bit-shifting-based versions) -*/ - -// move #define to gqf.cu -#define QF_BITS_PER_REMAINDER 10 -#define QF_BITS_PER_VALUE 6 - -#define QF_BITS_PER_SLOT 16 - -/* Must be >= 6. 6 seems fastest. */ -#define QF_BLOCK_OFFSET_BITS (6) - -#define QF_SLOTS_PER_BLOCK (1ULL << QF_BLOCK_OFFSET_BITS) -#define QF_METADATA_WORDS_PER_BLOCK ((QF_SLOTS_PER_BLOCK + 63) / 64) - -typedef struct { - /* Code works with uint16_t, uint32_t, etc, but uint8_t seems just as fast as - * anything else */ - uint8_t offset; - uint64_t occupieds[QF_METADATA_WORDS_PER_BLOCK]; - uint64_t runends[QF_METADATA_WORDS_PER_BLOCK]; -#if QF_BITS_PER_SLOT == 8 - uint8_t slots[QF_SLOTS_PER_BLOCK]; -#elif QF_BITS_PER_SLOT == 16 - uint16_t slots[QF_SLOTS_PER_BLOCK]; -#elif QF_BITS_PER_SLOT == 32 - uint32_t slots[QF_SLOTS_PER_BLOCK]; -#elif QF_BITS_PER_SLOT == 64 - uint64_t slots[QF_SLOTS_PER_BLOCK]; -#elif QF_BITS_PER_SLOT != 0 - uint8_t slots[QF_SLOTS_PER_BLOCK * QF_BITS_PER_SLOT / 8]; -#else - uint8_t slots[]; -#endif -} __attribute__((__packed__)) qfblock; - -// struct __attribute__ ((__packed__)) qfblock; -// typedef struct qfblock qfblock; - -typedef struct file_info { - int fd; - char *filepath; -} file_info; - -// The below struct is used to instrument the code. -// It is not used in normal operations of the CQF. -typedef struct { - uint64_t total_time_single; - uint64_t total_time_spinning; - uint64_t locks_taken; - uint64_t locks_acquired_single_attempt; -} wait_time_data; - -typedef struct quotient_filter_runtime_data { - file_info f_info; - uint32_t auto_resize; - int64_t (*container_resize)(QF *qf, uint64_t nslots); - pc_t pc_nelts; - pc_t pc_ndistinct_elts; - pc_t pc_noccupied_slots; - uint64_t num_locks; - volatile int metadata_lock; - uint16_t *locks; - wait_time_data *wait_times; -} quotient_filter_runtime_data; - -typedef quotient_filter_runtime_data qfruntime; - -typedef struct quotient_filter_metadata { - uint64_t magic_endian_number; - enum qf_hashmode hash_mode; - uint32_t reserved; - uint64_t total_size_in_bytes; - uint32_t seed; - uint64_t nslots; - uint64_t xnslots; - uint64_t key_bits; - uint64_t value_bits; - uint64_t key_remainder_bits; - uint64_t bits_per_slot; - uint64_t range; - uint64_t nblocks; - uint64_t nelts; - uint64_t ndistinct_elts; - uint64_t noccupied_slots; -} quotient_filter_metadata; - -typedef quotient_filter_metadata qfmetadata; - -typedef struct quotient_filter { - qfruntime *runtimedata; - qfmetadata *metadata; - qfblock *blocks; -} quotient_filter; - -typedef quotient_filter QF; - -// The below struct is used to instrument the code. -// It is not used in normal operations of the CQF. -typedef struct cluster_data { - uint64_t start_index; - uint16_t length; -} cluster_data; -typedef struct quotient_filter_iterator { - const QF *qf; - uint64_t run; - uint64_t current; - uint64_t cur_start_index; - uint16_t cur_length; - uint32_t num_clusters; - cluster_data *c_info; -} quotient_filter_iterator; - -} // namespace quotient_filter diff --git a/src/kcount/kcount-gpu/hashutil.cpp b/src/kcount/kcount-gpu/hashutil.cpp index 962bb58..e69de29 100644 --- a/src/kcount/kcount-gpu/hashutil.cpp +++ b/src/kcount/kcount-gpu/hashutil.cpp @@ -1,179 +0,0 @@ -/* - * ============================================================================ - * - * Authors: Prashant Pandey - * Rob Johnson - * - * ============================================================================ - */ - -#include "hashutil.hpp" - -//----------------------------------------------------------------------------- -// MurmurHash2, 64-bit versions, by Austin Appleby - -// The same caveats as 32-bit MurmurHash2 apply here - beware of alignment -// and endian-ness issues if used across multiple platforms. - -// 64-bit hash for 64-bit platforms - -__host__ __device__ uint64_t MurmurHash64A(const void *key, int len, unsigned int seed) { - const uint64_t m = 0xc6a4a7935bd1e995; - const int r = 47; - - uint64_t h = seed ^ (len * m); - - const uint64_t *data = (const uint64_t *)key; - const uint64_t *end = data + (len / 8); - - while (data != end) { - uint64_t k = *data++; - - k *= m; - k ^= k >> r; - k *= m; - - h ^= k; - h *= m; - } - - const unsigned char *data2 = (const unsigned char *)data; - - switch (len & 7) { - case 7: h ^= (uint64_t)data2[6] << 48; - case 6: h ^= (uint64_t)data2[5] << 40; - case 5: h ^= (uint64_t)data2[4] << 32; - case 4: h ^= (uint64_t)data2[3] << 24; - case 3: h ^= (uint64_t)data2[2] << 16; - case 2: h ^= (uint64_t)data2[1] << 8; - case 1: h ^= (uint64_t)data2[0]; h *= m; - }; - - h ^= h >> r; - h *= m; - h ^= h >> r; - - return h; -} - -// 64-bit hash for 32-bit platforms - -__host__ __device__ uint64_t MurmurHash64B(const void *key, int len, unsigned int seed) { - const unsigned int m = 0x5bd1e995; - const int r = 24; - - unsigned int h1 = seed ^ len; - unsigned int h2 = 0; - - const unsigned int *data = (const unsigned int *)key; - - while (len >= 8) { - unsigned int k1 = *data++; - k1 *= m; - k1 ^= k1 >> r; - k1 *= m; - h1 *= m; - h1 ^= k1; - len -= 4; - - unsigned int k2 = *data++; - k2 *= m; - k2 ^= k2 >> r; - k2 *= m; - h2 *= m; - h2 ^= k2; - len -= 4; - } - - if (len >= 4) { - unsigned int k1 = *data++; - k1 *= m; - k1 ^= k1 >> r; - k1 *= m; - h1 *= m; - h1 ^= k1; - len -= 4; - } - - switch (len) { - case 3: h2 ^= ((unsigned char *)data)[2] << 16; - case 2: h2 ^= ((unsigned char *)data)[1] << 8; - case 1: h2 ^= ((unsigned char *)data)[0]; h2 *= m; - }; - - h1 ^= h2 >> 18; - h1 *= m; - h2 ^= h1 >> 22; - h2 *= m; - h1 ^= h2 >> 17; - h1 *= m; - h2 ^= h1 >> 19; - h2 *= m; - - uint64_t h = h1; - - h = (h << 32) | h2; - - return h; -} - -/* - * For any 1 for a snapshot. - -__host__ __device__ uint64_t hash_64(uint64_t key, uint64_t mask) { - key = (~key + (key << 21)) & mask; // key = (key << 21) - key - 1; - key = key ^ key >> 24; - key = ((key + (key << 3)) + (key << 8)) & mask; // key * 265 - key = key ^ key >> 14; - key = ((key + (key << 2)) + (key << 4)) & mask; // key * 21 - key = key ^ key >> 28; - key = (key + (key << 31)) & mask; - return key; -} - -// The inversion of hash_64(). Modified from -// -__host__ __device__ uint64_t hash_64i(uint64_t key, uint64_t mask) { - uint64_t tmp; - - // Invert key = key + (key << 31) - tmp = (key - (key << 31)); - key = (key - (tmp << 31)) & mask; - - // Invert key = key ^ (key >> 28) - tmp = key ^ key >> 28; - key = key ^ tmp >> 28; - - // Invert key *= 21 - key = (key * 14933078535860113213ull) & mask; - - // Invert key = key ^ (key >> 14) - tmp = key ^ key >> 14; - tmp = key ^ tmp >> 14; - tmp = key ^ tmp >> 14; - key = key ^ tmp >> 14; - - // Invert key *= 265 - key = (key * 15244667743933553977ull) & mask; - - // Invert key = key ^ (key >> 24) - tmp = key ^ key >> 24; - key = key ^ tmp >> 24; - - // Invert key = (~key) + (key << 21) - tmp = ~key; - tmp = ~(key - (tmp << 21)); - tmp = ~(key - (tmp << 21)); - key = ~(key - (tmp << 21)) & mask; - - return key; -} diff --git a/src/kcount/kcount-gpu/hashutil.hpp b/src/kcount/kcount-gpu/hashutil.hpp index b09fb65..e69de29 100644 --- a/src/kcount/kcount-gpu/hashutil.hpp +++ b/src/kcount/kcount-gpu/hashutil.hpp @@ -1,25 +0,0 @@ -#pragma once - -/* - * ============================================================================ - * - * Authors: Prashant Pandey - * Rob Johnson - * - * ============================================================================ - */ - -#ifndef _HASHUTIL_CUH_ -#define _HASHUTIL_CUH_ - -#include -#include -#include - -__host__ __device__ uint64_t MurmurHash64B(const void* key, int len, unsigned int seed); -__host__ __device__ uint64_t MurmurHash64A(const void* key, int len, unsigned int seed); - -__host__ __device__ uint64_t hash_64(uint64_t key, uint64_t mask); -__host__ __device__ uint64_t hash_64i(uint64_t key, uint64_t mask); - -#endif // #ifndef _HASHUTIL_H_ diff --git a/src/kcount/kcount-gpu/parse_and_pack.cpp b/src/kcount/kcount-gpu/parse_and_pack.cpp index a3ecf17..e69de29 100644 --- a/src/kcount/kcount-gpu/parse_and_pack.cpp +++ b/src/kcount/kcount-gpu/parse_and_pack.cpp @@ -1,325 +0,0 @@ -/* - HipMer v 2.0, Copyright (c) 2020, The Regents of the University of California, - through Lawrence Berkeley National Laboratory (subject to receipt of any required - approvals from the U.S. Dept. of Energy). All rights reserved." - - Redistribution and use in source and binary forms, with or without modification, - are permitted provided that the following conditions are met: - - (1) Redistributions of source code must retain the above copyright notice, this - list of conditions and the following disclaimer. - - (2) Redistributions in binary form must reproduce the above copyright notice, - this list of conditions and the following disclaimer in the documentation and/or - other materials provided with the distribution. - - (3) Neither the name of the University of California, Lawrence Berkeley National - Laboratory, U.S. Dept. of Energy nor the names of its contributors may be used to - endorse or promote products derived from this software without specific prior - written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY - EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES - OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT - SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, - INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED - TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR - BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN - ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH - DAMAGE. - - You are under no obligation whatsoever to provide any bug fixes, patches, or upgrades - to the features, functionality or performance of the source code ("Enhancements") to - anyone; however, if you choose to make your Enhancements available either publicly, - or directly to Lawrence Berkeley National Laboratory, without imposing a separate - written license agreement for such Enhancements, then you hereby grant the following - license: a non-exclusive, royalty-free perpetual license to install, use, modify, - prepare derivative works, incorporate into other computer software, distribute, and - sublicense such enhancements or derivative works thereof, in binary and source code - form. -*/ - -#include -#include -#include -#include -#include -#include -#include - -#include "upcxx_utils/colors.h" -#include "gpu-utils/gpu_common.hpp" -#include "gpu-utils/gpu_utils.hpp" -#include "parse_and_pack.hpp" - -using namespace std; -using namespace gpu_common; - -__constant__ uint64_t GPU_0_MASK[32] = { - 0x0000000000000000, 0xC000000000000000, 0xF000000000000000, 0xFC00000000000000, 0xFF00000000000000, 0xFFC0000000000000, - 0xFFF0000000000000, 0xFFFC000000000000, 0xFFFF000000000000, 0xFFFFC00000000000, 0xFFFFF00000000000, 0xFFFFFC0000000000, - 0xFFFFFF0000000000, 0xFFFFFFC000000000, 0xFFFFFFF000000000, 0xFFFFFFFC00000000, 0xFFFFFFFF00000000, 0xFFFFFFFFC0000000, - 0xFFFFFFFFF0000000, 0xFFFFFFFFFC000000, 0xFFFFFFFFFF000000, 0xFFFFFFFFFFC00000, 0xFFFFFFFFFFF00000, 0xFFFFFFFFFFFC0000, - 0xFFFFFFFFFFFF0000, 0xFFFFFFFFFFFFC000, 0xFFFFFFFFFFFFF000, 0xFFFFFFFFFFFFFC00, 0xFFFFFFFFFFFFFF00, 0xFFFFFFFFFFFFFFC0, - 0xFFFFFFFFFFFFFFF0, 0xFFFFFFFFFFFFFFFC}; - -struct kcount_gpu::ParseAndPackDriverState { - cudaEvent_t event; - int rank_me; -}; - -inline __device__ uint64_t quick_hash(uint64_t v) { - v = v * 3935559000370003845 + 2691343689449507681; - v ^= v >> 21; - v ^= v << 37; - v ^= v >> 4; - v *= 4768777513237032717; - v ^= v << 20; - v ^= v >> 41; - v ^= v << 5; - return v; -} - -__device__ uint64_t gpu_minimizer_hash_fast(int m, int kmer_len, int num_longs, uint64_t *longs, uint64_t *rc_longs) { - const int chunk_step = 32 - ((m + 3) / 4) * 4; // chunk_step is a multiple of 4 - - int base; - int num_candidates = kmer_len - m + 1; - const int max_candidates = MAX_BUILD_KMER; - uint64_t rc_candidates[max_candidates]; - - // calculate and temporarily store all revcomp minimizer candidates on the stack - for (base = 0; base <= kmer_len - m; base += chunk_step) { - int shift = base % 32; - int l = base / 32; - uint64_t tmp = rc_longs[l]; - if (shift) { - tmp = (tmp << (shift * 2)); - if (l < num_longs - 1) tmp |= rc_longs[l + 1] >> (64 - shift * 2); - } - for (int j = 0; j < chunk_step; j++) { - if (base + j + m > kmer_len) break; - rc_candidates[base + j] = ((tmp << (j * 2)) & GPU_0_MASK[m]); - } - } - - uint64_t minimizer = 0; - // calculate and compare minimizers from revcomp - for (base = 0; base <= kmer_len - m; base += chunk_step) { - int shift = base % 32; - int l = base / 32; - uint64_t tmp = longs[l]; - if (shift) { - tmp = (tmp << (shift * 2)); - if (l < num_longs - 1) tmp |= longs[l + 1] >> (64 - shift * 2); - } - for (int j = 0; j < chunk_step; j++) { - if (base + j + m > kmer_len) break; - uint64_t fwd_candidate = ((tmp << (j * 2)) & GPU_0_MASK[m]); - auto &rc_candidate = rc_candidates[num_candidates - base - j - 1]; - uint64_t &least_candidate = (fwd_candidate < rc_candidate) ? fwd_candidate : rc_candidate; - if (least_candidate > minimizer) minimizer = least_candidate; - } - } - return quick_hash(minimizer); -} - -__global__ void parse_and_pack(char *seqs, int minimizer_len, int kmer_len, int num_longs, int seqs_len, int *kmer_targets, - int num_ranks) { - int num_kmers = seqs_len - kmer_len + 1; - const int MAX_LONGS = (MAX_BUILD_KMER + 31) / 32; - uint64_t kmer[MAX_LONGS]; - unsigned int threadid = blockIdx.x * blockDim.x + threadIdx.x; - if (threadid < num_kmers) { - if (pack_seq_to_kmer(&(seqs[threadid]), kmer_len, num_longs, kmer)) { - uint64_t kmer_rc[MAX_LONGS]; - revcomp(kmer, kmer_rc, kmer_len, num_longs); - kmer_targets[threadid] = gpu_minimizer_hash_fast(minimizer_len, kmer_len, num_longs, kmer, kmer_rc) % num_ranks; - } else { - // indicate invalid with -1 - kmer_targets[threadid] = -1; - } - } -} - -inline __device__ bool is_valid_base(char base) { return (base != '_' && base != 'N'); } - -__global__ void build_supermers(char *seqs, int *kmer_targets, int num_kmers, int kmer_len, int seqs_len, - kcount_gpu::SupermerInfo *supermers, unsigned int *num_supermers, unsigned int *num_valid_kmers, - int rank_me) { - // builds a single supermer starting at a given kmer, but only if the kmer is a valid start to a supermer - int my_valid_kmers = 0; - unsigned int threadid = blockIdx.x * blockDim.x + threadIdx.x; - if (threadid == 0 && kmer_targets[threadid] != -1) my_valid_kmers++; - if (threadid > 0 && threadid < num_kmers) { - int target = kmer_targets[threadid]; - if (target != -1) { - my_valid_kmers++; - bool prev_target_ok = false; - if (threadid == 1) { - prev_target_ok = true; - } else { - if (kmer_targets[threadid - 1] != target) { - // prev kmer was a different or invalid target - prev_target_ok = true; - } else { - // prev kmer was the same target, but was not a valid start to a supermer - if (!is_valid_base(seqs[threadid - 2]) || !is_valid_base(seqs[threadid - 1 + kmer_len])) prev_target_ok = true; - } - } - // make sure this is the first kmer for this target - if (prev_target_ok && is_valid_base(seqs[threadid - 1]) && is_valid_base(seqs[threadid + kmer_len])) { - int supermer_start_i = threadid - 1; - int supermer_len = kmer_len + 2; - // build the supermer - for (int i = threadid + 1; i < num_kmers - 1; i++) { - auto next_target = kmer_targets[i]; - int end_pos = supermer_start_i + supermer_len; // i + kmer_len; - if (next_target == target && end_pos < seqs_len && is_valid_base(seqs[end_pos])) - supermer_len++; - else - break; - } - // get a slot for the supermer - int slot = atomicAdd(num_supermers, 1); - supermers[slot].target = target; - supermers[slot].offset = supermer_start_i; - supermers[slot].len = supermer_len; - } - } - } - reduce(my_valid_kmers, num_kmers, num_valid_kmers); -} - -inline __device__ uint8_t get_packed_val(char base) { - switch (base) { - case 'a': return 1; - case 'c': return 2; - case 'g': return 3; - case 't': return 4; - case 'A': return 5; - case 'C': return 6; - case 'G': return 7; - case 'T': return 8; - case 'N': - case 'n': return 9; - case '_': - case 0: return 0; - default: printf("Invalid value encountered when packing: %d\n", (int)base); - }; - return 0; -} - -__global__ void pack_seqs(char *dev_seqs, char *dev_packed_seqs, int seqs_len) { - unsigned int threadid = blockIdx.x * blockDim.x + threadIdx.x; - int packed_seqs_len = seqs_len / 2 + seqs_len % 2; - if (threadid < packed_seqs_len) { - int seqs_i = threadid * 2; - char packed = (get_packed_val(dev_seqs[seqs_i]) << 4); - packed |= get_packed_val(dev_seqs[seqs_i + 1]); - dev_packed_seqs[threadid] = packed; - } -} - -inline int halve_up(int x) { return x / 2 + x % 2; } - -kcount_gpu::ParseAndPackGPUDriver::ParseAndPackGPUDriver(int upcxx_rank_me, int upcxx_rank_n, int qual_offset, int kmer_len, - int num_kmer_longs, int minimizer_len, double &init_time) - : upcxx_rank_me(upcxx_rank_me) - , upcxx_rank_n(upcxx_rank_n) - , kmer_len(kmer_len) - , qual_offset(qual_offset) - , num_kmer_longs(num_kmer_longs) - , minimizer_len(minimizer_len) - , t_func(0) - , t_kernel(0) { - QuickTimer init_timer; - init_timer.start(); - gpu_utils::set_gpu_device(upcxx_rank_me); - max_kmers = KCOUNT_SEQ_BLOCK_SIZE - kmer_len + 1; - - cudaErrchk(cudaMalloc((void **)&dev_seqs, KCOUNT_SEQ_BLOCK_SIZE)); - cudaErrchk(cudaMalloc((void **)&dev_kmer_targets, max_kmers * sizeof(int))); - - cudaErrchk(cudaMalloc((void **)&dev_supermers, max_kmers * sizeof(SupermerInfo))); - cudaErrchk(cudaMalloc((void **)&dev_packed_seqs, halve_up(KCOUNT_SEQ_BLOCK_SIZE))); - cudaErrchk(cudaMalloc((void **)&dev_num_supermers, sizeof(int))); - cudaErrchk(cudaMalloc((void **)&dev_num_valid_kmers, sizeof(int))); - - // total storage required is approx KCOUNT_SEQ_BLOCK_SIZE * (1 + num_kmers_longs * sizeof(uint64_t) + sizeof(int) + 1) - dstate = new ParseAndPackDriverState(); - dstate->rank_me = upcxx_rank_me; - init_timer.stop(); - init_time = init_timer.get_elapsed(); -} - -kcount_gpu::ParseAndPackGPUDriver::~ParseAndPackGPUDriver() { - cudaFree(dev_seqs); - cudaFree(dev_kmer_targets); - - cudaFree(dev_supermers); - cudaFree(dev_packed_seqs); - cudaFree(dev_num_supermers); - cudaFree(dev_num_valid_kmers); - - delete dstate; -} - -bool kcount_gpu::ParseAndPackGPUDriver::process_seq_block(const string &seqs, unsigned int &num_valid_kmers) { - QuickTimer func_timer, kernel_timer; - - if (seqs.length() >= KCOUNT_SEQ_BLOCK_SIZE) return false; - if (seqs.length() == 0) return false; - if (seqs.length() < (unsigned int)kmer_len) return false; - - func_timer.start(); - gpu_utils::set_gpu_device(dstate->rank_me); - cudaErrchk(cudaEventCreateWithFlags(&dstate->event, cudaEventDisableTiming | cudaEventBlockingSync)); - - int num_kmers = seqs.length() - kmer_len + 1; - cudaErrchk(cudaMemcpy(dev_seqs, &seqs[0], seqs.length(), cudaMemcpyHostToDevice)); - - int gridsize, threadblocksize; - get_kernel_config(seqs.length(), parse_and_pack, gridsize, threadblocksize); - kernel_timer.start(); - parse_and_pack<<>>(dev_seqs, minimizer_len, kmer_len, num_kmer_longs, seqs.length(), dev_kmer_targets, - upcxx_rank_n); - - cudaErrchk(cudaMemset(dev_num_supermers, 0, sizeof(int))); - cudaErrchk(cudaMemset(dev_num_valid_kmers, 0, sizeof(int))); - get_kernel_config(num_kmers, build_supermers, gridsize, threadblocksize); - build_supermers<<>>(dev_seqs, dev_kmer_targets, num_kmers, kmer_len, seqs.length(), dev_supermers, - dev_num_supermers, dev_num_valid_kmers, upcxx_rank_me); - cudaErrchk(cudaMemcpy(&num_valid_kmers, dev_num_valid_kmers, sizeof(unsigned int), cudaMemcpyDeviceToHost)); - unsigned int num_supermers; - cudaErrchk(cudaMemcpy(&num_supermers, dev_num_supermers, sizeof(unsigned int), cudaMemcpyDeviceToHost)); - supermers.resize(num_supermers); - cudaErrchk(cudaMemcpy(&(supermers[0]), dev_supermers, num_supermers * sizeof(SupermerInfo), cudaMemcpyDeviceToHost)); - cudaErrchk(cudaEventSynchronize(dstate->event)); - cudaErrchk(cudaEventDestroy(dstate->event)); - kernel_timer.stop(); - t_kernel += kernel_timer.get_elapsed(); - func_timer.stop(); - t_func += func_timer.get_elapsed(); - return true; -} - -void kcount_gpu::ParseAndPackGPUDriver::pack_seq_block(const string &seqs) { - gpu_utils::set_gpu_device(dstate->rank_me); - int packed_seqs_len = halve_up(seqs.length()); - cudaErrchk(cudaMemcpy(dev_seqs, &seqs[0], seqs.length(), cudaMemcpyHostToDevice)); - cudaErrchk(cudaMemset(dev_packed_seqs, 0, packed_seqs_len)); - int gridsize, threadblocksize; - get_kernel_config(packed_seqs_len, pack_seqs, gridsize, threadblocksize); - GPUTimer t; - t.start(); - pack_seqs<<>>(dev_seqs, dev_packed_seqs, seqs.length()); - // this GPUTimer forces a wait for the GPU kernel to complete - t.stop(); - t_kernel += t.get_elapsed(); - packed_seqs.resize(packed_seqs_len); - cudaErrchk(cudaMemcpy(&(packed_seqs[0]), dev_packed_seqs, packed_seqs_len, cudaMemcpyDeviceToHost)); -} - -tuple kcount_gpu::ParseAndPackGPUDriver::get_elapsed_times() { return {t_func, t_kernel}; } diff --git a/src/kcount/kcount-gpu/parse_and_pack.hpp b/src/kcount/kcount-gpu/parse_and_pack.hpp index 7f21c2f..e69de29 100644 --- a/src/kcount/kcount-gpu/parse_and_pack.hpp +++ b/src/kcount/kcount-gpu/parse_and_pack.hpp @@ -1,89 +0,0 @@ -/* - HipMer v 2.0, Copyright (c) 2020, The Regents of the University of California, - through Lawrence Berkeley National Laboratory (subject to receipt of any required - approvals from the U.S. Dept. of Energy). All rights reserved." - - Redistribution and use in source and binary forms, with or without modification, - are permitted provided that the following conditions are met: - - (1) Redistributions of source code must retain the above copyright notice, this - list of conditions and the following disclaimer. - - (2) Redistributions in binary form must reproduce the above copyright notice, - this list of conditions and the following disclaimer in the documentation and/or - other materials provided with the distribution. - - (3) Neither the name of the University of California, Lawrence Berkeley National - Laboratory, U.S. Dept. of Energy nor the names of its contributors may be used to - endorse or promote products derived from this software without specific prior - written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY - EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES - OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT - SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, - INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED - TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR - BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN - ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH - DAMAGE. - - You are under no obligation whatsoever to provide any bug fixes, patches, or upgrades - to the features, functionality or performance of the source code ("Enhancements") to - anyone; however, if you choose to make your Enhancements available either publicly, - or directly to Lawrence Berkeley National Laboratory, without imposing a separate - written license agreement for such Enhancements, then you hereby grant the following - license: a non-exclusive, royalty-free perpetual license to install, use, modify, - prepare derivative works, incorporate into other computer software, distribute, and - sublicense such enhancements or derivative works thereof, in binary and source code - form. -*/ - -#pragma once - -#include - -namespace kcount_gpu { - -struct ParseAndPackDriverState; - -struct SupermerInfo { - int target; - int offset; - uint16_t len; -}; - -class ParseAndPackGPUDriver { - // this opaque data type stores CUDA specific variables - ParseAndPackDriverState *dstate = nullptr; - - int upcxx_rank_me; - int upcxx_rank_n; - int max_kmers; - int kmer_len; - int qual_offset; - int num_kmer_longs; - int minimizer_len; - double t_func = 0, t_malloc = 0, t_cp = 0, t_kernel = 0; - char *dev_seqs; - int *dev_kmer_targets; - - SupermerInfo *dev_supermers; - char *dev_packed_seqs; - unsigned int *dev_num_supermers; - unsigned int *dev_num_valid_kmers; - - public: - std::vector supermers; - std::string packed_seqs; - - ParseAndPackGPUDriver(int upcxx_rank_me, int upcxx_rank_n, int qual_offset, int kmer_len, int num_kmer_longs, int minimizer_len, - double &init_time); - ~ParseAndPackGPUDriver(); - bool process_seq_block(const std::string &seqs, unsigned int &num_valid_kmers); - void pack_seq_block(const std::string &seqs); - std::tuple get_elapsed_times(); -}; - -} // namespace kcount_gpu diff --git a/src/kcount/kcount-gpu/partitioned_counter.cpp b/src/kcount/kcount-gpu/partitioned_counter.cpp index a219f00..e69de29 100644 --- a/src/kcount/kcount-gpu/partitioned_counter.cpp +++ b/src/kcount/kcount-gpu/partitioned_counter.cpp @@ -1,68 +0,0 @@ -/* - * ============================================================================ - * - * Author: Prashant Pandey (), ppandey@cs.stonybrook.edu - * Organization: Stony Brook University - * - * ============================================================================ - */ - -//#define _GNU_SOURCE -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "partitioned_counter.hpp" - -#define min(a, b) ((a) < (b) ? (a) : (b)) - -int pc_init(pc_t *pc, int64_t *global_counter, uint32_t num_counters, int32_t threshold) { - int num_cpus = (int)sysconf(_SC_NPROCESSORS_ONLN); - if (num_cpus < 0) { - perror("sysconf"); - return PC_ERROR; - } - pc->num_counters = num_counters == 0 ? num_cpus : min((unsigned)num_cpus, num_counters); - - pc->local_counters = (lctr_t *)calloc(pc->num_counters, sizeof(*pc->local_counters)); - if (pc->local_counters == NULL) { - perror("Couldn't allocate memory for local counters."); - return PC_ERROR; - } - /*printf("Padding check: 0: %p 1: %p\n", (void*)&pc->local_counters[0],*/ - /*(void*)&pc->local_counters[1]);*/ - pc->global_counter = global_counter; - pc->threshold = threshold; - - return 0; -} - -void pc_destructor(pc_t *pc) { - pc_sync(pc); - lctr_t *lc = pc->local_counters; - pc->local_counters = NULL; - free(lc); -} - -void pc_add(pc_t *pc, int64_t count) { - int cpuid = sched_getcpu(); - uint32_t counter_id = cpuid % pc->num_counters; - int64_t cur_count = __atomic_add_fetch(&pc->local_counters[counter_id].counter, count, __ATOMIC_SEQ_CST); - if (cur_count > pc->threshold || cur_count < -pc->threshold) { - int64_t new_count = __atomic_exchange_n(&pc->local_counters[counter_id].counter, 0, __ATOMIC_SEQ_CST); - __atomic_fetch_add(pc->global_counter, new_count, __ATOMIC_SEQ_CST); - } -} - -void pc_sync(pc_t *pc) { - for (uint32_t i = 0; i < pc->num_counters; i++) { - int64_t c = __atomic_exchange_n(&pc->local_counters[i].counter, 0, __ATOMIC_SEQ_CST); - __atomic_fetch_add(pc->global_counter, c, __ATOMIC_SEQ_CST); - } -} diff --git a/src/kcount/kcount-gpu/partitioned_counter.hpp b/src/kcount/kcount-gpu/partitioned_counter.hpp index b6b50e8..e69de29 100644 --- a/src/kcount/kcount-gpu/partitioned_counter.hpp +++ b/src/kcount/kcount-gpu/partitioned_counter.hpp @@ -1,56 +0,0 @@ -#pragma once - -/* - * ============================================================================ - * - * Author: Prashant Pandey (), ppandey@cs.stonybrook.edu - * Organization: Stony Brook University - * - * ============================================================================ - */ - -#ifndef _PARTITIONED_COUNTER_CUH_ -#define _PARTITIONED_COUNTER_CUH_ - -#include -#include -#include - -#ifdef __cplusplus -extern "C" { -#endif - -typedef struct local_counter { - int64_t counter; - int64_t padding[7]; -} local_counter; - -typedef struct local_counter lctr_t; - -typedef struct partitioned_counter { - lctr_t *local_counters; - int64_t *global_counter; - uint32_t num_counters; - int32_t threshold; -} partitioned_counter; - -typedef struct partitioned_counter pc_t; - -#define PC_ERROR -1 - -/* on success returns 0. - * If allocation fails returns PC_ERROR - */ -int pc_init(pc_t *pc, int64_t *global_counter, uint32_t num_counters, int32_t threshold); - -void pc_destructor(pc_t *pc); - -void pc_add(pc_t *pc, int64_t count); - -void pc_sync(pc_t *pc); - -#ifdef __cplusplus -} -#endif - -#endif /* _PARTITIONED_COUNTER_CUH_ */ diff --git a/src/kcount/kcount_gpu.cpp b/src/kcount/kcount_gpu.cpp index 4b31e30..718982a 100644 --- a/src/kcount/kcount_gpu.cpp +++ b/src/kcount/kcount_gpu.cpp @@ -101,7 +101,7 @@ static void process_block(SeqBlockInserter *seq_block_inserter, dist_obje state->num_block_calls++; future fut = execute_in_thread_pool( [&state, &num_valid_kmers] { return state->pnp_gpu_driver->process_seq_block(state->seq_block, num_valid_kmers); }); - while (!fut.is_ready()) { + while (!fut.ready()) { state->num_pnp_gpu_waits++; progress(); } From dd25703b7fdc08bc4907ba32f0bad89c14e19b00 Mon Sep 17 00:00:00 2001 From: Jan Ciesko Date: Wed, 20 Mar 2024 10:17:17 -0700 Subject: [PATCH 09/13] Update operator definition syntax to make clang happy --- src/kmer.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/kmer.cpp b/src/kmer.cpp index 1a9f7b2..721c794 100644 --- a/src/kmer.cpp +++ b/src/kmer.cpp @@ -633,7 +633,7 @@ ostream &operator<<(ostream &out, const Kmer &k) { return out << k.to_string(); } -#define KMER_K(KMER_LEN) template ostream &operator<<(ostream &out, const Kmer &k); +#define KMER_K(KMER_LEN) template ostream &operator<< (ostream &out, const Kmer &k); KMER_K(32); #if MAX_BUILD_KMER >= 64 From de1d7e4dd93e50517816603f4e29b5cb96daacbb Mon Sep 17 00:00:00 2001 From: Jan Ciesko Date: Wed, 20 Mar 2024 10:40:13 -0700 Subject: [PATCH 10/13] Add HIP to CmakeFile --- CMakeLists.txt | 24 ++++----- src/CMakeLists.txt | 80 ++++++++++++++-------------- src/gpu-utils/CMakeLists.txt | 10 ++-- src/kcount/kcount-gpu/CMakeLists.txt | 18 +++---- test/CMakeLists.txt | 2 +- 5 files changed, 67 insertions(+), 67 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 74355c5..53fe9cd 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -85,23 +85,23 @@ if(NOT CMAKE_BUILD_TYPE) endif() include(CheckLanguage) -check_language(CUDA) -if(CMAKE_CUDA_COMPILER) - option(ENABLE_CUDA "Enable CUDA" ON) - if(ENABLE_CUDA) - set(CMAKE_CUDA_SEPARABLE_COMPILATION ON) - set(CMAKE_CUDA_RESOLVE_DEVICE_SYMBOLS OFF) # only build device link objects +check_language(HIP) +if(CMAKE_HIP_COMPILER) + option(ENABLE_HIP "Enable HIP" ON) + if(ENABLE_HIP) + set(CMAKE_HIP_SEPARABLE_COMPILATION ON) + set(CMAKE_HIP_RESOLVE_DEVICE_SYMBOLS OFF) # only build device link objects # for GPU targets - enable_language(CUDA) + enable_language(HIP) endif() else() - message(STATUS "No CUDA environment detected") - set(ENABLE_CUDA + message(STATUS "No HIP environment detected") + set(ENABLE_HIP OFF - CACHE BOOL "Enable CUDA" FORCE) + CACHE BOOL "Enable HIP" FORCE) endif() -if(ENABLE_CUDA) - message(STATUS "Building for GPU with CUDA") +if(ENABLE_HIP) + message(STATUS "Building for GPU with HIP") else() message(STATUS "Building for CPU only") endif() diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 5e979b6..859805d 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -74,95 +74,95 @@ message( "Adding ZLIB for -I${ZLIB_INCLUDE_DIRS} and -L${ZLIB_BASEPATH}/lib64 -L${ZLIB_BASEPATH}/lib)" ) -if(ENABLE_CUDA) - message(STATUS "Building for GPUs with CUDA") +if(ENABLE_HIP) + message(STATUS "Building for GPUs with HIP") - set(CMAKE_CUDA_STANDARD + set(CMAKE_HIP_STANDARD 14 CACHE STRING "") - set(CMAKE_CUDA_STANDARD_REQUIRED + set(CMAKE_HIP_STANDARD_REQUIRED ON CACHE BOOL "") - set(CMAKE_CUDA_EXTENSIONS + set(CMAKE_HIP_EXTENSIONS OFF CACHE BOOL "") - find_package(CUDA REQUIRED) + find_package(HIP REQUIRED) if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.12) - cmake_policy(SET CMP0074 NEW) # use the CUDA_ROOT variable + cmake_policy(SET CMP0074 NEW) # use the HIP_ROOT variable endif() - enable_language(CUDA) - set(MHM2_MIN_CUDA_ARCH 70) + enable_language(HIP) + set(MHM2_MIN_HIP_ARCH 70) if(${CMAKE_VERSION} VERSION_GREATER_EQUAL 3.18) - if(NOT CMAKE_CUDA_ARCHITECTURES) - include(FindCUDA/select_compute_arch) - cuda_detect_installed_gpus(INSTALLED_GPU_CCS_1) + if(NOT CMAKE_HIP_ARCHITECTURES) + include(FindHIP/select_compute_arch) + hip_detect_installed_gpus(INSTALLED_GPU_CCS_1) string(STRIP "${INSTALLED_GPU_CCS_1}" INSTALLED_GPU_CCS_2) string(REPLACE " " ";" INSTALLED_GPU_CCS_3 "${INSTALLED_GPU_CCS_2}") - string(REPLACE "." "" CUDA_ARCH_LIST "${INSTALLED_GPU_CCS_3}") - set(CMAKE_CUDA_ARCHITECTURES ${CUDA_ARCH_LIST}) + string(REPLACE "." "" HIP_ARCH_LIST "${INSTALLED_GPU_CCS_3}") + set(CMAKE_HIP_ARCHITECTURES ${HIP_ARCH_LIST}) message( - STATUS "Autodetect CMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES}" + STATUS "Autodetect CMAKE_HIP_ARCHITECTURES=${CMAKE_HIP_ARCHITECTURES}" ) endif() - message(STATUS "Using CMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES}") + message(STATUS "Using CMAKE_HIP_ARCHITECTURES=${CMAKE_HIP_ARCHITECTURES}") else() - find_package(CUDA REQUIRED) - cuda_select_nvcc_arch_flags(ARCH_FLAGS_LIST) + find_package(HIP REQUIRED) + hip_select_nvcc_arch_flags(ARCH_FLAGS_LIST) if(ARCH_FLAGS_LIST) string(REPLACE ";" " " ARCH_FLAGS "${ARCH_FLAGS_LIST}") message("Found ARCH_FLAGS_LIST: ${ARCH_FLAGS_LIST}") - message("Found CUDA_GPU_DETECT_OUTPUT: ${CUDA_GPU_DETECT_OUTPUT}") - if(NOT CMAKE_CUDA_ARCHITECTURES) - string(REPLACE "." "" CMAKE_CUDA_ARCHITECTURES - "${CUDA_GPU_DETECT_OUTPUT}") + message("Found HIP_GPU_DETECT_OUTPUT: ${HIP_GPU_DETECT_OUTPUT}") + if(NOT CMAKE_HIP_ARCHITECTURES) + string(REPLACE "." "" CMAKE_HIP_ARCHITECTURES + "${HIP_GPU_DETECT_OUTPUT}") endif() else() - foreach(_CMAKE_CUDA_COMPUTE_CAPABILITY 70 80) + foreach(_CMAKE_HIP_COMPUTE_CAPABILITY 70 80) set(ARCH_FLAGS - "${ARCH_FLAGS} -gencode arch=compute_${_CMAKE_CUDA_COMPUTE_CAPABILITY},code=sm_${_CMAKE_CUDA_COMPUTE_CAPABILITY}" + "${ARCH_FLAGS} -gencode arch=compute_${_CMAKE_HIP_COMPUTE_CAPABILITY},code=sm_${_CMAKE_HIP_COMPUTE_CAPABILITY}" ) endforeach() - if(NOT CMAKE_CUDA_COMPUTE_CAPABILITY) - set(CMAKE_CUDA_COMPUTE_CAPABILITY + if(NOT CMAKE_HIP_COMPUTE_CAPABILITY) + set(CMAKE_HIP_COMPUTE_CAPABILITY 70 CACHE STRING "") endif() message( WARNING - "Could not auto-detect the GPU arch flags building for all gpu compute capabilities 3 - 7 and PTX for ${CMAKE_CUDA_COMPUTE_CAPABILITY}" + "Could not auto-detect the GPU arch flags building for all gpu compute capabilities 3 - 7 and PTX for ${CMAKE_HIP_COMPUTE_CAPABILITY}" ) set(ARCH_FLAGS - "${ARCH_FLAGS} -gencode arch=compute_${CMAKE_CUDA_COMPUTE_CAPABILITY},code=sm_${CMAKE_CUDA_COMPUTE_CAPABILITY}" + "${ARCH_FLAGS} -gencode arch=compute_${CMAKE_HIP_COMPUTE_CAPABILITY},code=sm_${CMAKE_HIP_COMPUTE_CAPABILITY}" ) set(ARCH_FLAGS - "${ARCH_FLAGS} -gencode arch=compute_${CMAKE_CUDA_COMPUTE_CAPABILITY},code=compute_${CMAKE_CUDA_COMPUTE_CAPABILITY}" + "${ARCH_FLAGS} -gencode arch=compute_${CMAKE_HIP_COMPUTE_CAPABILITY},code=compute_${CMAKE_HIP_COMPUTE_CAPABILITY}" ) endif() message(STATUS "ARCH_FLAGS=${ARCH_FLAGS}") endif() - foreach(test_cuda_arch ${CMAKE_CUDA_ARCHITECTURES}) - if(${test_cuda_arch} LESS ${MHM2_MIN_CUDA_ARCH}) + foreach(test_hip_arch ${CMAKE_HIP_ARCHITECTURES}) + if(${test_hip_arch} LESS ${MHM2_MIN_HIP_ARCH}) message( FATAL_ERROR - "CUDA architecture ${test_cuda_arch} is incompatible with the minimum ${MHM2_MIN_CUDA_ARCH}. Try compiling without cuda: -DENABLE_CUDA=OFF" + "HIP architecture ${test_hip_arch} is incompatible with the minimum ${MHM2_MIN_HIP_ARCH}. Try compiling without hip: -DENABLE_HIP=OFF" ) endif() endforeach() set(CMAKE_POSITION_INDEPENDENT_CODE ON) - set(CMAKE_CUDA_SEPARABLE_COMPILATION ON) - set(CMAKE_CUDA_PTX_COMPILATION ON) + set(CMAKE_HIP_SEPARABLE_COMPILATION ON) + set(CMAKE_HIP_PTX_COMPILATION ON) - set(CMAKE_CUDA_FLAGS - "${CMAKE_CUDA_FLAGS} -Xcompiler=-Wall ${ARCH_FLAGS} -Wno-deprecated-gpu-targets" + set(CMAKE_HIP_FLAGS + "${CMAKE_HIP_FLAGS} ${ARCH_FLAGS} -Wno-deprecated-gpu-targets" ) - message(STATUS "CMAKE_CUDA_FLAGS=${CMAKE_CUDA_FLAGS}") + message(STATUS "CMAKE_HIP_FLAGS=${CMAKE_HIP_FLAGS}") else() message(STATUS "Building for CPUs") @@ -215,7 +215,7 @@ foreach( list(APPEND MHM2_TARGET_OBJECTS $) endforeach() -if(ENABLE_CUDA) +if(ENABLE_HIP) set(tgt devices_gpu) add_library(${tgt} OBJECT ${tgt}.cpp) if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.12) @@ -250,8 +250,8 @@ set(MHM2_LINK_LIBRARIES include_directories(${CMAKE_CURRENT_SOURCE_DIR}) -if(ENABLE_CUDA) - set_property(TARGET mhm2 PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS OFF) +if(ENABLE_HIP) + set_property(TARGET mhm2 PROPERTY HIP_RESOLVE_DEVICE_SYMBOLS OFF) include_directories("gpu-utils") add_subdirectory(gpu-utils) set(MHM2_LINK_LIBRARIES ${MHM2_LINK_LIBRARIES} GPU_UTILS_LIBRARY_static) diff --git a/src/gpu-utils/CMakeLists.txt b/src/gpu-utils/CMakeLists.txt index faaef64..4700e06 100644 --- a/src/gpu-utils/CMakeLists.txt +++ b/src/gpu-utils/CMakeLists.txt @@ -1,5 +1,5 @@ -if(NOT ENABLE_CUDA) - message(FATAL_ERROR "Trying to build GPU-UTILS but CUDA is not enabled") +if(NOT ENABLE_HIP) + message(FATAL_ERROR "Trying to build GPU-UTILS but HIP is not enabled") endif() add_library(GPU_UTILS_LIBRARY_obj OBJECT gpu_utils.cpp gpu_common.cpp) @@ -8,7 +8,7 @@ if(${CMAKE_VERSION} VERSION_GREATER_EQUAL 3.13) endif() set_source_files_properties(gpu_utils.cpp gpu_common.cpp - PROPERTIES LANGUAGE CUDA LINKER_LANGUAGE CUDA) + PROPERTIES LANGUAGE HIP LINKER_LANGUAGE HIP) option(GPU_UTILS_SHARED "GPU-utils shared library" OFF) option(GPU_UTILS_STATIC "GPU-utils static library" ON) @@ -16,7 +16,7 @@ if(GPU_UTILS_SHARED) add_library(GPU_UTILS_LIBRARY_shared SHARED $) set_property(TARGET GPU_UTILS_LIBRARY_shared - PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) + PROPERTY HIP_RESOLVE_DEVICE_SYMBOLS ON) target_link_libraries(GPU_UTILS_LIBRARY_shared INTERFACE) install(TARGETS GPU_UTILS_LIBRARY_shared LIBRARY DESTINATION lib) endif() @@ -25,6 +25,6 @@ if(GPU_UTILS_STATIC) $) target_link_libraries(GPU_UTILS_LIBRARY_static INTERFACE) set_property(TARGET GPU_UTILS_LIBRARY_static - PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON) + PROPERTY HIP_RESOLVE_DEVICE_SYMBOLS ON) install(TARGETS GPU_UTILS_LIBRARY_static ARCHIVE DESTINATION lib) endif() diff --git a/src/kcount/kcount-gpu/CMakeLists.txt b/src/kcount/kcount-gpu/CMakeLists.txt index 0afbeae..dd84757 100644 --- a/src/kcount/kcount-gpu/CMakeLists.txt +++ b/src/kcount/kcount-gpu/CMakeLists.txt @@ -9,16 +9,16 @@ if(${CMAKE_VERSION} VERSION_GREATER_EQUAL 3.13) -suppress-stack-size-warning>) endif() -set_source_files_properties(parse_and_pack.cpp PROPERTIES LANGUAGE CUDA - LINKER_LANGUAGE CUDA) -set_source_files_properties(gpu_hash_table.cpp PROPERTIES LANGUAGE CUDA - LINKER_LANGUAGE CUDA) -set_source_files_properties(gqf.cpp PROPERTIES LANGUAGE CUDA LINKER_LANGUAGE - CUDA) -set_source_files_properties(hashutil.cpp PROPERTIES LANGUAGE CUDA - LINKER_LANGUAGE CUDA) +set_source_files_properties(parse_and_pack.cpp PROPERTIES LANGUAGE HIP + LINKER_LANGUAGE HIP) +set_source_files_properties(gpu_hash_table.cpp PROPERTIES LANGUAGE HIP + LINKER_LANGUAGE HIP) +set_source_files_properties(gqf.cpp PROPERTIES LANGUAGE HIP LINKER_LANGUAGE + HIP) +set_source_files_properties(hashutil.cpp PROPERTIES LANGUAGE HIP + LINKER_LANGUAGE HIP) set_source_files_properties(partitioned_counter.cpp - PROPERTIES LANGUAGE CUDA LINKER_LANGUAGE CUDA) + PROPERTIES LANGUAGE HIP LINKER_LANGUAGE HIP) option(KCOUNT_GPU_SHARED "kcount-GPU shared library" OFF) option(KCOUNT_GPU_STATIC "kcount-GPU static library" ON) diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index a95e4ad..88a4d5c 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -11,7 +11,7 @@ add_subdirectory(googletest) if(ENABLE_CUDA) add_definitions(-DENABLE_GPUS) - message(STATUS "Building tests for GPUs with CUDA") + message(STATUS "Building tests for GPUs with HIP") endif() set(BINARY mhm2_test) From e023280c967caa85c195f11da004ccd0e15e8495 Mon Sep 17 00:00:00 2001 From: Jan Ciesko Date: Wed, 20 Mar 2024 12:01:52 -0700 Subject: [PATCH 11/13] Hipification: round two --- src/gpu-utils/gpu_common.cpp | 22 +++++++++++----------- src/gpu-utils/gpu_common.hpp | 15 ++++++++------- src/gpu-utils/gpu_utils.cpp | 30 +++++++++++++++--------------- src/mhm2.py | 2 +- 4 files changed, 35 insertions(+), 34 deletions(-) diff --git a/src/gpu-utils/gpu_common.cpp b/src/gpu-utils/gpu_common.cpp index dc70a8a..df21bb3 100644 --- a/src/gpu-utils/gpu_common.cpp +++ b/src/gpu-utils/gpu_common.cpp @@ -48,9 +48,9 @@ namespace gpu_common { -void gpu_die(cudaError_t code, const char *file, int line, bool abort) { - if (code != cudaSuccess) { - std::cerr << KLRED << "<" << file << ":" << line << "> ERROR:" << KNORM << cudaGetErrorString(code) << "\n"; +void gpu_die(hipError_t code, const char *file, int line, bool abort) { + if (code != hipSuccess) { + std::cerr << KLRED << "<" << file << ":" << line << "> ERROR:" << KNORM << hipGetErrorString(code) << "\n"; std::abort(); // do not throw exceptions -- does not work properly within progress() throw std::runtime_error(outstr); } @@ -71,23 +71,23 @@ void QuickTimer::inc(double s) { secs += s; } double QuickTimer::get_elapsed() { return secs; } GPUTimer::GPUTimer() { - cudaErrchk(cudaEventCreate(&start_event)); - cudaErrchk(cudaEventCreate(&stop_event)); + cudaErrchk(hipEventCreate(&start_event)); + cudaErrchk(hipEventCreate(&stop_event)); elapsed_t_ms = 0; } GPUTimer::~GPUTimer() { - cudaErrchk(cudaEventDestroy(start_event)); - cudaErrchk(cudaEventDestroy(stop_event)); + cudaErrchk(hipEventDestroy(start_event)); + cudaErrchk(hipEventDestroy(stop_event)); } -void GPUTimer::start() { cudaErrchk(cudaEventRecord(start_event, 0)); } +void GPUTimer::start() { cudaErrchk(hipEventRecord(start_event, 0)); } void GPUTimer::stop() { - cudaErrchk(cudaEventRecord(stop_event, 0)); - cudaErrchk(cudaEventSynchronize(stop_event)); + cudaErrchk(hipEventRecord(stop_event, 0)); + cudaErrchk(hipEventSynchronize(stop_event)); float ms; - cudaErrchk(cudaEventElapsedTime(&ms, start_event, stop_event)); + cudaErrchk(hipEventElapsedTime(&ms, start_event, stop_event)); elapsed_t_ms += ms; } diff --git a/src/gpu-utils/gpu_common.hpp b/src/gpu-utils/gpu_common.hpp index b4b094c..2ebe6f3 100644 --- a/src/gpu-utils/gpu_common.hpp +++ b/src/gpu-utils/gpu_common.hpp @@ -1,3 +1,4 @@ +#include "hip/hip_runtime.h" /* HipMer v 2.0, Copyright (c) 2020, The Regents of the University of California, through Lawrence Berkeley National Laboratory (subject to receipt of any required @@ -44,8 +45,8 @@ #include #include -#include -#include +//#include +//#include // Functions that are common to all cuda code; not to be used by upcxx code @@ -71,7 +72,7 @@ static __constant__ uint64_t GPU_TWINS[256] = { 0xC8, 0x88, 0x48, 0x08, 0xF4, 0xB4, 0x74, 0x34, 0xE4, 0xA4, 0x64, 0x24, 0xD4, 0x94, 0x54, 0x14, 0xC4, 0x84, 0x44, 0x04, 0xF0, 0xB0, 0x70, 0x30, 0xE0, 0xA0, 0x60, 0x20, 0xD0, 0x90, 0x50, 0x10, 0xC0, 0x80, 0x40, 0x00}; -void gpu_die(cudaError_t code, const char *file, int line, bool abort = true); +void gpu_die(hipError_t code, const char *file, int line, bool abort = true); using timepoint_t = std::chrono::time_point; @@ -88,7 +89,7 @@ class QuickTimer { }; class GPUTimer { - cudaEvent_t start_event, stop_event; + hipEvent_t start_event, stop_event; float elapsed_t_ms = 0; public: @@ -101,8 +102,8 @@ class GPUTimer { inline __device__ int warpReduceSum(int val, int n) { unsigned int threadid = blockIdx.x * blockDim.x + threadIdx.x; - unsigned mask = __ballot_sync(0xffffffff, threadid < n); - for (int offset = warpSize / 2; offset > 0; offset /= 2) val += __shfl_down_sync(mask, val, offset); + unsigned mask = __ballot(threadid < n); /*JC*/ + for (int offset = warpSize / 2; offset > 0; offset /= 2) val += __shfl_down(mask, val, offset); /*JC*/ return val; } @@ -134,7 +135,7 @@ template inline void get_kernel_config(unsigned max_val, T func, int &gridsize, int &threadblocksize) { int mingridsize = 0; threadblocksize = 0; // 1024 - cudaErrchk(cudaOccupancyMaxPotentialBlockSize(&mingridsize, &threadblocksize, func, 0, 0)); + cudaErrchk(hipOccupancyMaxPotentialBlockSize(&mingridsize, &threadblocksize, func, 0, 0)); gridsize = (max_val + threadblocksize - 1) / threadblocksize; } diff --git a/src/gpu-utils/gpu_utils.cpp b/src/gpu-utils/gpu_utils.cpp index 402997a..8cb35a2 100644 --- a/src/gpu-utils/gpu_utils.cpp +++ b/src/gpu-utils/gpu_utils.cpp @@ -45,8 +45,8 @@ #include #include #include -#include -#include +//#include +//#include #include "gpu_utils.hpp" #include "upcxx_utils/colors.h" @@ -59,8 +59,8 @@ static int _rank_me = -1; static int get_gpu_device_count() { if (!_device_count) { - auto res = cudaGetDeviceCount(&_device_count); - if (res != cudaSuccess) return 0; + auto res = hipGetDeviceCount(&_device_count); + if (res != hipSuccess) return 0; } return _device_count; } @@ -71,27 +71,27 @@ void gpu_utils::set_gpu_device(int rank_me) { exit(1); } int num_devs = get_gpu_device_count(); - cudaErrchk(cudaSetDevice(rank_me % num_devs)); + cudaErrchk(hipSetDevice(rank_me % num_devs)); } size_t gpu_utils::get_gpu_tot_mem() { set_gpu_device(_rank_me); - cudaDeviceProp prop; - cudaErrchk(cudaGetDeviceProperties(&prop, 0)); + hipDeviceProp_t prop; + cudaErrchk(hipGetDeviceProperties(&prop, 0)); return prop.totalGlobalMem; } size_t gpu_utils::get_gpu_avail_mem() { set_gpu_device(_rank_me); size_t free_mem, tot_mem; - cudaErrchk(cudaMemGetInfo(&free_mem, &tot_mem)); + cudaErrchk(hipMemGetInfo(&free_mem, &tot_mem)); return free_mem; } string gpu_utils::get_gpu_device_name() { set_gpu_device(_rank_me); - cudaDeviceProp prop; - cudaErrchk(cudaGetDeviceProperties(&prop, 0)); + hipDeviceProp_t prop; + cudaErrchk(hipGetDeviceProperties(&prop, 0)); return prop.name; } @@ -107,8 +107,8 @@ vector gpu_utils::get_gpu_uuids() { vector uuids; int num_devs = get_gpu_device_count(); for (int i = 0; i < num_devs; ++i) { - cudaDeviceProp prop; - cudaErrchk(cudaGetDeviceProperties(&prop, i)); + hipDeviceProp_t prop; + cudaErrchk(hipGetDeviceProperties(&prop, i)); #if (CUDA_VERSION >= 10000) uuids.push_back(get_uuid_str(prop.uuid.bytes)); #else @@ -136,18 +136,18 @@ void gpu_utils::initialize_gpu(double& time_to_initialize, int rank_me) { if (!gpus_present()) return; _rank_me = rank_me; set_gpu_device(_rank_me); - cudaErrchk(cudaDeviceReset()); + cudaErrchk(hipDeviceReset()); elapsed = chrono::high_resolution_clock::now() - t; time_to_initialize = elapsed.count(); } string gpu_utils::get_gpu_device_descriptions() { - cudaDeviceProp prop; + hipDeviceProp_t prop; int num_devs = get_gpu_device_count(); ostringstream os; os << "Number of GPU devices visible: " << num_devs << "\n"; for (int i = 0; i < num_devs; ++i) { - cudaErrchk(cudaGetDeviceProperties(&prop, i)); + cudaErrchk(hipGetDeviceProperties(&prop, i)); os << "GPU Device number: " << i << "\n"; os << " Device name: " << prop.name << "\n"; diff --git a/src/mhm2.py b/src/mhm2.py index 5b929c7..8386c3e 100755 --- a/src/mhm2.py +++ b/src/mhm2.py @@ -475,7 +475,7 @@ def main(): halfnoderanks += ',' + str(n*cores) + ',' + str(n*cores+cores/2) # set extra GASNET environments from build and/or options to mhm2.py - runtime_vars = """@MHM2PY_RUNTIME_ENV@""" + runtime_vars = """""" if runtime_vars == '@MHM2PY_RUNTIME' + '_ENV@': runtime_vars = '' runtime_output_vars = '' From 6ada10f4d140b611b80f289953d89681e6cd5c4b Mon Sep 17 00:00:00 2001 From: Jan Ciesko Date: Wed, 20 Mar 2024 12:12:24 -0700 Subject: [PATCH 12/13] Add hip specific cmake build flags --- src/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 859805d..f970d64 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -159,7 +159,7 @@ if(ENABLE_HIP) set(CMAKE_HIP_PTX_COMPILATION ON) set(CMAKE_HIP_FLAGS - "${CMAKE_HIP_FLAGS} ${ARCH_FLAGS} -Wno-deprecated-gpu-targets" + "${CMAKE_HIP_FLAGS} ${ARCH_FLAGS} -Wno-deprecated-gpu-targets -fgpu-rdc --hip-link" ) message(STATUS "CMAKE_HIP_FLAGS=${CMAKE_HIP_FLAGS}") From 51af4f83c5e37268a7f5d54c1bbb8fe091262387 Mon Sep 17 00:00:00 2001 From: Amy Powell Date: Wed, 20 Mar 2024 14:30:06 -0700 Subject: [PATCH 13/13] update is_ready() to ready(); --- .../include/upcxx_utils/flat_aggr_store.hpp | 2 +- .../include/upcxx_utils/three_tier_aggr_store.hpp | 2 +- upcxx-utils/src/limit_outstanding.cpp | 14 +++++++------- upcxx-utils/src/ofstream.cpp | 2 +- upcxx-utils/src/promise_collectives.cpp | 4 ++-- upcxx-utils/src/reduce_prefix.cpp | 8 ++++---- 6 files changed, 16 insertions(+), 16 deletions(-) diff --git a/upcxx-utils/include/upcxx_utils/flat_aggr_store.hpp b/upcxx-utils/include/upcxx_utils/flat_aggr_store.hpp index b96e2ad..dc11e3a 100644 --- a/upcxx-utils/include/upcxx_utils/flat_aggr_store.hpp +++ b/upcxx-utils/include/upcxx_utils/flat_aggr_store.hpp @@ -466,7 +466,7 @@ class FlatAggrStore { do { fut = limit_outstanding_futures(fut); - } while (!fut.is_ready()); + } while (!fut.ready()); } CountType max_vals[2], sum_vals[2]; diff --git a/upcxx-utils/include/upcxx_utils/three_tier_aggr_store.hpp b/upcxx-utils/include/upcxx_utils/three_tier_aggr_store.hpp index c4f8b43..a60f89f 100644 --- a/upcxx-utils/include/upcxx_utils/three_tier_aggr_store.hpp +++ b/upcxx-utils/include/upcxx_utils/three_tier_aggr_store.hpp @@ -1073,7 +1073,7 @@ class ThreeTierAggrStore : public FlatAggrStore { do { fut = limit_outstanding_futures(fut); - } while (!fut.is_ready()); + } while (!fut.ready()); } } auto fut_done = flush_outstanding_futures_async(); diff --git a/upcxx-utils/src/limit_outstanding.cpp b/upcxx-utils/src/limit_outstanding.cpp index e65d063..ed08bf9 100644 --- a/upcxx-utils/src/limit_outstanding.cpp +++ b/upcxx-utils/src/limit_outstanding.cpp @@ -23,7 +23,7 @@ upcxx::future<> upcxx_utils::collapse_outstanding_futures(int limit, LimitedFutu while (outstanding_queue.size() > limit) { auto fut = outstanding_queue.front(); outstanding_queue.pop_front(); - if (!fut.is_ready()) returned_future = upcxx::when_all(fut, returned_future); + if (!fut.ready()) returned_future = upcxx::when_all(fut, returned_future); } DBG("limit=", limit, " outstanding=", outstanding_queue.size(), " max_check=", max_check, "\n"); if (limit == 0) { @@ -31,24 +31,24 @@ upcxx::future<> upcxx_utils::collapse_outstanding_futures(int limit, LimitedFutu } else { assert(outstanding_queue.size() <= limit); int i = 0; - while (i < max_check && !returned_future.is_ready() && i < outstanding_queue.size()) { + while (i < max_check && !returned_future.ready() && i < outstanding_queue.size()) { // find a ready future in the queue to swap with auto &test_fut = outstanding_queue[i++]; - if (test_fut.is_ready()) { + if (test_fut.ready()) { std::swap(test_fut, returned_future); - assert(returned_future.is_ready()); + assert(returned_future.ready()); break; } } } } - DBG("limit=", limit, " outstanding=", outstanding_queue.size(), " max_check=", max_check, ", ret=", returned_future.is_ready(), + DBG("limit=", limit, " outstanding=", outstanding_queue.size(), " max_check=", max_check, ", ret=", returned_future.ready(), "\n"); return returned_future; } void upcxx_utils::add_outstanding_future(upcxx::future<> fut, LimitedFutureQueue &outstanding_queue) { - if (!fut.is_ready()) outstanding_queue.push_back(fut); + if (!fut.ready()) outstanding_queue.push_back(fut); } upcxx::future<> upcxx_utils::limit_outstanding_futures(int limit, LimitedFutureQueue &outstanding_queue) { @@ -62,7 +62,7 @@ upcxx::future<> upcxx_utils::limit_outstanding_futures(upcxx::future<> fut, int if (outstanding_queue.empty()) return fut; return upcxx::when_all(collapse_outstanding_futures(limit, outstanding_queue), fut); } - if (fut.is_ready()) { + if (fut.ready()) { if (outstanding_queue.size() <= limit) return fut; } else { outstanding_queue.push_back(fut); diff --git a/upcxx-utils/src/ofstream.cpp b/upcxx-utils/src/ofstream.cpp index 784820f..cd311ac 100644 --- a/upcxx-utils/src/ofstream.cpp +++ b/upcxx-utils/src/ofstream.cpp @@ -841,7 +841,7 @@ dist_ofstream::~dist_ofstream() { if (!is_closed) close(); assert(is_closed); stringstream().swap(ss); - DBG_VERBOSE("close_fut=", close_fut.is_ready(), "\n"); + DBG_VERBOSE("close_fut=", close_fut.ready(), "\n"); } void dist_ofstream::close() { diff --git a/upcxx-utils/src/promise_collectives.cpp b/upcxx-utils/src/promise_collectives.cpp index 9f059cb..fd9ee7d 100644 --- a/upcxx-utils/src/promise_collectives.cpp +++ b/upcxx-utils/src/promise_collectives.cpp @@ -115,14 +115,14 @@ upcxx_utils::PromiseBarrier::~PromiseBarrier() { DBG_VERBOSE("Destroy this=", this, " move=", moved, "\n"); if (moved) return; // invalidated assert(upcxx::master_persona().active_with_caller()); - assert(dist_workflow->initiated_prom.get_future().is_ready()); + assert(dist_workflow->initiated_prom.get_future().ready()); get_future().wait(); } void upcxx_utils::PromiseBarrier::fulfill() const { DBG_VERBOSE("fulfill this=", this, "\n"); assert(upcxx::master_persona().active_with_caller()); - assert(!dist_workflow->initiated_prom.get_future().is_ready()); + assert(!dist_workflow->initiated_prom.get_future().ready()); dist_workflow->initiated_prom.fulfill_anonymous(1); } diff --git a/upcxx-utils/src/reduce_prefix.cpp b/upcxx-utils/src/reduce_prefix.cpp index 0ca447e..ecd0aa6 100644 --- a/upcxx-utils/src/reduce_prefix.cpp +++ b/upcxx-utils/src/reduce_prefix.cpp @@ -124,8 +124,8 @@ future<> binary_tree_steps::get_future() const { // up phase is done bool binary_tree_steps::up_ready() const { - return dst_is_partial_left_me.get_future().is_ready() && scratch_is_partial_right.get_future().is_ready() && - scratch_is_partial_to_parent.get_future().is_ready() && sent_partial_to_parent.get_future().is_ready(); + return dst_is_partial_left_me.get_future().ready() && scratch_is_partial_right.get_future().ready() && + scratch_is_partial_to_parent.get_future().ready() && sent_partial_to_parent.get_future().ready(); } future<> binary_tree_steps::get_up_future() const { @@ -135,8 +135,8 @@ future<> binary_tree_steps::get_up_future() const { // down phase is done bool binary_tree_steps::down_ready() const { - return scratch_is_partial_from_parent.get_future().is_ready() && sent_left_child.get_future().is_ready() && - sent_right_child.get_future().is_ready(); + return scratch_is_partial_from_parent.get_future().ready() && sent_left_child.get_future().ready() && + sent_right_child.get_future().ready(); } future<> binary_tree_steps::get_down_future() const {