diff --git a/CMakeLists.txt b/CMakeLists.txt index 63b1877..0f29f3a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -22,6 +22,83 @@ if(ENABLE_ADDRESS_SANITIZER) message(STATUS "AddressSanitizer is ON") endif() +set(SDSL_ROOT "" CACHE PATH "Installation prefix of sdsl-lite (contains include/ and lib/)") + +find_path( + SDSL_INCLUDE_DIR + NAMES sdsl/config.hpp + HINTS + "${SDSL_ROOT}" + PATH_SUFFIXES + include + PATHS + "$ENV{HOME}" + /usr + /usr/local +) + +if(NOT SDSL_INCLUDE_DIR) + message(FATAL_ERROR "Could not find sdsl/config.hpp. " + "Set SDSL_ROOT or SDSL_INCLUDE_DIR manually.") +endif() + +find_library( + SDSL_LIBRARY + NAMES sdsl + HINTS + "${SDSL_ROOT}" + PATH_SUFFIXES + lib + lib64 + PATHS + "$ENV{HOME}" + /usr + /usr/local +) + +find_library( + DIVSUFSORT_LIBRARY + NAMES divsufsort + HINTS + "${SDSL_ROOT}" + PATH_SUFFIXES + lib + lib64 + PATHS + "$ENV{HOME}" + /usr + /usr/local +) + +find_library( + DIVSUFSORT64_LIBRARY + NAMES divsufsort64 + HINTS + "${SDSL_ROOT}" + PATH_SUFFIXES + lib + lib64 + PATHS + "$ENV{HOME}" + /usr + /usr/local +) + +if(NOT SDSL_LIBRARY OR NOT DIVSUFSORT_LIBRARY OR NOT DIVSUFSORT64_LIBRARY) + message(FATAL_ERROR "Could not find sdsl-lite libraries (sdsl, divsufsort, divsufsort64). " + "Set SDSL_ROOT or SDSL_LIBRARY*/DIVSUFSORT* manually.") +endif() + +add_library(sdsl_lite INTERFACE) +target_include_directories(sdsl_lite INTERFACE "${SDSL_INCLUDE_DIR}") +target_link_libraries(sdsl_lite INTERFACE + "${SDSL_LIBRARY}" + "${DIVSUFSORT_LIBRARY}" + "${DIVSUFSORT64_LIBRARY}" +) +message(STATUS "Found sdsl-lite includes in: ${SDSL_INCLUDE_DIR}") +message(STATUS "Found sdsl-lite libs: ${SDSL_LIBRARY}; ${DIVSUFSORT_LIBRARY}; ${DIVSUFSORT64_LIBRARY}") + include(FetchContent) FetchContent_Declare( googletest @@ -103,6 +180,17 @@ target_link_libraries(test_rmm gtest gtest_main) +add_executable(bench_rmm_sdsl + src/bench_rmm_sdsl.cpp +) +target_include_directories(bench_rmm_sdsl + PUBLIC include +) +target_link_libraries(bench_rmm_sdsl + PRIVATE + benchmark + sdsl_lite +) FetchContent_Declare( doxygen-awesome-css diff --git a/README.md b/README.md index f1a5096..db4ef89 100644 --- a/README.md +++ b/README.md @@ -70,7 +70,14 @@ Benchmarks are random 50/50 0-1 bitvectors up to $2^{34}$ bits. ./bench_rmm ``` -For visualization, write the JSON output to a file using `--benchmark_out=` (e.g. `./bench_rmm --benchmark_out=rmm_bench.json`) and plot it with `misc/plot_rmm.py`. +For comparison with range min-max tree implementation from [sdsl-lite](https://github.com/simongog/sdsl-lite) (Release build required: `cmake .. -DCMAKE_BUILD_TYPE=Release`): + +```bash +sudo cpupower frequency-set --governor performance +./bench_rmm_sdsl --benchmark_out=rmm_bench_sdsl.json +``` + +For visualization, write the JSON output to a file using `--benchmark_out=` (e.g. `./bench_rmm --benchmark_out=rmm_bench.json`) and plot it with `misc/plot_rmm.py` (add `--sdsl-json rmm_bench_sdsl.json` for comparison). --- diff --git a/REPORT.md b/REPORT.md new file mode 100644 index 0000000..120fbd7 --- /dev/null +++ b/REPORT.md @@ -0,0 +1,258 @@ +# Отчёт по семестровому проекту: реализация Range Min–Max Tree (RmMTree) + +## 1. Введение + +В рамках семестрового проекта была реализована структура данных **Range Min–Max Tree (RmMTree)** над битовым вектором для операций над **правильными скобочными последовательностями (balanced parentheses, BP)**. Реализация основана на идеях из статьи (ScienceDirect): https://www.sciencedirect.com/science/article/pii/S0304397516300706 + +Цель проекта — получить +- **сжатую** ($o(n)$ доп. памяти) +- и **быструю** (все операции выполняются за $O(\log n)$ времени и используют различные оптимизации, существенно уменьшающие константы) +- структуру, поддерживающую типовой набор запросов: + - `rank` / `select` по битам, + - операции с **excess** (префиксная сумма с шагами +1/−1), + - `fwdsearch` / `bwdsearch`, + - диапазонные запросы минимума/максимума по префиксным суммам, + - навигацию по скобочной последовательности: `close`, `open`, `enclose`, + - дополнительные операции `rank10` / `select10` для шаблона `"10"`. + +В проекте присутствуют: +1) наивная эталонная реализация (для проверки на корректность); +2) основная, оптимизированная, реализация; +3) набор тестов на корректность (GoogleTest) с сравнением "наивная vs оптимизированная", +4) бенчмарки (Google Benchmark) и скрипт построения графиков для визуализации результатов. + +--- + +## 2. Постановка задачи и требования + +### 2.1. Входные данные +- Битовая строка `'0'/'1'` или массив 64-битных слов `uint64_t` с укладкой **LSB-first** (младшие биты — более ранние позиции). +- Число валидных бит `num_bits`. + +### 2.2. Основные поддерживаемые операции + +#### Ранг/выборка +- `rank1(i)`, `rank0(i)` — количество `1` или `0` на префиксе `[0, i)`. +- `select1(k)`, `select0(k)` — позиция k-й единицы/нуля (k **1-based**). + +#### Шаблон `"10"` +- `rank10(i)` — число позиций `p` в `[0, i)` таких что `bit[p]==1` и `bit[p+1]==0` и `p+1 bits;` +- укладка: **LSB-first** (бит `i` находится в слове `bits[i>>6]` на позиции `(i & 63)`). + +Это решение удобно, потому что: +- `rank` в блоке можно ускорять через `popcount`, +- `select` в блоке делается через перебор установленных битов (внутри 64 бит), +- удобно читать байты/16-битные фрагменты для LUT и SIMD. + +#### 3.2.2. Идея блокирования и дерева +Битовый вектор делится на **листья** фиксированного размера `block_bits` (по умолчанию минимум 64, далее может расти по степеням двойки). +Количество листьев: +- `leaf_count = ceil(num_bits / block_bits)` + +Далее строится **сбалансированное двоичное дерево** в heap-представлении, где: +- листья соответствуют блокам, +- внутренние вершины агрегируют статистики по своим поддеревьям. + +Индексация: +- `1` — корень, +- `first_leaf_index` — индекс первого листа (степень двойки ≥ `leaf_count`), +- лист `k` хранится в `first_leaf_index + k`. + +#### 3.2.3. Какие агрегаты хранятся в вершинах + +Для каждого узла дерева хранится набор агрегатов (но не в виде структуры, а в отдельных массивах, где индекс массива совпадает с номером узла): +- `segment_size_bits[node]` — длина сегмента (в битах), покрываемого узлом. +- `node_total_excess[node]` — суммарный excess на сегменте (итоговая сумма шагов +1/−1). +- `node_min_prefix_excess[node]` — минимум префиксной суммы внутри сегмента (относительно начала сегмента). +- `node_max_prefix_excess[node]` — максимум префиксной суммы внутри сегмента. +- `node_min_count[node]` — число позиций, где минимум достигается. +- `node_pattern10_count[node]` — число шаблонов `"10"` внутри сегмента. +- `node_first_bit[node]`, `node_last_bit[node]` — крайние биты сегмента (нужны для учёта `"10"` на границах между детьми/узлами). + +--- + +## 4. Ускорение сканирования листьев: LUT по байтам и SIMD + +### 4.1. LUT на 8 бит + +Для обработки хвостов и небольших диапазонов сделана lookup table на все 256 возможных значений байта. + +Структура `ByteAgg` хранит для байта: +- суммарный excess, +- min/max префикса, +- количество минимумов, +- число `"10"` внутри байта, +- первый/последний бит, +- позиции первого min/max. + +Также построены таблицы: +- `fwd_pos[byte][delta]` — первая позиция в байте, где префикс равен `delta` (delta ∈ [-8..8]), +- `bwd_pos[byte][delta]` — последняя позиция. + +Это позволяет: +- быстро находить минимум/максимум и их позиции на диапазонах, выровненных по байтам, +- быстро искать нужный префикс для маленьких дельт, +- ускорить `mincount/minselect`, +- ускорить локальные сканирования в `fwdsearch/bwdsearch`. + +### 4.2. SIMD (AVX2) для поиска по префиксу в листе + +В реализации предусмотрена опциональная поддержка AVX2 (`PIXIE_AVX2_SUPPORT`): +- берутся 16 бит (`get_u16`), +- разворачиваются в 16 "шагов" (+1/−1) по lane’ам, +- вычисляется префиксная сумма внутри SIMD-регистра, +- сравнивается с целевым значением, и по mask находится первая/последняя позиция. + +Дополнительно использована эвристика: +- для "маленьких хвостов" и "маленьких дельт" (пример: delta ∈ [-8..8], tail ≤ 256) LUT8 может быть быстрее, чем AVX2 из-за стоимости подготовки SIMD. + +--- + +## 5. Выбор размера блока и оценка накладных расходов + +В `RmMTree` предусмотрен механизм выбора `block_bits`: +- можно задать явно через конструктор/параметр `leaf_block_bits`, +- можно включить ограничение на относительные накладные расходы по памяти *(объём вспомогательных массивов по отношению к размеру битвектора, далее — **overhead**)* через `max_overhead`$^*$, +- иначе выбирается значение, близкое к `ceil_pow2(log2(num_bits))`, но не меньше базового порога. + +Это даёт возможность балансировать: +- **скорость** (меньше блок → больше дерево → больше узлов, но меньше сканирования листа), +- **память** (больше блок → меньше узлов → меньше overhead). + +$*$ — `max_overhead`, вообще говоря, задаёт **целевую** верхнюю границу на overhead. При построении структура увеличивает размер листового блока `block_bits`, чтобы уложиться в этот лимит. Однако гарантировать выполнение ограничения не всегда возможно: +- `block_bits` ограничен сверху значением `min(n, 2^14)` *(введена как защита от неадекватно большого и на практике почти никогда не нужного размера блока)*, поэтому при слишком строгом лимите может не существовать допустимого размера блока; +- при очень малом `n` накладные расходы неизбежно велики, потому что дерево и массивы имеют "постоянную" составляющую, а `block_bits` нельзя уменьшать ниже 64; +- дополнительно overhead растёт из-за выравнивания числа листьев до степени двойки (`first_leaf_index = bit_ceil(leaf_count)`), из-за чего появляются **пустые слоты узлов** в heap-представлении (они занимают место в массивах, хотя фактически не покрывают биты). + +--- + +## 6. Тестирование корректности + +Тестирование построено по принципу "оптимизированная реализация должна совпадать с наивной" на большом количестве случайных сценариев. + +--- + +## 7. Бенчмаркинг производительности + +В проекте реализованы два бенчмарка: +- `src/bench_rmm.cpp` — бенчмарк реализации `RmMTree`, +- `src/bench_rmm_sdsl.cpp` — бенчмарк реализации Range Min–Max Tree из библиотеки [sdsl-lite](https://github.com/simongog/sdsl-lite). + +### 7.1. Генерация датасетов +- Используется генератор случайных ПСП; +- После чего для каждого `N` заранее подготавливаются пулы запросов: + - случайные индексы, + - случайные отрезки `[i, j]`, + - случайные `k` для `select`-операций, + - случайные `delta` для `fwdsearch/bwdsearch`, + - корректные позиции `'('` и `')'` для `close/open/enclose`, + - для `minselect` — заранее генерируется `q` в допустимом диапазоне (через `mincount`). + +### 7.2. Поддерживаемые параметры бенчмарков (как аргументы командной строки) +- `--min_exp`, `--max_exp` — диапазон размеров `N = 2^e`, +- `--per_octave` — сколько дополнительных точек между степенями двойки, +- `--explicit_sizes` — явный список размеров, +- `--Q` — число запросов (сэмплов), +- `--p1` — параметр генератора, +- `--seed` — seed, +- `--block_bits` — размер листа (0 = auto). + +--- + +## 8. Визуализация результатов + +Реализован скрипт, читающий JSON-выгрузку Google Benchmark и строящий графики: +- по оси X — `N` (размер последовательности), +- по оси Y — `cpu_time` (нс на операцию), +- отдельно по каждой операции, +- опционально сравнивает `RmMTree` и `sdsl-lite` на одном графике, +- может включить `--logx` и медианное сглаживание `--smooth`. + +--- + +## 9. Практические итоги и наблюдения + +### 9.1. Корректность +- Наличие `NaiveRmM` позволило сделать строгую проверку "на каждом шаге" для большого количества случайных случаев. +- Полный перебор коротких строк (1..8) полезен для ловли ошибок на границах: + - пустые диапазоны, + - `i=0`, `i=num_bits`, + - различия 0-based / 1-based в `open/enclose/bwdsearch`, + - пересечения `"10"` на границах слов и блоков. + +### 9.2. Производительность +Заложено несколько уровней ускорения: +- агрегаты снижают работу на больших диапазонах до $O(\log n)$, +- LUT8 ускоряет "мелкие" сканы и листовые операции без тяжёлых подготовок, +- AVX2 ускоряет поиски по префиксным суммам на длинных хвостах. + +Самая красивая особенность этой реализации заключается в том, что в таких структурах часто именно "листья" становятся bottleneck-ом, но LUT8 и AVX2 позволяют получить немалый практический выигрыш в скорости работы на этих "листьях", а параметр `block_bits` позволяет подстроить баланс "дерево vs локальное сканирование". + +--- + +## 10. Роадмап дальнейшей работы + +1) **BP-обёртка**: + - `parent`, `isancestor`, `depth`, `lca` и т.п. поверх `close/open/enclose`. +2) **AoS**: + - подумать над слиянием некоторых массивов в Array-of-Structs для улучшения кэш-локальности. +3) **Cуперблоки**: + - объединить несколько соседних листьев в "суперблок" и строить дерево только по суперблокам, а внутри каждого суперблока хранить компактный локальный индекс (например, массив агрегатов по листьям или микро-структуру). Это позволило бы "срезать" несколько нижних уровней дерева, уменьшив число узлов, объём метаданных и количество обращений к памяти при запросах (особенно в `fwdsearch/bwdsearch` и `range min/max`). + +--- + +## 11. Заключение + +В итоге получилась завершённая реализация RmMTree, пригодная для дальнейшего развития в сторону полнофункциональной поддержки succinct-структур над BP. diff --git a/include/bits.h b/include/bits.h index c7ee26d..7dbce3c 100644 --- a/include/bits.h +++ b/include/bits.h @@ -4,7 +4,9 @@ #include #include +#include #include +#include #include #if defined(__AVX512VPOPCNTDQ__) && defined(__AVX512F__) && \ @@ -17,7 +19,7 @@ // Lookup table for 4-bit popcount // This table maps each 4-bit value (0-15) to its population count // clang-format off -const __m256i lookup_popcount_4 = _mm256_setr_epi8( +static inline const __m256i lookup_popcount_4 = _mm256_setr_epi8( 0, 1, 1, 2, // 0000, 0001, 0010, 0011 1, 2, 2, 3, // 0100, 0101, 0110, 0111 1, 2, 2, 3, // 1000, 1001, 1010, 1011 @@ -30,7 +32,7 @@ const __m256i lookup_popcount_4 = _mm256_setr_epi8( 2, 3, 3, 4 // 1100, 1101, 1110, 1111 ); -const __m256i mask_first_half = _mm256_setr_epi8( +static inline const __m256i mask_first_half = _mm256_setr_epi8( 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, @@ -405,108 +407,3 @@ void rank_32x8(const uint8_t* x, uint8_t* result) { } #endif } - -/** - * @brief Efficiently searches for the first occurrence of a 16-bit value in - * the range [@p begin, @p end_excl) using AVX2 when available. - * @details Loads 16 consecutive int16_t elements (256 bits) per iteration. - * Compares them against the @p target value using vectorized equality. - * If any match is found, extracts the index of the first matching lane from - * the comparison mask. Falls back to a scalar tail loop for leftover - * elements, or to a fully scalar search if AVX2 is not supported. - * @returns The index of the first match, or @p npos if the value is not found. - */ -static inline size_t find_forward_equal_i16_avx2(const int16_t* arr, - const size_t& begin, - const size_t& end_excl, - const int16_t& target, - const size_t& npos) noexcept { -#ifdef PIXIE_AVX2_SUPPORT - static constexpr size_t STEP = 16; - __m256i vtarget = _mm256_set1_epi16(target); - size_t i = begin; - size_t n = end_excl; - for (; i + STEP <= n; i += STEP) { - unsigned mask = _mm256_movemask_epi8(_mm256_cmpeq_epi16( - _mm256_loadu_si256(reinterpret_cast(arr + i)), - vtarget)); - if (mask) { - return i + (std::countr_zero(mask) >> 1); - } - } - for (; i < n; ++i) { - if (arr[i] == target) { - return i; - } - } -#else - for (size_t i = begin; i < end_excl; ++i) { - if (arr[i] == target) { - return i; - } - } -#endif - return npos; -} - -/** - * @brief Performs a backward search for a 16-bit value in a given range. - * @details Scans the array segment [@p begin .. @p end_incl] from right to - * left. - * If AVX2 is available, processes data in 256-bit blocks (16 × int16_t) using - * vectorized equality comparison for higher throughput. Falls back to a - * scalar backward scan when AVX2 is not supported. Returns the index of the - * rightmost occurrence of @p target, or @p npos if no match is found. - */ -static inline size_t find_backward_equal_i16_avx2(const int16_t* arr, - const size_t& begin, - const size_t& end_incl, - const int16_t& target, - const size_t& npos) noexcept { - if (begin > end_incl) { - return npos; - } -#ifdef PIXIE_AVX2_SUPPORT - static constexpr size_t STEP = 16; - size_t len = end_incl + 1 - begin; - size_t nblocks = len / STEP; - __m256i vtarget = _mm256_set1_epi16(target); - if (nblocks > 0) { - size_t first_block = begin + (len % STEP); - for (size_t p = first_block + (nblocks - 1) * STEP;;) { - unsigned mask = _mm256_movemask_epi8(_mm256_cmpeq_epi16( - _mm256_loadu_si256(reinterpret_cast(arr + p)), - vtarget)); - if (mask) { - return p + ((31u - std::countl_zero(mask)) >> 1); - } - if (p == first_block) { - break; - } - p -= STEP; - } - - for (size_t i = first_block; i > begin;) { - --i; - if (arr[i] == target) { - return i; - } - } - } else { - for (size_t i = end_incl + 1; i > begin;) { - --i; - if (arr[i] == target) { - return i; - } - } - } -#else - for (size_t i = end_incl + 1; i > begin;) { - --i; - if (arr[i] == target) { - return i; - } - } -#endif - return npos; -} diff --git a/include/rmm_tree.h b/include/rmm_tree.h index 30b2ad9..c453ef6 100644 --- a/include/rmm_tree.h +++ b/include/rmm_tree.h @@ -72,16 +72,10 @@ class RmMTree { std::vector node_pattern10_count; // node_first_bit = first bit (0/1), node_last_bit = last bit (0/1) of the - // segment (to handle "10" crossing) both needed for: rank10, select10. + // segment (to handle "10" crossing) + // both needed for: rank10, select10. std::vector node_first_bit, node_last_bit; - // leaf_prefix[k * leaf_prefix_stride + j] = - // excess(block_begin + j) - excess(block_begin), j in [0..leaf_size], - // block_begin = k * block_bits. needed for: - // fwdsearch/bwdsearch/close/open/enclose - std::vector leaf_prefix; - size_t leaf_prefix_stride = 0; - public: /** * @brief Sentinel for "not found". @@ -336,23 +330,43 @@ class RmMTree { } int remaining_delta = delta - leaf_delta; - if (leaf_block_index + 1 < leaf_count) { - size_t nodes_buffer[64]; - const size_t node_count = cover_blocks_collect( - leaf_block_index + 1, leaf_count - 1, nodes_buffer); - size_t segment_base = (leaf_block_index + 1) * block_bits; - for (size_t j = 0; j < node_count; ++j) { - const size_t node_index = nodes_buffer[j]; - if (remaining_delta == 0) { - return segment_base; - } - if (node_min_prefix_excess[node_index] <= remaining_delta && - remaining_delta <= node_max_prefix_excess[node_index]) { - return descend_fwd(node_index, remaining_delta, segment_base); + size_t segment_base = block_end; + if (remaining_delta == 0) { + return segment_base; + } + + // Tree-walk to the right: + // go up; whenever we come from a left child, try the right sibling subtree. + // If target is inside sibling -> descend; else skip it and continue up. + size_t node_index = leaf_index_of(block_begin); + const size_t tree_size = segment_size_bits.size() - 1; + + // If we are already at/after the last leaf boundary, there's nothing to + // scan. + if (segment_base >= num_bits || leaf_block_index + 1 >= leaf_count) { + return npos; + } + + while (node_index > 1) { + const bool is_left_child = ((node_index & 1u) == 0u); + if (is_left_child) { + const size_t sibling = node_index | 1u; // right sibling + if (sibling <= tree_size && segment_size_bits[sibling]) { + // Boundary at sibling start already handled above via + // remaining_delta==0. + if (node_min_prefix_excess[sibling] <= remaining_delta && + remaining_delta <= node_max_prefix_excess[sibling]) { + return descend_fwd(sibling, remaining_delta, segment_base); + } + // Skip whole sibling subtree. + remaining_delta -= node_total_excess[sibling]; + segment_base += segment_size_bits[sibling]; + if (remaining_delta == 0) { + return segment_base; + } } - remaining_delta -= node_total_excess[node_index]; - segment_base += segment_size_bits[node_index]; } + node_index >>= 1; } return npos; } @@ -1267,6 +1281,203 @@ class RmMTree { return LUT8_ALL().bwd_pos; } + /** + * @brief Extract 16 bits starting at @p position (LSB-first across words). + */ + inline uint16_t get_u16(const size_t& position) const noexcept { + const size_t word_index = position >> 6; + const unsigned offset = unsigned(position & 63); + const std::uint64_t w0 = + (word_index < bits.size()) ? bits[word_index] : 0ULL; + if (offset == 0) { + return uint16_t(w0 & 0xFFFFu); + } + const std::uint64_t w1 = + (word_index + 1 < bits.size()) ? bits[word_index + 1] : 0ULL; + const std::uint64_t v = (w0 >> offset) | (w1 << (64u - offset)); + return uint16_t(v & 0xFFFFu); + } + +#if defined(PIXIE_AVX2_SUPPORT) + static inline __m256i bit_masks_16x() noexcept { + // 16 lanes: (1<<0), (1<<1), ... (1<<15) + return _mm256_setr_epi16(0x0001, 0x0002, 0x0004, 0x0008, 0x0010, 0x0020, + 0x0040, 0x0080, 0x0100, 0x0200, 0x0400, 0x0800, + 0x1000, 0x2000, 0x4000, (int16_t)0x8000); + } + + static inline __m256i prefix_sum_16x_i16(__m256i v) noexcept { + // Inclusive prefix sum within 128-bit lanes, then fix carry into the high + // lane. + __m256i x = v; + __m256i t = _mm256_slli_si256(x, 2); + x = _mm256_add_epi16(x, t); + t = _mm256_slli_si256(x, 4); + x = _mm256_add_epi16(x, t); + t = _mm256_slli_si256(x, 8); + x = _mm256_add_epi16(x, t); + + __m128i lo = _mm256_extracti128_si256(x, 0); + __m128i hi = _mm256_extracti128_si256(x, 1); + const int16_t carry = + (int16_t)_mm_extract_epi16(lo, 7); // sum of first 8 elems + hi = _mm_add_epi16(hi, _mm_set1_epi16(carry)); + + __m256i out = _mm256_castsi128_si256(lo); + out = _mm256_inserti128_si256(out, hi, 1); + return out; + } + + static inline int16_t last_prefix_16x_i16(__m256i pref) noexcept { + __m128i hi = _mm256_extracti128_si256(pref, 1); + return (int16_t)_mm_extract_epi16(hi, 7); // lane 15 + } + + /** + * @brief SIMD forward scan: find first bit position p in [start,end) where + * sum_{k=start..p} step(k) == required_delta. + * @param out_total If not null and not found, receives total sum on + * [start,end). + */ + inline size_t scan_leaf_fwd_simd(const size_t& start, + const size_t& end, + const int& required_delta, + int* out_total) const noexcept { + if (start >= end) { + if (out_total) { + *out_total = 0; + } + return npos; + } + if (required_delta < -32768 || required_delta > 32767) { + if (out_total) { + const int len = int(end - start); + const int ones = int(rank1_in_block(start, end)); + *out_total = ones * 2 - len; + } + return npos; + } + + static const __m256i masks = bit_masks_16x(); + static const __m256i vzero = _mm256_setzero_si256(); + static const __m256i vallones = _mm256_cmpeq_epi16(vzero, vzero); + static const __m256i vminus1 = _mm256_set1_epi16(-1); + static const __m256i vtwo = _mm256_set1_epi16(2); + const __m256i vtarget = _mm256_set1_epi16((int16_t)required_delta); + + int cur = 0; + size_t pos = start; + while (pos + 16 <= end) { + const uint16_t bits16 = get_u16(pos); + const __m256i vb = _mm256_set1_epi16((int16_t)bits16); + const __m256i m = _mm256_and_si256(vb, masks); + const __m256i is_zero = _mm256_cmpeq_epi16(m, vzero); + const __m256i is_set = _mm256_andnot_si256(is_zero, vallones); + const __m256i steps = + _mm256_add_epi16(vminus1, _mm256_and_si256(is_set, vtwo)); + + const __m256i pref_rel = prefix_sum_16x_i16(steps); + const __m256i base = _mm256_set1_epi16((int16_t)cur); + const __m256i pref = _mm256_add_epi16(pref_rel, base); + const __m256i cmp = _mm256_cmpeq_epi16(pref, vtarget); + const uint32_t mask = (uint32_t)_mm256_movemask_epi8(cmp); + if (mask) { + const int lane = int(std::countr_zero(mask)) >> 1; + return pos + (size_t)lane; + } + cur += (int)last_prefix_16x_i16(pref_rel); + pos += 16; + } + while (pos < end) { + cur += bit(pos) ? +1 : -1; + if (cur == required_delta) { + return pos; + } + ++pos; + } + if (out_total) { + *out_total = cur; + } + return npos; + } +#endif // PIXIE_AVX2_SUPPORT + + /** + * @brief Faster small-delta forward scan for tiny tails (LUT8 + direct byte + * loads). + * @details This avoids the heavy AVX2 prefix-sum setup cost when the tail is + * small. Requires little-endian layout (x86_64). + */ + inline size_t scan_leaf_fwd_lut8_fast(const size_t& start, + const size_t& end, + const int& required_delta, + int* out_total) const noexcept { + if (start >= end) { + if (out_total) { + *out_total = 0; + } + return npos; + } + int cur = 0; + + size_t pos = start; + while (pos < end && (pos & 7)) { + cur += bit(pos) ? +1 : -1; + if (cur == required_delta) { + if (out_total) { + *out_total = cur; + } + return pos; + } + ++pos; + } + + // Byte-aligned fast path: read bytes directly. + // bits are LSB-first; on little-endian x86 the in-memory byte order matches + // bit groups [pos..pos+7]. + const uint8_t* bytep = + reinterpret_cast(bits.data()) + (pos >> 3); + const auto& agg = LUT8(); + const auto& fwd = LUT8_FWD_POS(); + + while (pos + 8 <= end) { + const uint8_t bv = *bytep++; + const auto& a = agg[bv]; + const int need = required_delta - cur; + // need must be in [-8..8] for a match inside one byte. + if ((unsigned)(need + 8) <= 16u) { + // min/max pruning first, then position lookup. + if (need >= a.min_prefix && need <= a.max_prefix) { + const int8_t off = fwd[bv][need + 8]; + if (off >= 0) { + if (out_total) { + *out_total = cur + a.excess_total; // not exact end, but caller + // only uses when not found + } + return pos + (size_t)off; + } + } + } + cur += a.excess_total; + pos += 8; + } + + while (pos < end) { + cur += bit(pos) ? +1 : -1; + if (cur == required_delta) { + if (out_total) { + *out_total = cur; + } + return pos; + } + ++pos; + } + if (out_total) { + *out_total = cur; + } + return npos; + } + /** * @brief Forward search within a single leaf over the range [@p search_start, * @p search_end). @@ -1324,8 +1535,33 @@ class RmMTree { const size_t& block_end, const int& required_delta, const bool& allow_right_boundary, - const size_t& global_right_border) const noexcept { - if (block_begin >= block_end) { + const size_t& global_right_border, + const int& prefix_at_boundary_max /*=kNoPrefixOverride*/) const noexcept { + // We scan bits in [block_begin, block_end) and look for the LAST boundary + // where prefix == required_delta, with optional exclusion of the right + // boundary. + if (block_begin > block_end) { + return npos; + } + + // Maximum allowed boundary (inclusive) inside this scan. + // If right boundary is forbidden, we forbid boundary == block_end. + size_t boundary_max = block_end; + if (!allow_right_boundary && boundary_max > block_begin) { + --boundary_max; + } + + // No bits to scan -> only possible answer is the left boundary. + if (block_begin >= boundary_max) { + if ((block_begin < global_right_border || allow_right_boundary) && + required_delta == 0) { + return block_begin; + } + return npos; + } + + if (required_delta < -32768 || required_delta > 32767) { + // Just in case. Should be impossible. if ((block_begin < global_right_border || allow_right_boundary) && required_delta == 0) { return block_begin; @@ -1333,22 +1569,100 @@ class RmMTree { return npos; } - const int16_t* leaf_prefix_ptr = - &leaf_prefix[block_of(block_begin) * leaf_prefix_stride]; - const size_t end_inclusive = block_end - block_begin; - if (end_inclusive > 0) { - const size_t position = ::find_backward_equal_i16_avx2( - leaf_prefix_ptr, 1, end_inclusive, required_delta, npos); - if (position != npos) { - return block_begin + position; +#if defined(PIXIE_AVX2_SUPPORT) + // Fast reverse scan with early exit: + // find the RIGHTMOST boundary j in (block_begin..boundary_max] such that + // prefix(j) == required_delta. + + // prefix_end = prefix(boundary_max) relative to block_begin + int prefix_end = prefix_at_boundary_max; + if (prefix_end == kNoPrefixOverride) { + const int len = int(boundary_max - block_begin); + const int ones = int(rank1_in_block(block_begin, boundary_max)); + prefix_end = ones * 2 - len; + } + + const __m256i masks = bit_masks_16x(); + const __m256i vzero = _mm256_setzero_si256(); + const __m256i vallones = _mm256_cmpeq_epi16(vzero, vzero); + const __m256i vminus1 = _mm256_set1_epi16(-1); + const __m256i vtwo = _mm256_set1_epi16(2); + const __m256i vtarget = _mm256_set1_epi16((int16_t)required_delta); + + size_t pos_end = boundary_max; // boundary (not bit index) + int cur_end = prefix_end; // prefix(pos_end) + + // Vector chunks: process 16 bits ending at pos_end. + while (pos_end >= block_begin + 16) { + const size_t pos = pos_end - 16; // bit index of the chunk start + const uint16_t bits16 = get_u16(pos); + const __m256i vb = _mm256_set1_epi16((int16_t)bits16); + const __m256i m = _mm256_and_si256(vb, masks); + const __m256i is_zero = _mm256_cmpeq_epi16(m, vzero); + const __m256i is_set = _mm256_andnot_si256(is_zero, vallones); + const __m256i steps = + _mm256_add_epi16(vminus1, _mm256_and_si256(is_set, vtwo)); + + const __m256i pref_rel = + prefix_sum_16x_i16(steps); // prefix after each bit (relative) + const int16_t sum16 = + last_prefix_16x_i16(pref_rel); // total sum on this 16-bit chunk + const int cur_start = + cur_end - (int)sum16; // prefix at boundary pos (chunk start) + + const __m256i base = _mm256_set1_epi16((int16_t)cur_start); + const __m256i pref = _mm256_add_epi16( + pref_rel, base); // prefix at boundaries (pos+1..pos+16) + const __m256i cmp = _mm256_cmpeq_epi16(pref, vtarget); + const uint32_t mask = (uint32_t)_mm256_movemask_epi8(cmp); + if (mask) { + const int bit_i = 31 - int(std::countl_zero(mask)); + const int lane = bit_i >> 1; + const size_t boundary = pos + (size_t)lane + 1; + if (boundary < global_right_border || allow_right_boundary) { + return boundary; + } + return npos; + } + + cur_end = cur_start; + pos_end = pos; + } + + while (pos_end > block_begin) { + // boundary pos_end corresponds to prefix cur_end + if (cur_end == required_delta) { + if (pos_end < global_right_border || allow_right_boundary) { + return pos_end; + } + return npos; + } + const size_t bit_pos = pos_end - 1; + cur_end -= bit(bit_pos) ? +1 : -1; // move one bit to the left + pos_end = bit_pos; + } +#else + size_t last_boundary = npos; + int cur = 0; + for (size_t pos = block_begin; pos < boundary_max; ++pos) { + cur += bit(pos) ? +1 : -1; + if (cur == required_delta) { + last_boundary = pos + 1; } } + if (last_boundary != npos) { + if (last_boundary < global_right_border || allow_right_boundary) { + return last_boundary; + } + return npos; + } +#endif + // Left boundary (prefix == 0) is always the final candidate. if ((block_begin < global_right_border || allow_right_boundary) && required_delta == 0) { return block_begin; } - return npos; } @@ -1413,6 +1727,12 @@ class RmMTree { */ size_t first_leaf_index = 1; + /** + * @brief Sentinel to tell scan_leaf_bwd(): "prefix at right boundary not + * provided". + */ + static constexpr int kNoPrefixOverride = std::numeric_limits::min(); + /** * @brief Block index containing position @p position. */ @@ -1489,10 +1809,13 @@ class RmMTree { node_index = right_child; } } - return scan_leaf_fwd( - segment_base, - std::min(segment_base + segment_size_bits[node_index], num_bits), - required_delta); + const size_t seg_end = + std::min(segment_base + segment_size_bits[node_index], num_bits); +#if defined(PIXIE_AVX2_SUPPORT) + return scan_leaf_fwd_simd(segment_base, seg_end, required_delta, nullptr); +#else + return scan_leaf_fwd(segment_base, seg_end, required_delta); +#endif } /** @@ -1550,12 +1873,26 @@ class RmMTree { return npos; } - return scan_leaf_bwd( - segment_base, - std::min( - global_right_border, - std::min(segment_base + segment_size_bits[node_index], num_bits)), - required_delta, allow_right_boundary, global_right_border); + const size_t seg_end = + std::min(segment_base + segment_size_bits[node_index], num_bits); + const size_t block_end = std::min(global_right_border, seg_end); + + int prefix_override = kNoPrefixOverride; + // If we scan the full leaf up to its end boundary, we know + // prefix(block_end) from node_total_excess[leaf]. If the right boundary is + // forbidden, we can still derive prefix(block_end-1) cheaply. + if (block_end == seg_end) { + const int total = node_total_excess[node_index]; + if (!allow_right_boundary && block_end > segment_base) { + prefix_override = total - (bit(block_end - 1) ? +1 : -1); + } else { + prefix_override = total; + } + } + + return scan_leaf_bwd(segment_base, block_end, required_delta, + allow_right_boundary, global_right_border, + prefix_override); } /** @@ -1792,8 +2129,7 @@ class RmMTree { const size_t& block_size_pow2) noexcept { static constexpr size_t AUX_SLOT_BYTES = sizeof(uint32_t) + sizeof(int32_t) + sizeof(int32_t) + sizeof(int32_t) + - sizeof(uint32_t) + sizeof(uint32_t) + sizeof(uint8_t) + - sizeof(uint8_t) + sizeof(int16_t); + sizeof(uint32_t) + sizeof(uint32_t) + sizeof(uint8_t) + sizeof(uint8_t); size_t bitvector_bytes = ceil_div(bit_count, 64) * 8; if (bitvector_bytes == 0) { @@ -2090,22 +2426,38 @@ class RmMTree { const int& delta, int& leaf_delta) const noexcept { const size_t leaf_length = segment_size_bits[first_leaf_index + leaf_index]; - const int16_t* leaf_prefix_ptr = - &leaf_prefix[leaf_index * leaf_prefix_stride]; - const size_t offset_in_leaf = start_position - leaf_block_begin; - if (offset_in_leaf > leaf_length) { + const size_t leaf_end = std::min(num_bits, leaf_block_begin + leaf_length); + if (start_position >= leaf_end) { leaf_delta = 0; return npos; } - const int16_t start_prefix = leaf_prefix_ptr[offset_in_leaf]; - size_t match_index = ::find_forward_equal_i16_avx2( - leaf_prefix_ptr, offset_in_leaf + 1, leaf_length + 1, - start_prefix + delta, npos); - if (match_index != npos) { - return leaf_block_begin + match_index - 1; +#if defined(PIXIE_AVX2_SUPPORT) + int total = 0; + // Heuristic: for tiny tails and tiny deltas, LUT8 wins because AVX2 setup + // is heavy... + // At least I think so... + const size_t tail_len = leaf_end - start_position; + size_t res = npos; + if (delta >= -8 && delta <= 8 && tail_len <= 256) { + res = scan_leaf_fwd_lut8_fast(start_position, leaf_end, delta, &total); + } else { + res = scan_leaf_fwd_simd(start_position, leaf_end, delta, &total); + } + if (res != npos) { + return res; } - leaf_delta = leaf_prefix_ptr[leaf_length] - start_prefix; + leaf_delta = total; return npos; +#else + const size_t res = scan_leaf_fwd(start_position, leaf_end, delta); + if (res != npos) { + return res; + } + const int len = int(leaf_end - start_position); + const int ones = int(rank1_in_block(start_position, leaf_end)); + leaf_delta = ones * 2 - len; + return npos; +#endif } /** @@ -2121,23 +2473,36 @@ class RmMTree { const size_t& start_position, const int& delta, int& leaf_delta) const noexcept { - const int16_t* leaf_prefix_ptr = - &leaf_prefix[leaf_index * leaf_prefix_stride]; - const size_t offset_in_leaf = start_position - leaf_block_begin; - if (offset_in_leaf > segment_size_bits[first_leaf_index + leaf_index]) { + const size_t leaf_length = segment_size_bits[first_leaf_index + leaf_index]; + const size_t leaf_end = std::min(num_bits, leaf_block_begin + leaf_length); + if (start_position < leaf_block_begin || start_position > leaf_end) { leaf_delta = 0; return npos; } - const int16_t start_prefix = leaf_prefix_ptr[offset_in_leaf]; - if (offset_in_leaf > 0) { - const size_t match_index = ::find_backward_equal_i16_avx2( - leaf_prefix_ptr, 0, offset_in_leaf - 1, start_prefix + delta, npos); - if (match_index != npos) { - return leaf_block_begin + match_index; - } + + // leaf_delta = excess(start_position) - excess(leaf_block_begin) + // = 2*rank1([leaf_begin, start)) - (start - leaf_begin) + const int len = int(start_position - leaf_block_begin); + const int ones = int(rank1_in_block(leaf_block_begin, start_position)); + leaf_delta = ones * 2 - len; + + // Must find a boundary strictly < start_position (so + // boundary==start_position forbidden). + if (start_position == leaf_block_begin) { + return npos; } - leaf_delta = start_prefix; - return npos; + const int target_prefix = leaf_delta + delta; + return scan_leaf_bwd( + leaf_block_begin, + start_position, // do not look to the right of start_position + target_prefix, + false, // right boundary (=start_position) forbidden + start_position, + // prefix at boundary_max = start_position-1: + // leaf_delta is prefix at start_position, subtract last step + (start_position > leaf_block_begin + ? (leaf_delta - (bit(start_position - 1) ? +1 : -1)) + : 0)); } /** @@ -2280,8 +2645,6 @@ class RmMTree { node_pattern10_count.assign(tree_size + 1, 0); node_first_bit.assign(tree_size + 1, 0); node_last_bit.assign(tree_size + 1, 0); - leaf_prefix_stride = block_bits + 1; - leaf_prefix.assign(leaf_count * leaf_prefix_stride, 0); // leaves for (size_t leaf_block_index = 0; leaf_block_index < leaf_count; @@ -2367,15 +2730,6 @@ class RmMTree { (segment_size_bits[leaf_node_index] == 0 ? 0 : max_value); node_min_count[leaf_node_index] = min_count; node_pattern10_count[leaf_node_index] = (uint32_t)pattern10_count; - int16_t* leaf_prefix_ptr = - &leaf_prefix[leaf_block_index * leaf_prefix_stride]; - int16_t prefix_accumulator = 0; - leaf_prefix_ptr[0] = 0; - for (size_t position_in_leaf = segment_begin, prefix_index = 1; - position_in_leaf < segment_end; ++position_in_leaf, ++prefix_index) { - prefix_accumulator += bit(position_in_leaf) ? 1 : -1; - leaf_prefix_ptr[prefix_index] = prefix_accumulator; - } } // internal nodes for (size_t node_index = first_leaf_index - 1; node_index >= 1; diff --git a/misc/plot_rmm.py b/misc/plot_rmm.py index 78e7beb..ce88c99 100644 --- a/misc/plot_rmm.py +++ b/misc/plot_rmm.py @@ -1,14 +1,17 @@ """ -Plot RmMTree benchmark results. +Plot RmMTree benchmark results (optionally compared with sdsl-lite). + +This script reads a JSON produced by `bench_rmm` and, optionally, a JSON +with benchmarks for sdsl-lite. For each operation, it draws a scatter plot +of individual points and a trend line (optionally median-smoothed) for each +implementation on the same figure. -This script reads a JSON produced by `bench_rmm` and, -for each operation, draws a scatter plot of individual points -and a trend line (optionally median-smoothed). Plots are saved as PNG files and can also be shown interactively. Examples: python3 plot_rmm.py rmm_bench.json --save-dir plots --logx --smooth 3 python3 plot_rmm.py rmm_bench.json --show + python3 plot_rmm.py rmm_bench.json --sdsl-json rmm_bench_sdsl.json --save-dir plots --logx --smooth 3 """ import argparse @@ -39,23 +42,49 @@ ] +def load_bench_json(path: str) -> pd.DataFrame: + with open(path, "r", encoding="utf-8") as f: + data = json.load(f) + if isinstance(data, dict) and "benchmarks" in data: + data = data["benchmarks"] + df = pd.DataFrame(data) + required = {"name", "cpu_time", "N"} + if not required.issubset(df.columns): + missing = ", ".join(sorted(required - set(df.columns))) + raise ValueError( + f"{path!r}: missing required columns in benchmark JSON: {missing}" + ) + df = df.dropna(subset=["cpu_time", "N"]) + return df + + def main(): ap = argparse.ArgumentParser( description=( - "Read a JSON with RmMTree benchmark results and plot time per operation " - "versus sequence size N for each operation." + "Read JSON with RmMTree benchmark results (and optionally " + "sdsl-lite results) and plot time per operation versus " + "sequence size N for each operation." ), epilog=( "Examples:\n" " python3 plot_rmm.py rmm_bench.json --save-dir plots --logx --smooth 3\n" - " python3 plot_rmm.py rmm_bench.json --show" + " python3 plot_rmm.py rmm_bench.json --show\n" + " python3 plot_rmm.py rmm_bench.json --sdsl-json rmm_bench_sdsl.json --save-dir plots --logx --smooth 3" ), formatter_class=argparse.RawDescriptionHelpFormatter, ) ap.add_argument( "json", metavar="JSON", - help="Path to the JSON file with results (output of bench_rmm).", + help="Path to the JSON file with RmMTree results (output of bench_rmm).", + ) + ap.add_argument( + "--sdsl-json", + metavar="SDSL_JSON", + help=( + "Path to JSON file with sdsl-lite benchmark results. " + "If provided, both implementations will be plotted on the same graphs." + ), ) ap.add_argument( "--save-dir", @@ -94,38 +123,97 @@ def main(): args = ap.parse_args() os.makedirs(args.save_dir, exist_ok=True) - with open(args.json, "r", encoding="utf-8") as f: - data = json.load(f) - if isinstance(data, dict) and "benchmarks" in data: - data = data["benchmarks"] - df = pd.DataFrame(data) - df = df.dropna(subset=["cpu_time", "N"]) - for op in OPS_ORDER: - d = df[df["name"] == op].copy() - if d.empty: + df_rmm = load_bench_json(args.json) + df_sdsl = None + if args.sdsl_json is not None: + df_sdsl = load_bench_json(args.sdsl_json) + + rmm_ops = set(df_rmm["name"].dropna().astype(str).unique()) + if df_sdsl is not None: + sdsl_ops = set(df_sdsl["name"].dropna().astype(str).unique()) + common_ops = rmm_ops & sdsl_ops + ops_to_plot = [op for op in OPS_ORDER if op in common_ops] + extra = sorted(common_ops - set(OPS_ORDER)) + ops_to_plot.extend(extra) + else: + ops_to_plot = OPS_ORDER + + for op in ops_to_plot: + d_rmm = df_rmm[df_rmm["name"] == op].copy() + d_sdsl = None + if df_sdsl is not None: + d_sdsl = df_sdsl[df_sdsl["name"] == op].copy() + if d_rmm.empty or (df_sdsl is not None and (d_sdsl is None or d_sdsl.empty)): continue - d = d.groupby("N", as_index=False)["cpu_time"].median().sort_values("N") + plt.figure() - yplot = d["cpu_time"] - if args.smooth and args.smooth > 1: - d["ns_smooth"] = ( - d["cpu_time"] - .rolling(window=args.smooth, center=True, min_periods=1) - .median() + if not d_rmm.empty: + d_rmm = ( + d_rmm.groupby("N", as_index=False)["cpu_time"].median().sort_values("N") + ) + y_rmm = d_rmm["cpu_time"] + if args.smooth and args.smooth > 1: + d_rmm["ns_smooth"] = ( + d_rmm["cpu_time"] + .rolling(window=args.smooth, center=True, min_periods=1) + .median() + ) + y_rmm = d_rmm["ns_smooth"] + plt.scatter( + d_rmm["N"], + d_rmm["cpu_time"], + s=8, + alpha=0.3, + linewidths=0, + ) + plt.plot( + d_rmm["N"], + y_rmm, + linewidth=1.5, + label="RmMTree", ) - yplot = d["ns_smooth"] - plt.figure() - plt.scatter(d["N"], d["cpu_time"], s=8, alpha=0.3, linewidths=0) - plt.plot(d["N"], yplot, linewidth=1.5) + if d_sdsl is not None and not d_sdsl.empty: + d_sdsl = ( + d_sdsl.groupby("N", as_index=False)["cpu_time"] + .median() + .sort_values("N") + ) + y_sdsl = d_sdsl["cpu_time"] + if args.smooth and args.smooth > 1: + d_sdsl["ns_smooth"] = ( + d_sdsl["cpu_time"] + .rolling(window=args.smooth, center=True, min_periods=1) + .median() + ) + y_sdsl = d_sdsl["ns_smooth"] + plt.scatter( + d_sdsl["N"], + d_sdsl["cpu_time"], + s=8, + alpha=0.3, + linewidths=0.9, + marker="x", + ) + plt.plot( + d_sdsl["N"], + y_sdsl, + linewidth=1.5, + linestyle="--", + label="sdsl-lite", + ) if args.logx: plt.xscale("log", base=2) plt.xlabel("Sequence size N (bits)") plt.ylabel("Time per operation, ns") - plt.title(f"RmMTree: {op}") + if d_sdsl is not None and not d_sdsl.empty: + plt.title(f"RmMTree vs sdsl-lite: {op}") + else: + plt.title(f"RmMTree: {op}") plt.grid(True, which="both", linestyle="--", alpha=0.4) + plt.legend(loc="best") out = os.path.join(args.save_dir, f"{op}.png") plt.savefig(out, bbox_inches="tight", dpi=160) if not args.show: diff --git a/src/bench_rmm.cpp b/src/bench_rmm.cpp index 523516e..a15866e 100644 --- a/src/bench_rmm.cpp +++ b/src/bench_rmm.cpp @@ -6,6 +6,7 @@ #include #include #include +#include #include #include #include @@ -32,66 +33,96 @@ struct Args { static Args args; static void parse_args_and_strip(int& argc, char**& argv) { - auto getv = [&](const std::string& key) -> std::optional { - std::string pref = "--" + key + "="; - for (int i = 1; i < argc; ++i) { - std::string s = argv[i]; - if (s.rfind(pref, 0) == 0) { - return s.substr(pref.size()); + auto is_value_token = [&](int candidate_index) -> bool { + if (candidate_index >= argc) { + return false; + } + std::string candidate = argv[candidate_index]; + return !(candidate.size() >= 2 && candidate[0] == '-' && + candidate[1] == '-'); + }; + + auto get_value = [&](const std::string& key) -> std::optional { + std::string prefix = "--" + key; + std::string prefix_with_equals = prefix + "="; + for (int argument_index = 1; argument_index < argc; ++argument_index) { + std::string argument = argv[argument_index]; + if (argument.rfind(prefix_with_equals, 0) == 0) { + return argument.substr(prefix_with_equals.size()); + } + if (argument == prefix && is_value_token(argument_index + 1)) { + return std::string(argv[argument_index + 1]); } } return std::nullopt; }; + auto strip = [&](const std::string& key) { - std::string pref = "--" + key + "="; - int w = 0; - for (int i = 0; i < argc; ++i) { - if (i > 0 && std::string(argv[i]).rfind(pref, 0) == 0) { + std::string prefix = "--" + key; + std::string prefix_with_equals = prefix + "="; + int write_index = 0; + for (int argument_index = 0; argument_index < argc; ++argument_index) { + std::string argument = argv[argument_index]; + if (argument_index > 0 && argument.rfind(prefix_with_equals, 0) == 0) { + continue; + } + if (argument_index > 0 && argument == prefix && + is_value_token(argument_index + 1)) { + ++argument_index; continue; } - argv[w++] = argv[i]; + argv[write_index++] = argv[argument_index]; } - argc = w; + argc = write_index; }; - if (auto v = getv("min-exp")) { - args.min_exp = std::stoi(*v), strip("min-exp"); + if (auto argument_value = get_value("min_exp")) { + args.min_exp = std::stoi(*argument_value); + strip("min_exp"); } - if (auto v = getv("max-exp")) { - args.max_exp = std::stoi(*v), strip("max-exp"); + if (auto argument_value = get_value("max_exp")) { + args.max_exp = std::stoi(*argument_value); + strip("max_exp"); } - if (auto v = getv("q")) { - args.Q = std::stoull(*v), strip("q"); + if (auto argument_value = get_value("Q")) { + args.Q = std::stoull(*argument_value); + strip("Q"); } - if (auto v = getv("p")) { - args.p1 = std::stod(*v), strip("p"); + if (auto argument_value = get_value("p1")) { + args.p1 = std::stod(*argument_value); + strip("p1"); } - if (auto v = getv("seed")) { - args.seed = std::stoull(*v), strip("seed"); + if (auto argument_value = get_value("seed")) { + args.seed = std::stoull(*argument_value); + strip("seed"); } - if (auto v = getv("block")) { - args.block_bits = std::stoull(*v), strip("block"); + if (auto argument_value = get_value("block_bits")) { + args.block_bits = std::stoull(*argument_value); + strip("block_bits"); } - if (auto v = getv("per-octave")) { - args.per_octave = std::stoi(*v), strip("per-octave"); + if (auto argument_value = get_value("per_octave")) { + args.per_octave = std::stoi(*argument_value); + strip("per_octave"); } - if (auto v = getv("sizes")) { - std::string s = *v; - strip("sizes"); - std::size_t pos = 0; - while (pos < s.size()) { - while (pos < s.size() && - (s[pos] == ',' || std::isspace((unsigned char)s[pos]))) { - ++pos; + if (auto argument_value = get_value("explicit_sizes")) { + std::string sizes_text = *argument_value; + strip("explicit_sizes"); + std::size_t position = 0; + while (position < sizes_text.size()) { + while (position < sizes_text.size() && + (sizes_text[position] == ',' || + std::isspace(static_cast(sizes_text[position])))) { + ++position; } - std::size_t start = pos; - while (pos < s.size() && std::isdigit((unsigned char)s[pos])) { - ++pos; + std::size_t start_index = position; + while (position < sizes_text.size() && + std::isdigit(static_cast(sizes_text[position]))) { + ++position; } - if (start < pos) { - args.explicit_sizes.push_back( - std::stoull(s.substr(start, pos - start))); + if (start_index < position) { + args.explicit_sizes.push_back(std::stoull( + sizes_text.substr(start_index, position - start_index))); } } std::sort(args.explicit_sizes.begin(), args.explicit_sizes.end()); @@ -101,14 +132,51 @@ static void parse_args_and_strip(int& argc, char**& argv) { } } -static std::string make_random_bits(std::size_t N, +static std::string random_dyck_bits(std::size_t N, double p1, std::mt19937_64& rng) { - std::uniform_real_distribution U(0.0, 1.0); + std::size_t pairs = N >> 1; + std::size_t L = pairs << 1; std::string s; - s.resize(N); - for (std::size_t i = 0; i < N; ++i) { - s[i] = (U(rng) < p1 ? '1' : '0'); + s.resize(L); + if (L == 0) { + return s; + } + p1 = std::clamp(p1, 0., 1.); + std::size_t opens_left = pairs; + std::size_t closes_left = pairs; + int balance = 0; + std::uniform_real_distribution U(0., 1.); + for (std::size_t i = 0; i < L; ++i) { + if (opens_left == 0) { + s[i] = '0'; + --closes_left; + --balance; + continue; + } + + if (closes_left == 0) { + s[i] = '1'; + --opens_left; + ++balance; + continue; + } + + if (balance == 0) { + s[i] = '1'; + --opens_left; + ++balance; + continue; + } + if (U(rng) < p1) { + s[i] = '1'; + --opens_left; + ++balance; + } else { + s[i] = '0'; + --closes_left; + --balance; + } } return s; } @@ -161,6 +229,10 @@ static std::vector build_size_grid() { struct Pools { std::vector inds_any; + std::vector rank10_end_positions; + std::vector open_positions_zero_based; + std::vector open_positions_one_based; + std::vector close_positions_one_based; std::vector inds; std::vector inds_1N; std::vector deltas; @@ -199,8 +271,9 @@ static Dataset build_dataset(std::size_t N) { static_cast(N) * 0x9E3779B185EBCA87ull); Dataset d; - d.N = N; - d.bits = make_random_bits(N, args.p1, rng); + d.bits = random_dyck_bits(N, args.p1, rng); + d.N = d.bits.size(); + N = d.N; if (args.block_bits == 0) { d.t = RmMTree(d.bits, 0, -1.0f); @@ -208,22 +281,42 @@ static Dataset build_dataset(std::size_t N) { d.t = RmMTree(d.bits, args.block_bits); } - d.cnt1 = d.t.rank1(N); + d.cnt1 = std::count(d.bits.begin(), d.bits.end(), '1'); d.cnt0 = N - d.cnt1; d.cnt10 = count10(d.bits); - const std::size_t L = std::min(args.Q, 32768); + std::vector open_positions_zero_based; + std::vector open_positions_one_based; + std::vector close_positions_one_based; + open_positions_zero_based.reserve(d.N >> 1); + open_positions_one_based.reserve(d.N >> 1); + close_positions_one_based.reserve(d.N >> 1); + for (std::size_t i = 0; i < N; ++i) { + if (d.bits[i] == '1') { + open_positions_zero_based.push_back(i); + open_positions_one_based.push_back(i + 1); + } else { + close_positions_one_based.push_back(i + 1); + } + } - std::uniform_int_distribution ind_dist_incl(0, N ? N : 0); + std::size_t sample_count = std::max( + 1, std::min(args.Q, std::size_t(32768))); + + std::uniform_int_distribution nonedge_index_distribution( + N > 1 ? 1 : 0, N > 1 ? (N - 1) : 0); std::uniform_int_distribution ind_dist(0, N ? (N - 1) : 0); std::uniform_int_distribution ind_dist_1N(1, N ? N : 1); - std::uniform_int_distribution d_dist(-8, +8); + std::uniform_int_distribution rank10_end_position_distribution( + N >= 2 ? 2 : 0, N >= 2 ? N : 0); + std::uniform_int_distribution delta_distribution(-8, +8); - d.pool.inds_any.resize(L); - d.pool.inds.resize(L); - d.pool.inds_1N.resize(L); - d.pool.deltas.resize(L); - d.pool.segs.resize(L); + d.pool.inds_any.resize(sample_count); + d.pool.rank10_end_positions.resize(sample_count); + d.pool.inds.resize(sample_count); + d.pool.inds_1N.resize(sample_count); + d.pool.deltas.resize(sample_count); + d.pool.segs.resize(sample_count); auto rand_ij = [&]() -> std::pair { if (N == 0) { @@ -236,22 +329,46 @@ static Dataset build_dataset(std::size_t N) { return {a, b}; }; - for (std::size_t i = 0; i < L; ++i) { - d.pool.inds_any[i] = ind_dist_incl(rng); + for (std::size_t i = 0; i < sample_count; ++i) { + d.pool.inds_any[i] = nonedge_index_distribution(rng); + d.pool.rank10_end_positions[i] = rank10_end_position_distribution(rng); d.pool.inds[i] = (N ? ind_dist(rng) : 0); d.pool.inds_1N[i] = (N ? ind_dist_1N(rng) : 0); - d.pool.deltas[i] = d_dist(rng); + d.pool.deltas[i] = delta_distribution(rng); d.pool.segs[i] = rand_ij(); } + auto fill_from_candidates = [&](const std::vector& candidates, + std::vector& destination, + std::size_t fallback_value) { + destination.resize(sample_count); + if (candidates.empty()) { + std::fill(destination.begin(), destination.end(), fallback_value); + return; + } + std::uniform_int_distribution index_distribution( + 0, candidates.size() - 1); + for (std::size_t sample_index = 0; sample_index < sample_count; + ++sample_index) { + destination[sample_index] = candidates[index_distribution(rng)]; + } + }; + const std::size_t one_based_fallback = (N > 0 ? 1 : 0); + fill_from_candidates(open_positions_zero_based, + d.pool.open_positions_zero_based, 0); + fill_from_candidates(open_positions_one_based, + d.pool.open_positions_one_based, one_based_fallback); + fill_from_candidates(close_positions_one_based, + d.pool.close_positions_one_based, one_based_fallback); + auto fill_ks = [&](std::size_t total, std::vector& out) { - out.resize(L); + out.resize(sample_count); if (total == 0) { std::fill(out.begin(), out.end(), 1); return; } std::uniform_int_distribution dist(1, total); - for (std::size_t i = 0; i < L; ++i) { + for (std::size_t i = 0; i < sample_count; ++i) { out[i] = dist(rng); } }; @@ -259,8 +376,8 @@ static Dataset build_dataset(std::size_t N) { fill_ks(d.cnt0, d.pool.ks0); fill_ks(d.cnt10, d.pool.ks10); - d.pool.minselect_q.resize(L); - for (std::size_t i = 0; i < L; ++i) { + d.pool.minselect_q.resize(sample_count); + for (std::size_t i = 0; i < sample_count; ++i) { auto [l, r] = d.pool.segs[i]; std::size_t c = d.t.mincount(l, r); if (c == 0) { @@ -326,11 +443,14 @@ static void register_all() { benchmark::DoNotOptimize(r); }); - register_op("rank10", data, - [&](benchmark::State&, const Dataset& D, std::size_t k) { - auto r = D.t.rank10(P.inds_any[k % P.inds_any.size()]); - benchmark::DoNotOptimize(r); - }); + register_op( + "rank10", data, + [&](benchmark::State&, const Dataset& dataset, + std::size_t sample_index) { + benchmark::DoNotOptimize(dataset.t.rank10( + dataset.pool.rank10_end_positions + [sample_index % dataset.pool.rank10_end_positions.size()])); + }); register_op("select10", data, [&](benchmark::State&, const Dataset& D, std::size_t k) { @@ -360,16 +480,20 @@ static void register_all() { register_op("range_min_query_pos", data, [&](benchmark::State&, const Dataset& D, std::size_t k) { - auto [i, j] = P.segs[k % P.segs.size()]; - auto r = D.t.range_min_query_pos(i, j); - benchmark::DoNotOptimize(r); + const std::size_t segment_index = k % P.segs.size(); + auto [range_begin, range_end] = P.segs[segment_index]; + auto min_position = + D.t.range_min_query_pos(range_begin, range_end); + benchmark::DoNotOptimize(min_position); }); register_op("range_min_query_val", data, [&](benchmark::State&, const Dataset& D, std::size_t k) { - auto [i, j] = P.segs[k % P.segs.size()]; - auto r = D.t.range_min_query_val(i, j); - benchmark::DoNotOptimize(r); + const std::size_t segment_index = k % P.segs.size(); + auto [range_begin, range_end] = P.segs[segment_index]; + auto min_value = + D.t.range_min_query_val(range_begin, range_end); + benchmark::DoNotOptimize(min_value); }); register_op("range_max_query_pos", data, @@ -403,27 +527,39 @@ static void register_all() { }); register_op("close", data, - [&](benchmark::State&, const Dataset& D, std::size_t k) { - if (D.N == 0) { + [&](benchmark::State&, const Dataset& dataset, + std::size_t sample_index) { + if (dataset.N == 0) { benchmark::DoNotOptimize(0); return; } - auto i = P.inds[k % P.inds.size()]; - auto r = D.t.close(i); - benchmark::DoNotOptimize(r); + const std::size_t open_position = + dataset.pool.open_positions_zero_based + [sample_index % + dataset.pool.open_positions_zero_based.size()]; + benchmark::DoNotOptimize(dataset.t.close(open_position)); }); - register_op("open", data, - [&](benchmark::State&, const Dataset& D, std::size_t k) { - auto r = D.t.open(P.inds_1N[k % P.inds_1N.size()]); - benchmark::DoNotOptimize(r); - }); - - register_op("enclose", data, - [&](benchmark::State&, const Dataset& D, std::size_t k) { - auto r = D.t.enclose(P.inds_1N[k % P.inds_1N.size()]); - benchmark::DoNotOptimize(r); - }); + register_op( + "open", data, + [&](benchmark::State&, const Dataset& dataset, + std::size_t sample_index) { + const std::size_t close_position_one_based = + dataset.pool.close_positions_one_based + [sample_index % + dataset.pool.close_positions_one_based.size()]; + benchmark::DoNotOptimize(dataset.t.open(close_position_one_based)); + }); + + register_op( + "enclose", data, + [&](benchmark::State&, const Dataset& dataset, + std::size_t sample_index) { + const std::size_t open_position_one_based = + dataset.pool.open_positions_one_based + [sample_index % dataset.pool.open_positions_one_based.size()]; + benchmark::DoNotOptimize(dataset.t.enclose(open_position_one_based)); + }); } } diff --git a/src/bench_rmm_sdsl.cpp b/src/bench_rmm_sdsl.cpp new file mode 100644 index 0000000..d7e450e --- /dev/null +++ b/src/bench_rmm_sdsl.cpp @@ -0,0 +1,594 @@ +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using sdsl::bit_vector; +using sdsl::bp_support_sada; +using BpSupport = bp_support_sada<>; + +struct Args { + int min_exp = 14; + int max_exp = 22; + std::size_t Q = 200000; + double p1 = 0.5; + std::uint64_t seed = 42; + std::size_t block_bits = 64; + int per_octave = 10; + std::vector explicit_sizes; +}; + +static Args args; + +static void parse_args_and_strip(int& argc, char**& argv) { + auto is_value_token = [&](int candidate_index) -> bool { + if (candidate_index >= argc) { + return false; + } + std::string candidate = argv[candidate_index]; + return !(candidate.size() >= 2 && candidate[0] == '-' && + candidate[1] == '-'); + }; + + auto get_value = [&](const std::string& key) -> std::optional { + std::string prefix = "--" + key; + std::string prefix_with_equals = prefix + "="; + for (int argument_index = 1; argument_index < argc; ++argument_index) { + std::string argument = argv[argument_index]; + if (argument.rfind(prefix_with_equals, 0) == 0) { + return argument.substr(prefix_with_equals.size()); + } + if (argument == prefix && is_value_token(argument_index + 1)) { + return std::string(argv[argument_index + 1]); + } + } + return std::nullopt; + }; + + auto strip = [&](const std::string& key) { + std::string prefix = "--" + key; + std::string prefix_with_equals = prefix + "="; + int write_index = 0; + for (int argument_index = 0; argument_index < argc; ++argument_index) { + std::string argument = argv[argument_index]; + if (argument_index > 0 && argument.rfind(prefix_with_equals, 0) == 0) { + continue; + } + if (argument_index > 0 && argument == prefix && + is_value_token(argument_index + 1)) { + ++argument_index; + continue; + } + argv[write_index++] = argv[argument_index]; + } + argc = write_index; + }; + + if (auto argument_value = get_value("min_exp")) { + args.min_exp = std::stoi(*argument_value); + strip("min_exp"); + } + if (auto argument_value = get_value("max_exp")) { + args.max_exp = std::stoi(*argument_value); + strip("max_exp"); + } + if (auto argument_value = get_value("Q")) { + args.Q = std::stoull(*argument_value); + strip("Q"); + } + if (auto argument_value = get_value("p1")) { + args.p1 = std::stod(*argument_value); + strip("p1"); + } + if (auto argument_value = get_value("seed")) { + args.seed = std::stoull(*argument_value); + strip("seed"); + } + if (auto argument_value = get_value("block_bits")) { + args.block_bits = std::stoull(*argument_value); + strip("block_bits"); + } + if (auto argument_value = get_value("per_octave")) { + args.per_octave = std::stoi(*argument_value); + strip("per_octave"); + } + + if (auto argument_value = get_value("explicit_sizes")) { + std::string sizes_text = *argument_value; + strip("explicit_sizes"); + std::size_t position = 0; + while (position < sizes_text.size()) { + while (position < sizes_text.size() && + (sizes_text[position] == ',' || + std::isspace(static_cast(sizes_text[position])))) { + ++position; + } + std::size_t start_index = position; + while (position < sizes_text.size() && + std::isdigit(static_cast(sizes_text[position]))) { + ++position; + } + if (start_index < position) { + args.explicit_sizes.push_back(std::stoull( + sizes_text.substr(start_index, position - start_index))); + } + } + std::sort(args.explicit_sizes.begin(), args.explicit_sizes.end()); + args.explicit_sizes.erase( + std::unique(args.explicit_sizes.begin(), args.explicit_sizes.end()), + args.explicit_sizes.end()); + } +} + +static std::string random_dyck_bits(std::size_t N, + double p1, + std::mt19937_64& rng) { + std::size_t pairs = N >> 1; + std::size_t L = pairs << 1; + std::string s; + s.resize(L); + if (L == 0) { + return s; + } + p1 = std::clamp(p1, 0., 1.); + std::size_t opens_left = pairs; + std::size_t closes_left = pairs; + int balance = 0; + std::uniform_real_distribution U(0., 1.); + for (std::size_t i = 0; i < L; ++i) { + if (opens_left == 0) { + s[i] = '0'; + --closes_left; + --balance; + continue; + } + + if (closes_left == 0) { + s[i] = '1'; + --opens_left; + ++balance; + continue; + } + + if (balance == 0) { + s[i] = '1'; + --opens_left; + ++balance; + continue; + } + if (U(rng) < p1) { + s[i] = '1'; + --opens_left; + ++balance; + } else { + s[i] = '0'; + --closes_left; + --balance; + } + } + return s; +} + +static std::vector build_size_grid() { + if (!args.explicit_sizes.empty()) { + std::vector v = args.explicit_sizes; + v.erase(std::remove(v.begin(), v.end(), 0), v.end()); + return v; + } + + const int lo = args.min_exp, hi = args.max_exp; + if (args.per_octave <= 0) { + std::vector v; + for (int e = lo; e <= hi; ++e) { + v.push_back(std::size_t(1) << e); + } + return v; + } + + int per_octave_steps = args.per_octave; + std::set N_cands; + std::size_t min_N = std::size_t(1) << lo, max_N = std::size_t(1) << hi; + + for (int e = lo; e <= hi; ++e) { + for (int t = 0; t <= per_octave_steps; ++t) { + long double x = e + (long double)t / (long double)per_octave_steps; + std::size_t N = (std::size_t)std::llround(std::pow(2.0L, x)); + if (N < min_N) { + N = min_N; + } + if (N > max_N) { + N = max_N; + } + if (N) { + N_cands.insert(N); + } + } + } + + std::vector v(N_cands.begin(), N_cands.end()); + if (v.front() != min_N) { + v.insert(v.begin(), min_N); + } + if (v.back() != max_N) { + v.push_back(max_N); + } + return v; +} + +struct Pools { + std::vector inds_any; + std::vector rank10_end_positions; + std::vector open_positions_zero_based; + std::vector open_positions_one_based; + std::vector close_positions_one_based; + std::vector inds; + std::vector inds_1N; + std::vector deltas; + std::vector> segs; + + std::vector ks1, ks0, ks10; + std::vector minselect_q; +}; + +struct Dataset { + std::size_t N{}; + std::string bits; + bit_vector bv; + BpSupport t; + + std::size_t cnt1{}, cnt0{}, cnt10{}; + Pools pool; +}; + +static std::vector> keepalive; + +static std::size_t count10(const std::string& s) { + std::size_t c = 0; + if (s.size() < 2) { + return 0; + } + for (std::size_t i = 0; i + 1 < s.size(); ++i) { + if (s[i] == '1' && s[i + 1] == '0') { + ++c; + } + } + return c; +} + +static std::shared_ptr build_dataset(std::size_t N) { + std::mt19937_64 rng(args.seed ^ + static_cast(N) * 0x9E3779B185EBCA87ull); + + auto d = std::make_shared(); + d->bits = random_dyck_bits(N, args.p1, rng); + d->N = d->bits.size(); + N = d->N; + + d->bv = bit_vector(N); + for (std::size_t i = 0; i < N; ++i) { + d->bv[i] = (d->bits[i] == '1'); + } + d->t = BpSupport(&d->bv); + + d->cnt1 = std::count(d->bits.begin(), d->bits.end(), '1'); + d->cnt0 = N - d->cnt1; + d->cnt10 = count10(d->bits); + + std::vector open_positions_zero_based; + std::vector open_positions_one_based; + std::vector close_positions_one_based; + open_positions_zero_based.reserve(d->N >> 1); + open_positions_one_based.reserve(d->N >> 1); + close_positions_one_based.reserve(d->N >> 1); + for (std::size_t i = 0; i < N; ++i) { + if (d->bits[i] == '1') { + open_positions_zero_based.push_back(i); + open_positions_one_based.push_back(i + 1); + } else { + close_positions_one_based.push_back(i + 1); + } + } + + std::size_t sample_count = std::max( + 1, std::min(args.Q, std::size_t(32768))); + + std::uniform_int_distribution nonedge_index_distribution( + N > 1 ? 1 : 0, N > 1 ? (N - 1) : 0); + std::uniform_int_distribution ind_dist(0, N ? (N - 1) : 0); + std::uniform_int_distribution ind_dist_1N(1, N ? N : 1); + std::uniform_int_distribution rank10_end_position_distribution( + N >= 2 ? 2 : 0, N >= 2 ? N : 0); + std::uniform_int_distribution delta_distribution(-8, +8); + + d->pool.inds_any.resize(sample_count); + d->pool.rank10_end_positions.resize(sample_count); + d->pool.inds.resize(sample_count); + d->pool.inds_1N.resize(sample_count); + d->pool.deltas.resize(sample_count); + d->pool.segs.resize(sample_count); + + auto rand_ij = [&]() -> std::pair { + if (N == 0) { + return {0, 0}; + } + std::size_t a = ind_dist(rng), b = ind_dist(rng); + if (a > b) { + std::swap(a, b); + } + return {a, b}; + }; + + for (std::size_t i = 0; i < sample_count; ++i) { + d->pool.inds_any[i] = nonedge_index_distribution(rng); + d->pool.rank10_end_positions[i] = rank10_end_position_distribution(rng); + d->pool.inds[i] = (N ? ind_dist(rng) : 0); + d->pool.inds_1N[i] = (N ? ind_dist_1N(rng) : 0); + d->pool.deltas[i] = delta_distribution(rng); + d->pool.segs[i] = rand_ij(); + } + + auto fill_from_candidates = [&](const std::vector& candidates, + std::vector& destination, + std::size_t fallback_value) { + destination.resize(sample_count); + if (candidates.empty()) { + std::fill(destination.begin(), destination.end(), fallback_value); + return; + } + std::uniform_int_distribution index_distribution( + 0, candidates.size() - 1); + for (std::size_t sample_index = 0; sample_index < sample_count; + ++sample_index) { + destination[sample_index] = candidates[index_distribution(rng)]; + } + }; + const std::size_t one_based_fallback = (N > 0 ? 1 : 0); + fill_from_candidates(open_positions_zero_based, + d->pool.open_positions_zero_based, 0); + fill_from_candidates(open_positions_one_based, + d->pool.open_positions_one_based, one_based_fallback); + fill_from_candidates(close_positions_one_based, + d->pool.close_positions_one_based, one_based_fallback); + + auto fill_ks = [&](std::size_t total, std::vector& out) { + out.resize(sample_count); + if (total == 0) { + std::fill(out.begin(), out.end(), 1); + return; + } + std::uniform_int_distribution dist(1, total); + for (std::size_t i = 0; i < sample_count; ++i) { + out[i] = dist(rng); + } + }; + fill_ks(d->cnt1, d->pool.ks1); + fill_ks(d->cnt0, d->pool.ks0); + fill_ks(d->cnt10, d->pool.ks10); + + d->pool.minselect_q.clear(); + + return d; +} + +template +static void register_op(const std::string& op, + std::shared_ptr data, + Fn&& body) { + auto idx_ptr = std::make_shared(0); + + auto* b = benchmark::RegisterBenchmark( + op.c_str(), [data, idx_ptr, body](benchmark::State& state) { + const Dataset& D = *data; + for (auto _ : state) { + std::size_t i = (*idx_ptr)++; + body(state, D, i); + } + state.counters["N"] = static_cast(D.N); + state.counters["seed"] = static_cast(args.seed); + state.counters["block_bits"] = static_cast(args.block_bits); + }); + + b->Unit(benchmark::kNanosecond); +} + +static void register_all() { + auto Ns = build_size_grid(); + for (std::size_t N : Ns) { + auto data = build_dataset(N); + keepalive.push_back(data); + + const auto& P = data->pool; + + register_op("rank1", data, + [&](benchmark::State&, const Dataset& D, std::size_t k) { + if (D.N == 0) { + benchmark::DoNotOptimize(0); + return; + } + std::size_t pos = P.inds_any[k % P.inds_any.size()]; + std::size_t r = 0; + if (pos > 0) { + r = D.t.rank(pos - 1); + } + benchmark::DoNotOptimize(r); + }); + + register_op("rank0", data, + [&](benchmark::State&, const Dataset& D, std::size_t k) { + if (D.N == 0) { + benchmark::DoNotOptimize(0); + return; + } + std::size_t pos = P.inds_any[k % P.inds_any.size()]; + std::size_t r = 0; + if (pos == 0) { + r = 0; + } else if (pos >= D.N) { + r = D.cnt0; + } else { + r = D.t.preceding_closing_parentheses(pos); + } + benchmark::DoNotOptimize(r); + }); + + register_op("select1", data, + [&](benchmark::State&, const Dataset& D, std::size_t k) { + if (D.cnt1 == 0) { + benchmark::DoNotOptimize(0); + return; + } + std::size_t q = P.ks1[k % P.ks1.size()]; + auto r = D.t.select(q); + benchmark::DoNotOptimize(r); + }); + + register_op("excess", data, + [&](benchmark::State&, const Dataset& D, std::size_t k) { + if (D.N == 0) { + benchmark::DoNotOptimize(0); + return; + } + std::size_t pos = P.inds_any[k % P.inds_any.size()]; + BpSupport::difference_type r = 0; + if (pos > 0) { + r = D.t.excess(pos - 1); + } + benchmark::DoNotOptimize(r); + }); + + register_op("range_min_query_pos", data, + [&](benchmark::State&, const Dataset& D, std::size_t k) { + if (D.N == 0) { + benchmark::DoNotOptimize(0); + return; + } + const std::size_t segment_index = k % P.segs.size(); + auto [range_begin, range_end] = P.segs[segment_index]; + auto min_position = D.t.rmq(range_begin, range_end); + benchmark::DoNotOptimize(min_position); + }); + + register_op("range_min_query_val", data, + [&](benchmark::State&, const Dataset& D, std::size_t k) { + if (D.N == 0) { + benchmark::DoNotOptimize(0); + return; + } + const std::size_t segment_index = k % P.segs.size(); + auto [range_begin, range_end] = P.segs[segment_index]; + auto min_position = D.t.rmq(range_begin, range_end); + BpSupport::difference_type min_value = 0; + if (min_position < D.N) { + const auto base_excess = + (range_begin == 0 ? 0 : D.t.excess(range_begin - 1)); + min_value = D.t.excess(min_position) - base_excess; + } + benchmark::DoNotOptimize(min_value); + }); + + register_op("close", data, + [&](benchmark::State&, const Dataset& dataset, + std::size_t sample_index) { + if (dataset.N == 0) { + benchmark::DoNotOptimize(0); + return; + } + const std::size_t open_position = + dataset.pool.open_positions_zero_based + [sample_index % + dataset.pool.open_positions_zero_based.size()]; + benchmark::DoNotOptimize(dataset.t.find_close(open_position)); + }); + + register_op( + "open", data, + [&](benchmark::State&, const Dataset& dataset, + std::size_t sample_index) { + if (dataset.N == 0) { + benchmark::DoNotOptimize(0); + return; + } + const std::size_t close_position_one_based = + dataset.pool.close_positions_one_based + [sample_index % + dataset.pool.close_positions_one_based.size()]; + const std::size_t close_position_zero_based = + (close_position_one_based > 0 ? close_position_one_based - 1 : 0); + benchmark::DoNotOptimize( + dataset.t.find_open(close_position_zero_based)); + }); + + register_op( + "enclose", data, + [&](benchmark::State&, const Dataset& dataset, + std::size_t sample_index) { + if (dataset.N == 0) { + benchmark::DoNotOptimize(0); + return; + } + const std::size_t open_position_one_based = + dataset.pool.open_positions_one_based + [sample_index % dataset.pool.open_positions_one_based.size()]; + const std::size_t open_position_zero_based = + (open_position_one_based > 0 ? open_position_one_based - 1 : 0); + benchmark::DoNotOptimize(dataset.t.enclose(open_position_zero_based)); + }); + } +} + +int main(int argc, char** argv) { + benchmark::MaybeReenterWithoutASLR(argc, argv); + parse_args_and_strip(argc, argv); + + auto has = [&](const char* key) { + std::string p1 = std::string(key) + "="; + for (int i = 1; i < argc; ++i) { + std::string s = argv[i]; + if (s == key || s.rfind(p1, 0) == 0) { + return true; + } + } + return false; + }; + + static std::vector extra; + if (!has("--benchmark_out_format")) { + extra.emplace_back("--benchmark_out_format=json"); + } + if (!has("--benchmark_counters_tabular")) { + extra.emplace_back("--benchmark_counters_tabular=true"); + } + if (!has("--benchmark_time_unit")) { + extra.emplace_back("--benchmark_time_unit=ns"); + } + + static std::vector argv_vec; + argv_vec.assign(argv, argv + argc); + for (auto& s : extra) { + argv_vec.push_back(s.data()); + } + argv_vec.push_back(nullptr); + argc = (int)argv_vec.size() - 1; + argv = argv_vec.data(); + + benchmark::Initialize(&argc, argv); + register_all(); + benchmark::RunSpecifiedBenchmarks(); + benchmark::Shutdown(); + return 0; +} diff --git a/src/test_rmm.cpp b/src/test_rmm.cpp index 9571a91..c0bf86b 100644 --- a/src/test_rmm.cpp +++ b/src/test_rmm.cpp @@ -58,47 +58,45 @@ static std::string random_bits(std::mt19937_64& rng, size_t n) { } static std::string random_dyck_bits(std::mt19937_64& rng, size_t m) { - std::string s(2 * m, '0'); - if (m == 0) { + std::size_t L = m << 1; + std::string s; + s.resize(L); + if (L == 0) { return s; } - size_t opens_left = m; - size_t closes_left = m; - int h = 0; - std::bernoulli_distribution coin(0.5); - for (size_t pos = 0; pos < 2 * m; ++pos) { + std::size_t opens_left = m; + std::size_t closes_left = m; + int balance = 0; + std::uniform_real_distribution U(0., 1.); + for (std::size_t i = 0; i < L; ++i) { if (opens_left == 0) { - s[pos] = '0'; + s[i] = '0'; --closes_left; - --h; + --balance; continue; } + if (closes_left == 0) { - s[pos] = '1'; + s[i] = '1'; --opens_left; - ++h; + ++balance; continue; } - if (h == 0) { - s[pos] = '1'; + + if (balance == 0) { + s[i] = '1'; --opens_left; - ++h; - continue; - } - if ((size_t)h == closes_left) { - s[pos] = '0'; - --closes_left; - --h; + ++balance; continue; } - if (coin(rng)) { - s[pos] = '1'; + if (U(rng) < 0.5) { + s[i] = '1'; --opens_left; - ++h; + ++balance; } else { - s[pos] = '0'; + s[i] = '0'; --closes_left; - --h; + --balance; } } return s;