diff --git a/CMakeLists.txt b/CMakeLists.txt
index 63b1877..0f29f3a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -22,6 +22,83 @@ if(ENABLE_ADDRESS_SANITIZER)
     message(STATUS "AddressSanitizer is ON")
 endif()
 
+set(SDSL_ROOT "" CACHE PATH "Installation prefix of sdsl-lite (contains include/ and lib/)")
+
+find_path(
+    SDSL_INCLUDE_DIR
+    NAMES sdsl/config.hpp
+    HINTS
+        "${SDSL_ROOT}"
+    PATH_SUFFIXES
+        include
+    PATHS
+        "$ENV{HOME}"
+        /usr
+        /usr/local
+)
+
+if(NOT SDSL_INCLUDE_DIR)
+    message(FATAL_ERROR "Could not find sdsl/config.hpp. "
+                        "Set SDSL_ROOT or SDSL_INCLUDE_DIR manually.")
+endif()
+
+find_library(
+    SDSL_LIBRARY
+    NAMES sdsl
+    HINTS
+        "${SDSL_ROOT}"
+    PATH_SUFFIXES
+        lib
+        lib64
+    PATHS
+        "$ENV{HOME}"
+        /usr
+        /usr/local
+)
+
+find_library(
+    DIVSUFSORT_LIBRARY
+    NAMES divsufsort
+    HINTS
+        "${SDSL_ROOT}"
+    PATH_SUFFIXES
+        lib
+        lib64
+    PATHS
+        "$ENV{HOME}"
+        /usr
+        /usr/local
+)
+
+find_library(
+    DIVSUFSORT64_LIBRARY
+    NAMES divsufsort64
+    HINTS
+        "${SDSL_ROOT}"
+    PATH_SUFFIXES
+        lib
+        lib64
+    PATHS
+        "$ENV{HOME}"
+        /usr
+        /usr/local
+)
+
+if(NOT SDSL_LIBRARY OR NOT DIVSUFSORT_LIBRARY OR NOT DIVSUFSORT64_LIBRARY)
+    message(FATAL_ERROR "Could not find sdsl-lite libraries (sdsl, divsufsort, divsufsort64). "
+                        "Set SDSL_ROOT or SDSL_LIBRARY*/DIVSUFSORT* manually.")
+endif()
+
+add_library(sdsl_lite INTERFACE)
+target_include_directories(sdsl_lite INTERFACE "${SDSL_INCLUDE_DIR}")
+target_link_libraries(sdsl_lite INTERFACE
+    "${SDSL_LIBRARY}"
+    "${DIVSUFSORT_LIBRARY}"
+    "${DIVSUFSORT64_LIBRARY}"
+)
+message(STATUS "Found sdsl-lite includes in: ${SDSL_INCLUDE_DIR}")
+message(STATUS "Found sdsl-lite libs: ${SDSL_LIBRARY}; ${DIVSUFSORT_LIBRARY}; ${DIVSUFSORT64_LIBRARY}")
+
 include(FetchContent)
 FetchContent_Declare(
     googletest
@@ -103,6 +180,17 @@ target_link_libraries(test_rmm
     gtest
     gtest_main)
 
+add_executable(bench_rmm_sdsl
+    src/bench_rmm_sdsl.cpp
+)
+target_include_directories(bench_rmm_sdsl
+    PUBLIC include
+)
+target_link_libraries(bench_rmm_sdsl
+    PRIVATE
+        benchmark
+        sdsl_lite
+)
 
 FetchContent_Declare(
     doxygen-awesome-css
diff --git a/README.md b/README.md
index f1a5096..db4ef89 100644
--- a/README.md
+++ b/README.md
@@ -70,7 +70,14 @@ Benchmarks are random 50/50 0-1 bitvectors up to $2^{34}$ bits.
 ./bench_rmm
 ```
 
-For visualization, write the JSON output to a file using `--benchmark_out=<file>` (e.g. `./bench_rmm --benchmark_out=rmm_bench.json`) and plot it with `misc/plot_rmm.py`.
+For comparison with range min-max tree implementation from [sdsl-lite](https://github.com/simongog/sdsl-lite) (Release build required: `cmake .. -DCMAKE_BUILD_TYPE=Release`):
+
+```bash
+sudo cpupower frequency-set --governor performance
+./bench_rmm_sdsl --benchmark_out=rmm_bench_sdsl.json
+```
+
+For visualization, write the JSON output to a file using `--benchmark_out=<file>` (e.g. `./bench_rmm --benchmark_out=rmm_bench.json`) and plot it with `misc/plot_rmm.py` (add `--sdsl-json rmm_bench_sdsl.json` for comparison).
 
 ---
 
diff --git a/REPORT.md b/REPORT.md
new file mode 100644
index 0000000..120fbd7
--- /dev/null
+++ b/REPORT.md
@@ -0,0 +1,258 @@
+# Отчёт по семестровому проекту: реализация Range Min–Max Tree (RmMTree)
+
+## 1. Введение
+
+В рамках семестрового проекта была реализована структура данных **Range Min–Max Tree (RmMTree)** над битовым вектором для операций над **правильными скобочными последовательностями (balanced parentheses, BP)**. Реализация основана на идеях из статьи (ScienceDirect): https://www.sciencedirect.com/science/article/pii/S0304397516300706
+
+Цель проекта — получить
+- **сжатую** ($o(n)$ доп. памяти)
+- и **быструю** (все операции выполняются за $O(\log n)$ времени и используют различные оптимизации, существенно уменьшающие константы)
+- структуру, поддерживающую типовой набор запросов:
+    - `rank` / `select` по битам,
+    - операции с **excess** (префиксная сумма с шагами +1/−1),
+    - `fwdsearch` / `bwdsearch`,
+    - диапазонные запросы минимума/максимума по префиксным суммам,
+    - навигацию по скобочной последовательности: `close`, `open`, `enclose`,
+    - дополнительные операции `rank10` / `select10` для шаблона `"10"`.
+
+В проекте присутствуют:
+1) наивная эталонная реализация (для проверки на корректность);
+2) основная, оптимизированная, реализация;
+3) набор тестов на корректность (GoogleTest) с сравнением "наивная vs оптимизированная",  
+4) бенчмарки (Google Benchmark) и скрипт построения графиков для визуализации результатов.
+
+---
+
+## 2. Постановка задачи и требования
+
+### 2.1. Входные данные
+- Битовая строка `'0'/'1'` или массив 64-битных слов `uint64_t` с укладкой **LSB-first** (младшие биты — более ранние позиции).
+- Число валидных бит `num_bits`.
+
+### 2.2. Основные поддерживаемые операции
+
+#### Ранг/выборка
+- `rank1(i)`, `rank0(i)` — количество `1` или `0` на префиксе `[0, i)`.
+- `select1(k)`, `select0(k)` — позиция k-й единицы/нуля (k **1-based**).
+
+#### Шаблон `"10"`
+- `rank10(i)` — число позиций `p` в `[0, i)` таких что `bit[p]==1` и `bit[p+1]==0` и `p+1<i`.
+- `select10(k)` — позиция начала k-го шаблона `"10"` (k **1-based**).
+
+#### Excess и поиски по префиксной сумме
+- `excess(i) = rank1(i) - rank0(i) = 2*rank1(i) - i` — префиксная сумма шагов: `+1` для `1` и `-1` для `0`.
+- `fwdsearch(i, d)` — минимальный `p ≥ i`, где `excess(p) = excess(i) + d`.
+- `bwdsearch(i, d)` — максимальный `p ≤ i`, где `excess(p) = excess(i) + d`.
+
+#### Диапазонные min/max по префиксу
+- `range_min_query_pos(i, j)` — позиция первого минимума на `[i..j]`.
+- `range_min_query_val(i, j)` — значение соответствующего минимума.
+- Аналогично: `range_max_query_pos/val`.
+
+#### Количество минимумов и выбор q-го минимума
+- `mincount(i, j)` — сколько раз минимум достигается на `[i..j]`.
+- `minselect(i, j, q)` — позиция q-го минимума (q **1-based**).
+
+#### BP-навигация
+Интерпретация: `1` — это `'('`, `0` — это `')'`.
+- `close(open_pos)` — найти соответствующую закрывающую скобку.
+- `open(close_pos)` — найти соответствующую открывающую скобку.
+- `enclose(pos)` — это **открывающая скобка ближайшей внешней пары**, которая содержит позицию `pos` **внутри себя**, а не совпадает с ней *(если рассматривать BP как дерево, то `enclose(pos)` — это **открывающая скобка родительского узла** для позиции `pos`, т.е. ближайшая открывающая скобка "на уровень выше")*.
+
+---
+
+## 3. Архитектура решения
+
+Дерево реализовано в двух вариантах:
+
+### 3.1. NaiveRmM — наивная эталонная реализация
+
+Назначение:
+- служит **проверочным эталоном** для тестов;
+- реализует те же операции простыми циклами по массиву бит.
+
+Все операции — асимптотически линейные по длине рассматриваемого диапазона:
+- `rank`: `O(n)`,
+- `select`: `O(n)`,
+- `range_min/max` и `mincount/minselect`: `O(j-i)` и т.п.
+
+Плюсы:
+- простота,
+- легко проверить корректность логики.
+
+Минусы:
+- очевидны.
+
+---
+
+### 3.2. pixie::RmMTree — оптимизированная реализация Range Min–Max Tree
+
+#### 3.2.1. Хранение битов
+- `std::vector<std::uint64_t> bits;`  
+- укладка: **LSB-first** (бит `i` находится в слове `bits[i>>6]` на позиции `(i & 63)`).
+
+Это решение удобно, потому что:
+- `rank` в блоке можно ускорять через `popcount`,
+- `select` в блоке делается через перебор установленных битов (внутри 64 бит),
+- удобно читать байты/16-битные фрагменты для LUT и SIMD.
+
+#### 3.2.2. Идея блокирования и дерева
+Битовый вектор делится на **листья** фиксированного размера `block_bits` (по умолчанию минимум 64, далее может расти по степеням двойки).
+Количество листьев:
+- `leaf_count = ceil(num_bits / block_bits)`
+
+Далее строится **сбалансированное двоичное дерево** в heap-представлении, где:
+- листья соответствуют блокам,
+- внутренние вершины агрегируют статистики по своим поддеревьям.
+
+Индексация:
+- `1` — корень,
+- `first_leaf_index` — индекс первого листа (степень двойки ≥ `leaf_count`),
+- лист `k` хранится в `first_leaf_index + k`.
+
+#### 3.2.3. Какие агрегаты хранятся в вершинах
+
+Для каждого узла дерева хранится набор агрегатов (но не в виде структуры, а в отдельных массивах, где индекс массива совпадает с номером узла):
+- `segment_size_bits[node]` — длина сегмента (в битах), покрываемого узлом.
+- `node_total_excess[node]` — суммарный excess на сегменте (итоговая сумма шагов +1/−1).
+- `node_min_prefix_excess[node]` — минимум префиксной суммы внутри сегмента (относительно начала сегмента).
+- `node_max_prefix_excess[node]` — максимум префиксной суммы внутри сегмента.
+- `node_min_count[node]` — число позиций, где минимум достигается.
+- `node_pattern10_count[node]` — число шаблонов `"10"` внутри сегмента.
+- `node_first_bit[node]`, `node_last_bit[node]` — крайние биты сегмента (нужны для учёта `"10"` на границах между детьми/узлами).
+
+---
+
+## 4. Ускорение сканирования листьев: LUT по байтам и SIMD
+
+### 4.1. LUT на 8 бит
+
+Для обработки хвостов и небольших диапазонов сделана lookup table на все 256 возможных значений байта.
+
+Структура `ByteAgg` хранит для байта:
+- суммарный excess,
+- min/max префикса,
+- количество минимумов,
+- число `"10"` внутри байта,
+- первый/последний бит,
+- позиции первого min/max.
+
+Также построены таблицы:
+- `fwd_pos[byte][delta]` — первая позиция в байте, где префикс равен `delta` (delta ∈ [-8..8]),
+- `bwd_pos[byte][delta]` — последняя позиция.
+
+Это позволяет:
+- быстро находить минимум/максимум и их позиции на диапазонах, выровненных по байтам,
+- быстро искать нужный префикс для маленьких дельт,
+- ускорить `mincount/minselect`,
+- ускорить локальные сканирования в `fwdsearch/bwdsearch`.
+
+### 4.2. SIMD (AVX2) для поиска по префиксу в листе
+
+В реализации предусмотрена опциональная поддержка AVX2 (`PIXIE_AVX2_SUPPORT`):
+- берутся 16 бит (`get_u16`),
+- разворачиваются в 16 "шагов" (+1/−1) по lane’ам,
+- вычисляется префиксная сумма внутри SIMD-регистра,
+- сравнивается с целевым значением, и по mask находится первая/последняя позиция.
+
+Дополнительно использована эвристика:
+- для "маленьких хвостов" и "маленьких дельт" (пример: delta ∈ [-8..8], tail ≤ 256) LUT8 может быть быстрее, чем AVX2 из-за стоимости подготовки SIMD.
+
+---
+
+## 5. Выбор размера блока и оценка накладных расходов
+
+В `RmMTree` предусмотрен механизм выбора `block_bits`:
+- можно задать явно через конструктор/параметр `leaf_block_bits`,
+- можно включить ограничение на относительные накладные расходы по памяти *(объём вспомогательных массивов по отношению к размеру битвектора, далее — **overhead**)* через `max_overhead`$^*$,
+- иначе выбирается значение, близкое к `ceil_pow2(log2(num_bits))`, но не меньше базового порога.
+
+Это даёт возможность балансировать:
+- **скорость** (меньше блок → больше дерево → больше узлов, но меньше сканирования листа),
+- **память** (больше блок → меньше узлов → меньше overhead).
+
+$*$ — `max_overhead`, вообще говоря, задаёт **целевую** верхнюю границу на overhead. При построении структура увеличивает размер листового блока `block_bits`, чтобы уложиться в этот лимит. Однако гарантировать выполнение ограничения не всегда возможно:
+- `block_bits` ограничен сверху значением `min(n, 2^14)` *(введена как защита от неадекватно большого и на практике почти никогда не нужного размера блока)*, поэтому при слишком строгом лимите может не существовать допустимого размера блока;
+- при очень малом `n` накладные расходы неизбежно велики, потому что дерево и массивы имеют "постоянную" составляющую, а `block_bits` нельзя уменьшать ниже 64;
+- дополнительно overhead растёт из-за выравнивания числа листьев до степени двойки (`first_leaf_index = bit_ceil(leaf_count)`), из-за чего появляются **пустые слоты узлов** в heap-представлении (они занимают место в массивах, хотя фактически не покрывают биты).
+
+---
+
+## 6. Тестирование корректности
+
+Тестирование построено по принципу "оптимизированная реализация должна совпадать с наивной" на большом количестве случайных сценариев.
+
+---
+
+## 7. Бенчмаркинг производительности
+
+В проекте реализованы два бенчмарка:
+- `src/bench_rmm.cpp` — бенчмарк реализации `RmMTree`,
+- `src/bench_rmm_sdsl.cpp` — бенчмарк реализации Range Min–Max Tree из библиотеки [sdsl-lite](https://github.com/simongog/sdsl-lite).
+
+### 7.1. Генерация датасетов
+- Используется генератор случайных ПСП;
+- После чего для каждого `N` заранее подготавливаются пулы запросов:
+    - случайные индексы,
+    - случайные отрезки `[i, j]`,
+    - случайные `k` для `select`-операций,
+    - случайные `delta` для `fwdsearch/bwdsearch`,
+    - корректные позиции `'('` и `')'` для `close/open/enclose`,
+    - для `minselect` — заранее генерируется `q` в допустимом диапазоне (через `mincount`).
+
+### 7.2. Поддерживаемые параметры бенчмарков (как аргументы командной строки)
+- `--min_exp`, `--max_exp` — диапазон размеров `N = 2^e`,
+- `--per_octave` — сколько дополнительных точек между степенями двойки,
+- `--explicit_sizes` — явный список размеров,
+- `--Q` — число запросов (сэмплов),
+- `--p1` — параметр генератора,
+- `--seed` — seed,
+- `--block_bits` — размер листа (0 = auto).
+
+---
+
+## 8. Визуализация результатов
+
+Реализован скрипт, читающий JSON-выгрузку Google Benchmark и строящий графики:
+- по оси X — `N` (размер последовательности),
+- по оси Y — `cpu_time` (нс на операцию),
+- отдельно по каждой операции,
+- опционально сравнивает `RmMTree` и `sdsl-lite` на одном графике,
+- может включить `--logx` и медианное сглаживание `--smooth`.
+
+---
+
+## 9. Практические итоги и наблюдения
+
+### 9.1. Корректность
+- Наличие `NaiveRmM` позволило сделать строгую проверку "на каждом шаге" для большого количества случайных случаев.
+- Полный перебор коротких строк (1..8) полезен для ловли ошибок на границах:
+  - пустые диапазоны,
+  - `i=0`, `i=num_bits`,
+  - различия 0-based / 1-based в `open/enclose/bwdsearch`,
+  - пересечения `"10"` на границах слов и блоков.
+
+### 9.2. Производительность
+Заложено несколько уровней ускорения:
+- агрегаты снижают работу на больших диапазонах до $O(\log n)$,
+- LUT8 ускоряет "мелкие" сканы и листовые операции без тяжёлых подготовок,
+- AVX2 ускоряет поиски по префиксным суммам на длинных хвостах.
+
+Самая красивая особенность этой реализации заключается в том, что в таких структурах часто именно "листья" становятся bottleneck-ом, но LUT8 и AVX2 позволяют получить немалый практический выигрыш в скорости работы на этих "листьях", а параметр `block_bits` позволяет подстроить баланс "дерево vs локальное сканирование".
+
+---
+
+## 10. Роадмап дальнейшей работы
+
+1) **BP-обёртка**:
+   - `parent`, `isancestor`, `depth`, `lca` и т.п. поверх `close/open/enclose`.
+2) **AoS**:
+   - подумать над слиянием некоторых массивов в Array-of-Structs для улучшения кэш-локальности.
+3) **Cуперблоки**:
+   - объединить несколько соседних листьев в "суперблок" и строить дерево только по суперблокам, а внутри каждого суперблока хранить компактный локальный индекс (например, массив агрегатов по листьям или микро-структуру). Это позволило бы "срезать" несколько нижних уровней дерева, уменьшив число узлов, объём метаданных и количество обращений к памяти при запросах (особенно в `fwdsearch/bwdsearch` и `range min/max`).
+
+---
+
+## 11. Заключение
+
+В итоге получилась завершённая реализация RmMTree, пригодная для дальнейшего развития в сторону полнофункциональной поддержки succinct-структур над BP.
diff --git a/include/bits.h b/include/bits.h
index c7ee26d..7dbce3c 100644
--- a/include/bits.h
+++ b/include/bits.h
@@ -4,7 +4,9 @@
 
 #include <array>
 #include <bit>
+#include <cstddef>
 #include <cstdint>
+#include <limits>
 #include <numeric>
 
 #if defined(__AVX512VPOPCNTDQ__) && defined(__AVX512F__) && \
@@ -17,7 +19,7 @@
 // Lookup table for 4-bit popcount
 // This table maps each 4-bit value (0-15) to its population count
 // clang-format off
-const __m256i lookup_popcount_4 = _mm256_setr_epi8(
+static inline const __m256i lookup_popcount_4 = _mm256_setr_epi8(
     0, 1, 1, 2,  // 0000, 0001, 0010, 0011
     1, 2, 2, 3,  // 0100, 0101, 0110, 0111
     1, 2, 2, 3,  // 1000, 1001, 1010, 1011
@@ -30,7 +32,7 @@ const __m256i lookup_popcount_4 = _mm256_setr_epi8(
     2, 3, 3, 4   // 1100, 1101, 1110, 1111
 );
 
-const __m256i mask_first_half = _mm256_setr_epi8(
+static inline const __m256i mask_first_half = _mm256_setr_epi8(
     0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF,
     0xFF, 0xFF, 0xFF, 0xFF,
@@ -405,108 +407,3 @@ void rank_32x8(const uint8_t* x, uint8_t* result) {
   }
 #endif
 }
-
-/**
- * @brief Efficiently searches for the first occurrence of a 16-bit value in
- * the range [@p begin, @p end_excl) using AVX2 when available.
- * @details Loads 16 consecutive int16_t elements (256 bits) per iteration.
- * Compares them against the @p target value using vectorized equality.
- * If any match is found, extracts the index of the first matching lane from
- * the comparison mask. Falls back to a scalar tail loop for leftover
- * elements, or to a fully scalar search if AVX2 is not supported.
- * @returns The index of the first match, or @p npos if the value is not found.
- */
-static inline size_t find_forward_equal_i16_avx2(const int16_t* arr,
-                                                 const size_t& begin,
-                                                 const size_t& end_excl,
-                                                 const int16_t& target,
-                                                 const size_t& npos) noexcept {
-#ifdef PIXIE_AVX2_SUPPORT
-  static constexpr size_t STEP = 16;
-  __m256i vtarget = _mm256_set1_epi16(target);
-  size_t i = begin;
-  size_t n = end_excl;
-  for (; i + STEP <= n; i += STEP) {
-    unsigned mask = _mm256_movemask_epi8(_mm256_cmpeq_epi16(
-        _mm256_loadu_si256(reinterpret_cast<const __m256i*>(arr + i)),
-        vtarget));
-    if (mask) {
-      return i + (std::countr_zero(mask) >> 1);
-    }
-  }
-  for (; i < n; ++i) {
-    if (arr[i] == target) {
-      return i;
-    }
-  }
-#else
-  for (size_t i = begin; i < end_excl; ++i) {
-    if (arr[i] == target) {
-      return i;
-    }
-  }
-#endif
-  return npos;
-}
-
-/**
- * @brief Performs a backward search for a 16-bit value in a given range.
- * @details Scans the array segment [@p begin .. @p end_incl] from right to
- * left.
- * If AVX2 is available, processes data in 256-bit blocks (16 × int16_t) using
- * vectorized equality comparison for higher throughput. Falls back to a
- * scalar backward scan when AVX2 is not supported. Returns the index of the
- * rightmost occurrence of @p target, or @p npos if no match is found.
- */
-static inline size_t find_backward_equal_i16_avx2(const int16_t* arr,
-                                                  const size_t& begin,
-                                                  const size_t& end_incl,
-                                                  const int16_t& target,
-                                                  const size_t& npos) noexcept {
-  if (begin > end_incl) {
-    return npos;
-  }
-#ifdef PIXIE_AVX2_SUPPORT
-  static constexpr size_t STEP = 16;
-  size_t len = end_incl + 1 - begin;
-  size_t nblocks = len / STEP;
-  __m256i vtarget = _mm256_set1_epi16(target);
-  if (nblocks > 0) {
-    size_t first_block = begin + (len % STEP);
-    for (size_t p = first_block + (nblocks - 1) * STEP;;) {
-      unsigned mask = _mm256_movemask_epi8(_mm256_cmpeq_epi16(
-          _mm256_loadu_si256(reinterpret_cast<const __m256i*>(arr + p)),
-          vtarget));
-      if (mask) {
-        return p + ((31u - std::countl_zero(mask)) >> 1);
-      }
-      if (p == first_block) {
-        break;
-      }
-      p -= STEP;
-    }
-
-    for (size_t i = first_block; i > begin;) {
-      --i;
-      if (arr[i] == target) {
-        return i;
-      }
-    }
-  } else {
-    for (size_t i = end_incl + 1; i > begin;) {
-      --i;
-      if (arr[i] == target) {
-        return i;
-      }
-    }
-  }
-#else
-  for (size_t i = end_incl + 1; i > begin;) {
-    --i;
-    if (arr[i] == target) {
-      return i;
-    }
-  }
-#endif
-  return npos;
-}
diff --git a/include/rmm_tree.h b/include/rmm_tree.h
index 30b2ad9..c453ef6 100644
--- a/include/rmm_tree.h
+++ b/include/rmm_tree.h
@@ -72,16 +72,10 @@ class RmMTree {
   std::vector<uint32_t> node_pattern10_count;
 
   // node_first_bit = first bit (0/1), node_last_bit = last bit (0/1) of the
-  // segment (to handle "10" crossing) both needed for: rank10, select10.
+  // segment (to handle "10" crossing)
+  // both needed for: rank10, select10.
   std::vector<uint8_t> node_first_bit, node_last_bit;
 
-  // leaf_prefix[k * leaf_prefix_stride + j] =
-  // excess(block_begin + j) - excess(block_begin), j in [0..leaf_size],
-  // block_begin = k * block_bits. needed for:
-  // fwdsearch/bwdsearch/close/open/enclose
-  std::vector<int16_t> leaf_prefix;
-  size_t leaf_prefix_stride = 0;
-
  public:
   /**
    * @brief Sentinel for "not found".
@@ -336,23 +330,43 @@ class RmMTree {
     }
 
     int remaining_delta = delta - leaf_delta;
-    if (leaf_block_index + 1 < leaf_count) {
-      size_t nodes_buffer[64];
-      const size_t node_count = cover_blocks_collect(
-          leaf_block_index + 1, leaf_count - 1, nodes_buffer);
-      size_t segment_base = (leaf_block_index + 1) * block_bits;
-      for (size_t j = 0; j < node_count; ++j) {
-        const size_t node_index = nodes_buffer[j];
-        if (remaining_delta == 0) {
-          return segment_base;
-        }
-        if (node_min_prefix_excess[node_index] <= remaining_delta &&
-            remaining_delta <= node_max_prefix_excess[node_index]) {
-          return descend_fwd(node_index, remaining_delta, segment_base);
+    size_t segment_base = block_end;
+    if (remaining_delta == 0) {
+      return segment_base;
+    }
+
+    // Tree-walk to the right:
+    // go up; whenever we come from a left child, try the right sibling subtree.
+    // If target is inside sibling -> descend; else skip it and continue up.
+    size_t node_index = leaf_index_of(block_begin);
+    const size_t tree_size = segment_size_bits.size() - 1;
+
+    // If we are already at/after the last leaf boundary, there's nothing to
+    // scan.
+    if (segment_base >= num_bits || leaf_block_index + 1 >= leaf_count) {
+      return npos;
+    }
+
+    while (node_index > 1) {
+      const bool is_left_child = ((node_index & 1u) == 0u);
+      if (is_left_child) {
+        const size_t sibling = node_index | 1u;  // right sibling
+        if (sibling <= tree_size && segment_size_bits[sibling]) {
+          // Boundary at sibling start already handled above via
+          // remaining_delta==0.
+          if (node_min_prefix_excess[sibling] <= remaining_delta &&
+              remaining_delta <= node_max_prefix_excess[sibling]) {
+            return descend_fwd(sibling, remaining_delta, segment_base);
+          }
+          // Skip whole sibling subtree.
+          remaining_delta -= node_total_excess[sibling];
+          segment_base += segment_size_bits[sibling];
+          if (remaining_delta == 0) {
+            return segment_base;
+          }
         }
-        remaining_delta -= node_total_excess[node_index];
-        segment_base += segment_size_bits[node_index];
       }
+      node_index >>= 1;
     }
     return npos;
   }
@@ -1267,6 +1281,203 @@ class RmMTree {
     return LUT8_ALL().bwd_pos;
   }
 
+  /**
+   * @brief Extract 16 bits starting at @p position (LSB-first across words).
+   */
+  inline uint16_t get_u16(const size_t& position) const noexcept {
+    const size_t word_index = position >> 6;
+    const unsigned offset = unsigned(position & 63);
+    const std::uint64_t w0 =
+        (word_index < bits.size()) ? bits[word_index] : 0ULL;
+    if (offset == 0) {
+      return uint16_t(w0 & 0xFFFFu);
+    }
+    const std::uint64_t w1 =
+        (word_index + 1 < bits.size()) ? bits[word_index + 1] : 0ULL;
+    const std::uint64_t v = (w0 >> offset) | (w1 << (64u - offset));
+    return uint16_t(v & 0xFFFFu);
+  }
+
+#if defined(PIXIE_AVX2_SUPPORT)
+  static inline __m256i bit_masks_16x() noexcept {
+    // 16 lanes: (1<<0), (1<<1), ... (1<<15)
+    return _mm256_setr_epi16(0x0001, 0x0002, 0x0004, 0x0008, 0x0010, 0x0020,
+                             0x0040, 0x0080, 0x0100, 0x0200, 0x0400, 0x0800,
+                             0x1000, 0x2000, 0x4000, (int16_t)0x8000);
+  }
+
+  static inline __m256i prefix_sum_16x_i16(__m256i v) noexcept {
+    // Inclusive prefix sum within 128-bit lanes, then fix carry into the high
+    // lane.
+    __m256i x = v;
+    __m256i t = _mm256_slli_si256(x, 2);
+    x = _mm256_add_epi16(x, t);
+    t = _mm256_slli_si256(x, 4);
+    x = _mm256_add_epi16(x, t);
+    t = _mm256_slli_si256(x, 8);
+    x = _mm256_add_epi16(x, t);
+
+    __m128i lo = _mm256_extracti128_si256(x, 0);
+    __m128i hi = _mm256_extracti128_si256(x, 1);
+    const int16_t carry =
+        (int16_t)_mm_extract_epi16(lo, 7);  // sum of first 8 elems
+    hi = _mm_add_epi16(hi, _mm_set1_epi16(carry));
+
+    __m256i out = _mm256_castsi128_si256(lo);
+    out = _mm256_inserti128_si256(out, hi, 1);
+    return out;
+  }
+
+  static inline int16_t last_prefix_16x_i16(__m256i pref) noexcept {
+    __m128i hi = _mm256_extracti128_si256(pref, 1);
+    return (int16_t)_mm_extract_epi16(hi, 7);  // lane 15
+  }
+
+  /**
+   * @brief SIMD forward scan: find first bit position p in [start,end) where
+   *        sum_{k=start..p} step(k) == required_delta.
+   * @param out_total If not null and not found, receives total sum on
+   * [start,end).
+   */
+  inline size_t scan_leaf_fwd_simd(const size_t& start,
+                                   const size_t& end,
+                                   const int& required_delta,
+                                   int* out_total) const noexcept {
+    if (start >= end) {
+      if (out_total) {
+        *out_total = 0;
+      }
+      return npos;
+    }
+    if (required_delta < -32768 || required_delta > 32767) {
+      if (out_total) {
+        const int len = int(end - start);
+        const int ones = int(rank1_in_block(start, end));
+        *out_total = ones * 2 - len;
+      }
+      return npos;
+    }
+
+    static const __m256i masks = bit_masks_16x();
+    static const __m256i vzero = _mm256_setzero_si256();
+    static const __m256i vallones = _mm256_cmpeq_epi16(vzero, vzero);
+    static const __m256i vminus1 = _mm256_set1_epi16(-1);
+    static const __m256i vtwo = _mm256_set1_epi16(2);
+    const __m256i vtarget = _mm256_set1_epi16((int16_t)required_delta);
+
+    int cur = 0;
+    size_t pos = start;
+    while (pos + 16 <= end) {
+      const uint16_t bits16 = get_u16(pos);
+      const __m256i vb = _mm256_set1_epi16((int16_t)bits16);
+      const __m256i m = _mm256_and_si256(vb, masks);
+      const __m256i is_zero = _mm256_cmpeq_epi16(m, vzero);
+      const __m256i is_set = _mm256_andnot_si256(is_zero, vallones);
+      const __m256i steps =
+          _mm256_add_epi16(vminus1, _mm256_and_si256(is_set, vtwo));
+
+      const __m256i pref_rel = prefix_sum_16x_i16(steps);
+      const __m256i base = _mm256_set1_epi16((int16_t)cur);
+      const __m256i pref = _mm256_add_epi16(pref_rel, base);
+      const __m256i cmp = _mm256_cmpeq_epi16(pref, vtarget);
+      const uint32_t mask = (uint32_t)_mm256_movemask_epi8(cmp);
+      if (mask) {
+        const int lane = int(std::countr_zero(mask)) >> 1;
+        return pos + (size_t)lane;
+      }
+      cur += (int)last_prefix_16x_i16(pref_rel);
+      pos += 16;
+    }
+    while (pos < end) {
+      cur += bit(pos) ? +1 : -1;
+      if (cur == required_delta) {
+        return pos;
+      }
+      ++pos;
+    }
+    if (out_total) {
+      *out_total = cur;
+    }
+    return npos;
+  }
+#endif  // PIXIE_AVX2_SUPPORT
+
+  /**
+   * @brief Faster small-delta forward scan for tiny tails (LUT8 + direct byte
+   * loads).
+   * @details This avoids the heavy AVX2 prefix-sum setup cost when the tail is
+   * small. Requires little-endian layout (x86_64).
+   */
+  inline size_t scan_leaf_fwd_lut8_fast(const size_t& start,
+                                        const size_t& end,
+                                        const int& required_delta,
+                                        int* out_total) const noexcept {
+    if (start >= end) {
+      if (out_total) {
+        *out_total = 0;
+      }
+      return npos;
+    }
+    int cur = 0;
+
+    size_t pos = start;
+    while (pos < end && (pos & 7)) {
+      cur += bit(pos) ? +1 : -1;
+      if (cur == required_delta) {
+        if (out_total) {
+          *out_total = cur;
+        }
+        return pos;
+      }
+      ++pos;
+    }
+
+    // Byte-aligned fast path: read bytes directly.
+    // bits are LSB-first; on little-endian x86 the in-memory byte order matches
+    // bit groups [pos..pos+7].
+    const uint8_t* bytep =
+        reinterpret_cast<const uint8_t*>(bits.data()) + (pos >> 3);
+    const auto& agg = LUT8();
+    const auto& fwd = LUT8_FWD_POS();
+
+    while (pos + 8 <= end) {
+      const uint8_t bv = *bytep++;
+      const auto& a = agg[bv];
+      const int need = required_delta - cur;
+      // need must be in [-8..8] for a match inside one byte.
+      if ((unsigned)(need + 8) <= 16u) {
+        // min/max pruning first, then position lookup.
+        if (need >= a.min_prefix && need <= a.max_prefix) {
+          const int8_t off = fwd[bv][need + 8];
+          if (off >= 0) {
+            if (out_total) {
+              *out_total = cur + a.excess_total;  // not exact end, but caller
+                                                  // only uses when not found
+            }
+            return pos + (size_t)off;
+          }
+        }
+      }
+      cur += a.excess_total;
+      pos += 8;
+    }
+
+    while (pos < end) {
+      cur += bit(pos) ? +1 : -1;
+      if (cur == required_delta) {
+        if (out_total) {
+          *out_total = cur;
+        }
+        return pos;
+      }
+      ++pos;
+    }
+    if (out_total) {
+      *out_total = cur;
+    }
+    return npos;
+  }
+
   /**
    * @brief Forward search within a single leaf over the range [@p search_start,
    * @p search_end).
@@ -1324,8 +1535,33 @@ class RmMTree {
       const size_t& block_end,
       const int& required_delta,
       const bool& allow_right_boundary,
-      const size_t& global_right_border) const noexcept {
-    if (block_begin >= block_end) {
+      const size_t& global_right_border,
+      const int& prefix_at_boundary_max /*=kNoPrefixOverride*/) const noexcept {
+    // We scan bits in [block_begin, block_end) and look for the LAST boundary
+    // where prefix == required_delta, with optional exclusion of the right
+    // boundary.
+    if (block_begin > block_end) {
+      return npos;
+    }
+
+    // Maximum allowed boundary (inclusive) inside this scan.
+    // If right boundary is forbidden, we forbid boundary == block_end.
+    size_t boundary_max = block_end;
+    if (!allow_right_boundary && boundary_max > block_begin) {
+      --boundary_max;
+    }
+
+    // No bits to scan -> only possible answer is the left boundary.
+    if (block_begin >= boundary_max) {
+      if ((block_begin < global_right_border || allow_right_boundary) &&
+          required_delta == 0) {
+        return block_begin;
+      }
+      return npos;
+    }
+
+    if (required_delta < -32768 || required_delta > 32767) {
+      // Just in case. Should be impossible.
       if ((block_begin < global_right_border || allow_right_boundary) &&
           required_delta == 0) {
         return block_begin;
@@ -1333,22 +1569,100 @@ class RmMTree {
       return npos;
     }
 
-    const int16_t* leaf_prefix_ptr =
-        &leaf_prefix[block_of(block_begin) * leaf_prefix_stride];
-    const size_t end_inclusive = block_end - block_begin;
-    if (end_inclusive > 0) {
-      const size_t position = ::find_backward_equal_i16_avx2(
-          leaf_prefix_ptr, 1, end_inclusive, required_delta, npos);
-      if (position != npos) {
-        return block_begin + position;
+#if defined(PIXIE_AVX2_SUPPORT)
+    // Fast reverse scan with early exit:
+    // find the RIGHTMOST boundary j in (block_begin..boundary_max] such that
+    // prefix(j) == required_delta.
+
+    // prefix_end = prefix(boundary_max) relative to block_begin
+    int prefix_end = prefix_at_boundary_max;
+    if (prefix_end == kNoPrefixOverride) {
+      const int len = int(boundary_max - block_begin);
+      const int ones = int(rank1_in_block(block_begin, boundary_max));
+      prefix_end = ones * 2 - len;
+    }
+
+    const __m256i masks = bit_masks_16x();
+    const __m256i vzero = _mm256_setzero_si256();
+    const __m256i vallones = _mm256_cmpeq_epi16(vzero, vzero);
+    const __m256i vminus1 = _mm256_set1_epi16(-1);
+    const __m256i vtwo = _mm256_set1_epi16(2);
+    const __m256i vtarget = _mm256_set1_epi16((int16_t)required_delta);
+
+    size_t pos_end = boundary_max;  // boundary (not bit index)
+    int cur_end = prefix_end;       // prefix(pos_end)
+
+    // Vector chunks: process 16 bits ending at pos_end.
+    while (pos_end >= block_begin + 16) {
+      const size_t pos = pos_end - 16;  // bit index of the chunk start
+      const uint16_t bits16 = get_u16(pos);
+      const __m256i vb = _mm256_set1_epi16((int16_t)bits16);
+      const __m256i m = _mm256_and_si256(vb, masks);
+      const __m256i is_zero = _mm256_cmpeq_epi16(m, vzero);
+      const __m256i is_set = _mm256_andnot_si256(is_zero, vallones);
+      const __m256i steps =
+          _mm256_add_epi16(vminus1, _mm256_and_si256(is_set, vtwo));
+
+      const __m256i pref_rel =
+          prefix_sum_16x_i16(steps);  // prefix after each bit (relative)
+      const int16_t sum16 =
+          last_prefix_16x_i16(pref_rel);  // total sum on this 16-bit chunk
+      const int cur_start =
+          cur_end - (int)sum16;  // prefix at boundary pos (chunk start)
+
+      const __m256i base = _mm256_set1_epi16((int16_t)cur_start);
+      const __m256i pref = _mm256_add_epi16(
+          pref_rel, base);  // prefix at boundaries (pos+1..pos+16)
+      const __m256i cmp = _mm256_cmpeq_epi16(pref, vtarget);
+      const uint32_t mask = (uint32_t)_mm256_movemask_epi8(cmp);
+      if (mask) {
+        const int bit_i = 31 - int(std::countl_zero(mask));
+        const int lane = bit_i >> 1;
+        const size_t boundary = pos + (size_t)lane + 1;
+        if (boundary < global_right_border || allow_right_boundary) {
+          return boundary;
+        }
+        return npos;
+      }
+
+      cur_end = cur_start;
+      pos_end = pos;
+    }
+
+    while (pos_end > block_begin) {
+      // boundary pos_end corresponds to prefix cur_end
+      if (cur_end == required_delta) {
+        if (pos_end < global_right_border || allow_right_boundary) {
+          return pos_end;
+        }
+        return npos;
+      }
+      const size_t bit_pos = pos_end - 1;
+      cur_end -= bit(bit_pos) ? +1 : -1;  // move one bit to the left
+      pos_end = bit_pos;
+    }
+#else
+    size_t last_boundary = npos;
+    int cur = 0;
+    for (size_t pos = block_begin; pos < boundary_max; ++pos) {
+      cur += bit(pos) ? +1 : -1;
+      if (cur == required_delta) {
+        last_boundary = pos + 1;
       }
     }
+    if (last_boundary != npos) {
+      if (last_boundary < global_right_border || allow_right_boundary) {
+        return last_boundary;
+      }
+      return npos;
+    }
+#endif
 
+    // Left boundary (prefix == 0) is always the final candidate.
     if ((block_begin < global_right_border || allow_right_boundary) &&
         required_delta == 0) {
       return block_begin;
     }
-
     return npos;
   }
 
@@ -1413,6 +1727,12 @@ class RmMTree {
    */
   size_t first_leaf_index = 1;
 
+  /**
+   * @brief Sentinel to tell scan_leaf_bwd(): "prefix at right boundary not
+   * provided".
+   */
+  static constexpr int kNoPrefixOverride = std::numeric_limits<int>::min();
+
   /**
    * @brief Block index containing position @p position.
    */
@@ -1489,10 +1809,13 @@ class RmMTree {
         node_index = right_child;
       }
     }
-    return scan_leaf_fwd(
-        segment_base,
-        std::min(segment_base + segment_size_bits[node_index], num_bits),
-        required_delta);
+    const size_t seg_end =
+        std::min(segment_base + segment_size_bits[node_index], num_bits);
+#if defined(PIXIE_AVX2_SUPPORT)
+    return scan_leaf_fwd_simd(segment_base, seg_end, required_delta, nullptr);
+#else
+    return scan_leaf_fwd(segment_base, seg_end, required_delta);
+#endif
   }
 
   /**
@@ -1550,12 +1873,26 @@ class RmMTree {
       return npos;
     }
 
-    return scan_leaf_bwd(
-        segment_base,
-        std::min(
-            global_right_border,
-            std::min(segment_base + segment_size_bits[node_index], num_bits)),
-        required_delta, allow_right_boundary, global_right_border);
+    const size_t seg_end =
+        std::min(segment_base + segment_size_bits[node_index], num_bits);
+    const size_t block_end = std::min(global_right_border, seg_end);
+
+    int prefix_override = kNoPrefixOverride;
+    // If we scan the full leaf up to its end boundary, we know
+    // prefix(block_end) from node_total_excess[leaf]. If the right boundary is
+    // forbidden, we can still derive prefix(block_end-1) cheaply.
+    if (block_end == seg_end) {
+      const int total = node_total_excess[node_index];
+      if (!allow_right_boundary && block_end > segment_base) {
+        prefix_override = total - (bit(block_end - 1) ? +1 : -1);
+      } else {
+        prefix_override = total;
+      }
+    }
+
+    return scan_leaf_bwd(segment_base, block_end, required_delta,
+                         allow_right_boundary, global_right_border,
+                         prefix_override);
   }
 
   /**
@@ -1792,8 +2129,7 @@ class RmMTree {
                                    const size_t& block_size_pow2) noexcept {
     static constexpr size_t AUX_SLOT_BYTES =
         sizeof(uint32_t) + sizeof(int32_t) + sizeof(int32_t) + sizeof(int32_t) +
-        sizeof(uint32_t) + sizeof(uint32_t) + sizeof(uint8_t) +
-        sizeof(uint8_t) + sizeof(int16_t);
+        sizeof(uint32_t) + sizeof(uint32_t) + sizeof(uint8_t) + sizeof(uint8_t);
 
     size_t bitvector_bytes = ceil_div(bit_count, 64) * 8;
     if (bitvector_bytes == 0) {
@@ -2090,22 +2426,38 @@ class RmMTree {
                                  const int& delta,
                                  int& leaf_delta) const noexcept {
     const size_t leaf_length = segment_size_bits[first_leaf_index + leaf_index];
-    const int16_t* leaf_prefix_ptr =
-        &leaf_prefix[leaf_index * leaf_prefix_stride];
-    const size_t offset_in_leaf = start_position - leaf_block_begin;
-    if (offset_in_leaf > leaf_length) {
+    const size_t leaf_end = std::min(num_bits, leaf_block_begin + leaf_length);
+    if (start_position >= leaf_end) {
       leaf_delta = 0;
       return npos;
     }
-    const int16_t start_prefix = leaf_prefix_ptr[offset_in_leaf];
-    size_t match_index = ::find_forward_equal_i16_avx2(
-        leaf_prefix_ptr, offset_in_leaf + 1, leaf_length + 1,
-        start_prefix + delta, npos);
-    if (match_index != npos) {
-      return leaf_block_begin + match_index - 1;
+#if defined(PIXIE_AVX2_SUPPORT)
+    int total = 0;
+    // Heuristic: for tiny tails and tiny deltas, LUT8 wins because AVX2 setup
+    // is heavy...
+    // At least I think so...
+    const size_t tail_len = leaf_end - start_position;
+    size_t res = npos;
+    if (delta >= -8 && delta <= 8 && tail_len <= 256) {
+      res = scan_leaf_fwd_lut8_fast(start_position, leaf_end, delta, &total);
+    } else {
+      res = scan_leaf_fwd_simd(start_position, leaf_end, delta, &total);
+    }
+    if (res != npos) {
+      return res;
     }
-    leaf_delta = leaf_prefix_ptr[leaf_length] - start_prefix;
+    leaf_delta = total;
     return npos;
+#else
+    const size_t res = scan_leaf_fwd(start_position, leaf_end, delta);
+    if (res != npos) {
+      return res;
+    }
+    const int len = int(leaf_end - start_position);
+    const int ones = int(rank1_in_block(start_position, leaf_end));
+    leaf_delta = ones * 2 - len;
+    return npos;
+#endif
   }
 
   /**
@@ -2121,23 +2473,36 @@ class RmMTree {
                                  const size_t& start_position,
                                  const int& delta,
                                  int& leaf_delta) const noexcept {
-    const int16_t* leaf_prefix_ptr =
-        &leaf_prefix[leaf_index * leaf_prefix_stride];
-    const size_t offset_in_leaf = start_position - leaf_block_begin;
-    if (offset_in_leaf > segment_size_bits[first_leaf_index + leaf_index]) {
+    const size_t leaf_length = segment_size_bits[first_leaf_index + leaf_index];
+    const size_t leaf_end = std::min(num_bits, leaf_block_begin + leaf_length);
+    if (start_position < leaf_block_begin || start_position > leaf_end) {
       leaf_delta = 0;
       return npos;
     }
-    const int16_t start_prefix = leaf_prefix_ptr[offset_in_leaf];
-    if (offset_in_leaf > 0) {
-      const size_t match_index = ::find_backward_equal_i16_avx2(
-          leaf_prefix_ptr, 0, offset_in_leaf - 1, start_prefix + delta, npos);
-      if (match_index != npos) {
-        return leaf_block_begin + match_index;
-      }
+
+    // leaf_delta = excess(start_position) - excess(leaf_block_begin)
+    //            = 2*rank1([leaf_begin, start)) - (start - leaf_begin)
+    const int len = int(start_position - leaf_block_begin);
+    const int ones = int(rank1_in_block(leaf_block_begin, start_position));
+    leaf_delta = ones * 2 - len;
+
+    // Must find a boundary strictly < start_position (so
+    // boundary==start_position forbidden).
+    if (start_position == leaf_block_begin) {
+      return npos;
     }
-    leaf_delta = start_prefix;
-    return npos;
+    const int target_prefix = leaf_delta + delta;
+    return scan_leaf_bwd(
+        leaf_block_begin,
+        start_position,  // do not look to the right of start_position
+        target_prefix,
+        false,  // right boundary (=start_position) forbidden
+        start_position,
+        // prefix at boundary_max = start_position-1:
+        // leaf_delta is prefix at start_position, subtract last step
+        (start_position > leaf_block_begin
+             ? (leaf_delta - (bit(start_position - 1) ? +1 : -1))
+             : 0));
   }
 
   /**
@@ -2280,8 +2645,6 @@ class RmMTree {
     node_pattern10_count.assign(tree_size + 1, 0);
     node_first_bit.assign(tree_size + 1, 0);
     node_last_bit.assign(tree_size + 1, 0);
-    leaf_prefix_stride = block_bits + 1;
-    leaf_prefix.assign(leaf_count * leaf_prefix_stride, 0);
 
     // leaves
     for (size_t leaf_block_index = 0; leaf_block_index < leaf_count;
@@ -2367,15 +2730,6 @@ class RmMTree {
           (segment_size_bits[leaf_node_index] == 0 ? 0 : max_value);
       node_min_count[leaf_node_index] = min_count;
       node_pattern10_count[leaf_node_index] = (uint32_t)pattern10_count;
-      int16_t* leaf_prefix_ptr =
-          &leaf_prefix[leaf_block_index * leaf_prefix_stride];
-      int16_t prefix_accumulator = 0;
-      leaf_prefix_ptr[0] = 0;
-      for (size_t position_in_leaf = segment_begin, prefix_index = 1;
-           position_in_leaf < segment_end; ++position_in_leaf, ++prefix_index) {
-        prefix_accumulator += bit(position_in_leaf) ? 1 : -1;
-        leaf_prefix_ptr[prefix_index] = prefix_accumulator;
-      }
     }
     // internal nodes
     for (size_t node_index = first_leaf_index - 1; node_index >= 1;
diff --git a/misc/plot_rmm.py b/misc/plot_rmm.py
index 78e7beb..ce88c99 100644
--- a/misc/plot_rmm.py
+++ b/misc/plot_rmm.py
@@ -1,14 +1,17 @@
 """
-Plot RmMTree benchmark results.
+Plot RmMTree benchmark results (optionally compared with sdsl-lite).
+
+This script reads a JSON produced by `bench_rmm` and, optionally, a JSON
+with benchmarks for sdsl-lite. For each operation, it draws a scatter plot
+of individual points and a trend line (optionally median-smoothed) for each
+implementation on the same figure.
 
-This script reads a JSON produced by `bench_rmm` and,
-for each operation, draws a scatter plot of individual points
-and a trend line (optionally median-smoothed).
 Plots are saved as PNG files and can also be shown interactively.
 
 Examples:
   python3 plot_rmm.py rmm_bench.json --save-dir plots --logx --smooth 3
   python3 plot_rmm.py rmm_bench.json --show
+  python3 plot_rmm.py rmm_bench.json --sdsl-json rmm_bench_sdsl.json --save-dir plots --logx --smooth 3
 """
 
 import argparse
@@ -39,23 +42,49 @@
 ]
 
 
+def load_bench_json(path: str) -> pd.DataFrame:
+    with open(path, "r", encoding="utf-8") as f:
+        data = json.load(f)
+    if isinstance(data, dict) and "benchmarks" in data:
+        data = data["benchmarks"]
+    df = pd.DataFrame(data)
+    required = {"name", "cpu_time", "N"}
+    if not required.issubset(df.columns):
+        missing = ", ".join(sorted(required - set(df.columns)))
+        raise ValueError(
+            f"{path!r}: missing required columns in benchmark JSON: {missing}"
+        )
+    df = df.dropna(subset=["cpu_time", "N"])
+    return df
+
+
 def main():
     ap = argparse.ArgumentParser(
         description=(
-            "Read a JSON with RmMTree benchmark results and plot time per operation "
-            "versus sequence size N for each operation."
+            "Read JSON with RmMTree benchmark results (and optionally "
+            "sdsl-lite results) and plot time per operation versus "
+            "sequence size N for each operation."
         ),
         epilog=(
             "Examples:\n"
             "  python3 plot_rmm.py rmm_bench.json --save-dir plots --logx --smooth 3\n"
-            "  python3 plot_rmm.py rmm_bench.json --show"
+            "  python3 plot_rmm.py rmm_bench.json --show\n"
+            "  python3 plot_rmm.py rmm_bench.json --sdsl-json rmm_bench_sdsl.json --save-dir plots --logx --smooth 3"
         ),
         formatter_class=argparse.RawDescriptionHelpFormatter,
     )
     ap.add_argument(
         "json",
         metavar="JSON",
-        help="Path to the JSON file with results (output of bench_rmm).",
+        help="Path to the JSON file with RmMTree results (output of bench_rmm).",
+    )
+    ap.add_argument(
+        "--sdsl-json",
+        metavar="SDSL_JSON",
+        help=(
+            "Path to JSON file with sdsl-lite benchmark results. "
+            "If provided, both implementations will be plotted on the same graphs."
+        ),
     )
     ap.add_argument(
         "--save-dir",
@@ -94,38 +123,97 @@ def main():
     args = ap.parse_args()
 
     os.makedirs(args.save_dir, exist_ok=True)
-    with open(args.json, "r", encoding="utf-8") as f:
-        data = json.load(f)
-    if isinstance(data, dict) and "benchmarks" in data:
-        data = data["benchmarks"]
-    df = pd.DataFrame(data)
-    df = df.dropna(subset=["cpu_time", "N"])
 
-    for op in OPS_ORDER:
-        d = df[df["name"] == op].copy()
-        if d.empty:
+    df_rmm = load_bench_json(args.json)
+    df_sdsl = None
+    if args.sdsl_json is not None:
+        df_sdsl = load_bench_json(args.sdsl_json)
+
+    rmm_ops = set(df_rmm["name"].dropna().astype(str).unique())
+    if df_sdsl is not None:
+        sdsl_ops = set(df_sdsl["name"].dropna().astype(str).unique())
+        common_ops = rmm_ops & sdsl_ops
+        ops_to_plot = [op for op in OPS_ORDER if op in common_ops]
+        extra = sorted(common_ops - set(OPS_ORDER))
+        ops_to_plot.extend(extra)
+    else:
+        ops_to_plot = OPS_ORDER
+
+    for op in ops_to_plot:
+        d_rmm = df_rmm[df_rmm["name"] == op].copy()
+        d_sdsl = None
+        if df_sdsl is not None:
+            d_sdsl = df_sdsl[df_sdsl["name"] == op].copy()
+        if d_rmm.empty or (df_sdsl is not None and (d_sdsl is None or d_sdsl.empty)):
             continue
-        d = d.groupby("N", as_index=False)["cpu_time"].median().sort_values("N")
+        plt.figure()
 
-        yplot = d["cpu_time"]
-        if args.smooth and args.smooth > 1:
-            d["ns_smooth"] = (
-                d["cpu_time"]
-                .rolling(window=args.smooth, center=True, min_periods=1)
-                .median()
+        if not d_rmm.empty:
+            d_rmm = (
+                d_rmm.groupby("N", as_index=False)["cpu_time"].median().sort_values("N")
+            )
+            y_rmm = d_rmm["cpu_time"]
+            if args.smooth and args.smooth > 1:
+                d_rmm["ns_smooth"] = (
+                    d_rmm["cpu_time"]
+                    .rolling(window=args.smooth, center=True, min_periods=1)
+                    .median()
+                )
+                y_rmm = d_rmm["ns_smooth"]
+            plt.scatter(
+                d_rmm["N"],
+                d_rmm["cpu_time"],
+                s=8,
+                alpha=0.3,
+                linewidths=0,
+            )
+            plt.plot(
+                d_rmm["N"],
+                y_rmm,
+                linewidth=1.5,
+                label="RmMTree",
             )
-            yplot = d["ns_smooth"]
 
-        plt.figure()
-        plt.scatter(d["N"], d["cpu_time"], s=8, alpha=0.3, linewidths=0)
-        plt.plot(d["N"], yplot, linewidth=1.5)
+        if d_sdsl is not None and not d_sdsl.empty:
+            d_sdsl = (
+                d_sdsl.groupby("N", as_index=False)["cpu_time"]
+                .median()
+                .sort_values("N")
+            )
+            y_sdsl = d_sdsl["cpu_time"]
+            if args.smooth and args.smooth > 1:
+                d_sdsl["ns_smooth"] = (
+                    d_sdsl["cpu_time"]
+                    .rolling(window=args.smooth, center=True, min_periods=1)
+                    .median()
+                )
+                y_sdsl = d_sdsl["ns_smooth"]
+            plt.scatter(
+                d_sdsl["N"],
+                d_sdsl["cpu_time"],
+                s=8,
+                alpha=0.3,
+                linewidths=0.9,
+                marker="x",
+            )
+            plt.plot(
+                d_sdsl["N"],
+                y_sdsl,
+                linewidth=1.5,
+                linestyle="--",
+                label="sdsl-lite",
+            )
 
         if args.logx:
             plt.xscale("log", base=2)
         plt.xlabel("Sequence size N (bits)")
         plt.ylabel("Time per operation, ns")
-        plt.title(f"RmMTree: {op}")
+        if d_sdsl is not None and not d_sdsl.empty:
+            plt.title(f"RmMTree vs sdsl-lite: {op}")
+        else:
+            plt.title(f"RmMTree: {op}")
         plt.grid(True, which="both", linestyle="--", alpha=0.4)
+        plt.legend(loc="best")
         out = os.path.join(args.save_dir, f"{op}.png")
         plt.savefig(out, bbox_inches="tight", dpi=160)
         if not args.show:
diff --git a/src/bench_rmm.cpp b/src/bench_rmm.cpp
index 523516e..a15866e 100644
--- a/src/bench_rmm.cpp
+++ b/src/bench_rmm.cpp
@@ -6,6 +6,7 @@
 #include <cstddef>
 #include <cstdint>
 #include <iostream>
+#include <limits>
 #include <memory>
 #include <optional>
 #include <random>
@@ -32,66 +33,96 @@ struct Args {
 static Args args;
 
 static void parse_args_and_strip(int& argc, char**& argv) {
-  auto getv = [&](const std::string& key) -> std::optional<std::string> {
-    std::string pref = "--" + key + "=";
-    for (int i = 1; i < argc; ++i) {
-      std::string s = argv[i];
-      if (s.rfind(pref, 0) == 0) {
-        return s.substr(pref.size());
+  auto is_value_token = [&](int candidate_index) -> bool {
+    if (candidate_index >= argc) {
+      return false;
+    }
+    std::string candidate = argv[candidate_index];
+    return !(candidate.size() >= 2 && candidate[0] == '-' &&
+             candidate[1] == '-');
+  };
+
+  auto get_value = [&](const std::string& key) -> std::optional<std::string> {
+    std::string prefix = "--" + key;
+    std::string prefix_with_equals = prefix + "=";
+    for (int argument_index = 1; argument_index < argc; ++argument_index) {
+      std::string argument = argv[argument_index];
+      if (argument.rfind(prefix_with_equals, 0) == 0) {
+        return argument.substr(prefix_with_equals.size());
+      }
+      if (argument == prefix && is_value_token(argument_index + 1)) {
+        return std::string(argv[argument_index + 1]);
       }
     }
     return std::nullopt;
   };
+
   auto strip = [&](const std::string& key) {
-    std::string pref = "--" + key + "=";
-    int w = 0;
-    for (int i = 0; i < argc; ++i) {
-      if (i > 0 && std::string(argv[i]).rfind(pref, 0) == 0) {
+    std::string prefix = "--" + key;
+    std::string prefix_with_equals = prefix + "=";
+    int write_index = 0;
+    for (int argument_index = 0; argument_index < argc; ++argument_index) {
+      std::string argument = argv[argument_index];
+      if (argument_index > 0 && argument.rfind(prefix_with_equals, 0) == 0) {
+        continue;
+      }
+      if (argument_index > 0 && argument == prefix &&
+          is_value_token(argument_index + 1)) {
+        ++argument_index;
         continue;
       }
-      argv[w++] = argv[i];
+      argv[write_index++] = argv[argument_index];
     }
-    argc = w;
+    argc = write_index;
   };
 
-  if (auto v = getv("min-exp")) {
-    args.min_exp = std::stoi(*v), strip("min-exp");
+  if (auto argument_value = get_value("min_exp")) {
+    args.min_exp = std::stoi(*argument_value);
+    strip("min_exp");
   }
-  if (auto v = getv("max-exp")) {
-    args.max_exp = std::stoi(*v), strip("max-exp");
+  if (auto argument_value = get_value("max_exp")) {
+    args.max_exp = std::stoi(*argument_value);
+    strip("max_exp");
   }
-  if (auto v = getv("q")) {
-    args.Q = std::stoull(*v), strip("q");
+  if (auto argument_value = get_value("Q")) {
+    args.Q = std::stoull(*argument_value);
+    strip("Q");
   }
-  if (auto v = getv("p")) {
-    args.p1 = std::stod(*v), strip("p");
+  if (auto argument_value = get_value("p1")) {
+    args.p1 = std::stod(*argument_value);
+    strip("p1");
   }
-  if (auto v = getv("seed")) {
-    args.seed = std::stoull(*v), strip("seed");
+  if (auto argument_value = get_value("seed")) {
+    args.seed = std::stoull(*argument_value);
+    strip("seed");
   }
-  if (auto v = getv("block")) {
-    args.block_bits = std::stoull(*v), strip("block");
+  if (auto argument_value = get_value("block_bits")) {
+    args.block_bits = std::stoull(*argument_value);
+    strip("block_bits");
   }
-  if (auto v = getv("per-octave")) {
-    args.per_octave = std::stoi(*v), strip("per-octave");
+  if (auto argument_value = get_value("per_octave")) {
+    args.per_octave = std::stoi(*argument_value);
+    strip("per_octave");
   }
 
-  if (auto v = getv("sizes")) {
-    std::string s = *v;
-    strip("sizes");
-    std::size_t pos = 0;
-    while (pos < s.size()) {
-      while (pos < s.size() &&
-             (s[pos] == ',' || std::isspace((unsigned char)s[pos]))) {
-        ++pos;
+  if (auto argument_value = get_value("explicit_sizes")) {
+    std::string sizes_text = *argument_value;
+    strip("explicit_sizes");
+    std::size_t position = 0;
+    while (position < sizes_text.size()) {
+      while (position < sizes_text.size() &&
+             (sizes_text[position] == ',' ||
+              std::isspace(static_cast<unsigned char>(sizes_text[position])))) {
+        ++position;
       }
-      std::size_t start = pos;
-      while (pos < s.size() && std::isdigit((unsigned char)s[pos])) {
-        ++pos;
+      std::size_t start_index = position;
+      while (position < sizes_text.size() &&
+             std::isdigit(static_cast<unsigned char>(sizes_text[position]))) {
+        ++position;
       }
-      if (start < pos) {
-        args.explicit_sizes.push_back(
-            std::stoull(s.substr(start, pos - start)));
+      if (start_index < position) {
+        args.explicit_sizes.push_back(std::stoull(
+            sizes_text.substr(start_index, position - start_index)));
       }
     }
     std::sort(args.explicit_sizes.begin(), args.explicit_sizes.end());
@@ -101,14 +132,51 @@ static void parse_args_and_strip(int& argc, char**& argv) {
   }
 }
 
-static std::string make_random_bits(std::size_t N,
+static std::string random_dyck_bits(std::size_t N,
                                     double p1,
                                     std::mt19937_64& rng) {
-  std::uniform_real_distribution<double> U(0.0, 1.0);
+  std::size_t pairs = N >> 1;
+  std::size_t L = pairs << 1;
   std::string s;
-  s.resize(N);
-  for (std::size_t i = 0; i < N; ++i) {
-    s[i] = (U(rng) < p1 ? '1' : '0');
+  s.resize(L);
+  if (L == 0) {
+    return s;
+  }
+  p1 = std::clamp(p1, 0., 1.);
+  std::size_t opens_left = pairs;
+  std::size_t closes_left = pairs;
+  int balance = 0;
+  std::uniform_real_distribution<double> U(0., 1.);
+  for (std::size_t i = 0; i < L; ++i) {
+    if (opens_left == 0) {
+      s[i] = '0';
+      --closes_left;
+      --balance;
+      continue;
+    }
+
+    if (closes_left == 0) {
+      s[i] = '1';
+      --opens_left;
+      ++balance;
+      continue;
+    }
+
+    if (balance == 0) {
+      s[i] = '1';
+      --opens_left;
+      ++balance;
+      continue;
+    }
+    if (U(rng) < p1) {
+      s[i] = '1';
+      --opens_left;
+      ++balance;
+    } else {
+      s[i] = '0';
+      --closes_left;
+      --balance;
+    }
   }
   return s;
 }
@@ -161,6 +229,10 @@ static std::vector<std::size_t> build_size_grid() {
 
 struct Pools {
   std::vector<std::size_t> inds_any;
+  std::vector<std::size_t> rank10_end_positions;
+  std::vector<std::size_t> open_positions_zero_based;
+  std::vector<std::size_t> open_positions_one_based;
+  std::vector<std::size_t> close_positions_one_based;
   std::vector<std::size_t> inds;
   std::vector<std::size_t> inds_1N;
   std::vector<int> deltas;
@@ -199,8 +271,9 @@ static Dataset build_dataset(std::size_t N) {
                       static_cast<std::uint64_t>(N) * 0x9E3779B185EBCA87ull);
 
   Dataset d;
-  d.N = N;
-  d.bits = make_random_bits(N, args.p1, rng);
+  d.bits = random_dyck_bits(N, args.p1, rng);
+  d.N = d.bits.size();
+  N = d.N;
 
   if (args.block_bits == 0) {
     d.t = RmMTree(d.bits, 0, -1.0f);
@@ -208,22 +281,42 @@ static Dataset build_dataset(std::size_t N) {
     d.t = RmMTree(d.bits, args.block_bits);
   }
 
-  d.cnt1 = d.t.rank1(N);
+  d.cnt1 = std::count(d.bits.begin(), d.bits.end(), '1');
   d.cnt0 = N - d.cnt1;
   d.cnt10 = count10(d.bits);
 
-  const std::size_t L = std::min<std::size_t>(args.Q, 32768);
+  std::vector<std::size_t> open_positions_zero_based;
+  std::vector<std::size_t> open_positions_one_based;
+  std::vector<std::size_t> close_positions_one_based;
+  open_positions_zero_based.reserve(d.N >> 1);
+  open_positions_one_based.reserve(d.N >> 1);
+  close_positions_one_based.reserve(d.N >> 1);
+  for (std::size_t i = 0; i < N; ++i) {
+    if (d.bits[i] == '1') {
+      open_positions_zero_based.push_back(i);
+      open_positions_one_based.push_back(i + 1);
+    } else {
+      close_positions_one_based.push_back(i + 1);
+    }
+  }
 
-  std::uniform_int_distribution<std::size_t> ind_dist_incl(0, N ? N : 0);
+  std::size_t sample_count = std::max<std::size_t>(
+      1, std::min<std::size_t>(args.Q, std::size_t(32768)));
+
+  std::uniform_int_distribution<std::size_t> nonedge_index_distribution(
+      N > 1 ? 1 : 0, N > 1 ? (N - 1) : 0);
   std::uniform_int_distribution<std::size_t> ind_dist(0, N ? (N - 1) : 0);
   std::uniform_int_distribution<std::size_t> ind_dist_1N(1, N ? N : 1);
-  std::uniform_int_distribution<int> d_dist(-8, +8);
+  std::uniform_int_distribution<std::size_t> rank10_end_position_distribution(
+      N >= 2 ? 2 : 0, N >= 2 ? N : 0);
+  std::uniform_int_distribution<int> delta_distribution(-8, +8);
 
-  d.pool.inds_any.resize(L);
-  d.pool.inds.resize(L);
-  d.pool.inds_1N.resize(L);
-  d.pool.deltas.resize(L);
-  d.pool.segs.resize(L);
+  d.pool.inds_any.resize(sample_count);
+  d.pool.rank10_end_positions.resize(sample_count);
+  d.pool.inds.resize(sample_count);
+  d.pool.inds_1N.resize(sample_count);
+  d.pool.deltas.resize(sample_count);
+  d.pool.segs.resize(sample_count);
 
   auto rand_ij = [&]() -> std::pair<std::size_t, std::size_t> {
     if (N == 0) {
@@ -236,22 +329,46 @@ static Dataset build_dataset(std::size_t N) {
     return {a, b};
   };
 
-  for (std::size_t i = 0; i < L; ++i) {
-    d.pool.inds_any[i] = ind_dist_incl(rng);
+  for (std::size_t i = 0; i < sample_count; ++i) {
+    d.pool.inds_any[i] = nonedge_index_distribution(rng);
+    d.pool.rank10_end_positions[i] = rank10_end_position_distribution(rng);
     d.pool.inds[i] = (N ? ind_dist(rng) : 0);
     d.pool.inds_1N[i] = (N ? ind_dist_1N(rng) : 0);
-    d.pool.deltas[i] = d_dist(rng);
+    d.pool.deltas[i] = delta_distribution(rng);
     d.pool.segs[i] = rand_ij();
   }
 
+  auto fill_from_candidates = [&](const std::vector<std::size_t>& candidates,
+                                  std::vector<std::size_t>& destination,
+                                  std::size_t fallback_value) {
+    destination.resize(sample_count);
+    if (candidates.empty()) {
+      std::fill(destination.begin(), destination.end(), fallback_value);
+      return;
+    }
+    std::uniform_int_distribution<std::size_t> index_distribution(
+        0, candidates.size() - 1);
+    for (std::size_t sample_index = 0; sample_index < sample_count;
+         ++sample_index) {
+      destination[sample_index] = candidates[index_distribution(rng)];
+    }
+  };
+  const std::size_t one_based_fallback = (N > 0 ? 1 : 0);
+  fill_from_candidates(open_positions_zero_based,
+                       d.pool.open_positions_zero_based, 0);
+  fill_from_candidates(open_positions_one_based,
+                       d.pool.open_positions_one_based, one_based_fallback);
+  fill_from_candidates(close_positions_one_based,
+                       d.pool.close_positions_one_based, one_based_fallback);
+
   auto fill_ks = [&](std::size_t total, std::vector<std::size_t>& out) {
-    out.resize(L);
+    out.resize(sample_count);
     if (total == 0) {
       std::fill(out.begin(), out.end(), 1);
       return;
     }
     std::uniform_int_distribution<std::size_t> dist(1, total);
-    for (std::size_t i = 0; i < L; ++i) {
+    for (std::size_t i = 0; i < sample_count; ++i) {
       out[i] = dist(rng);
     }
   };
@@ -259,8 +376,8 @@ static Dataset build_dataset(std::size_t N) {
   fill_ks(d.cnt0, d.pool.ks0);
   fill_ks(d.cnt10, d.pool.ks10);
 
-  d.pool.minselect_q.resize(L);
-  for (std::size_t i = 0; i < L; ++i) {
+  d.pool.minselect_q.resize(sample_count);
+  for (std::size_t i = 0; i < sample_count; ++i) {
     auto [l, r] = d.pool.segs[i];
     std::size_t c = d.t.mincount(l, r);
     if (c == 0) {
@@ -326,11 +443,14 @@ static void register_all() {
                   benchmark::DoNotOptimize(r);
                 });
 
-    register_op("rank10", data,
-                [&](benchmark::State&, const Dataset& D, std::size_t k) {
-                  auto r = D.t.rank10(P.inds_any[k % P.inds_any.size()]);
-                  benchmark::DoNotOptimize(r);
-                });
+    register_op(
+        "rank10", data,
+        [&](benchmark::State&, const Dataset& dataset,
+            std::size_t sample_index) {
+          benchmark::DoNotOptimize(dataset.t.rank10(
+              dataset.pool.rank10_end_positions
+                  [sample_index % dataset.pool.rank10_end_positions.size()]));
+        });
 
     register_op("select10", data,
                 [&](benchmark::State&, const Dataset& D, std::size_t k) {
@@ -360,16 +480,20 @@ static void register_all() {
 
     register_op("range_min_query_pos", data,
                 [&](benchmark::State&, const Dataset& D, std::size_t k) {
-                  auto [i, j] = P.segs[k % P.segs.size()];
-                  auto r = D.t.range_min_query_pos(i, j);
-                  benchmark::DoNotOptimize(r);
+                  const std::size_t segment_index = k % P.segs.size();
+                  auto [range_begin, range_end] = P.segs[segment_index];
+                  auto min_position =
+                      D.t.range_min_query_pos(range_begin, range_end);
+                  benchmark::DoNotOptimize(min_position);
                 });
 
     register_op("range_min_query_val", data,
                 [&](benchmark::State&, const Dataset& D, std::size_t k) {
-                  auto [i, j] = P.segs[k % P.segs.size()];
-                  auto r = D.t.range_min_query_val(i, j);
-                  benchmark::DoNotOptimize(r);
+                  const std::size_t segment_index = k % P.segs.size();
+                  auto [range_begin, range_end] = P.segs[segment_index];
+                  auto min_value =
+                      D.t.range_min_query_val(range_begin, range_end);
+                  benchmark::DoNotOptimize(min_value);
                 });
 
     register_op("range_max_query_pos", data,
@@ -403,27 +527,39 @@ static void register_all() {
                 });
 
     register_op("close", data,
-                [&](benchmark::State&, const Dataset& D, std::size_t k) {
-                  if (D.N == 0) {
+                [&](benchmark::State&, const Dataset& dataset,
+                    std::size_t sample_index) {
+                  if (dataset.N == 0) {
                     benchmark::DoNotOptimize(0);
                     return;
                   }
-                  auto i = P.inds[k % P.inds.size()];
-                  auto r = D.t.close(i);
-                  benchmark::DoNotOptimize(r);
+                  const std::size_t open_position =
+                      dataset.pool.open_positions_zero_based
+                          [sample_index %
+                           dataset.pool.open_positions_zero_based.size()];
+                  benchmark::DoNotOptimize(dataset.t.close(open_position));
                 });
 
-    register_op("open", data,
-                [&](benchmark::State&, const Dataset& D, std::size_t k) {
-                  auto r = D.t.open(P.inds_1N[k % P.inds_1N.size()]);
-                  benchmark::DoNotOptimize(r);
-                });
-
-    register_op("enclose", data,
-                [&](benchmark::State&, const Dataset& D, std::size_t k) {
-                  auto r = D.t.enclose(P.inds_1N[k % P.inds_1N.size()]);
-                  benchmark::DoNotOptimize(r);
-                });
+    register_op(
+        "open", data,
+        [&](benchmark::State&, const Dataset& dataset,
+            std::size_t sample_index) {
+          const std::size_t close_position_one_based =
+              dataset.pool.close_positions_one_based
+                  [sample_index %
+                   dataset.pool.close_positions_one_based.size()];
+          benchmark::DoNotOptimize(dataset.t.open(close_position_one_based));
+        });
+
+    register_op(
+        "enclose", data,
+        [&](benchmark::State&, const Dataset& dataset,
+            std::size_t sample_index) {
+          const std::size_t open_position_one_based =
+              dataset.pool.open_positions_one_based
+                  [sample_index % dataset.pool.open_positions_one_based.size()];
+          benchmark::DoNotOptimize(dataset.t.enclose(open_position_one_based));
+        });
   }
 }
 
diff --git a/src/bench_rmm_sdsl.cpp b/src/bench_rmm_sdsl.cpp
new file mode 100644
index 0000000..d7e450e
--- /dev/null
+++ b/src/bench_rmm_sdsl.cpp
@@ -0,0 +1,594 @@
+#include <benchmark/benchmark.h>
+
+#include <algorithm>
+#include <cctype>
+#include <cmath>
+#include <cstddef>
+#include <cstdint>
+#include <iostream>
+#include <limits>
+#include <memory>
+#include <optional>
+#include <random>
+#include <sdsl/bit_vectors.hpp>
+#include <sdsl/bp_support_sada.hpp>
+#include <sdsl/util.hpp>
+#include <set>
+#include <string>
+#include <utility>
+#include <vector>
+
+using sdsl::bit_vector;
+using sdsl::bp_support_sada;
+using BpSupport = bp_support_sada<>;
+
+struct Args {
+  int min_exp = 14;
+  int max_exp = 22;
+  std::size_t Q = 200000;
+  double p1 = 0.5;
+  std::uint64_t seed = 42;
+  std::size_t block_bits = 64;
+  int per_octave = 10;
+  std::vector<std::size_t> explicit_sizes;
+};
+
+static Args args;
+
+static void parse_args_and_strip(int& argc, char**& argv) {
+  auto is_value_token = [&](int candidate_index) -> bool {
+    if (candidate_index >= argc) {
+      return false;
+    }
+    std::string candidate = argv[candidate_index];
+    return !(candidate.size() >= 2 && candidate[0] == '-' &&
+             candidate[1] == '-');
+  };
+
+  auto get_value = [&](const std::string& key) -> std::optional<std::string> {
+    std::string prefix = "--" + key;
+    std::string prefix_with_equals = prefix + "=";
+    for (int argument_index = 1; argument_index < argc; ++argument_index) {
+      std::string argument = argv[argument_index];
+      if (argument.rfind(prefix_with_equals, 0) == 0) {
+        return argument.substr(prefix_with_equals.size());
+      }
+      if (argument == prefix && is_value_token(argument_index + 1)) {
+        return std::string(argv[argument_index + 1]);
+      }
+    }
+    return std::nullopt;
+  };
+
+  auto strip = [&](const std::string& key) {
+    std::string prefix = "--" + key;
+    std::string prefix_with_equals = prefix + "=";
+    int write_index = 0;
+    for (int argument_index = 0; argument_index < argc; ++argument_index) {
+      std::string argument = argv[argument_index];
+      if (argument_index > 0 && argument.rfind(prefix_with_equals, 0) == 0) {
+        continue;
+      }
+      if (argument_index > 0 && argument == prefix &&
+          is_value_token(argument_index + 1)) {
+        ++argument_index;
+        continue;
+      }
+      argv[write_index++] = argv[argument_index];
+    }
+    argc = write_index;
+  };
+
+  if (auto argument_value = get_value("min_exp")) {
+    args.min_exp = std::stoi(*argument_value);
+    strip("min_exp");
+  }
+  if (auto argument_value = get_value("max_exp")) {
+    args.max_exp = std::stoi(*argument_value);
+    strip("max_exp");
+  }
+  if (auto argument_value = get_value("Q")) {
+    args.Q = std::stoull(*argument_value);
+    strip("Q");
+  }
+  if (auto argument_value = get_value("p1")) {
+    args.p1 = std::stod(*argument_value);
+    strip("p1");
+  }
+  if (auto argument_value = get_value("seed")) {
+    args.seed = std::stoull(*argument_value);
+    strip("seed");
+  }
+  if (auto argument_value = get_value("block_bits")) {
+    args.block_bits = std::stoull(*argument_value);
+    strip("block_bits");
+  }
+  if (auto argument_value = get_value("per_octave")) {
+    args.per_octave = std::stoi(*argument_value);
+    strip("per_octave");
+  }
+
+  if (auto argument_value = get_value("explicit_sizes")) {
+    std::string sizes_text = *argument_value;
+    strip("explicit_sizes");
+    std::size_t position = 0;
+    while (position < sizes_text.size()) {
+      while (position < sizes_text.size() &&
+             (sizes_text[position] == ',' ||
+              std::isspace(static_cast<unsigned char>(sizes_text[position])))) {
+        ++position;
+      }
+      std::size_t start_index = position;
+      while (position < sizes_text.size() &&
+             std::isdigit(static_cast<unsigned char>(sizes_text[position]))) {
+        ++position;
+      }
+      if (start_index < position) {
+        args.explicit_sizes.push_back(std::stoull(
+            sizes_text.substr(start_index, position - start_index)));
+      }
+    }
+    std::sort(args.explicit_sizes.begin(), args.explicit_sizes.end());
+    args.explicit_sizes.erase(
+        std::unique(args.explicit_sizes.begin(), args.explicit_sizes.end()),
+        args.explicit_sizes.end());
+  }
+}
+
+static std::string random_dyck_bits(std::size_t N,
+                                    double p1,
+                                    std::mt19937_64& rng) {
+  std::size_t pairs = N >> 1;
+  std::size_t L = pairs << 1;
+  std::string s;
+  s.resize(L);
+  if (L == 0) {
+    return s;
+  }
+  p1 = std::clamp(p1, 0., 1.);
+  std::size_t opens_left = pairs;
+  std::size_t closes_left = pairs;
+  int balance = 0;
+  std::uniform_real_distribution<double> U(0., 1.);
+  for (std::size_t i = 0; i < L; ++i) {
+    if (opens_left == 0) {
+      s[i] = '0';
+      --closes_left;
+      --balance;
+      continue;
+    }
+
+    if (closes_left == 0) {
+      s[i] = '1';
+      --opens_left;
+      ++balance;
+      continue;
+    }
+
+    if (balance == 0) {
+      s[i] = '1';
+      --opens_left;
+      ++balance;
+      continue;
+    }
+    if (U(rng) < p1) {
+      s[i] = '1';
+      --opens_left;
+      ++balance;
+    } else {
+      s[i] = '0';
+      --closes_left;
+      --balance;
+    }
+  }
+  return s;
+}
+
+static std::vector<std::size_t> build_size_grid() {
+  if (!args.explicit_sizes.empty()) {
+    std::vector<std::size_t> v = args.explicit_sizes;
+    v.erase(std::remove(v.begin(), v.end(), 0), v.end());
+    return v;
+  }
+
+  const int lo = args.min_exp, hi = args.max_exp;
+  if (args.per_octave <= 0) {
+    std::vector<std::size_t> v;
+    for (int e = lo; e <= hi; ++e) {
+      v.push_back(std::size_t(1) << e);
+    }
+    return v;
+  }
+
+  int per_octave_steps = args.per_octave;
+  std::set<std::size_t> N_cands;
+  std::size_t min_N = std::size_t(1) << lo, max_N = std::size_t(1) << hi;
+
+  for (int e = lo; e <= hi; ++e) {
+    for (int t = 0; t <= per_octave_steps; ++t) {
+      long double x = e + (long double)t / (long double)per_octave_steps;
+      std::size_t N = (std::size_t)std::llround(std::pow(2.0L, x));
+      if (N < min_N) {
+        N = min_N;
+      }
+      if (N > max_N) {
+        N = max_N;
+      }
+      if (N) {
+        N_cands.insert(N);
+      }
+    }
+  }
+
+  std::vector<std::size_t> v(N_cands.begin(), N_cands.end());
+  if (v.front() != min_N) {
+    v.insert(v.begin(), min_N);
+  }
+  if (v.back() != max_N) {
+    v.push_back(max_N);
+  }
+  return v;
+}
+
+struct Pools {
+  std::vector<std::size_t> inds_any;
+  std::vector<std::size_t> rank10_end_positions;
+  std::vector<std::size_t> open_positions_zero_based;
+  std::vector<std::size_t> open_positions_one_based;
+  std::vector<std::size_t> close_positions_one_based;
+  std::vector<std::size_t> inds;
+  std::vector<std::size_t> inds_1N;
+  std::vector<int> deltas;
+  std::vector<std::pair<std::size_t, std::size_t>> segs;
+
+  std::vector<std::size_t> ks1, ks0, ks10;
+  std::vector<std::size_t> minselect_q;
+};
+
+struct Dataset {
+  std::size_t N{};
+  std::string bits;
+  bit_vector bv;
+  BpSupport t;
+
+  std::size_t cnt1{}, cnt0{}, cnt10{};
+  Pools pool;
+};
+
+static std::vector<std::shared_ptr<Dataset>> keepalive;
+
+static std::size_t count10(const std::string& s) {
+  std::size_t c = 0;
+  if (s.size() < 2) {
+    return 0;
+  }
+  for (std::size_t i = 0; i + 1 < s.size(); ++i) {
+    if (s[i] == '1' && s[i + 1] == '0') {
+      ++c;
+    }
+  }
+  return c;
+}
+
+static std::shared_ptr<Dataset> build_dataset(std::size_t N) {
+  std::mt19937_64 rng(args.seed ^
+                      static_cast<std::uint64_t>(N) * 0x9E3779B185EBCA87ull);
+
+  auto d = std::make_shared<Dataset>();
+  d->bits = random_dyck_bits(N, args.p1, rng);
+  d->N = d->bits.size();
+  N = d->N;
+
+  d->bv = bit_vector(N);
+  for (std::size_t i = 0; i < N; ++i) {
+    d->bv[i] = (d->bits[i] == '1');
+  }
+  d->t = BpSupport(&d->bv);
+
+  d->cnt1 = std::count(d->bits.begin(), d->bits.end(), '1');
+  d->cnt0 = N - d->cnt1;
+  d->cnt10 = count10(d->bits);
+
+  std::vector<std::size_t> open_positions_zero_based;
+  std::vector<std::size_t> open_positions_one_based;
+  std::vector<std::size_t> close_positions_one_based;
+  open_positions_zero_based.reserve(d->N >> 1);
+  open_positions_one_based.reserve(d->N >> 1);
+  close_positions_one_based.reserve(d->N >> 1);
+  for (std::size_t i = 0; i < N; ++i) {
+    if (d->bits[i] == '1') {
+      open_positions_zero_based.push_back(i);
+      open_positions_one_based.push_back(i + 1);
+    } else {
+      close_positions_one_based.push_back(i + 1);
+    }
+  }
+
+  std::size_t sample_count = std::max<std::size_t>(
+      1, std::min<std::size_t>(args.Q, std::size_t(32768)));
+
+  std::uniform_int_distribution<std::size_t> nonedge_index_distribution(
+      N > 1 ? 1 : 0, N > 1 ? (N - 1) : 0);
+  std::uniform_int_distribution<std::size_t> ind_dist(0, N ? (N - 1) : 0);
+  std::uniform_int_distribution<std::size_t> ind_dist_1N(1, N ? N : 1);
+  std::uniform_int_distribution<std::size_t> rank10_end_position_distribution(
+      N >= 2 ? 2 : 0, N >= 2 ? N : 0);
+  std::uniform_int_distribution<int> delta_distribution(-8, +8);
+
+  d->pool.inds_any.resize(sample_count);
+  d->pool.rank10_end_positions.resize(sample_count);
+  d->pool.inds.resize(sample_count);
+  d->pool.inds_1N.resize(sample_count);
+  d->pool.deltas.resize(sample_count);
+  d->pool.segs.resize(sample_count);
+
+  auto rand_ij = [&]() -> std::pair<std::size_t, std::size_t> {
+    if (N == 0) {
+      return {0, 0};
+    }
+    std::size_t a = ind_dist(rng), b = ind_dist(rng);
+    if (a > b) {
+      std::swap(a, b);
+    }
+    return {a, b};
+  };
+
+  for (std::size_t i = 0; i < sample_count; ++i) {
+    d->pool.inds_any[i] = nonedge_index_distribution(rng);
+    d->pool.rank10_end_positions[i] = rank10_end_position_distribution(rng);
+    d->pool.inds[i] = (N ? ind_dist(rng) : 0);
+    d->pool.inds_1N[i] = (N ? ind_dist_1N(rng) : 0);
+    d->pool.deltas[i] = delta_distribution(rng);
+    d->pool.segs[i] = rand_ij();
+  }
+
+  auto fill_from_candidates = [&](const std::vector<std::size_t>& candidates,
+                                  std::vector<std::size_t>& destination,
+                                  std::size_t fallback_value) {
+    destination.resize(sample_count);
+    if (candidates.empty()) {
+      std::fill(destination.begin(), destination.end(), fallback_value);
+      return;
+    }
+    std::uniform_int_distribution<std::size_t> index_distribution(
+        0, candidates.size() - 1);
+    for (std::size_t sample_index = 0; sample_index < sample_count;
+         ++sample_index) {
+      destination[sample_index] = candidates[index_distribution(rng)];
+    }
+  };
+  const std::size_t one_based_fallback = (N > 0 ? 1 : 0);
+  fill_from_candidates(open_positions_zero_based,
+                       d->pool.open_positions_zero_based, 0);
+  fill_from_candidates(open_positions_one_based,
+                       d->pool.open_positions_one_based, one_based_fallback);
+  fill_from_candidates(close_positions_one_based,
+                       d->pool.close_positions_one_based, one_based_fallback);
+
+  auto fill_ks = [&](std::size_t total, std::vector<std::size_t>& out) {
+    out.resize(sample_count);
+    if (total == 0) {
+      std::fill(out.begin(), out.end(), 1);
+      return;
+    }
+    std::uniform_int_distribution<std::size_t> dist(1, total);
+    for (std::size_t i = 0; i < sample_count; ++i) {
+      out[i] = dist(rng);
+    }
+  };
+  fill_ks(d->cnt1, d->pool.ks1);
+  fill_ks(d->cnt0, d->pool.ks0);
+  fill_ks(d->cnt10, d->pool.ks10);
+
+  d->pool.minselect_q.clear();
+
+  return d;
+}
+
+template <class Fn>
+static void register_op(const std::string& op,
+                        std::shared_ptr<Dataset> data,
+                        Fn&& body) {
+  auto idx_ptr = std::make_shared<std::size_t>(0);
+
+  auto* b = benchmark::RegisterBenchmark(
+      op.c_str(), [data, idx_ptr, body](benchmark::State& state) {
+        const Dataset& D = *data;
+        for (auto _ : state) {
+          std::size_t i = (*idx_ptr)++;
+          body(state, D, i);
+        }
+        state.counters["N"] = static_cast<double>(D.N);
+        state.counters["seed"] = static_cast<double>(args.seed);
+        state.counters["block_bits"] = static_cast<double>(args.block_bits);
+      });
+
+  b->Unit(benchmark::kNanosecond);
+}
+
+static void register_all() {
+  auto Ns = build_size_grid();
+  for (std::size_t N : Ns) {
+    auto data = build_dataset(N);
+    keepalive.push_back(data);
+
+    const auto& P = data->pool;
+
+    register_op("rank1", data,
+                [&](benchmark::State&, const Dataset& D, std::size_t k) {
+                  if (D.N == 0) {
+                    benchmark::DoNotOptimize(0);
+                    return;
+                  }
+                  std::size_t pos = P.inds_any[k % P.inds_any.size()];
+                  std::size_t r = 0;
+                  if (pos > 0) {
+                    r = D.t.rank(pos - 1);
+                  }
+                  benchmark::DoNotOptimize(r);
+                });
+
+    register_op("rank0", data,
+                [&](benchmark::State&, const Dataset& D, std::size_t k) {
+                  if (D.N == 0) {
+                    benchmark::DoNotOptimize(0);
+                    return;
+                  }
+                  std::size_t pos = P.inds_any[k % P.inds_any.size()];
+                  std::size_t r = 0;
+                  if (pos == 0) {
+                    r = 0;
+                  } else if (pos >= D.N) {
+                    r = D.cnt0;
+                  } else {
+                    r = D.t.preceding_closing_parentheses(pos);
+                  }
+                  benchmark::DoNotOptimize(r);
+                });
+
+    register_op("select1", data,
+                [&](benchmark::State&, const Dataset& D, std::size_t k) {
+                  if (D.cnt1 == 0) {
+                    benchmark::DoNotOptimize(0);
+                    return;
+                  }
+                  std::size_t q = P.ks1[k % P.ks1.size()];
+                  auto r = D.t.select(q);
+                  benchmark::DoNotOptimize(r);
+                });
+
+    register_op("excess", data,
+                [&](benchmark::State&, const Dataset& D, std::size_t k) {
+                  if (D.N == 0) {
+                    benchmark::DoNotOptimize(0);
+                    return;
+                  }
+                  std::size_t pos = P.inds_any[k % P.inds_any.size()];
+                  BpSupport::difference_type r = 0;
+                  if (pos > 0) {
+                    r = D.t.excess(pos - 1);
+                  }
+                  benchmark::DoNotOptimize(r);
+                });
+
+    register_op("range_min_query_pos", data,
+                [&](benchmark::State&, const Dataset& D, std::size_t k) {
+                  if (D.N == 0) {
+                    benchmark::DoNotOptimize(0);
+                    return;
+                  }
+                  const std::size_t segment_index = k % P.segs.size();
+                  auto [range_begin, range_end] = P.segs[segment_index];
+                  auto min_position = D.t.rmq(range_begin, range_end);
+                  benchmark::DoNotOptimize(min_position);
+                });
+
+    register_op("range_min_query_val", data,
+                [&](benchmark::State&, const Dataset& D, std::size_t k) {
+                  if (D.N == 0) {
+                    benchmark::DoNotOptimize(0);
+                    return;
+                  }
+                  const std::size_t segment_index = k % P.segs.size();
+                  auto [range_begin, range_end] = P.segs[segment_index];
+                  auto min_position = D.t.rmq(range_begin, range_end);
+                  BpSupport::difference_type min_value = 0;
+                  if (min_position < D.N) {
+                    const auto base_excess =
+                        (range_begin == 0 ? 0 : D.t.excess(range_begin - 1));
+                    min_value = D.t.excess(min_position) - base_excess;
+                  }
+                  benchmark::DoNotOptimize(min_value);
+                });
+
+    register_op("close", data,
+                [&](benchmark::State&, const Dataset& dataset,
+                    std::size_t sample_index) {
+                  if (dataset.N == 0) {
+                    benchmark::DoNotOptimize(0);
+                    return;
+                  }
+                  const std::size_t open_position =
+                      dataset.pool.open_positions_zero_based
+                          [sample_index %
+                           dataset.pool.open_positions_zero_based.size()];
+                  benchmark::DoNotOptimize(dataset.t.find_close(open_position));
+                });
+
+    register_op(
+        "open", data,
+        [&](benchmark::State&, const Dataset& dataset,
+            std::size_t sample_index) {
+          if (dataset.N == 0) {
+            benchmark::DoNotOptimize(0);
+            return;
+          }
+          const std::size_t close_position_one_based =
+              dataset.pool.close_positions_one_based
+                  [sample_index %
+                   dataset.pool.close_positions_one_based.size()];
+          const std::size_t close_position_zero_based =
+              (close_position_one_based > 0 ? close_position_one_based - 1 : 0);
+          benchmark::DoNotOptimize(
+              dataset.t.find_open(close_position_zero_based));
+        });
+
+    register_op(
+        "enclose", data,
+        [&](benchmark::State&, const Dataset& dataset,
+            std::size_t sample_index) {
+          if (dataset.N == 0) {
+            benchmark::DoNotOptimize(0);
+            return;
+          }
+          const std::size_t open_position_one_based =
+              dataset.pool.open_positions_one_based
+                  [sample_index % dataset.pool.open_positions_one_based.size()];
+          const std::size_t open_position_zero_based =
+              (open_position_one_based > 0 ? open_position_one_based - 1 : 0);
+          benchmark::DoNotOptimize(dataset.t.enclose(open_position_zero_based));
+        });
+  }
+}
+
+int main(int argc, char** argv) {
+  benchmark::MaybeReenterWithoutASLR(argc, argv);
+  parse_args_and_strip(argc, argv);
+
+  auto has = [&](const char* key) {
+    std::string p1 = std::string(key) + "=";
+    for (int i = 1; i < argc; ++i) {
+      std::string s = argv[i];
+      if (s == key || s.rfind(p1, 0) == 0) {
+        return true;
+      }
+    }
+    return false;
+  };
+
+  static std::vector<std::string> extra;
+  if (!has("--benchmark_out_format")) {
+    extra.emplace_back("--benchmark_out_format=json");
+  }
+  if (!has("--benchmark_counters_tabular")) {
+    extra.emplace_back("--benchmark_counters_tabular=true");
+  }
+  if (!has("--benchmark_time_unit")) {
+    extra.emplace_back("--benchmark_time_unit=ns");
+  }
+
+  static std::vector<char*> argv_vec;
+  argv_vec.assign(argv, argv + argc);
+  for (auto& s : extra) {
+    argv_vec.push_back(s.data());
+  }
+  argv_vec.push_back(nullptr);
+  argc = (int)argv_vec.size() - 1;
+  argv = argv_vec.data();
+
+  benchmark::Initialize(&argc, argv);
+  register_all();
+  benchmark::RunSpecifiedBenchmarks();
+  benchmark::Shutdown();
+  return 0;
+}
diff --git a/src/test_rmm.cpp b/src/test_rmm.cpp
index 9571a91..c0bf86b 100644
--- a/src/test_rmm.cpp
+++ b/src/test_rmm.cpp
@@ -58,47 +58,45 @@ static std::string random_bits(std::mt19937_64& rng, size_t n) {
 }
 
 static std::string random_dyck_bits(std::mt19937_64& rng, size_t m) {
-  std::string s(2 * m, '0');
-  if (m == 0) {
+  std::size_t L = m << 1;
+  std::string s;
+  s.resize(L);
+  if (L == 0) {
     return s;
   }
-  size_t opens_left = m;
-  size_t closes_left = m;
-  int h = 0;
-  std::bernoulli_distribution coin(0.5);
-  for (size_t pos = 0; pos < 2 * m; ++pos) {
+  std::size_t opens_left = m;
+  std::size_t closes_left = m;
+  int balance = 0;
+  std::uniform_real_distribution<double> U(0., 1.);
+  for (std::size_t i = 0; i < L; ++i) {
     if (opens_left == 0) {
-      s[pos] = '0';
+      s[i] = '0';
       --closes_left;
-      --h;
+      --balance;
       continue;
     }
+
     if (closes_left == 0) {
-      s[pos] = '1';
+      s[i] = '1';
       --opens_left;
-      ++h;
+      ++balance;
       continue;
     }
-    if (h == 0) {
-      s[pos] = '1';
+
+    if (balance == 0) {
+      s[i] = '1';
       --opens_left;
-      ++h;
-      continue;
-    }
-    if ((size_t)h == closes_left) {
-      s[pos] = '0';
-      --closes_left;
-      --h;
+      ++balance;
       continue;
     }
-    if (coin(rng)) {
-      s[pos] = '1';
+    if (U(rng) < 0.5) {
+      s[i] = '1';
       --opens_left;
-      ++h;
+      ++balance;
     } else {
-      s[pos] = '0';
+      s[i] = '0';
       --closes_left;
-      --h;
+      --balance;
     }
   }
   return s;