Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
88 changes: 88 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,83 @@ if(ENABLE_ADDRESS_SANITIZER)
message(STATUS "AddressSanitizer is ON")
endif()

set(SDSL_ROOT "" CACHE PATH "Installation prefix of sdsl-lite (contains include/ and lib/)")

find_path(
SDSL_INCLUDE_DIR
NAMES sdsl/config.hpp
HINTS
"${SDSL_ROOT}"
PATH_SUFFIXES
include
PATHS
"$ENV{HOME}"
/usr
/usr/local
)

if(NOT SDSL_INCLUDE_DIR)
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note that SDSL is intended for comparison only and thus it is better to configure it to be easily excluded, i.e. for example introducing some variable like THIRD_PARTY_BENCHMARK and set to false by default, activating only when it is intended.

message(FATAL_ERROR "Could not find sdsl/config.hpp. "
"Set SDSL_ROOT or SDSL_INCLUDE_DIR manually.")
endif()

find_library(
SDSL_LIBRARY
NAMES sdsl
HINTS
"${SDSL_ROOT}"
PATH_SUFFIXES
lib
lib64
PATHS
"$ENV{HOME}"
/usr
/usr/local
)

find_library(
DIVSUFSORT_LIBRARY
NAMES divsufsort
HINTS
"${SDSL_ROOT}"
PATH_SUFFIXES
lib
lib64
PATHS
"$ENV{HOME}"
/usr
/usr/local
)

find_library(
DIVSUFSORT64_LIBRARY
NAMES divsufsort64
HINTS
"${SDSL_ROOT}"
PATH_SUFFIXES
lib
lib64
PATHS
"$ENV{HOME}"
/usr
/usr/local
)

if(NOT SDSL_LIBRARY OR NOT DIVSUFSORT_LIBRARY OR NOT DIVSUFSORT64_LIBRARY)
message(FATAL_ERROR "Could not find sdsl-lite libraries (sdsl, divsufsort, divsufsort64). "
"Set SDSL_ROOT or SDSL_LIBRARY*/DIVSUFSORT* manually.")
endif()

add_library(sdsl_lite INTERFACE)
target_include_directories(sdsl_lite INTERFACE "${SDSL_INCLUDE_DIR}")
target_link_libraries(sdsl_lite INTERFACE
"${SDSL_LIBRARY}"
"${DIVSUFSORT_LIBRARY}"
"${DIVSUFSORT64_LIBRARY}"
)
message(STATUS "Found sdsl-lite includes in: ${SDSL_INCLUDE_DIR}")
message(STATUS "Found sdsl-lite libs: ${SDSL_LIBRARY}; ${DIVSUFSORT_LIBRARY}; ${DIVSUFSORT64_LIBRARY}")

include(FetchContent)
FetchContent_Declare(
googletest
Expand Down Expand Up @@ -103,6 +180,17 @@ target_link_libraries(test_rmm
gtest
gtest_main)

add_executable(bench_rmm_sdsl
src/bench_rmm_sdsl.cpp
)
target_include_directories(bench_rmm_sdsl
PUBLIC include
)
target_link_libraries(bench_rmm_sdsl
PRIVATE
benchmark
sdsl_lite
)

FetchContent_Declare(
doxygen-awesome-css
Expand Down
9 changes: 8 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,14 @@ Benchmarks are random 50/50 0-1 bitvectors up to $2^{34}$ bits.
./bench_rmm
```

For visualization, write the JSON output to a file using `--benchmark_out=<file>` (e.g. `./bench_rmm --benchmark_out=rmm_bench.json`) and plot it with `misc/plot_rmm.py`.
For comparison with range min-max tree implementation from [sdsl-lite](https://github.com/simongog/sdsl-lite) (Release build required: `cmake .. -DCMAKE_BUILD_TYPE=Release`):
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Although rookies still make mistakes about measuring performance in debug build, the remark in the brackets is common knowledge. If you want to make a hint for novices, it is better to leave a message "make sure to build with -DCMAKE_BUILD_TYPE=Release to achieve intended performance`" to actually pinpoint what is it intended for.


```bash
sudo cpupower frequency-set --governor performance
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a very limited instruction for performance stabilization, cpupower is not shipped by default in ubuntu as af as I know. I you really want the make a hint for a reader about performance variance, refer to the original page at google benchmarks.

./bench_rmm_sdsl --benchmark_out=rmm_bench_sdsl.json
```

For visualization, write the JSON output to a file using `--benchmark_out=<file>` (e.g. `./bench_rmm --benchmark_out=rmm_bench.json`) and plot it with `misc/plot_rmm.py` (add `--sdsl-json rmm_bench_sdsl.json` for comparison).

---

Expand Down
258 changes: 258 additions & 0 deletions REPORT.md

Large diffs are not rendered by default.

111 changes: 4 additions & 107 deletions include/bits.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,9 @@

#include <array>
#include <bit>
#include <cstddef>
#include <cstdint>
#include <limits>
#include <numeric>

#if defined(__AVX512VPOPCNTDQ__) && defined(__AVX512F__) && \
Expand All @@ -17,7 +19,7 @@
// Lookup table for 4-bit popcount
// This table maps each 4-bit value (0-15) to its population count
// clang-format off
const __m256i lookup_popcount_4 = _mm256_setr_epi8(
static inline const __m256i lookup_popcount_4 = _mm256_setr_epi8(
0, 1, 1, 2, // 0000, 0001, 0010, 0011
1, 2, 2, 3, // 0100, 0101, 0110, 0111
1, 2, 2, 3, // 1000, 1001, 1010, 1011
Expand All @@ -30,7 +32,7 @@ const __m256i lookup_popcount_4 = _mm256_setr_epi8(
2, 3, 3, 4 // 1100, 1101, 1110, 1111
);

const __m256i mask_first_half = _mm256_setr_epi8(
static inline const __m256i mask_first_half = _mm256_setr_epi8(
0xFF, 0xFF, 0xFF, 0xFF,
0xFF, 0xFF, 0xFF, 0xFF,
0xFF, 0xFF, 0xFF, 0xFF,
Expand Down Expand Up @@ -405,108 +407,3 @@ void rank_32x8(const uint8_t* x, uint8_t* result) {
}
#endif
}

/**
* @brief Efficiently searches for the first occurrence of a 16-bit value in
* the range [@p begin, @p end_excl) using AVX2 when available.
* @details Loads 16 consecutive int16_t elements (256 bits) per iteration.
* Compares them against the @p target value using vectorized equality.
* If any match is found, extracts the index of the first matching lane from
* the comparison mask. Falls back to a scalar tail loop for leftover
* elements, or to a fully scalar search if AVX2 is not supported.
* @returns The index of the first match, or @p npos if the value is not found.
*/
static inline size_t find_forward_equal_i16_avx2(const int16_t* arr,
const size_t& begin,
const size_t& end_excl,
const int16_t& target,
const size_t& npos) noexcept {
#ifdef PIXIE_AVX2_SUPPORT
static constexpr size_t STEP = 16;
__m256i vtarget = _mm256_set1_epi16(target);
size_t i = begin;
size_t n = end_excl;
for (; i + STEP <= n; i += STEP) {
unsigned mask = _mm256_movemask_epi8(_mm256_cmpeq_epi16(
_mm256_loadu_si256(reinterpret_cast<const __m256i*>(arr + i)),
vtarget));
if (mask) {
return i + (std::countr_zero(mask) >> 1);
}
}
for (; i < n; ++i) {
if (arr[i] == target) {
return i;
}
}
#else
for (size_t i = begin; i < end_excl; ++i) {
if (arr[i] == target) {
return i;
}
}
#endif
return npos;
}

/**
* @brief Performs a backward search for a 16-bit value in a given range.
* @details Scans the array segment [@p begin .. @p end_incl] from right to
* left.
* If AVX2 is available, processes data in 256-bit blocks (16 × int16_t) using
* vectorized equality comparison for higher throughput. Falls back to a
* scalar backward scan when AVX2 is not supported. Returns the index of the
* rightmost occurrence of @p target, or @p npos if no match is found.
*/
static inline size_t find_backward_equal_i16_avx2(const int16_t* arr,
const size_t& begin,
const size_t& end_incl,
const int16_t& target,
const size_t& npos) noexcept {
if (begin > end_incl) {
return npos;
}
#ifdef PIXIE_AVX2_SUPPORT
static constexpr size_t STEP = 16;
size_t len = end_incl + 1 - begin;
size_t nblocks = len / STEP;
__m256i vtarget = _mm256_set1_epi16(target);
if (nblocks > 0) {
size_t first_block = begin + (len % STEP);
for (size_t p = first_block + (nblocks - 1) * STEP;;) {
unsigned mask = _mm256_movemask_epi8(_mm256_cmpeq_epi16(
_mm256_loadu_si256(reinterpret_cast<const __m256i*>(arr + p)),
vtarget));
if (mask) {
return p + ((31u - std::countl_zero(mask)) >> 1);
}
if (p == first_block) {
break;
}
p -= STEP;
}

for (size_t i = first_block; i > begin;) {
--i;
if (arr[i] == target) {
return i;
}
}
} else {
for (size_t i = end_incl + 1; i > begin;) {
--i;
if (arr[i] == target) {
return i;
}
}
}
#else
for (size_t i = end_incl + 1; i > begin;) {
--i;
if (arr[i] == target) {
return i;
}
}
#endif
return npos;
}
Loading
Loading