Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@ Thumbs.db
# Test data files
test_data*.bin
*.bin
!tests/data/dbscan_static_data.bin
!tests/data/dbscan_static_truth.bin

# Executables
dbscan_tests
Expand All @@ -43,4 +45,4 @@ dbscan_tests
*.tmp
*.temp
.cache/
AGENTS.md
AGENTS.md
28 changes: 27 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@ endif()
add_library(dbscan STATIC
src/dbscan.cpp
src/dbscan_optimized.cpp
src/dbscan_grid2d_l1.cpp
src/dbscan_grid2d_l1_aos.cpp
)

target_include_directories(dbscan
Expand Down Expand Up @@ -64,7 +66,9 @@ FetchContent_MakeAvailable(nanobench)
add_executable(dbscan_tests
tests/test_dbscan.cpp
tests/test_dbscan_optimized.cpp
tests/test_dbscan_grid2d_l1.cpp
tests/test_parallel_for.cpp
tests/test_parallelize.cpp
tests/test_union_find.cpp
)

Expand All @@ -79,6 +83,20 @@ target_include_directories(dbscan_tests
include
)

add_executable(dbscan_dataset_validator
tools/dbscan_dataset_validator.cpp
)

target_link_libraries(dbscan_dataset_validator
PRIVATE
dbscan
)

target_include_directories(dbscan_dataset_validator
PRIVATE
include
)

# Benchmark executable
add_executable(dbscan_benchmark
benchmark/benchmark_dbscan.cpp
Expand All @@ -101,4 +119,12 @@ target_include_directories(dbscan_benchmark

# Enable testing
enable_testing()
add_test(NAME dbscan_tests COMMAND dbscan_tests)
add_test(NAME dbscan_tests COMMAND dbscan_tests)

add_test(
NAME dbscan_dataset_validation
COMMAND ${CMAKE_COMMAND}
-DPROJECT_SOURCE_DIR=${CMAKE_CURRENT_SOURCE_DIR}
-DPROJECT_BINARY_DIR=${CMAKE_CURRENT_BINARY_DIR}
-P ${CMAKE_CURRENT_SOURCE_DIR}/cmake/run_dataset_validator.cmake
)
218 changes: 96 additions & 122 deletions benchmark/benchmark_dbscan.cpp
Original file line number Diff line number Diff line change
@@ -1,145 +1,119 @@
#include "dbscan.h"
#include "dbscan_optimized.h"
#include <chrono>
#include <cstdlib>
#include <ctime>
#include "dbscan_grid2d_l1.h"

#include <algorithm>
#include <cmath>
#include <cstdint>
#include <iostream>
#include <memory>
#include <limits>
#include <nanobench.h>
#include <random>
#include <string>
#include <vector>

// Generate clustered 2D data for benchmarking
std::vector<dbscan::Point<double>> generate_benchmark_data(size_t n_points, int n_clusters = 8) {
std::vector<dbscan::Point<double>> points;
points.reserve(n_points);

// Create clusters
for (int c = 0; c < n_clusters; ++c) {
double center_x = c * 5.0;
double center_y = c * 5.0;
size_t points_per_cluster = n_points / n_clusters;

for (size_t i = 0; i < points_per_cluster; ++i) {
double x = center_x + (static_cast<double>(rand()) / RAND_MAX - 0.5) * 2.0;
double y = center_y + (static_cast<double>(rand()) / RAND_MAX - 0.5) * 2.0;
points.push_back({x, y});
}
}

// Add some noise points
size_t noise_points = n_points / 10;
for (size_t i = 0; i < noise_points; ++i) {
double x = 50.0 + (static_cast<double>(rand()) / RAND_MAX - 0.5) * 20.0;
double y = 50.0 + (static_cast<double>(rand()) / RAND_MAX - 0.5) * 20.0;
points.push_back({x, y});
}

return points;
}

int main() {
// Seed random number generator
srand(static_cast<unsigned int>(time(nullptr)));

ankerl::nanobench::Bench bench;
namespace {

// Benchmark different data sizes
std::vector<size_t> data_sizes = {1000, 10000, 50000, 100000};
struct Uint32Dataset {
std::vector<uint32_t> x;
std::vector<uint32_t> y;
};

for (size_t n_points : data_sizes) {
std::cout << "\n=== Benchmarking with " << n_points << " points ===" << std::endl;
Uint32Dataset generate_uint32_dataset(std::size_t cluster_count, std::size_t points_per_cluster,
std::size_t noise_points, uint32_t area_width, uint32_t cluster_sigma,
std::mt19937 &rng) {
std::uniform_real_distribution<double> uniform_dist(0.0, static_cast<double>(area_width));
std::normal_distribution<double> normal_dist(0.0, static_cast<double>(cluster_sigma));

// Generate test data
auto points = generate_benchmark_data(n_points);
Uint32Dataset dataset;
dataset.x.reserve(cluster_count * points_per_cluster + noise_points);
dataset.y.reserve(cluster_count * points_per_cluster + noise_points);

// Benchmark original DBSCAN
bench.title("Original DBSCAN").run("Original DBSCAN " + std::to_string(n_points) + " points", [&]() {
dbscan::DBSCAN<double> dbscan(0.8, 5);
auto result = dbscan.cluster(points);
ankerl::nanobench::doNotOptimizeAway(result);
});
for (std::size_t c = 0; c < cluster_count; ++c) {
const double center_x = uniform_dist(rng);
const double center_y = uniform_dist(rng);

// Benchmark optimized DBSCAN
bench.title("Optimized DBSCAN").run("Optimized DBSCAN " + std::to_string(n_points) + " points", [&]() {
dbscan::DBSCANOptimized<double> dbscan(0.8, 5);
auto result = dbscan.cluster(points);
ankerl::nanobench::doNotOptimizeAway(result);
});
for (std::size_t i = 0; i < points_per_cluster; ++i) {
const double sample_x = center_x + normal_dist(rng);
const double sample_y = center_y + normal_dist(rng);

// Memory usage comparison
{
dbscan::DBSCAN<double> original_dbscan(0.8, 5);
auto original_result = original_dbscan.cluster(points);
const uint32_t clamped_x =
static_cast<uint32_t>(std::min(static_cast<double>(area_width - 1), std::max(0.0, std::round(sample_x))));
const uint32_t clamped_y =
static_cast<uint32_t>(std::min(static_cast<double>(area_width - 1), std::max(0.0, std::round(sample_y))));

dbscan::DBSCANOptimized<double> optimized_dbscan(0.8, 5);
auto optimized_result = optimized_dbscan.cluster(points);

std::cout << "Original DBSCAN found " << original_result.num_clusters << " clusters" << std::endl;
std::cout << "Optimized DBSCAN found " << optimized_result.num_clusters << " clusters" << std::endl;
dataset.x.push_back(clamped_x);
dataset.y.push_back(clamped_y);
}
}

// Performance comparison with different parameters
std::cout << "\n=== Parameter Sensitivity Benchmark ===" << std::endl;

auto test_points = generate_benchmark_data(10000);

// Different eps values
std::vector<double> eps_values = {0.3, 0.5, 0.8, 1.2};

for (double eps : eps_values) {
bench.title("EPS Parameter").run("Optimized DBSCAN eps=" + std::to_string(eps), [&]() {
dbscan::DBSCANOptimized<double> dbscan(eps, 5);
auto result = dbscan.cluster(test_points);
ankerl::nanobench::doNotOptimizeAway(result);
});
std::uniform_int_distribution<uint32_t> uniform_int(0, area_width - 1);
for (std::size_t i = 0; i < noise_points; ++i) {
dataset.x.push_back(uniform_int(rng));
dataset.y.push_back(uniform_int(rng));
}

// Different min_pts values
std::vector<int> min_pts_values = {3, 5, 10, 15};
return dataset;
}

} // namespace

for (int min_pts : min_pts_values) {
bench.title("MinPts Parameter").run("Optimized DBSCAN min_pts=" + std::to_string(min_pts), [&]() {
dbscan::DBSCANOptimized<double> dbscan(0.8, min_pts);
auto result = dbscan.cluster(test_points);
ankerl::nanobench::doNotOptimizeAway(result);
});
}
int main() {
constexpr uint32_t area_width = 1'000'000;
constexpr uint32_t cluster_sigma = 50; // Approximately 3 sigma ~ 150 px footprint
constexpr uint32_t eps = 60;
constexpr uint32_t min_samples = 16;

// Detailed performance analysis
std::cout << "\n=== Detailed Performance Analysis ===" << std::endl;

auto large_dataset = generate_benchmark_data(50000);

// Time both implementations on larger dataset
{
std::cout << "Running performance comparison on 50k points..." << std::endl;

// Original DBSCAN timing
auto start_time = std::chrono::high_resolution_clock::now();
dbscan::DBSCAN<double> original_dbscan(0.8, 5);
auto original_result = original_dbscan.cluster(large_dataset);
auto end_time = std::chrono::high_resolution_clock::now();
auto original_duration = std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time);

// Optimized DBSCAN timing
start_time = std::chrono::high_resolution_clock::now();
dbscan::DBSCANOptimized<double> optimized_dbscan(0.8, 5);
auto optimized_result = optimized_dbscan.cluster(large_dataset);
end_time = std::chrono::high_resolution_clock::now();
auto optimized_duration = std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time);

std::cout << "Original DBSCAN: " << original_duration.count() << "ms, " << original_result.num_clusters
<< " clusters" << std::endl;
std::cout << "Optimized DBSCAN: " << optimized_duration.count() << "ms, " << optimized_result.num_clusters
<< " clusters" << std::endl;

if (original_duration.count() > 0) {
double speedup = static_cast<double>(original_duration.count()) / optimized_duration.count();
std::cout << "Speedup: " << speedup << "x" << std::endl;
std::mt19937 rng(1337u);
ankerl::nanobench::Bench bench;
bench.title("DBSCANGrid2D_L1");
bench.relative(true);
bench.warmup(2);
bench.minEpochIterations(10);
bench.unit("pt");

struct Scenario {
std::size_t clusters;
std::size_t points_per_cluster;
};

const std::vector<Scenario> scenarios = {
{64, 256}, // ~16K cluster points + 32K noise => ~48K total
{128, 256}, // ~32K cluster points + 64K noise => ~96K total
{256, 256}, // ~65K cluster points + 131K noise => ~196K total
{512, 256}, // ~131K cluster points + 262K noise => ~393K total
{640, 256}, // ~163K cluster points + 327K noise => ~490K total
};

std::cout << "Benchmarking DBSCANGrid2D_L1 with Manhattan distance" << std::endl;
std::cout << "eps=" << eps << ", min_samples=" << min_samples << std::endl;
std::cout << "Thread sweep: 0 (auto), 1, 2, 4, 8" << std::endl;

for (const auto &scenario : scenarios) {
const std::size_t cluster_points = scenario.clusters * scenario.points_per_cluster;
const std::size_t noise_points = cluster_points * 2; // 2x noise compared to clustered points

auto dataset = generate_uint32_dataset(scenario.clusters, scenario.points_per_cluster, noise_points, area_width,
cluster_sigma, rng);

const std::size_t total_points = dataset.x.size();
std::cout << "\nScenario: " << scenario.clusters << " clusters, " << scenario.points_per_cluster
<< " points/cluster, total points=" << total_points << std::endl;

bench.batch(static_cast<double>(total_points));
bench.context("points", std::to_string(total_points));

const std::vector<std::size_t> thread_counts = {0, 1, 2, 4, 8};
for (std::size_t thread_count : thread_counts) {
const std::string label = "grid-l1 " + std::to_string(total_points) + " pts threads=" +
(thread_count == 0 ? std::string("auto") : std::to_string(thread_count));
bench.run(label, [&]() {
dbscan::DBSCANGrid2DL1Params params{eps, min_samples};
params.num_threads = thread_count;
auto result =
dbscan::dbscan_grid2d_l1(dataset.x.data(), 1, dataset.y.data(), 1, total_points, params);
ankerl::nanobench::doNotOptimizeAway(result.labels);
});
}
}

return 0;
}
}
31 changes: 31 additions & 0 deletions cmake/run_dataset_validator.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
set(DATA_FILE "${PROJECT_SOURCE_DIR}/tests/data/dbscan_static_data.bin")
set(TRUTH_FILE "${PROJECT_SOURCE_DIR}/tests/data/dbscan_static_truth.bin")
set(VALIDATOR "${PROJECT_BINARY_DIR}/dbscan_dataset_validator")

if(NOT EXISTS "${DATA_FILE}")
message(FATAL_ERROR "Static dataset not found: ${DATA_FILE}")
endif()

if(NOT EXISTS "${TRUTH_FILE}")
message(FATAL_ERROR "Truth labels file not found: ${TRUTH_FILE}")
endif()

if(NOT EXISTS "${VALIDATOR}")
message(FATAL_ERROR "dbscan_dataset_validator executable not found: ${VALIDATOR}")
endif()

execute_process(
COMMAND "${VALIDATOR}"
"--data" "${DATA_FILE}"
"--truth" "${TRUTH_FILE}"
"--eps" "10"
"--min-samples" "3"
"--impl" "both"
WORKING_DIRECTORY "${PROJECT_BINARY_DIR}"
RESULT_VARIABLE VALIDATOR_RESULT
COMMAND_ECHO STDOUT
)

if(NOT VALIDATOR_RESULT EQUAL 0)
message(FATAL_ERROR "dbscan_dataset_validator reported a mismatch (exit code ${VALIDATOR_RESULT})")
endif()
Loading
Loading