diff --git a/.gitignore b/.gitignore index 42167f6..5adba1c 100644 --- a/.gitignore +++ b/.gitignore @@ -33,6 +33,8 @@ Thumbs.db # Test data files test_data*.bin *.bin +!tests/data/dbscan_static_data.bin +!tests/data/dbscan_static_truth.bin # Executables dbscan_tests @@ -43,4 +45,4 @@ dbscan_tests *.tmp *.temp .cache/ -AGENTS.md \ No newline at end of file +AGENTS.md diff --git a/CMakeLists.txt b/CMakeLists.txt index c88060e..0ed6314 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -19,6 +19,8 @@ endif() add_library(dbscan STATIC src/dbscan.cpp src/dbscan_optimized.cpp + src/dbscan_grid2d_l1.cpp + src/dbscan_grid2d_l1_aos.cpp ) target_include_directories(dbscan @@ -64,7 +66,9 @@ FetchContent_MakeAvailable(nanobench) add_executable(dbscan_tests tests/test_dbscan.cpp tests/test_dbscan_optimized.cpp + tests/test_dbscan_grid2d_l1.cpp tests/test_parallel_for.cpp + tests/test_parallelize.cpp tests/test_union_find.cpp ) @@ -79,6 +83,20 @@ target_include_directories(dbscan_tests include ) +add_executable(dbscan_dataset_validator + tools/dbscan_dataset_validator.cpp +) + +target_link_libraries(dbscan_dataset_validator + PRIVATE + dbscan +) + +target_include_directories(dbscan_dataset_validator + PRIVATE + include +) + # Benchmark executable add_executable(dbscan_benchmark benchmark/benchmark_dbscan.cpp @@ -101,4 +119,12 @@ target_include_directories(dbscan_benchmark # Enable testing enable_testing() -add_test(NAME dbscan_tests COMMAND dbscan_tests) \ No newline at end of file +add_test(NAME dbscan_tests COMMAND dbscan_tests) + +add_test( + NAME dbscan_dataset_validation + COMMAND ${CMAKE_COMMAND} + -DPROJECT_SOURCE_DIR=${CMAKE_CURRENT_SOURCE_DIR} + -DPROJECT_BINARY_DIR=${CMAKE_CURRENT_BINARY_DIR} + -P ${CMAKE_CURRENT_SOURCE_DIR}/cmake/run_dataset_validator.cmake +) diff --git a/benchmark/benchmark_dbscan.cpp b/benchmark/benchmark_dbscan.cpp index 7fb065c..55b0584 100644 --- a/benchmark/benchmark_dbscan.cpp +++ b/benchmark/benchmark_dbscan.cpp @@ -1,145 +1,119 @@ -#include "dbscan.h" -#include "dbscan_optimized.h" -#include -#include -#include +#include "dbscan_grid2d_l1.h" + +#include +#include +#include #include -#include +#include #include +#include #include #include -// Generate clustered 2D data for benchmarking -std::vector> generate_benchmark_data(size_t n_points, int n_clusters = 8) { - std::vector> points; - points.reserve(n_points); - - // Create clusters - for (int c = 0; c < n_clusters; ++c) { - double center_x = c * 5.0; - double center_y = c * 5.0; - size_t points_per_cluster = n_points / n_clusters; - - for (size_t i = 0; i < points_per_cluster; ++i) { - double x = center_x + (static_cast(rand()) / RAND_MAX - 0.5) * 2.0; - double y = center_y + (static_cast(rand()) / RAND_MAX - 0.5) * 2.0; - points.push_back({x, y}); - } - } - - // Add some noise points - size_t noise_points = n_points / 10; - for (size_t i = 0; i < noise_points; ++i) { - double x = 50.0 + (static_cast(rand()) / RAND_MAX - 0.5) * 20.0; - double y = 50.0 + (static_cast(rand()) / RAND_MAX - 0.5) * 20.0; - points.push_back({x, y}); - } - - return points; -} - -int main() { - // Seed random number generator - srand(static_cast(time(nullptr))); - - ankerl::nanobench::Bench bench; +namespace { - // Benchmark different data sizes - std::vector data_sizes = {1000, 10000, 50000, 100000}; +struct Uint32Dataset { + std::vector x; + std::vector y; +}; - for (size_t n_points : data_sizes) { - std::cout << "\n=== Benchmarking with " << n_points << " points ===" << std::endl; +Uint32Dataset generate_uint32_dataset(std::size_t cluster_count, std::size_t points_per_cluster, + std::size_t noise_points, uint32_t area_width, uint32_t cluster_sigma, + std::mt19937 &rng) { + std::uniform_real_distribution uniform_dist(0.0, static_cast(area_width)); + std::normal_distribution normal_dist(0.0, static_cast(cluster_sigma)); - // Generate test data - auto points = generate_benchmark_data(n_points); + Uint32Dataset dataset; + dataset.x.reserve(cluster_count * points_per_cluster + noise_points); + dataset.y.reserve(cluster_count * points_per_cluster + noise_points); - // Benchmark original DBSCAN - bench.title("Original DBSCAN").run("Original DBSCAN " + std::to_string(n_points) + " points", [&]() { - dbscan::DBSCAN dbscan(0.8, 5); - auto result = dbscan.cluster(points); - ankerl::nanobench::doNotOptimizeAway(result); - }); + for (std::size_t c = 0; c < cluster_count; ++c) { + const double center_x = uniform_dist(rng); + const double center_y = uniform_dist(rng); - // Benchmark optimized DBSCAN - bench.title("Optimized DBSCAN").run("Optimized DBSCAN " + std::to_string(n_points) + " points", [&]() { - dbscan::DBSCANOptimized dbscan(0.8, 5); - auto result = dbscan.cluster(points); - ankerl::nanobench::doNotOptimizeAway(result); - }); + for (std::size_t i = 0; i < points_per_cluster; ++i) { + const double sample_x = center_x + normal_dist(rng); + const double sample_y = center_y + normal_dist(rng); - // Memory usage comparison - { - dbscan::DBSCAN original_dbscan(0.8, 5); - auto original_result = original_dbscan.cluster(points); + const uint32_t clamped_x = + static_cast(std::min(static_cast(area_width - 1), std::max(0.0, std::round(sample_x)))); + const uint32_t clamped_y = + static_cast(std::min(static_cast(area_width - 1), std::max(0.0, std::round(sample_y)))); - dbscan::DBSCANOptimized optimized_dbscan(0.8, 5); - auto optimized_result = optimized_dbscan.cluster(points); - - std::cout << "Original DBSCAN found " << original_result.num_clusters << " clusters" << std::endl; - std::cout << "Optimized DBSCAN found " << optimized_result.num_clusters << " clusters" << std::endl; + dataset.x.push_back(clamped_x); + dataset.y.push_back(clamped_y); } } - // Performance comparison with different parameters - std::cout << "\n=== Parameter Sensitivity Benchmark ===" << std::endl; - - auto test_points = generate_benchmark_data(10000); - - // Different eps values - std::vector eps_values = {0.3, 0.5, 0.8, 1.2}; - - for (double eps : eps_values) { - bench.title("EPS Parameter").run("Optimized DBSCAN eps=" + std::to_string(eps), [&]() { - dbscan::DBSCANOptimized dbscan(eps, 5); - auto result = dbscan.cluster(test_points); - ankerl::nanobench::doNotOptimizeAway(result); - }); + std::uniform_int_distribution uniform_int(0, area_width - 1); + for (std::size_t i = 0; i < noise_points; ++i) { + dataset.x.push_back(uniform_int(rng)); + dataset.y.push_back(uniform_int(rng)); } - // Different min_pts values - std::vector min_pts_values = {3, 5, 10, 15}; + return dataset; +} + +} // namespace - for (int min_pts : min_pts_values) { - bench.title("MinPts Parameter").run("Optimized DBSCAN min_pts=" + std::to_string(min_pts), [&]() { - dbscan::DBSCANOptimized dbscan(0.8, min_pts); - auto result = dbscan.cluster(test_points); - ankerl::nanobench::doNotOptimizeAway(result); - }); - } +int main() { + constexpr uint32_t area_width = 1'000'000; + constexpr uint32_t cluster_sigma = 50; // Approximately 3 sigma ~ 150 px footprint + constexpr uint32_t eps = 60; + constexpr uint32_t min_samples = 16; - // Detailed performance analysis - std::cout << "\n=== Detailed Performance Analysis ===" << std::endl; - - auto large_dataset = generate_benchmark_data(50000); - - // Time both implementations on larger dataset - { - std::cout << "Running performance comparison on 50k points..." << std::endl; - - // Original DBSCAN timing - auto start_time = std::chrono::high_resolution_clock::now(); - dbscan::DBSCAN original_dbscan(0.8, 5); - auto original_result = original_dbscan.cluster(large_dataset); - auto end_time = std::chrono::high_resolution_clock::now(); - auto original_duration = std::chrono::duration_cast(end_time - start_time); - - // Optimized DBSCAN timing - start_time = std::chrono::high_resolution_clock::now(); - dbscan::DBSCANOptimized optimized_dbscan(0.8, 5); - auto optimized_result = optimized_dbscan.cluster(large_dataset); - end_time = std::chrono::high_resolution_clock::now(); - auto optimized_duration = std::chrono::duration_cast(end_time - start_time); - - std::cout << "Original DBSCAN: " << original_duration.count() << "ms, " << original_result.num_clusters - << " clusters" << std::endl; - std::cout << "Optimized DBSCAN: " << optimized_duration.count() << "ms, " << optimized_result.num_clusters - << " clusters" << std::endl; - - if (original_duration.count() > 0) { - double speedup = static_cast(original_duration.count()) / optimized_duration.count(); - std::cout << "Speedup: " << speedup << "x" << std::endl; + std::mt19937 rng(1337u); + ankerl::nanobench::Bench bench; + bench.title("DBSCANGrid2D_L1"); + bench.relative(true); + bench.warmup(2); + bench.minEpochIterations(10); + bench.unit("pt"); + + struct Scenario { + std::size_t clusters; + std::size_t points_per_cluster; + }; + + const std::vector scenarios = { + {64, 256}, // ~16K cluster points + 32K noise => ~48K total + {128, 256}, // ~32K cluster points + 64K noise => ~96K total + {256, 256}, // ~65K cluster points + 131K noise => ~196K total + {512, 256}, // ~131K cluster points + 262K noise => ~393K total + {640, 256}, // ~163K cluster points + 327K noise => ~490K total + }; + + std::cout << "Benchmarking DBSCANGrid2D_L1 with Manhattan distance" << std::endl; + std::cout << "eps=" << eps << ", min_samples=" << min_samples << std::endl; + std::cout << "Thread sweep: 0 (auto), 1, 2, 4, 8" << std::endl; + + for (const auto &scenario : scenarios) { + const std::size_t cluster_points = scenario.clusters * scenario.points_per_cluster; + const std::size_t noise_points = cluster_points * 2; // 2x noise compared to clustered points + + auto dataset = generate_uint32_dataset(scenario.clusters, scenario.points_per_cluster, noise_points, area_width, + cluster_sigma, rng); + + const std::size_t total_points = dataset.x.size(); + std::cout << "\nScenario: " << scenario.clusters << " clusters, " << scenario.points_per_cluster + << " points/cluster, total points=" << total_points << std::endl; + + bench.batch(static_cast(total_points)); + bench.context("points", std::to_string(total_points)); + + const std::vector thread_counts = {0, 1, 2, 4, 8}; + for (std::size_t thread_count : thread_counts) { + const std::string label = "grid-l1 " + std::to_string(total_points) + " pts threads=" + + (thread_count == 0 ? std::string("auto") : std::to_string(thread_count)); + bench.run(label, [&]() { + dbscan::DBSCANGrid2DL1Params params{eps, min_samples}; + params.num_threads = thread_count; + auto result = + dbscan::dbscan_grid2d_l1(dataset.x.data(), 1, dataset.y.data(), 1, total_points, params); + ankerl::nanobench::doNotOptimizeAway(result.labels); + }); } } return 0; -} \ No newline at end of file +} diff --git a/cmake/run_dataset_validator.cmake b/cmake/run_dataset_validator.cmake new file mode 100644 index 0000000..2790383 --- /dev/null +++ b/cmake/run_dataset_validator.cmake @@ -0,0 +1,31 @@ +set(DATA_FILE "${PROJECT_SOURCE_DIR}/tests/data/dbscan_static_data.bin") +set(TRUTH_FILE "${PROJECT_SOURCE_DIR}/tests/data/dbscan_static_truth.bin") +set(VALIDATOR "${PROJECT_BINARY_DIR}/dbscan_dataset_validator") + +if(NOT EXISTS "${DATA_FILE}") + message(FATAL_ERROR "Static dataset not found: ${DATA_FILE}") +endif() + +if(NOT EXISTS "${TRUTH_FILE}") + message(FATAL_ERROR "Truth labels file not found: ${TRUTH_FILE}") +endif() + +if(NOT EXISTS "${VALIDATOR}") + message(FATAL_ERROR "dbscan_dataset_validator executable not found: ${VALIDATOR}") +endif() + +execute_process( + COMMAND "${VALIDATOR}" + "--data" "${DATA_FILE}" + "--truth" "${TRUTH_FILE}" + "--eps" "10" + "--min-samples" "3" + "--impl" "both" + WORKING_DIRECTORY "${PROJECT_BINARY_DIR}" + RESULT_VARIABLE VALIDATOR_RESULT + COMMAND_ECHO STDOUT +) + +if(NOT VALIDATOR_RESULT EQUAL 0) + message(FATAL_ERROR "dbscan_dataset_validator reported a mismatch (exit code ${VALIDATOR_RESULT})") +endif() diff --git a/generate_test_data.py b/generate_test_data.py deleted file mode 100644 index a299dfa..0000000 --- a/generate_test_data.py +++ /dev/null @@ -1,144 +0,0 @@ -#!/usr/bin/env python3 - -import numpy as np -import struct -import sys - -def generate_test_data(n_samples=300, n_centers=4, output_file='test_data.bin'): - """ - Generate test data with known clusters and save to binary file. - Creates simple clusters manually without sklearn. - """ - return generate_test_data_with_params(n_samples, n_centers, 0.8, 5, output_file) - -def load_test_data(input_file='test_data.bin'): - """ - Load test data from binary file. - Returns: points (N, 2), labels - """ - with open(input_file, 'rb') as f: - # Read number of points - n_points = struct.unpack('I', f.read(4))[0] - - points = [] - labels = [] - - # Read points - for _ in range(n_points): - x, y = struct.unpack('dd', f.read(16)) - points.append([x, y]) - - # Read labels - for _ in range(n_points): - label = struct.unpack('i', f.read(4))[0] - labels.append(label) - - return np.array(points), np.array(labels) - -def generate_multiple_test_cases(): - """Generate test cases with different data sizes and parameters""" - test_cases = [ - # (n_samples, n_centers, eps, min_samples, output_file) - (500, 3, 0.8, 5, 'test_data_500.bin'), - (10000, 5, 0.8, 5, 'test_data_10k.bin'), - (100000, 8, 0.8, 5, 'test_data_100k.bin'), # Using 100k instead of 1M for practicality - # Different eps values - (1000, 4, 0.5, 5, 'test_data_eps_0_5.bin'), - (1000, 4, 1.2, 5, 'test_data_eps_1_2.bin'), - # Different min_samples values - (1000, 4, 0.8, 3, 'test_data_minpts_3.bin'), - (1000, 4, 0.8, 10, 'test_data_minpts_10.bin'), - ] - - for n_samples, n_centers, eps, min_samples, output_file in test_cases: - print(f"Generating {n_samples} points with {n_centers} centers (eps={eps}, min_samples={min_samples})...") - generate_test_data_with_params(n_samples, n_centers, eps, min_samples, output_file) - print(f"Saved to {output_file}\n") - -def generate_test_data_with_params(n_samples=300, n_centers=4, eps=0.8, min_samples=5, output_file='test_data.bin'): - """ - Generate test data with specific DBSCAN parameters - """ - np.random.seed(42) - - # Generate clustered data manually - X = [] - true_labels = [] - - for center_idx in range(n_centers): - # Create cluster centers - center_x = np.random.uniform(-5, 5) - center_y = np.random.uniform(-5, 5) - - # Generate points around each center - cluster_size = n_samples // n_centers - for _ in range(cluster_size): - x = center_x + np.random.normal(0, 0.6) - y = center_y + np.random.normal(0, 0.6) - X.append([x, y]) - true_labels.append(center_idx) - - # Add some noise points - noise_points = n_samples // 10 - for _ in range(noise_points): - x = np.random.uniform(-10, 10) - y = np.random.uniform(-10, 10) - X.append([x, y]) - true_labels.append(-1) # Noise - - X = np.array(X) - true_labels = np.array(true_labels) - - # Simple DBSCAN-like clustering for ground truth - labels = np.full(len(X), -1) - cluster_id = 0 - - for i in range(len(X)): - if labels[i] != -1: - continue - - # Find neighbors within eps - neighbors = [] - for j in range(len(X)): - if i == j: - continue - dist = np.sqrt((X[i][0] - X[j][0])**2 + (X[i][1] - X[j][1])**2) - if dist <= eps: - neighbors.append(j) - - if len(neighbors) >= min_samples: # min_samples - labels[i] = cluster_id - # Expand cluster (simplified) - for neighbor in neighbors: - if labels[neighbor] == -1: - labels[neighbor] = cluster_id - cluster_id += 1 - else: - labels[i] = -1 - - # Save data to binary file - with open(output_file, 'wb') as f: - # Write number of points - f.write(struct.pack('I', len(X))) - - # Write points as doubles (x, y) - for point in X: - f.write(struct.pack('dd', point[0], point[1])) - - # Write computed labels for validation - for label in labels: - f.write(struct.pack('i', label)) - - print(f"Generated {len(X)} points with {n_centers} clusters") - print(f"Simple clustering found {len(set(labels)) - (1 if -1 in labels else 0)} clusters") - - return X, labels - -if __name__ == '__main__': - if len(sys.argv) > 1 and sys.argv[1] == '--multiple': - generate_multiple_test_cases() - elif len(sys.argv) > 1: - output_file = sys.argv[1] - generate_test_data(output_file=output_file) - else: - generate_test_data() \ No newline at end of file diff --git a/include/dbscan_grid2d_l1.h b/include/dbscan_grid2d_l1.h new file mode 100644 index 0000000..c3e91c1 --- /dev/null +++ b/include/dbscan_grid2d_l1.h @@ -0,0 +1,43 @@ +#pragma once + +#include +#include +#include + +#include "perf_timer.h" + +namespace dbscan { + +enum class GridExpansionMode { + Sequential, + FrontierParallel, + UnionFind +}; + +struct DBSCANGrid2DL1Params { + uint32_t eps; + uint32_t min_samples; + std::size_t num_threads = 0; + std::size_t chunk_size = 0; +}; + +struct Grid2DPoint { + uint32_t x; + uint32_t y; +}; + +struct DBSCANGrid2DL1Result { + std::vector labels; + PerfTiming perf_timing; +}; + +[[nodiscard]] DBSCANGrid2DL1Result dbscan_grid2d_l1(const uint32_t *x, std::size_t x_stride, const uint32_t *y, + std::size_t y_stride, std::size_t count, + const DBSCANGrid2DL1Params ¶ms, + GridExpansionMode mode = GridExpansionMode::Sequential); + +[[nodiscard]] DBSCANGrid2DL1Result dbscan_grid2d_l1_aos(const Grid2DPoint *points, std::size_t count, + const DBSCANGrid2DL1Params ¶ms, + GridExpansionMode mode = GridExpansionMode::Sequential); + +} // namespace dbscan diff --git a/include/parallel.hpp b/include/parallel.hpp index dc07709..5487241 100644 --- a/include/parallel.hpp +++ b/include/parallel.hpp @@ -33,4 +33,38 @@ inline void parallel_for(size_t begin, size_t end, size_t n_threads, const std:: th.join(); } -} // namespace utils \ No newline at end of file +inline void parallelize(size_t begin, size_t end, size_t num_threads, size_t chunk_size, + std::function &&chunk_processor) { + if (begin >= end) + return; + + if (num_threads == 0) + num_threads = std::thread::hardware_concurrency(); + if (num_threads == 0) + num_threads = 1; + + if (chunk_size == 0) + chunk_size = std::max(1, (end - begin + num_threads - 1) / num_threads); + + std::atomic next_begin{begin}; + std::vector workers; + workers.reserve(num_threads); + + for (size_t thread_idx = 0; thread_idx < num_threads; ++thread_idx) { + workers.emplace_back([&, chunk_size]() { + while (true) { + size_t start = next_begin.fetch_add(chunk_size, std::memory_order_relaxed); + if (start >= end) + break; + + size_t stop = std::min(end, start + chunk_size); + chunk_processor(start, stop); + } + }); + } + + for (auto &worker : workers) + worker.join(); +} + +} // namespace utils diff --git a/include/perf_timer.h b/include/perf_timer.h new file mode 100644 index 0000000..7bc5843 --- /dev/null +++ b/include/perf_timer.h @@ -0,0 +1,56 @@ +#pragma once + +#include +#include +#include +#include +#include + +namespace dbscan { + +struct PerfTimingEntry { + std::string label; + double duration_ms; +}; + +struct PerfTiming { + void clear() { entries_.clear(); } + + void add(std::string label, double duration_ms) { + entries_.push_back({std::move(label), duration_ms}); + } + + [[nodiscard]] const std::vector &entries() const { return entries_; } + +private: + std::vector entries_; + + friend class ScopedTimer; +}; + +class ScopedTimer { +public: + ScopedTimer(std::string label, PerfTiming &sink) + : sink_(sink), label_(std::move(label)), start_(std::chrono::steady_clock::now()) {} + + ScopedTimer(const ScopedTimer &) = delete; + ScopedTimer &operator=(const ScopedTimer &) = delete; + + ~ScopedTimer() { + const auto end = std::chrono::steady_clock::now(); + const auto elapsed = std::chrono::duration_cast>(end - start_); + sink_.add(std::move(label_), elapsed.count()); + } + +private: + PerfTiming &sink_; + std::string label_; + std::chrono::steady_clock::time_point start_; +}; + +inline std::ostream &operator<<(std::ostream &out, const PerfTimingEntry &entry) { + out << entry.label << ": " << entry.duration_ms << " ms"; + return out; +} + +} // namespace dbscan diff --git a/scripts/generate_test_data.py b/scripts/generate_test_data.py new file mode 100755 index 0000000..fbb362c --- /dev/null +++ b/scripts/generate_test_data.py @@ -0,0 +1,180 @@ +#!/usr/bin/env -S uv run +# /// script +# dependencies = ["numpy", "scikit-learn"] +# /// + +import argparse +from pathlib import Path + +import numpy as np +from sklearn.cluster import DBSCAN + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + description="Generate synthetic DBSCAN test data with embedded Gaussian clusters." + ) + parser.add_argument( + "--uniform-count", + type=int, + default=200_000, + help="Number of background points drawn from a uniform distribution (default: 200_000).", + ) + parser.add_argument( + "--cluster-count", + type=int, + default=100, + help="Number of Gaussian clusters to sprinkle into the dataset (default: 100).", + ) + parser.add_argument( + "--points-per-cluster", + type=int, + default=256, + help="Number of points sampled for each Gaussian cluster (default: 256).", + ) + parser.add_argument( + "--area-width", + type=int, + default=1_000_000, + help="Width/height of the square area measured in pixels (default: 1_000_000).", + ) + parser.add_argument( + "--cluster-sigma", + type=float, + default=50.0 / 3.0, + help="Standard deviation for the Gaussian clusters in pixels (default: 50/3).", + ) + parser.add_argument( + "--eps", + type=float, + default=60.0, + help="DBSCAN epsilon radius in pixels for scikit-learn clustering (default: 60).", + ) + parser.add_argument( + "--metric", + type=str, + default="l2", + help="Distance metric: 'l2' (Euclidean) or 'l1' (Manhattan).", + ) + parser.add_argument( + "--min-samples", + type=int, + default=16, + help="DBSCAN min_samples parameter for scikit-learn clustering (default: 16).", + ) + parser.add_argument( + "--seed", + type=int, + default=42, + help="Seed for the random number generator (default: 42).", + ) + parser.add_argument( + "--data-file", + type=Path, + default=Path("data.bin"), + help="Path to the output coordinate binary file (default: data.bin).", + ) + parser.add_argument( + "--truth-file", + type=Path, + default=Path("truth.bin"), + help="Path to the output truth label binary file (default: truth.bin).", + ) + return parser.parse_args() + + +def generate_uniform_points(rng: np.random.Generator, count: int, width: int) -> np.ndarray: + if count <= 0: + return np.empty((0, 2), dtype=np.uint32) + + coords = rng.uniform(0, width, size=(count, 2)) + coords = np.rint(coords, casting="unsafe") # round to nearest integer pixel + coords = np.clip(coords, 0, width - 1) + return coords.astype(np.uint32) + + +def generate_gaussian_clusters( + rng: np.random.Generator, + cluster_count: int, + points_per_cluster: int, + width: int, + sigma: float, +) -> np.ndarray: + if cluster_count <= 0 or points_per_cluster <= 0: + return np.empty((0, 2), dtype=np.uint32) + + centers = rng.uniform(0, width, size=(cluster_count, 2)) + clusters = [] + + for center in centers: + samples = rng.normal(loc=center, scale=sigma, size=(points_per_cluster, 2)) + samples = np.rint(samples, casting="unsafe") + samples = np.clip(samples, 0, width - 1) + clusters.append(samples.astype(np.uint32)) + + if not clusters: + return np.empty((0, 2), dtype=np.uint32) + + return np.vstack(clusters) + + +def write_data_file(path: Path, coords_yx: np.ndarray) -> None: + coords_yx.astype(np.uint32).tofile(path) + + +def write_truth_file(path: Path, labels: np.ndarray) -> None: + labels.astype(np.int32).tofile(path) + + +def main() -> None: + args = parse_args() + rng = np.random.default_rng(args.seed) + + uniform_coords = generate_uniform_points(rng, args.uniform_count, args.area_width) + cluster_coords = generate_gaussian_clusters( + rng, + cluster_count=args.cluster_count, + points_per_cluster=args.points_per_cluster, + width=args.area_width, + sigma=args.cluster_sigma, + ) + + if uniform_coords.size == 0 and cluster_coords.size == 0: + raise ValueError("No points generated; adjust the generator parameters.") + + # Concatenate and shuffle to avoid ordering artifacts. + all_coords_yx = np.vstack([uniform_coords, cluster_coords]) + rng.shuffle(all_coords_yx, axis=0) + + # Prepare float coordinates for clustering (x, y order for scikit-learn). + coords_xy_float = all_coords_yx[:, [1, 0]].astype(np.float64) + + metric = args.metric.lower() + if metric == "l1": + sklearn_metric = "cityblock" + elif metric == "l2": + sklearn_metric = "euclidean" + else: + raise ValueError("Unsupported metric. Choose 'l1' or 'l2'.") + + dbscan = DBSCAN(eps=args.eps, min_samples=args.min_samples, metric=sklearn_metric, n_jobs=-1) + labels = dbscan.fit_predict(coords_xy_float) + + # Persist data in requested layout: (y, x) pairs as uint32. + write_data_file(args.data_file, all_coords_yx) + write_truth_file(args.truth_file, labels) + + unique_labels, counts = np.unique(labels, return_counts=True) + cluster_labels = unique_labels[unique_labels != -1] + noise_count = counts[unique_labels == -1].sum() if -1 in unique_labels else 0 + + print(f"Generated {all_coords_yx.shape[0]} total points.") + print(f"Uniform points: {uniform_coords.shape[0]}, clustered points: {cluster_coords.shape[0]}.") + print( + f"DBSCAN ({metric.upper()}) discovered {cluster_labels.size} clusters and {noise_count} noise points." + ) + print(f"Data written to {args.data_file.resolve()} and labels to {args.truth_file.resolve()}.") + + +if __name__ == "__main__": + main() diff --git a/src/dbscan_grid2d_l1.cpp b/src/dbscan_grid2d_l1.cpp new file mode 100644 index 0000000..8e3cdc6 --- /dev/null +++ b/src/dbscan_grid2d_l1.cpp @@ -0,0 +1,23 @@ +#include "dbscan_grid2d_l1.h" + +#include + +#include "dbscan_grid2d_l1_impl.hpp" + +namespace dbscan { + +DBSCANGrid2DL1Result dbscan_grid2d_l1(const uint32_t *x, std::size_t x_stride, const uint32_t *y, std::size_t y_stride, + std::size_t count, const DBSCANGrid2DL1Params ¶ms, GridExpansionMode mode) { + if (params.eps == 0) + throw std::invalid_argument("eps must be greater than zero for dbscan_grid2d_l1"); + if (params.min_samples == 0) + throw std::invalid_argument("min_samples must be greater than zero for dbscan_grid2d_l1"); + if (count != 0 && (x == nullptr || y == nullptr)) + throw std::invalid_argument("Input coordinate arrays must be non-null when count > 0"); + if (count != 0 && (x_stride == 0 || y_stride == 0)) + throw std::invalid_argument("Stride must be positive when count > 0"); + + return detail::dbscan_grid2d_l1_impl(x, x_stride, y, y_stride, count, params, mode); +} + +} // namespace dbscan diff --git a/src/dbscan_grid2d_l1_aos.cpp b/src/dbscan_grid2d_l1_aos.cpp new file mode 100644 index 0000000..15cc900 --- /dev/null +++ b/src/dbscan_grid2d_l1_aos.cpp @@ -0,0 +1,25 @@ +#include "dbscan_grid2d_l1.h" + +#include +#include + +namespace dbscan { + +DBSCANGrid2DL1Result dbscan_grid2d_l1_aos(const Grid2DPoint *points, std::size_t count, + const DBSCANGrid2DL1Params ¶ms, GridExpansionMode mode) { + if (params.eps == 0) + throw std::invalid_argument("eps must be greater than zero for dbscan_grid2d_l1_aos"); + if (params.min_samples == 0) + throw std::invalid_argument("min_samples must be greater than zero for dbscan_grid2d_l1_aos"); + if (count != 0 && points == nullptr) + throw std::invalid_argument("Input point array must be non-null when count > 0"); + + static_assert(sizeof(Grid2DPoint) == sizeof(uint32_t) * 2, "Grid2DPoint is expected to be tightly packed"); + const std::size_t stride = sizeof(Grid2DPoint) / sizeof(uint32_t); + + const uint32_t *x = count == 0 ? nullptr : &points[0].x; + const uint32_t *y = count == 0 ? nullptr : &points[0].y; + return dbscan_grid2d_l1(x, stride, y, stride, count, params, mode); +} + +} // namespace dbscan diff --git a/src/dbscan_grid2d_l1_impl.hpp b/src/dbscan_grid2d_l1_impl.hpp new file mode 100644 index 0000000..afa8722 --- /dev/null +++ b/src/dbscan_grid2d_l1_impl.hpp @@ -0,0 +1,459 @@ +#pragma once + +#include "dbscan_grid2d_l1.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "parallel.hpp" +#include "perf_timer.h" + +namespace dbscan::detail { + +constexpr uint64_t pack_cell(uint32_t ix, uint32_t iy) noexcept { + return (static_cast(ix) << 32U) | static_cast(iy); +} + +constexpr uint32_t cell_of(uint32_t value, uint32_t cell_size) noexcept { + return cell_size == 0 ? value : static_cast(value / cell_size); +} + +inline uint32_t load_coord(const uint32_t *ptr, std::size_t stride, std::size_t index) noexcept { + return *(ptr + index * stride); +} + +template +void for_each_neighbor(uint32_t point_index, uint32_t eps, const uint32_t *x, std::size_t x_stride, + const uint32_t *y, std::size_t y_stride, const std::vector &cell_x, + const std::vector &cell_y, const std::vector &ordered_indices, + const std::vector &cell_offsets, const std::vector &unique_keys, + Fn &&fn) { + + const uint32_t base_cx = cell_x[point_index]; + const uint32_t base_cy = cell_y[point_index]; + const uint32_t x_a = load_coord(x, x_stride, point_index); + const uint32_t y_a = load_coord(y, y_stride, point_index); + + for (int dx = -1; dx <= 1; ++dx) { + const int64_t nx = static_cast(base_cx) + dx; + if (nx < 0) + continue; + + for (int dy = -1; dy <= 1; ++dy) { + const int64_t ny = static_cast(base_cy) + dy; + if (ny < 0) + continue; + + const uint64_t key = pack_cell(static_cast(nx), static_cast(ny)); + const auto it = std::lower_bound(unique_keys.begin(), unique_keys.end(), key); + if (it == unique_keys.end() || *it != key) + continue; + + const std::size_t cell_idx = static_cast(std::distance(unique_keys.begin(), it)); + const std::size_t begin = cell_offsets[cell_idx]; + const std::size_t end = cell_offsets[cell_idx + 1]; + + for (std::size_t pos = begin; pos < end; ++pos) { + const uint32_t neighbor_idx = ordered_indices[pos]; + + const uint32_t x_b = load_coord(x, x_stride, neighbor_idx); + const uint32_t y_b = load_coord(y, y_stride, neighbor_idx); + + const uint32_t dx_abs = x_a > x_b ? x_a - x_b : x_b - x_a; + const uint32_t dy_abs = y_a > y_b ? y_a - y_b : y_b - y_a; + const uint64_t manhattan = static_cast(dx_abs) + static_cast(dy_abs); + + if (manhattan <= static_cast(eps)) { + if (!fn(neighbor_idx)) + return; + } + } + } + } +} + +struct ExpansionContext { + const uint32_t *x; + std::size_t x_stride; + const uint32_t *y; + std::size_t y_stride; + std::size_t count; + uint32_t eps; + uint32_t min_samples; + const std::vector &cell_x; + const std::vector &cell_y; + const std::vector &ordered_indices; + const std::vector &cell_offsets; + const std::vector &unique_keys; + const std::vector &is_core; + std::size_t num_threads; + std::size_t chunk_size; +}; + +void sequential_expand(const ExpansionContext &ctx, std::vector &labels) { + std::vector stack; + stack.reserve(ctx.count); + std::vector neighbor_buffer; + neighbor_buffer.reserve(64); + + int32_t next_label = 0; + for (std::size_t i = 0; i < ctx.count; ++i) { + if (!ctx.is_core[i] || labels[i] != -1) + continue; + + labels[i] = next_label; + stack.clear(); + stack.push_back(static_cast(i)); + + while (!stack.empty()) { + const uint32_t current = stack.back(); + stack.pop_back(); + + neighbor_buffer.clear(); + for_each_neighbor(current, ctx.eps, ctx.x, ctx.x_stride, ctx.y, ctx.y_stride, ctx.cell_x, ctx.cell_y, + ctx.ordered_indices, ctx.cell_offsets, ctx.unique_keys, [&](uint32_t neighbor) { + neighbor_buffer.push_back(neighbor); + return true; + }); + + for (uint32_t neighbor : neighbor_buffer) { + if (labels[neighbor] == -1) { + labels[neighbor] = next_label; + if (ctx.is_core[neighbor]) + stack.push_back(neighbor); + } + } + } + + ++next_label; + } +} + +void frontier_expand(const ExpansionContext &ctx, std::vector &labels) { + std::vector> shared_labels(ctx.count); + for (std::size_t i = 0; i < ctx.count; ++i) + shared_labels[i].store(labels[i], std::memory_order_relaxed); + + int32_t next_label = 0; + std::vector frontier; + frontier.reserve(256); + + const std::size_t frontier_chunk = ctx.chunk_size == 0 ? 64 : ctx.chunk_size; + + for (std::size_t seed = 0; seed < ctx.count; ++seed) { + if (!ctx.is_core[seed] || shared_labels[seed].load(std::memory_order_acquire) != -1) + continue; + + const int32_t label = next_label++; + shared_labels[seed].store(label, std::memory_order_release); + frontier.clear(); + frontier.push_back(static_cast(seed)); + + while (!frontier.empty()) { + std::vector next_frontier; + std::mutex next_mutex; + + utils::parallelize(0, frontier.size(), ctx.num_threads, frontier_chunk, [&](std::size_t begin, std::size_t end) { + std::vector local_next; + local_next.reserve(32); + std::vector neighbor_buffer; + neighbor_buffer.reserve(64); + + for (std::size_t idx = begin; idx < end; ++idx) { + const uint32_t current = frontier[idx]; + + neighbor_buffer.clear(); + for_each_neighbor(current, ctx.eps, ctx.x, ctx.x_stride, ctx.y, ctx.y_stride, ctx.cell_x, ctx.cell_y, + ctx.ordered_indices, ctx.cell_offsets, ctx.unique_keys, [&](uint32_t neighbor) { + neighbor_buffer.push_back(neighbor); + return true; + }); + + for (uint32_t neighbor : neighbor_buffer) { + if (ctx.is_core[neighbor]) { + int32_t expected = -1; + if (shared_labels[neighbor].compare_exchange_strong(expected, label, std::memory_order_acq_rel)) + local_next.push_back(neighbor); + } else { + int32_t expected = -1; + shared_labels[neighbor].compare_exchange_strong(expected, label, std::memory_order_acq_rel); + } + } + } + + if (!local_next.empty()) { + std::sort(local_next.begin(), local_next.end()); + local_next.erase(std::unique(local_next.begin(), local_next.end()), local_next.end()); + std::lock_guard lock(next_mutex); + next_frontier.insert(next_frontier.end(), local_next.begin(), local_next.end()); + } + }); + + if (next_frontier.empty()) + break; + + std::sort(next_frontier.begin(), next_frontier.end()); + next_frontier.erase(std::unique(next_frontier.begin(), next_frontier.end()), next_frontier.end()); + frontier.swap(next_frontier); + } + } + + for (std::size_t i = 0; i < ctx.count; ++i) + labels[i] = shared_labels[i].load(std::memory_order_relaxed); +} + +void union_find_expand(const ExpansionContext &ctx, std::vector &labels) { + constexpr uint32_t invalid = std::numeric_limits::max(); + std::vector> parents(ctx.count); + for (std::size_t i = 0; i < ctx.count; ++i) { + if (ctx.is_core[i]) + parents[i].store(static_cast(i), std::memory_order_relaxed); + else + parents[i].store(invalid, std::memory_order_relaxed); + } + + auto find_root = [&](uint32_t node) { + uint32_t parent = parents[node].load(std::memory_order_acquire); + if (parent == invalid) + return invalid; + + while (true) { + uint32_t grandparent = parents[parent].load(std::memory_order_acquire); + if (grandparent == parent) { + if (parent != node) + parents[node].store(parent, std::memory_order_release); + return parent; + } + parents[node].compare_exchange_strong(parent, grandparent, std::memory_order_acq_rel); + node = parent; + parent = parents[node].load(std::memory_order_acquire); + if (parent == invalid) + return invalid; + } + }; + + auto unite = [&](uint32_t a, uint32_t b) { + while (true) { + a = find_root(a); + b = find_root(b); + if (a == invalid || b == invalid || a == b) + return; + + if (a < b) { + uint32_t expected = b; + if (parents[b].compare_exchange_strong(expected, a, std::memory_order_acq_rel)) + return; + } else { + uint32_t expected = a; + if (parents[a].compare_exchange_strong(expected, b, std::memory_order_acq_rel)) + return; + } + } + }; + + const std::size_t union_chunk = ctx.chunk_size == 0 ? 512 : ctx.chunk_size; + utils::parallelize(0, ctx.count, ctx.num_threads, union_chunk, [&](std::size_t begin, std::size_t end) { + for (std::size_t idx = begin; idx < end; ++idx) { + if (!ctx.is_core[idx]) + continue; + + for_each_neighbor(static_cast(idx), ctx.eps, ctx.x, ctx.x_stride, ctx.y, ctx.y_stride, ctx.cell_x, + ctx.cell_y, ctx.ordered_indices, ctx.cell_offsets, ctx.unique_keys, [&](uint32_t neighbor) { + if (ctx.is_core[neighbor]) + unite(static_cast(idx), neighbor); + return true; + }); + } + }); + + std::vector root_for_point(ctx.count, invalid); + for (std::size_t i = 0; i < ctx.count; ++i) { + if (ctx.is_core[i]) + root_for_point[i] = find_root(static_cast(i)); + } + + std::vector component_min(ctx.count, invalid); + for (std::size_t i = 0; i < ctx.count; ++i) { + if (!ctx.is_core[i]) + continue; + const uint32_t root = root_for_point[i]; + if (root == invalid) + continue; + if (component_min[root] > i) + component_min[root] = static_cast(i); + } + + std::vector> components; + components.reserve(ctx.count); + for (std::size_t i = 0; i < ctx.count; ++i) { + if (component_min[i] != invalid) + components.emplace_back(component_min[i], static_cast(i)); + } + + std::sort(components.begin(), components.end()); + + std::vector root_label(ctx.count, -1); + int32_t next_label = 0; + for (const auto &[min_index, root] : components) { + (void)min_index; + root_label[root] = next_label++; + } + + for (std::size_t i = 0; i < ctx.count; ++i) { + if (!ctx.is_core[i]) + continue; + const uint32_t root = root_for_point[i]; + if (root == invalid) + continue; + labels[i] = root_label[root]; + } + + for (std::size_t i = 0; i < ctx.count; ++i) { + if (ctx.is_core[i]) + continue; + + int32_t best_label = -1; + for_each_neighbor(static_cast(i), ctx.eps, ctx.x, ctx.x_stride, ctx.y, ctx.y_stride, ctx.cell_x, + ctx.cell_y, ctx.ordered_indices, ctx.cell_offsets, ctx.unique_keys, [&](uint32_t neighbor) { + if (!ctx.is_core[neighbor]) + return true; + const int32_t candidate = labels[neighbor]; + if (candidate != -1 && (best_label == -1 || candidate < best_label)) + best_label = candidate; + return true; + }); + labels[i] = best_label; + } +} + +inline DBSCANGrid2DL1Result dbscan_grid2d_l1_impl(const uint32_t *x, std::size_t x_stride, const uint32_t *y, + std::size_t y_stride, std::size_t count, + const DBSCANGrid2DL1Params ¶ms, + GridExpansionMode expansion_mode) { + DBSCANGrid2DL1Result result; + result.perf_timing.clear(); + if (count == 0) + return result; + + ScopedTimer total_timer("total", result.perf_timing); + + const uint32_t cell_size = params.eps; + + std::vector cell_x(count); + std::vector cell_y(count); + std::vector keys(count); + std::vector ordered_indices(count); + std::iota(ordered_indices.begin(), ordered_indices.end(), 0U); + + const std::size_t index_chunk = params.chunk_size == 0 ? 1024 : params.chunk_size; + { + ScopedTimer timer("precompute_cells", result.perf_timing); + utils::parallelize(0, count, params.num_threads, index_chunk, [&](std::size_t begin, std::size_t end) { + for (std::size_t i = begin; i < end; ++i) { + const uint32_t cx = cell_of(load_coord(x, x_stride, i), cell_size); + const uint32_t cy = cell_of(load_coord(y, y_stride, i), cell_size); + cell_x[i] = cx; + cell_y[i] = cy; + keys[i] = pack_cell(cx, cy); + } + }); + } + + { + ScopedTimer timer("sort_indices", result.perf_timing); + std::sort(ordered_indices.begin(), ordered_indices.end(), [&](uint32_t lhs, uint32_t rhs) { + const uint64_t key_lhs = keys[lhs]; + const uint64_t key_rhs = keys[rhs]; + if (key_lhs == key_rhs) + return lhs < rhs; + return key_lhs < key_rhs; + }); + } + + std::vector cell_offsets; + cell_offsets.reserve(count + 1); + std::vector unique_keys; + unique_keys.reserve(count); + + { + ScopedTimer timer("build_cell_offsets", result.perf_timing); + std::size_t pos = 0; + while (pos < count) { + const uint64_t key = keys[ordered_indices[pos]]; + unique_keys.push_back(key); + cell_offsets.push_back(pos); + + do { + ++pos; + } while (pos < count && keys[ordered_indices[pos]] == key); + } + cell_offsets.push_back(count); + } + + result.labels.assign(count, -1); + std::vector is_core(count, 0U); + + const uint32_t eps_value = params.eps; + const uint32_t min_samples_value = params.min_samples; + + const std::size_t core_chunk = params.chunk_size == 0 ? 512 : params.chunk_size; + { + ScopedTimer timer("core_detection", result.perf_timing); + utils::parallelize(0, count, params.num_threads, core_chunk, [&](std::size_t begin, std::size_t end) { + for (std::size_t idx = begin; idx < end; ++idx) { + uint32_t neighbor_count = 0; + for_each_neighbor(static_cast(idx), eps_value, x, x_stride, y, y_stride, cell_x, cell_y, + ordered_indices, cell_offsets, unique_keys, [&](uint32_t) { + ++neighbor_count; + return neighbor_count < min_samples_value; + }); + + if (neighbor_count >= min_samples_value) + is_core[idx] = 1U; + } + }); + } + + const ExpansionContext context{x, + x_stride, + y, + y_stride, + count, + eps_value, + min_samples_value, + cell_x, + cell_y, + ordered_indices, + cell_offsets, + unique_keys, + is_core, + params.num_threads, + params.chunk_size}; + + { + ScopedTimer timer("cluster_expansion", result.perf_timing); + switch (expansion_mode) { + case GridExpansionMode::Sequential: + sequential_expand(context, result.labels); + break; + case GridExpansionMode::FrontierParallel: + frontier_expand(context, result.labels); + break; + case GridExpansionMode::UnionFind: + union_find_expand(context, result.labels); + break; + } + } + + return result; +} + +} // namespace dbscan::detail + diff --git a/tests/data/dbscan_static_data.bin b/tests/data/dbscan_static_data.bin new file mode 100644 index 0000000..0fe4113 Binary files /dev/null and b/tests/data/dbscan_static_data.bin differ diff --git a/tests/data/dbscan_static_truth.bin b/tests/data/dbscan_static_truth.bin new file mode 100644 index 0000000..64aa8ea Binary files /dev/null and b/tests/data/dbscan_static_truth.bin differ diff --git a/tests/test_dbscan_grid2d_l1.cpp b/tests/test_dbscan_grid2d_l1.cpp new file mode 100644 index 0000000..034240f --- /dev/null +++ b/tests/test_dbscan_grid2d_l1.cpp @@ -0,0 +1,144 @@ +#include "dbscan_grid2d_l1.h" + +#include + +#include +#include +#include +#include + +namespace { + +std::filesystem::path fixture_root() { + const std::filesystem::path candidates[] = {std::filesystem::path{"tests"} / "data", + std::filesystem::path{".."} / "tests" / "data", + std::filesystem::path{".."} / ".." / "tests" / "data"}; + for (const auto &candidate : candidates) { + if (std::filesystem::exists(candidate)) + return candidate; + } + return candidates[0]; +} + +// The binary fixtures store Y first then X to mirror the grid compaction used in production; this loader +// preserves that ordering so the test exercises identical memory access patterns as the runtime path. +void load_fixture(const std::filesystem::path &data_path, const std::filesystem::path &truth_path, + std::vector &x_out, std::vector &y_out, std::vector &truth_out) { + std::ifstream data_stream(data_path, std::ios::binary); + REQUIRE(data_stream.good()); + + data_stream.seekg(0, std::ios::end); + const auto data_size = data_stream.tellg(); + data_stream.seekg(0, std::ios::beg); + + REQUIRE(data_size % (sizeof(uint32_t) * 2) == 0); + const std::size_t point_count = static_cast(data_size) / (sizeof(uint32_t) * 2); + + x_out.assign(point_count, 0U); + y_out.assign(point_count, 0U); + + for (std::size_t i = 0; i < point_count; ++i) { + uint32_t y = 0; + uint32_t x = 0; + data_stream.read(reinterpret_cast(&y), sizeof(uint32_t)); + data_stream.read(reinterpret_cast(&x), sizeof(uint32_t)); + REQUIRE(data_stream.good()); + y_out[i] = y; + x_out[i] = x; + } + + std::ifstream truth_stream(truth_path, std::ios::binary); + REQUIRE(truth_stream.good()); + + truth_stream.seekg(0, std::ios::end); + const auto truth_size = truth_stream.tellg(); + truth_stream.seekg(0, std::ios::beg); + + REQUIRE(truth_size % sizeof(int32_t) == 0); + const std::size_t truth_count = static_cast(truth_size) / sizeof(int32_t); + REQUIRE(truth_count == point_count); + + truth_out.assign(truth_count, 0); + truth_stream.read(reinterpret_cast(truth_out.data()), + static_cast(truth_count * sizeof(int32_t))); + REQUIRE(truth_stream.good()); +} + +std::vector make_aos(const std::vector &x, const std::vector &y) { + std::vector points(x.size()); + for (std::size_t i = 0; i < points.size(); ++i) + points[i] = dbscan::Grid2DPoint{x[i], y[i]}; + return points; +} + +} // namespace + +TEST_CASE("DBSCANGrid2D_L1 clusters dense neighbors", "[dbscan][grid_l1]") { + const std::vector x = {0, 1, 2, 100}; + const std::vector y = {0, 0, 1, 200}; + + const dbscan::DBSCANGrid2DL1Params params{4, 3}; + const auto soa_result = dbscan::dbscan_grid2d_l1(x.data(), 1, y.data(), 1, x.size(), params); + const auto &labels = soa_result.labels; + + REQUIRE(labels.size() == x.size()); + REQUIRE(labels[0] == labels[1]); + REQUIRE(labels[1] == labels[2]); + REQUIRE(labels[0] != -1); + REQUIRE(labels[3] == -1); + + const auto aos_points = make_aos(x, y); + const auto aos_result = dbscan::dbscan_grid2d_l1_aos(aos_points.data(), aos_points.size(), params); + REQUIRE(aos_result.labels == labels); +} + +TEST_CASE("DBSCANGrid2D_L1 respects min_samples threshold", "[dbscan][grid_l1]") { + const std::vector coords = {0, 2, 4}; + const dbscan::DBSCANGrid2DL1Params params{3, 4}; + const auto result = dbscan::dbscan_grid2d_l1(coords.data(), 1, coords.data(), 1, coords.size(), params); + REQUIRE(result.labels.size() == coords.size()); + for (int32_t label : result.labels) { + REQUIRE(label == -1); + } +} + +TEST_CASE("DBSCANGrid2D_L1 matches fixture truth", "[dbscan][grid_l1]") { + const std::filesystem::path root = fixture_root(); + const auto data_path = root / "dbscan_static_data.bin"; + const auto truth_path = root / "dbscan_static_truth.bin"; + + std::vector x; + std::vector y; + std::vector truth; + load_fixture(data_path, truth_path, x, y, truth); + + const dbscan::DBSCANGrid2DL1Params params{10, 3}; + const auto sequential = dbscan::dbscan_grid2d_l1(x.data(), 1, y.data(), 1, x.size(), params); + REQUIRE(sequential.labels == truth); + + const auto frontier = + dbscan::dbscan_grid2d_l1(x.data(), 1, y.data(), 1, x.size(), params, dbscan::GridExpansionMode::FrontierParallel); + REQUIRE(frontier.labels == truth); + + const auto union_find = + dbscan::dbscan_grid2d_l1(x.data(), 1, y.data(), 1, x.size(), params, dbscan::GridExpansionMode::UnionFind); + REQUIRE(union_find.labels == truth); +} + +TEST_CASE("DBSCANGrid2D_L1 parallel variants align with sequential", "[dbscan][grid_l1][parallel]") { + const std::vector x = {0, 1, 2, 5, 40}; + const std::vector y = {0, 0, 1, 4, 80}; + + dbscan::DBSCANGrid2DL1Params params{6, 3}; + params.num_threads = 4; + + const auto sequential = dbscan::dbscan_grid2d_l1(x.data(), 1, y.data(), 1, x.size(), params); + + const auto frontier = + dbscan::dbscan_grid2d_l1(x.data(), 1, y.data(), 1, x.size(), params, dbscan::GridExpansionMode::FrontierParallel); + REQUIRE(frontier.labels == sequential.labels); + + const auto union_find = + dbscan::dbscan_grid2d_l1(x.data(), 1, y.data(), 1, x.size(), params, dbscan::GridExpansionMode::UnionFind); + REQUIRE(union_find.labels == sequential.labels); +} diff --git a/tests/test_parallelize.cpp b/tests/test_parallelize.cpp new file mode 100644 index 0000000..13c2650 --- /dev/null +++ b/tests/test_parallelize.cpp @@ -0,0 +1,56 @@ +#include "parallel.hpp" + +#include +#include +#include +#include +#include + +TEST_CASE("parallelize processes full range", "[parallel]") { + constexpr std::size_t N = 10'000; + std::vector> visited(N); + for (auto &flag : visited) + flag.store(false, std::memory_order_relaxed); + + utils::parallelize(0, N, 4, 128, [&](std::size_t begin, std::size_t end) { + for (std::size_t i = begin; i < end; ++i) { + bool expected = false; + REQUIRE(visited[i].compare_exchange_strong(expected, true, std::memory_order_relaxed)); + } + }); + + for (const auto &flag : visited) { + REQUIRE(flag.load(std::memory_order_relaxed)); + } +} + +TEST_CASE("parallelize handles uneven chunks", "[parallel]") { + constexpr std::size_t N = 1'023; + std::vector> counts(N); + for (auto &value : counts) + value.store(0, std::memory_order_relaxed); + + utils::parallelize(0, N, 3, 100, [&](std::size_t begin, std::size_t end) { + for (std::size_t i = begin; i < end; ++i) { + counts[i].fetch_add(1, std::memory_order_relaxed); + } + }); + + for (const auto &value : counts) { + REQUIRE(value.load(std::memory_order_relaxed) == 1); + } +} + +TEST_CASE("parallelize default chunk size", "[parallel]") { + constexpr std::size_t N = 5'000; + std::vector out(N, 0); + + utils::parallelize(0, N, 8, 0, [&](std::size_t begin, std::size_t end) { + for (std::size_t i = begin; i < end; ++i) { + out[i] = static_cast(i); + } + }); + + for (std::size_t i = 0; i < N; ++i) + REQUIRE(out[i] == static_cast(i)); +} diff --git a/tools/dbscan_dataset_validator.cpp b/tools/dbscan_dataset_validator.cpp new file mode 100644 index 0000000..c132b4a --- /dev/null +++ b/tools/dbscan_dataset_validator.cpp @@ -0,0 +1,527 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "dbscan.h" +#include "dbscan_grid2d_l1.h" +#include "dbscan_optimized.h" + +namespace { + +struct Options { + std::filesystem::path data_path{"data.bin"}; + std::filesystem::path truth_path{"truth.bin"}; + double eps{60.0}; + int32_t min_samples{16}; + + bool run_baseline{true}; + bool run_optimized{true}; + bool run_grid_l1{false}; + std::vector grid_modes; + std::optional mismatch_output_dir; +}; + +void print_usage(const char *program_name) { + std::cout << "Usage: " << program_name + << " [--data ] [--truth ] [--eps ] [--min-samples ]" + << " [--impl baseline|optimized|grid|grid_frontier|grid_union|grid_all|both|all]" + << " [--dump-mismatches ]\n"; +} + +Options parse_arguments(int argc, char **argv) { + Options options; + + for (int i = 1; i < argc; ++i) { + const std::string arg{argv[i]}; + + if (arg == "--help" || arg == "-h") { + print_usage(argv[0]); + std::exit(0); + } else if (arg == "--data") { + if (i + 1 >= argc) + throw std::invalid_argument("--data expects a path argument"); + options.data_path = argv[++i]; + } else if (arg == "--truth") { + if (i + 1 >= argc) + throw std::invalid_argument("--truth expects a path argument"); + options.truth_path = argv[++i]; + } else if (arg == "--eps") { + if (i + 1 >= argc) + throw std::invalid_argument("--eps expects a numeric argument"); + options.eps = std::stod(argv[++i]); + } else if (arg == "--min-samples") { + if (i + 1 >= argc) + throw std::invalid_argument("--min-samples expects an integer argument"); + options.min_samples = static_cast(std::stoi(argv[++i])); + } else if (arg == "--impl") { + if (i + 1 >= argc) + throw std::invalid_argument( + "--impl expects one of: baseline, optimized, grid, grid_frontier, grid_union, grid_all, both, all"); + const std::string value{argv[++i]}; + if (value == "baseline") { + options.run_baseline = true; + options.run_optimized = false; + options.run_grid_l1 = false; + options.grid_modes.clear(); + } else if (value == "optimized") { + options.run_baseline = false; + options.run_optimized = true; + options.run_grid_l1 = false; + options.grid_modes.clear(); + } else if (value == "grid" || value == "grid_l1") { + options.run_baseline = false; + options.run_optimized = false; + options.run_grid_l1 = true; + options.grid_modes = {dbscan::GridExpansionMode::Sequential}; + } else if (value == "grid_frontier") { + options.run_baseline = false; + options.run_optimized = false; + options.run_grid_l1 = true; + options.grid_modes = {dbscan::GridExpansionMode::FrontierParallel}; + } else if (value == "grid_union" || value == "grid_union_find") { + options.run_baseline = false; + options.run_optimized = false; + options.run_grid_l1 = true; + options.grid_modes = {dbscan::GridExpansionMode::UnionFind}; + } else if (value == "grid_all") { + options.run_baseline = false; + options.run_optimized = false; + options.run_grid_l1 = true; + options.grid_modes = {dbscan::GridExpansionMode::Sequential, dbscan::GridExpansionMode::FrontierParallel, + dbscan::GridExpansionMode::UnionFind}; + } else if (value == "both") { + options.run_baseline = true; + options.run_optimized = true; + options.run_grid_l1 = false; + options.grid_modes.clear(); + } else if (value == "all") { + options.run_baseline = true; + options.run_optimized = true; + options.run_grid_l1 = true; + if (options.grid_modes.empty()) + options.grid_modes = {dbscan::GridExpansionMode::Sequential}; + } else { + throw std::invalid_argument( + "--impl expects one of: baseline, optimized, grid, grid_frontier, grid_union, grid_all, both, all"); + } + } else if (arg == "--dump-mismatches") { + if (i + 1 >= argc) + throw std::invalid_argument("--dump-mismatches expects a directory path"); + options.mismatch_output_dir = std::filesystem::path{argv[++i]}; + } else { + throw std::invalid_argument("Unknown argument: " + arg); + } + } + + if (options.eps <= 0.0) + throw std::invalid_argument("--eps must be positive"); + if (options.min_samples <= 0) + throw std::invalid_argument("--min-samples must be positive"); + if (options.run_grid_l1 && options.grid_modes.empty()) + options.grid_modes = {dbscan::GridExpansionMode::Sequential}; + + return options; +} + +std::vector> load_points(const std::filesystem::path &path, std::vector *x_out, + std::vector *y_out) { + std::ifstream stream(path, std::ios::binary); + if (!stream) + throw std::runtime_error("Failed to open data file: " + path.string()); + + stream.seekg(0, std::ios::end); + const auto file_size = stream.tellg(); + stream.seekg(0, std::ios::beg); + + if (file_size % (sizeof(uint32_t) * 2) != 0) + throw std::runtime_error("Data file does not contain a whole number of (y, x) uint32 pairs: " + path.string()); + + const std::size_t num_values = static_cast(file_size) / sizeof(uint32_t); + const std::size_t num_points = num_values / 2; + + std::vector raw(num_values); + stream.read(reinterpret_cast(raw.data()), static_cast(raw.size() * sizeof(uint32_t))); + if (!stream) + throw std::runtime_error("Failed to read data file: " + path.string()); + + std::vector> points(num_points); + if (x_out) + x_out->assign(num_points, 0U); + if (y_out) + y_out->assign(num_points, 0U); + + for (std::size_t i = 0; i < num_points; ++i) { + const uint32_t y = raw[2 * i]; + const uint32_t x = raw[2 * i + 1]; + points[i] = dbscan::Point{static_cast(x), static_cast(y)}; + if (x_out) + (*x_out)[i] = x; + if (y_out) + (*y_out)[i] = y; + } + + return points; +} + +std::vector load_labels(const std::filesystem::path &path) { + std::ifstream stream(path, std::ios::binary); + if (!stream) + throw std::runtime_error("Failed to open truth file: " + path.string()); + + stream.seekg(0, std::ios::end); + const auto file_size = stream.tellg(); + stream.seekg(0, std::ios::beg); + + if (file_size % sizeof(int32_t) != 0) + throw std::runtime_error("Truth file does not contain a whole number of int32 labels: " + path.string()); + + const std::size_t num_labels = static_cast(file_size) / sizeof(int32_t); + std::vector labels(num_labels); + stream.read(reinterpret_cast(labels.data()), static_cast(labels.size() * sizeof(int32_t))); + if (!stream) + throw std::runtime_error("Failed to read truth file: " + path.string()); + + return labels; +} + +std::size_t count_clusters(const std::vector &labels) { + std::unordered_set clusters; + clusters.reserve(labels.size()); + for (int32_t label : labels) { + if (label != -1) + clusters.insert(label); + } + return clusters.size(); +} + +std::size_t count_noise(const std::vector &labels) { + return static_cast(std::count(labels.begin(), labels.end(), -1)); +} + +struct LabelIndex { + std::unordered_map to_index; + std::vector values; +}; + +LabelIndex make_index(const std::vector &labels) { + LabelIndex index; + index.values.reserve(labels.size()); + index.to_index.reserve(labels.size()); + + for (int32_t label : labels) { + if (!index.to_index.contains(label)) { + const std::size_t idx = index.values.size(); + index.values.push_back(label); + index.to_index.emplace(label, idx); + } + } + + return index; +} + +double combination2(std::int64_t n) { + if (n <= 1) + return 0.0; + return static_cast(n) * static_cast(n - 1) / 2.0; +} + +struct EvaluationMetrics { + double adjusted_rand{0.0}; + double remapped_accuracy{0.0}; + std::size_t mismatched_points{0}; + std::size_t predicted_clusters{0}; + std::size_t truth_clusters{0}; + std::size_t predicted_noise{0}; + std::size_t truth_noise{0}; + bool passed{false}; +}; + +EvaluationMetrics evaluate(const std::vector &predicted, const std::vector &truth, + std::vector *mismatch_indices = nullptr) { + if (predicted.size() != truth.size()) + throw std::runtime_error("Predicted labels and truth labels must have the same length"); + + const std::size_t total_points = truth.size(); + const auto predicted_index = make_index(predicted); + const auto truth_index = make_index(truth); + + const std::size_t predicted_size = predicted_index.values.size(); + const std::size_t truth_size = truth_index.values.size(); + + std::vector contingency(predicted_size * truth_size, 0); + std::vector predicted_counts(predicted_size, 0); + std::vector truth_counts(truth_size, 0); + + for (std::size_t i = 0; i < total_points; ++i) { + const auto predicted_it = predicted_index.to_index.find(predicted[i]); + const auto truth_it = truth_index.to_index.find(truth[i]); + const std::size_t predicted_row = predicted_it->second; + const std::size_t truth_col = truth_it->second; + + const std::size_t cell_index = predicted_row * truth_size + truth_col; + ++contingency[cell_index]; + ++predicted_counts[predicted_row]; + ++truth_counts[truth_col]; + } + + double sum_combination = 0.0; + for (std::int64_t count : contingency) + sum_combination += combination2(count); + + double predicted_combination = 0.0; + for (std::int64_t count : predicted_counts) + predicted_combination += combination2(count); + + double truth_combination = 0.0; + for (std::int64_t count : truth_counts) + truth_combination += combination2(count); + + const double total_pairs = combination2(static_cast(total_points)); + double expected_index = 0.0; + if (total_pairs > 0.0) + expected_index = (predicted_combination * truth_combination) / total_pairs; + + const double max_index = 0.5 * (predicted_combination + truth_combination); + const double denominator = max_index - expected_index; + + EvaluationMetrics metrics; + if (denominator == 0.0) { + metrics.adjusted_rand = 1.0; + } else { + metrics.adjusted_rand = (sum_combination - expected_index) / denominator; + } + + std::unordered_map remap; + remap.reserve(predicted_size); + for (std::size_t row = 0; row < predicted_size; ++row) { + const int32_t predicted_label = predicted_index.values[row]; + if (predicted_label == -1) { + remap.emplace(predicted_label, -1); + continue; + } + + const std::int64_t *row_ptr = contingency.data() + row * truth_size; + std::size_t best_col = 0; + std::int64_t best_count = -1; + for (std::size_t col = 0; col < truth_size; ++col) { + if (row_ptr[col] > best_count) { + best_count = row_ptr[col]; + best_col = col; + } + } + remap.emplace(predicted_label, truth_index.values[best_col]); + } + + if (mismatch_indices) + mismatch_indices->clear(); + + std::size_t matches = 0; + for (std::size_t i = 0; i < total_points; ++i) { + const int32_t predicted_label = predicted[i]; + const auto mapping_it = remap.find(predicted_label); + const int32_t mapped_label = mapping_it != remap.end() ? mapping_it->second : predicted_label; + if (mapped_label == truth[i]) + ++matches; + else if (mismatch_indices) + mismatch_indices->push_back(i); + } + + metrics.remapped_accuracy = + total_points == 0 ? 1.0 : static_cast(matches) / static_cast(total_points); + metrics.mismatched_points = mismatch_indices ? mismatch_indices->size() : (total_points - matches); + + metrics.predicted_clusters = count_clusters(predicted); + metrics.truth_clusters = count_clusters(truth); + metrics.predicted_noise = count_noise(predicted); + metrics.truth_noise = count_noise(truth); + metrics.passed = metrics.mismatched_points == 0 && metrics.predicted_clusters == metrics.truth_clusters; + + return metrics; +} + +struct RunResult { + std::string name; + EvaluationMetrics metrics; +}; + +} // namespace + +int main(int argc, char **argv) { + try { + const Options options = parse_arguments(argc, argv); + + std::vector x_coords; + std::vector y_coords; + const auto points = load_points(options.data_path, &x_coords, &y_coords); + const auto truth_labels = load_labels(options.truth_path); + + if (points.size() != truth_labels.size()) + throw std::runtime_error("Point count and truth label count differ"); + + std::cout << "Loaded " << points.size() << " points from " << options.data_path << "\n"; + std::cout << "Using eps=" << options.eps << ", min_samples=" << options.min_samples << "\n"; + + const auto truth_cluster_count = count_clusters(truth_labels); + const auto truth_noise_count = count_noise(truth_labels); + std::cout << "Ground truth clusters: " << truth_cluster_count << "; noise points: " << truth_noise_count << "\n"; + + std::vector results; + results.reserve(3); + + if (options.run_baseline) { + std::cout << "\n[baseline] Running clustering..." << std::flush; + const auto start = std::chrono::steady_clock::now(); + dbscan::DBSCAN baseline(options.eps, options.min_samples); + const auto clustering = baseline.cluster(points); + std::vector mismatches; + const auto metrics = + evaluate(clustering.labels, truth_labels, options.mismatch_output_dir ? &mismatches : nullptr); + const auto end = std::chrono::steady_clock::now(); + const auto elapsed_ms = std::chrono::duration_cast(end - start).count(); + std::cout << " done in " << elapsed_ms << " ms" << std::endl; + results.push_back({"baseline", metrics}); + + if (options.mismatch_output_dir && !mismatches.empty()) { + std::filesystem::create_directories(*options.mismatch_output_dir); + auto file_path = *options.mismatch_output_dir / "baseline_mismatches.txt"; + std::ofstream out(file_path); + if (!out) + throw std::runtime_error("Failed to open mismatch output file: " + file_path.string()); + for (std::size_t index : mismatches) + out << index << '\n'; + std::cout << "[baseline] Wrote " << mismatches.size() << " mismatches to " << file_path << "\n"; + } + } + + if (options.run_optimized) { + std::cout << "\n[optimized] Running clustering..." << std::flush; + const auto start = std::chrono::steady_clock::now(); + dbscan::DBSCANOptimized optimized(options.eps, options.min_samples); + const auto clustering = optimized.cluster(points); + std::vector mismatches; + const auto metrics = + evaluate(clustering.labels, truth_labels, options.mismatch_output_dir ? &mismatches : nullptr); + const auto end = std::chrono::steady_clock::now(); + const auto elapsed_ms = std::chrono::duration_cast(end - start).count(); + std::cout << " done in " << elapsed_ms << " ms" << std::endl; + results.push_back({"optimized", metrics}); + + if (options.mismatch_output_dir && !mismatches.empty()) { + std::filesystem::create_directories(*options.mismatch_output_dir); + auto file_path = *options.mismatch_output_dir / "optimized_mismatches.txt"; + std::ofstream out(file_path); + if (!out) + throw std::runtime_error("Failed to open mismatch output file: " + file_path.string()); + for (std::size_t index : mismatches) + out << index << '\n'; + std::cout << "[optimized] Wrote " << mismatches.size() << " mismatches to " << file_path << "\n"; + } + } + + auto grid_variant_info = [](dbscan::GridExpansionMode mode) { + struct VariantInfo { + std::string label; + std::string slug; + }; + switch (mode) { + case dbscan::GridExpansionMode::Sequential: + return VariantInfo{"grid_l1", "grid_l1"}; + case dbscan::GridExpansionMode::FrontierParallel: + return VariantInfo{"grid_l1_frontier", "grid_l1_frontier"}; + case dbscan::GridExpansionMode::UnionFind: + return VariantInfo{"grid_l1_union", "grid_l1_union"}; + } + return VariantInfo{"grid_l1", "grid_l1"}; + }; + + if (options.run_grid_l1) { + const auto eps_int = static_cast(std::llround(options.eps)); + if (std::fabs(options.eps - static_cast(eps_int)) > 1e-6) { + throw std::invalid_argument("grid_l1 implementation requires integer eps value"); + } + if (x_coords.size() != y_coords.size()) + throw std::runtime_error("Mismatch between x and y coordinate counts"); + dbscan::DBSCANGrid2DL1Params params{eps_int, static_cast(options.min_samples)}; + std::vector aos_points; + aos_points.reserve(x_coords.size()); + for (std::size_t i = 0; i < x_coords.size(); ++i) + aos_points.push_back(dbscan::Grid2DPoint{x_coords[i], y_coords[i]}); + for (auto mode : options.grid_modes) { + const auto info = grid_variant_info(mode); + const auto run_layout = [&](const std::string &layout, auto &&invoke) { + const std::string display_label = info.label + "/" + layout; + std::cout << "\n[" << display_label << "] Running clustering..." << std::flush; + const auto start = std::chrono::steady_clock::now(); + const auto result = invoke(); + const auto end = std::chrono::steady_clock::now(); + const auto elapsed_ms = std::chrono::duration_cast(end - start).count(); + std::cout << " done in " << elapsed_ms << " ms" << std::endl; + + std::vector mismatches; + const auto metrics = + evaluate(result.labels, truth_labels, options.mismatch_output_dir ? &mismatches : nullptr); + + if (!result.perf_timing.entries().empty()) { + std::cout << "[" << display_label << "] component timings" << std::endl; + for (const auto &entry : result.perf_timing.entries()) + std::cout << " " << entry << std::endl; + } + results.push_back({display_label, metrics}); + + if (options.mismatch_output_dir && !mismatches.empty()) { + std::filesystem::create_directories(*options.mismatch_output_dir); + auto file_path = *options.mismatch_output_dir / (info.slug + "_" + layout + "_mismatches.txt"); + std::ofstream out(file_path); + if (!out) + throw std::runtime_error("Failed to open mismatch output file: " + file_path.string()); + for (std::size_t index : mismatches) + out << index << '\n'; + std::cout << "[" << display_label << "] Wrote " << mismatches.size() << " mismatches to " << file_path + << "\n"; + } + }; + + run_layout("soa", [&]() { + return dbscan::dbscan_grid2d_l1(x_coords.data(), 1, y_coords.data(), 1, x_coords.size(), params, mode); + }); + + run_layout("aos", [&]() { + return dbscan::dbscan_grid2d_l1_aos(aos_points.data(), aos_points.size(), params, mode); + }); + } + } + + std::cout << std::fixed << std::setprecision(6); + + bool all_passed = true; + for (const auto &result : results) { + const auto &m = result.metrics; + std::cout << "\nImplementation: " << result.name << "\n"; + std::cout << " clusters: " << m.predicted_clusters << " (truth " << m.truth_clusters << ")\n"; + std::cout << " noise points: " << m.predicted_noise << " (truth " << m.truth_noise << ")\n"; + std::cout << " adjusted rand index: " << m.adjusted_rand << "\n"; + std::cout << " remapped accuracy: " << m.remapped_accuracy * 100.0 << "%\n"; + std::cout << " mismatched points: " << m.mismatched_points << "\n"; + std::cout << " status: " << (m.passed ? "PASS" : "FAIL") << "\n"; + all_passed = all_passed && m.passed; + } + + return all_passed ? 0 : 1; + } catch (const std::exception &ex) { + std::cerr << "Error: " << ex.what() << "\n"; + print_usage(argv[0]); + return 1; + } +}