diff --git a/.gitignore b/.gitignore
index 42167f6..5adba1c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -33,6 +33,8 @@ Thumbs.db
 # Test data files
 test_data*.bin
 *.bin
+!tests/data/dbscan_static_data.bin
+!tests/data/dbscan_static_truth.bin
 
 # Executables
 dbscan_tests
@@ -43,4 +45,4 @@ dbscan_tests
 *.tmp
 *.temp
 .cache/
-AGENTS.md
\ No newline at end of file
+AGENTS.md
diff --git a/CMakeLists.txt b/CMakeLists.txt
index c88060e..0ed6314 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -19,6 +19,8 @@ endif()
 add_library(dbscan STATIC
     src/dbscan.cpp
     src/dbscan_optimized.cpp
+    src/dbscan_grid2d_l1.cpp
+    src/dbscan_grid2d_l1_aos.cpp
 )
 
 target_include_directories(dbscan
@@ -64,7 +66,9 @@ FetchContent_MakeAvailable(nanobench)
 add_executable(dbscan_tests
     tests/test_dbscan.cpp
     tests/test_dbscan_optimized.cpp
+    tests/test_dbscan_grid2d_l1.cpp
     tests/test_parallel_for.cpp
+    tests/test_parallelize.cpp
     tests/test_union_find.cpp
 )
 
@@ -79,6 +83,20 @@ target_include_directories(dbscan_tests
         include
 )
 
+add_executable(dbscan_dataset_validator
+    tools/dbscan_dataset_validator.cpp
+)
+
+target_link_libraries(dbscan_dataset_validator
+    PRIVATE
+        dbscan
+)
+
+target_include_directories(dbscan_dataset_validator
+    PRIVATE
+        include
+)
+
 # Benchmark executable
 add_executable(dbscan_benchmark
     benchmark/benchmark_dbscan.cpp
@@ -101,4 +119,12 @@ target_include_directories(dbscan_benchmark
 
 # Enable testing
 enable_testing()
-add_test(NAME dbscan_tests COMMAND dbscan_tests)
\ No newline at end of file
+add_test(NAME dbscan_tests COMMAND dbscan_tests)
+
+add_test(
+    NAME dbscan_dataset_validation
+    COMMAND ${CMAKE_COMMAND}
+        -DPROJECT_SOURCE_DIR=${CMAKE_CURRENT_SOURCE_DIR}
+        -DPROJECT_BINARY_DIR=${CMAKE_CURRENT_BINARY_DIR}
+        -P ${CMAKE_CURRENT_SOURCE_DIR}/cmake/run_dataset_validator.cmake
+)
diff --git a/benchmark/benchmark_dbscan.cpp b/benchmark/benchmark_dbscan.cpp
index 7fb065c..55b0584 100644
--- a/benchmark/benchmark_dbscan.cpp
+++ b/benchmark/benchmark_dbscan.cpp
@@ -1,145 +1,119 @@
-#include "dbscan.h"
-#include "dbscan_optimized.h"
-#include <chrono>
-#include <cstdlib>
-#include <ctime>
+#include "dbscan_grid2d_l1.h"
+
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
 #include <iostream>
-#include <memory>
+#include <limits>
 #include <nanobench.h>
+#include <random>
 #include <string>
 #include <vector>
 
-// Generate clustered 2D data for benchmarking
-std::vector<dbscan::Point<double>> generate_benchmark_data(size_t n_points, int n_clusters = 8) {
-  std::vector<dbscan::Point<double>> points;
-  points.reserve(n_points);
-
-  // Create clusters
-  for (int c = 0; c < n_clusters; ++c) {
-    double center_x = c * 5.0;
-    double center_y = c * 5.0;
-    size_t points_per_cluster = n_points / n_clusters;
-
-    for (size_t i = 0; i < points_per_cluster; ++i) {
-      double x = center_x + (static_cast<double>(rand()) / RAND_MAX - 0.5) * 2.0;
-      double y = center_y + (static_cast<double>(rand()) / RAND_MAX - 0.5) * 2.0;
-      points.push_back({x, y});
-    }
-  }
-
-  // Add some noise points
-  size_t noise_points = n_points / 10;
-  for (size_t i = 0; i < noise_points; ++i) {
-    double x = 50.0 + (static_cast<double>(rand()) / RAND_MAX - 0.5) * 20.0;
-    double y = 50.0 + (static_cast<double>(rand()) / RAND_MAX - 0.5) * 20.0;
-    points.push_back({x, y});
-  }
-
-  return points;
-}
-
-int main() {
-  // Seed random number generator
-  srand(static_cast<unsigned int>(time(nullptr)));
-
-  ankerl::nanobench::Bench bench;
+namespace {
 
-  // Benchmark different data sizes
-  std::vector<size_t> data_sizes = {1000, 10000, 50000, 100000};
+struct Uint32Dataset {
+  std::vector<uint32_t> x;
+  std::vector<uint32_t> y;
+};
 
-  for (size_t n_points : data_sizes) {
-    std::cout << "\n=== Benchmarking with " << n_points << " points ===" << std::endl;
+Uint32Dataset generate_uint32_dataset(std::size_t cluster_count, std::size_t points_per_cluster,
+                                      std::size_t noise_points, uint32_t area_width, uint32_t cluster_sigma,
+                                      std::mt19937 &rng) {
+  std::uniform_real_distribution<double> uniform_dist(0.0, static_cast<double>(area_width));
+  std::normal_distribution<double> normal_dist(0.0, static_cast<double>(cluster_sigma));
 
-    // Generate test data
-    auto points = generate_benchmark_data(n_points);
+  Uint32Dataset dataset;
+  dataset.x.reserve(cluster_count * points_per_cluster + noise_points);
+  dataset.y.reserve(cluster_count * points_per_cluster + noise_points);
 
-    // Benchmark original DBSCAN
-    bench.title("Original DBSCAN").run("Original DBSCAN " + std::to_string(n_points) + " points", [&]() {
-      dbscan::DBSCAN<double> dbscan(0.8, 5);
-      auto result = dbscan.cluster(points);
-      ankerl::nanobench::doNotOptimizeAway(result);
-    });
+  for (std::size_t c = 0; c < cluster_count; ++c) {
+    const double center_x = uniform_dist(rng);
+    const double center_y = uniform_dist(rng);
 
-    // Benchmark optimized DBSCAN
-    bench.title("Optimized DBSCAN").run("Optimized DBSCAN " + std::to_string(n_points) + " points", [&]() {
-      dbscan::DBSCANOptimized<double> dbscan(0.8, 5);
-      auto result = dbscan.cluster(points);
-      ankerl::nanobench::doNotOptimizeAway(result);
-    });
+    for (std::size_t i = 0; i < points_per_cluster; ++i) {
+      const double sample_x = center_x + normal_dist(rng);
+      const double sample_y = center_y + normal_dist(rng);
 
-    // Memory usage comparison
-    {
-      dbscan::DBSCAN<double> original_dbscan(0.8, 5);
-      auto original_result = original_dbscan.cluster(points);
+      const uint32_t clamped_x =
+          static_cast<uint32_t>(std::min(static_cast<double>(area_width - 1), std::max(0.0, std::round(sample_x))));
+      const uint32_t clamped_y =
+          static_cast<uint32_t>(std::min(static_cast<double>(area_width - 1), std::max(0.0, std::round(sample_y))));
 
-      dbscan::DBSCANOptimized<double> optimized_dbscan(0.8, 5);
-      auto optimized_result = optimized_dbscan.cluster(points);
-
-      std::cout << "Original DBSCAN found " << original_result.num_clusters << " clusters" << std::endl;
-      std::cout << "Optimized DBSCAN found " << optimized_result.num_clusters << " clusters" << std::endl;
+      dataset.x.push_back(clamped_x);
+      dataset.y.push_back(clamped_y);
     }
   }
 
-  // Performance comparison with different parameters
-  std::cout << "\n=== Parameter Sensitivity Benchmark ===" << std::endl;
-
-  auto test_points = generate_benchmark_data(10000);
-
-  // Different eps values
-  std::vector<double> eps_values = {0.3, 0.5, 0.8, 1.2};
-
-  for (double eps : eps_values) {
-    bench.title("EPS Parameter").run("Optimized DBSCAN eps=" + std::to_string(eps), [&]() {
-      dbscan::DBSCANOptimized<double> dbscan(eps, 5);
-      auto result = dbscan.cluster(test_points);
-      ankerl::nanobench::doNotOptimizeAway(result);
-    });
+  std::uniform_int_distribution<uint32_t> uniform_int(0, area_width - 1);
+  for (std::size_t i = 0; i < noise_points; ++i) {
+    dataset.x.push_back(uniform_int(rng));
+    dataset.y.push_back(uniform_int(rng));
   }
 
-  // Different min_pts values
-  std::vector<int> min_pts_values = {3, 5, 10, 15};
+  return dataset;
+}
+
+} // namespace
 
-  for (int min_pts : min_pts_values) {
-    bench.title("MinPts Parameter").run("Optimized DBSCAN min_pts=" + std::to_string(min_pts), [&]() {
-      dbscan::DBSCANOptimized<double> dbscan(0.8, min_pts);
-      auto result = dbscan.cluster(test_points);
-      ankerl::nanobench::doNotOptimizeAway(result);
-    });
-  }
+int main() {
+  constexpr uint32_t area_width = 1'000'000;
+  constexpr uint32_t cluster_sigma = 50; // Approximately 3 sigma ~ 150 px footprint
+  constexpr uint32_t eps = 60;
+  constexpr uint32_t min_samples = 16;
 
-  // Detailed performance analysis
-  std::cout << "\n=== Detailed Performance Analysis ===" << std::endl;
-
-  auto large_dataset = generate_benchmark_data(50000);
-
-  // Time both implementations on larger dataset
-  {
-    std::cout << "Running performance comparison on 50k points..." << std::endl;
-
-    // Original DBSCAN timing
-    auto start_time = std::chrono::high_resolution_clock::now();
-    dbscan::DBSCAN<double> original_dbscan(0.8, 5);
-    auto original_result = original_dbscan.cluster(large_dataset);
-    auto end_time = std::chrono::high_resolution_clock::now();
-    auto original_duration = std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time);
-
-    // Optimized DBSCAN timing
-    start_time = std::chrono::high_resolution_clock::now();
-    dbscan::DBSCANOptimized<double> optimized_dbscan(0.8, 5);
-    auto optimized_result = optimized_dbscan.cluster(large_dataset);
-    end_time = std::chrono::high_resolution_clock::now();
-    auto optimized_duration = std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time);
-
-    std::cout << "Original DBSCAN: " << original_duration.count() << "ms, " << original_result.num_clusters
-              << " clusters" << std::endl;
-    std::cout << "Optimized DBSCAN: " << optimized_duration.count() << "ms, " << optimized_result.num_clusters
-              << " clusters" << std::endl;
-
-    if (original_duration.count() > 0) {
-      double speedup = static_cast<double>(original_duration.count()) / optimized_duration.count();
-      std::cout << "Speedup: " << speedup << "x" << std::endl;
+  std::mt19937 rng(1337u);
+  ankerl::nanobench::Bench bench;
+  bench.title("DBSCANGrid2D_L1");
+  bench.relative(true);
+  bench.warmup(2);
+  bench.minEpochIterations(10);
+  bench.unit("pt");
+
+  struct Scenario {
+    std::size_t clusters;
+    std::size_t points_per_cluster;
+  };
+
+  const std::vector<Scenario> scenarios = {
+      {64, 256},  // ~16K cluster points + 32K noise => ~48K total
+      {128, 256}, // ~32K cluster points + 64K noise => ~96K total
+      {256, 256}, // ~65K cluster points + 131K noise => ~196K total
+      {512, 256}, // ~131K cluster points + 262K noise => ~393K total
+      {640, 256}, // ~163K cluster points + 327K noise => ~490K total
+  };
+
+  std::cout << "Benchmarking DBSCANGrid2D_L1 with Manhattan distance" << std::endl;
+  std::cout << "eps=" << eps << ", min_samples=" << min_samples << std::endl;
+  std::cout << "Thread sweep: 0 (auto), 1, 2, 4, 8" << std::endl;
+
+  for (const auto &scenario : scenarios) {
+    const std::size_t cluster_points = scenario.clusters * scenario.points_per_cluster;
+    const std::size_t noise_points = cluster_points * 2; // 2x noise compared to clustered points
+
+    auto dataset = generate_uint32_dataset(scenario.clusters, scenario.points_per_cluster, noise_points, area_width,
+                                           cluster_sigma, rng);
+
+    const std::size_t total_points = dataset.x.size();
+    std::cout << "\nScenario: " << scenario.clusters << " clusters, " << scenario.points_per_cluster
+              << " points/cluster, total points=" << total_points << std::endl;
+
+    bench.batch(static_cast<double>(total_points));
+    bench.context("points", std::to_string(total_points));
+
+    const std::vector<std::size_t> thread_counts = {0, 1, 2, 4, 8};
+    for (std::size_t thread_count : thread_counts) {
+      const std::string label = "grid-l1 " + std::to_string(total_points) + " pts threads=" +
+                                (thread_count == 0 ? std::string("auto") : std::to_string(thread_count));
+      bench.run(label, [&]() {
+        dbscan::DBSCANGrid2DL1Params params{eps, min_samples};
+        params.num_threads = thread_count;
+        auto result =
+            dbscan::dbscan_grid2d_l1(dataset.x.data(), 1, dataset.y.data(), 1, total_points, params);
+        ankerl::nanobench::doNotOptimizeAway(result.labels);
+      });
     }
   }
 
   return 0;
-}
\ No newline at end of file
+}
diff --git a/cmake/run_dataset_validator.cmake b/cmake/run_dataset_validator.cmake
new file mode 100644
index 0000000..2790383
--- /dev/null
+++ b/cmake/run_dataset_validator.cmake
@@ -0,0 +1,31 @@
+set(DATA_FILE "${PROJECT_SOURCE_DIR}/tests/data/dbscan_static_data.bin")
+set(TRUTH_FILE "${PROJECT_SOURCE_DIR}/tests/data/dbscan_static_truth.bin")
+set(VALIDATOR "${PROJECT_BINARY_DIR}/dbscan_dataset_validator")
+
+if(NOT EXISTS "${DATA_FILE}")
+  message(FATAL_ERROR "Static dataset not found: ${DATA_FILE}")
+endif()
+
+if(NOT EXISTS "${TRUTH_FILE}")
+  message(FATAL_ERROR "Truth labels file not found: ${TRUTH_FILE}")
+endif()
+
+if(NOT EXISTS "${VALIDATOR}")
+  message(FATAL_ERROR "dbscan_dataset_validator executable not found: ${VALIDATOR}")
+endif()
+
+execute_process(
+  COMMAND "${VALIDATOR}"
+    "--data" "${DATA_FILE}"
+    "--truth" "${TRUTH_FILE}"
+    "--eps" "10"
+    "--min-samples" "3"
+    "--impl" "both"
+  WORKING_DIRECTORY "${PROJECT_BINARY_DIR}"
+  RESULT_VARIABLE VALIDATOR_RESULT
+  COMMAND_ECHO STDOUT
+)
+
+if(NOT VALIDATOR_RESULT EQUAL 0)
+  message(FATAL_ERROR "dbscan_dataset_validator reported a mismatch (exit code ${VALIDATOR_RESULT})")
+endif()
diff --git a/generate_test_data.py b/generate_test_data.py
deleted file mode 100644
index a299dfa..0000000
--- a/generate_test_data.py
+++ /dev/null
@@ -1,144 +0,0 @@
-#!/usr/bin/env python3
-
-import numpy as np
-import struct
-import sys
-
-def generate_test_data(n_samples=300, n_centers=4, output_file='test_data.bin'):
-    """
-    Generate test data with known clusters and save to binary file.
-    Creates simple clusters manually without sklearn.
-    """
-    return generate_test_data_with_params(n_samples, n_centers, 0.8, 5, output_file)
-
-def load_test_data(input_file='test_data.bin'):
-    """
-    Load test data from binary file.
-    Returns: points (N, 2), labels
-    """
-    with open(input_file, 'rb') as f:
-        # Read number of points
-        n_points = struct.unpack('I', f.read(4))[0]
-
-        points = []
-        labels = []
-
-        # Read points
-        for _ in range(n_points):
-            x, y = struct.unpack('dd', f.read(16))
-            points.append([x, y])
-
-        # Read labels
-        for _ in range(n_points):
-            label = struct.unpack('i', f.read(4))[0]
-            labels.append(label)
-
-    return np.array(points), np.array(labels)
-
-def generate_multiple_test_cases():
-    """Generate test cases with different data sizes and parameters"""
-    test_cases = [
-        # (n_samples, n_centers, eps, min_samples, output_file)
-        (500, 3, 0.8, 5, 'test_data_500.bin'),
-        (10000, 5, 0.8, 5, 'test_data_10k.bin'),
-        (100000, 8, 0.8, 5, 'test_data_100k.bin'),  # Using 100k instead of 1M for practicality
-        # Different eps values
-        (1000, 4, 0.5, 5, 'test_data_eps_0_5.bin'),
-        (1000, 4, 1.2, 5, 'test_data_eps_1_2.bin'),
-        # Different min_samples values
-        (1000, 4, 0.8, 3, 'test_data_minpts_3.bin'),
-        (1000, 4, 0.8, 10, 'test_data_minpts_10.bin'),
-    ]
-
-    for n_samples, n_centers, eps, min_samples, output_file in test_cases:
-        print(f"Generating {n_samples} points with {n_centers} centers (eps={eps}, min_samples={min_samples})...")
-        generate_test_data_with_params(n_samples, n_centers, eps, min_samples, output_file)
-        print(f"Saved to {output_file}\n")
-
-def generate_test_data_with_params(n_samples=300, n_centers=4, eps=0.8, min_samples=5, output_file='test_data.bin'):
-    """
-    Generate test data with specific DBSCAN parameters
-    """
-    np.random.seed(42)
-
-    # Generate clustered data manually
-    X = []
-    true_labels = []
-
-    for center_idx in range(n_centers):
-        # Create cluster centers
-        center_x = np.random.uniform(-5, 5)
-        center_y = np.random.uniform(-5, 5)
-
-        # Generate points around each center
-        cluster_size = n_samples // n_centers
-        for _ in range(cluster_size):
-            x = center_x + np.random.normal(0, 0.6)
-            y = center_y + np.random.normal(0, 0.6)
-            X.append([x, y])
-            true_labels.append(center_idx)
-
-    # Add some noise points
-    noise_points = n_samples // 10
-    for _ in range(noise_points):
-        x = np.random.uniform(-10, 10)
-        y = np.random.uniform(-10, 10)
-        X.append([x, y])
-        true_labels.append(-1)  # Noise
-
-    X = np.array(X)
-    true_labels = np.array(true_labels)
-
-    # Simple DBSCAN-like clustering for ground truth
-    labels = np.full(len(X), -1)
-    cluster_id = 0
-
-    for i in range(len(X)):
-        if labels[i] != -1:
-            continue
-
-        # Find neighbors within eps
-        neighbors = []
-        for j in range(len(X)):
-            if i == j:
-                continue
-            dist = np.sqrt((X[i][0] - X[j][0])**2 + (X[i][1] - X[j][1])**2)
-            if dist <= eps:
-                neighbors.append(j)
-
-        if len(neighbors) >= min_samples:  # min_samples
-            labels[i] = cluster_id
-            # Expand cluster (simplified)
-            for neighbor in neighbors:
-                if labels[neighbor] == -1:
-                    labels[neighbor] = cluster_id
-            cluster_id += 1
-        else:
-            labels[i] = -1
-
-    # Save data to binary file
-    with open(output_file, 'wb') as f:
-        # Write number of points
-        f.write(struct.pack('I', len(X)))
-
-        # Write points as doubles (x, y)
-        for point in X:
-            f.write(struct.pack('dd', point[0], point[1]))
-
-        # Write computed labels for validation
-        for label in labels:
-            f.write(struct.pack('i', label))
-
-    print(f"Generated {len(X)} points with {n_centers} clusters")
-    print(f"Simple clustering found {len(set(labels)) - (1 if -1 in labels else 0)} clusters")
-
-    return X, labels
-
-if __name__ == '__main__':
-    if len(sys.argv) > 1 and sys.argv[1] == '--multiple':
-        generate_multiple_test_cases()
-    elif len(sys.argv) > 1:
-        output_file = sys.argv[1]
-        generate_test_data(output_file=output_file)
-    else:
-        generate_test_data()
\ No newline at end of file
diff --git a/include/dbscan_grid2d_l1.h b/include/dbscan_grid2d_l1.h
new file mode 100644
index 0000000..c3e91c1
--- /dev/null
+++ b/include/dbscan_grid2d_l1.h
@@ -0,0 +1,43 @@
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+#include <vector>
+
+#include "perf_timer.h"
+
+namespace dbscan {
+
+enum class GridExpansionMode {
+  Sequential,
+  FrontierParallel,
+  UnionFind
+};
+
+struct DBSCANGrid2DL1Params {
+  uint32_t eps;
+  uint32_t min_samples;
+  std::size_t num_threads = 0;
+  std::size_t chunk_size = 0;
+};
+
+struct Grid2DPoint {
+  uint32_t x;
+  uint32_t y;
+};
+
+struct DBSCANGrid2DL1Result {
+  std::vector<int32_t> labels;
+  PerfTiming perf_timing;
+};
+
+[[nodiscard]] DBSCANGrid2DL1Result dbscan_grid2d_l1(const uint32_t *x, std::size_t x_stride, const uint32_t *y,
+                                                    std::size_t y_stride, std::size_t count,
+                                                    const DBSCANGrid2DL1Params &params,
+                                                    GridExpansionMode mode = GridExpansionMode::Sequential);
+
+[[nodiscard]] DBSCANGrid2DL1Result dbscan_grid2d_l1_aos(const Grid2DPoint *points, std::size_t count,
+                                                        const DBSCANGrid2DL1Params &params,
+                                                        GridExpansionMode mode = GridExpansionMode::Sequential);
+
+} // namespace dbscan
diff --git a/include/parallel.hpp b/include/parallel.hpp
index dc07709..5487241 100644
--- a/include/parallel.hpp
+++ b/include/parallel.hpp
@@ -33,4 +33,38 @@ inline void parallel_for(size_t begin, size_t end, size_t n_threads, const std::
     th.join();
 }
 
-} // namespace utils
\ No newline at end of file
+inline void parallelize(size_t begin, size_t end, size_t num_threads, size_t chunk_size,
+                        std::function<void(size_t, size_t)> &&chunk_processor) {
+  if (begin >= end)
+    return;
+
+  if (num_threads == 0)
+    num_threads = std::thread::hardware_concurrency();
+  if (num_threads == 0)
+    num_threads = 1;
+
+  if (chunk_size == 0)
+    chunk_size = std::max<std::size_t>(1, (end - begin + num_threads - 1) / num_threads);
+
+  std::atomic<size_t> next_begin{begin};
+  std::vector<std::thread> workers;
+  workers.reserve(num_threads);
+
+  for (size_t thread_idx = 0; thread_idx < num_threads; ++thread_idx) {
+    workers.emplace_back([&, chunk_size]() {
+      while (true) {
+        size_t start = next_begin.fetch_add(chunk_size, std::memory_order_relaxed);
+        if (start >= end)
+          break;
+
+        size_t stop = std::min(end, start + chunk_size);
+        chunk_processor(start, stop);
+      }
+    });
+  }
+
+  for (auto &worker : workers)
+    worker.join();
+}
+
+} // namespace utils
diff --git a/include/perf_timer.h b/include/perf_timer.h
new file mode 100644
index 0000000..7bc5843
--- /dev/null
+++ b/include/perf_timer.h
@@ -0,0 +1,56 @@
+#pragma once
+
+#include <chrono>
+#include <ostream>
+#include <string>
+#include <utility>
+#include <vector>
+
+namespace dbscan {
+
+struct PerfTimingEntry {
+  std::string label;
+  double duration_ms;
+};
+
+struct PerfTiming {
+  void clear() { entries_.clear(); }
+
+  void add(std::string label, double duration_ms) {
+    entries_.push_back({std::move(label), duration_ms});
+  }
+
+  [[nodiscard]] const std::vector<PerfTimingEntry> &entries() const { return entries_; }
+
+private:
+  std::vector<PerfTimingEntry> entries_;
+
+  friend class ScopedTimer;
+};
+
+class ScopedTimer {
+public:
+  ScopedTimer(std::string label, PerfTiming &sink)
+      : sink_(sink), label_(std::move(label)), start_(std::chrono::steady_clock::now()) {}
+
+  ScopedTimer(const ScopedTimer &) = delete;
+  ScopedTimer &operator=(const ScopedTimer &) = delete;
+
+  ~ScopedTimer() {
+    const auto end = std::chrono::steady_clock::now();
+    const auto elapsed = std::chrono::duration_cast<std::chrono::duration<double, std::milli>>(end - start_);
+    sink_.add(std::move(label_), elapsed.count());
+  }
+
+private:
+  PerfTiming &sink_;
+  std::string label_;
+  std::chrono::steady_clock::time_point start_;
+};
+
+inline std::ostream &operator<<(std::ostream &out, const PerfTimingEntry &entry) {
+  out << entry.label << ": " << entry.duration_ms << " ms";
+  return out;
+}
+
+} // namespace dbscan
diff --git a/scripts/generate_test_data.py b/scripts/generate_test_data.py
new file mode 100755
index 0000000..fbb362c
--- /dev/null
+++ b/scripts/generate_test_data.py
@@ -0,0 +1,180 @@
+#!/usr/bin/env -S uv run
+# /// script
+# dependencies = ["numpy", "scikit-learn"]
+# ///
+
+import argparse
+from pathlib import Path
+
+import numpy as np
+from sklearn.cluster import DBSCAN
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="Generate synthetic DBSCAN test data with embedded Gaussian clusters."
+    )
+    parser.add_argument(
+        "--uniform-count",
+        type=int,
+        default=200_000,
+        help="Number of background points drawn from a uniform distribution (default: 200_000).",
+    )
+    parser.add_argument(
+        "--cluster-count",
+        type=int,
+        default=100,
+        help="Number of Gaussian clusters to sprinkle into the dataset (default: 100).",
+    )
+    parser.add_argument(
+        "--points-per-cluster",
+        type=int,
+        default=256,
+        help="Number of points sampled for each Gaussian cluster (default: 256).",
+    )
+    parser.add_argument(
+        "--area-width",
+        type=int,
+        default=1_000_000,
+        help="Width/height of the square area measured in pixels (default: 1_000_000).",
+    )
+    parser.add_argument(
+        "--cluster-sigma",
+        type=float,
+        default=50.0 / 3.0,
+        help="Standard deviation for the Gaussian clusters in pixels (default: 50/3).",
+    )
+    parser.add_argument(
+        "--eps",
+        type=float,
+        default=60.0,
+        help="DBSCAN epsilon radius in pixels for scikit-learn clustering (default: 60).",
+    )
+    parser.add_argument(
+        "--metric",
+        type=str,
+        default="l2",
+        help="Distance metric: 'l2' (Euclidean) or 'l1' (Manhattan).",
+    )
+    parser.add_argument(
+        "--min-samples",
+        type=int,
+        default=16,
+        help="DBSCAN min_samples parameter for scikit-learn clustering (default: 16).",
+    )
+    parser.add_argument(
+        "--seed",
+        type=int,
+        default=42,
+        help="Seed for the random number generator (default: 42).",
+    )
+    parser.add_argument(
+        "--data-file",
+        type=Path,
+        default=Path("data.bin"),
+        help="Path to the output coordinate binary file (default: data.bin).",
+    )
+    parser.add_argument(
+        "--truth-file",
+        type=Path,
+        default=Path("truth.bin"),
+        help="Path to the output truth label binary file (default: truth.bin).",
+    )
+    return parser.parse_args()
+
+
+def generate_uniform_points(rng: np.random.Generator, count: int, width: int) -> np.ndarray:
+    if count <= 0:
+        return np.empty((0, 2), dtype=np.uint32)
+
+    coords = rng.uniform(0, width, size=(count, 2))
+    coords = np.rint(coords, casting="unsafe")  # round to nearest integer pixel
+    coords = np.clip(coords, 0, width - 1)
+    return coords.astype(np.uint32)
+
+
+def generate_gaussian_clusters(
+    rng: np.random.Generator,
+    cluster_count: int,
+    points_per_cluster: int,
+    width: int,
+    sigma: float,
+) -> np.ndarray:
+    if cluster_count <= 0 or points_per_cluster <= 0:
+        return np.empty((0, 2), dtype=np.uint32)
+
+    centers = rng.uniform(0, width, size=(cluster_count, 2))
+    clusters = []
+
+    for center in centers:
+        samples = rng.normal(loc=center, scale=sigma, size=(points_per_cluster, 2))
+        samples = np.rint(samples, casting="unsafe")
+        samples = np.clip(samples, 0, width - 1)
+        clusters.append(samples.astype(np.uint32))
+
+    if not clusters:
+        return np.empty((0, 2), dtype=np.uint32)
+
+    return np.vstack(clusters)
+
+
+def write_data_file(path: Path, coords_yx: np.ndarray) -> None:
+    coords_yx.astype(np.uint32).tofile(path)
+
+
+def write_truth_file(path: Path, labels: np.ndarray) -> None:
+    labels.astype(np.int32).tofile(path)
+
+
+def main() -> None:
+    args = parse_args()
+    rng = np.random.default_rng(args.seed)
+
+    uniform_coords = generate_uniform_points(rng, args.uniform_count, args.area_width)
+    cluster_coords = generate_gaussian_clusters(
+        rng,
+        cluster_count=args.cluster_count,
+        points_per_cluster=args.points_per_cluster,
+        width=args.area_width,
+        sigma=args.cluster_sigma,
+    )
+
+    if uniform_coords.size == 0 and cluster_coords.size == 0:
+        raise ValueError("No points generated; adjust the generator parameters.")
+
+    # Concatenate and shuffle to avoid ordering artifacts.
+    all_coords_yx = np.vstack([uniform_coords, cluster_coords])
+    rng.shuffle(all_coords_yx, axis=0)
+
+    # Prepare float coordinates for clustering (x, y order for scikit-learn).
+    coords_xy_float = all_coords_yx[:, [1, 0]].astype(np.float64)
+
+    metric = args.metric.lower()
+    if metric == "l1":
+        sklearn_metric = "cityblock"
+    elif metric == "l2":
+        sklearn_metric = "euclidean"
+    else:
+        raise ValueError("Unsupported metric. Choose 'l1' or 'l2'.")
+
+    dbscan = DBSCAN(eps=args.eps, min_samples=args.min_samples, metric=sklearn_metric, n_jobs=-1)
+    labels = dbscan.fit_predict(coords_xy_float)
+
+    # Persist data in requested layout: (y, x) pairs as uint32.
+    write_data_file(args.data_file, all_coords_yx)
+    write_truth_file(args.truth_file, labels)
+
+    unique_labels, counts = np.unique(labels, return_counts=True)
+    cluster_labels = unique_labels[unique_labels != -1]
+    noise_count = counts[unique_labels == -1].sum() if -1 in unique_labels else 0
+
+    print(f"Generated {all_coords_yx.shape[0]} total points.")
+    print(f"Uniform points: {uniform_coords.shape[0]}, clustered points: {cluster_coords.shape[0]}.")
+    print(
+        f"DBSCAN ({metric.upper()}) discovered {cluster_labels.size} clusters and {noise_count} noise points."
+    )
+    print(f"Data written to {args.data_file.resolve()} and labels to {args.truth_file.resolve()}.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/dbscan_grid2d_l1.cpp b/src/dbscan_grid2d_l1.cpp
new file mode 100644
index 0000000..8e3cdc6
--- /dev/null
+++ b/src/dbscan_grid2d_l1.cpp
@@ -0,0 +1,23 @@
+#include "dbscan_grid2d_l1.h"
+
+#include <stdexcept>
+
+#include "dbscan_grid2d_l1_impl.hpp"
+
+namespace dbscan {
+
+DBSCANGrid2DL1Result dbscan_grid2d_l1(const uint32_t *x, std::size_t x_stride, const uint32_t *y, std::size_t y_stride,
+                                      std::size_t count, const DBSCANGrid2DL1Params &params, GridExpansionMode mode) {
+  if (params.eps == 0)
+    throw std::invalid_argument("eps must be greater than zero for dbscan_grid2d_l1");
+  if (params.min_samples == 0)
+    throw std::invalid_argument("min_samples must be greater than zero for dbscan_grid2d_l1");
+  if (count != 0 && (x == nullptr || y == nullptr))
+    throw std::invalid_argument("Input coordinate arrays must be non-null when count > 0");
+  if (count != 0 && (x_stride == 0 || y_stride == 0))
+    throw std::invalid_argument("Stride must be positive when count > 0");
+
+  return detail::dbscan_grid2d_l1_impl(x, x_stride, y, y_stride, count, params, mode);
+}
+
+} // namespace dbscan
diff --git a/src/dbscan_grid2d_l1_aos.cpp b/src/dbscan_grid2d_l1_aos.cpp
new file mode 100644
index 0000000..15cc900
--- /dev/null
+++ b/src/dbscan_grid2d_l1_aos.cpp
@@ -0,0 +1,25 @@
+#include "dbscan_grid2d_l1.h"
+
+#include <cstddef>
+#include <stdexcept>
+
+namespace dbscan {
+
+DBSCANGrid2DL1Result dbscan_grid2d_l1_aos(const Grid2DPoint *points, std::size_t count,
+                                          const DBSCANGrid2DL1Params &params, GridExpansionMode mode) {
+  if (params.eps == 0)
+    throw std::invalid_argument("eps must be greater than zero for dbscan_grid2d_l1_aos");
+  if (params.min_samples == 0)
+    throw std::invalid_argument("min_samples must be greater than zero for dbscan_grid2d_l1_aos");
+  if (count != 0 && points == nullptr)
+    throw std::invalid_argument("Input point array must be non-null when count > 0");
+
+  static_assert(sizeof(Grid2DPoint) == sizeof(uint32_t) * 2, "Grid2DPoint is expected to be tightly packed");
+  const std::size_t stride = sizeof(Grid2DPoint) / sizeof(uint32_t);
+
+  const uint32_t *x = count == 0 ? nullptr : &points[0].x;
+  const uint32_t *y = count == 0 ? nullptr : &points[0].y;
+  return dbscan_grid2d_l1(x, stride, y, stride, count, params, mode);
+}
+
+} // namespace dbscan
diff --git a/src/dbscan_grid2d_l1_impl.hpp b/src/dbscan_grid2d_l1_impl.hpp
new file mode 100644
index 0000000..afa8722
--- /dev/null
+++ b/src/dbscan_grid2d_l1_impl.hpp
@@ -0,0 +1,459 @@
+#pragma once
+
+#include "dbscan_grid2d_l1.h"
+
+#include <algorithm>
+#include <atomic>
+#include <cstddef>
+#include <cstdint>
+#include <limits>
+#include <mutex>
+#include <numeric>
+#include <utility>
+#include <vector>
+
+#include "parallel.hpp"
+#include "perf_timer.h"
+
+namespace dbscan::detail {
+
+constexpr uint64_t pack_cell(uint32_t ix, uint32_t iy) noexcept {
+  return (static_cast<uint64_t>(ix) << 32U) | static_cast<uint64_t>(iy);
+}
+
+constexpr uint32_t cell_of(uint32_t value, uint32_t cell_size) noexcept {
+  return cell_size == 0 ? value : static_cast<uint32_t>(value / cell_size);
+}
+
+inline uint32_t load_coord(const uint32_t *ptr, std::size_t stride, std::size_t index) noexcept {
+  return *(ptr + index * stride);
+}
+
+template <typename Fn>
+void for_each_neighbor(uint32_t point_index, uint32_t eps, const uint32_t *x, std::size_t x_stride,
+                       const uint32_t *y, std::size_t y_stride, const std::vector<uint32_t> &cell_x,
+                       const std::vector<uint32_t> &cell_y, const std::vector<uint32_t> &ordered_indices,
+                       const std::vector<std::size_t> &cell_offsets, const std::vector<uint64_t> &unique_keys,
+                       Fn &&fn) {
+
+  const uint32_t base_cx = cell_x[point_index];
+  const uint32_t base_cy = cell_y[point_index];
+  const uint32_t x_a = load_coord(x, x_stride, point_index);
+  const uint32_t y_a = load_coord(y, y_stride, point_index);
+
+  for (int dx = -1; dx <= 1; ++dx) {
+    const int64_t nx = static_cast<int64_t>(base_cx) + dx;
+    if (nx < 0)
+      continue;
+
+    for (int dy = -1; dy <= 1; ++dy) {
+      const int64_t ny = static_cast<int64_t>(base_cy) + dy;
+      if (ny < 0)
+        continue;
+
+      const uint64_t key = pack_cell(static_cast<uint32_t>(nx), static_cast<uint32_t>(ny));
+      const auto it = std::lower_bound(unique_keys.begin(), unique_keys.end(), key);
+      if (it == unique_keys.end() || *it != key)
+        continue;
+
+      const std::size_t cell_idx = static_cast<std::size_t>(std::distance(unique_keys.begin(), it));
+      const std::size_t begin = cell_offsets[cell_idx];
+      const std::size_t end = cell_offsets[cell_idx + 1];
+
+      for (std::size_t pos = begin; pos < end; ++pos) {
+        const uint32_t neighbor_idx = ordered_indices[pos];
+
+        const uint32_t x_b = load_coord(x, x_stride, neighbor_idx);
+        const uint32_t y_b = load_coord(y, y_stride, neighbor_idx);
+
+        const uint32_t dx_abs = x_a > x_b ? x_a - x_b : x_b - x_a;
+        const uint32_t dy_abs = y_a > y_b ? y_a - y_b : y_b - y_a;
+        const uint64_t manhattan = static_cast<uint64_t>(dx_abs) + static_cast<uint64_t>(dy_abs);
+
+        if (manhattan <= static_cast<uint64_t>(eps)) {
+          if (!fn(neighbor_idx))
+            return;
+        }
+      }
+    }
+  }
+}
+
+struct ExpansionContext {
+  const uint32_t *x;
+  std::size_t x_stride;
+  const uint32_t *y;
+  std::size_t y_stride;
+  std::size_t count;
+  uint32_t eps;
+  uint32_t min_samples;
+  const std::vector<uint32_t> &cell_x;
+  const std::vector<uint32_t> &cell_y;
+  const std::vector<uint32_t> &ordered_indices;
+  const std::vector<std::size_t> &cell_offsets;
+  const std::vector<uint64_t> &unique_keys;
+  const std::vector<uint8_t> &is_core;
+  std::size_t num_threads;
+  std::size_t chunk_size;
+};
+
+void sequential_expand(const ExpansionContext &ctx, std::vector<int32_t> &labels) {
+  std::vector<uint32_t> stack;
+  stack.reserve(ctx.count);
+  std::vector<uint32_t> neighbor_buffer;
+  neighbor_buffer.reserve(64);
+
+  int32_t next_label = 0;
+  for (std::size_t i = 0; i < ctx.count; ++i) {
+    if (!ctx.is_core[i] || labels[i] != -1)
+      continue;
+
+    labels[i] = next_label;
+    stack.clear();
+    stack.push_back(static_cast<uint32_t>(i));
+
+    while (!stack.empty()) {
+      const uint32_t current = stack.back();
+      stack.pop_back();
+
+      neighbor_buffer.clear();
+      for_each_neighbor(current, ctx.eps, ctx.x, ctx.x_stride, ctx.y, ctx.y_stride, ctx.cell_x, ctx.cell_y,
+                        ctx.ordered_indices, ctx.cell_offsets, ctx.unique_keys, [&](uint32_t neighbor) {
+                          neighbor_buffer.push_back(neighbor);
+                          return true;
+                        });
+
+      for (uint32_t neighbor : neighbor_buffer) {
+        if (labels[neighbor] == -1) {
+          labels[neighbor] = next_label;
+          if (ctx.is_core[neighbor])
+            stack.push_back(neighbor);
+        }
+      }
+    }
+
+    ++next_label;
+  }
+}
+
+void frontier_expand(const ExpansionContext &ctx, std::vector<int32_t> &labels) {
+  std::vector<std::atomic<int32_t>> shared_labels(ctx.count);
+  for (std::size_t i = 0; i < ctx.count; ++i)
+    shared_labels[i].store(labels[i], std::memory_order_relaxed);
+
+  int32_t next_label = 0;
+  std::vector<uint32_t> frontier;
+  frontier.reserve(256);
+
+  const std::size_t frontier_chunk = ctx.chunk_size == 0 ? 64 : ctx.chunk_size;
+
+  for (std::size_t seed = 0; seed < ctx.count; ++seed) {
+    if (!ctx.is_core[seed] || shared_labels[seed].load(std::memory_order_acquire) != -1)
+      continue;
+
+    const int32_t label = next_label++;
+    shared_labels[seed].store(label, std::memory_order_release);
+    frontier.clear();
+    frontier.push_back(static_cast<uint32_t>(seed));
+
+    while (!frontier.empty()) {
+      std::vector<uint32_t> next_frontier;
+      std::mutex next_mutex;
+
+      utils::parallelize(0, frontier.size(), ctx.num_threads, frontier_chunk, [&](std::size_t begin, std::size_t end) {
+        std::vector<uint32_t> local_next;
+        local_next.reserve(32);
+        std::vector<uint32_t> neighbor_buffer;
+        neighbor_buffer.reserve(64);
+
+        for (std::size_t idx = begin; idx < end; ++idx) {
+          const uint32_t current = frontier[idx];
+
+          neighbor_buffer.clear();
+          for_each_neighbor(current, ctx.eps, ctx.x, ctx.x_stride, ctx.y, ctx.y_stride, ctx.cell_x, ctx.cell_y,
+                            ctx.ordered_indices, ctx.cell_offsets, ctx.unique_keys, [&](uint32_t neighbor) {
+                              neighbor_buffer.push_back(neighbor);
+                              return true;
+                            });
+
+          for (uint32_t neighbor : neighbor_buffer) {
+            if (ctx.is_core[neighbor]) {
+              int32_t expected = -1;
+              if (shared_labels[neighbor].compare_exchange_strong(expected, label, std::memory_order_acq_rel))
+                local_next.push_back(neighbor);
+            } else {
+              int32_t expected = -1;
+              shared_labels[neighbor].compare_exchange_strong(expected, label, std::memory_order_acq_rel);
+            }
+          }
+        }
+
+        if (!local_next.empty()) {
+          std::sort(local_next.begin(), local_next.end());
+          local_next.erase(std::unique(local_next.begin(), local_next.end()), local_next.end());
+          std::lock_guard<std::mutex> lock(next_mutex);
+          next_frontier.insert(next_frontier.end(), local_next.begin(), local_next.end());
+        }
+      });
+
+      if (next_frontier.empty())
+        break;
+
+      std::sort(next_frontier.begin(), next_frontier.end());
+      next_frontier.erase(std::unique(next_frontier.begin(), next_frontier.end()), next_frontier.end());
+      frontier.swap(next_frontier);
+    }
+  }
+
+  for (std::size_t i = 0; i < ctx.count; ++i)
+    labels[i] = shared_labels[i].load(std::memory_order_relaxed);
+}
+
+void union_find_expand(const ExpansionContext &ctx, std::vector<int32_t> &labels) {
+  constexpr uint32_t invalid = std::numeric_limits<uint32_t>::max();
+  std::vector<std::atomic<uint32_t>> parents(ctx.count);
+  for (std::size_t i = 0; i < ctx.count; ++i) {
+    if (ctx.is_core[i])
+      parents[i].store(static_cast<uint32_t>(i), std::memory_order_relaxed);
+    else
+      parents[i].store(invalid, std::memory_order_relaxed);
+  }
+
+  auto find_root = [&](uint32_t node) {
+    uint32_t parent = parents[node].load(std::memory_order_acquire);
+    if (parent == invalid)
+      return invalid;
+
+    while (true) {
+      uint32_t grandparent = parents[parent].load(std::memory_order_acquire);
+      if (grandparent == parent) {
+        if (parent != node)
+          parents[node].store(parent, std::memory_order_release);
+        return parent;
+      }
+      parents[node].compare_exchange_strong(parent, grandparent, std::memory_order_acq_rel);
+      node = parent;
+      parent = parents[node].load(std::memory_order_acquire);
+      if (parent == invalid)
+        return invalid;
+    }
+  };
+
+  auto unite = [&](uint32_t a, uint32_t b) {
+    while (true) {
+      a = find_root(a);
+      b = find_root(b);
+      if (a == invalid || b == invalid || a == b)
+        return;
+
+      if (a < b) {
+        uint32_t expected = b;
+        if (parents[b].compare_exchange_strong(expected, a, std::memory_order_acq_rel))
+          return;
+      } else {
+        uint32_t expected = a;
+        if (parents[a].compare_exchange_strong(expected, b, std::memory_order_acq_rel))
+          return;
+      }
+    }
+  };
+
+  const std::size_t union_chunk = ctx.chunk_size == 0 ? 512 : ctx.chunk_size;
+  utils::parallelize(0, ctx.count, ctx.num_threads, union_chunk, [&](std::size_t begin, std::size_t end) {
+    for (std::size_t idx = begin; idx < end; ++idx) {
+      if (!ctx.is_core[idx])
+        continue;
+
+      for_each_neighbor(static_cast<uint32_t>(idx), ctx.eps, ctx.x, ctx.x_stride, ctx.y, ctx.y_stride, ctx.cell_x,
+                        ctx.cell_y, ctx.ordered_indices, ctx.cell_offsets, ctx.unique_keys, [&](uint32_t neighbor) {
+                          if (ctx.is_core[neighbor])
+                            unite(static_cast<uint32_t>(idx), neighbor);
+                          return true;
+                        });
+    }
+  });
+
+  std::vector<uint32_t> root_for_point(ctx.count, invalid);
+  for (std::size_t i = 0; i < ctx.count; ++i) {
+    if (ctx.is_core[i])
+      root_for_point[i] = find_root(static_cast<uint32_t>(i));
+  }
+
+  std::vector<uint32_t> component_min(ctx.count, invalid);
+  for (std::size_t i = 0; i < ctx.count; ++i) {
+    if (!ctx.is_core[i])
+      continue;
+    const uint32_t root = root_for_point[i];
+    if (root == invalid)
+      continue;
+    if (component_min[root] > i)
+      component_min[root] = static_cast<uint32_t>(i);
+  }
+
+  std::vector<std::pair<uint32_t, uint32_t>> components;
+  components.reserve(ctx.count);
+  for (std::size_t i = 0; i < ctx.count; ++i) {
+    if (component_min[i] != invalid)
+      components.emplace_back(component_min[i], static_cast<uint32_t>(i));
+  }
+
+  std::sort(components.begin(), components.end());
+
+  std::vector<int32_t> root_label(ctx.count, -1);
+  int32_t next_label = 0;
+  for (const auto &[min_index, root] : components) {
+    (void)min_index;
+    root_label[root] = next_label++;
+  }
+
+  for (std::size_t i = 0; i < ctx.count; ++i) {
+    if (!ctx.is_core[i])
+      continue;
+    const uint32_t root = root_for_point[i];
+    if (root == invalid)
+      continue;
+    labels[i] = root_label[root];
+  }
+
+  for (std::size_t i = 0; i < ctx.count; ++i) {
+    if (ctx.is_core[i])
+      continue;
+
+    int32_t best_label = -1;
+    for_each_neighbor(static_cast<uint32_t>(i), ctx.eps, ctx.x, ctx.x_stride, ctx.y, ctx.y_stride, ctx.cell_x,
+                      ctx.cell_y, ctx.ordered_indices, ctx.cell_offsets, ctx.unique_keys, [&](uint32_t neighbor) {
+                        if (!ctx.is_core[neighbor])
+                          return true;
+                        const int32_t candidate = labels[neighbor];
+                        if (candidate != -1 && (best_label == -1 || candidate < best_label))
+                          best_label = candidate;
+                        return true;
+                      });
+    labels[i] = best_label;
+  }
+}
+
+inline DBSCANGrid2DL1Result dbscan_grid2d_l1_impl(const uint32_t *x, std::size_t x_stride, const uint32_t *y,
+                                                  std::size_t y_stride, std::size_t count,
+                                                  const DBSCANGrid2DL1Params &params,
+                                                  GridExpansionMode expansion_mode) {
+  DBSCANGrid2DL1Result result;
+  result.perf_timing.clear();
+  if (count == 0)
+    return result;
+
+  ScopedTimer total_timer("total", result.perf_timing);
+
+  const uint32_t cell_size = params.eps;
+
+  std::vector<uint32_t> cell_x(count);
+  std::vector<uint32_t> cell_y(count);
+  std::vector<uint64_t> keys(count);
+  std::vector<uint32_t> ordered_indices(count);
+  std::iota(ordered_indices.begin(), ordered_indices.end(), 0U);
+
+  const std::size_t index_chunk = params.chunk_size == 0 ? 1024 : params.chunk_size;
+  {
+    ScopedTimer timer("precompute_cells", result.perf_timing);
+    utils::parallelize(0, count, params.num_threads, index_chunk, [&](std::size_t begin, std::size_t end) {
+      for (std::size_t i = begin; i < end; ++i) {
+        const uint32_t cx = cell_of(load_coord(x, x_stride, i), cell_size);
+        const uint32_t cy = cell_of(load_coord(y, y_stride, i), cell_size);
+        cell_x[i] = cx;
+        cell_y[i] = cy;
+        keys[i] = pack_cell(cx, cy);
+      }
+    });
+  }
+
+  {
+    ScopedTimer timer("sort_indices", result.perf_timing);
+    std::sort(ordered_indices.begin(), ordered_indices.end(), [&](uint32_t lhs, uint32_t rhs) {
+      const uint64_t key_lhs = keys[lhs];
+      const uint64_t key_rhs = keys[rhs];
+      if (key_lhs == key_rhs)
+        return lhs < rhs;
+      return key_lhs < key_rhs;
+    });
+  }
+
+  std::vector<std::size_t> cell_offsets;
+  cell_offsets.reserve(count + 1);
+  std::vector<uint64_t> unique_keys;
+  unique_keys.reserve(count);
+
+  {
+    ScopedTimer timer("build_cell_offsets", result.perf_timing);
+    std::size_t pos = 0;
+    while (pos < count) {
+      const uint64_t key = keys[ordered_indices[pos]];
+      unique_keys.push_back(key);
+      cell_offsets.push_back(pos);
+
+      do {
+        ++pos;
+      } while (pos < count && keys[ordered_indices[pos]] == key);
+    }
+    cell_offsets.push_back(count);
+  }
+
+  result.labels.assign(count, -1);
+  std::vector<uint8_t> is_core(count, 0U);
+
+  const uint32_t eps_value = params.eps;
+  const uint32_t min_samples_value = params.min_samples;
+
+  const std::size_t core_chunk = params.chunk_size == 0 ? 512 : params.chunk_size;
+  {
+    ScopedTimer timer("core_detection", result.perf_timing);
+    utils::parallelize(0, count, params.num_threads, core_chunk, [&](std::size_t begin, std::size_t end) {
+      for (std::size_t idx = begin; idx < end; ++idx) {
+        uint32_t neighbor_count = 0;
+        for_each_neighbor(static_cast<uint32_t>(idx), eps_value, x, x_stride, y, y_stride, cell_x, cell_y,
+                          ordered_indices, cell_offsets, unique_keys, [&](uint32_t) {
+                            ++neighbor_count;
+                            return neighbor_count < min_samples_value;
+                          });
+
+        if (neighbor_count >= min_samples_value)
+          is_core[idx] = 1U;
+      }
+    });
+  }
+
+  const ExpansionContext context{x,
+                                 x_stride,
+                                 y,
+                                 y_stride,
+                                 count,
+                                 eps_value,
+                                 min_samples_value,
+                                 cell_x,
+                                 cell_y,
+                                 ordered_indices,
+                                 cell_offsets,
+                                 unique_keys,
+                                 is_core,
+                                 params.num_threads,
+                                 params.chunk_size};
+
+  {
+    ScopedTimer timer("cluster_expansion", result.perf_timing);
+    switch (expansion_mode) {
+    case GridExpansionMode::Sequential:
+      sequential_expand(context, result.labels);
+      break;
+    case GridExpansionMode::FrontierParallel:
+      frontier_expand(context, result.labels);
+      break;
+    case GridExpansionMode::UnionFind:
+      union_find_expand(context, result.labels);
+      break;
+    }
+  }
+
+  return result;
+}
+
+} // namespace dbscan::detail
+
diff --git a/tests/data/dbscan_static_data.bin b/tests/data/dbscan_static_data.bin
new file mode 100644
index 0000000..0fe4113
Binary files /dev/null and b/tests/data/dbscan_static_data.bin differ
diff --git a/tests/data/dbscan_static_truth.bin b/tests/data/dbscan_static_truth.bin
new file mode 100644
index 0000000..64aa8ea
Binary files /dev/null and b/tests/data/dbscan_static_truth.bin differ
diff --git a/tests/test_dbscan_grid2d_l1.cpp b/tests/test_dbscan_grid2d_l1.cpp
new file mode 100644
index 0000000..034240f
--- /dev/null
+++ b/tests/test_dbscan_grid2d_l1.cpp
@@ -0,0 +1,144 @@
+#include "dbscan_grid2d_l1.h"
+
+#include <catch2/catch_test_macros.hpp>
+
+#include <cstdint>
+#include <filesystem>
+#include <fstream>
+#include <vector>
+
+namespace {
+
+std::filesystem::path fixture_root() {
+  const std::filesystem::path candidates[] = {std::filesystem::path{"tests"} / "data",
+                                              std::filesystem::path{".."} / "tests" / "data",
+                                              std::filesystem::path{".."} / ".." / "tests" / "data"};
+  for (const auto &candidate : candidates) {
+    if (std::filesystem::exists(candidate))
+      return candidate;
+  }
+  return candidates[0];
+}
+
+// The binary fixtures store Y first then X to mirror the grid compaction used in production; this loader
+// preserves that ordering so the test exercises identical memory access patterns as the runtime path.
+void load_fixture(const std::filesystem::path &data_path, const std::filesystem::path &truth_path,
+                  std::vector<uint32_t> &x_out, std::vector<uint32_t> &y_out, std::vector<int32_t> &truth_out) {
+  std::ifstream data_stream(data_path, std::ios::binary);
+  REQUIRE(data_stream.good());
+
+  data_stream.seekg(0, std::ios::end);
+  const auto data_size = data_stream.tellg();
+  data_stream.seekg(0, std::ios::beg);
+
+  REQUIRE(data_size % (sizeof(uint32_t) * 2) == 0);
+  const std::size_t point_count = static_cast<std::size_t>(data_size) / (sizeof(uint32_t) * 2);
+
+  x_out.assign(point_count, 0U);
+  y_out.assign(point_count, 0U);
+
+  for (std::size_t i = 0; i < point_count; ++i) {
+    uint32_t y = 0;
+    uint32_t x = 0;
+    data_stream.read(reinterpret_cast<char *>(&y), sizeof(uint32_t));
+    data_stream.read(reinterpret_cast<char *>(&x), sizeof(uint32_t));
+    REQUIRE(data_stream.good());
+    y_out[i] = y;
+    x_out[i] = x;
+  }
+
+  std::ifstream truth_stream(truth_path, std::ios::binary);
+  REQUIRE(truth_stream.good());
+
+  truth_stream.seekg(0, std::ios::end);
+  const auto truth_size = truth_stream.tellg();
+  truth_stream.seekg(0, std::ios::beg);
+
+  REQUIRE(truth_size % sizeof(int32_t) == 0);
+  const std::size_t truth_count = static_cast<std::size_t>(truth_size) / sizeof(int32_t);
+  REQUIRE(truth_count == point_count);
+
+  truth_out.assign(truth_count, 0);
+  truth_stream.read(reinterpret_cast<char *>(truth_out.data()),
+                    static_cast<std::streamsize>(truth_count * sizeof(int32_t)));
+  REQUIRE(truth_stream.good());
+}
+
+std::vector<dbscan::Grid2DPoint> make_aos(const std::vector<uint32_t> &x, const std::vector<uint32_t> &y) {
+  std::vector<dbscan::Grid2DPoint> points(x.size());
+  for (std::size_t i = 0; i < points.size(); ++i)
+    points[i] = dbscan::Grid2DPoint{x[i], y[i]};
+  return points;
+}
+
+} // namespace
+
+TEST_CASE("DBSCANGrid2D_L1 clusters dense neighbors", "[dbscan][grid_l1]") {
+  const std::vector<uint32_t> x = {0, 1, 2, 100};
+  const std::vector<uint32_t> y = {0, 0, 1, 200};
+
+  const dbscan::DBSCANGrid2DL1Params params{4, 3};
+  const auto soa_result = dbscan::dbscan_grid2d_l1(x.data(), 1, y.data(), 1, x.size(), params);
+  const auto &labels = soa_result.labels;
+
+  REQUIRE(labels.size() == x.size());
+  REQUIRE(labels[0] == labels[1]);
+  REQUIRE(labels[1] == labels[2]);
+  REQUIRE(labels[0] != -1);
+  REQUIRE(labels[3] == -1);
+
+  const auto aos_points = make_aos(x, y);
+  const auto aos_result = dbscan::dbscan_grid2d_l1_aos(aos_points.data(), aos_points.size(), params);
+  REQUIRE(aos_result.labels == labels);
+}
+
+TEST_CASE("DBSCANGrid2D_L1 respects min_samples threshold", "[dbscan][grid_l1]") {
+  const std::vector<uint32_t> coords = {0, 2, 4};
+  const dbscan::DBSCANGrid2DL1Params params{3, 4};
+  const auto result = dbscan::dbscan_grid2d_l1(coords.data(), 1, coords.data(), 1, coords.size(), params);
+  REQUIRE(result.labels.size() == coords.size());
+  for (int32_t label : result.labels) {
+    REQUIRE(label == -1);
+  }
+}
+
+TEST_CASE("DBSCANGrid2D_L1 matches fixture truth", "[dbscan][grid_l1]") {
+  const std::filesystem::path root = fixture_root();
+  const auto data_path = root / "dbscan_static_data.bin";
+  const auto truth_path = root / "dbscan_static_truth.bin";
+
+  std::vector<uint32_t> x;
+  std::vector<uint32_t> y;
+  std::vector<int32_t> truth;
+  load_fixture(data_path, truth_path, x, y, truth);
+
+  const dbscan::DBSCANGrid2DL1Params params{10, 3};
+  const auto sequential = dbscan::dbscan_grid2d_l1(x.data(), 1, y.data(), 1, x.size(), params);
+  REQUIRE(sequential.labels == truth);
+
+  const auto frontier =
+      dbscan::dbscan_grid2d_l1(x.data(), 1, y.data(), 1, x.size(), params, dbscan::GridExpansionMode::FrontierParallel);
+  REQUIRE(frontier.labels == truth);
+
+  const auto union_find =
+      dbscan::dbscan_grid2d_l1(x.data(), 1, y.data(), 1, x.size(), params, dbscan::GridExpansionMode::UnionFind);
+  REQUIRE(union_find.labels == truth);
+}
+
+TEST_CASE("DBSCANGrid2D_L1 parallel variants align with sequential", "[dbscan][grid_l1][parallel]") {
+  const std::vector<uint32_t> x = {0, 1, 2, 5, 40};
+  const std::vector<uint32_t> y = {0, 0, 1, 4, 80};
+
+  dbscan::DBSCANGrid2DL1Params params{6, 3};
+  params.num_threads = 4;
+
+  const auto sequential = dbscan::dbscan_grid2d_l1(x.data(), 1, y.data(), 1, x.size(), params);
+
+  const auto frontier =
+      dbscan::dbscan_grid2d_l1(x.data(), 1, y.data(), 1, x.size(), params, dbscan::GridExpansionMode::FrontierParallel);
+  REQUIRE(frontier.labels == sequential.labels);
+
+  const auto union_find =
+      dbscan::dbscan_grid2d_l1(x.data(), 1, y.data(), 1, x.size(), params, dbscan::GridExpansionMode::UnionFind);
+  REQUIRE(union_find.labels == sequential.labels);
+}
diff --git a/tests/test_parallelize.cpp b/tests/test_parallelize.cpp
new file mode 100644
index 0000000..13c2650
--- /dev/null
+++ b/tests/test_parallelize.cpp
@@ -0,0 +1,56 @@
+#include "parallel.hpp"
+
+#include <atomic>
+#include <catch2/catch_test_macros.hpp>
+#include <cstddef>
+#include <numeric>
+#include <vector>
+
+TEST_CASE("parallelize processes full range", "[parallel]") {
+  constexpr std::size_t N = 10'000;
+  std::vector<std::atomic<bool>> visited(N);
+  for (auto &flag : visited)
+    flag.store(false, std::memory_order_relaxed);
+
+  utils::parallelize(0, N, 4, 128, [&](std::size_t begin, std::size_t end) {
+    for (std::size_t i = begin; i < end; ++i) {
+      bool expected = false;
+      REQUIRE(visited[i].compare_exchange_strong(expected, true, std::memory_order_relaxed));
+    }
+  });
+
+  for (const auto &flag : visited) {
+    REQUIRE(flag.load(std::memory_order_relaxed));
+  }
+}
+
+TEST_CASE("parallelize handles uneven chunks", "[parallel]") {
+  constexpr std::size_t N = 1'023;
+  std::vector<std::atomic<int>> counts(N);
+  for (auto &value : counts)
+    value.store(0, std::memory_order_relaxed);
+
+  utils::parallelize(0, N, 3, 100, [&](std::size_t begin, std::size_t end) {
+    for (std::size_t i = begin; i < end; ++i) {
+      counts[i].fetch_add(1, std::memory_order_relaxed);
+    }
+  });
+
+  for (const auto &value : counts) {
+    REQUIRE(value.load(std::memory_order_relaxed) == 1);
+  }
+}
+
+TEST_CASE("parallelize default chunk size", "[parallel]") {
+  constexpr std::size_t N = 5'000;
+  std::vector<int> out(N, 0);
+
+  utils::parallelize(0, N, 8, 0, [&](std::size_t begin, std::size_t end) {
+    for (std::size_t i = begin; i < end; ++i) {
+      out[i] = static_cast<int>(i);
+    }
+  });
+
+  for (std::size_t i = 0; i < N; ++i)
+    REQUIRE(out[i] == static_cast<int>(i));
+}
diff --git a/tools/dbscan_dataset_validator.cpp b/tools/dbscan_dataset_validator.cpp
new file mode 100644
index 0000000..c132b4a
--- /dev/null
+++ b/tools/dbscan_dataset_validator.cpp
@@ -0,0 +1,527 @@
+#include <algorithm>
+#include <chrono>
+#include <cmath>
+#include <cstdint>
+#include <cstdlib>
+#include <filesystem>
+#include <fstream>
+#include <iomanip>
+#include <iostream>
+#include <optional>
+#include <stdexcept>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+#include "dbscan.h"
+#include "dbscan_grid2d_l1.h"
+#include "dbscan_optimized.h"
+
+namespace {
+
+struct Options {
+  std::filesystem::path data_path{"data.bin"};
+  std::filesystem::path truth_path{"truth.bin"};
+  double eps{60.0};
+  int32_t min_samples{16};
+
+  bool run_baseline{true};
+  bool run_optimized{true};
+  bool run_grid_l1{false};
+  std::vector<dbscan::GridExpansionMode> grid_modes;
+  std::optional<std::filesystem::path> mismatch_output_dir;
+};
+
+void print_usage(const char *program_name) {
+  std::cout << "Usage: " << program_name
+            << " [--data <data.bin>] [--truth <truth.bin>] [--eps <value>] [--min-samples <value>]"
+            << " [--impl baseline|optimized|grid|grid_frontier|grid_union|grid_all|both|all]"
+            << " [--dump-mismatches <directory>]\n";
+}
+
+Options parse_arguments(int argc, char **argv) {
+  Options options;
+
+  for (int i = 1; i < argc; ++i) {
+    const std::string arg{argv[i]};
+
+    if (arg == "--help" || arg == "-h") {
+      print_usage(argv[0]);
+      std::exit(0);
+    } else if (arg == "--data") {
+      if (i + 1 >= argc)
+        throw std::invalid_argument("--data expects a path argument");
+      options.data_path = argv[++i];
+    } else if (arg == "--truth") {
+      if (i + 1 >= argc)
+        throw std::invalid_argument("--truth expects a path argument");
+      options.truth_path = argv[++i];
+    } else if (arg == "--eps") {
+      if (i + 1 >= argc)
+        throw std::invalid_argument("--eps expects a numeric argument");
+      options.eps = std::stod(argv[++i]);
+    } else if (arg == "--min-samples") {
+      if (i + 1 >= argc)
+        throw std::invalid_argument("--min-samples expects an integer argument");
+      options.min_samples = static_cast<int32_t>(std::stoi(argv[++i]));
+    } else if (arg == "--impl") {
+      if (i + 1 >= argc)
+        throw std::invalid_argument(
+            "--impl expects one of: baseline, optimized, grid, grid_frontier, grid_union, grid_all, both, all");
+      const std::string value{argv[++i]};
+      if (value == "baseline") {
+        options.run_baseline = true;
+        options.run_optimized = false;
+        options.run_grid_l1 = false;
+        options.grid_modes.clear();
+      } else if (value == "optimized") {
+        options.run_baseline = false;
+        options.run_optimized = true;
+        options.run_grid_l1 = false;
+        options.grid_modes.clear();
+      } else if (value == "grid" || value == "grid_l1") {
+        options.run_baseline = false;
+        options.run_optimized = false;
+        options.run_grid_l1 = true;
+        options.grid_modes = {dbscan::GridExpansionMode::Sequential};
+      } else if (value == "grid_frontier") {
+        options.run_baseline = false;
+        options.run_optimized = false;
+        options.run_grid_l1 = true;
+        options.grid_modes = {dbscan::GridExpansionMode::FrontierParallel};
+      } else if (value == "grid_union" || value == "grid_union_find") {
+        options.run_baseline = false;
+        options.run_optimized = false;
+        options.run_grid_l1 = true;
+        options.grid_modes = {dbscan::GridExpansionMode::UnionFind};
+      } else if (value == "grid_all") {
+        options.run_baseline = false;
+        options.run_optimized = false;
+        options.run_grid_l1 = true;
+        options.grid_modes = {dbscan::GridExpansionMode::Sequential, dbscan::GridExpansionMode::FrontierParallel,
+                              dbscan::GridExpansionMode::UnionFind};
+      } else if (value == "both") {
+        options.run_baseline = true;
+        options.run_optimized = true;
+        options.run_grid_l1 = false;
+        options.grid_modes.clear();
+      } else if (value == "all") {
+        options.run_baseline = true;
+        options.run_optimized = true;
+        options.run_grid_l1 = true;
+        if (options.grid_modes.empty())
+          options.grid_modes = {dbscan::GridExpansionMode::Sequential};
+      } else {
+        throw std::invalid_argument(
+            "--impl expects one of: baseline, optimized, grid, grid_frontier, grid_union, grid_all, both, all");
+      }
+    } else if (arg == "--dump-mismatches") {
+      if (i + 1 >= argc)
+        throw std::invalid_argument("--dump-mismatches expects a directory path");
+      options.mismatch_output_dir = std::filesystem::path{argv[++i]};
+    } else {
+      throw std::invalid_argument("Unknown argument: " + arg);
+    }
+  }
+
+  if (options.eps <= 0.0)
+    throw std::invalid_argument("--eps must be positive");
+  if (options.min_samples <= 0)
+    throw std::invalid_argument("--min-samples must be positive");
+  if (options.run_grid_l1 && options.grid_modes.empty())
+    options.grid_modes = {dbscan::GridExpansionMode::Sequential};
+
+  return options;
+}
+
+std::vector<dbscan::Point<double>> load_points(const std::filesystem::path &path, std::vector<uint32_t> *x_out,
+                                               std::vector<uint32_t> *y_out) {
+  std::ifstream stream(path, std::ios::binary);
+  if (!stream)
+    throw std::runtime_error("Failed to open data file: " + path.string());
+
+  stream.seekg(0, std::ios::end);
+  const auto file_size = stream.tellg();
+  stream.seekg(0, std::ios::beg);
+
+  if (file_size % (sizeof(uint32_t) * 2) != 0)
+    throw std::runtime_error("Data file does not contain a whole number of (y, x) uint32 pairs: " + path.string());
+
+  const std::size_t num_values = static_cast<std::size_t>(file_size) / sizeof(uint32_t);
+  const std::size_t num_points = num_values / 2;
+
+  std::vector<uint32_t> raw(num_values);
+  stream.read(reinterpret_cast<char *>(raw.data()), static_cast<std::streamsize>(raw.size() * sizeof(uint32_t)));
+  if (!stream)
+    throw std::runtime_error("Failed to read data file: " + path.string());
+
+  std::vector<dbscan::Point<double>> points(num_points);
+  if (x_out)
+    x_out->assign(num_points, 0U);
+  if (y_out)
+    y_out->assign(num_points, 0U);
+
+  for (std::size_t i = 0; i < num_points; ++i) {
+    const uint32_t y = raw[2 * i];
+    const uint32_t x = raw[2 * i + 1];
+    points[i] = dbscan::Point<double>{static_cast<double>(x), static_cast<double>(y)};
+    if (x_out)
+      (*x_out)[i] = x;
+    if (y_out)
+      (*y_out)[i] = y;
+  }
+
+  return points;
+}
+
+std::vector<int32_t> load_labels(const std::filesystem::path &path) {
+  std::ifstream stream(path, std::ios::binary);
+  if (!stream)
+    throw std::runtime_error("Failed to open truth file: " + path.string());
+
+  stream.seekg(0, std::ios::end);
+  const auto file_size = stream.tellg();
+  stream.seekg(0, std::ios::beg);
+
+  if (file_size % sizeof(int32_t) != 0)
+    throw std::runtime_error("Truth file does not contain a whole number of int32 labels: " + path.string());
+
+  const std::size_t num_labels = static_cast<std::size_t>(file_size) / sizeof(int32_t);
+  std::vector<int32_t> labels(num_labels);
+  stream.read(reinterpret_cast<char *>(labels.data()), static_cast<std::streamsize>(labels.size() * sizeof(int32_t)));
+  if (!stream)
+    throw std::runtime_error("Failed to read truth file: " + path.string());
+
+  return labels;
+}
+
+std::size_t count_clusters(const std::vector<int32_t> &labels) {
+  std::unordered_set<int32_t> clusters;
+  clusters.reserve(labels.size());
+  for (int32_t label : labels) {
+    if (label != -1)
+      clusters.insert(label);
+  }
+  return clusters.size();
+}
+
+std::size_t count_noise(const std::vector<int32_t> &labels) {
+  return static_cast<std::size_t>(std::count(labels.begin(), labels.end(), -1));
+}
+
+struct LabelIndex {
+  std::unordered_map<int32_t, std::size_t> to_index;
+  std::vector<int32_t> values;
+};
+
+LabelIndex make_index(const std::vector<int32_t> &labels) {
+  LabelIndex index;
+  index.values.reserve(labels.size());
+  index.to_index.reserve(labels.size());
+
+  for (int32_t label : labels) {
+    if (!index.to_index.contains(label)) {
+      const std::size_t idx = index.values.size();
+      index.values.push_back(label);
+      index.to_index.emplace(label, idx);
+    }
+  }
+
+  return index;
+}
+
+double combination2(std::int64_t n) {
+  if (n <= 1)
+    return 0.0;
+  return static_cast<double>(n) * static_cast<double>(n - 1) / 2.0;
+}
+
+struct EvaluationMetrics {
+  double adjusted_rand{0.0};
+  double remapped_accuracy{0.0};
+  std::size_t mismatched_points{0};
+  std::size_t predicted_clusters{0};
+  std::size_t truth_clusters{0};
+  std::size_t predicted_noise{0};
+  std::size_t truth_noise{0};
+  bool passed{false};
+};
+
+EvaluationMetrics evaluate(const std::vector<int32_t> &predicted, const std::vector<int32_t> &truth,
+                           std::vector<std::size_t> *mismatch_indices = nullptr) {
+  if (predicted.size() != truth.size())
+    throw std::runtime_error("Predicted labels and truth labels must have the same length");
+
+  const std::size_t total_points = truth.size();
+  const auto predicted_index = make_index(predicted);
+  const auto truth_index = make_index(truth);
+
+  const std::size_t predicted_size = predicted_index.values.size();
+  const std::size_t truth_size = truth_index.values.size();
+
+  std::vector<std::int64_t> contingency(predicted_size * truth_size, 0);
+  std::vector<std::int64_t> predicted_counts(predicted_size, 0);
+  std::vector<std::int64_t> truth_counts(truth_size, 0);
+
+  for (std::size_t i = 0; i < total_points; ++i) {
+    const auto predicted_it = predicted_index.to_index.find(predicted[i]);
+    const auto truth_it = truth_index.to_index.find(truth[i]);
+    const std::size_t predicted_row = predicted_it->second;
+    const std::size_t truth_col = truth_it->second;
+
+    const std::size_t cell_index = predicted_row * truth_size + truth_col;
+    ++contingency[cell_index];
+    ++predicted_counts[predicted_row];
+    ++truth_counts[truth_col];
+  }
+
+  double sum_combination = 0.0;
+  for (std::int64_t count : contingency)
+    sum_combination += combination2(count);
+
+  double predicted_combination = 0.0;
+  for (std::int64_t count : predicted_counts)
+    predicted_combination += combination2(count);
+
+  double truth_combination = 0.0;
+  for (std::int64_t count : truth_counts)
+    truth_combination += combination2(count);
+
+  const double total_pairs = combination2(static_cast<std::int64_t>(total_points));
+  double expected_index = 0.0;
+  if (total_pairs > 0.0)
+    expected_index = (predicted_combination * truth_combination) / total_pairs;
+
+  const double max_index = 0.5 * (predicted_combination + truth_combination);
+  const double denominator = max_index - expected_index;
+
+  EvaluationMetrics metrics;
+  if (denominator == 0.0) {
+    metrics.adjusted_rand = 1.0;
+  } else {
+    metrics.adjusted_rand = (sum_combination - expected_index) / denominator;
+  }
+
+  std::unordered_map<int32_t, int32_t> remap;
+  remap.reserve(predicted_size);
+  for (std::size_t row = 0; row < predicted_size; ++row) {
+    const int32_t predicted_label = predicted_index.values[row];
+    if (predicted_label == -1) {
+      remap.emplace(predicted_label, -1);
+      continue;
+    }
+
+    const std::int64_t *row_ptr = contingency.data() + row * truth_size;
+    std::size_t best_col = 0;
+    std::int64_t best_count = -1;
+    for (std::size_t col = 0; col < truth_size; ++col) {
+      if (row_ptr[col] > best_count) {
+        best_count = row_ptr[col];
+        best_col = col;
+      }
+    }
+    remap.emplace(predicted_label, truth_index.values[best_col]);
+  }
+
+  if (mismatch_indices)
+    mismatch_indices->clear();
+
+  std::size_t matches = 0;
+  for (std::size_t i = 0; i < total_points; ++i) {
+    const int32_t predicted_label = predicted[i];
+    const auto mapping_it = remap.find(predicted_label);
+    const int32_t mapped_label = mapping_it != remap.end() ? mapping_it->second : predicted_label;
+    if (mapped_label == truth[i])
+      ++matches;
+    else if (mismatch_indices)
+      mismatch_indices->push_back(i);
+  }
+
+  metrics.remapped_accuracy =
+      total_points == 0 ? 1.0 : static_cast<double>(matches) / static_cast<double>(total_points);
+  metrics.mismatched_points = mismatch_indices ? mismatch_indices->size() : (total_points - matches);
+
+  metrics.predicted_clusters = count_clusters(predicted);
+  metrics.truth_clusters = count_clusters(truth);
+  metrics.predicted_noise = count_noise(predicted);
+  metrics.truth_noise = count_noise(truth);
+  metrics.passed = metrics.mismatched_points == 0 && metrics.predicted_clusters == metrics.truth_clusters;
+
+  return metrics;
+}
+
+struct RunResult {
+  std::string name;
+  EvaluationMetrics metrics;
+};
+
+} // namespace
+
+int main(int argc, char **argv) {
+  try {
+    const Options options = parse_arguments(argc, argv);
+
+    std::vector<uint32_t> x_coords;
+    std::vector<uint32_t> y_coords;
+    const auto points = load_points(options.data_path, &x_coords, &y_coords);
+    const auto truth_labels = load_labels(options.truth_path);
+
+    if (points.size() != truth_labels.size())
+      throw std::runtime_error("Point count and truth label count differ");
+
+    std::cout << "Loaded " << points.size() << " points from " << options.data_path << "\n";
+    std::cout << "Using eps=" << options.eps << ", min_samples=" << options.min_samples << "\n";
+
+    const auto truth_cluster_count = count_clusters(truth_labels);
+    const auto truth_noise_count = count_noise(truth_labels);
+    std::cout << "Ground truth clusters: " << truth_cluster_count << "; noise points: " << truth_noise_count << "\n";
+
+    std::vector<RunResult> results;
+    results.reserve(3);
+
+    if (options.run_baseline) {
+      std::cout << "\n[baseline] Running clustering..." << std::flush;
+      const auto start = std::chrono::steady_clock::now();
+      dbscan::DBSCAN<double> baseline(options.eps, options.min_samples);
+      const auto clustering = baseline.cluster(points);
+      std::vector<std::size_t> mismatches;
+      const auto metrics =
+          evaluate(clustering.labels, truth_labels, options.mismatch_output_dir ? &mismatches : nullptr);
+      const auto end = std::chrono::steady_clock::now();
+      const auto elapsed_ms = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();
+      std::cout << " done in " << elapsed_ms << " ms" << std::endl;
+      results.push_back({"baseline", metrics});
+
+      if (options.mismatch_output_dir && !mismatches.empty()) {
+        std::filesystem::create_directories(*options.mismatch_output_dir);
+        auto file_path = *options.mismatch_output_dir / "baseline_mismatches.txt";
+        std::ofstream out(file_path);
+        if (!out)
+          throw std::runtime_error("Failed to open mismatch output file: " + file_path.string());
+        for (std::size_t index : mismatches)
+          out << index << '\n';
+        std::cout << "[baseline] Wrote " << mismatches.size() << " mismatches to " << file_path << "\n";
+      }
+    }
+
+    if (options.run_optimized) {
+      std::cout << "\n[optimized] Running clustering..." << std::flush;
+      const auto start = std::chrono::steady_clock::now();
+      dbscan::DBSCANOptimized<double> optimized(options.eps, options.min_samples);
+      const auto clustering = optimized.cluster(points);
+      std::vector<std::size_t> mismatches;
+      const auto metrics =
+          evaluate(clustering.labels, truth_labels, options.mismatch_output_dir ? &mismatches : nullptr);
+      const auto end = std::chrono::steady_clock::now();
+      const auto elapsed_ms = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();
+      std::cout << " done in " << elapsed_ms << " ms" << std::endl;
+      results.push_back({"optimized", metrics});
+
+      if (options.mismatch_output_dir && !mismatches.empty()) {
+        std::filesystem::create_directories(*options.mismatch_output_dir);
+        auto file_path = *options.mismatch_output_dir / "optimized_mismatches.txt";
+        std::ofstream out(file_path);
+        if (!out)
+          throw std::runtime_error("Failed to open mismatch output file: " + file_path.string());
+        for (std::size_t index : mismatches)
+          out << index << '\n';
+        std::cout << "[optimized] Wrote " << mismatches.size() << " mismatches to " << file_path << "\n";
+      }
+    }
+
+    auto grid_variant_info = [](dbscan::GridExpansionMode mode) {
+      struct VariantInfo {
+        std::string label;
+        std::string slug;
+      };
+      switch (mode) {
+      case dbscan::GridExpansionMode::Sequential:
+        return VariantInfo{"grid_l1", "grid_l1"};
+      case dbscan::GridExpansionMode::FrontierParallel:
+        return VariantInfo{"grid_l1_frontier", "grid_l1_frontier"};
+      case dbscan::GridExpansionMode::UnionFind:
+        return VariantInfo{"grid_l1_union", "grid_l1_union"};
+      }
+      return VariantInfo{"grid_l1", "grid_l1"};
+    };
+
+    if (options.run_grid_l1) {
+      const auto eps_int = static_cast<uint32_t>(std::llround(options.eps));
+      if (std::fabs(options.eps - static_cast<double>(eps_int)) > 1e-6) {
+        throw std::invalid_argument("grid_l1 implementation requires integer eps value");
+      }
+      if (x_coords.size() != y_coords.size())
+        throw std::runtime_error("Mismatch between x and y coordinate counts");
+      dbscan::DBSCANGrid2DL1Params params{eps_int, static_cast<uint32_t>(options.min_samples)};
+      std::vector<dbscan::Grid2DPoint> aos_points;
+      aos_points.reserve(x_coords.size());
+      for (std::size_t i = 0; i < x_coords.size(); ++i)
+        aos_points.push_back(dbscan::Grid2DPoint{x_coords[i], y_coords[i]});
+      for (auto mode : options.grid_modes) {
+        const auto info = grid_variant_info(mode);
+        const auto run_layout = [&](const std::string &layout, auto &&invoke) {
+          const std::string display_label = info.label + "/" + layout;
+          std::cout << "\n[" << display_label << "] Running clustering..." << std::flush;
+          const auto start = std::chrono::steady_clock::now();
+          const auto result = invoke();
+          const auto end = std::chrono::steady_clock::now();
+          const auto elapsed_ms = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();
+          std::cout << " done in " << elapsed_ms << " ms" << std::endl;
+
+          std::vector<std::size_t> mismatches;
+          const auto metrics =
+              evaluate(result.labels, truth_labels, options.mismatch_output_dir ? &mismatches : nullptr);
+
+          if (!result.perf_timing.entries().empty()) {
+            std::cout << "[" << display_label << "] component timings" << std::endl;
+            for (const auto &entry : result.perf_timing.entries())
+              std::cout << "  " << entry << std::endl;
+          }
+          results.push_back({display_label, metrics});
+
+          if (options.mismatch_output_dir && !mismatches.empty()) {
+            std::filesystem::create_directories(*options.mismatch_output_dir);
+            auto file_path = *options.mismatch_output_dir / (info.slug + "_" + layout + "_mismatches.txt");
+            std::ofstream out(file_path);
+            if (!out)
+              throw std::runtime_error("Failed to open mismatch output file: " + file_path.string());
+            for (std::size_t index : mismatches)
+              out << index << '\n';
+            std::cout << "[" << display_label << "] Wrote " << mismatches.size() << " mismatches to " << file_path
+                      << "\n";
+          }
+        };
+
+        run_layout("soa", [&]() {
+          return dbscan::dbscan_grid2d_l1(x_coords.data(), 1, y_coords.data(), 1, x_coords.size(), params, mode);
+        });
+
+        run_layout("aos", [&]() {
+          return dbscan::dbscan_grid2d_l1_aos(aos_points.data(), aos_points.size(), params, mode);
+        });
+      }
+    }
+
+    std::cout << std::fixed << std::setprecision(6);
+
+    bool all_passed = true;
+    for (const auto &result : results) {
+      const auto &m = result.metrics;
+      std::cout << "\nImplementation: " << result.name << "\n";
+      std::cout << "  clusters: " << m.predicted_clusters << " (truth " << m.truth_clusters << ")\n";
+      std::cout << "  noise points: " << m.predicted_noise << " (truth " << m.truth_noise << ")\n";
+      std::cout << "  adjusted rand index: " << m.adjusted_rand << "\n";
+      std::cout << "  remapped accuracy: " << m.remapped_accuracy * 100.0 << "%\n";
+      std::cout << "  mismatched points: " << m.mismatched_points << "\n";
+      std::cout << "  status: " << (m.passed ? "PASS" : "FAIL") << "\n";
+      all_passed = all_passed && m.passed;
+    }
+
+    return all_passed ? 0 : 1;
+  } catch (const std::exception &ex) {
+    std::cerr << "Error: " << ex.what() << "\n";
+    print_usage(argv[0]);
+    return 1;
+  }
+}