From 2db43027b6cd2b90b18fc2e95e6934c7c61fd5d2 Mon Sep 17 00:00:00 2001
From: Bo Lu <bo.lu@utexas.edu>
Date: Tue, 30 Sep 2025 22:07:14 -0700
Subject: [PATCH 01/12] add validator

---
 .gitignore                         |   4 +-
 CMakeLists.txt                     |  24 +-
 cmake/run_dataset_validator.cmake  |  31 +++
 generate_test_data.py              | 144 -----------
 scripts/generate_test_data.py      | 164 ++++++++++++
 tools/dbscan_dataset_validator.cpp | 398 +++++++++++++++++++++++++++++
 6 files changed, 619 insertions(+), 146 deletions(-)
 create mode 100644 cmake/run_dataset_validator.cmake
 delete mode 100644 generate_test_data.py
 create mode 100755 scripts/generate_test_data.py
 create mode 100644 tools/dbscan_dataset_validator.cpp

diff --git a/.gitignore b/.gitignore
index 42167f6..5adba1c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -33,6 +33,8 @@ Thumbs.db
 # Test data files
 test_data*.bin
 *.bin
+!tests/data/dbscan_static_data.bin
+!tests/data/dbscan_static_truth.bin
 
 # Executables
 dbscan_tests
@@ -43,4 +45,4 @@ dbscan_tests
 *.tmp
 *.temp
 .cache/
-AGENTS.md
\ No newline at end of file
+AGENTS.md
diff --git a/CMakeLists.txt b/CMakeLists.txt
index c88060e..df35fb7 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -79,6 +79,20 @@ target_include_directories(dbscan_tests
         include
 )
 
+add_executable(dbscan_dataset_validator
+    tools/dbscan_dataset_validator.cpp
+)
+
+target_link_libraries(dbscan_dataset_validator
+    PRIVATE
+        dbscan
+)
+
+target_include_directories(dbscan_dataset_validator
+    PRIVATE
+        include
+)
+
 # Benchmark executable
 add_executable(dbscan_benchmark
     benchmark/benchmark_dbscan.cpp
@@ -101,4 +115,12 @@ target_include_directories(dbscan_benchmark
 
 # Enable testing
 enable_testing()
-add_test(NAME dbscan_tests COMMAND dbscan_tests)
\ No newline at end of file
+add_test(NAME dbscan_tests COMMAND dbscan_tests)
+
+add_test(
+    NAME dbscan_dataset_validation
+    COMMAND ${CMAKE_COMMAND}
+        -DPROJECT_SOURCE_DIR=${CMAKE_CURRENT_SOURCE_DIR}
+        -DPROJECT_BINARY_DIR=${CMAKE_CURRENT_BINARY_DIR}
+        -P ${CMAKE_CURRENT_SOURCE_DIR}/cmake/run_dataset_validator.cmake
+)
diff --git a/cmake/run_dataset_validator.cmake b/cmake/run_dataset_validator.cmake
new file mode 100644
index 0000000..2790383
--- /dev/null
+++ b/cmake/run_dataset_validator.cmake
@@ -0,0 +1,31 @@
+set(DATA_FILE "${PROJECT_SOURCE_DIR}/tests/data/dbscan_static_data.bin")
+set(TRUTH_FILE "${PROJECT_SOURCE_DIR}/tests/data/dbscan_static_truth.bin")
+set(VALIDATOR "${PROJECT_BINARY_DIR}/dbscan_dataset_validator")
+
+if(NOT EXISTS "${DATA_FILE}")
+  message(FATAL_ERROR "Static dataset not found: ${DATA_FILE}")
+endif()
+
+if(NOT EXISTS "${TRUTH_FILE}")
+  message(FATAL_ERROR "Truth labels file not found: ${TRUTH_FILE}")
+endif()
+
+if(NOT EXISTS "${VALIDATOR}")
+  message(FATAL_ERROR "dbscan_dataset_validator executable not found: ${VALIDATOR}")
+endif()
+
+execute_process(
+  COMMAND "${VALIDATOR}"
+    "--data" "${DATA_FILE}"
+    "--truth" "${TRUTH_FILE}"
+    "--eps" "10"
+    "--min-samples" "3"
+    "--impl" "both"
+  WORKING_DIRECTORY "${PROJECT_BINARY_DIR}"
+  RESULT_VARIABLE VALIDATOR_RESULT
+  COMMAND_ECHO STDOUT
+)
+
+if(NOT VALIDATOR_RESULT EQUAL 0)
+  message(FATAL_ERROR "dbscan_dataset_validator reported a mismatch (exit code ${VALIDATOR_RESULT})")
+endif()
diff --git a/generate_test_data.py b/generate_test_data.py
deleted file mode 100644
index a299dfa..0000000
--- a/generate_test_data.py
+++ /dev/null
@@ -1,144 +0,0 @@
-#!/usr/bin/env python3
-
-import numpy as np
-import struct
-import sys
-
-def generate_test_data(n_samples=300, n_centers=4, output_file='test_data.bin'):
-    """
-    Generate test data with known clusters and save to binary file.
-    Creates simple clusters manually without sklearn.
-    """
-    return generate_test_data_with_params(n_samples, n_centers, 0.8, 5, output_file)
-
-def load_test_data(input_file='test_data.bin'):
-    """
-    Load test data from binary file.
-    Returns: points (N, 2), labels
-    """
-    with open(input_file, 'rb') as f:
-        # Read number of points
-        n_points = struct.unpack('I', f.read(4))[0]
-
-        points = []
-        labels = []
-
-        # Read points
-        for _ in range(n_points):
-            x, y = struct.unpack('dd', f.read(16))
-            points.append([x, y])
-
-        # Read labels
-        for _ in range(n_points):
-            label = struct.unpack('i', f.read(4))[0]
-            labels.append(label)
-
-    return np.array(points), np.array(labels)
-
-def generate_multiple_test_cases():
-    """Generate test cases with different data sizes and parameters"""
-    test_cases = [
-        # (n_samples, n_centers, eps, min_samples, output_file)
-        (500, 3, 0.8, 5, 'test_data_500.bin'),
-        (10000, 5, 0.8, 5, 'test_data_10k.bin'),
-        (100000, 8, 0.8, 5, 'test_data_100k.bin'),  # Using 100k instead of 1M for practicality
-        # Different eps values
-        (1000, 4, 0.5, 5, 'test_data_eps_0_5.bin'),
-        (1000, 4, 1.2, 5, 'test_data_eps_1_2.bin'),
-        # Different min_samples values
-        (1000, 4, 0.8, 3, 'test_data_minpts_3.bin'),
-        (1000, 4, 0.8, 10, 'test_data_minpts_10.bin'),
-    ]
-
-    for n_samples, n_centers, eps, min_samples, output_file in test_cases:
-        print(f"Generating {n_samples} points with {n_centers} centers (eps={eps}, min_samples={min_samples})...")
-        generate_test_data_with_params(n_samples, n_centers, eps, min_samples, output_file)
-        print(f"Saved to {output_file}\n")
-
-def generate_test_data_with_params(n_samples=300, n_centers=4, eps=0.8, min_samples=5, output_file='test_data.bin'):
-    """
-    Generate test data with specific DBSCAN parameters
-    """
-    np.random.seed(42)
-
-    # Generate clustered data manually
-    X = []
-    true_labels = []
-
-    for center_idx in range(n_centers):
-        # Create cluster centers
-        center_x = np.random.uniform(-5, 5)
-        center_y = np.random.uniform(-5, 5)
-
-        # Generate points around each center
-        cluster_size = n_samples // n_centers
-        for _ in range(cluster_size):
-            x = center_x + np.random.normal(0, 0.6)
-            y = center_y + np.random.normal(0, 0.6)
-            X.append([x, y])
-            true_labels.append(center_idx)
-
-    # Add some noise points
-    noise_points = n_samples // 10
-    for _ in range(noise_points):
-        x = np.random.uniform(-10, 10)
-        y = np.random.uniform(-10, 10)
-        X.append([x, y])
-        true_labels.append(-1)  # Noise
-
-    X = np.array(X)
-    true_labels = np.array(true_labels)
-
-    # Simple DBSCAN-like clustering for ground truth
-    labels = np.full(len(X), -1)
-    cluster_id = 0
-
-    for i in range(len(X)):
-        if labels[i] != -1:
-            continue
-
-        # Find neighbors within eps
-        neighbors = []
-        for j in range(len(X)):
-            if i == j:
-                continue
-            dist = np.sqrt((X[i][0] - X[j][0])**2 + (X[i][1] - X[j][1])**2)
-            if dist <= eps:
-                neighbors.append(j)
-
-        if len(neighbors) >= min_samples:  # min_samples
-            labels[i] = cluster_id
-            # Expand cluster (simplified)
-            for neighbor in neighbors:
-                if labels[neighbor] == -1:
-                    labels[neighbor] = cluster_id
-            cluster_id += 1
-        else:
-            labels[i] = -1
-
-    # Save data to binary file
-    with open(output_file, 'wb') as f:
-        # Write number of points
-        f.write(struct.pack('I', len(X)))
-
-        # Write points as doubles (x, y)
-        for point in X:
-            f.write(struct.pack('dd', point[0], point[1]))
-
-        # Write computed labels for validation
-        for label in labels:
-            f.write(struct.pack('i', label))
-
-    print(f"Generated {len(X)} points with {n_centers} clusters")
-    print(f"Simple clustering found {len(set(labels)) - (1 if -1 in labels else 0)} clusters")
-
-    return X, labels
-
-if __name__ == '__main__':
-    if len(sys.argv) > 1 and sys.argv[1] == '--multiple':
-        generate_multiple_test_cases()
-    elif len(sys.argv) > 1:
-        output_file = sys.argv[1]
-        generate_test_data(output_file=output_file)
-    else:
-        generate_test_data()
\ No newline at end of file
diff --git a/scripts/generate_test_data.py b/scripts/generate_test_data.py
new file mode 100755
index 0000000..16a893b
--- /dev/null
+++ b/scripts/generate_test_data.py
@@ -0,0 +1,164 @@
+#!/usr/bin/env -S uv run
+# /// script
+# dependencies = ["numpy", "scikit-learn"]
+# ///
+
+import argparse
+from pathlib import Path
+
+import numpy as np
+from sklearn.cluster import DBSCAN
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="Generate synthetic DBSCAN test data with embedded Gaussian clusters."
+    )
+    parser.add_argument(
+        "--uniform-count",
+        type=int,
+        default=200_000,
+        help="Number of background points drawn from a uniform distribution (default: 200_000).",
+    )
+    parser.add_argument(
+        "--cluster-count",
+        type=int,
+        default=100,
+        help="Number of Gaussian clusters to sprinkle into the dataset (default: 100).",
+    )
+    parser.add_argument(
+        "--points-per-cluster",
+        type=int,
+        default=256,
+        help="Number of points sampled for each Gaussian cluster (default: 256).",
+    )
+    parser.add_argument(
+        "--area-width",
+        type=int,
+        default=1_000_000,
+        help="Width/height of the square area measured in pixels (default: 1_000_000).",
+    )
+    parser.add_argument(
+        "--cluster-sigma",
+        type=float,
+        default=50.0 / 3.0,
+        help="Standard deviation for the Gaussian clusters in pixels (default: 50/3).",
+    )
+    parser.add_argument(
+        "--eps",
+        type=float,
+        default=60.0,
+        help="DBSCAN epsilon radius in pixels for scikit-learn clustering (default: 60).",
+    )
+    parser.add_argument(
+        "--min-samples",
+        type=int,
+        default=16,
+        help="DBSCAN min_samples parameter for scikit-learn clustering (default: 16).",
+    )
+    parser.add_argument(
+        "--seed",
+        type=int,
+        default=42,
+        help="Seed for the random number generator (default: 42).",
+    )
+    parser.add_argument(
+        "--data-file",
+        type=Path,
+        default=Path("data.bin"),
+        help="Path to the output coordinate binary file (default: data.bin).",
+    )
+    parser.add_argument(
+        "--truth-file",
+        type=Path,
+        default=Path("truth.bin"),
+        help="Path to the output truth label binary file (default: truth.bin).",
+    )
+    return parser.parse_args()
+
+
+def generate_uniform_points(rng: np.random.Generator, count: int, width: int) -> np.ndarray:
+    if count <= 0:
+        return np.empty((0, 2), dtype=np.uint32)
+
+    coords = rng.uniform(0, width, size=(count, 2))
+    coords = np.rint(coords, casting="unsafe")  # round to nearest integer pixel
+    coords = np.clip(coords, 0, width - 1)
+    return coords.astype(np.uint32)
+
+
+def generate_gaussian_clusters(
+    rng: np.random.Generator,
+    cluster_count: int,
+    points_per_cluster: int,
+    width: int,
+    sigma: float,
+) -> np.ndarray:
+    if cluster_count <= 0 or points_per_cluster <= 0:
+        return np.empty((0, 2), dtype=np.uint32)
+
+    centers = rng.uniform(0, width, size=(cluster_count, 2))
+    clusters = []
+
+    for center in centers:
+        samples = rng.normal(loc=center, scale=sigma, size=(points_per_cluster, 2))
+        samples = np.rint(samples, casting="unsafe")
+        samples = np.clip(samples, 0, width - 1)
+        clusters.append(samples.astype(np.uint32))
+
+    if not clusters:
+        return np.empty((0, 2), dtype=np.uint32)
+
+    return np.vstack(clusters)
+
+
+def write_data_file(path: Path, coords_yx: np.ndarray) -> None:
+    coords_yx.astype(np.uint32).tofile(path)
+
+
+def write_truth_file(path: Path, labels: np.ndarray) -> None:
+    labels.astype(np.int32).tofile(path)
+
+
+def main() -> None:
+    args = parse_args()
+    rng = np.random.default_rng(args.seed)
+
+    uniform_coords = generate_uniform_points(rng, args.uniform_count, args.area_width)
+    cluster_coords = generate_gaussian_clusters(
+        rng,
+        cluster_count=args.cluster_count,
+        points_per_cluster=args.points_per_cluster,
+        width=args.area_width,
+        sigma=args.cluster_sigma,
+    )
+
+    if uniform_coords.size == 0 and cluster_coords.size == 0:
+        raise ValueError("No points generated; adjust the generator parameters.")
+
+    # Concatenate and shuffle to avoid ordering artifacts.
+    all_coords_yx = np.vstack([uniform_coords, cluster_coords])
+    rng.shuffle(all_coords_yx, axis=0)
+
+    # Prepare float coordinates for clustering (x, y order for scikit-learn).
+    coords_xy_float = all_coords_yx[:, [1, 0]].astype(np.float64)
+
+    dbscan = DBSCAN(eps=args.eps, min_samples=args.min_samples, n_jobs=-1)
+    labels = dbscan.fit_predict(coords_xy_float)
+
+    # Persist data in requested layout: (y, x) pairs as uint32.
+    write_data_file(args.data_file, all_coords_yx)
+    write_truth_file(args.truth_file, labels)
+
+    unique_labels, counts = np.unique(labels, return_counts=True)
+    cluster_labels = unique_labels[unique_labels != -1]
+    noise_count = counts[unique_labels == -1].sum() if -1 in unique_labels else 0
+
+    print(f"Generated {all_coords_yx.shape[0]} total points.")
+    print(f"Uniform points: {uniform_coords.shape[0]}, clustered points: {cluster_coords.shape[0]}.")
+    print(f"DBSCAN discovered {cluster_labels.size} clusters and {noise_count} noise points.")
+    print(f"Data written to {args.data_file.resolve()} and labels to {args.truth_file.resolve()}.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tools/dbscan_dataset_validator.cpp b/tools/dbscan_dataset_validator.cpp
new file mode 100644
index 0000000..fe246d8
--- /dev/null
+++ b/tools/dbscan_dataset_validator.cpp
@@ -0,0 +1,398 @@
+#include <algorithm>
+#include <chrono>
+#include <cstdint>
+#include <cstdlib>
+#include <filesystem>
+#include <fstream>
+#include <iomanip>
+#include <iostream>
+#include <optional>
+#include <stdexcept>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+#include "dbscan.h"
+#include "dbscan_optimized.h"
+
+namespace {
+
+struct Options {
+  std::filesystem::path data_path{"data.bin"};
+  std::filesystem::path truth_path{"truth.bin"};
+  double eps{60.0};
+  int32_t min_samples{16};
+
+  enum class Implementation { Baseline, Optimized, Both } implementation{Implementation::Both};
+  std::optional<std::filesystem::path> mismatch_output_dir;
+};
+
+void print_usage(const char *program_name) {
+  std::cout << "Usage: " << program_name
+            << " [--data <data.bin>] [--truth <truth.bin>] [--eps <value>] [--min-samples <value>]"
+            << " [--impl baseline|optimized|both] [--dump-mismatches <directory>]\n";
+}
+
+Options parse_arguments(int argc, char **argv) {
+  Options options;
+
+  for (int i = 1; i < argc; ++i) {
+    const std::string arg{argv[i]};
+
+    if (arg == "--help" || arg == "-h") {
+      print_usage(argv[0]);
+      std::exit(0);
+    } else if (arg == "--data") {
+      if (i + 1 >= argc)
+        throw std::invalid_argument("--data expects a path argument");
+      options.data_path = argv[++i];
+    } else if (arg == "--truth") {
+      if (i + 1 >= argc)
+        throw std::invalid_argument("--truth expects a path argument");
+      options.truth_path = argv[++i];
+    } else if (arg == "--eps") {
+      if (i + 1 >= argc)
+        throw std::invalid_argument("--eps expects a numeric argument");
+      options.eps = std::stod(argv[++i]);
+    } else if (arg == "--min-samples") {
+      if (i + 1 >= argc)
+        throw std::invalid_argument("--min-samples expects an integer argument");
+      options.min_samples = static_cast<int32_t>(std::stoi(argv[++i]));
+    } else if (arg == "--impl") {
+      if (i + 1 >= argc)
+        throw std::invalid_argument("--impl expects one of: baseline, optimized, both");
+      const std::string value{argv[++i]};
+      if (value == "baseline") {
+        options.implementation = Options::Implementation::Baseline;
+      } else if (value == "optimized") {
+        options.implementation = Options::Implementation::Optimized;
+      } else if (value == "both") {
+        options.implementation = Options::Implementation::Both;
+      } else {
+        throw std::invalid_argument("--impl expects one of: baseline, optimized, both");
+      }
+    } else if (arg == "--dump-mismatches") {
+      if (i + 1 >= argc)
+        throw std::invalid_argument("--dump-mismatches expects a directory path");
+      options.mismatch_output_dir = std::filesystem::path{argv[++i]};
+    } else {
+      throw std::invalid_argument("Unknown argument: " + arg);
+    }
+  }
+
+  if (options.eps <= 0.0)
+    throw std::invalid_argument("--eps must be positive");
+  if (options.min_samples <= 0)
+    throw std::invalid_argument("--min-samples must be positive");
+
+  return options;
+}
+
+std::vector<dbscan::Point<double>> load_points(const std::filesystem::path &path) {
+  std::ifstream stream(path, std::ios::binary);
+  if (!stream)
+    throw std::runtime_error("Failed to open data file: " + path.string());
+
+  stream.seekg(0, std::ios::end);
+  const auto file_size = stream.tellg();
+  stream.seekg(0, std::ios::beg);
+
+  if (file_size % (sizeof(uint32_t) * 2) != 0)
+    throw std::runtime_error("Data file does not contain a whole number of (y, x) uint32 pairs: " + path.string());
+
+  const std::size_t num_values = static_cast<std::size_t>(file_size) / sizeof(uint32_t);
+  const std::size_t num_points = num_values / 2;
+
+  std::vector<uint32_t> raw(num_values);
+  stream.read(reinterpret_cast<char *>(raw.data()), static_cast<std::streamsize>(raw.size() * sizeof(uint32_t)));
+  if (!stream)
+    throw std::runtime_error("Failed to read data file: " + path.string());
+
+  std::vector<dbscan::Point<double>> points(num_points);
+  for (std::size_t i = 0; i < num_points; ++i) {
+    const uint32_t y = raw[2 * i];
+    const uint32_t x = raw[2 * i + 1];
+    points[i] = dbscan::Point<double>{static_cast<double>(x), static_cast<double>(y)};
+  }
+
+  return points;
+}
+
+std::vector<int32_t> load_labels(const std::filesystem::path &path) {
+  std::ifstream stream(path, std::ios::binary);
+  if (!stream)
+    throw std::runtime_error("Failed to open truth file: " + path.string());
+
+  stream.seekg(0, std::ios::end);
+  const auto file_size = stream.tellg();
+  stream.seekg(0, std::ios::beg);
+
+  if (file_size % sizeof(int32_t) != 0)
+    throw std::runtime_error("Truth file does not contain a whole number of int32 labels: " + path.string());
+
+  const std::size_t num_labels = static_cast<std::size_t>(file_size) / sizeof(int32_t);
+  std::vector<int32_t> labels(num_labels);
+  stream.read(reinterpret_cast<char *>(labels.data()), static_cast<std::streamsize>(labels.size() * sizeof(int32_t)));
+  if (!stream)
+    throw std::runtime_error("Failed to read truth file: " + path.string());
+
+  return labels;
+}
+
+std::size_t count_clusters(const std::vector<int32_t> &labels) {
+  std::unordered_set<int32_t> clusters;
+  clusters.reserve(labels.size());
+  for (int32_t label : labels) {
+    if (label != -1)
+      clusters.insert(label);
+  }
+  return clusters.size();
+}
+
+std::size_t count_noise(const std::vector<int32_t> &labels) {
+  return static_cast<std::size_t>(std::count(labels.begin(), labels.end(), -1));
+}
+
+struct LabelIndex {
+  std::unordered_map<int32_t, std::size_t> to_index;
+  std::vector<int32_t> values;
+};
+
+LabelIndex make_index(const std::vector<int32_t> &labels) {
+  LabelIndex index;
+  index.values.reserve(labels.size());
+  index.to_index.reserve(labels.size());
+
+  for (int32_t label : labels) {
+    if (!index.to_index.contains(label)) {
+      const std::size_t idx = index.values.size();
+      index.values.push_back(label);
+      index.to_index.emplace(label, idx);
+    }
+  }
+
+  return index;
+}
+
+double combination2(std::int64_t n) {
+  if (n <= 1)
+    return 0.0;
+  return static_cast<double>(n) * static_cast<double>(n - 1) / 2.0;
+}
+
+struct EvaluationMetrics {
+  double adjusted_rand{0.0};
+  double remapped_accuracy{0.0};
+  std::size_t mismatched_points{0};
+  std::size_t predicted_clusters{0};
+  std::size_t truth_clusters{0};
+  std::size_t predicted_noise{0};
+  std::size_t truth_noise{0};
+  bool passed{false};
+};
+
+EvaluationMetrics evaluate(const std::vector<int32_t> &predicted, const std::vector<int32_t> &truth,
+                           std::vector<std::size_t> *mismatch_indices = nullptr) {
+  if (predicted.size() != truth.size())
+    throw std::runtime_error("Predicted labels and truth labels must have the same length");
+
+  const std::size_t total_points = truth.size();
+  const auto predicted_index = make_index(predicted);
+  const auto truth_index = make_index(truth);
+
+  const std::size_t predicted_size = predicted_index.values.size();
+  const std::size_t truth_size = truth_index.values.size();
+
+  std::vector<std::int64_t> contingency(predicted_size * truth_size, 0);
+  std::vector<std::int64_t> predicted_counts(predicted_size, 0);
+  std::vector<std::int64_t> truth_counts(truth_size, 0);
+
+  for (std::size_t i = 0; i < total_points; ++i) {
+    const auto predicted_it = predicted_index.to_index.find(predicted[i]);
+    const auto truth_it = truth_index.to_index.find(truth[i]);
+    const std::size_t predicted_row = predicted_it->second;
+    const std::size_t truth_col = truth_it->second;
+
+    const std::size_t cell_index = predicted_row * truth_size + truth_col;
+    ++contingency[cell_index];
+    ++predicted_counts[predicted_row];
+    ++truth_counts[truth_col];
+  }
+
+  double sum_combination = 0.0;
+  for (std::int64_t count : contingency)
+    sum_combination += combination2(count);
+
+  double predicted_combination = 0.0;
+  for (std::int64_t count : predicted_counts)
+    predicted_combination += combination2(count);
+
+  double truth_combination = 0.0;
+  for (std::int64_t count : truth_counts)
+    truth_combination += combination2(count);
+
+  const double total_pairs = combination2(static_cast<std::int64_t>(total_points));
+  double expected_index = 0.0;
+  if (total_pairs > 0.0)
+    expected_index = (predicted_combination * truth_combination) / total_pairs;
+
+  const double max_index = 0.5 * (predicted_combination + truth_combination);
+  const double denominator = max_index - expected_index;
+
+  EvaluationMetrics metrics;
+  if (denominator == 0.0) {
+    metrics.adjusted_rand = 1.0;
+  } else {
+    metrics.adjusted_rand = (sum_combination - expected_index) / denominator;
+  }
+
+  std::unordered_map<int32_t, int32_t> remap;
+  remap.reserve(predicted_size);
+  for (std::size_t row = 0; row < predicted_size; ++row) {
+    const int32_t predicted_label = predicted_index.values[row];
+    if (predicted_label == -1) {
+      remap.emplace(predicted_label, -1);
+      continue;
+    }
+
+    const std::int64_t *row_ptr = contingency.data() + row * truth_size;
+    std::size_t best_col = 0;
+    std::int64_t best_count = -1;
+    for (std::size_t col = 0; col < truth_size; ++col) {
+      if (row_ptr[col] > best_count) {
+        best_count = row_ptr[col];
+        best_col = col;
+      }
+    }
+    remap.emplace(predicted_label, truth_index.values[best_col]);
+  }
+
+  if (mismatch_indices)
+    mismatch_indices->clear();
+
+  std::size_t matches = 0;
+  for (std::size_t i = 0; i < total_points; ++i) {
+    const int32_t predicted_label = predicted[i];
+    const auto mapping_it = remap.find(predicted_label);
+    const int32_t mapped_label = mapping_it != remap.end() ? mapping_it->second : predicted_label;
+    if (mapped_label == truth[i])
+      ++matches;
+    else if (mismatch_indices)
+      mismatch_indices->push_back(i);
+  }
+
+  metrics.remapped_accuracy =
+      total_points == 0 ? 1.0 : static_cast<double>(matches) / static_cast<double>(total_points);
+  metrics.mismatched_points = mismatch_indices ? mismatch_indices->size() : (total_points - matches);
+
+  metrics.predicted_clusters = count_clusters(predicted);
+  metrics.truth_clusters = count_clusters(truth);
+  metrics.predicted_noise = count_noise(predicted);
+  metrics.truth_noise = count_noise(truth);
+  metrics.passed = metrics.mismatched_points == 0 && metrics.predicted_clusters == metrics.truth_clusters;
+
+  return metrics;
+}
+
+struct RunResult {
+  std::string name;
+  EvaluationMetrics metrics;
+};
+
+} // namespace
+
+int main(int argc, char **argv) {
+  try {
+    const Options options = parse_arguments(argc, argv);
+
+    const auto points = load_points(options.data_path);
+    const auto truth_labels = load_labels(options.truth_path);
+
+    if (points.size() != truth_labels.size())
+      throw std::runtime_error("Point count and truth label count differ");
+
+    std::cout << "Loaded " << points.size() << " points from " << options.data_path << "\n";
+    std::cout << "Using eps=" << options.eps << ", min_samples=" << options.min_samples << "\n";
+
+    const auto truth_cluster_count = count_clusters(truth_labels);
+    const auto truth_noise_count = count_noise(truth_labels);
+    std::cout << "Ground truth clusters: " << truth_cluster_count << "; noise points: " << truth_noise_count << "\n";
+
+    std::vector<RunResult> results;
+    results.reserve(2);
+
+    if (options.implementation == Options::Implementation::Baseline ||
+        options.implementation == Options::Implementation::Both) {
+      std::cout << "\n[baseline] Running clustering..." << std::flush;
+      const auto start = std::chrono::steady_clock::now();
+      dbscan::DBSCAN<double> baseline(options.eps, options.min_samples);
+      const auto clustering = baseline.cluster(points);
+      std::vector<std::size_t> mismatches;
+      const auto metrics =
+          evaluate(clustering.labels, truth_labels, options.mismatch_output_dir ? &mismatches : nullptr);
+      const auto end = std::chrono::steady_clock::now();
+      const auto elapsed_ms = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();
+      std::cout << " done in " << elapsed_ms << " ms" << std::endl;
+      results.push_back({"baseline", metrics});
+
+      if (options.mismatch_output_dir && !mismatches.empty()) {
+        std::filesystem::create_directories(*options.mismatch_output_dir);
+        auto file_path = *options.mismatch_output_dir / "baseline_mismatches.txt";
+        std::ofstream out(file_path);
+        if (!out)
+          throw std::runtime_error("Failed to open mismatch output file: " + file_path.string());
+        for (std::size_t index : mismatches)
+          out << index << '\n';
+        std::cout << "[baseline] Wrote " << mismatches.size() << " mismatches to " << file_path << "\n";
+      }
+    }
+
+    if (options.implementation == Options::Implementation::Optimized ||
+        options.implementation == Options::Implementation::Both) {
+      std::cout << "\n[optimized] Running clustering..." << std::flush;
+      const auto start = std::chrono::steady_clock::now();
+      dbscan::DBSCANOptimized<double> optimized(options.eps, options.min_samples);
+      const auto clustering = optimized.cluster(points);
+      std::vector<std::size_t> mismatches;
+      const auto metrics =
+          evaluate(clustering.labels, truth_labels, options.mismatch_output_dir ? &mismatches : nullptr);
+      const auto end = std::chrono::steady_clock::now();
+      const auto elapsed_ms = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();
+      std::cout << " done in " << elapsed_ms << " ms" << std::endl;
+      results.push_back({"optimized", metrics});
+
+      if (options.mismatch_output_dir && !mismatches.empty()) {
+        std::filesystem::create_directories(*options.mismatch_output_dir);
+        auto file_path = *options.mismatch_output_dir / "optimized_mismatches.txt";
+        std::ofstream out(file_path);
+        if (!out)
+          throw std::runtime_error("Failed to open mismatch output file: " + file_path.string());
+        for (std::size_t index : mismatches)
+          out << index << '\n';
+        std::cout << "[optimized] Wrote " << mismatches.size() << " mismatches to " << file_path << "\n";
+      }
+    }
+
+    std::cout << std::fixed << std::setprecision(6);
+
+    bool all_passed = true;
+    for (const auto &result : results) {
+      const auto &m = result.metrics;
+      std::cout << "\nImplementation: " << result.name << "\n";
+      std::cout << "  clusters: " << m.predicted_clusters << " (truth " << m.truth_clusters << ")\n";
+      std::cout << "  noise points: " << m.predicted_noise << " (truth " << m.truth_noise << ")\n";
+      std::cout << "  adjusted rand index: " << m.adjusted_rand << "\n";
+      std::cout << "  remapped accuracy: " << m.remapped_accuracy * 100.0 << "%\n";
+      std::cout << "  mismatched points: " << m.mismatched_points << "\n";
+      std::cout << "  status: " << (m.passed ? "PASS" : "FAIL") << "\n";
+      all_passed = all_passed && m.passed;
+    }
+
+    return all_passed ? 0 : 1;
+  } catch (const std::exception &ex) {
+    std::cerr << "Error: " << ex.what() << "\n";
+    print_usage(argv[0]);
+    return 1;
+  }
+}

From 13fd4c6a3a79d6307f40763303e5f7b65284d3bf Mon Sep 17 00:00:00 2001
From: Bo Lu <bo.lu@utexas.edu>
Date: Tue, 30 Sep 2025 22:20:30 -0700
Subject: [PATCH 02/12] add l1 impl

---
 CMakeLists.txt                     |   2 +
 include/dbscan_grid2d_l1.h         |  18 +++
 src/dbscan_grid2d_l1.cpp           | 189 +++++++++++++++++++++++++++++
 tests/test_dbscan_grid2d_l1.cpp    |  96 +++++++++++++++
 tools/dbscan_dataset_validator.cpp |  87 ++++++++++---
 5 files changed, 378 insertions(+), 14 deletions(-)
 create mode 100644 include/dbscan_grid2d_l1.h
 create mode 100644 src/dbscan_grid2d_l1.cpp
 create mode 100644 tests/test_dbscan_grid2d_l1.cpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index df35fb7..2bcc52b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -19,6 +19,7 @@ endif()
 add_library(dbscan STATIC
     src/dbscan.cpp
     src/dbscan_optimized.cpp
+    src/dbscan_grid2d_l1.cpp
 )
 
 target_include_directories(dbscan
@@ -64,6 +65,7 @@ FetchContent_MakeAvailable(nanobench)
 add_executable(dbscan_tests
     tests/test_dbscan.cpp
     tests/test_dbscan_optimized.cpp
+    tests/test_dbscan_grid2d_l1.cpp
     tests/test_parallel_for.cpp
     tests/test_union_find.cpp
 )
diff --git a/include/dbscan_grid2d_l1.h b/include/dbscan_grid2d_l1.h
new file mode 100644
index 0000000..7527c04
--- /dev/null
+++ b/include/dbscan_grid2d_l1.h
@@ -0,0 +1,18 @@
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+#include <vector>
+
+namespace dbscan {
+
+struct DBSCANGrid2D_L1 {
+  uint32_t eps;
+  uint32_t min_samples;
+
+  DBSCANGrid2D_L1(uint32_t eps_value, uint32_t min_samples_value);
+
+  [[nodiscard]] std::vector<int32_t> fit_predict(const uint32_t *x, const uint32_t *y, std::size_t count) const;
+};
+
+} // namespace dbscan
diff --git a/src/dbscan_grid2d_l1.cpp b/src/dbscan_grid2d_l1.cpp
new file mode 100644
index 0000000..a0d8575
--- /dev/null
+++ b/src/dbscan_grid2d_l1.cpp
@@ -0,0 +1,189 @@
+#include "dbscan_grid2d_l1.h"
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <limits>
+#include <numeric>
+#include <stdexcept>
+#include <utility>
+#include <vector>
+
+namespace dbscan {
+
+namespace {
+
+constexpr uint64_t pack_cell(uint32_t ix, uint32_t iy) noexcept {
+  return (static_cast<uint64_t>(ix) << 32U) | static_cast<uint64_t>(iy);
+}
+
+constexpr uint32_t cell_of(uint32_t value, uint32_t cell_size) noexcept {
+  return cell_size == 0 ? value : static_cast<uint32_t>(value / cell_size);
+}
+
+template <typename Fn>
+void for_each_neighbor(uint32_t point_index, uint32_t eps, const uint32_t *x, const uint32_t *y,
+                       const std::vector<uint32_t> &cell_x, const std::vector<uint32_t> &cell_y,
+                       const std::vector<uint32_t> &ordered_indices, const std::vector<size_t> &cell_offsets,
+                       const std::vector<uint64_t> &unique_keys, Fn &&fn) {
+
+  const uint32_t base_cx = cell_x[point_index];
+  const uint32_t base_cy = cell_y[point_index];
+
+  for (int dx = -1; dx <= 1; ++dx) {
+    const int64_t nx = static_cast<int64_t>(base_cx) + dx;
+    if (nx < 0)
+      continue;
+
+    for (int dy = -1; dy <= 1; ++dy) {
+      const int64_t ny = static_cast<int64_t>(base_cy) + dy;
+      if (ny < 0)
+        continue;
+
+      const uint64_t key = pack_cell(static_cast<uint32_t>(nx), static_cast<uint32_t>(ny));
+      const auto it = std::lower_bound(unique_keys.begin(), unique_keys.end(), key);
+      if (it == unique_keys.end() || *it != key)
+        continue;
+
+      const std::size_t cell_idx = static_cast<std::size_t>(std::distance(unique_keys.begin(), it));
+      const std::size_t begin = cell_offsets[cell_idx];
+      const std::size_t end = cell_offsets[cell_idx + 1];
+
+      for (std::size_t pos = begin; pos < end; ++pos) {
+        const uint32_t neighbor_idx = ordered_indices[pos];
+
+        const uint32_t x_a = x[point_index];
+        const uint32_t y_a = y[point_index];
+        const uint32_t x_b = x[neighbor_idx];
+        const uint32_t y_b = y[neighbor_idx];
+
+        const uint32_t dx_abs = x_a > x_b ? x_a - x_b : x_b - x_a;
+        const uint32_t dy_abs = y_a > y_b ? y_a - y_b : y_b - y_a;
+        const uint64_t manhattan = static_cast<uint64_t>(dx_abs) + static_cast<uint64_t>(dy_abs);
+
+        if (manhattan <= static_cast<uint64_t>(eps)) {
+          if (!fn(neighbor_idx))
+            return;
+        }
+      }
+    }
+  }
+}
+
+} // namespace
+
+DBSCANGrid2D_L1::DBSCANGrid2D_L1(uint32_t eps_value, uint32_t min_samples_value)
+    : eps(eps_value), min_samples(min_samples_value) {
+  if (eps == 0)
+    throw std::invalid_argument("eps must be greater than zero for DBSCANGrid2D_L1");
+  if (min_samples == 0)
+    throw std::invalid_argument("min_samples must be greater than zero for DBSCANGrid2D_L1");
+}
+
+std::vector<int32_t> DBSCANGrid2D_L1::fit_predict(const uint32_t *x, const uint32_t *y, std::size_t count) const {
+  if (x == nullptr || y == nullptr)
+    throw std::invalid_argument("Input coordinate arrays must be non-null");
+
+  if (count == 0)
+    return {};
+
+  const uint32_t cell_size = eps;
+
+  std::vector<uint32_t> cell_x(count);
+  std::vector<uint32_t> cell_y(count);
+  std::vector<uint64_t> keys(count);
+  std::vector<uint32_t> ordered_indices(count);
+  std::iota(ordered_indices.begin(), ordered_indices.end(), 0U);
+
+  for (std::size_t i = 0; i < count; ++i) {
+    const uint32_t cx = cell_of(x[i], cell_size);
+    const uint32_t cy = cell_of(y[i], cell_size);
+    cell_x[i] = cx;
+    cell_y[i] = cy;
+    keys[i] = pack_cell(cx, cy);
+  }
+
+  std::sort(ordered_indices.begin(), ordered_indices.end(), [&](uint32_t lhs, uint32_t rhs) {
+    const uint64_t key_lhs = keys[lhs];
+    const uint64_t key_rhs = keys[rhs];
+    if (key_lhs == key_rhs)
+      return lhs < rhs;
+    return key_lhs < key_rhs;
+  });
+
+  std::vector<std::size_t> cell_offsets;
+  cell_offsets.reserve(count + 1);
+  std::vector<uint64_t> unique_keys;
+  unique_keys.reserve(count);
+
+  std::size_t pos = 0;
+  while (pos < count) {
+    const uint64_t key = keys[ordered_indices[pos]];
+    unique_keys.push_back(key);
+    cell_offsets.push_back(pos);
+
+    do {
+      ++pos;
+    } while (pos < count && keys[ordered_indices[pos]] == key);
+  }
+  cell_offsets.push_back(count);
+
+  std::vector<int32_t> labels(count, -1);
+  std::vector<uint8_t> is_core(count, 0U);
+
+  const uint32_t eps_value = eps;
+  const uint32_t min_samples_value = min_samples;
+
+  for (std::size_t i = 0; i < count; ++i) {
+    uint32_t neighbor_count = 0;
+    for_each_neighbor(static_cast<uint32_t>(i), eps_value, x, y, cell_x, cell_y, ordered_indices, cell_offsets,
+                      unique_keys, [&](uint32_t) {
+                        ++neighbor_count;
+                        return neighbor_count < min_samples_value;
+                      });
+
+    if (neighbor_count >= min_samples_value)
+      is_core[i] = 1U;
+  }
+
+  std::vector<uint32_t> stack;
+  stack.reserve(count);
+  std::vector<uint32_t> neighbor_buffer;
+  neighbor_buffer.reserve(64);
+
+  int32_t next_label = 0;
+  for (std::size_t i = 0; i < count; ++i) {
+    if (!is_core[i] || labels[i] != -1)
+      continue;
+
+    labels[i] = next_label;
+    stack.clear();
+    stack.push_back(static_cast<uint32_t>(i));
+
+    while (!stack.empty()) {
+      const uint32_t current = stack.back();
+      stack.pop_back();
+
+      neighbor_buffer.clear();
+      for_each_neighbor(current, eps_value, x, y, cell_x, cell_y, ordered_indices, cell_offsets, unique_keys,
+                        [&](uint32_t neighbor) {
+                          neighbor_buffer.push_back(neighbor);
+                          return true;
+                        });
+
+      for (uint32_t neighbor : neighbor_buffer) {
+        if (labels[neighbor] == -1) {
+          labels[neighbor] = next_label;
+          if (is_core[neighbor])
+            stack.push_back(neighbor);
+        }
+      }
+    }
+
+    ++next_label;
+  }
+
+  return labels;
+}
+
+} // namespace dbscan
diff --git a/tests/test_dbscan_grid2d_l1.cpp b/tests/test_dbscan_grid2d_l1.cpp
new file mode 100644
index 0000000..bf6100a
--- /dev/null
+++ b/tests/test_dbscan_grid2d_l1.cpp
@@ -0,0 +1,96 @@
+#include "dbscan_grid2d_l1.h"
+
+#include <catch2/catch_test_macros.hpp>
+
+#include <cstdint>
+#include <filesystem>
+#include <fstream>
+#include <vector>
+
+namespace {
+
+void load_fixture(const std::filesystem::path &data_path, const std::filesystem::path &truth_path,
+                  std::vector<uint32_t> &x_out, std::vector<uint32_t> &y_out, std::vector<int32_t> &truth_out) {
+  std::ifstream data_stream(data_path, std::ios::binary);
+  REQUIRE(data_stream.good());
+
+  data_stream.seekg(0, std::ios::end);
+  const auto data_size = data_stream.tellg();
+  data_stream.seekg(0, std::ios::beg);
+
+  REQUIRE(data_size % (sizeof(uint32_t) * 2) == 0);
+  const std::size_t point_count = static_cast<std::size_t>(data_size) / (sizeof(uint32_t) * 2);
+
+  x_out.assign(point_count, 0U);
+  y_out.assign(point_count, 0U);
+
+  for (std::size_t i = 0; i < point_count; ++i) {
+    uint32_t y = 0;
+    uint32_t x = 0;
+    data_stream.read(reinterpret_cast<char *>(&y), sizeof(uint32_t));
+    data_stream.read(reinterpret_cast<char *>(&x), sizeof(uint32_t));
+    REQUIRE(data_stream.good());
+    y_out[i] = y;
+    x_out[i] = x;
+  }
+
+  std::ifstream truth_stream(truth_path, std::ios::binary);
+  REQUIRE(truth_stream.good());
+
+  truth_stream.seekg(0, std::ios::end);
+  const auto truth_size = truth_stream.tellg();
+  truth_stream.seekg(0, std::ios::beg);
+
+  REQUIRE(truth_size % sizeof(int32_t) == 0);
+  const std::size_t truth_count = static_cast<std::size_t>(truth_size) / sizeof(int32_t);
+  REQUIRE(truth_count == point_count);
+
+  truth_out.assign(truth_count, 0);
+  truth_stream.read(reinterpret_cast<char *>(truth_out.data()),
+                    static_cast<std::streamsize>(truth_count * sizeof(int32_t)));
+  REQUIRE(truth_stream.good());
+}
+
+} // namespace
+
+TEST_CASE("DBSCANGrid2D_L1 clusters dense neighbors", "[dbscan][grid_l1]") {
+  const std::vector<uint32_t> x = {0, 1, 2, 100};
+  const std::vector<uint32_t> y = {0, 0, 1, 200};
+
+  dbscan::DBSCANGrid2D_L1 algo(4, 3);
+  auto labels = algo.fit_predict(x.data(), y.data(), x.size());
+
+  REQUIRE(labels.size() == x.size());
+  REQUIRE(labels[0] == labels[1]);
+  REQUIRE(labels[1] == labels[2]);
+  REQUIRE(labels[0] != -1);
+  REQUIRE(labels[3] == -1);
+}
+
+TEST_CASE("DBSCANGrid2D_L1 respects min_samples threshold", "[dbscan][grid_l1]") {
+  const std::vector<uint32_t> coords = {0, 2, 4};
+  dbscan::DBSCANGrid2D_L1 algo(3, 4);
+  auto labels = algo.fit_predict(coords.data(), coords.data(), coords.size());
+
+  REQUIRE(labels.size() == coords.size());
+  for (int32_t label : labels) {
+    REQUIRE(label == -1);
+  }
+}
+
+TEST_CASE("DBSCANGrid2D_L1 matches fixture truth", "[dbscan][grid_l1]") {
+  const std::filesystem::path root = std::filesystem::path{"tests"} / "data";
+  const auto data_path = root / "dbscan_static_data.bin";
+  const auto truth_path = root / "dbscan_static_truth.bin";
+
+  std::vector<uint32_t> x;
+  std::vector<uint32_t> y;
+  std::vector<int32_t> truth;
+  load_fixture(data_path, truth_path, x, y, truth);
+
+  dbscan::DBSCANGrid2D_L1 algo(10, 3);
+  auto labels = algo.fit_predict(x.data(), y.data(), x.size());
+
+  REQUIRE(labels.size() == truth.size());
+  REQUIRE(labels == truth);
+}
diff --git a/tools/dbscan_dataset_validator.cpp b/tools/dbscan_dataset_validator.cpp
index fe246d8..2c5a26e 100644
--- a/tools/dbscan_dataset_validator.cpp
+++ b/tools/dbscan_dataset_validator.cpp
@@ -1,5 +1,6 @@
 #include <algorithm>
 #include <chrono>
+#include <cmath>
 #include <cstdint>
 #include <cstdlib>
 #include <filesystem>
@@ -14,6 +15,7 @@
 #include <vector>
 
 #include "dbscan.h"
+#include "dbscan_grid2d_l1.h"
 #include "dbscan_optimized.h"
 
 namespace {
@@ -24,14 +26,16 @@ struct Options {
   double eps{60.0};
   int32_t min_samples{16};
 
-  enum class Implementation { Baseline, Optimized, Both } implementation{Implementation::Both};
+  bool run_baseline{true};
+  bool run_optimized{true};
+  bool run_grid_l1{false};
   std::optional<std::filesystem::path> mismatch_output_dir;
 };
 
 void print_usage(const char *program_name) {
   std::cout << "Usage: " << program_name
             << " [--data <data.bin>] [--truth <truth.bin>] [--eps <value>] [--min-samples <value>]"
-            << " [--impl baseline|optimized|both] [--dump-mismatches <directory>]\n";
+            << " [--impl baseline|optimized|grid|both|all] [--dump-mismatches <directory>]\n";
 }
 
 Options parse_arguments(int argc, char **argv) {
@@ -61,16 +65,30 @@ Options parse_arguments(int argc, char **argv) {
       options.min_samples = static_cast<int32_t>(std::stoi(argv[++i]));
     } else if (arg == "--impl") {
       if (i + 1 >= argc)
-        throw std::invalid_argument("--impl expects one of: baseline, optimized, both");
+        throw std::invalid_argument("--impl expects one of: baseline, optimized, grid, both, all");
       const std::string value{argv[++i]};
       if (value == "baseline") {
-        options.implementation = Options::Implementation::Baseline;
+        options.run_baseline = true;
+        options.run_optimized = false;
+        options.run_grid_l1 = false;
       } else if (value == "optimized") {
-        options.implementation = Options::Implementation::Optimized;
+        options.run_baseline = false;
+        options.run_optimized = true;
+        options.run_grid_l1 = false;
+      } else if (value == "grid" || value == "grid_l1") {
+        options.run_baseline = false;
+        options.run_optimized = false;
+        options.run_grid_l1 = true;
       } else if (value == "both") {
-        options.implementation = Options::Implementation::Both;
+        options.run_baseline = true;
+        options.run_optimized = true;
+        options.run_grid_l1 = false;
+      } else if (value == "all") {
+        options.run_baseline = true;
+        options.run_optimized = true;
+        options.run_grid_l1 = true;
       } else {
-        throw std::invalid_argument("--impl expects one of: baseline, optimized, both");
+        throw std::invalid_argument("--impl expects one of: baseline, optimized, grid, both, all");
       }
     } else if (arg == "--dump-mismatches") {
       if (i + 1 >= argc)
@@ -89,7 +107,8 @@ Options parse_arguments(int argc, char **argv) {
   return options;
 }
 
-std::vector<dbscan::Point<double>> load_points(const std::filesystem::path &path) {
+std::vector<dbscan::Point<double>> load_points(const std::filesystem::path &path, std::vector<uint32_t> *x_out,
+                                               std::vector<uint32_t> *y_out) {
   std::ifstream stream(path, std::ios::binary);
   if (!stream)
     throw std::runtime_error("Failed to open data file: " + path.string());
@@ -110,10 +129,19 @@ std::vector<dbscan::Point<double>> load_points(const std::filesystem::path &path
     throw std::runtime_error("Failed to read data file: " + path.string());
 
   std::vector<dbscan::Point<double>> points(num_points);
+  if (x_out)
+    x_out->assign(num_points, 0U);
+  if (y_out)
+    y_out->assign(num_points, 0U);
+
   for (std::size_t i = 0; i < num_points; ++i) {
     const uint32_t y = raw[2 * i];
     const uint32_t x = raw[2 * i + 1];
     points[i] = dbscan::Point<double>{static_cast<double>(x), static_cast<double>(y)};
+    if (x_out)
+      (*x_out)[i] = x;
+    if (y_out)
+      (*y_out)[i] = y;
   }
 
   return points;
@@ -306,7 +334,9 @@ int main(int argc, char **argv) {
   try {
     const Options options = parse_arguments(argc, argv);
 
-    const auto points = load_points(options.data_path);
+    std::vector<uint32_t> x_coords;
+    std::vector<uint32_t> y_coords;
+    const auto points = load_points(options.data_path, &x_coords, &y_coords);
     const auto truth_labels = load_labels(options.truth_path);
 
     if (points.size() != truth_labels.size())
@@ -320,10 +350,9 @@ int main(int argc, char **argv) {
     std::cout << "Ground truth clusters: " << truth_cluster_count << "; noise points: " << truth_noise_count << "\n";
 
     std::vector<RunResult> results;
-    results.reserve(2);
+    results.reserve(3);
 
-    if (options.implementation == Options::Implementation::Baseline ||
-        options.implementation == Options::Implementation::Both) {
+    if (options.run_baseline) {
       std::cout << "\n[baseline] Running clustering..." << std::flush;
       const auto start = std::chrono::steady_clock::now();
       dbscan::DBSCAN<double> baseline(options.eps, options.min_samples);
@@ -348,8 +377,7 @@ int main(int argc, char **argv) {
       }
     }
 
-    if (options.implementation == Options::Implementation::Optimized ||
-        options.implementation == Options::Implementation::Both) {
+    if (options.run_optimized) {
       std::cout << "\n[optimized] Running clustering..." << std::flush;
       const auto start = std::chrono::steady_clock::now();
       dbscan::DBSCANOptimized<double> optimized(options.eps, options.min_samples);
@@ -374,6 +402,37 @@ int main(int argc, char **argv) {
       }
     }
 
+    if (options.run_grid_l1) {
+      const auto eps_int = static_cast<uint32_t>(std::llround(options.eps));
+      if (std::fabs(options.eps - static_cast<double>(eps_int)) > 1e-6) {
+        throw std::invalid_argument("grid_l1 implementation requires integer eps value");
+      }
+      if (x_coords.size() != y_coords.size())
+        throw std::runtime_error("Mismatch between x and y coordinate counts");
+
+      std::cout << "\n[grid_l1] Running clustering..." << std::flush;
+      const auto start = std::chrono::steady_clock::now();
+      dbscan::DBSCANGrid2D_L1 grid_algo(eps_int, static_cast<uint32_t>(options.min_samples));
+      const auto labels = grid_algo.fit_predict(x_coords.data(), y_coords.data(), x_coords.size());
+      std::vector<std::size_t> mismatches;
+      const auto metrics = evaluate(labels, truth_labels, options.mismatch_output_dir ? &mismatches : nullptr);
+      const auto end = std::chrono::steady_clock::now();
+      const auto elapsed_ms = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();
+      std::cout << " done in " << elapsed_ms << " ms" << std::endl;
+      results.push_back({"grid_l1", metrics});
+
+      if (options.mismatch_output_dir && !mismatches.empty()) {
+        std::filesystem::create_directories(*options.mismatch_output_dir);
+        auto file_path = *options.mismatch_output_dir / "grid_l1_mismatches.txt";
+        std::ofstream out(file_path);
+        if (!out)
+          throw std::runtime_error("Failed to open mismatch output file: " + file_path.string());
+        for (std::size_t index : mismatches)
+          out << index << '\n';
+        std::cout << "[grid_l1] Wrote " << mismatches.size() << " mismatches to " << file_path << "\n";
+      }
+    }
+
     std::cout << std::fixed << std::setprecision(6);
 
     bool all_passed = true;

From b2f46dc70828dc62dcf1384dca799d82f5f75480 Mon Sep 17 00:00:00 2001
From: Bo Lu <bo.lu@utexas.edu>
Date: Tue, 30 Sep 2025 22:23:32 -0700
Subject: [PATCH 03/12] commit some test files

---
 tests/data/dbscan_static_data.bin  | Bin 0 -> 80 bytes
 tests/data/dbscan_static_truth.bin | Bin 0 -> 40 bytes
 2 files changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 tests/data/dbscan_static_data.bin
 create mode 100644 tests/data/dbscan_static_truth.bin

diff --git a/tests/data/dbscan_static_data.bin b/tests/data/dbscan_static_data.bin
new file mode 100644
index 0000000000000000000000000000000000000000..0fe411357874f5466ac30d7899c6a3450886634f
GIT binary patch
literal 80
zcmYdcU|>iA;#45c1mZLxP6FZtDEkDE2H{gcd<=+>1MyiXJ`JQNFhcMlAZ-NX0|2}%
B3y1&!

literal 0
HcmV?d00001

diff --git a/tests/data/dbscan_static_truth.bin b/tests/data/dbscan_static_truth.bin
new file mode 100644
index 0000000000000000000000000000000000000000..64aa8ea48cbbe80e618c97cf8308a34cdb51e386
GIT binary patch
literal 40
RcmZQzKn09IE;9ZP1pp*W2mJs5

literal 0
HcmV?d00001


From 619a6764cc894bb7caf76c5213ad03c1c82402d7 Mon Sep 17 00:00:00 2001
From: Bo Lu <bo.lu@utexas.edu>
Date: Tue, 30 Sep 2025 22:29:03 -0700
Subject: [PATCH 04/12] support l1/l2 metric

---
 scripts/generate_test_data.py | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/scripts/generate_test_data.py b/scripts/generate_test_data.py
index 16a893b..fbb362c 100755
--- a/scripts/generate_test_data.py
+++ b/scripts/generate_test_data.py
@@ -50,6 +50,12 @@ def parse_args() -> argparse.Namespace:
         default=60.0,
         help="DBSCAN epsilon radius in pixels for scikit-learn clustering (default: 60).",
     )
+    parser.add_argument(
+        "--metric",
+        type=str,
+        default="l2",
+        help="Distance metric: 'l2' (Euclidean) or 'l1' (Manhattan).",
+    )
     parser.add_argument(
         "--min-samples",
         type=int,
@@ -143,7 +149,15 @@ def main() -> None:
     # Prepare float coordinates for clustering (x, y order for scikit-learn).
     coords_xy_float = all_coords_yx[:, [1, 0]].astype(np.float64)
 
-    dbscan = DBSCAN(eps=args.eps, min_samples=args.min_samples, n_jobs=-1)
+    metric = args.metric.lower()
+    if metric == "l1":
+        sklearn_metric = "cityblock"
+    elif metric == "l2":
+        sklearn_metric = "euclidean"
+    else:
+        raise ValueError("Unsupported metric. Choose 'l1' or 'l2'.")
+
+    dbscan = DBSCAN(eps=args.eps, min_samples=args.min_samples, metric=sklearn_metric, n_jobs=-1)
     labels = dbscan.fit_predict(coords_xy_float)
 
     # Persist data in requested layout: (y, x) pairs as uint32.
@@ -156,7 +170,9 @@ def main() -> None:
 
     print(f"Generated {all_coords_yx.shape[0]} total points.")
     print(f"Uniform points: {uniform_coords.shape[0]}, clustered points: {cluster_coords.shape[0]}.")
-    print(f"DBSCAN discovered {cluster_labels.size} clusters and {noise_count} noise points.")
+    print(
+        f"DBSCAN ({metric.upper()}) discovered {cluster_labels.size} clusters and {noise_count} noise points."
+    )
     print(f"Data written to {args.data_file.resolve()} and labels to {args.truth_file.resolve()}.")
 
 
From 9fc31ffe0540cdee89bbfa35b63ac1a0cdd10574 Mon Sep 17 00:00:00 2001
From: Bo Lu <bo.lu@utexas.edu>
Date: Tue, 30 Sep 2025 22:33:35 -0700
Subject: [PATCH 05/12] update benchmark to focus on l1 gri dimplementation

---
 benchmark/benchmark_dbscan.cpp | 203 +++++++++++++--------------------
 1 file changed, 81 insertions(+), 122 deletions(-)

diff --git a/benchmark/benchmark_dbscan.cpp b/benchmark/benchmark_dbscan.cpp
index 7fb065c..b7cec7c 100644
--- a/benchmark/benchmark_dbscan.cpp
+++ b/benchmark/benchmark_dbscan.cpp
@@ -1,145 +1,104 @@
-#include "dbscan.h"
-#include "dbscan_optimized.h"
-#include <chrono>
-#include <cstdlib>
-#include <ctime>
+#include "dbscan_grid2d_l1.h"
+
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
 #include <iostream>
-#include <memory>
+#include <limits>
 #include <nanobench.h>
+#include <random>
 #include <string>
 #include <vector>
 
-// Generate clustered 2D data for benchmarking
-std::vector<dbscan::Point<double>> generate_benchmark_data(size_t n_points, int n_clusters = 8) {
-  std::vector<dbscan::Point<double>> points;
-  points.reserve(n_points);
-
-  // Create clusters
-  for (int c = 0; c < n_clusters; ++c) {
-    double center_x = c * 5.0;
-    double center_y = c * 5.0;
-    size_t points_per_cluster = n_points / n_clusters;
-
-    for (size_t i = 0; i < points_per_cluster; ++i) {
-      double x = center_x + (static_cast<double>(rand()) / RAND_MAX - 0.5) * 2.0;
-      double y = center_y + (static_cast<double>(rand()) / RAND_MAX - 0.5) * 2.0;
-      points.push_back({x, y});
-    }
-  }
+namespace {
 
-  // Add some noise points
-  size_t noise_points = n_points / 10;
-  for (size_t i = 0; i < noise_points; ++i) {
-    double x = 50.0 + (static_cast<double>(rand()) / RAND_MAX - 0.5) * 20.0;
-    double y = 50.0 + (static_cast<double>(rand()) / RAND_MAX - 0.5) * 20.0;
-    points.push_back({x, y});
-  }
+struct Uint32Dataset {
+  std::vector<uint32_t> x;
+  std::vector<uint32_t> y;
+};
 
-  return points;
-}
+Uint32Dataset generate_uint32_dataset(std::size_t cluster_count, std::size_t points_per_cluster,
+                                      std::size_t noise_points, uint32_t area_width, uint32_t cluster_sigma,
+                                      std::mt19937 &rng) {
+  std::uniform_real_distribution<double> uniform_dist(0.0, static_cast<double>(area_width));
+  std::normal_distribution<double> normal_dist(0.0, static_cast<double>(cluster_sigma));
 
-int main() {
-  // Seed random number generator
-  srand(static_cast<unsigned int>(time(nullptr)));
+  Uint32Dataset dataset;
+  dataset.x.reserve(cluster_count * points_per_cluster + noise_points);
+  dataset.y.reserve(cluster_count * points_per_cluster + noise_points);
 
-  ankerl::nanobench::Bench bench;
+  for (std::size_t c = 0; c < cluster_count; ++c) {
+    const double center_x = uniform_dist(rng);
+    const double center_y = uniform_dist(rng);
 
-  // Benchmark different data sizes
-  std::vector<size_t> data_sizes = {1000, 10000, 50000, 100000};
-
-  for (size_t n_points : data_sizes) {
-    std::cout << "\n=== Benchmarking with " << n_points << " points ===" << std::endl;
-
-    // Generate test data
-    auto points = generate_benchmark_data(n_points);
-
-    // Benchmark original DBSCAN
-    bench.title("Original DBSCAN").run("Original DBSCAN " + std::to_string(n_points) + " points", [&]() {
-      dbscan::DBSCAN<double> dbscan(0.8, 5);
-      auto result = dbscan.cluster(points);
-      ankerl::nanobench::doNotOptimizeAway(result);
-    });
-
-    // Benchmark optimized DBSCAN
-    bench.title("Optimized DBSCAN").run("Optimized DBSCAN " + std::to_string(n_points) + " points", [&]() {
-      dbscan::DBSCANOptimized<double> dbscan(0.8, 5);
-      auto result = dbscan.cluster(points);
-      ankerl::nanobench::doNotOptimizeAway(result);
-    });
+    for (std::size_t i = 0; i < points_per_cluster; ++i) {
+      const double sample_x = center_x + normal_dist(rng);
+      const double sample_y = center_y + normal_dist(rng);
 
-    // Memory usage comparison
-    {
-      dbscan::DBSCAN<double> original_dbscan(0.8, 5);
-      auto original_result = original_dbscan.cluster(points);
+      const uint32_t clamped_x =
+          static_cast<uint32_t>(std::min(static_cast<double>(area_width - 1), std::max(0.0, std::round(sample_x))));
+      const uint32_t clamped_y =
+          static_cast<uint32_t>(std::min(static_cast<double>(area_width - 1), std::max(0.0, std::round(sample_y))));
 
-      dbscan::DBSCANOptimized<double> optimized_dbscan(0.8, 5);
-      auto optimized_result = optimized_dbscan.cluster(points);
-
-      std::cout << "Original DBSCAN found " << original_result.num_clusters << " clusters" << std::endl;
-      std::cout << "Optimized DBSCAN found " << optimized_result.num_clusters << " clusters" << std::endl;
+      dataset.x.push_back(clamped_x);
+      dataset.y.push_back(clamped_y);
     }
   }
 
-  // Performance comparison with different parameters
-  std::cout << "\n=== Parameter Sensitivity Benchmark ===" << std::endl;
-
-  auto test_points = generate_benchmark_data(10000);
+  std::uniform_int_distribution<uint32_t> uniform_int(0, area_width - 1);
+  for (std::size_t i = 0; i < noise_points; ++i) {
+    dataset.x.push_back(uniform_int(rng));
+    dataset.y.push_back(uniform_int(rng));
+  }
 
-  // Different eps values
-  std::vector<double> eps_values = {0.3, 0.5, 0.8, 1.2};
+  return dataset;
+}
 
-  for (double eps : eps_values) {
-    bench.title("EPS Parameter").run("Optimized DBSCAN eps=" + std::to_string(eps), [&]() {
-      dbscan::DBSCANOptimized<double> dbscan(eps, 5);
-      auto result = dbscan.cluster(test_points);
-      ankerl::nanobench::doNotOptimizeAway(result);
-    });
-  }
+} // namespace
 
-  // Different min_pts values
-  std::vector<int> min_pts_values = {3, 5, 10, 15};
+int main() {
+  constexpr uint32_t area_width = 1'000'000;
+  constexpr uint32_t cluster_sigma = 50; // Approximately 3 sigma ~ 150 px footprint
+  constexpr uint32_t eps = 60;
+  constexpr uint32_t min_samples = 16;
 
-  for (int min_pts : min_pts_values) {
-    bench.title("MinPts Parameter").run("Optimized DBSCAN min_pts=" + std::to_string(min_pts), [&]() {
-      dbscan::DBSCANOptimized<double> dbscan(0.8, min_pts);
-      auto result = dbscan.cluster(test_points);
-      ankerl::nanobench::doNotOptimizeAway(result);
+  std::mt19937 rng(1337u);
+  ankerl::nanobench::Bench bench;
+  bench.title("DBSCANGrid2D_L1");
+  bench.relative(true);
+
+  struct Scenario {
+    std::size_t clusters;
+    std::size_t points_per_cluster;
+  };
+
+  const std::vector<Scenario> scenarios = {
+      {32, 256},
+      {64, 256},
+      {100, 256},
+      {128, 256},
+  };
+
+  std::cout << "Benchmarking DBSCANGrid2D_L1 with Manhattan distance" << std::endl;
+  std::cout << "eps=" << eps << ", min_samples=" << min_samples << std::endl;
+
+  for (const auto &scenario : scenarios) {
+    const std::size_t cluster_points = scenario.clusters * scenario.points_per_cluster;
+    const std::size_t noise_points = cluster_points * 2; // 2x noise compared to clustered points
+
+    auto dataset = generate_uint32_dataset(scenario.clusters, scenario.points_per_cluster, noise_points, area_width,
+                                           cluster_sigma, rng);
+
+    const std::size_t total_points = dataset.x.size();
+    std::cout << "\nScenario: " << scenario.clusters << " clusters, " << scenario.points_per_cluster
+              << " points/cluster, total points=" << total_points << std::endl;
+
+    bench.run("grid-l1 " + std::to_string(total_points) + " pts", [&]() {
+      dbscan::DBSCANGrid2D_L1 algo(eps, min_samples);
+      auto labels = algo.fit_predict(dataset.x.data(), dataset.y.data(), total_points);
+      ankerl::nanobench::doNotOptimizeAway(labels);
     });
   }
 
-  // Detailed performance analysis
-  std::cout << "\n=== Detailed Performance Analysis ===" << std::endl;
-
-  auto large_dataset = generate_benchmark_data(50000);
-
-  // Time both implementations on larger dataset
-  {
-    std::cout << "Running performance comparison on 50k points..." << std::endl;
-
-    // Original DBSCAN timing
-    auto start_time = std::chrono::high_resolution_clock::now();
-    dbscan::DBSCAN<double> original_dbscan(0.8, 5);
-    auto original_result = original_dbscan.cluster(large_dataset);
-    auto end_time = std::chrono::high_resolution_clock::now();
-    auto original_duration = std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time);
-
-    // Optimized DBSCAN timing
-    start_time = std::chrono::high_resolution_clock::now();
-    dbscan::DBSCANOptimized<double> optimized_dbscan(0.8, 5);
-    auto optimized_result = optimized_dbscan.cluster(large_dataset);
-    end_time = std::chrono::high_resolution_clock::now();
-    auto optimized_duration = std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time);
-
-    std::cout << "Original DBSCAN: " << original_duration.count() << "ms, " << original_result.num_clusters
-              << " clusters" << std::endl;
-    std::cout << "Optimized DBSCAN: " << optimized_duration.count() << "ms, " << optimized_result.num_clusters
-              << " clusters" << std::endl;
-
-    if (original_duration.count() > 0) {
-      double speedup = static_cast<double>(original_duration.count()) / optimized_duration.count();
-      std::cout << "Speedup: " << speedup << "x" << std::endl;
-    }
-  }
-
   return 0;
-}
\ No newline at end of file
+}

From 0507e24a966f85369ca234c69835fcb51b8aa767 Mon Sep 17 00:00:00 2001
From: Bo Lu <bo.lu@utexas.edu>
Date: Tue, 30 Sep 2025 22:39:40 -0700
Subject: [PATCH 06/12] add a test for parallelization

---
 CMakeLists.txt             |  1 +
 include/parallel.hpp       | 36 +++++++++++++++++++++++-
 tests/test_parallelize.cpp | 56 ++++++++++++++++++++++++++++++++++++++
 3 files changed, 92 insertions(+), 1 deletion(-)
 create mode 100644 tests/test_parallelize.cpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2bcc52b..bcc6c50 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -67,6 +67,7 @@ add_executable(dbscan_tests
     tests/test_dbscan_optimized.cpp
     tests/test_dbscan_grid2d_l1.cpp
     tests/test_parallel_for.cpp
+    tests/test_parallelize.cpp
     tests/test_union_find.cpp
 )
 
diff --git a/include/parallel.hpp b/include/parallel.hpp
index dc07709..5487241 100644
--- a/include/parallel.hpp
+++ b/include/parallel.hpp
@@ -33,4 +33,38 @@ inline void parallel_for(size_t begin, size_t end, size_t n_threads, const std::
     th.join();
 }
 
-} // namespace utils
\ No newline at end of file
+inline void parallelize(size_t begin, size_t end, size_t num_threads, size_t chunk_size,
+                        std::function<void(size_t, size_t)> &&chunk_processor) {
+  if (begin >= end)
+    return;
+
+  if (num_threads == 0)
+    num_threads = std::thread::hardware_concurrency();
+  if (num_threads == 0)
+    num_threads = 1;
+
+  if (chunk_size == 0)
+    chunk_size = std::max<std::size_t>(1, (end - begin + num_threads - 1) / num_threads);
+
+  std::atomic<size_t> next_begin{begin};
+  std::vector<std::thread> workers;
+  workers.reserve(num_threads);
+
+  for (size_t thread_idx = 0; thread_idx < num_threads; ++thread_idx) {
+    workers.emplace_back([&, chunk_size]() {
+      while (true) {
+        size_t start = next_begin.fetch_add(chunk_size, std::memory_order_relaxed);
+        if (start >= end)
+          break;
+
+        size_t stop = std::min(end, start + chunk_size);
+        chunk_processor(start, stop);
+      }
+    });
+  }
+
+  for (auto &worker : workers)
+    worker.join();
+}
+
+} // namespace utils
diff --git a/tests/test_parallelize.cpp b/tests/test_parallelize.cpp
new file mode 100644
index 0000000..13c2650
--- /dev/null
+++ b/tests/test_parallelize.cpp
@@ -0,0 +1,56 @@
+#include "parallel.hpp"
+
+#include <atomic>
+#include <catch2/catch_test_macros.hpp>
+#include <cstddef>
+#include <numeric>
+#include <vector>
+
+TEST_CASE("parallelize processes full range", "[parallel]") {
+  constexpr std::size_t N = 10'000;
+  std::vector<std::atomic<bool>> visited(N);
+  for (auto &flag : visited)
+    flag.store(false, std::memory_order_relaxed);
+
+  utils::parallelize(0, N, 4, 128, [&](std::size_t begin, std::size_t end) {
+    for (std::size_t i = begin; i < end; ++i) {
+      bool expected = false;
+      REQUIRE(visited[i].compare_exchange_strong(expected, true, std::memory_order_relaxed));
+    }
+  });
+
+  for (const auto &flag : visited) {
+    REQUIRE(flag.load(std::memory_order_relaxed));
+  }
+}
+
+TEST_CASE("parallelize handles uneven chunks", "[parallel]") {
+  constexpr std::size_t N = 1'023;
+  std::vector<std::atomic<int>> counts(N);
+  for (auto &value : counts)
+    value.store(0, std::memory_order_relaxed);
+
+  utils::parallelize(0, N, 3, 100, [&](std::size_t begin, std::size_t end) {
+    for (std::size_t i = begin; i < end; ++i) {
+      counts[i].fetch_add(1, std::memory_order_relaxed);
+    }
+  });
+
+  for (const auto &value : counts) {
+    REQUIRE(value.load(std::memory_order_relaxed) == 1);
+  }
+}
+
+TEST_CASE("parallelize default chunk size", "[parallel]") {
+  constexpr std::size_t N = 5'000;
+  std::vector<int> out(N, 0);
+
+  utils::parallelize(0, N, 8, 0, [&](std::size_t begin, std::size_t end) {
+    for (std::size_t i = begin; i < end; ++i) {
+      out[i] = static_cast<int>(i);
+    }
+  });
+
+  for (std::size_t i = 0; i < N; ++i)
+    REQUIRE(out[i] == static_cast<int>(i));
+}

From 8884c1fcf31240043a0e382e21c2e0d73dd09b5e Mon Sep 17 00:00:00 2001
From: Bo Lu <bo.lu@utexas.edu>
Date: Tue, 30 Sep 2025 22:48:41 -0700
Subject: [PATCH 07/12] update bench

---
 benchmark/benchmark_dbscan.cpp | 31 ++++++++++++++------
 include/dbscan_grid2d_l1.h     |  5 +++-
 src/dbscan_grid2d_l1.cpp       | 52 ++++++++++++++++++++--------------
 3 files changed, 57 insertions(+), 31 deletions(-)

diff --git a/benchmark/benchmark_dbscan.cpp b/benchmark/benchmark_dbscan.cpp
index b7cec7c..95fc230 100644
--- a/benchmark/benchmark_dbscan.cpp
+++ b/benchmark/benchmark_dbscan.cpp
@@ -66,6 +66,9 @@ int main() {
   ankerl::nanobench::Bench bench;
   bench.title("DBSCANGrid2D_L1");
   bench.relative(true);
+  bench.warmup(2);
+  bench.minEpochIterations(10);
+  bench.unit("pt");
 
   struct Scenario {
     std::size_t clusters;
@@ -73,14 +76,16 @@ int main() {
   };
 
   const std::vector<Scenario> scenarios = {
-      {32, 256},
-      {64, 256},
-      {100, 256},
-      {128, 256},
+      {64, 256},  // ~16K cluster points + 32K noise => ~48K total
+      {128, 256}, // ~32K cluster points + 64K noise => ~96K total
+      {256, 256}, // ~65K cluster points + 131K noise => ~196K total
+      {512, 256}, // ~131K cluster points + 262K noise => ~393K total
+      {640, 256}, // ~163K cluster points + 327K noise => ~490K total
   };
 
   std::cout << "Benchmarking DBSCANGrid2D_L1 with Manhattan distance" << std::endl;
   std::cout << "eps=" << eps << ", min_samples=" << min_samples << std::endl;
+  std::cout << "Thread sweep: 0 (auto), 1, 2, 4, 8" << std::endl;
 
   for (const auto &scenario : scenarios) {
     const std::size_t cluster_points = scenario.clusters * scenario.points_per_cluster;
@@ -93,11 +98,19 @@ int main() {
     std::cout << "\nScenario: " << scenario.clusters << " clusters, " << scenario.points_per_cluster
               << " points/cluster, total points=" << total_points << std::endl;
 
-    bench.run("grid-l1 " + std::to_string(total_points) + " pts", [&]() {
-      dbscan::DBSCANGrid2D_L1 algo(eps, min_samples);
-      auto labels = algo.fit_predict(dataset.x.data(), dataset.y.data(), total_points);
-      ankerl::nanobench::doNotOptimizeAway(labels);
-    });
+    bench.batch(static_cast<double>(total_points));
+    bench.context("points", std::to_string(total_points));
+
+    const std::vector<std::size_t> thread_counts = {0, 1, 2, 4, 8};
+    for (std::size_t thread_count : thread_counts) {
+      const std::string label = "grid-l1 " + std::to_string(total_points) + " pts threads=" +
+                                (thread_count == 0 ? std::string("auto") : std::to_string(thread_count));
+      bench.run(label, [&]() {
+        dbscan::DBSCANGrid2D_L1 algo(eps, min_samples, thread_count);
+        auto labels = algo.fit_predict(dataset.x.data(), dataset.y.data(), total_points);
+        ankerl::nanobench::doNotOptimizeAway(labels);
+      });
+    }
   }
 
   return 0;
diff --git a/include/dbscan_grid2d_l1.h b/include/dbscan_grid2d_l1.h
index 7527c04..8f18b7d 100644
--- a/include/dbscan_grid2d_l1.h
+++ b/include/dbscan_grid2d_l1.h
@@ -9,8 +9,11 @@ namespace dbscan {
 struct DBSCANGrid2D_L1 {
   uint32_t eps;
   uint32_t min_samples;
+  std::size_t num_threads;
+  std::size_t chunk_size;
 
-  DBSCANGrid2D_L1(uint32_t eps_value, uint32_t min_samples_value);
+  DBSCANGrid2D_L1(uint32_t eps_value, uint32_t min_samples_value, std::size_t num_threads_value = 0,
+                  std::size_t chunk_size_value = 0);
 
   [[nodiscard]] std::vector<int32_t> fit_predict(const uint32_t *x, const uint32_t *y, std::size_t count) const;
 };
diff --git a/src/dbscan_grid2d_l1.cpp b/src/dbscan_grid2d_l1.cpp
index a0d8575..0489aae 100644
--- a/src/dbscan_grid2d_l1.cpp
+++ b/src/dbscan_grid2d_l1.cpp
@@ -3,12 +3,13 @@
 #include <algorithm>
 #include <cstddef>
 #include <cstdint>
-#include <limits>
 #include <numeric>
 #include <stdexcept>
 #include <utility>
 #include <vector>
 
+#include "parallel.hpp"
+
 namespace dbscan {
 
 namespace {
@@ -72,8 +73,9 @@ void for_each_neighbor(uint32_t point_index, uint32_t eps, const uint32_t *x, co
 
 } // namespace
 
-DBSCANGrid2D_L1::DBSCANGrid2D_L1(uint32_t eps_value, uint32_t min_samples_value)
-    : eps(eps_value), min_samples(min_samples_value) {
+DBSCANGrid2D_L1::DBSCANGrid2D_L1(uint32_t eps_value, uint32_t min_samples_value, std::size_t num_threads_value,
+                                 std::size_t chunk_size_value)
+    : eps(eps_value), min_samples(min_samples_value), num_threads(num_threads_value), chunk_size(chunk_size_value) {
   if (eps == 0)
     throw std::invalid_argument("eps must be greater than zero for DBSCANGrid2D_L1");
   if (min_samples == 0)
@@ -95,13 +97,17 @@ std::vector<int32_t> DBSCANGrid2D_L1::fit_predict(const uint32_t *x, const uint3
   std::vector<uint32_t> ordered_indices(count);
   std::iota(ordered_indices.begin(), ordered_indices.end(), 0U);
 
-  for (std::size_t i = 0; i < count; ++i) {
-    const uint32_t cx = cell_of(x[i], cell_size);
-    const uint32_t cy = cell_of(y[i], cell_size);
-    cell_x[i] = cx;
-    cell_y[i] = cy;
-    keys[i] = pack_cell(cx, cy);
-  }
+  const std::size_t index_chunk = chunk_size == 0 ? 1024 : chunk_size;
+  utils::parallelize(0, count, num_threads, index_chunk,
+                     [&](std::size_t begin, std::size_t end) {
+    for (std::size_t i = begin; i < end; ++i) {
+      const uint32_t cx = cell_of(x[i], cell_size);
+      const uint32_t cy = cell_of(y[i], cell_size);
+      cell_x[i] = cx;
+      cell_y[i] = cy;
+      keys[i] = pack_cell(cx, cy);
+    }
+  });
 
   std::sort(ordered_indices.begin(), ordered_indices.end(), [&](uint32_t lhs, uint32_t rhs) {
     const uint64_t key_lhs = keys[lhs];
@@ -134,17 +140,21 @@ std::vector<int32_t> DBSCANGrid2D_L1::fit_predict(const uint32_t *x, const uint3
   const uint32_t eps_value = eps;
   const uint32_t min_samples_value = min_samples;
 
-  for (std::size_t i = 0; i < count; ++i) {
-    uint32_t neighbor_count = 0;
-    for_each_neighbor(static_cast<uint32_t>(i), eps_value, x, y, cell_x, cell_y, ordered_indices, cell_offsets,
-                      unique_keys, [&](uint32_t) {
-                        ++neighbor_count;
-                        return neighbor_count < min_samples_value;
-                      });
-
-    if (neighbor_count >= min_samples_value)
-      is_core[i] = 1U;
-  }
+  const std::size_t core_chunk = chunk_size == 0 ? 512 : chunk_size;
+  utils::parallelize(0, count, num_threads, core_chunk,
+                     [&](std::size_t begin, std::size_t end) {
+    for (std::size_t idx = begin; idx < end; ++idx) {
+      uint32_t neighbor_count = 0;
+      for_each_neighbor(static_cast<uint32_t>(idx), eps_value, x, y, cell_x, cell_y, ordered_indices, cell_offsets,
+                        unique_keys, [&](uint32_t) {
+                          ++neighbor_count;
+                          return neighbor_count < min_samples_value;
+                        });
+
+      if (neighbor_count >= min_samples_value)
+        is_core[idx] = 1U;
+    }
+  });
 
   std::vector<uint32_t> stack;
   stack.reserve(count);

From 3959b5e17915b238db82747925c1fbfe5a174164 Mon Sep 17 00:00:00 2001
From: Bo Lu <bo.lu@utexas.edu>
Date: Wed, 1 Oct 2025 07:10:53 -0700
Subject: [PATCH 08/12] add performance timing instrumentation

---
 include/dbscan_grid2d_l1.h      |   6 +-
 include/perf_timer.h            |  51 ++++++++++
 src/dbscan_grid2d_l1.cpp        | 164 +++++++++++++++++++-------------
 tests/test_dbscan_grid2d_l1.cpp |   6 ++
 4 files changed, 159 insertions(+), 68 deletions(-)
 create mode 100644 include/perf_timer.h

diff --git a/include/dbscan_grid2d_l1.h b/include/dbscan_grid2d_l1.h
index 8f18b7d..8c51190 100644
--- a/include/dbscan_grid2d_l1.h
+++ b/include/dbscan_grid2d_l1.h
@@ -4,6 +4,8 @@
 #include <cstdint>
 #include <vector>
 
+#include "perf_timer.h"
+
 namespace dbscan {
 
 struct DBSCANGrid2D_L1 {
@@ -15,7 +17,9 @@ struct DBSCANGrid2D_L1 {
   DBSCANGrid2D_L1(uint32_t eps_value, uint32_t min_samples_value, std::size_t num_threads_value = 0,
                   std::size_t chunk_size_value = 0);
 
-  [[nodiscard]] std::vector<int32_t> fit_predict(const uint32_t *x, const uint32_t *y, std::size_t count) const;
+  [[nodiscard]] std::vector<int32_t> fit_predict(const uint32_t *x, const uint32_t *y, std::size_t count);
+
+  PerfTiming perf_timing_;
 };
 
 } // namespace dbscan
diff --git a/include/perf_timer.h b/include/perf_timer.h
new file mode 100644
index 0000000..61e9881
--- /dev/null
+++ b/include/perf_timer.h
@@ -0,0 +1,51 @@
+#pragma once
+
+#include <chrono>
+#include <string>
+#include <utility>
+#include <vector>
+
+namespace dbscan {
+
+struct PerfTimingEntry {
+  std::string label;
+  double duration_ms;
+};
+
+struct PerfTiming {
+  void clear() { entries_.clear(); }
+
+  void add(std::string label, double duration_ms) {
+    entries_.push_back({std::move(label), duration_ms});
+  }
+
+  [[nodiscard]] const std::vector<PerfTimingEntry> &entries() const { return entries_; }
+
+private:
+  std::vector<PerfTimingEntry> entries_;
+
+  friend class ScopedTimer;
+};
+
+class ScopedTimer {
+public:
+  ScopedTimer(std::string label, PerfTiming &sink)
+      : sink_(sink), label_(std::move(label)), start_(std::chrono::steady_clock::now()) {}
+
+  ScopedTimer(const ScopedTimer &) = delete;
+  ScopedTimer &operator=(const ScopedTimer &) = delete;
+
+  ~ScopedTimer() {
+    const auto end = std::chrono::steady_clock::now();
+    const auto elapsed = std::chrono::duration_cast<std::chrono::duration<double, std::milli>>(end - start_);
+    sink_.add(std::move(label_), elapsed.count());
+  }
+
+private:
+  PerfTiming &sink_;
+  std::string label_;
+  std::chrono::steady_clock::time_point start_;
+};
+
+} // namespace dbscan
+
diff --git a/src/dbscan_grid2d_l1.cpp b/src/dbscan_grid2d_l1.cpp
index 0489aae..1b324f2 100644
--- a/src/dbscan_grid2d_l1.cpp
+++ b/src/dbscan_grid2d_l1.cpp
@@ -9,11 +9,13 @@
 #include <vector>
 
 #include "parallel.hpp"
+#include "perf_timer.h"
 
 namespace dbscan {
 
 namespace {
 
+// Compact 2D cell coordinates into a sortable key so we can reuse std::sort rather than bespoke grid maps.
 constexpr uint64_t pack_cell(uint32_t ix, uint32_t iy) noexcept {
   return (static_cast<uint64_t>(ix) << 32U) | static_cast<uint64_t>(iy);
 }
@@ -22,6 +24,8 @@ constexpr uint32_t cell_of(uint32_t value, uint32_t cell_size) noexcept {
   return cell_size == 0 ? value : static_cast<uint32_t>(value / cell_size);
 }
 
+// Neighbors are explored by scanning adjacent grid cells so that the expensive L1 radius test only touches
+// candidates that share the precomputed bucket, keeping the branch predictor in our favor.
 template <typename Fn>
 void for_each_neighbor(uint32_t point_index, uint32_t eps, const uint32_t *x, const uint32_t *y,
                        const std::vector<uint32_t> &cell_x, const std::vector<uint32_t> &cell_y,
@@ -82,13 +86,16 @@ DBSCANGrid2D_L1::DBSCANGrid2D_L1(uint32_t eps_value, uint32_t min_samples_value,
     throw std::invalid_argument("min_samples must be greater than zero for DBSCANGrid2D_L1");
 }
 
-std::vector<int32_t> DBSCANGrid2D_L1::fit_predict(const uint32_t *x, const uint32_t *y, std::size_t count) const {
+std::vector<int32_t> DBSCANGrid2D_L1::fit_predict(const uint32_t *x, const uint32_t *y, std::size_t count) {
   if (x == nullptr || y == nullptr)
     throw std::invalid_argument("Input coordinate arrays must be non-null");
 
   if (count == 0)
     return {};
 
+  perf_timing_.clear();
+  ScopedTimer total_timer("total", perf_timing_);
+
   const uint32_t cell_size = eps;
 
   std::vector<uint32_t> cell_x(count);
@@ -98,41 +105,52 @@ std::vector<int32_t> DBSCANGrid2D_L1::fit_predict(const uint32_t *x, const uint3
   std::iota(ordered_indices.begin(), ordered_indices.end(), 0U);
 
   const std::size_t index_chunk = chunk_size == 0 ? 1024 : chunk_size;
-  utils::parallelize(0, count, num_threads, index_chunk,
-                     [&](std::size_t begin, std::size_t end) {
-    for (std::size_t i = begin; i < end; ++i) {
-      const uint32_t cx = cell_of(x[i], cell_size);
-      const uint32_t cy = cell_of(y[i], cell_size);
-      cell_x[i] = cx;
-      cell_y[i] = cy;
-      keys[i] = pack_cell(cx, cy);
-    }
-  });
+  {
+    // Precompute grid placements in parallel so later stages can stay read-only and avoid rehashing coordinates.
+    ScopedTimer timer("precompute_cells", perf_timing_);
+    utils::parallelize(0, count, num_threads, index_chunk, [&](std::size_t begin, std::size_t end) {
+      for (std::size_t i = begin; i < end; ++i) {
+        const uint32_t cx = cell_of(x[i], cell_size);
+        const uint32_t cy = cell_of(y[i], cell_size);
+        cell_x[i] = cx;
+        cell_y[i] = cy;
+        keys[i] = pack_cell(cx, cy);
+      }
+    });
+  }
 
-  std::sort(ordered_indices.begin(), ordered_indices.end(), [&](uint32_t lhs, uint32_t rhs) {
-    const uint64_t key_lhs = keys[lhs];
-    const uint64_t key_rhs = keys[rhs];
-    if (key_lhs == key_rhs)
-      return lhs < rhs;
-    return key_lhs < key_rhs;
-  });
+  {
+    // Sorting indices by packed cell ensures neighbors form contiguous spans which we can scan without lookups.
+    ScopedTimer timer("sort_indices", perf_timing_);
+    std::sort(ordered_indices.begin(), ordered_indices.end(), [&](uint32_t lhs, uint32_t rhs) {
+      const uint64_t key_lhs = keys[lhs];
+      const uint64_t key_rhs = keys[rhs];
+      if (key_lhs == key_rhs)
+        return lhs < rhs;
+      return key_lhs < key_rhs;
+    });
+  }
 
   std::vector<std::size_t> cell_offsets;
   cell_offsets.reserve(count + 1);
   std::vector<uint64_t> unique_keys;
   unique_keys.reserve(count);
 
-  std::size_t pos = 0;
-  while (pos < count) {
-    const uint64_t key = keys[ordered_indices[pos]];
-    unique_keys.push_back(key);
-    cell_offsets.push_back(pos);
-
-    do {
-      ++pos;
-    } while (pos < count && keys[ordered_indices[pos]] == key);
+  {
+    // Build a CSR-style view of the sorted cells so we can jump directly to the occupants of any neighboring bucket.
+    ScopedTimer timer("build_cell_offsets", perf_timing_);
+    std::size_t pos = 0;
+    while (pos < count) {
+      const uint64_t key = keys[ordered_indices[pos]];
+      unique_keys.push_back(key);
+      cell_offsets.push_back(pos);
+
+      do {
+        ++pos;
+      } while (pos < count && keys[ordered_indices[pos]] == key);
+    }
+    cell_offsets.push_back(count);
   }
-  cell_offsets.push_back(count);
 
   std::vector<int32_t> labels(count, -1);
   std::vector<uint8_t> is_core(count, 0U);
@@ -141,56 +159,68 @@ std::vector<int32_t> DBSCANGrid2D_L1::fit_predict(const uint32_t *x, const uint3
   const uint32_t min_samples_value = min_samples;
 
   const std::size_t core_chunk = chunk_size == 0 ? 512 : chunk_size;
-  utils::parallelize(0, count, num_threads, core_chunk,
-                     [&](std::size_t begin, std::size_t end) {
-    for (std::size_t idx = begin; idx < end; ++idx) {
-      uint32_t neighbor_count = 0;
-      for_each_neighbor(static_cast<uint32_t>(idx), eps_value, x, y, cell_x, cell_y, ordered_indices, cell_offsets,
-                        unique_keys, [&](uint32_t) {
-                          ++neighbor_count;
-                          return neighbor_count < min_samples_value;
-                        });
-
-      if (neighbor_count >= min_samples_value)
-        is_core[idx] = 1U;
-    }
-  });
+  {
+    // Core detection runs as an isolated pass so expansion can treat label writes as the only mutation, simplifying
+    // synchronization even when the search function is invoked concurrently.
+    ScopedTimer timer("core_detection", perf_timing_);
+    utils::parallelize(0, count, num_threads, core_chunk, [&](std::size_t begin, std::size_t end) {
+      for (std::size_t idx = begin; idx < end; ++idx) {
+        uint32_t neighbor_count = 0;
+        for_each_neighbor(static_cast<uint32_t>(idx), eps_value, x, y, cell_x, cell_y, ordered_indices, cell_offsets,
+                          unique_keys, [&](uint32_t) {
+                            ++neighbor_count;
+                            return neighbor_count < min_samples_value;
+                          });
+
+        if (neighbor_count >= min_samples_value)
+          is_core[idx] = 1U;
+      }
+    });
+  }
 
+  // Stack-based expansion keeps the cluster growth iterative, avoiding recursion while allowing work reuse.
   std::vector<uint32_t> stack;
   stack.reserve(count);
+
+  // Recycle a neighbor buffer per cluster to amortize allocations across large components.
   std::vector<uint32_t> neighbor_buffer;
   neighbor_buffer.reserve(64);
 
   int32_t next_label = 0;
-  for (std::size_t i = 0; i < count; ++i) {
-    if (!is_core[i] || labels[i] != -1)
-      continue;
+  {
+    ScopedTimer timer("cluster_expansion", perf_timing_);
+    for (std::size_t i = 0; i < count; ++i) {
+      if (!is_core[i] || labels[i] != -1)
+        continue;
 
-    labels[i] = next_label;
-    stack.clear();
-    stack.push_back(static_cast<uint32_t>(i));
-
-    while (!stack.empty()) {
-      const uint32_t current = stack.back();
-      stack.pop_back();
-
-      neighbor_buffer.clear();
-      for_each_neighbor(current, eps_value, x, y, cell_x, cell_y, ordered_indices, cell_offsets, unique_keys,
-                        [&](uint32_t neighbor) {
-                          neighbor_buffer.push_back(neighbor);
-                          return true;
-                        });
-
-      for (uint32_t neighbor : neighbor_buffer) {
-        if (labels[neighbor] == -1) {
-          labels[neighbor] = next_label;
-          if (is_core[neighbor])
-            stack.push_back(neighbor);
+      labels[i] = next_label;
+      stack.clear();
+      stack.push_back(static_cast<uint32_t>(i));
+
+      while (!stack.empty()) {
+        const uint32_t current = stack.back();
+        stack.pop_back();
+
+        neighbor_buffer.clear();
+        // Capture neighbors into a buffer first so every touch of labels happens after the search, keeping the
+        // expansion phase deterministic regardless of how for_each_neighbor yields matches.
+        for_each_neighbor(current, eps_value, x, y, cell_x, cell_y, ordered_indices, cell_offsets, unique_keys,
+                          [&](uint32_t neighbor) {
+                            neighbor_buffer.push_back(neighbor);
+                            return true;
+                          });
+
+        for (uint32_t neighbor : neighbor_buffer) {
+          if (labels[neighbor] == -1) {
+            labels[neighbor] = next_label;
+            if (is_core[neighbor])
+              stack.push_back(neighbor);
+          }
         }
       }
-    }
 
-    ++next_label;
+      ++next_label;
+    }
   }
 
   return labels;
diff --git a/tests/test_dbscan_grid2d_l1.cpp b/tests/test_dbscan_grid2d_l1.cpp
index bf6100a..7f977fa 100644
--- a/tests/test_dbscan_grid2d_l1.cpp
+++ b/tests/test_dbscan_grid2d_l1.cpp
@@ -9,6 +9,8 @@
 
 namespace {
 
+// The binary fixtures store Y first then X to mirror the grid compaction used in production; this loader
+// preserves that ordering so the test exercises identical memory access patterns as the runtime path.
 void load_fixture(const std::filesystem::path &data_path, const std::filesystem::path &truth_path,
                   std::vector<uint32_t> &x_out, std::vector<uint32_t> &y_out, std::vector<int32_t> &truth_out) {
   std::ifstream data_stream(data_path, std::ios::binary);
@@ -54,6 +56,8 @@ void load_fixture(const std::filesystem::path &data_path, const std::filesystem:
 } // namespace
 
 TEST_CASE("DBSCANGrid2D_L1 clusters dense neighbors", "[dbscan][grid_l1]") {
+  // Points are arranged so the Manhattan frontier just connects the first three but leaves the outlier isolated,
+  // validating that the L1 grid expansion covers diagonals without absorbing distant noise.
   const std::vector<uint32_t> x = {0, 1, 2, 100};
   const std::vector<uint32_t> y = {0, 0, 1, 200};
 
@@ -68,6 +72,7 @@ TEST_CASE("DBSCANGrid2D_L1 clusters dense neighbors", "[dbscan][grid_l1]") {
 }
 
 TEST_CASE("DBSCANGrid2D_L1 respects min_samples threshold", "[dbscan][grid_l1]") {
+  // Every point is deliberately spaced just beyond eps so we confirm the min_samples guard suppresses tiny clusters.
   const std::vector<uint32_t> coords = {0, 2, 4};
   dbscan::DBSCANGrid2D_L1 algo(3, 4);
   auto labels = algo.fit_predict(coords.data(), coords.data(), coords.size());
@@ -79,6 +84,7 @@ TEST_CASE("DBSCANGrid2D_L1 respects min_samples threshold", "[dbscan][grid_l1]")
 }
 
 TEST_CASE("DBSCANGrid2D_L1 matches fixture truth", "[dbscan][grid_l1]") {
+  // Fixture run mirrors the end-to-end validator to ensure the optimized grid path stays aligned with reference data.
   const std::filesystem::path root = std::filesystem::path{"tests"} / "data";
   const auto data_path = root / "dbscan_static_data.bin";
   const auto truth_path = root / "dbscan_static_truth.bin";

From 1773a490c4cb2808ef523a888e76654c64648971 Mon Sep 17 00:00:00 2001
From: Bo Lu <bo.lu@utexas.edu>
Date: Wed, 1 Oct 2025 07:51:47 -0700
Subject: [PATCH 09/12] log perf timing

---
 include/perf_timer.h               | 7 ++++++-
 tools/dbscan_dataset_validator.cpp | 6 ++++++
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/include/perf_timer.h b/include/perf_timer.h
index 61e9881..7bc5843 100644
--- a/include/perf_timer.h
+++ b/include/perf_timer.h
@@ -1,6 +1,7 @@
 #pragma once
 
 #include <chrono>
+#include <ostream>
 #include <string>
 #include <utility>
 #include <vector>
@@ -47,5 +48,9 @@ class ScopedTimer {
   std::chrono::steady_clock::time_point start_;
 };
 
-} // namespace dbscan
+inline std::ostream &operator<<(std::ostream &out, const PerfTimingEntry &entry) {
+  out << entry.label << ": " << entry.duration_ms << " ms";
+  return out;
+}
 
+} // namespace dbscan
diff --git a/tools/dbscan_dataset_validator.cpp b/tools/dbscan_dataset_validator.cpp
index 2c5a26e..684a48c 100644
--- a/tools/dbscan_dataset_validator.cpp
+++ b/tools/dbscan_dataset_validator.cpp
@@ -419,6 +419,12 @@ int main(int argc, char **argv) {
       const auto end = std::chrono::steady_clock::now();
       const auto elapsed_ms = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();
       std::cout << " done in " << elapsed_ms << " ms" << std::endl;
+      if (!grid_algo.perf_timing_.entries().empty()) {
+        // Emit step-level timings so dataset runs surface bottlenecks without separate profiling passes.
+        std::cout << "[grid_l1] component timings" << std::endl;
+        for (const auto &entry : grid_algo.perf_timing_.entries())
+          std::cout << "  " << entry << std::endl;
+      }
       results.push_back({"grid_l1", metrics});
 
       if (options.mismatch_output_dir && !mismatches.empty()) {

From a3a99b42b3ff87a126e94a2601ff51fc59d6b3d5 Mon Sep 17 00:00:00 2001
From: Bo Lu <bo.lu@utexas.edu>
Date: Wed, 1 Oct 2025 08:25:54 -0700
Subject: [PATCH 10/12] multithreaded expansion

---
 include/dbscan_grid2d_l1.h         |  20 +-
 src/dbscan_grid2d_l1.cpp           | 327 +++++++++++++++++++++++++----
 tests/test_dbscan_grid2d_l1.cpp    |  37 +++-
 tools/dbscan_dataset_validator.cpp | 107 +++++++---
 4 files changed, 418 insertions(+), 73 deletions(-)

diff --git a/include/dbscan_grid2d_l1.h b/include/dbscan_grid2d_l1.h
index 8c51190..0a402f6 100644
--- a/include/dbscan_grid2d_l1.h
+++ b/include/dbscan_grid2d_l1.h
@@ -8,18 +8,36 @@
 
 namespace dbscan {
 
+enum class GridExpansionMode {
+  Sequential,
+  FrontierParallel,
+  UnionFind
+};
+
 struct DBSCANGrid2D_L1 {
   uint32_t eps;
   uint32_t min_samples;
   std::size_t num_threads;
   std::size_t chunk_size;
+  GridExpansionMode expansion_mode;
 
   DBSCANGrid2D_L1(uint32_t eps_value, uint32_t min_samples_value, std::size_t num_threads_value = 0,
-                  std::size_t chunk_size_value = 0);
+                  std::size_t chunk_size_value = 0, GridExpansionMode mode_value = GridExpansionMode::Sequential);
 
   [[nodiscard]] std::vector<int32_t> fit_predict(const uint32_t *x, const uint32_t *y, std::size_t count);
 
   PerfTiming perf_timing_;
 };
 
+template <GridExpansionMode Mode>
+struct DBSCANGrid2D_L1T : DBSCANGrid2D_L1 {
+  DBSCANGrid2D_L1T(uint32_t eps_value, uint32_t min_samples_value, std::size_t num_threads_value = 0,
+                   std::size_t chunk_size_value = 0)
+      : DBSCANGrid2D_L1(eps_value, min_samples_value, num_threads_value, chunk_size_value, Mode) {}
+};
+
+using DBSCANGrid2D_L1Frontier = DBSCANGrid2D_L1T<GridExpansionMode::FrontierParallel>;
+using DBSCANGrid2D_L1UnionFind = DBSCANGrid2D_L1T<GridExpansionMode::UnionFind>;
+
 } // namespace dbscan
+
diff --git a/src/dbscan_grid2d_l1.cpp b/src/dbscan_grid2d_l1.cpp
index 1b324f2..f412207 100644
--- a/src/dbscan_grid2d_l1.cpp
+++ b/src/dbscan_grid2d_l1.cpp
@@ -1,10 +1,13 @@
 #include "dbscan_grid2d_l1.h"
 
 #include <algorithm>
+#include <atomic>
 #include <cstddef>
 #include <cstdint>
+#include <limits>
 #include <numeric>
 #include <stdexcept>
+#include <mutex>
 #include <utility>
 #include <vector>
 
@@ -75,11 +78,269 @@ void for_each_neighbor(uint32_t point_index, uint32_t eps, const uint32_t *x, co
   }
 }
 
+struct ExpansionContext {
+  const uint32_t *x;
+  const uint32_t *y;
+  std::size_t count;
+  uint32_t eps;
+  uint32_t min_samples;
+  const std::vector<uint32_t> &cell_x;
+  const std::vector<uint32_t> &cell_y;
+  const std::vector<uint32_t> &ordered_indices;
+  const std::vector<std::size_t> &cell_offsets;
+  const std::vector<uint64_t> &unique_keys;
+  const std::vector<uint8_t> &is_core;
+  std::size_t num_threads;
+  std::size_t chunk_size;
+};
+
+void sequential_expand(const ExpansionContext &ctx, std::vector<int32_t> &labels) {
+  std::vector<uint32_t> stack;
+  stack.reserve(ctx.count);
+  std::vector<uint32_t> neighbor_buffer;
+  neighbor_buffer.reserve(64);
+
+  int32_t next_label = 0;
+  for (std::size_t i = 0; i < ctx.count; ++i) {
+    if (!ctx.is_core[i] || labels[i] != -1)
+      continue;
+
+    labels[i] = next_label;
+    stack.clear();
+    stack.push_back(static_cast<uint32_t>(i));
+
+    while (!stack.empty()) {
+      const uint32_t current = stack.back();
+      stack.pop_back();
+
+      neighbor_buffer.clear();
+      // Capture neighbors into a buffer first so every touch of labels happens after the search, keeping the
+      // expansion phase deterministic regardless of how for_each_neighbor yields matches.
+      for_each_neighbor(current, ctx.eps, ctx.x, ctx.y, ctx.cell_x, ctx.cell_y, ctx.ordered_indices, ctx.cell_offsets,
+                        ctx.unique_keys, [&](uint32_t neighbor) {
+                          neighbor_buffer.push_back(neighbor);
+                          return true;
+                        });
+
+      for (uint32_t neighbor : neighbor_buffer) {
+        if (labels[neighbor] == -1) {
+          labels[neighbor] = next_label;
+          if (ctx.is_core[neighbor])
+            stack.push_back(neighbor);
+        }
+      }
+    }
+
+    ++next_label;
+  }
+}
+
+void frontier_expand(const ExpansionContext &ctx, std::vector<int32_t> &labels);
+
+void union_find_expand(const ExpansionContext &ctx, std::vector<int32_t> &labels);
+
+void frontier_expand(const ExpansionContext &ctx, std::vector<int32_t> &labels) {
+  std::vector<std::atomic<int32_t>> shared_labels(ctx.count);
+  for (std::size_t i = 0; i < ctx.count; ++i)
+    shared_labels[i].store(labels[i], std::memory_order_relaxed);
+
+  int32_t next_label = 0;
+  std::vector<uint32_t> frontier;
+  frontier.reserve(256);
+
+  const std::size_t frontier_chunk = ctx.chunk_size == 0 ? 64 : ctx.chunk_size;
+
+  for (std::size_t seed = 0; seed < ctx.count; ++seed) {
+    if (!ctx.is_core[seed] || shared_labels[seed].load(std::memory_order_acquire) != -1)
+      continue;
+
+    const int32_t label = next_label++;
+    shared_labels[seed].store(label, std::memory_order_release);
+    frontier.clear();
+    frontier.push_back(static_cast<uint32_t>(seed));
+
+    while (!frontier.empty()) {
+      std::vector<uint32_t> next_frontier;
+      std::mutex next_mutex;
+
+      utils::parallelize(0, frontier.size(), ctx.num_threads, frontier_chunk, [&](std::size_t begin, std::size_t end) {
+        std::vector<uint32_t> local_next;
+        local_next.reserve(32);
+        std::vector<uint32_t> neighbor_buffer;
+        neighbor_buffer.reserve(64);
+
+        for (std::size_t idx = begin; idx < end; ++idx) {
+          const uint32_t current = frontier[idx];
+
+          neighbor_buffer.clear();
+          for_each_neighbor(current, ctx.eps, ctx.x, ctx.y, ctx.cell_x, ctx.cell_y, ctx.ordered_indices,
+                            ctx.cell_offsets, ctx.unique_keys, [&](uint32_t neighbor) {
+                              neighbor_buffer.push_back(neighbor);
+                              return true;
+                            });
+
+          for (uint32_t neighbor : neighbor_buffer) {
+            if (ctx.is_core[neighbor]) {
+              int32_t expected = -1;
+              if (shared_labels[neighbor].compare_exchange_strong(expected, label, std::memory_order_acq_rel))
+                local_next.push_back(neighbor);
+            } else {
+              int32_t expected = -1;
+              shared_labels[neighbor].compare_exchange_strong(expected, label, std::memory_order_acq_rel);
+            }
+          }
+        }
+
+        if (!local_next.empty()) {
+          std::sort(local_next.begin(), local_next.end());
+          local_next.erase(std::unique(local_next.begin(), local_next.end()), local_next.end());
+          std::lock_guard<std::mutex> lock(next_mutex);
+          next_frontier.insert(next_frontier.end(), local_next.begin(), local_next.end());
+        }
+      });
+
+      if (next_frontier.empty())
+        break;
+
+      std::sort(next_frontier.begin(), next_frontier.end());
+      next_frontier.erase(std::unique(next_frontier.begin(), next_frontier.end()), next_frontier.end());
+      frontier.swap(next_frontier);
+    }
+  }
+
+  for (std::size_t i = 0; i < ctx.count; ++i)
+    labels[i] = shared_labels[i].load(std::memory_order_relaxed);
+}
+
+void union_find_expand(const ExpansionContext &ctx, std::vector<int32_t> &labels) {
+  constexpr uint32_t invalid = std::numeric_limits<uint32_t>::max();
+  std::vector<std::atomic<uint32_t>> parents(ctx.count);
+  for (std::size_t i = 0; i < ctx.count; ++i) {
+    if (ctx.is_core[i])
+      parents[i].store(static_cast<uint32_t>(i), std::memory_order_relaxed);
+    else
+      parents[i].store(invalid, std::memory_order_relaxed);
+  }
+
+  auto find_root = [&](uint32_t node) {
+    uint32_t parent = parents[node].load(std::memory_order_acquire);
+    if (parent == invalid)
+      return invalid;
+
+    while (true) {
+      uint32_t grandparent = parents[parent].load(std::memory_order_acquire);
+      if (grandparent == parent) {
+        if (parent != node)
+          parents[node].store(parent, std::memory_order_release);
+        return parent;
+      }
+      parents[node].compare_exchange_strong(parent, grandparent, std::memory_order_acq_rel);
+      node = parent;
+      parent = parents[node].load(std::memory_order_acquire);
+      if (parent == invalid)
+        return invalid;
+    }
+  };
+
+  auto unite = [&](uint32_t a, uint32_t b) {
+    while (true) {
+      a = find_root(a);
+      b = find_root(b);
+      if (a == invalid || b == invalid || a == b)
+        return;
+
+      if (a < b) {
+        uint32_t expected = b;
+        if (parents[b].compare_exchange_strong(expected, a, std::memory_order_acq_rel))
+          return;
+      } else {
+        uint32_t expected = a;
+        if (parents[a].compare_exchange_strong(expected, b, std::memory_order_acq_rel))
+          return;
+      }
+    }
+  };
+
+  const std::size_t union_chunk = ctx.chunk_size == 0 ? 512 : ctx.chunk_size;
+  utils::parallelize(0, ctx.count, ctx.num_threads, union_chunk, [&](std::size_t begin, std::size_t end) {
+    for (std::size_t idx = begin; idx < end; ++idx) {
+      if (!ctx.is_core[idx])
+        continue;
+
+      for_each_neighbor(static_cast<uint32_t>(idx), ctx.eps, ctx.x, ctx.y, ctx.cell_x, ctx.cell_y, ctx.ordered_indices,
+                        ctx.cell_offsets, ctx.unique_keys, [&](uint32_t neighbor) {
+                          if (ctx.is_core[neighbor])
+                            unite(static_cast<uint32_t>(idx), neighbor);
+                          return true;
+                        });
+    }
+  });
+
+  std::vector<uint32_t> root_for_point(ctx.count, invalid);
+  for (std::size_t i = 0; i < ctx.count; ++i) {
+    if (ctx.is_core[i])
+      root_for_point[i] = find_root(static_cast<uint32_t>(i));
+  }
+
+  std::vector<uint32_t> component_min(ctx.count, invalid);
+  for (std::size_t i = 0; i < ctx.count; ++i) {
+    if (!ctx.is_core[i])
+      continue;
+    const uint32_t root = root_for_point[i];
+    if (root == invalid)
+      continue;
+    if (component_min[root] > i)
+      component_min[root] = static_cast<uint32_t>(i);
+  }
+
+  std::vector<std::pair<uint32_t, uint32_t>> components;
+  components.reserve(ctx.count);
+  for (std::size_t i = 0; i < ctx.count; ++i) {
+    if (component_min[i] != invalid)
+      components.emplace_back(component_min[i], static_cast<uint32_t>(i));
+  }
+
+  std::sort(components.begin(), components.end());
+
+  std::vector<int32_t> root_label(ctx.count, -1);
+  int32_t next_label = 0;
+  for (const auto &[min_index, root] : components) {
+    (void)min_index;
+    root_label[root] = next_label++;
+  }
+
+  for (std::size_t i = 0; i < ctx.count; ++i) {
+    if (!ctx.is_core[i])
+      continue;
+    const uint32_t root = root_for_point[i];
+    if (root == invalid)
+      continue;
+    labels[i] = root_label[root];
+  }
+
+  for (std::size_t i = 0; i < ctx.count; ++i) {
+    if (ctx.is_core[i])
+      continue;
+
+    int32_t best_label = -1;
+    for_each_neighbor(static_cast<uint32_t>(i), ctx.eps, ctx.x, ctx.y, ctx.cell_x, ctx.cell_y, ctx.ordered_indices,
+                      ctx.cell_offsets, ctx.unique_keys, [&](uint32_t neighbor) {
+                        if (!ctx.is_core[neighbor])
+                          return true;
+                        const int32_t candidate = labels[neighbor];
+                        if (candidate != -1 && (best_label == -1 || candidate < best_label))
+                          best_label = candidate;
+                        return true;
+                      });
+    labels[i] = best_label;
+  }
+}
 } // namespace
 
 DBSCANGrid2D_L1::DBSCANGrid2D_L1(uint32_t eps_value, uint32_t min_samples_value, std::size_t num_threads_value,
-                                 std::size_t chunk_size_value)
-    : eps(eps_value), min_samples(min_samples_value), num_threads(num_threads_value), chunk_size(chunk_size_value) {
+                                 std::size_t chunk_size_value, GridExpansionMode mode_value)
+    : eps(eps_value), min_samples(min_samples_value), num_threads(num_threads_value), chunk_size(chunk_size_value),
+      expansion_mode(mode_value) {
   if (eps == 0)
     throw std::invalid_argument("eps must be greater than zero for DBSCANGrid2D_L1");
   if (min_samples == 0)
@@ -178,48 +439,32 @@ std::vector<int32_t> DBSCANGrid2D_L1::fit_predict(const uint32_t *x, const uint3
     });
   }
 
-  // Stack-based expansion keeps the cluster growth iterative, avoiding recursion while allowing work reuse.
-  std::vector<uint32_t> stack;
-  stack.reserve(count);
-
-  // Recycle a neighbor buffer per cluster to amortize allocations across large components.
-  std::vector<uint32_t> neighbor_buffer;
-  neighbor_buffer.reserve(64);
+  const ExpansionContext context{x,
+                                 y,
+                                 count,
+                                 eps_value,
+                                 min_samples_value,
+                                 cell_x,
+                                 cell_y,
+                                 ordered_indices,
+                                 cell_offsets,
+                                 unique_keys,
+                                 is_core,
+                                 num_threads,
+                                 chunk_size};
 
-  int32_t next_label = 0;
   {
     ScopedTimer timer("cluster_expansion", perf_timing_);
-    for (std::size_t i = 0; i < count; ++i) {
-      if (!is_core[i] || labels[i] != -1)
-        continue;
-
-      labels[i] = next_label;
-      stack.clear();
-      stack.push_back(static_cast<uint32_t>(i));
-
-      while (!stack.empty()) {
-        const uint32_t current = stack.back();
-        stack.pop_back();
-
-        neighbor_buffer.clear();
-        // Capture neighbors into a buffer first so every touch of labels happens after the search, keeping the
-        // expansion phase deterministic regardless of how for_each_neighbor yields matches.
-        for_each_neighbor(current, eps_value, x, y, cell_x, cell_y, ordered_indices, cell_offsets, unique_keys,
-                          [&](uint32_t neighbor) {
-                            neighbor_buffer.push_back(neighbor);
-                            return true;
-                          });
-
-        for (uint32_t neighbor : neighbor_buffer) {
-          if (labels[neighbor] == -1) {
-            labels[neighbor] = next_label;
-            if (is_core[neighbor])
-              stack.push_back(neighbor);
-          }
-        }
-      }
-
-      ++next_label;
+    switch (expansion_mode) {
+    case GridExpansionMode::Sequential:
+      sequential_expand(context, labels);
+      break;
+    case GridExpansionMode::FrontierParallel:
+      frontier_expand(context, labels);
+      break;
+    case GridExpansionMode::UnionFind:
+      union_find_expand(context, labels);
+      break;
     }
   }
 
diff --git a/tests/test_dbscan_grid2d_l1.cpp b/tests/test_dbscan_grid2d_l1.cpp
index 7f977fa..b08b0d0 100644
--- a/tests/test_dbscan_grid2d_l1.cpp
+++ b/tests/test_dbscan_grid2d_l1.cpp
@@ -9,6 +9,17 @@
 
 namespace {
 
+std::filesystem::path fixture_root() {
+  const std::filesystem::path candidates[] = {std::filesystem::path{"tests"} / "data",
+                                              std::filesystem::path{".."} / "tests" / "data",
+                                              std::filesystem::path{".."} / ".." / "tests" / "data"};
+  for (const auto &candidate : candidates) {
+    if (std::filesystem::exists(candidate))
+      return candidate;
+  }
+  return candidates[0];
+}
+
 // The binary fixtures store Y first then X to mirror the grid compaction used in production; this loader
 // preserves that ordering so the test exercises identical memory access patterns as the runtime path.
 void load_fixture(const std::filesystem::path &data_path, const std::filesystem::path &truth_path,
@@ -85,7 +96,7 @@ TEST_CASE("DBSCANGrid2D_L1 respects min_samples threshold", "[dbscan][grid_l1]")
 
 TEST_CASE("DBSCANGrid2D_L1 matches fixture truth", "[dbscan][grid_l1]") {
   // Fixture run mirrors the end-to-end validator to ensure the optimized grid path stays aligned with reference data.
-  const std::filesystem::path root = std::filesystem::path{"tests"} / "data";
+  const std::filesystem::path root = fixture_root();
   const auto data_path = root / "dbscan_static_data.bin";
   const auto truth_path = root / "dbscan_static_truth.bin";
 
@@ -99,4 +110,28 @@ TEST_CASE("DBSCANGrid2D_L1 matches fixture truth", "[dbscan][grid_l1]") {
 
   REQUIRE(labels.size() == truth.size());
   REQUIRE(labels == truth);
+
+  dbscan::DBSCANGrid2D_L1Frontier frontier_algo(10, 3, 4);
+  auto frontier_labels = frontier_algo.fit_predict(x.data(), y.data(), x.size());
+  REQUIRE(frontier_labels == truth);
+
+  dbscan::DBSCANGrid2D_L1UnionFind union_algo(10, 3, 4);
+  auto union_labels = union_algo.fit_predict(x.data(), y.data(), x.size());
+  REQUIRE(union_labels == truth);
+}
+
+TEST_CASE("DBSCANGrid2D_L1 parallel variants align with sequential", "[dbscan][grid_l1][parallel]") {
+  const std::vector<uint32_t> x = {0, 1, 2, 5, 40};
+  const std::vector<uint32_t> y = {0, 0, 1, 4, 80};
+
+  dbscan::DBSCANGrid2D_L1 sequential(6, 3);
+  const auto expected = sequential.fit_predict(x.data(), y.data(), x.size());
+
+  dbscan::DBSCANGrid2D_L1Frontier frontier(6, 3, 4);
+  auto frontier_labels = frontier.fit_predict(x.data(), y.data(), x.size());
+  REQUIRE(frontier_labels == expected);
+
+  dbscan::DBSCANGrid2D_L1UnionFind union_algo(6, 3, 4);
+  auto union_labels = union_algo.fit_predict(x.data(), y.data(), x.size());
+  REQUIRE(union_labels == expected);
 }
diff --git a/tools/dbscan_dataset_validator.cpp b/tools/dbscan_dataset_validator.cpp
index 684a48c..88ad292 100644
--- a/tools/dbscan_dataset_validator.cpp
+++ b/tools/dbscan_dataset_validator.cpp
@@ -29,13 +29,15 @@ struct Options {
   bool run_baseline{true};
   bool run_optimized{true};
   bool run_grid_l1{false};
+  std::vector<dbscan::GridExpansionMode> grid_modes;
   std::optional<std::filesystem::path> mismatch_output_dir;
 };
 
 void print_usage(const char *program_name) {
   std::cout << "Usage: " << program_name
             << " [--data <data.bin>] [--truth <truth.bin>] [--eps <value>] [--min-samples <value>]"
-            << " [--impl baseline|optimized|grid|both|all] [--dump-mismatches <directory>]\n";
+            << " [--impl baseline|optimized|grid|grid_frontier|grid_union|grid_all|both|all]"
+            << " [--dump-mismatches <directory>]\n";
 }
 
 Options parse_arguments(int argc, char **argv) {
@@ -65,30 +67,54 @@ Options parse_arguments(int argc, char **argv) {
       options.min_samples = static_cast<int32_t>(std::stoi(argv[++i]));
     } else if (arg == "--impl") {
       if (i + 1 >= argc)
-        throw std::invalid_argument("--impl expects one of: baseline, optimized, grid, both, all");
+        throw std::invalid_argument(
+            "--impl expects one of: baseline, optimized, grid, grid_frontier, grid_union, grid_all, both, all");
       const std::string value{argv[++i]};
       if (value == "baseline") {
         options.run_baseline = true;
         options.run_optimized = false;
         options.run_grid_l1 = false;
+        options.grid_modes.clear();
       } else if (value == "optimized") {
         options.run_baseline = false;
         options.run_optimized = true;
         options.run_grid_l1 = false;
+        options.grid_modes.clear();
       } else if (value == "grid" || value == "grid_l1") {
         options.run_baseline = false;
         options.run_optimized = false;
         options.run_grid_l1 = true;
+        options.grid_modes = {dbscan::GridExpansionMode::Sequential};
+      } else if (value == "grid_frontier") {
+        options.run_baseline = false;
+        options.run_optimized = false;
+        options.run_grid_l1 = true;
+        options.grid_modes = {dbscan::GridExpansionMode::FrontierParallel};
+      } else if (value == "grid_union" || value == "grid_union_find") {
+        options.run_baseline = false;
+        options.run_optimized = false;
+        options.run_grid_l1 = true;
+        options.grid_modes = {dbscan::GridExpansionMode::UnionFind};
+      } else if (value == "grid_all") {
+        options.run_baseline = false;
+        options.run_optimized = false;
+        options.run_grid_l1 = true;
+        options.grid_modes = {dbscan::GridExpansionMode::Sequential, dbscan::GridExpansionMode::FrontierParallel,
+                              dbscan::GridExpansionMode::UnionFind};
       } else if (value == "both") {
         options.run_baseline = true;
         options.run_optimized = true;
         options.run_grid_l1 = false;
+        options.grid_modes.clear();
       } else if (value == "all") {
         options.run_baseline = true;
         options.run_optimized = true;
         options.run_grid_l1 = true;
+        if (options.grid_modes.empty())
+          options.grid_modes = {dbscan::GridExpansionMode::Sequential};
       } else {
-        throw std::invalid_argument("--impl expects one of: baseline, optimized, grid, both, all");
+        throw std::invalid_argument(
+            "--impl expects one of: baseline, optimized, grid, grid_frontier, grid_union, grid_all, both, all");
       }
     } else if (arg == "--dump-mismatches") {
       if (i + 1 >= argc)
@@ -103,6 +129,8 @@ Options parse_arguments(int argc, char **argv) {
     throw std::invalid_argument("--eps must be positive");
   if (options.min_samples <= 0)
     throw std::invalid_argument("--min-samples must be positive");
+  if (options.run_grid_l1 && options.grid_modes.empty())
+    options.grid_modes = {dbscan::GridExpansionMode::Sequential};
 
   return options;
 }
@@ -402,6 +430,22 @@ int main(int argc, char **argv) {
       }
     }
 
+    auto grid_variant_info = [](dbscan::GridExpansionMode mode) {
+      struct VariantInfo {
+        std::string label;
+        std::string slug;
+      };
+      switch (mode) {
+      case dbscan::GridExpansionMode::Sequential:
+        return VariantInfo{"grid_l1", "grid_l1"};
+      case dbscan::GridExpansionMode::FrontierParallel:
+        return VariantInfo{"grid_l1_frontier", "grid_l1_frontier"};
+      case dbscan::GridExpansionMode::UnionFind:
+        return VariantInfo{"grid_l1_union", "grid_l1_union"};
+      }
+      return VariantInfo{"grid_l1", "grid_l1"};
+    };
+
     if (options.run_grid_l1) {
       const auto eps_int = static_cast<uint32_t>(std::llround(options.eps));
       if (std::fabs(options.eps - static_cast<double>(eps_int)) > 1e-6) {
@@ -409,33 +453,36 @@ int main(int argc, char **argv) {
       }
       if (x_coords.size() != y_coords.size())
         throw std::runtime_error("Mismatch between x and y coordinate counts");
-
-      std::cout << "\n[grid_l1] Running clustering..." << std::flush;
-      const auto start = std::chrono::steady_clock::now();
-      dbscan::DBSCANGrid2D_L1 grid_algo(eps_int, static_cast<uint32_t>(options.min_samples));
-      const auto labels = grid_algo.fit_predict(x_coords.data(), y_coords.data(), x_coords.size());
-      std::vector<std::size_t> mismatches;
-      const auto metrics = evaluate(labels, truth_labels, options.mismatch_output_dir ? &mismatches : nullptr);
-      const auto end = std::chrono::steady_clock::now();
-      const auto elapsed_ms = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();
-      std::cout << " done in " << elapsed_ms << " ms" << std::endl;
-      if (!grid_algo.perf_timing_.entries().empty()) {
-        // Emit step-level timings so dataset runs surface bottlenecks without separate profiling passes.
-        std::cout << "[grid_l1] component timings" << std::endl;
-        for (const auto &entry : grid_algo.perf_timing_.entries())
-          std::cout << "  " << entry << std::endl;
-      }
-      results.push_back({"grid_l1", metrics});
-
-      if (options.mismatch_output_dir && !mismatches.empty()) {
-        std::filesystem::create_directories(*options.mismatch_output_dir);
-        auto file_path = *options.mismatch_output_dir / "grid_l1_mismatches.txt";
-        std::ofstream out(file_path);
-        if (!out)
-          throw std::runtime_error("Failed to open mismatch output file: " + file_path.string());
-        for (std::size_t index : mismatches)
-          out << index << '\n';
-        std::cout << "[grid_l1] Wrote " << mismatches.size() << " mismatches to " << file_path << "\n";
+      for (auto mode : options.grid_modes) {
+        const auto info = grid_variant_info(mode);
+        std::cout << "\n[" << info.label << "] Running clustering..." << std::flush;
+        const auto start = std::chrono::steady_clock::now();
+        dbscan::DBSCANGrid2D_L1 grid_algo(eps_int, static_cast<uint32_t>(options.min_samples), 0, 0, mode);
+        const auto labels = grid_algo.fit_predict(x_coords.data(), y_coords.data(), x_coords.size());
+        std::vector<std::size_t> mismatches;
+        const auto metrics = evaluate(labels, truth_labels, options.mismatch_output_dir ? &mismatches : nullptr);
+        const auto end = std::chrono::steady_clock::now();
+        const auto elapsed_ms = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();
+        std::cout << " done in " << elapsed_ms << " ms" << std::endl;
+        if (!grid_algo.perf_timing_.entries().empty()) {
+          // Emit step-level timings so dataset runs surface bottlenecks without separate profiling passes.
+          std::cout << "[" << info.label << "] component timings" << std::endl;
+          for (const auto &entry : grid_algo.perf_timing_.entries())
+            std::cout << "  " << entry << std::endl;
+        }
+        results.push_back({info.label, metrics});
+
+        if (options.mismatch_output_dir && !mismatches.empty()) {
+          std::filesystem::create_directories(*options.mismatch_output_dir);
+          auto file_path = *options.mismatch_output_dir / (info.slug + "_mismatches.txt");
+          std::ofstream out(file_path);
+          if (!out)
+            throw std::runtime_error("Failed to open mismatch output file: " + file_path.string());
+          for (std::size_t index : mismatches)
+            out << index << '\n';
+          std::cout << "[" << info.label << "] Wrote " << mismatches.size() << " mismatches to " << file_path
+                    << "\n";
+        }
       }
     }
 

From 82d7fef1954d009455be15e3f57f9ce24a7ba93a Mon Sep 17 00:00:00 2001
From: Bo Lu <bo.lu@utexas.edu>
Date: Thu, 2 Oct 2025 08:15:05 -0700
Subject: [PATCH 11/12] commit

---
 CMakeLists.txt                     |   1 +
 benchmark/benchmark_dbscan.cpp     |   8 +-
 include/dbscan_grid2d_l1.h         |  36 +--
 src/dbscan_grid2d_l1.cpp           | 477 +----------------------------
 src/dbscan_grid2d_l1_aos.cpp       |  25 ++
 src/dbscan_grid2d_l1_impl.hpp      | 459 +++++++++++++++++++++++++++
 tests/test_dbscan_grid2d_l1.cpp    |  67 ++--
 tools/dbscan_dataset_validator.cpp |  16 +-
 8 files changed, 568 insertions(+), 521 deletions(-)
 create mode 100644 src/dbscan_grid2d_l1_aos.cpp
 create mode 100644 src/dbscan_grid2d_l1_impl.hpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index bcc6c50..0ed6314 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -20,6 +20,7 @@ add_library(dbscan STATIC
     src/dbscan.cpp
     src/dbscan_optimized.cpp
     src/dbscan_grid2d_l1.cpp
+    src/dbscan_grid2d_l1_aos.cpp
 )
 
 target_include_directories(dbscan
diff --git a/benchmark/benchmark_dbscan.cpp b/benchmark/benchmark_dbscan.cpp
index 95fc230..55b0584 100644
--- a/benchmark/benchmark_dbscan.cpp
+++ b/benchmark/benchmark_dbscan.cpp
@@ -106,9 +106,11 @@ int main() {
       const std::string label = "grid-l1 " + std::to_string(total_points) + " pts threads=" +
                                 (thread_count == 0 ? std::string("auto") : std::to_string(thread_count));
       bench.run(label, [&]() {
-        dbscan::DBSCANGrid2D_L1 algo(eps, min_samples, thread_count);
-        auto labels = algo.fit_predict(dataset.x.data(), dataset.y.data(), total_points);
-        ankerl::nanobench::doNotOptimizeAway(labels);
+        dbscan::DBSCANGrid2DL1Params params{eps, min_samples};
+        params.num_threads = thread_count;
+        auto result =
+            dbscan::dbscan_grid2d_l1(dataset.x.data(), 1, dataset.y.data(), 1, total_points, params);
+        ankerl::nanobench::doNotOptimizeAway(result.labels);
       });
     }
   }
diff --git a/include/dbscan_grid2d_l1.h b/include/dbscan_grid2d_l1.h
index 0a402f6..c3e91c1 100644
--- a/include/dbscan_grid2d_l1.h
+++ b/include/dbscan_grid2d_l1.h
@@ -14,30 +14,30 @@ enum class GridExpansionMode {
   UnionFind
 };
 
-struct DBSCANGrid2D_L1 {
+struct DBSCANGrid2DL1Params {
   uint32_t eps;
   uint32_t min_samples;
-  std::size_t num_threads;
-  std::size_t chunk_size;
-  GridExpansionMode expansion_mode;
-
-  DBSCANGrid2D_L1(uint32_t eps_value, uint32_t min_samples_value, std::size_t num_threads_value = 0,
-                  std::size_t chunk_size_value = 0, GridExpansionMode mode_value = GridExpansionMode::Sequential);
-
-  [[nodiscard]] std::vector<int32_t> fit_predict(const uint32_t *x, const uint32_t *y, std::size_t count);
+  std::size_t num_threads = 0;
+  std::size_t chunk_size = 0;
+};
 
-  PerfTiming perf_timing_;
+struct Grid2DPoint {
+  uint32_t x;
+  uint32_t y;
 };
 
-template <GridExpansionMode Mode>
-struct DBSCANGrid2D_L1T : DBSCANGrid2D_L1 {
-  DBSCANGrid2D_L1T(uint32_t eps_value, uint32_t min_samples_value, std::size_t num_threads_value = 0,
-                   std::size_t chunk_size_value = 0)
-      : DBSCANGrid2D_L1(eps_value, min_samples_value, num_threads_value, chunk_size_value, Mode) {}
+struct DBSCANGrid2DL1Result {
+  std::vector<int32_t> labels;
+  PerfTiming perf_timing;
 };
 
-using DBSCANGrid2D_L1Frontier = DBSCANGrid2D_L1T<GridExpansionMode::FrontierParallel>;
-using DBSCANGrid2D_L1UnionFind = DBSCANGrid2D_L1T<GridExpansionMode::UnionFind>;
+[[nodiscard]] DBSCANGrid2DL1Result dbscan_grid2d_l1(const uint32_t *x, std::size_t x_stride, const uint32_t *y,
+                                                    std::size_t y_stride, std::size_t count,
+                                                    const DBSCANGrid2DL1Params &params,
+                                                    GridExpansionMode mode = GridExpansionMode::Sequential);
 
-} // namespace dbscan
+[[nodiscard]] DBSCANGrid2DL1Result dbscan_grid2d_l1_aos(const Grid2DPoint *points, std::size_t count,
+                                                        const DBSCANGrid2DL1Params &params,
+                                                        GridExpansionMode mode = GridExpansionMode::Sequential);
 
+} // namespace dbscan
diff --git a/src/dbscan_grid2d_l1.cpp b/src/dbscan_grid2d_l1.cpp
index f412207..8e3cdc6 100644
--- a/src/dbscan_grid2d_l1.cpp
+++ b/src/dbscan_grid2d_l1.cpp
@@ -1,474 +1,23 @@
 #include "dbscan_grid2d_l1.h"
 
-#include <algorithm>
-#include <atomic>
-#include <cstddef>
-#include <cstdint>
-#include <limits>
-#include <numeric>
 #include <stdexcept>
-#include <mutex>
-#include <utility>
-#include <vector>
 
-#include "parallel.hpp"
-#include "perf_timer.h"
+#include "dbscan_grid2d_l1_impl.hpp"
 
 namespace dbscan {
 
-namespace {
-
-// Compact 2D cell coordinates into a sortable key so we can reuse std::sort rather than bespoke grid maps.
-constexpr uint64_t pack_cell(uint32_t ix, uint32_t iy) noexcept {
-  return (static_cast<uint64_t>(ix) << 32U) | static_cast<uint64_t>(iy);
-}
-
-constexpr uint32_t cell_of(uint32_t value, uint32_t cell_size) noexcept {
-  return cell_size == 0 ? value : static_cast<uint32_t>(value / cell_size);
-}
-
-// Neighbors are explored by scanning adjacent grid cells so that the expensive L1 radius test only touches
-// candidates that share the precomputed bucket, keeping the branch predictor in our favor.
-template <typename Fn>
-void for_each_neighbor(uint32_t point_index, uint32_t eps, const uint32_t *x, const uint32_t *y,
-                       const std::vector<uint32_t> &cell_x, const std::vector<uint32_t> &cell_y,
-                       const std::vector<uint32_t> &ordered_indices, const std::vector<size_t> &cell_offsets,
-                       const std::vector<uint64_t> &unique_keys, Fn &&fn) {
-
-  const uint32_t base_cx = cell_x[point_index];
-  const uint32_t base_cy = cell_y[point_index];
-
-  for (int dx = -1; dx <= 1; ++dx) {
-    const int64_t nx = static_cast<int64_t>(base_cx) + dx;
-    if (nx < 0)
-      continue;
-
-    for (int dy = -1; dy <= 1; ++dy) {
-      const int64_t ny = static_cast<int64_t>(base_cy) + dy;
-      if (ny < 0)
-        continue;
-
-      const uint64_t key = pack_cell(static_cast<uint32_t>(nx), static_cast<uint32_t>(ny));
-      const auto it = std::lower_bound(unique_keys.begin(), unique_keys.end(), key);
-      if (it == unique_keys.end() || *it != key)
-        continue;
-
-      const std::size_t cell_idx = static_cast<std::size_t>(std::distance(unique_keys.begin(), it));
-      const std::size_t begin = cell_offsets[cell_idx];
-      const std::size_t end = cell_offsets[cell_idx + 1];
-
-      for (std::size_t pos = begin; pos < end; ++pos) {
-        const uint32_t neighbor_idx = ordered_indices[pos];
-
-        const uint32_t x_a = x[point_index];
-        const uint32_t y_a = y[point_index];
-        const uint32_t x_b = x[neighbor_idx];
-        const uint32_t y_b = y[neighbor_idx];
-
-        const uint32_t dx_abs = x_a > x_b ? x_a - x_b : x_b - x_a;
-        const uint32_t dy_abs = y_a > y_b ? y_a - y_b : y_b - y_a;
-        const uint64_t manhattan = static_cast<uint64_t>(dx_abs) + static_cast<uint64_t>(dy_abs);
-
-        if (manhattan <= static_cast<uint64_t>(eps)) {
-          if (!fn(neighbor_idx))
-            return;
-        }
-      }
-    }
-  }
-}
-
-struct ExpansionContext {
-  const uint32_t *x;
-  const uint32_t *y;
-  std::size_t count;
-  uint32_t eps;
-  uint32_t min_samples;
-  const std::vector<uint32_t> &cell_x;
-  const std::vector<uint32_t> &cell_y;
-  const std::vector<uint32_t> &ordered_indices;
-  const std::vector<std::size_t> &cell_offsets;
-  const std::vector<uint64_t> &unique_keys;
-  const std::vector<uint8_t> &is_core;
-  std::size_t num_threads;
-  std::size_t chunk_size;
-};
-
-void sequential_expand(const ExpansionContext &ctx, std::vector<int32_t> &labels) {
-  std::vector<uint32_t> stack;
-  stack.reserve(ctx.count);
-  std::vector<uint32_t> neighbor_buffer;
-  neighbor_buffer.reserve(64);
-
-  int32_t next_label = 0;
-  for (std::size_t i = 0; i < ctx.count; ++i) {
-    if (!ctx.is_core[i] || labels[i] != -1)
-      continue;
-
-    labels[i] = next_label;
-    stack.clear();
-    stack.push_back(static_cast<uint32_t>(i));
-
-    while (!stack.empty()) {
-      const uint32_t current = stack.back();
-      stack.pop_back();
-
-      neighbor_buffer.clear();
-      // Capture neighbors into a buffer first so every touch of labels happens after the search, keeping the
-      // expansion phase deterministic regardless of how for_each_neighbor yields matches.
-      for_each_neighbor(current, ctx.eps, ctx.x, ctx.y, ctx.cell_x, ctx.cell_y, ctx.ordered_indices, ctx.cell_offsets,
-                        ctx.unique_keys, [&](uint32_t neighbor) {
-                          neighbor_buffer.push_back(neighbor);
-                          return true;
-                        });
-
-      for (uint32_t neighbor : neighbor_buffer) {
-        if (labels[neighbor] == -1) {
-          labels[neighbor] = next_label;
-          if (ctx.is_core[neighbor])
-            stack.push_back(neighbor);
-        }
-      }
-    }
-
-    ++next_label;
-  }
-}
-
-void frontier_expand(const ExpansionContext &ctx, std::vector<int32_t> &labels);
-
-void union_find_expand(const ExpansionContext &ctx, std::vector<int32_t> &labels);
-
-void frontier_expand(const ExpansionContext &ctx, std::vector<int32_t> &labels) {
-  std::vector<std::atomic<int32_t>> shared_labels(ctx.count);
-  for (std::size_t i = 0; i < ctx.count; ++i)
-    shared_labels[i].store(labels[i], std::memory_order_relaxed);
-
-  int32_t next_label = 0;
-  std::vector<uint32_t> frontier;
-  frontier.reserve(256);
-
-  const std::size_t frontier_chunk = ctx.chunk_size == 0 ? 64 : ctx.chunk_size;
-
-  for (std::size_t seed = 0; seed < ctx.count; ++seed) {
-    if (!ctx.is_core[seed] || shared_labels[seed].load(std::memory_order_acquire) != -1)
-      continue;
-
-    const int32_t label = next_label++;
-    shared_labels[seed].store(label, std::memory_order_release);
-    frontier.clear();
-    frontier.push_back(static_cast<uint32_t>(seed));
-
-    while (!frontier.empty()) {
-      std::vector<uint32_t> next_frontier;
-      std::mutex next_mutex;
-
-      utils::parallelize(0, frontier.size(), ctx.num_threads, frontier_chunk, [&](std::size_t begin, std::size_t end) {
-        std::vector<uint32_t> local_next;
-        local_next.reserve(32);
-        std::vector<uint32_t> neighbor_buffer;
-        neighbor_buffer.reserve(64);
-
-        for (std::size_t idx = begin; idx < end; ++idx) {
-          const uint32_t current = frontier[idx];
-
-          neighbor_buffer.clear();
-          for_each_neighbor(current, ctx.eps, ctx.x, ctx.y, ctx.cell_x, ctx.cell_y, ctx.ordered_indices,
-                            ctx.cell_offsets, ctx.unique_keys, [&](uint32_t neighbor) {
-                              neighbor_buffer.push_back(neighbor);
-                              return true;
-                            });
-
-          for (uint32_t neighbor : neighbor_buffer) {
-            if (ctx.is_core[neighbor]) {
-              int32_t expected = -1;
-              if (shared_labels[neighbor].compare_exchange_strong(expected, label, std::memory_order_acq_rel))
-                local_next.push_back(neighbor);
-            } else {
-              int32_t expected = -1;
-              shared_labels[neighbor].compare_exchange_strong(expected, label, std::memory_order_acq_rel);
-            }
-          }
-        }
-
-        if (!local_next.empty()) {
-          std::sort(local_next.begin(), local_next.end());
-          local_next.erase(std::unique(local_next.begin(), local_next.end()), local_next.end());
-          std::lock_guard<std::mutex> lock(next_mutex);
-          next_frontier.insert(next_frontier.end(), local_next.begin(), local_next.end());
-        }
-      });
-
-      if (next_frontier.empty())
-        break;
-
-      std::sort(next_frontier.begin(), next_frontier.end());
-      next_frontier.erase(std::unique(next_frontier.begin(), next_frontier.end()), next_frontier.end());
-      frontier.swap(next_frontier);
-    }
-  }
-
-  for (std::size_t i = 0; i < ctx.count; ++i)
-    labels[i] = shared_labels[i].load(std::memory_order_relaxed);
-}
-
-void union_find_expand(const ExpansionContext &ctx, std::vector<int32_t> &labels) {
-  constexpr uint32_t invalid = std::numeric_limits<uint32_t>::max();
-  std::vector<std::atomic<uint32_t>> parents(ctx.count);
-  for (std::size_t i = 0; i < ctx.count; ++i) {
-    if (ctx.is_core[i])
-      parents[i].store(static_cast<uint32_t>(i), std::memory_order_relaxed);
-    else
-      parents[i].store(invalid, std::memory_order_relaxed);
-  }
-
-  auto find_root = [&](uint32_t node) {
-    uint32_t parent = parents[node].load(std::memory_order_acquire);
-    if (parent == invalid)
-      return invalid;
-
-    while (true) {
-      uint32_t grandparent = parents[parent].load(std::memory_order_acquire);
-      if (grandparent == parent) {
-        if (parent != node)
-          parents[node].store(parent, std::memory_order_release);
-        return parent;
-      }
-      parents[node].compare_exchange_strong(parent, grandparent, std::memory_order_acq_rel);
-      node = parent;
-      parent = parents[node].load(std::memory_order_acquire);
-      if (parent == invalid)
-        return invalid;
-    }
-  };
-
-  auto unite = [&](uint32_t a, uint32_t b) {
-    while (true) {
-      a = find_root(a);
-      b = find_root(b);
-      if (a == invalid || b == invalid || a == b)
-        return;
-
-      if (a < b) {
-        uint32_t expected = b;
-        if (parents[b].compare_exchange_strong(expected, a, std::memory_order_acq_rel))
-          return;
-      } else {
-        uint32_t expected = a;
-        if (parents[a].compare_exchange_strong(expected, b, std::memory_order_acq_rel))
-          return;
-      }
-    }
-  };
-
-  const std::size_t union_chunk = ctx.chunk_size == 0 ? 512 : ctx.chunk_size;
-  utils::parallelize(0, ctx.count, ctx.num_threads, union_chunk, [&](std::size_t begin, std::size_t end) {
-    for (std::size_t idx = begin; idx < end; ++idx) {
-      if (!ctx.is_core[idx])
-        continue;
-
-      for_each_neighbor(static_cast<uint32_t>(idx), ctx.eps, ctx.x, ctx.y, ctx.cell_x, ctx.cell_y, ctx.ordered_indices,
-                        ctx.cell_offsets, ctx.unique_keys, [&](uint32_t neighbor) {
-                          if (ctx.is_core[neighbor])
-                            unite(static_cast<uint32_t>(idx), neighbor);
-                          return true;
-                        });
-    }
-  });
-
-  std::vector<uint32_t> root_for_point(ctx.count, invalid);
-  for (std::size_t i = 0; i < ctx.count; ++i) {
-    if (ctx.is_core[i])
-      root_for_point[i] = find_root(static_cast<uint32_t>(i));
-  }
-
-  std::vector<uint32_t> component_min(ctx.count, invalid);
-  for (std::size_t i = 0; i < ctx.count; ++i) {
-    if (!ctx.is_core[i])
-      continue;
-    const uint32_t root = root_for_point[i];
-    if (root == invalid)
-      continue;
-    if (component_min[root] > i)
-      component_min[root] = static_cast<uint32_t>(i);
-  }
-
-  std::vector<std::pair<uint32_t, uint32_t>> components;
-  components.reserve(ctx.count);
-  for (std::size_t i = 0; i < ctx.count; ++i) {
-    if (component_min[i] != invalid)
-      components.emplace_back(component_min[i], static_cast<uint32_t>(i));
-  }
-
-  std::sort(components.begin(), components.end());
-
-  std::vector<int32_t> root_label(ctx.count, -1);
-  int32_t next_label = 0;
-  for (const auto &[min_index, root] : components) {
-    (void)min_index;
-    root_label[root] = next_label++;
-  }
-
-  for (std::size_t i = 0; i < ctx.count; ++i) {
-    if (!ctx.is_core[i])
-      continue;
-    const uint32_t root = root_for_point[i];
-    if (root == invalid)
-      continue;
-    labels[i] = root_label[root];
-  }
-
-  for (std::size_t i = 0; i < ctx.count; ++i) {
-    if (ctx.is_core[i])
-      continue;
-
-    int32_t best_label = -1;
-    for_each_neighbor(static_cast<uint32_t>(i), ctx.eps, ctx.x, ctx.y, ctx.cell_x, ctx.cell_y, ctx.ordered_indices,
-                      ctx.cell_offsets, ctx.unique_keys, [&](uint32_t neighbor) {
-                        if (!ctx.is_core[neighbor])
-                          return true;
-                        const int32_t candidate = labels[neighbor];
-                        if (candidate != -1 && (best_label == -1 || candidate < best_label))
-                          best_label = candidate;
-                        return true;
-                      });
-    labels[i] = best_label;
-  }
-}
-} // namespace
-
-DBSCANGrid2D_L1::DBSCANGrid2D_L1(uint32_t eps_value, uint32_t min_samples_value, std::size_t num_threads_value,
-                                 std::size_t chunk_size_value, GridExpansionMode mode_value)
-    : eps(eps_value), min_samples(min_samples_value), num_threads(num_threads_value), chunk_size(chunk_size_value),
-      expansion_mode(mode_value) {
-  if (eps == 0)
-    throw std::invalid_argument("eps must be greater than zero for DBSCANGrid2D_L1");
-  if (min_samples == 0)
-    throw std::invalid_argument("min_samples must be greater than zero for DBSCANGrid2D_L1");
-}
-
-std::vector<int32_t> DBSCANGrid2D_L1::fit_predict(const uint32_t *x, const uint32_t *y, std::size_t count) {
-  if (x == nullptr || y == nullptr)
-    throw std::invalid_argument("Input coordinate arrays must be non-null");
-
-  if (count == 0)
-    return {};
-
-  perf_timing_.clear();
-  ScopedTimer total_timer("total", perf_timing_);
-
-  const uint32_t cell_size = eps;
-
-  std::vector<uint32_t> cell_x(count);
-  std::vector<uint32_t> cell_y(count);
-  std::vector<uint64_t> keys(count);
-  std::vector<uint32_t> ordered_indices(count);
-  std::iota(ordered_indices.begin(), ordered_indices.end(), 0U);
-
-  const std::size_t index_chunk = chunk_size == 0 ? 1024 : chunk_size;
-  {
-    // Precompute grid placements in parallel so later stages can stay read-only and avoid rehashing coordinates.
-    ScopedTimer timer("precompute_cells", perf_timing_);
-    utils::parallelize(0, count, num_threads, index_chunk, [&](std::size_t begin, std::size_t end) {
-      for (std::size_t i = begin; i < end; ++i) {
-        const uint32_t cx = cell_of(x[i], cell_size);
-        const uint32_t cy = cell_of(y[i], cell_size);
-        cell_x[i] = cx;
-        cell_y[i] = cy;
-        keys[i] = pack_cell(cx, cy);
-      }
-    });
-  }
-
-  {
-    // Sorting indices by packed cell ensures neighbors form contiguous spans which we can scan without lookups.
-    ScopedTimer timer("sort_indices", perf_timing_);
-    std::sort(ordered_indices.begin(), ordered_indices.end(), [&](uint32_t lhs, uint32_t rhs) {
-      const uint64_t key_lhs = keys[lhs];
-      const uint64_t key_rhs = keys[rhs];
-      if (key_lhs == key_rhs)
-        return lhs < rhs;
-      return key_lhs < key_rhs;
-    });
-  }
-
-  std::vector<std::size_t> cell_offsets;
-  cell_offsets.reserve(count + 1);
-  std::vector<uint64_t> unique_keys;
-  unique_keys.reserve(count);
-
-  {
-    // Build a CSR-style view of the sorted cells so we can jump directly to the occupants of any neighboring bucket.
-    ScopedTimer timer("build_cell_offsets", perf_timing_);
-    std::size_t pos = 0;
-    while (pos < count) {
-      const uint64_t key = keys[ordered_indices[pos]];
-      unique_keys.push_back(key);
-      cell_offsets.push_back(pos);
-
-      do {
-        ++pos;
-      } while (pos < count && keys[ordered_indices[pos]] == key);
-    }
-    cell_offsets.push_back(count);
-  }
-
-  std::vector<int32_t> labels(count, -1);
-  std::vector<uint8_t> is_core(count, 0U);
-
-  const uint32_t eps_value = eps;
-  const uint32_t min_samples_value = min_samples;
-
-  const std::size_t core_chunk = chunk_size == 0 ? 512 : chunk_size;
-  {
-    // Core detection runs as an isolated pass so expansion can treat label writes as the only mutation, simplifying
-    // synchronization even when the search function is invoked concurrently.
-    ScopedTimer timer("core_detection", perf_timing_);
-    utils::parallelize(0, count, num_threads, core_chunk, [&](std::size_t begin, std::size_t end) {
-      for (std::size_t idx = begin; idx < end; ++idx) {
-        uint32_t neighbor_count = 0;
-        for_each_neighbor(static_cast<uint32_t>(idx), eps_value, x, y, cell_x, cell_y, ordered_indices, cell_offsets,
-                          unique_keys, [&](uint32_t) {
-                            ++neighbor_count;
-                            return neighbor_count < min_samples_value;
-                          });
-
-        if (neighbor_count >= min_samples_value)
-          is_core[idx] = 1U;
-      }
-    });
-  }
-
-  const ExpansionContext context{x,
-                                 y,
-                                 count,
-                                 eps_value,
-                                 min_samples_value,
-                                 cell_x,
-                                 cell_y,
-                                 ordered_indices,
-                                 cell_offsets,
-                                 unique_keys,
-                                 is_core,
-                                 num_threads,
-                                 chunk_size};
-
-  {
-    ScopedTimer timer("cluster_expansion", perf_timing_);
-    switch (expansion_mode) {
-    case GridExpansionMode::Sequential:
-      sequential_expand(context, labels);
-      break;
-    case GridExpansionMode::FrontierParallel:
-      frontier_expand(context, labels);
-      break;
-    case GridExpansionMode::UnionFind:
-      union_find_expand(context, labels);
-      break;
-    }
-  }
-
-  return labels;
+DBSCANGrid2DL1Result dbscan_grid2d_l1(const uint32_t *x, std::size_t x_stride, const uint32_t *y, std::size_t y_stride,
+                                      std::size_t count, const DBSCANGrid2DL1Params &params, GridExpansionMode mode) {
+  if (params.eps == 0)
+    throw std::invalid_argument("eps must be greater than zero for dbscan_grid2d_l1");
+  if (params.min_samples == 0)
+    throw std::invalid_argument("min_samples must be greater than zero for dbscan_grid2d_l1");
+  if (count != 0 && (x == nullptr || y == nullptr))
+    throw std::invalid_argument("Input coordinate arrays must be non-null when count > 0");
+  if (count != 0 && (x_stride == 0 || y_stride == 0))
+    throw std::invalid_argument("Stride must be positive when count > 0");
+
+  return detail::dbscan_grid2d_l1_impl(x, x_stride, y, y_stride, count, params, mode);
 }
 
 } // namespace dbscan
diff --git a/src/dbscan_grid2d_l1_aos.cpp b/src/dbscan_grid2d_l1_aos.cpp
new file mode 100644
index 0000000..15cc900
--- /dev/null
+++ b/src/dbscan_grid2d_l1_aos.cpp
@@ -0,0 +1,25 @@
+#include "dbscan_grid2d_l1.h"
+
+#include <cstddef>
+#include <stdexcept>
+
+namespace dbscan {
+
+DBSCANGrid2DL1Result dbscan_grid2d_l1_aos(const Grid2DPoint *points, std::size_t count,
+                                          const DBSCANGrid2DL1Params &params, GridExpansionMode mode) {
+  if (params.eps == 0)
+    throw std::invalid_argument("eps must be greater than zero for dbscan_grid2d_l1_aos");
+  if (params.min_samples == 0)
+    throw std::invalid_argument("min_samples must be greater than zero for dbscan_grid2d_l1_aos");
+  if (count != 0 && points == nullptr)
+    throw std::invalid_argument("Input point array must be non-null when count > 0");
+
+  static_assert(sizeof(Grid2DPoint) == sizeof(uint32_t) * 2, "Grid2DPoint is expected to be tightly packed");
+  const std::size_t stride = sizeof(Grid2DPoint) / sizeof(uint32_t);
+
+  const uint32_t *x = count == 0 ? nullptr : &points[0].x;
+  const uint32_t *y = count == 0 ? nullptr : &points[0].y;
+  return dbscan_grid2d_l1(x, stride, y, stride, count, params, mode);
+}
+
+} // namespace dbscan
diff --git a/src/dbscan_grid2d_l1_impl.hpp b/src/dbscan_grid2d_l1_impl.hpp
new file mode 100644
index 0000000..afa8722
--- /dev/null
+++ b/src/dbscan_grid2d_l1_impl.hpp
@@ -0,0 +1,459 @@
+#pragma once
+
+#include "dbscan_grid2d_l1.h"
+
+#include <algorithm>
+#include <atomic>
+#include <cstddef>
+#include <cstdint>
+#include <limits>
+#include <mutex>
+#include <numeric>
+#include <utility>
+#include <vector>
+
+#include "parallel.hpp"
+#include "perf_timer.h"
+
+namespace dbscan::detail {
+
+constexpr uint64_t pack_cell(uint32_t ix, uint32_t iy) noexcept {
+  return (static_cast<uint64_t>(ix) << 32U) | static_cast<uint64_t>(iy);
+}
+
+constexpr uint32_t cell_of(uint32_t value, uint32_t cell_size) noexcept {
+  return cell_size == 0 ? value : static_cast<uint32_t>(value / cell_size);
+}
+
+inline uint32_t load_coord(const uint32_t *ptr, std::size_t stride, std::size_t index) noexcept {
+  return *(ptr + index * stride);
+}
+
+template <typename Fn>
+void for_each_neighbor(uint32_t point_index, uint32_t eps, const uint32_t *x, std::size_t x_stride,
+                       const uint32_t *y, std::size_t y_stride, const std::vector<uint32_t> &cell_x,
+                       const std::vector<uint32_t> &cell_y, const std::vector<uint32_t> &ordered_indices,
+                       const std::vector<std::size_t> &cell_offsets, const std::vector<uint64_t> &unique_keys,
+                       Fn &&fn) {
+
+  const uint32_t base_cx = cell_x[point_index];
+  const uint32_t base_cy = cell_y[point_index];
+  const uint32_t x_a = load_coord(x, x_stride, point_index);
+  const uint32_t y_a = load_coord(y, y_stride, point_index);
+
+  for (int dx = -1; dx <= 1; ++dx) {
+    const int64_t nx = static_cast<int64_t>(base_cx) + dx;
+    if (nx < 0)
+      continue;
+
+    for (int dy = -1; dy <= 1; ++dy) {
+      const int64_t ny = static_cast<int64_t>(base_cy) + dy;
+      if (ny < 0)
+        continue;
+
+      const uint64_t key = pack_cell(static_cast<uint32_t>(nx), static_cast<uint32_t>(ny));
+      const auto it = std::lower_bound(unique_keys.begin(), unique_keys.end(), key);
+      if (it == unique_keys.end() || *it != key)
+        continue;
+
+      const std::size_t cell_idx = static_cast<std::size_t>(std::distance(unique_keys.begin(), it));
+      const std::size_t begin = cell_offsets[cell_idx];
+      const std::size_t end = cell_offsets[cell_idx + 1];
+
+      for (std::size_t pos = begin; pos < end; ++pos) {
+        const uint32_t neighbor_idx = ordered_indices[pos];
+
+        const uint32_t x_b = load_coord(x, x_stride, neighbor_idx);
+        const uint32_t y_b = load_coord(y, y_stride, neighbor_idx);
+
+        const uint32_t dx_abs = x_a > x_b ? x_a - x_b : x_b - x_a;
+        const uint32_t dy_abs = y_a > y_b ? y_a - y_b : y_b - y_a;
+        const uint64_t manhattan = static_cast<uint64_t>(dx_abs) + static_cast<uint64_t>(dy_abs);
+
+        if (manhattan <= static_cast<uint64_t>(eps)) {
+          if (!fn(neighbor_idx))
+            return;
+        }
+      }
+    }
+  }
+}
+
+struct ExpansionContext {
+  const uint32_t *x;
+  std::size_t x_stride;
+  const uint32_t *y;
+  std::size_t y_stride;
+  std::size_t count;
+  uint32_t eps;
+  uint32_t min_samples;
+  const std::vector<uint32_t> &cell_x;
+  const std::vector<uint32_t> &cell_y;
+  const std::vector<uint32_t> &ordered_indices;
+  const std::vector<std::size_t> &cell_offsets;
+  const std::vector<uint64_t> &unique_keys;
+  const std::vector<uint8_t> &is_core;
+  std::size_t num_threads;
+  std::size_t chunk_size;
+};
+
+void sequential_expand(const ExpansionContext &ctx, std::vector<int32_t> &labels) {
+  std::vector<uint32_t> stack;
+  stack.reserve(ctx.count);
+  std::vector<uint32_t> neighbor_buffer;
+  neighbor_buffer.reserve(64);
+
+  int32_t next_label = 0;
+  for (std::size_t i = 0; i < ctx.count; ++i) {
+    if (!ctx.is_core[i] || labels[i] != -1)
+      continue;
+
+    labels[i] = next_label;
+    stack.clear();
+    stack.push_back(static_cast<uint32_t>(i));
+
+    while (!stack.empty()) {
+      const uint32_t current = stack.back();
+      stack.pop_back();
+
+      neighbor_buffer.clear();
+      for_each_neighbor(current, ctx.eps, ctx.x, ctx.x_stride, ctx.y, ctx.y_stride, ctx.cell_x, ctx.cell_y,
+                        ctx.ordered_indices, ctx.cell_offsets, ctx.unique_keys, [&](uint32_t neighbor) {
+                          neighbor_buffer.push_back(neighbor);
+                          return true;
+                        });
+
+      for (uint32_t neighbor : neighbor_buffer) {
+        if (labels[neighbor] == -1) {
+          labels[neighbor] = next_label;
+          if (ctx.is_core[neighbor])
+            stack.push_back(neighbor);
+        }
+      }
+    }
+
+    ++next_label;
+  }
+}
+
+void frontier_expand(const ExpansionContext &ctx, std::vector<int32_t> &labels) {
+  std::vector<std::atomic<int32_t>> shared_labels(ctx.count);
+  for (std::size_t i = 0; i < ctx.count; ++i)
+    shared_labels[i].store(labels[i], std::memory_order_relaxed);
+
+  int32_t next_label = 0;
+  std::vector<uint32_t> frontier;
+  frontier.reserve(256);
+
+  const std::size_t frontier_chunk = ctx.chunk_size == 0 ? 64 : ctx.chunk_size;
+
+  for (std::size_t seed = 0; seed < ctx.count; ++seed) {
+    if (!ctx.is_core[seed] || shared_labels[seed].load(std::memory_order_acquire) != -1)
+      continue;
+
+    const int32_t label = next_label++;
+    shared_labels[seed].store(label, std::memory_order_release);
+    frontier.clear();
+    frontier.push_back(static_cast<uint32_t>(seed));
+
+    while (!frontier.empty()) {
+      std::vector<uint32_t> next_frontier;
+      std::mutex next_mutex;
+
+      utils::parallelize(0, frontier.size(), ctx.num_threads, frontier_chunk, [&](std::size_t begin, std::size_t end) {
+        std::vector<uint32_t> local_next;
+        local_next.reserve(32);
+        std::vector<uint32_t> neighbor_buffer;
+        neighbor_buffer.reserve(64);
+
+        for (std::size_t idx = begin; idx < end; ++idx) {
+          const uint32_t current = frontier[idx];
+
+          neighbor_buffer.clear();
+          for_each_neighbor(current, ctx.eps, ctx.x, ctx.x_stride, ctx.y, ctx.y_stride, ctx.cell_x, ctx.cell_y,
+                            ctx.ordered_indices, ctx.cell_offsets, ctx.unique_keys, [&](uint32_t neighbor) {
+                              neighbor_buffer.push_back(neighbor);
+                              return true;
+                            });
+
+          for (uint32_t neighbor : neighbor_buffer) {
+            if (ctx.is_core[neighbor]) {
+              int32_t expected = -1;
+              if (shared_labels[neighbor].compare_exchange_strong(expected, label, std::memory_order_acq_rel))
+                local_next.push_back(neighbor);
+            } else {
+              int32_t expected = -1;
+              shared_labels[neighbor].compare_exchange_strong(expected, label, std::memory_order_acq_rel);
+            }
+          }
+        }
+
+        if (!local_next.empty()) {
+          std::sort(local_next.begin(), local_next.end());
+          local_next.erase(std::unique(local_next.begin(), local_next.end()), local_next.end());
+          std::lock_guard<std::mutex> lock(next_mutex);
+          next_frontier.insert(next_frontier.end(), local_next.begin(), local_next.end());
+        }
+      });
+
+      if (next_frontier.empty())
+        break;
+
+      std::sort(next_frontier.begin(), next_frontier.end());
+      next_frontier.erase(std::unique(next_frontier.begin(), next_frontier.end()), next_frontier.end());
+      frontier.swap(next_frontier);
+    }
+  }
+
+  for (std::size_t i = 0; i < ctx.count; ++i)
+    labels[i] = shared_labels[i].load(std::memory_order_relaxed);
+}
+
+void union_find_expand(const ExpansionContext &ctx, std::vector<int32_t> &labels) {
+  constexpr uint32_t invalid = std::numeric_limits<uint32_t>::max();
+  std::vector<std::atomic<uint32_t>> parents(ctx.count);
+  for (std::size_t i = 0; i < ctx.count; ++i) {
+    if (ctx.is_core[i])
+      parents[i].store(static_cast<uint32_t>(i), std::memory_order_relaxed);
+    else
+      parents[i].store(invalid, std::memory_order_relaxed);
+  }
+
+  auto find_root = [&](uint32_t node) {
+    uint32_t parent = parents[node].load(std::memory_order_acquire);
+    if (parent == invalid)
+      return invalid;
+
+    while (true) {
+      uint32_t grandparent = parents[parent].load(std::memory_order_acquire);
+      if (grandparent == parent) {
+        if (parent != node)
+          parents[node].store(parent, std::memory_order_release);
+        return parent;
+      }
+      parents[node].compare_exchange_strong(parent, grandparent, std::memory_order_acq_rel);
+      node = parent;
+      parent = parents[node].load(std::memory_order_acquire);
+      if (parent == invalid)
+        return invalid;
+    }
+  };
+
+  auto unite = [&](uint32_t a, uint32_t b) {
+    while (true) {
+      a = find_root(a);
+      b = find_root(b);
+      if (a == invalid || b == invalid || a == b)
+        return;
+
+      if (a < b) {
+        uint32_t expected = b;
+        if (parents[b].compare_exchange_strong(expected, a, std::memory_order_acq_rel))
+          return;
+      } else {
+        uint32_t expected = a;
+        if (parents[a].compare_exchange_strong(expected, b, std::memory_order_acq_rel))
+          return;
+      }
+    }
+  };
+
+  const std::size_t union_chunk = ctx.chunk_size == 0 ? 512 : ctx.chunk_size;
+  utils::parallelize(0, ctx.count, ctx.num_threads, union_chunk, [&](std::size_t begin, std::size_t end) {
+    for (std::size_t idx = begin; idx < end; ++idx) {
+      if (!ctx.is_core[idx])
+        continue;
+
+      for_each_neighbor(static_cast<uint32_t>(idx), ctx.eps, ctx.x, ctx.x_stride, ctx.y, ctx.y_stride, ctx.cell_x,
+                        ctx.cell_y, ctx.ordered_indices, ctx.cell_offsets, ctx.unique_keys, [&](uint32_t neighbor) {
+                          if (ctx.is_core[neighbor])
+                            unite(static_cast<uint32_t>(idx), neighbor);
+                          return true;
+                        });
+    }
+  });
+
+  std::vector<uint32_t> root_for_point(ctx.count, invalid);
+  for (std::size_t i = 0; i < ctx.count; ++i) {
+    if (ctx.is_core[i])
+      root_for_point[i] = find_root(static_cast<uint32_t>(i));
+  }
+
+  std::vector<uint32_t> component_min(ctx.count, invalid);
+  for (std::size_t i = 0; i < ctx.count; ++i) {
+    if (!ctx.is_core[i])
+      continue;
+    const uint32_t root = root_for_point[i];
+    if (root == invalid)
+      continue;
+    if (component_min[root] > i)
+      component_min[root] = static_cast<uint32_t>(i);
+  }
+
+  std::vector<std::pair<uint32_t, uint32_t>> components;
+  components.reserve(ctx.count);
+  for (std::size_t i = 0; i < ctx.count; ++i) {
+    if (component_min[i] != invalid)
+      components.emplace_back(component_min[i], static_cast<uint32_t>(i));
+  }
+
+  std::sort(components.begin(), components.end());
+
+  std::vector<int32_t> root_label(ctx.count, -1);
+  int32_t next_label = 0;
+  for (const auto &[min_index, root] : components) {
+    (void)min_index;
+    root_label[root] = next_label++;
+  }
+
+  for (std::size_t i = 0; i < ctx.count; ++i) {
+    if (!ctx.is_core[i])
+      continue;
+    const uint32_t root = root_for_point[i];
+    if (root == invalid)
+      continue;
+    labels[i] = root_label[root];
+  }
+
+  for (std::size_t i = 0; i < ctx.count; ++i) {
+    if (ctx.is_core[i])
+      continue;
+
+    int32_t best_label = -1;
+    for_each_neighbor(static_cast<uint32_t>(i), ctx.eps, ctx.x, ctx.x_stride, ctx.y, ctx.y_stride, ctx.cell_x,
+                      ctx.cell_y, ctx.ordered_indices, ctx.cell_offsets, ctx.unique_keys, [&](uint32_t neighbor) {
+                        if (!ctx.is_core[neighbor])
+                          return true;
+                        const int32_t candidate = labels[neighbor];
+                        if (candidate != -1 && (best_label == -1 || candidate < best_label))
+                          best_label = candidate;
+                        return true;
+                      });
+    labels[i] = best_label;
+  }
+}
+
+inline DBSCANGrid2DL1Result dbscan_grid2d_l1_impl(const uint32_t *x, std::size_t x_stride, const uint32_t *y,
+                                                  std::size_t y_stride, std::size_t count,
+                                                  const DBSCANGrid2DL1Params &params,
+                                                  GridExpansionMode expansion_mode) {
+  DBSCANGrid2DL1Result result;
+  result.perf_timing.clear();
+  if (count == 0)
+    return result;
+
+  ScopedTimer total_timer("total", result.perf_timing);
+
+  const uint32_t cell_size = params.eps;
+
+  std::vector<uint32_t> cell_x(count);
+  std::vector<uint32_t> cell_y(count);
+  std::vector<uint64_t> keys(count);
+  std::vector<uint32_t> ordered_indices(count);
+  std::iota(ordered_indices.begin(), ordered_indices.end(), 0U);
+
+  const std::size_t index_chunk = params.chunk_size == 0 ? 1024 : params.chunk_size;
+  {
+    ScopedTimer timer("precompute_cells", result.perf_timing);
+    utils::parallelize(0, count, params.num_threads, index_chunk, [&](std::size_t begin, std::size_t end) {
+      for (std::size_t i = begin; i < end; ++i) {
+        const uint32_t cx = cell_of(load_coord(x, x_stride, i), cell_size);
+        const uint32_t cy = cell_of(load_coord(y, y_stride, i), cell_size);
+        cell_x[i] = cx;
+        cell_y[i] = cy;
+        keys[i] = pack_cell(cx, cy);
+      }
+    });
+  }
+
+  {
+    ScopedTimer timer("sort_indices", result.perf_timing);
+    std::sort(ordered_indices.begin(), ordered_indices.end(), [&](uint32_t lhs, uint32_t rhs) {
+      const uint64_t key_lhs = keys[lhs];
+      const uint64_t key_rhs = keys[rhs];
+      if (key_lhs == key_rhs)
+        return lhs < rhs;
+      return key_lhs < key_rhs;
+    });
+  }
+
+  std::vector<std::size_t> cell_offsets;
+  cell_offsets.reserve(count + 1);
+  std::vector<uint64_t> unique_keys;
+  unique_keys.reserve(count);
+
+  {
+    ScopedTimer timer("build_cell_offsets", result.perf_timing);
+    std::size_t pos = 0;
+    while (pos < count) {
+      const uint64_t key = keys[ordered_indices[pos]];
+      unique_keys.push_back(key);
+      cell_offsets.push_back(pos);
+
+      do {
+        ++pos;
+      } while (pos < count && keys[ordered_indices[pos]] == key);
+    }
+    cell_offsets.push_back(count);
+  }
+
+  result.labels.assign(count, -1);
+  std::vector<uint8_t> is_core(count, 0U);
+
+  const uint32_t eps_value = params.eps;
+  const uint32_t min_samples_value = params.min_samples;
+
+  const std::size_t core_chunk = params.chunk_size == 0 ? 512 : params.chunk_size;
+  {
+    ScopedTimer timer("core_detection", result.perf_timing);
+    utils::parallelize(0, count, params.num_threads, core_chunk, [&](std::size_t begin, std::size_t end) {
+      for (std::size_t idx = begin; idx < end; ++idx) {
+        uint32_t neighbor_count = 0;
+        for_each_neighbor(static_cast<uint32_t>(idx), eps_value, x, x_stride, y, y_stride, cell_x, cell_y,
+                          ordered_indices, cell_offsets, unique_keys, [&](uint32_t) {
+                            ++neighbor_count;
+                            return neighbor_count < min_samples_value;
+                          });
+
+        if (neighbor_count >= min_samples_value)
+          is_core[idx] = 1U;
+      }
+    });
+  }
+
+  const ExpansionContext context{x,
+                                 x_stride,
+                                 y,
+                                 y_stride,
+                                 count,
+                                 eps_value,
+                                 min_samples_value,
+                                 cell_x,
+                                 cell_y,
+                                 ordered_indices,
+                                 cell_offsets,
+                                 unique_keys,
+                                 is_core,
+                                 params.num_threads,
+                                 params.chunk_size};
+
+  {
+    ScopedTimer timer("cluster_expansion", result.perf_timing);
+    switch (expansion_mode) {
+    case GridExpansionMode::Sequential:
+      sequential_expand(context, result.labels);
+      break;
+    case GridExpansionMode::FrontierParallel:
+      frontier_expand(context, result.labels);
+      break;
+    case GridExpansionMode::UnionFind:
+      union_find_expand(context, result.labels);
+      break;
+    }
+  }
+
+  return result;
+}
+
+} // namespace dbscan::detail
+
diff --git a/tests/test_dbscan_grid2d_l1.cpp b/tests/test_dbscan_grid2d_l1.cpp
index b08b0d0..034240f 100644
--- a/tests/test_dbscan_grid2d_l1.cpp
+++ b/tests/test_dbscan_grid2d_l1.cpp
@@ -64,38 +64,45 @@ void load_fixture(const std::filesystem::path &data_path, const std::filesystem:
   REQUIRE(truth_stream.good());
 }
 
+std::vector<dbscan::Grid2DPoint> make_aos(const std::vector<uint32_t> &x, const std::vector<uint32_t> &y) {
+  std::vector<dbscan::Grid2DPoint> points(x.size());
+  for (std::size_t i = 0; i < points.size(); ++i)
+    points[i] = dbscan::Grid2DPoint{x[i], y[i]};
+  return points;
+}
+
 } // namespace
 
 TEST_CASE("DBSCANGrid2D_L1 clusters dense neighbors", "[dbscan][grid_l1]") {
-  // Points are arranged so the Manhattan frontier just connects the first three but leaves the outlier isolated,
-  // validating that the L1 grid expansion covers diagonals without absorbing distant noise.
   const std::vector<uint32_t> x = {0, 1, 2, 100};
   const std::vector<uint32_t> y = {0, 0, 1, 200};
 
-  dbscan::DBSCANGrid2D_L1 algo(4, 3);
-  auto labels = algo.fit_predict(x.data(), y.data(), x.size());
+  const dbscan::DBSCANGrid2DL1Params params{4, 3};
+  const auto soa_result = dbscan::dbscan_grid2d_l1(x.data(), 1, y.data(), 1, x.size(), params);
+  const auto &labels = soa_result.labels;
 
   REQUIRE(labels.size() == x.size());
   REQUIRE(labels[0] == labels[1]);
   REQUIRE(labels[1] == labels[2]);
   REQUIRE(labels[0] != -1);
   REQUIRE(labels[3] == -1);
+
+  const auto aos_points = make_aos(x, y);
+  const auto aos_result = dbscan::dbscan_grid2d_l1_aos(aos_points.data(), aos_points.size(), params);
+  REQUIRE(aos_result.labels == labels);
 }
 
 TEST_CASE("DBSCANGrid2D_L1 respects min_samples threshold", "[dbscan][grid_l1]") {
-  // Every point is deliberately spaced just beyond eps so we confirm the min_samples guard suppresses tiny clusters.
   const std::vector<uint32_t> coords = {0, 2, 4};
-  dbscan::DBSCANGrid2D_L1 algo(3, 4);
-  auto labels = algo.fit_predict(coords.data(), coords.data(), coords.size());
-
-  REQUIRE(labels.size() == coords.size());
-  for (int32_t label : labels) {
+  const dbscan::DBSCANGrid2DL1Params params{3, 4};
+  const auto result = dbscan::dbscan_grid2d_l1(coords.data(), 1, coords.data(), 1, coords.size(), params);
+  REQUIRE(result.labels.size() == coords.size());
+  for (int32_t label : result.labels) {
     REQUIRE(label == -1);
   }
 }
 
 TEST_CASE("DBSCANGrid2D_L1 matches fixture truth", "[dbscan][grid_l1]") {
-  // Fixture run mirrors the end-to-end validator to ensure the optimized grid path stays aligned with reference data.
   const std::filesystem::path root = fixture_root();
   const auto data_path = root / "dbscan_static_data.bin";
   const auto truth_path = root / "dbscan_static_truth.bin";
@@ -105,33 +112,33 @@ TEST_CASE("DBSCANGrid2D_L1 matches fixture truth", "[dbscan][grid_l1]") {
   std::vector<int32_t> truth;
   load_fixture(data_path, truth_path, x, y, truth);
 
-  dbscan::DBSCANGrid2D_L1 algo(10, 3);
-  auto labels = algo.fit_predict(x.data(), y.data(), x.size());
-
-  REQUIRE(labels.size() == truth.size());
-  REQUIRE(labels == truth);
+  const dbscan::DBSCANGrid2DL1Params params{10, 3};
+  const auto sequential = dbscan::dbscan_grid2d_l1(x.data(), 1, y.data(), 1, x.size(), params);
+  REQUIRE(sequential.labels == truth);
 
-  dbscan::DBSCANGrid2D_L1Frontier frontier_algo(10, 3, 4);
-  auto frontier_labels = frontier_algo.fit_predict(x.data(), y.data(), x.size());
-  REQUIRE(frontier_labels == truth);
+  const auto frontier =
+      dbscan::dbscan_grid2d_l1(x.data(), 1, y.data(), 1, x.size(), params, dbscan::GridExpansionMode::FrontierParallel);
+  REQUIRE(frontier.labels == truth);
 
-  dbscan::DBSCANGrid2D_L1UnionFind union_algo(10, 3, 4);
-  auto union_labels = union_algo.fit_predict(x.data(), y.data(), x.size());
-  REQUIRE(union_labels == truth);
+  const auto union_find =
+      dbscan::dbscan_grid2d_l1(x.data(), 1, y.data(), 1, x.size(), params, dbscan::GridExpansionMode::UnionFind);
+  REQUIRE(union_find.labels == truth);
 }
 
 TEST_CASE("DBSCANGrid2D_L1 parallel variants align with sequential", "[dbscan][grid_l1][parallel]") {
   const std::vector<uint32_t> x = {0, 1, 2, 5, 40};
   const std::vector<uint32_t> y = {0, 0, 1, 4, 80};
 
-  dbscan::DBSCANGrid2D_L1 sequential(6, 3);
-  const auto expected = sequential.fit_predict(x.data(), y.data(), x.size());
+  dbscan::DBSCANGrid2DL1Params params{6, 3};
+  params.num_threads = 4;
+
+  const auto sequential = dbscan::dbscan_grid2d_l1(x.data(), 1, y.data(), 1, x.size(), params);
 
-  dbscan::DBSCANGrid2D_L1Frontier frontier(6, 3, 4);
-  auto frontier_labels = frontier.fit_predict(x.data(), y.data(), x.size());
-  REQUIRE(frontier_labels == expected);
+  const auto frontier =
+      dbscan::dbscan_grid2d_l1(x.data(), 1, y.data(), 1, x.size(), params, dbscan::GridExpansionMode::FrontierParallel);
+  REQUIRE(frontier.labels == sequential.labels);
 
-  dbscan::DBSCANGrid2D_L1UnionFind union_algo(6, 3, 4);
-  auto union_labels = union_algo.fit_predict(x.data(), y.data(), x.size());
-  REQUIRE(union_labels == expected);
+  const auto union_find =
+      dbscan::dbscan_grid2d_l1(x.data(), 1, y.data(), 1, x.size(), params, dbscan::GridExpansionMode::UnionFind);
+  REQUIRE(union_find.labels == sequential.labels);
 }
diff --git a/tools/dbscan_dataset_validator.cpp b/tools/dbscan_dataset_validator.cpp
index 88ad292..6afc06b 100644
--- a/tools/dbscan_dataset_validator.cpp
+++ b/tools/dbscan_dataset_validator.cpp
@@ -453,21 +453,25 @@ int main(int argc, char **argv) {
       }
       if (x_coords.size() != y_coords.size())
         throw std::runtime_error("Mismatch between x and y coordinate counts");
+      dbscan::DBSCANGrid2DL1Params params{eps_int, static_cast<uint32_t>(options.min_samples)};
       for (auto mode : options.grid_modes) {
         const auto info = grid_variant_info(mode);
         std::cout << "\n[" << info.label << "] Running clustering..." << std::flush;
         const auto start = std::chrono::steady_clock::now();
-        dbscan::DBSCANGrid2D_L1 grid_algo(eps_int, static_cast<uint32_t>(options.min_samples), 0, 0, mode);
-        const auto labels = grid_algo.fit_predict(x_coords.data(), y_coords.data(), x_coords.size());
-        std::vector<std::size_t> mismatches;
-        const auto metrics = evaluate(labels, truth_labels, options.mismatch_output_dir ? &mismatches : nullptr);
+        const auto result = dbscan::dbscan_grid2d_l1(x_coords.data(), 1, y_coords.data(), 1, x_coords.size(), params,
+                                                     mode);
         const auto end = std::chrono::steady_clock::now();
         const auto elapsed_ms = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();
         std::cout << " done in " << elapsed_ms << " ms" << std::endl;
-        if (!grid_algo.perf_timing_.entries().empty()) {
+
+        std::vector<std::size_t> mismatches;
+        const auto metrics =
+            evaluate(result.labels, truth_labels, options.mismatch_output_dir ? &mismatches : nullptr);
+
+        if (!result.perf_timing.entries().empty()) {
           // Emit step-level timings so dataset runs surface bottlenecks without separate profiling passes.
           std::cout << "[" << info.label << "] component timings" << std::endl;
-          for (const auto &entry : grid_algo.perf_timing_.entries())
+          for (const auto &entry : result.perf_timing.entries())
             std::cout << "  " << entry << std::endl;
         }
         results.push_back({info.label, metrics});

From 671914aa9cf502addc3a008089e65f543faaadff Mon Sep 17 00:00:00 2001
From: Bo Lu <bo.lu@utexas.edu>
Date: Thu, 2 Oct 2025 08:17:59 -0700
Subject: [PATCH 12/12] refactor

---
 tools/dbscan_dataset_validator.cpp | 75 ++++++++++++++++++------------
 1 file changed, 44 insertions(+), 31 deletions(-)

diff --git a/tools/dbscan_dataset_validator.cpp b/tools/dbscan_dataset_validator.cpp
index 6afc06b..c132b4a 100644
--- a/tools/dbscan_dataset_validator.cpp
+++ b/tools/dbscan_dataset_validator.cpp
@@ -454,39 +454,52 @@ int main(int argc, char **argv) {
       if (x_coords.size() != y_coords.size())
         throw std::runtime_error("Mismatch between x and y coordinate counts");
       dbscan::DBSCANGrid2DL1Params params{eps_int, static_cast<uint32_t>(options.min_samples)};
+      std::vector<dbscan::Grid2DPoint> aos_points;
+      aos_points.reserve(x_coords.size());
+      for (std::size_t i = 0; i < x_coords.size(); ++i)
+        aos_points.push_back(dbscan::Grid2DPoint{x_coords[i], y_coords[i]});
       for (auto mode : options.grid_modes) {
         const auto info = grid_variant_info(mode);
-        std::cout << "\n[" << info.label << "] Running clustering..." << std::flush;
-        const auto start = std::chrono::steady_clock::now();
-        const auto result = dbscan::dbscan_grid2d_l1(x_coords.data(), 1, y_coords.data(), 1, x_coords.size(), params,
-                                                     mode);
-        const auto end = std::chrono::steady_clock::now();
-        const auto elapsed_ms = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();
-        std::cout << " done in " << elapsed_ms << " ms" << std::endl;
-
-        std::vector<std::size_t> mismatches;
-        const auto metrics =
-            evaluate(result.labels, truth_labels, options.mismatch_output_dir ? &mismatches : nullptr);
-
-        if (!result.perf_timing.entries().empty()) {
-          // Emit step-level timings so dataset runs surface bottlenecks without separate profiling passes.
-          std::cout << "[" << info.label << "] component timings" << std::endl;
-          for (const auto &entry : result.perf_timing.entries())
-            std::cout << "  " << entry << std::endl;
-        }
-        results.push_back({info.label, metrics});
-
-        if (options.mismatch_output_dir && !mismatches.empty()) {
-          std::filesystem::create_directories(*options.mismatch_output_dir);
-          auto file_path = *options.mismatch_output_dir / (info.slug + "_mismatches.txt");
-          std::ofstream out(file_path);
-          if (!out)
-            throw std::runtime_error("Failed to open mismatch output file: " + file_path.string());
-          for (std::size_t index : mismatches)
-            out << index << '\n';
-          std::cout << "[" << info.label << "] Wrote " << mismatches.size() << " mismatches to " << file_path
-                    << "\n";
-        }
+        const auto run_layout = [&](const std::string &layout, auto &&invoke) {
+          const std::string display_label = info.label + "/" + layout;
+          std::cout << "\n[" << display_label << "] Running clustering..." << std::flush;
+          const auto start = std::chrono::steady_clock::now();
+          const auto result = invoke();
+          const auto end = std::chrono::steady_clock::now();
+          const auto elapsed_ms = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();
+          std::cout << " done in " << elapsed_ms << " ms" << std::endl;
+
+          std::vector<std::size_t> mismatches;
+          const auto metrics =
+              evaluate(result.labels, truth_labels, options.mismatch_output_dir ? &mismatches : nullptr);
+
+          if (!result.perf_timing.entries().empty()) {
+            std::cout << "[" << display_label << "] component timings" << std::endl;
+            for (const auto &entry : result.perf_timing.entries())
+              std::cout << "  " << entry << std::endl;
+          }
+          results.push_back({display_label, metrics});
+
+          if (options.mismatch_output_dir && !mismatches.empty()) {
+            std::filesystem::create_directories(*options.mismatch_output_dir);
+            auto file_path = *options.mismatch_output_dir / (info.slug + "_" + layout + "_mismatches.txt");
+            std::ofstream out(file_path);
+            if (!out)
+              throw std::runtime_error("Failed to open mismatch output file: " + file_path.string());
+            for (std::size_t index : mismatches)
+              out << index << '\n';
+            std::cout << "[" << display_label << "] Wrote " << mismatches.size() << " mismatches to " << file_path
+                      << "\n";
+          }
+        };
+
+        run_layout("soa", [&]() {
+          return dbscan::dbscan_grid2d_l1(x_coords.data(), 1, y_coords.data(), 1, x_coords.size(), params, mode);
+        });
+
+        run_layout("aos", [&]() {
+          return dbscan::dbscan_grid2d_l1_aos(aos_points.data(), aos_points.size(), params, mode);
+        });
       }
     }