From 11dc1aef1f36c83be6b567c7de75720a9e4370ef Mon Sep 17 00:00:00 2001
From: Justin Maier <Zipp425@gmail.com>
Date: Mon, 13 Apr 2026 12:20:57 -0600
Subject: [PATCH] perf: offset-sorted mmap reads + disk read concurrency
 limiter

Two changes to fix 5-10s doc read tail latency:

1. Sort keys by data.bin offset before reading (datasilo get_many_timed).
   Turns random mmap page faults into sequential access, enabling kernel
   readahead and I/O request merging. At 109M records across multi-GB
   data.bin, random access causes page fault storms under concurrency.

2. Add 16-permit semaphore on the spawn_blocking doc read path (server.rs).
   Caps concurrent disk readers from 78 (observed in prod) to 16. Excess
   readers queue in async context (no thread consumed) instead of all
   doing simultaneous page faults.

Prod data: 0.35% of disk fetches took 5-10s, 2.2% took 2.5-5s.
Expected: sequential reads reduce per-batch latency; semaphore eliminates
the I/O contention that amplifies individual read times.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 crates/datasilo/src/lib.rs | 26 ++++++++++++++++++++++----
 src/server.rs              | 12 ++++++++++++
 2 files changed, 34 insertions(+), 4 deletions(-)
diff --git a/crates/datasilo/src/lib.rs b/crates/datasilo/src/lib.rs
index 3c4c538..9328a48 100644
--- a/crates/datasilo/src/lib.rs
+++ b/crates/datasilo/src/lib.rs
@@ -536,14 +536,32 @@ impl<S: SnapshotCodec, O: OpCodec<Snapshot = S::Snapshot>> DataSilo<S, O> {
         }
 
         // Phase 1: seed snapshots from data.bin (no locks).
+        // Sort keys by their offset in data.bin so mmap reads are sequential.
+        // This turns random page faults into sequential readahead — the kernel
+        // prefetches adjacent pages, and the I/O scheduler merges requests.
         let mmap_start = Instant::now();
-        let mut out: Vec<Option<S::Snapshot>> = keys
+        let mut out: Vec<Option<S::Snapshot>> = vec![None; keys.len()];
+
+        // Build (original_index, offset) pairs, sort by offset, read in order.
+        let mut order: Vec<(usize, u64)> = keys
             .iter()
-            .map(|&key| match self.data_bytes_for(key) {
-                Some(bytes) => S::decode(bytes).ok(),
-                None => None,
+            .enumerate()
+            .map(|(i, &key)| {
+                let offset = self.index.as_ref()
+                    .and_then(|idx| idx.get(key))
+                    .map_or(u64::MAX, |e| e.offset);
+                (i, offset)
             })
             .collect();
+        order.sort_unstable_by_key(|&(_, offset)| offset);
+
+        for &(orig_idx, _) in &order {
+            let key = keys[orig_idx];
+            out[orig_idx] = match self.data_bytes_for(key) {
+                Some(bytes) => S::decode(bytes).ok(),
+                None => None,
+            };
+        }
         let mmap_nanos = mmap_start.elapsed().as_nanos() as u64;
 
         // Phase 2: scan each ops log once, applying every op to any key that
diff --git a/src/server.rs b/src/server.rs
index 368c9bc..ce92f59 100644
--- a/src/server.rs
+++ b/src/server.rs
@@ -28,6 +28,13 @@ use crate::executor::{CaseSensitiveFields, StringMaps};
 use crate::loader;
 use crate::metrics::Metrics;
 use crate::mutation::FieldValue;
+
+/// Limit concurrent disk doc reads to prevent mmap page-fault storms.
+/// At 109M records with multi-GB data.bin, 78 concurrent readers doing
+/// random page faults thrash the I/O scheduler — P99 doc reads hit 5-10s.
+/// 16 permits keeps disk utilization high without contention collapse.
+static DOC_DISK_READ_SEMAPHORE: std::sync::LazyLock<tokio::sync::Semaphore> =
+    std::sync::LazyLock::new(|| tokio::sync::Semaphore::new(16));
 use crate::query::{BitdexQuery, Value};
 
 // ---------------------------------------------------------------------------
@@ -2775,6 +2782,11 @@ async fn handle_query(
                 m.query_docs_path_total
                     .with_label_values(&[&name_docs, "spawn_blocking"])
                     .inc();
+                // Acquire disk read permit before spawn_blocking. This queues
+                // excess readers in async context (no thread consumed) rather
+                // than spawning 78 blocking threads that all page-fault at once.
+                let _disk_permit = DOC_DISK_READ_SEMAPHORE.acquire().await
+                    .expect("doc disk read semaphore closed");
                 // C1 isolation: capture wall time immediately before the
                 // spawn_blocking call so we can measure dispatch gap.
                 let dispatch_start = Instant::now();