From 74af0e90bf8f7df27e664b65adbf3ac8e674abec Mon Sep 17 00:00:00 2001
From: Stef Kariotidis <stef.kariotidis@gmail.com>
Date: Thu, 26 Mar 2026 19:43:47 +0200
Subject: [PATCH 1/3] WIP: Fix cascading 503 on cold-cache tile loading
 (checkpoint)

Two-part fix for tiles graying out instead of progressively loading:

1. Client-side: Add concurrency pool (6 slots) to retryTileLayer.js.
   Prevents thundering herd where ~35 tiles blast the server simultaneously,
   overwhelming both per-IP and global outbound budgets.

2. Server-side: Two-phase per-IP outbound budget in SendTileRequestCoreAsync.
   Previously, IsRateLimitExceeded incremented the per-IP counter on every
   request, even those rejected by the global budget. This caused retries to
   find the counter already past the limit and fail immediately (cascading 503).
   Now uses WouldExceedRateLimit (peek, no increment) for fast-fail, then
   RecordRateLimitHit only after global budget is acquired.
---
 Services/RateLimitHelper.cs  |  73 ++++++++++++++
 Services/TileCacheService.cs |  28 ++++--
 wwwroot/js/retryTileLayer.js | 190 ++++++++++++++++++++++++-----------
 3 files changed, 224 insertions(+), 67 deletions(-)
diff --git a/Services/RateLimitHelper.cs b/Services/RateLimitHelper.cs
index 49d3ba3b..c9874247 100644
--- a/Services/RateLimitHelper.cs
+++ b/Services/RateLimitHelper.cs
@@ -111,6 +111,37 @@ public int IncrementAndGet(long currentTicks, long newExpirationTicks)
             return weighted;
         }
 
+        /// <summary>
+        /// Returns the current weighted sliding-window count WITHOUT incrementing.
+        /// Used for speculative checks (e.g., per-IP outbound budget) where the
+        /// increment should be deferred until the request actually proceeds upstream.
+        /// </summary>
+        /// <param name="currentTicks">The current tick count.</param>
+        /// <returns>The weighted sliding-window request count (read-only).</returns>
+        public int PeekCount(long currentTicks)
+        {
+            var currentExpiration = Interlocked.Read(ref _expirationTicks);
+            // Don't rotate the window — this is a read-only peek.
+            // If the window has expired, the count will be stale (0 + prevWeight * prev),
+            // which is conservative: it underestimates if a rotation is pending,
+            // so the caller may allow a borderline request that IncrementAndGet would block.
+            // Acceptable for the two-phase pattern where the actual increment follows shortly.
+
+            var currentCount = Volatile.Read(ref _count);
+
+            var windowStart = Interlocked.Read(ref _windowStartTicks);
+            var windowEnd = Interlocked.Read(ref _expirationTicks);
+            var windowSize = windowEnd - windowStart;
+            if (windowSize <= 0) windowSize = WindowTicks;
+            var elapsed = currentTicks - windowStart;
+            if (elapsed < 0) elapsed = 0;
+            if (elapsed > windowSize) elapsed = windowSize;
+
+            var prevWeight = 1.0 - ((double)elapsed / windowSize);
+            var prev = Volatile.Read(ref _prevCount);
+            return (int)(prev * prevWeight) + currentCount;
+        }
+
         /// <summary>
         /// Returns true if this entry's window has expired.
         /// Uses a 2-window horizon: an entry is considered expired only after 2 full windows
@@ -178,6 +209,48 @@ public static bool IsRateLimitExceeded(
         return count > maxRequestsPerMinute;
     }
 
+    /// <summary>
+    /// Checks if the given client key would exceed the rate limit WITHOUT incrementing the counter.
+    /// Used for speculative fast-fail checks (e.g., per-IP outbound budget) where the actual
+    /// increment is deferred until the request succeeds. This prevents budget-exhausted requests
+    /// from inflating the per-IP counter, which would cause cascading 503 rejections on retries.
+    /// </summary>
+    /// <param name="cache">The concurrent dictionary tracking rate limit entries per key.</param>
+    /// <param name="clientKey">The client key (IP address, user ID, etc.).</param>
+    /// <param name="maxRequestsPerMinute">Maximum allowed requests per minute.</param>
+    /// <returns>True if the rate limit would be exceeded, false otherwise.</returns>
+    public static bool WouldExceedRateLimit(
+        ConcurrentDictionary<string, RateLimitEntry> cache,
+        string clientKey,
+        int maxRequestsPerMinute)
+    {
+        var currentTicks = DateTime.UtcNow.Ticks;
+        var expirationTicks = currentTicks + WindowTicks;
+
+        var entry = cache.GetOrAdd(clientKey, _ => new RateLimitEntry(expirationTicks));
+        var count = entry.PeekCount(currentTicks);
+        return count >= maxRequestsPerMinute;
+    }
+
+    /// <summary>
+    /// Records a successful request against the rate limit counter. Call this AFTER the request
+    /// has actually been fulfilled (e.g., after acquiring the global outbound budget token).
+    /// Pairs with <see cref="WouldExceedRateLimit"/> for two-phase rate limiting where the
+    /// check and increment are separated to avoid counting rejected requests.
+    /// </summary>
+    /// <param name="cache">The concurrent dictionary tracking rate limit entries per key.</param>
+    /// <param name="clientKey">The client key (IP address, user ID, etc.).</param>
+    public static void RecordRateLimitHit(
+        ConcurrentDictionary<string, RateLimitEntry> cache,
+        string clientKey)
+    {
+        var currentTicks = DateTime.UtcNow.Ticks;
+        var expirationTicks = currentTicks + WindowTicks;
+
+        var entry = cache.GetOrAdd(clientKey, _ => new RateLimitEntry(expirationTicks));
+        entry.IncrementAndGet(currentTicks, expirationTicks);
+    }
+
     /// <summary>
     /// Removes expired entries from the rate limit cache to prevent memory growth.
     /// Called periodically when cache exceeds size threshold.
diff --git a/Services/TileCacheService.cs b/Services/TileCacheService.cs
index f8d45238..0c42e159 100644
--- a/Services/TileCacheService.cs
+++ b/Services/TileCacheService.cs
@@ -461,32 +461,37 @@ public string GetCacheDirectory()
         Action<HttpRequestMessage>? configureRequest = null, bool skipBudget = false,
         string? clientIp = null, CancellationToken cancellationToken = default)
     {
-        // Per-IP outbound budget check: prevent a single client from monopolizing global tokens.
-        // Checked before the global budget to fail fast without consuming a global token.
+        // Two-phase per-IP outbound budget: peek first (fast-fail without incrementing),
+        // then record the hit only after the global budget is acquired. This prevents
+        // budget-exhausted requests from inflating the per-IP counter, which previously
+        // caused cascading 503 rejections: on cold-cache loads with ~35 tiles, every
+        // request (including those rejected by the global budget) incremented the per-IP
+        // counter, so retries found the counter already past the limit and failed immediately.
         // skipBudget is true on retries — the per-IP check was already passed on the first attempt.
         // clientIp may be passed explicitly by callers (e.g., coalesced revalidation) where
         // HttpContext is no longer available; falls back to HttpContext if not provided.
+        string? resolvedIpForBudget = null;
         if (!skipBudget)
         {
             var perIpLimit = _applicationSettings.GetSettings().TileOutboundBudgetPerIpPerMinute;
             if (perIpLimit > 0)
             {
-                var resolvedIp = clientIp;
-                if (resolvedIp == null)
+                resolvedIpForBudget = clientIp;
+                if (resolvedIpForBudget == null)
                 {
                     var ctx = _httpContextAccessor.HttpContext;
                     if (ctx != null)
                     {
-                        resolvedIp = RateLimitHelper.GetClientIpAddress(ctx);
+                        resolvedIpForBudget = RateLimitHelper.GetClientIpAddress(ctx);
                     }
                 }
 
-                if (resolvedIp != null && RateLimitHelper.IsRateLimitExceeded(
-                        TilesController.OutboundBudgetCache, resolvedIp, perIpLimit))
+                if (resolvedIpForBudget != null && RateLimitHelper.WouldExceedRateLimit(
+                        TilesController.OutboundBudgetCache, resolvedIpForBudget, perIpLimit))
                 {
                     _logger.LogWarning(
                         "Per-IP outbound budget exceeded for {ClientIp} — throttling upstream request for {TileUrl}",
-                        resolvedIp, TileProviderCatalog.RedactApiKey(tileUrl));
+                        resolvedIpForBudget, TileProviderCatalog.RedactApiKey(tileUrl));
                     return null;
                 }
             }
@@ -504,6 +509,13 @@ public string GetCacheDirectory()
             return null;
         }
 
+        // Both budgets passed — record the per-IP hit now that an upstream fetch will proceed.
+        // This ensures only actual upstream fetches count against the per-IP limit.
+        if (resolvedIpForBudget != null)
+        {
+            RateLimitHelper.RecordRateLimitHit(TilesController.OutboundBudgetCache, resolvedIpForBudget);
+        }
+
         const int maxRedirects = 3;
         var initialUri = new Uri(tileUrl);
         var currentUri = initialUri;
diff --git a/wwwroot/js/retryTileLayer.js b/wwwroot/js/retryTileLayer.js
index 9ffb9bbc..2ca280d3 100644
--- a/wwwroot/js/retryTileLayer.js
+++ b/wwwroot/js/retryTileLayer.js
@@ -3,6 +3,14 @@
  * This enables HTTP status code inspection so we can retry on 503 (budget exhaustion)
  * while treating 404 as permanent failure.
  *
+ * Concurrency control:
+ * - Global pool limits concurrent tile fetches (default 6) to prevent overwhelming
+ *   the server's outbound budget (10 burst, 2/sec) and per-IP budget (default 120/min).
+ *   Without this, a cold-cache load at zoom 17 (~35 tiles) sends all requests
+ *   simultaneously, exhausting both budgets and causing cascading 503 failures where
+ *   retries also get rejected (the per-IP counter increments on every request, even
+ *   rejected ones, so the count quickly snowballs past the limit).
+ *
  * Retry strategy:
  * - Only retries on HTTP 503 or network errors
  * - Reads Retry-After header from server (falls back to exponential backoff)
@@ -14,6 +22,58 @@
  * on our proxy. If upstream OSM is down, retrying would not help and would only pile
  * up stale retry timers. Users will see gray tiles until upstream recovers.
  */
+
+// ---------- Global concurrency pool ----------
+// Limits concurrent tile fetches to prevent overwhelming the server's per-IP outbound
+// budget (default 120/min) and global token budget (10 burst, 2/sec). Tiles beyond the
+// limit queue client-side and proceed as slots free up, producing the progressive
+// "stream-in" effect on cold-cache loads instead of a wall of 503s.
+const _poolSize = 6;
+let _inFlight = 0;
+const _waiting = [];
+
+/**
+ * Acquires a concurrency slot. Resolves with true when a slot is available,
+ * or false if the signal was aborted while queued (tile panned/zoomed away).
+ * @param {AbortSignal} signal - Abort signal from the tile's AbortController.
+ * @returns {Promise<boolean>} True if a slot was acquired (caller must call _releaseSlot).
+ */
+const _acquireSlot = (signal) => {
+    if (signal.aborted) return Promise.resolve(false);
+    if (_inFlight < _poolSize) {
+        _inFlight++;
+        return Promise.resolve(true);
+    }
+    return new Promise((resolve) => {
+        const entry = { resolve };
+        _waiting.push(entry);
+        // When the tile is removed (panned/zoomed away), dequeue so the slot isn't
+        // wasted on a fetch that will be immediately aborted.
+        signal.addEventListener('abort', () => {
+            const i = _waiting.indexOf(entry);
+            if (i !== -1) {
+                _waiting.splice(i, 1);
+                resolve(false);
+            }
+            // If entry was already shifted by _releaseSlot, the slot was transferred
+            // to us — caller checks signal.aborted and releases.
+        }, { once: true });
+    });
+};
+
+/**
+ * Releases a concurrency slot, allowing the next queued fetch to proceed.
+ * Must be called exactly once for every _acquireSlot that resolved with true.
+ */
+const _releaseSlot = () => {
+    if (_waiting.length > 0) {
+        // Transfer slot directly to next waiter (don't decrement _inFlight).
+        _waiting.shift().resolve(true);
+    } else {
+        _inFlight--;
+    }
+};
+
 const RetryTileLayer = L.TileLayer.extend({
     options: {
         maxRetries: 5,
@@ -75,7 +135,8 @@ const RetryTileLayer = L.TileLayer.extend({
 
     /**
      * Fetches a tile via fetch(), retries on 503 or network error with backoff.
-     * Respects the AbortSignal so removed tiles stop retrying immediately.
+     * Acquires a concurrency slot before each fetch attempt to prevent overwhelming
+     * the server's budget. Respects AbortSignal so removed tiles stop immediately.
      * @param {string} url - The tile URL.
      * @param {HTMLImageElement} tile - The tile image element.
      * @param {Function} done - Leaflet callback to signal completion.
@@ -87,66 +148,77 @@ const RetryTileLayer = L.TileLayer.extend({
         const maxRetries = this.options.maxRetries;
         const baseDelay = this.options.retryDelayMs;
 
-        fetch(url, { signal: signal }).then(function (response) {
-            if (response.ok) {
-                return response.blob().then(function (blob) {
-                    // Guard: if the tile was removed (panned/zoomed away) while the blob
-                    // was being read, skip assigning the blob URL. Without this check,
-                    // Leaflet's _removeTile would have already replaced onload/onerror
-                    // with falseFn, so the revokeObjectURL callback would never fire,
-                    // leaking the blob URL.
-                    if (signal.aborted) return;
-                    tile.onload = function () {
-                        URL.revokeObjectURL(tile.src);
-                        done(null, tile);
-                    };
-                    tile.onerror = function (e) {
-                        URL.revokeObjectURL(tile.src);
-                        done(e, tile);
-                    };
-                    tile.src = URL.createObjectURL(blob);
-                });
-            }
+        _acquireSlot(signal).then(function (acquired) {
+            // No slot acquired — tile was removed while queued, nothing to do.
+            if (!acquired) return;
+            // Slot acquired but tile removed in the meantime — release immediately.
+            if (signal.aborted) { _releaseSlot(); return; }
 
-            // 503 = budget exhausted, transient — retry with Retry-After or backoff.
-            // Jitter (±25%) prevents thundering-herd retries when many tiles 503 simultaneously.
-            if (response.status === 503 && attempt < maxRetries) {
-                const retryAfter = response.headers.get('Retry-After');
-                const parsed = retryAfter ? parseInt(retryAfter, 10) : NaN;
-                let delayMs = !isNaN(parsed) && parsed > 0
-                    ? parsed * 1000
-                    : baseDelay * Math.pow(2, attempt);
-                delayMs = Math.max(delayMs, baseDelay); // floor: never below base delay
-                delayMs = Math.min(delayMs, 10000);     // cap: never above 10s
-                delayMs *= (0.75 + Math.random() * 0.5); // jitter ±25%
-
-                setTimeout(function () {
-                    // Check if tile was removed while waiting — abort signal is set.
-                    if (!signal.aborted) {
-                        layer._fetchWithRetry(url, tile, done, attempt + 1, signal);
-                    }
-                }, delayMs);
-                return;
-            }
+            fetch(url, { signal: signal }).then(function (response) {
+                _releaseSlot();
 
-            // Non-retryable (404, 400, 500, etc.)
-            done(new Error('Tile fetch failed: ' + response.status), tile);
-        }).catch(function (err) {
-            // Tile was removed (panned/zoomed away) — silently stop.
-            if (err.name === 'AbortError') return;
-
-            // Network error (or body-read failure mid-transfer) — retry if attempts remain
-            if (attempt < maxRetries) {
-                let delayMs = Math.min(baseDelay * Math.pow(2, attempt), 10000);
-                delayMs *= (0.75 + Math.random() * 0.5); // jitter ±25%
-                setTimeout(function () {
-                    if (!signal.aborted) {
-                        layer._fetchWithRetry(url, tile, done, attempt + 1, signal);
-                    }
-                }, delayMs);
-                return;
-            }
-            done(err, tile);
+                if (response.ok) {
+                    return response.blob().then(function (blob) {
+                        // Guard: if the tile was removed (panned/zoomed away) while the blob
+                        // was being read, skip assigning the blob URL. Without this check,
+                        // Leaflet's _removeTile would have already replaced onload/onerror
+                        // with falseFn, so the revokeObjectURL callback would never fire,
+                        // leaking the blob URL.
+                        if (signal.aborted) return;
+                        tile.onload = function () {
+                            URL.revokeObjectURL(tile.src);
+                            done(null, tile);
+                        };
+                        tile.onerror = function (e) {
+                            URL.revokeObjectURL(tile.src);
+                            done(e, tile);
+                        };
+                        tile.src = URL.createObjectURL(blob);
+                    });
+                }
+
+                // 503 = budget exhausted, transient — retry with Retry-After or backoff.
+                // Jitter (±25%) prevents thundering-herd retries when many tiles 503 simultaneously.
+                if (response.status === 503 && attempt < maxRetries) {
+                    const retryAfter = response.headers.get('Retry-After');
+                    const parsed = retryAfter ? parseInt(retryAfter, 10) : NaN;
+                    let delayMs = !isNaN(parsed) && parsed > 0
+                        ? parsed * 1000
+                        : baseDelay * Math.pow(2, attempt);
+                    delayMs = Math.max(delayMs, baseDelay); // floor: never below base delay
+                    delayMs = Math.min(delayMs, 10000);     // cap: never above 10s
+                    delayMs *= (0.75 + Math.random() * 0.5); // jitter ±25%
+
+                    setTimeout(function () {
+                        // Check if tile was removed while waiting — abort signal is set.
+                        if (!signal.aborted) {
+                            layer._fetchWithRetry(url, tile, done, attempt + 1, signal);
+                        }
+                    }, delayMs);
+                    return;
+                }
+
+                // Non-retryable (404, 400, 500, etc.)
+                done(new Error('Tile fetch failed: ' + response.status), tile);
+            }).catch(function (err) {
+                _releaseSlot();
+
+                // Tile was removed (panned/zoomed away) — silently stop.
+                if (err.name === 'AbortError') return;
+
+                // Network error (or body-read failure mid-transfer) — retry if attempts remain
+                if (attempt < maxRetries) {
+                    let delayMs = Math.min(baseDelay * Math.pow(2, attempt), 10000);
+                    delayMs *= (0.75 + Math.random() * 0.5); // jitter ±25%
+                    setTimeout(function () {
+                        if (!signal.aborted) {
+                            layer._fetchWithRetry(url, tile, done, attempt + 1, signal);
+                        }
+                    }, delayMs);
+                    return;
+                }
+                done(err, tile);
+            });
         });
     }
 });

From 82a0b483b1b76a89bdcb8399e033e0b573ef4adc Mon Sep 17 00:00:00 2001
From: Stef Kariotidis <stef.kariotidis@gmail.com>
Date: Thu, 26 Mar 2026 19:47:17 +0200
Subject: [PATCH 2/3] Fix cascading 503 on cold-cache tile loading
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two-part fix for tiles graying out instead of progressively loading:

1. Client-side (retryTileLayer.js): Add concurrency pool (6 slots).
   Prevents thundering herd where ~35 tiles blast the server simultaneously,
   overwhelming both per-IP and global outbound budgets.

2. Server-side (TileCacheService/RateLimitHelper): Two-phase per-IP budget.
   Previously, IsRateLimitExceeded incremented the per-IP counter on every
   request, even those rejected by the global budget. This caused retries to
   find the counter already past the limit and fail immediately (cascading 503).
   Now uses WouldExceedRateLimit (peek without increment) for fast-fail, then
   RecordRateLimitHit only after global budget is acquired — so only actual
   upstream fetches count against the per-IP limit.

Closes #206
---
 Services/RateLimitHelper.cs  | 3 +++
 wwwroot/js/retryTileLayer.js | 3 +++
 2 files changed, 6 insertions(+)

diff --git a/Services/RateLimitHelper.cs b/Services/RateLimitHelper.cs
index c9874247..bb752cba 100644
--- a/Services/RateLimitHelper.cs
+++ b/Services/RateLimitHelper.cs
@@ -229,6 +229,9 @@ public static bool WouldExceedRateLimit(
 
         var entry = cache.GetOrAdd(clientKey, _ => new RateLimitEntry(expirationTicks));
         var count = entry.PeekCount(currentTicks);
+        // Uses >= (not >) because PeekCount reads without incrementing: if count is already
+        // AT the limit, the next IncrementAndGet would push it over. IsRateLimitExceeded
+        // uses > because it increments first (post-increment semantics).
         return count >= maxRequestsPerMinute;
     }
 
diff --git a/wwwroot/js/retryTileLayer.js b/wwwroot/js/retryTileLayer.js
index 2ca280d3..fa946df5 100644
--- a/wwwroot/js/retryTileLayer.js
+++ b/wwwroot/js/retryTileLayer.js
@@ -150,8 +150,11 @@ const RetryTileLayer = L.TileLayer.extend({
 
         _acquireSlot(signal).then(function (acquired) {
             // No slot acquired — tile was removed while queued, nothing to do.
+            // done() is intentionally not called: Leaflet's _removeTile already replaced
+            // it with falseFn, so calling it would be a no-op at best.
             if (!acquired) return;
             // Slot acquired but tile removed in the meantime — release immediately.
+            // Same reasoning: _removeTile already handled Leaflet cleanup.
             if (signal.aborted) { _releaseSlot(); return; }
 
             fetch(url, { signal: signal }).then(function (response) {

From 9367256a0ef163f55d9486b94b23aa9f68222069 Mon Sep 17 00:00:00 2001
From: Stef Kariotidis <stef.kariotidis@gmail.com>
Date: Thu, 26 Mar 2026 19:49:23 +0200
Subject: [PATCH 3/3] Update changelog for v1.2.22

---
 CHANGELOG.md | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 37eb0ddb..e1e339b7 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,18 @@
 # CHANGELOG
 
+## [1.2.22] - 2026-03-26
+
+### Fixed
+- **HIGH:** Cascading 503 on cold-cache tile loading — per-IP outbound budget counter incremented on every request (including those rejected by the global budget), so client-side retries found the counter already past the limit and failed immediately, causing all tiles to gray out permanently (#206)
+
+### Added
+- Client-side concurrency pool (6 slots) in `retryTileLayer.js` — tiles queue client-side and stream in progressively instead of blasting ~35 simultaneous requests that overwhelm server budgets (#206)
+- Two-phase per-IP rate limiting: `WouldExceedRateLimit` (peek without increment) and `RecordRateLimitHit` (increment only) in `RateLimitHelper` — enables check-then-record pattern where only actual upstream fetches count against the per-IP limit (#206)
+- `PeekCount` method on `RateLimitEntry` — read-only weighted sliding-window count for speculative checks (#206)
+
+### Changed
+- `SendTileRequestCoreAsync` now uses two-phase per-IP budget: peeks first (fast-fail), then records the hit only after global budget token is acquired (#206)
+
 ## [1.2.21] - 2026-03-26
 
 ### Added