stef-k · stef-k · Mar 26, 2026 · Mar 26, 2026 · Mar 26, 2026 · Mar 26, 2026
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,18 @@
 # CHANGELOG
 
+## [1.2.22] - 2026-03-26
+
+### Fixed
+- **HIGH:** Cascading 503 on cold-cache tile loading — per-IP outbound budget counter incremented on every request (including those rejected by the global budget), so client-side retries found the counter already past the limit and failed immediately, causing all tiles to gray out permanently (#206)
+
+### Added
+- Client-side concurrency pool (6 slots) in `retryTileLayer.js` — tiles queue client-side and stream in progressively instead of blasting ~35 simultaneous requests that overwhelm server budgets (#206)
+- Two-phase per-IP rate limiting: `WouldExceedRateLimit` (peek without increment) and `RecordRateLimitHit` (increment only) in `RateLimitHelper` — enables check-then-record pattern where only actual upstream fetches count against the per-IP limit (#206)
+- `PeekCount` method on `RateLimitEntry` — read-only weighted sliding-window count for speculative checks (#206)
+
+### Changed
+- `SendTileRequestCoreAsync` now uses two-phase per-IP budget: peeks first (fast-fail), then records the hit only after global budget token is acquired (#206)
+
 ## [1.2.21] - 2026-03-26
 
 ### Added

diff --git a/Services/RateLimitHelper.cs b/Services/RateLimitHelper.cs
@@ -111,6 +111,37 @@ public int IncrementAndGet(long currentTicks, long newExpirationTicks)
             return weighted;
         }
 
+        /// <summary>
+        /// Returns the current weighted sliding-window count WITHOUT incrementing.
+        /// Used for speculative checks (e.g., per-IP outbound budget) where the
+        /// increment should be deferred until the request actually proceeds upstream.
+        /// </summary>
+        /// <param name="currentTicks">The current tick count.</param>
+        /// <returns>The weighted sliding-window request count (read-only).</returns>
+        public int PeekCount(long currentTicks)
+        {
+            var currentExpiration = Interlocked.Read(ref _expirationTicks);
+            // Don't rotate the window — this is a read-only peek.
+            // If the window has expired, the count will be stale (0 + prevWeight * prev),
+            // which is conservative: it underestimates if a rotation is pending,
+            // so the caller may allow a borderline request that IncrementAndGet would block.
+            // Acceptable for the two-phase pattern where the actual increment follows shortly.
+
+            var currentCount = Volatile.Read(ref _count);
+
+            var windowStart = Interlocked.Read(ref _windowStartTicks);
+            var windowEnd = Interlocked.Read(ref _expirationTicks);
+            var windowSize = windowEnd - windowStart;
+            if (windowSize <= 0) windowSize = WindowTicks;
+            var elapsed = currentTicks - windowStart;
+            if (elapsed < 0) elapsed = 0;
+            if (elapsed > windowSize) elapsed = windowSize;
+
+            var prevWeight = 1.0 - ((double)elapsed / windowSize);
+            var prev = Volatile.Read(ref _prevCount);
+            return (int)(prev * prevWeight) + currentCount;
+        }
+
         /// <summary>
         /// Returns true if this entry's window has expired.
         /// Uses a 2-window horizon: an entry is considered expired only after 2 full windows
@@ -178,6 +209,51 @@ public static bool IsRateLimitExceeded(
         return count > maxRequestsPerMinute;
     }
 
+    /// <summary>
+    /// Checks if the given client key would exceed the rate limit WITHOUT incrementing the counter.
+    /// Used for speculative fast-fail checks (e.g., per-IP outbound budget) where the actual
+    /// increment is deferred until the request succeeds. This prevents budget-exhausted requests
+    /// from inflating the per-IP counter, which would cause cascading 503 rejections on retries.
+    /// </summary>
+    /// <param name="cache">The concurrent dictionary tracking rate limit entries per key.</param>
+    /// <param name="clientKey">The client key (IP address, user ID, etc.).</param>
+    /// <param name="maxRequestsPerMinute">Maximum allowed requests per minute.</param>
+    /// <returns>True if the rate limit would be exceeded, false otherwise.</returns>
+    public static bool WouldExceedRateLimit(
+        ConcurrentDictionary<string, RateLimitEntry> cache,
+        string clientKey,
+        int maxRequestsPerMinute)
+    {
+        var currentTicks = DateTime.UtcNow.Ticks;
+        var expirationTicks = currentTicks + WindowTicks;
+
+        var entry = cache.GetOrAdd(clientKey, _ => new RateLimitEntry(expirationTicks));
+        var count = entry.PeekCount(currentTicks);
+        // Uses >= (not >) because PeekCount reads without incrementing: if count is already
+        // AT the limit, the next IncrementAndGet would push it over. IsRateLimitExceeded
+        // uses > because it increments first (post-increment semantics).
+        return count >= maxRequestsPerMinute;
+    }
+
+    /// <summary>
+    /// Records a successful request against the rate limit counter. Call this AFTER the request
+    /// has actually been fulfilled (e.g., after acquiring the global outbound budget token).
+    /// Pairs with <see cref="WouldExceedRateLimit"/> for two-phase rate limiting where the
+    /// check and increment are separated to avoid counting rejected requests.
+    /// </summary>
+    /// <param name="cache">The concurrent dictionary tracking rate limit entries per key.</param>
+    /// <param name="clientKey">The client key (IP address, user ID, etc.).</param>
+    public static void RecordRateLimitHit(
+        ConcurrentDictionary<string, RateLimitEntry> cache,
+        string clientKey)
+    {
+        var currentTicks = DateTime.UtcNow.Ticks;
+        var expirationTicks = currentTicks + WindowTicks;
+
+        var entry = cache.GetOrAdd(clientKey, _ => new RateLimitEntry(expirationTicks));
+        entry.IncrementAndGet(currentTicks, expirationTicks);
+    }
+
     /// <summary>
     /// Removes expired entries from the rate limit cache to prevent memory growth.
     /// Called periodically when cache exceeds size threshold.

diff --git a/Services/TileCacheService.cs b/Services/TileCacheService.cs
@@ -461,32 +461,37 @@ public string GetCacheDirectory()
         Action<HttpRequestMessage>? configureRequest = null, bool skipBudget = false,
         string? clientIp = null, CancellationToken cancellationToken = default)
     {
-        // Per-IP outbound budget check: prevent a single client from monopolizing global tokens.
-        // Checked before the global budget to fail fast without consuming a global token.
+        // Two-phase per-IP outbound budget: peek first (fast-fail without incrementing),
+        // then record the hit only after the global budget is acquired. This prevents
+        // budget-exhausted requests from inflating the per-IP counter, which previously
+        // caused cascading 503 rejections: on cold-cache loads with ~35 tiles, every
+        // request (including those rejected by the global budget) incremented the per-IP
+        // counter, so retries found the counter already past the limit and failed immediately.
         // skipBudget is true on retries — the per-IP check was already passed on the first attempt.
         // clientIp may be passed explicitly by callers (e.g., coalesced revalidation) where
         // HttpContext is no longer available; falls back to HttpContext if not provided.
+        string? resolvedIpForBudget = null;
         if (!skipBudget)
         {
             var perIpLimit = _applicationSettings.GetSettings().TileOutboundBudgetPerIpPerMinute;
             if (perIpLimit > 0)
             {
-                var resolvedIp = clientIp;
-                if (resolvedIp == null)
+                resolvedIpForBudget = clientIp;
+                if (resolvedIpForBudget == null)
                 {
                     var ctx = _httpContextAccessor.HttpContext;
                     if (ctx != null)
                     {
-                        resolvedIp = RateLimitHelper.GetClientIpAddress(ctx);
+                        resolvedIpForBudget = RateLimitHelper.GetClientIpAddress(ctx);
                     }
                 }
 
-                if (resolvedIp != null && RateLimitHelper.IsRateLimitExceeded(
-                        TilesController.OutboundBudgetCache, resolvedIp, perIpLimit))
+                if (resolvedIpForBudget != null && RateLimitHelper.WouldExceedRateLimit(
+                        TilesController.OutboundBudgetCache, resolvedIpForBudget, perIpLimit))
                 {
                     _logger.LogWarning(
                         "Per-IP outbound budget exceeded for {ClientIp} — throttling upstream request for {TileUrl}",
-                        resolvedIp, TileProviderCatalog.RedactApiKey(tileUrl));
+                        resolvedIpForBudget, TileProviderCatalog.RedactApiKey(tileUrl));
                     return null;
                 }
             }
@@ -504,6 +509,13 @@ public string GetCacheDirectory()
             return null;
         }
 
+        // Both budgets passed — record the per-IP hit now that an upstream fetch will proceed.
+        // This ensures only actual upstream fetches count against the per-IP limit.
+        if (resolvedIpForBudget != null)
+        {
+            RateLimitHelper.RecordRateLimitHit(TilesController.OutboundBudgetCache, resolvedIpForBudget);
+        }
+
         const int maxRedirects = 3;
         var initialUri = new Uri(tileUrl);
         var currentUri = initialUri;

diff --git a/wwwroot/js/retryTileLayer.js b/wwwroot/js/retryTileLayer.js
@@ -3,6 +3,14 @@
  * This enables HTTP status code inspection so we can retry on 503 (budget exhaustion)
  * while treating 404 as permanent failure.
  *
+ * Concurrency control:
+ * - Global pool limits concurrent tile fetches (default 6) to prevent overwhelming
+ *   the server's outbound budget (10 burst, 2/sec) and per-IP budget (default 120/min).
+ *   Without this, a cold-cache load at zoom 17 (~35 tiles) sends all requests
+ *   simultaneously, exhausting both budgets and causing cascading 503 failures where
+ *   retries also get rejected (the per-IP counter increments on every request, even
+ *   rejected ones, so the count quickly snowballs past the limit).
+ *
  * Retry strategy:
  * - Only retries on HTTP 503 or network errors
  * - Reads Retry-After header from server (falls back to exponential backoff)
@@ -14,6 +22,58 @@
  * on our proxy. If upstream OSM is down, retrying would not help and would only pile
  * up stale retry timers. Users will see gray tiles until upstream recovers.
  */
+
+// ---------- Global concurrency pool ----------
+// Limits concurrent tile fetches to prevent overwhelming the server's per-IP outbound
+// budget (default 120/min) and global token budget (10 burst, 2/sec). Tiles beyond the
+// limit queue client-side and proceed as slots free up, producing the progressive
+// "stream-in" effect on cold-cache loads instead of a wall of 503s.
+const _poolSize = 6;
+let _inFlight = 0;
+const _waiting = [];
+
+/**
+ * Acquires a concurrency slot. Resolves with true when a slot is available,
+ * or false if the signal was aborted while queued (tile panned/zoomed away).
+ * @param {AbortSignal} signal - Abort signal from the tile's AbortController.
+ * @returns {Promise<boolean>} True if a slot was acquired (caller must call _releaseSlot).
+ */
+const _acquireSlot = (signal) => {
+    if (signal.aborted) return Promise.resolve(false);
+    if (_inFlight < _poolSize) {
+        _inFlight++;
+        return Promise.resolve(true);
+    }
+    return new Promise((resolve) => {
+        const entry = { resolve };
+        _waiting.push(entry);
+        // When the tile is removed (panned/zoomed away), dequeue so the slot isn't
+        // wasted on a fetch that will be immediately aborted.
+        signal.addEventListener('abort', () => {
+            const i = _waiting.indexOf(entry);
+            if (i !== -1) {
+                _waiting.splice(i, 1);
+                resolve(false);
+            }
+            // If entry was already shifted by _releaseSlot, the slot was transferred
+            // to us — caller checks signal.aborted and releases.
+        }, { once: true });
+    });
+};
+
+/**
+ * Releases a concurrency slot, allowing the next queued fetch to proceed.
+ * Must be called exactly once for every _acquireSlot that resolved with true.
+ */
+const _releaseSlot = () => {
+    if (_waiting.length > 0) {
+        // Transfer slot directly to next waiter (don't decrement _inFlight).
+        _waiting.shift().resolve(true);
+    } else {
+        _inFlight--;
+    }
+};
+
 const RetryTileLayer = L.TileLayer.extend({
     options: {
         maxRetries: 5,
@@ -75,7 +135,8 @@ const RetryTileLayer = L.TileLayer.extend({
 
     /**
      * Fetches a tile via fetch(), retries on 503 or network error with backoff.
-     * Respects the AbortSignal so removed tiles stop retrying immediately.
+     * Acquires a concurrency slot before each fetch attempt to prevent overwhelming
+     * the server's budget. Respects AbortSignal so removed tiles stop immediately.
      * @param {string} url - The tile URL.
      * @param {HTMLImageElement} tile - The tile image element.
      * @param {Function} done - Leaflet callback to signal completion.
@@ -87,66 +148,80 @@ const RetryTileLayer = L.TileLayer.extend({
         const maxRetries = this.options.maxRetries;
         const baseDelay = this.options.retryDelayMs;
 
-        fetch(url, { signal: signal }).then(function (response) {
-            if (response.ok) {
-                return response.blob().then(function (blob) {
-                    // Guard: if the tile was removed (panned/zoomed away) while the blob
-                    // was being read, skip assigning the blob URL. Without this check,
-                    // Leaflet's _removeTile would have already replaced onload/onerror
-                    // with falseFn, so the revokeObjectURL callback would never fire,
-                    // leaking the blob URL.
-                    if (signal.aborted) return;
-                    tile.onload = function () {
-                        URL.revokeObjectURL(tile.src);
-                        done(null, tile);
-                    };
-                    tile.onerror = function (e) {
-                        URL.revokeObjectURL(tile.src);
-                        done(e, tile);
-                    };
-                    tile.src = URL.createObjectURL(blob);
-                });
-            }
+        _acquireSlot(signal).then(function (acquired) {
+            // No slot acquired — tile was removed while queued, nothing to do.
+            // done() is intentionally not called: Leaflet's _removeTile already replaced
+            // it with falseFn, so calling it would be a no-op at best.
+            if (!acquired) return;
+            // Slot acquired but tile removed in the meantime — release immediately.
+            // Same reasoning: _removeTile already handled Leaflet cleanup.
+            if (signal.aborted) { _releaseSlot(); return; }
 
-            // 503 = budget exhausted, transient — retry with Retry-After or backoff.
-            // Jitter (±25%) prevents thundering-herd retries when many tiles 503 simultaneously.
-            if (response.status === 503 && attempt < maxRetries) {
-                const retryAfter = response.headers.get('Retry-After');
-                const parsed = retryAfter ? parseInt(retryAfter, 10) : NaN;
-                let delayMs = !isNaN(parsed) && parsed > 0
-                    ? parsed * 1000
-                    : baseDelay * Math.pow(2, attempt);
-                delayMs = Math.max(delayMs, baseDelay); // floor: never below base delay
-                delayMs = Math.min(delayMs, 10000);     // cap: never above 10s
-                delayMs *= (0.75 + Math.random() * 0.5); // jitter ±25%
-
-                setTimeout(function () {
-                    // Check if tile was removed while waiting — abort signal is set.
-                    if (!signal.aborted) {
-                        layer._fetchWithRetry(url, tile, done, attempt + 1, signal);
-                    }
-                }, delayMs);
-                return;
-            }
+            fetch(url, { signal: signal }).then(function (response) {
+                _releaseSlot();
 
-            // Non-retryable (404, 400, 500, etc.)
-            done(new Error('Tile fetch failed: ' + response.status), tile);
-        }).catch(function (err) {
-            // Tile was removed (panned/zoomed away) — silently stop.
-            if (err.name === 'AbortError') return;
-
-            // Network error (or body-read failure mid-transfer) — retry if attempts remain
-            if (attempt < maxRetries) {
-                let delayMs = Math.min(baseDelay * Math.pow(2, attempt), 10000);
-                delayMs *= (0.75 + Math.random() * 0.5); // jitter ±25%
-                setTimeout(function () {
-                    if (!signal.aborted) {
-                        layer._fetchWithRetry(url, tile, done, attempt + 1, signal);
-                    }
-                }, delayMs);
-                return;
-            }
-            done(err, tile);
+                if (response.ok) {
+                    return response.blob().then(function (blob) {
+                        // Guard: if the tile was removed (panned/zoomed away) while the blob
+                        // was being read, skip assigning the blob URL. Without this check,
+                        // Leaflet's _removeTile would have already replaced onload/onerror
+                        // with falseFn, so the revokeObjectURL callback would never fire,
+                        // leaking the blob URL.
+                        if (signal.aborted) return;
+                        tile.onload = function () {
+                            URL.revokeObjectURL(tile.src);
+                            done(null, tile);
+                        };
+                        tile.onerror = function (e) {
+                            URL.revokeObjectURL(tile.src);
+                            done(e, tile);
+                        };
+                        tile.src = URL.createObjectURL(blob);
+                    });
+                }
+
+                // 503 = budget exhausted, transient — retry with Retry-After or backoff.
+                // Jitter (±25%) prevents thundering-herd retries when many tiles 503 simultaneously.
+                if (response.status === 503 && attempt < maxRetries) {
+                    const retryAfter = response.headers.get('Retry-After');
+                    const parsed = retryAfter ? parseInt(retryAfter, 10) : NaN;
+                    let delayMs = !isNaN(parsed) && parsed > 0
+                        ? parsed * 1000
+                        : baseDelay * Math.pow(2, attempt);
+                    delayMs = Math.max(delayMs, baseDelay); // floor: never below base delay
+                    delayMs = Math.min(delayMs, 10000);     // cap: never above 10s
+                    delayMs *= (0.75 + Math.random() * 0.5); // jitter ±25%
+
+                    setTimeout(function () {
+                        // Check if tile was removed while waiting — abort signal is set.
+                        if (!signal.aborted) {
+                            layer._fetchWithRetry(url, tile, done, attempt + 1, signal);
+                        }
+                    }, delayMs);
+                    return;
+                }
+
+                // Non-retryable (404, 400, 500, etc.)
+                done(new Error('Tile fetch failed: ' + response.status), tile);
+            }).catch(function (err) {
+                _releaseSlot();
+
+                // Tile was removed (panned/zoomed away) — silently stop.
+                if (err.name === 'AbortError') return;
+
+                // Network error (or body-read failure mid-transfer) — retry if attempts remain
+                if (attempt < maxRetries) {
+                    let delayMs = Math.min(baseDelay * Math.pow(2, attempt), 10000);
+                    delayMs *= (0.75 + Math.random() * 0.5); // jitter ±25%
+                    setTimeout(function () {
+                        if (!signal.aborted) {
+                            layer._fetchWithRetry(url, tile, done, attempt + 1, signal);
+                        }
+                    }, delayMs);
+                    return;
+                }
+                done(err, tile);
+            });
         });
     }
 });