diff --git a/CHANGELOG.md b/CHANGELOG.md
index 37eb0dd..e1e339b 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,18 @@
# CHANGELOG
+## [1.2.22] - 2026-03-26
+
+### Fixed
+- **HIGH:** Cascading 503 on cold-cache tile loading — per-IP outbound budget counter incremented on every request (including those rejected by the global budget), so client-side retries found the counter already past the limit and failed immediately, causing all tiles to gray out permanently (#206)
+
+### Added
+- Client-side concurrency pool (6 slots) in `retryTileLayer.js` — tiles queue client-side and stream in progressively instead of blasting ~35 simultaneous requests that overwhelm server budgets (#206)
+- Two-phase per-IP rate limiting: `WouldExceedRateLimit` (peek without increment) and `RecordRateLimitHit` (increment only) in `RateLimitHelper` — enables check-then-record pattern where only actual upstream fetches count against the per-IP limit (#206)
+- `PeekCount` method on `RateLimitEntry` — read-only weighted sliding-window count for speculative checks (#206)
+
+### Changed
+- `SendTileRequestCoreAsync` now uses two-phase per-IP budget: peeks first (fast-fail), then records the hit only after global budget token is acquired (#206)
+
## [1.2.21] - 2026-03-26
### Added
diff --git a/Services/RateLimitHelper.cs b/Services/RateLimitHelper.cs
index 49d3ba3..bb752cb 100644
--- a/Services/RateLimitHelper.cs
+++ b/Services/RateLimitHelper.cs
@@ -111,6 +111,37 @@ public int IncrementAndGet(long currentTicks, long newExpirationTicks)
return weighted;
}
+ ///
+ /// Returns the current weighted sliding-window count WITHOUT incrementing.
+ /// Used for speculative checks (e.g., per-IP outbound budget) where the
+ /// increment should be deferred until the request actually proceeds upstream.
+ ///
+ /// The current tick count.
+ /// The weighted sliding-window request count (read-only).
+ public int PeekCount(long currentTicks)
+ {
+ var currentExpiration = Interlocked.Read(ref _expirationTicks);
+ // Don't rotate the window — this is a read-only peek.
+ // If the window has expired, the count will be stale (0 + prevWeight * prev),
+ // which is conservative: it underestimates if a rotation is pending,
+ // so the caller may allow a borderline request that IncrementAndGet would block.
+ // Acceptable for the two-phase pattern where the actual increment follows shortly.
+
+ var currentCount = Volatile.Read(ref _count);
+
+ var windowStart = Interlocked.Read(ref _windowStartTicks);
+ var windowEnd = Interlocked.Read(ref _expirationTicks);
+ var windowSize = windowEnd - windowStart;
+ if (windowSize <= 0) windowSize = WindowTicks;
+ var elapsed = currentTicks - windowStart;
+ if (elapsed < 0) elapsed = 0;
+ if (elapsed > windowSize) elapsed = windowSize;
+
+ var prevWeight = 1.0 - ((double)elapsed / windowSize);
+ var prev = Volatile.Read(ref _prevCount);
+ return (int)(prev * prevWeight) + currentCount;
+ }
+
///
/// Returns true if this entry's window has expired.
/// Uses a 2-window horizon: an entry is considered expired only after 2 full windows
@@ -178,6 +209,51 @@ public static bool IsRateLimitExceeded(
return count > maxRequestsPerMinute;
}
+ ///
+ /// Checks if the given client key would exceed the rate limit WITHOUT incrementing the counter.
+ /// Used for speculative fast-fail checks (e.g., per-IP outbound budget) where the actual
+ /// increment is deferred until the request succeeds. This prevents budget-exhausted requests
+ /// from inflating the per-IP counter, which would cause cascading 503 rejections on retries.
+ ///
+ /// The concurrent dictionary tracking rate limit entries per key.
+ /// The client key (IP address, user ID, etc.).
+ /// Maximum allowed requests per minute.
+ /// True if the rate limit would be exceeded, false otherwise.
+ public static bool WouldExceedRateLimit(
+ ConcurrentDictionary cache,
+ string clientKey,
+ int maxRequestsPerMinute)
+ {
+ var currentTicks = DateTime.UtcNow.Ticks;
+ var expirationTicks = currentTicks + WindowTicks;
+
+ var entry = cache.GetOrAdd(clientKey, _ => new RateLimitEntry(expirationTicks));
+ var count = entry.PeekCount(currentTicks);
+ // Uses >= (not >) because PeekCount reads without incrementing: if count is already
+ // AT the limit, the next IncrementAndGet would push it over. IsRateLimitExceeded
+ // uses > because it increments first (post-increment semantics).
+ return count >= maxRequestsPerMinute;
+ }
+
+ ///
+ /// Records a successful request against the rate limit counter. Call this AFTER the request
+ /// has actually been fulfilled (e.g., after acquiring the global outbound budget token).
+ /// Pairs with for two-phase rate limiting where the
+ /// check and increment are separated to avoid counting rejected requests.
+ ///
+ /// The concurrent dictionary tracking rate limit entries per key.
+ /// The client key (IP address, user ID, etc.).
+ public static void RecordRateLimitHit(
+ ConcurrentDictionary cache,
+ string clientKey)
+ {
+ var currentTicks = DateTime.UtcNow.Ticks;
+ var expirationTicks = currentTicks + WindowTicks;
+
+ var entry = cache.GetOrAdd(clientKey, _ => new RateLimitEntry(expirationTicks));
+ entry.IncrementAndGet(currentTicks, expirationTicks);
+ }
+
///
/// Removes expired entries from the rate limit cache to prevent memory growth.
/// Called periodically when cache exceeds size threshold.
diff --git a/Services/TileCacheService.cs b/Services/TileCacheService.cs
index f8d4523..0c42e15 100644
--- a/Services/TileCacheService.cs
+++ b/Services/TileCacheService.cs
@@ -461,32 +461,37 @@ public string GetCacheDirectory()
Action? configureRequest = null, bool skipBudget = false,
string? clientIp = null, CancellationToken cancellationToken = default)
{
- // Per-IP outbound budget check: prevent a single client from monopolizing global tokens.
- // Checked before the global budget to fail fast without consuming a global token.
+ // Two-phase per-IP outbound budget: peek first (fast-fail without incrementing),
+ // then record the hit only after the global budget is acquired. This prevents
+ // budget-exhausted requests from inflating the per-IP counter, which previously
+ // caused cascading 503 rejections: on cold-cache loads with ~35 tiles, every
+ // request (including those rejected by the global budget) incremented the per-IP
+ // counter, so retries found the counter already past the limit and failed immediately.
// skipBudget is true on retries — the per-IP check was already passed on the first attempt.
// clientIp may be passed explicitly by callers (e.g., coalesced revalidation) where
// HttpContext is no longer available; falls back to HttpContext if not provided.
+ string? resolvedIpForBudget = null;
if (!skipBudget)
{
var perIpLimit = _applicationSettings.GetSettings().TileOutboundBudgetPerIpPerMinute;
if (perIpLimit > 0)
{
- var resolvedIp = clientIp;
- if (resolvedIp == null)
+ resolvedIpForBudget = clientIp;
+ if (resolvedIpForBudget == null)
{
var ctx = _httpContextAccessor.HttpContext;
if (ctx != null)
{
- resolvedIp = RateLimitHelper.GetClientIpAddress(ctx);
+ resolvedIpForBudget = RateLimitHelper.GetClientIpAddress(ctx);
}
}
- if (resolvedIp != null && RateLimitHelper.IsRateLimitExceeded(
- TilesController.OutboundBudgetCache, resolvedIp, perIpLimit))
+ if (resolvedIpForBudget != null && RateLimitHelper.WouldExceedRateLimit(
+ TilesController.OutboundBudgetCache, resolvedIpForBudget, perIpLimit))
{
_logger.LogWarning(
"Per-IP outbound budget exceeded for {ClientIp} — throttling upstream request for {TileUrl}",
- resolvedIp, TileProviderCatalog.RedactApiKey(tileUrl));
+ resolvedIpForBudget, TileProviderCatalog.RedactApiKey(tileUrl));
return null;
}
}
@@ -504,6 +509,13 @@ public string GetCacheDirectory()
return null;
}
+ // Both budgets passed — record the per-IP hit now that an upstream fetch will proceed.
+ // This ensures only actual upstream fetches count against the per-IP limit.
+ if (resolvedIpForBudget != null)
+ {
+ RateLimitHelper.RecordRateLimitHit(TilesController.OutboundBudgetCache, resolvedIpForBudget);
+ }
+
const int maxRedirects = 3;
var initialUri = new Uri(tileUrl);
var currentUri = initialUri;
diff --git a/wwwroot/js/retryTileLayer.js b/wwwroot/js/retryTileLayer.js
index 9ffb9bb..fa946df 100644
--- a/wwwroot/js/retryTileLayer.js
+++ b/wwwroot/js/retryTileLayer.js
@@ -3,6 +3,14 @@
* This enables HTTP status code inspection so we can retry on 503 (budget exhaustion)
* while treating 404 as permanent failure.
*
+ * Concurrency control:
+ * - Global pool limits concurrent tile fetches (default 6) to prevent overwhelming
+ * the server's outbound budget (10 burst, 2/sec) and per-IP budget (default 120/min).
+ * Without this, a cold-cache load at zoom 17 (~35 tiles) sends all requests
+ * simultaneously, exhausting both budgets and causing cascading 503 failures where
+ * retries also get rejected (the per-IP counter increments on every request, even
+ * rejected ones, so the count quickly snowballs past the limit).
+ *
* Retry strategy:
* - Only retries on HTTP 503 or network errors
* - Reads Retry-After header from server (falls back to exponential backoff)
@@ -14,6 +22,58 @@
* on our proxy. If upstream OSM is down, retrying would not help and would only pile
* up stale retry timers. Users will see gray tiles until upstream recovers.
*/
+
+// ---------- Global concurrency pool ----------
+// Limits concurrent tile fetches to prevent overwhelming the server's per-IP outbound
+// budget (default 120/min) and global token budget (10 burst, 2/sec). Tiles beyond the
+// limit queue client-side and proceed as slots free up, producing the progressive
+// "stream-in" effect on cold-cache loads instead of a wall of 503s.
+const _poolSize = 6;
+let _inFlight = 0;
+const _waiting = [];
+
+/**
+ * Acquires a concurrency slot. Resolves with true when a slot is available,
+ * or false if the signal was aborted while queued (tile panned/zoomed away).
+ * @param {AbortSignal} signal - Abort signal from the tile's AbortController.
+ * @returns {Promise} True if a slot was acquired (caller must call _releaseSlot).
+ */
+const _acquireSlot = (signal) => {
+ if (signal.aborted) return Promise.resolve(false);
+ if (_inFlight < _poolSize) {
+ _inFlight++;
+ return Promise.resolve(true);
+ }
+ return new Promise((resolve) => {
+ const entry = { resolve };
+ _waiting.push(entry);
+ // When the tile is removed (panned/zoomed away), dequeue so the slot isn't
+ // wasted on a fetch that will be immediately aborted.
+ signal.addEventListener('abort', () => {
+ const i = _waiting.indexOf(entry);
+ if (i !== -1) {
+ _waiting.splice(i, 1);
+ resolve(false);
+ }
+ // If entry was already shifted by _releaseSlot, the slot was transferred
+ // to us — caller checks signal.aborted and releases.
+ }, { once: true });
+ });
+};
+
+/**
+ * Releases a concurrency slot, allowing the next queued fetch to proceed.
+ * Must be called exactly once for every _acquireSlot that resolved with true.
+ */
+const _releaseSlot = () => {
+ if (_waiting.length > 0) {
+ // Transfer slot directly to next waiter (don't decrement _inFlight).
+ _waiting.shift().resolve(true);
+ } else {
+ _inFlight--;
+ }
+};
+
const RetryTileLayer = L.TileLayer.extend({
options: {
maxRetries: 5,
@@ -75,7 +135,8 @@ const RetryTileLayer = L.TileLayer.extend({
/**
* Fetches a tile via fetch(), retries on 503 or network error with backoff.
- * Respects the AbortSignal so removed tiles stop retrying immediately.
+ * Acquires a concurrency slot before each fetch attempt to prevent overwhelming
+ * the server's budget. Respects AbortSignal so removed tiles stop immediately.
* @param {string} url - The tile URL.
* @param {HTMLImageElement} tile - The tile image element.
* @param {Function} done - Leaflet callback to signal completion.
@@ -87,66 +148,80 @@ const RetryTileLayer = L.TileLayer.extend({
const maxRetries = this.options.maxRetries;
const baseDelay = this.options.retryDelayMs;
- fetch(url, { signal: signal }).then(function (response) {
- if (response.ok) {
- return response.blob().then(function (blob) {
- // Guard: if the tile was removed (panned/zoomed away) while the blob
- // was being read, skip assigning the blob URL. Without this check,
- // Leaflet's _removeTile would have already replaced onload/onerror
- // with falseFn, so the revokeObjectURL callback would never fire,
- // leaking the blob URL.
- if (signal.aborted) return;
- tile.onload = function () {
- URL.revokeObjectURL(tile.src);
- done(null, tile);
- };
- tile.onerror = function (e) {
- URL.revokeObjectURL(tile.src);
- done(e, tile);
- };
- tile.src = URL.createObjectURL(blob);
- });
- }
+ _acquireSlot(signal).then(function (acquired) {
+ // No slot acquired — tile was removed while queued, nothing to do.
+ // done() is intentionally not called: Leaflet's _removeTile already replaced
+ // it with falseFn, so calling it would be a no-op at best.
+ if (!acquired) return;
+ // Slot acquired but tile removed in the meantime — release immediately.
+ // Same reasoning: _removeTile already handled Leaflet cleanup.
+ if (signal.aborted) { _releaseSlot(); return; }
- // 503 = budget exhausted, transient — retry with Retry-After or backoff.
- // Jitter (±25%) prevents thundering-herd retries when many tiles 503 simultaneously.
- if (response.status === 503 && attempt < maxRetries) {
- const retryAfter = response.headers.get('Retry-After');
- const parsed = retryAfter ? parseInt(retryAfter, 10) : NaN;
- let delayMs = !isNaN(parsed) && parsed > 0
- ? parsed * 1000
- : baseDelay * Math.pow(2, attempt);
- delayMs = Math.max(delayMs, baseDelay); // floor: never below base delay
- delayMs = Math.min(delayMs, 10000); // cap: never above 10s
- delayMs *= (0.75 + Math.random() * 0.5); // jitter ±25%
-
- setTimeout(function () {
- // Check if tile was removed while waiting — abort signal is set.
- if (!signal.aborted) {
- layer._fetchWithRetry(url, tile, done, attempt + 1, signal);
- }
- }, delayMs);
- return;
- }
+ fetch(url, { signal: signal }).then(function (response) {
+ _releaseSlot();
- // Non-retryable (404, 400, 500, etc.)
- done(new Error('Tile fetch failed: ' + response.status), tile);
- }).catch(function (err) {
- // Tile was removed (panned/zoomed away) — silently stop.
- if (err.name === 'AbortError') return;
-
- // Network error (or body-read failure mid-transfer) — retry if attempts remain
- if (attempt < maxRetries) {
- let delayMs = Math.min(baseDelay * Math.pow(2, attempt), 10000);
- delayMs *= (0.75 + Math.random() * 0.5); // jitter ±25%
- setTimeout(function () {
- if (!signal.aborted) {
- layer._fetchWithRetry(url, tile, done, attempt + 1, signal);
- }
- }, delayMs);
- return;
- }
- done(err, tile);
+ if (response.ok) {
+ return response.blob().then(function (blob) {
+ // Guard: if the tile was removed (panned/zoomed away) while the blob
+ // was being read, skip assigning the blob URL. Without this check,
+ // Leaflet's _removeTile would have already replaced onload/onerror
+ // with falseFn, so the revokeObjectURL callback would never fire,
+ // leaking the blob URL.
+ if (signal.aborted) return;
+ tile.onload = function () {
+ URL.revokeObjectURL(tile.src);
+ done(null, tile);
+ };
+ tile.onerror = function (e) {
+ URL.revokeObjectURL(tile.src);
+ done(e, tile);
+ };
+ tile.src = URL.createObjectURL(blob);
+ });
+ }
+
+ // 503 = budget exhausted, transient — retry with Retry-After or backoff.
+ // Jitter (±25%) prevents thundering-herd retries when many tiles 503 simultaneously.
+ if (response.status === 503 && attempt < maxRetries) {
+ const retryAfter = response.headers.get('Retry-After');
+ const parsed = retryAfter ? parseInt(retryAfter, 10) : NaN;
+ let delayMs = !isNaN(parsed) && parsed > 0
+ ? parsed * 1000
+ : baseDelay * Math.pow(2, attempt);
+ delayMs = Math.max(delayMs, baseDelay); // floor: never below base delay
+ delayMs = Math.min(delayMs, 10000); // cap: never above 10s
+ delayMs *= (0.75 + Math.random() * 0.5); // jitter ±25%
+
+ setTimeout(function () {
+ // Check if tile was removed while waiting — abort signal is set.
+ if (!signal.aborted) {
+ layer._fetchWithRetry(url, tile, done, attempt + 1, signal);
+ }
+ }, delayMs);
+ return;
+ }
+
+ // Non-retryable (404, 400, 500, etc.)
+ done(new Error('Tile fetch failed: ' + response.status), tile);
+ }).catch(function (err) {
+ _releaseSlot();
+
+ // Tile was removed (panned/zoomed away) — silently stop.
+ if (err.name === 'AbortError') return;
+
+ // Network error (or body-read failure mid-transfer) — retry if attempts remain
+ if (attempt < maxRetries) {
+ let delayMs = Math.min(baseDelay * Math.pow(2, attempt), 10000);
+ delayMs *= (0.75 + Math.random() * 0.5); // jitter ±25%
+ setTimeout(function () {
+ if (!signal.aborted) {
+ layer._fetchWithRetry(url, tile, done, attempt + 1, signal);
+ }
+ }, delayMs);
+ return;
+ }
+ done(err, tile);
+ });
});
}
});