diff --git a/CHANGELOG.md b/CHANGELOG.md index 37eb0dd..e1e339b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,18 @@ # CHANGELOG +## [1.2.22] - 2026-03-26 + +### Fixed +- **HIGH:** Cascading 503 on cold-cache tile loading — per-IP outbound budget counter incremented on every request (including those rejected by the global budget), so client-side retries found the counter already past the limit and failed immediately, causing all tiles to gray out permanently (#206) + +### Added +- Client-side concurrency pool (6 slots) in `retryTileLayer.js` — tiles queue client-side and stream in progressively instead of blasting ~35 simultaneous requests that overwhelm server budgets (#206) +- Two-phase per-IP rate limiting: `WouldExceedRateLimit` (peek without increment) and `RecordRateLimitHit` (increment only) in `RateLimitHelper` — enables check-then-record pattern where only actual upstream fetches count against the per-IP limit (#206) +- `PeekCount` method on `RateLimitEntry` — read-only weighted sliding-window count for speculative checks (#206) + +### Changed +- `SendTileRequestCoreAsync` now uses two-phase per-IP budget: peeks first (fast-fail), then records the hit only after global budget token is acquired (#206) + ## [1.2.21] - 2026-03-26 ### Added diff --git a/Services/RateLimitHelper.cs b/Services/RateLimitHelper.cs index 49d3ba3..bb752cb 100644 --- a/Services/RateLimitHelper.cs +++ b/Services/RateLimitHelper.cs @@ -111,6 +111,37 @@ public int IncrementAndGet(long currentTicks, long newExpirationTicks) return weighted; } + /// + /// Returns the current weighted sliding-window count WITHOUT incrementing. + /// Used for speculative checks (e.g., per-IP outbound budget) where the + /// increment should be deferred until the request actually proceeds upstream. + /// + /// The current tick count. + /// The weighted sliding-window request count (read-only). + public int PeekCount(long currentTicks) + { + var currentExpiration = Interlocked.Read(ref _expirationTicks); + // Don't rotate the window — this is a read-only peek. + // If the window has expired, the count will be stale (0 + prevWeight * prev), + // which is conservative: it underestimates if a rotation is pending, + // so the caller may allow a borderline request that IncrementAndGet would block. + // Acceptable for the two-phase pattern where the actual increment follows shortly. + + var currentCount = Volatile.Read(ref _count); + + var windowStart = Interlocked.Read(ref _windowStartTicks); + var windowEnd = Interlocked.Read(ref _expirationTicks); + var windowSize = windowEnd - windowStart; + if (windowSize <= 0) windowSize = WindowTicks; + var elapsed = currentTicks - windowStart; + if (elapsed < 0) elapsed = 0; + if (elapsed > windowSize) elapsed = windowSize; + + var prevWeight = 1.0 - ((double)elapsed / windowSize); + var prev = Volatile.Read(ref _prevCount); + return (int)(prev * prevWeight) + currentCount; + } + /// /// Returns true if this entry's window has expired. /// Uses a 2-window horizon: an entry is considered expired only after 2 full windows @@ -178,6 +209,51 @@ public static bool IsRateLimitExceeded( return count > maxRequestsPerMinute; } + /// + /// Checks if the given client key would exceed the rate limit WITHOUT incrementing the counter. + /// Used for speculative fast-fail checks (e.g., per-IP outbound budget) where the actual + /// increment is deferred until the request succeeds. This prevents budget-exhausted requests + /// from inflating the per-IP counter, which would cause cascading 503 rejections on retries. + /// + /// The concurrent dictionary tracking rate limit entries per key. + /// The client key (IP address, user ID, etc.). + /// Maximum allowed requests per minute. + /// True if the rate limit would be exceeded, false otherwise. + public static bool WouldExceedRateLimit( + ConcurrentDictionary cache, + string clientKey, + int maxRequestsPerMinute) + { + var currentTicks = DateTime.UtcNow.Ticks; + var expirationTicks = currentTicks + WindowTicks; + + var entry = cache.GetOrAdd(clientKey, _ => new RateLimitEntry(expirationTicks)); + var count = entry.PeekCount(currentTicks); + // Uses >= (not >) because PeekCount reads without incrementing: if count is already + // AT the limit, the next IncrementAndGet would push it over. IsRateLimitExceeded + // uses > because it increments first (post-increment semantics). + return count >= maxRequestsPerMinute; + } + + /// + /// Records a successful request against the rate limit counter. Call this AFTER the request + /// has actually been fulfilled (e.g., after acquiring the global outbound budget token). + /// Pairs with for two-phase rate limiting where the + /// check and increment are separated to avoid counting rejected requests. + /// + /// The concurrent dictionary tracking rate limit entries per key. + /// The client key (IP address, user ID, etc.). + public static void RecordRateLimitHit( + ConcurrentDictionary cache, + string clientKey) + { + var currentTicks = DateTime.UtcNow.Ticks; + var expirationTicks = currentTicks + WindowTicks; + + var entry = cache.GetOrAdd(clientKey, _ => new RateLimitEntry(expirationTicks)); + entry.IncrementAndGet(currentTicks, expirationTicks); + } + /// /// Removes expired entries from the rate limit cache to prevent memory growth. /// Called periodically when cache exceeds size threshold. diff --git a/Services/TileCacheService.cs b/Services/TileCacheService.cs index f8d4523..0c42e15 100644 --- a/Services/TileCacheService.cs +++ b/Services/TileCacheService.cs @@ -461,32 +461,37 @@ public string GetCacheDirectory() Action? configureRequest = null, bool skipBudget = false, string? clientIp = null, CancellationToken cancellationToken = default) { - // Per-IP outbound budget check: prevent a single client from monopolizing global tokens. - // Checked before the global budget to fail fast without consuming a global token. + // Two-phase per-IP outbound budget: peek first (fast-fail without incrementing), + // then record the hit only after the global budget is acquired. This prevents + // budget-exhausted requests from inflating the per-IP counter, which previously + // caused cascading 503 rejections: on cold-cache loads with ~35 tiles, every + // request (including those rejected by the global budget) incremented the per-IP + // counter, so retries found the counter already past the limit and failed immediately. // skipBudget is true on retries — the per-IP check was already passed on the first attempt. // clientIp may be passed explicitly by callers (e.g., coalesced revalidation) where // HttpContext is no longer available; falls back to HttpContext if not provided. + string? resolvedIpForBudget = null; if (!skipBudget) { var perIpLimit = _applicationSettings.GetSettings().TileOutboundBudgetPerIpPerMinute; if (perIpLimit > 0) { - var resolvedIp = clientIp; - if (resolvedIp == null) + resolvedIpForBudget = clientIp; + if (resolvedIpForBudget == null) { var ctx = _httpContextAccessor.HttpContext; if (ctx != null) { - resolvedIp = RateLimitHelper.GetClientIpAddress(ctx); + resolvedIpForBudget = RateLimitHelper.GetClientIpAddress(ctx); } } - if (resolvedIp != null && RateLimitHelper.IsRateLimitExceeded( - TilesController.OutboundBudgetCache, resolvedIp, perIpLimit)) + if (resolvedIpForBudget != null && RateLimitHelper.WouldExceedRateLimit( + TilesController.OutboundBudgetCache, resolvedIpForBudget, perIpLimit)) { _logger.LogWarning( "Per-IP outbound budget exceeded for {ClientIp} — throttling upstream request for {TileUrl}", - resolvedIp, TileProviderCatalog.RedactApiKey(tileUrl)); + resolvedIpForBudget, TileProviderCatalog.RedactApiKey(tileUrl)); return null; } } @@ -504,6 +509,13 @@ public string GetCacheDirectory() return null; } + // Both budgets passed — record the per-IP hit now that an upstream fetch will proceed. + // This ensures only actual upstream fetches count against the per-IP limit. + if (resolvedIpForBudget != null) + { + RateLimitHelper.RecordRateLimitHit(TilesController.OutboundBudgetCache, resolvedIpForBudget); + } + const int maxRedirects = 3; var initialUri = new Uri(tileUrl); var currentUri = initialUri; diff --git a/wwwroot/js/retryTileLayer.js b/wwwroot/js/retryTileLayer.js index 9ffb9bb..fa946df 100644 --- a/wwwroot/js/retryTileLayer.js +++ b/wwwroot/js/retryTileLayer.js @@ -3,6 +3,14 @@ * This enables HTTP status code inspection so we can retry on 503 (budget exhaustion) * while treating 404 as permanent failure. * + * Concurrency control: + * - Global pool limits concurrent tile fetches (default 6) to prevent overwhelming + * the server's outbound budget (10 burst, 2/sec) and per-IP budget (default 120/min). + * Without this, a cold-cache load at zoom 17 (~35 tiles) sends all requests + * simultaneously, exhausting both budgets and causing cascading 503 failures where + * retries also get rejected (the per-IP counter increments on every request, even + * rejected ones, so the count quickly snowballs past the limit). + * * Retry strategy: * - Only retries on HTTP 503 or network errors * - Reads Retry-After header from server (falls back to exponential backoff) @@ -14,6 +22,58 @@ * on our proxy. If upstream OSM is down, retrying would not help and would only pile * up stale retry timers. Users will see gray tiles until upstream recovers. */ + +// ---------- Global concurrency pool ---------- +// Limits concurrent tile fetches to prevent overwhelming the server's per-IP outbound +// budget (default 120/min) and global token budget (10 burst, 2/sec). Tiles beyond the +// limit queue client-side and proceed as slots free up, producing the progressive +// "stream-in" effect on cold-cache loads instead of a wall of 503s. +const _poolSize = 6; +let _inFlight = 0; +const _waiting = []; + +/** + * Acquires a concurrency slot. Resolves with true when a slot is available, + * or false if the signal was aborted while queued (tile panned/zoomed away). + * @param {AbortSignal} signal - Abort signal from the tile's AbortController. + * @returns {Promise} True if a slot was acquired (caller must call _releaseSlot). + */ +const _acquireSlot = (signal) => { + if (signal.aborted) return Promise.resolve(false); + if (_inFlight < _poolSize) { + _inFlight++; + return Promise.resolve(true); + } + return new Promise((resolve) => { + const entry = { resolve }; + _waiting.push(entry); + // When the tile is removed (panned/zoomed away), dequeue so the slot isn't + // wasted on a fetch that will be immediately aborted. + signal.addEventListener('abort', () => { + const i = _waiting.indexOf(entry); + if (i !== -1) { + _waiting.splice(i, 1); + resolve(false); + } + // If entry was already shifted by _releaseSlot, the slot was transferred + // to us — caller checks signal.aborted and releases. + }, { once: true }); + }); +}; + +/** + * Releases a concurrency slot, allowing the next queued fetch to proceed. + * Must be called exactly once for every _acquireSlot that resolved with true. + */ +const _releaseSlot = () => { + if (_waiting.length > 0) { + // Transfer slot directly to next waiter (don't decrement _inFlight). + _waiting.shift().resolve(true); + } else { + _inFlight--; + } +}; + const RetryTileLayer = L.TileLayer.extend({ options: { maxRetries: 5, @@ -75,7 +135,8 @@ const RetryTileLayer = L.TileLayer.extend({ /** * Fetches a tile via fetch(), retries on 503 or network error with backoff. - * Respects the AbortSignal so removed tiles stop retrying immediately. + * Acquires a concurrency slot before each fetch attempt to prevent overwhelming + * the server's budget. Respects AbortSignal so removed tiles stop immediately. * @param {string} url - The tile URL. * @param {HTMLImageElement} tile - The tile image element. * @param {Function} done - Leaflet callback to signal completion. @@ -87,66 +148,80 @@ const RetryTileLayer = L.TileLayer.extend({ const maxRetries = this.options.maxRetries; const baseDelay = this.options.retryDelayMs; - fetch(url, { signal: signal }).then(function (response) { - if (response.ok) { - return response.blob().then(function (blob) { - // Guard: if the tile was removed (panned/zoomed away) while the blob - // was being read, skip assigning the blob URL. Without this check, - // Leaflet's _removeTile would have already replaced onload/onerror - // with falseFn, so the revokeObjectURL callback would never fire, - // leaking the blob URL. - if (signal.aborted) return; - tile.onload = function () { - URL.revokeObjectURL(tile.src); - done(null, tile); - }; - tile.onerror = function (e) { - URL.revokeObjectURL(tile.src); - done(e, tile); - }; - tile.src = URL.createObjectURL(blob); - }); - } + _acquireSlot(signal).then(function (acquired) { + // No slot acquired — tile was removed while queued, nothing to do. + // done() is intentionally not called: Leaflet's _removeTile already replaced + // it with falseFn, so calling it would be a no-op at best. + if (!acquired) return; + // Slot acquired but tile removed in the meantime — release immediately. + // Same reasoning: _removeTile already handled Leaflet cleanup. + if (signal.aborted) { _releaseSlot(); return; } - // 503 = budget exhausted, transient — retry with Retry-After or backoff. - // Jitter (±25%) prevents thundering-herd retries when many tiles 503 simultaneously. - if (response.status === 503 && attempt < maxRetries) { - const retryAfter = response.headers.get('Retry-After'); - const parsed = retryAfter ? parseInt(retryAfter, 10) : NaN; - let delayMs = !isNaN(parsed) && parsed > 0 - ? parsed * 1000 - : baseDelay * Math.pow(2, attempt); - delayMs = Math.max(delayMs, baseDelay); // floor: never below base delay - delayMs = Math.min(delayMs, 10000); // cap: never above 10s - delayMs *= (0.75 + Math.random() * 0.5); // jitter ±25% - - setTimeout(function () { - // Check if tile was removed while waiting — abort signal is set. - if (!signal.aborted) { - layer._fetchWithRetry(url, tile, done, attempt + 1, signal); - } - }, delayMs); - return; - } + fetch(url, { signal: signal }).then(function (response) { + _releaseSlot(); - // Non-retryable (404, 400, 500, etc.) - done(new Error('Tile fetch failed: ' + response.status), tile); - }).catch(function (err) { - // Tile was removed (panned/zoomed away) — silently stop. - if (err.name === 'AbortError') return; - - // Network error (or body-read failure mid-transfer) — retry if attempts remain - if (attempt < maxRetries) { - let delayMs = Math.min(baseDelay * Math.pow(2, attempt), 10000); - delayMs *= (0.75 + Math.random() * 0.5); // jitter ±25% - setTimeout(function () { - if (!signal.aborted) { - layer._fetchWithRetry(url, tile, done, attempt + 1, signal); - } - }, delayMs); - return; - } - done(err, tile); + if (response.ok) { + return response.blob().then(function (blob) { + // Guard: if the tile was removed (panned/zoomed away) while the blob + // was being read, skip assigning the blob URL. Without this check, + // Leaflet's _removeTile would have already replaced onload/onerror + // with falseFn, so the revokeObjectURL callback would never fire, + // leaking the blob URL. + if (signal.aborted) return; + tile.onload = function () { + URL.revokeObjectURL(tile.src); + done(null, tile); + }; + tile.onerror = function (e) { + URL.revokeObjectURL(tile.src); + done(e, tile); + }; + tile.src = URL.createObjectURL(blob); + }); + } + + // 503 = budget exhausted, transient — retry with Retry-After or backoff. + // Jitter (±25%) prevents thundering-herd retries when many tiles 503 simultaneously. + if (response.status === 503 && attempt < maxRetries) { + const retryAfter = response.headers.get('Retry-After'); + const parsed = retryAfter ? parseInt(retryAfter, 10) : NaN; + let delayMs = !isNaN(parsed) && parsed > 0 + ? parsed * 1000 + : baseDelay * Math.pow(2, attempt); + delayMs = Math.max(delayMs, baseDelay); // floor: never below base delay + delayMs = Math.min(delayMs, 10000); // cap: never above 10s + delayMs *= (0.75 + Math.random() * 0.5); // jitter ±25% + + setTimeout(function () { + // Check if tile was removed while waiting — abort signal is set. + if (!signal.aborted) { + layer._fetchWithRetry(url, tile, done, attempt + 1, signal); + } + }, delayMs); + return; + } + + // Non-retryable (404, 400, 500, etc.) + done(new Error('Tile fetch failed: ' + response.status), tile); + }).catch(function (err) { + _releaseSlot(); + + // Tile was removed (panned/zoomed away) — silently stop. + if (err.name === 'AbortError') return; + + // Network error (or body-read failure mid-transfer) — retry if attempts remain + if (attempt < maxRetries) { + let delayMs = Math.min(baseDelay * Math.pow(2, attempt), 10000); + delayMs *= (0.75 + Math.random() * 0.5); // jitter ±25% + setTimeout(function () { + if (!signal.aborted) { + layer._fetchWithRetry(url, tile, done, attempt + 1, signal); + } + }, delayMs); + return; + } + done(err, tile); + }); }); } });