From a4fe507c9ce2f4df2f8e6a4e8957a13dd9165911 Mon Sep 17 00:00:00 2001 From: w Date: Tue, 21 Apr 2026 19:24:50 -0400 Subject: [PATCH 1/2] perf(runtime-service): use short retry when no peers available The runtime service tries to download the finalized block runtime immediately at startup, before peer connections are established. This always fails with StorageQueryError { errors: [] } (no peers to query). Previously, this triggered the full 4s retry_after_failed cooldown, making warm start consistently ~5-7s. Now, "no peers" errors use a 200ms retry instead of 4s. Peers typically connect within a few hundred milliseconds, so the retry succeeds quickly. Other errors (peer misbehavior, decode failures) still use the full 4s cooldown. Benchmark on Polkadot: warm start drops from ~5.5s to ~600ms. --- lib/src/chain/async_tree.rs | 11 ++++++++--- light-base/src/runtime_service.rs | 31 ++++++++++++++++++++++++++----- light-base/src/sync_service.rs | 5 +++++ 3 files changed, 39 insertions(+), 8 deletions(-) diff --git a/lib/src/chain/async_tree.rs b/lib/src/chain/async_tree.rs index e45e6ef757..ccd406d096 100644 --- a/lib/src/chain/async_tree.rs +++ b/lib/src/chain/async_tree.rs @@ -505,8 +505,13 @@ where /// Panics if the [`AsyncOpId`] is invalid. /// pub fn async_op_failure(&mut self, async_op_id: AsyncOpId, now: &TNow) { - let new_timeout = now.clone() + self.retry_after_failed; + let retry_after = now.clone() + self.retry_after_failed; + self.async_op_failure_retry_at(async_op_id, &retry_after); + } + /// Similar to [`AsyncTree::async_op_failure`], but retries at the given time + /// instead of `now + retry_after_failed`. + pub fn async_op_failure_retry_at(&mut self, async_op_id: AsyncOpId, retry_after: &TNow) { // Update the blocks that were performing this operation. // The blocks are iterated from child to parent, so that we can check, for each node, // whether its parent has the same asynchronous operation id. @@ -523,11 +528,11 @@ where AsyncOpState::InProgress { async_op_id: id, timeout: Some(ref timeout), - } if id == async_op_id => Some(cmp::min(timeout.clone(), new_timeout.clone())), + } if id == async_op_id => Some(cmp::min(timeout.clone(), retry_after.clone())), AsyncOpState::InProgress { async_op_id: id, timeout: None, - } if id == async_op_id => Some(new_timeout.clone()), + } if id == async_op_id => Some(retry_after.clone()), _ => continue, }; diff --git a/light-base/src/runtime_service.rs b/light-base/src/runtime_service.rs index eb1a0b8b63..e20f870a9a 100644 --- a/light-base/src/runtime_service.rs +++ b/light-base/src/runtime_service.rs @@ -2810,12 +2810,26 @@ async fn run_background( ); } - match &mut background.tree { - Tree::FinalizedBlockRuntimeKnown { tree, .. } => { - tree.async_op_failure(async_op_id, &background.platform.now()); + if error.is_no_peers() { + // No peers available yet — use a short retry (200ms) instead of + // the full 4s cooldown. Peers typically connect within milliseconds. + let short_retry = background.platform.now() + Duration::from_millis(200); + match &mut background.tree { + Tree::FinalizedBlockRuntimeKnown { tree, .. } => { + tree.async_op_failure_retry_at(async_op_id, &short_retry); + } + Tree::FinalizedBlockRuntimeUnknown { tree, .. } => { + tree.async_op_failure_retry_at(async_op_id, &short_retry); + } } - Tree::FinalizedBlockRuntimeUnknown { tree, .. } => { - tree.async_op_failure(async_op_id, &background.platform.now()); + } else { + match &mut background.tree { + Tree::FinalizedBlockRuntimeKnown { tree, .. } => { + tree.async_op_failure(async_op_id, &background.platform.now()); + } + Tree::FinalizedBlockRuntimeUnknown { tree, .. } => { + tree.async_op_failure(async_op_id, &background.platform.now()); + } } } } @@ -2832,6 +2846,13 @@ enum RuntimeDownloadError { } impl RuntimeDownloadError { + fn is_no_peers(&self) -> bool { + match self { + RuntimeDownloadError::StorageQuery(err) => err.is_no_peers(), + RuntimeDownloadError::InvalidHeader(_) => false, + } + } + /// Returns `true` if this is caused by networking issues, as opposed to a consensus-related /// issue. fn is_network_problem(&self) -> bool { diff --git a/light-base/src/sync_service.rs b/light-base/src/sync_service.rs index 087dbcfb36..b37b754d67 100644 --- a/light-base/src/sync_service.rs +++ b/light-base/src/sync_service.rs @@ -1040,6 +1040,11 @@ pub struct StorageQueryError { } impl StorageQueryError { + /// Returns `true` if no peers were available to query. + pub fn is_no_peers(&self) -> bool { + self.errors.is_empty() + } + /// Returns `true` if this is caused by networking issues, as opposed to a consensus-related /// issue. pub fn is_network_problem(&self) -> bool { From 66866aedb0290fb4c1d4c706d032e39278e503bd Mon Sep 17 00:00:00 2001 From: w Date: Wed, 22 Apr 2026 14:48:30 -0400 Subject: [PATCH 2/2] fix: implement exponential backoff for no-peers retry Replace the flat 200ms retry with exponential backoff (200ms, 400ms, 800ms) before falling through to the normal 4s cooldown. Prevents a busy loop when peers are genuinely unreachable while still giving a fast path for the common warm-start case. Track no_peers_retry_count in the background task. Reset on success. After 3 fast retries, fall through to the normal cooldown. Remove misleading comment about peer connection timing. --- light-base/src/runtime_service.rs | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/light-base/src/runtime_service.rs b/light-base/src/runtime_service.rs index e20f870a9a..edfbaeefe8 100644 --- a/light-base/src/runtime_service.rs +++ b/light-base/src/runtime_service.rs @@ -831,6 +831,7 @@ async fn run_background( blocks_stream: None, runtime_downloads: stream::FuturesUnordered::new(), progress_runtime_call_requests: stream::FuturesUnordered::new(), + no_peers_retry_count: 0, } }; @@ -2766,6 +2767,7 @@ async fn run_background( }; // Insert the runtime into the tree. + background.no_peers_retry_count = 0; match &mut background.tree { Tree::FinalizedBlockRuntimeKnown { tree, .. } => { tree.async_op_finished(async_op_id, runtime); @@ -2810,16 +2812,16 @@ async fn run_background( ); } - if error.is_no_peers() { - // No peers available yet — use a short retry (200ms) instead of - // the full 4s cooldown. Peers typically connect within milliseconds. - let short_retry = background.platform.now() + Duration::from_millis(200); + if error.is_no_peers() && background.no_peers_retry_count < 3 { + let delay_ms = 200u64 << background.no_peers_retry_count; + background.no_peers_retry_count += 1; + let retry_at = background.platform.now() + Duration::from_millis(delay_ms); match &mut background.tree { Tree::FinalizedBlockRuntimeKnown { tree, .. } => { - tree.async_op_failure_retry_at(async_op_id, &short_retry); + tree.async_op_failure_retry_at(async_op_id, &retry_at); } Tree::FinalizedBlockRuntimeUnknown { tree, .. } => { - tree.async_op_failure_retry_at(async_op_id, &short_retry); + tree.async_op_failure_retry_at(async_op_id, &retry_at); } } } else { @@ -2914,6 +2916,10 @@ struct Background { /// Stream of notifications coming from the sync service. `None` if not subscribed yet. blocks_stream: Option + Send>>>, + /// Number of consecutive runtime download failures due to no peers being available. + /// Used for exponential backoff (200ms, 400ms, 800ms) before falling to the normal cooldown. + no_peers_retry_count: u32, + /// List of runtimes currently being downloaded from the network. /// For each item, the download id, storage value of `:code`, storage value of `:heappages`, /// and Merkle value and closest ancestor of `:code`.