-
Notifications
You must be signed in to change notification settings - Fork 296
fix(iroh): add jitter to dns retry calls #3447
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
d880602
6033580
ef7a8ee
ae2bf21
f710dd0
ae87df8
3052221
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -28,6 +28,9 @@ pub const N0_DNS_NODE_ORIGIN_PROD: &str = "dns.iroh.link"; | |
/// The n0 testing DNS node origin, for testing. | ||
pub const N0_DNS_NODE_ORIGIN_STAGING: &str = "staging-dns.iroh.link"; | ||
|
||
/// Percent of total delay to jitter. 20 means +/- 20% of delay. | ||
const MAX_JITTER_PERCENT: u64 = 20; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Last remaining comment, now I understand this value: Why did you choose +/- 20%? Isn't that rather big? I think I might have chosen +/- 10%. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I was actually thinking 20% was relatively conservative. Looking at this article from aws, they found success with jittering a random value between 0 and the normal delay. I know the assumptions are probably different between iroh's use case and theirs, so I cam run some tests in a lab environment and see what the impact is to dns server load and request response time. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Cool, thanks for the context. We do retries eagerly as well to deal with slow responses, so we don't quite want to make the jittering between [0..=delay]. I'm happy to give this 20% a try and see what happens. I suspect in any case that this isn't going to solve the entire issue. As I said in #3017 we need to collect all callsites and think about it with a global view. |
||
|
||
/// Potential errors related to dns. | ||
#[common_fields({ | ||
backtrace: Option<Backtrace>, | ||
|
@@ -503,7 +506,7 @@ async fn stagger_call< | |
// NOTE: we add the 0 delay here to have a uniform set of futures. This is more performant than | ||
// using alternatives that allow futures of different types. | ||
for delay in std::iter::once(&0u64).chain(delays_ms) { | ||
let delay = Duration::from_millis(*delay); | ||
let delay = add_jitter(delay); | ||
let fut = f(); | ||
let staggered_fut = async move { | ||
time::sleep(delay).await; | ||
|
@@ -523,6 +526,19 @@ async fn stagger_call< | |
Err(StaggeredError::new(errors)) | ||
} | ||
|
||
fn add_jitter(delay: &u64) -> Duration { | ||
// If delay is 0, return 0 immediately. | ||
if *delay == 0 { | ||
return Duration::ZERO; | ||
} | ||
|
||
// Calculate jitter as a random value in the range of +/- MAX_JITTER_PERCENT of the delay. | ||
let max_jitter = delay.saturating_mul(MAX_JITTER_PERCENT * 2) / 100; | ||
let jitter = rand::random::<u64>() % max_jitter; | ||
|
||
Duration::from_millis(delay.saturating_sub(max_jitter / 2).saturating_add(jitter)) | ||
} | ||
flub marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
#[cfg(test)] | ||
pub(crate) mod tests { | ||
use std::sync::atomic::AtomicUsize; | ||
|
@@ -548,4 +564,30 @@ pub(crate) mod tests { | |
let result = stagger_call(f, &delays).await.unwrap(); | ||
assert_eq!(result, 5) | ||
} | ||
|
||
#[test] | ||
#[traced_test] | ||
fn jitter_test_zero() { | ||
let jittered_delay = add_jitter(&0); | ||
assert_eq!(jittered_delay, Duration::from_secs(0)); | ||
} | ||
|
||
//Sanity checks that I did the math right | ||
#[test] | ||
#[traced_test] | ||
fn jitter_test_nonzero_lower_bound() { | ||
let delay: u64 = 300; | ||
for _ in 0..100 { | ||
assert!(add_jitter(&delay) >= Duration::from_millis(delay * 8 / 10)); | ||
} | ||
} | ||
|
||
#[test] | ||
#[traced_test] | ||
fn jitter_test_nonzero_upper_bound() { | ||
let delay: u64 = 300; | ||
for _ in 0..100 { | ||
assert!(add_jitter(&delay) < Duration::from_millis(delay * 12 / 10)); | ||
} | ||
} | ||
} |
Uh oh!
There was an error while loading. Please reload this page.