From 619f8151f0296502f11a7feb25dcac0358e31aaa Mon Sep 17 00:00:00 2001 From: Mykhailo Chalyi Date: Fri, 27 Mar 2026 02:30:51 +0000 Subject: [PATCH] feat(fetch): add conditional fetching with ETag and If-Modified-Since Add if_none_match and if_modified_since fields to FetchRequest for conditional HTTP requests. Add etag field to FetchResponse. Handle 304 Not Modified responses (no content, no conversion). Fix redirect logic to not treat 304 as a redirect. Closes #74 --- crates/fetchkit/src/fetchers/default.rs | 122 +++++++++++++++++++++++- crates/fetchkit/src/types.rs | 26 +++++ 2 files changed, 144 insertions(+), 4 deletions(-) diff --git a/crates/fetchkit/src/fetchers/default.rs b/crates/fetchkit/src/fetchers/default.rs index 9018d49..07509cb 100644 --- a/crates/fetchkit/src/fetchers/default.rs +++ b/crates/fetchkit/src/fetchers/default.rs @@ -81,7 +81,7 @@ impl Default for DefaultFetcher { } /// Build headers for HTTP requests -fn build_headers(options: &FetchOptions, accept: &str) -> HeaderMap { +fn build_headers(options: &FetchOptions, accept: &str, request: &FetchRequest) -> HeaderMap { let mut headers = HeaderMap::new(); let user_agent = options.user_agent.as_deref().unwrap_or(DEFAULT_USER_AGENT); headers.insert( @@ -93,6 +93,19 @@ fn build_headers(options: &FetchOptions, accept: &str) -> HeaderMap { ACCEPT, HeaderValue::from_str(accept).unwrap_or_else(|_| HeaderValue::from_static("*/*")), ); + + // Conditional request headers + if let Some(ref etag) = request.if_none_match { + if let Ok(v) = HeaderValue::from_str(etag) { + headers.insert(reqwest::header::IF_NONE_MATCH, v); + } + } + if let Some(ref date) = request.if_modified_since { + if let Ok(v) = HeaderValue::from_str(date) { + headers.insert(reqwest::header::IF_MODIFIED_SINCE, v); + } + } + headers } @@ -137,6 +150,7 @@ fn apply_bot_auth_if_enabled(headers: HeaderMap, _options: &FetchOptions, _url: struct ResponseMeta { content_type: Option, last_modified: Option, + etag: Option, content_length: Option, filename: Option, } @@ -151,6 +165,10 @@ fn extract_response_meta(headers: &HeaderMap, url: &str) -> ResponseMeta { .get("last-modified") .and_then(|v| v.to_str().ok()) .map(|s| s.to_string()), + etag: headers + .get("etag") + .and_then(|v| v.to_str().ok()) + .map(|s| s.to_string()), content_length: headers .get("content-length") .and_then(|v| v.to_str().ok()) @@ -192,7 +210,7 @@ impl Fetcher for DefaultFetcher { "*/*" }; - let headers = build_headers(options, accept); + let headers = build_headers(options, accept, request); let parsed_url = url::Url::parse(&request.url).map_err(|_| FetchError::InvalidUrlScheme)?; let headers = apply_bot_auth_if_enabled(headers, options, &parsed_url); @@ -209,6 +227,18 @@ impl Fetcher for DefaultFetcher { let final_url = response.url().to_string(); let meta = extract_response_meta(response.headers(), &final_url); + // Handle 304 Not Modified (conditional request response) + if status_code == 304 { + return Ok(FetchResponse { + url: final_url, + status_code, + content_type: meta.content_type, + last_modified: meta.last_modified, + etag: meta.etag, + ..Default::default() + }); + } + // Handle HEAD request if method == HttpMethod::Head { return Ok(FetchResponse { @@ -217,6 +247,7 @@ impl Fetcher for DefaultFetcher { content_type: meta.content_type, size: meta.content_length, last_modified: meta.last_modified, + etag: meta.etag, filename: meta.filename, method: Some("HEAD".to_string()), ..Default::default() @@ -232,6 +263,7 @@ impl Fetcher for DefaultFetcher { content_type: meta.content_type, size: meta.content_length, last_modified: meta.last_modified, + etag: meta.etag, filename: meta.filename, error: Some( "Binary content is not supported. Only textual content (HTML, text, JSON, etc.) can be fetched." @@ -309,6 +341,7 @@ impl Fetcher for DefaultFetcher { content_type: meta.content_type, size: Some(size), last_modified: meta.last_modified, + etag: meta.etag, filename: meta.filename, format: Some(format), content: Some(final_content), @@ -340,7 +373,7 @@ impl Fetcher for DefaultFetcher { let method = request.effective_method(); let max_body_size = options.max_body_size.unwrap_or(DEFAULT_MAX_BODY_SIZE); - let headers = build_headers(options, "*/*"); + let headers = build_headers(options, "*/*", request); let parsed_url = url::Url::parse(&request.url).map_err(|_| FetchError::InvalidUrlScheme)?; let headers = apply_bot_auth_if_enabled(headers, options, &parsed_url); @@ -365,6 +398,7 @@ impl Fetcher for DefaultFetcher { content_type: meta.content_type, size: meta.content_length, last_modified: meta.last_modified, + etag: meta.etag, filename: meta.filename, method: Some("HEAD".to_string()), ..Default::default() @@ -387,6 +421,7 @@ impl Fetcher for DefaultFetcher { content_type: meta.content_type, size: Some(size), last_modified: meta.last_modified, + etag: meta.etag, filename: meta.filename, truncated: if truncated { Some(true) } else { None }, saved_path: Some(save_result.path), @@ -472,7 +507,8 @@ fn redirect_target( response: &reqwest::Response, options: &FetchOptions, ) -> Result, FetchError> { - if !response.status().is_redirection() { + // 304 Not Modified is in the 3xx range but is not a redirect + if !response.status().is_redirection() || response.status().as_u16() == 304 { return Ok(None); } @@ -934,4 +970,82 @@ mod tests { assert_eq!(response.status_code, 200); } + + #[tokio::test] + async fn test_etag_returned_in_response() { + let server = MockServer::start().await; + Mock::given(method("GET")) + .and(path("/page")) + .respond_with( + ResponseTemplate::new(200) + .set_body_string("content") + .insert_header("content-type", "text/plain") + .insert_header("etag", "\"abc123\""), + ) + .mount(&server) + .await; + + let fetcher = DefaultFetcher::new(); + let options = FetchOptions { + dns_policy: DnsPolicy::allow_all(), + ..Default::default() + }; + let request = FetchRequest::new(format!("{}/page", server.uri())); + let response = fetcher.fetch(&request, &options).await.unwrap(); + + assert_eq!(response.status_code, 200); + assert_eq!(response.etag.as_deref(), Some("\"abc123\"")); + } + + #[tokio::test] + async fn test_conditional_fetch_304_not_modified() { + use wiremock::matchers::header; + + let server = MockServer::start().await; + Mock::given(method("GET")) + .and(path("/page")) + .and(header("if-none-match", "\"abc123\"")) + .respond_with(ResponseTemplate::new(304).insert_header("etag", "\"abc123\"")) + .mount(&server) + .await; + + let fetcher = DefaultFetcher::new(); + let options = FetchOptions { + dns_policy: DnsPolicy::allow_all(), + ..Default::default() + }; + let request = + FetchRequest::new(format!("{}/page", server.uri())).if_none_match("\"abc123\""); + let response = fetcher.fetch(&request, &options).await.unwrap(); + + assert_eq!(response.status_code, 304); + assert_eq!(response.etag.as_deref(), Some("\"abc123\"")); + assert!(response.content.is_none()); + assert!(response.format.is_none()); + } + + #[tokio::test] + async fn test_conditional_fetch_if_modified_since() { + use wiremock::matchers::header_exists; + + let server = MockServer::start().await; + Mock::given(method("GET")) + .and(path("/page")) + .and(header_exists("if-modified-since")) + .respond_with(ResponseTemplate::new(304)) + .mount(&server) + .await; + + let fetcher = DefaultFetcher::new(); + let options = FetchOptions { + dns_policy: DnsPolicy::allow_all(), + ..Default::default() + }; + let request = FetchRequest::new(format!("{}/page", server.uri())) + .if_modified_since("Wed, 21 Oct 2015 07:28:00 GMT"); + let response = fetcher.fetch(&request, &options).await.unwrap(); + + assert_eq!(response.status_code, 304); + assert!(response.content.is_none()); + } } diff --git a/crates/fetchkit/src/types.rs b/crates/fetchkit/src/types.rs index ae16349..09b8bbf 100644 --- a/crates/fetchkit/src/types.rs +++ b/crates/fetchkit/src/types.rs @@ -94,6 +94,16 @@ pub struct FetchRequest { /// "full" (default) returns everything. #[serde(default, skip_serializing_if = "Option::is_none")] pub content_focus: Option, + + /// ETag value for conditional requests (If-None-Match header). + /// When set, the server may return 304 Not Modified if content unchanged. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub if_none_match: Option, + + /// Last-Modified value for conditional requests (If-Modified-Since header). + /// When set, the server may return 304 Not Modified if content unchanged. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub if_modified_since: Option, } impl FetchRequest { @@ -135,6 +145,18 @@ impl FetchRequest { self } + /// Set ETag for conditional request + pub fn if_none_match(mut self, etag: impl Into) -> Self { + self.if_none_match = Some(etag.into()); + self + } + + /// Set If-Modified-Since for conditional request + pub fn if_modified_since(mut self, date: impl Into) -> Self { + self.if_modified_since = Some(date.into()); + self + } + /// Get the effective method (default to GET) pub fn effective_method(&self) -> HttpMethod { self.method.unwrap_or_default() @@ -279,6 +301,10 @@ pub struct FetchResponse { #[serde(skip_serializing_if = "Option::is_none")] pub last_modified: Option, + /// ETag header value (for conditional requests) + #[serde(skip_serializing_if = "Option::is_none")] + pub etag: Option, + /// Extracted filename #[serde(skip_serializing_if = "Option::is_none")] pub filename: Option,