From 4c9025229ef9e650fba4ea2115db6cdc6261149b Mon Sep 17 00:00:00 2001 From: Mykhailo Chalyi Date: Fri, 27 Mar 2026 02:37:40 +0000 Subject: [PATCH] feat(fetch): add content quality signals (word_count, redirect_chain, is_paywall) Add word_count, redirect_chain, and is_paywall fields to FetchResponse. Word count computed from final content. Redirect chain tracks all intermediate URLs during redirect following. Paywall detection uses heuristic matching against common paywall indicators in raw HTML. Closes #76 --- crates/fetchkit/src/fetchers/default.rs | 206 +++++++++++++++++++++++- crates/fetchkit/src/types.rs | 12 ++ 2 files changed, 214 insertions(+), 4 deletions(-) diff --git a/crates/fetchkit/src/fetchers/default.rs b/crates/fetchkit/src/fetchers/default.rs index 07509cb..137436a 100644 --- a/crates/fetchkit/src/fetchers/default.rs +++ b/crates/fetchkit/src/fetchers/default.rs @@ -220,7 +220,7 @@ impl Fetcher for DefaultFetcher { }; // THREAT[TM-SSRF-010]: Follow redirects manually so every hop is re-validated. - let response = + let (response, redirect_chain) = send_request_following_redirects(parsed_url, reqwest_method, headers, options).await?; let status_code = response.status().as_u16(); @@ -250,6 +250,7 @@ impl Fetcher for DefaultFetcher { etag: meta.etag, filename: meta.filename, method: Some("HEAD".to_string()), + redirect_chain, ..Default::default() }); } @@ -265,6 +266,7 @@ impl Fetcher for DefaultFetcher { last_modified: meta.last_modified, etag: meta.etag, filename: meta.filename, + redirect_chain, error: Some( "Binary content is not supported. Only textual content (HTML, text, JSON, etc.) can be fetched." .to_string(), @@ -282,6 +284,9 @@ impl Fetcher for DefaultFetcher { // Convert to string let content = String::from_utf8_lossy(&body).to_string(); + // Detect paywall before content is moved by conversion + let is_paywall = detect_paywall(&content); + // Determine format and convert if needed // THREAT[TM-DOS-006]: Conversion input is bounded by max_body_size let is_html_content = is_html(&meta.content_type, &content); @@ -335,6 +340,9 @@ impl Fetcher for DefaultFetcher { final_content.push_str(TRUNCATION_MESSAGE); } + // Compute quality signals + let word_count = count_words(&final_content); + Ok(FetchResponse { url: final_url, status_code, @@ -347,6 +355,9 @@ impl Fetcher for DefaultFetcher { content: Some(final_content), truncated: if truncated { Some(true) } else { None }, metadata: page_metadata, + word_count: Some(word_count), + redirect_chain, + is_paywall: if is_paywall { Some(true) } else { None }, ..Default::default() }) } @@ -383,7 +394,7 @@ impl Fetcher for DefaultFetcher { }; // THREAT[TM-SSRF-010]: Follow redirects manually with IP validation at each hop - let response = + let (response, redirect_chain) = send_request_following_redirects(parsed_url, reqwest_method, headers, options).await?; let status_code = response.status().as_u16(); @@ -401,6 +412,7 @@ impl Fetcher for DefaultFetcher { etag: meta.etag, filename: meta.filename, method: Some("HEAD".to_string()), + redirect_chain, ..Default::default() }); } @@ -426,19 +438,22 @@ impl Fetcher for DefaultFetcher { truncated: if truncated { Some(true) } else { None }, saved_path: Some(save_result.path), bytes_written: Some(save_result.bytes_written), + redirect_chain, // No inline content when saving to file ..Default::default() }) } } +/// Returns `(response, redirect_chain)` where redirect_chain lists intermediate URLs. async fn send_request_following_redirects( initial_url: Url, method: reqwest::Method, headers: HeaderMap, options: &FetchOptions, -) -> Result { +) -> Result<(reqwest::Response, Vec), FetchError> { let mut current_url = initial_url; + let mut redirect_chain = Vec::new(); for redirect_count in 0..=MAX_REDIRECTS { let client = build_client_for_url(¤t_url, headers.clone(), options)?; @@ -449,7 +464,7 @@ async fn send_request_following_redirects( .map_err(FetchError::from_reqwest)?; let Some(next_url) = redirect_target(¤t_url, &response, options)? else { - return Ok(response); + return Ok((response, redirect_chain)); }; if redirect_count == MAX_REDIRECTS { @@ -463,6 +478,7 @@ async fn send_request_following_redirects( "Following redirect with IP validation" ); + redirect_chain.push(current_url.to_string()); current_url = next_url; } @@ -650,6 +666,36 @@ async fn read_body_with_timeout( } } +/// Count words in text content. +fn count_words(text: &str) -> u64 { + text.split_whitespace().count() as u64 +} + +/// Common paywall indicators in raw HTML content. +const PAYWALL_INDICATORS: &[&str] = &[ + "paywall", + "subscribe to read", + "subscribe to continue", + "subscription required", + "premium content", + "members only", + "sign in to read", + "log in to read", + "create a free account", + "already a subscriber", + "unlock this article", + "get unlimited access", + "start your free trial", +]; + +/// Heuristic paywall detection from raw HTML. +fn detect_paywall(html: &str) -> bool { + let lower = html.to_lowercase(); + PAYWALL_INDICATORS + .iter() + .any(|indicator| lower.contains(indicator)) +} + #[cfg(test)] mod tests { use super::*; @@ -1048,4 +1094,156 @@ mod tests { assert_eq!(response.status_code, 304); assert!(response.content.is_none()); } + + #[test] + fn test_count_words() { + assert_eq!(count_words("hello world"), 2); + assert_eq!(count_words(""), 0); + assert_eq!(count_words(" one two three "), 3); + assert_eq!(count_words("word"), 1); + } + + #[test] + fn test_detect_paywall() { + assert!(detect_paywall("
Subscribe
")); + assert!(detect_paywall("

Subscribe to read the full article

")); + assert!(detect_paywall("Already a subscriber? Log in")); + assert!(detect_paywall("
Unlock this article
")); + assert!(!detect_paywall("

This is a normal article

")); + assert!(!detect_paywall("

Hello World

Free content

")); + } + + #[tokio::test] + async fn test_word_count_in_response() { + let server = MockServer::start().await; + Mock::given(method("GET")) + .and(path("/article")) + .respond_with( + ResponseTemplate::new(200) + .set_body_string("Hello world this is a test") + .insert_header("content-type", "text/plain"), + ) + .mount(&server) + .await; + + let fetcher = DefaultFetcher::new(); + let options = FetchOptions { + dns_policy: DnsPolicy::allow_all(), + ..Default::default() + }; + let request = FetchRequest::new(format!("{}/article", server.uri())); + let response = fetcher.fetch(&request, &options).await.unwrap(); + + assert_eq!(response.word_count, Some(6)); + } + + #[tokio::test] + async fn test_redirect_chain_tracked() { + let destination = MockServer::start().await; + Mock::given(method("GET")) + .and(path("/final")) + .respond_with( + ResponseTemplate::new(200) + .set_body_string("arrived") + .insert_header("content-type", "text/plain"), + ) + .mount(&destination) + .await; + + let origin = MockServer::start().await; + Mock::given(method("GET")) + .and(path("/start")) + .respond_with( + ResponseTemplate::new(302) + .insert_header("location", format!("{}/final", destination.uri())), + ) + .mount(&origin) + .await; + + let fetcher = DefaultFetcher::new(); + let options = FetchOptions { + dns_policy: DnsPolicy::allow_all(), + ..Default::default() + }; + let request = FetchRequest::new(format!("{}/start", origin.uri())); + let response = fetcher.fetch(&request, &options).await.unwrap(); + + assert_eq!(response.status_code, 200); + assert_eq!(response.redirect_chain.len(), 1); + assert!(response.redirect_chain[0].contains("/start")); + } + + #[tokio::test] + async fn test_no_redirect_chain_for_direct_response() { + let server = MockServer::start().await; + Mock::given(method("GET")) + .and(path("/direct")) + .respond_with( + ResponseTemplate::new(200) + .set_body_string("direct") + .insert_header("content-type", "text/plain"), + ) + .mount(&server) + .await; + + let fetcher = DefaultFetcher::new(); + let options = FetchOptions { + dns_policy: DnsPolicy::allow_all(), + ..Default::default() + }; + let request = FetchRequest::new(format!("{}/direct", server.uri())); + let response = fetcher.fetch(&request, &options).await.unwrap(); + + assert!(response.redirect_chain.is_empty()); + } + + #[tokio::test] + async fn test_paywall_detection() { + let server = MockServer::start().await; + Mock::given(method("GET")) + .and(path("/paywalled")) + .respond_with( + ResponseTemplate::new(200) + .set_body_string("
Subscribe to read the full article

Preview...

") + .insert_header("content-type", "text/html"), + ) + .mount(&server) + .await; + + let fetcher = DefaultFetcher::new(); + let options = FetchOptions { + enable_markdown: true, + dns_policy: DnsPolicy::allow_all(), + ..Default::default() + }; + let request = FetchRequest::new(format!("{}/paywalled", server.uri())).as_markdown(); + let response = fetcher.fetch(&request, &options).await.unwrap(); + + assert_eq!(response.is_paywall, Some(true)); + } + + #[tokio::test] + async fn test_no_paywall_for_normal_content() { + let server = MockServer::start().await; + Mock::given(method("GET")) + .and(path("/free")) + .respond_with( + ResponseTemplate::new(200) + .set_body_string("

This is free content

") + .insert_header("content-type", "text/html"), + ) + .mount(&server) + .await; + + let fetcher = DefaultFetcher::new(); + let options = FetchOptions { + enable_markdown: true, + dns_policy: DnsPolicy::allow_all(), + ..Default::default() + }; + let request = FetchRequest::new(format!("{}/free", server.uri())).as_markdown(); + let response = fetcher.fetch(&request, &options).await.unwrap(); + + assert!(response.is_paywall.is_none()); + } } diff --git a/crates/fetchkit/src/types.rs b/crates/fetchkit/src/types.rs index 09b8bbf..e50aadc 100644 --- a/crates/fetchkit/src/types.rs +++ b/crates/fetchkit/src/types.rs @@ -340,6 +340,18 @@ pub struct FetchResponse { /// Structured page metadata extracted from HTML #[serde(skip_serializing_if = "Option::is_none")] pub metadata: Option, + + /// Word count of the final content + #[serde(skip_serializing_if = "Option::is_none")] + pub word_count: Option, + + /// Chain of URLs followed during redirects (empty if no redirects) + #[serde(skip_serializing_if = "Vec::is_empty", default)] + pub redirect_chain: Vec, + + /// Heuristic paywall detection (soft signal, not guaranteed) + #[serde(skip_serializing_if = "Option::is_none")] + pub is_paywall: Option, } #[cfg(test)]