From ab4990b6fda97b0582b4e935dc152c84d88eabe2 Mon Sep 17 00:00:00 2001 From: Mykhailo Chalyi Date: Fri, 27 Mar 2026 21:39:16 +0000 Subject: [PATCH 1/2] test(fetchers): add live integration tests behind feature flag Adds `live-tests` feature flag with tests hitting real endpoints for all 13 fetchers. Tests assert structural properties (non-empty, expected substrings) rather than exact content. CI job detects changed fetcher files and runs only the relevant live tests (continue-on-error: true). - `cargo test --features live-tests` runs all live tests - `cargo test --features live-tests live_github_repo` runs one fetcher - Normal `cargo test` is unaffected (tests don't compile without flag) --- .github/workflows/ci.yml | 31 +++ crates/fetchkit/Cargo.toml | 1 + crates/fetchkit/tests/fetcher_live.rs | 320 ++++++++++++++++++++++++++ scripts/changed-fetcher-tests.sh | 43 ++++ 4 files changed, 395 insertions(+) create mode 100644 crates/fetchkit/tests/fetcher_live.rs create mode 100755 scripts/changed-fetcher-tests.sh diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 2c7378b..b622941 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -112,6 +112,37 @@ jobs: python examples/python_fetchkit.py timeout-minutes: 2 + live-tests: + name: Live Fetcher Tests + if: github.event_name == 'pull_request' + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v6 + with: + fetch-depth: 0 + - uses: dtolnay/rust-toolchain@stable + - uses: Swatinem/rust-cache@v2 + - name: Detect changed fetchers + id: detect + run: | + FILTERS=$(bash scripts/changed-fetcher-tests.sh "origin/${{ github.base_ref }}") + echo "filters=$FILTERS" >> "$GITHUB_OUTPUT" + if [ -z "$FILTERS" ]; then + echo "skip=true" >> "$GITHUB_OUTPUT" + else + echo "skip=false" >> "$GITHUB_OUTPUT" + fi + - name: Run live tests + if: steps.detect.outputs.skip == 'false' + run: | + for filter in ${{ steps.detect.outputs.filters }}; do + echo "::group::Running $filter" + cargo test --features live-tests -p fetchkit "$filter" -- --nocapture + echo "::endgroup::" + done + timeout-minutes: 5 + continue-on-error: true + # Gate job for branch protection. check: name: Check diff --git a/crates/fetchkit/Cargo.toml b/crates/fetchkit/Cargo.toml index 397d7f3..9231802 100644 --- a/crates/fetchkit/Cargo.toml +++ b/crates/fetchkit/Cargo.toml @@ -13,6 +13,7 @@ readme = "../../README.md" [features] default = [] bot-auth = ["dep:ed25519-dalek", "dep:base64", "dep:sha2", "dep:rand"] +live-tests = [] [dependencies] tokio = { workspace = true } diff --git a/crates/fetchkit/tests/fetcher_live.rs b/crates/fetchkit/tests/fetcher_live.rs new file mode 100644 index 0000000..efa7b5a --- /dev/null +++ b/crates/fetchkit/tests/fetcher_live.rs @@ -0,0 +1,320 @@ +//! Live integration tests for fetchers against real endpoints. +//! +//! Gated behind `--features live-tests` so they never run during normal `cargo test`. +//! Each test module maps 1:1 to a fetcher source file; CI runs only the modules +//! whose fetcher changed. +//! +//! Assertions are structural (field presence, non-empty content, expected substrings) +//! rather than exact-match, so tests tolerate minor upstream changes. + +#![cfg(feature = "live-tests")] + +use fetchkit::{FetchOptions, FetchRequest, FetcherRegistry}; + +/// Shared options for live tests — default everything, both conversions on. +fn live_options() -> FetchOptions { + FetchOptions { + enable_markdown: true, + enable_text: true, + ..Default::default() + } +} + +fn registry() -> FetcherRegistry { + FetcherRegistry::with_defaults() +} + +// --------------------------------------------------------------------------- +// github_repo +// --------------------------------------------------------------------------- +mod live_github_repo { + use super::*; + + #[tokio::test] + async fn fetches_repo_metadata() { + let req = FetchRequest::new("https://github.com/rust-lang/rust"); + let resp = registry().fetch(req, live_options()).await.unwrap(); + + assert_eq!(resp.status_code, 200); + let content = resp.content.expect("should have content"); + // Repo name should appear somewhere in the output + assert!( + content.contains("rust-lang/rust") || content.to_lowercase().contains("rust"), + "content should mention the repo" + ); + assert!(!content.is_empty()); + } +} + +// --------------------------------------------------------------------------- +// github_issue +// --------------------------------------------------------------------------- +mod live_github_issue { + use super::*; + + #[tokio::test] + async fn fetches_issue() { + // Well-known issue: rust-lang/rust#1 (the very first issue) + let req = FetchRequest::new("https://github.com/rust-lang/rust/issues/1"); + let resp = registry().fetch(req, live_options()).await.unwrap(); + + assert_eq!(resp.status_code, 200); + let content = resp.content.expect("should have content"); + assert!(!content.is_empty()); + } +} + +// --------------------------------------------------------------------------- +// github_code +// --------------------------------------------------------------------------- +mod live_github_code { + use super::*; + + #[tokio::test] + async fn fetches_source_file() { + let req = FetchRequest::new("https://github.com/rust-lang/rust/blob/master/README.md"); + let resp = registry().fetch(req, live_options()).await.unwrap(); + + assert_eq!(resp.status_code, 200); + let content = resp.content.expect("should have content"); + assert!( + content.to_lowercase().contains("rust"), + "README should mention Rust" + ); + } +} + +// --------------------------------------------------------------------------- +// twitter +// --------------------------------------------------------------------------- +mod live_twitter { + use super::*; + + #[tokio::test] + async fn fetches_tweet() { + // Rust lang announcement tweet — stable, public + let req = FetchRequest::new("https://x.com/rustlang/status/1821986021505405014"); + let result = registry().fetch(req, live_options()).await; + + // Twitter APIs are flaky; accept success or a graceful error + match result { + Ok(resp) => { + assert!(resp.status_code == 200 || resp.status_code == 403); + if resp.status_code == 200 { + assert!(resp.content.is_some()); + } + } + Err(_) => { + // Third-party API unavailable — acceptable + } + } + } +} + +// --------------------------------------------------------------------------- +// stackoverflow +// --------------------------------------------------------------------------- +mod live_stackoverflow { + use super::*; + + #[tokio::test] + async fn fetches_question() { + // "What is a NullPointerException" — one of the most famous SO questions + let req = FetchRequest::new( + "https://stackoverflow.com/questions/218384/what-is-a-nullpointerexception-and-how-do-i-fix-it", + ); + let resp = registry().fetch(req, live_options()).await.unwrap(); + + assert_eq!(resp.status_code, 200); + let content = resp.content.expect("should have content"); + assert!( + content.to_lowercase().contains("null"), + "content should mention null" + ); + } +} + +// --------------------------------------------------------------------------- +// package_registry (crates.io) +// --------------------------------------------------------------------------- +mod live_package_registry { + use super::*; + + #[tokio::test] + async fn fetches_crate() { + let req = FetchRequest::new("https://crates.io/crates/serde"); + let resp = registry().fetch(req, live_options()).await.unwrap(); + + assert_eq!(resp.status_code, 200); + let content = resp.content.expect("should have content"); + assert!( + content.to_lowercase().contains("serde"), + "content should mention serde" + ); + } + + #[tokio::test] + async fn fetches_pypi_package() { + let req = FetchRequest::new("https://pypi.org/project/requests/"); + let resp = registry().fetch(req, live_options()).await.unwrap(); + + assert_eq!(resp.status_code, 200); + let content = resp.content.expect("should have content"); + assert!( + content.to_lowercase().contains("requests"), + "content should mention requests" + ); + } + + #[tokio::test] + async fn fetches_npm_package() { + let req = FetchRequest::new("https://www.npmjs.com/package/express"); + let resp = registry().fetch(req, live_options()).await.unwrap(); + + assert_eq!(resp.status_code, 200); + let content = resp.content.expect("should have content"); + assert!( + content.to_lowercase().contains("express"), + "content should mention express" + ); + } +} + +// --------------------------------------------------------------------------- +// wikipedia +// --------------------------------------------------------------------------- +mod live_wikipedia { + use super::*; + + #[tokio::test] + async fn fetches_article() { + let req = FetchRequest::new("https://en.wikipedia.org/wiki/Rust_(programming_language)"); + let resp = registry().fetch(req, live_options()).await.unwrap(); + + assert_eq!(resp.status_code, 200); + let content = resp.content.expect("should have content"); + assert!( + content.to_lowercase().contains("rust"), + "article should mention Rust" + ); + } +} + +// --------------------------------------------------------------------------- +// youtube +// --------------------------------------------------------------------------- +mod live_youtube { + use super::*; + + #[tokio::test] + async fn fetches_video_metadata() { + // "Me at the zoo" — first YouTube video ever, very stable + let req = FetchRequest::new("https://www.youtube.com/watch?v=jNQXAC9IVRw"); + let resp = registry().fetch(req, live_options()).await.unwrap(); + + assert_eq!(resp.status_code, 200); + let content = resp.content.expect("should have content"); + assert!(!content.is_empty()); + } +} + +// --------------------------------------------------------------------------- +// arxiv +// --------------------------------------------------------------------------- +mod live_arxiv { + use super::*; + + #[tokio::test] + async fn fetches_paper() { + // "Attention Is All You Need" + let req = FetchRequest::new("https://arxiv.org/abs/1706.03762"); + let resp = registry().fetch(req, live_options()).await.unwrap(); + + assert_eq!(resp.status_code, 200); + let content = resp.content.expect("should have content"); + assert!( + content.to_lowercase().contains("attention"), + "paper should mention attention" + ); + } +} + +// --------------------------------------------------------------------------- +// hackernews +// --------------------------------------------------------------------------- +mod live_hackernews { + use super::*; + + #[tokio::test] + async fn fetches_story() { + // HN item 1 — the very first story + let req = FetchRequest::new("https://news.ycombinator.com/item?id=1"); + let resp = registry().fetch(req, live_options()).await.unwrap(); + + assert_eq!(resp.status_code, 200); + let content = resp.content.expect("should have content"); + assert!(!content.is_empty()); + } +} + +// --------------------------------------------------------------------------- +// rss_feed +// --------------------------------------------------------------------------- +mod live_rss_feed { + use super::*; + + #[tokio::test] + async fn fetches_rss() { + // Rust blog RSS feed + let req = FetchRequest::new("https://blog.rust-lang.org/feed.xml"); + let resp = registry().fetch(req, live_options()).await.unwrap(); + + assert_eq!(resp.status_code, 200); + let content = resp.content.expect("should have content"); + assert!( + content.to_lowercase().contains("rust"), + "Rust blog feed should mention Rust" + ); + } +} + +// --------------------------------------------------------------------------- +// docs_site +// --------------------------------------------------------------------------- +mod live_docs_site { + use super::*; + + #[tokio::test] + async fn fetches_docs_rs() { + let req = FetchRequest::new("https://docs.rs/serde/latest/serde/"); + let resp = registry().fetch(req, live_options()).await.unwrap(); + + assert_eq!(resp.status_code, 200); + let content = resp.content.expect("should have content"); + assert!( + content.to_lowercase().contains("serde"), + "docs.rs page should mention serde" + ); + } +} + +// --------------------------------------------------------------------------- +// default (generic HTTP) +// --------------------------------------------------------------------------- +mod live_default { + use super::*; + + #[tokio::test] + async fn fetches_plain_html() { + let req = FetchRequest::new("https://example.com").as_markdown(); + let resp = registry().fetch(req, live_options()).await.unwrap(); + + assert_eq!(resp.status_code, 200); + let content = resp.content.expect("should have content"); + assert!( + content.contains("Example Domain"), + "example.com should contain 'Example Domain'" + ); + assert_eq!(resp.format, Some("markdown".to_string())); + } +} diff --git a/scripts/changed-fetcher-tests.sh b/scripts/changed-fetcher-tests.sh new file mode 100755 index 0000000..56de3ca --- /dev/null +++ b/scripts/changed-fetcher-tests.sh @@ -0,0 +1,43 @@ +#!/usr/bin/env bash +# Determine which live_* test modules to run based on changed fetcher files. +# +# Usage: scripts/changed-fetcher-tests.sh +# Output: space-separated test name filters, or empty string if none changed. +# +# Maps: crates/fetchkit/src/fetchers/.rs → live_ +# Also maps: tests/fetcher_live.rs → (all) + +set -euo pipefail + +BASE_REF="${1:-origin/main}" + +# Get changed files relative to base +CHANGED=$(git diff --name-only "$BASE_REF"...HEAD 2>/dev/null || git diff --name-only "$BASE_REF" HEAD) + +FILTERS=() +RUN_ALL=false + +while IFS= read -r file; do + # If the live test file itself changed, run everything + if [[ "$file" == "crates/fetchkit/tests/fetcher_live.rs" ]]; then + RUN_ALL=true + break + fi + + # Match fetcher source files + if [[ "$file" =~ ^crates/fetchkit/src/fetchers/([a-z_]+)\.rs$ ]]; then + name="${BASH_REMATCH[1]}" + # Skip mod.rs — user said no need to run all for shared changes + [[ "$name" == "mod" ]] && continue + FILTERS+=("live_${name}") + fi +done <<< "$CHANGED" + +if $RUN_ALL; then + echo "live_" +elif [[ ${#FILTERS[@]} -gt 0 ]]; then + # Deduplicate + printf '%s\n' "${FILTERS[@]}" | sort -u | tr '\n' ' ' +else + echo "" +fi From 3980fa506a37602b2bc7104242ada9551f60d979 Mon Sep 17 00:00:00 2001 From: Mykhailo Chalyi Date: Fri, 27 Mar 2026 21:49:38 +0000 Subject: [PATCH 2/2] fix(live-tests): graceful skip on network errors, lenient twitter assertions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Live tests now skip (not fail) when network is unavailable — DNS resolution failures, timeouts, and blocked URLs are treated as infra issues. Twitter test accepts any status code since the API is notoriously unreliable. --- crates/fetchkit/tests/fetcher_live.rs | 145 ++++++++++++++++++-------- 1 file changed, 99 insertions(+), 46 deletions(-) diff --git a/crates/fetchkit/tests/fetcher_live.rs b/crates/fetchkit/tests/fetcher_live.rs index efa7b5a..dfd4626 100644 --- a/crates/fetchkit/tests/fetcher_live.rs +++ b/crates/fetchkit/tests/fetcher_live.rs @@ -6,10 +6,13 @@ //! //! Assertions are structural (field presence, non-empty content, expected substrings) //! rather than exact-match, so tests tolerate minor upstream changes. +//! +//! Network errors (DNS, timeout, blocked) are treated as skips, not failures — +//! live tests should only fail on unexpected response structure, not infra issues. #![cfg(feature = "live-tests")] -use fetchkit::{FetchOptions, FetchRequest, FetcherRegistry}; +use fetchkit::{FetchError, FetchOptions, FetchRequest, FetchResponse, FetcherRegistry}; /// Shared options for live tests — default everything, both conversions on. fn live_options() -> FetchOptions { @@ -24,6 +27,44 @@ fn registry() -> FetcherRegistry { FetcherRegistry::with_defaults() } +/// Network errors that indicate infra problems, not fetcher bugs. +fn is_network_error(err: &FetchError) -> bool { + matches!( + err, + FetchError::FirstByteTimeout + | FetchError::BlockedUrl + | FetchError::ConnectError(_) + | FetchError::ClientBuildError(_) + | FetchError::RequestError(_) + ) +} + +/// Fetch and return Ok(response), or skip the test if the error is network-related. +async fn fetch_or_skip(url: &str) -> Option { + let req = FetchRequest::new(url); + match registry().fetch(req, live_options()).await { + Ok(resp) => Some(resp), + Err(e) if is_network_error(&e) => { + eprintln!("SKIPPED (network): {url} — {e}"); + None + } + Err(e) => panic!("unexpected fetcher error for {url}: {e}"), + } +} + +/// Like fetch_or_skip but with as_markdown set. +async fn fetch_markdown_or_skip(url: &str) -> Option { + let req = FetchRequest::new(url).as_markdown(); + match registry().fetch(req, live_options()).await { + Ok(resp) => Some(resp), + Err(e) if is_network_error(&e) => { + eprintln!("SKIPPED (network): {url} — {e}"); + None + } + Err(e) => panic!("unexpected fetcher error for {url}: {e}"), + } +} + // --------------------------------------------------------------------------- // github_repo // --------------------------------------------------------------------------- @@ -32,12 +73,12 @@ mod live_github_repo { #[tokio::test] async fn fetches_repo_metadata() { - let req = FetchRequest::new("https://github.com/rust-lang/rust"); - let resp = registry().fetch(req, live_options()).await.unwrap(); + let Some(resp) = fetch_or_skip("https://github.com/rust-lang/rust").await else { + return; + }; assert_eq!(resp.status_code, 200); let content = resp.content.expect("should have content"); - // Repo name should appear somewhere in the output assert!( content.contains("rust-lang/rust") || content.to_lowercase().contains("rust"), "content should mention the repo" @@ -55,8 +96,9 @@ mod live_github_issue { #[tokio::test] async fn fetches_issue() { // Well-known issue: rust-lang/rust#1 (the very first issue) - let req = FetchRequest::new("https://github.com/rust-lang/rust/issues/1"); - let resp = registry().fetch(req, live_options()).await.unwrap(); + let Some(resp) = fetch_or_skip("https://github.com/rust-lang/rust/issues/1").await else { + return; + }; assert_eq!(resp.status_code, 200); let content = resp.content.expect("should have content"); @@ -72,8 +114,11 @@ mod live_github_code { #[tokio::test] async fn fetches_source_file() { - let req = FetchRequest::new("https://github.com/rust-lang/rust/blob/master/README.md"); - let resp = registry().fetch(req, live_options()).await.unwrap(); + let Some(resp) = + fetch_or_skip("https://github.com/rust-lang/rust/blob/master/README.md").await + else { + return; + }; assert_eq!(resp.status_code, 200); let content = resp.content.expect("should have content"); @@ -93,20 +138,15 @@ mod live_twitter { #[tokio::test] async fn fetches_tweet() { // Rust lang announcement tweet — stable, public - let req = FetchRequest::new("https://x.com/rustlang/status/1821986021505405014"); - let result = registry().fetch(req, live_options()).await; - - // Twitter APIs are flaky; accept success or a graceful error - match result { - Ok(resp) => { - assert!(resp.status_code == 200 || resp.status_code == 403); - if resp.status_code == 200 { - assert!(resp.content.is_some()); - } - } - Err(_) => { - // Third-party API unavailable — acceptable - } + let Some(resp) = fetch_or_skip("https://x.com/rustlang/status/1821986021505405014").await + else { + return; + }; + + // Twitter APIs are unreliable; accept any non-panic response as proof + // the fetcher handled it. Only assert structure on 200. + if resp.status_code == 200 { + assert!(resp.content.is_some()); } } } @@ -120,10 +160,11 @@ mod live_stackoverflow { #[tokio::test] async fn fetches_question() { // "What is a NullPointerException" — one of the most famous SO questions - let req = FetchRequest::new( + let Some(resp) = fetch_or_skip( "https://stackoverflow.com/questions/218384/what-is-a-nullpointerexception-and-how-do-i-fix-it", - ); - let resp = registry().fetch(req, live_options()).await.unwrap(); + ).await else { + return; + }; assert_eq!(resp.status_code, 200); let content = resp.content.expect("should have content"); @@ -135,15 +176,16 @@ mod live_stackoverflow { } // --------------------------------------------------------------------------- -// package_registry (crates.io) +// package_registry // --------------------------------------------------------------------------- mod live_package_registry { use super::*; #[tokio::test] async fn fetches_crate() { - let req = FetchRequest::new("https://crates.io/crates/serde"); - let resp = registry().fetch(req, live_options()).await.unwrap(); + let Some(resp) = fetch_or_skip("https://crates.io/crates/serde").await else { + return; + }; assert_eq!(resp.status_code, 200); let content = resp.content.expect("should have content"); @@ -155,8 +197,9 @@ mod live_package_registry { #[tokio::test] async fn fetches_pypi_package() { - let req = FetchRequest::new("https://pypi.org/project/requests/"); - let resp = registry().fetch(req, live_options()).await.unwrap(); + let Some(resp) = fetch_or_skip("https://pypi.org/project/requests/").await else { + return; + }; assert_eq!(resp.status_code, 200); let content = resp.content.expect("should have content"); @@ -168,8 +211,9 @@ mod live_package_registry { #[tokio::test] async fn fetches_npm_package() { - let req = FetchRequest::new("https://www.npmjs.com/package/express"); - let resp = registry().fetch(req, live_options()).await.unwrap(); + let Some(resp) = fetch_or_skip("https://www.npmjs.com/package/express").await else { + return; + }; assert_eq!(resp.status_code, 200); let content = resp.content.expect("should have content"); @@ -188,8 +232,11 @@ mod live_wikipedia { #[tokio::test] async fn fetches_article() { - let req = FetchRequest::new("https://en.wikipedia.org/wiki/Rust_(programming_language)"); - let resp = registry().fetch(req, live_options()).await.unwrap(); + let Some(resp) = + fetch_or_skip("https://en.wikipedia.org/wiki/Rust_(programming_language)").await + else { + return; + }; assert_eq!(resp.status_code, 200); let content = resp.content.expect("should have content"); @@ -209,8 +256,9 @@ mod live_youtube { #[tokio::test] async fn fetches_video_metadata() { // "Me at the zoo" — first YouTube video ever, very stable - let req = FetchRequest::new("https://www.youtube.com/watch?v=jNQXAC9IVRw"); - let resp = registry().fetch(req, live_options()).await.unwrap(); + let Some(resp) = fetch_or_skip("https://www.youtube.com/watch?v=jNQXAC9IVRw").await else { + return; + }; assert_eq!(resp.status_code, 200); let content = resp.content.expect("should have content"); @@ -227,8 +275,9 @@ mod live_arxiv { #[tokio::test] async fn fetches_paper() { // "Attention Is All You Need" - let req = FetchRequest::new("https://arxiv.org/abs/1706.03762"); - let resp = registry().fetch(req, live_options()).await.unwrap(); + let Some(resp) = fetch_or_skip("https://arxiv.org/abs/1706.03762").await else { + return; + }; assert_eq!(resp.status_code, 200); let content = resp.content.expect("should have content"); @@ -248,8 +297,9 @@ mod live_hackernews { #[tokio::test] async fn fetches_story() { // HN item 1 — the very first story - let req = FetchRequest::new("https://news.ycombinator.com/item?id=1"); - let resp = registry().fetch(req, live_options()).await.unwrap(); + let Some(resp) = fetch_or_skip("https://news.ycombinator.com/item?id=1").await else { + return; + }; assert_eq!(resp.status_code, 200); let content = resp.content.expect("should have content"); @@ -266,8 +316,9 @@ mod live_rss_feed { #[tokio::test] async fn fetches_rss() { // Rust blog RSS feed - let req = FetchRequest::new("https://blog.rust-lang.org/feed.xml"); - let resp = registry().fetch(req, live_options()).await.unwrap(); + let Some(resp) = fetch_or_skip("https://blog.rust-lang.org/feed.xml").await else { + return; + }; assert_eq!(resp.status_code, 200); let content = resp.content.expect("should have content"); @@ -286,8 +337,9 @@ mod live_docs_site { #[tokio::test] async fn fetches_docs_rs() { - let req = FetchRequest::new("https://docs.rs/serde/latest/serde/"); - let resp = registry().fetch(req, live_options()).await.unwrap(); + let Some(resp) = fetch_or_skip("https://docs.rs/serde/latest/serde/").await else { + return; + }; assert_eq!(resp.status_code, 200); let content = resp.content.expect("should have content"); @@ -306,8 +358,9 @@ mod live_default { #[tokio::test] async fn fetches_plain_html() { - let req = FetchRequest::new("https://example.com").as_markdown(); - let resp = registry().fetch(req, live_options()).await.unwrap(); + let Some(resp) = fetch_markdown_or_skip("https://example.com").await else { + return; + }; assert_eq!(resp.status_code, 200); let content = resp.content.expect("should have content");