From 2dec1236030b2248c31dbeb9652ca5c8fad0882d Mon Sep 17 00:00:00 2001 From: Julien Tant Date: Sat, 4 Apr 2026 14:50:01 -0700 Subject: [PATCH 1/2] Add retry with backoff for HTTP fetches and per-blog error reporting Transient HTTP failures (timeouts, 503s) now retry up to 3 times with exponential backoff via cenkalti/backoff/v5. ScanAllBlogs collects per-blog errors instead of failing fast, so successful scans are not lost when one blog is temporarily unreachable. Co-Authored-By: Claude Opus 4.6 (1M context) --- e2e/e2e_test.go | 48 ++++++++++- go.mod | 1 + go.sum | 2 + internal/cli/commands.go | 25 ++++-- internal/scanner/scanner.go | 58 +++++++++---- internal/scanner/scanner_test.go | 142 +++++++++++++++++++++++++++++++ 6 files changed, 252 insertions(+), 24 deletions(-) diff --git a/e2e/e2e_test.go b/e2e/e2e_test.go index af4dae3..dbfffce 100644 --- a/e2e/e2e_test.go +++ b/e2e/e2e_test.go @@ -6,6 +6,7 @@ import ( "fmt" "net" "net/http" + "net/http/httptest" "os" "os/exec" "path/filepath" @@ -467,13 +468,14 @@ func TestSSRFProtection(t *testing.T) { out, err := cmd.CombinedOutput() require.NoError(t, err, "add should succeed: %s", string(out)) - // Scan WITHOUT --unsafe-client — the safe client should block loopback and fail. + // Scan WITHOUT --unsafe-client — the safe client should block loopback. + // The scan command exits 0 (partial success) but reports the per-blog error. cmd = exec.CommandContext(context.Background(), binaryPath, "--db", dbPath, "scan") cmd.Env = cleanEnv out, err = cmd.CombinedOutput() - require.Error(t, err, "scan should fail when SSRF protection blocks loopback") - require.Contains(t, string(out), "failed to fetch feed:", "expected our wrapped error message") + require.NoError(t, err, "scan should exit 0 even when blogs fail: %s", string(out)) + require.Contains(t, string(out), "failed to fetch feed:", "expected per-blog error message in output") } func filterEnv(env []string, key string) []string { @@ -487,6 +489,46 @@ func filterEnv(env []string, key string) []string { return filtered } +func TestScanPartialFailure(t *testing.T) { + baseURL := startTestServer(t) + + // Start a server that always returns 500. + badServer := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusInternalServerError) + })) + defer badServer.Close() + + for _, mode := range []string{"flags", "env"} { + t.Run(mode, func(t *testing.T) { + c := &cliOpts{ + mode: mode, + dbPath: filepath.Join(t.TempDir(), "test.db"), + } + + // Add a working blog. + c.ok(t, []string{"add", "good-blog", baseURL + "/go/"}, map[string]string{ + "feed-url": baseURL + "/go/feed.atom", + }) + + // Add a blog pointing to the always-failing server. + c.ok(t, []string{"add", "bad-blog", badServer.URL}, map[string]string{ + "feed-url": badServer.URL + "/feed", + }) + + // Scan should exit 0 (partial success). + stdout := c.ok(t, []string{"scan"}, nil) + + // Verify the output contains success for the good blog and error for the bad blog. + assert.Contains(t, stdout, "good-blog") + assert.Contains(t, stdout, "Source: RSS") + assert.Contains(t, stdout, "bad-blog") + assert.Contains(t, stdout, "Error: failed to fetch feed: status 500") + assert.Contains(t, stdout, "2 blog(s): 1 succeeded, 1 failed") + assert.Contains(t, stdout, "Found 3 new article(s) total!") + }) + } +} + func extractFirstID(t *testing.T, output string) string { t.Helper() re := regexp.MustCompile(`\[(\d+)\]`) diff --git a/go.mod b/go.mod index 8a7e7a2..1c8b6be 100644 --- a/go.mod +++ b/go.mod @@ -6,6 +6,7 @@ require ( github.com/DataDog/go-secure-sdk v0.0.7 github.com/Masterminds/squirrel v1.5.4 github.com/PuerkitoBio/goquery v1.12.0 + github.com/cenkalti/backoff/v5 v5.0.3 github.com/fatih/color v1.19.0 github.com/golang-migrate/migrate/v4 v4.19.1 github.com/mmcdole/gofeed v1.3.0 diff --git a/go.sum b/go.sum index 83d1bb7..aa588b5 100644 --- a/go.sum +++ b/go.sum @@ -6,6 +6,8 @@ github.com/PuerkitoBio/goquery v1.12.0 h1:pAcL4g3WRXekcB9AU/y1mbKez2dbY2AajVhtkO github.com/PuerkitoBio/goquery v1.12.0/go.mod h1:802ej+gV2y7bbIhOIoPY5sT183ZW0YFofScC4q/hIpQ= github.com/andybalholm/cascadia v1.3.3 h1:AG2YHrzJIm4BZ19iwJ/DAua6Btl3IwJX+VI4kktS1LM= github.com/andybalholm/cascadia v1.3.3/go.mod h1:xNd9bqTn98Ln4DwST8/nG+H0yuB8Hmgu1YHNnWw0GeA= +github.com/cenkalti/backoff/v5 v5.0.3 h1:ZN+IMa753KfX5hd8vVaMixjnqRZ3y8CuJKRKj1xcsSM= +github.com/cenkalti/backoff/v5 v5.0.3/go.mod h1:rkhZdG3JZukswDf7f0cwqPNk4K0sa+F97BxZthm/crw= github.com/cpuguy83/go-md2man/v2 v2.0.6/go.mod h1:oOW0eioCTA6cOiMLiUPZOpcVxMig6NIQQ7OS05n1F4g= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= diff --git a/internal/cli/commands.go b/internal/cli/commands.go index 7a3f14b..46e9520 100644 --- a/internal/cli/commands.go +++ b/internal/cli/commands.go @@ -194,17 +194,26 @@ func newScanCommand() *cobra.Command { return err } totalNew := 0 + failed := 0 for _, result := range results { if !silent { printScanResult(result) } - totalNew += result.NewArticles + if result.Error != "" { + failed++ + } else { + totalNew += result.NewArticles + } } if !silent { fmt.Println() + succeeded := len(results) - failed + if failed > 0 { + cprintf([]color.Attribute{color.FgYellow}, "Scanned %d blog(s): %d succeeded, %d failed\n", len(results), succeeded, failed) + } if totalNew > 0 { cprintf([]color.Attribute{color.FgGreen, color.Bold}, "Found %d new article(s) total!\n", totalNew) - } else { + } else if failed == 0 { cprintln([]color.Attribute{color.FgYellow}, "No new articles found.") } } @@ -433,15 +442,19 @@ func newImportCommand() *cobra.Command { } func printScanResult(result scanner.ScanResult) { - statusColor := []color.Attribute{color.FgWhite} - if result.NewArticles > 0 { - statusColor = []color.Attribute{color.FgGreen} - } cprintf([]color.Attribute{color.FgWhite, color.Bold}, " %s\n", result.BlogName) + if result.Error != "" { + cprintf([]color.Attribute{color.FgRed}, " Error: %s\n", result.Error) + return + } if result.Source == "none" { cprintln([]color.Attribute{color.FgYellow}, " No feed or scraper configured") return } + statusColor := []color.Attribute{color.FgWhite} + if result.NewArticles > 0 { + statusColor = []color.Attribute{color.FgGreen} + } sourceLabel := "HTML" if result.Source == "rss" { sourceLabel = "RSS" diff --git a/internal/scanner/scanner.go b/internal/scanner/scanner.go index accdd99..ef13b63 100644 --- a/internal/scanner/scanner.go +++ b/internal/scanner/scanner.go @@ -6,6 +6,7 @@ import ( "os" "time" + "github.com/cenkalti/backoff/v5" "golang.org/x/sync/errgroup" "github.com/JulienTant/blogwatcher-cli/internal/model" @@ -14,11 +15,17 @@ import ( "github.com/JulienTant/blogwatcher-cli/internal/storage" ) +const ( + retryMaxTries = 3 + retryInitialBackoff = 500 * time.Millisecond +) + type ScanResult struct { BlogName string NewArticles int TotalFound int Source string + Error string } // Scanner orchestrates blog scanning using a Fetcher and Scraper. @@ -32,6 +39,17 @@ func NewScanner(fetcher *rss.Fetcher, scraper *scraper.Scraper) *Scanner { return &Scanner{fetcher: fetcher, scraper: scraper} } +// retryHTTP retries a transient HTTP operation with exponential backoff. +func retryHTTP[T any](ctx context.Context, op func() (T, error)) (T, error) { + b := &backoff.ExponentialBackOff{ + InitialInterval: retryInitialBackoff, + RandomizationFactor: backoff.DefaultRandomizationFactor, + Multiplier: backoff.DefaultMultiplier, + MaxInterval: backoff.DefaultMaxInterval, + } + return backoff.Retry(ctx, op, backoff.WithBackOff(b), backoff.WithMaxTries(retryMaxTries)) +} + func (s *Scanner) ScanBlog(ctx context.Context, db *storage.Database, blog model.Blog) (ScanResult, error) { var ( articles []model.Article @@ -40,7 +58,9 @@ func (s *Scanner) ScanBlog(ctx context.Context, db *storage.Database, blog model feedURL := blog.FeedURL if feedURL == "" { - discovered, err := s.fetcher.DiscoverFeedURL(ctx, blog.URL) + discovered, err := retryHTTP(ctx, func() (string, error) { + return s.fetcher.DiscoverFeedURL(ctx, blog.URL) + }) if err != nil { return ScanResult{BlogName: blog.Name}, err } @@ -50,14 +70,18 @@ func (s *Scanner) ScanBlog(ctx context.Context, db *storage.Database, blog model } if feedURL != "" { - feedArticles, err := s.fetcher.ParseFeed(ctx, feedURL) + feedArticles, err := retryHTTP(ctx, func() ([]rss.FeedArticle, error) { + return s.fetcher.ParseFeed(ctx, feedURL) + }) if err != nil { // If there's a scraper fallback, try it before giving up. if blog.ScrapeSelector == "" { return ScanResult{BlogName: blog.Name}, err } // Try scraper as fallback. - scrapedArticles, scrapeErr := s.scraper.ScrapeBlog(ctx, blog.URL, blog.ScrapeSelector) + scrapedArticles, scrapeErr := retryHTTP(ctx, func() ([]scraper.ScrapedArticle, error) { + return s.scraper.ScrapeBlog(ctx, blog.URL, blog.ScrapeSelector) + }) if scrapeErr != nil { return ScanResult{BlogName: blog.Name}, fmt.Errorf("RSS: %w; Scraper: %w", err, scrapeErr) } @@ -75,7 +99,9 @@ func (s *Scanner) ScanBlog(ctx context.Context, db *storage.Database, blog model } } } else if blog.ScrapeSelector != "" { - scrapedArticles, err := s.scraper.ScrapeBlog(ctx, blog.URL, blog.ScrapeSelector) + scrapedArticles, err := retryHTTP(ctx, func() ([]scraper.ScrapedArticle, error) { + return s.scraper.ScrapeBlog(ctx, blog.URL, blog.ScrapeSelector) + }) if err != nil { return ScanResult{BlogName: blog.Name}, err } @@ -142,9 +168,10 @@ func (s *Scanner) ScanAllBlogs(ctx context.Context, db *storage.Database, worker if workers <= 1 { results := make([]ScanResult, 0, len(blogs)) for _, blog := range blogs { - result, err := s.ScanBlog(ctx, db, blog) - if err != nil { - return nil, fmt.Errorf("scan %s: %w", blog.Name, err) + result, scanErr := s.ScanBlog(ctx, db, blog) + if scanErr != nil { + result.BlogName = blog.Name + result.Error = scanErr.Error() } results = append(results, result) } @@ -162,19 +189,20 @@ func (s *Scanner) ScanAllBlogs(ctx context.Context, db *storage.Database, worker for i := 0; i < workers; i++ { g.Go(func() error { - workerDB, err := storage.OpenDatabase(gctx, db.Path()) - if err != nil { - return err + workerDB, openErr := storage.OpenDatabase(gctx, db.Path()) + if openErr != nil { + return openErr } defer func() { - if err := workerDB.Close(); err != nil { - fmt.Fprintf(os.Stderr, "close: %v\n", err) + if closeErr := workerDB.Close(); closeErr != nil { + fmt.Fprintf(os.Stderr, "close: %v\n", closeErr) } }() for item := range jobs { - result, err := s.ScanBlog(gctx, workerDB, item.Blog) - if err != nil { - return fmt.Errorf("scan %s: %w", item.Blog.Name, err) + result, scanErr := s.ScanBlog(gctx, workerDB, item.Blog) + if scanErr != nil { + result.BlogName = item.Blog.Name + result.Error = scanErr.Error() } results[item.Index] = result } diff --git a/internal/scanner/scanner_test.go b/internal/scanner/scanner_test.go index 374a4cf..4ed78b1 100644 --- a/internal/scanner/scanner_test.go +++ b/internal/scanner/scanner_test.go @@ -6,6 +6,7 @@ import ( "net/http" "net/http/httptest" "path/filepath" + "sync/atomic" "testing" "time" @@ -224,6 +225,147 @@ func TestScanBlogRSSWithCategories(t *testing.T) { require.Nil(t, withoutCat.Categories) } +func TestScanBlogRetriesTransientError(t *testing.T) { + ctx := context.Background() + var requestCount atomic.Int32 + + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + n := requestCount.Add(1) + if n == 1 { + w.WriteHeader(http.StatusInternalServerError) + return + } + if _, writeErr := w.Write([]byte(sampleFeed)); writeErr != nil { + http.Error(w, writeErr.Error(), http.StatusInternalServerError) + } + })) + defer server.Close() + + db := openTestDB(t) + defer func() { require.NoError(t, db.Close()) }() + + blog, err := db.AddBlog(ctx, model.Blog{Name: "RetryTest", URL: "https://example.com", FeedURL: server.URL}) + require.NoError(t, err) + + result, scanErr := newTestScanner().ScanBlog(ctx, db, blog) + require.NoError(t, scanErr) + require.Equal(t, 2, result.NewArticles) + require.Equal(t, "rss", result.Source) + require.GreaterOrEqual(t, requestCount.Load(), int32(2), "should have retried at least once") +} + +func TestScanBlogRetriesExhausted(t *testing.T) { + ctx := context.Background() + var requestCount atomic.Int32 + + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + requestCount.Add(1) + w.WriteHeader(http.StatusInternalServerError) + })) + defer server.Close() + + db := openTestDB(t) + defer func() { require.NoError(t, db.Close()) }() + + blog, err := db.AddBlog(ctx, model.Blog{Name: "ExhaustedTest", URL: "https://example.com", FeedURL: server.URL}) + require.NoError(t, err) + + _, scanErr := newTestScanner().ScanBlog(ctx, db, blog) + require.Error(t, scanErr) + require.Contains(t, scanErr.Error(), "failed to fetch feed") + require.Equal(t, int32(3), requestCount.Load(), "should have tried 3 times (initial + 2 retries)") +} + +func TestScanAllBlogsPartialFailure(t *testing.T) { + ctx := context.Background() + + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if r.URL.Path == "/good/feed" { + if _, writeErr := w.Write([]byte(sampleFeed)); writeErr != nil { + http.Error(w, writeErr.Error(), http.StatusInternalServerError) + } + return + } + w.WriteHeader(http.StatusInternalServerError) + })) + defer server.Close() + + db := openTestDB(t) + defer func() { require.NoError(t, db.Close()) }() + + _, err := db.AddBlog(ctx, model.Blog{Name: "good-blog", URL: "https://good.example.com", FeedURL: server.URL + "/good/feed"}) + require.NoError(t, err) + _, err = db.AddBlog(ctx, model.Blog{Name: "bad-blog", URL: "https://bad.example.com", FeedURL: server.URL + "/bad/feed"}) + require.NoError(t, err) + + results, scanErr := newTestScanner().ScanAllBlogs(ctx, db, 2) + require.NoError(t, scanErr, "ScanAllBlogs should not return an error for blog-level failures") + require.Len(t, results, 2) + + var good, bad *ScanResult + for i := range results { + switch results[i].BlogName { + case "good-blog": + good = &results[i] + case "bad-blog": + bad = &results[i] + } + } + require.NotNil(t, good) + require.NotNil(t, bad) + + require.Empty(t, good.Error) + require.Equal(t, 2, good.NewArticles) + + require.NotEmpty(t, bad.Error) + require.Contains(t, bad.Error, "failed to fetch feed") +} + +func TestScanAllBlogsPartialFailureSequential(t *testing.T) { + ctx := context.Background() + + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if r.URL.Path == "/good/feed" { + if _, writeErr := w.Write([]byte(sampleFeed)); writeErr != nil { + http.Error(w, writeErr.Error(), http.StatusInternalServerError) + } + return + } + w.WriteHeader(http.StatusInternalServerError) + })) + defer server.Close() + + db := openTestDB(t) + defer func() { require.NoError(t, db.Close()) }() + + _, err := db.AddBlog(ctx, model.Blog{Name: "good-blog", URL: "https://good.example.com", FeedURL: server.URL + "/good/feed"}) + require.NoError(t, err) + _, err = db.AddBlog(ctx, model.Blog{Name: "bad-blog", URL: "https://bad.example.com", FeedURL: server.URL + "/bad/feed"}) + require.NoError(t, err) + + results, scanErr := newTestScanner().ScanAllBlogs(ctx, db, 1) + require.NoError(t, scanErr, "ScanAllBlogs should not return an error for blog-level failures") + require.Len(t, results, 2) + + var good, bad *ScanResult + for i := range results { + switch results[i].BlogName { + case "good-blog": + good = &results[i] + case "bad-blog": + bad = &results[i] + } + } + require.NotNil(t, good) + require.NotNil(t, bad) + + require.Empty(t, good.Error) + require.Equal(t, 2, good.NewArticles) + + require.NotEmpty(t, bad.Error) + require.Contains(t, bad.Error, "failed to fetch feed") +} + func ptrTime(value time.Time) *time.Time { return &value } From ed53cb9148b471ef359587205671437ff3472887 Mon Sep 17 00:00:00 2001 From: Julien Tant Date: Sat, 4 Apr 2026 15:14:00 -0700 Subject: [PATCH 2/2] Fix fatal error propagation, silent mode reporting, and 5xx retry in discovery - DiscoverFeedURL now returns a retryable error for 5xx responses instead of silently treating them as "no feed found" - ScanAllBlogs propagates context cancellation and DB errors as fatal instead of folding them into per-blog result.Error - Silent scan mode reports failures to stderr and exits non-zero on total failure Co-Authored-By: Claude Opus 4.6 (1M context) --- e2e/e2e_test.go | 59 ++++++++++++++++++++++++++++++++ internal/cli/commands.go | 7 ++++ internal/rss/rss.go | 5 ++- internal/rss/rss_test.go | 23 +++++++++++++ internal/scanner/scanner.go | 23 +++++++++++++ internal/scanner/scanner_test.go | 46 +++++++++++++++++++++++++ 6 files changed, 162 insertions(+), 1 deletion(-) diff --git a/e2e/e2e_test.go b/e2e/e2e_test.go index dbfffce..80c03dd 100644 --- a/e2e/e2e_test.go +++ b/e2e/e2e_test.go @@ -529,6 +529,65 @@ func TestScanPartialFailure(t *testing.T) { } } +func TestScanSilentPartialFailure(t *testing.T) { + baseURL := startTestServer(t) + + badServer := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusInternalServerError) + })) + defer badServer.Close() + + for _, mode := range []string{"flags", "env"} { + t.Run(mode, func(t *testing.T) { + c := &cliOpts{ + mode: mode, + dbPath: filepath.Join(t.TempDir(), "test.db"), + } + + c.ok(t, []string{"add", "good-blog", baseURL + "/go/"}, map[string]string{ + "feed-url": baseURL + "/go/feed.atom", + }) + c.ok(t, []string{"add", "bad-blog", badServer.URL}, map[string]string{ + "feed-url": badServer.URL + "/feed", + }) + + // Partial failure in silent mode: should still print "scan done" and exit 0. + stdout, stderr, code := c.run(t, []string{"scan"}, map[string]string{"silent": ""}) + assert.Equal(t, 0, code, "partial failure should exit 0") + assert.Contains(t, stdout, "scan done") + assert.Contains(t, stderr, "1/2 blog(s) failed") + }) + } +} + +func TestScanSilentTotalFailure(t *testing.T) { + badServer := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusInternalServerError) + })) + defer badServer.Close() + + for _, mode := range []string{"flags", "env"} { + t.Run(mode, func(t *testing.T) { + c := &cliOpts{ + mode: mode, + dbPath: filepath.Join(t.TempDir(), "test.db"), + } + + c.ok(t, []string{"add", "bad-blog-1", badServer.URL + "/a"}, map[string]string{ + "feed-url": badServer.URL + "/feed1", + }) + c.ok(t, []string{"add", "bad-blog-2", badServer.URL + "/b"}, map[string]string{ + "feed-url": badServer.URL + "/feed2", + }) + + // Total failure in silent mode: should exit non-zero. + _, stderr, code := c.run(t, []string{"scan"}, map[string]string{"silent": ""}) + assert.NotEqual(t, 0, code, "total failure should exit non-zero") + assert.Contains(t, stderr, "2/2 blog(s) failed") + }) + } +} + func extractFirstID(t *testing.T, output string) string { t.Helper() re := regexp.MustCompile(`\[(\d+)\]`) diff --git a/internal/cli/commands.go b/internal/cli/commands.go index 46e9520..c48016c 100644 --- a/internal/cli/commands.go +++ b/internal/cli/commands.go @@ -216,6 +216,13 @@ func newScanCommand() *cobra.Command { } else if failed == 0 { cprintln([]color.Attribute{color.FgYellow}, "No new articles found.") } + } else { + if failed > 0 { + fmt.Fprintf(os.Stderr, "scan: %d/%d blog(s) failed\n", failed, len(results)) + } + if failed == len(results) { + return fmt.Errorf("scan failed: all %d blog(s) failed", failed) + } } } diff --git a/internal/rss/rss.go b/internal/rss/rss.go index 925993f..df8e650 100644 --- a/internal/rss/rss.go +++ b/internal/rss/rss.go @@ -96,8 +96,11 @@ func (f *Fetcher) DiscoverFeedURL(ctx context.Context, blogURL string) (string, fmt.Fprintf(os.Stderr, "close: %v\n", err) } }() + if response.StatusCode >= 500 { + return "", FeedParseError{Message: fmt.Sprintf("discover feed: server error status %d", response.StatusCode)} + } if response.StatusCode < 200 || response.StatusCode >= 300 { - // Not-found / bad status is not an error — just means no feed at this URL. + // Client errors (4xx) are not transient — just means no feed at this URL. return "", nil } diff --git a/internal/rss/rss_test.go b/internal/rss/rss_test.go index dbdf453..f55e52e 100644 --- a/internal/rss/rss_test.go +++ b/internal/rss/rss_test.go @@ -119,6 +119,29 @@ func TestDiscoverFeedURL_XMLContentType(t *testing.T) { require.Equal(t, server.URL+"/tag/AI/feed/", feedURL, "should return URL directly for feed content-type") } +func TestDiscoverFeedURL_ServerError(t *testing.T) { + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusServiceUnavailable) + })) + defer server.Close() + + _, err := newTestFetcher().DiscoverFeedURL(context.Background(), server.URL) + require.Error(t, err, "should return error for 5xx") + require.Contains(t, err.Error(), "server error status 503") + require.True(t, IsFeedError(err), "should be a FeedParseError so it's retryable") +} + +func TestDiscoverFeedURL_NotFound(t *testing.T) { + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusNotFound) + })) + defer server.Close() + + feedURL, err := newTestFetcher().DiscoverFeedURL(context.Background(), server.URL) + require.NoError(t, err, "404 should not be an error") + require.Empty(t, feedURL, "should return empty for 404") +} + func TestDiscoverFeedURL_RelSelf(t *testing.T) { mux := http.NewServeMux() mux.HandleFunc("/", func(w http.ResponseWriter, r *http.Request) { diff --git a/internal/scanner/scanner.go b/internal/scanner/scanner.go index ef13b63..3d8b05e 100644 --- a/internal/scanner/scanner.go +++ b/internal/scanner/scanner.go @@ -2,6 +2,7 @@ package scanner import ( "context" + "errors" "fmt" "os" "time" @@ -39,6 +40,22 @@ func NewScanner(fetcher *rss.Fetcher, scraper *scraper.Scraper) *Scanner { return &Scanner{fetcher: fetcher, scraper: scraper} } +// isFatalScanError returns true for errors that should abort the entire scan +// (context cancellation, DB errors) rather than being recorded as per-blog failures. +func isFatalScanError(err error) bool { + if err == nil { + return false + } + if errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) { + return true + } + // Feed/scrape errors are recoverable — everything else (DB errors) is fatal. + if rss.IsFeedError(err) || scraper.IsScrapeError(err) { + return false + } + return true +} + // retryHTTP retries a transient HTTP operation with exponential backoff. func retryHTTP[T any](ctx context.Context, op func() (T, error)) (T, error) { b := &backoff.ExponentialBackOff{ @@ -170,6 +187,9 @@ func (s *Scanner) ScanAllBlogs(ctx context.Context, db *storage.Database, worker for _, blog := range blogs { result, scanErr := s.ScanBlog(ctx, db, blog) if scanErr != nil { + if isFatalScanError(scanErr) { + return nil, fmt.Errorf("scan %s: %w", blog.Name, scanErr) + } result.BlogName = blog.Name result.Error = scanErr.Error() } @@ -201,6 +221,9 @@ func (s *Scanner) ScanAllBlogs(ctx context.Context, db *storage.Database, worker for item := range jobs { result, scanErr := s.ScanBlog(gctx, workerDB, item.Blog) if scanErr != nil { + if isFatalScanError(scanErr) { + return fmt.Errorf("scan %s: %w", item.Blog.Name, scanErr) + } result.BlogName = item.Blog.Name result.Error = scanErr.Error() } diff --git a/internal/scanner/scanner_test.go b/internal/scanner/scanner_test.go index 4ed78b1..9e2208a 100644 --- a/internal/scanner/scanner_test.go +++ b/internal/scanner/scanner_test.go @@ -366,6 +366,52 @@ func TestScanAllBlogsPartialFailureSequential(t *testing.T) { require.Contains(t, bad.Error, "failed to fetch feed") } +func TestScanAllBlogsPropagatesContextCancellation(t *testing.T) { + ctx, cancel := context.WithCancel(context.Background()) + + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + cancel() // cancel context while handling request + w.WriteHeader(http.StatusOK) + if _, writeErr := w.Write([]byte(sampleFeed)); writeErr != nil { + http.Error(w, writeErr.Error(), http.StatusInternalServerError) + } + })) + defer server.Close() + + db := openTestDB(t) + defer func() { require.NoError(t, db.Close()) }() + + _, err := db.AddBlog(ctx, model.Blog{Name: "cancel-blog", URL: "https://cancel.example.com", FeedURL: server.URL}) + require.NoError(t, err) + + _, scanErr := newTestScanner().ScanAllBlogs(ctx, db, 1) + require.Error(t, scanErr, "should propagate context cancellation as a fatal error") + require.ErrorIs(t, scanErr, context.Canceled) +} + +func TestScanAllBlogsPropagatesContextCancellationConcurrent(t *testing.T) { + ctx, cancel := context.WithCancel(context.Background()) + + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + cancel() + w.WriteHeader(http.StatusOK) + if _, writeErr := w.Write([]byte(sampleFeed)); writeErr != nil { + http.Error(w, writeErr.Error(), http.StatusInternalServerError) + } + })) + defer server.Close() + + db := openTestDB(t) + defer func() { require.NoError(t, db.Close()) }() + + _, err := db.AddBlog(ctx, model.Blog{Name: "cancel-blog", URL: "https://cancel.example.com", FeedURL: server.URL}) + require.NoError(t, err) + + _, scanErr := newTestScanner().ScanAllBlogs(ctx, db, 2) + require.Error(t, scanErr, "should propagate context cancellation as a fatal error") + require.ErrorIs(t, scanErr, context.Canceled) +} + func ptrTime(value time.Time) *time.Time { return &value }