From b113e074190a7c623f99423f9302e50dfc836c72 Mon Sep 17 00:00:00 2001
From: Elizabeth Worstell <worstell@squareup.com>
Date: Thu, 12 Mar 2026 23:18:56 -0700
Subject: [PATCH] fix: break corrupt mirror snapshot poison cycle
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When a corrupt or empty mirror snapshot exists in S3, pods restore it,
the post-restore fetch fails, and then snapshot jobs immediately
re-upload the corrupt mirror — perpetuating the cycle even after manual
S3 cleanup.

Three changes break this cycle:

1. FetchLenient: post-restore and startup fetches now omit the
   lowSpeedLimit check (same as executeClone), since large deltas after
   snapshot restore trigger server-side pack computation that stalls at
   near-zero transfer for minutes, tripping the 1KB/s threshold.

2. ResetToEmpty + fallback to clone: when the post-restore fetch fails,
   the corrupt mirror is removed and the repo state is reset to Empty so
   the code falls through to a fresh git clone --mirror instead of
   serving and re-uploading stale data.

3. Skip snapshot scheduling on failed fetch: snapshot and repack jobs
   are only scheduled after a successful fetch, both in the startup
   path and the post-restore path.

Co-authored-by: Amp <amp@ampcode.com>
Amp-Thread-ID: https://ampcode.com/threads/T-019ce5d1-9385-7026-8e69-78903ce99c47
---
 internal/gitclone/manager.go | 48 +++++++++++++++++++++++-----
 internal/strategy/git/git.go | 61 +++++++++++++++++++++---------------
 2 files changed, 76 insertions(+), 33 deletions(-)

diff --git a/internal/gitclone/manager.go b/internal/gitclone/manager.go
index c06b679..06f0609 100644
--- a/internal/gitclone/manager.go
+++ b/internal/gitclone/manager.go
@@ -320,6 +320,17 @@ func WithReadLockReturn[T any](repo *Repository, fn func() (T, error)) (T, error
 	return fn()
 }
 
+// ResetToEmpty transitions the repository back to StateEmpty so that a
+// subsequent call to TryStartCloning can re-attempt the clone. Use this
+// when a restored snapshot turns out to be corrupt or empty and needs to
+// be replaced with a fresh clone.
+func (r *Repository) ResetToEmpty() {
+	r.mu.Lock()
+	defer r.mu.Unlock()
+	r.state = StateEmpty
+	r.lastFetch = time.Time{}
+}
+
 // TryStartCloning atomically transitions the repository from StateEmpty to
 // StateCloning. Returns true if this goroutine won the transition and should
 // proceed with the clone/restore; false if another goroutine already claimed it.
@@ -466,17 +477,17 @@ func configureMirror(ctx context.Context, repoPath string, packThreads int) erro
 	return nil
 }
 
-// cloneTimeout bounds `git clone --mirror` so a stuck clone cannot block
+// CloneTimeout bounds `git clone --mirror` so a stuck clone cannot block
 // the repo indefinitely. This is deliberately generous: large repos may
 // take 10-20 minutes for GitHub to compute the server-side pack.
-const cloneTimeout = 30 * time.Minute
+const CloneTimeout = 30 * time.Minute
 
 func (r *Repository) executeClone(ctx context.Context) error {
 	if err := os.MkdirAll(filepath.Dir(r.path), 0o750); err != nil {
 		return errors.Wrap(err, "create clone directory")
 	}
 
-	cloneCtx, cancel := context.WithTimeout(ctx, cloneTimeout)
+	cloneCtx, cancel := context.WithTimeout(ctx, CloneTimeout)
 	defer cancel()
 
 	config := DefaultGitTuningConfig()
@@ -525,6 +536,19 @@ func (r *Repository) Fetch(ctx context.Context) error {
 // for catch-up fetches after snapshot restore where the delta may be large and
 // the default fetchTimeout is too short.
 func (r *Repository) FetchWithTimeout(ctx context.Context, timeout time.Duration) error {
+	return r.fetchInternal(ctx, timeout, true)
+}
+
+// FetchLenient fetches from upstream with the given timeout but without the
+// low-speed transfer check. Use this for post-restore catch-up fetches where
+// the delta may be very large and GitHub's server-side pack computation can
+// stall at near-zero transfer rate for minutes — the same situation that
+// executeClone handles by omitting lowSpeedLimit.
+func (r *Repository) FetchLenient(ctx context.Context, timeout time.Duration) error {
+	return r.fetchInternal(ctx, timeout, false)
+}
+
+func (r *Repository) fetchInternal(ctx context.Context, timeout time.Duration, enforceSpeedLimit bool) error {
 	select {
 	case <-r.fetchSem:
 		defer func() {
@@ -547,12 +571,20 @@ func (r *Repository) FetchWithTimeout(ctx context.Context, timeout time.Duration
 
 	config := DefaultGitTuningConfig()
 
+	args := []string{
+		"-C", r.path,
+		"-c", "http.postBuffer=" + strconv.Itoa(config.PostBuffer),
+	}
+	if enforceSpeedLimit {
+		args = append(args,
+			"-c", "http.lowSpeedLimit="+strconv.Itoa(config.LowSpeedLimit),
+			"-c", "http.lowSpeedTime="+strconv.Itoa(int(config.LowSpeedTime.Seconds())),
+		)
+	}
+	args = append(args, "fetch", "--prune", "--prune-tags")
+
 	// #nosec G204 - r.path is controlled by us
-	cmd, err := r.gitCommand(fetchCtx, "-C", r.path,
-		"-c", "http.postBuffer="+strconv.Itoa(config.PostBuffer),
-		"-c", "http.lowSpeedLimit="+strconv.Itoa(config.LowSpeedLimit),
-		"-c", "http.lowSpeedTime="+strconv.Itoa(int(config.LowSpeedTime.Seconds())),
-		"fetch", "--prune", "--prune-tags")
+	cmd, err := r.gitCommand(fetchCtx, args...)
 	if err != nil {
 		return errors.Wrap(err, "create git command")
 	}
diff --git a/internal/strategy/git/git.go b/internal/strategy/git/git.go
index 7424588..45c7ba2 100644
--- a/internal/strategy/git/git.go
+++ b/internal/strategy/git/git.go
@@ -133,13 +133,13 @@ func New(
 		}
 
 		start := time.Now()
-		if err := repo.Fetch(ctx); err != nil {
+		if err := repo.FetchLenient(ctx, gitclone.CloneTimeout); err != nil {
 			logger.ErrorContext(ctx, "Startup fetch failed for existing repo", "upstream", repo.UpstreamURL(), "error", err,
 				"duration", time.Since(start))
-		} else {
-			logger.InfoContext(ctx, "Startup fetch completed for existing repo", "upstream", repo.UpstreamURL(),
-				"duration", time.Since(start))
+			continue
 		}
+		logger.InfoContext(ctx, "Startup fetch completed for existing repo", "upstream", repo.UpstreamURL(),
+			"duration", time.Since(start))
 
 		postRefs, err := repo.GetLocalRefs(ctx)
 		if err != nil {
@@ -459,31 +459,42 @@ func (s *Strategy) startClone(ctx context.Context, repo *gitclone.Repository) {
 	if err := s.tryRestoreSnapshot(ctx, repo); err != nil {
 		logger.InfoContext(ctx, "Mirror snapshot restore failed, falling back to clone", "upstream", upstream, "error", err)
 	} else {
-		// Mirror snapshot restored successfully. The bare mirror is immediately
-		// servable — mark ready and let background fetch handle freshening.
-		repo.MarkReady()
-
-		if err := s.cleanupSpools(upstream); err != nil {
-			logger.WarnContext(ctx, "Failed to clean up spools", "upstream", upstream, "error", err)
-		}
+		logger.InfoContext(ctx, "Mirror snapshot restored, fetching to freshen", "upstream", upstream)
+
+		// Fetch with a generous timeout and no low-speed check: mirror
+		// snapshots can be hours old, so the delta may be very large and
+		// GitHub's server-side pack computation can stall at near-zero
+		// transfer for minutes (same as initial clone).
+		//
+		// State remains StateCloning until fetch succeeds so that
+		// concurrent requests (via ensureCloneReady) block rather than
+		// serving from a potentially empty or stale mirror.
+		if err := repo.FetchLenient(ctx, gitclone.CloneTimeout); err != nil {
+			logger.WarnContext(ctx, "Post-restore fetch failed, discarding snapshot and falling back to clone",
+				"upstream", upstream, "error", err)
+			// The restored snapshot may be corrupt or empty. Remove it and
+			// fall through to a fresh clone so we don't re-upload bad data.
+			repo.ResetToEmpty()
+			if rmErr := os.RemoveAll(repo.Path()); rmErr != nil {
+				logger.WarnContext(ctx, "Failed to remove corrupt mirror", "upstream", upstream, "error", rmErr)
+			}
+		} else {
+			repo.MarkReady()
 
-		logger.InfoContext(ctx, "Mirror snapshot restored, serving immediately", "upstream", upstream)
+			if err := s.cleanupSpools(upstream); err != nil {
+				logger.WarnContext(ctx, "Failed to clean up spools", "upstream", upstream, "error", err)
+			}
 
-		// Fetch synchronously so the mirror is fresh before we serve from it.
-		// Mirror snapshots can be hours old; serving stale data defeats the
-		// purpose of the cache. Call repo.Fetch directly instead of
-		// backgroundFetch, which would skip because MarkReady sets lastFetch.
-		if err := repo.Fetch(ctx); err != nil {
-			logger.WarnContext(ctx, "Post-restore fetch failed, serving from snapshot", "upstream", upstream, "error", err)
-		}
+			logger.InfoContext(ctx, "Post-restore fetch completed, serving", "upstream", upstream)
 
-		if s.config.SnapshotInterval > 0 {
-			s.scheduleSnapshotJobs(repo)
-		}
-		if s.config.RepackInterval > 0 {
-			s.scheduleRepackJobs(repo)
+			if s.config.SnapshotInterval > 0 {
+				s.scheduleSnapshotJobs(repo)
+			}
+			if s.config.RepackInterval > 0 {
+				s.scheduleRepackJobs(repo)
+			}
+			return
 		}
-		return
 	}
 
 	logger.InfoContext(ctx, "Starting clone", "upstream", upstream, "path", repo.Path())