From b113e074190a7c623f99423f9302e50dfc836c72 Mon Sep 17 00:00:00 2001 From: Elizabeth Worstell Date: Thu, 12 Mar 2026 23:18:56 -0700 Subject: [PATCH] fix: break corrupt mirror snapshot poison cycle MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When a corrupt or empty mirror snapshot exists in S3, pods restore it, the post-restore fetch fails, and then snapshot jobs immediately re-upload the corrupt mirror — perpetuating the cycle even after manual S3 cleanup. Three changes break this cycle: 1. FetchLenient: post-restore and startup fetches now omit the lowSpeedLimit check (same as executeClone), since large deltas after snapshot restore trigger server-side pack computation that stalls at near-zero transfer for minutes, tripping the 1KB/s threshold. 2. ResetToEmpty + fallback to clone: when the post-restore fetch fails, the corrupt mirror is removed and the repo state is reset to Empty so the code falls through to a fresh git clone --mirror instead of serving and re-uploading stale data. 3. Skip snapshot scheduling on failed fetch: snapshot and repack jobs are only scheduled after a successful fetch, both in the startup path and the post-restore path. Co-authored-by: Amp Amp-Thread-ID: https://ampcode.com/threads/T-019ce5d1-9385-7026-8e69-78903ce99c47 --- internal/gitclone/manager.go | 48 +++++++++++++++++++++++----- internal/strategy/git/git.go | 61 +++++++++++++++++++++--------------- 2 files changed, 76 insertions(+), 33 deletions(-) diff --git a/internal/gitclone/manager.go b/internal/gitclone/manager.go index c06b679..06f0609 100644 --- a/internal/gitclone/manager.go +++ b/internal/gitclone/manager.go @@ -320,6 +320,17 @@ func WithReadLockReturn[T any](repo *Repository, fn func() (T, error)) (T, error return fn() } +// ResetToEmpty transitions the repository back to StateEmpty so that a +// subsequent call to TryStartCloning can re-attempt the clone. Use this +// when a restored snapshot turns out to be corrupt or empty and needs to +// be replaced with a fresh clone. +func (r *Repository) ResetToEmpty() { + r.mu.Lock() + defer r.mu.Unlock() + r.state = StateEmpty + r.lastFetch = time.Time{} +} + // TryStartCloning atomically transitions the repository from StateEmpty to // StateCloning. Returns true if this goroutine won the transition and should // proceed with the clone/restore; false if another goroutine already claimed it. @@ -466,17 +477,17 @@ func configureMirror(ctx context.Context, repoPath string, packThreads int) erro return nil } -// cloneTimeout bounds `git clone --mirror` so a stuck clone cannot block +// CloneTimeout bounds `git clone --mirror` so a stuck clone cannot block // the repo indefinitely. This is deliberately generous: large repos may // take 10-20 minutes for GitHub to compute the server-side pack. -const cloneTimeout = 30 * time.Minute +const CloneTimeout = 30 * time.Minute func (r *Repository) executeClone(ctx context.Context) error { if err := os.MkdirAll(filepath.Dir(r.path), 0o750); err != nil { return errors.Wrap(err, "create clone directory") } - cloneCtx, cancel := context.WithTimeout(ctx, cloneTimeout) + cloneCtx, cancel := context.WithTimeout(ctx, CloneTimeout) defer cancel() config := DefaultGitTuningConfig() @@ -525,6 +536,19 @@ func (r *Repository) Fetch(ctx context.Context) error { // for catch-up fetches after snapshot restore where the delta may be large and // the default fetchTimeout is too short. func (r *Repository) FetchWithTimeout(ctx context.Context, timeout time.Duration) error { + return r.fetchInternal(ctx, timeout, true) +} + +// FetchLenient fetches from upstream with the given timeout but without the +// low-speed transfer check. Use this for post-restore catch-up fetches where +// the delta may be very large and GitHub's server-side pack computation can +// stall at near-zero transfer rate for minutes — the same situation that +// executeClone handles by omitting lowSpeedLimit. +func (r *Repository) FetchLenient(ctx context.Context, timeout time.Duration) error { + return r.fetchInternal(ctx, timeout, false) +} + +func (r *Repository) fetchInternal(ctx context.Context, timeout time.Duration, enforceSpeedLimit bool) error { select { case <-r.fetchSem: defer func() { @@ -547,12 +571,20 @@ func (r *Repository) FetchWithTimeout(ctx context.Context, timeout time.Duration config := DefaultGitTuningConfig() + args := []string{ + "-C", r.path, + "-c", "http.postBuffer=" + strconv.Itoa(config.PostBuffer), + } + if enforceSpeedLimit { + args = append(args, + "-c", "http.lowSpeedLimit="+strconv.Itoa(config.LowSpeedLimit), + "-c", "http.lowSpeedTime="+strconv.Itoa(int(config.LowSpeedTime.Seconds())), + ) + } + args = append(args, "fetch", "--prune", "--prune-tags") + // #nosec G204 - r.path is controlled by us - cmd, err := r.gitCommand(fetchCtx, "-C", r.path, - "-c", "http.postBuffer="+strconv.Itoa(config.PostBuffer), - "-c", "http.lowSpeedLimit="+strconv.Itoa(config.LowSpeedLimit), - "-c", "http.lowSpeedTime="+strconv.Itoa(int(config.LowSpeedTime.Seconds())), - "fetch", "--prune", "--prune-tags") + cmd, err := r.gitCommand(fetchCtx, args...) if err != nil { return errors.Wrap(err, "create git command") } diff --git a/internal/strategy/git/git.go b/internal/strategy/git/git.go index 7424588..45c7ba2 100644 --- a/internal/strategy/git/git.go +++ b/internal/strategy/git/git.go @@ -133,13 +133,13 @@ func New( } start := time.Now() - if err := repo.Fetch(ctx); err != nil { + if err := repo.FetchLenient(ctx, gitclone.CloneTimeout); err != nil { logger.ErrorContext(ctx, "Startup fetch failed for existing repo", "upstream", repo.UpstreamURL(), "error", err, "duration", time.Since(start)) - } else { - logger.InfoContext(ctx, "Startup fetch completed for existing repo", "upstream", repo.UpstreamURL(), - "duration", time.Since(start)) + continue } + logger.InfoContext(ctx, "Startup fetch completed for existing repo", "upstream", repo.UpstreamURL(), + "duration", time.Since(start)) postRefs, err := repo.GetLocalRefs(ctx) if err != nil { @@ -459,31 +459,42 @@ func (s *Strategy) startClone(ctx context.Context, repo *gitclone.Repository) { if err := s.tryRestoreSnapshot(ctx, repo); err != nil { logger.InfoContext(ctx, "Mirror snapshot restore failed, falling back to clone", "upstream", upstream, "error", err) } else { - // Mirror snapshot restored successfully. The bare mirror is immediately - // servable — mark ready and let background fetch handle freshening. - repo.MarkReady() - - if err := s.cleanupSpools(upstream); err != nil { - logger.WarnContext(ctx, "Failed to clean up spools", "upstream", upstream, "error", err) - } + logger.InfoContext(ctx, "Mirror snapshot restored, fetching to freshen", "upstream", upstream) + + // Fetch with a generous timeout and no low-speed check: mirror + // snapshots can be hours old, so the delta may be very large and + // GitHub's server-side pack computation can stall at near-zero + // transfer for minutes (same as initial clone). + // + // State remains StateCloning until fetch succeeds so that + // concurrent requests (via ensureCloneReady) block rather than + // serving from a potentially empty or stale mirror. + if err := repo.FetchLenient(ctx, gitclone.CloneTimeout); err != nil { + logger.WarnContext(ctx, "Post-restore fetch failed, discarding snapshot and falling back to clone", + "upstream", upstream, "error", err) + // The restored snapshot may be corrupt or empty. Remove it and + // fall through to a fresh clone so we don't re-upload bad data. + repo.ResetToEmpty() + if rmErr := os.RemoveAll(repo.Path()); rmErr != nil { + logger.WarnContext(ctx, "Failed to remove corrupt mirror", "upstream", upstream, "error", rmErr) + } + } else { + repo.MarkReady() - logger.InfoContext(ctx, "Mirror snapshot restored, serving immediately", "upstream", upstream) + if err := s.cleanupSpools(upstream); err != nil { + logger.WarnContext(ctx, "Failed to clean up spools", "upstream", upstream, "error", err) + } - // Fetch synchronously so the mirror is fresh before we serve from it. - // Mirror snapshots can be hours old; serving stale data defeats the - // purpose of the cache. Call repo.Fetch directly instead of - // backgroundFetch, which would skip because MarkReady sets lastFetch. - if err := repo.Fetch(ctx); err != nil { - logger.WarnContext(ctx, "Post-restore fetch failed, serving from snapshot", "upstream", upstream, "error", err) - } + logger.InfoContext(ctx, "Post-restore fetch completed, serving", "upstream", upstream) - if s.config.SnapshotInterval > 0 { - s.scheduleSnapshotJobs(repo) - } - if s.config.RepackInterval > 0 { - s.scheduleRepackJobs(repo) + if s.config.SnapshotInterval > 0 { + s.scheduleSnapshotJobs(repo) + } + if s.config.RepackInterval > 0 { + s.scheduleRepackJobs(repo) + } + return } - return } logger.InfoContext(ctx, "Starting clone", "upstream", upstream, "path", repo.Path())