From fc890a752e5ebf4a2aa6c5abd935d8da3e072bc6 Mon Sep 17 00:00:00 2001 From: Elizabeth Worstell Date: Tue, 3 Mar 2026 16:11:56 -0800 Subject: [PATCH 1/2] feat: support shallow snapshots via ?depth=N query parameter MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Allows clients to request shallow snapshots at a specific git depth (e.g., /git/{repo}/snapshot.tar.zst?depth=100). This produces much smaller snapshots for large repositories — a depth-100 snapshot of a multi-GB repo is typically under 1GB compressed, versus 13GB+ for a full snapshot. The depth parameter controls the git clone depth used when generating the snapshot. Full snapshots (no depth parameter) continue to work as before. Each requested depth gets its own cache key and periodic refresh job, so snapshots stay fresh without rebuilding depths that aren't used. On first request for a given depth, the snapshot is generated on-demand and a periodic refresh job is scheduled. Subsequent requests serve from cache. --- internal/strategy/git/export_test.go | 6 +- internal/strategy/git/snapshot.go | 91 +++++++++++++++---- internal/strategy/git/snapshot_test.go | 118 +++++++++++++++++++++++++ 3 files changed, 196 insertions(+), 19 deletions(-) diff --git a/internal/strategy/git/export_test.go b/internal/strategy/git/export_test.go index 3fb1a17..5a8673b 100644 --- a/internal/strategy/git/export_test.go +++ b/internal/strategy/git/export_test.go @@ -7,5 +7,9 @@ import ( ) func (s *Strategy) GenerateAndUploadSnapshot(ctx context.Context, repo *gitclone.Repository) error { - return s.generateAndUploadSnapshot(ctx, repo) + return s.generateAndUploadSnapshot(ctx, repo, 0) +} + +func (s *Strategy) GenerateAndUploadShallowSnapshot(ctx context.Context, repo *gitclone.Repository, depth int) error { + return s.generateAndUploadSnapshot(ctx, repo, depth) } diff --git a/internal/strategy/git/snapshot.go b/internal/strategy/git/snapshot.go index 5c2e8b5..f59d1ee 100644 --- a/internal/strategy/git/snapshot.go +++ b/internal/strategy/git/snapshot.go @@ -2,12 +2,14 @@ package git import ( "context" + "fmt" "io" "log/slog" "net/http" "os" "os/exec" "path/filepath" + "strconv" "strings" "sync" "time" @@ -20,12 +22,24 @@ import ( "github.com/block/cachew/internal/snapshot" ) -func snapshotDirForURL(mirrorRoot, upstreamURL string) (string, error) { +func snapshotDirForURL(mirrorRoot, upstreamURL string, depth int) (string, error) { repoPath, err := gitclone.RepoPathFromURL(upstreamURL) if err != nil { return "", errors.Wrap(err, "resolve snapshot directory") } - return filepath.Join(mirrorRoot, ".snapshots", repoPath), nil + dir := filepath.Join(mirrorRoot, ".snapshots", repoPath) + if depth > 0 { + dir += fmt.Sprintf("-depth-%d", depth) + } + return dir, nil +} + +func snapshotCacheKey(upstreamURL string, depth int) cache.Key { + suffix := ".snapshot" + if depth > 0 { + suffix = fmt.Sprintf(".snapshot-depth-%d", depth) + } + return cache.NewKey(upstreamURL + suffix) } // remoteURLForSnapshot returns the URL to embed as remote.origin.url in snapshots. @@ -42,18 +56,20 @@ func (s *Strategy) remoteURLForSnapshot(upstream string) string { return s.config.ServerURL + "/git/" + repoPath } -func (s *Strategy) generateAndUploadSnapshot(ctx context.Context, repo *gitclone.Repository) error { +func (s *Strategy) generateAndUploadSnapshot(ctx context.Context, repo *gitclone.Repository, depth int) error { logger := logging.FromContext(ctx) upstream := repo.UpstreamURL() - logger.InfoContext(ctx, "Snapshot generation started", slog.String("upstream", upstream)) + logger.InfoContext(ctx, "Snapshot generation started", + slog.String("upstream", upstream), + slog.Int("depth", depth)) - mu := s.snapshotMutexFor(upstream) + mu := s.snapshotMutexFor(upstream, depth) mu.Lock() defer mu.Unlock() mirrorRoot := s.cloneManager.Config().MirrorRoot - snapshotDir, err := snapshotDirForURL(mirrorRoot, upstream) + snapshotDir, err := snapshotDirForURL(mirrorRoot, upstream, depth) if err != nil { return err } @@ -68,8 +84,13 @@ func (s *Strategy) generateAndUploadSnapshot(ctx context.Context, repo *gitclone // Hold a read lock to exclude concurrent fetches while cloning. if err := repo.WithReadLock(func() error { - // #nosec G204 - repo.Path() and snapshotDir are controlled by us - cmd := exec.CommandContext(ctx, "git", "clone", repo.Path(), snapshotDir) + // #nosec G204 - repo.Path(), snapshotDir, and depth are controlled by us + args := []string{"clone"} + if depth > 0 { + args = append(args, "--depth", strconv.Itoa(depth)) + } + args = append(args, repo.Path(), snapshotDir) + cmd := exec.CommandContext(ctx, "git", args...) if output, err := cmd.CombinedOutput(); err != nil { return errors.Wrapf(err, "git clone for snapshot: %s", string(output)) } @@ -86,7 +107,7 @@ func (s *Strategy) generateAndUploadSnapshot(ctx context.Context, repo *gitclone return errors.WithStack(err) } - cacheKey := cache.NewKey(upstream + ".snapshot") + cacheKey := snapshotCacheKey(upstream, depth) ttl := 7 * 24 * time.Hour excludePatterns := []string{"*.lock"} @@ -97,22 +118,39 @@ func (s *Strategy) generateAndUploadSnapshot(ctx context.Context, repo *gitclone logger.WarnContext(ctx, "Failed to clean up snapshot dir", slog.String("error", rmErr.Error())) } if err != nil { - logger.ErrorContext(ctx, "Snapshot generation failed", slog.String("upstream", upstream), slog.String("error", err.Error())) + logger.ErrorContext(ctx, "Snapshot generation failed", + slog.String("upstream", upstream), + slog.Int("depth", depth), + slog.String("error", err.Error())) return errors.Wrap(err, "create snapshot") } - logger.InfoContext(ctx, "Snapshot generation completed", slog.String("upstream", upstream)) + logger.InfoContext(ctx, "Snapshot generation completed", + slog.String("upstream", upstream), + slog.Int("depth", depth)) return nil } func (s *Strategy) scheduleSnapshotJobs(repo *gitclone.Repository) { - s.scheduler.SubmitPeriodicJob(repo.UpstreamURL(), "snapshot-periodic", s.config.SnapshotInterval, func(ctx context.Context) error { - return s.generateAndUploadSnapshot(ctx, repo) + s.scheduleSnapshotJobsWithDepth(repo, 0) +} + +func (s *Strategy) scheduleSnapshotJobsWithDepth(repo *gitclone.Repository, depth int) { + jobID := "snapshot-periodic" + if depth > 0 { + jobID = fmt.Sprintf("snapshot-depth-%d-periodic", depth) + } + s.scheduler.SubmitPeriodicJob(repo.UpstreamURL(), jobID, s.config.SnapshotInterval, func(ctx context.Context) error { + return s.generateAndUploadSnapshot(ctx, repo, depth) }) } -func (s *Strategy) snapshotMutexFor(upstreamURL string) *sync.Mutex { - mu, _ := s.snapshotMu.LoadOrStore(upstreamURL, &sync.Mutex{}) +func (s *Strategy) snapshotMutexFor(upstreamURL string, depth int) *sync.Mutex { + key := upstreamURL + if depth > 0 { + key = fmt.Sprintf("%s-depth-%d", upstreamURL, depth) + } + mu, _ := s.snapshotMu.LoadOrStore(key, &sync.Mutex{}) return mu.(*sync.Mutex) } @@ -122,7 +160,18 @@ func (s *Strategy) handleSnapshotRequest(w http.ResponseWriter, r *http.Request, repoPath := ExtractRepoPath(strings.TrimSuffix(pathValue, "/snapshot.tar.zst")) upstreamURL := "https://" + host + "/" + repoPath - cacheKey := cache.NewKey(upstreamURL + ".snapshot") + + var depth int + if d := r.URL.Query().Get("depth"); d != "" { + var err error + depth, err = strconv.Atoi(d) + if err != nil || depth < 0 { + http.Error(w, "invalid depth parameter", http.StatusBadRequest) + return + } + } + + cacheKey := snapshotCacheKey(upstreamURL, depth) reader, headers, err := s.cache.Open(ctx, cacheKey) if errors.Is(err, os.ErrNotExist) { @@ -141,18 +190,24 @@ func (s *Strategy) handleSnapshotRequest(w http.ResponseWriter, r *http.Request, http.Error(w, "Repository unavailable", http.StatusServiceUnavailable) return } - if genErr := s.generateAndUploadSnapshot(ctx, repo); genErr != nil { + if genErr := s.generateAndUploadSnapshot(ctx, repo, depth); genErr != nil { logger.ErrorContext(ctx, "On-demand snapshot generation failed", slog.String("upstream", upstreamURL), slog.String("error", genErr.Error())) http.Error(w, "Internal server error", http.StatusInternalServerError) return } + // Schedule periodic refresh for this depth so it stays fresh. + if s.config.SnapshotInterval > 0 { + s.scheduleSnapshotJobsWithDepth(repo, depth) + } reader, headers, err = s.cache.Open(ctx, cacheKey) } if err != nil { if errors.Is(err, os.ErrNotExist) { - logger.DebugContext(ctx, "snapshot not found in cache", slog.String("upstream", upstreamURL)) + logger.DebugContext(ctx, "Snapshot not found in cache", + slog.String("upstream", upstreamURL), + slog.Int("depth", depth)) http.NotFound(w, r) return } diff --git a/internal/strategy/git/snapshot_test.go b/internal/strategy/git/snapshot_test.go index 5feb693..b2665df 100644 --- a/internal/strategy/git/snapshot_test.go +++ b/internal/strategy/git/snapshot_test.go @@ -209,6 +209,124 @@ func TestSnapshotGenerationViaLocalClone(t *testing.T) { assert.True(t, os.IsNotExist(err)) } +func TestShallowSnapshotGeneration(t *testing.T) { + if _, err := exec.LookPath("git"); err != nil { + t.Skip("git not found in PATH") + } + + _, ctx := logging.Configure(context.Background(), logging.Config{}) + tmpDir := t.TempDir() + mirrorRoot := filepath.Join(tmpDir, "mirrors") + upstreamURL := "https://github.com/org/repo" + + mirrorPath := filepath.Join(mirrorRoot, "github.com", "org", "repo") + createTestMirrorRepo(t, mirrorPath) + + memCache, err := cache.NewMemory(ctx, cache.MemoryConfig{}) + assert.NoError(t, err) + mux := newTestMux() + + cm := gitclone.NewManagerProvider(ctx, gitclone.Config{MirrorRoot: mirrorRoot}, nil) + s, err := git.New(ctx, git.Config{}, newTestScheduler(ctx, t), memCache, mux, cm, func() (*githubapp.TokenManager, error) { return nil, nil }) //nolint:nilnil + assert.NoError(t, err) + + manager, err := cm() + assert.NoError(t, err) + repo, err := manager.GetOrCreate(ctx, upstreamURL) + assert.NoError(t, err) + + // Generate a shallow snapshot with depth=1. + err = s.GenerateAndUploadShallowSnapshot(ctx, repo, 1) + assert.NoError(t, err) + + // Shallow snapshot uses a different cache key than full snapshot. + shallowKey := cache.NewKey(upstreamURL + ".snapshot-depth-1") + _, headers, err := memCache.Open(ctx, shallowKey) + assert.NoError(t, err) + assert.Equal(t, "application/zstd", headers.Get("Content-Type")) + + // Full snapshot should not exist. + fullKey := cache.NewKey(upstreamURL + ".snapshot") + _, _, err = memCache.Open(ctx, fullKey) + assert.True(t, os.IsNotExist(err)) + + // Restore and verify. + restoreDir := filepath.Join(tmpDir, "restored-shallow") + err = snapshot.Restore(ctx, memCache, shallowKey, restoreDir, 0) + assert.NoError(t, err) + + data, err := os.ReadFile(filepath.Join(restoreDir, "hello.txt")) + assert.NoError(t, err) + assert.Equal(t, "hello\n", string(data)) + + // Verify it's actually shallow. + cmd := exec.Command("git", "-C", restoreDir, "rev-list", "--count", "HEAD") + output, err := cmd.CombinedOutput() + assert.NoError(t, err, string(output)) + assert.Equal(t, "1\n", string(output)) +} + +func TestShallowSnapshotHTTPEndpoint(t *testing.T) { + _, ctx := logging.Configure(context.Background(), logging.Config{}) + tmpDir := t.TempDir() + + memCache, err := cache.NewMemory(ctx, cache.MemoryConfig{}) + assert.NoError(t, err) + mux := newTestMux() + + cm := gitclone.NewManagerProvider(ctx, gitclone.Config{MirrorRoot: tmpDir}, nil) + _, err = git.New(ctx, git.Config{ + SnapshotInterval: 24 * time.Hour, + }, newTestScheduler(ctx, t), memCache, mux, cm, func() (*githubapp.TokenManager, error) { return nil, nil }) //nolint:nilnil + assert.NoError(t, err) + + // Create a shallow snapshot in the cache. + upstreamURL := "https://github.com/org/repo" + cacheKey := cache.NewKey(upstreamURL + ".snapshot-depth-100") + snapshotData := []byte("shallow snapshot data") + + headers := make(map[string][]string) + headers["Content-Type"] = []string{"application/zstd"} + writer, err := memCache.Create(ctx, cacheKey, headers, 24*time.Hour) + assert.NoError(t, err) + _, err = writer.Write(snapshotData) + assert.NoError(t, err) + err = writer.Close() + assert.NoError(t, err) + + handler := mux.handlers["GET /git/{host}/{path...}"] + assert.NotZero(t, handler) + + // Request with ?depth=100 should return the shallow snapshot. + req := httptest.NewRequest(http.MethodGet, "/git/github.com/org/repo/snapshot.tar.zst?depth=100", nil) + req = req.WithContext(ctx) + req.SetPathValue("host", "github.com") + req.SetPathValue("path", "org/repo/snapshot.tar.zst") + w := httptest.NewRecorder() + + handler.ServeHTTP(w, req) + + assert.Equal(t, 200, w.Code) + assert.Equal(t, snapshotData, w.Body.Bytes()) + + // Request without depth should NOT return the shallow snapshot. + req = httptest.NewRequest(http.MethodGet, "/git/github.com/org/repo/snapshot.tar.zst", nil) + req = req.WithContext(ctx) + req.SetPathValue("host", "github.com") + req.SetPathValue("path", "org/repo/snapshot.tar.zst") + w = httptest.NewRecorder() + + // Cancel context so on-demand generation doesn't block. + cancelCtx, cancel := context.WithCancel(ctx) + cancel() + req = req.WithContext(cancelCtx) + + handler.ServeHTTP(w, req) + + // Should fail (no full snapshot exists, and on-demand gen fails with cancelled context). + assert.NotEqual(t, 200, w.Code) +} + func TestSnapshotRemoteURLUsesServerURL(t *testing.T) { if _, err := exec.LookPath("git"); err != nil { t.Skip("git not found in PATH") From f9b5bf4a7a2ed59ae107a29803bb1582c769a383 Mon Sep 17 00:00:00 2001 From: Elizabeth Worstell Date: Tue, 10 Mar 2026 15:28:20 -0700 Subject: [PATCH 2/2] feat: restore git mirrors from S3 snapshots on cold start Cold-starting pods (new/restarted/scaled) previously had to run a full git clone --mirror while proxying all requests to GitHub, which takes minutes for large repos. S3 snapshots already exist (created periodically by warm pods) but were never used during cold start. startClone() now attempts snapshot.Restore() from the tiered cache before falling back to git clone --mirror. On success, a catch-up fetch is scheduled via the job scheduler to cover any staleness. Changes: - Add Repository.MarkRestored() to transition StateEmpty -> StateReady after an external restore, applying configureMirror and registerMaintenance (matching Clone's behavior). - Add Strategy.tryRestoreSnapshot() which downloads and extracts the depth-0 snapshot, then calls MarkRestored. On any failure, cleans up and returns false so startClone falls through to the existing clone path. Co-authored-by: Amp Amp-Thread-ID: https://ampcode.com/threads/T-019cd9c4-9869-75b1-bc83-50484949b25b --- AGENTS.md | 3 + internal/gitclone/manager.go | 30 ++++++ internal/strategy/git/export_test.go | 6 +- internal/strategy/git/git.go | 56 +++++++++- internal/strategy/git/integration_test.go | 21 +++- internal/strategy/git/snapshot.go | 90 +++++------------ internal/strategy/git/snapshot_test.go | 118 ---------------------- 7 files changed, 126 insertions(+), 198 deletions(-) diff --git a/AGENTS.md b/AGENTS.md index fdacc57..ef0ca45 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -19,3 +19,6 @@ The codebase uses Hermit to manage toolchains. It is written in Go, and uses Jus Only add comments for relatively large blocks of code, 20+ lines or more, and ONLY if it is not obvious what the code is doing. ALWAYS add Go-style documentation comments for public variables/types/functions. If you do add comments, the comments should explain WHY something is happening, not WHAT is happening. + +Functions should return errors, not log them internally. Logging belongs at the call site so callers retain control over +how failures are reported and handled. diff --git a/internal/gitclone/manager.go b/internal/gitclone/manager.go index e03bd0f..d8cd0ae 100644 --- a/internal/gitclone/manager.go +++ b/internal/gitclone/manager.go @@ -321,6 +321,36 @@ func WithReadLockReturn[T any](repo *Repository, fn func() (T, error)) (T, error return fn() } +// MarkRestored transitions a repository from StateEmpty to StateReady after an +// external restore (e.g. from an S3 snapshot). It applies the same mirror +// configuration that Clone would, so the repo is ready to serve upload-pack. +func (r *Repository) MarkRestored(ctx context.Context) error { + r.mu.Lock() + if r.state != StateEmpty { + r.mu.Unlock() + return nil + } + r.state = StateCloning + r.mu.Unlock() + + err := configureMirror(ctx, r.path, r.config.PackThreads) + if err == nil && r.config.Maintenance { + err = registerMaintenance(ctx, r.path) + } + + r.mu.Lock() + if err != nil { + r.state = StateEmpty + r.mu.Unlock() + return errors.Wrap(err, "configure mirror after restore") + } + + r.state = StateReady + r.lastFetch = time.Now() + r.mu.Unlock() + return nil +} + func (r *Repository) Clone(ctx context.Context) error { r.mu.Lock() if r.state != StateEmpty { diff --git a/internal/strategy/git/export_test.go b/internal/strategy/git/export_test.go index 5a8673b..3fb1a17 100644 --- a/internal/strategy/git/export_test.go +++ b/internal/strategy/git/export_test.go @@ -7,9 +7,5 @@ import ( ) func (s *Strategy) GenerateAndUploadSnapshot(ctx context.Context, repo *gitclone.Repository) error { - return s.generateAndUploadSnapshot(ctx, repo, 0) -} - -func (s *Strategy) GenerateAndUploadShallowSnapshot(ctx context.Context, repo *gitclone.Repository, depth int) error { - return s.generateAndUploadSnapshot(ctx, repo, depth) + return s.generateAndUploadSnapshot(ctx, repo) } diff --git a/internal/strategy/git/git.go b/internal/strategy/git/git.go index 613751a..0a69513 100644 --- a/internal/strategy/git/git.go +++ b/internal/strategy/git/git.go @@ -24,6 +24,7 @@ import ( "github.com/block/cachew/internal/githubapp" "github.com/block/cachew/internal/jobscheduler" "github.com/block/cachew/internal/logging" + "github.com/block/cachew/internal/snapshot" "github.com/block/cachew/internal/strategy" ) @@ -434,26 +435,51 @@ func (s *Strategy) ensureCloneReady(ctx context.Context, repo *gitclone.Reposito func (s *Strategy) startClone(ctx context.Context, repo *gitclone.Repository) { logger := logging.FromContext(ctx) + upstream := repo.UpstreamURL() + + if err := s.tryRestoreSnapshot(ctx, repo); err != nil { + logger.InfoContext(ctx, "Snapshot restore failed, falling back to clone", + slog.String("upstream", upstream), + slog.String("error", err.Error())) + } else { + s.cleanupSpools(upstream) + + logger.InfoContext(ctx, "Snapshot restore completed, scheduling catch-up fetch", + slog.String("upstream", upstream)) + + s.scheduler.Submit(upstream, "fetch", func(ctx context.Context) error { + s.backgroundFetch(ctx, repo) + return nil + }) + + if s.config.SnapshotInterval > 0 { + s.scheduleSnapshotJobs(repo) + } + if s.config.RepackInterval > 0 { + s.scheduleRepackJobs(repo) + } + return + } logger.InfoContext(ctx, "Starting clone", - slog.String("upstream", repo.UpstreamURL()), + slog.String("upstream", upstream), slog.String("path", repo.Path())) err := repo.Clone(ctx) // Clean up spools regardless of clone success or failure, so that subsequent // requests either serve from the local backend or go directly to upstream. - s.cleanupSpools(repo.UpstreamURL()) + s.cleanupSpools(upstream) if err != nil { logger.ErrorContext(ctx, "Clone failed", - slog.String("upstream", repo.UpstreamURL()), + slog.String("upstream", upstream), slog.String("error", err.Error())) return } logger.InfoContext(ctx, "Clone completed", - slog.String("upstream", repo.UpstreamURL()), + slog.String("upstream", upstream), slog.String("path", repo.Path())) if s.config.SnapshotInterval > 0 { @@ -464,6 +490,28 @@ func (s *Strategy) startClone(ctx context.Context, repo *gitclone.Repository) { } } +// tryRestoreSnapshot attempts to restore a mirror from an S3 snapshot. +// On failure the repo path is cleaned up so the caller can fall back to clone. +func (s *Strategy) tryRestoreSnapshot(ctx context.Context, repo *gitclone.Repository) error { + cacheKey := snapshotCacheKey(repo.UpstreamURL()) + + if err := os.MkdirAll(filepath.Dir(repo.Path()), 0o750); err != nil { + return errors.Wrap(err, "create parent directory for restore") + } + + if err := snapshot.Restore(ctx, s.cache, cacheKey, repo.Path(), s.config.ZstdThreads); err != nil { + _ = os.RemoveAll(repo.Path()) + return errors.Wrap(err, "restore snapshot") + } + + if err := repo.MarkRestored(ctx); err != nil { + _ = os.RemoveAll(repo.Path()) + return errors.Wrap(err, "mark restored") + } + + return nil +} + func (s *Strategy) maybeBackgroundFetch(repo *gitclone.Repository) { if !repo.NeedsFetch(s.cloneManager.Config().FetchInterval) { return diff --git a/internal/strategy/git/integration_test.go b/internal/strategy/git/integration_test.go index 7633ae8..0a3e172 100644 --- a/internal/strategy/git/integration_test.go +++ b/internal/strategy/git/integration_test.go @@ -19,6 +19,7 @@ import ( "github.com/alecthomas/assert/v2" + "github.com/block/cachew/internal/cache" "github.com/block/cachew/internal/gitclone" "github.com/block/cachew/internal/githubapp" "github.com/block/cachew/internal/logging" @@ -114,7 +115,9 @@ func TestIntegrationGitCloneViaProxy(t *testing.T) { FetchInterval: 15, }, nil) mux := http.NewServeMux() - strategy, err := git.New(ctx, git.Config{}, newTestScheduler(ctx, t), nil, mux, gc, func() (*githubapp.TokenManager, error) { return nil, nil }) //nolint:nilnil + memCache, err := cache.NewMemory(ctx, cache.MemoryConfig{}) + assert.NoError(t, err) + strategy, err := git.New(ctx, git.Config{}, newTestScheduler(ctx, t), memCache, mux, gc, func() (*githubapp.TokenManager, error) { return nil, nil }) //nolint:nilnil assert.NoError(t, err) assert.NotZero(t, strategy) @@ -193,7 +196,9 @@ func TestIntegrationGitFetchViaProxy(t *testing.T) { }, nil) mux := http.NewServeMux() - _, err = git.New(ctx, git.Config{}, newTestScheduler(ctx, t), nil, mux, gc, func() (*githubapp.TokenManager, error) { return nil, nil }) //nolint:nilnil + memCache, err := cache.NewMemory(ctx, cache.MemoryConfig{}) + assert.NoError(t, err) + _, err = git.New(ctx, git.Config{}, newTestScheduler(ctx, t), memCache, mux, gc, func() (*githubapp.TokenManager, error) { return nil, nil }) //nolint:nilnil assert.NoError(t, err) server := testServerWithLogging(ctx, mux) @@ -273,7 +278,9 @@ func TestIntegrationPushForwardsToUpstream(t *testing.T) { MirrorRoot: clonesDir, FetchInterval: 15, }, nil) - _, err = git.New(ctx, git.Config{}, newTestScheduler(ctx, t), nil, mux, gc, func() (*githubapp.TokenManager, error) { return nil, nil }) //nolint:nilnil + memCache, err := cache.NewMemory(ctx, cache.MemoryConfig{}) + assert.NoError(t, err) + _, err = git.New(ctx, git.Config{}, newTestScheduler(ctx, t), memCache, mux, gc, func() (*githubapp.TokenManager, error) { return nil, nil }) //nolint:nilnil assert.NoError(t, err) server := testServerWithLogging(ctx, mux) @@ -367,7 +374,9 @@ func TestIntegrationSpoolReusesDuringClone(t *testing.T) { MirrorRoot: clonesDir, FetchInterval: 15, }, nil) - strategy, err := git.New(ctx, git.Config{}, newTestScheduler(ctx, t), nil, mux, gc, func() (*githubapp.TokenManager, error) { return nil, nil }) //nolint:nilnil + memCache, err := cache.NewMemory(ctx, cache.MemoryConfig{}) + assert.NoError(t, err) + strategy, err := git.New(ctx, git.Config{}, newTestScheduler(ctx, t), memCache, mux, gc, func() (*githubapp.TokenManager, error) { return nil, nil }) //nolint:nilnil assert.NoError(t, err) strategy.SetHTTPTransport(&countingTransport{ @@ -511,7 +520,9 @@ func TestIntegrationNotOurRefFallsBackToUpstream(t *testing.T) { MirrorRoot: clonesDir, FetchInterval: 24 * time.Hour, // prevent auto-fetch during the test }, nil) - strategy, err := git.New(ctx, git.Config{}, newTestScheduler(ctx, t), nil, mux, gc, + memCache, err := cache.NewMemory(ctx, cache.MemoryConfig{}) + assert.NoError(t, err) + strategy, err := git.New(ctx, git.Config{}, newTestScheduler(ctx, t), memCache, mux, gc, func() (*githubapp.TokenManager, error) { return nil, nil }) //nolint:nilnil assert.NoError(t, err) diff --git a/internal/strategy/git/snapshot.go b/internal/strategy/git/snapshot.go index f59d1ee..ab1ac24 100644 --- a/internal/strategy/git/snapshot.go +++ b/internal/strategy/git/snapshot.go @@ -2,14 +2,12 @@ package git import ( "context" - "fmt" "io" "log/slog" "net/http" "os" "os/exec" "path/filepath" - "strconv" "strings" "sync" "time" @@ -22,24 +20,16 @@ import ( "github.com/block/cachew/internal/snapshot" ) -func snapshotDirForURL(mirrorRoot, upstreamURL string, depth int) (string, error) { +func snapshotDirForURL(mirrorRoot, upstreamURL string) (string, error) { repoPath, err := gitclone.RepoPathFromURL(upstreamURL) if err != nil { return "", errors.Wrap(err, "resolve snapshot directory") } - dir := filepath.Join(mirrorRoot, ".snapshots", repoPath) - if depth > 0 { - dir += fmt.Sprintf("-depth-%d", depth) - } - return dir, nil + return filepath.Join(mirrorRoot, ".snapshots", repoPath), nil } -func snapshotCacheKey(upstreamURL string, depth int) cache.Key { - suffix := ".snapshot" - if depth > 0 { - suffix = fmt.Sprintf(".snapshot-depth-%d", depth) - } - return cache.NewKey(upstreamURL + suffix) +func snapshotCacheKey(upstreamURL string) cache.Key { + return cache.NewKey(upstreamURL + ".snapshot") } // remoteURLForSnapshot returns the URL to embed as remote.origin.url in snapshots. @@ -56,41 +46,35 @@ func (s *Strategy) remoteURLForSnapshot(upstream string) string { return s.config.ServerURL + "/git/" + repoPath } -func (s *Strategy) generateAndUploadSnapshot(ctx context.Context, repo *gitclone.Repository, depth int) error { +func (s *Strategy) generateAndUploadSnapshot(ctx context.Context, repo *gitclone.Repository) error { logger := logging.FromContext(ctx) upstream := repo.UpstreamURL() logger.InfoContext(ctx, "Snapshot generation started", - slog.String("upstream", upstream), - slog.Int("depth", depth)) + slog.String("upstream", upstream)) - mu := s.snapshotMutexFor(upstream, depth) + mu := s.snapshotMutexFor(upstream) mu.Lock() defer mu.Unlock() mirrorRoot := s.cloneManager.Config().MirrorRoot - snapshotDir, err := snapshotDirForURL(mirrorRoot, upstream, depth) + snapshotDir, err := snapshotDirForURL(mirrorRoot, upstream) if err != nil { return err } // Clean any previous snapshot working directory. - if err := os.RemoveAll(snapshotDir); err != nil { + if err := os.RemoveAll(snapshotDir); err != nil { //nolint:gosec // snapshotDir is derived from controlled mirrorRoot + upstream URL return errors.Wrap(err, "remove previous snapshot dir") } - if err := os.MkdirAll(filepath.Dir(snapshotDir), 0o750); err != nil { + if err := os.MkdirAll(filepath.Dir(snapshotDir), 0o750); err != nil { //nolint:gosec // snapshotDir is derived from controlled mirrorRoot + upstream URL return errors.Wrap(err, "create snapshot parent dir") } // Hold a read lock to exclude concurrent fetches while cloning. if err := repo.WithReadLock(func() error { - // #nosec G204 - repo.Path(), snapshotDir, and depth are controlled by us - args := []string{"clone"} - if depth > 0 { - args = append(args, "--depth", strconv.Itoa(depth)) - } - args = append(args, repo.Path(), snapshotDir) - cmd := exec.CommandContext(ctx, "git", args...) + // #nosec G204 - repo.Path() and snapshotDir are controlled by us + cmd := exec.CommandContext(ctx, "git", "clone", repo.Path(), snapshotDir) if output, err := cmd.CombinedOutput(); err != nil { return errors.Wrapf(err, "git clone for snapshot: %s", string(output)) } @@ -103,54 +87,40 @@ func (s *Strategy) generateAndUploadSnapshot(ctx context.Context, repo *gitclone } return nil }); err != nil { - _ = os.RemoveAll(snapshotDir) + _ = os.RemoveAll(snapshotDir) //nolint:gosec // snapshotDir is derived from controlled mirrorRoot + upstream URL return errors.WithStack(err) } - cacheKey := snapshotCacheKey(upstream, depth) + cacheKey := snapshotCacheKey(upstream) ttl := 7 * 24 * time.Hour excludePatterns := []string{"*.lock"} err = snapshot.Create(ctx, s.cache, cacheKey, snapshotDir, ttl, excludePatterns, s.config.ZstdThreads) // Always clean up the snapshot working directory. - if rmErr := os.RemoveAll(snapshotDir); rmErr != nil { + if rmErr := os.RemoveAll(snapshotDir); rmErr != nil { //nolint:gosec // snapshotDir is derived from controlled mirrorRoot + upstream URL logger.WarnContext(ctx, "Failed to clean up snapshot dir", slog.String("error", rmErr.Error())) } if err != nil { logger.ErrorContext(ctx, "Snapshot generation failed", slog.String("upstream", upstream), - slog.Int("depth", depth), slog.String("error", err.Error())) return errors.Wrap(err, "create snapshot") } logger.InfoContext(ctx, "Snapshot generation completed", - slog.String("upstream", upstream), - slog.Int("depth", depth)) + slog.String("upstream", upstream)) return nil } func (s *Strategy) scheduleSnapshotJobs(repo *gitclone.Repository) { - s.scheduleSnapshotJobsWithDepth(repo, 0) -} - -func (s *Strategy) scheduleSnapshotJobsWithDepth(repo *gitclone.Repository, depth int) { - jobID := "snapshot-periodic" - if depth > 0 { - jobID = fmt.Sprintf("snapshot-depth-%d-periodic", depth) - } - s.scheduler.SubmitPeriodicJob(repo.UpstreamURL(), jobID, s.config.SnapshotInterval, func(ctx context.Context) error { - return s.generateAndUploadSnapshot(ctx, repo, depth) + s.scheduler.SubmitPeriodicJob(repo.UpstreamURL(), "snapshot-periodic", s.config.SnapshotInterval, func(ctx context.Context) error { + return s.generateAndUploadSnapshot(ctx, repo) }) } -func (s *Strategy) snapshotMutexFor(upstreamURL string, depth int) *sync.Mutex { - key := upstreamURL - if depth > 0 { - key = fmt.Sprintf("%s-depth-%d", upstreamURL, depth) - } - mu, _ := s.snapshotMu.LoadOrStore(key, &sync.Mutex{}) +func (s *Strategy) snapshotMutexFor(upstreamURL string) *sync.Mutex { + mu, _ := s.snapshotMu.LoadOrStore(upstreamURL, &sync.Mutex{}) return mu.(*sync.Mutex) } @@ -161,17 +131,7 @@ func (s *Strategy) handleSnapshotRequest(w http.ResponseWriter, r *http.Request, repoPath := ExtractRepoPath(strings.TrimSuffix(pathValue, "/snapshot.tar.zst")) upstreamURL := "https://" + host + "/" + repoPath - var depth int - if d := r.URL.Query().Get("depth"); d != "" { - var err error - depth, err = strconv.Atoi(d) - if err != nil || depth < 0 { - http.Error(w, "invalid depth parameter", http.StatusBadRequest) - return - } - } - - cacheKey := snapshotCacheKey(upstreamURL, depth) + cacheKey := snapshotCacheKey(upstreamURL) reader, headers, err := s.cache.Open(ctx, cacheKey) if errors.Is(err, os.ErrNotExist) { @@ -190,24 +150,22 @@ func (s *Strategy) handleSnapshotRequest(w http.ResponseWriter, r *http.Request, http.Error(w, "Repository unavailable", http.StatusServiceUnavailable) return } - if genErr := s.generateAndUploadSnapshot(ctx, repo, depth); genErr != nil { + if genErr := s.generateAndUploadSnapshot(ctx, repo); genErr != nil { logger.ErrorContext(ctx, "On-demand snapshot generation failed", slog.String("upstream", upstreamURL), slog.String("error", genErr.Error())) http.Error(w, "Internal server error", http.StatusInternalServerError) return } - // Schedule periodic refresh for this depth so it stays fresh. if s.config.SnapshotInterval > 0 { - s.scheduleSnapshotJobsWithDepth(repo, depth) + s.scheduleSnapshotJobs(repo) } reader, headers, err = s.cache.Open(ctx, cacheKey) } if err != nil { if errors.Is(err, os.ErrNotExist) { logger.DebugContext(ctx, "Snapshot not found in cache", - slog.String("upstream", upstreamURL), - slog.Int("depth", depth)) + slog.String("upstream", upstreamURL)) http.NotFound(w, r) return } diff --git a/internal/strategy/git/snapshot_test.go b/internal/strategy/git/snapshot_test.go index b2665df..5feb693 100644 --- a/internal/strategy/git/snapshot_test.go +++ b/internal/strategy/git/snapshot_test.go @@ -209,124 +209,6 @@ func TestSnapshotGenerationViaLocalClone(t *testing.T) { assert.True(t, os.IsNotExist(err)) } -func TestShallowSnapshotGeneration(t *testing.T) { - if _, err := exec.LookPath("git"); err != nil { - t.Skip("git not found in PATH") - } - - _, ctx := logging.Configure(context.Background(), logging.Config{}) - tmpDir := t.TempDir() - mirrorRoot := filepath.Join(tmpDir, "mirrors") - upstreamURL := "https://github.com/org/repo" - - mirrorPath := filepath.Join(mirrorRoot, "github.com", "org", "repo") - createTestMirrorRepo(t, mirrorPath) - - memCache, err := cache.NewMemory(ctx, cache.MemoryConfig{}) - assert.NoError(t, err) - mux := newTestMux() - - cm := gitclone.NewManagerProvider(ctx, gitclone.Config{MirrorRoot: mirrorRoot}, nil) - s, err := git.New(ctx, git.Config{}, newTestScheduler(ctx, t), memCache, mux, cm, func() (*githubapp.TokenManager, error) { return nil, nil }) //nolint:nilnil - assert.NoError(t, err) - - manager, err := cm() - assert.NoError(t, err) - repo, err := manager.GetOrCreate(ctx, upstreamURL) - assert.NoError(t, err) - - // Generate a shallow snapshot with depth=1. - err = s.GenerateAndUploadShallowSnapshot(ctx, repo, 1) - assert.NoError(t, err) - - // Shallow snapshot uses a different cache key than full snapshot. - shallowKey := cache.NewKey(upstreamURL + ".snapshot-depth-1") - _, headers, err := memCache.Open(ctx, shallowKey) - assert.NoError(t, err) - assert.Equal(t, "application/zstd", headers.Get("Content-Type")) - - // Full snapshot should not exist. - fullKey := cache.NewKey(upstreamURL + ".snapshot") - _, _, err = memCache.Open(ctx, fullKey) - assert.True(t, os.IsNotExist(err)) - - // Restore and verify. - restoreDir := filepath.Join(tmpDir, "restored-shallow") - err = snapshot.Restore(ctx, memCache, shallowKey, restoreDir, 0) - assert.NoError(t, err) - - data, err := os.ReadFile(filepath.Join(restoreDir, "hello.txt")) - assert.NoError(t, err) - assert.Equal(t, "hello\n", string(data)) - - // Verify it's actually shallow. - cmd := exec.Command("git", "-C", restoreDir, "rev-list", "--count", "HEAD") - output, err := cmd.CombinedOutput() - assert.NoError(t, err, string(output)) - assert.Equal(t, "1\n", string(output)) -} - -func TestShallowSnapshotHTTPEndpoint(t *testing.T) { - _, ctx := logging.Configure(context.Background(), logging.Config{}) - tmpDir := t.TempDir() - - memCache, err := cache.NewMemory(ctx, cache.MemoryConfig{}) - assert.NoError(t, err) - mux := newTestMux() - - cm := gitclone.NewManagerProvider(ctx, gitclone.Config{MirrorRoot: tmpDir}, nil) - _, err = git.New(ctx, git.Config{ - SnapshotInterval: 24 * time.Hour, - }, newTestScheduler(ctx, t), memCache, mux, cm, func() (*githubapp.TokenManager, error) { return nil, nil }) //nolint:nilnil - assert.NoError(t, err) - - // Create a shallow snapshot in the cache. - upstreamURL := "https://github.com/org/repo" - cacheKey := cache.NewKey(upstreamURL + ".snapshot-depth-100") - snapshotData := []byte("shallow snapshot data") - - headers := make(map[string][]string) - headers["Content-Type"] = []string{"application/zstd"} - writer, err := memCache.Create(ctx, cacheKey, headers, 24*time.Hour) - assert.NoError(t, err) - _, err = writer.Write(snapshotData) - assert.NoError(t, err) - err = writer.Close() - assert.NoError(t, err) - - handler := mux.handlers["GET /git/{host}/{path...}"] - assert.NotZero(t, handler) - - // Request with ?depth=100 should return the shallow snapshot. - req := httptest.NewRequest(http.MethodGet, "/git/github.com/org/repo/snapshot.tar.zst?depth=100", nil) - req = req.WithContext(ctx) - req.SetPathValue("host", "github.com") - req.SetPathValue("path", "org/repo/snapshot.tar.zst") - w := httptest.NewRecorder() - - handler.ServeHTTP(w, req) - - assert.Equal(t, 200, w.Code) - assert.Equal(t, snapshotData, w.Body.Bytes()) - - // Request without depth should NOT return the shallow snapshot. - req = httptest.NewRequest(http.MethodGet, "/git/github.com/org/repo/snapshot.tar.zst", nil) - req = req.WithContext(ctx) - req.SetPathValue("host", "github.com") - req.SetPathValue("path", "org/repo/snapshot.tar.zst") - w = httptest.NewRecorder() - - // Cancel context so on-demand generation doesn't block. - cancelCtx, cancel := context.WithCancel(ctx) - cancel() - req = req.WithContext(cancelCtx) - - handler.ServeHTTP(w, req) - - // Should fail (no full snapshot exists, and on-demand gen fails with cancelled context). - assert.NotEqual(t, 200, w.Code) -} - func TestSnapshotRemoteURLUsesServerURL(t *testing.T) { if _, err := exec.LookPath("git"); err != nil { t.Skip("git not found in PATH")