diff --git a/implement/GitCore.IntegrationTests/LoadTreeContentsFromGitHubTests.cs b/implement/GitCore.IntegrationTests/LoadTreeContentsFromGitHubTests.cs index fccff8c..47e6531 100644 --- a/implement/GitCore.IntegrationTests/LoadTreeContentsFromGitHubTests.cs +++ b/implement/GitCore.IntegrationTests/LoadTreeContentsFromGitHubTests.cs @@ -81,14 +81,6 @@ public async Task Load_tree_with_custom_http_client_for_profiling() requestCounter.RequestCount.Should().BeGreaterThan(0, "HTTP requests should have been made"); } - [Fact] - public void Placeholder() - { - /* - * Avoid "Zero tests ran" error in CI as long as there are no real tests yet. - * */ - } - [Fact] public async Task Load_subdirectory_tree_contents() { @@ -98,8 +90,9 @@ public async Task Load_subdirectory_tree_contents() var subdirectoryPath = new[] { "implement", "GitCore" }; // Load the subdirectory contents - var subdirectoryContents = await LoadFromUrl.LoadSubdirectoryContentsFromGitUrlAsync( - repositoryUrl, commitSha, subdirectoryPath); + var subdirectoryContents = + await LoadFromUrl.LoadSubdirectoryContentsFromGitUrlAsync( + repositoryUrl, commitSha, subdirectoryPath); // Verify that the subdirectory was loaded successfully subdirectoryContents.Should().NotBeNull("Subdirectory should be loaded"); @@ -152,6 +145,72 @@ public async Task Load_subdirectory_tree_contents() "Common/EnumerableExtensions.cs should have the expected content"); } + [Fact] + public async Task Load_relatively_small_subdirectory_from_larger_repository() + { + // Create a custom HttpClient with a handler to track data transfer + var dataTrackingHandler = new DataTrackingHandler(new System.Net.Http.SocketsHttpHandler()); + using var httpClient = new System.Net.Http.HttpClient(dataTrackingHandler); + + // Target: Load the 'guide' subdirectory, which is relatively small compared to others. + var repositoryUrl = "https://github.com/pine-vm/pine.git"; + var commitSha = "c837c8199f38aab839c40019a50055e16d100c74"; + var subdirectoryPath = new[] { "guide" }; + + // Load the subdirectory contents + var subdirectoryContents = + await LoadFromUrl.LoadSubdirectoryContentsFromGitUrlAsync( + repositoryUrl, commitSha, subdirectoryPath, httpClient); + + // Verify that the subdirectory was loaded successfully + subdirectoryContents.Should().NotBeNull("Subdirectory should be loaded"); + subdirectoryContents.Count.Should().BeGreaterThan(0, "Subdirectory should contain files"); + + // Verify that we have the expected files + subdirectoryContents.Should().ContainKey( + ["customizing-elm-app-builds-with-compilation-interfaces.md"], + "The subdirectory should contain an 'customizing-elm-app-builds-with-compilation-interfaces.md' file"); + + subdirectoryContents.Should().ContainKey( + ["how-to-build-a-backend-app-in-elm.md"], + "The subdirectory should contain a 'how-to-build-a-backend-app-in-elm.md' file"); + + var subtreeAggregateFileContentSize = + subdirectoryContents.Values.Sum(file => file.Length); + + // Profile data transfer + var totalBytesReceived = dataTrackingHandler.TotalBytesReceived; + var totalBytesSent = dataTrackingHandler.TotalBytesSent; + var requestCount = dataTrackingHandler.RequestCount; + + // Log profiling information for debugging + System.Console.WriteLine($"Data Transfer Profile:"); + System.Console.WriteLine($" Total Requests: {requestCount}"); + System.Console.WriteLine($" Total Bytes Sent: {totalBytesSent:N0} bytes"); + System.Console.WriteLine($" Total Bytes Received: {totalBytesReceived:N0} bytes"); + System.Console.WriteLine($" Total Data Transfer: {totalBytesSent + totalBytesReceived:N0} bytes"); + System.Console.WriteLine($" Subdirectory Content Size: {subtreeAggregateFileContentSize:N0} bytes"); + System.Console.WriteLine($" Files in Subdirectory: {subdirectoryContents.Count}"); + System.Console.WriteLine($" Compression Ratio: {(double)totalBytesReceived / subtreeAggregateFileContentSize:F2}x"); + + // Assert bounds on data transfer + // With blobless clone optimization, we: + // 1. Fetch commit + trees only (blobless pack file) + // 2. Navigate to subdirectory and identify needed blobs + // 3. Fetch only those specific blobs + // This results in significantly less data transfer compared to fetching all files + + requestCount.Should().BeLessThan(10, "Should not make excessive HTTP requests"); + + // Set a reasonable upper bound for data transfer with blobless optimization + // We expect data transfer to be close to the actual content size plus some overhead + // for trees, commit, and pack file headers. + var maxExpectedBytes = subtreeAggregateFileContentSize * 4 + 100_000; + + totalBytesReceived.Should().BeLessThan(maxExpectedBytes, + $"Should optimize data transfer for subdirectory (received {totalBytesReceived:N0} bytes)"); + } + // Helper class for tracking HTTP requests private class RequestCountingHandler(System.Net.Http.HttpMessageHandler innerHandler) : System.Net.Http.DelegatingHandler(innerHandler) @@ -166,4 +225,51 @@ private class RequestCountingHandler(System.Net.Http.HttpMessageHandler innerHan return await base.SendAsync(request, cancellationToken); } } + + // Helper class for tracking data transfer + private class DataTrackingHandler(System.Net.Http.HttpMessageHandler innerHandler) + : System.Net.Http.DelegatingHandler(innerHandler) + { + public int RequestCount { get; private set; } + public long TotalBytesSent { get; private set; } + public long TotalBytesReceived { get; private set; } + + protected override async Task SendAsync( + System.Net.Http.HttpRequestMessage request, + System.Threading.CancellationToken cancellationToken) + { + RequestCount++; + + // Track request size + if (request.Content is not null) + { + var requestBytes = await request.Content.ReadAsByteArrayAsync(cancellationToken); + TotalBytesSent += requestBytes.Length; + } + + // Send the request + var response = await base.SendAsync(request, cancellationToken); + + // Track response size + if (response.Content is not null) + { + // Capture headers before reading content + var originalHeaders = response.Content.Headers.ToList(); + + var responseBytes = await response.Content.ReadAsByteArrayAsync(cancellationToken); + TotalBytesReceived += responseBytes.Length; + + // Re-wrap the content so it can be read again by the caller + response.Content = new System.Net.Http.ByteArrayContent(responseBytes); + + // Restore the original content headers + foreach (var header in originalHeaders) + { + response.Content.Headers.TryAddWithoutValidation(header.Key, header.Value); + } + } + + return response; + } + } } diff --git a/implement/GitCore/GitSmartHttp.cs b/implement/GitCore/GitSmartHttp.cs index fc75e8d..7033004 100644 --- a/implement/GitCore/GitSmartHttp.cs +++ b/implement/GitCore/GitSmartHttp.cs @@ -1,4 +1,5 @@ using System; +using System.Collections.Generic; using System.IO; using System.Net.Http; using System.Text; @@ -94,6 +95,23 @@ public static async Task> FetchPackFileAsync( string gitUrl, string commitSha, HttpClient? httpClient = null) + { + return await FetchPackFileAsync(gitUrl, commitSha, subdirectoryPath: null, httpClient); + } + + /// + /// Fetches a pack file containing only objects needed for a specific subdirectory. + /// + /// Git repository URL like https://github.com/owner/repo.git + /// Commit SHA to fetch + /// Optional subdirectory path to optimize the fetch + /// Optional HttpClient to use for requests. If null, uses a default static client. + /// Pack file data + public static async Task> FetchPackFileAsync( + string gitUrl, + string commitSha, + IReadOnlyList? subdirectoryPath, + HttpClient? httpClient = null) { httpClient ??= s_httpClient; @@ -114,8 +132,13 @@ public static async Task> FetchPackFileAsync( // Step 2: Request the pack file with the specific commit var uploadPackUrl = $"{gitUrl}/git-upload-pack"; - // Build the request body according to Git protocol - var requestBody = BuildUploadPackRequest(commitSha); + // For subdirectory optimization, use shallow fetch to only get the commit without history + // Note: To further optimize by fetching only specific subdirectory contents would require: + // 1. Git Protocol v2 with partial clone and sparse checkout support + // 2. Multiple round-trips: fetch trees, navigate to subdirectory, then fetch only those blobs + // The current shallow approach (depth=1) already provides significant optimization + int? shallowDepth = (subdirectoryPath is not null && subdirectoryPath.Count > 0) ? 1 : null; + var requestBody = BuildUploadPackRequest(commitSha, shallowDepth); using var packRequest = new HttpRequestMessage(HttpMethod.Post, uploadPackUrl) { @@ -187,14 +210,112 @@ public static async Task FetchBranchCommitShaAsync( throw new InvalidOperationException($"Branch {branch} not found in repository {owner}/{repo}"); } - private static byte[] BuildUploadPackRequest(string commitSha) + /// + /// Fetches a blobless pack file (commit and trees only, no blobs) for optimized subdirectory loading. + /// + /// Git repository URL like https://github.com/owner/repo.git + /// Commit SHA to fetch + /// Optional HttpClient to use for requests. If null, uses a default static client. + /// Pack file data containing commit and trees but no blobs + public static async Task> FetchBloblessPackFileAsync( + string gitUrl, + string commitSha, + HttpClient? httpClient = null) + { + var requestBody = BuildUploadPackRequest(commitSha, shallowDepth: 1, filter: "blob:none"); + return await FetchPackFileWithRequestBodyAsync(gitUrl, requestBody, httpClient); + } + + /// + /// Fetches specific Git objects by their SHAs. + /// + /// Git repository URL like https://github.com/owner/repo.git + /// List of object SHAs to fetch + /// Optional HttpClient to use for requests. If null, uses a default static client. + /// Pack file data containing the requested objects + public static async Task> FetchSpecificObjectsAsync( + string gitUrl, + IReadOnlyList objectShas, + HttpClient? httpClient = null) + { + var requestBody = BuildUploadPackRequestForSpecificObjects(objectShas); + return await FetchPackFileWithRequestBodyAsync(gitUrl, requestBody, httpClient); + } + + /// + /// Common helper for fetching pack files with a prepared request body. + /// + private static async Task> FetchPackFileWithRequestBodyAsync( + string gitUrl, + byte[] requestBody, + HttpClient? httpClient) + { + httpClient ??= s_httpClient; + + // Ensure the URL ends with .git + if (!gitUrl.EndsWith(".git")) + { + gitUrl = $"{gitUrl}.git"; + } + + // Step 1: Discover refs + var refsUrl = $"{gitUrl}/info/refs?service=git-upload-pack"; + using var refsRequest = new HttpRequestMessage(HttpMethod.Get, refsUrl); + using var refsResponse = await httpClient.SendAsync(refsRequest); + refsResponse.EnsureSuccessStatusCode(); + + // Step 2: Request pack file + var uploadPackUrl = $"{gitUrl}/git-upload-pack"; + + using var packRequest = new HttpRequestMessage(HttpMethod.Post, uploadPackUrl) + { + Content = new ByteArrayContent(requestBody) + }; + + packRequest.Content.Headers.ContentType = + new System.Net.Http.Headers.MediaTypeHeaderValue("application/x-git-upload-pack-request"); + + using var packResponse = await httpClient.SendAsync(packRequest); + packResponse.EnsureSuccessStatusCode(); + + var responseData = await packResponse.Content.ReadAsByteArrayAsync(); + return ExtractPackFileFromResponse(responseData); + } + + private static byte[] BuildUploadPackRequest(string commitSha, int? shallowDepth = null, string? filter = null) { using var ms = new MemoryStream(); // Want line: want - var wantLine = $"want {commitSha} {GitProtocolCapabilities}\n"; + var capabilities = GitProtocolCapabilities; + + if (shallowDepth.HasValue) + { + capabilities = $"{capabilities} shallow"; + } + + if (filter is not null) + { + capabilities = $"{capabilities} filter"; + } + + var wantLine = $"want {commitSha} {capabilities}\n"; WritePktLine(ms, wantLine); + // For shallow clones, request specific depth (only this commit, not its history) + if (shallowDepth.HasValue) + { + var shallowLine = $"deepen {shallowDepth.Value}\n"; + WritePktLine(ms, shallowLine); + } + + // For filtered fetches, specify the filter + if (filter is not null) + { + var filterLine = $"filter {filter}\n"; + WritePktLine(ms, filterLine); + } + // Flush packet WritePktLine(ms, null); @@ -221,6 +342,27 @@ private static void WritePktLine(Stream stream, string? line) } } + private static byte[] BuildUploadPackRequestForSpecificObjects(IReadOnlyList objectShas) + { + using var ms = new MemoryStream(); + + // Request each object with want lines + for (var i = 0; i < objectShas.Count; i++) + { + var capabilities = i is 0 ? $" {GitProtocolCapabilities}" : ""; + var wantLine = $"want {objectShas[i]}{capabilities}\n"; + WritePktLine(ms, wantLine); + } + + // Flush packet + WritePktLine(ms, null); + + // Done line + WritePktLine(ms, "done\n"); + + return ms.ToArray(); + } + private static ReadOnlyMemory ExtractPackFileFromResponse(byte[] responseData) { // The response is in pkt-line format with side-band diff --git a/implement/GitCore/LoadFromUrl.cs b/implement/GitCore/LoadFromUrl.cs index 49a9bef..5c1a669 100644 --- a/implement/GitCore/LoadFromUrl.cs +++ b/implement/GitCore/LoadFromUrl.cs @@ -1,5 +1,6 @@ using System; using System.Collections.Generic; +using System.Linq; using System.Net.Http; using System.Threading.Tasks; @@ -99,19 +100,20 @@ public static IReadOnlyDictionary> LoadTreeConten /// Git repository URL like https://github.com/owner/repo.git /// Commit SHA to load /// Path to the subdirectory (e.g., ["implement", "GitCore"]) - /// Optional HttpClient to use for HTTP requests. If null, uses a default static client. + /// Optional HttpClient to use for HTTP requests. If null, uses a default client. + /// Optional delegate to retrieve a blob from cache by SHA. Returns null if not in cache. + /// Optional delegate to be invoke when a blob was loaded, with its SHA and content. /// A dictionary mapping file paths (relative to subdirectory) to their contents public static async Task>> LoadSubdirectoryContentsFromGitUrlAsync( string gitUrl, string commitSha, FilePath subdirectoryPath, - HttpClient? httpClient = null) + HttpClient? httpClient = null, + Func?>? getBlobFromCache = null, + Action>? reportLoadedBlob = null) { - // Fetch the pack file containing the commit and its tree - var packFileData = - await GitSmartHttp.FetchPackFileAsync(gitUrl, commitSha, httpClient); - - return LoadSubdirectoryContentsFromPackFile(packFileData, commitSha, subdirectoryPath); + return await LoadSubdirectoryContentsWithBloblessCloneAsync( + gitUrl, commitSha, subdirectoryPath, httpClient, getBlobFromCache, reportLoadedBlob); } /// @@ -120,13 +122,26 @@ public static async Task>> Lo /// Git repository URL like https://github.com/owner/repo.git /// Commit SHA to load /// Path to the subdirectory (e.g., ["implement", "GitCore"]) + /// Optional HttpClient to use for HTTP requests. If null, uses a default client. + /// Optional delegate to retrieve a blob from cache by SHA. Returns null if not in cache. + /// Optional delegate to be invoke when a blob was loaded, with its SHA and content. /// A dictionary mapping file paths (relative to subdirectory) to their contents public static IReadOnlyDictionary> LoadSubdirectoryContentsFromGitUrl( string gitUrl, string commitSha, - FilePath subdirectoryPath) + FilePath subdirectoryPath, + HttpClient? httpClient = null, + Func?>? getBlobFromCache = null, + Action>? reportLoadedBlob = null) { - return LoadSubdirectoryContentsFromGitUrlAsync(gitUrl, commitSha, subdirectoryPath, null).GetAwaiter().GetResult(); + return LoadSubdirectoryContentsFromGitUrlAsync( + gitUrl, + commitSha, + subdirectoryPath, + httpClient, + getBlobFromCache, + reportLoadedBlob) + .GetAwaiter().GetResult(); } /// @@ -138,6 +153,41 @@ public static IReadOnlyDictionary> LoadSubdirecto private static IReadOnlyDictionary> LoadTreeContentsFromPackFile( ReadOnlyMemory packFileData, string commitSha) + { + var (commit, objectsBySHA1) = ParsePackFileAndGetCommit(packFileData, commitSha); + + // Get all files from the tree recursively + return GitObjects.GetAllFilesFromTree( + commit.TreeSHA1, + sha => objectsBySHA1.TryGetValue(sha, out var obj) ? obj : null); + } + + /// + /// Loads the contents of a subdirectory from pack file data. + /// + /// Pack file data containing the commit and tree objects + /// Commit SHA to load + /// Path to the subdirectory + /// A dictionary mapping file paths (relative to subdirectory) to their contents + private static IReadOnlyDictionary> LoadSubdirectoryContentsFromPackFile( + ReadOnlyMemory packFileData, + string commitSha, + FilePath subdirectoryPath) + { + var (commit, objectsBySHA1) = ParsePackFileAndGetCommit(packFileData, commitSha); + + // Get files from the subdirectory + return GitObjects.GetFilesFromSubdirectory( + commit.TreeSHA1, + subdirectoryPath, + sha => objectsBySHA1.TryGetValue(sha, out var obj) ? obj : null); + } + + /// + /// Common helper to parse pack file and extract commit object. + /// + private static (GitObjects.CommitObject commit, IReadOnlyDictionary objectsBySHA1) + ParsePackFileAndGetCommit(ReadOnlyMemory packFileData, string commitSha) { // Generate index for the pack file var indexResult = PackIndex.GeneratePackIndexV2(packFileData); @@ -161,31 +211,30 @@ private static IReadOnlyDictionary> LoadTreeConte // Parse the commit to get the tree SHA var commit = GitObjects.ParseCommit(commitObject.Data); - // Get all files from the tree recursively - return GitObjects.GetAllFilesFromTree( - commit.TreeSHA1, - sha => objectsBySHA1.TryGetValue(sha, out var obj) ? obj : null); + return (commit, objectsBySHA1); } /// - /// Loads the contents of a subdirectory from pack file data. + /// Loads subdirectory contents using blobless clone optimization. + /// First fetches only trees and commit, then requests specific blobs for the subdirectory. /// - /// Pack file data containing the commit and tree objects - /// Commit SHA to load - /// Path to the subdirectory - /// A dictionary mapping file paths (relative to subdirectory) to their contents - private static IReadOnlyDictionary> LoadSubdirectoryContentsFromPackFile( - ReadOnlyMemory packFileData, + private static async Task>> LoadSubdirectoryContentsWithBloblessCloneAsync( + string gitUrl, string commitSha, - FilePath subdirectoryPath) + FilePath subdirectoryPath, + HttpClient? httpClient, + Func?>? getBlobFromCache, + Action>? reportLoadedBlob) { - // Generate index for the pack file - var indexResult = PackIndex.GeneratePackIndexV2(packFileData); - var indexEntries = PackIndex.ParsePackIndexV2(indexResult.IndexData); + // Step 1: Fetch blobless pack file (commit and trees only) + var bloblessPackFileData = + await GitSmartHttp.FetchBloblessPackFileAsync(gitUrl, commitSha, httpClient); - // Parse all objects from the pack file - var objects = PackFile.ParseAllObjects(packFileData, indexEntries); - var objectsBySHA1 = PackFile.GetObjectsBySHA1(objects); + // Parse the blobless pack file + var indexResult = PackIndex.GeneratePackIndexV2(bloblessPackFileData); + var indexEntries = PackIndex.ParsePackIndexV2(indexResult.IndexData); + var objects = PackFile.ParseAllObjects(bloblessPackFileData, indexEntries); + var objectsBySHA1 = new Dictionary(PackFile.GetObjectsBySHA1(objects)); // Get the commit object if (!objectsBySHA1.TryGetValue(commitSha, out var commitObject)) @@ -201,13 +250,161 @@ private static IReadOnlyDictionary> LoadSubdirect // Parse the commit to get the tree SHA var commit = GitObjects.ParseCommit(commitObject.Data); - // Get files from the subdirectory + // Step 2: Navigate trees to find blob SHAs in the subdirectory + var blobShas = new List(); + CollectBlobShasFromSubdirectory( + commit.TreeSHA1, + subdirectoryPath, + sha => objectsBySHA1.TryGetValue(sha, out var obj) ? obj : null, + blobShas); + + // Step 3: Check cache for blobs we already have + var cachedBlobs = new Dictionary>(); + var missingBlobShas = new List(); + + if (getBlobFromCache is not null) + { + foreach (var blobSha in blobShas) + { + if (getBlobFromCache(blobSha) is { } cached) + { + cachedBlobs[blobSha] = cached; + } + else + { + missingBlobShas.Add(blobSha); + } + } + } + else + { + missingBlobShas.AddRange(blobShas); + } + + // Step 4: Fetch missing blobs + if (missingBlobShas.Count > 0) + { + var blobsPackFileData = + await GitSmartHttp.FetchSpecificObjectsAsync(gitUrl, missingBlobShas, httpClient); + + // Parse the blobs pack file + var blobsIndexResult = PackIndex.GeneratePackIndexV2(blobsPackFileData); + var blobsIndexEntries = PackIndex.ParsePackIndexV2(blobsIndexResult.IndexData); + var blobObjects = PackFile.ParseAllObjects(blobsPackFileData, blobsIndexEntries); + + foreach (var blobObject in blobObjects) + { + if (blobObject.Type is PackFile.ObjectType.Blob) + { + cachedBlobs[blobObject.SHA1base16] = blobObject.Data; + } + + // Support caller caching blobs for future reads. + reportLoadedBlob?.Invoke(blobObject.SHA1base16, blobObject.Data); + } + } + + // Step 5: Build the final dictionary with all objects (trees from step 1 + blobs from steps 3&4) + foreach (var (sha, blob) in cachedBlobs) + { + if (!objectsBySHA1.ContainsKey(sha)) + { + objectsBySHA1[sha] = + new PackFile.PackObject( + PackFile.ObjectType.Blob, + blob.Length, + blob, + sha); + } + } + + // Step 6: Get files from the subdirectory (now we have all the blobs) return GitObjects.GetFilesFromSubdirectory( commit.TreeSHA1, subdirectoryPath, sha => objectsBySHA1.TryGetValue(sha, out var obj) ? obj : null); } + /// + /// Collects blob SHAs from a subdirectory by navigating the tree structure. + /// + private static void CollectBlobShasFromSubdirectory( + string treeSHA1, + FilePath subdirectoryPath, + Func getObjectBySHA1, + List blobShas) + { + // Navigate to the subdirectory + var currentTreeSHA1 = treeSHA1; + + foreach (var pathComponent in subdirectoryPath) + { + var treeObject = getObjectBySHA1(currentTreeSHA1); + + if (treeObject is null) + { + throw new InvalidOperationException($"Tree {currentTreeSHA1} not found"); + } + + if (treeObject.Type is not PackFile.ObjectType.Tree) + { + throw new InvalidOperationException($"Object {currentTreeSHA1} is not a tree"); + } + + var tree = GitObjects.ParseTree(treeObject.Data); + var entry = tree.Entries.FirstOrDefault(e => e.Name == pathComponent); + + if (entry is null) + { + throw new InvalidOperationException($"Path component '{pathComponent}' not found in tree"); + } + + if (entry.Mode is not "40000") + { + throw new InvalidOperationException($"Path component '{pathComponent}' is not a directory"); + } + + currentTreeSHA1 = entry.SHA1; + } + + // Now collect all blob SHAs from this tree recursively + CollectBlobShasFromTree(currentTreeSHA1, getObjectBySHA1, blobShas); + } + + /// + /// Recursively collects all blob SHAs from a tree. + /// + private static void CollectBlobShasFromTree( + string treeSHA1, + Func getObjectBySHA1, + List blobShas) + { + var treeObject = getObjectBySHA1(treeSHA1); + if (treeObject is null) + { + throw new InvalidOperationException($"Tree {treeSHA1} not found"); + } + + if (treeObject.Type is not PackFile.ObjectType.Tree) + { + throw new InvalidOperationException($"Object {treeSHA1} is not a tree"); + } + + var tree = GitObjects.ParseTree(treeObject.Data); + + foreach (var entry in tree.Entries) + { + if (entry.Mode is "40000") // Directory + { + CollectBlobShasFromTree(entry.SHA1, getObjectBySHA1, blobShas); + } + else // File (blob) + { + blobShas.Add(entry.SHA1); + } + } + } + /// /// Determines if a string is likely a commit SHA (40 hex characters) vs a branch name. ///