From b1f0b9d561e780db2f2ae6f3239a9e192a8fe01a Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 6 Nov 2025 18:20:27 +0000 Subject: [PATCH 1/4] Initial plan From 1d9e6ffa6fda08eec123ce40b6b69989e021c95e Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 6 Nov 2025 18:28:34 +0000 Subject: [PATCH 2/4] Add ParseAllObjectsDirectly method and eliminate redundant index generation Co-authored-by: Viir <19209696+Viir@users.noreply.github.com> --- implement/GitCore/LoadFromUrl.cs | 20 ++-- implement/GitCore/PackFile.cs | 166 +++++++++++++++++++++++++++++++ 2 files changed, 172 insertions(+), 14 deletions(-) diff --git a/implement/GitCore/LoadFromUrl.cs b/implement/GitCore/LoadFromUrl.cs index a7357eb..62c6793 100644 --- a/implement/GitCore/LoadFromUrl.cs +++ b/implement/GitCore/LoadFromUrl.cs @@ -190,12 +190,8 @@ private static IReadOnlyDictionary> LoadSubdirect private static (GitObjects.CommitObject commit, IReadOnlyDictionary objectsBySHA1) ParsePackFileAndGetCommit(ReadOnlyMemory packFileData, string commitSha) { - // Generate index for the pack file - var indexResult = PackIndex.GeneratePackIndexV2(packFileData); - var indexEntries = PackIndex.ParsePackIndexV2(indexResult.IndexData); - - // Parse all objects from the pack file - var objects = PackFile.ParseAllObjects(packFileData, indexEntries); + // Parse all objects directly from the pack file (more efficient than generating index first) + var objects = PackFile.ParseAllObjectsDirectly(packFileData); var objectsBySHA1 = PackFile.GetObjectsBySHA1(objects); // Get the commit object @@ -282,10 +278,8 @@ private static async Task>> L var blobsPackFileData = await GitSmartHttp.FetchSpecificObjectsAsync(gitUrl, missingBlobShas, httpClient); - // Parse the blobs pack file - var blobsIndexResult = PackIndex.GeneratePackIndexV2(blobsPackFileData); - var blobsIndexEntries = PackIndex.ParsePackIndexV2(blobsIndexResult.IndexData); - var blobObjects = PackFile.ParseAllObjects(blobsPackFileData, blobsIndexEntries); + // Parse the blobs pack file directly without generating index + var blobObjects = PackFile.ParseAllObjectsDirectly(blobsPackFileData); foreach (var blobObject in blobObjects) { @@ -431,10 +425,8 @@ public static async Task FetchBloblessCloneAsync( var bloblessPackFileData = await GitSmartHttp.FetchBloblessPackFileAsync(gitUrl, commitSha, depth, httpClient); - // Parse the blobless pack file - var indexResult = PackIndex.GeneratePackIndexV2(bloblessPackFileData); - var indexEntries = PackIndex.ParsePackIndexV2(indexResult.IndexData); - var objects = PackFile.ParseAllObjects(bloblessPackFileData, indexEntries); + // Parse the blobless pack file directly without generating index + var objects = PackFile.ParseAllObjectsDirectly(bloblessPackFileData); var objectsBySha = PackFile.GetObjectsBySHA1(objects); return new Repository(objectsBySha.ToImmutableDictionary()); diff --git a/implement/GitCore/PackFile.cs b/implement/GitCore/PackFile.cs index 852af9a..503bdfc 100644 --- a/implement/GitCore/PackFile.cs +++ b/implement/GitCore/PackFile.cs @@ -81,6 +81,172 @@ public static bool VerifyPackFileChecksum(ReadOnlyMemory packFileData) return storedChecksum.Span.SequenceEqual(calculatedChecksum); } + /// + /// Parses all objects directly from a pack file without requiring a pre-generated index. + /// This is more efficient for in-memory operations where we don't need to generate an index file. + /// + /// The bytes of the pack file including its trailing 20-byte checksum. + /// A read-only list of parsed pack objects. + public static IReadOnlyList ParseAllObjectsDirectly(ReadOnlyMemory packFileData) + { + var header = ParsePackFileHeader(packFileData); + var objectCount = (int)header.ObjectCount; + var dataWithoutChecksum = packFileData[..^20]; + var span = dataWithoutChecksum.Span; + + // We'll parse objects sequentially and build up the list + // Track objects by offset for delta resolution + var objectsByOffset = new Dictionary(); + var objects = new List(); + + var offset = 12; // Start after pack header + + // Helper to find compressed length by decompression + int FindCompressedLength(ReadOnlySpan data, int startOffset, int expectedSize) + { + var inflater = new ICSharpCode.SharpZipLib.Zip.Compression.Inflater(false); + try + { + var availableData = data[startOffset..].ToArray(); + inflater.SetInput(availableData); + var outputBuffer = new byte[expectedSize + 1]; + var decompressedBytes = inflater.Inflate(outputBuffer); + if (decompressedBytes != expectedSize) + { + throw new InvalidOperationException($"Decompression size mismatch at offset {startOffset}"); + } + return (int)inflater.TotalIn; + } + finally + { + inflater.Reset(); + } + } + + // Helper to calculate SHA1 of an object + string CalculateObjectSHA1(ObjectType objectType, byte[] data) + { + var objectHeader = System.Text.Encoding.UTF8.GetBytes($"{objectType.ToString().ToLower()} {data.Length}\0"); + var dataForHash = new byte[objectHeader.Length + data.Length]; + Array.Copy(objectHeader, 0, dataForHash, 0, objectHeader.Length); + Array.Copy(data, 0, dataForHash, objectHeader.Length, data.Length); + var sha1 = System.Security.Cryptography.SHA1.HashData(dataForHash); + return Convert.ToHexStringLower(sha1); + } + + // Parse objects sequentially + for (var i = 0; i < objectCount; i++) + { + var startOffset = offset; + + // Read object header (type and size) + var currentByte = span[offset++]; + var objectType = (ObjectType)((currentByte >> 4) & 0x7); + long size = currentByte & 0xF; + var shift = 4; + + while ((currentByte & 0x80) != 0) + { + currentByte = span[offset++]; + size |= (long)(currentByte & 0x7F) << shift; + shift += 7; + } + + // Handle different object types + if (objectType == ObjectType.OfsDelta) + { + // Read negative offset + var negativeOffset = 0L; + currentByte = span[offset++]; + negativeOffset = currentByte & 0x7F; + + while ((currentByte & 0x80) != 0) + { + currentByte = span[offset++]; + negativeOffset = ((negativeOffset + 1) << 7) | ((long)currentByte & 0x7F); + } + + var baseOffset = startOffset - negativeOffset; + + // Get base object + if (!objectsByOffset.TryGetValue(baseOffset, out var baseObj)) + { + throw new InvalidOperationException($"Base object at offset {baseOffset} not found for OfsDelta at {startOffset}"); + } + + // Decompress delta data + var compressedLength = FindCompressedLength(span, offset, (int)size); + var compressedData = span.Slice(offset, compressedLength); + var deltaData = DecompressZlib(compressedData, (int)size); + + // Apply delta to reconstruct object + var reconstructedData = ApplyDelta(baseObj.Data, deltaData); + var sha1 = CalculateObjectSHA1(baseObj.Type, reconstructedData); + + // Store for potential future delta references + objectsByOffset[startOffset] = (baseObj.Type, reconstructedData, sha1); + + // Create pack object + var packObject = new PackObject(baseObj.Type, reconstructedData.Length, reconstructedData, sha1); + objects.Add(packObject); + + offset += compressedLength; + } + else if (objectType == ObjectType.RefDelta) + { + // Read base SHA1 + var baseSHA1Bytes = span.Slice(offset, 20); + var baseSHA1 = Convert.ToHexStringLower(baseSHA1Bytes); + offset += 20; + + // Find base object by SHA1 + var baseObj = objectsByOffset.Values.FirstOrDefault(o => o.SHA1 == baseSHA1); + if (baseObj == default) + { + throw new InvalidOperationException($"Base object {baseSHA1} not found for RefDelta at {startOffset}"); + } + + // Decompress delta data + var compressedLength = FindCompressedLength(span, offset, (int)size); + var compressedData = span.Slice(offset, compressedLength); + var deltaData = DecompressZlib(compressedData, (int)size); + + // Apply delta to reconstruct object + var reconstructedData = ApplyDelta(baseObj.Data, deltaData); + var sha1 = CalculateObjectSHA1(baseObj.Type, reconstructedData); + + // Store for potential future delta references + objectsByOffset[startOffset] = (baseObj.Type, reconstructedData, sha1); + + // Create pack object + var packObject = new PackObject(baseObj.Type, reconstructedData.Length, reconstructedData, sha1); + objects.Add(packObject); + + offset += compressedLength; + } + else + { + // Regular object (commit, tree, blob, tag) + var compressedLength = FindCompressedLength(span, offset, (int)size); + var compressedData = span.Slice(offset, compressedLength); + var decompressedData = DecompressZlib(compressedData, (int)size); + + var sha1 = CalculateObjectSHA1(objectType, decompressedData); + + // Store for potential future delta references + objectsByOffset[startOffset] = (objectType, decompressedData, sha1); + + // Create pack object + var packObject = new PackObject(objectType, decompressedData.Length, decompressedData, sha1); + objects.Add(packObject); + + offset += compressedLength; + } + } + + return objects; + } + public static IReadOnlyList ParseAllObjects( ReadOnlyMemory packFileData, IReadOnlyList indexEntries) From 08da19dcaf0b1ca0ab55563db4fe4aad8e25b859 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 6 Nov 2025 18:30:00 +0000 Subject: [PATCH 3/4] Add test to verify ParseAllObjectsDirectly produces same results as index-based parsing Co-authored-by: Viir <19209696+Viir@users.noreply.github.com> --- .../GitCore.UnitTests/ParsePackFileTests.cs | 46 +++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/implement/GitCore.UnitTests/ParsePackFileTests.cs b/implement/GitCore.UnitTests/ParsePackFileTests.cs index 35efb63..c04699d 100644 --- a/implement/GitCore.UnitTests/ParsePackFileTests.cs +++ b/implement/GitCore.UnitTests/ParsePackFileTests.cs @@ -90,4 +90,50 @@ public void Generate_idx_and_rev_files_from_pack_file() result.ReverseIndexData.Length.Should().Be(expectedRevFileData.Length, "Generated .rev file should have the same size"); result.ReverseIndexData.Span.SequenceEqual(expectedRevFileData.Span).Should().BeTrue("Generated .rev file should match expected content"); } + + [Fact] + public void ParseAllObjectsDirectly_produces_same_results_as_index_based_parsing() + { + var filesFromClone = TestData.LoadTestDataFiles_2025_10_27(); + + var packFileData = + filesFromClone[["objects", "pack", "pack-f0af0a07967292ae02df043ff4169bee06f6c143.pack"]]; + + var idxFileData = + filesFromClone[["objects", "pack", "pack-f0af0a07967292ae02df043ff4169bee06f6c143.idx"]]; + + // Parse using the old method (index-based) + var indexEntries = PackIndex.ParsePackIndexV2(idxFileData); + var objectsFromIndexBased = PackFile.ParseAllObjects(packFileData, indexEntries); + + // Parse using the new method (direct) + var objectsFromDirect = PackFile.ParseAllObjectsDirectly(packFileData); + + // Verify we got the same number of objects + objectsFromDirect.Count.Should().Be(objectsFromIndexBased.Count, + "Direct parsing should produce the same number of objects as index-based parsing"); + + // Create dictionaries for comparison + var objectsFromIndexBasedDict = PackFile.GetObjectsBySHA1(objectsFromIndexBased); + var objectsFromDirectDict = PackFile.GetObjectsBySHA1(objectsFromDirect); + + // Verify all objects have the same SHA1 keys + objectsFromDirectDict.Keys.Should().BeEquivalentTo(objectsFromIndexBasedDict.Keys, + "Direct parsing should produce objects with the same SHA1 hashes"); + + // Verify each object has the same type and data + foreach (var (sha1, directObj) in objectsFromDirectDict) + { + var indexBasedObj = objectsFromIndexBasedDict[sha1]; + + directObj.Type.Should().Be(indexBasedObj.Type, + $"Object {sha1} should have the same type in both parsing methods"); + + directObj.Size.Should().Be(indexBasedObj.Size, + $"Object {sha1} should have the same size in both parsing methods"); + + directObj.Data.Span.SequenceEqual(indexBasedObj.Data.Span).Should().BeTrue( + $"Object {sha1} should have the same data in both parsing methods"); + } + } } From 0a24dd369f7d8a574f89b06316643dae06c23dba Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 6 Nov 2025 18:33:24 +0000 Subject: [PATCH 4/4] Address code review feedback: optimize RefDelta lookup and reduce allocations Co-authored-by: Viir <19209696+Viir@users.noreply.github.com> --- implement/GitCore/PackFile.cs | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/implement/GitCore/PackFile.cs b/implement/GitCore/PackFile.cs index 503bdfc..01043d2 100644 --- a/implement/GitCore/PackFile.cs +++ b/implement/GitCore/PackFile.cs @@ -95,8 +95,9 @@ public static IReadOnlyList ParseAllObjectsDirectly(ReadOnlyMemory(); + var objectsBySHA1 = new Dictionary(); var objects = new List(); var offset = 12; // Start after pack header @@ -123,14 +124,18 @@ int FindCompressedLength(ReadOnlySpan data, int startOffset, int expectedS } } - // Helper to calculate SHA1 of an object + // Helper to calculate SHA1 of an object with reduced allocations string CalculateObjectSHA1(ObjectType objectType, byte[] data) { - var objectHeader = System.Text.Encoding.UTF8.GetBytes($"{objectType.ToString().ToLower()} {data.Length}\0"); - var dataForHash = new byte[objectHeader.Length + data.Length]; - Array.Copy(objectHeader, 0, dataForHash, 0, objectHeader.Length); - Array.Copy(data, 0, dataForHash, objectHeader.Length, data.Length); - var sha1 = System.Security.Cryptography.SHA1.HashData(dataForHash); + var typeString = objectType.ToString().ToLower(); + var headerString = $"{typeString} {data.Length}\0"; + var headerBytes = System.Text.Encoding.UTF8.GetByteCount(headerString); + + Span buffer = stackalloc byte[headerBytes + data.Length]; + System.Text.Encoding.UTF8.GetBytes(headerString, buffer[..headerBytes]); + data.AsSpan().CopyTo(buffer[headerBytes..]); + + var sha1 = System.Security.Cryptography.SHA1.HashData(buffer); return Convert.ToHexStringLower(sha1); } @@ -185,6 +190,7 @@ string CalculateObjectSHA1(ObjectType objectType, byte[] data) // Store for potential future delta references objectsByOffset[startOffset] = (baseObj.Type, reconstructedData, sha1); + objectsBySHA1[sha1] = (baseObj.Type, reconstructedData); // Create pack object var packObject = new PackObject(baseObj.Type, reconstructedData.Length, reconstructedData, sha1); @@ -199,9 +205,8 @@ string CalculateObjectSHA1(ObjectType objectType, byte[] data) var baseSHA1 = Convert.ToHexStringLower(baseSHA1Bytes); offset += 20; - // Find base object by SHA1 - var baseObj = objectsByOffset.Values.FirstOrDefault(o => o.SHA1 == baseSHA1); - if (baseObj == default) + // Find base object by SHA1 using O(1) lookup + if (!objectsBySHA1.TryGetValue(baseSHA1, out var baseObj)) { throw new InvalidOperationException($"Base object {baseSHA1} not found for RefDelta at {startOffset}"); } @@ -217,6 +222,7 @@ string CalculateObjectSHA1(ObjectType objectType, byte[] data) // Store for potential future delta references objectsByOffset[startOffset] = (baseObj.Type, reconstructedData, sha1); + objectsBySHA1[sha1] = (baseObj.Type, reconstructedData); // Create pack object var packObject = new PackObject(baseObj.Type, reconstructedData.Length, reconstructedData, sha1); @@ -235,6 +241,7 @@ string CalculateObjectSHA1(ObjectType objectType, byte[] data) // Store for potential future delta references objectsByOffset[startOffset] = (objectType, decompressedData, sha1); + objectsBySHA1[sha1] = (objectType, decompressedData); // Create pack object var packObject = new PackObject(objectType, decompressedData.Length, decompressedData, sha1);