diff --git a/implement/GitCore.UnitTests/ParsePackFileTests.cs b/implement/GitCore.UnitTests/ParsePackFileTests.cs index 35efb63..c04699d 100644 --- a/implement/GitCore.UnitTests/ParsePackFileTests.cs +++ b/implement/GitCore.UnitTests/ParsePackFileTests.cs @@ -90,4 +90,50 @@ public void Generate_idx_and_rev_files_from_pack_file() result.ReverseIndexData.Length.Should().Be(expectedRevFileData.Length, "Generated .rev file should have the same size"); result.ReverseIndexData.Span.SequenceEqual(expectedRevFileData.Span).Should().BeTrue("Generated .rev file should match expected content"); } + + [Fact] + public void ParseAllObjectsDirectly_produces_same_results_as_index_based_parsing() + { + var filesFromClone = TestData.LoadTestDataFiles_2025_10_27(); + + var packFileData = + filesFromClone[["objects", "pack", "pack-f0af0a07967292ae02df043ff4169bee06f6c143.pack"]]; + + var idxFileData = + filesFromClone[["objects", "pack", "pack-f0af0a07967292ae02df043ff4169bee06f6c143.idx"]]; + + // Parse using the old method (index-based) + var indexEntries = PackIndex.ParsePackIndexV2(idxFileData); + var objectsFromIndexBased = PackFile.ParseAllObjects(packFileData, indexEntries); + + // Parse using the new method (direct) + var objectsFromDirect = PackFile.ParseAllObjectsDirectly(packFileData); + + // Verify we got the same number of objects + objectsFromDirect.Count.Should().Be(objectsFromIndexBased.Count, + "Direct parsing should produce the same number of objects as index-based parsing"); + + // Create dictionaries for comparison + var objectsFromIndexBasedDict = PackFile.GetObjectsBySHA1(objectsFromIndexBased); + var objectsFromDirectDict = PackFile.GetObjectsBySHA1(objectsFromDirect); + + // Verify all objects have the same SHA1 keys + objectsFromDirectDict.Keys.Should().BeEquivalentTo(objectsFromIndexBasedDict.Keys, + "Direct parsing should produce objects with the same SHA1 hashes"); + + // Verify each object has the same type and data + foreach (var (sha1, directObj) in objectsFromDirectDict) + { + var indexBasedObj = objectsFromIndexBasedDict[sha1]; + + directObj.Type.Should().Be(indexBasedObj.Type, + $"Object {sha1} should have the same type in both parsing methods"); + + directObj.Size.Should().Be(indexBasedObj.Size, + $"Object {sha1} should have the same size in both parsing methods"); + + directObj.Data.Span.SequenceEqual(indexBasedObj.Data.Span).Should().BeTrue( + $"Object {sha1} should have the same data in both parsing methods"); + } + } } diff --git a/implement/GitCore/LoadFromUrl.cs b/implement/GitCore/LoadFromUrl.cs index a7357eb..62c6793 100644 --- a/implement/GitCore/LoadFromUrl.cs +++ b/implement/GitCore/LoadFromUrl.cs @@ -190,12 +190,8 @@ private static IReadOnlyDictionary> LoadSubdirect private static (GitObjects.CommitObject commit, IReadOnlyDictionary objectsBySHA1) ParsePackFileAndGetCommit(ReadOnlyMemory packFileData, string commitSha) { - // Generate index for the pack file - var indexResult = PackIndex.GeneratePackIndexV2(packFileData); - var indexEntries = PackIndex.ParsePackIndexV2(indexResult.IndexData); - - // Parse all objects from the pack file - var objects = PackFile.ParseAllObjects(packFileData, indexEntries); + // Parse all objects directly from the pack file (more efficient than generating index first) + var objects = PackFile.ParseAllObjectsDirectly(packFileData); var objectsBySHA1 = PackFile.GetObjectsBySHA1(objects); // Get the commit object @@ -282,10 +278,8 @@ private static async Task>> L var blobsPackFileData = await GitSmartHttp.FetchSpecificObjectsAsync(gitUrl, missingBlobShas, httpClient); - // Parse the blobs pack file - var blobsIndexResult = PackIndex.GeneratePackIndexV2(blobsPackFileData); - var blobsIndexEntries = PackIndex.ParsePackIndexV2(blobsIndexResult.IndexData); - var blobObjects = PackFile.ParseAllObjects(blobsPackFileData, blobsIndexEntries); + // Parse the blobs pack file directly without generating index + var blobObjects = PackFile.ParseAllObjectsDirectly(blobsPackFileData); foreach (var blobObject in blobObjects) { @@ -431,10 +425,8 @@ public static async Task FetchBloblessCloneAsync( var bloblessPackFileData = await GitSmartHttp.FetchBloblessPackFileAsync(gitUrl, commitSha, depth, httpClient); - // Parse the blobless pack file - var indexResult = PackIndex.GeneratePackIndexV2(bloblessPackFileData); - var indexEntries = PackIndex.ParsePackIndexV2(indexResult.IndexData); - var objects = PackFile.ParseAllObjects(bloblessPackFileData, indexEntries); + // Parse the blobless pack file directly without generating index + var objects = PackFile.ParseAllObjectsDirectly(bloblessPackFileData); var objectsBySha = PackFile.GetObjectsBySHA1(objects); return new Repository(objectsBySha.ToImmutableDictionary()); diff --git a/implement/GitCore/PackFile.cs b/implement/GitCore/PackFile.cs index 852af9a..01043d2 100644 --- a/implement/GitCore/PackFile.cs +++ b/implement/GitCore/PackFile.cs @@ -81,6 +81,179 @@ public static bool VerifyPackFileChecksum(ReadOnlyMemory packFileData) return storedChecksum.Span.SequenceEqual(calculatedChecksum); } + /// + /// Parses all objects directly from a pack file without requiring a pre-generated index. + /// This is more efficient for in-memory operations where we don't need to generate an index file. + /// + /// The bytes of the pack file including its trailing 20-byte checksum. + /// A read-only list of parsed pack objects. + public static IReadOnlyList ParseAllObjectsDirectly(ReadOnlyMemory packFileData) + { + var header = ParsePackFileHeader(packFileData); + var objectCount = (int)header.ObjectCount; + var dataWithoutChecksum = packFileData[..^20]; + var span = dataWithoutChecksum.Span; + + // We'll parse objects sequentially and build up the list + // Track objects by both offset and SHA1 for efficient delta resolution + var objectsByOffset = new Dictionary(); + var objectsBySHA1 = new Dictionary(); + var objects = new List(); + + var offset = 12; // Start after pack header + + // Helper to find compressed length by decompression + int FindCompressedLength(ReadOnlySpan data, int startOffset, int expectedSize) + { + var inflater = new ICSharpCode.SharpZipLib.Zip.Compression.Inflater(false); + try + { + var availableData = data[startOffset..].ToArray(); + inflater.SetInput(availableData); + var outputBuffer = new byte[expectedSize + 1]; + var decompressedBytes = inflater.Inflate(outputBuffer); + if (decompressedBytes != expectedSize) + { + throw new InvalidOperationException($"Decompression size mismatch at offset {startOffset}"); + } + return (int)inflater.TotalIn; + } + finally + { + inflater.Reset(); + } + } + + // Helper to calculate SHA1 of an object with reduced allocations + string CalculateObjectSHA1(ObjectType objectType, byte[] data) + { + var typeString = objectType.ToString().ToLower(); + var headerString = $"{typeString} {data.Length}\0"; + var headerBytes = System.Text.Encoding.UTF8.GetByteCount(headerString); + + Span buffer = stackalloc byte[headerBytes + data.Length]; + System.Text.Encoding.UTF8.GetBytes(headerString, buffer[..headerBytes]); + data.AsSpan().CopyTo(buffer[headerBytes..]); + + var sha1 = System.Security.Cryptography.SHA1.HashData(buffer); + return Convert.ToHexStringLower(sha1); + } + + // Parse objects sequentially + for (var i = 0; i < objectCount; i++) + { + var startOffset = offset; + + // Read object header (type and size) + var currentByte = span[offset++]; + var objectType = (ObjectType)((currentByte >> 4) & 0x7); + long size = currentByte & 0xF; + var shift = 4; + + while ((currentByte & 0x80) != 0) + { + currentByte = span[offset++]; + size |= (long)(currentByte & 0x7F) << shift; + shift += 7; + } + + // Handle different object types + if (objectType == ObjectType.OfsDelta) + { + // Read negative offset + var negativeOffset = 0L; + currentByte = span[offset++]; + negativeOffset = currentByte & 0x7F; + + while ((currentByte & 0x80) != 0) + { + currentByte = span[offset++]; + negativeOffset = ((negativeOffset + 1) << 7) | ((long)currentByte & 0x7F); + } + + var baseOffset = startOffset - negativeOffset; + + // Get base object + if (!objectsByOffset.TryGetValue(baseOffset, out var baseObj)) + { + throw new InvalidOperationException($"Base object at offset {baseOffset} not found for OfsDelta at {startOffset}"); + } + + // Decompress delta data + var compressedLength = FindCompressedLength(span, offset, (int)size); + var compressedData = span.Slice(offset, compressedLength); + var deltaData = DecompressZlib(compressedData, (int)size); + + // Apply delta to reconstruct object + var reconstructedData = ApplyDelta(baseObj.Data, deltaData); + var sha1 = CalculateObjectSHA1(baseObj.Type, reconstructedData); + + // Store for potential future delta references + objectsByOffset[startOffset] = (baseObj.Type, reconstructedData, sha1); + objectsBySHA1[sha1] = (baseObj.Type, reconstructedData); + + // Create pack object + var packObject = new PackObject(baseObj.Type, reconstructedData.Length, reconstructedData, sha1); + objects.Add(packObject); + + offset += compressedLength; + } + else if (objectType == ObjectType.RefDelta) + { + // Read base SHA1 + var baseSHA1Bytes = span.Slice(offset, 20); + var baseSHA1 = Convert.ToHexStringLower(baseSHA1Bytes); + offset += 20; + + // Find base object by SHA1 using O(1) lookup + if (!objectsBySHA1.TryGetValue(baseSHA1, out var baseObj)) + { + throw new InvalidOperationException($"Base object {baseSHA1} not found for RefDelta at {startOffset}"); + } + + // Decompress delta data + var compressedLength = FindCompressedLength(span, offset, (int)size); + var compressedData = span.Slice(offset, compressedLength); + var deltaData = DecompressZlib(compressedData, (int)size); + + // Apply delta to reconstruct object + var reconstructedData = ApplyDelta(baseObj.Data, deltaData); + var sha1 = CalculateObjectSHA1(baseObj.Type, reconstructedData); + + // Store for potential future delta references + objectsByOffset[startOffset] = (baseObj.Type, reconstructedData, sha1); + objectsBySHA1[sha1] = (baseObj.Type, reconstructedData); + + // Create pack object + var packObject = new PackObject(baseObj.Type, reconstructedData.Length, reconstructedData, sha1); + objects.Add(packObject); + + offset += compressedLength; + } + else + { + // Regular object (commit, tree, blob, tag) + var compressedLength = FindCompressedLength(span, offset, (int)size); + var compressedData = span.Slice(offset, compressedLength); + var decompressedData = DecompressZlib(compressedData, (int)size); + + var sha1 = CalculateObjectSHA1(objectType, decompressedData); + + // Store for potential future delta references + objectsByOffset[startOffset] = (objectType, decompressedData, sha1); + objectsBySHA1[sha1] = (objectType, decompressedData); + + // Create pack object + var packObject = new PackObject(objectType, decompressedData.Length, decompressedData, sha1); + objects.Add(packObject); + + offset += compressedLength; + } + } + + return objects; + } + public static IReadOnlyList ParseAllObjects( ReadOnlyMemory packFileData, IReadOnlyList indexEntries)