Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 46 additions & 0 deletions implement/GitCore.UnitTests/ParsePackFileTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -90,4 +90,50 @@ public void Generate_idx_and_rev_files_from_pack_file()
result.ReverseIndexData.Length.Should().Be(expectedRevFileData.Length, "Generated .rev file should have the same size");
result.ReverseIndexData.Span.SequenceEqual(expectedRevFileData.Span).Should().BeTrue("Generated .rev file should match expected content");
}

[Fact]
public void ParseAllObjectsDirectly_produces_same_results_as_index_based_parsing()
{
var filesFromClone = TestData.LoadTestDataFiles_2025_10_27();

var packFileData =
filesFromClone[["objects", "pack", "pack-f0af0a07967292ae02df043ff4169bee06f6c143.pack"]];

var idxFileData =
filesFromClone[["objects", "pack", "pack-f0af0a07967292ae02df043ff4169bee06f6c143.idx"]];

// Parse using the old method (index-based)
var indexEntries = PackIndex.ParsePackIndexV2(idxFileData);
var objectsFromIndexBased = PackFile.ParseAllObjects(packFileData, indexEntries);

// Parse using the new method (direct)
var objectsFromDirect = PackFile.ParseAllObjectsDirectly(packFileData);

// Verify we got the same number of objects
objectsFromDirect.Count.Should().Be(objectsFromIndexBased.Count,
"Direct parsing should produce the same number of objects as index-based parsing");

// Create dictionaries for comparison
var objectsFromIndexBasedDict = PackFile.GetObjectsBySHA1(objectsFromIndexBased);
var objectsFromDirectDict = PackFile.GetObjectsBySHA1(objectsFromDirect);

// Verify all objects have the same SHA1 keys
objectsFromDirectDict.Keys.Should().BeEquivalentTo(objectsFromIndexBasedDict.Keys,
"Direct parsing should produce objects with the same SHA1 hashes");

// Verify each object has the same type and data
foreach (var (sha1, directObj) in objectsFromDirectDict)
{
var indexBasedObj = objectsFromIndexBasedDict[sha1];

directObj.Type.Should().Be(indexBasedObj.Type,
$"Object {sha1} should have the same type in both parsing methods");

directObj.Size.Should().Be(indexBasedObj.Size,
$"Object {sha1} should have the same size in both parsing methods");

directObj.Data.Span.SequenceEqual(indexBasedObj.Data.Span).Should().BeTrue(
$"Object {sha1} should have the same data in both parsing methods");
}
}
}
20 changes: 6 additions & 14 deletions implement/GitCore/LoadFromUrl.cs
Original file line number Diff line number Diff line change
Expand Up @@ -190,12 +190,8 @@ private static IReadOnlyDictionary<FilePath, ReadOnlyMemory<byte>> LoadSubdirect
private static (GitObjects.CommitObject commit, IReadOnlyDictionary<string, PackFile.PackObject> objectsBySHA1)
ParsePackFileAndGetCommit(ReadOnlyMemory<byte> packFileData, string commitSha)
{
// Generate index for the pack file
var indexResult = PackIndex.GeneratePackIndexV2(packFileData);
var indexEntries = PackIndex.ParsePackIndexV2(indexResult.IndexData);

// Parse all objects from the pack file
var objects = PackFile.ParseAllObjects(packFileData, indexEntries);
// Parse all objects directly from the pack file (more efficient than generating index first)
var objects = PackFile.ParseAllObjectsDirectly(packFileData);
var objectsBySHA1 = PackFile.GetObjectsBySHA1(objects);

// Get the commit object
Expand Down Expand Up @@ -282,10 +278,8 @@ private static async Task<IReadOnlyDictionary<FilePath, ReadOnlyMemory<byte>>> L
var blobsPackFileData =
await GitSmartHttp.FetchSpecificObjectsAsync(gitUrl, missingBlobShas, httpClient);

// Parse the blobs pack file
var blobsIndexResult = PackIndex.GeneratePackIndexV2(blobsPackFileData);
var blobsIndexEntries = PackIndex.ParsePackIndexV2(blobsIndexResult.IndexData);
var blobObjects = PackFile.ParseAllObjects(blobsPackFileData, blobsIndexEntries);
// Parse the blobs pack file directly without generating index
var blobObjects = PackFile.ParseAllObjectsDirectly(blobsPackFileData);

foreach (var blobObject in blobObjects)
{
Expand Down Expand Up @@ -431,10 +425,8 @@ public static async Task<Repository> FetchBloblessCloneAsync(
var bloblessPackFileData =
await GitSmartHttp.FetchBloblessPackFileAsync(gitUrl, commitSha, depth, httpClient);

// Parse the blobless pack file
var indexResult = PackIndex.GeneratePackIndexV2(bloblessPackFileData);
var indexEntries = PackIndex.ParsePackIndexV2(indexResult.IndexData);
var objects = PackFile.ParseAllObjects(bloblessPackFileData, indexEntries);
// Parse the blobless pack file directly without generating index
var objects = PackFile.ParseAllObjectsDirectly(bloblessPackFileData);
var objectsBySha = PackFile.GetObjectsBySHA1(objects);

return new Repository(objectsBySha.ToImmutableDictionary());
Expand Down
173 changes: 173 additions & 0 deletions implement/GitCore/PackFile.cs
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,179 @@ public static bool VerifyPackFileChecksum(ReadOnlyMemory<byte> packFileData)
return storedChecksum.Span.SequenceEqual(calculatedChecksum);
}

/// <summary>
/// Parses all objects directly from a pack file without requiring a pre-generated index.
/// This is more efficient for in-memory operations where we don't need to generate an index file.
/// </summary>
/// <param name="packFileData">The bytes of the pack file including its trailing 20-byte checksum.</param>
/// <returns>A read-only list of parsed pack objects.</returns>
public static IReadOnlyList<PackObject> ParseAllObjectsDirectly(ReadOnlyMemory<byte> packFileData)
{
var header = ParsePackFileHeader(packFileData);
var objectCount = (int)header.ObjectCount;
var dataWithoutChecksum = packFileData[..^20];
var span = dataWithoutChecksum.Span;

// We'll parse objects sequentially and build up the list
// Track objects by both offset and SHA1 for efficient delta resolution
var objectsByOffset = new Dictionary<long, (ObjectType Type, byte[] Data, string SHA1)>();
var objectsBySHA1 = new Dictionary<string, (ObjectType Type, byte[] Data)>();
var objects = new List<PackObject>();

var offset = 12; // Start after pack header

// Helper to find compressed length by decompression
int FindCompressedLength(ReadOnlySpan<byte> data, int startOffset, int expectedSize)
{
var inflater = new ICSharpCode.SharpZipLib.Zip.Compression.Inflater(false);
try
{
var availableData = data[startOffset..].ToArray();
inflater.SetInput(availableData);
var outputBuffer = new byte[expectedSize + 1];
var decompressedBytes = inflater.Inflate(outputBuffer);
if (decompressedBytes != expectedSize)
{
throw new InvalidOperationException($"Decompression size mismatch at offset {startOffset}");
}
return (int)inflater.TotalIn;
}
finally
{
inflater.Reset();
}
}

// Helper to calculate SHA1 of an object with reduced allocations
string CalculateObjectSHA1(ObjectType objectType, byte[] data)
{
var typeString = objectType.ToString().ToLower();
var headerString = $"{typeString} {data.Length}\0";
var headerBytes = System.Text.Encoding.UTF8.GetByteCount(headerString);

Span<byte> buffer = stackalloc byte[headerBytes + data.Length];
System.Text.Encoding.UTF8.GetBytes(headerString, buffer[..headerBytes]);
data.AsSpan().CopyTo(buffer[headerBytes..]);

var sha1 = System.Security.Cryptography.SHA1.HashData(buffer);
return Convert.ToHexStringLower(sha1);
}

// Parse objects sequentially
for (var i = 0; i < objectCount; i++)
{
var startOffset = offset;

// Read object header (type and size)
var currentByte = span[offset++];
var objectType = (ObjectType)((currentByte >> 4) & 0x7);
long size = currentByte & 0xF;
var shift = 4;

while ((currentByte & 0x80) != 0)
{
currentByte = span[offset++];
size |= (long)(currentByte & 0x7F) << shift;
shift += 7;
}

// Handle different object types
if (objectType == ObjectType.OfsDelta)
{
// Read negative offset
var negativeOffset = 0L;
currentByte = span[offset++];
negativeOffset = currentByte & 0x7F;

while ((currentByte & 0x80) != 0)
{
currentByte = span[offset++];
negativeOffset = ((negativeOffset + 1) << 7) | ((long)currentByte & 0x7F);
}

var baseOffset = startOffset - negativeOffset;

// Get base object
if (!objectsByOffset.TryGetValue(baseOffset, out var baseObj))
{
throw new InvalidOperationException($"Base object at offset {baseOffset} not found for OfsDelta at {startOffset}");
}

// Decompress delta data
var compressedLength = FindCompressedLength(span, offset, (int)size);
var compressedData = span.Slice(offset, compressedLength);
var deltaData = DecompressZlib(compressedData, (int)size);

// Apply delta to reconstruct object
var reconstructedData = ApplyDelta(baseObj.Data, deltaData);
var sha1 = CalculateObjectSHA1(baseObj.Type, reconstructedData);

// Store for potential future delta references
objectsByOffset[startOffset] = (baseObj.Type, reconstructedData, sha1);
objectsBySHA1[sha1] = (baseObj.Type, reconstructedData);

// Create pack object
var packObject = new PackObject(baseObj.Type, reconstructedData.Length, reconstructedData, sha1);
objects.Add(packObject);

offset += compressedLength;
}
else if (objectType == ObjectType.RefDelta)
{
// Read base SHA1
var baseSHA1Bytes = span.Slice(offset, 20);
var baseSHA1 = Convert.ToHexStringLower(baseSHA1Bytes);
offset += 20;

// Find base object by SHA1 using O(1) lookup
if (!objectsBySHA1.TryGetValue(baseSHA1, out var baseObj))
{
throw new InvalidOperationException($"Base object {baseSHA1} not found for RefDelta at {startOffset}");
}

// Decompress delta data
var compressedLength = FindCompressedLength(span, offset, (int)size);
var compressedData = span.Slice(offset, compressedLength);
var deltaData = DecompressZlib(compressedData, (int)size);

// Apply delta to reconstruct object
var reconstructedData = ApplyDelta(baseObj.Data, deltaData);
var sha1 = CalculateObjectSHA1(baseObj.Type, reconstructedData);

// Store for potential future delta references
objectsByOffset[startOffset] = (baseObj.Type, reconstructedData, sha1);
objectsBySHA1[sha1] = (baseObj.Type, reconstructedData);

// Create pack object
var packObject = new PackObject(baseObj.Type, reconstructedData.Length, reconstructedData, sha1);
objects.Add(packObject);

offset += compressedLength;
}
else
{
// Regular object (commit, tree, blob, tag)
var compressedLength = FindCompressedLength(span, offset, (int)size);
var compressedData = span.Slice(offset, compressedLength);
var decompressedData = DecompressZlib(compressedData, (int)size);

var sha1 = CalculateObjectSHA1(objectType, decompressedData);

// Store for potential future delta references
objectsByOffset[startOffset] = (objectType, decompressedData, sha1);
objectsBySHA1[sha1] = (objectType, decompressedData);

// Create pack object
var packObject = new PackObject(objectType, decompressedData.Length, decompressedData, sha1);
objects.Add(packObject);

offset += compressedLength;
}
}

return objects;
}

public static IReadOnlyList<PackObject> ParseAllObjects(
ReadOnlyMemory<byte> packFileData,
IReadOnlyList<PackIndex.IndexEntry> indexEntries)
Expand Down
Loading